From 3f677f9c998a18d11641b5a2de6f11d0af32a153 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Fri, 14 Jun 2019 15:41:04 -0700 Subject: [PATCH 001/153] drivers: md: Unify common definitions of raid1 and raid10 These definitions are being moved to raid1-10.c. Signed-off-by: Marcos Paulo de Souza Signed-off-by: Song Liu Signed-off-by: Jens Axboe --- drivers/md/raid1-10.c | 25 +++++++++++++++++++++++++ drivers/md/raid1.c | 29 ++--------------------------- drivers/md/raid10.c | 27 +-------------------------- 3 files changed, 28 insertions(+), 53 deletions(-) diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 400001b815db..7d968bf08e54 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -3,6 +3,31 @@ #define RESYNC_BLOCK_SIZE (64*1024) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) +/* + * Number of guaranteed raid bios in case of extreme VM load: + */ +#define NR_RAID_BIOS 256 + +/* when we get a read error on a read-only array, we redirect to another + * device without failing the first device, or trying to over-write to + * correct the read error. To keep track of bad blocks on a per-bio + * level, we store IO_BLOCKED in the appropriate 'bios' pointer + */ +#define IO_BLOCKED ((struct bio *)1) +/* When we successfully write to a known bad-block, we need to remove the + * bad-block marking which must be done from process context. So we record + * the success by setting devs[n].bio to IO_MADE_GOOD + */ +#define IO_MADE_GOOD ((struct bio *)2) + +#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) + +/* When there are this many requests queue to be written by + * the raid thread, we become 'congested' to provide back-pressure + * for writeback. + */ +static int max_queued_requests = 1024; + /* for managing resync I/O pages */ struct resync_pages { void *raid_bio; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 2aa36e570e04..e331b433d00c 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -42,31 +42,6 @@ (1L << MD_HAS_PPL) | \ (1L << MD_HAS_MULTIPLE_PPLS)) -/* - * Number of guaranteed r1bios in case of extreme VM load: - */ -#define NR_RAID1_BIOS 256 - -/* when we get a read error on a read-only array, we redirect to another - * device without failing the first device, or trying to over-write to - * correct the read error. To keep track of bad blocks on a per-bio - * level, we store IO_BLOCKED in the appropriate 'bios' pointer - */ -#define IO_BLOCKED ((struct bio *)1) -/* When we successfully write to a known bad-block, we need to remove the - * bad-block marking which must be done from process context. So we record - * the success by setting devs[n].bio to IO_MADE_GOOD - */ -#define IO_MADE_GOOD ((struct bio *)2) - -#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) - -/* When there are this many requests queue to be written by - * the raid1 thread, we become 'congested' to provide back-pressure - * for writeback. - */ -static int max_queued_requests = 1024; - static void allow_barrier(struct r1conf *conf, sector_t sector_nr); static void lower_barrier(struct r1conf *conf, sector_t sector_nr); @@ -2947,7 +2922,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (!conf->poolinfo) goto abort; conf->poolinfo->raid_disks = mddev->raid_disks * 2; - err = mempool_init(&conf->r1bio_pool, NR_RAID1_BIOS, r1bio_pool_alloc, + err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc, r1bio_pool_free, conf->poolinfo); if (err) goto abort; @@ -3232,7 +3207,7 @@ static int raid1_reshape(struct mddev *mddev) newpoolinfo->mddev = mddev; newpoolinfo->raid_disks = raid_disks * 2; - ret = mempool_init(&newpool, NR_RAID1_BIOS, r1bio_pool_alloc, + ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc, r1bio_pool_free, newpoolinfo); if (ret) { kfree(newpoolinfo); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index aea11476fee6..1facd0153399 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -64,31 +64,6 @@ * [B A] [D C] [B A] [E C D] */ -/* - * Number of guaranteed r10bios in case of extreme VM load: - */ -#define NR_RAID10_BIOS 256 - -/* when we get a read error on a read-only array, we redirect to another - * device without failing the first device, or trying to over-write to - * correct the read error. To keep track of bad blocks on a per-bio - * level, we store IO_BLOCKED in the appropriate 'bios' pointer - */ -#define IO_BLOCKED ((struct bio *)1) -/* When we successfully write to a known bad-block, we need to remove the - * bad-block marking which must be done from process context. So we record - * the success by setting devs[n].bio to IO_MADE_GOOD - */ -#define IO_MADE_GOOD ((struct bio *)2) - -#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) - -/* When there are this many requests queued to be written by - * the raid10 thread, we become 'congested' to provide back-pressure - * for writeback. - */ -static int max_queued_requests = 1024; - static void allow_barrier(struct r10conf *conf); static void lower_barrier(struct r10conf *conf); static int _enough(struct r10conf *conf, int previous, int ignore); @@ -3675,7 +3650,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) conf->geo = geo; conf->copies = copies; - err = mempool_init(&conf->r10bio_pool, NR_RAID10_BIOS, r10bio_pool_alloc, + err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc, r10bio_pool_free, conf); if (err) goto out; From d9771f5ec46c282d518b453c793635dbdc3a2a94 Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Fri, 14 Jun 2019 15:41:05 -0700 Subject: [PATCH 002/153] raid5-cache: Need to do start() part job after adding journal device commit d5d885fd514f ("md: introduce new personality funciton start()") splits the init job to two parts. The first part run() does the jobs that do not require the md threads. The second part start() does the jobs that require the md threads. Now it just does run() in adding new journal device. It needs to do the second part start() too. Fixes: d5d885fd514f ("md: introduce new personality funciton start()") Cc: stable@vger.kernel.org #v4.9+ Reported-by: Michal Soltys Signed-off-by: Xiao Ni Signed-off-by: Song Liu Signed-off-by: Jens Axboe --- drivers/md/raid5.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b83bce2beb66..da94cbaa1a9e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7672,7 +7672,7 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r5conf *conf = mddev->private; - int err = -EEXIST; + int ret, err = -EEXIST; int disk; struct disk_info *p; int first = 0; @@ -7687,7 +7687,14 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) * The array is in readonly mode if journal is missing, so no * write requests running. We should be safe */ - log_init(conf, rdev, false); + ret = log_init(conf, rdev, false); + if (ret) + return ret; + + ret = r5l_start(conf->log); + if (ret) + return ret; + return 0; } if (mddev->recovery_disabled == conf->recovery_disabled) From 168b305b0cfb7467a6691993f922ecbdcfc00c98 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Fri, 14 Jun 2019 15:41:06 -0700 Subject: [PATCH 003/153] md: md.c: Return -ENODEV when mddev is NULL in rdev_attr_show Commit c42d3240990814eec1e4b2b93fa0487fc4873aed ("md: return -ENODEV if rdev has no mddev assigned") changed rdev_attr_store to return -ENODEV when rdev->mddev is NULL, now do the same to rdev_attr_show. Signed-off-by: Marcos Paulo de Souza Signed-off-by: Song Liu Signed-off-by: Jens Axboe --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 04f4f131f9d6..86f4f2b5a724 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3356,7 +3356,7 @@ rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) if (!entry->show) return -EIO; if (!rdev->mddev) - return -EBUSY; + return -ENODEV; return entry->show(rdev, page); } From e5b521ee9b58c8954ad4d75ccaed9428f4b1a0ca Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Fri, 14 Jun 2019 15:41:07 -0700 Subject: [PATCH 004/153] md: fix spelling typo and add necessary space This patch fix a spelling typo and add necessary space for code. In addition, the patch get rid of the unnecessary 'if'. Signed-off-by: Yufen Yu Signed-off-by: Song Liu Signed-off-by: Jens Axboe --- drivers/md/md.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 86f4f2b5a724..1f37a1adc926 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5639,8 +5639,7 @@ int md_run(struct mddev *mddev) spin_unlock(&mddev->lock); rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) - if (sysfs_link_rdev(mddev, rdev)) - /* failure here is OK */; + sysfs_link_rdev(mddev, rdev); /* failure here is OK */ if (mddev->degraded && !mddev->ro) /* This ensures that recovering status is reported immediately @@ -8190,8 +8189,7 @@ void md_do_sync(struct md_thread *thread) { struct mddev *mddev = thread->mddev; struct mddev *mddev2; - unsigned int currspeed = 0, - window; + unsigned int currspeed = 0, window; sector_t max_sectors,j, io_sectors, recovery_done; unsigned long mark[SYNC_MARKS]; unsigned long update_time; @@ -8248,7 +8246,7 @@ void md_do_sync(struct md_thread *thread) * 0 == not engaged in resync at all * 2 == checking that there is no conflict with another sync * 1 == like 2, but have yielded to allow conflicting resync to - * commense + * commence * other == active in resync - this many blocks * * Before starting a resync we must have set curr_resync to @@ -8379,7 +8377,7 @@ void md_do_sync(struct md_thread *thread) /* * Tune reconstruction: */ - window = 32*(PAGE_SIZE/512); + window = 32 * (PAGE_SIZE / 512); pr_debug("md: using %dk window, over a total of %lluk.\n", window/2, (unsigned long long)max_sectors/2); @@ -9192,7 +9190,6 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) * perform resync with the new activated disk */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); - } /* device faulty * We just want to do the minimum to mark the disk From ebfeb444fa6fd9bc7be62694fff838bc57e19a7d Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Fri, 14 Jun 2019 15:41:08 -0700 Subject: [PATCH 005/153] md/raid1: get rid of extra blank line and space This patch get rid of extra blank line and space, and add necessary space for code. Signed-off-by: Yufen Yu Signed-off-by: Song Liu Signed-off-by: Jens Axboe --- drivers/md/raid1.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e331b433d00c..869c32fea1b8 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1424,7 +1424,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (!r1_bio->bios[i]) continue; - if (first_clone) { /* do behind I/O ? * Not if there are too many, or cannot @@ -1704,9 +1703,8 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) first = last = rdev->saved_raid_disk; for (mirror = first; mirror <= last; mirror++) { - p = conf->mirrors+mirror; + p = conf->mirrors + mirror; if (!p->rdev) { - if (mddev->gendisk) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); @@ -2863,7 +2861,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (read_targets == 1) bio->bi_opf &= ~MD_FAILFAST; generic_make_request(bio); - } return nr_sectors; } @@ -3064,7 +3061,7 @@ static int raid1_run(struct mddev *mddev) } mddev->degraded = 0; - for (i=0; i < conf->raid_disks; i++) + for (i = 0; i < conf->raid_disks; i++) if (conf->mirrors[i].rdev == NULL || !test_bit(In_sync, &conf->mirrors[i].rdev->flags) || test_bit(Faulty, &conf->mirrors[i].rdev->flags)) @@ -3099,7 +3096,7 @@ static int raid1_run(struct mddev *mddev) mddev->queue); } - ret = md_integrity_register(mddev); + ret = md_integrity_register(mddev); if (ret) { md_unregister_thread(&mddev->thread); raid1_free(mddev, conf); From 8cf05a7841e1cfd894741d6bab43067b0ca85eb8 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 14 Jun 2019 15:41:09 -0700 Subject: [PATCH 006/153] md: raid10: Use struct_size() in kmalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; struct boo entry[]; }; instance = kmalloc(size, GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kmalloc(struct_size(instance, entry, count), GFP_KERNEL); This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Song Liu Signed-off-by: Jens Axboe --- drivers/md/raid10.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 1facd0153399..f35e076ee47d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4755,8 +4755,7 @@ static int handle_reshape_read_error(struct mddev *mddev, int idx = 0; struct page **pages; - r10b = kmalloc(sizeof(*r10b) + - sizeof(struct r10dev) * conf->copies, GFP_NOIO); + r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO); if (!r10b) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); return -ENOMEM; From c7afa8034b09bc2bb664d86de7db34466401f352 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Fri, 14 Jun 2019 15:41:10 -0700 Subject: [PATCH 007/153] md: raid1-10: Unify r{1,10}bio_pool_free Avoiding duplicated code, since they just execute a kfree. Signed-off-by: Marcos Paulo de Souza Signed-off-by: Jens Axboe --- drivers/md/raid1-10.c | 5 +++++ drivers/md/raid1.c | 13 ++++--------- drivers/md/raid10.c | 11 +++-------- 3 files changed, 12 insertions(+), 17 deletions(-) diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 7d968bf08e54..54db34163968 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -34,6 +34,11 @@ struct resync_pages { struct page *pages[RESYNC_PAGES]; }; +static void rbio_pool_free(void *rbio, void *data) +{ + kfree(rbio); +} + static inline int resync_alloc_pages(struct resync_pages *rp, gfp_t gfp_flags) { diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 869c32fea1b8..a7860b5f33f2 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -68,11 +68,6 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) return kzalloc(size, gfp_flags); } -static void r1bio_pool_free(void *r1_bio, void *data) -{ - kfree(r1_bio); -} - #define RESYNC_DEPTH 32 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) @@ -148,7 +143,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) kfree(rps); out_free_r1bio: - r1bio_pool_free(r1_bio, data); + rbio_pool_free(r1_bio, data); return NULL; } @@ -168,7 +163,7 @@ static void r1buf_pool_free(void *__r1_bio, void *data) /* resync pages array stored in the 1st bio's .bi_private */ kfree(rp); - r1bio_pool_free(r1bio, data); + rbio_pool_free(r1bio, data); } static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio) @@ -2920,7 +2915,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) goto abort; conf->poolinfo->raid_disks = mddev->raid_disks * 2; err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc, - r1bio_pool_free, conf->poolinfo); + rbio_pool_free, conf->poolinfo); if (err) goto abort; @@ -3205,7 +3200,7 @@ static int raid1_reshape(struct mddev *mddev) newpoolinfo->raid_disks = raid_disks * 2; ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc, - r1bio_pool_free, newpoolinfo); + rbio_pool_free, newpoolinfo); if (ret) { kfree(newpoolinfo); return ret; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f35e076ee47d..c9a149b2ec86 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -98,11 +98,6 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) return kzalloc(size, gfp_flags); } -static void r10bio_pool_free(void *r10_bio, void *data) -{ - kfree(r10_bio); -} - #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) /* amount of memory to reserve for resync requests */ #define RESYNC_WINDOW (1024*1024) @@ -208,7 +203,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) } kfree(rps); out_free_r10bio: - r10bio_pool_free(r10_bio, conf); + rbio_pool_free(r10_bio, conf); return NULL; } @@ -236,7 +231,7 @@ static void r10buf_pool_free(void *__r10_bio, void *data) /* resync pages array stored in the 1st bio's .bi_private */ kfree(rp); - r10bio_pool_free(r10bio, conf); + rbio_pool_free(r10bio, conf); } static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) @@ -3651,7 +3646,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) conf->geo = geo; conf->copies = copies; err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc, - r10bio_pool_free, conf); + rbio_pool_free, conf); if (err) goto out; From e9eeba28a1e01a55b49cdcf9c7a346d2aaa0aa7d Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 14 Jun 2019 15:41:11 -0700 Subject: [PATCH 008/153] md/raid10: read balance chooses idlest disk for SSD Andy reported that raid10 array with SSD disks has poor read performance. Compared with raid1, RAID-1 can be 3x faster than RAID-10 sometimes [1]. The thing is that raid10 chooses the low distance disk for read request, however, the approach doesn't work well for SSD device since it doesn't have spindle like HDD, we should just read from the SSD which has less pending IO like commit 9dedf60313fa4 ("md/raid1: read balance chooses idlest disk for SSD"). So this commit selects the idlest SSD disk for read if array has none rotational disk, otherwise, read_balance uses the previous distance priority algorithm. With the change, the performance of raid10 gets increased largely per Andy's test [2]. [1]. https://marc.info/?l=linux-raid&m=155915890004761&w=2 [2]. https://marc.info/?l=linux-raid&m=155990654223786&w=2 Tested-by: Andy Smith Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu Signed-off-by: Jens Axboe --- drivers/md/raid10.c | 45 +++++++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c9a149b2ec86..8a1354a08a1a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -707,15 +707,19 @@ static struct md_rdev *read_balance(struct r10conf *conf, int sectors = r10_bio->sectors; int best_good_sectors; sector_t new_distance, best_dist; - struct md_rdev *best_rdev, *rdev = NULL; + struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL; int do_balance; - int best_slot; + int best_dist_slot, best_pending_slot; + bool has_nonrot_disk = false; + unsigned int min_pending; struct geom *geo = &conf->geo; raid10_find_phys(conf, r10_bio); rcu_read_lock(); - best_slot = -1; - best_rdev = NULL; + best_dist_slot = -1; + min_pending = UINT_MAX; + best_dist_rdev = NULL; + best_pending_rdev = NULL; best_dist = MaxSector; best_good_sectors = 0; do_balance = 1; @@ -737,6 +741,8 @@ static struct md_rdev *read_balance(struct r10conf *conf, sector_t first_bad; int bad_sectors; sector_t dev_sector; + unsigned int pending; + bool nonrot; if (r10_bio->devs[slot].bio == IO_BLOCKED) continue; @@ -773,8 +779,8 @@ static struct md_rdev *read_balance(struct r10conf *conf, first_bad - dev_sector; if (good_sectors > best_good_sectors) { best_good_sectors = good_sectors; - best_slot = slot; - best_rdev = rdev; + best_dist_slot = slot; + best_dist_rdev = rdev; } if (!do_balance) /* Must read from here */ @@ -787,14 +793,23 @@ static struct md_rdev *read_balance(struct r10conf *conf, if (!do_balance) break; - if (best_slot >= 0) + nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); + has_nonrot_disk |= nonrot; + pending = atomic_read(&rdev->nr_pending); + if (min_pending > pending && nonrot) { + min_pending = pending; + best_pending_slot = slot; + best_pending_rdev = rdev; + } + + if (best_dist_slot >= 0) /* At least 2 disks to choose from so failfast is OK */ set_bit(R10BIO_FailFast, &r10_bio->state); /* This optimisation is debatable, and completely destroys * sequential read speed for 'far copies' arrays. So only * keep it for 'near' arrays, and review those later. */ - if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending)) + if (geo->near_copies > 1 && !pending) new_distance = 0; /* for far > 1 always use the lowest address */ @@ -803,15 +818,21 @@ static struct md_rdev *read_balance(struct r10conf *conf, else new_distance = abs(r10_bio->devs[slot].addr - conf->mirrors[disk].head_position); + if (new_distance < best_dist) { best_dist = new_distance; - best_slot = slot; - best_rdev = rdev; + best_dist_slot = slot; + best_dist_rdev = rdev; } } if (slot >= conf->copies) { - slot = best_slot; - rdev = best_rdev; + if (has_nonrot_disk) { + slot = best_pending_slot; + rdev = best_pending_rdev; + } else { + slot = best_dist_slot; + rdev = best_dist_rdev; + } } if (slot >= 0) { From 315eb656649db680af06c5e56c6cb77c858cbe45 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 14 Jun 2019 14:39:26 +0300 Subject: [PATCH 009/153] blk-mq/debugfs: Fix improper print qualifier struct blk_rq_stat::mean is a u64 value, so use %llu Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 2489ddbb21db..f0550be60824 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -17,7 +17,7 @@ static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) { if (stat->nr_samples) { - seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu", + seq_printf(m, "samples=%d, mean=%llu, min=%llu, max=%llu", stat->nr_samples, stat->mean, stat->min, stat->max); } else { seq_puts(m, "samples=0"); From 7602843fd873cae43a444b83b14dfdd114a9659c Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Sat, 15 Jun 2019 01:43:48 -0600 Subject: [PATCH 010/153] block: null_blk: fix race condition for null_del_dev Dulicate call of null_del_dev() will trigger null pointer error like below. The reason is a race condition between nullb_device_power_store() and nullb_group_drop_item(). CPU#0 CPU#1 ---------------- ----------------- do_rmdir() >configfs_rmdir() >client_drop_item() >nullb_group_drop_item() nullb_device_power_store() >null_del_dev() >test_and_clear_bit(NULLB_DEV_FL_UP >null_del_dev() ^^^^^ Duplicated null_dev_dev() triger null pointer error >clear_bit(NULLB_DEV_FL_UP The fix could be keep the sequnce of clear NULLB_DEV_FL_UP and null_del_dev(). [ 698.613600] BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 [ 698.613608] #PF error: [normal kernel read fault] [ 698.613611] PGD 0 P4D 0 [ 698.613619] Oops: 0000 [#1] SMP PTI [ 698.613627] CPU: 3 PID: 6382 Comm: rmdir Not tainted 5.0.0+ #35 [ 698.613631] Hardware name: LENOVO 20LJS2EV08/20LJS2EV08, BIOS R0SET33W (1.17 ) 07/18/2018 [ 698.613644] RIP: 0010:null_del_dev+0xc/0x110 [null_blk] [ 698.613649] Code: 00 00 00 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 0b eb 97 e8 47 bb 2a e8 0f 1f 80 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 41 54 53 <8b> 77 18 48 89 fb 4c 8b 27 48 c7 c7 40 57 1e c1 e8 bf c7 cb e8 48 [ 698.613654] RSP: 0018:ffffb887888bfde0 EFLAGS: 00010286 [ 698.613659] RAX: 0000000000000000 RBX: ffff9d436d92bc00 RCX: ffff9d43a9184681 [ 698.613663] RDX: ffffffffc11e5c30 RSI: 0000000068be6540 RDI: 0000000000000000 [ 698.613667] RBP: ffffb887888bfdf0 R08: 0000000000000001 R09: 0000000000000000 [ 698.613671] R10: ffffb887888bfdd8 R11: 0000000000000f16 R12: ffff9d436d92bc08 [ 698.613675] R13: ffff9d436d94e630 R14: ffffffffc11e5088 R15: ffffffffc11e5000 [ 698.613680] FS: 00007faa68be6540(0000) GS:ffff9d43d14c0000(0000) knlGS:0000000000000000 [ 698.613685] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 698.613689] CR2: 0000000000000018 CR3: 000000042f70c002 CR4: 00000000003606e0 [ 698.613693] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 698.613697] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 698.613700] Call Trace: [ 698.613712] nullb_group_drop_item+0x50/0x70 [null_blk] [ 698.613722] client_drop_item+0x29/0x40 [ 698.613728] configfs_rmdir+0x1ed/0x300 [ 698.613738] vfs_rmdir+0xb2/0x130 [ 698.613743] do_rmdir+0x1c7/0x1e0 [ 698.613750] __x64_sys_rmdir+0x17/0x20 [ 698.613759] do_syscall_64+0x5a/0x110 [ 698.613768] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Signed-off-by: Bob Liu Signed-off-by: Jens Axboe --- drivers/block/null_blk_main.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 447d635c79a2..2a4f8bc4f930 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -327,11 +327,12 @@ static ssize_t nullb_device_power_store(struct config_item *item, set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); dev->power = newp; } else if (dev->power && !newp) { - mutex_lock(&lock); - dev->power = newp; - null_del_dev(dev->nullb); - mutex_unlock(&lock); - clear_bit(NULLB_DEV_FL_UP, &dev->flags); + if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { + mutex_lock(&lock); + dev->power = newp; + null_del_dev(dev->nullb); + mutex_unlock(&lock); + } clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); } From 78b90a2ce8424eb4be4a6a1623dc7c07af8303aa Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 31 May 2019 13:47:54 -0500 Subject: [PATCH 011/153] block: genhd: Use struct_size() helper Make use of the struct_size() helper instead of an open-coded version in order to avoid any potential type mistakes, in particular in the context in which this code is being used. So, replace the following form: sizeof(*new_ptbl) + target * sizeof(new_ptbl->part[0]) with: struct_size(new_ptbl, part, target) Also, notice that variable size is unnecessary, hence it is removed. This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Jens Axboe --- block/genhd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 24654e1d83e6..97887e59f3b2 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1281,7 +1281,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) struct disk_part_tbl *new_ptbl; int len = old_ptbl ? old_ptbl->len : 0; int i, target; - size_t size; /* * check for int overflow, since we can get here from blkpg_ioctl() @@ -1298,8 +1297,8 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) if (target <= len) return 0; - size = sizeof(*new_ptbl) + target * sizeof(new_ptbl->part[0]); - new_ptbl = kzalloc_node(size, GFP_KERNEL, disk->node_id); + new_ptbl = kzalloc_node(struct_size(new_ptbl, part, target), GFP_KERNEL, + disk->node_id); if (!new_ptbl) return -ENOMEM; From f1f8f292cd12292289cae87aac3a5c035186ec54 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 10 Jun 2019 10:04:12 -0500 Subject: [PATCH 012/153] block: bio: Use struct_size() in kmalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct bio_map_data { ... struct iovec iov[]; }; instance = kmalloc(sizeof(sizeof(struct bio_map_data) + sizeof(struct iovec) * count, GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kmalloc(struct_size(instance, iov, count), GFP_KERNEL); This code was detected with the help of Coccinelle. Reviewed-by: Kees Cook Signed-off-by: Gustavo A. R. Silva Signed-off-by: Jens Axboe --- block/bio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/bio.c b/block/bio.c index 683cbb40f051..4bcdcd3f63f4 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1120,8 +1120,7 @@ static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, if (data->nr_segs > UIO_MAXIOV) return NULL; - bmd = kmalloc(sizeof(struct bio_map_data) + - sizeof(struct iovec) * data->nr_segs, gfp_mask); + bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask); if (!bmd) return NULL; memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs); From 5de0073fcd50cc1f150895a7bb04d3cf8067b1d7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jun 2019 15:30:37 -0700 Subject: [PATCH 013/153] blk-iolatency: clear use_delay when io.latency is set to zero If use_delay was non-zero when the latency target of a cgroup was set to zero, it will stay stuck until io.latency is enabled on the cgroup again. This keeps readahead disabled for the cgroup impacting performance negatively. Signed-off-by: Tejun Heo Cc: Josef Bacik Fixes: d70675121546 ("block: introduce blk-iolatency io controller") Cc: stable@vger.kernel.org # v4.19+ Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index d22e61bced86..17896bb3aaf2 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -778,8 +778,10 @@ static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) if (!oldval && val) return 1; - if (oldval && !val) + if (oldval && !val) { + blkcg_clear_delay(blkg); return -1; + } return 0; } From f539da82f2158916e154d206054e0efd5df7ab61 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jun 2019 15:30:38 -0700 Subject: [PATCH 014/153] blkcg: update blkcg_print_stat() to handle larger outputs Depending on the number of devices, blkcg stats can go over the default seqfile buf size. seqfile normally retries with a larger buffer but since the ->pd_stat() addition, blkcg_print_stat() doesn't tell seqfile that overflow has happened and the output gets printed truncated. Fix it by calling seq_commit() w/ -1 on possible overflows. Signed-off-by: Tejun Heo Fixes: 903d23f0a354 ("blk-cgroup: allow controllers to output their own stats") Cc: stable@vger.kernel.org # v4.19+ Cc: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1f7127b03490..e4715b35d42c 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1006,8 +1006,12 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) } next: if (has_stats) { - off += scnprintf(buf+off, size-off, "\n"); - seq_commit(sf, off); + if (off < size - 1) { + off += scnprintf(buf+off, size-off, "\n"); + seq_commit(sf, off); + } else { + seq_commit(sf, -1); + } } } From ef069b97feec11c2399bbc5f6f347b35482105dc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jun 2019 15:30:39 -0700 Subject: [PATCH 015/153] blkcg: perpcu_ref init/exit should be done from blkg_alloc/free() blkg alloc is performed as a separate step from the rest of blkg creation so that GFP_KERNEL allocations can be used when creating blkgs from configuration file writes because otherwise user actions may fail due to failures of opportunistic GFP_NOWAIT allocations. While making blkgs use percpu_ref, 7fcf2b033b84 ("blkcg: change blkg reference counting to use percpu_ref") incorrectly added unconditional opportunistic percpu_ref_init() to blkg_create() breaking this guarantee. This patch moves percpu_ref_init() to blkg_alloc() so makes it use @gfp_mask that blkg_alloc() is called with. Also, percpu_ref_exit() is moved to blkg_free() for consistency. Signed-off-by: Tejun Heo Fixes: 7fcf2b033b84 ("blkcg: change blkg reference counting to use percpu_ref") Cc: Dennis Zhou Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index e4715b35d42c..04d286934c5e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -79,6 +79,7 @@ static void blkg_free(struct blkcg_gq *blkg) blkg_rwstat_exit(&blkg->stat_ios); blkg_rwstat_exit(&blkg->stat_bytes); + percpu_ref_exit(&blkg->refcnt); kfree(blkg); } @@ -86,8 +87,6 @@ static void __blkg_release(struct rcu_head *rcu) { struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); - percpu_ref_exit(&blkg->refcnt); - /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); if (blkg->parent) @@ -132,6 +131,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, if (!blkg) return NULL; + if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask)) + goto err_free; + if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) || blkg_rwstat_init(&blkg->stat_ios, gfp_mask)) goto err_free; @@ -244,11 +246,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg_get(blkg->parent); } - ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0, - GFP_NOWAIT | __GFP_NOWARN); - if (ret) - goto err_cancel_ref; - /* invoke per-policy init */ for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -281,8 +278,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg_put(blkg); return ERR_PTR(ret); -err_cancel_ref: - percpu_ref_exit(&blkg->refcnt); err_put_congested: wb_congested_put(wb_congested); err_put_css: From 71c814077de60b2e7415dac6f5c4e98f59d521fd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jun 2019 15:30:40 -0700 Subject: [PATCH 016/153] blkcg: blkcg_activate_policy() should initialize ancestors first When blkcg_activate_policy() is creating blkg_policy_data for existing blkgs, it did in the wrong order - descendants first. Fix it. None of the existing controllers seem affected by this. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 04d286934c5e..440797293235 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1390,7 +1390,8 @@ int blkcg_activate_policy(struct request_queue *q, spin_lock_irq(&q->queue_lock); - list_for_each_entry(blkg, &q->blkg_list, q_node) { + /* blkg_list is pushed at the head, reverse walk to init parents first */ + list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { struct blkg_policy_data *pd; if (blkg->pd[pol->plid]) From 6631142229005e1b1c311a09efe9fb3cfdac8559 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jun 2019 15:30:41 -0700 Subject: [PATCH 017/153] blkcg, writeback: dead memcgs shouldn't contribute to writeback ownership arbitration wbc_account_io() collects information on cgroup ownership of writeback pages to determine which cgroup should own the inode. Pages can stay associated with dead memcgs but we want to avoid attributing IOs to dead blkcgs as much as possible as the association is likely to be stale. However, currently, pages associated with dead memcgs contribute to the accounting delaying and/or confusing the arbitration. Fix it by ignoring pages associated with dead memcgs. Signed-off-by: Tejun Heo Cc: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e41cbe8e81b9..9ebfb1b28430 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -715,6 +715,7 @@ void wbc_detach_inode(struct writeback_control *wbc) void wbc_account_io(struct writeback_control *wbc, struct page *page, size_t bytes) { + struct cgroup_subsys_state *css; int id; /* @@ -726,7 +727,12 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page, if (!wbc->wb) return; - id = mem_cgroup_css_from_page(page)->id; + css = mem_cgroup_css_from_page(page); + /* dead cgroups shouldn't contribute to inode ownership arbitration */ + if (!(css->flags & CSS_ONLINE)) + return; + + id = css->id; if (id == wbc->wb_id) { wbc->wb_bytes += bytes; From f9bc64a0f0f884036d76d71edeaafb994c5ceddf Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 13 Jun 2019 07:14:21 -0700 Subject: [PATCH 018/153] block: use req_op() to maintain consistency This is a pure code cleanup patch and doesn't change any functionality. In block layer to identify the request operation req_op() macro is used, so change the open coding the req_op() in the blk-mq-debugfs.c. Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index f0550be60824..5d940ff124a5 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -341,7 +341,7 @@ static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state) int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) { const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; - const unsigned int op = rq->cmd_flags & REQ_OP_MASK; + const unsigned int op = req_op(rq); seq_printf(m, "%p {.op=", rq); if (op < ARRAY_SIZE(op_name) && op_name[op]) From 2af47c10e80baf91cff56c44cec47402e05ac45c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 19 Jun 2019 15:19:44 +0200 Subject: [PATCH 019/153] floppy: fix harmless clang build warning clang warns about unusual code in floppy.c that looks like it was intended to be a bit mask operation, checking for a specific bit in the UDP->cmos variable (FLOPPY1_TYPE expands to '4' on ARM): drivers/block/floppy.c:3902:17: error: use of logical '&&' with constant operand [-Werror,-Wconstant-logical-operand] if (!UDP->cmos && FLOPPY1_TYPE) ^ ~~~~~~~~~~~~ drivers/block/floppy.c:3902:17: note: use '&' for a bitwise operation if (!UDP->cmos && FLOPPY1_TYPE) The check here is redundant anyway, if FLOPPY1_TYPE is zero, then assigning it to a zero UDP->cmos field does not change anything, so removing the extra check here has no effect other than shutting up the warning. On x86, this will no longer read a hardware register, as the FLOPPY1_TYPE macro is not expanded if UDP->cmos is already zero, but the result is the same. Cc: Robert Elliott Cc: Keith Busch Link: https://patchwork.kernel.org/patch/10851841/ Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 9fb9b312ab6b..b933a7eea52b 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3900,7 +3900,7 @@ static void __init config_types(void) if (!UDP->cmos) UDP->cmos = FLOPPY0_TYPE; drive = 1; - if (!UDP->cmos && FLOPPY1_TYPE) + if (!UDP->cmos) UDP->cmos = FLOPPY1_TYPE; /* FIXME: additional physical CMOS drive detection should go here */ From 2f578aaf51624aa6fcff041fc7dc5c2d4dfa447f Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 9 Jun 2019 05:15:51 +0900 Subject: [PATCH 020/153] block: move tag field position in struct request __data_len and __sector are internal fields which should not be accessed directly in driver-level like the comment above it. But, tag field can be accessed by driver level directly so that we need to make the comment right by moving it to some other place. Cc: Jens Axboe Cc: linux-block@vger.kernel.org Signed-off-by: Minwoo Im Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 592669bcc536..90e6914bea0c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -137,11 +137,11 @@ struct request { unsigned int cmd_flags; /* op and common flags */ req_flags_t rq_flags; + int tag; int internal_tag; /* the following two fields are internal, NEVER access directly */ unsigned int __data_len; /* total data len */ - int tag; sector_t __sector; /* sector cursor */ struct bio *bio; From 8c54803b98d5907b45fe98270be5ed4fbc7e4c4c Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 19 Jun 2019 21:56:58 -0700 Subject: [PATCH 021/153] null_blk: remove duplicate 0 initialization In function null_add_dev() struct nullb *nullb member is allocated using kzalloc_node() which returns 0red memory. In function setup_queues() which is called from the null_add_dev(), on successful queue allocation we set the nullb->nr_queues = 0 which is not needed due to earlier use of kzalloc_node(). Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- drivers/block/null_blk_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 2a4f8bc4f930..22303e59a274 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1489,7 +1489,6 @@ static int setup_queues(struct nullb *nullb) if (!nullb->queues) return -ENOMEM; - nullb->nr_queues = 0; nullb->queue_depth = nullb->dev->hw_queue_depth; return 0; From ee1e03598f7961f471367e075edcdbd8492a2239 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 19 Jun 2019 15:01:48 -0700 Subject: [PATCH 022/153] block: get rid of redundant else This is a pure code cleanup patch and doesn't change any functionality. This removes the redundant else in the code which is not needed since we are returning from function anyway. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 5d940ff124a5..84394835e2b0 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -779,8 +779,8 @@ static int blk_mq_debugfs_release(struct inode *inode, struct file *file) if (attr->show) return single_release(inode, file); - else - return seq_release(inode, file); + + return seq_release(inode, file); } static const struct file_operations blk_mq_debugfs_fops = { From 3f6d385f818029d353fb932fcac38c3f11eeeb20 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 19 Jun 2019 15:01:49 -0700 Subject: [PATCH 023/153] block: use right format specifier for op In function __blk_mq_debugfs_rq_show variable op has unsigned int type. Since op can never be negative use %u format specifier to match the variable type. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 84394835e2b0..03b6aabbe602 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -347,7 +347,7 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) if (op < ARRAY_SIZE(op_name) && op_name[op]) seq_printf(m, "%s", op_name[op]); else - seq_printf(m, "%d", op); + seq_printf(m, "%u", op); seq_puts(m, ", .cmd_flags="); blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name, ARRAY_SIZE(cmd_flag_name)); From 243d9f78d942c4ed4a684202814c6cd0d1bcd954 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 19 Jun 2019 15:01:50 -0700 Subject: [PATCH 024/153] block: code cleanup queue_poll_stat_show() This is a pure code cleanup patch and doesn't change any functionality. Having multiple coding styles in the code creates confusion when someone tries to add a new code. Make queue_poll_stat_show() consistent by adding spaces around binary operators with the rest of the code. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 03b6aabbe602..a8376cc06a39 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -29,13 +29,13 @@ static int queue_poll_stat_show(void *data, struct seq_file *m) struct request_queue *q = data; int bucket; - for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS/2; bucket++) { - seq_printf(m, "read (%d Bytes): ", 1 << (9+bucket)); - print_stat(m, &q->poll_stat[2*bucket]); + for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) { + seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket)); + print_stat(m, &q->poll_stat[2 * bucket]); seq_puts(m, "\n"); - seq_printf(m, "write (%d Bytes): ", 1 << (9+bucket)); - print_stat(m, &q->poll_stat[2*bucket+1]); + seq_printf(m, "write (%d Bytes): ", 1 << (9 + bucket)); + print_stat(m, &q->poll_stat[2 * bucket + 1]); seq_puts(m, "\n"); } return 0; From 3a211b71529fdd0a89095b18fb19155db0c8fb5d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 May 2019 18:43:11 +0300 Subject: [PATCH 025/153] blk-core: Remove blk_end_request*() declarations Commit a1ce35fa49852db60fc6e268 ("block: remove dead elevator code") deleted blk_end_request() and friends, but some declaration are still left. Purge them. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- include/linux/blkdev.h | 12 ------------ 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 8340f69670d8..94c6520bc786 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1348,7 +1348,7 @@ EXPORT_SYMBOL_GPL(blk_steal_bios); * * This special helper function is only for request stacking drivers * (e.g. request-based dm) so that they can handle partial completion. - * Actual device drivers should use blk_end_request instead. + * Actual device drivers should use blk_mq_end_request instead. * * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 90e6914bea0c..ad49a775c54f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1026,21 +1026,9 @@ void blk_steal_bios(struct bio_list *list, struct request *rq); * * blk_update_request() completes given number of bytes and updates * the request without completing it. - * - * blk_end_request() and friends. __blk_end_request() must be called - * with the request queue spinlock acquired. - * - * Several drivers define their own end_request and call - * blk_end_request() for parts of the original function. - * This prevents code duplication in drivers. */ extern bool blk_update_request(struct request *rq, blk_status_t error, unsigned int nr_bytes); -extern void blk_end_request_all(struct request *rq, blk_status_t error); -extern bool __blk_end_request(struct request *rq, blk_status_t error, - unsigned int nr_bytes); -extern void __blk_end_request_all(struct request *rq, blk_status_t error); -extern bool __blk_end_request_cur(struct request *rq, blk_status_t error); extern void __blk_complete_request(struct request *); extern void blk_abort_request(struct request *); From d27e84a305980ac61df0a6841059d0eb09b8283d Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 18 Jun 2019 17:45:49 +0200 Subject: [PATCH 026/153] block: drbd: no need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Cc: Philipp Reisner Cc: Lars Ellenberg Cc: Jens Axboe Cc: drbd-dev@lists.linbit.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_debugfs.c | 64 +------------------------------ drivers/block/drbd/drbd_debugfs.h | 4 +- drivers/block/drbd/drbd_main.c | 3 +- 3 files changed, 5 insertions(+), 66 deletions(-) diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c index f13b48ff5f43..b3b9cd5628fd 100644 --- a/drivers/block/drbd/drbd_debugfs.c +++ b/drivers/block/drbd/drbd_debugfs.c @@ -465,35 +465,20 @@ static const struct file_operations in_flight_summary_fops = { void drbd_debugfs_resource_add(struct drbd_resource *resource) { struct dentry *dentry; - if (!drbd_debugfs_resources) - return; dentry = debugfs_create_dir(resource->name, drbd_debugfs_resources); - if (IS_ERR_OR_NULL(dentry)) - goto fail; resource->debugfs_res = dentry; dentry = debugfs_create_dir("volumes", resource->debugfs_res); - if (IS_ERR_OR_NULL(dentry)) - goto fail; resource->debugfs_res_volumes = dentry; dentry = debugfs_create_dir("connections", resource->debugfs_res); - if (IS_ERR_OR_NULL(dentry)) - goto fail; resource->debugfs_res_connections = dentry; dentry = debugfs_create_file("in_flight_summary", 0440, resource->debugfs_res, resource, &in_flight_summary_fops); - if (IS_ERR_OR_NULL(dentry)) - goto fail; resource->debugfs_res_in_flight_summary = dentry; - return; - -fail: - drbd_debugfs_resource_cleanup(resource); - drbd_err(resource, "failed to create debugfs dentry\n"); } static void drbd_debugfs_remove(struct dentry **dp) @@ -636,35 +621,22 @@ void drbd_debugfs_connection_add(struct drbd_connection *connection) { struct dentry *conns_dir = connection->resource->debugfs_res_connections; struct dentry *dentry; - if (!conns_dir) - return; /* Once we enable mutliple peers, * these connections will have descriptive names. * For now, it is just the one connection to the (only) "peer". */ dentry = debugfs_create_dir("peer", conns_dir); - if (IS_ERR_OR_NULL(dentry)) - goto fail; connection->debugfs_conn = dentry; dentry = debugfs_create_file("callback_history", 0440, connection->debugfs_conn, connection, &connection_callback_history_fops); - if (IS_ERR_OR_NULL(dentry)) - goto fail; connection->debugfs_conn_callback_history = dentry; dentry = debugfs_create_file("oldest_requests", 0440, connection->debugfs_conn, connection, &connection_oldest_requests_fops); - if (IS_ERR_OR_NULL(dentry)) - goto fail; connection->debugfs_conn_oldest_requests = dentry; - return; - -fail: - drbd_debugfs_connection_cleanup(connection); - drbd_err(connection, "failed to create debugfs dentry\n"); } void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) @@ -809,8 +781,6 @@ void drbd_debugfs_device_add(struct drbd_device *device) snprintf(vnr_buf, sizeof(vnr_buf), "%u", device->vnr); dentry = debugfs_create_dir(vnr_buf, vols_dir); - if (IS_ERR_OR_NULL(dentry)) - goto fail; device->debugfs_vol = dentry; snprintf(minor_buf, sizeof(minor_buf), "%u", device->minor); @@ -819,18 +789,14 @@ void drbd_debugfs_device_add(struct drbd_device *device) if (!slink_name) goto fail; dentry = debugfs_create_symlink(minor_buf, drbd_debugfs_minors, slink_name); + device->debugfs_minor = dentry; kfree(slink_name); slink_name = NULL; - if (IS_ERR_OR_NULL(dentry)) - goto fail; - device->debugfs_minor = dentry; #define DCF(name) do { \ dentry = debugfs_create_file(#name, 0440, \ device->debugfs_vol, device, \ &device_ ## name ## _fops); \ - if (IS_ERR_OR_NULL(dentry)) \ - goto fail; \ device->debugfs_vol_ ## name = dentry; \ } while (0) @@ -864,19 +830,9 @@ void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device) struct dentry *dentry; char vnr_buf[8]; - if (!conn_dir) - return; - snprintf(vnr_buf, sizeof(vnr_buf), "%u", peer_device->device->vnr); dentry = debugfs_create_dir(vnr_buf, conn_dir); - if (IS_ERR_OR_NULL(dentry)) - goto fail; peer_device->debugfs_peer_dev = dentry; - return; - -fail: - drbd_debugfs_peer_device_cleanup(peer_device); - drbd_err(peer_device, "failed to create debugfs entries\n"); } void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device) @@ -917,35 +873,19 @@ void drbd_debugfs_cleanup(void) drbd_debugfs_remove(&drbd_debugfs_root); } -int __init drbd_debugfs_init(void) +void __init drbd_debugfs_init(void) { struct dentry *dentry; dentry = debugfs_create_dir("drbd", NULL); - if (IS_ERR_OR_NULL(dentry)) - goto fail; drbd_debugfs_root = dentry; dentry = debugfs_create_file("version", 0444, drbd_debugfs_root, NULL, &drbd_version_fops); - if (IS_ERR_OR_NULL(dentry)) - goto fail; drbd_debugfs_version = dentry; dentry = debugfs_create_dir("resources", drbd_debugfs_root); - if (IS_ERR_OR_NULL(dentry)) - goto fail; drbd_debugfs_resources = dentry; dentry = debugfs_create_dir("minors", drbd_debugfs_root); - if (IS_ERR_OR_NULL(dentry)) - goto fail; drbd_debugfs_minors = dentry; - return 0; - -fail: - drbd_debugfs_cleanup(); - if (dentry) - return PTR_ERR(dentry); - else - return -EINVAL; } diff --git a/drivers/block/drbd/drbd_debugfs.h b/drivers/block/drbd/drbd_debugfs.h index 4ecfbb3358d7..58e31cef0844 100644 --- a/drivers/block/drbd/drbd_debugfs.h +++ b/drivers/block/drbd/drbd_debugfs.h @@ -6,7 +6,7 @@ #include "drbd_int.h" #ifdef CONFIG_DEBUG_FS -int __init drbd_debugfs_init(void); +void __init drbd_debugfs_init(void); void drbd_debugfs_cleanup(void); void drbd_debugfs_resource_add(struct drbd_resource *resource); @@ -22,7 +22,7 @@ void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device); void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device); #else -static inline int __init drbd_debugfs_init(void) { return -ENODEV; } +static inline void __init drbd_debugfs_init(void) { } static inline void drbd_debugfs_cleanup(void) { } static inline void drbd_debugfs_resource_add(struct drbd_resource *resource) { } diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 541b31fa42b3..40edaf87241c 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3009,8 +3009,7 @@ static int __init drbd_init(void) spin_lock_init(&retry.lock); INIT_LIST_HEAD(&retry.writes); - if (drbd_debugfs_init()) - pr_notice("failed to initialize debugfs -- will not be available\n"); + drbd_debugfs_init(); pr_info("initialized. " "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", From a3fb01ba5af066521f3f3421839e501bb2c71805 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Thu, 23 May 2019 16:10:18 -0400 Subject: [PATCH 027/153] blk-iolatency: only account submitted bios As is, iolatency recognizes done_bio and cleanup as ending paths. If a request is marked REQ_NOWAIT and fails to get a request, the bio is cleaned up via rq_qos_cleanup() and ended in bio_wouldblock_error(). This results in underflowing the inflight counter. Fix this by only accounting bios that were actually submitted. Signed-off-by: Dennis Zhou Cc: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 17896bb3aaf2..e8859350ab6e 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -600,6 +600,10 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) if (!blkg || !bio_flagged(bio, BIO_TRACKED)) return; + /* We didn't actually submit this bio, don't account it. */ + if (bio->bi_status == BLK_STS_AGAIN) + return; + iolat = blkg_to_lat(bio->bi_blkg); if (!iolat) return; From 0c8cf8c2a553f01732f23ba407fae8edb0a18ff5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Jun 2019 12:28:59 +0200 Subject: [PATCH 028/153] block: initialize the write priority in blk_rq_bio_prep The priority field also makes sense for passthrough requests, so initialize it in blk_rq_bio_prep. Reviewed-by: Minwoo Im Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index 94c6520bc786..b6f22f219389 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -693,7 +693,6 @@ void blk_init_request_from_bio(struct request *req, struct bio *bio) req->cmd_flags |= REQ_FAILFAST_MASK; req->__sector = bio->bi_iter.bi_sector; - req->ioprio = bio_prio(bio); req->write_hint = bio->bi_write_hint; blk_rq_bio_prep(req->q, req, bio); } @@ -1449,6 +1448,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; + rq->ioprio = bio_prio(bio); if (bio->bi_disk) rq->rq_disk = bio->bi_disk; From f924cddebc900f7cb10d5538d69523e558fa681c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Jun 2019 12:29:00 +0200 Subject: [PATCH 029/153] block: remove blk_init_request_from_bio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit lightnvm should have never used this function, as it is sending passthrough requests, so switch it to blk_rq_append_bio like all the other passthrough request users. Inline blk_init_request_from_bio into the only remaining caller. Reviewed-by: Hannes Reinecke Reviewed-by: Minwoo Im Reviewed-by: Javier González Reviewed-by: Matias Bjørling Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 11 ----------- block/blk-mq.c | 7 ++++++- drivers/nvme/host/lightnvm.c | 2 +- include/linux/blkdev.h | 1 - 4 files changed, 7 insertions(+), 14 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index b6f22f219389..d1c7c69a20dd 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -687,17 +687,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, return false; } -void blk_init_request_from_bio(struct request *req, struct bio *bio) -{ - if (bio->bi_opf & REQ_RAHEAD) - req->cmd_flags |= REQ_FAILFAST_MASK; - - req->__sector = bio->bi_iter.bi_sector; - req->write_hint = bio->bi_write_hint; - blk_rq_bio_prep(req->q, req, bio); -} -EXPORT_SYMBOL_GPL(blk_init_request_from_bio); - static void handle_bad_sector(struct bio *bio, sector_t maxsector) { char b[BDEVNAME_SIZE]; diff --git a/block/blk-mq.c b/block/blk-mq.c index ce0f5f4ede70..61457bffa55f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1766,7 +1766,12 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) { - blk_init_request_from_bio(rq, bio); + if (bio->bi_opf & REQ_RAHEAD) + rq->cmd_flags |= REQ_FAILFAST_MASK; + + rq->__sector = bio->bi_iter.bi_sector; + rq->write_hint = bio->bi_write_hint; + blk_rq_bio_prep(rq->q, rq, bio); blk_account_io_start(rq, true); } diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 4f20a10b39d3..ba009d4c9dfa 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -660,7 +660,7 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q, rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; if (rqd->bio) - blk_init_request_from_bio(rq, rqd->bio); + blk_rq_append_bio(rq, &rqd->bio); else rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ad49a775c54f..2d4dfe82767a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -828,7 +828,6 @@ extern void blk_unregister_queue(struct gendisk *disk); extern blk_qc_t generic_make_request(struct bio *bio); extern blk_qc_t direct_make_request(struct bio *bio); extern void blk_rq_init(struct request_queue *q, struct request *rq); -extern void blk_init_request_from_bio(struct request *req, struct bio *bio); extern void blk_put_request(struct request *); extern struct request *blk_get_request(struct request_queue *, unsigned int op, blk_mq_req_flags_t flags); From 14ccb66b3f585b2bc21e7256c96090abed5a512c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Jun 2019 12:29:01 +0200 Subject: [PATCH 030/153] block: remove the bi_phys_segments field in struct bio We only need the number of segments in the blk-mq submission path. Remove the field from struct bio, and return it from a variant of blk_queue_split instead of that it can passed as an argument to those functions that need the value. This also means we stop recounting segments except for cloning and partial segments. To keep the number of arguments in this how path down remove pointless struct request_queue arguments from any of the functions that had it and grew a nr_segs argument. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- Documentation/block/biodoc.txt | 1 - block/bfq-iosched.c | 5 ++- block/bio.c | 15 +------ block/blk-core.c | 32 +++++++-------- block/blk-map.c | 10 ++++- block/blk-merge.c | 75 ++++++++++++---------------------- block/blk-mq-sched.c | 26 +++++++----- block/blk-mq-sched.h | 10 +++-- block/blk-mq.c | 23 ++++++----- block/blk.h | 23 ++++++----- block/kyber-iosched.c | 5 ++- block/mq-deadline.c | 5 ++- drivers/md/raid5.c | 1 - include/linux/bio.h | 1 - include/linux/blk-mq.h | 2 +- include/linux/blk_types.h | 6 --- include/linux/blkdev.h | 1 - include/linux/elevator.h | 2 +- 18 files changed, 106 insertions(+), 137 deletions(-) diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index ac18b488cb5e..31c177663ed5 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -436,7 +436,6 @@ struct bio { struct bvec_iter bi_iter; /* current index into bio_vec array */ unsigned int bi_size; /* total size in bytes */ - unsigned short bi_phys_segments; /* segments after physaddr coalesce*/ unsigned short bi_hw_segments; /* segments after DMA remapping */ unsigned int bi_max; /* max bio_vecs we can hold used as index into pool */ diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index f8d430f88d25..a6bf842cbe16 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2027,7 +2027,8 @@ static void bfq_remove_request(struct request_queue *q, } -static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) +static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, + unsigned int nr_segs) { struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; @@ -2050,7 +2051,7 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) bfqd->bio_bfqq = NULL; bfqd->bio_bic = bic; - ret = blk_mq_sched_try_merge(q, bio, &free); + ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); if (free) blk_mq_free_request(free); diff --git a/block/bio.c b/block/bio.c index 4bcdcd3f63f4..ad9c3aa9bf7d 100644 --- a/block/bio.c +++ b/block/bio.c @@ -558,14 +558,6 @@ void bio_put(struct bio *bio) } EXPORT_SYMBOL(bio_put); -int bio_phys_segments(struct request_queue *q, struct bio *bio) -{ - if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) - blk_recount_segments(q, bio); - - return bio->bi_phys_segments; -} - /** * __bio_clone_fast - clone a bio that shares the original bio's biovec * @bio: destination bio @@ -739,7 +731,7 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio, if (bio_full(bio)) return 0; - if (bio->bi_phys_segments >= queue_max_segments(q)) + if (bio->bi_vcnt >= queue_max_segments(q)) return 0; bvec = &bio->bi_io_vec[bio->bi_vcnt]; @@ -749,8 +741,6 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio, bio->bi_vcnt++; done: bio->bi_iter.bi_size += len; - bio->bi_phys_segments = bio->bi_vcnt; - bio_set_flag(bio, BIO_SEG_VALID); return len; } @@ -1909,10 +1899,7 @@ void bio_trim(struct bio *bio, int offset, int size) if (offset == 0 && size == bio->bi_iter.bi_size) return; - bio_clear_flag(bio, BIO_SEG_VALID); - bio_advance(bio, offset << 9); - bio->bi_iter.bi_size = size; if (bio_integrity(bio)) diff --git a/block/blk-core.c b/block/blk-core.c index d1c7c69a20dd..ef998a724b27 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -550,15 +550,15 @@ void blk_put_request(struct request *req) } EXPORT_SYMBOL(blk_put_request); -bool bio_attempt_back_merge(struct request_queue *q, struct request *req, - struct bio *bio) +bool bio_attempt_back_merge(struct request *req, struct bio *bio, + unsigned int nr_segs) { const int ff = bio->bi_opf & REQ_FAILFAST_MASK; - if (!ll_back_merge_fn(q, req, bio)) + if (!ll_back_merge_fn(req, bio, nr_segs)) return false; - trace_block_bio_backmerge(q, req, bio); + trace_block_bio_backmerge(req->q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); @@ -571,15 +571,15 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req, return true; } -bool bio_attempt_front_merge(struct request_queue *q, struct request *req, - struct bio *bio) +bool bio_attempt_front_merge(struct request *req, struct bio *bio, + unsigned int nr_segs) { const int ff = bio->bi_opf & REQ_FAILFAST_MASK; - if (!ll_front_merge_fn(q, req, bio)) + if (!ll_front_merge_fn(req, bio, nr_segs)) return false; - trace_block_bio_frontmerge(q, req, bio); + trace_block_bio_frontmerge(req->q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); @@ -621,6 +621,7 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, * blk_attempt_plug_merge - try to merge with %current's plugged list * @q: request_queue new bio is being queued at * @bio: new bio being queued + * @nr_segs: number of segments in @bio * @same_queue_rq: pointer to &struct request that gets filled in when * another request associated with @q is found on the plug list * (optional, may be %NULL) @@ -639,7 +640,7 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - struct request **same_queue_rq) + unsigned int nr_segs, struct request **same_queue_rq) { struct blk_plug *plug; struct request *rq; @@ -668,10 +669,10 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, switch (blk_try_merge(rq, bio)) { case ELEVATOR_BACK_MERGE: - merged = bio_attempt_back_merge(q, rq, bio); + merged = bio_attempt_back_merge(rq, bio, nr_segs); break; case ELEVATOR_FRONT_MERGE: - merged = bio_attempt_front_merge(q, rq, bio); + merged = bio_attempt_front_merge(rq, bio, nr_segs); break; case ELEVATOR_DISCARD_MERGE: merged = bio_attempt_discard_merge(q, rq, bio); @@ -1427,14 +1428,9 @@ bool blk_update_request(struct request *req, blk_status_t error, } EXPORT_SYMBOL_GPL(blk_update_request); -void blk_rq_bio_prep(struct request_queue *q, struct request *rq, - struct bio *bio) +void blk_rq_bio_prep(struct request *rq, struct bio *bio, unsigned int nr_segs) { - if (bio_has_data(bio)) - rq->nr_phys_segments = bio_phys_segments(q, bio); - else if (bio_op(bio) == REQ_OP_DISCARD) - rq->nr_phys_segments = 1; - + rq->nr_phys_segments = nr_segs; rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; rq->ioprio = bio_prio(bio); diff --git a/block/blk-map.c b/block/blk-map.c index db9373bd31ac..3a62e471d81b 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -18,13 +18,19 @@ int blk_rq_append_bio(struct request *rq, struct bio **bio) { struct bio *orig_bio = *bio; + struct bvec_iter iter; + struct bio_vec bv; + unsigned int nr_segs = 0; blk_queue_bounce(rq->q, bio); + bio_for_each_bvec(bv, *bio, iter) + nr_segs++; + if (!rq->bio) { - blk_rq_bio_prep(rq->q, rq, *bio); + blk_rq_bio_prep(rq, *bio, nr_segs); } else { - if (!ll_back_merge_fn(rq->q, rq, *bio)) { + if (!ll_back_merge_fn(rq, *bio, nr_segs)) { if (orig_bio != *bio) { bio_put(*bio); *bio = orig_bio; diff --git a/block/blk-merge.c b/block/blk-merge.c index 17713d7d98d5..72b4fd89a22d 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -258,32 +258,29 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, return do_split ? new : NULL; } -void blk_queue_split(struct request_queue *q, struct bio **bio) +void __blk_queue_split(struct request_queue *q, struct bio **bio, + unsigned int *nr_segs) { - struct bio *split, *res; - unsigned nsegs; + struct bio *split; switch (bio_op(*bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: - split = blk_bio_discard_split(q, *bio, &q->bio_split, &nsegs); + split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs); break; case REQ_OP_WRITE_ZEROES: - split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, &nsegs); + split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, + nr_segs); break; case REQ_OP_WRITE_SAME: - split = blk_bio_write_same_split(q, *bio, &q->bio_split, &nsegs); + split = blk_bio_write_same_split(q, *bio, &q->bio_split, + nr_segs); break; default: - split = blk_bio_segment_split(q, *bio, &q->bio_split, &nsegs); + split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs); break; } - /* physical segments can be figured out during splitting */ - res = split ? split : *bio; - res->bi_phys_segments = nsegs; - bio_set_flag(res, BIO_SEG_VALID); - if (split) { /* there isn't chance to merge the splitted bio */ split->bi_opf |= REQ_NOMERGE; @@ -304,6 +301,13 @@ void blk_queue_split(struct request_queue *q, struct bio **bio) *bio = split; } } + +void blk_queue_split(struct request_queue *q, struct bio **bio) +{ + unsigned int nr_segs; + + __blk_queue_split(q, bio, &nr_segs); +} EXPORT_SYMBOL(blk_queue_split); static unsigned int __blk_recalc_rq_segments(struct request_queue *q, @@ -338,17 +342,6 @@ void blk_recalc_rq_segments(struct request *rq) rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio); } -void blk_recount_segments(struct request_queue *q, struct bio *bio) -{ - struct bio *nxt = bio->bi_next; - - bio->bi_next = NULL; - bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio); - bio->bi_next = nxt; - - bio_set_flag(bio, BIO_SEG_VALID); -} - static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) { @@ -519,16 +512,13 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(blk_rq_map_sg); -static inline int ll_new_hw_segment(struct request_queue *q, - struct request *req, - struct bio *bio) +static inline int ll_new_hw_segment(struct request *req, struct bio *bio, + unsigned int nr_phys_segs) { - int nr_phys_segs = bio_phys_segments(q, bio); - - if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) + if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(req->q)) goto no_merge; - if (blk_integrity_merge_bio(q, req, bio) == false) + if (blk_integrity_merge_bio(req->q, req, bio) == false) goto no_merge; /* @@ -539,12 +529,11 @@ static inline int ll_new_hw_segment(struct request_queue *q, return 1; no_merge: - req_set_nomerge(q, req); + req_set_nomerge(req->q, req); return 0; } -int ll_back_merge_fn(struct request_queue *q, struct request *req, - struct bio *bio) +int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs) { if (req_gap_back_merge(req, bio)) return 0; @@ -553,21 +542,15 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req, blk_rq_pos(req))) { - req_set_nomerge(q, req); + req_set_nomerge(req->q, req); return 0; } - if (!bio_flagged(req->biotail, BIO_SEG_VALID)) - blk_recount_segments(q, req->biotail); - if (!bio_flagged(bio, BIO_SEG_VALID)) - blk_recount_segments(q, bio); - return ll_new_hw_segment(q, req, bio); + return ll_new_hw_segment(req, bio, nr_segs); } -int ll_front_merge_fn(struct request_queue *q, struct request *req, - struct bio *bio) +int ll_front_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs) { - if (req_gap_front_merge(req, bio)) return 0; if (blk_integrity_rq(req) && @@ -575,15 +558,11 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) { - req_set_nomerge(q, req); + req_set_nomerge(req->q, req); return 0; } - if (!bio_flagged(bio, BIO_SEG_VALID)) - blk_recount_segments(q, bio); - if (!bio_flagged(req->bio, BIO_SEG_VALID)) - blk_recount_segments(q, req->bio); - return ll_new_hw_segment(q, req, bio); + return ll_new_hw_segment(req, bio, nr_segs); } static bool req_attempt_discard_merge(struct request_queue *q, struct request *req, diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 2766066a15db..956a7aa9a637 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -224,7 +224,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) } bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, - struct request **merged_request) + unsigned int nr_segs, struct request **merged_request) { struct request *rq; @@ -232,7 +232,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, case ELEVATOR_BACK_MERGE: if (!blk_mq_sched_allow_merge(q, rq, bio)) return false; - if (!bio_attempt_back_merge(q, rq, bio)) + if (!bio_attempt_back_merge(rq, bio, nr_segs)) return false; *merged_request = attempt_back_merge(q, rq); if (!*merged_request) @@ -241,7 +241,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, case ELEVATOR_FRONT_MERGE: if (!blk_mq_sched_allow_merge(q, rq, bio)) return false; - if (!bio_attempt_front_merge(q, rq, bio)) + if (!bio_attempt_front_merge(rq, bio, nr_segs)) return false; *merged_request = attempt_front_merge(q, rq); if (!*merged_request) @@ -260,7 +260,7 @@ EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); * of them. */ bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, - struct bio *bio) + struct bio *bio, unsigned int nr_segs) { struct request *rq; int checked = 8; @@ -277,11 +277,13 @@ bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, switch (blk_try_merge(rq, bio)) { case ELEVATOR_BACK_MERGE: if (blk_mq_sched_allow_merge(q, rq, bio)) - merged = bio_attempt_back_merge(q, rq, bio); + merged = bio_attempt_back_merge(rq, bio, + nr_segs); break; case ELEVATOR_FRONT_MERGE: if (blk_mq_sched_allow_merge(q, rq, bio)) - merged = bio_attempt_front_merge(q, rq, bio); + merged = bio_attempt_front_merge(rq, bio, + nr_segs); break; case ELEVATOR_DISCARD_MERGE: merged = bio_attempt_discard_merge(q, rq, bio); @@ -304,13 +306,14 @@ EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge); */ static bool blk_mq_attempt_merge(struct request_queue *q, struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx, struct bio *bio) + struct blk_mq_ctx *ctx, struct bio *bio, + unsigned int nr_segs) { enum hctx_type type = hctx->type; lockdep_assert_held(&ctx->lock); - if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio)) { + if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) { ctx->rq_merged++; return true; } @@ -318,7 +321,8 @@ static bool blk_mq_attempt_merge(struct request_queue *q, return false; } -bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) +bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs) { struct elevator_queue *e = q->elevator; struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); @@ -328,7 +332,7 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) if (e && e->type->ops.bio_merge) { blk_mq_put_ctx(ctx); - return e->type->ops.bio_merge(hctx, bio); + return e->type->ops.bio_merge(hctx, bio, nr_segs); } type = hctx->type; @@ -336,7 +340,7 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) !list_empty_careful(&ctx->rq_lists[type])) { /* default per sw-queue merge */ spin_lock(&ctx->lock); - ret = blk_mq_attempt_merge(q, hctx, ctx, bio); + ret = blk_mq_attempt_merge(q, hctx, ctx, bio, nr_segs); spin_unlock(&ctx->lock); } diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 3cf92cbbd8ac..cf22ab00fefb 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -12,8 +12,9 @@ void blk_mq_sched_assign_ioc(struct request *rq); void blk_mq_sched_request_inserted(struct request *rq); bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, - struct request **merged_request); -bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); + unsigned int nr_segs, struct request **merged_request); +bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs); bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); @@ -31,12 +32,13 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_sched_free_requests(struct request_queue *q); static inline bool -blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) +blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs) { if (blk_queue_nomerges(q) || !bio_mergeable(bio)) return false; - return __blk_mq_sched_bio_merge(q, bio); + return __blk_mq_sched_bio_merge(q, bio, nr_segs); } static inline bool diff --git a/block/blk-mq.c b/block/blk-mq.c index 61457bffa55f..d89383847d09 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1764,14 +1764,15 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) } } -static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) +static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, + unsigned int nr_segs) { if (bio->bi_opf & REQ_RAHEAD) rq->cmd_flags |= REQ_FAILFAST_MASK; rq->__sector = bio->bi_iter.bi_sector; rq->write_hint = bio->bi_write_hint; - blk_rq_bio_prep(rq->q, rq, bio); + blk_rq_bio_prep(rq, bio, nr_segs); blk_account_io_start(rq, true); } @@ -1941,20 +1942,20 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) struct request *rq; struct blk_plug *plug; struct request *same_queue_rq = NULL; + unsigned int nr_segs; blk_qc_t cookie; blk_queue_bounce(q, &bio); - - blk_queue_split(q, &bio); + __blk_queue_split(q, &bio, &nr_segs); if (!bio_integrity_prep(bio)) return BLK_QC_T_NONE; if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &same_queue_rq)) + blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq)) return BLK_QC_T_NONE; - if (blk_mq_sched_bio_merge(q, bio)) + if (blk_mq_sched_bio_merge(q, bio, nr_segs)) return BLK_QC_T_NONE; rq_qos_throttle(q, bio); @@ -1977,7 +1978,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) plug = current->plug; if (unlikely(is_flush_fua)) { blk_mq_put_ctx(data.ctx); - blk_mq_bio_to_request(rq, bio); + blk_mq_bio_to_request(rq, bio, nr_segs); /* bypass scheduler for flush rq */ blk_insert_flush(rq); @@ -1991,7 +1992,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) struct request *last = NULL; blk_mq_put_ctx(data.ctx); - blk_mq_bio_to_request(rq, bio); + blk_mq_bio_to_request(rq, bio, nr_segs); if (!request_count) trace_block_plug(q); @@ -2006,7 +2007,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_add_rq_to_plug(plug, rq); } else if (plug && !blk_queue_nomerges(q)) { - blk_mq_bio_to_request(rq, bio); + blk_mq_bio_to_request(rq, bio, nr_segs); /* * We do limited plugging. If the bio can be merged, do that. @@ -2035,11 +2036,11 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && !data.hctx->dispatch_busy)) { blk_mq_put_ctx(data.ctx); - blk_mq_bio_to_request(rq, bio); + blk_mq_bio_to_request(rq, bio, nr_segs); blk_mq_try_issue_directly(data.hctx, rq, &cookie); } else { blk_mq_put_ctx(data.ctx); - blk_mq_bio_to_request(rq, bio); + blk_mq_bio_to_request(rq, bio, nr_segs); blk_mq_sched_insert_request(rq, false, true, true); } diff --git a/block/blk.h b/block/blk.h index 7814aa207153..a1d33cb65842 100644 --- a/block/blk.h +++ b/block/blk.h @@ -51,8 +51,7 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, int node, int cmd_size, gfp_t flags); void blk_free_flush_queue(struct blk_flush_queue *q); -void blk_rq_bio_prep(struct request_queue *q, struct request *rq, - struct bio *bio); +void blk_rq_bio_prep(struct request *rq, struct bio *bio, unsigned int nr_segs); void blk_freeze_queue(struct request_queue *q); static inline void blk_queue_enter_live(struct request_queue *q) @@ -154,14 +153,14 @@ static inline bool bio_integrity_endio(struct bio *bio) unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); -bool bio_attempt_front_merge(struct request_queue *q, struct request *req, - struct bio *bio); -bool bio_attempt_back_merge(struct request_queue *q, struct request *req, - struct bio *bio); +bool bio_attempt_front_merge(struct request *req, struct bio *bio, + unsigned int nr_segs); +bool bio_attempt_back_merge(struct request *req, struct bio *bio, + unsigned int nr_segs); bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, struct bio *bio); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - struct request **same_queue_rq); + unsigned int nr_segs, struct request **same_queue_rq); void blk_account_io_start(struct request *req, bool new_io); void blk_account_io_completion(struct request *req, unsigned int bytes); @@ -202,10 +201,12 @@ static inline int blk_should_fake_timeout(struct request_queue *q) } #endif -int ll_back_merge_fn(struct request_queue *q, struct request *req, - struct bio *bio); -int ll_front_merge_fn(struct request_queue *q, struct request *req, - struct bio *bio); +void __blk_queue_split(struct request_queue *q, struct bio **bio, + unsigned int *nr_segs); +int ll_back_merge_fn(struct request *req, struct bio *bio, + unsigned int nr_segs); +int ll_front_merge_fn(struct request *req, struct bio *bio, + unsigned int nr_segs); struct request *attempt_back_merge(struct request_queue *q, struct request *rq); struct request *attempt_front_merge(struct request_queue *q, struct request *rq); int blk_attempt_req_merge(struct request_queue *q, struct request *rq, diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index c3b05119cebd..3c2602601741 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -562,7 +562,8 @@ static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) } } -static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) +static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, + unsigned int nr_segs) { struct kyber_hctx_data *khd = hctx->sched_data; struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue); @@ -572,7 +573,7 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) bool merged; spin_lock(&kcq->lock); - merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio); + merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio, nr_segs); spin_unlock(&kcq->lock); blk_mq_put_ctx(ctx); diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 1876f5712bfd..b8a682b5a1bb 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -469,7 +469,8 @@ static int dd_request_merge(struct request_queue *q, struct request **rq, return ELEVATOR_NO_MERGE; } -static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) +static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, + unsigned int nr_segs) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; @@ -477,7 +478,7 @@ static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) bool ret; spin_lock(&dd->lock); - ret = blk_mq_sched_try_merge(q, bio, &free); + ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); spin_unlock(&dd->lock); if (free) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index da94cbaa1a9e..3de4e13bde98 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5251,7 +5251,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) rcu_read_unlock(); raid_bio->bi_next = (void*)rdev; bio_set_dev(align_bi, rdev->bdev); - bio_clear_flag(align_bi, BIO_SEG_VALID); if (is_badblock(rdev, align_bi->bi_iter.bi_sector, bio_sectors(align_bi), diff --git a/include/linux/bio.h b/include/linux/bio.h index 0f23b5682640..ee11c4324751 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -408,7 +408,6 @@ static inline void bio_wouldblock_error(struct bio *bio) } struct request_queue; -extern int bio_phys_segments(struct request_queue *, struct bio *); extern int submit_bio_wait(struct bio *bio); extern void bio_advance(struct bio *, unsigned); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 15d1aa53d96c..3fa1fa59f9b2 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -306,7 +306,7 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs bool blk_mq_complete_request(struct request *rq); void blk_mq_complete_request_sync(struct request *rq); bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, - struct bio *bio); + struct bio *bio, unsigned int nr_segs); bool blk_mq_queue_stopped(struct request_queue *q); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 95202f80676c..6a53799c3fe2 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -154,11 +154,6 @@ struct bio { blk_status_t bi_status; u8 bi_partno; - /* Number of segments in this BIO after - * physical address coalescing is performed. - */ - unsigned int bi_phys_segments; - struct bvec_iter bi_iter; atomic_t __bi_remaining; @@ -210,7 +205,6 @@ struct bio { */ enum { BIO_NO_PAGE_REF, /* don't put release vec pages */ - BIO_SEG_VALID, /* bi_phys_segments valid */ BIO_CLONED, /* doesn't own data */ BIO_BOUNCED, /* bio is a bounce bio */ BIO_USER_MAPPED, /* contains user pages */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2d4dfe82767a..d5d3bb45dfb6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -841,7 +841,6 @@ extern blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq); extern int blk_rq_append_bio(struct request *rq, struct bio **bio); extern void blk_queue_split(struct request_queue *, struct bio **); -extern void blk_recount_segments(struct request_queue *, struct bio *); extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int); extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t, unsigned int, void __user *); diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 6e8bc53740f0..169bb2e02516 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -34,7 +34,7 @@ struct elevator_mq_ops { void (*depth_updated)(struct blk_mq_hw_ctx *); bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); - bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *); + bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *, unsigned int); int (*request_merge)(struct request_queue *q, struct request **, struct bio *); void (*request_merged)(struct request_queue *, struct request *, enum elv_merge); void (*requests_merged)(struct request_queue *, struct request *, struct request *); From e9cd19c0c198aa1c893e142b015fde6da862ed52 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Jun 2019 12:29:02 +0200 Subject: [PATCH 031/153] block: simplify blk_recalc_rq_segments Return the segement and let the callers assign them, which makes the code a littler more obvious. Also pass the request instead of q plus bio chain, allowing for the use of rq_for_each_bvec. Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 4 ++-- block/blk-merge.c | 21 ++++++--------------- block/blk.h | 2 +- 3 files changed, 9 insertions(+), 18 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index ef998a724b27..ccba87bb5267 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1152,7 +1152,7 @@ static int blk_cloned_rq_check_limits(struct request_queue *q, * Recalculate it to check the request correctly on this queue's * limitation. */ - blk_recalc_rq_segments(rq); + rq->nr_phys_segments = blk_recalc_rq_segments(rq); if (rq->nr_phys_segments > queue_max_segments(q)) { printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n", __func__, rq->nr_phys_segments, queue_max_segments(q)); @@ -1421,7 +1421,7 @@ bool blk_update_request(struct request *req, blk_status_t error, } /* recalculate the number of segments */ - blk_recalc_rq_segments(req); + req->nr_phys_segments = blk_recalc_rq_segments(req); } return true; diff --git a/block/blk-merge.c b/block/blk-merge.c index 72b4fd89a22d..2ea21ffd5f72 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -310,17 +310,16 @@ void blk_queue_split(struct request_queue *q, struct bio **bio) } EXPORT_SYMBOL(blk_queue_split); -static unsigned int __blk_recalc_rq_segments(struct request_queue *q, - struct bio *bio) +unsigned int blk_recalc_rq_segments(struct request *rq) { unsigned int nr_phys_segs = 0; - struct bvec_iter iter; + struct req_iterator iter; struct bio_vec bv; - if (!bio) + if (!rq->bio) return 0; - switch (bio_op(bio)) { + switch (bio_op(rq->bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: case REQ_OP_WRITE_ZEROES: @@ -329,19 +328,11 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, return 1; } - for_each_bio(bio) { - bio_for_each_bvec(bv, bio, iter) - bvec_split_segs(q, &bv, &nr_phys_segs, NULL, UINT_MAX); - } - + rq_for_each_bvec(bv, rq, iter) + bvec_split_segs(rq->q, &bv, &nr_phys_segs, NULL, UINT_MAX); return nr_phys_segs; } -void blk_recalc_rq_segments(struct request *rq) -{ - rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio); -} - static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) { diff --git a/block/blk.h b/block/blk.h index a1d33cb65842..c62e801b2582 100644 --- a/block/blk.h +++ b/block/blk.h @@ -211,7 +211,7 @@ struct request *attempt_back_merge(struct request_queue *q, struct request *rq); struct request *attempt_front_merge(struct request_queue *q, struct request *rq); int blk_attempt_req_merge(struct request_queue *q, struct request *rq, struct request *next); -void blk_recalc_rq_segments(struct request *rq); +unsigned int blk_recalc_rq_segments(struct request *rq); void blk_rq_set_mixed_merge(struct request *rq); bool blk_rq_merge_ok(struct request *rq, struct bio *bio); enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); From d627065d88469933bc1527f97c539c464482f0bb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Jun 2019 12:29:03 +0200 Subject: [PATCH 032/153] block: untangle the end of blk_bio_segment_split Now that we don't need to assign the front/back segment sizes, we can duplicating the segs assignment for the split vs no-split case and remove a whole chunk of boilerplate code. Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-merge.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 2ea21ffd5f72..ca45eb51c669 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -202,8 +202,6 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; unsigned nsegs = 0, sectors = 0; - bool do_split = true; - struct bio *new = NULL; const unsigned max_sectors = get_max_io_size(q, bio); const unsigned max_segs = queue_max_segments(q); @@ -245,17 +243,11 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, } } - do_split = false; + *segs = nsegs; + return NULL; split: *segs = nsegs; - - if (do_split) { - new = bio_split(bio, sectors, GFP_NOIO, bs); - if (new) - bio = new; - } - - return do_split ? new : NULL; + return bio_split(bio, sectors, GFP_NOIO, bs); } void __blk_queue_split(struct request_queue *q, struct bio **bio, From 1aa0a133fbabeca9e8785fb11de471841009d6d9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Jun 2019 12:29:04 +0200 Subject: [PATCH 033/153] block: mark blk_rq_bio_prep as inline This function just has a few trivial assignments, has two callers with one of them being in the fastpath. Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 11 ----------- block/blk.h | 13 ++++++++++++- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index ccba87bb5267..e1b77113671e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1428,17 +1428,6 @@ bool blk_update_request(struct request *req, blk_status_t error, } EXPORT_SYMBOL_GPL(blk_update_request); -void blk_rq_bio_prep(struct request *rq, struct bio *bio, unsigned int nr_segs) -{ - rq->nr_phys_segments = nr_segs; - rq->__data_len = bio->bi_iter.bi_size; - rq->bio = rq->biotail = bio; - rq->ioprio = bio_prio(bio); - - if (bio->bi_disk) - rq->rq_disk = bio->bi_disk; -} - #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE /** * rq_flush_dcache_pages - Helper function to flush all pages in a request diff --git a/block/blk.h b/block/blk.h index c62e801b2582..de6b2e146d6e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -51,7 +51,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, int node, int cmd_size, gfp_t flags); void blk_free_flush_queue(struct blk_flush_queue *q); -void blk_rq_bio_prep(struct request *rq, struct bio *bio, unsigned int nr_segs); void blk_freeze_queue(struct request_queue *q); static inline void blk_queue_enter_live(struct request_queue *q) @@ -100,6 +99,18 @@ static inline bool bvec_gap_to_prev(struct request_queue *q, return __bvec_gap_to_prev(q, bprv, offset); } +static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, + unsigned int nr_segs) +{ + rq->nr_phys_segments = nr_segs; + rq->__data_len = bio->bi_iter.bi_size; + rq->bio = rq->biotail = bio; + rq->ioprio = bio_prio(bio); + + if (bio->bi_disk) + rq->rq_disk = bio->bi_disk; +} + #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); bool __bio_integrity_endio(struct bio *); From 239eeb085753d4356f731a773f363eb5bed4fe81 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Jun 2019 12:26:19 +0200 Subject: [PATCH 034/153] blk-cgroup: factor out a helper to read rwstat counter Trying to break up the crazy statements to something readable. Also switch to an unsigned counter as it can't ever turn negative. Reviewed-by: Chaitanya Kulkarni Acked-by: Tejun Heo Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 5 ++--- include/linux/blk-cgroup.h | 7 +++++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 440797293235..0778e52b1db2 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -745,7 +745,7 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; struct blkg_rwstat sum = { }; - int i; + unsigned int i; lockdep_assert_held(&blkg->q->queue_lock); @@ -762,8 +762,7 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, rwstat = (void *)pos_blkg + off; for (i = 0; i < BLKG_RWSTAT_NR; i++) - atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) + - percpu_counter_sum_positive(&rwstat->cpu_cnt[i]), + atomic64_add(blkg_rwstat_read_counter(rwstat, i), &sum.aux_cnt[i]); } rcu_read_unlock(); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 76c61318fda5..06236f56a840 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -198,6 +198,13 @@ int blkcg_activate_policy(struct request_queue *q, void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol); +static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat, + unsigned int idx) +{ + return atomic64_read(&rwstat->aux_cnt[idx]) + + percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]); +} + const char *blkg_dev_name(struct blkcg_gq *blkg); void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, From 5d0b6e48cbef3219c0ed75e0e746c4ed259303c2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Jun 2019 12:26:20 +0200 Subject: [PATCH 035/153] blk-cgroup: pass blkg_rwstat structures by reference Returning a structure generates rather bad code, so switch to passing by reference. Also don't require the structure to be zeroed and add to the 0-initialized counters, but actually set the counters to the calculated value. Acked-by: Tejun Heo Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 15 +++++++++------ block/blk-cgroup.c | 31 ++++++++++++++++--------------- include/linux/blk-cgroup.h | 14 +++++++------- 3 files changed, 32 insertions(+), 28 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index b3796a40a61a..66abc82179f3 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -935,9 +935,9 @@ static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), - &blkcg_policy_bfq, - off); + struct blkg_rwstat sum; + + blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum); return __blkg_prfill_rwstat(sf, pd, &sum); } @@ -975,9 +975,12 @@ static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, - offsetof(struct blkcg_gq, stat_bytes)); - u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + + struct blkg_rwstat tmp; + u64 sum; + + blkg_rwstat_recursive_sum(pd->blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes), &tmp); + sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); return __blkg_prfill_u64(sf, pd, sum >> 9); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0778e52b1db2..db039a869d95 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -597,8 +597,9 @@ EXPORT_SYMBOL_GPL(blkg_prfill_stat); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off); + struct blkg_rwstat rwstat = { }; + blkg_rwstat_read((void *)pd + off, &rwstat); return __blkg_prfill_rwstat(sf, pd, &rwstat); } EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); @@ -606,8 +607,9 @@ EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); static u64 blkg_prfill_rwstat_field(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off); + struct blkg_rwstat rwstat = { }; + blkg_rwstat_read((void *)pd->blkg + off, &rwstat); return __blkg_prfill_rwstat(sf, pd, &rwstat); } @@ -649,8 +651,9 @@ static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg, - NULL, off); + struct blkg_rwstat rwstat; + + blkg_rwstat_recursive_sum(pd->blkg, NULL, off, &rwstat); return __blkg_prfill_rwstat(sf, pd, &rwstat); } @@ -731,6 +734,7 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); * @blkg: blkg of interest * @pol: blkcg_policy which contains the blkg_rwstat * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg + * @sum: blkg_rwstat structure containing the results * * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its * online descendants and their aux counts. The caller must be holding the @@ -739,12 +743,11 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it * is at @off bytes into @blkg's blkg_policy_data of the policy. */ -struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, - struct blkcg_policy *pol, int off) +void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, + int off, struct blkg_rwstat *sum) { struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; - struct blkg_rwstat sum = { }; unsigned int i; lockdep_assert_held(&blkg->q->queue_lock); @@ -762,12 +765,10 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, rwstat = (void *)pos_blkg + off; for (i = 0; i < BLKG_RWSTAT_NR; i++) - atomic64_add(blkg_rwstat_read_counter(rwstat, i), - &sum.aux_cnt[i]); + atomic64_set(&sum->aux_cnt[i], + blkg_rwstat_read_counter(rwstat, i)); } rcu_read_unlock(); - - return sum; } EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); @@ -953,14 +954,14 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) spin_lock_irq(&blkg->q->queue_lock); - rwstat = blkg_rwstat_recursive_sum(blkg, NULL, - offsetof(struct blkcg_gq, stat_bytes)); + blkg_rwstat_recursive_sum(blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes), &rwstat); rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); - rwstat = blkg_rwstat_recursive_sum(blkg, NULL, - offsetof(struct blkcg_gq, stat_ios)); + blkg_rwstat_recursive_sum(blkg, NULL, + offsetof(struct blkcg_gq, stat_ios), &rwstat); rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 06236f56a840..3ee858111274 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -224,8 +224,8 @@ int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v); u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, int off); -struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, - struct blkcg_policy *pol, int off); +void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, + int off, struct blkg_rwstat *sum); struct blkg_conf_ctx { struct gendisk *disk; @@ -700,15 +700,14 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, * * Read the current snapshot of @rwstat and return it in the aux counts. */ -static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) +static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat, + struct blkg_rwstat *result) { - struct blkg_rwstat result; int i; for (i = 0; i < BLKG_RWSTAT_NR; i++) - atomic64_set(&result.aux_cnt[i], + atomic64_set(&result->aux_cnt[i], percpu_counter_sum_positive(&rwstat->cpu_cnt[i])); - return result; } /** @@ -721,8 +720,9 @@ static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) */ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) { - struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); + struct blkg_rwstat tmp = { }; + blkg_rwstat_read(rwstat, &tmp); return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); } From 7af6fd9112ba310a889c60d0606b4b74049cfe14 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Jun 2019 12:26:21 +0200 Subject: [PATCH 036/153] blk-cgroup: introduce a new struct blkg_rwstat_sample When sampling the blkcg counts we don't need atomics or per-cpu variables. Introduce a new structure just containing plain u64 counters. Acked-by: Tejun Heo Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 10 ++++------ block/blk-cgroup.c | 39 +++++++++++++++++++------------------- include/linux/blk-cgroup.h | 22 +++++++++++---------- 3 files changed, 35 insertions(+), 36 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 66abc82179f3..624374a99c6e 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -935,7 +935,7 @@ static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat sum; + struct blkg_rwstat_sample sum; blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum); return __blkg_prfill_rwstat(sf, pd, &sum); @@ -975,15 +975,13 @@ static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat tmp; - u64 sum; + struct blkg_rwstat_sample tmp; blkg_rwstat_recursive_sum(pd->blkg, NULL, offsetof(struct blkcg_gq, stat_bytes), &tmp); - sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + - atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); - return __blkg_prfill_u64(sf, pd, sum >> 9); + return __blkg_prfill_u64(sf, pd, + (tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]) >> 9); } static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index db039a869d95..664c09866839 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -544,7 +544,7 @@ EXPORT_SYMBOL_GPL(__blkg_prfill_u64); * Print @rwstat to @sf for the device assocaited with @pd. */ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - const struct blkg_rwstat *rwstat) + const struct blkg_rwstat_sample *rwstat) { static const char *rwstr[] = { [BLKG_RWSTAT_READ] = "Read", @@ -562,12 +562,12 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, for (i = 0; i < BLKG_RWSTAT_NR; i++) seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], - (unsigned long long)atomic64_read(&rwstat->aux_cnt[i])); + rwstat->cnt[i]); - v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) + - atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) + - atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]); - seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); + v = rwstat->cnt[BLKG_RWSTAT_READ] + + rwstat->cnt[BLKG_RWSTAT_WRITE] + + rwstat->cnt[BLKG_RWSTAT_DISCARD]; + seq_printf(sf, "%s Total %llu\n", dname, v); return v; } EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); @@ -597,7 +597,7 @@ EXPORT_SYMBOL_GPL(blkg_prfill_stat); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat rwstat = { }; + struct blkg_rwstat_sample rwstat = { }; blkg_rwstat_read((void *)pd + off, &rwstat); return __blkg_prfill_rwstat(sf, pd, &rwstat); @@ -607,7 +607,7 @@ EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); static u64 blkg_prfill_rwstat_field(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat rwstat = { }; + struct blkg_rwstat_sample rwstat = { }; blkg_rwstat_read((void *)pd->blkg + off, &rwstat); return __blkg_prfill_rwstat(sf, pd, &rwstat); @@ -651,7 +651,7 @@ static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat rwstat; + struct blkg_rwstat_sample rwstat; blkg_rwstat_recursive_sum(pd->blkg, NULL, off, &rwstat); return __blkg_prfill_rwstat(sf, pd, &rwstat); @@ -734,7 +734,7 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); * @blkg: blkg of interest * @pol: blkcg_policy which contains the blkg_rwstat * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg - * @sum: blkg_rwstat structure containing the results + * @sum: blkg_rwstat_sample structure containing the results * * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its * online descendants and their aux counts. The caller must be holding the @@ -744,7 +744,7 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); * is at @off bytes into @blkg's blkg_policy_data of the policy. */ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, - int off, struct blkg_rwstat *sum) + int off, struct blkg_rwstat_sample *sum) { struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; @@ -765,8 +765,7 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, rwstat = (void *)pos_blkg + off; for (i = 0; i < BLKG_RWSTAT_NR; i++) - atomic64_set(&sum->aux_cnt[i], - blkg_rwstat_read_counter(rwstat, i)); + sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i); } rcu_read_unlock(); } @@ -934,7 +933,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { const char *dname; char *buf; - struct blkg_rwstat rwstat; + struct blkg_rwstat_sample rwstat; u64 rbytes, wbytes, rios, wios, dbytes, dios; size_t size = seq_get_buf(sf, &buf), off = 0; int i; @@ -956,15 +955,15 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) blkg_rwstat_recursive_sum(blkg, NULL, offsetof(struct blkcg_gq, stat_bytes), &rwstat); - rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); - wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); - dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); + rbytes = rwstat.cnt[BLKG_RWSTAT_READ]; + wbytes = rwstat.cnt[BLKG_RWSTAT_WRITE]; + dbytes = rwstat.cnt[BLKG_RWSTAT_DISCARD]; blkg_rwstat_recursive_sum(blkg, NULL, offsetof(struct blkcg_gq, stat_ios), &rwstat); - rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); - wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); - dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); + rios = rwstat.cnt[BLKG_RWSTAT_READ]; + wios = rwstat.cnt[BLKG_RWSTAT_WRITE]; + dios = rwstat.cnt[BLKG_RWSTAT_DISCARD]; spin_unlock_irq(&blkg->q->queue_lock); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 3ee858111274..e4a81767e111 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -63,8 +63,7 @@ struct blkcg { /* * blkg_[rw]stat->aux_cnt is excluded for local stats but included for - * recursive. Used to carry stats of dead children, and, for blkg_rwstat, - * to carry result values from read and sum operations. + * recursive. Used to carry stats of dead children. */ struct blkg_stat { struct percpu_counter cpu_cnt; @@ -76,6 +75,10 @@ struct blkg_rwstat { atomic64_t aux_cnt[BLKG_RWSTAT_NR]; }; +struct blkg_rwstat_sample { + u64 cnt[BLKG_RWSTAT_NR]; +}; + /* * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a * request_queue (q). This is used by blkcg policies which need to track @@ -213,7 +216,7 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, bool show_total); u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - const struct blkg_rwstat *rwstat); + const struct blkg_rwstat_sample *rwstat); u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); @@ -225,7 +228,7 @@ int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v); u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, int off); void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, - int off, struct blkg_rwstat *sum); + int off, struct blkg_rwstat_sample *sum); struct blkg_conf_ctx { struct gendisk *disk; @@ -701,13 +704,13 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, * Read the current snapshot of @rwstat and return it in the aux counts. */ static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat, - struct blkg_rwstat *result) + struct blkg_rwstat_sample *result) { int i; for (i = 0; i < BLKG_RWSTAT_NR; i++) - atomic64_set(&result->aux_cnt[i], - percpu_counter_sum_positive(&rwstat->cpu_cnt[i])); + result->cnt[i] = + percpu_counter_sum_positive(&rwstat->cpu_cnt[i]); } /** @@ -720,11 +723,10 @@ static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat, */ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) { - struct blkg_rwstat tmp = { }; + struct blkg_rwstat_sample tmp = { }; blkg_rwstat_read(rwstat, &tmp); - return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + - atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); + return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; } /** From c0ce79dca5b0e8373a546ebea2af7b3df94c584e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Jun 2019 12:26:22 +0200 Subject: [PATCH 037/153] blk-cgroup: move struct blkg_stat to bfq This structure and assorted infrastructure is only used by the bfq I/O scheduler. Move it there instead of bloating the common code. Acked-by: Tejun Heo Acked-by: Paolo Valente Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 192 ++++++++++++++++++++++++++++++------- block/bfq-iosched.h | 19 ++-- block/blk-cgroup.c | 56 ----------- include/linux/blk-cgroup.h | 71 -------------- 4 files changed, 167 insertions(+), 171 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 624374a99c6e..a691dca7e966 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -17,6 +17,124 @@ #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) +static int bfq_stat_init(struct bfq_stat *stat, gfp_t gfp) +{ + int ret; + + ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp); + if (ret) + return ret; + + atomic64_set(&stat->aux_cnt, 0); + return 0; +} + +static void bfq_stat_exit(struct bfq_stat *stat) +{ + percpu_counter_destroy(&stat->cpu_cnt); +} + +/** + * bfq_stat_add - add a value to a bfq_stat + * @stat: target bfq_stat + * @val: value to add + * + * Add @val to @stat. The caller must ensure that IRQ on the same CPU + * don't re-enter this function for the same counter. + */ +static inline void bfq_stat_add(struct bfq_stat *stat, uint64_t val) +{ + percpu_counter_add_batch(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH); +} + +/** + * bfq_stat_read - read the current value of a bfq_stat + * @stat: bfq_stat to read + */ +static inline uint64_t bfq_stat_read(struct bfq_stat *stat) +{ + return percpu_counter_sum_positive(&stat->cpu_cnt); +} + +/** + * bfq_stat_reset - reset a bfq_stat + * @stat: bfq_stat to reset + */ +static inline void bfq_stat_reset(struct bfq_stat *stat) +{ + percpu_counter_set(&stat->cpu_cnt, 0); + atomic64_set(&stat->aux_cnt, 0); +} + +/** + * bfq_stat_add_aux - add a bfq_stat into another's aux count + * @to: the destination bfq_stat + * @from: the source + * + * Add @from's count including the aux one to @to's aux count. + */ +static inline void bfq_stat_add_aux(struct bfq_stat *to, + struct bfq_stat *from) +{ + atomic64_add(bfq_stat_read(from) + atomic64_read(&from->aux_cnt), + &to->aux_cnt); +} + +/** + * bfq_stat_recursive_sum - collect hierarchical bfq_stat + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the bfq_stat + * @off: offset to the bfq_stat in blkg_policy_data or @blkg + * + * Collect the bfq_stat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. + * + * If @pol is NULL, bfq_stat is at @off bytes into @blkg; otherwise, it is + * at @off bytes into @blkg's blkg_policy_data of the policy. + */ +static u64 bfq_stat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off) +{ + struct blkcg_gq *pos_blkg; + struct cgroup_subsys_state *pos_css; + u64 sum = 0; + + lockdep_assert_held(&blkg->q->queue_lock); + + rcu_read_lock(); + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct bfq_stat *stat; + + if (!pos_blkg->online) + continue; + + if (pol) + stat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + stat = (void *)blkg + off; + + sum += bfq_stat_read(stat) + atomic64_read(&stat->aux_cnt); + } + rcu_read_unlock(); + + return sum; +} + +/** + * blkg_prfill_stat - prfill callback for bfq_stat + * @sf: seq_file to print to + * @pd: policy private data of interest + * @off: offset to the bfq_stat in @pd + * + * prfill callback for printing a bfq_stat. + */ +static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + return __blkg_prfill_u64(sf, pd, bfq_stat_read((void *)pd + off)); +} + /* bfqg stats flags */ enum bfqg_stats_flags { BFQG_stats_waiting = 0, @@ -53,7 +171,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) now = ktime_get_ns(); if (now > stats->start_group_wait_time) - blkg_stat_add(&stats->group_wait_time, + bfq_stat_add(&stats->group_wait_time, now - stats->start_group_wait_time); bfqg_stats_clear_waiting(stats); } @@ -82,14 +200,14 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) now = ktime_get_ns(); if (now > stats->start_empty_time) - blkg_stat_add(&stats->empty_time, + bfq_stat_add(&stats->empty_time, now - stats->start_empty_time); bfqg_stats_clear_empty(stats); } void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { - blkg_stat_add(&bfqg->stats.dequeue, 1); + bfq_stat_add(&bfqg->stats.dequeue, 1); } void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) @@ -119,7 +237,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg) u64 now = ktime_get_ns(); if (now > stats->start_idle_time) - blkg_stat_add(&stats->idle_time, + bfq_stat_add(&stats->idle_time, now - stats->start_idle_time); bfqg_stats_clear_idling(stats); } @@ -137,9 +255,9 @@ void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; - blkg_stat_add(&stats->avg_queue_size_sum, + bfq_stat_add(&stats->avg_queue_size_sum, blkg_rwstat_total(&stats->queued)); - blkg_stat_add(&stats->avg_queue_size_samples, 1); + bfq_stat_add(&stats->avg_queue_size_samples, 1); bfqg_stats_update_group_wait_time(stats); } @@ -279,13 +397,13 @@ static void bfqg_stats_reset(struct bfqg_stats *stats) blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); blkg_rwstat_reset(&stats->wait_time); - blkg_stat_reset(&stats->time); - blkg_stat_reset(&stats->avg_queue_size_sum); - blkg_stat_reset(&stats->avg_queue_size_samples); - blkg_stat_reset(&stats->dequeue); - blkg_stat_reset(&stats->group_wait_time); - blkg_stat_reset(&stats->idle_time); - blkg_stat_reset(&stats->empty_time); + bfq_stat_reset(&stats->time); + bfq_stat_reset(&stats->avg_queue_size_sum); + bfq_stat_reset(&stats->avg_queue_size_samples); + bfq_stat_reset(&stats->dequeue); + bfq_stat_reset(&stats->group_wait_time); + bfq_stat_reset(&stats->idle_time); + bfq_stat_reset(&stats->empty_time); #endif } @@ -300,14 +418,14 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) blkg_rwstat_add_aux(&to->merged, &from->merged); blkg_rwstat_add_aux(&to->service_time, &from->service_time); blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); - blkg_stat_add_aux(&from->time, &from->time); - blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); - blkg_stat_add_aux(&to->avg_queue_size_samples, + bfq_stat_add_aux(&from->time, &from->time); + bfq_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); + bfq_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); - blkg_stat_add_aux(&to->dequeue, &from->dequeue); - blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); - blkg_stat_add_aux(&to->idle_time, &from->idle_time); - blkg_stat_add_aux(&to->empty_time, &from->empty_time); + bfq_stat_add_aux(&to->dequeue, &from->dequeue); + bfq_stat_add_aux(&to->group_wait_time, &from->group_wait_time); + bfq_stat_add_aux(&to->idle_time, &from->idle_time); + bfq_stat_add_aux(&to->empty_time, &from->empty_time); #endif } @@ -360,13 +478,13 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) blkg_rwstat_exit(&stats->service_time); blkg_rwstat_exit(&stats->wait_time); blkg_rwstat_exit(&stats->queued); - blkg_stat_exit(&stats->time); - blkg_stat_exit(&stats->avg_queue_size_sum); - blkg_stat_exit(&stats->avg_queue_size_samples); - blkg_stat_exit(&stats->dequeue); - blkg_stat_exit(&stats->group_wait_time); - blkg_stat_exit(&stats->idle_time); - blkg_stat_exit(&stats->empty_time); + bfq_stat_exit(&stats->time); + bfq_stat_exit(&stats->avg_queue_size_sum); + bfq_stat_exit(&stats->avg_queue_size_samples); + bfq_stat_exit(&stats->dequeue); + bfq_stat_exit(&stats->group_wait_time); + bfq_stat_exit(&stats->idle_time); + bfq_stat_exit(&stats->empty_time); #endif } @@ -377,13 +495,13 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) blkg_rwstat_init(&stats->service_time, gfp) || blkg_rwstat_init(&stats->wait_time, gfp) || blkg_rwstat_init(&stats->queued, gfp) || - blkg_stat_init(&stats->time, gfp) || - blkg_stat_init(&stats->avg_queue_size_sum, gfp) || - blkg_stat_init(&stats->avg_queue_size_samples, gfp) || - blkg_stat_init(&stats->dequeue, gfp) || - blkg_stat_init(&stats->group_wait_time, gfp) || - blkg_stat_init(&stats->idle_time, gfp) || - blkg_stat_init(&stats->empty_time, gfp)) { + bfq_stat_init(&stats->time, gfp) || + bfq_stat_init(&stats->avg_queue_size_sum, gfp) || + bfq_stat_init(&stats->avg_queue_size_samples, gfp) || + bfq_stat_init(&stats->dequeue, gfp) || + bfq_stat_init(&stats->group_wait_time, gfp) || + bfq_stat_init(&stats->idle_time, gfp) || + bfq_stat_init(&stats->empty_time, gfp)) { bfqg_stats_exit(stats); return -ENOMEM; } @@ -927,7 +1045,7 @@ static int bfqg_print_rwstat(struct seq_file *sf, void *v) static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), + u64 sum = bfq_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); return __blkg_prfill_u64(sf, pd, sum); } @@ -996,11 +1114,11 @@ static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct bfq_group *bfqg = pd_to_bfqg(pd); - u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); + u64 samples = bfq_stat_read(&bfqg->stats.avg_queue_size_samples); u64 v = 0; if (samples) { - v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); + v = bfq_stat_read(&bfqg->stats.avg_queue_size_sum); v = div64_u64(v, samples); } __blkg_prfill_u64(sf, pd, v); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index c2faa77824f8..aef4fa0046b8 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -777,6 +777,11 @@ enum bfqq_expiration { BFQQE_PREEMPTED /* preemption in progress */ }; +struct bfq_stat { + struct percpu_counter cpu_cnt; + atomic64_t aux_cnt; +}; + struct bfqg_stats { #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) /* number of ios merged */ @@ -788,19 +793,19 @@ struct bfqg_stats { /* number of IOs queued up */ struct blkg_rwstat queued; /* total disk time and nr sectors dispatched by this group */ - struct blkg_stat time; + struct bfq_stat time; /* sum of number of ios queued across all samples */ - struct blkg_stat avg_queue_size_sum; + struct bfq_stat avg_queue_size_sum; /* count of samples taken for average */ - struct blkg_stat avg_queue_size_samples; + struct bfq_stat avg_queue_size_samples; /* how many times this group has been removed from service tree */ - struct blkg_stat dequeue; + struct bfq_stat dequeue; /* total time spent waiting for it to be assigned a timeslice. */ - struct blkg_stat group_wait_time; + struct bfq_stat group_wait_time; /* time spent idling for this blkcg_gq */ - struct blkg_stat idle_time; + struct bfq_stat idle_time; /* total time with empty current active q with other requests queued */ - struct blkg_stat empty_time; + struct bfq_stat empty_time; /* fields after this shouldn't be cleared on stat reset */ u64 start_group_wait_time; u64 start_idle_time; diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 664c09866839..53b7bd4c7000 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -572,20 +572,6 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, } EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); -/** - * blkg_prfill_stat - prfill callback for blkg_stat - * @sf: seq_file to print to - * @pd: policy private data of interest - * @off: offset to the blkg_stat in @pd - * - * prfill callback for printing a blkg_stat. - */ -u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off) -{ - return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off)); -} -EXPORT_SYMBOL_GPL(blkg_prfill_stat); - /** * blkg_prfill_rwstat - prfill callback for blkg_rwstat * @sf: seq_file to print to @@ -687,48 +673,6 @@ int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v) } EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive); -/** - * blkg_stat_recursive_sum - collect hierarchical blkg_stat - * @blkg: blkg of interest - * @pol: blkcg_policy which contains the blkg_stat - * @off: offset to the blkg_stat in blkg_policy_data or @blkg - * - * Collect the blkg_stat specified by @blkg, @pol and @off and all its - * online descendants and their aux counts. The caller must be holding the - * queue lock for online tests. - * - * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is - * at @off bytes into @blkg's blkg_policy_data of the policy. - */ -u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, - struct blkcg_policy *pol, int off) -{ - struct blkcg_gq *pos_blkg; - struct cgroup_subsys_state *pos_css; - u64 sum = 0; - - lockdep_assert_held(&blkg->q->queue_lock); - - rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { - struct blkg_stat *stat; - - if (!pos_blkg->online) - continue; - - if (pol) - stat = (void *)blkg_to_pd(pos_blkg, pol) + off; - else - stat = (void *)blkg + off; - - sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt); - } - rcu_read_unlock(); - - return sum; -} -EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); - /** * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat * @blkg: blkg of interest diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index e4a81767e111..33f23a858438 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -65,11 +65,6 @@ struct blkcg { * blkg_[rw]stat->aux_cnt is excluded for local stats but included for * recursive. Used to carry stats of dead children. */ -struct blkg_stat { - struct percpu_counter cpu_cnt; - atomic64_t aux_cnt; -}; - struct blkg_rwstat { struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR]; atomic64_t aux_cnt[BLKG_RWSTAT_NR]; @@ -217,7 +212,6 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, const struct blkg_rwstat_sample *rwstat); -u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); int blkg_print_stat_bytes(struct seq_file *sf, void *v); @@ -225,8 +219,6 @@ int blkg_print_stat_ios(struct seq_file *sf, void *v); int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v); int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v); -u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, - struct blkcg_policy *pol, int off); void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, int off, struct blkg_rwstat_sample *sum); @@ -579,69 +571,6 @@ static inline void blkg_put(struct blkcg_gq *blkg) if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q, false))) -static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp) -{ - int ret; - - ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp); - if (ret) - return ret; - - atomic64_set(&stat->aux_cnt, 0); - return 0; -} - -static inline void blkg_stat_exit(struct blkg_stat *stat) -{ - percpu_counter_destroy(&stat->cpu_cnt); -} - -/** - * blkg_stat_add - add a value to a blkg_stat - * @stat: target blkg_stat - * @val: value to add - * - * Add @val to @stat. The caller must ensure that IRQ on the same CPU - * don't re-enter this function for the same counter. - */ -static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) -{ - percpu_counter_add_batch(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH); -} - -/** - * blkg_stat_read - read the current value of a blkg_stat - * @stat: blkg_stat to read - */ -static inline uint64_t blkg_stat_read(struct blkg_stat *stat) -{ - return percpu_counter_sum_positive(&stat->cpu_cnt); -} - -/** - * blkg_stat_reset - reset a blkg_stat - * @stat: blkg_stat to reset - */ -static inline void blkg_stat_reset(struct blkg_stat *stat) -{ - percpu_counter_set(&stat->cpu_cnt, 0); - atomic64_set(&stat->aux_cnt, 0); -} - -/** - * blkg_stat_add_aux - add a blkg_stat into another's aux count - * @to: the destination blkg_stat - * @from: the source - * - * Add @from's count including the aux one to @to's aux count. - */ -static inline void blkg_stat_add_aux(struct blkg_stat *to, - struct blkg_stat *from) -{ - atomic64_add(blkg_stat_read(from) + atomic64_read(&from->aux_cnt), - &to->aux_cnt); -} - static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp) { int i, ret; From d6258980daf207f986676e59e6ea295204cdc84e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Jun 2019 12:26:23 +0200 Subject: [PATCH 038/153] bfq-iosched: move bfq_stat_recursive_sum into the only caller This function was moved from core block code and is way to generic. Fold it into the only caller and simplify it based on the actually passed arguments. Acked-by: Tejun Heo Acked-by: Paolo Valente Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 62 ++++++++++++++-------------------------------- 1 file changed, 19 insertions(+), 43 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index a691dca7e966..d84302445e30 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -80,47 +80,6 @@ static inline void bfq_stat_add_aux(struct bfq_stat *to, &to->aux_cnt); } -/** - * bfq_stat_recursive_sum - collect hierarchical bfq_stat - * @blkg: blkg of interest - * @pol: blkcg_policy which contains the bfq_stat - * @off: offset to the bfq_stat in blkg_policy_data or @blkg - * - * Collect the bfq_stat specified by @blkg, @pol and @off and all its - * online descendants and their aux counts. The caller must be holding the - * queue lock for online tests. - * - * If @pol is NULL, bfq_stat is at @off bytes into @blkg; otherwise, it is - * at @off bytes into @blkg's blkg_policy_data of the policy. - */ -static u64 bfq_stat_recursive_sum(struct blkcg_gq *blkg, - struct blkcg_policy *pol, int off) -{ - struct blkcg_gq *pos_blkg; - struct cgroup_subsys_state *pos_css; - u64 sum = 0; - - lockdep_assert_held(&blkg->q->queue_lock); - - rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { - struct bfq_stat *stat; - - if (!pos_blkg->online) - continue; - - if (pol) - stat = (void *)blkg_to_pd(pos_blkg, pol) + off; - else - stat = (void *)blkg + off; - - sum += bfq_stat_read(stat) + atomic64_read(&stat->aux_cnt); - } - rcu_read_unlock(); - - return sum; -} - /** * blkg_prfill_stat - prfill callback for bfq_stat * @sf: seq_file to print to @@ -1045,8 +1004,25 @@ static int bfqg_print_rwstat(struct seq_file *sf, void *v) static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - u64 sum = bfq_stat_recursive_sum(pd_to_blkg(pd), - &blkcg_policy_bfq, off); + struct blkcg_gq *blkg = pd_to_blkg(pd); + struct blkcg_gq *pos_blkg; + struct cgroup_subsys_state *pos_css; + u64 sum = 0; + + lockdep_assert_held(&blkg->q->queue_lock); + + rcu_read_lock(); + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct bfq_stat *stat; + + if (!pos_blkg->online) + continue; + + stat = (void *)blkg_to_pd(pos_blkg, &blkcg_policy_bfq) + off; + sum += bfq_stat_read(stat) + atomic64_read(&stat->aux_cnt); + } + rcu_read_unlock(); + return __blkg_prfill_u64(sf, pd, sum); } From 8060c47ba853f147c46bf1e6f6d93d1726fcb57a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Jun 2019 12:26:24 +0200 Subject: [PATCH 039/153] block: rename CONFIG_DEBUG_BLK_CGROUP to CONFIG_BFQ_CGROUP_DEBUG This option is entirely bfq specific, give it an appropinquate name. Also make it depend on CONFIG_BFQ_GROUP_IOSCHED in Kconfig, as all the functionality already does so anyway. Acked-by: Tejun Heo Acked-by: Paolo Valente Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- Documentation/block/bfq-iosched.txt | 12 ++++----- Documentation/cgroup-v1/blkio-controller.txt | 12 ++++----- block/Kconfig.iosched | 7 +++++ block/bfq-cgroup.c | 27 ++++++++++---------- block/bfq-iosched.c | 8 +++--- block/bfq-iosched.h | 4 +-- init/Kconfig | 8 ------ 7 files changed, 38 insertions(+), 40 deletions(-) diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index 1a0f2ac02eb6..f02163fabf80 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt @@ -38,13 +38,13 @@ stack). To give an idea of the limits with BFQ, on slow or average CPUs, here are, first, the limits of BFQ for three different CPUs, on, respectively, an average laptop, an old desktop, and a cheap embedded system, in case full hierarchical support is enabled (i.e., -CONFIG_BFQ_GROUP_IOSCHED is set), but CONFIG_DEBUG_BLK_CGROUP is not +CONFIG_BFQ_GROUP_IOSCHED is set), but CONFIG_BFQ_CGROUP_DEBUG is not set (Section 4-2): - Intel i7-4850HQ: 400 KIOPS - AMD A8-3850: 250 KIOPS - ARM CortexTM-A53 Octa-core: 80 KIOPS -If CONFIG_DEBUG_BLK_CGROUP is set (and of course full hierarchical +If CONFIG_BFQ_CGROUP_DEBUG is set (and of course full hierarchical support is enabled), then the sustainable throughput with BFQ decreases, because all blkio.bfq* statistics are created and updated (Section 4-2). For BFQ, this leads to the following maximum @@ -537,19 +537,19 @@ or io.bfq.weight. As for cgroups-v1 (blkio controller), the exact set of stat files created, and kept up-to-date by bfq, depends on whether -CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq creates all +CONFIG_BFQ_CGROUP_DEBUG is set. If it is set, then bfq creates all the stat files documented in Documentation/cgroup-v1/blkio-controller.txt. If, instead, -CONFIG_DEBUG_BLK_CGROUP is not set, then bfq creates only the files +CONFIG_BFQ_CGROUP_DEBUG is not set, then bfq creates only the files blkio.bfq.io_service_bytes blkio.bfq.io_service_bytes_recursive blkio.bfq.io_serviced blkio.bfq.io_serviced_recursive -The value of CONFIG_DEBUG_BLK_CGROUP greatly influences the maximum +The value of CONFIG_BFQ_CGROUP_DEBUG greatly influences the maximum throughput sustainable with bfq, because updating the blkio.bfq.* stats is rather costly, especially for some of the stats enabled by -CONFIG_DEBUG_BLK_CGROUP. +CONFIG_BFQ_CGROUP_DEBUG. Parameters to set ----------------- diff --git a/Documentation/cgroup-v1/blkio-controller.txt b/Documentation/cgroup-v1/blkio-controller.txt index d1a1b7bdd03a..78ec4500f220 100644 --- a/Documentation/cgroup-v1/blkio-controller.txt +++ b/Documentation/cgroup-v1/blkio-controller.txt @@ -77,7 +77,7 @@ Various user visible config options CONFIG_BLK_CGROUP - Block IO controller. -CONFIG_DEBUG_BLK_CGROUP +CONFIG_BFQ_CGROUP_DEBUG - Debug help. Right now some additional stats file show up in cgroup if this option is enabled. @@ -193,13 +193,13 @@ Proportional weight policy files write, sync or async. - blkio.avg_queue_size - - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. + - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. The average queue size for this cgroup over the entire time of this cgroup's existence. Queue size samples are taken each time one of the queues of this cgroup gets a timeslice. - blkio.group_wait_time - - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. + - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This is the amount of time the cgroup had to wait since it became busy (i.e., went from 0 to 1 request queued) to get a timeslice for one of its queues. This is different from the io_wait_time which is the @@ -210,7 +210,7 @@ Proportional weight policy files got a timeslice and will not include the current delta. - blkio.empty_time - - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. + - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This is the amount of time a cgroup spends without any pending requests when not being served, i.e., it does not include any time spent idling for one of the queues of the cgroup. This is in @@ -219,7 +219,7 @@ Proportional weight policy files time it had a pending request and will not include the current delta. - blkio.idle_time - - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. + - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This is the amount of time spent by the IO scheduler idling for a given cgroup in anticipation of a better request than the existing ones from other queues/cgroups. This is in nanoseconds. If this is read @@ -228,7 +228,7 @@ Proportional weight policy files the current delta. - blkio.dequeue - - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This + - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This gives the statistics about how many a times a group was dequeued from service tree of the device. First two fields specify the major and minor number of the device and third field specifies the number diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 4626b88b2d5a..7a6b2f29a582 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -36,6 +36,13 @@ config BFQ_GROUP_IOSCHED Enable hierarchical scheduling in BFQ, using the blkio (cgroups-v1) or io (cgroups-v2) controller. +config BFQ_CGROUP_DEBUG + bool "BFQ IO controller debugging" + depends on BFQ_GROUP_IOSCHED + ---help--- + Enable some debugging help. Currently it exports additional stat + files in a cgroup which can be useful for debugging. + endmenu endif diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index d84302445e30..0f6cd688924f 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -15,8 +15,7 @@ #include "bfq-iosched.h" -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) - +#ifdef CONFIG_BFQ_CGROUP_DEBUG static int bfq_stat_init(struct bfq_stat *stat, gfp_t gfp) { int ret; @@ -253,7 +252,7 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, io_start_time_ns - start_time_ns); } -#else /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ +#else /* CONFIG_BFQ_CGROUP_DEBUG */ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, unsigned int op) { } @@ -267,7 +266,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } -#endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ #ifdef CONFIG_BFQ_GROUP_IOSCHED @@ -351,7 +350,7 @@ void bfqg_and_blkg_put(struct bfq_group *bfqg) /* @stats = 0 */ static void bfqg_stats_reset(struct bfqg_stats *stats) { -#ifdef CONFIG_DEBUG_BLK_CGROUP +#ifdef CONFIG_BFQ_CGROUP_DEBUG /* queued stats shouldn't be cleared */ blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); @@ -372,7 +371,7 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) if (!to || !from) return; -#ifdef CONFIG_DEBUG_BLK_CGROUP +#ifdef CONFIG_BFQ_CGROUP_DEBUG /* queued stats shouldn't be cleared */ blkg_rwstat_add_aux(&to->merged, &from->merged); blkg_rwstat_add_aux(&to->service_time, &from->service_time); @@ -432,7 +431,7 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) static void bfqg_stats_exit(struct bfqg_stats *stats) { -#ifdef CONFIG_DEBUG_BLK_CGROUP +#ifdef CONFIG_BFQ_CGROUP_DEBUG blkg_rwstat_exit(&stats->merged); blkg_rwstat_exit(&stats->service_time); blkg_rwstat_exit(&stats->wait_time); @@ -449,7 +448,7 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { -#ifdef CONFIG_DEBUG_BLK_CGROUP +#ifdef CONFIG_BFQ_CGROUP_DEBUG if (blkg_rwstat_init(&stats->merged, gfp) || blkg_rwstat_init(&stats->service_time, gfp) || blkg_rwstat_init(&stats->wait_time, gfp) || @@ -986,7 +985,7 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, return ret ?: nbytes; } -#ifdef CONFIG_DEBUG_BLK_CGROUP +#ifdef CONFIG_BFQ_CGROUP_DEBUG static int bfqg_print_stat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, @@ -1109,7 +1108,7 @@ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) 0, false); return 0; } -#endif /* CONFIG_DEBUG_BLK_CGROUP */ +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) { @@ -1157,7 +1156,7 @@ struct cftype bfq_blkcg_legacy_files[] = { .private = (unsigned long)&blkcg_policy_bfq, .seq_show = blkg_print_stat_ios, }, -#ifdef CONFIG_DEBUG_BLK_CGROUP +#ifdef CONFIG_BFQ_CGROUP_DEBUG { .name = "bfq.time", .private = offsetof(struct bfq_group, stats.time), @@ -1187,7 +1186,7 @@ struct cftype bfq_blkcg_legacy_files[] = { .private = offsetof(struct bfq_group, stats.queued), .seq_show = bfqg_print_rwstat, }, -#endif /* CONFIG_DEBUG_BLK_CGROUP */ +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ /* the same statistics which cover the bfqg and its descendants */ { @@ -1200,7 +1199,7 @@ struct cftype bfq_blkcg_legacy_files[] = { .private = (unsigned long)&blkcg_policy_bfq, .seq_show = blkg_print_stat_ios_recursive, }, -#ifdef CONFIG_DEBUG_BLK_CGROUP +#ifdef CONFIG_BFQ_CGROUP_DEBUG { .name = "bfq.time_recursive", .private = offsetof(struct bfq_group, stats.time), @@ -1254,7 +1253,7 @@ struct cftype bfq_blkcg_legacy_files[] = { .private = offsetof(struct bfq_group, stats.dequeue), .seq_show = bfqg_print_stat, }, -#endif /* CONFIG_DEBUG_BLK_CGROUP */ +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ { } /* terminate */ }; diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index a6bf842cbe16..44c6bbcd7720 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4404,7 +4404,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) return rq; } -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) +#ifdef CONFIG_BFQ_CGROUP_DEBUG static void bfq_update_dispatch_stats(struct request_queue *q, struct request *rq, struct bfq_queue *in_serv_queue, @@ -4454,7 +4454,7 @@ static inline void bfq_update_dispatch_stats(struct request_queue *q, struct request *rq, struct bfq_queue *in_serv_queue, bool idle_timer_disabled) {} -#endif +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) { @@ -5008,7 +5008,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) return idle_timer_disabled; } -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) +#ifdef CONFIG_BFQ_CGROUP_DEBUG static void bfq_update_insert_stats(struct request_queue *q, struct bfq_queue *bfqq, bool idle_timer_disabled, @@ -5038,7 +5038,7 @@ static inline void bfq_update_insert_stats(struct request_queue *q, struct bfq_queue *bfqq, bool idle_timer_disabled, unsigned int cmd_flags) {} -#endif +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head) diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index aef4fa0046b8..584d3c9ed8ba 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -783,7 +783,7 @@ struct bfq_stat { }; struct bfqg_stats { -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) +#ifdef CONFIG_BFQ_CGROUP_DEBUG /* number of ios merged */ struct blkg_rwstat merged; /* total time spent on device in ns, may not be accurate w/ queueing */ @@ -811,7 +811,7 @@ struct bfqg_stats { u64 start_idle_time; u64 start_empty_time; uint16_t flags; -#endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ }; #ifdef CONFIG_BFQ_GROUP_IOSCHED diff --git a/init/Kconfig b/init/Kconfig index 0e2344389501..a41d8fbe09d8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -799,14 +799,6 @@ config BLK_CGROUP See Documentation/cgroup-v1/blkio-controller.txt for more information. -config DEBUG_BLK_CGROUP - bool "IO controller debugging" - depends on BLK_CGROUP - default n - ---help--- - Enable some debugging help. Currently it exports additional stat - files in a cgroup which can be useful for debugging. - config CGROUP_WRITEBACK bool depends on MEMCG && BLK_CGROUP From 178cc590e54a9e04a749a0474fcaac0e8c20888f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Jun 2019 10:59:15 -0700 Subject: [PATCH 040/153] block: improve print_req_error Print the calling function instead of print_req_error as a prefix, and print the operation and op_flags separately instead of the whole field. Reviewed-by: Bart Van Assche Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-core.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index e1b77113671e..c97da29ddc07 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -167,18 +167,20 @@ int blk_status_to_errno(blk_status_t status) } EXPORT_SYMBOL_GPL(blk_status_to_errno); -static void print_req_error(struct request *req, blk_status_t status) +static void print_req_error(struct request *req, blk_status_t status, + const char *caller) { int idx = (__force int)status; if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) return; - printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu flags %x\n", - __func__, blk_errors[idx].name, - req->rq_disk ? req->rq_disk->disk_name : "?", - (unsigned long long)blk_rq_pos(req), - req->cmd_flags); + printk_ratelimited(KERN_ERR + "%s: %s error, dev %s, sector %llu op 0x%x flags 0x%x\n", + caller, blk_errors[idx].name, + req->rq_disk ? req->rq_disk->disk_name : "?", + blk_rq_pos(req), req_op(req), + req->cmd_flags & ~REQ_OP_MASK); } static void req_bio_endio(struct request *rq, struct bio *bio, @@ -1362,7 +1364,7 @@ bool blk_update_request(struct request *req, blk_status_t error, if (unlikely(error && !blk_rq_is_passthrough(req) && !(req->rq_flags & RQF_QUIET))) - print_req_error(req, error); + print_req_error(req, error, __func__); blk_account_io_completion(req, nr_bytes); From e47bc4eda953928644109101d07c9c95dc29a458 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 20 Jun 2019 10:59:16 -0700 Subject: [PATCH 041/153] block: add centralize REQ_OP_XXX to string helper In order to centralize the REQ_OP_XXX to string conversion which can be used in the block layer and different places in the kernel like f2fs, this patch adds a new helper function along with an array similar to the one present in the blk-mq-debugfs.c. We keep this helper functionality centralize under blk-core.c instead of blk-mq-debugfs.c since blk-core.c is configured using CONFIG_BLOCK and it will not be dependent on blk-mq-debugfs.c which is configured using CONFIG_BLK_DEBUG_FS. Next patch adjusts the code in the blk-mq-debugfs.c with newly introduced helper. Reviewed-by: Bart Van Assche Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-core.c | 36 ++++++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 3 +++ 2 files changed, 39 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index c97da29ddc07..129204dd3bae 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -120,6 +120,42 @@ void blk_rq_init(struct request_queue *q, struct request *rq) } EXPORT_SYMBOL(blk_rq_init); +#define REQ_OP_NAME(name) [REQ_OP_##name] = #name +static const char *const blk_op_name[] = { + REQ_OP_NAME(READ), + REQ_OP_NAME(WRITE), + REQ_OP_NAME(FLUSH), + REQ_OP_NAME(DISCARD), + REQ_OP_NAME(SECURE_ERASE), + REQ_OP_NAME(ZONE_RESET), + REQ_OP_NAME(WRITE_SAME), + REQ_OP_NAME(WRITE_ZEROES), + REQ_OP_NAME(SCSI_IN), + REQ_OP_NAME(SCSI_OUT), + REQ_OP_NAME(DRV_IN), + REQ_OP_NAME(DRV_OUT), +}; +#undef REQ_OP_NAME + +/** + * blk_op_str - Return string XXX in the REQ_OP_XXX. + * @op: REQ_OP_XXX. + * + * Description: Centralize block layer function to convert REQ_OP_XXX into + * string format. Useful in the debugging and tracing bio or request. For + * invalid REQ_OP_XXX it returns string "UNKNOWN". + */ +inline const char *blk_op_str(unsigned int op) +{ + const char *op_str = "UNKNOWN"; + + if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op]) + op_str = blk_op_name[op]; + + return op_str; +} +EXPORT_SYMBOL_GPL(blk_op_str); + static const struct { int errno; const char *name; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d5d3bb45dfb6..0c482371c8b3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -865,6 +865,9 @@ extern void blk_execute_rq(struct request_queue *, struct gendisk *, extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); +/* Helper to convert REQ_OP_XXX to its string format XXX */ +extern const char *blk_op_str(unsigned int op); + int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); From 874c893bf07b88d25c6d1db3e1d14e582f838281 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 20 Jun 2019 10:59:17 -0700 Subject: [PATCH 042/153] block: use blk_op_str() in blk-mq-debugfs.c Now that we've a helper function blk_op_str() to convert the REQ_OP_XXX to string XXX, adjust the code to use that. Get rid of the duplicate array op_name which is now present in the blk-core.c which we renamed it to "blk_op_name" and open coding in the blk-mq-debugfs.c. Reviewed-by: Bart Van Assche Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index a8376cc06a39..748164f4e8b1 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -261,23 +261,6 @@ static int hctx_flags_show(void *data, struct seq_file *m) return 0; } -#define REQ_OP_NAME(name) [REQ_OP_##name] = #name -static const char *const op_name[] = { - REQ_OP_NAME(READ), - REQ_OP_NAME(WRITE), - REQ_OP_NAME(FLUSH), - REQ_OP_NAME(DISCARD), - REQ_OP_NAME(SECURE_ERASE), - REQ_OP_NAME(ZONE_RESET), - REQ_OP_NAME(WRITE_SAME), - REQ_OP_NAME(WRITE_ZEROES), - REQ_OP_NAME(SCSI_IN), - REQ_OP_NAME(SCSI_OUT), - REQ_OP_NAME(DRV_IN), - REQ_OP_NAME(DRV_OUT), -}; -#undef REQ_OP_NAME - #define CMD_FLAG_NAME(name) [__REQ_##name] = #name static const char *const cmd_flag_name[] = { CMD_FLAG_NAME(FAILFAST_DEV), @@ -342,12 +325,13 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) { const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; const unsigned int op = req_op(rq); + const char *op_str = blk_op_str(op); seq_printf(m, "%p {.op=", rq); - if (op < ARRAY_SIZE(op_name) && op_name[op]) - seq_printf(m, "%s", op_name[op]); - else + if (strcmp(op_str, "UNKNOWN") == 0) seq_printf(m, "%u", op); + else + seq_printf(m, "%s", op_str); seq_puts(m, ", .cmd_flags="); blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name, ARRAY_SIZE(cmd_flag_name)); From b0e5168a77387d19caee1622e30f77464c369885 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 20 Jun 2019 10:59:18 -0700 Subject: [PATCH 043/153] block: update print_req_error() Improve the print_req_error with additional request fields which are helpful for debugging. Use newly introduced blk_op_str() to print the REQ_OP_XXX in the string format. Reviewed-by: Chao Yu Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-core.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 129204dd3bae..5d1fc8e17dd1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -212,11 +212,14 @@ static void print_req_error(struct request *req, blk_status_t status, return; printk_ratelimited(KERN_ERR - "%s: %s error, dev %s, sector %llu op 0x%x flags 0x%x\n", + "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " + "phys_seg %u prio class %u\n", caller, blk_errors[idx].name, - req->rq_disk ? req->rq_disk->disk_name : "?", - blk_rq_pos(req), req_op(req), - req->cmd_flags & ~REQ_OP_MASK); + req->rq_disk ? req->rq_disk->disk_name : "?", + blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), + req->cmd_flags & ~REQ_OP_MASK, + req->nr_phys_segments, + IOPRIO_PRIO_CLASS(req->ioprio)); } static void req_bio_endio(struct request *rq, struct bio *bio, From 0ce353794b6c4dc88592b942e94b33cd1bf2ef54 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 20 Jun 2019 10:59:19 -0700 Subject: [PATCH 044/153] f2fs: use block layer helper for show_bio_op macro Adjust the f2fs tracing code to use newly introduced block layer function blk_op_str() which converts the REQ_OP_XXX into the string XXX. Reviewed-by: Chao Yu Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- include/trace/events/f2fs.h | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 53b96f12300c..e3dc031af7f5 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -76,16 +76,7 @@ TRACE_DEFINE_ENUM(CP_TRIMMED); #define show_bio_type(op,op_flags) show_bio_op(op), \ show_bio_op_flags(op_flags) -#define show_bio_op(op) \ - __print_symbolic(op, \ - { REQ_OP_READ, "READ" }, \ - { REQ_OP_WRITE, "WRITE" }, \ - { REQ_OP_FLUSH, "FLUSH" }, \ - { REQ_OP_DISCARD, "DISCARD" }, \ - { REQ_OP_SECURE_ERASE, "SECURE_ERASE" }, \ - { REQ_OP_ZONE_RESET, "ZONE_RESET" }, \ - { REQ_OP_WRITE_SAME, "WRITE_SAME" }, \ - { REQ_OP_WRITE_ZEROES, "WRITE_ZEROES" }) +#define show_bio_op(op) blk_op_str(op) #define show_bio_op_flags(flags) \ __print_flags(F2FS_BIO_FLAG_MASK(flags), "|", \ From 3e148a3209792e04f63ec99701235c960765fc9a Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 19 Jun 2019 17:30:46 +0800 Subject: [PATCH 045/153] md/raid1: fix potential data inconsistency issue with write behind device For write-behind mode, we think write IO is complete once it has reached all the non-writemostly devices. It works fine for single queue devices. But for multiqueue device, if there are lots of IOs come from upper layer, then the write-behind device could issue those IOs to different queues, depends on the each queue's delay, so there is no guarantee that those IOs can arrive in order. To address the issue, we need to check the collision among write behind IOs, we can only continue without collision, otherwise wait for the completion of previous collisioned IO. And WBCollision is introduced for multiqueue device which is worked under write-behind mode. But this patch doesn't handle below cases which could have the data inconsistency issue as well, these cases will be handled in later patches. 1. modify max_write_behind by write backlog node. 2. add or remove array's bitmap dynamically. 3. the change of member disk. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 41 ++++++++++++++++++++++++++++ drivers/md/md.h | 21 ++++++++++++++ drivers/md/raid1.c | 68 +++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 129 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 1f37a1adc926..9a9762c83cc8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -124,6 +124,19 @@ static inline int speed_max(struct mddev *mddev) mddev->sync_speed_max : sysctl_speed_limit_max; } +static int rdev_init_wb(struct md_rdev *rdev) +{ + if (rdev->bdev->bd_queue->nr_hw_queues == 1) + return 0; + + spin_lock_init(&rdev->wb_list_lock); + INIT_LIST_HEAD(&rdev->wb_list); + init_waitqueue_head(&rdev->wb_io_wait); + set_bit(WBCollisionCheck, &rdev->flags); + + return 1; +} + static struct ctl_table_header *raid_table_header; static struct ctl_table raid_table[] = { @@ -5597,6 +5610,32 @@ int md_run(struct mddev *mddev) md_bitmap_destroy(mddev); goto abort; } + + if (mddev->bitmap_info.max_write_behind > 0) { + bool creat_pool = false; + + rdev_for_each(rdev, mddev) { + if (test_bit(WriteMostly, &rdev->flags) && + rdev_init_wb(rdev)) + creat_pool = true; + } + if (creat_pool && mddev->wb_info_pool == NULL) { + mddev->wb_info_pool = + mempool_create_kmalloc_pool(NR_WB_INFOS, + sizeof(struct wb_info)); + if (!mddev->wb_info_pool) { + err = -ENOMEM; + mddev_detach(mddev); + if (mddev->private) + pers->free(mddev, mddev->private); + mddev->private = NULL; + module_put(pers->owner); + md_bitmap_destroy(mddev); + goto abort; + } + } + } + if (mddev->queue) { bool nonrot = true; @@ -5825,6 +5864,8 @@ static void __md_stop_writes(struct mddev *mddev) mddev->in_sync = 1; md_update_sb(mddev, 1); } + mempool_destroy(mddev->wb_info_pool); + mddev->wb_info_pool = NULL; } void md_stop_writes(struct mddev *mddev) diff --git a/drivers/md/md.h b/drivers/md/md.h index 7c930c091193..d449d514cff9 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -109,6 +109,14 @@ struct md_rdev { * for reporting to userspace and storing * in superblock. */ + + /* + * The members for check collision of write behind IOs. + */ + struct list_head wb_list; + spinlock_t wb_list_lock; + wait_queue_head_t wb_io_wait; + struct work_struct del_work; /* used for delayed sysfs removal */ struct kernfs_node *sysfs_state; /* handle for 'state' @@ -193,6 +201,10 @@ enum flag_bits { * it didn't fail, so don't use FailFast * any more for metadata */ + WBCollisionCheck, /* + * multiqueue device should check if there + * is collision between write behind bios. + */ }; static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, @@ -245,6 +257,14 @@ enum mddev_sb_flags { MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ }; +#define NR_WB_INFOS 8 +/* record current range of write behind IOs */ +struct wb_info { + sector_t lo; + sector_t hi; + struct list_head list; +}; + struct mddev { void *private; struct md_personality *pers; @@ -461,6 +481,7 @@ struct mddev { */ struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ + mempool_t *wb_info_pool; void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; unsigned int good_device_nr; /* good device num within cluster raid */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a7860b5f33f2..3d44da663797 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -50,6 +50,57 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr); #include "raid1-10.c" +static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) +{ + struct wb_info *wi, *temp_wi; + unsigned long flags; + int ret = 0; + struct mddev *mddev = rdev->mddev; + + wi = mempool_alloc(mddev->wb_info_pool, GFP_NOIO); + + spin_lock_irqsave(&rdev->wb_list_lock, flags); + list_for_each_entry(temp_wi, &rdev->wb_list, list) { + /* collision happened */ + if (hi > temp_wi->lo && lo < temp_wi->hi) { + ret = -EBUSY; + break; + } + } + + if (!ret) { + wi->lo = lo; + wi->hi = hi; + list_add(&wi->list, &rdev->wb_list); + } else + mempool_free(wi, mddev->wb_info_pool); + spin_unlock_irqrestore(&rdev->wb_list_lock, flags); + + return ret; +} + +static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) +{ + struct wb_info *wi; + unsigned long flags; + int found = 0; + struct mddev *mddev = rdev->mddev; + + spin_lock_irqsave(&rdev->wb_list_lock, flags); + list_for_each_entry(wi, &rdev->wb_list, list) + if (hi == wi->hi && lo == wi->lo) { + list_del(&wi->list); + mempool_free(wi, mddev->wb_info_pool); + found = 1; + break; + } + + if (!found) + WARN_ON("The write behind IO is not recorded\n"); + spin_unlock_irqrestore(&rdev->wb_list_lock, flags); + wake_up(&rdev->wb_io_wait); +} + /* * for resync bio, r1bio pointer can be retrieved from the per-bio * 'struct resync_pages'. @@ -446,6 +497,12 @@ static void raid1_end_write_request(struct bio *bio) } if (behind) { + if (test_bit(WBCollisionCheck, &rdev->flags)) { + sector_t lo = r1_bio->sector; + sector_t hi = r1_bio->sector + r1_bio->sectors; + + remove_wb(rdev, lo, hi); + } if (test_bit(WriteMostly, &rdev->flags)) atomic_dec(&r1_bio->behind_remaining); @@ -1443,7 +1500,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); if (r1_bio->behind_master_bio) { - if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) + struct md_rdev *rdev = conf->mirrors[i].rdev; + + if (test_bit(WBCollisionCheck, &rdev->flags)) { + sector_t lo = r1_bio->sector; + sector_t hi = r1_bio->sector + r1_bio->sectors; + + wait_event(rdev->wb_io_wait, + check_and_add_wb(rdev, lo, hi) == 0); + } + if (test_bit(WriteMostly, &rdev->flags)) atomic_inc(&r1_bio->behind_remaining); } From 963c555e75b033202dd76cf6325a7b7c83d08d5f Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 14 Jun 2019 17:10:36 +0800 Subject: [PATCH 046/153] md: introduce mddev_create/destroy_wb_pool for the change of member device Previously, we called rdev_init_wb to avoid potential data inconsistency when array is created. Now, we need to call the function and create mempool if a device is added or just be flaged as "writemostly". So mddev_create_wb_pool is introduced and called accordingly. And for safety reason, we mark implicit GFP_NOIO allocation scope for create mempool during mddev_suspend/mddev_resume. And mempool should be removed conversely after remove a member device or its's "writemostly" flag, which is done by call mddev_destroy_wb_pool. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++ drivers/md/md.h | 2 ++ 2 files changed, 67 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 9a9762c83cc8..b43207ab00b2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -37,6 +37,7 @@ */ +#include #include #include #include @@ -137,6 +138,64 @@ static int rdev_init_wb(struct md_rdev *rdev) return 1; } +/* + * Create wb_info_pool if rdev is the first multi-queue device flaged + * with writemostly, also write-behind mode is enabled. + */ +void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend) +{ + if (mddev->bitmap_info.max_write_behind == 0) + return; + + if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev)) + return; + + if (mddev->wb_info_pool == NULL) { + unsigned int noio_flag; + + if (!is_suspend) + mddev_suspend(mddev); + noio_flag = memalloc_noio_save(); + mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS, + sizeof(struct wb_info)); + memalloc_noio_restore(noio_flag); + if (!mddev->wb_info_pool) + pr_err("can't alloc memory pool for writemostly\n"); + if (!is_suspend) + mddev_resume(mddev); + } +} +EXPORT_SYMBOL_GPL(mddev_create_wb_pool); + +/* + * destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck. + */ +static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev) +{ + if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags)) + return; + + if (mddev->wb_info_pool) { + struct md_rdev *temp; + int num = 0; + + /* + * Check if other rdevs need wb_info_pool. + */ + rdev_for_each(temp, mddev) + if (temp != rdev && + test_bit(WBCollisionCheck, &temp->flags)) + num++; + if (!num) { + mddev_suspend(rdev->mddev); + mempool_destroy(mddev->wb_info_pool); + mddev->wb_info_pool = NULL; + mddev_resume(rdev->mddev); + } + } +} + static struct ctl_table_header *raid_table_header; static struct ctl_table raid_table[] = { @@ -2223,6 +2282,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) rdev->mddev = mddev; pr_debug("md: bind<%s>\n", b); + if (mddev->raid_disks) + mddev_create_wb_pool(mddev, rdev, false); + if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; @@ -2259,6 +2321,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); + mddev_destroy_wb_pool(rdev->mddev, rdev); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); @@ -2771,8 +2834,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); + mddev_create_wb_pool(rdev->mddev, rdev, false); err = 0; } else if (cmd_match(buf, "-writemostly")) { + mddev_destroy_wb_pool(rdev->mddev, rdev); clear_bit(WriteMostly, &rdev->flags); err = 0; } else if (cmd_match(buf, "blocked")) { diff --git a/drivers/md/md.h b/drivers/md/md.h index d449d514cff9..10f98200e2f8 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -730,6 +730,8 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); +extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); From 10c92fca636e40dcb15d85ffe06b1b6843cd28fc Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 14 Jun 2019 17:10:37 +0800 Subject: [PATCH 047/153] md-bitmap: create and destroy wb_info_pool with the change of backlog Since we can enable write-behind mode by write backlog node, so create wb_info_pool if the mode is just enabled, also call call md_bitmap_update_sb to make user aware the write-behind mode is enabled. Conversely, wb_info_pool should be destroyed when write-behind mode is disabled. Beside above, it is better to update bitmap sb if we change the number of max_write_behind. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index c01d41198f5e..15dd817fe83b 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2462,12 +2462,26 @@ static ssize_t backlog_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long backlog; + unsigned long old_mwb = mddev->bitmap_info.max_write_behind; int rv = kstrtoul(buf, 10, &backlog); if (rv) return rv; if (backlog > COUNTER_MAX) return -EINVAL; mddev->bitmap_info.max_write_behind = backlog; + if (!backlog && mddev->wb_info_pool) { + /* wb_info_pool is not needed if backlog is zero */ + mempool_destroy(mddev->wb_info_pool); + mddev->wb_info_pool = NULL; + } else if (backlog && !mddev->wb_info_pool) { + /* wb_info_pool is needed since backlog is not zero */ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + mddev_create_wb_pool(mddev, rdev, false); + } + if (old_mwb != backlog) + md_bitmap_update_sb(mddev->bitmap); return len; } From 617b194a13c0f3b0a6d14fc6227c222877c23b4e Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 14 Jun 2019 17:10:38 +0800 Subject: [PATCH 048/153] md-bitmap: create and destroy wb_info_pool with the change of bitmap The write-behind attribute is part of bitmap, since bitmap can be added/removed dynamically with the following. 1. mdadm --grow /dev/md0 --bitmap=none 2. mdadm --grow /dev/md0 --bitmap=internal --write-behind So we need to destroy wb_info_pool in md_bitmap_destroy, and create the pool before load bitmap. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 15dd817fe83b..b092c7b5282f 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1790,6 +1790,8 @@ void md_bitmap_destroy(struct mddev *mddev) return; md_bitmap_wait_behind_writes(mddev); + mempool_destroy(mddev->wb_info_pool); + mddev->wb_info_pool = NULL; mutex_lock(&mddev->bitmap_info.mutex); spin_lock(&mddev->lock); @@ -1900,10 +1902,14 @@ int md_bitmap_load(struct mddev *mddev) sector_t start = 0; sector_t sector = 0; struct bitmap *bitmap = mddev->bitmap; + struct md_rdev *rdev; if (!bitmap) goto out; + rdev_for_each(rdev, mddev) + mddev_create_wb_pool(mddev, rdev, true); + if (mddev_is_clustered(mddev)) md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); From d494549ac8852ec42854d1491dd17bb9350a0abc Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 14 Jun 2019 17:10:39 +0800 Subject: [PATCH 049/153] md: add bitmap_abort label in md_run Now, there are two places need to consider about the failure of destroy bitmap, so move the common part between bitmap_abort and abort label. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index b43207ab00b2..692fc365e73c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5666,15 +5666,8 @@ int md_run(struct mddev *mddev) mddev->bitmap = bitmap; } - if (err) { - mddev_detach(mddev); - if (mddev->private) - pers->free(mddev, mddev->private); - mddev->private = NULL; - module_put(pers->owner); - md_bitmap_destroy(mddev); - goto abort; - } + if (err) + goto bitmap_abort; if (mddev->bitmap_info.max_write_behind > 0) { bool creat_pool = false; @@ -5690,13 +5683,7 @@ int md_run(struct mddev *mddev) sizeof(struct wb_info)); if (!mddev->wb_info_pool) { err = -ENOMEM; - mddev_detach(mddev); - if (mddev->private) - pers->free(mddev, mddev->private); - mddev->private = NULL; - module_put(pers->owner); - md_bitmap_destroy(mddev); - goto abort; + goto bitmap_abort; } } } @@ -5761,6 +5748,13 @@ int md_run(struct mddev *mddev) sysfs_notify(&mddev->kobj, NULL, "degraded"); return 0; +bitmap_abort: + mddev_detach(mddev); + if (mddev->private) + pers->free(mddev, mddev->private); + mddev->private = NULL; + module_put(pers->owner); + md_bitmap_destroy(mddev); abort: bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); From 474a280036e8d319ef852f1dec59bedf4eda0107 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jun 2019 10:22:48 +0200 Subject: [PATCH 050/153] cgroup: export css_next_descendant_pre for bfq The bfq schedule now uses css_next_descendant_pre directly after the stats functionality depending on it has been from the core blk-cgroup code to bfq. Export the symbol so that bfq can still be build modular. Fixes: d6258980daf2 ("bfq-iosched: move bfq_stat_recursive_sum into the only caller") Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- kernel/cgroup/cgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 426a0026225c..30aba80858e3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4221,6 +4221,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, return NULL; } +EXPORT_SYMBOL_GPL(css_next_descendant_pre); /** * css_rightmost_descendant - return the rightmost descendant of a css From 9d09dd8d7626b9124ce4bc081aabcb0590173b27 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 14 May 2019 14:58:02 -0700 Subject: [PATCH 051/153] nvmet: add transport discovery change op Some transports, such as FC-NVME, support discovery controller change events without the use of a persistent discovery controller. FC receives events via RSCN from the FC Fabric Controller or subsystem FC port. This patch adds a nvmet transport op that is called whenever a discovery change event occurs in the nvmet layer. To facilitate the callback without adding another layer to cross into core.c to reference the transport ops, the port structure snapshots the transport ops when the port is enabled and clears them when disabled. Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Reviewed-by: Arun Easi Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 2 ++ drivers/nvme/target/discovery.c | 4 ++++ drivers/nvme/target/nvmet.h | 2 ++ 3 files changed, 8 insertions(+) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 7734a6acff85..43e8c4adc1f4 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -311,6 +311,7 @@ int nvmet_enable_port(struct nvmet_port *port) port->inline_data_size = 0; port->enabled = true; + port->tr_ops = ops; return 0; } @@ -321,6 +322,7 @@ void nvmet_disable_port(struct nvmet_port *port) lockdep_assert_held(&nvmet_config_sem); port->enabled = false; + port->tr_ops = NULL; ops = nvmet_transports[port->disc_addr.trtype]; ops->remove_port(port); diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 5baf269f3f8a..8efca26b4776 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -41,6 +41,10 @@ void nvmet_port_disc_changed(struct nvmet_port *port, __nvmet_disc_changed(port, ctrl); } mutex_unlock(&nvmet_disc_subsys->lock); + + /* If transport can signal change, notify transport */ + if (port->tr_ops && port->tr_ops->discovery_chg) + port->tr_ops->discovery_chg(port); } static void __nvmet_subsys_disc_changed(struct nvmet_port *port, diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index c25d88fc9dec..dc270944bb25 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -140,6 +140,7 @@ struct nvmet_port { void *priv; bool enabled; int inline_data_size; + const struct nvmet_fabrics_ops *tr_ops; }; static inline struct nvmet_port *to_nvmet_port(struct config_item *item) @@ -277,6 +278,7 @@ struct nvmet_fabrics_ops { void (*disc_traddr)(struct nvmet_req *req, struct nvmet_port *port, char *traddr); u16 (*install_queue)(struct nvmet_sq *nvme_sq); + void (*discovery_chg)(struct nvmet_port *port); }; #define NVMET_MAX_INLINE_BIOVEC 8 From 150d71f725fd2f5a0015b7fa8df0816a207d4e4b Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 14 May 2019 14:58:03 -0700 Subject: [PATCH 052/153] nvmet-fc: add transport discovery change event callback support This patch adds support for the nvmet discovery_change transport op. In turn, the transport adds it's own LLDD api callback discovery_event op to request the LLDD to generate an RSCN for the discovery change. Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Reviewed-by: Arun Easi Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fc.c | 11 +++++++++++ include/linux/nvme-fc-driver.h | 6 ++++++ 2 files changed, 17 insertions(+) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 508661af0f50..1f252c9a953a 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -2549,6 +2549,16 @@ nvmet_fc_remove_port(struct nvmet_port *port) kfree(pe); } +static void +nvmet_fc_discovery_chg(struct nvmet_port *port) +{ + struct nvmet_fc_port_entry *pe = port->priv; + struct nvmet_fc_tgtport *tgtport = pe->tgtport; + + if (tgtport && tgtport->ops->discovery_event) + tgtport->ops->discovery_event(&tgtport->fc_target_port); +} + static const struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_FC, @@ -2557,6 +2567,7 @@ static const struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = { .remove_port = nvmet_fc_remove_port, .queue_response = nvmet_fc_fcp_nvme_cmd_done, .delete_ctrl = nvmet_fc_delete_ctrl, + .discovery_chg = nvmet_fc_discovery_chg, }; static int __init nvmet_fc_init_module(void) diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index c48e96436f56..98d904961b33 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -791,6 +791,11 @@ struct nvmet_fc_target_port { * nvmefc_tgt_fcp_req. * Entrypoint is Optional. * + * @discovery_event: Called by the transport to generate an RSCN + * change notifications to NVME initiators. The RSCN notifications + * should cause the initiator to rescan the discovery controller + * on the targetport. + * * @max_hw_queues: indicates the maximum number of hw queues the LLDD * supports for cpu affinitization. * Value is Mandatory. Must be at least 1. @@ -832,6 +837,7 @@ struct nvmet_fc_target_template { struct nvmefc_tgt_fcp_req *fcpreq); void (*defer_rcv)(struct nvmet_fc_target_port *tgtport, struct nvmefc_tgt_fcp_req *fcpreq); + void (*discovery_event)(struct nvmet_fc_target_port *tgtport); u32 max_hw_queues; u16 max_sgl_segments; From 4cf7c363b41552d76331fcf1e7ce600c8deeddc3 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 14 May 2019 14:58:04 -0700 Subject: [PATCH 053/153] nvme-fcloop: add support for nvmet discovery_event op Update fcloop to support the discovery_event operation and invoke a nvme rescan. In a real fc adapter, this would generate an RSCN, which the host would receive and convert into a nvme rescan on the remote port specified in the rscn payload. Signed-off-by: James Smart [kbuild-bot: fcloop_tgt_discovery_evt can be static] Reviewed-by: Hannes Reinecke Reviewed-by: Arun Easi Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 37 ++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 381b5a90c48b..b8c1cc54a0db 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -231,6 +231,11 @@ struct fcloop_lsreq { int status; }; +struct fcloop_rscn { + struct fcloop_tport *tport; + struct work_struct work; +}; + enum { INI_IO_START = 0, INI_IO_ACTIVE = 1, @@ -348,6 +353,37 @@ fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport, return 0; } +/* + * Simulate reception of RSCN and converting it to a initiator transport + * call to rescan a remote port. + */ +static void +fcloop_tgt_rscn_work(struct work_struct *work) +{ + struct fcloop_rscn *tgt_rscn = + container_of(work, struct fcloop_rscn, work); + struct fcloop_tport *tport = tgt_rscn->tport; + + if (tport->remoteport) + nvme_fc_rescan_remoteport(tport->remoteport); + kfree(tgt_rscn); +} + +static void +fcloop_tgt_discovery_evt(struct nvmet_fc_target_port *tgtport) +{ + struct fcloop_rscn *tgt_rscn; + + tgt_rscn = kzalloc(sizeof(*tgt_rscn), GFP_KERNEL); + if (!tgt_rscn) + return; + + tgt_rscn->tport = tgtport->private; + INIT_WORK(&tgt_rscn->work, fcloop_tgt_rscn_work); + + schedule_work(&tgt_rscn->work); +} + static void fcloop_tfcp_req_free(struct kref *ref) { @@ -839,6 +875,7 @@ static struct nvmet_fc_target_template tgttemplate = { .fcp_op = fcloop_fcp_op, .fcp_abort = fcloop_tgt_fcp_abort, .fcp_req_release = fcloop_fcp_req_release, + .discovery_event = fcloop_tgt_discovery_evt, .max_hw_queues = FCLOOP_HW_QUEUES, .max_sgl_segments = FCLOOP_SGL_SEGS, .max_dif_sgl_segments = FCLOOP_SGL_SEGS, From f60cb93bbfecf1ad13713af285c3793e861fc9b2 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 14 May 2019 14:58:05 -0700 Subject: [PATCH 054/153] lpfc: add support to generate RSCN events for nport This patch adds general RSCN support: - The ability to transmit an RSCN to the port on the other end of the link (regular port if pt2pt, or fabric controller if fabric). - And general recognition of an RSCN ELS when an ELS is received. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Reviewed-by: Arun Easi Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/scsi/lpfc/lpfc.h | 1 + drivers/scsi/lpfc/lpfc_crtn.h | 2 + drivers/scsi/lpfc/lpfc_els.c | 122 +++++++++++++++++++++++++++++++ drivers/scsi/lpfc/lpfc_hbadisc.c | 35 +++++++++ drivers/scsi/lpfc/lpfc_hw.h | 2 + drivers/scsi/lpfc/lpfc_sli.c | 1 + 6 files changed, 163 insertions(+) diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h index aafcffaa25f7..14293c546772 100644 --- a/drivers/scsi/lpfc/lpfc.h +++ b/drivers/scsi/lpfc/lpfc.h @@ -274,6 +274,7 @@ struct lpfc_stats { uint32_t elsXmitADISC; uint32_t elsXmitLOGO; uint32_t elsXmitSCR; + uint32_t elsXmitRSCN; uint32_t elsXmitRNID; uint32_t elsXmitFARP; uint32_t elsXmitFARPR; diff --git a/drivers/scsi/lpfc/lpfc_crtn.h b/drivers/scsi/lpfc/lpfc_crtn.h index e0b14d791b8c..4b8eb9107b85 100644 --- a/drivers/scsi/lpfc/lpfc_crtn.h +++ b/drivers/scsi/lpfc/lpfc_crtn.h @@ -141,6 +141,7 @@ int lpfc_issue_els_adisc(struct lpfc_vport *, struct lpfc_nodelist *, uint8_t); int lpfc_issue_els_logo(struct lpfc_vport *, struct lpfc_nodelist *, uint8_t); int lpfc_issue_els_npiv_logo(struct lpfc_vport *, struct lpfc_nodelist *); int lpfc_issue_els_scr(struct lpfc_vport *, uint32_t, uint8_t); +int lpfc_issue_els_rscn(struct lpfc_vport *vport, uint8_t retry); int lpfc_issue_fabric_reglogin(struct lpfc_vport *); int lpfc_els_free_iocb(struct lpfc_hba *, struct lpfc_iocbq *); int lpfc_ct_free_iocb(struct lpfc_hba *, struct lpfc_iocbq *); @@ -355,6 +356,7 @@ void lpfc_mbox_timeout_handler(struct lpfc_hba *); struct lpfc_nodelist *lpfc_findnode_did(struct lpfc_vport *, uint32_t); struct lpfc_nodelist *lpfc_findnode_wwpn(struct lpfc_vport *, struct lpfc_name *); +struct lpfc_nodelist *lpfc_findnode_mapped(struct lpfc_vport *vport); int lpfc_sli_issue_mbox_wait(struct lpfc_hba *, LPFC_MBOXQ_t *, uint32_t); diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index 5ac4f8d76b91..00f5d9d547f9 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -30,6 +30,8 @@ #include #include #include +#include +#include #include "lpfc_hw4.h" #include "lpfc_hw.h" @@ -3078,6 +3080,116 @@ lpfc_issue_els_scr(struct lpfc_vport *vport, uint32_t nportid, uint8_t retry) return 0; } +/** + * lpfc_issue_els_rscn - Issue an RSCN to the Fabric Controller (Fabric) + * or the other nport (pt2pt). + * @vport: pointer to a host virtual N_Port data structure. + * @retry: number of retries to the command IOCB. + * + * This routine issues a RSCN to the Fabric Controller (DID 0xFFFFFD) + * when connected to a fabric, or to the remote port when connected + * in point-to-point mode. When sent to the Fabric Controller, it will + * replay the RSCN to registered recipients. + * + * Note that, in lpfc_prep_els_iocb() routine, the reference count of ndlp + * will be incremented by 1 for holding the ndlp and the reference to ndlp + * will be stored into the context1 field of the IOCB for the completion + * callback function to the RSCN ELS command. + * + * Return code + * 0 - Successfully issued RSCN command + * 1 - Failed to issue RSCN command + **/ +int +lpfc_issue_els_rscn(struct lpfc_vport *vport, uint8_t retry) +{ + struct lpfc_hba *phba = vport->phba; + struct lpfc_iocbq *elsiocb; + struct lpfc_nodelist *ndlp; + struct { + struct fc_els_rscn rscn; + struct fc_els_rscn_page portid; + } *event; + uint32_t nportid; + uint16_t cmdsize = sizeof(*event); + + /* Not supported for private loop */ + if (phba->fc_topology == LPFC_TOPOLOGY_LOOP && + !(vport->fc_flag & FC_PUBLIC_LOOP)) + return 1; + + if (vport->fc_flag & FC_PT2PT) { + /* find any mapped nport - that would be the other nport */ + ndlp = lpfc_findnode_mapped(vport); + if (!ndlp) + return 1; + } else { + nportid = FC_FID_FCTRL; + /* find the fabric controller node */ + ndlp = lpfc_findnode_did(vport, nportid); + if (!ndlp) { + /* if one didn't exist, make one */ + ndlp = lpfc_nlp_init(vport, nportid); + if (!ndlp) + return 1; + lpfc_enqueue_node(vport, ndlp); + } else if (!NLP_CHK_NODE_ACT(ndlp)) { + ndlp = lpfc_enable_node(vport, ndlp, + NLP_STE_UNUSED_NODE); + if (!ndlp) + return 1; + } + } + + elsiocb = lpfc_prep_els_iocb(vport, 1, cmdsize, retry, ndlp, + ndlp->nlp_DID, ELS_CMD_RSCN_XMT); + + if (!elsiocb) { + /* This will trigger the release of the node just + * allocated + */ + lpfc_nlp_put(ndlp); + return 1; + } + + event = ((struct lpfc_dmabuf *)elsiocb->context2)->virt; + + event->rscn.rscn_cmd = ELS_RSCN; + event->rscn.rscn_page_len = sizeof(struct fc_els_rscn_page); + event->rscn.rscn_plen = cpu_to_be16(cmdsize); + + nportid = vport->fc_myDID; + /* appears that page flags must be 0 for fabric to broadcast RSCN */ + event->portid.rscn_page_flags = 0; + event->portid.rscn_fid[0] = (nportid & 0x00FF0000) >> 16; + event->portid.rscn_fid[1] = (nportid & 0x0000FF00) >> 8; + event->portid.rscn_fid[2] = nportid & 0x000000FF; + + lpfc_debugfs_disc_trc(vport, LPFC_DISC_TRC_ELS_CMD, + "Issue RSCN: did:x%x", + ndlp->nlp_DID, 0, 0); + + phba->fc_stat.elsXmitRSCN++; + elsiocb->iocb_cmpl = lpfc_cmpl_els_cmd; + if (lpfc_sli_issue_iocb(phba, LPFC_ELS_RING, elsiocb, 0) == + IOCB_ERROR) { + /* The additional lpfc_nlp_put will cause the following + * lpfc_els_free_iocb routine to trigger the rlease of + * the node. + */ + lpfc_nlp_put(ndlp); + lpfc_els_free_iocb(phba, elsiocb); + return 1; + } + /* This will cause the callback-function lpfc_cmpl_els_cmd to + * trigger the release of node. + */ + if (!(vport->fc_flag & FC_PT2PT)) + lpfc_nlp_put(ndlp); + + return 0; +} + /** * lpfc_issue_els_farpr - Issue a farp to an node on a vport * @vport: pointer to a host virtual N_Port data structure. @@ -6318,6 +6430,16 @@ lpfc_els_rcv_rscn(struct lpfc_vport *vport, struct lpfc_iocbq *cmdiocb, fc_host_post_event(shost, fc_get_event_number(), FCH_EVT_RSCN, lp[i]); + /* Check if RSCN is coming from a direct-connected remote NPort */ + if (vport->fc_flag & FC_PT2PT) { + /* If so, just ACC it, no other action needed for now */ + lpfc_printf_vlog(vport, KERN_INFO, LOG_ELS, + "2024 pt2pt RSCN %08x Data: x%x x%x\n", + *lp, vport->fc_flag, payload_len); + lpfc_els_rsp_acc(vport, ELS_CMD_ACC, cmdiocb, ndlp, NULL); + return 0; + } + /* If we are about to begin discovery, just ACC the RSCN. * Discovery processing will satisfy it. */ diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index c43852f97f25..28ecaa7fc715 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -5276,6 +5276,41 @@ lpfc_findnode_did(struct lpfc_vport *vport, uint32_t did) return ndlp; } +struct lpfc_nodelist * +lpfc_findnode_mapped(struct lpfc_vport *vport) +{ + struct Scsi_Host *shost = lpfc_shost_from_vport(vport); + struct lpfc_nodelist *ndlp; + uint32_t data1; + unsigned long iflags; + + spin_lock_irqsave(shost->host_lock, iflags); + + list_for_each_entry(ndlp, &vport->fc_nodes, nlp_listp) { + if (ndlp->nlp_state == NLP_STE_UNMAPPED_NODE || + ndlp->nlp_state == NLP_STE_MAPPED_NODE) { + data1 = (((uint32_t)ndlp->nlp_state << 24) | + ((uint32_t)ndlp->nlp_xri << 16) | + ((uint32_t)ndlp->nlp_type << 8) | + ((uint32_t)ndlp->nlp_rpi & 0xff)); + spin_unlock_irqrestore(shost->host_lock, iflags); + lpfc_printf_vlog(vport, KERN_INFO, LOG_NODE, + "2025 FIND node DID " + "Data: x%p x%x x%x x%x %p\n", + ndlp, ndlp->nlp_DID, + ndlp->nlp_flag, data1, + ndlp->active_rrqs_xri_bitmap); + return ndlp; + } + } + spin_unlock_irqrestore(shost->host_lock, iflags); + + /* FIND node did NOT FOUND */ + lpfc_printf_vlog(vport, KERN_INFO, LOG_NODE, + "2026 FIND mapped did NOT FOUND.\n"); + return NULL; +} + struct lpfc_nodelist * lpfc_setup_disc_node(struct lpfc_vport *vport, uint32_t did) { diff --git a/drivers/scsi/lpfc/lpfc_hw.h b/drivers/scsi/lpfc/lpfc_hw.h index edd8f3982023..5b439a6dcde1 100644 --- a/drivers/scsi/lpfc/lpfc_hw.h +++ b/drivers/scsi/lpfc/lpfc_hw.h @@ -601,6 +601,7 @@ struct fc_vft_header { #define ELS_CMD_RPL 0x57000000 #define ELS_CMD_FAN 0x60000000 #define ELS_CMD_RSCN 0x61040000 +#define ELS_CMD_RSCN_XMT 0x61040008 #define ELS_CMD_SCR 0x62000000 #define ELS_CMD_RNID 0x78000000 #define ELS_CMD_LIRR 0x7A000000 @@ -642,6 +643,7 @@ struct fc_vft_header { #define ELS_CMD_RPL 0x57 #define ELS_CMD_FAN 0x60 #define ELS_CMD_RSCN 0x0461 +#define ELS_CMD_RSCN_XMT 0x08000461 #define ELS_CMD_SCR 0x62 #define ELS_CMD_RNID 0x78 #define ELS_CMD_LIRR 0x7A diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index d1512e4f9791..4329cc44bb55 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -9398,6 +9398,7 @@ lpfc_sli4_iocb2wqe(struct lpfc_hba *phba, struct lpfc_iocbq *iocbq, if (if_type >= LPFC_SLI_INTF_IF_TYPE_2) { if (pcmd && (*pcmd == ELS_CMD_FLOGI || *pcmd == ELS_CMD_SCR || + *pcmd == ELS_CMD_RSCN_XMT || *pcmd == ELS_CMD_FDISC || *pcmd == ELS_CMD_LOGO || *pcmd == ELS_CMD_PLOGI)) { From ab723121a8eade04ecc6bd7116924c359336f4eb Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 14 May 2019 14:58:06 -0700 Subject: [PATCH 055/153] lpfc: add nvmet discovery_event op support This patch adds support for the nvmet discovery op. When the callback routine is called, the driver will call the routine to generate an RSCN to the port on the other end of the link. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Reviewed-by: Arun Easi Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/scsi/lpfc/lpfc_nvmet.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c index d74bfd264495..06170824a69b 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.c +++ b/drivers/scsi/lpfc/lpfc_nvmet.c @@ -1139,6 +1139,22 @@ lpfc_nvmet_defer_rcv(struct nvmet_fc_target_port *tgtport, spin_unlock_irqrestore(&ctxp->ctxlock, iflag); } +static void +lpfc_nvmet_discovery_event(struct nvmet_fc_target_port *tgtport) +{ + struct lpfc_nvmet_tgtport *tgtp; + struct lpfc_hba *phba; + uint32_t rc; + + tgtp = tgtport->private; + phba = tgtp->phba; + + rc = lpfc_issue_els_rscn(phba->pport, 0); + lpfc_printf_log(phba, KERN_ERR, LOG_NVME, + "6420 NVMET subsystem change: Notification %s\n", + (rc) ? "Failed" : "Sent"); +} + static struct nvmet_fc_target_template lpfc_tgttemplate = { .targetport_delete = lpfc_nvmet_targetport_delete, .xmt_ls_rsp = lpfc_nvmet_xmt_ls_rsp, @@ -1146,6 +1162,7 @@ static struct nvmet_fc_target_template lpfc_tgttemplate = { .fcp_abort = lpfc_nvmet_xmt_fcp_abort, .fcp_req_release = lpfc_nvmet_xmt_fcp_release, .defer_rcv = lpfc_nvmet_defer_rcv, + .discovery_event = lpfc_nvmet_discovery_event, .max_hw_queues = 1, .max_sgl_segments = LPFC_NVMET_DEFAULT_SEGS, From 6f2589f478795c46a61696d7d7c2f47a0bc6cfe3 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 14 May 2019 14:58:07 -0700 Subject: [PATCH 056/153] lpfc: add support for translating an RSCN rcv into a discovery rescan This patch updates RSCN receive processing to check for the remote port being an NVME port, and if so, invoke the nvme_fc callback to rescan the remote port. The rescan will generate a discovery udev event. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Reviewed-by: Arun Easi Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/scsi/lpfc/lpfc_crtn.h | 2 ++ drivers/scsi/lpfc/lpfc_els.c | 5 ++++ drivers/scsi/lpfc/lpfc_nvme.c | 44 +++++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+) diff --git a/drivers/scsi/lpfc/lpfc_crtn.h b/drivers/scsi/lpfc/lpfc_crtn.h index 4b8eb9107b85..866374801140 100644 --- a/drivers/scsi/lpfc/lpfc_crtn.h +++ b/drivers/scsi/lpfc/lpfc_crtn.h @@ -557,6 +557,8 @@ void lpfc_ras_stop_fwlog(struct lpfc_hba *phba); int lpfc_check_fwlog_support(struct lpfc_hba *phba); /* NVME interfaces. */ +void lpfc_nvme_rescan_port(struct lpfc_vport *vport, + struct lpfc_nodelist *ndlp); void lpfc_nvme_unregister_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp); int lpfc_nvme_register_port(struct lpfc_vport *vport, diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index 00f5d9d547f9..968ed0fd37f7 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -6326,6 +6326,8 @@ lpfc_rscn_recovery_check(struct lpfc_vport *vport) continue; } + if (ndlp->nlp_fc4_type & NLP_FC4_NVME) + lpfc_nvme_rescan_port(vport, ndlp); lpfc_disc_state_machine(vport, ndlp, NULL, NLP_EVT_DEVICE_RECOVERY); @@ -6437,6 +6439,9 @@ lpfc_els_rcv_rscn(struct lpfc_vport *vport, struct lpfc_iocbq *cmdiocb, "2024 pt2pt RSCN %08x Data: x%x x%x\n", *lp, vport->fc_flag, payload_len); lpfc_els_rsp_acc(vport, ELS_CMD_ACC, cmdiocb, ndlp, NULL); + + if (ndlp->nlp_fc4_type & NLP_FC4_NVME) + lpfc_nvme_rescan_port(vport, ndlp); return 0; } diff --git a/drivers/scsi/lpfc/lpfc_nvme.c b/drivers/scsi/lpfc/lpfc_nvme.c index 9d99cb915390..fdd16d9f55a1 100644 --- a/drivers/scsi/lpfc/lpfc_nvme.c +++ b/drivers/scsi/lpfc/lpfc_nvme.c @@ -2402,6 +2402,50 @@ lpfc_nvme_register_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) #endif } +/** + * lpfc_nvme_rescan_port - Check to see if we should rescan this remoteport + * + * If the ndlp represents an NVME Target, that we are logged into, + * ping the NVME FC Transport layer to initiate a device rescan + * on this remote NPort. + */ +void +lpfc_nvme_rescan_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) +{ +#if (IS_ENABLED(CONFIG_NVME_FC)) + struct lpfc_nvme_rport *rport; + struct nvme_fc_remote_port *remoteport; + + rport = ndlp->nrport; + + lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME_DISC, + "6170 Rescan NPort DID x%06x type x%x " + "state x%x rport %p\n", + ndlp->nlp_DID, ndlp->nlp_type, ndlp->nlp_state, rport); + if (!rport) + goto input_err; + remoteport = rport->remoteport; + if (!remoteport) + goto input_err; + + /* Only rescan if we are an NVME target in the MAPPED state */ + if (remoteport->port_role & FC_PORT_ROLE_NVME_DISCOVERY && + ndlp->nlp_state == NLP_STE_MAPPED_NODE) { + nvme_fc_rescan_remoteport(remoteport); + + lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_DISC, + "6172 NVME rescanned DID x%06x " + "port_state x%x\n", + ndlp->nlp_DID, remoteport->port_state); + } + return; +input_err: + lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_DISC, + "6169 State error: lport %p, rport%p FCID x%06x\n", + vport->localport, ndlp->rport, ndlp->nlp_DID); +#endif +} + /* lpfc_nvme_unregister_port - unbind the DID and port_role from this rport. * * There is no notion of Devloss or rport recovery from the current From 41b194b843a255d5a6e9468edd3ab1d71a24abb3 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 14 May 2019 14:58:08 -0700 Subject: [PATCH 057/153] lpfc: add sysfs interface to post NVME RSCN To support scenarios which aren't bound to nvmetcli add port scenarios, which is currently where the nvmet_fc transport invokes the discovery event callbacks, a syfs attribute is added to lpfc which can be written to cause an RSCN to be generated for the nport. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Reviewed-by: Arun Easi Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/scsi/lpfc/lpfc.h | 1 + drivers/scsi/lpfc/lpfc_attr.c | 60 +++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h index 14293c546772..2c3bb8a966e5 100644 --- a/drivers/scsi/lpfc/lpfc.h +++ b/drivers/scsi/lpfc/lpfc.h @@ -820,6 +820,7 @@ struct lpfc_hba { uint32_t cfg_use_msi; uint32_t cfg_auto_imax; uint32_t cfg_fcp_imax; + uint32_t cfg_force_rscn; uint32_t cfg_cq_poll_threshold; uint32_t cfg_cq_max_proc_limit; uint32_t cfg_fcp_cpu_map; diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index d4c65e2109e2..2bd1e014103b 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -4958,6 +4958,64 @@ static DEVICE_ATTR(lpfc_req_fw_upgrade, S_IRUGO | S_IWUSR, lpfc_request_firmware_upgrade_show, lpfc_request_firmware_upgrade_store); +/** + * lpfc_force_rscn_store + * + * @dev: class device that is converted into a Scsi_host. + * @attr: device attribute, not used. + * @buf: unused string + * @count: unused variable. + * + * Description: + * Force the switch to send a RSCN to all other NPorts in our zone + * If we are direct connect pt2pt, build the RSCN command ourself + * and send to the other NPort. Not supported for private loop. + * + * Returns: + * 0 - on success + * -EIO - if command is not sent + **/ +static ssize_t +lpfc_force_rscn_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct lpfc_vport *vport = (struct lpfc_vport *)shost->hostdata; + int i; + + i = lpfc_issue_els_rscn(vport, 0); + if (i) + return -EIO; + return strlen(buf); +} + +/* + * lpfc_force_rscn: Force an RSCN to be sent to all remote NPorts + * connected to the HBA. + * + * Value range is any ascii value + */ +static int lpfc_force_rscn; +module_param(lpfc_force_rscn, int, 0644); +MODULE_PARM_DESC(lpfc_force_rscn, + "Force an RSCN to be sent to all remote NPorts"); +lpfc_param_show(force_rscn) + +/** + * lpfc_force_rscn_init - Force an RSCN to be sent to all remote NPorts + * @phba: lpfc_hba pointer. + * @val: unused value. + * + * Returns: + * zero if val saved. + **/ +static int +lpfc_force_rscn_init(struct lpfc_hba *phba, int val) +{ + return 0; +} +static DEVICE_ATTR_RW(lpfc_force_rscn); + /** * lpfc_fcp_imax_store * @@ -5958,6 +6016,7 @@ struct device_attribute *lpfc_hba_attrs[] = { &dev_attr_lpfc_nvme_oas, &dev_attr_lpfc_nvme_embed_cmd, &dev_attr_lpfc_fcp_imax, + &dev_attr_lpfc_force_rscn, &dev_attr_lpfc_cq_poll_threshold, &dev_attr_lpfc_cq_max_proc_limit, &dev_attr_lpfc_fcp_cpu_map, @@ -7005,6 +7064,7 @@ lpfc_get_cfgparam(struct lpfc_hba *phba) lpfc_nvme_oas_init(phba, lpfc_nvme_oas); lpfc_nvme_embed_cmd_init(phba, lpfc_nvme_embed_cmd); lpfc_fcp_imax_init(phba, lpfc_fcp_imax); + lpfc_force_rscn_init(phba, lpfc_force_rscn); lpfc_cq_poll_threshold_init(phba, lpfc_cq_poll_threshold); lpfc_cq_max_proc_limit_init(phba, lpfc_cq_max_proc_limit); lpfc_fcp_cpu_map_init(phba, lpfc_fcp_cpu_map); From 4bea364f161810523032f37a8ae0b7d92cf28eea Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 29 May 2019 15:25:26 -0700 Subject: [PATCH 058/153] nvme-fc: add message when creating new association When looking at console messages to troubleshoot, there are one maybe two messages before creation of the controller is complete. However, a lot of io takes place to reach that point. It's unclear when things have started. Add a message when the controller is attempting to create a new association. Thus we know what controller, between what host and remote port, and what NQN is being put into place for any subsequent success or failure messages. Signed-off-by: James Smart Reviewed-by: Chaitanya Kulkarni Reviewed-by: Giridhar Malavali Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index dd8169bbf0d2..9b497d785ed7 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2607,6 +2607,12 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (nvme_fc_ctlr_active_on_rport(ctrl)) return -ENOTUNIQ; + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: create association : host wwpn 0x%016llx " + " rport wwpn 0x%016llx: NQN \"%s\"\n", + ctrl->cnum, ctrl->lport->localport.port_name, + ctrl->rport->remoteport.port_name, ctrl->ctrl.opts->subsysnqn); + /* * Create the admin queue */ From 2181e455612a8db2761eabbf126640552a451e96 Mon Sep 17 00:00:00 2001 From: Anton Eidelman Date: Thu, 20 Jun 2019 08:48:10 +0200 Subject: [PATCH 059/153] nvme: fix possible io failures when removing multipathed ns When a shared namespace is removed, we call blk_cleanup_queue() when the device can still be accessed as the current path and this can result in submission to a dying queue. Hence, direct_make_request() called by our mpath device may fail (propagating the failure to userspace). Instead, we want to failover this I/O to a different path if one exists. Thus, before we cleanup the request queue, we make sure that the device is cleared from the current path nor it can be selected again as such. Fix this by: - clear the ns from the head->list and synchronize rcu to make sure there is no concurrent path search that restores it as the current path - clear the mpath current path in order to trigger a subsequent path search and sync srcu to wait for any ongoing request submissions - safely continue to namespace removal and blk_cleanup_queue Signed-off-by: Anton Eidelman Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 120fb593d1da..22c68e3b71d5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3344,6 +3344,14 @@ static void nvme_ns_remove(struct nvme_ns *ns) return; nvme_fault_inject_fini(ns); + + mutex_lock(&ns->ctrl->subsys->lock); + list_del_rcu(&ns->siblings); + mutex_unlock(&ns->ctrl->subsys->lock); + synchronize_rcu(); /* guarantee not available in head->list */ + nvme_mpath_clear_current_path(ns); + synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */ + if (ns->disk && ns->disk->flags & GENHD_FL_UP) { del_gendisk(ns->disk); blk_cleanup_queue(ns->queue); @@ -3351,16 +3359,10 @@ static void nvme_ns_remove(struct nvme_ns *ns) blk_integrity_unregister(ns->disk); } - mutex_lock(&ns->ctrl->subsys->lock); - list_del_rcu(&ns->siblings); - nvme_mpath_clear_current_path(ns); - mutex_unlock(&ns->ctrl->subsys->lock); - down_write(&ns->ctrl->namespaces_rwsem); list_del_init(&ns->list); up_write(&ns->ctrl->namespaces_rwsem); - synchronize_srcu(&ns->head->srcu); nvme_mpath_check_last_path(ns); nvme_put_ns(ns); } From 1a87ee657c530bb2f3e39e4ac184d48f5f959cda Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 27 May 2019 01:29:01 +0900 Subject: [PATCH 060/153] nvme: export get and set features Future use intends to make use of both, so export these functions. And since their implementation is identical except for the opcode, provide a new function that implement both. [akinobu.mita@gmail.com>: fix line over 80 characters] Signed-off-by: Keith Busch Signed-off-by: Akinobu Mita Reviewed-by: Chaitanya Kulkarni Reviewed-by: Minwoo Im Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 24 +++++++++++++++++++++--- drivers/nvme/host/nvme.h | 6 ++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 22c68e3b71d5..3b3960e0c31f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1113,15 +1113,15 @@ static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl, return id; } -static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, - void *buffer, size_t buflen, u32 *result) +static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid, + unsigned int dword11, void *buffer, size_t buflen, u32 *result) { struct nvme_command c; union nvme_result res; int ret; memset(&c, 0, sizeof(c)); - c.features.opcode = nvme_admin_set_features; + c.features.opcode = op; c.features.fid = cpu_to_le32(fid); c.features.dword11 = cpu_to_le32(dword11); @@ -1132,6 +1132,24 @@ static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword return ret; } +int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid, + unsigned int dword11, void *buffer, size_t buflen, + u32 *result) +{ + return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer, + buflen, result); +} +EXPORT_SYMBOL_GPL(nvme_set_features); + +int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid, + unsigned int dword11, void *buffer, size_t buflen, + u32 *result) +{ + return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer, + buflen, result); +} +EXPORT_SYMBOL_GPL(nvme_get_features); + int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) { u32 q_count = (*count - 1) | ((*count - 1) << 16); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 55553d293a98..038b8931d9e5 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -459,6 +459,12 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, union nvme_result *result, void *buffer, unsigned bufflen, unsigned timeout, int qid, int at_head, blk_mq_req_flags_t flags, bool poll); +int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid, + unsigned int dword11, void *buffer, size_t buflen, + u32 *result); +int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid, + unsigned int dword11, void *buffer, size_t buflen, + u32 *result); int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); From 7a1f46e3f75cff5042dfa1bb80c9929a0e412abc Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Thu, 6 Jun 2019 14:30:14 +0900 Subject: [PATCH 061/153] nvme: introduce nvme_is_fabrics to check fabrics cmd This patch introduces a nvme_is_fabrics() inline function to check whether or not the given command structure is for fabrics. Signed-off-by: Minwoo Im Reviewed-by: Sagi Grimberg Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 2 +- drivers/nvme/target/core.c | 2 +- drivers/nvme/target/fabrics-cmd.c | 2 +- drivers/nvme/target/fc.c | 2 +- include/linux/nvme.h | 7 ++++++- 5 files changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 5838f7cd53ac..1994d5b42f94 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -578,7 +578,7 @@ bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, switch (ctrl->state) { case NVME_CTRL_NEW: case NVME_CTRL_CONNECTING: - if (req->cmd->common.opcode == nvme_fabrics_command && + if (nvme_is_fabrics(req->cmd) && req->cmd->fabrics.fctype == nvme_fabrics_type_connect) return true; break; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 43e8c4adc1f4..0587707b1a25 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -873,7 +873,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, status = nvmet_parse_connect_cmd(req); else if (likely(req->sq->qid != 0)) status = nvmet_parse_io_cmd(req); - else if (req->cmd->common.opcode == nvme_fabrics_command) + else if (nvme_is_fabrics(req->cmd)) status = nvmet_parse_fabrics_cmd(req); else if (req->sq->ctrl->subsys->type == NVME_NQN_DISC) status = nvmet_parse_discovery_cmd(req); diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 3b9f79aba98f..d16b55ffe79f 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -268,7 +268,7 @@ u16 nvmet_parse_connect_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; - if (cmd->common.opcode != nvme_fabrics_command) { + if (!nvme_is_fabrics(cmd)) { pr_err("invalid command 0x%x on unconnected queue.\n", cmd->fabrics.opcode); req->error_loc = offsetof(struct nvme_common_command, opcode); diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 1f252c9a953a..ce8d819f86cc 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1806,7 +1806,7 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport, */ rspcnt = atomic_inc_return(&fod->queue->zrspcnt); if (!(rspcnt % fod->queue->ersp_ratio) || - sqe->opcode == nvme_fabrics_command || + nvme_is_fabrics((struct nvme_command *) sqe) || xfr_length != fod->req.transfer_len || (le16_to_cpu(cqe->status) & 0xFFFE) || cqewd[0] || cqewd[1] || (sqe->flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND)) || diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 8028adacaff3..7080923e78d1 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1165,6 +1165,11 @@ struct nvme_command { }; }; +static inline bool nvme_is_fabrics(struct nvme_command *cmd) +{ + return cmd->common.opcode == nvme_fabrics_command; +} + struct nvme_error_slot { __le64 error_count; __le16 sqid; @@ -1186,7 +1191,7 @@ static inline bool nvme_is_write(struct nvme_command *cmd) * * Why can't we simply have a Fabrics In and Fabrics out command? */ - if (unlikely(cmd->common.opcode == nvme_fabrics_command)) + if (unlikely(nvme_is_fabrics(cmd))) return cmd->fabrics.fctype & 1; return cmd->common.opcode & 1; } From d916b1be94b6dc8d293abed2451f3062f6af7551 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 23 May 2019 09:27:35 -0600 Subject: [PATCH 062/153] nvme-pci: use host managed power state for suspend The nvme pci driver prepares its devices for power loss during suspend by shutting down the controllers. The power setting is deferred to pci driver's power management before the platform removes power. The suspend-to-idle mode, however, does not remove power. NVMe devices that implement host managed power settings can achieve lower power and better transition latencies than using generic PCI power settings. Try to use this feature if the platform is not involved with the suspend. If successful, restore the previous power state on resume. Tested-by: Kai-Heng Feng Tested-by: Mario Limonciello Reviewed-by: Rafael J. Wysocki Signed-off-by: Keith Busch Signed-off-by: Sagi Grimberg [hch: fixed the compilation for the !CONFIG_PM_SLEEP case] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 95 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 92 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 524d6bd6d095..eeae5789303a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -116,6 +117,7 @@ struct nvme_dev { u32 cmbsz; u32 cmbloc; struct nvme_ctrl ctrl; + u32 last_ps; mempool_t *iod_mempool; @@ -2835,16 +2837,94 @@ static void nvme_remove(struct pci_dev *pdev) } #ifdef CONFIG_PM_SLEEP +static int nvme_get_power_state(struct nvme_ctrl *ctrl, u32 *ps) +{ + return nvme_get_features(ctrl, NVME_FEAT_POWER_MGMT, 0, NULL, 0, ps); +} + +static int nvme_set_power_state(struct nvme_ctrl *ctrl, u32 ps) +{ + return nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, ps, NULL, 0, NULL); +} + +static int nvme_resume(struct device *dev) +{ + struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev)); + struct nvme_ctrl *ctrl = &ndev->ctrl; + + if (pm_resume_via_firmware() || !ctrl->npss || + nvme_set_power_state(ctrl, ndev->last_ps) != 0) + nvme_reset_ctrl(ctrl); + return 0; +} + static int nvme_suspend(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); struct nvme_dev *ndev = pci_get_drvdata(pdev); + struct nvme_ctrl *ctrl = &ndev->ctrl; + int ret = -EBUSY; + + /* + * The platform does not remove power for a kernel managed suspend so + * use host managed nvme power settings for lowest idle power if + * possible. This should have quicker resume latency than a full device + * shutdown. But if the firmware is involved after the suspend or the + * device does not support any non-default power states, shut down the + * device fully. + */ + if (pm_suspend_via_firmware() || !ctrl->npss) { + nvme_dev_disable(ndev, true); + return 0; + } + + nvme_start_freeze(ctrl); + nvme_wait_freeze(ctrl); + nvme_sync_queues(ctrl); + + if (ctrl->state != NVME_CTRL_LIVE && + ctrl->state != NVME_CTRL_ADMIN_ONLY) + goto unfreeze; + + ndev->last_ps = 0; + ret = nvme_get_power_state(ctrl, &ndev->last_ps); + if (ret < 0) + goto unfreeze; + + ret = nvme_set_power_state(ctrl, ctrl->npss); + if (ret < 0) + goto unfreeze; + + if (ret) { + /* + * Clearing npss forces a controller reset on resume. The + * correct value will be resdicovered then. + */ + nvme_dev_disable(ndev, true); + ctrl->npss = 0; + ret = 0; + goto unfreeze; + } + /* + * A saved state prevents pci pm from generically controlling the + * device's power. If we're using protocol specific settings, we don't + * want pci interfering. + */ + pci_save_state(pdev); +unfreeze: + nvme_unfreeze(ctrl); + return ret; +} + +static int nvme_simple_suspend(struct device *dev) +{ + struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev)); nvme_dev_disable(ndev, true); return 0; } -static int nvme_resume(struct device *dev) +static int nvme_simple_resume(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); struct nvme_dev *ndev = pci_get_drvdata(pdev); @@ -2852,9 +2932,16 @@ static int nvme_resume(struct device *dev) nvme_reset_ctrl(&ndev->ctrl); return 0; } -#endif -static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); +const struct dev_pm_ops nvme_dev_pm_ops = { + .suspend = nvme_suspend, + .resume = nvme_resume, + .freeze = nvme_simple_suspend, + .thaw = nvme_simple_resume, + .poweroff = nvme_simple_suspend, + .restore = nvme_simple_resume, +}; +#endif /* CONFIG_PM_SLEEP */ static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev, pci_channel_state_t state) @@ -2959,9 +3046,11 @@ static struct pci_driver nvme_driver = { .probe = nvme_probe, .remove = nvme_remove, .shutdown = nvme_shutdown, +#ifdef CONFIG_PM_SLEEP .driver = { .pm = &nvme_dev_pm_ops, }, +#endif .sriov_configure = pci_sriov_configure_simple, .err_handler = &nvme_err_handler, }; From a232ea0ebffeaab48ec24cf795dcb07280a55ea1 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 9 Jun 2019 03:02:17 +0900 Subject: [PATCH 063/153] nvme-pci: remove unnecessary zero for static var poll_queues will be zero even without zero initialization here. Signed-off-by: Minwoo Im Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index eeae5789303a..02216b45613d 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -80,7 +80,7 @@ MODULE_PARM_DESC(write_queues, "Number of queues to use for writes. If not set, reads and writes " "will share a queue set."); -static int poll_queues = 0; +static int poll_queues; module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644); MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO."); From 483178f38cbe55a0b1854a93ceef715a0fc2ef9f Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 9 Jun 2019 03:02:18 +0900 Subject: [PATCH 064/153] nvme-pci: remove queue_count_ops for write_queues and poll_queues queue_count_set() seems like that it has been provided to limit the number of queue entries for write/poll queues. But, the queue_count_set() has been doing nothing but a parameter check even it has num_possible_cpus() which is nop. This patch removes entire queue_count_ops from the write_queues and poll_queues. Signed-off-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 02216b45613d..007f8becde4a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -68,20 +68,14 @@ static int io_queue_depth = 1024; module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); -static int queue_count_set(const char *val, const struct kernel_param *kp); -static const struct kernel_param_ops queue_count_ops = { - .set = queue_count_set, - .get = param_get_int, -}; - static int write_queues; -module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644); +module_param(write_queues, int, 0644); MODULE_PARM_DESC(write_queues, "Number of queues to use for writes. If not set, reads and writes " "will share a queue set."); static int poll_queues; -module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644); +module_param(poll_queues, int, 0644); MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO."); struct nvme_dev; @@ -146,19 +140,6 @@ static int io_queue_depth_set(const char *val, const struct kernel_param *kp) return param_set_int(val, kp); } -static int queue_count_set(const char *val, const struct kernel_param *kp) -{ - int n, ret; - - ret = kstrtoint(val, 10, &n); - if (ret) - return ret; - if (n > num_possible_cpus()) - n = num_possible_cpus(); - - return param_set_int(val, kp); -} - static inline unsigned int sq_idx(unsigned int qid, u32 stride) { return qid * 2 * stride; From dad77d63903e91a2e97a0c984cabe5d36e91ba60 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 9 Jun 2019 03:02:19 +0900 Subject: [PATCH 065/153] nvme-pci: adjust irq max_vector using num_possible_cpus() If the "irq_queues" are greater than num_possible_cpus(), nvme_calc_irq_sets() can have irq set_size for HCTX_TYPE_DEFAULT greater than it can be afforded. 2039 affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues; It might cause a WARN() from the irq_build_affinity_masks() like [1]: 220 if (nr_present < numvecs) 221 WARN_ON(nr_present + nr_others < numvecs); This patch prevents it from the WARN() by adjusting the max_vector value from the nvme_setup_irqs(). [1] WARN messages when modprobe nvme write_queues=32 poll_queues=0: root@target:~/nvme# nproc 8 root@target:~/nvme# modprobe nvme write_queues=32 poll_queues=0 [ 17.925326] nvme nvme0: pci function 0000:00:04.0 [ 17.940601] WARNING: CPU: 3 PID: 1030 at kernel/irq/affinity.c:221 irq_create_affinity_masks+0x222/0x330 [ 17.940602] Modules linked in: nvme nvme_core [last unloaded: nvme] [ 17.940605] CPU: 3 PID: 1030 Comm: kworker/u17:4 Tainted: G W 5.1.0+ #156 [ 17.940605] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 [ 17.940608] Workqueue: nvme-reset-wq nvme_reset_work [nvme] [ 17.940609] RIP: 0010:irq_create_affinity_masks+0x222/0x330 [ 17.940611] Code: 4c 8d 4c 24 28 4c 8d 44 24 30 e8 c9 fa ff ff 89 44 24 18 e8 c0 38 fa ff 8b 44 24 18 44 8b 54 24 1c 5a 44 01 d0 41 39 c4 76 02 <0f> 0b 48 89 df 44 01 e5 e8 f1 ce 10 00 48 8b 34 24 44 89 f0 44 01 [ 17.940611] RSP: 0018:ffffc90002277c50 EFLAGS: 00010216 [ 17.940612] RAX: 0000000000000008 RBX: ffff88807ca48860 RCX: 0000000000000000 [ 17.940612] RDX: ffff88807bc03800 RSI: 0000000000000020 RDI: 0000000000000000 [ 17.940613] RBP: 0000000000000001 R08: ffffc90002277c78 R09: ffffc90002277c70 [ 17.940613] R10: 0000000000000008 R11: 0000000000000001 R12: 0000000000000020 [ 17.940614] R13: 0000000000025d08 R14: 0000000000000001 R15: ffff88807bc03800 [ 17.940614] FS: 0000000000000000(0000) GS:ffff88807db80000(0000) knlGS:0000000000000000 [ 17.940616] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 17.940617] CR2: 00005635e583f790 CR3: 000000000240a000 CR4: 00000000000006e0 [ 17.940617] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 17.940618] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 17.940618] Call Trace: [ 17.940622] __pci_enable_msix_range+0x215/0x540 [ 17.940623] ? kernfs_put+0x117/0x160 [ 17.940625] pci_alloc_irq_vectors_affinity+0x74/0x110 [ 17.940626] nvme_reset_work+0xc30/0x1397 [nvme] [ 17.940628] ? __switch_to_asm+0x34/0x70 [ 17.940628] ? __switch_to_asm+0x40/0x70 [ 17.940629] ? __switch_to_asm+0x34/0x70 [ 17.940630] ? __switch_to_asm+0x40/0x70 [ 17.940630] ? __switch_to_asm+0x34/0x70 [ 17.940631] ? __switch_to_asm+0x40/0x70 [ 17.940632] ? nvme_irq_check+0x30/0x30 [nvme] [ 17.940633] process_one_work+0x20b/0x3e0 [ 17.940634] worker_thread+0x1f9/0x3d0 [ 17.940635] ? cancel_delayed_work+0xa0/0xa0 [ 17.940636] kthread+0x117/0x120 [ 17.940637] ? kthread_stop+0xf0/0xf0 [ 17.940638] ret_from_fork+0x3a/0x50 [ 17.940639] ---[ end trace aca8a131361cd42a ]--- [ 17.942124] nvme nvme0: 7/1/0 default/read/poll queues Signed-off-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 007f8becde4a..c98b73da38e2 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2051,6 +2051,7 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) .priv = dev, }; unsigned int irq_queues, this_p_queues; + unsigned int nr_cpus = num_possible_cpus(); /* * Poll queues don't need interrupts, but we need at least one IO @@ -2061,7 +2062,10 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) this_p_queues = nr_io_queues - 1; irq_queues = 1; } else { - irq_queues = nr_io_queues - this_p_queues + 1; + if (nr_cpus < nr_io_queues - this_p_queues) + irq_queues = nr_cpus + 1; + else + irq_queues = nr_io_queues - this_p_queues + 1; } dev->io_queues[HCTX_TYPE_POLL] = this_p_queues; From e71afda49335620e3d9adf56015676db33a3bd86 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sat, 8 Jun 2019 13:01:02 -0700 Subject: [PATCH 066/153] nvme-pci: set the errno on ctrl state change error This patch removes the confusing assignment of the variable result at the time of declaration and sets the value in error cases next to the places where the actual error is happening. Here we also set the result value to -ENODEV when we fail at the final ctrl state transition in nvme_reset_work(). Without this assignment result will hold 0 from nvme_setup_io_queue() and on failure 0 will be passed to he nvme_remove_dead_ctrl() from final state transition. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c98b73da38e2..092c8403b306 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2467,11 +2467,13 @@ static void nvme_reset_work(struct work_struct *work) struct nvme_dev *dev = container_of(work, struct nvme_dev, ctrl.reset_work); bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); - int result = -ENODEV; + int result; enum nvme_ctrl_state new_state = NVME_CTRL_LIVE; - if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) + if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) { + result = -ENODEV; goto out; + } /* * If we're called to reset a live controller first shut it down before @@ -2575,6 +2577,7 @@ static void nvme_reset_work(struct work_struct *work) if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) { dev_warn(dev->ctrl.device, "failed to mark controller state %d\n", new_state); + result = -ENODEV; goto out; } From cee6c269b016ba89c62e34d6bccb103ee2c7de4f Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 9 Jun 2019 03:35:20 +0900 Subject: [PATCH 067/153] nvme-pci: properly report state change failure in nvme_reset_work If the state change to NVME_CTRL_CONNECTING fails, the dmesg is going to be like: [ 293.689160] nvme nvme0: failed to mark controller CONNECTING [ 293.689160] nvme nvme0: Removing after probe failure status: 0 Even it prints the first line to indicate the situation, the second line is not proper because the status is 0 which means normally success of the previous operation. This patch makes it indicate the proper error value when it fails. [ 25.932367] nvme nvme0: failed to mark controller CONNECTING [ 25.932369] nvme nvme0: Removing after probe failure status: -16 This situation is able to be easily reproduced by: root@target:~# rmmod nvme && modprobe nvme && rmmod nvme Signed-off-by: Minwoo Im Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 092c8403b306..d308ae7e2e11 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2517,6 +2517,7 @@ static void nvme_reset_work(struct work_struct *work) if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) { dev_warn(dev->ctrl.device, "failed to mark controller CONNECTING\n"); + result = -EBUSY; goto out; } From 7c1ce408eb320b3d4051570d167852ffbd7778ce Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sat, 8 Jun 2019 13:16:32 -0700 Subject: [PATCH 068/153] nvme-pci: clean up nvme_remove_dead_ctrl a bit Remove the status parameter o nvme_remove_dead_ctrl(), which is only used for printing it. We move the print message to the same function where actual error is occurring. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d308ae7e2e11..189352081994 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2451,10 +2451,8 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) kfree(dev); } -static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) +static void nvme_remove_dead_ctrl(struct nvme_dev *dev) { - dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status); - nvme_get_ctrl(&dev->ctrl); nvme_dev_disable(dev, false); nvme_kill_queues(&dev->ctrl); @@ -2588,7 +2586,10 @@ static void nvme_reset_work(struct work_struct *work) out_unlock: mutex_unlock(&dev->shutdown_lock); out: - nvme_remove_dead_ctrl(dev, result); + if (result) + dev_warn(dev->ctrl.device, + "Removing after probe failure status: %d\n", result); + nvme_remove_dead_ctrl(dev); } static void nvme_remove_dead_ctrl_work(struct work_struct *work) From 7183a46a4879b1640ed428334a8468f3f9b0a4bb Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Wed, 12 Jun 2019 21:45:29 +0900 Subject: [PATCH 069/153] nvme-trace: do not export nvme_trace_disk_name nvme_trace_disk_name() is now already being invoked with the function prototype in trace.h. We don't need to export this symbol at all. The following patches are going to provide target-side trace feature with the exactly same function with this so that this patch removes the EXPORT_SYMBOL() for this function. Signed-off-by: Minwoo Im Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/trace.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 5f24ea7a28eb..14b0d2993cbe 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -145,6 +145,5 @@ const char *nvme_trace_disk_name(struct trace_seq *p, char *name) return ret; } -EXPORT_SYMBOL_GPL(nvme_trace_disk_name); EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_sq); From 26f2990d85838caa650744a0ded9e38988a2bd7f Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Wed, 12 Jun 2019 21:45:30 +0900 Subject: [PATCH 070/153] nvme-trace: move opcode symbol print to nvme.h The following patches are going to provide the target-side trace which might need these kind of macros. It would be great if it can be shared between host and target side both. Signed-off-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/trace.h | 44 -------------------------------------- include/linux/nvme.h | 45 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 44 deletions(-) diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index e71502d141ed..62ee29107c32 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -16,50 +16,6 @@ #include "nvme.h" -#define nvme_admin_opcode_name(opcode) { opcode, #opcode } -#define show_admin_opcode_name(val) \ - __print_symbolic(val, \ - nvme_admin_opcode_name(nvme_admin_delete_sq), \ - nvme_admin_opcode_name(nvme_admin_create_sq), \ - nvme_admin_opcode_name(nvme_admin_get_log_page), \ - nvme_admin_opcode_name(nvme_admin_delete_cq), \ - nvme_admin_opcode_name(nvme_admin_create_cq), \ - nvme_admin_opcode_name(nvme_admin_identify), \ - nvme_admin_opcode_name(nvme_admin_abort_cmd), \ - nvme_admin_opcode_name(nvme_admin_set_features), \ - nvme_admin_opcode_name(nvme_admin_get_features), \ - nvme_admin_opcode_name(nvme_admin_async_event), \ - nvme_admin_opcode_name(nvme_admin_ns_mgmt), \ - nvme_admin_opcode_name(nvme_admin_activate_fw), \ - nvme_admin_opcode_name(nvme_admin_download_fw), \ - nvme_admin_opcode_name(nvme_admin_ns_attach), \ - nvme_admin_opcode_name(nvme_admin_keep_alive), \ - nvme_admin_opcode_name(nvme_admin_directive_send), \ - nvme_admin_opcode_name(nvme_admin_directive_recv), \ - nvme_admin_opcode_name(nvme_admin_dbbuf), \ - nvme_admin_opcode_name(nvme_admin_format_nvm), \ - nvme_admin_opcode_name(nvme_admin_security_send), \ - nvme_admin_opcode_name(nvme_admin_security_recv), \ - nvme_admin_opcode_name(nvme_admin_sanitize_nvm)) - -#define nvme_opcode_name(opcode) { opcode, #opcode } -#define show_nvm_opcode_name(val) \ - __print_symbolic(val, \ - nvme_opcode_name(nvme_cmd_flush), \ - nvme_opcode_name(nvme_cmd_write), \ - nvme_opcode_name(nvme_cmd_read), \ - nvme_opcode_name(nvme_cmd_write_uncor), \ - nvme_opcode_name(nvme_cmd_compare), \ - nvme_opcode_name(nvme_cmd_write_zeroes), \ - nvme_opcode_name(nvme_cmd_dsm), \ - nvme_opcode_name(nvme_cmd_resv_register), \ - nvme_opcode_name(nvme_cmd_resv_report), \ - nvme_opcode_name(nvme_cmd_resv_acquire), \ - nvme_opcode_name(nvme_cmd_resv_release)) - -#define show_opcode_name(qid, opcode) \ - (qid ? show_nvm_opcode_name(opcode) : show_admin_opcode_name(opcode)) - const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, u8 *cdw10); const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 7080923e78d1..86b3d04baf20 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -562,6 +562,22 @@ enum nvme_opcode { nvme_cmd_resv_release = 0x15, }; +#define nvme_opcode_name(opcode) { opcode, #opcode } +#define show_nvm_opcode_name(val) \ + __print_symbolic(val, \ + nvme_opcode_name(nvme_cmd_flush), \ + nvme_opcode_name(nvme_cmd_write), \ + nvme_opcode_name(nvme_cmd_read), \ + nvme_opcode_name(nvme_cmd_write_uncor), \ + nvme_opcode_name(nvme_cmd_compare), \ + nvme_opcode_name(nvme_cmd_write_zeroes), \ + nvme_opcode_name(nvme_cmd_dsm), \ + nvme_opcode_name(nvme_cmd_resv_register), \ + nvme_opcode_name(nvme_cmd_resv_report), \ + nvme_opcode_name(nvme_cmd_resv_acquire), \ + nvme_opcode_name(nvme_cmd_resv_release)) + + /* * Descriptor subtype - lower 4 bits of nvme_(keyed_)sgl_desc identifier * @@ -794,6 +810,35 @@ enum nvme_admin_opcode { nvme_admin_sanitize_nvm = 0x84, }; +#define nvme_admin_opcode_name(opcode) { opcode, #opcode } +#define show_admin_opcode_name(val) \ + __print_symbolic(val, \ + nvme_admin_opcode_name(nvme_admin_delete_sq), \ + nvme_admin_opcode_name(nvme_admin_create_sq), \ + nvme_admin_opcode_name(nvme_admin_get_log_page), \ + nvme_admin_opcode_name(nvme_admin_delete_cq), \ + nvme_admin_opcode_name(nvme_admin_create_cq), \ + nvme_admin_opcode_name(nvme_admin_identify), \ + nvme_admin_opcode_name(nvme_admin_abort_cmd), \ + nvme_admin_opcode_name(nvme_admin_set_features), \ + nvme_admin_opcode_name(nvme_admin_get_features), \ + nvme_admin_opcode_name(nvme_admin_async_event), \ + nvme_admin_opcode_name(nvme_admin_ns_mgmt), \ + nvme_admin_opcode_name(nvme_admin_activate_fw), \ + nvme_admin_opcode_name(nvme_admin_download_fw), \ + nvme_admin_opcode_name(nvme_admin_ns_attach), \ + nvme_admin_opcode_name(nvme_admin_keep_alive), \ + nvme_admin_opcode_name(nvme_admin_directive_send), \ + nvme_admin_opcode_name(nvme_admin_directive_recv), \ + nvme_admin_opcode_name(nvme_admin_dbbuf), \ + nvme_admin_opcode_name(nvme_admin_format_nvm), \ + nvme_admin_opcode_name(nvme_admin_security_send), \ + nvme_admin_opcode_name(nvme_admin_security_recv), \ + nvme_admin_opcode_name(nvme_admin_sanitize_nvm)) + +#define show_opcode_name(qid, opcode) \ + (qid ? show_nvm_opcode_name(opcode) : show_admin_opcode_name(opcode)) + enum { NVME_QUEUE_PHYS_CONTIG = (1 << 0), NVME_CQ_IRQ_ENABLED = (1 << 1), From ad795e47cdef078bfd9e48745040d12104005aab Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Wed, 12 Jun 2019 21:45:31 +0900 Subject: [PATCH 071/153] nvme-trace: support for fabrics commands in host-side This patch introduces fabrics commands tracing feature from host-side. This patch does not include any changes for the previous host-side tracing, but just add fabrics commands parsing in cmd=() format. Signed-off-by: Minwoo Im [hch: fixed some whitespace damage] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/trace.c | 63 +++++++++++++++++++++++++++++++++++++++ drivers/nvme/host/trace.h | 20 +++++++++---- include/linux/nvme.h | 20 +++++++++++-- 3 files changed, 94 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 14b0d2993cbe..f01ad0fd60bb 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -135,6 +135,69 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, } } +static const char *nvme_trace_fabrics_property_set(struct trace_seq *p, u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 attrib = spc[0]; + u32 ofst = get_unaligned_le32(spc + 4); + u64 value = get_unaligned_le64(spc + 8); + + trace_seq_printf(p, "attrib=%u, ofst=0x%x, value=0x%llx", + attrib, ofst, value); + trace_seq_putc(p, 0); + return ret; +} + +static const char *nvme_trace_fabrics_connect(struct trace_seq *p, u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u16 recfmt = get_unaligned_le16(spc); + u16 qid = get_unaligned_le16(spc + 2); + u16 sqsize = get_unaligned_le16(spc + 4); + u8 cattr = spc[6]; + u32 kato = get_unaligned_le32(spc + 8); + + trace_seq_printf(p, "recfmt=%u, qid=%u, sqsize=%u, cattr=%u, kato=%u", + recfmt, qid, sqsize, cattr, kato); + trace_seq_putc(p, 0); + return ret; +} + +static const char *nvme_trace_fabrics_property_get(struct trace_seq *p, u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 attrib = spc[0]; + u32 ofst = get_unaligned_le32(spc + 4); + + trace_seq_printf(p, "attrib=%u, ofst=0x%x", attrib, ofst); + trace_seq_putc(p, 0); + return ret; +} + +static const char *nvme_trace_fabrics_common(struct trace_seq *p, u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_printf(p, "spcecific=%*ph", 24, spc); + trace_seq_putc(p, 0); + return ret; +} + +const char *nvme_trace_parse_fabrics_cmd(struct trace_seq *p, + u8 fctype, u8 *spc) +{ + switch (fctype) { + case nvme_fabrics_type_property_set: + return nvme_trace_fabrics_property_set(p, spc); + case nvme_fabrics_type_connect: + return nvme_trace_fabrics_connect(p, spc); + case nvme_fabrics_type_property_get: + return nvme_trace_fabrics_property_get(p, spc); + default: + return nvme_trace_fabrics_common(p, spc); + } +} + const char *nvme_trace_disk_name(struct trace_seq *p, char *name) { const char *ret = trace_seq_buffer_ptr(p); diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index 62ee29107c32..19a18c87fb7b 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -20,11 +20,15 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, u8 *cdw10); const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, u8 *cdw10); +const char *nvme_trace_parse_fabrics_cmd(struct trace_seq *p, u8 fctype, + u8 *spc); -#define parse_nvme_cmd(qid, opcode, cdw10) \ - (qid ? \ - nvme_trace_parse_nvm_cmd(p, opcode, cdw10) : \ - nvme_trace_parse_admin_cmd(p, opcode, cdw10)) +#define parse_nvme_cmd(qid, opcode, fctype, cdw10) \ + ((opcode) == nvme_fabrics_command ? \ + nvme_trace_parse_fabrics_cmd(p, fctype, cdw10) : \ + ((qid) ? \ + nvme_trace_parse_nvm_cmd(p, opcode, cdw10) : \ + nvme_trace_parse_admin_cmd(p, opcode, cdw10))) const char *nvme_trace_disk_name(struct trace_seq *p, char *name); #define __print_disk_name(name) \ @@ -49,6 +53,7 @@ TRACE_EVENT(nvme_setup_cmd, __field(int, qid) __field(u8, opcode) __field(u8, flags) + __field(u8, fctype) __field(u16, cid) __field(u32, nsid) __field(u64, metadata) @@ -62,6 +67,7 @@ TRACE_EVENT(nvme_setup_cmd, __entry->cid = cmd->common.command_id; __entry->nsid = le32_to_cpu(cmd->common.nsid); __entry->metadata = le64_to_cpu(cmd->common.metadata); + __entry->fctype = cmd->fabrics.fctype; __assign_disk_name(__entry->disk, req->rq_disk); memcpy(__entry->cdw10, &cmd->common.cdw10, sizeof(__entry->cdw10)); @@ -70,8 +76,10 @@ TRACE_EVENT(nvme_setup_cmd, __entry->ctrl_id, __print_disk_name(__entry->disk), __entry->qid, __entry->cid, __entry->nsid, __entry->flags, __entry->metadata, - show_opcode_name(__entry->qid, __entry->opcode), - parse_nvme_cmd(__entry->qid, __entry->opcode, __entry->cdw10)) + show_opcode_name(__entry->qid, __entry->opcode, + __entry->fctype), + parse_nvme_cmd(__entry->qid, __entry->opcode, + __entry->fctype, __entry->cdw10)) ); TRACE_EVENT(nvme_complete_rq, diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 86b3d04baf20..d98b2d8baf4e 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -836,9 +836,6 @@ enum nvme_admin_opcode { nvme_admin_opcode_name(nvme_admin_security_recv), \ nvme_admin_opcode_name(nvme_admin_sanitize_nvm)) -#define show_opcode_name(qid, opcode) \ - (qid ? show_nvm_opcode_name(opcode) : show_admin_opcode_name(opcode)) - enum { NVME_QUEUE_PHYS_CONTIG = (1 << 0), NVME_CQ_IRQ_ENABLED = (1 << 1), @@ -1053,6 +1050,23 @@ enum nvmf_capsule_command { nvme_fabrics_type_property_get = 0x04, }; +#define nvme_fabrics_type_name(type) { type, #type } +#define show_fabrics_type_name(type) \ + __print_symbolic(type, \ + nvme_fabrics_type_name(nvme_fabrics_type_property_set), \ + nvme_fabrics_type_name(nvme_fabrics_type_connect), \ + nvme_fabrics_type_name(nvme_fabrics_type_property_get)) + +/* + * If not fabrics command, fctype will be ignored. + */ +#define show_opcode_name(qid, opcode, fctype) \ + ((opcode) == nvme_fabrics_command ? \ + show_fabrics_type_name(fctype) : \ + ((qid) ? \ + show_nvm_opcode_name(opcode) : \ + show_admin_opcode_name(opcode))) + struct nvmf_common_command { __u8 opcode; __u8 resv1; From 5f965f4fd92303066b319db4b4bbdb53cb924582 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Wed, 12 Jun 2019 21:45:32 +0900 Subject: [PATCH 072/153] nvme-trace: print result and status in hex format The "result" field is in 64bit to be printed out which means it could be like: nvme_complete_rq: nvme0: qid=0, cmdid=0, res=18446612684158962624, etries=0, flags=0x0, status=0 Switch both the result and status field to be printed in hexadecimal format to be easier to read. Signed-off-by: Minwoo Im Reviewed-by: Christoph Hellwig Signed-off-by: Christoph Hellwig --- drivers/nvme/host/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index 19a18c87fb7b..daaf700eae79 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -105,7 +105,7 @@ TRACE_EVENT(nvme_complete_rq, __entry->status = nvme_req(req)->status; __assign_disk_name(__entry->disk, req->rq_disk); ), - TP_printk("nvme%d: %sqid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u", + TP_printk("nvme%d: %sqid=%d, cmdid=%u, res=%#llx, retries=%u, flags=0x%x, status=%#x", __entry->ctrl_id, __print_disk_name(__entry->disk), __entry->qid, __entry->cid, __entry->result, __entry->retries, __entry->flags, __entry->status) From 510fd8ea98fcb586c01aef93d87c060a159ac30a Mon Sep 17 00:00:00 2001 From: Heiner Litz Date: Fri, 21 Jun 2019 11:11:59 +0200 Subject: [PATCH 073/153] lightnvm: pblk: fix freeing of merged pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bio_add_pc_page() may merge pages when a bio is padded due to a flush. Fix iteration over the bio to free the correct pages in case of a merge. Signed-off-by: Heiner Litz Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 773537804319..f546e6f28b8a 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -323,14 +323,16 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type) void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off, int nr_pages) { - struct bio_vec bv; - int i; + struct bio_vec *bv; + struct page *page; + int i, e, nbv = 0; - WARN_ON(off + nr_pages != bio->bi_vcnt); - - for (i = off; i < nr_pages + off; i++) { - bv = bio->bi_io_vec[i]; - mempool_free(bv.bv_page, &pblk->page_bio_pool); + for (i = 0; i < bio->bi_vcnt; i++) { + bv = &bio->bi_io_vec[i]; + page = bv->bv_page; + for (e = 0; e < bv->bv_len; e += PBLK_EXPOSED_PAGE_SIZE, nbv++) + if (nbv >= off) + mempool_free(page++, &pblk->page_bio_pool); } } From 2f5af4ab7de14bd35f3435e6a47300276bbb6c17 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 21 Jun 2019 11:12:00 +0200 Subject: [PATCH 074/153] lightnvm: fix uninitialized pointer in nvm_remove_tgt() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With gcc 4.1: drivers/lightnvm/core.c: In function ‘nvm_remove_tgt’: drivers/lightnvm/core.c:510: warning: ‘t’ is used uninitialized in this function Indeed, if no NVM devices have been registered, t will be an uninitialized pointer, and may be dereferenced later. A call to nvm_remove_tgt() can be triggered from userspace by issuing the NVM_DEV_REMOVE ioctl on the lightnvm control device. Fix this by preinitializing t to NULL. Fixes: 843f2edbdde085b4 ("lightnvm: do not remove instance under global lock") Signed-off-by: Geert Uytterhoeven Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 7d555b110ecd..a600934fdd9c 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -478,7 +478,7 @@ static void __nvm_remove_target(struct nvm_target *t, bool graceful) */ static int nvm_remove_tgt(struct nvm_ioctl_remove *remove) { - struct nvm_target *t; + struct nvm_target *t = NULL; struct nvm_dev *dev; down_read(&nvm_lock); From a5448fdc469d67da99728a132229aba5fce8f67a Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Wed, 12 Jun 2019 21:45:33 +0900 Subject: [PATCH 075/153] nvmet: introduce target-side trace This patch introduces target-side request tracing. As Christoph suggested, the trace would not be in a core or module to avoid disadvantages like cache miss: http://lists.infradead.org/pipermail/linux-nvme/2019-June/024721.html The target-side trace code is entirely based on the Johannes's trace code from the host side. It has lots of codes duplicated, but it would be better than having advantages mentioned above. It also traces not only fabrics commands, but also nvme normal commands. Once the codes to be shared gets bigger, then we can make it common as suggsted. This also removed the create_sq and create_cq trace parsing functions because it will be done by the connect fabrics command. Example: echo 1 > /sys/kernel/debug/tracing/event/nvmet/nvmet_req_init/enable echo 1 > /sys/kernel/debug/tracing/event/nvmet/nvmet_req_complete/enable cat /sys/kernel/debug/tracing/trace Signed-off-by: Minwoo Im [hch: fixed the symbol namespace and a an endianess conversion] Signed-off-by: Christoph Hellwig --- drivers/nvme/target/Makefile | 3 + drivers/nvme/target/core.c | 8 ++ drivers/nvme/target/trace.c | 201 +++++++++++++++++++++++++++++++++++ drivers/nvme/target/trace.h | 141 ++++++++++++++++++++++++ 4 files changed, 353 insertions(+) create mode 100644 drivers/nvme/target/trace.c create mode 100644 drivers/nvme/target/trace.h diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile index 8c3ad0fb6860..2b33836f3d3e 100644 --- a/drivers/nvme/target/Makefile +++ b/drivers/nvme/target/Makefile @@ -1,5 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 +ccflags-y += -I$(src) + obj-$(CONFIG_NVME_TARGET) += nvmet.o obj-$(CONFIG_NVME_TARGET_LOOP) += nvme-loop.o obj-$(CONFIG_NVME_TARGET_RDMA) += nvmet-rdma.o @@ -14,3 +16,4 @@ nvmet-rdma-y += rdma.o nvmet-fc-y += fc.o nvme-fcloop-y += fcloop.o nvmet-tcp-y += tcp.o +nvmet-$(CONFIG_TRACING) += trace.o diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 0587707b1a25..dad0243c7c96 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -10,6 +10,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include "trace.h" + #include "nvmet.h" struct workqueue_struct *buffered_io_wq; @@ -691,6 +694,9 @@ static void __nvmet_req_complete(struct nvmet_req *req, u16 status) if (unlikely(status)) nvmet_set_error(req, status); + + trace_nvmet_req_complete(req); + if (req->ns) nvmet_put_namespace(req->ns); req->ops->queue_response(req); @@ -850,6 +856,8 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, req->error_loc = NVMET_NO_ERROR_LOC; req->error_slba = 0; + trace_nvmet_req_init(req, req->cmd); + /* no support for fused commands yet */ if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { req->error_loc = offsetof(struct nvme_common_command, flags); diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c new file mode 100644 index 000000000000..cdcdd14c6408 --- /dev/null +++ b/drivers/nvme/target/trace.c @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVM Express target device driver tracepoints + * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH + */ + +#include +#include "trace.h" + +static const char *nvmet_trace_admin_identify(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 cns = cdw10[0]; + u16 ctrlid = get_unaligned_le16(cdw10 + 2); + + trace_seq_printf(p, "cns=%u, ctrlid=%u", cns, ctrlid); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvmet_trace_admin_get_features(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 fid = cdw10[0]; + u8 sel = cdw10[1] & 0x7; + u32 cdw11 = get_unaligned_le32(cdw10 + 4); + + trace_seq_printf(p, "fid=0x%x sel=0x%x cdw11=0x%x", fid, sel, cdw11); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvmet_trace_read_write(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + u16 length = get_unaligned_le16(cdw10 + 8); + u16 control = get_unaligned_le16(cdw10 + 10); + u32 dsmgmt = get_unaligned_le32(cdw10 + 12); + u32 reftag = get_unaligned_le32(cdw10 + 16); + + trace_seq_printf(p, + "slba=%llu, len=%u, ctrl=0x%x, dsmgmt=%u, reftag=%u", + slba, length, control, dsmgmt, reftag); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvmet_trace_dsm(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_printf(p, "nr=%u, attributes=%u", + get_unaligned_le32(cdw10), + get_unaligned_le32(cdw10 + 4)); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvmet_trace_common(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_printf(p, "cdw10=%*ph", 24, cdw10); + trace_seq_putc(p, 0); + + return ret; +} + +const char *nvmet_trace_parse_admin_cmd(struct trace_seq *p, + u8 opcode, u8 *cdw10) +{ + switch (opcode) { + case nvme_admin_identify: + return nvmet_trace_admin_identify(p, cdw10); + case nvme_admin_get_features: + return nvmet_trace_admin_get_features(p, cdw10); + default: + return nvmet_trace_common(p, cdw10); + } +} + +const char *nvmet_trace_parse_nvm_cmd(struct trace_seq *p, + u8 opcode, u8 *cdw10) +{ + switch (opcode) { + case nvme_cmd_read: + case nvme_cmd_write: + case nvme_cmd_write_zeroes: + return nvmet_trace_read_write(p, cdw10); + case nvme_cmd_dsm: + return nvmet_trace_dsm(p, cdw10); + default: + return nvmet_trace_common(p, cdw10); + } +} + +static const char *nvmet_trace_fabrics_property_set(struct trace_seq *p, + u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 attrib = spc[0]; + u32 ofst = get_unaligned_le32(spc + 4); + u64 value = get_unaligned_le64(spc + 8); + + trace_seq_printf(p, "attrib=%u, ofst=0x%x, value=0x%llx", + attrib, ofst, value); + trace_seq_putc(p, 0); + return ret; +} + +static const char *nvmet_trace_fabrics_connect(struct trace_seq *p, + u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u16 recfmt = get_unaligned_le16(spc); + u16 qid = get_unaligned_le16(spc + 2); + u16 sqsize = get_unaligned_le16(spc + 4); + u8 cattr = spc[6]; + u32 kato = get_unaligned_le32(spc + 8); + + trace_seq_printf(p, "recfmt=%u, qid=%u, sqsize=%u, cattr=%u, kato=%u", + recfmt, qid, sqsize, cattr, kato); + trace_seq_putc(p, 0); + return ret; +} + +static const char *nvmet_trace_fabrics_property_get(struct trace_seq *p, + u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 attrib = spc[0]; + u32 ofst = get_unaligned_le32(spc + 4); + + trace_seq_printf(p, "attrib=%u, ofst=0x%x", attrib, ofst); + trace_seq_putc(p, 0); + return ret; +} + +static const char *nvmet_trace_fabrics_common(struct trace_seq *p, u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_printf(p, "spcecific=%*ph", 24, spc); + trace_seq_putc(p, 0); + return ret; +} + +const char *nvmet_trace_parse_fabrics_cmd(struct trace_seq *p, + u8 fctype, u8 *spc) +{ + switch (fctype) { + case nvme_fabrics_type_property_set: + return nvmet_trace_fabrics_property_set(p, spc); + case nvme_fabrics_type_connect: + return nvmet_trace_fabrics_connect(p, spc); + case nvme_fabrics_type_property_get: + return nvmet_trace_fabrics_property_get(p, spc); + default: + return nvmet_trace_fabrics_common(p, spc); + } +} + +const char *nvmet_trace_disk_name(struct trace_seq *p, char *name) +{ + const char *ret = trace_seq_buffer_ptr(p); + + if (*name) + trace_seq_printf(p, "disk=%s, ", name); + trace_seq_putc(p, 0); + + return ret; +} + +const char *nvmet_trace_ctrl_name(struct trace_seq *p, struct nvmet_ctrl *ctrl) +{ + const char *ret = trace_seq_buffer_ptr(p); + + /* + * XXX: We don't know the controller instance before executing the + * connect command itself because the connect command for the admin + * queue will not provide the cntlid which will be allocated in this + * command. In case of io queues, the controller instance will be + * mapped by the extra data of the connect command. + * If we can know the extra data of the connect command in this stage, + * we can update this print statement later. + */ + if (ctrl) + trace_seq_printf(p, "%d", ctrl->cntlid); + else + trace_seq_printf(p, "_"); + trace_seq_putc(p, 0); + + return ret; +} + diff --git a/drivers/nvme/target/trace.h b/drivers/nvme/target/trace.h new file mode 100644 index 000000000000..e645caa882dd --- /dev/null +++ b/drivers/nvme/target/trace.h @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * NVM Express target device driver tracepoints + * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH + * + * This is entirely based on drivers/nvme/host/trace.h + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nvmet + +#if !defined(_TRACE_NVMET_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NVMET_H + +#include +#include +#include + +#include "nvmet.h" + +const char *nvmet_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, + u8 *cdw10); +const char *nvmet_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, + u8 *cdw10); +const char *nvmet_trace_parse_fabrics_cmd(struct trace_seq *p, u8 fctype, + u8 *spc); + +#define parse_nvme_cmd(qid, opcode, fctype, cdw10) \ + ((opcode) == nvme_fabrics_command ? \ + nvmet_trace_parse_fabrics_cmd(p, fctype, cdw10) : \ + (qid ? \ + nvmet_trace_parse_nvm_cmd(p, opcode, cdw10) : \ + nvmet_trace_parse_admin_cmd(p, opcode, cdw10))) + +const char *nvmet_trace_ctrl_name(struct trace_seq *p, struct nvmet_ctrl *ctrl); +#define __print_ctrl_name(ctrl) \ + nvmet_trace_ctrl_name(p, ctrl) + +const char *nvmet_trace_disk_name(struct trace_seq *p, char *name); +#define __print_disk_name(name) \ + nvmet_trace_disk_name(p, name) + +#ifndef TRACE_HEADER_MULTI_READ +static inline struct nvmet_ctrl *nvmet_req_to_ctrl(struct nvmet_req *req) +{ + return req->sq->ctrl; +} + +static inline void __assign_disk_name(char *name, struct nvmet_req *req, + bool init) +{ + struct nvmet_ctrl *ctrl = nvmet_req_to_ctrl(req); + struct nvmet_ns *ns; + + if ((init && req->sq->qid) || (!init && req->cq->qid)) { + ns = nvmet_find_namespace(ctrl, req->cmd->rw.nsid); + strncpy(name, ns->device_path, DISK_NAME_LEN); + return; + } + + memset(name, 0, DISK_NAME_LEN); +} +#endif + +TRACE_EVENT(nvmet_req_init, + TP_PROTO(struct nvmet_req *req, struct nvme_command *cmd), + TP_ARGS(req, cmd), + TP_STRUCT__entry( + __field(struct nvme_command *, cmd) + __field(struct nvmet_ctrl *, ctrl) + __array(char, disk, DISK_NAME_LEN) + __field(int, qid) + __field(u16, cid) + __field(u8, opcode) + __field(u8, fctype) + __field(u8, flags) + __field(u32, nsid) + __field(u64, metadata) + __array(u8, cdw10, 24) + ), + TP_fast_assign( + __entry->cmd = cmd; + __entry->ctrl = nvmet_req_to_ctrl(req); + __assign_disk_name(__entry->disk, req, true); + __entry->qid = req->sq->qid; + __entry->cid = cmd->common.command_id; + __entry->opcode = cmd->common.opcode; + __entry->fctype = cmd->fabrics.fctype; + __entry->flags = cmd->common.flags; + __entry->nsid = le32_to_cpu(cmd->common.nsid); + __entry->metadata = le64_to_cpu(cmd->common.metadata); + memcpy(__entry->cdw10, &cmd->common.cdw10, + sizeof(__entry->cdw10)); + ), + TP_printk("nvmet%s: %sqid=%d, cmdid=%u, nsid=%u, flags=%#x, " + "meta=%#llx, cmd=(%s, %s)", + __print_ctrl_name(__entry->ctrl), + __print_disk_name(__entry->disk), + __entry->qid, __entry->cid, __entry->nsid, + __entry->flags, __entry->metadata, + show_opcode_name(__entry->qid, __entry->opcode, + __entry->fctype), + parse_nvme_cmd(__entry->qid, __entry->opcode, + __entry->fctype, __entry->cdw10)) +); + +TRACE_EVENT(nvmet_req_complete, + TP_PROTO(struct nvmet_req *req), + TP_ARGS(req), + TP_STRUCT__entry( + __field(struct nvmet_ctrl *, ctrl) + __array(char, disk, DISK_NAME_LEN) + __field(int, qid) + __field(int, cid) + __field(u64, result) + __field(u16, status) + ), + TP_fast_assign( + __entry->ctrl = nvmet_req_to_ctrl(req); + __entry->qid = req->cq->qid; + __entry->cid = req->cqe->command_id; + __entry->result = le64_to_cpu(req->cqe->result.u64); + __entry->status = le16_to_cpu(req->cqe->status) >> 1; + __assign_disk_name(__entry->disk, req, false); + ), + TP_printk("nvmet%s: %sqid=%d, cmdid=%u, res=%#llx, status=%#x", + __print_ctrl_name(__entry->ctrl), + __print_disk_name(__entry->disk), + __entry->qid, __entry->cid, __entry->result, __entry->status) + +); + +#endif /* _TRACE_NVMET_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include From a3646451edd52ba238cbe4f618aaf6eb9bf9d60c Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Thu, 20 Jun 2019 08:49:02 +0200 Subject: [PATCH 076/153] nvme: prepare for fault injection into admin commands Currenlty fault injection support for nvme only enables to inject errors into the commands submitted to I/O queues. In preparation for fault injection into the admin commands, this makes the helper functions independent of struct nvme_ns. Signed-off-by: Akinobu Mita Reviewed-by: Minwoo Im Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 4 ++-- drivers/nvme/host/fault_inject.c | 36 ++++++++++++++++++-------------- drivers/nvme/host/nvme.h | 34 +++++++++++++++++------------- 3 files changed, 41 insertions(+), 33 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3b3960e0c31f..625605f8a0b5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3336,7 +3336,7 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups); nvme_mpath_add_disk(ns, id); - nvme_fault_inject_init(ns); + nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name); kfree(id); return 0; @@ -3361,7 +3361,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) return; - nvme_fault_inject_fini(ns); + nvme_fault_inject_fini(&ns->fault_inject); mutex_lock(&ns->ctrl->subsys->lock); list_del_rcu(&ns->siblings); diff --git a/drivers/nvme/host/fault_inject.c b/drivers/nvme/host/fault_inject.c index 4cfd2c9222d4..e37b8c2fddea 100644 --- a/drivers/nvme/host/fault_inject.c +++ b/drivers/nvme/host/fault_inject.c @@ -15,11 +15,10 @@ static DECLARE_FAULT_ATTR(fail_default_attr); static char *fail_request; module_param(fail_request, charp, 0000); -void nvme_fault_inject_init(struct nvme_ns *ns) +void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj, + const char *dev_name) { struct dentry *dir, *parent; - char *name = ns->disk->disk_name; - struct nvme_fault_inject *fault_inj = &ns->fault_inject; struct fault_attr *attr = &fault_inj->attr; /* set default fault injection attribute */ @@ -27,20 +26,20 @@ void nvme_fault_inject_init(struct nvme_ns *ns) setup_fault_attr(&fail_default_attr, fail_request); /* create debugfs directory and attribute */ - parent = debugfs_create_dir(name, NULL); + parent = debugfs_create_dir(dev_name, NULL); if (!parent) { - pr_warn("%s: failed to create debugfs directory\n", name); + pr_warn("%s: failed to create debugfs directory\n", dev_name); return; } *attr = fail_default_attr; dir = fault_create_debugfs_attr("fault_inject", parent, attr); if (IS_ERR(dir)) { - pr_warn("%s: failed to create debugfs attr\n", name); + pr_warn("%s: failed to create debugfs attr\n", dev_name); debugfs_remove_recursive(parent); return; } - ns->fault_inject.parent = parent; + fault_inj->parent = parent; /* create debugfs for status code and dont_retry */ fault_inj->status = NVME_SC_INVALID_OPCODE; @@ -49,29 +48,34 @@ void nvme_fault_inject_init(struct nvme_ns *ns) debugfs_create_bool("dont_retry", 0600, dir, &fault_inj->dont_retry); } -void nvme_fault_inject_fini(struct nvme_ns *ns) +void nvme_fault_inject_fini(struct nvme_fault_inject *fault_inject) { /* remove debugfs directories */ - debugfs_remove_recursive(ns->fault_inject.parent); + debugfs_remove_recursive(fault_inject->parent); } void nvme_should_fail(struct request *req) { struct gendisk *disk = req->rq_disk; - struct nvme_ns *ns = NULL; + struct nvme_fault_inject *fault_inject = NULL; u16 status; /* * make sure this request is coming from a valid namespace */ - if (!disk) - return; + if (disk) { + struct nvme_ns *ns = disk->private_data; - ns = disk->private_data; - if (ns && should_fail(&ns->fault_inject.attr, 1)) { + if (ns) + fault_inject = &ns->fault_inject; + else + WARN_ONCE(1, "No namespace found for request\n"); + } + + if (fault_inject && should_fail(&fault_inject->attr, 1)) { /* inject status code and DNR bit */ - status = ns->fault_inject.status; - if (ns->fault_inject.dont_retry) + status = fault_inject->status; + if (fault_inject->dont_retry) status |= NVME_SC_DNR; nvme_req(req)->status = status; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 038b8931d9e5..8f907576efb6 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -146,6 +146,15 @@ enum nvme_ctrl_state { NVME_CTRL_DEAD, }; +struct nvme_fault_inject { +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + struct fault_attr attr; + struct dentry *parent; + bool dont_retry; /* DNR, do not retry */ + u16 status; /* status code */ +#endif +}; + struct nvme_ctrl { bool comp_seen; enum nvme_ctrl_state state; @@ -313,15 +322,6 @@ struct nvme_ns_head { #endif }; -#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS -struct nvme_fault_inject { - struct fault_attr attr; - struct dentry *parent; - bool dont_retry; /* DNR, do not retry */ - u16 status; /* status code */ -}; -#endif - struct nvme_ns { struct list_head list; @@ -349,9 +349,7 @@ struct nvme_ns { #define NVME_NS_ANA_PENDING 2 u16 noiob; -#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS struct nvme_fault_inject fault_inject; -#endif }; @@ -372,12 +370,18 @@ struct nvme_ctrl_ops { }; #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS -void nvme_fault_inject_init(struct nvme_ns *ns); -void nvme_fault_inject_fini(struct nvme_ns *ns); +void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj, + const char *dev_name); +void nvme_fault_inject_fini(struct nvme_fault_inject *fault_inject); void nvme_should_fail(struct request *req); #else -static inline void nvme_fault_inject_init(struct nvme_ns *ns) {} -static inline void nvme_fault_inject_fini(struct nvme_ns *ns) {} +static inline void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj, + const char *dev_name) +{ +} +static inline void nvme_fault_inject_fini(struct nvme_fault_inject *fault_inj) +{ +} static inline void nvme_should_fail(struct request *req) {} #endif From f79d5fda4ea08c33a114087573d86f703149ee0e Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sun, 9 Jun 2019 23:17:01 +0900 Subject: [PATCH 077/153] nvme: enable to inject errors into admin commands This enables to inject errors into the commands submitted to the admin queue. It is useful to test error handling in the controller initialization. # echo 100 > /sys/kernel/debug/nvme0/fault_inject/probability # echo 1 > /sys/kernel/debug/nvme0/fault_inject/times # echo 10 > /sys/kernel/debug/nvme0/fault_inject/space # nvme reset /dev/nvme0 # dmesg ... nvme nvme0: Could not set queue count (16385) nvme nvme0: IO queues not created Signed-off-by: Akinobu Mita Reviewed-by: Minwoo Im Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 3 +++ drivers/nvme/host/fault_inject.c | 5 ++--- drivers/nvme/host/nvme.h | 2 ++ 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 625605f8a0b5..b2dd4e391f5c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3722,6 +3722,7 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl); void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) { + nvme_fault_inject_fini(&ctrl->fault_inject); dev_pm_qos_hide_latency_tolerance(ctrl->device); cdev_device_del(&ctrl->cdev, ctrl->device); } @@ -3817,6 +3818,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, dev_pm_qos_update_user_latency_tolerance(ctrl->device, min(default_ps_max_latency_us, (unsigned long)S32_MAX)); + nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device)); + return 0; out_free_name: kfree_const(ctrl->device->kobj.name); diff --git a/drivers/nvme/host/fault_inject.c b/drivers/nvme/host/fault_inject.c index e37b8c2fddea..1352159733b0 100644 --- a/drivers/nvme/host/fault_inject.c +++ b/drivers/nvme/host/fault_inject.c @@ -60,9 +60,6 @@ void nvme_should_fail(struct request *req) struct nvme_fault_inject *fault_inject = NULL; u16 status; - /* - * make sure this request is coming from a valid namespace - */ if (disk) { struct nvme_ns *ns = disk->private_data; @@ -70,6 +67,8 @@ void nvme_should_fail(struct request *req) fault_inject = &ns->fault_inject; else WARN_ONCE(1, "No namespace found for request\n"); + } else { + fault_inject = &nvme_req(req)->ctrl->fault_inject; } if (fault_inject && should_fail(&fault_inject->attr, 1)) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 8f907576efb6..ea45d7d393ad 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -256,6 +256,8 @@ struct nvme_ctrl { struct page *discard_page; unsigned long discard_page_busy; + + struct nvme_fault_inject fault_inject; }; enum nvme_iopolicy { From 7e31d8215fd8cb1c13b47e23f1136545010e00de Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sun, 9 Jun 2019 23:17:02 +0900 Subject: [PATCH 078/153] Documentation: nvme: add an example for nvme fault injection This adds an example of how to inject errors into admin commands. Suggested-by: Thomas Tai Signed-off-by: Akinobu Mita Reviewed-by: Chaitanya Kulkarni Reviewed-by: Minwoo Im Reviewed-by: Christoph Hellwig Signed-off-by: Christoph Hellwig --- .../fault-injection/nvme-fault-injection.txt | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/Documentation/fault-injection/nvme-fault-injection.txt b/Documentation/fault-injection/nvme-fault-injection.txt index 8fbf3bf60b62..efcb339a3add 100644 --- a/Documentation/fault-injection/nvme-fault-injection.txt +++ b/Documentation/fault-injection/nvme-fault-injection.txt @@ -114,3 +114,59 @@ R13: ffff88011a3c9680 R14: 0000000000000000 R15: 0000000000000000 cpu_startup_entry+0x6f/0x80 start_secondary+0x187/0x1e0 secondary_startup_64+0xa5/0xb0 + +Example 3: Inject an error into the 10th admin command +------------------------------------------------------ + +echo 100 > /sys/kernel/debug/nvme0/fault_inject/probability +echo 10 > /sys/kernel/debug/nvme0/fault_inject/space +echo 1 > /sys/kernel/debug/nvme0/fault_inject/times +nvme reset /dev/nvme0 + +Expected Result: + +After NVMe controller reset, the reinitialization may or may not succeed. +It depends on which admin command is actually forced to fail. + +Message from dmesg: + +nvme nvme0: resetting controller +FAULT_INJECTION: forcing a failure. +name fault_inject, interval 1, probability 100, space 1, times 1 +CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.2.0-rc2+ #2 +Hardware name: MSI MS-7A45/B150M MORTAR ARCTIC (MS-7A45), BIOS 1.50 04/25/2017 +Call Trace: + + dump_stack+0x63/0x85 + should_fail+0x14a/0x170 + nvme_should_fail+0x38/0x80 [nvme_core] + nvme_irq+0x129/0x280 [nvme] + ? blk_mq_end_request+0xb3/0x120 + __handle_irq_event_percpu+0x84/0x1a0 + handle_irq_event_percpu+0x32/0x80 + handle_irq_event+0x3b/0x60 + handle_edge_irq+0x7f/0x1a0 + handle_irq+0x20/0x30 + do_IRQ+0x4e/0xe0 + common_interrupt+0xf/0xf + +RIP: 0010:cpuidle_enter_state+0xc5/0x460 +Code: ff e8 8f 5f 86 ff 80 7d c7 00 74 17 9c 58 0f 1f 44 00 00 f6 c4 02 0f 85 69 03 00 00 31 ff e8 62 aa 8c ff fb 66 0f 1f 44 00 00 <45> 85 ed 0f 88 37 03 00 00 4c 8b 45 d0 4c 2b 45 b8 48 ba cf f7 53 +RSP: 0018:ffffffff88c03dd0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffdc +RAX: ffff9dac25a2ac80 RBX: ffffffff88d53760 RCX: 000000000000001f +RDX: 0000000000000000 RSI: 000000002d958403 RDI: 0000000000000000 +RBP: ffffffff88c03e18 R08: fffffff75e35ffb7 R09: 00000a49a56c0b48 +R10: ffffffff88c03da0 R11: 0000000000001b0c R12: ffff9dac25a34d00 +R13: 0000000000000006 R14: 0000000000000006 R15: ffffffff88d53760 + cpuidle_enter+0x2e/0x40 + call_cpuidle+0x23/0x40 + do_idle+0x201/0x280 + cpu_startup_entry+0x1d/0x20 + rest_init+0xaa/0xb0 + arch_call_rest_init+0xe/0x1b + start_kernel+0x51c/0x53b + x86_64_start_reservations+0x24/0x26 + x86_64_start_kernel+0x74/0x77 + secondary_startup_64+0xa4/0xb0 +nvme nvme0: Could not set queue count (16385) +nvme nvme0: IO queues not created From 766d61412ef840295f55e98e2c5fb0fc110c6ca4 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 25 Jun 2019 07:12:43 +0200 Subject: [PATCH 079/153] block, bfq: reset inject limit when think-time state changes Until the base value of the request service times gets finally computed for a bfq_queue, the inject limit does depend on the think-time state (short|long). The limit must be 0 or 1 if the think time is deemed, respectively, as short or long. However, such a check and possible limit update is performed only periodically, once per second. So, to make the injection mechanism much more reactive, this commit performs the update also every time the think-time state changes. In addition, in the following special case, this commit lets the inject limit of a bfq_queue bfqq remain equal to 1 even if bfqq's think time is short: bfqq's I/O is synchronized with that of some other queue, i.e., bfqq may receive new I/O only after the I/O of the other queue is completed. Keeping the inject limit to 1 allows the blocking I/O to be served while bfqq is in service. And this is very convenient both for bfqq and for the total throughput, as explained in detail in the comments in bfq_update_has_short_ttime(). Reported-by: Srivatsa S. Bhat (VMware) Tested-by: Srivatsa S. Bhat (VMware) Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 219 ++++++++++++++++++++++++++++++-------------- 1 file changed, 151 insertions(+), 68 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 44c6bbcd7720..9bc10198ddff 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1735,6 +1735,72 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, false, BFQQE_PREEMPTED); } +static void bfq_reset_inject_limit(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + /* invalidate baseline total service time */ + bfqq->last_serv_time_ns = 0; + + /* + * Reset pointer in case we are waiting for + * some request completion. + */ + bfqd->waited_rq = NULL; + + /* + * If bfqq has a short think time, then start by setting the + * inject limit to 0 prudentially, because the service time of + * an injected I/O request may be higher than the think time + * of bfqq, and therefore, if one request was injected when + * bfqq remains empty, this injected request might delay the + * service of the next I/O request for bfqq significantly. In + * case bfqq can actually tolerate some injection, then the + * adaptive update will however raise the limit soon. This + * lucky circumstance holds exactly because bfqq has a short + * think time, and thus, after remaining empty, is likely to + * get new I/O enqueued---and then completed---before being + * expired. This is the very pattern that gives the + * limit-update algorithm the chance to measure the effect of + * injection on request service times, and then to update the + * limit accordingly. + * + * However, in the following special case, the inject limit is + * left to 1 even if the think time is short: bfqq's I/O is + * synchronized with that of some other queue, i.e., bfqq may + * receive new I/O only after the I/O of the other queue is + * completed. Keeping the inject limit to 1 allows the + * blocking I/O to be served while bfqq is in service. And + * this is very convenient both for bfqq and for overall + * throughput, as explained in detail in the comments in + * bfq_update_has_short_ttime(). + * + * On the opposite end, if bfqq has a long think time, then + * start directly by 1, because: + * a) on the bright side, keeping at most one request in + * service in the drive is unlikely to cause any harm to the + * latency of bfqq's requests, as the service time of a single + * request is likely to be lower than the think time of bfqq; + * b) on the downside, after becoming empty, bfqq is likely to + * expire before getting its next request. With this request + * arrival pattern, it is very hard to sample total service + * times and update the inject limit accordingly (see comments + * on bfq_update_inject_limit()). So the limit is likely to be + * never, or at least seldom, updated. As a consequence, by + * setting the limit to 1, we avoid that no injection ever + * occurs with bfqq. On the downside, this proactive step + * further reduces chances to actually compute the baseline + * total service time. Thus it reduces chances to execute the + * limit-update algorithm and possibly raise the limit to more + * than 1. + */ + if (bfq_bfqq_has_short_ttime(bfqq)) + bfqq->inject_limit = 0; + else + bfqq->inject_limit = 1; + + bfqq->decrease_time_jif = jiffies; +} + static void bfq_add_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); @@ -1755,71 +1821,8 @@ static void bfq_add_request(struct request *rq) * bfq_update_inject_limit(). */ if (time_is_before_eq_jiffies(bfqq->decrease_time_jif + - msecs_to_jiffies(1000))) { - /* invalidate baseline total service time */ - bfqq->last_serv_time_ns = 0; - - /* - * Reset pointer in case we are waiting for - * some request completion. - */ - bfqd->waited_rq = NULL; - - /* - * If bfqq has a short think time, then start - * by setting the inject limit to 0 - * prudentially, because the service time of - * an injected I/O request may be higher than - * the think time of bfqq, and therefore, if - * one request was injected when bfqq remains - * empty, this injected request might delay - * the service of the next I/O request for - * bfqq significantly. In case bfqq can - * actually tolerate some injection, then the - * adaptive update will however raise the - * limit soon. This lucky circumstance holds - * exactly because bfqq has a short think - * time, and thus, after remaining empty, is - * likely to get new I/O enqueued---and then - * completed---before being expired. This is - * the very pattern that gives the - * limit-update algorithm the chance to - * measure the effect of injection on request - * service times, and then to update the limit - * accordingly. - * - * On the opposite end, if bfqq has a long - * think time, then start directly by 1, - * because: - * a) on the bright side, keeping at most one - * request in service in the drive is unlikely - * to cause any harm to the latency of bfqq's - * requests, as the service time of a single - * request is likely to be lower than the - * think time of bfqq; - * b) on the downside, after becoming empty, - * bfqq is likely to expire before getting its - * next request. With this request arrival - * pattern, it is very hard to sample total - * service times and update the inject limit - * accordingly (see comments on - * bfq_update_inject_limit()). So the limit is - * likely to be never, or at least seldom, - * updated. As a consequence, by setting the - * limit to 1, we avoid that no injection ever - * occurs with bfqq. On the downside, this - * proactive step further reduces chances to - * actually compute the baseline total service - * time. Thus it reduces chances to execute the - * limit-update algorithm and possibly raise the - * limit to more than 1. - */ - if (bfq_bfqq_has_short_ttime(bfqq)) - bfqq->inject_limit = 0; - else - bfqq->inject_limit = 1; - bfqq->decrease_time_jif = jiffies; - } + msecs_to_jiffies(1000))) + bfq_reset_inject_limit(bfqd, bfqq); /* * The following conditions must hold to setup a new @@ -4855,7 +4858,7 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_io_cq *bic) { - bool has_short_ttime = true; + bool has_short_ttime = true, state_changed; /* * No need to update has_short_ttime if bfqq is async or in @@ -4880,13 +4883,93 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) has_short_ttime = false; - bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", - has_short_ttime); + state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq); if (has_short_ttime) bfq_mark_bfqq_has_short_ttime(bfqq); else bfq_clear_bfqq_has_short_ttime(bfqq); + + /* + * Until the base value for the total service time gets + * finally computed for bfqq, the inject limit does depend on + * the think-time state (short|long). In particular, the limit + * is 0 or 1 if the think time is deemed, respectively, as + * short or long (details in the comments in + * bfq_update_inject_limit()). Accordingly, the next + * instructions reset the inject limit if the think-time state + * has changed and the above base value is still to be + * computed. + * + * However, the reset is performed only if more than 100 ms + * have elapsed since the last update of the inject limit, or + * (inclusive) if the change is from short to long think + * time. The reason for this waiting is as follows. + * + * bfqq may have a long think time because of a + * synchronization with some other queue, i.e., because the + * I/O of some other queue may need to be completed for bfqq + * to receive new I/O. This happens, e.g., if bfqq is + * associated with a process that does some sync. A sync + * generates extra blocking I/O, which must be completed + * before the process associated with bfqq can go on with its + * I/O. + * + * If such a synchronization is actually in place, then, + * without injection on bfqq, the blocking I/O cannot happen + * to served while bfqq is in service. As a consequence, if + * bfqq is granted I/O-dispatch-plugging, then bfqq remains + * empty, and no I/O is dispatched, until the idle timeout + * fires. This is likely to result in lower bandwidth and + * higher latencies for bfqq, and in a severe loss of total + * throughput. + * + * On the opposite end, a non-zero inject limit may allow the + * I/O that blocks bfqq to be executed soon, and therefore + * bfqq to receive new I/O soon. But, if this actually + * happens, then the next think-time sample for bfqq may be + * very low. This in turn may cause bfqq's think time to be + * deemed short. Without the 100 ms barrier, this new state + * change would cause the body of the next if to be executed + * immediately. But this would set to 0 the inject + * limit. Without injection, the blocking I/O would cause the + * think time of bfqq to become long again, and therefore the + * inject limit to be raised again, and so on. The only effect + * of such a steady oscillation between the two think-time + * states would be to prevent effective injection on bfqq. + * + * In contrast, if the inject limit is not reset during such a + * long time interval as 100 ms, then the number of short + * think time samples can grow significantly before the reset + * is allowed. As a consequence, the think time state can + * become stable before the reset. There will be no state + * change when the 100 ms elapse, and therefore no reset of + * the inject limit. The inject limit remains steadily equal + * to 1 both during and after the 100 ms. So injection can be + * performed at all times, and throughput gets boosted. + * + * An inject limit equal to 1 is however in conflict, in + * general, with the fact that the think time of bfqq is + * short, because injection may be likely to delay bfqq's I/O + * (as explained in the comments in + * bfq_update_inject_limit()). But this does not happen in + * this special case, because bfqq's low think time is due to + * an effective handling of a synchronization, through + * injection. In this special case, bfqq's I/O does not get + * delayed by injection; on the contrary, bfqq's I/O is + * brought forward, because it is not blocked for + * milliseconds. + * + * In addition, during the 100 ms, the base value for the + * total service time is likely to get finally computed, + * freeing the inject limit from its relation with the think + * time. + */ + if (state_changed && bfqq->last_serv_time_ns == 0 && + (time_is_before_eq_jiffies(bfqq->decrease_time_jif + + msecs_to_jiffies(100)) || + !has_short_ttime)) + bfq_reset_inject_limit(bfqd, bfqq); } /* From db599f9ed9bd31b018b6c48ad7c6b21d5b790ecf Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 25 Jun 2019 07:12:44 +0200 Subject: [PATCH 080/153] block, bfq: fix rq_in_driver check in bfq_update_inject_limit One of the cases where the parameters for injection may be updated is when there are no more in-flight I/O requests. The number of in-flight requests is stored in the field bfqd->rq_in_driver of the descriptor bfqd of the device. So, the controlled condition is bfqd->rq_in_driver == 0. Unfortunately, this is wrong because, the instruction that checks this condition is in the code path that handles the completion of a request, and, in particular, the instruction is executed before bfqd->rq_in_driver is decremented in such a code path. This commit fixes this issue by just replacing 0 with 1 in the comparison. Reported-by: Srivatsa S. Bhat (VMware) Tested-by: Srivatsa S. Bhat (VMware) Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 9bc10198ddff..05041f84b8da 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5481,8 +5481,14 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, * total service time, and there seem to be the right * conditions to do it, or we can lower the last base value * computed. + * + * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O + * request in flight, because this function is in the code + * path that handles the completion of a request of bfqq, and, + * in particular, this function is executed before + * bfqd->rq_in_driver is decremented in such a code path. */ - if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 0) || + if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) || tot_time_ns < bfqq->last_serv_time_ns) { bfqq->last_serv_time_ns = tot_time_ns; /* From 24792ad01cb659c8b5899de2af6e8ca250f93df3 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 25 Jun 2019 07:12:45 +0200 Subject: [PATCH 081/153] block, bfq: update base request service times when possible I/O injection gets reduced if it increases the request service times of the victim queue beyond a certain threshold. The threshold, in its turn, is computed as a function of the base service time enjoyed by the queue when it undergoes no injection. As a consequence, for injection to work properly, the above base value has to be accurate. In this respect, such a value may vary over time. For example, it varies if the size or the spatial locality of the I/O requests in the queue change. It is then important to update this value whenever possible. This commit performs this update. Reported-by: Srivatsa S. Bhat (VMware) Tested-by: Srivatsa S. Bhat (VMware) Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 05041f84b8da..62442083b147 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5496,7 +5496,18 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, * start trying injection. */ bfqq->inject_limit = max_t(unsigned int, 1, old_limit); - } + } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1) + /* + * No I/O injected and no request still in service in + * the drive: these are the exact conditions for + * computing the base value of the total service time + * for bfqq. So let's update this value, because it is + * rather variable. For example, it varies if the size + * or the spatial locality of the I/O requests in bfqq + * change. + */ + bfqq->last_serv_time_ns = tot_time_ns; + /* update complete, not waiting for any request completion any longer */ bfqd->waited_rq = NULL; From a3f9bce3697a5b4039ff7096db4a1ee897349276 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 25 Jun 2019 07:12:46 +0200 Subject: [PATCH 082/153] block, bfq: bring forward seek&think time update Until the base value for request service times gets finally computed for a bfq_queue, the inject limit for that queue does depend on the think-time state (short|long) of the queue. A timely update of the think time then guarantees a quicker activation or deactivation of the injection. Fortunately, the think time of a bfq_queue is updated in the same code path as the inject limit; but after the inject limit. This commits moves the update of the think time before the update of the inject limit. For coherence, it moves the update of the seek time too. Reported-by: Srivatsa S. Bhat (VMware) Tested-by: Srivatsa S. Bhat (VMware) Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 62442083b147..d5bc32371ace 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4979,19 +4979,9 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct request *rq) { - struct bfq_io_cq *bic = RQ_BIC(rq); - if (rq->cmd_flags & REQ_META) bfqq->meta_pending++; - bfq_update_io_thinktime(bfqd, bfqq); - bfq_update_has_short_ttime(bfqd, bfqq, bic); - bfq_update_io_seektime(bfqd, bfqq, rq); - - bfq_log_bfqq(bfqd, bfqq, - "rq_enqueued: has_short_ttime=%d (seeky %d)", - bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); - bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { @@ -5079,6 +5069,10 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) bfqq = new_bfqq; } + bfq_update_io_thinktime(bfqd, bfqq); + bfq_update_has_short_ttime(bfqd, bfqq, RQ_BIC(rq)); + bfq_update_io_seektime(bfqd, bfqq, rq); + waiting = bfqq && bfq_bfqq_wait_request(bfqq); bfq_add_request(rq); idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); From 13a857a4c4e826c587cde3a69bc3d1162d247d9d Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 25 Jun 2019 07:12:47 +0200 Subject: [PATCH 083/153] block, bfq: detect wakers and unconditionally inject their I/O A bfq_queue Q may happen to be synchronized with another bfq_queue Q2, i.e., the I/O of Q2 may need to be completed for Q to receive new I/O. We call Q2 "waker queue". If I/O plugging is being performed for Q, and Q is not receiving any more I/O because of the above synchronization, then, thanks to BFQ's injection mechanism, the waker queue is likely to get served before the I/O-plugging timeout fires. Unfortunately, this fact may not be sufficient to guarantee a high throughput during the I/O plugging, because the inject limit for Q may be too low to guarantee a lot of injected I/O. In addition, the duration of the plugging, i.e., the time before Q finally receives new I/O, may not be minimized, because the waker queue may happen to be served only after other queues. To address these issues, this commit introduces the explicit detection of the waker queue, and the unconditional injection of a pending I/O request of the waker queue on each invocation of bfq_dispatch_request(). One may be concerned that this systematic injection of I/O from the waker queue delays the service of Q's I/O. Fortunately, it doesn't. On the contrary, next Q's I/O is brought forward dramatically, for it is not blocked for milliseconds. Reported-by: Srivatsa S. Bhat (VMware) Tested-by: Srivatsa S. Bhat (VMware) Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 270 ++++++++++++++++++++++++++++++++++++++------ block/bfq-iosched.h | 25 +++- 2 files changed, 261 insertions(+), 34 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index d5bc32371ace..9e2fbb7d1fb6 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -157,6 +157,7 @@ BFQ_BFQQ_FNS(in_large_burst); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); BFQ_BFQQ_FNS(softrt_update); +BFQ_BFQQ_FNS(has_waker); #undef BFQ_BFQQ_FNS \ /* Expiration time of sync (0) and async (1) requests, in ns. */ @@ -1814,6 +1815,111 @@ static void bfq_add_request(struct request *rq) bfqd->queued++; if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) { + /* + * Detect whether bfqq's I/O seems synchronized with + * that of some other queue, i.e., whether bfqq, after + * remaining empty, happens to receive new I/O only + * right after some I/O request of the other queue has + * been completed. We call waker queue the other + * queue, and we assume, for simplicity, that bfqq may + * have at most one waker queue. + * + * A remarkable throughput boost can be reached by + * unconditionally injecting the I/O of the waker + * queue, every time a new bfq_dispatch_request + * happens to be invoked while I/O is being plugged + * for bfqq. In addition to boosting throughput, this + * unblocks bfqq's I/O, thereby improving bandwidth + * and latency for bfqq. Note that these same results + * may be achieved with the general injection + * mechanism, but less effectively. For details on + * this aspect, see the comments on the choice of the + * queue for injection in bfq_select_queue(). + * + * Turning back to the detection of a waker queue, a + * queue Q is deemed as a waker queue for bfqq if, for + * two consecutive times, bfqq happens to become non + * empty right after a request of Q has been + * completed. In particular, on the first time, Q is + * tentatively set as a candidate waker queue, while + * on the second time, the flag + * bfq_bfqq_has_waker(bfqq) is set to confirm that Q + * is a waker queue for bfqq. These detection steps + * are performed only if bfqq has a long think time, + * so as to make it more likely that bfqq's I/O is + * actually being blocked by a synchronization. This + * last filter, plus the above two-times requirement, + * make false positives less likely. + * + * NOTE + * + * The sooner a waker queue is detected, the sooner + * throughput can be boosted by injecting I/O from the + * waker queue. Fortunately, detection is likely to be + * actually fast, for the following reasons. While + * blocked by synchronization, bfqq has a long think + * time. This implies that bfqq's inject limit is at + * least equal to 1 (see the comments in + * bfq_update_inject_limit()). So, thanks to + * injection, the waker queue is likely to be served + * during the very first I/O-plugging time interval + * for bfqq. This triggers the first step of the + * detection mechanism. Thanks again to injection, the + * candidate waker queue is then likely to be + * confirmed no later than during the next + * I/O-plugging interval for bfqq. + */ + if (!bfq_bfqq_has_short_ttime(bfqq) && + ktime_get_ns() - bfqd->last_completion < + 200 * NSEC_PER_USEC) { + if (bfqd->last_completed_rq_bfqq != bfqq && + bfqd->last_completed_rq_bfqq != + bfqq->waker_bfqq) { + /* + * First synchronization detected with + * a candidate waker queue, or with a + * different candidate waker queue + * from the current one. + */ + bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; + + /* + * If the waker queue disappears, then + * bfqq->waker_bfqq must be reset. To + * this goal, we maintain in each + * waker queue a list, woken_list, of + * all the queues that reference the + * waker queue through their + * waker_bfqq pointer. When the waker + * queue exits, the waker_bfqq pointer + * of all the queues in the woken_list + * is reset. + * + * In addition, if bfqq is already in + * the woken_list of a waker queue, + * then, before being inserted into + * the woken_list of a new waker + * queue, bfqq must be removed from + * the woken_list of the old waker + * queue. + */ + if (!hlist_unhashed(&bfqq->woken_list_node)) + hlist_del_init(&bfqq->woken_list_node); + hlist_add_head(&bfqq->woken_list_node, + &bfqd->last_completed_rq_bfqq->woken_list); + + bfq_clear_bfqq_has_waker(bfqq); + } else if (bfqd->last_completed_rq_bfqq == + bfqq->waker_bfqq && + !bfq_bfqq_has_waker(bfqq)) { + /* + * synchronization with waker_bfqq + * seen for the second time + */ + bfq_mark_bfqq_has_waker(bfqq); + } + } + /* * Periodically reset inject limit, to make sure that * the latter eventually drops in case workload @@ -4164,18 +4270,89 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) bfqq->bic->bfqq[0] : NULL; /* - * If the process associated with bfqq has also async - * I/O pending, then inject it - * unconditionally. Injecting I/O from the same - * process can cause no harm to the process. On the - * contrary, it can only increase bandwidth and reduce - * latency for the process. + * The next three mutually-exclusive ifs decide + * whether to try injection, and choose the queue to + * pick an I/O request from. + * + * The first if checks whether the process associated + * with bfqq has also async I/O pending. If so, it + * injects such I/O unconditionally. Injecting async + * I/O from the same process can cause no harm to the + * process. On the contrary, it can only increase + * bandwidth and reduce latency for the process. + * + * The second if checks whether there happens to be a + * non-empty waker queue for bfqq, i.e., a queue whose + * I/O needs to be completed for bfqq to receive new + * I/O. This happens, e.g., if bfqq is associated with + * a process that does some sync. A sync generates + * extra blocking I/O, which must be completed before + * the process associated with bfqq can go on with its + * I/O. If the I/O of the waker queue is not served, + * then bfqq remains empty, and no I/O is dispatched, + * until the idle timeout fires for bfqq. This is + * likely to result in lower bandwidth and higher + * latencies for bfqq, and in a severe loss of total + * throughput. The best action to take is therefore to + * serve the waker queue as soon as possible. So do it + * (without relying on the third alternative below for + * eventually serving waker_bfqq's I/O; see the last + * paragraph for further details). This systematic + * injection of I/O from the waker queue does not + * cause any delay to bfqq's I/O. On the contrary, + * next bfqq's I/O is brought forward dramatically, + * for it is not blocked for milliseconds. + * + * The third if checks whether bfqq is a queue for + * which it is better to avoid injection. It is so if + * bfqq delivers more throughput when served without + * any further I/O from other queues in the middle, or + * if the service times of bfqq's I/O requests both + * count more than overall throughput, and may be + * easily increased by injection (this happens if bfqq + * has a short think time). If none of these + * conditions holds, then a candidate queue for + * injection is looked for through + * bfq_choose_bfqq_for_injection(). Note that the + * latter may return NULL (for example if the inject + * limit for bfqq is currently 0). + * + * NOTE: motivation for the second alternative + * + * Thanks to the way the inject limit is updated in + * bfq_update_has_short_ttime(), it is rather likely + * that, if I/O is being plugged for bfqq and the + * waker queue has pending I/O requests that are + * blocking bfqq's I/O, then the third alternative + * above lets the waker queue get served before the + * I/O-plugging timeout fires. So one may deem the + * second alternative superfluous. It is not, because + * the third alternative may be way less effective in + * case of a synchronization. For two main + * reasons. First, throughput may be low because the + * inject limit may be too low to guarantee the same + * amount of injected I/O, from the waker queue or + * other queues, that the second alternative + * guarantees (the second alternative unconditionally + * injects a pending I/O request of the waker queue + * for each bfq_dispatch_request()). Second, with the + * third alternative, the duration of the plugging, + * i.e., the time before bfqq finally receives new I/O, + * may not be minimized, because the waker queue may + * happen to be served only after other queues. */ if (async_bfqq && icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic && bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <= bfq_bfqq_budget_left(async_bfqq)) bfqq = bfqq->bic->bfqq[0]; + else if (bfq_bfqq_has_waker(bfqq) && + bfq_bfqq_busy(bfqq->waker_bfqq) && + bfq_serv_to_charge(bfqq->waker_bfqq->next_rq, + bfqq->waker_bfqq) <= + bfq_bfqq_budget_left(bfqq->waker_bfqq) + ) + bfqq = bfqq->waker_bfqq; else if (!idling_boosts_thr_without_issues(bfqd, bfqq) && (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 || !bfq_bfqq_has_short_ttime(bfqq))) @@ -4564,6 +4741,9 @@ static void bfq_put_cooperator(struct bfq_queue *bfqq) static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) { + struct bfq_queue *item; + struct hlist_node *n; + if (bfqq == bfqd->in_service_queue) { __bfq_bfqq_expire(bfqd, bfqq); bfq_schedule_dispatch(bfqd); @@ -4573,6 +4753,18 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_put_cooperator(bfqq); + /* remove bfqq from woken list */ + if (!hlist_unhashed(&bfqq->woken_list_node)) + hlist_del_init(&bfqq->woken_list_node); + + /* reset waker for all queues in woken list */ + hlist_for_each_entry_safe(item, n, &bfqq->woken_list, + woken_list_node) { + item->waker_bfqq = NULL; + bfq_clear_bfqq_has_waker(item); + hlist_del_init(&item->woken_list_node); + } + bfq_put_queue(bfqq); /* release process reference */ } @@ -4691,6 +4883,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, RB_CLEAR_NODE(&bfqq->entity.rb_node); INIT_LIST_HEAD(&bfqq->fifo); INIT_HLIST_NODE(&bfqq->burst_list_node); + INIT_HLIST_NODE(&bfqq->woken_list_node); + INIT_HLIST_HEAD(&bfqq->woken_list); bfqq->ref = 0; bfqq->bfqd = bfqd; @@ -4909,28 +5103,27 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, * bfqq may have a long think time because of a * synchronization with some other queue, i.e., because the * I/O of some other queue may need to be completed for bfqq - * to receive new I/O. This happens, e.g., if bfqq is - * associated with a process that does some sync. A sync - * generates extra blocking I/O, which must be completed - * before the process associated with bfqq can go on with its - * I/O. + * to receive new I/O. Details in the comments on the choice + * of the queue for injection in bfq_select_queue(). * - * If such a synchronization is actually in place, then, - * without injection on bfqq, the blocking I/O cannot happen - * to served while bfqq is in service. As a consequence, if - * bfqq is granted I/O-dispatch-plugging, then bfqq remains - * empty, and no I/O is dispatched, until the idle timeout - * fires. This is likely to result in lower bandwidth and - * higher latencies for bfqq, and in a severe loss of total - * throughput. + * As stressed in those comments, if such a synchronization is + * actually in place, then, without injection on bfqq, the + * blocking I/O cannot happen to served while bfqq is in + * service. As a consequence, if bfqq is granted + * I/O-dispatch-plugging, then bfqq remains empty, and no I/O + * is dispatched, until the idle timeout fires. This is likely + * to result in lower bandwidth and higher latencies for bfqq, + * and in a severe loss of total throughput. * * On the opposite end, a non-zero inject limit may allow the * I/O that blocks bfqq to be executed soon, and therefore - * bfqq to receive new I/O soon. But, if this actually - * happens, then the next think-time sample for bfqq may be - * very low. This in turn may cause bfqq's think time to be - * deemed short. Without the 100 ms barrier, this new state - * change would cause the body of the next if to be executed + * bfqq to receive new I/O soon. + * + * But, if the blocking gets actually eliminated, then the + * next think-time sample for bfqq may be very low. This in + * turn may cause bfqq's think time to be deemed + * short. Without the 100 ms barrier, this new state change + * would cause the body of the next if to be executed * immediately. But this would set to 0 the inject * limit. Without injection, the blocking I/O would cause the * think time of bfqq to become long again, and therefore the @@ -4941,11 +5134,11 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, * In contrast, if the inject limit is not reset during such a * long time interval as 100 ms, then the number of short * think time samples can grow significantly before the reset - * is allowed. As a consequence, the think time state can - * become stable before the reset. There will be no state - * change when the 100 ms elapse, and therefore no reset of - * the inject limit. The inject limit remains steadily equal - * to 1 both during and after the 100 ms. So injection can be + * is performed. As a consequence, the think time state can + * become stable before the reset. Therefore there will be no + * state change when the 100 ms elapse, and no reset of the + * inject limit. The inject limit remains steadily equal to 1 + * both during and after the 100 ms. So injection can be * performed at all times, and throughput gets boosted. * * An inject limit equal to 1 is however in conflict, in @@ -4960,10 +5153,20 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, * brought forward, because it is not blocked for * milliseconds. * - * In addition, during the 100 ms, the base value for the - * total service time is likely to get finally computed, - * freeing the inject limit from its relation with the think - * time. + * In addition, serving the blocking I/O much sooner, and much + * more frequently than once per I/O-plugging timeout, makes + * it much quicker to detect a waker queue (the concept of + * waker queue is defined in the comments in + * bfq_add_request()). This makes it possible to start sooner + * to boost throughput more effectively, by injecting the I/O + * of the waker queue unconditionally on every + * bfq_dispatch_request(). + * + * One last, important benefit of not resetting the inject + * limit before 100 ms is that, during this time interval, the + * base value for the total service time is likely to get + * finally computed for bfqq, freeing the inject limit from + * its relation with the think time. */ if (state_changed && bfqq->last_serv_time_ns == 0 && (time_is_before_eq_jiffies(bfqq->decrease_time_jif + @@ -5278,6 +5481,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) 1UL<<(BFQ_RATE_SHIFT - 10)) bfq_update_rate_reset(bfqd, NULL); bfqd->last_completion = now_ns; + bfqd->last_completed_rq_bfqq = bfqq; /* * If we are waiting to discover whether the request pattern diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 584d3c9ed8ba..e80adf822bbe 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -357,6 +357,24 @@ struct bfq_queue { /* max service rate measured so far */ u32 max_service_rate; + + /* + * Pointer to the waker queue for this queue, i.e., to the + * queue Q such that this queue happens to get new I/O right + * after some I/O request of Q is completed. For details, see + * the comments on the choice of the queue for injection in + * bfq_select_queue(). + */ + struct bfq_queue *waker_bfqq; + /* node for woken_list, see below */ + struct hlist_node woken_list_node; + /* + * Head of the list of the woken queues for this queue, i.e., + * of the list of the queues for which this queue is a waker + * queue. This list is used to reset the waker_bfqq pointer in + * the woken queues when this queue exits. + */ + struct hlist_head woken_list; }; /** @@ -533,6 +551,9 @@ struct bfq_data { /* time of last request completion (ns) */ u64 last_completion; + /* bfqq owning the last completed rq */ + struct bfq_queue *last_completed_rq_bfqq; + /* time of last transition from empty to non-empty (ns) */ u64 last_empty_occupied_ns; @@ -743,7 +764,8 @@ enum bfqq_state_flags { * update */ BFQQF_coop, /* bfqq is shared */ - BFQQF_split_coop /* shared bfqq will be split */ + BFQQF_split_coop, /* shared bfqq will be split */ + BFQQF_has_waker /* bfqq has a waker queue */ }; #define BFQ_BFQQ_FNS(name) \ @@ -763,6 +785,7 @@ BFQ_BFQQ_FNS(in_large_burst); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); BFQ_BFQQ_FNS(softrt_update); +BFQ_BFQQ_FNS(has_waker); #undef BFQ_BFQQ_FNS /* Expiration reasons. */ From 96a291c38c329910738c002de83a9e3f6bf8c6e7 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 25 Jun 2019 07:12:48 +0200 Subject: [PATCH 084/153] block, bfq: preempt lower-weight or lower-priority queues BFQ enqueues the I/O coming from each process into a separate bfq_queue, and serves bfq_queues one at a time. Each bfq_queue may be served for at most timeout_sync milliseconds (default: 125 ms). This service scheme is prone to the following inaccuracy. While a bfq_queue Q1 is in service, some empty bfq_queue Q2 may receive I/O, and, according to BFQ's scheduling policy, may become the right bfq_queue to serve, in place of the currently in-service bfq_queue. In this respect, postponing the service of Q2 to after the service of Q1 finishes may delay the completion of Q2's I/O, compared with an ideal service in which all non-empty bfq_queues are served in parallel, and every non-empty bfq_queue is served at a rate proportional to the bfq_queue's weight. This additional delay is equal at most to the time Q1 may unjustly remain in service before switching to Q2. If Q1 and Q2 have the same weight, then this time is most likely negligible compared with the completion time to be guaranteed to Q2's I/O. In addition, first, one of the reasons why BFQ may want to serve Q1 for a while is that this boosts throughput and, second, serving Q1 longer reduces BFQ's overhead. As a conclusion, it is usually better not to preempt Q1 if both Q1 and Q2 have the same weight. In contrast, as Q2's weight or priority becomes higher and higher compared with that of Q1, the above delay becomes larger and larger, compared with the I/O completion times that have to be guaranteed to Q2 according to Q2's weight. So reducing this delay may be more important than avoiding the costs of preempting Q1. Accordingly, this commit preempts Q1 if Q2 has a higher weight or a higher priority than Q1. Preemption causes Q1 to be re-scheduled, and triggers a new choice of the next bfq_queue to serve. If Q2 really is the next bfq_queue to serve, then Q2 will be set in service immediately. This change reduces the component of the I/O latency caused by the above delay by about 80%. For example, on an (old) PLEXTOR PX-256M5 SSD, the maximum latency reported by fio drops from 15.1 to 3.2 ms for a process doing sporadic random reads while another process is doing continuous sequential reads. Signed-off-by: Nicola Bottura Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 95 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 75 insertions(+), 20 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 9e2fbb7d1fb6..6a3d05023300 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1428,17 +1428,19 @@ static int bfq_min_budget(struct bfq_data *bfqd) * mechanism may be re-designed in such a way to make it possible to * know whether preemption is needed without needing to update service * trees). In addition, queue preemptions almost always cause random - * I/O, and thus loss of throughput. Because of these facts, the next - * function adopts the following simple scheme to avoid both costly - * operations and too frequent preemptions: it requests the expiration - * of the in-service queue (unconditionally) only for queues that need - * to recover a hole, or that either are weight-raised or deserve to - * be weight-raised. + * I/O, which may in turn cause loss of throughput. Finally, there may + * even be no in-service queue when the next function is invoked (so, + * no queue to compare timestamps with). Because of these facts, the + * next function adopts the following simple scheme to avoid costly + * operations, too frequent preemptions and too many dependencies on + * the state of the scheduler: it requests the expiration of the + * in-service queue (unconditionally) only for queues that need to + * recover a hole. Then it delegates to other parts of the code the + * responsibility of handling the above case 2. */ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool arrived_in_time, - bool wr_or_deserves_wr) + bool arrived_in_time) { struct bfq_entity *entity = &bfqq->entity; @@ -1493,7 +1495,7 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, entity->budget = max_t(unsigned long, bfqq->max_budget, bfq_serv_to_charge(bfqq->next_rq, bfqq)); bfq_clear_bfqq_non_blocking_wait_rq(bfqq); - return wr_or_deserves_wr; + return false; } /* @@ -1611,6 +1613,36 @@ static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, bfqd->bfq_wr_min_idle_time); } + +/* + * Return true if bfqq is in a higher priority class, or has a higher + * weight than the in-service queue. + */ +static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq, + struct bfq_queue *in_serv_bfqq) +{ + int bfqq_weight, in_serv_weight; + + if (bfqq->ioprio_class < in_serv_bfqq->ioprio_class) + return true; + + if (in_serv_bfqq->entity.parent == bfqq->entity.parent) { + bfqq_weight = bfqq->entity.weight; + in_serv_weight = in_serv_bfqq->entity.weight; + } else { + if (bfqq->entity.parent) + bfqq_weight = bfqq->entity.parent->weight; + else + bfqq_weight = bfqq->entity.weight; + if (in_serv_bfqq->entity.parent) + in_serv_weight = in_serv_bfqq->entity.parent->weight; + else + in_serv_weight = in_serv_bfqq->entity.weight; + } + + return bfqq_weight > in_serv_weight; +} + static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, struct bfq_queue *bfqq, int old_wr_coeff, @@ -1655,8 +1687,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, */ bfqq_wants_to_preempt = bfq_bfqq_update_budg_for_activation(bfqd, bfqq, - arrived_in_time, - wr_or_deserves_wr); + arrived_in_time); /* * If bfqq happened to be activated in a burst, but has been @@ -1721,16 +1752,40 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, /* * Expire in-service queue only if preemption may be needed - * for guarantees. In this respect, the function - * next_queue_may_preempt just checks a simple, necessary - * condition, and not a sufficient condition based on - * timestamps. In fact, for the latter condition to be - * evaluated, timestamps would need first to be updated, and - * this operation is quite costly (see the comments on the - * function bfq_bfqq_update_budg_for_activation). + * for guarantees. In particular, we care only about two + * cases. The first is that bfqq has to recover a service + * hole, as explained in the comments on + * bfq_bfqq_update_budg_for_activation(), i.e., that + * bfqq_wants_to_preempt is true. However, if bfqq does not + * carry time-critical I/O, then bfqq's bandwidth is less + * important than that of queues that carry time-critical I/O. + * So, as a further constraint, we consider this case only if + * bfqq is at least as weight-raised, i.e., at least as time + * critical, as the in-service queue. + * + * The second case is that bfqq is in a higher priority class, + * or has a higher weight than the in-service queue. If this + * condition does not hold, we don't care because, even if + * bfqq does not start to be served immediately, the resulting + * delay for bfqq's I/O is however lower or much lower than + * the ideal completion time to be guaranteed to bfqq's I/O. + * + * In both cases, preemption is needed only if, according to + * the timestamps of both bfqq and of the in-service queue, + * bfqq actually is the next queue to serve. So, to reduce + * useless preemptions, the return value of + * next_queue_may_preempt() is considered in the next compound + * condition too. Yet next_queue_may_preempt() just checks a + * simple, necessary condition for bfqq to be the next queue + * to serve. In fact, to evaluate a sufficient condition, the + * timestamps of the in-service queue would need to be + * updated, and this operation is quite costly (see the + * comments on bfq_bfqq_update_budg_for_activation()). */ - if (bfqd->in_service_queue && bfqq_wants_to_preempt && - bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && + if (bfqd->in_service_queue && + ((bfqq_wants_to_preempt && + bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) || + bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) && next_queue_may_preempt(bfqd)) bfq_bfqq_expire(bfqd, bfqd->in_service_queue, false, BFQQE_PREEMPTED); From 3726112ec7316068625a1adefa101b9522c588ba Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 25 Jun 2019 07:12:49 +0200 Subject: [PATCH 085/153] block, bfq: re-schedule empty queues if they deserve I/O plugging Consider, on one side, a bfq_queue Q that remains empty while in service, and, on the other side, the pending I/O of bfq_queues that, according to their timestamps, have to be served after Q. If an uncontrolled amount of I/O from the latter bfq_queues were dispatched while Q is waiting for its new I/O to arrive, then Q's bandwidth guarantees would be violated. To prevent this, I/O dispatch is plugged until Q receives new I/O (except for a properly controlled amount of injected I/O). Unfortunately, preemption breaks I/O-dispatch plugging, for the following reason. Preemption is performed in two steps. First, Q is expired and re-scheduled. Second, the new bfq_queue to serve is chosen. The first step is needed by the second, as the second can be performed only after Q's timestamps have been properly updated (done in the expiration step), and Q has been re-queued for service. This dependency is a consequence of the way how BFQ's scheduling algorithm is currently implemented. But Q is not re-scheduled at all in the first step, because Q is empty. As a consequence, an uncontrolled amount of I/O may be dispatched until Q becomes non empty again. This breaks Q's service guarantees. This commit addresses this issue by re-scheduling Q even if it is empty. This in turn breaks the assumption that all scheduled queues are non empty. Then a few extra checks are now needed. Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 387 +++++++++++++++++++++++--------------------- 1 file changed, 203 insertions(+), 184 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 6a3d05023300..72840ebf953e 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3210,7 +3210,186 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) bfq_remove_request(q, rq); } -static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) +/* + * There is a case where idling does not have to be performed for + * throughput concerns, but to preserve the throughput share of + * the process associated with bfqq. + * + * To introduce this case, we can note that allowing the drive + * to enqueue more than one request at a time, and hence + * delegating de facto final scheduling decisions to the + * drive's internal scheduler, entails loss of control on the + * actual request service order. In particular, the critical + * situation is when requests from different processes happen + * to be present, at the same time, in the internal queue(s) + * of the drive. In such a situation, the drive, by deciding + * the service order of the internally-queued requests, does + * determine also the actual throughput distribution among + * these processes. But the drive typically has no notion or + * concern about per-process throughput distribution, and + * makes its decisions only on a per-request basis. Therefore, + * the service distribution enforced by the drive's internal + * scheduler is likely to coincide with the desired throughput + * distribution only in a completely symmetric, or favorably + * skewed scenario where: + * (i-a) each of these processes must get the same throughput as + * the others, + * (i-b) in case (i-a) does not hold, it holds that the process + * associated with bfqq must receive a lower or equal + * throughput than any of the other processes; + * (ii) the I/O of each process has the same properties, in + * terms of locality (sequential or random), direction + * (reads or writes), request sizes, greediness + * (from I/O-bound to sporadic), and so on; + + * In fact, in such a scenario, the drive tends to treat the requests + * of each process in about the same way as the requests of the + * others, and thus to provide each of these processes with about the + * same throughput. This is exactly the desired throughput + * distribution if (i-a) holds, or, if (i-b) holds instead, this is an + * even more convenient distribution for (the process associated with) + * bfqq. + * + * In contrast, in any asymmetric or unfavorable scenario, device + * idling (I/O-dispatch plugging) is certainly needed to guarantee + * that bfqq receives its assigned fraction of the device throughput + * (see [1] for details). + * + * The problem is that idling may significantly reduce throughput with + * certain combinations of types of I/O and devices. An important + * example is sync random I/O on flash storage with command + * queueing. So, unless bfqq falls in cases where idling also boosts + * throughput, it is important to check conditions (i-a), i(-b) and + * (ii) accurately, so as to avoid idling when not strictly needed for + * service guarantees. + * + * Unfortunately, it is extremely difficult to thoroughly check + * condition (ii). And, in case there are active groups, it becomes + * very difficult to check conditions (i-a) and (i-b) too. In fact, + * if there are active groups, then, for conditions (i-a) or (i-b) to + * become false 'indirectly', it is enough that an active group + * contains more active processes or sub-groups than some other active + * group. More precisely, for conditions (i-a) or (i-b) to become + * false because of such a group, it is not even necessary that the + * group is (still) active: it is sufficient that, even if the group + * has become inactive, some of its descendant processes still have + * some request already dispatched but still waiting for + * completion. In fact, requests have still to be guaranteed their + * share of the throughput even after being dispatched. In this + * respect, it is easy to show that, if a group frequently becomes + * inactive while still having in-flight requests, and if, when this + * happens, the group is not considered in the calculation of whether + * the scenario is asymmetric, then the group may fail to be + * guaranteed its fair share of the throughput (basically because + * idling may not be performed for the descendant processes of the + * group, but it had to be). We address this issue with the following + * bi-modal behavior, implemented in the function + * bfq_asymmetric_scenario(). + * + * If there are groups with requests waiting for completion + * (as commented above, some of these groups may even be + * already inactive), then the scenario is tagged as + * asymmetric, conservatively, without checking any of the + * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq. + * This behavior matches also the fact that groups are created + * exactly if controlling I/O is a primary concern (to + * preserve bandwidth and latency guarantees). + * + * On the opposite end, if there are no groups with requests waiting + * for completion, then only conditions (i-a) and (i-b) are actually + * controlled, i.e., provided that conditions (i-a) or (i-b) holds, + * idling is not performed, regardless of whether condition (ii) + * holds. In other words, only if conditions (i-a) and (i-b) do not + * hold, then idling is allowed, and the device tends to be prevented + * from queueing many requests, possibly of several processes. Since + * there are no groups with requests waiting for completion, then, to + * control conditions (i-a) and (i-b) it is enough to check just + * whether all the queues with requests waiting for completion also + * have the same weight. + * + * Not checking condition (ii) evidently exposes bfqq to the + * risk of getting less throughput than its fair share. + * However, for queues with the same weight, a further + * mechanism, preemption, mitigates or even eliminates this + * problem. And it does so without consequences on overall + * throughput. This mechanism and its benefits are explained + * in the next three paragraphs. + * + * Even if a queue, say Q, is expired when it remains idle, Q + * can still preempt the new in-service queue if the next + * request of Q arrives soon (see the comments on + * bfq_bfqq_update_budg_for_activation). If all queues and + * groups have the same weight, this form of preemption, + * combined with the hole-recovery heuristic described in the + * comments on function bfq_bfqq_update_budg_for_activation, + * are enough to preserve a correct bandwidth distribution in + * the mid term, even without idling. In fact, even if not + * idling allows the internal queues of the device to contain + * many requests, and thus to reorder requests, we can rather + * safely assume that the internal scheduler still preserves a + * minimum of mid-term fairness. + * + * More precisely, this preemption-based, idleless approach + * provides fairness in terms of IOPS, and not sectors per + * second. This can be seen with a simple example. Suppose + * that there are two queues with the same weight, but that + * the first queue receives requests of 8 sectors, while the + * second queue receives requests of 1024 sectors. In + * addition, suppose that each of the two queues contains at + * most one request at a time, which implies that each queue + * always remains idle after it is served. Finally, after + * remaining idle, each queue receives very quickly a new + * request. It follows that the two queues are served + * alternatively, preempting each other if needed. This + * implies that, although both queues have the same weight, + * the queue with large requests receives a service that is + * 1024/8 times as high as the service received by the other + * queue. + * + * The motivation for using preemption instead of idling (for + * queues with the same weight) is that, by not idling, + * service guarantees are preserved (completely or at least in + * part) without minimally sacrificing throughput. And, if + * there is no active group, then the primary expectation for + * this device is probably a high throughput. + * + * We are now left only with explaining the additional + * compound condition that is checked below for deciding + * whether the scenario is asymmetric. To explain this + * compound condition, we need to add that the function + * bfq_asymmetric_scenario checks the weights of only + * non-weight-raised queues, for efficiency reasons (see + * comments on bfq_weights_tree_add()). Then the fact that + * bfqq is weight-raised is checked explicitly here. More + * precisely, the compound condition below takes into account + * also the fact that, even if bfqq is being weight-raised, + * the scenario is still symmetric if all queues with requests + * waiting for completion happen to be + * weight-raised. Actually, we should be even more precise + * here, and differentiate between interactive weight raising + * and soft real-time weight raising. + * + * As a side note, it is worth considering that the above + * device-idling countermeasures may however fail in the + * following unlucky scenario: if idling is (correctly) + * disabled in a time period during which all symmetry + * sub-conditions hold, and hence the device is allowed to + * enqueue many requests, but at some later point in time some + * sub-condition stops to hold, then it may become impossible + * to let requests be served in the desired order until all + * the requests already queued in the device have been served. + */ +static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + return (bfqq->wr_coeff > 1 && + bfqd->wr_busy_queues < + bfq_tot_busy_queues(bfqd)) || + bfq_asymmetric_scenario(bfqd, bfqq); +} + +static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, + enum bfqq_expiration reason) { /* * If this bfqq is shared between multiple processes, check @@ -3221,7 +3400,22 @@ static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) bfq_mark_bfqq_split_coop(bfqq); - if (RB_EMPTY_ROOT(&bfqq->sort_list)) { + /* + * Consider queues with a higher finish virtual time than + * bfqq. If idling_needed_for_service_guarantees(bfqq) returns + * true, then bfqq's bandwidth would be violated if an + * uncontrolled amount of I/O from these queues were + * dispatched while bfqq is waiting for its new I/O to + * arrive. This is exactly what may happen if this is a forced + * expiration caused by a preemption attempt, and if bfqq is + * not re-scheduled. To prevent this from happening, re-queue + * bfqq if it needs I/O-dispatch plugging, even if it is + * empty. By doing so, bfqq is granted to be served before the + * above queues (provided that bfqq is of course eligible). + */ + if (RB_EMPTY_ROOT(&bfqq->sort_list) && + !(reason == BFQQE_PREEMPTED && + idling_needed_for_service_guarantees(bfqd, bfqq))) { if (bfqq->dispatched == 0) /* * Overloading budget_timeout field to store @@ -3238,7 +3432,8 @@ static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) * Resort priority tree of potential close cooperators. * See comments on bfq_pos_tree_add_move() for the unlikely(). */ - if (unlikely(!bfqd->nonrot_with_queueing)) + if (unlikely(!bfqd->nonrot_with_queueing && + !RB_EMPTY_ROOT(&bfqq->sort_list))) bfq_pos_tree_add_move(bfqd, bfqq); } @@ -3739,7 +3934,7 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, * reason. */ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); - if (__bfq_bfqq_expire(bfqd, bfqq)) + if (__bfq_bfqq_expire(bfqd, bfqq, reason)) /* bfqq is gone, no more actions on it */ return; @@ -3885,184 +4080,6 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, bfqd->wr_busy_queues == 0; } -/* - * There is a case where idling does not have to be performed for - * throughput concerns, but to preserve the throughput share of - * the process associated with bfqq. - * - * To introduce this case, we can note that allowing the drive - * to enqueue more than one request at a time, and hence - * delegating de facto final scheduling decisions to the - * drive's internal scheduler, entails loss of control on the - * actual request service order. In particular, the critical - * situation is when requests from different processes happen - * to be present, at the same time, in the internal queue(s) - * of the drive. In such a situation, the drive, by deciding - * the service order of the internally-queued requests, does - * determine also the actual throughput distribution among - * these processes. But the drive typically has no notion or - * concern about per-process throughput distribution, and - * makes its decisions only on a per-request basis. Therefore, - * the service distribution enforced by the drive's internal - * scheduler is likely to coincide with the desired throughput - * distribution only in a completely symmetric, or favorably - * skewed scenario where: - * (i-a) each of these processes must get the same throughput as - * the others, - * (i-b) in case (i-a) does not hold, it holds that the process - * associated with bfqq must receive a lower or equal - * throughput than any of the other processes; - * (ii) the I/O of each process has the same properties, in - * terms of locality (sequential or random), direction - * (reads or writes), request sizes, greediness - * (from I/O-bound to sporadic), and so on; - - * In fact, in such a scenario, the drive tends to treat the requests - * of each process in about the same way as the requests of the - * others, and thus to provide each of these processes with about the - * same throughput. This is exactly the desired throughput - * distribution if (i-a) holds, or, if (i-b) holds instead, this is an - * even more convenient distribution for (the process associated with) - * bfqq. - * - * In contrast, in any asymmetric or unfavorable scenario, device - * idling (I/O-dispatch plugging) is certainly needed to guarantee - * that bfqq receives its assigned fraction of the device throughput - * (see [1] for details). - * - * The problem is that idling may significantly reduce throughput with - * certain combinations of types of I/O and devices. An important - * example is sync random I/O on flash storage with command - * queueing. So, unless bfqq falls in cases where idling also boosts - * throughput, it is important to check conditions (i-a), i(-b) and - * (ii) accurately, so as to avoid idling when not strictly needed for - * service guarantees. - * - * Unfortunately, it is extremely difficult to thoroughly check - * condition (ii). And, in case there are active groups, it becomes - * very difficult to check conditions (i-a) and (i-b) too. In fact, - * if there are active groups, then, for conditions (i-a) or (i-b) to - * become false 'indirectly', it is enough that an active group - * contains more active processes or sub-groups than some other active - * group. More precisely, for conditions (i-a) or (i-b) to become - * false because of such a group, it is not even necessary that the - * group is (still) active: it is sufficient that, even if the group - * has become inactive, some of its descendant processes still have - * some request already dispatched but still waiting for - * completion. In fact, requests have still to be guaranteed their - * share of the throughput even after being dispatched. In this - * respect, it is easy to show that, if a group frequently becomes - * inactive while still having in-flight requests, and if, when this - * happens, the group is not considered in the calculation of whether - * the scenario is asymmetric, then the group may fail to be - * guaranteed its fair share of the throughput (basically because - * idling may not be performed for the descendant processes of the - * group, but it had to be). We address this issue with the following - * bi-modal behavior, implemented in the function - * bfq_asymmetric_scenario(). - * - * If there are groups with requests waiting for completion - * (as commented above, some of these groups may even be - * already inactive), then the scenario is tagged as - * asymmetric, conservatively, without checking any of the - * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq. - * This behavior matches also the fact that groups are created - * exactly if controlling I/O is a primary concern (to - * preserve bandwidth and latency guarantees). - * - * On the opposite end, if there are no groups with requests waiting - * for completion, then only conditions (i-a) and (i-b) are actually - * controlled, i.e., provided that conditions (i-a) or (i-b) holds, - * idling is not performed, regardless of whether condition (ii) - * holds. In other words, only if conditions (i-a) and (i-b) do not - * hold, then idling is allowed, and the device tends to be prevented - * from queueing many requests, possibly of several processes. Since - * there are no groups with requests waiting for completion, then, to - * control conditions (i-a) and (i-b) it is enough to check just - * whether all the queues with requests waiting for completion also - * have the same weight. - * - * Not checking condition (ii) evidently exposes bfqq to the - * risk of getting less throughput than its fair share. - * However, for queues with the same weight, a further - * mechanism, preemption, mitigates or even eliminates this - * problem. And it does so without consequences on overall - * throughput. This mechanism and its benefits are explained - * in the next three paragraphs. - * - * Even if a queue, say Q, is expired when it remains idle, Q - * can still preempt the new in-service queue if the next - * request of Q arrives soon (see the comments on - * bfq_bfqq_update_budg_for_activation). If all queues and - * groups have the same weight, this form of preemption, - * combined with the hole-recovery heuristic described in the - * comments on function bfq_bfqq_update_budg_for_activation, - * are enough to preserve a correct bandwidth distribution in - * the mid term, even without idling. In fact, even if not - * idling allows the internal queues of the device to contain - * many requests, and thus to reorder requests, we can rather - * safely assume that the internal scheduler still preserves a - * minimum of mid-term fairness. - * - * More precisely, this preemption-based, idleless approach - * provides fairness in terms of IOPS, and not sectors per - * second. This can be seen with a simple example. Suppose - * that there are two queues with the same weight, but that - * the first queue receives requests of 8 sectors, while the - * second queue receives requests of 1024 sectors. In - * addition, suppose that each of the two queues contains at - * most one request at a time, which implies that each queue - * always remains idle after it is served. Finally, after - * remaining idle, each queue receives very quickly a new - * request. It follows that the two queues are served - * alternatively, preempting each other if needed. This - * implies that, although both queues have the same weight, - * the queue with large requests receives a service that is - * 1024/8 times as high as the service received by the other - * queue. - * - * The motivation for using preemption instead of idling (for - * queues with the same weight) is that, by not idling, - * service guarantees are preserved (completely or at least in - * part) without minimally sacrificing throughput. And, if - * there is no active group, then the primary expectation for - * this device is probably a high throughput. - * - * We are now left only with explaining the additional - * compound condition that is checked below for deciding - * whether the scenario is asymmetric. To explain this - * compound condition, we need to add that the function - * bfq_asymmetric_scenario checks the weights of only - * non-weight-raised queues, for efficiency reasons (see - * comments on bfq_weights_tree_add()). Then the fact that - * bfqq is weight-raised is checked explicitly here. More - * precisely, the compound condition below takes into account - * also the fact that, even if bfqq is being weight-raised, - * the scenario is still symmetric if all queues with requests - * waiting for completion happen to be - * weight-raised. Actually, we should be even more precise - * here, and differentiate between interactive weight raising - * and soft real-time weight raising. - * - * As a side note, it is worth considering that the above - * device-idling countermeasures may however fail in the - * following unlucky scenario: if idling is (correctly) - * disabled in a time period during which all symmetry - * sub-conditions hold, and hence the device is allowed to - * enqueue many requests, but at some later point in time some - * sub-condition stops to hold, then it may become impossible - * to let requests be served in the desired order until all - * the requests already queued in the device have been served. - */ -static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - return (bfqq->wr_coeff > 1 && - bfqd->wr_busy_queues < - bfq_tot_busy_queues(bfqd)) || - bfq_asymmetric_scenario(bfqd, bfqq); -} - /* * For a queue that becomes empty, device idling is allowed only if * this function returns true for that queue. As a consequence, since @@ -4321,7 +4338,8 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { struct bfq_queue *async_bfqq = bfqq->bic && bfqq->bic->bfqq[0] && - bfq_bfqq_busy(bfqq->bic->bfqq[0]) ? + bfq_bfqq_busy(bfqq->bic->bfqq[0]) && + bfqq->bic->bfqq[0]->next_rq ? bfqq->bic->bfqq[0] : NULL; /* @@ -4403,6 +4421,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) bfqq = bfqq->bic->bfqq[0]; else if (bfq_bfqq_has_waker(bfqq) && bfq_bfqq_busy(bfqq->waker_bfqq) && + bfqq->next_rq && bfq_serv_to_charge(bfqq->waker_bfqq->next_rq, bfqq->waker_bfqq) <= bfq_bfqq_budget_left(bfqq->waker_bfqq) @@ -4800,7 +4819,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) struct hlist_node *n; if (bfqq == bfqd->in_service_queue) { - __bfq_bfqq_expire(bfqd, bfqq); + __bfq_bfqq_expire(bfqd, bfqq, BFQQE_BUDGET_TIMEOUT); bfq_schedule_dispatch(bfqd); } From 16d4b74654ff7c3c5d0b6446278ef51b1de41484 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 26 Jun 2019 12:42:51 +0300 Subject: [PATCH 086/153] md/raid1: Fix a warning message in remove_wb() The WARN_ON() macro doesn't take an error message, it just takes a condition. I've changed this to use WARN(1, "...") instead. Fixes: 3e148a320979 ("md/raid1: fix potential data inconsistency issue with write behind device") Signed-off-by: Dan Carpenter Signed-off-by: Song Liu --- drivers/md/raid1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 3d44da663797..34e26834ad28 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -96,7 +96,7 @@ static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) } if (!found) - WARN_ON("The write behind IO is not recorded\n"); + WARN(1, "The write behind IO is not recorded\n"); spin_unlock_irqrestore(&rdev->wb_list_lock, flags); wake_up(&rdev->wb_io_wait); } From 2b50f230f76f8ef954f12ac34a648e1978f6adf0 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 26 Jun 2019 12:59:19 -0700 Subject: [PATCH 087/153] block, bfq: Init saved_wr_start_at_switch_to_srt in unlikely case Some debug code suggested by Paolo was tripping when I did reboot stress tests. Specifically in bfq_bfqq_resume_state() "bic->saved_wr_start_at_switch_to_srt" was later than the current value of "jiffies". A bit of debugging showed that "bic->saved_wr_start_at_switch_to_srt" was actually 0 and a bit more debugging showed that was because we had run through the "unlikely" case in the bfq_bfqq_save_state() function. Let's init "saved_wr_start_at_switch_to_srt" in the unlikely case to something sane. NOTE: this fixes no known real-world errors. Reviewed-by: Paolo Valente Reviewed-by: Guenter Roeck Signed-off-by: Douglas Anderson Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 72840ebf953e..008c93d6b8d7 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2678,6 +2678,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) * to enjoy weight raising if split soon. */ bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; + bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now(); bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); bic->saved_last_wr_start_finish = jiffies; } else { From a5b47a40bed8b19e956872fb55097d676a68f59e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 27 Jun 2019 11:59:41 +0900 Subject: [PATCH 088/153] block: Remove unused code bio_flush_dcache_pages() is unused. Remove it. Reviewed-by: Christoph Hellwig Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/bio.c | 12 ------------ include/linux/bio.h | 11 ----------- 2 files changed, 23 deletions(-) diff --git a/block/bio.c b/block/bio.c index ad9c3aa9bf7d..bb55b94bb361 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1760,18 +1760,6 @@ void generic_end_io_acct(struct request_queue *q, int req_op, } EXPORT_SYMBOL(generic_end_io_acct); -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -void bio_flush_dcache_pages(struct bio *bi) -{ - struct bio_vec bvec; - struct bvec_iter iter; - - bio_for_each_segment(bvec, bi, iter) - flush_dcache_page(bvec.bv_page); -} -EXPORT_SYMBOL(bio_flush_dcache_pages); -#endif - static inline bool bio_remaining_done(struct bio *bio) { /* diff --git a/include/linux/bio.h b/include/linux/bio.h index ee11c4324751..5a8ae56e09ff 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -443,17 +443,6 @@ void generic_end_io_acct(struct request_queue *q, int op, struct hd_struct *part, unsigned long start_time); -#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" -#endif -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -extern void bio_flush_dcache_pages(struct bio *bi); -#else -static inline void bio_flush_dcache_pages(struct bio *bi) -{ -} -#endif - extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter); extern void bio_copy_data(struct bio *dst, struct bio *src); From 141df8bb5dc052f605de8f48a7aa10290e1384ae Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:24 +0800 Subject: [PATCH 089/153] bcache: don't set max writeback rate if gc is running When gc is running, user space I/O processes may wait inside bcache code, so no new I/O coming. Indeed this is not a real idle time, maximum writeback rate should not be set in such situation. Otherwise a faster writeback thread may compete locks with gc thread and makes garbage collection slower, which results a longer I/O freeze period. This patch checks c->gc_mark_valid in set_at_max_writeback_rate(). If c->gc_mark_valid is 0 (gc running), set_at_max_writeback_rate() returns false, then update_writeback_rate() will not set writeback rate to maximum value even c->idle_counter reaches an idle threshold. Now writeback thread won't interfere gc thread performance. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 73f0efac2b9f..262f7ef20992 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -122,6 +122,9 @@ static void __update_writeback_rate(struct cached_dev *dc) static bool set_at_max_writeback_rate(struct cache_set *c, struct cached_dev *dc) { + /* Don't set max writeback rate if gc is running */ + if (!c->gc_mark_valid) + return false; /* * Idle_counter is increased everytime when update_writeback_rate() is * called. If all backing devices attached to the same cache set have From b387e9b58679c60f5b1e4313939bd4878204fc37 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:25 +0800 Subject: [PATCH 090/153] bcache: check c->gc_thread by IS_ERR_OR_NULL in cache_set_flush() When system memory is in heavy pressure, bch_gc_thread_start() from run_cache_set() may fail due to out of memory. In such condition, c->gc_thread is assigned to -ENOMEM, not NULL pointer. Then in following failure code path bch_cache_set_error(), when cache_set_flush() gets called, the code piece to stop c->gc_thread is broken, if (!IS_ERR_OR_NULL(c->gc_thread)) kthread_stop(c->gc_thread); And KASAN catches such NULL pointer deference problem, with the warning information: [ 561.207881] ================================================================== [ 561.207900] BUG: KASAN: null-ptr-deref in kthread_stop+0x3b/0x440 [ 561.207904] Write of size 4 at addr 000000000000001c by task kworker/15:1/313 [ 561.207913] CPU: 15 PID: 313 Comm: kworker/15:1 Tainted: G W 5.0.0-vanilla+ #3 [ 561.207916] Hardware name: Lenovo ThinkSystem SR650 -[7X05CTO1WW]-/-[7X05CTO1WW]-, BIOS -[IVE136T-2.10]- 03/22/2019 [ 561.207935] Workqueue: events cache_set_flush [bcache] [ 561.207940] Call Trace: [ 561.207948] dump_stack+0x9a/0xeb [ 561.207955] ? kthread_stop+0x3b/0x440 [ 561.207960] ? kthread_stop+0x3b/0x440 [ 561.207965] kasan_report+0x176/0x192 [ 561.207973] ? kthread_stop+0x3b/0x440 [ 561.207981] kthread_stop+0x3b/0x440 [ 561.207995] cache_set_flush+0xd4/0x6d0 [bcache] [ 561.208008] process_one_work+0x856/0x1620 [ 561.208015] ? find_held_lock+0x39/0x1d0 [ 561.208028] ? drain_workqueue+0x380/0x380 [ 561.208048] worker_thread+0x87/0xb80 [ 561.208058] ? __kthread_parkme+0xb6/0x180 [ 561.208067] ? process_one_work+0x1620/0x1620 [ 561.208072] kthread+0x326/0x3e0 [ 561.208079] ? kthread_create_worker_on_cpu+0xc0/0xc0 [ 561.208090] ret_from_fork+0x3a/0x50 [ 561.208110] ================================================================== [ 561.208113] Disabling lock debugging due to kernel taint [ 561.208115] irq event stamp: 11800231 [ 561.208126] hardirqs last enabled at (11800231): [] do_syscall_64+0x18/0x410 [ 561.208127] BUG: unable to handle kernel NULL pointer dereference at 000000000000001c [ 561.208129] #PF error: [WRITE] [ 561.312253] hardirqs last disabled at (11800230): [] trace_hardirqs_off_thunk+0x1a/0x1c [ 561.312259] softirqs last enabled at (11799832): [] __do_softirq+0x5c7/0x8c3 [ 561.405975] PGD 0 P4D 0 [ 561.442494] softirqs last disabled at (11799821): [] irq_exit+0x1ac/0x1e0 [ 561.791359] Oops: 0002 [#1] SMP KASAN NOPTI [ 561.791362] CPU: 15 PID: 313 Comm: kworker/15:1 Tainted: G B W 5.0.0-vanilla+ #3 [ 561.791363] Hardware name: Lenovo ThinkSystem SR650 -[7X05CTO1WW]-/-[7X05CTO1WW]-, BIOS -[IVE136T-2.10]- 03/22/2019 [ 561.791371] Workqueue: events cache_set_flush [bcache] [ 561.791374] RIP: 0010:kthread_stop+0x3b/0x440 [ 561.791376] Code: 00 00 65 8b 05 26 d5 e0 7c 89 c0 48 0f a3 05 ec aa df 02 0f 82 dc 02 00 00 4c 8d 63 20 be 04 00 00 00 4c 89 e7 e8 65 c5 53 00 ff 43 20 48 8d 7b 24 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 [ 561.791377] RSP: 0018:ffff88872fc8fd10 EFLAGS: 00010286 [ 561.838895] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 561.838916] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 561.838934] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 561.838948] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 561.838966] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 561.838979] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 561.838996] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 563.067028] RAX: 0000000000000000 RBX: fffffffffffffffc RCX: ffffffff832dd314 [ 563.067030] RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000297 [ 563.067032] RBP: ffff88872fc8fe88 R08: fffffbfff0b8213d R09: fffffbfff0b8213d [ 563.067034] R10: 0000000000000001 R11: fffffbfff0b8213c R12: 000000000000001c [ 563.408618] R13: ffff88dc61cc0f68 R14: ffff888102b94900 R15: ffff88dc61cc0f68 [ 563.408620] FS: 0000000000000000(0000) GS:ffff888f7dc00000(0000) knlGS:0000000000000000 [ 563.408622] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 563.408623] CR2: 000000000000001c CR3: 0000000f48a1a004 CR4: 00000000007606e0 [ 563.408625] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 563.408627] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 563.904795] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 563.915796] PKRU: 55555554 [ 563.915797] Call Trace: [ 563.915807] cache_set_flush+0xd4/0x6d0 [bcache] [ 563.915812] process_one_work+0x856/0x1620 [ 564.001226] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 564.033563] ? find_held_lock+0x39/0x1d0 [ 564.033567] ? drain_workqueue+0x380/0x380 [ 564.033574] worker_thread+0x87/0xb80 [ 564.062823] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 564.118042] ? __kthread_parkme+0xb6/0x180 [ 564.118046] ? process_one_work+0x1620/0x1620 [ 564.118048] kthread+0x326/0x3e0 [ 564.118050] ? kthread_create_worker_on_cpu+0xc0/0xc0 [ 564.167066] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 564.252441] ret_from_fork+0x3a/0x50 [ 564.252447] Modules linked in: msr rpcrdma sunrpc rdma_ucm ib_iser ib_umad rdma_cm ib_ipoib i40iw configfs iw_cm ib_cm libiscsi scsi_transport_iscsi mlx4_ib ib_uverbs mlx4_en ib_core nls_iso8859_1 nls_cp437 vfat fat intel_rapl skx_edac x86_pkg_temp_thermal coretemp iTCO_wdt iTCO_vendor_support crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel ses raid0 aesni_intel cdc_ether enclosure usbnet ipmi_ssif joydev aes_x86_64 i40e scsi_transport_sas mii bcache md_mod crypto_simd mei_me ioatdma crc64 ptp cryptd pcspkr i2c_i801 mlx4_core glue_helper pps_core mei lpc_ich dca wmi ipmi_si ipmi_devintf nd_pmem dax_pmem nd_btt ipmi_msghandler device_dax pcc_cpufreq button hid_generic usbhid mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect xhci_pci sysimgblt fb_sys_fops xhci_hcd ttm megaraid_sas drm usbcore nfit libnvdimm sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua efivarfs [ 564.299390] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 564.348360] CR2: 000000000000001c [ 564.348362] ---[ end trace b7f0e5cc7b2103b0 ]--- Therefore, it is not enough to only check whether c->gc_thread is NULL, we should use IS_ERR_OR_NULL() to check both NULL pointer and error value. This patch changes the above buggy code piece in this way, if (!IS_ERR_OR_NULL(c->gc_thread)) kthread_stop(c->gc_thread); Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1b63ac876169..64d9de89a63f 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1564,7 +1564,7 @@ static void cache_set_flush(struct closure *cl) kobject_put(&c->internal); kobject_del(&c->kobj); - if (c->gc_thread) + if (!IS_ERR_OR_NULL(c->gc_thread)) kthread_stop(c->gc_thread); if (!IS_ERR_OR_NULL(c->root)) From 0ae49cb7aa005ed18fe8f4d6ccf73019b78ac7b2 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:26 +0800 Subject: [PATCH 091/153] bcache: fix return value error in bch_journal_read() When everything is OK in bch_journal_read(), finally the return value is returned by, return ret; which assumes ret will be 0 here. This assumption is wrong when all journal buckets as are full and filled with valid journal entries. In such cache the last location referencess read_bucket() sets 'ret' to 1, which means new jset added into jset list. The jset list is list 'journal' in caller run_cache_set(). Return 1 to run_cache_set() means something wrong and the cache set won't start, but indeed everything is OK. This patch changes the line at end of bch_journal_read() to directly return 0 since everything if verything is good. Then a bogus error is fixed. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 12dae9348147..4e5fc05720fc 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -268,7 +268,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) struct journal_replay, list)->j.seq; - return ret; + return 0; #undef read_bucket } From 695277f16b3a102fcc22c97fdf2de77c7b19f0b3 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:27 +0800 Subject: [PATCH 092/153] Revert "bcache: set CACHE_SET_IO_DISABLE in bch_cached_dev_error()" This reverts commit 6147305c73e4511ca1a975b766b97a779d442567. Although this patch helps the failed bcache device to stop faster when too many I/O errors detected on corresponding cached device, setting CACHE_SET_IO_DISABLE bit to cache set c->flags was not a good idea. This operation will disable all I/Os on cache set, which means other attached bcache devices won't work neither. Without this patch, the failed bcache device can also be stopped eventually if internal I/O accomplished (e.g. writeback). Therefore here I revert it. Fixes: 6147305c73e4 ("bcache: set CACHE_SET_IO_DISABLE in bch_cached_dev_error()") Reported-by: Yong Li Signed-off-by: Coly Li Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 64d9de89a63f..ba2ad093bc80 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1437,8 +1437,6 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) bool bch_cached_dev_error(struct cached_dev *dc) { - struct cache_set *c; - if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) return false; @@ -1449,21 +1447,6 @@ bool bch_cached_dev_error(struct cached_dev *dc) pr_err("stop %s: too many IO errors on backing device %s\n", dc->disk.disk->disk_name, dc->backing_dev_name); - /* - * If the cached device is still attached to a cache set, - * even dc->io_disable is true and no more I/O requests - * accepted, cache device internal I/O (writeback scan or - * garbage collection) may still prevent bcache device from - * being stopped. So here CACHE_SET_IO_DISABLE should be - * set to c->flags too, to make the internal I/O to cache - * device rejected and stopped immediately. - * If c is NULL, that means the bcache device is not attached - * to any cache set, then no CACHE_SET_IO_DISABLE bit to set. - */ - c = dc->disk.c; - if (c && test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags)) - pr_info("CACHE_SET_IO_DISABLE already set"); - bcache_device_stop(&dc->disk); return true; } From e6dcbd3e6c91b7828cb305ec324eb7fd9bdea8a0 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:28 +0800 Subject: [PATCH 093/153] bcache: avoid flushing btree node in cache_set_flush() if io disabled When cache_set_flush() is called for too many I/O errors detected on cache device and the cache set is retiring, inside the function it doesn't make sense to flushing cached btree nodes from c->btree_cache because CACHE_SET_IO_DISABLE is set on c->flags already and all I/Os onto cache device will be rejected. This patch checks in cache_set_flush() that whether CACHE_SET_IO_DISABLE is set. If yes, then avoids to flush the cached btree nodes to reduce more time and make cache set retiring more faster. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index ba2ad093bc80..dc6702c2c4b6 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1553,13 +1553,17 @@ static void cache_set_flush(struct closure *cl) if (!IS_ERR_OR_NULL(c->root)) list_add(&c->root->list, &c->btree_cache); - /* Should skip this if we're unregistering because of an error */ - list_for_each_entry(b, &c->btree_cache, list) { - mutex_lock(&b->write_lock); - if (btree_node_dirty(b)) - __bch_btree_node_write(b, NULL); - mutex_unlock(&b->write_lock); - } + /* + * Avoid flushing cached nodes if cache set is retiring + * due to too many I/O errors detected. + */ + if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags)) + list_for_each_entry(b, &c->btree_cache, list) { + mutex_lock(&b->write_lock); + if (btree_node_dirty(b)) + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); + } for_each_cache(ca, c, i) if (ca->alloc_thread) From 578df99b1b0531d19af956530fe4da63d01a1604 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:29 +0800 Subject: [PATCH 094/153] bcache: ignore read-ahead request failure on backing device When md raid device (e.g. raid456) is used as backing device, read-ahead requests on a degrading and recovering md raid device might be failured immediately by md raid code, but indeed this md raid array can still be read or write for normal I/O requests. Therefore such failed read-ahead request are not real hardware failure. Further more, after degrading and recovering accomplished, read-ahead requests will be handled by md raid array again. For such condition, I/O failures of read-ahead requests don't indicate real health status (because normal I/O still be served), they should not be counted into I/O error counter dc->io_errors. Since there is no simple way to detect whether the backing divice is a md raid device, this patch simply ignores I/O failures for read-ahead bios on backing device, to avoid bogus backing device failure on a degrading md raid array. Suggested-and-tested-by: Thorsten Knabe Signed-off-by: Coly Li Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/io.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index c25097968319..4d93f07f63e5 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -58,6 +58,18 @@ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) WARN_ONCE(!dc, "NULL pointer of struct cached_dev"); + /* + * Read-ahead requests on a degrading and recovering md raid + * (e.g. raid6) device might be failured immediately by md + * raid code, which is not a real hardware media failure. So + * we shouldn't count failed REQ_RAHEAD bio to dc->io_errors. + */ + if (bio->bi_opf & REQ_RAHEAD) { + pr_warn_ratelimited("%s: Read-ahead I/O failed on backing device, ignore", + dc->backing_dev_name); + return; + } + errors = atomic_add_return(1, &dc->io_errors); if (errors < dc->error_limit) pr_err("%s: IO error on backing device, unrecoverable", From 08ec1e6282f271698f0053983fab89de6e1a8217 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:30 +0800 Subject: [PATCH 095/153] bcache: add io error counting in write_bdev_super_endio() When backing device super block is written by bch_write_bdev_super(), the bio complete callback write_bdev_super_endio() simply ignores I/O status. Indeed such write request also contribute to backing device health status if the request failed. This patch checkes bio->bi_status in write_bdev_super_endio(), if there is error, bch_count_backing_io_errors() will be called to count an I/O error to dc->io_errors. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index dc6702c2c4b6..73466bda12a7 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -197,7 +197,9 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, static void write_bdev_super_endio(struct bio *bio) { struct cached_dev *dc = bio->bi_private; - /* XXX: error checking */ + + if (bio->bi_status) + bch_count_backing_io_errors(dc, bio); closure_put(&dc->sb_write); } From f960facb399ece6ff88a7a2d4b4a5515e3a467a0 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:31 +0800 Subject: [PATCH 096/153] bcache: remove unnecessary prefetch() in bset_search_tree() In function bset_search_tree(), when p >= t->size, t->tree[0] will be prefetched by the following code piece, 974 unsigned int p = n << 4; 975 976 p &= ((int) (p - t->size)) >> 31; 977 978 prefetch(&t->tree[p]); The purpose of the above code is to avoid a branch instruction, but when p >= t->size, prefetch(&t->tree[0]) has no positive performance contribution at all. This patch avoids the unncessary prefetch by only calling prefetch() when p < t->size. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bset.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 268f1b685084..e36a108d3648 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -970,22 +970,10 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, unsigned int inorder, j, n = 1; do { - /* - * A bit trick here. - * If p < t->size, (int)(p - t->size) is a minus value and - * the most significant bit is set, right shifting 31 bits - * gets 1. If p >= t->size, the most significant bit is - * not set, right shifting 31 bits gets 0. - * So the following 2 lines equals to - * if (p >= t->size) - * p = 0; - * but a branch instruction is avoided. - */ unsigned int p = n << 4; - p &= ((int) (p - t->size)) >> 31; - - prefetch(&t->tree[p]); + if (p < t->size) + prefetch(&t->tree[p]); j = n; f = &t->tree[j]; From 89e0341af082dbc170019f908846f4a424efc86b Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Fri, 28 Jun 2019 19:59:32 +0800 Subject: [PATCH 097/153] bcache: use sysfs_match_string() instead of __sysfs_match_string() The arrays (of strings) that are passed to __sysfs_match_string() are static, so use sysfs_match_string() which does an implicit ARRAY_SIZE() over these arrays. Functionally, this doesn't change anything. The change is more cosmetic. It only shrinks the static arrays by 1 byte each. Signed-off-by: Alexandru Ardelean Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index bfb437ffb13c..760cf8951338 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -21,28 +21,24 @@ static const char * const bch_cache_modes[] = { "writethrough", "writeback", "writearound", - "none", - NULL + "none" }; /* Default is 0 ("auto") */ static const char * const bch_stop_on_failure_modes[] = { "auto", - "always", - NULL + "always" }; static const char * const cache_replacement_policies[] = { "lru", "fifo", - "random", - NULL + "random" }; static const char * const error_actions[] = { "unregister", - "panic", - NULL + "panic" }; write_attribute(attach); @@ -333,7 +329,7 @@ STORE(__cached_dev) bch_cached_dev_run(dc); if (attr == &sysfs_cache_mode) { - v = __sysfs_match_string(bch_cache_modes, -1, buf); + v = sysfs_match_string(bch_cache_modes, buf); if (v < 0) return v; @@ -344,7 +340,7 @@ STORE(__cached_dev) } if (attr == &sysfs_stop_when_cache_set_failed) { - v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf); + v = sysfs_match_string(bch_stop_on_failure_modes, buf); if (v < 0) return v; @@ -799,7 +795,7 @@ STORE(__bch_cache_set) 0, UINT_MAX); if (attr == &sysfs_errors) { - v = __sysfs_match_string(error_actions, -1, buf); + v = sysfs_match_string(error_actions, buf); if (v < 0) return v; @@ -1063,7 +1059,7 @@ STORE(__bch_cache) } if (attr == &sysfs_cache_replacement_policy) { - v = __sysfs_match_string(cache_replacement_policies, -1, buf); + v = sysfs_match_string(cache_replacement_policies, buf); if (v < 0) return v; From 0b13efecf5f25ce5e31f2ab3930335015cb65a7d Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:33 +0800 Subject: [PATCH 098/153] bcache: add return value check to bch_cached_dev_run() This patch adds return value check to bch_cached_dev_run(), now if there is error happens inside bch_cached_dev_run(), it can be catched. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/super.c | 33 ++++++++++++++++++++++++++------- drivers/md/bcache/sysfs.c | 7 +++++-- 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index fdf75352e16a..73a97586a2ef 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -1006,7 +1006,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size); int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, uint8_t *set_uuid); void bch_cached_dev_detach(struct cached_dev *dc); -void bch_cached_dev_run(struct cached_dev *dc); +int bch_cached_dev_run(struct cached_dev *dc); void bcache_device_stop(struct bcache_device *d); void bch_cache_set_unregister(struct cache_set *c); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 73466bda12a7..0abee44092bf 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -910,7 +910,7 @@ static int cached_dev_status_update(void *arg) } -void bch_cached_dev_run(struct cached_dev *dc) +int bch_cached_dev_run(struct cached_dev *dc) { struct bcache_device *d = &dc->disk; char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL); @@ -921,11 +921,14 @@ void bch_cached_dev_run(struct cached_dev *dc) NULL, }; + if (dc->io_disable) + return -EIO; + if (atomic_xchg(&dc->running, 1)) { kfree(env[1]); kfree(env[2]); kfree(buf); - return; + return -EBUSY; } if (!d->c && @@ -951,8 +954,11 @@ void bch_cached_dev_run(struct cached_dev *dc) kfree(buf); if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || - sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) + sysfs_create_link(&disk_to_dev(d->disk)->kobj, + &d->kobj, "bcache")) { pr_debug("error creating sysfs link"); + return -ENOMEM; + } dc->status_update_thread = kthread_run(cached_dev_status_update, dc, "bcache_status_update"); @@ -961,6 +967,8 @@ void bch_cached_dev_run(struct cached_dev *dc) "continue to run without monitoring backing " "device status"); } + + return 0; } /* @@ -1056,6 +1064,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds()); struct uuid_entry *u; struct cached_dev *exist_dc, *t; + int ret = 0; if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) || (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))) @@ -1165,7 +1174,12 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, bch_sectors_dirty_init(&dc->disk); - bch_cached_dev_run(dc); + ret = bch_cached_dev_run(dc); + if (ret && (ret != -EBUSY)) { + up_write(&dc->writeback_lock); + return ret; + } + bcache_device_link(&dc->disk, c, "bdev"); atomic_inc(&c->attached_dev_nr); @@ -1292,6 +1306,7 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, { const char *err = "cannot allocate memory"; struct cache_set *c; + int ret = -ENOMEM; bdevname(bdev, dc->backing_dev_name); memcpy(&dc->sb, sb, sizeof(struct cache_sb)); @@ -1321,14 +1336,18 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, bch_cached_dev_attach(dc, c, NULL); if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || - BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) - bch_cached_dev_run(dc); + BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) { + err = "failed to run cached device"; + ret = bch_cached_dev_run(dc); + if (ret) + goto err; + } return 0; err: pr_notice("error %s: %s", dc->backing_dev_name, err); bcache_device_stop(&dc->disk); - return -EIO; + return ret; } /* Flash only volumes */ diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 760cf8951338..eb678e43ac00 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -325,8 +325,11 @@ STORE(__cached_dev) bch_cache_accounting_clear(&dc->accounting); if (attr == &sysfs_running && - strtoul_or_return(buf)) - bch_cached_dev_run(dc); + strtoul_or_return(buf)) { + v = bch_cached_dev_run(dc); + if (v) + return v; + } if (attr == &sysfs_cache_mode) { v = sysfs_match_string(bch_cache_modes, buf); From bd9026c8a7f33ebe25543b7b7e6276b49db60f7e Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:34 +0800 Subject: [PATCH 099/153] bcache: remove unncessary code in bch_btree_keys_init() Function bch_btree_keys_init() initializes b->set[].size and b->set[].data to zero. As the code comments indicates, these code indeed is unncessary, because both struct btree_keys and struct bset_tree are nested embedded into struct btree, when struct btree is filled with 0 bits by kzalloc() in mca_bucket_alloc(), b->set[].size and b->set[].data are initialized to 0 (a.k.a NULL) already. This patch removes the redundant code, and add comments in bch_btree_keys_init() and mca_bucket_alloc() to explain why it's safe. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bset.c | 15 ++++++--------- drivers/md/bcache/btree.c | 4 ++++ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index e36a108d3648..8af9509e78bd 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -347,22 +347,19 @@ EXPORT_SYMBOL(bch_btree_keys_alloc); void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, bool *expensive_debug_checks) { - unsigned int i; - b->ops = ops; b->expensive_debug_checks = expensive_debug_checks; b->nsets = 0; b->last_set_unwritten = 0; - /* XXX: shouldn't be needed */ - for (i = 0; i < MAX_BSETS; i++) - b->set[i].size = 0; /* - * Second loop starts at 1 because b->keys[0]->data is the memory we - * allocated + * struct btree_keys in embedded in struct btree, and struct + * bset_tree is embedded into struct btree_keys. They are all + * initialized as 0 by kzalloc() in mca_bucket_alloc(), and + * b->set[0].data is allocated in bch_btree_keys_alloc(), so we + * don't have to initiate b->set[].size and b->set[].data here + * any more. */ - for (i = 1; i < MAX_BSETS; i++) - b->set[i].data = NULL; } EXPORT_SYMBOL(bch_btree_keys_init); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 773f5fdad25f..cf38a1b031fa 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -613,6 +613,10 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) static struct btree *mca_bucket_alloc(struct cache_set *c, struct bkey *k, gfp_t gfp) { + /* + * kzalloc() is necessary here for initialization, + * see code comments in bch_btree_keys_init(). + */ struct btree *b = kzalloc(sizeof(struct btree), gfp); if (!b) From e775339e1ae1205b47d94881db124c11385e597c Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:35 +0800 Subject: [PATCH 100/153] bcache: check CACHE_SET_IO_DISABLE in allocator code If CACHE_SET_IO_DISABLE of a cache set flag is set by too many I/O errors, currently allocator routines can still continue allocate space which may introduce inconsistent metadata state. This patch checkes CACHE_SET_IO_DISABLE bit in following allocator routines, - bch_bucket_alloc() - __bch_bucket_alloc_set() Once CACHE_SET_IO_DISABLE is set on cache set, the allocator routines may reject allocation request earlier to avoid potential inconsistent metadata. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index f8986effcb50..6f776823b9ba 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -393,6 +393,11 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait) struct bucket *b; long r; + + /* No allocation if CACHE_SET_IO_DISABLE bit is set */ + if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags))) + return -1; + /* fastpath */ if (fifo_pop(&ca->free[RESERVE_NONE], r) || fifo_pop(&ca->free[reserve], r)) @@ -484,6 +489,10 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, { int i; + /* No allocation if CACHE_SET_IO_DISABLE bit is set */ + if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) + return -1; + lockdep_assert_held(&c->bucket_lock); BUG_ON(!n || n > c->caches_loaded || n > MAX_CACHES_PER_SET); From 383ff2183ad16a8842d1fbd9dd3e1cbd66813e64 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:36 +0800 Subject: [PATCH 101/153] bcache: check CACHE_SET_IO_DISABLE bit in bch_journal() When too many I/O errors happen on cache set and CACHE_SET_IO_DISABLE bit is set, bch_journal() may continue to work because the journaling bkey might be still in write set yet. The caller of bch_journal() may believe the journal still work but the truth is in-memory journal write set won't be written into cache device any more. This behavior may introduce potential inconsistent metadata status. This patch checks CACHE_SET_IO_DISABLE bit at the head of bch_journal(), if the bit is set, bch_journal() returns NULL immediately to notice caller to know journal does not work. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 4e5fc05720fc..54f8886b6177 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -811,6 +811,10 @@ atomic_t *bch_journal(struct cache_set *c, struct journal_write *w; atomic_t *ret; + /* No journaling if CACHE_SET_IO_DISABLE set already */ + if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) + return NULL; + if (!CACHE_SYNC(&c->sb)) return NULL; From 4b6efb4bdbce25097f1a6329e18c2b77c4f27722 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:37 +0800 Subject: [PATCH 102/153] bcache: more detailed error message to bcache_device_link() This patch adds more accurate error message for specific ssyfs_create_link() call, to help debugging failure during bcache device start tup. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 0abee44092bf..d4d8d1300faf 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -693,6 +693,7 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, { unsigned int i; struct cache *ca; + int ret; for_each_cache(ca, d->c, i) bd_link_disk_holder(ca->bdev, d->disk); @@ -700,9 +701,13 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, snprintf(d->name, BCACHEDEVNAME_SIZE, "%s%u", name, d->id); - WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || - sysfs_create_link(&c->kobj, &d->kobj, d->name), - "Couldn't create device <-> cache set symlinks"); + ret = sysfs_create_link(&d->kobj, &c->kobj, "cache"); + if (ret < 0) + pr_err("Couldn't create device -> cache set symlink"); + + ret = sysfs_create_link(&c->kobj, &d->kobj, d->name); + if (ret < 0) + pr_err("Couldn't create cache set -> device symlink"); clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags); } From 633bb2ce60b949e2990c15324be162c54788c027 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:38 +0800 Subject: [PATCH 103/153] bcache: add more error message in bch_cached_dev_attach() This patch adds more error message for attaching cached device, this is helpful to debug code failure during bache device start up. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index d4d8d1300faf..a836910ef368 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1169,6 +1169,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, down_write(&dc->writeback_lock); if (bch_cached_dev_writeback_start(dc)) { up_write(&dc->writeback_lock); + pr_err("Couldn't start writeback facilities for %s", + dc->disk.disk->disk_name); return -ENOMEM; } @@ -1182,6 +1184,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, ret = bch_cached_dev_run(dc); if (ret && (ret != -EBUSY)) { up_write(&dc->writeback_lock); + pr_err("Couldn't run cached device %s", + dc->backing_dev_name); return ret; } From e0faa3d7f79f7e1abb43de168e88c76061518ea4 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:39 +0800 Subject: [PATCH 104/153] bcache: improve error message in bch_cached_dev_run() This patch adds more error message in bch_cached_dev_run() to indicate the exact reason why an error value is returned. Please notice when printing out the "is running already" message, pr_info() is used here, because in this case also -EBUSY is returned, the bcache device can continue to attach to the cache devince and run, so it won't be an error level message in kernel message. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a836910ef368..e9e6d653bf70 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -926,13 +926,18 @@ int bch_cached_dev_run(struct cached_dev *dc) NULL, }; - if (dc->io_disable) + if (dc->io_disable) { + pr_err("I/O disabled on cached dev %s", + dc->backing_dev_name); return -EIO; + } if (atomic_xchg(&dc->running, 1)) { kfree(env[1]); kfree(env[2]); kfree(buf); + pr_info("cached dev %s is running already", + dc->backing_dev_name); return -EBUSY; } @@ -961,7 +966,7 @@ int bch_cached_dev_run(struct cached_dev *dc) if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) { - pr_debug("error creating sysfs link"); + pr_err("Couldn't create bcache dev <-> disk sysfs symlinks"); return -ENOMEM; } From 68a53c95a0fce541321fbca74a7f72c71361f496 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:40 +0800 Subject: [PATCH 105/153] bcache: remove "XXX:" comment line from run_cache_set() In previous bcache patches for Linux v5.2, the failure code path of run_cache_set() is tested and fixed. So now the following comment line can be removed from run_cache_set(), /* XXX: test this, it's broken */ Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e9e6d653bf70..c53fe0f1629f 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1979,7 +1979,7 @@ static int run_cache_set(struct cache_set *c) } closure_sync(&cl); - /* XXX: test this, it's broken */ + bch_cache_set_error(c, "%s", err); return -EIO; From 944a4f340a65c21ee311d2d3e617034bef9d0b25 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:41 +0800 Subject: [PATCH 106/153] bcache: make bset_search_tree() be more understandable The purpose of following code in bset_search_tree() is to avoid a branch instruction, 994 if (likely(f->exponent != 127)) 995 n = j * 2 + (((unsigned int) 996 (f->mantissa - 997 bfloat_mantissa(search, f))) >> 31); 998 else 999 n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) 1000 ? j * 2 1001 : j * 2 + 1; This piece of code is not very clear to understand, even when I tried to add code comment for it, I made mistake. This patch removes the implict bit operation and uses explicit branch to calculate next location in binary tree search. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bset.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 8af9509e78bd..08768796b543 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -975,25 +975,17 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, j = n; f = &t->tree[j]; - /* - * Similar bit trick, use subtract operation to avoid a branch - * instruction. - * - * n = (f->mantissa > bfloat_mantissa()) - * ? j * 2 - * : j * 2 + 1; - * - * We need to subtract 1 from f->mantissa for the sign bit trick - * to work - that's done in make_bfloat() - */ - if (likely(f->exponent != 127)) - n = j * 2 + (((unsigned int) - (f->mantissa - - bfloat_mantissa(search, f))) >> 31); - else - n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) - ? j * 2 - : j * 2 + 1; + if (likely(f->exponent != 127)) { + if (f->mantissa >= bfloat_mantissa(search, f)) + n = j * 2; + else + n = j * 2 + 1; + } else { + if (bkey_cmp(tree_to_bkey(t, j), search) > 0) + n = j * 2; + else + n = j * 2 + 1; + } } while (n < t->size); inorder = to_inorder(j, t); From 0c277e211aae056b26513358fc060291d8523747 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:42 +0800 Subject: [PATCH 107/153] bcache: add pendings_cleanup to stop pending bcache device If a bcache device is in dirty state and its cache set is not registered, this bcache device will not appear in /dev/bcache, and there is no way to stop it or remove the bcache kernel module. This is an as-designed behavior, but sometimes people has to reboot whole system to release or stop the pending backing device. This sysfs interface may remove such pending bcache devices when write anything into the sysfs file manually. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 55 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c53fe0f1629f..c4c4b2d99dc2 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2273,9 +2273,13 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *buffer, size_t size); +static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, + struct kobj_attribute *attr, + const char *buffer, size_t size); kobj_attribute_write(register, register_bcache); kobj_attribute_write(register_quiet, register_bcache); +kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup); static bool bch_is_open_backing(struct block_device *bdev) { @@ -2400,6 +2404,56 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, goto out; } + +struct pdev { + struct list_head list; + struct cached_dev *dc; +}; + +static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, + struct kobj_attribute *attr, + const char *buffer, + size_t size) +{ + LIST_HEAD(pending_devs); + ssize_t ret = size; + struct cached_dev *dc, *tdc; + struct pdev *pdev, *tpdev; + struct cache_set *c, *tc; + + mutex_lock(&bch_register_lock); + list_for_each_entry_safe(dc, tdc, &uncached_devices, list) { + pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL); + if (!pdev) + break; + pdev->dc = dc; + list_add(&pdev->list, &pending_devs); + } + + list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { + char *pdev_set_uuid = pdev->dc->sb.set_uuid; + char *set_uuid = c->sb.uuid; + + if (!memcmp(pdev_set_uuid, set_uuid, 16)) { + list_del(&pdev->list); + kfree(pdev); + break; + } + } + } + mutex_unlock(&bch_register_lock); + + list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { + pr_info("delete pdev %p", pdev); + list_del(&pdev->list); + bcache_device_stop(&pdev->dc->disk); + kfree(pdev); + } + + return ret; +} + static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) { if (code == SYS_DOWN || @@ -2518,6 +2572,7 @@ static int __init bcache_init(void) static const struct attribute *files[] = { &ksysfs_register.attr, &ksysfs_register_quiet.attr, + &ksysfs_pendings_cleanup.attr, NULL }; From 5461999848e0462c14f306a62923d22de820a59c Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:43 +0800 Subject: [PATCH 108/153] bcache: fix mistaken sysfs entry for io_error counter In bch_cached_dev_files[] from driver/md/bcache/sysfs.c, sysfs_errors is incorrectly inserted in. The correct entry should be sysfs_io_errors. This patch fixes the problem and now I/O errors of cached device can be read from /sys/block/bcache/bcache/io_errors. Fixes: c7b7bd07404c5 ("bcache: add io_disable to struct cached_dev") Signed-off-by: Coly Li Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index eb678e43ac00..dddb8d4048ce 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -176,7 +176,7 @@ SHOW(__bch_cached_dev) var_print(writeback_percent); sysfs_hprint(writeback_rate, wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0); - sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); + sysfs_printf(io_errors, "%i", atomic_read(&dc->io_errors)); sysfs_printf(io_error_limit, "%i", dc->error_limit); sysfs_printf(io_disable, "%i", dc->io_disable); var_print(writeback_rate_update_seconds); @@ -463,7 +463,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_rate_p_term_inverse, &sysfs_writeback_rate_minimum, &sysfs_writeback_rate_debug, - &sysfs_errors, + &sysfs_io_errors, &sysfs_io_error_limit, &sysfs_io_disable, &sysfs_dirty_data, From f54d801dda14942dbefa00541d10603015b7859c Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:44 +0800 Subject: [PATCH 109/153] bcache: destroy dc->writeback_write_wq if failed to create dc->writeback_thread Commit 9baf30972b55 ("bcache: fix for gc and write-back race") added a new work queue dc->writeback_write_wq, but forgot to destroy it in the error condition when creating dc->writeback_thread failed. This patch destroys dc->writeback_write_wq if kthread_create() returns error pointer to dc->writeback_thread, then a memory leak is avoided. Fixes: 9baf30972b55 ("bcache: fix for gc and write-back race") Signed-off-by: Coly Li Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 262f7ef20992..21081febcb59 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -833,6 +833,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) "bcache_writeback"); if (IS_ERR(dc->writeback_thread)) { cached_dev_put(dc); + destroy_workqueue(dc->writeback_write_wq); return PTR_ERR(dc->writeback_thread); } dc->writeback_running = true; From 5c2a634cbfaf1971cb6453fe5f86d83585257790 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:45 +0800 Subject: [PATCH 110/153] bcache: stop writeback kthread and kworker when bch_cached_dev_run() failed In bch_cached_dev_attach() after bch_cached_dev_writeback_start() called, the wrireback kthread and writeback rate update kworker of the cached device are created, if the following bch_cached_dev_run() failed, bch_cached_dev_attach() will return with -ENOMEM without stopping the writeback related kthread and kworker. This patch stops writeback kthread and writeback rate update kworker before returning -ENOMEM if bch_cached_dev_run() returns error. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c4c4b2d99dc2..791cb930b353 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1189,6 +1189,14 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, ret = bch_cached_dev_run(dc); if (ret && (ret != -EBUSY)) { up_write(&dc->writeback_lock); + /* + * bch_register_lock is held, bcache_device_stop() is not + * able to be directly called. The kthread and kworker + * created previously in bch_cached_dev_writeback_start() + * have to be stopped manually here. + */ + kthread_stop(dc->writeback_thread); + cancel_writeback_rate_update_dwork(dc); pr_err("Couldn't run cached device %s", dc->backing_dev_name); return ret; From a59ff6ccc2bf2e2934b31bbf734f0bc04b5ec78a Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:46 +0800 Subject: [PATCH 111/153] bcache: avoid a deadlock in bcache_reboot() It is quite frequently to observe deadlock in bcache_reboot() happens and hang the system reboot process. The reason is, in bcache_reboot() when calling bch_cache_set_stop() and bcache_device_stop() the mutex bch_register_lock is held. But in the process to stop cache set and bcache device, bch_register_lock will be acquired again. If this mutex is held here, deadlock will happen inside the stopping process. The aftermath of the deadlock is, whole system reboot gets hung. The fix is to avoid holding bch_register_lock for the following loops in bcache_reboot(), list_for_each_entry_safe(c, tc, &bch_cache_sets, list) bch_cache_set_stop(c); list_for_each_entry_safe(dc, tdc, &uncached_devices, list) bcache_device_stop(&dc->disk); A module range variable 'bcache_is_reboot' is added, it sets to true in bcache_reboot(). In register_bcache(), if bcache_is_reboot is checked to be true, reject the registration by returning -EBUSY immediately. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 40 ++++++++++++++++++++++++++++++++++++++- drivers/md/bcache/sysfs.c | 26 +++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 791cb930b353..a88238ad5da1 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -40,6 +40,7 @@ static const char invalid_uuid[] = { static struct kobject *bcache_kobj; struct mutex bch_register_lock; +bool bcache_is_reboot; LIST_HEAD(bch_cache_sets); static LIST_HEAD(uncached_devices); @@ -49,6 +50,7 @@ static wait_queue_head_t unregister_wait; struct workqueue_struct *bcache_wq; struct workqueue_struct *bch_journal_wq; + #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) /* limitation of partitions number on single bcache device */ #define BCACHE_MINORS 128 @@ -2335,6 +2337,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!try_module_get(THIS_MODULE)) return -EBUSY; + /* For latest state of bcache_is_reboot */ + smp_mb(); + if (bcache_is_reboot) + return -EBUSY; + path = kstrndup(buffer, size, GFP_KERNEL); if (!path) goto err; @@ -2464,6 +2471,9 @@ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) { + if (bcache_is_reboot) + return NOTIFY_DONE; + if (code == SYS_DOWN || code == SYS_HALT || code == SYS_POWER_OFF) { @@ -2476,19 +2486,45 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) mutex_lock(&bch_register_lock); + if (bcache_is_reboot) + goto out; + + /* New registration is rejected since now */ + bcache_is_reboot = true; + /* + * Make registering caller (if there is) on other CPU + * core know bcache_is_reboot set to true earlier + */ + smp_mb(); + if (list_empty(&bch_cache_sets) && list_empty(&uncached_devices)) goto out; + mutex_unlock(&bch_register_lock); + pr_info("Stopping all devices:"); + /* + * The reason bch_register_lock is not held to call + * bch_cache_set_stop() and bcache_device_stop() is to + * avoid potential deadlock during reboot, because cache + * set or bcache device stopping process will acqurie + * bch_register_lock too. + * + * We are safe here because bcache_is_reboot sets to + * true already, register_bcache() will reject new + * registration now. bcache_is_reboot also makes sure + * bcache_reboot() won't be re-entered on by other thread, + * so there is no race in following list iteration by + * list_for_each_entry_safe(). + */ list_for_each_entry_safe(c, tc, &bch_cache_sets, list) bch_cache_set_stop(c); list_for_each_entry_safe(dc, tdc, &uncached_devices, list) bcache_device_stop(&dc->disk); - mutex_unlock(&bch_register_lock); /* * Give an early chance for other kthreads and @@ -2616,6 +2652,8 @@ static int __init bcache_init(void) bch_debug_init(); closure_debug_init(); + bcache_is_reboot = false; + return 0; err: bcache_exit(); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index dddb8d4048ce..d62e28643109 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -16,6 +16,8 @@ #include #include +extern bool bcache_is_reboot; + /* Default is 0 ("writethrough") */ static const char * const bch_cache_modes[] = { "writethrough", @@ -267,6 +269,10 @@ STORE(__cached_dev) struct cache_set *c; struct kobj_uevent_env *env; + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + #define d_strtoul(var) sysfs_strtoul(var, dc->var) #define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX) #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) @@ -407,6 +413,10 @@ STORE(bch_cached_dev) struct cached_dev *dc = container_of(kobj, struct cached_dev, disk.kobj); + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + mutex_lock(&bch_register_lock); size = __cached_dev_store(kobj, attr, buf, size); @@ -510,6 +520,10 @@ STORE(__bch_flash_dev) kobj); struct uuid_entry *u = &d->c->uuids[d->id]; + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + sysfs_strtoul(data_csum, d->data_csum); if (attr == &sysfs_size) { @@ -745,6 +759,10 @@ STORE(__bch_cache_set) struct cache_set *c = container_of(kobj, struct cache_set, kobj); ssize_t v; + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + if (attr == &sysfs_unregister) bch_cache_set_unregister(c); @@ -864,6 +882,10 @@ STORE(bch_cache_set_internal) { struct cache_set *c = container_of(kobj, struct cache_set, internal); + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + return bch_cache_set_store(&c->kobj, attr, buf, size); } @@ -1049,6 +1071,10 @@ STORE(__bch_cache) struct cache *ca = container_of(kobj, struct cache, kobj); ssize_t v; + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + if (attr == &sysfs_discard) { bool v = strtoul_or_return(buf); From 97ba3b816e2cdea798398bc8486125f79f2453c1 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:47 +0800 Subject: [PATCH 112/153] bcache: acquire bch_register_lock later in cached_dev_detach_finish() Now there is variable bcache_is_reboot to prevent device register or unregister during reboot, it is unncessary to still hold mutex lock bch_register_lock before stopping writeback_rate_update kworker and writeback kthread. And if the stopping kworker or kthread holding bch_register_lock inside their routine (we used to have such problem in writeback thread, thanks to Junhui Wang fixed it), it is very easy to introduce deadlock during reboot/shutdown procedure. Therefore in this patch, the location to acquire bch_register_lock is moved to the location before calling calc_cached_dev_sectors(). Which is later then original location in cached_dev_detach_finish(). Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a88238ad5da1..40d857e690f9 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1018,7 +1018,6 @@ static void cached_dev_detach_finish(struct work_struct *w) BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); BUG_ON(refcount_read(&dc->count)); - mutex_lock(&bch_register_lock); if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) cancel_writeback_rate_update_dwork(dc); @@ -1034,6 +1033,8 @@ static void cached_dev_detach_finish(struct work_struct *w) bch_write_bdev_super(dc, &cl); closure_sync(&cl); + mutex_lock(&bch_register_lock); + calc_cached_dev_sectors(dc->disk.c); bcache_device_detach(&dc->disk); list_move(&dc->list, &uncached_devices); From 80265d8dfd77792e133793cef44a21323aac2908 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:48 +0800 Subject: [PATCH 113/153] bcache: acquire bch_register_lock later in cached_dev_free() When enable lockdep engine, a lockdep warning can be observed when reboot or shutdown system, [ 3142.764557][ T1] bcache: bcache_reboot() Stopping all devices: [ 3142.776265][ T2649] [ 3142.777159][ T2649] ====================================================== [ 3142.780039][ T2649] WARNING: possible circular locking dependency detected [ 3142.782869][ T2649] 5.2.0-rc4-lp151.20-default+ #1 Tainted: G W [ 3142.785684][ T2649] ------------------------------------------------------ [ 3142.788479][ T2649] kworker/3:67/2649 is trying to acquire lock: [ 3142.790738][ T2649] 00000000aaf02291 ((wq_completion)bcache_writeback_wq){+.+.}, at: flush_workqueue+0x87/0x4c0 [ 3142.794678][ T2649] [ 3142.794678][ T2649] but task is already holding lock: [ 3142.797402][ T2649] 000000004fcf89c5 (&bch_register_lock){+.+.}, at: cached_dev_free+0x17/0x120 [bcache] [ 3142.801462][ T2649] [ 3142.801462][ T2649] which lock already depends on the new lock. [ 3142.801462][ T2649] [ 3142.805277][ T2649] [ 3142.805277][ T2649] the existing dependency chain (in reverse order) is: [ 3142.808902][ T2649] [ 3142.808902][ T2649] -> #2 (&bch_register_lock){+.+.}: [ 3142.812396][ T2649] __mutex_lock+0x7a/0x9d0 [ 3142.814184][ T2649] cached_dev_free+0x17/0x120 [bcache] [ 3142.816415][ T2649] process_one_work+0x2a4/0x640 [ 3142.818413][ T2649] worker_thread+0x39/0x3f0 [ 3142.820276][ T2649] kthread+0x125/0x140 [ 3142.822061][ T2649] ret_from_fork+0x3a/0x50 [ 3142.823965][ T2649] [ 3142.823965][ T2649] -> #1 ((work_completion)(&cl->work)#2){+.+.}: [ 3142.827244][ T2649] process_one_work+0x277/0x640 [ 3142.829160][ T2649] worker_thread+0x39/0x3f0 [ 3142.830958][ T2649] kthread+0x125/0x140 [ 3142.832674][ T2649] ret_from_fork+0x3a/0x50 [ 3142.834915][ T2649] [ 3142.834915][ T2649] -> #0 ((wq_completion)bcache_writeback_wq){+.+.}: [ 3142.838121][ T2649] lock_acquire+0xb4/0x1c0 [ 3142.840025][ T2649] flush_workqueue+0xae/0x4c0 [ 3142.842035][ T2649] drain_workqueue+0xa9/0x180 [ 3142.844042][ T2649] destroy_workqueue+0x17/0x250 [ 3142.846142][ T2649] cached_dev_free+0x52/0x120 [bcache] [ 3142.848530][ T2649] process_one_work+0x2a4/0x640 [ 3142.850663][ T2649] worker_thread+0x39/0x3f0 [ 3142.852464][ T2649] kthread+0x125/0x140 [ 3142.854106][ T2649] ret_from_fork+0x3a/0x50 [ 3142.855880][ T2649] [ 3142.855880][ T2649] other info that might help us debug this: [ 3142.855880][ T2649] [ 3142.859663][ T2649] Chain exists of: [ 3142.859663][ T2649] (wq_completion)bcache_writeback_wq --> (work_completion)(&cl->work)#2 --> &bch_register_lock [ 3142.859663][ T2649] [ 3142.865424][ T2649] Possible unsafe locking scenario: [ 3142.865424][ T2649] [ 3142.868022][ T2649] CPU0 CPU1 [ 3142.869885][ T2649] ---- ---- [ 3142.871751][ T2649] lock(&bch_register_lock); [ 3142.873379][ T2649] lock((work_completion)(&cl->work)#2); [ 3142.876399][ T2649] lock(&bch_register_lock); [ 3142.879727][ T2649] lock((wq_completion)bcache_writeback_wq); [ 3142.882064][ T2649] [ 3142.882064][ T2649] *** DEADLOCK *** [ 3142.882064][ T2649] [ 3142.885060][ T2649] 3 locks held by kworker/3:67/2649: [ 3142.887245][ T2649] #0: 00000000e774cdd0 ((wq_completion)events){+.+.}, at: process_one_work+0x21e/0x640 [ 3142.890815][ T2649] #1: 00000000f7df89da ((work_completion)(&cl->work)#2){+.+.}, at: process_one_work+0x21e/0x640 [ 3142.894884][ T2649] #2: 000000004fcf89c5 (&bch_register_lock){+.+.}, at: cached_dev_free+0x17/0x120 [bcache] [ 3142.898797][ T2649] [ 3142.898797][ T2649] stack backtrace: [ 3142.900961][ T2649] CPU: 3 PID: 2649 Comm: kworker/3:67 Tainted: G W 5.2.0-rc4-lp151.20-default+ #1 [ 3142.904789][ T2649] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018 [ 3142.909168][ T2649] Workqueue: events cached_dev_free [bcache] [ 3142.911422][ T2649] Call Trace: [ 3142.912656][ T2649] dump_stack+0x85/0xcb [ 3142.914181][ T2649] print_circular_bug+0x19a/0x1f0 [ 3142.916193][ T2649] __lock_acquire+0x16cd/0x1850 [ 3142.917936][ T2649] ? __lock_acquire+0x6a8/0x1850 [ 3142.919704][ T2649] ? lock_acquire+0xb4/0x1c0 [ 3142.921335][ T2649] ? find_held_lock+0x34/0xa0 [ 3142.923052][ T2649] lock_acquire+0xb4/0x1c0 [ 3142.924635][ T2649] ? flush_workqueue+0x87/0x4c0 [ 3142.926375][ T2649] flush_workqueue+0xae/0x4c0 [ 3142.928047][ T2649] ? flush_workqueue+0x87/0x4c0 [ 3142.929824][ T2649] ? drain_workqueue+0xa9/0x180 [ 3142.931686][ T2649] drain_workqueue+0xa9/0x180 [ 3142.933534][ T2649] destroy_workqueue+0x17/0x250 [ 3142.935787][ T2649] cached_dev_free+0x52/0x120 [bcache] [ 3142.937795][ T2649] process_one_work+0x2a4/0x640 [ 3142.939803][ T2649] worker_thread+0x39/0x3f0 [ 3142.941487][ T2649] ? process_one_work+0x640/0x640 [ 3142.943389][ T2649] kthread+0x125/0x140 [ 3142.944894][ T2649] ? kthread_create_worker_on_cpu+0x70/0x70 [ 3142.947744][ T2649] ret_from_fork+0x3a/0x50 [ 3142.970358][ T2649] bcache: bcache_device_free() bcache0 stopped Here is how the deadlock happens. 1) bcache_reboot() calls bcache_device_stop(), then inside bcache_device_stop() BCACHE_DEV_CLOSING bit is set on d->flags. Then closure_queue(&d->cl) is called to invoke cached_dev_flush(). 2) In cached_dev_flush(), cached_dev_free() is called by continu_at(). 3) In cached_dev_free(), when stopping the writeback kthread of the cached device by kthread_stop(), dc->writeback_thread will be waken up to quite the kthread while-loop, then cached_dev_put() is called in bch_writeback_thread(). 4) Calling cached_dev_put() in writeback kthread may drop dc->count to 0, then dc->detach kworker is scheduled, which is initialized as cached_dev_detach_finish(). 5) Inside cached_dev_detach_finish(), the last line of code is to call closure_put(&dc->disk.cl), which drops the last reference counter of closrure dc->disk.cl, then the callback cached_dev_flush() gets called. Now cached_dev_flush() is called for second time in the code path, the first time is in step 2). And again bch_register_lock will be acquired again, and a A-A lock (lockdep terminology) is happening. The root cause of the above A-A lock is in cached_dev_free(), mutex bch_register_lock is held before stopping writeback kthread and other kworkers. Fortunately now we have variable 'bcache_is_reboot', which may prevent device registration or unregistration during reboot/shutdown time, so it is unncessary to hold bch_register_lock such early now. This is how this patch fixes the reboot/shutdown time A-A lock issue: After moving mutex_lock(&bch_register_lock) to a later location where before atomic_read(&dc->running) in cached_dev_free(), such A-A lock problem can be solved without any reboot time registration race. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 40d857e690f9..8a12a8313367 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1231,8 +1231,6 @@ static void cached_dev_free(struct closure *cl) { struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); - mutex_lock(&bch_register_lock); - if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) cancel_writeback_rate_update_dwork(dc); @@ -1243,6 +1241,8 @@ static void cached_dev_free(struct closure *cl) if (!IS_ERR_OR_NULL(dc->status_update_thread)) kthread_stop(dc->status_update_thread); + mutex_lock(&bch_register_lock); + if (atomic_read(&dc->running)) bd_unlink_disk_holder(dc->bdev, dc->disk.disk); bcache_device_free(&dc->disk); From 7e865eba00a3df2dc8c4746173a8ca1c1c7f042e Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:49 +0800 Subject: [PATCH 114/153] bcache: fix potential deadlock in cached_def_free() When enable lockdep and reboot system with a writeback mode bcache device, the following potential deadlock warning is reported by lockdep engine. [ 101.536569][ T401] kworker/2:2/401 is trying to acquire lock: [ 101.538575][ T401] 00000000bbf6e6c7 ((wq_completion)bcache_writeback_wq){+.+.}, at: flush_workqueue+0x87/0x4c0 [ 101.542054][ T401] [ 101.542054][ T401] but task is already holding lock: [ 101.544587][ T401] 00000000f5f305b3 ((work_completion)(&cl->work)#2){+.+.}, at: process_one_work+0x21e/0x640 [ 101.548386][ T401] [ 101.548386][ T401] which lock already depends on the new lock. [ 101.548386][ T401] [ 101.551874][ T401] [ 101.551874][ T401] the existing dependency chain (in reverse order) is: [ 101.555000][ T401] [ 101.555000][ T401] -> #1 ((work_completion)(&cl->work)#2){+.+.}: [ 101.557860][ T401] process_one_work+0x277/0x640 [ 101.559661][ T401] worker_thread+0x39/0x3f0 [ 101.561340][ T401] kthread+0x125/0x140 [ 101.562963][ T401] ret_from_fork+0x3a/0x50 [ 101.564718][ T401] [ 101.564718][ T401] -> #0 ((wq_completion)bcache_writeback_wq){+.+.}: [ 101.567701][ T401] lock_acquire+0xb4/0x1c0 [ 101.569651][ T401] flush_workqueue+0xae/0x4c0 [ 101.571494][ T401] drain_workqueue+0xa9/0x180 [ 101.573234][ T401] destroy_workqueue+0x17/0x250 [ 101.575109][ T401] cached_dev_free+0x44/0x120 [bcache] [ 101.577304][ T401] process_one_work+0x2a4/0x640 [ 101.579357][ T401] worker_thread+0x39/0x3f0 [ 101.581055][ T401] kthread+0x125/0x140 [ 101.582709][ T401] ret_from_fork+0x3a/0x50 [ 101.584592][ T401] [ 101.584592][ T401] other info that might help us debug this: [ 101.584592][ T401] [ 101.588355][ T401] Possible unsafe locking scenario: [ 101.588355][ T401] [ 101.590974][ T401] CPU0 CPU1 [ 101.592889][ T401] ---- ---- [ 101.594743][ T401] lock((work_completion)(&cl->work)#2); [ 101.596785][ T401] lock((wq_completion)bcache_writeback_wq); [ 101.600072][ T401] lock((work_completion)(&cl->work)#2); [ 101.602971][ T401] lock((wq_completion)bcache_writeback_wq); [ 101.605255][ T401] [ 101.605255][ T401] *** DEADLOCK *** [ 101.605255][ T401] [ 101.608310][ T401] 2 locks held by kworker/2:2/401: [ 101.610208][ T401] #0: 00000000cf2c7d17 ((wq_completion)events){+.+.}, at: process_one_work+0x21e/0x640 [ 101.613709][ T401] #1: 00000000f5f305b3 ((work_completion)(&cl->work)#2){+.+.}, at: process_one_work+0x21e/0x640 [ 101.617480][ T401] [ 101.617480][ T401] stack backtrace: [ 101.619539][ T401] CPU: 2 PID: 401 Comm: kworker/2:2 Tainted: G W 5.2.0-rc4-lp151.20-default+ #1 [ 101.623225][ T401] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018 [ 101.627210][ T401] Workqueue: events cached_dev_free [bcache] [ 101.629239][ T401] Call Trace: [ 101.630360][ T401] dump_stack+0x85/0xcb [ 101.631777][ T401] print_circular_bug+0x19a/0x1f0 [ 101.633485][ T401] __lock_acquire+0x16cd/0x1850 [ 101.635184][ T401] ? __lock_acquire+0x6a8/0x1850 [ 101.636863][ T401] ? lock_acquire+0xb4/0x1c0 [ 101.638421][ T401] ? find_held_lock+0x34/0xa0 [ 101.640015][ T401] lock_acquire+0xb4/0x1c0 [ 101.641513][ T401] ? flush_workqueue+0x87/0x4c0 [ 101.643248][ T401] flush_workqueue+0xae/0x4c0 [ 101.644832][ T401] ? flush_workqueue+0x87/0x4c0 [ 101.646476][ T401] ? drain_workqueue+0xa9/0x180 [ 101.648303][ T401] drain_workqueue+0xa9/0x180 [ 101.649867][ T401] destroy_workqueue+0x17/0x250 [ 101.651503][ T401] cached_dev_free+0x44/0x120 [bcache] [ 101.653328][ T401] process_one_work+0x2a4/0x640 [ 101.655029][ T401] worker_thread+0x39/0x3f0 [ 101.656693][ T401] ? process_one_work+0x640/0x640 [ 101.658501][ T401] kthread+0x125/0x140 [ 101.660012][ T401] ? kthread_create_worker_on_cpu+0x70/0x70 [ 101.661985][ T401] ret_from_fork+0x3a/0x50 [ 101.691318][ T401] bcache: bcache_device_free() bcache0 stopped Here is how the above potential deadlock may happen in reboot/shutdown code path, 1) bcache_reboot() is called firstly in the reboot/shutdown code path, then in bcache_reboot(), bcache_device_stop() is called. 2) bcache_device_stop() sets BCACHE_DEV_CLOSING on d->falgs, then call closure_queue(&d->cl) to invoke cached_dev_flush(). And in turn cached_dev_flush() calls cached_dev_free() via closure_at() 3) In cached_dev_free(), after stopped writebach kthread dc->writeback_thread, the kwork dc->writeback_write_wq is stopping by destroy_workqueue(). 4) Inside destroy_workqueue(), drain_workqueue() is called. Inside drain_workqueue(), flush_workqueue() is called. Then wq->lockdep_map is acquired by lock_map_acquire() in flush_workqueue(). After the lock acquired the rest part of flush_workqueue() just wait for the workqueue to complete. 5) Now we look back at writeback thread routine bch_writeback_thread(), in the main while-loop, write_dirty() is called via continue_at() in read_dirty_submit(), which is called via continue_at() in while-loop level called function read_dirty(). Inside write_dirty() it may be re-called on workqueeu dc->writeback_write_wq via continue_at(). It means when the writeback kthread is stopped in cached_dev_free() there might be still one kworker queued on dc->writeback_write_wq to execute write_dirty() again. 6) Now this kworker is scheduled on dc->writeback_write_wq to run by process_one_work() (which is called by worker_thread()). Before calling the kwork routine, wq->lockdep_map is acquired. 7) But wq->lockdep_map is acquired already in step 4), so a A-A lock (lockdep terminology) scenario happens. Indeed on multiple cores syatem, the above deadlock is very rare to happen, just as the code comments in process_one_work() says, 2263 * AFAICT there is no possible deadlock scenario between the 2264 * flush_work() and complete() primitives (except for single-threaded 2265 * workqueues), so hiding them isn't a problem. But it is still good to fix such lockdep warning, even no one running bcache on single core system. The fix is simple. This patch solves the above potential deadlock by, - Do not destroy workqueue dc->writeback_write_wq in cached_dev_free(). - Flush and destroy dc->writeback_write_wq in writebach kthread routine bch_writeback_thread(), where after quit the thread main while-loop and before cached_dev_put() is called. By this fix, dc->writeback_write_wq will be stopped and destroy before the writeback kthread stopped, so the chance for a A-A locking on wq->lockdep_map is disappeared, such A-A deadlock won't happen any more. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 2 -- drivers/md/bcache/writeback.c | 4 ++++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 8a12a8313367..a8ea4e2086a9 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1236,8 +1236,6 @@ static void cached_dev_free(struct closure *cl) if (!IS_ERR_OR_NULL(dc->writeback_thread)) kthread_stop(dc->writeback_thread); - if (dc->writeback_write_wq) - destroy_workqueue(dc->writeback_write_wq); if (!IS_ERR_OR_NULL(dc->status_update_thread)) kthread_stop(dc->status_update_thread); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 21081febcb59..d60268fe49e1 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -738,6 +738,10 @@ static int bch_writeback_thread(void *arg) } } + if (dc->writeback_write_wq) { + flush_workqueue(dc->writeback_write_wq); + destroy_workqueue(dc->writeback_write_wq); + } cached_dev_put(dc); wait_for_kthread_stop(); From 2464b693148e5d5ca42b6064bb40c1a275ea61cd Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:50 +0800 Subject: [PATCH 115/153] bcache: add code comments for journal_read_bucket() This patch adds more code comments in journal_read_bucket(), this is an effort to make the code to be more understandable. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 54f8886b6177..98ee467ec3f7 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -100,6 +100,20 @@ reread: left = ca->sb.bucket_size - offset; blocks = set_blocks(j, block_bytes(ca->set)); + /* + * Nodes in 'list' are in linear increasing order of + * i->j.seq, the node on head has the smallest (oldest) + * journal seq, the node on tail has the biggest + * (latest) journal seq. + */ + + /* + * Check from the oldest jset for last_seq. If + * i->j.seq < j->last_seq, it means the oldest jset + * in list is expired and useless, remove it from + * this list. Otherwise, j is a condidate jset for + * further following checks. + */ while (!list_empty(list)) { i = list_first_entry(list, struct journal_replay, list); @@ -109,13 +123,22 @@ reread: left = ca->sb.bucket_size - offset; kfree(i); } + /* iterate list in reverse order (from latest jset) */ list_for_each_entry_reverse(i, list, list) { if (j->seq == i->j.seq) goto next_set; + /* + * if j->seq is less than any i->j.last_seq + * in list, j is an expired and useless jset. + */ if (j->seq < i->j.last_seq) goto next_set; + /* + * 'where' points to first jset in list which + * is elder then j. + */ if (j->seq > i->j.seq) { where = &i->list; goto add; @@ -129,6 +152,7 @@ reread: left = ca->sb.bucket_size - offset; if (!i) return -ENOMEM; memcpy(&i->j, j, bytes); + /* Add to the location after 'where' points to */ list_add(&i->list, where); ret = 1; From a231f07a5fe30a522b402011c5190cb936641a66 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:51 +0800 Subject: [PATCH 116/153] bcache: set largest seq to ja->seq[bucket_index] in journal_read_bucket() In journal_read_bucket() when setting ja->seq[bucket_index], there might be potential case that a later non-maximum overwrites a better sequence number to ja->seq[bucket_index]. This patch adds a check to make sure that ja->seq[bucket_index] will be only set a new value if it is bigger then current value. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 98ee467ec3f7..3d321bffddc9 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -156,7 +156,8 @@ reread: left = ca->sb.bucket_size - offset; list_add(&i->list, where); ret = 1; - ja->seq[bucket_index] = j->seq; + if (j->seq > ja->seq[bucket_index]) + ja->seq[bucket_index] = j->seq; next_set: offset += blocks * ca->sb.block_size; len -= blocks * ca->sb.block_size; From 1df3877ff6a4810054237c3259d900ded4468969 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:52 +0800 Subject: [PATCH 117/153] bcache: shrink btree node cache after bch_btree_check() When cache set starts, bch_btree_check() will check all bkeys on cache device by calculating the checksum. This operation will consume a huge number of system memory if there are a lot of data cached. Since bcache uses its own mca cache to maintain all its read-in btree nodes, and only releases the cache space when system memory manage code starts to shrink caches. Then before memory manager code to call the mca cache shrinker callback, bcache mca cache will compete memory resource with user space application, which may have nagive effect to performance of user space workloads (e.g. data base, or I/O service of distributed storage node). This patch tries to call bcache mca shrinker routine to proactively release mca cache memory, to decrease the memory pressure of system and avoid negative effort of the overall system I/O performance. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a8ea4e2086a9..26e374fbf57c 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1880,6 +1880,23 @@ static int run_cache_set(struct cache_set *c) if (bch_btree_check(c)) goto err; + /* + * bch_btree_check() may occupy too much system memory which + * has negative effects to user space application (e.g. data + * base) performance. Shrink the mca cache memory proactively + * here to avoid competing memory with user space workloads.. + */ + if (!c->shrinker_disabled) { + struct shrink_control sc; + + sc.gfp_mask = GFP_KERNEL; + sc.nr_to_scan = c->btree_cache_used * c->btree_pages; + /* first run to clear b->accessed tag */ + c->shrink.scan_objects(&c->shrink, &sc); + /* second run to reap non-accessed nodes */ + c->shrink.scan_objects(&c->shrink, &sc); + } + bch_journal_mark(c, &journal); bch_initial_gc_finish(c); pr_debug("btree_check() done"); From ba82c1ac1667d6efb91a268edb13fc9cdaecec9b Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:53 +0800 Subject: [PATCH 118/153] bcache: Revert "bcache: free heap cache_set->flush_btree in bch_journal_free" This reverts commit 6268dc2c4703aabfb0b35681be709acf4c2826c6. This patch depends on commit c4dc2497d50d ("bcache: fix high CPU occupancy during journal") which is reverted in previous patch. So revert this one too. Fixes: 6268dc2c4703 ("bcache: free heap cache_set->flush_btree in bch_journal_free") Signed-off-by: Coly Li Cc: stable@vger.kernel.org Cc: Shenghui Wang Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 3d321bffddc9..11d8c93b88bb 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -884,7 +884,6 @@ void bch_journal_free(struct cache_set *c) free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); free_fifo(&c->journal.pin); - free_heap(&c->flush_btree); } int bch_journal_alloc(struct cache_set *c) From 249a5f6da57c28a903c75d81505d58ec8c10030d Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:54 +0800 Subject: [PATCH 119/153] bcache: Revert "bcache: fix high CPU occupancy during journal" This reverts commit c4dc2497d50d9c6fb16aa0d07b6a14f3b2adb1e0. This patch enlarges a race between normal btree flush code path and flush_btree_write(), which causes deadlock when journal space is exhausted. Reverts this patch makes the race window from 128 btree nodes to only 1 btree nodes. Fixes: c4dc2497d50d ("bcache: fix high CPU occupancy during journal") Signed-off-by: Coly Li Cc: stable@vger.kernel.org Cc: Tang Junhui Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 2 -- drivers/md/bcache/journal.c | 47 ++++++++++++------------------------- drivers/md/bcache/util.h | 2 -- 3 files changed, 15 insertions(+), 36 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 73a97586a2ef..cb268d7c6cea 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -726,8 +726,6 @@ struct cache_set { #define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; - - DECLARE_HEAP(struct btree *, flush_btree); }; struct bbio { diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 11d8c93b88bb..14a4e2c44de9 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -416,12 +416,6 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) } /* Journalling */ -#define journal_max_cmp(l, r) \ - (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \ - fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) -#define journal_min_cmp(l, r) \ - (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \ - fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) static void btree_flush_write(struct cache_set *c) { @@ -429,35 +423,25 @@ static void btree_flush_write(struct cache_set *c) * Try to find the btree node with that references the oldest journal * entry, best is our current candidate and is locked if non NULL: */ - struct btree *b; - int i; + struct btree *b, *best; + unsigned int i; atomic_long_inc(&c->flush_write); - retry: - spin_lock(&c->journal.lock); - if (heap_empty(&c->flush_btree)) { - for_each_cached_btree(b, c, i) - if (btree_current_write(b)->journal) { - if (!heap_full(&c->flush_btree)) - heap_add(&c->flush_btree, b, - journal_max_cmp); - else if (journal_max_cmp(b, - heap_peek(&c->flush_btree))) { - c->flush_btree.data[0] = b; - heap_sift(&c->flush_btree, 0, - journal_max_cmp); - } + best = NULL; + + for_each_cached_btree(b, c, i) + if (btree_current_write(b)->journal) { + if (!best) + best = b; + else if (journal_pin_cmp(c, + btree_current_write(best)->journal, + btree_current_write(b)->journal)) { + best = b; } + } - for (i = c->flush_btree.used / 2 - 1; i >= 0; --i) - heap_sift(&c->flush_btree, i, journal_min_cmp); - } - - b = NULL; - heap_pop(&c->flush_btree, b, journal_min_cmp); - spin_unlock(&c->journal.lock); - + b = best; if (b) { mutex_lock(&b->write_lock); if (!btree_current_write(b)->journal) { @@ -898,8 +882,7 @@ int bch_journal_alloc(struct cache_set *c) j->w[0].c = c; j->w[1].c = c; - if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) || - !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || + if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) return -ENOMEM; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 1fbced94e4cc..c029f7443190 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -113,8 +113,6 @@ do { \ #define heap_full(h) ((h)->used == (h)->size) -#define heap_empty(h) ((h)->used == 0) - #define DECLARE_FIFO(type, name) \ struct { \ size_t front, back, size, mask; \ From e5ec5f4765ada9c75fb3eee93a6e72f0e50599d5 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:55 +0800 Subject: [PATCH 120/153] bcache: only clear BTREE_NODE_dirty bit when it is set In bch_btree_cache_free() and btree_node_free(), BTREE_NODE_dirty is always set no matter btree node is dirty or not. The code looks like this, if (btree_node_dirty(b)) btree_complete_write(b, btree_current_write(b)); clear_bit(BTREE_NODE_dirty, &b->flags); Indeed if btree_node_dirty(b) returns false, it means BTREE_NODE_dirty bit is cleared, then it is unnecessary to clear the bit again. This patch only clears BTREE_NODE_dirty when btree_node_dirty(b) is true (the bit is set), to save a few CPU cycles. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index cf38a1b031fa..88e5aa3fbb07 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -782,10 +782,10 @@ void bch_btree_cache_free(struct cache_set *c) while (!list_empty(&c->btree_cache)) { b = list_first_entry(&c->btree_cache, struct btree, list); - if (btree_node_dirty(b)) + if (btree_node_dirty(b)) { btree_complete_write(b, btree_current_write(b)); - clear_bit(BTREE_NODE_dirty, &b->flags); - + clear_bit(BTREE_NODE_dirty, &b->flags); + } mca_data_free(b); } @@ -1073,9 +1073,10 @@ static void btree_node_free(struct btree *b) mutex_lock(&b->write_lock); - if (btree_node_dirty(b)) + if (btree_node_dirty(b)) { btree_complete_write(b, btree_current_write(b)); - clear_bit(BTREE_NODE_dirty, &b->flags); + clear_bit(BTREE_NODE_dirty, &b->flags); + } mutex_unlock(&b->write_lock); From 41508bb7d46b74dba631017e5a702a86caf1db8c Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:56 +0800 Subject: [PATCH 121/153] bcache: add comments for mutex_lock(&b->write_lock) When accessing or modifying BTREE_NODE_dirty bit, it is not always necessary to acquire b->write_lock. In bch_btree_cache_free() and mca_reap() acquiring b->write_lock is necessary, and this patch adds comments to explain why mutex_lock(&b->write_lock) is necessary for checking or clearing BTREE_NODE_dirty bit there. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 88e5aa3fbb07..846306c3a887 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -659,6 +659,11 @@ static int mca_reap(struct btree *b, unsigned int min_order, bool flush) up(&b->io_mutex); } + /* + * BTREE_NODE_dirty might be cleared in btree_flush_btree() by + * __bch_btree_node_write(). To avoid an extra flush, acquire + * b->write_lock before checking BTREE_NODE_dirty bit. + */ mutex_lock(&b->write_lock); if (btree_node_dirty(b)) __bch_btree_node_write(b, &cl); @@ -782,6 +787,11 @@ void bch_btree_cache_free(struct cache_set *c) while (!list_empty(&c->btree_cache)) { b = list_first_entry(&c->btree_cache, struct btree, list); + /* + * This function is called by cache_set_free(), no I/O + * request on cache now, it is unnecessary to acquire + * b->write_lock before clearing BTREE_NODE_dirty anymore. + */ if (btree_node_dirty(b)) { btree_complete_write(b, btree_current_write(b)); clear_bit(BTREE_NODE_dirty, &b->flags); From d91ce7574daf48a4567ba62733d43284f5d2a3f4 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:57 +0800 Subject: [PATCH 122/153] bcache: remove retry_flush_write from struct cache_set In struct cache_set, retry_flush_write is added for commit c4dc2497d50d ("bcache: fix high CPU occupancy during journal") which is reverted in previous patch. Now it is useless anymore, and this patch removes it from bcache code. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 1 - drivers/md/bcache/journal.c | 1 - drivers/md/bcache/sysfs.c | 5 ----- 3 files changed, 7 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index cb268d7c6cea..35396248a7d5 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -706,7 +706,6 @@ struct cache_set { atomic_long_t reclaim; atomic_long_t flush_write; - atomic_long_t retry_flush_write; enum { ON_ERROR_UNREGISTER, diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 14a4e2c44de9..1218e3cada3c 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -447,7 +447,6 @@ static void btree_flush_write(struct cache_set *c) if (!btree_current_write(b)->journal) { mutex_unlock(&b->write_lock); /* We raced */ - atomic_long_inc(&c->retry_flush_write); goto retry; } diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index d62e28643109..701a386a954c 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -83,7 +83,6 @@ read_attribute(state); read_attribute(cache_read_races); read_attribute(reclaim); read_attribute(flush_write); -read_attribute(retry_flush_write); read_attribute(writeback_keys_done); read_attribute(writeback_keys_failed); read_attribute(io_errors); @@ -709,9 +708,6 @@ SHOW(__bch_cache_set) sysfs_print(flush_write, atomic_long_read(&c->flush_write)); - sysfs_print(retry_flush_write, - atomic_long_read(&c->retry_flush_write)); - sysfs_print(writeback_keys_done, atomic_long_read(&c->writeback_keys_done)); sysfs_print(writeback_keys_failed, @@ -936,7 +932,6 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_cache_read_races, &sysfs_reclaim, &sysfs_flush_write, - &sysfs_retry_flush_write, &sysfs_writeback_keys_done, &sysfs_writeback_keys_failed, From 50a260e859964002dab162513a10f91ae9d3bcd3 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:58 +0800 Subject: [PATCH 123/153] bcache: fix race in btree_flush_write() There is a race between mca_reap(), btree_node_free() and journal code btree_flush_write(), which results very rare and strange deadlock or panic and are very hard to reproduce. Let me explain how the race happens. In btree_flush_write() one btree node with oldest journal pin is selected, then it is flushed to cache device, the select-and-flush is a two steps operation. Between these two steps, there are something may happen inside the race window, - The selected btree node was reaped by mca_reap() and allocated to other requesters for other btree node. - The slected btree node was selected, flushed and released by mca shrink callback bch_mca_scan(). When btree_flush_write() tries to flush the selected btree node, firstly b->write_lock is held by mutex_lock(). If the race happens and the memory of selected btree node is allocated to other btree node, if that btree node's write_lock is held already, a deadlock very probably happens here. A worse case is the memory of the selected btree node is released, then all references to this btree node (e.g. b->write_lock) will trigger NULL pointer deference panic. This race was introduced in commit cafe56359144 ("bcache: A block layer cache"), and enlarged by commit c4dc2497d50d ("bcache: fix high CPU occupancy during journal"), which selected 128 btree nodes and flushed them one-by-one in a quite long time period. Such race is not easy to reproduce before. On a Lenovo SR650 server with 48 Xeon cores, and configure 1 NVMe SSD as cache device, a MD raid0 device assembled by 3 NVMe SSDs as backing device, this race can be observed around every 10,000 times btree_flush_write() gets called. Both deadlock and kernel panic all happened as aftermath of the race. The idea of the fix is to add a btree flag BTREE_NODE_journal_flush. It is set when selecting btree nodes, and cleared after btree nodes flushed. Then when mca_reap() selects a btree node with this bit set, this btree node will be skipped. Since mca_reap() only reaps btree node without BTREE_NODE_journal_flush flag, such race is avoided. Once corner case should be noticed, that is btree_node_free(). It might be called in some error handling code path. For example the following code piece from btree_split(), 2149 err_free2: 2150 bkey_put(b->c, &n2->key); 2151 btree_node_free(n2); 2152 rw_unlock(true, n2); 2153 err_free1: 2154 bkey_put(b->c, &n1->key); 2155 btree_node_free(n1); 2156 rw_unlock(true, n1); At line 2151 and 2155, the btree node n2 and n1 are released without mac_reap(), so BTREE_NODE_journal_flush also needs to be checked here. If btree_node_free() is called directly in such error handling path, and the selected btree node has BTREE_NODE_journal_flush bit set, just delay for 1 us and retry again. In this case this btree node won't be skipped, just retry until the BTREE_NODE_journal_flush bit cleared, and free the btree node memory. Fixes: cafe56359144 ("bcache: A block layer cache") Signed-off-by: Coly Li Reported-and-tested-by: kbuild test robot Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 28 +++++++++++++++++++++++++++- drivers/md/bcache/btree.h | 2 ++ drivers/md/bcache/journal.c | 7 +++++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 846306c3a887..ba434d9ac720 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -35,7 +35,7 @@ #include #include #include - +#include #include /* @@ -659,12 +659,25 @@ static int mca_reap(struct btree *b, unsigned int min_order, bool flush) up(&b->io_mutex); } +retry: /* * BTREE_NODE_dirty might be cleared in btree_flush_btree() by * __bch_btree_node_write(). To avoid an extra flush, acquire * b->write_lock before checking BTREE_NODE_dirty bit. */ mutex_lock(&b->write_lock); + /* + * If this btree node is selected in btree_flush_write() by journal + * code, delay and retry until the node is flushed by journal code + * and BTREE_NODE_journal_flush bit cleared by btree_flush_write(). + */ + if (btree_node_journal_flush(b)) { + pr_debug("bnode %p is flushing by journal, retry", b); + mutex_unlock(&b->write_lock); + udelay(1); + goto retry; + } + if (btree_node_dirty(b)) __bch_btree_node_write(b, &cl); mutex_unlock(&b->write_lock); @@ -1081,7 +1094,20 @@ static void btree_node_free(struct btree *b) BUG_ON(b == b->c->root); +retry: mutex_lock(&b->write_lock); + /* + * If the btree node is selected and flushing in btree_flush_write(), + * delay and retry until the BTREE_NODE_journal_flush bit cleared, + * then it is safe to free the btree node here. Otherwise this btree + * node will be in race condition. + */ + if (btree_node_journal_flush(b)) { + mutex_unlock(&b->write_lock); + pr_debug("bnode %p journal_flush set, retry", b); + udelay(1); + goto retry; + } if (btree_node_dirty(b)) { btree_complete_write(b, btree_current_write(b)); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index d1c72ef64edf..76cfd121a486 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -158,11 +158,13 @@ enum btree_flags { BTREE_NODE_io_error, BTREE_NODE_dirty, BTREE_NODE_write_idx, + BTREE_NODE_journal_flush, }; BTREE_FLAG(io_error); BTREE_FLAG(dirty); BTREE_FLAG(write_idx); +BTREE_FLAG(journal_flush); static inline struct btree_write *btree_current_write(struct btree *b) { diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 1218e3cada3c..a1e3e1fcea6e 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -430,6 +430,7 @@ static void btree_flush_write(struct cache_set *c) retry: best = NULL; + mutex_lock(&c->bucket_lock); for_each_cached_btree(b, c, i) if (btree_current_write(b)->journal) { if (!best) @@ -442,15 +443,21 @@ static void btree_flush_write(struct cache_set *c) } b = best; + if (b) + set_btree_node_journal_flush(b); + mutex_unlock(&c->bucket_lock); + if (b) { mutex_lock(&b->write_lock); if (!btree_current_write(b)->journal) { + clear_bit(BTREE_NODE_journal_flush, &b->flags); mutex_unlock(&b->write_lock); /* We raced */ goto retry; } __bch_btree_node_write(b, NULL); + clear_bit(BTREE_NODE_journal_flush, &b->flags); mutex_unlock(&b->write_lock); } } From 91be66e1318f67ed5888ca10e10cc8ffdc24f661 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:59 +0800 Subject: [PATCH 124/153] bcache: performance improvement for btree_flush_write() This patch improves performance for btree_flush_write() in following ways, - Use another spinlock journal.flush_write_lock to replace the very hot journal.lock. We don't have to use journal.lock here, selecting candidate btree nodes takes a lot of time, hold journal.lock here will block other jouranling threads and drop the overall I/O performance. - Only select flushing btree node from c->btree_cache list. When the machine has a large system memory, mca cache may have a huge number of cached btree nodes. Iterating all the cached nodes will take a lot of CPU time, and most of the nodes on c->btree_cache_freeable and c->btree_cache_freed lists are cleared and have need to flush. So only travel mca list c->btree_cache to select flushing btree node should be enough for most of the cases. - Don't iterate whole c->btree_cache list, only reversely select first BTREE_FLUSH_NR btree nodes to flush. Iterate all btree nodes from c->btree_cache and select the oldest journal pin btree nodes consumes huge number of CPU cycles if the list is huge (push and pop a node into/out of a heap is expensive). The last several dirty btree nodes on the tail of c->btree_cache list are earlest allocated and cached btree nodes, they are relative to the oldest journal pin btree nodes. Therefore only flushing BTREE_FLUSH_NR btree nodes from tail of c->btree_cache probably includes the oldest journal pin btree nodes. In my testing, the above change decreases 50%+ CPU consumption when journal space is full. Some times IOPS drops to 0 for 5-8 seconds, comparing blocking I/O for 120+ seconds in previous code, this is much better. Maybe there is room to improve in future, but at this momment the fix looks fine and performs well in my testing. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 85 +++++++++++++++++++++++++++---------- drivers/md/bcache/journal.h | 4 ++ 2 files changed, 67 insertions(+), 22 deletions(-) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index a1e3e1fcea6e..8bcd8f1bf8cb 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -419,47 +419,87 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) static void btree_flush_write(struct cache_set *c) { - /* - * Try to find the btree node with that references the oldest journal - * entry, best is our current candidate and is locked if non NULL: - */ - struct btree *b, *best; - unsigned int i; + struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR]; + unsigned int i, n; + + if (c->journal.btree_flushing) + return; + + spin_lock(&c->journal.flush_write_lock); + if (c->journal.btree_flushing) { + spin_unlock(&c->journal.flush_write_lock); + return; + } + c->journal.btree_flushing = true; + spin_unlock(&c->journal.flush_write_lock); atomic_long_inc(&c->flush_write); -retry: - best = NULL; + memset(btree_nodes, 0, sizeof(btree_nodes)); + n = 0; mutex_lock(&c->bucket_lock); - for_each_cached_btree(b, c, i) - if (btree_current_write(b)->journal) { - if (!best) - best = b; - else if (journal_pin_cmp(c, - btree_current_write(best)->journal, - btree_current_write(b)->journal)) { - best = b; - } + list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { + if (btree_node_journal_flush(b)) + pr_err("BUG: flush_write bit should not be set here!"); + + mutex_lock(&b->write_lock); + + if (!btree_node_dirty(b)) { + mutex_unlock(&b->write_lock); + continue; + } + + if (!btree_current_write(b)->journal) { + mutex_unlock(&b->write_lock); + continue; } - b = best; - if (b) set_btree_node_journal_flush(b); + + mutex_unlock(&b->write_lock); + + btree_nodes[n++] = b; + if (n == BTREE_FLUSH_NR) + break; + } mutex_unlock(&c->bucket_lock); - if (b) { + for (i = 0; i < n; i++) { + b = btree_nodes[i]; + if (!b) { + pr_err("BUG: btree_nodes[%d] is NULL", i); + continue; + } + + /* safe to check without holding b->write_lock */ + if (!btree_node_journal_flush(b)) { + pr_err("BUG: bnode %p: journal_flush bit cleaned", b); + continue; + } + mutex_lock(&b->write_lock); if (!btree_current_write(b)->journal) { clear_bit(BTREE_NODE_journal_flush, &b->flags); mutex_unlock(&b->write_lock); - /* We raced */ - goto retry; + pr_debug("bnode %p: written by others", b); + continue; + } + + if (!btree_node_dirty(b)) { + clear_bit(BTREE_NODE_journal_flush, &b->flags); + mutex_unlock(&b->write_lock); + pr_debug("bnode %p: dirty bit cleaned by others", b); + continue; } __bch_btree_node_write(b, NULL); clear_bit(BTREE_NODE_journal_flush, &b->flags); mutex_unlock(&b->write_lock); } + + spin_lock(&c->journal.flush_write_lock); + c->journal.btree_flushing = false; + spin_unlock(&c->journal.flush_write_lock); } #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) @@ -881,6 +921,7 @@ int bch_journal_alloc(struct cache_set *c) struct journal *j = &c->journal; spin_lock_init(&j->lock); + spin_lock_init(&j->flush_write_lock); INIT_DELAYED_WORK(&j->work, journal_write_work); c->journal_delay_ms = 100; diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index 66f0facff84b..f2ea34d5f431 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -103,6 +103,8 @@ struct journal_write { /* Embedded in struct cache_set */ struct journal { spinlock_t lock; + spinlock_t flush_write_lock; + bool btree_flushing; /* used when waiting because the journal was full */ struct closure_waitlist wait; struct closure io; @@ -154,6 +156,8 @@ struct journal_device { struct bio_vec bv[8]; }; +#define BTREE_FLUSH_NR 8 + #define journal_pin_cmp(c, l, r) \ (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) From dff90d58a1c815b87b2603295382c97e78064349 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 20:00:00 +0800 Subject: [PATCH 125/153] bcache: add reclaimed_journal_buckets to struct cache_set Now we have counters for how many times jouranl is reclaimed, how many times cached dirty btree nodes are flushed, but we don't know how many jouranl buckets are really reclaimed. This patch adds reclaimed_journal_buckets into struct cache_set, this is an increasing only counter, to tell how many journal buckets are reclaimed since cache set runs. From all these three counters (reclaim, reclaimed_journal_buckets, flush_write), we can have idea how well current journal space reclaim code works. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/journal.c | 1 + drivers/md/bcache/sysfs.c | 5 +++++ 3 files changed, 7 insertions(+) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 35396248a7d5..013e35a9e317 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -705,6 +705,7 @@ struct cache_set { atomic_long_t writeback_keys_failed; atomic_long_t reclaim; + atomic_long_t reclaimed_journal_buckets; atomic_long_t flush_write; enum { diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 8bcd8f1bf8cb..be2a2a201603 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -614,6 +614,7 @@ static void journal_reclaim(struct cache_set *c) k->ptr[n++] = MAKE_PTR(0, bucket_to_sector(c, ca->sb.d[ja->cur_idx]), ca->sb.nr_this_dev); + atomic_long_inc(&c->reclaimed_journal_buckets); } if (n) { diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 701a386a954c..9f0826712845 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -82,6 +82,7 @@ read_attribute(bset_tree_stats); read_attribute(state); read_attribute(cache_read_races); read_attribute(reclaim); +read_attribute(reclaimed_journal_buckets); read_attribute(flush_write); read_attribute(writeback_keys_done); read_attribute(writeback_keys_failed); @@ -705,6 +706,9 @@ SHOW(__bch_cache_set) sysfs_print(reclaim, atomic_long_read(&c->reclaim)); + sysfs_print(reclaimed_journal_buckets, + atomic_long_read(&c->reclaimed_journal_buckets)); + sysfs_print(flush_write, atomic_long_read(&c->flush_write)); @@ -931,6 +935,7 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_bset_tree_stats, &sysfs_cache_read_races, &sysfs_reclaim, + &sysfs_reclaimed_journal_buckets, &sysfs_flush_write, &sysfs_writeback_keys_done, &sysfs_writeback_keys_failed, From dbc3117d4ca9e17819ac73501e914b8422686750 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 27 Jun 2019 21:44:09 -0700 Subject: [PATCH 126/153] block, bfq: NULL out the bic when it's no longer valid In reboot tests on several devices we were seeing a "use after free" when slub_debug or KASAN was enabled. The kernel complained about: Unable to handle kernel paging request at virtual address 6b6b6c2b ...which is a classic sign of use after free under slub_debug. The stack crawl in kgdb looked like: 0 test_bit (addr=, nr=) 1 bfq_bfqq_busy (bfqq=) 2 bfq_select_queue (bfqd=) 3 __bfq_dispatch_request (hctx=) 4 bfq_dispatch_request (hctx=) 5 0xc056ef00 in blk_mq_do_dispatch_sched (hctx=0xed249440) 6 0xc056f728 in blk_mq_sched_dispatch_requests (hctx=0xed249440) 7 0xc0568d24 in __blk_mq_run_hw_queue (hctx=0xed249440) 8 0xc0568d94 in blk_mq_run_work_fn (work=) 9 0xc024c5c4 in process_one_work (worker=0xec6d4640, work=0xed249480) 10 0xc024cff4 in worker_thread (__worker=0xec6d4640) Digging in kgdb, it could be found that, though bfqq looked fine, bfqq->bic had been freed. Through further digging, I postulated that perhaps it is illegal to access a "bic" (AKA an "icq") after bfq_exit_icq() had been called because the "bic" can be freed at some point in time after this call is made. I confirmed that there certainly were cases where the exact crashing code path would access the "bic" after bfq_exit_icq() had been called. Sspecifically I set the "bfqq->bic" to (void *)0x7 and saw that the bic was 0x7 at the time of the crash. To understand a bit more about why this crash was fairly uncommon (I saw it only once in a few hundred reboots), you can see that much of the time bfq_exit_icq_fbqq() fully frees the bfqq and thus it can't access the ->bic anymore. The only case it doesn't is if bfq_put_queue() sees a reference still held. However, even in the case when bfqq isn't freed, the crash is still rare. Why? I tracked what happened to the "bic" after the exit routine. It doesn't get freed right away. Rather, put_io_context_active() eventually called put_io_context() which queued up freeing on a workqueue. The freeing then actually happened later than that through call_rcu(). Despite all these delays, some extra debugging showed that all the hoops could be jumped through in time and the memory could be freed causing the original crash. Phew! To make a long story short, assuming it truly is illegal to access an icq after the "exit_icq" callback is finished, this patch is needed. Cc: stable@vger.kernel.org Reviewed-by: Paolo Valente Signed-off-by: Douglas Anderson Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 008c93d6b8d7..06c9b00507b6 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4855,6 +4855,7 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) unsigned long flags; spin_lock_irqsave(&bfqd->lock, flags); + bfqq->bic = NULL; bfq_exit_bfqq(bfqd, bfqq); bic_set_bfqq(bic, NULL, is_sync); spin_unlock_irqrestore(&bfqd->lock, flags); From 152c762e92609965b542c31a7627ad05893f70d9 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Fri, 28 Jun 2019 16:29:04 -0700 Subject: [PATCH 127/153] null_blk: fix type mismatch null_handle_cmd() In null_handle_cmd() when device is configured as zoned, variable op is decalred as an int, where it is used to hold values of type REQ_OP_XXX which is of type enum req_opf. Change the type from int to enum req_opf. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- drivers/block/null_blk_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 22303e59a274..99328ded60d1 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1198,7 +1198,7 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd) if (!cmd->error && dev->zoned) { sector_t sector; unsigned int nr_sectors; - int op; + enum req_opf op; if (dev->queue_mode == NULL_Q_BIO) { op = bio_op(cmd->bio); From 152c7776b9442f2f094da7d81e5a8f345dedb397 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 28 Jun 2019 13:07:42 -0700 Subject: [PATCH 128/153] block, documentation: Fix wbt_lat_usec documentation Fix the spelling of the wbt_lat_usec sysfs attribute. Fixes: 87760e5eef35 ("block: hook up writeback throttling") # v4.10. Reviewed-by: Martin K. Petersen Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- Documentation/block/queue-sysfs.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index 83b457e24bba..3eaf86806621 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt @@ -185,8 +185,8 @@ This is the number of bytes the device can write in a single write-same command. A value of '0' means write-same is not supported by this device. -wb_lat_usec (RW) ----------------- +wbt_lat_usec (RW) +----------------- If the device is registered for writeback throttling, then this file shows the target minimum read latency. If this latency is exceeded in a given window of time (see wb_window_usec), then the writeback throttling will start From 6728ac3396265184abe93f18b32aca329981e5ce Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 28 Jun 2019 13:07:43 -0700 Subject: [PATCH 129/153] block, documentation: Sort queue sysfs attribute names alphabetically Commit f9824952ee1c ("block: update sysfs documentation") # v5.0 broke the alphabetical order of the sysfs attribute names. List queue sysfs attribute names alphabetically. Cc: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- Documentation/block/queue-sysfs.txt | 30 ++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index 3eaf86806621..f6da2efe2105 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt @@ -14,6 +14,15 @@ add_random (RW) This file allows to turn off the disk entropy contribution. Default value of this file is '1'(on). +chunk_sectors (RO) +------------------ +This has different meaning depending on the type of the block device. +For a RAID device (dm-raid), chunk_sectors indicates the size in 512B sectors +of the RAID volume stripe segment. For a zoned block device, either host-aware +or host-managed, chunk_sectors indicates the size in 512B sectors of the zones +of the device, with the eventual exception of the last zone of the device which +may be smaller. + dax (RO) -------- This file indicates whether the device supports Direct Access (DAX), @@ -132,6 +141,12 @@ per-block-cgroup request pool. IOW, if there are N block cgroups, each request queue may have up to N request pools, each independently regulated by nr_requests. +nr_zones (RO) +------------- +For zoned block devices (zoned attribute indicating "host-managed" or +"host-aware"), this indicates the total number of zones of the device. +This is always 0 for regular block devices. + optimal_io_size (RO) -------------------- This is the optimal IO size reported by the device. @@ -213,19 +228,4 @@ devices are described in the ZBC (Zoned Block Commands) and ZAC do not support zone commands, they will be treated as regular block devices and zoned will report "none". -nr_zones (RO) -------------- -For zoned block devices (zoned attribute indicating "host-managed" or -"host-aware"), this indicates the total number of zones of the device. -This is always 0 for regular block devices. - -chunk_sectors (RO) ------------------- -This has different meaning depending on the type of the block device. -For a RAID device (dm-raid), chunk_sectors indicates the size in 512B sectors -of the RAID volume stripe segment. For a zoned block device, either host-aware -or host-managed, chunk_sectors indicates the size in 512B sectors of the zones -of the device, with the eventual exception of the last zone of the device which -may be smaller. - Jens Axboe , February 2009 From 0c766e78bda6d4edf40779fc0cd48d0867a04d84 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 28 Jun 2019 13:07:44 -0700 Subject: [PATCH 130/153] block, documentation: Explain the word 'segments' Several block layer users who are not kernel developers do not know that the word 'segment' refers to an element in a DMA scatter/gather list. Make the block layer documentation easier to understand by stating explicitly what the word 'segment' stands for. Reviewed-by: Martin K. Petersen Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- Documentation/block/queue-sysfs.txt | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index f6da2efe2105..1515dcf3dec4 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt @@ -98,8 +98,9 @@ This is the maximum number of kilobytes supported in a single data transfer. max_integrity_segments (RO) --------------------------- -When read, this file shows the max limit of integrity segments as -set by block layer which a hardware controller can handle. +Maximum number of elements in a DMA scatter/gather list with integrity +data that will be submitted by the block layer core to the associated +block driver. max_sectors_kb (RW) ------------------- @@ -109,11 +110,12 @@ size allowed by the hardware. max_segments (RO) ----------------- -Maximum number of segments of the device. +Maximum number of elements in a DMA scatter/gather list that is submitted +to the associated block driver. max_segment_size (RO) --------------------- -Maximum segment size of the device. +Maximum size in bytes of a single element in a DMA scatter/gather list. minimum_io_size (RO) -------------------- From fbbe7c86b483878da4a2ec7b899e0814195942af Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 28 Jun 2019 13:07:45 -0700 Subject: [PATCH 131/153] block, documentation: Document discard_zeroes_data, fua, max_discard_segments and write_zeroes_max_bytes Reviewed-by: Martin K. Petersen Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- Documentation/block/queue-sysfs.txt | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index 1515dcf3dec4..b40b5b7cebd9 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt @@ -52,6 +52,16 @@ large discards are issued, setting this value lower will make Linux issue smaller discards and potentially help reduce latencies induced by large discard operations. +discard_zeroes_data (RO) +------------------------ +Obsolete. Always zero. + +fua (RO) +-------- +Whether or not the block driver supports the FUA flag for write requests. +FUA stands for Force Unit Access. If the FUA flag is set that means that +write requests must bypass the volatile cache of the storage device. + hw_sector_size (RO) ------------------- This is the hardware sector size of the device, in bytes. @@ -92,6 +102,10 @@ logical_block_size (RO) ----------------------- This is the logical block size of the device, in bytes. +max_discard_segments (RO) +------------------------- +The maximum number of DMA scatter/gather entries in a discard request. + max_hw_sectors_kb (RO) ---------------------- This is the maximum number of kilobytes supported in a single data transfer. @@ -218,6 +232,12 @@ blk-throttle makes decision based on the samplings. Lower time means cgroups have more smooth throughput, but higher CPU overhead. This exists only when CONFIG_BLK_DEV_THROTTLING_LOW is enabled. +write_zeroes_max_bytes (RO) +--------------------------- +For block drivers that support REQ_OP_WRITE_ZEROES, the maximum number of +bytes that can be zeroed at once. The value 0 means that REQ_OP_WRITE_ZEROES +is not supported. + zoned (RO) ---------- This indicates if the device is a zoned block device and the zone model of the From 5e4c7cf60ec3cad59703c203de1dfb31ea608e6e Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Thu, 27 Jun 2019 16:30:02 -0600 Subject: [PATCH 132/153] block: sed-opal: PSID reverttper capability PSID is a 32 character password printed on the drive label, to prove its physical access. This PSID reverttper function is very useful to regain the control over the drive when it is locked and the user can no longer access it because of some failures. However, *all the data on the drive is completely erased*. This method is advisable only when the user is exhausted of all other recovery methods. PSID capabilities are described in: https://trustedcomputinggroup.org/wp-content/uploads/TCG_Storage-Opal_Feature_Set_PSID_v1.00_r1.00.pdf Signed-off-by: Revanth Rajashekar Signed-off-by: Jens Axboe --- block/sed-opal.c | 33 +++++++++++++++++++++++++++++---- include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 1 + 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index a46e8d13e16d..bb8ef7963d11 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1307,6 +1307,7 @@ static int start_generic_opal_session(struct opal_dev *dev, break; case OPAL_ADMIN1_UID: case OPAL_SID_UID: + case OPAL_PSID_UID: add_token_u8(&err, dev, OPAL_STARTNAME); add_token_u8(&err, dev, 0); /* HostChallenge */ add_token_bytestring(&err, dev, key, key_len); @@ -1367,6 +1368,16 @@ static int start_admin1LSP_opal_session(struct opal_dev *dev, void *data) key->key, key->key_len); } +static int start_PSID_opal_session(struct opal_dev *dev, void *data) +{ + const struct opal_key *okey = data; + + return start_generic_opal_session(dev, OPAL_PSID_UID, + OPAL_ADMINSP_UID, + okey->key, + okey->key_len); +} + static int start_auth_opal_session(struct opal_dev *dev, void *data) { struct opal_session_info *session = data; @@ -2030,17 +2041,28 @@ static int opal_add_user_to_lr(struct opal_dev *dev, return ret; } -static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal) +static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal, bool psid) { + /* controller will terminate session */ const struct opal_step revert_steps[] = { { start_SIDASP_opal_session, opal }, - { revert_tper, } /* controller will terminate session */ + { revert_tper, } }; + const struct opal_step psid_revert_steps[] = { + { start_PSID_opal_session, opal }, + { revert_tper, } + }; + int ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); - ret = execute_steps(dev, revert_steps, ARRAY_SIZE(revert_steps)); + if (psid) + ret = execute_steps(dev, psid_revert_steps, + ARRAY_SIZE(psid_revert_steps)); + else + ret = execute_steps(dev, revert_steps, + ARRAY_SIZE(revert_steps)); mutex_unlock(&dev->dev_lock); /* @@ -2280,7 +2302,7 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) ret = opal_activate_user(dev, p); break; case IOC_OPAL_REVERT_TPR: - ret = opal_reverttper(dev, p); + ret = opal_reverttper(dev, p, false); break; case IOC_OPAL_LR_SETUP: ret = opal_setup_locking_range(dev, p); @@ -2297,6 +2319,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_SECURE_ERASE_LR: ret = opal_secure_erase_locking_range(dev, p); break; + case IOC_OPAL_PSID_REVERT_TPR: + ret = opal_reverttper(dev, p, true); + break; default: break; } diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 3e76b6d7d97f..f03bbffd3281 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -39,6 +39,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_ENABLE_DISABLE_MBR: case IOC_OPAL_ERASE_LR: case IOC_OPAL_SECURE_ERASE_LR: + case IOC_OPAL_PSID_REVERT_TPR: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 33e53b80cd1f..7a03e5b4df6e 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -107,5 +107,6 @@ struct opal_mbr_data { #define IOC_OPAL_ENABLE_DISABLE_MBR _IOW('p', 229, struct opal_mbr_data) #define IOC_OPAL_ERASE_LR _IOW('p', 230, struct opal_session_info) #define IOC_OPAL_SECURE_ERASE_LR _IOW('p', 231, struct opal_session_info) +#define IOC_OPAL_PSID_REVERT_TPR _IOW('p', 232, struct opal_key) #endif /* _UAPI_SED_OPAL_H */ From 15ddffcb341392ba56a28a0ff5d19d8f8cde1b80 Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Thu, 27 Jun 2019 16:31:09 -0600 Subject: [PATCH 133/153] block: sed-opal: "Never True" conditions 'who' an unsigned variable in stucture opal_session_info can never be lesser than zero. Hence, the condition "who < OPAL_ADMIN1" can never be true. Signed-off-by: Revanth Rajashekar Signed-off-by: Jens Axboe --- block/sed-opal.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index bb8ef7963d11..c54019c11e91 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -2114,8 +2114,7 @@ static int opal_lock_unlock(struct opal_dev *dev, { int ret; - if (lk_unlk->session.who < OPAL_ADMIN1 || - lk_unlk->session.who > OPAL_USER9) + if (lk_unlk->session.who > OPAL_USER9) return -EINVAL; mutex_lock(&dev->dev_lock); @@ -2193,9 +2192,7 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) }; int ret; - if (opal_pw->session.who < OPAL_ADMIN1 || - opal_pw->session.who > OPAL_USER9 || - opal_pw->new_user_pw.who < OPAL_ADMIN1 || + if (opal_pw->session.who > OPAL_USER9 || opal_pw->new_user_pw.who > OPAL_USER9) return -EINVAL; From b71e8c13fa57811b64939ba7eb603775fb13c2eb Mon Sep 17 00:00:00 2001 From: Fuqian Huang Date: Fri, 28 Jun 2019 01:35:04 +0800 Subject: [PATCH 134/153] block: mtip32xx: Remove call to memset after dma_alloc_coherent In commit af7ddd8a627c ("Merge tag 'dma-mapping-4.21' of git://git.infradead.org/users/hch/dma-mapping"), dma_alloc_coherent has already zeroed the memory. So memset is not needed. Signed-off-by: Fuqian Huang Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index a14b09ab3a41..964f78cfffa0 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -1577,7 +1577,6 @@ static int exec_drive_command(struct mtip_port *port, u8 *command, ATA_SECT_SIZE * xfer_sz); return -ENOMEM; } - memset(buf, 0, ATA_SECT_SIZE * xfer_sz); } /* Build the FIS. */ @@ -2776,7 +2775,6 @@ static int mtip_dma_alloc(struct driver_data *dd) &port->block1_dma, GFP_KERNEL); if (!port->block1) return -ENOMEM; - memset(port->block1, 0, BLOCK_DMA_ALLOC_SZ); /* Allocate dma memory for command list */ port->command_list = @@ -2789,7 +2787,6 @@ static int mtip_dma_alloc(struct driver_data *dd) port->block1_dma = 0; return -ENOMEM; } - memset(port->command_list, 0, AHCI_CMD_TBL_SZ); /* Setup all pointers into first DMA region */ port->rxfis = port->block1 + AHCI_RX_FIS_OFFSET; @@ -3529,8 +3526,6 @@ static int mtip_init_cmd(struct blk_mq_tag_set *set, struct request *rq, if (!cmd->command) return -ENOMEM; - memset(cmd->command, 0, CMD_DMA_ALLOC_SZ); - sg_init_table(cmd->sg, MTIP_MAX_SG); return 0; } From 5f2ab0c1c896764ef3b2d01d9e40d138c2bfd791 Mon Sep 17 00:00:00 2001 From: Fuqian Huang Date: Fri, 28 Jun 2019 01:35:16 +0800 Subject: [PATCH 135/153] block: skd_main.c: Remove call to memset after dma_alloc_coherent In commit af7ddd8a627c ("Merge tag 'dma-mapping-4.21' of git://git.infradead.org/users/hch/dma-mapping"), dma_alloc_coherent has already zeroed the memory. So memset is not needed. Reviewed-by: Chaitanya Kulkarni Signed-off-by: Fuqian Huang Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 7d3ad6c22ee5..abeae7616f04 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -2696,7 +2696,6 @@ static int skd_cons_skmsg(struct skd_device *skdev) (FIT_QCMD_ALIGN - 1), "not aligned: msg_buf %p mb_dma_address %pad\n", skmsg->msg_buf, &skmsg->mb_dma_address); - memset(skmsg->msg_buf, 0, SKD_N_FITMSG_BYTES); } err_out: From b2d0d99135ad145667765cbd27f148c1a4cd50d1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 15:49:20 +0200 Subject: [PATCH 136/153] block: move the BIO_NO_PAGE_REF check into bio_release_pages Move the BIO_NO_PAGE_REF check into bio_release_pages instead of duplicating it in both callers. Also make the function available outside of bio.c so that we can reuse it in other direct I/O implementations. Reviewed-by: Minwoo Im Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 11 ++++++----- include/linux/bio.h | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/block/bio.c b/block/bio.c index bb55b94bb361..b35356c6093b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -845,11 +845,14 @@ static void bio_get_pages(struct bio *bio) get_page(bvec->bv_page); } -static void bio_release_pages(struct bio *bio) +void bio_release_pages(struct bio *bio) { struct bvec_iter_all iter_all; struct bio_vec *bvec; + if (bio_flagged(bio, BIO_NO_PAGE_REF)) + return; + bio_for_each_segment_all(bvec, bio, iter_all) put_page(bvec->bv_page); } @@ -1681,8 +1684,7 @@ static void bio_dirty_fn(struct work_struct *work) next = bio->bi_private; bio_set_pages_dirty(bio); - if (!bio_flagged(bio, BIO_NO_PAGE_REF)) - bio_release_pages(bio); + bio_release_pages(bio); bio_put(bio); } } @@ -1698,8 +1700,7 @@ void bio_check_pages_dirty(struct bio *bio) goto defer; } - if (!bio_flagged(bio, BIO_NO_PAGE_REF)) - bio_release_pages(bio); + bio_release_pages(bio); bio_put(bio); return; defer: diff --git a/include/linux/bio.h b/include/linux/bio.h index 5a8ae56e09ff..6d82b4856282 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -426,6 +426,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page, void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); +void bio_release_pages(struct bio *bio); struct rq_map_data; extern struct bio *bio_map_user_iov(struct request_queue *, struct iov_iter *, gfp_t); From d241a95f3514a5eb544dfd8d9d141ffd1c89b707 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 15:49:21 +0200 Subject: [PATCH 137/153] block: optionally mark pages dirty in bio_release_pages A lot of callers of bio_release_pages also want to mark the released pages as dirty. Add a mark_dirty parameter to avoid a second relatively expensive bio_for_each_segment_all loop. Reviewed-by: Minwoo Im Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 12 +++++++----- include/linux/bio.h | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/block/bio.c b/block/bio.c index b35356c6093b..8a7b315630ce 100644 --- a/block/bio.c +++ b/block/bio.c @@ -845,7 +845,7 @@ static void bio_get_pages(struct bio *bio) get_page(bvec->bv_page); } -void bio_release_pages(struct bio *bio) +void bio_release_pages(struct bio *bio, bool mark_dirty) { struct bvec_iter_all iter_all; struct bio_vec *bvec; @@ -853,8 +853,11 @@ void bio_release_pages(struct bio *bio) if (bio_flagged(bio, BIO_NO_PAGE_REF)) return; - bio_for_each_segment_all(bvec, bio, iter_all) + bio_for_each_segment_all(bvec, bio, iter_all) { + if (mark_dirty && !PageCompound(bvec->bv_page)) + set_page_dirty_lock(bvec->bv_page); put_page(bvec->bv_page); + } } static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) @@ -1683,8 +1686,7 @@ static void bio_dirty_fn(struct work_struct *work) while ((bio = next) != NULL) { next = bio->bi_private; - bio_set_pages_dirty(bio); - bio_release_pages(bio); + bio_release_pages(bio, true); bio_put(bio); } } @@ -1700,7 +1702,7 @@ void bio_check_pages_dirty(struct bio *bio) goto defer; } - bio_release_pages(bio); + bio_release_pages(bio, false); bio_put(bio); return; defer: diff --git a/include/linux/bio.h b/include/linux/bio.h index 6d82b4856282..2d8c73f0ecaf 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -426,7 +426,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page, void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); -void bio_release_pages(struct bio *bio); +void bio_release_pages(struct bio *bio, bool mark_dirty); struct rq_map_data; extern struct bio *bio_map_user_iov(struct request_queue *, struct iov_iter *, gfp_t); From 163cc2d3cd87af82b589bc2327285505eeac3842 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 15:49:22 +0200 Subject: [PATCH 138/153] block: use bio_release_pages in bio_unmap_user Use bio_release_pages instead of open coding it. Reviewed-by: Chaitanya Kulkarni Reviewed-by: Minwoo Im Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/block/bio.c b/block/bio.c index 8a7b315630ce..c759f5598513 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1437,24 +1437,6 @@ struct bio *bio_map_user_iov(struct request_queue *q, return ERR_PTR(ret); } -static void __bio_unmap_user(struct bio *bio) -{ - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - /* - * make sure we dirty pages we wrote to - */ - bio_for_each_segment_all(bvec, bio, iter_all) { - if (bio_data_dir(bio) == READ) - set_page_dirty_lock(bvec->bv_page); - - put_page(bvec->bv_page); - } - - bio_put(bio); -} - /** * bio_unmap_user - unmap a bio * @bio: the bio being unmapped @@ -1466,7 +1448,8 @@ static void __bio_unmap_user(struct bio *bio) */ void bio_unmap_user(struct bio *bio) { - __bio_unmap_user(bio); + bio_release_pages(bio, bio_data_dir(bio) == READ); + bio_put(bio); bio_put(bio); } From 506e0798479ed54d48f063547b1b62d33b18d54c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 15:49:23 +0200 Subject: [PATCH 139/153] block: use bio_release_pages in bio_map_user_iov Use bio_release_pages instead of open coding it. Reviewed-by: Minwoo Im Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/block/bio.c b/block/bio.c index c759f5598513..1cbf2a7c245e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1362,8 +1362,6 @@ struct bio *bio_map_user_iov(struct request_queue *q, int j; struct bio *bio; int ret; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; if (!iov_iter_count(iter)) return ERR_PTR(-EINVAL); @@ -1430,9 +1428,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, return bio; out_unmap: - bio_for_each_segment_all(bvec, bio, iter_all) { - put_page(bvec->bv_page); - } + bio_release_pages(bio, false); bio_put(bio); return ERR_PTR(ret); } From 147a60538d91ddd431c970f83527b3365d72eefc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 15:49:24 +0200 Subject: [PATCH 140/153] iomap: use bio_release_pages in iomap_dio_bio_end_io Use bio_release_pages instead of duplicating it. Reviewed-by: Minwoo Im Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/iomap.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/iomap.c b/fs/iomap.c index 23ef63fd1669..3798eaf789d7 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1595,13 +1595,7 @@ static void iomap_dio_bio_end_io(struct bio *bio) if (should_dirty) { bio_check_pages_dirty(bio); } else { - if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { - struct bvec_iter_all iter_all; - struct bio_vec *bvec; - - bio_for_each_segment_all(bvec, bio, iter_all) - put_page(bvec->bv_page); - } + bio_release_pages(bio, false); bio_put(bio); } } From 57dfe3ce10e5cecfb3d073e90c0454f432a86873 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 15:49:25 +0200 Subject: [PATCH 141/153] block_dev: use bio_release_pages in blkdev_bio_end_io Use bio_release_pages instead of duplicating it. Reviewed-by: Minwoo Im Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 749f5984425d..a6572a811880 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -335,13 +335,7 @@ static void blkdev_bio_end_io(struct bio *bio) if (should_dirty) { bio_check_pages_dirty(bio); } else { - if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { - struct bvec_iter_all iter_all; - struct bio_vec *bvec; - - bio_for_each_segment_all(bvec, bio, iter_all) - put_page(bvec->bv_page); - } + bio_release_pages(bio, false); bio_put(bio); } } From 9fec4a21886e67c3337e1762eda9f71a1822cba8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 15:49:26 +0200 Subject: [PATCH 142/153] block_dev: use bio_release_pages in bio_unmap_user Use bio_release_pages instead of duplicating it. Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index a6572a811880..f00b569a9f89 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -203,13 +203,12 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, { struct file *file = iocb->ki_filp; struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs, *bvec; + struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; loff_t pos = iocb->ki_pos; bool should_dirty = false; struct bio bio; ssize_t ret; blk_qc_t qc; - struct bvec_iter_all iter_all; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) @@ -259,13 +258,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, } __set_current_state(TASK_RUNNING); - bio_for_each_segment_all(bvec, &bio, iter_all) { - if (should_dirty && !PageCompound(bvec->bv_page)) - set_page_dirty_lock(bvec->bv_page); - if (!bio_flagged(&bio, BIO_NO_PAGE_REF)) - put_page(bvec->bv_page); - } - + bio_release_pages(&bio, should_dirty); if (unlikely(bio.bi_status)) ret = blk_status_to_errno(bio.bi_status); From d7c8aa85ed204ccebf0c18f29b941d52b33deae3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 15:49:27 +0200 Subject: [PATCH 143/153] direct-io: use bio_release_pages in dio_bio_complete Use bio_release_pages instead of duplicating it. Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/direct-io.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index ac7fb19b6ade..ae196784f487 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -538,8 +538,8 @@ static struct bio *dio_await_one(struct dio *dio) */ static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio) { - struct bio_vec *bvec; blk_status_t err = bio->bi_status; + bool should_dirty = dio->op == REQ_OP_READ && dio->should_dirty; if (err) { if (err == BLK_STS_AGAIN && (bio->bi_opf & REQ_NOWAIT)) @@ -548,19 +548,10 @@ static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio) dio->io_error = -EIO; } - if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) { + if (dio->is_async && should_dirty) { bio_check_pages_dirty(bio); /* transfers ownership */ } else { - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - - if (dio->op == REQ_OP_READ && !PageCompound(page) && - dio->should_dirty) - set_page_dirty_lock(page); - put_page(page); - } + bio_release_pages(bio, should_dirty); bio_put(bio); } return err; From b620743077e291ae7d0debd21f50413a8c266229 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 15:49:28 +0200 Subject: [PATCH 144/153] block: never take page references for ITER_BVEC If we pass pages through an iov_iter we always already have a reference in the caller. Thus remove the ITER_BVEC_FLAG_NO_REF and don't take reference to pages by default for bvec backed iov_iters. Reviewed-by: Minwoo Im Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 14 +------------- drivers/block/loop.c | 16 ++++------------ fs/io_uring.c | 3 --- include/linux/uio.h | 10 +--------- 4 files changed, 6 insertions(+), 37 deletions(-) diff --git a/block/bio.c b/block/bio.c index 1cbf2a7c245e..5733b9426231 100644 --- a/block/bio.c +++ b/block/bio.c @@ -836,15 +836,6 @@ int bio_add_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL(bio_add_page); -static void bio_get_pages(struct bio *bio) -{ - struct bvec_iter_all iter_all; - struct bio_vec *bvec; - - bio_for_each_segment_all(bvec, bio, iter_all) - get_page(bvec->bv_page); -} - void bio_release_pages(struct bio *bio, bool mark_dirty) { struct bvec_iter_all iter_all; @@ -960,11 +951,8 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio)); - if (iov_iter_bvec_no_ref(iter)) + if (is_bvec) bio_set_flag(bio, BIO_NO_PAGE_REF); - else if (is_bvec) - bio_get_pages(bio); - return bio->bi_vcnt ? 0 : ret; } diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f11b7dc16e9d..44c9985f352a 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -264,20 +264,12 @@ lo_do_transfer(struct loop_device *lo, int cmd, return ret; } -static inline void loop_iov_iter_bvec(struct iov_iter *i, - unsigned int direction, const struct bio_vec *bvec, - unsigned long nr_segs, size_t count) -{ - iov_iter_bvec(i, direction, bvec, nr_segs, count); - i->type |= ITER_BVEC_FLAG_NO_REF; -} - static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos) { struct iov_iter i; ssize_t bw; - loop_iov_iter_bvec(&i, WRITE, bvec, 1, bvec->bv_len); + iov_iter_bvec(&i, WRITE, bvec, 1, bvec->bv_len); file_start_write(file); bw = vfs_iter_write(file, &i, ppos, 0); @@ -355,7 +347,7 @@ static int lo_read_simple(struct loop_device *lo, struct request *rq, ssize_t len; rq_for_each_segment(bvec, rq, iter) { - loop_iov_iter_bvec(&i, READ, &bvec, 1, bvec.bv_len); + iov_iter_bvec(&i, READ, &bvec, 1, bvec.bv_len); len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0); if (len < 0) return len; @@ -396,7 +388,7 @@ static int lo_read_transfer(struct loop_device *lo, struct request *rq, b.bv_offset = 0; b.bv_len = bvec.bv_len; - loop_iov_iter_bvec(&i, READ, &b, 1, b.bv_len); + iov_iter_bvec(&i, READ, &b, 1, b.bv_len); len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0); if (len < 0) { ret = len; @@ -563,7 +555,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, } atomic_set(&cmd->ref, 2); - loop_iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq)); + iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq)); iter.iov_offset = offset; cmd->iocb.ki_pos = pos; diff --git a/fs/io_uring.c b/fs/io_uring.c index 86a2bd721900..eb6ab1507913 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -997,9 +997,6 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw, iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); if (offset) iov_iter_advance(iter, offset); - - /* don't drop a reference to these pages */ - iter->type |= ITER_BVEC_FLAG_NO_REF; return 0; } diff --git a/include/linux/uio.h b/include/linux/uio.h index 2c90a0842ee8..cea1761c5672 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -19,9 +19,6 @@ struct kvec { }; enum iter_type { - /* set if ITER_BVEC doesn't hold a bv_page ref */ - ITER_BVEC_FLAG_NO_REF = 2, - /* iter types */ ITER_IOVEC = 4, ITER_KVEC = 8, @@ -56,7 +53,7 @@ struct iov_iter { static inline enum iter_type iov_iter_type(const struct iov_iter *i) { - return i->type & ~(READ | WRITE | ITER_BVEC_FLAG_NO_REF); + return i->type & ~(READ | WRITE); } static inline bool iter_is_iovec(const struct iov_iter *i) @@ -89,11 +86,6 @@ static inline unsigned char iov_iter_rw(const struct iov_iter *i) return i->type & (READ | WRITE); } -static inline bool iov_iter_bvec_no_ref(const struct iov_iter *i) -{ - return (i->type & ITER_BVEC_FLAG_NO_REF) != 0; -} - /* * Total number of bytes covered by an iovec. * From c9888443413e4e06013e482fc484dbb9c559c145 Mon Sep 17 00:00:00 2001 From: Jonas Rabenstein Date: Tue, 21 May 2019 22:46:44 +0200 Subject: [PATCH 145/153] block: sed-opal: add ioctl for done-mark of shadow mbr Enable users to mark the shadow mbr as done without completely deactivating the shadow mbr feature. This may be useful on reboots, when the power to the disk is not disconnected in between and the shadow mbr stores the required boot files. Of course, this saves also the (few) commands required to enable the feature if it is already enabled and one only wants to mark the shadow mbr as done. Co-authored-by: David Kozub Signed-off-by: Jonas Rabenstein Signed-off-by: David Kozub Reviewed-by: Christoph Hellwig Reviewed by: Scott Bauer Reviewed-by: Jon Derrick Signed-off-by: Jens Axboe --- block/sed-opal.c | 27 +++++++++++++++++++++++++++ include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 12 ++++++++++++ 3 files changed, 40 insertions(+) diff --git a/block/sed-opal.c b/block/sed-opal.c index c54019c11e91..f94f359dd688 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1989,6 +1989,30 @@ static int opal_enable_disable_shadow_mbr(struct opal_dev *dev, return ret; } +static int opal_set_mbr_done(struct opal_dev *dev, + struct opal_mbr_done *mbr_done) +{ + u8 mbr_done_tf = mbr_done->done_flag == OPAL_MBR_DONE ? + OPAL_TRUE : OPAL_FALSE; + + const struct opal_step mbr_steps[] = { + { start_admin1LSP_opal_session, &mbr_done->key }, + { set_mbr_done, &mbr_done_tf }, + { end_opal_session, } + }; + int ret; + + if (mbr_done->done_flag != OPAL_MBR_DONE && + mbr_done->done_flag != OPAL_MBR_NOT_DONE) + return -EINVAL; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); + mutex_unlock(&dev->dev_lock); + return ret; +} + static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk) { struct opal_suspend_data *suspend; @@ -2310,6 +2334,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_ENABLE_DISABLE_MBR: ret = opal_enable_disable_shadow_mbr(dev, p); break; + case IOC_OPAL_MBR_DONE: + ret = opal_set_mbr_done(dev, p); + break; case IOC_OPAL_ERASE_LR: ret = opal_erase_locking_range(dev, p); break; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index f03bbffd3281..f834e8a1495f 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -40,6 +40,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_ERASE_LR: case IOC_OPAL_SECURE_ERASE_LR: case IOC_OPAL_PSID_REVERT_TPR: + case IOC_OPAL_MBR_DONE: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 7a03e5b4df6e..5681f55d334b 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -20,6 +20,11 @@ enum opal_mbr { OPAL_MBR_DISABLE = 0x01, }; +enum opal_mbr_done_flag { + OPAL_MBR_NOT_DONE = 0x0, + OPAL_MBR_DONE = 0x01 +}; + enum opal_user { OPAL_ADMIN1 = 0x0, OPAL_USER1 = 0x01, @@ -95,6 +100,12 @@ struct opal_mbr_data { __u8 __align[7]; }; +struct opal_mbr_done { + struct opal_key key; + __u8 done_flag; + __u8 __align[7]; +}; + #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) #define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) #define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) @@ -108,5 +119,6 @@ struct opal_mbr_data { #define IOC_OPAL_ERASE_LR _IOW('p', 230, struct opal_session_info) #define IOC_OPAL_SECURE_ERASE_LR _IOW('p', 231, struct opal_session_info) #define IOC_OPAL_PSID_REVERT_TPR _IOW('p', 232, struct opal_key) +#define IOC_OPAL_MBR_DONE _IOW('p', 233, struct opal_mbr_done) #endif /* _UAPI_SED_OPAL_H */ From a9b25b4cf2b76d320afc999f881ccb805fecdd84 Mon Sep 17 00:00:00 2001 From: Jonas Rabenstein Date: Tue, 21 May 2019 22:46:45 +0200 Subject: [PATCH 146/153] block: sed-opal: ioctl for writing to shadow mbr Allow modification of the shadow mbr. If the shadow mbr is not marked as done, this data will be presented read only as the device content. Only after marking the shadow mbr as done and unlocking a locking range the actual content is accessible. Co-authored-by: David Kozub Signed-off-by: Jonas Rabenstein Signed-off-by: David Kozub Reviewed-by: Scott Bauer Reviewed-by: Jon Derrick Signed-off-by: Jens Axboe --- block/sed-opal.c | 91 ++++++++++++++++++++++++++++++++++- include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 8 +++ 3 files changed, 98 insertions(+), 2 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index f94f359dd688..b02ef2ff0d75 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -26,6 +26,9 @@ #define IO_BUFFER_LENGTH 2048 #define MAX_TOKS 64 +/* Number of bytes needed by cmd_finalize. */ +#define CMD_FINALIZE_BYTES_NEEDED 7 + struct opal_step { int (*fn)(struct opal_dev *dev, void *data); void *data; @@ -523,12 +526,17 @@ static int opal_discovery0_step(struct opal_dev *dev) return execute_step(dev, &discovery0_step, 0); } +static size_t remaining_size(struct opal_dev *cmd) +{ + return IO_BUFFER_LENGTH - cmd->pos; +} + static bool can_add(int *err, struct opal_dev *cmd, size_t len) { if (*err) return false; - if (len > IO_BUFFER_LENGTH || cmd->pos > IO_BUFFER_LENGTH - len) { + if (remaining_size(cmd) < len) { pr_debug("Error adding %zu bytes: end of buffer.\n", len); *err = -ERANGE; return false; @@ -674,7 +682,11 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn) struct opal_header *hdr; int err = 0; - /* close the parameter list opened from cmd_start */ + /* + * Close the parameter list opened from cmd_start. + * The number of bytes added must be equal to + * CMD_FINALIZE_BYTES_NEEDED. + */ add_token_u8(&err, cmd, OPAL_ENDLIST); add_token_u8(&err, cmd, OPAL_ENDOFDATA); @@ -1536,6 +1548,58 @@ static int set_mbr_enable_disable(struct opal_dev *dev, void *data) return finalize_and_send(dev, parse_and_check_status); } +static int write_shadow_mbr(struct opal_dev *dev, void *data) +{ + struct opal_shadow_mbr *shadow = data; + const u8 __user *src; + u8 *dst; + size_t off = 0; + u64 len; + int err = 0; + + /* do the actual transmission(s) */ + src = (u8 __user *)(uintptr_t)shadow->data; + while (off < shadow->size) { + err = cmd_start(dev, opaluid[OPAL_MBR], opalmethod[OPAL_SET]); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_WHERE); + add_token_u64(&err, dev, shadow->offset + off); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + + /* + * The bytestring header is either 1 or 2 bytes, so assume 2. + * There also needs to be enough space to accommodate the + * trailing OPAL_ENDNAME (1 byte) and tokens added by + * cmd_finalize. + */ + len = min(remaining_size(dev) - (2+1+CMD_FINALIZE_BYTES_NEEDED), + (size_t)(shadow->size - off)); + pr_debug("MBR: write bytes %zu+%llu/%llu\n", + off, len, shadow->size); + + dst = add_bytestring_header(&err, dev, len); + if (!dst) + break; + if (copy_from_user(dst, src + off, len)) + err = -EFAULT; + dev->pos += len; + + add_token_u8(&err, dev, OPAL_ENDNAME); + if (err) + break; + + err = finalize_and_send(dev, parse_and_check_status); + if (err) + break; + + off += len; + } + return err; +} + static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid, struct opal_dev *dev) { @@ -2013,6 +2077,26 @@ static int opal_set_mbr_done(struct opal_dev *dev, return ret; } +static int opal_write_shadow_mbr(struct opal_dev *dev, + struct opal_shadow_mbr *info) +{ + const struct opal_step mbr_steps[] = { + { start_admin1LSP_opal_session, &info->key }, + { write_shadow_mbr, info }, + { end_opal_session, } + }; + int ret; + + if (info->size == 0) + return 0; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); + mutex_unlock(&dev->dev_lock); + return ret; +} + static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk) { struct opal_suspend_data *suspend; @@ -2337,6 +2421,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_MBR_DONE: ret = opal_set_mbr_done(dev, p); break; + case IOC_OPAL_WRITE_SHADOW_MBR: + ret = opal_write_shadow_mbr(dev, p); + break; case IOC_OPAL_ERASE_LR: ret = opal_erase_locking_range(dev, p); break; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index f834e8a1495f..53c28d750a45 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -41,6 +41,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_SECURE_ERASE_LR: case IOC_OPAL_PSID_REVERT_TPR: case IOC_OPAL_MBR_DONE: + case IOC_OPAL_WRITE_SHADOW_MBR: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 5681f55d334b..c6d035fa1b6c 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -106,6 +106,13 @@ struct opal_mbr_done { __u8 __align[7]; }; +struct opal_shadow_mbr { + struct opal_key key; + const __u64 data; + __u64 offset; + __u64 size; +}; + #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) #define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) #define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) @@ -120,5 +127,6 @@ struct opal_mbr_done { #define IOC_OPAL_SECURE_ERASE_LR _IOW('p', 231, struct opal_session_info) #define IOC_OPAL_PSID_REVERT_TPR _IOW('p', 232, struct opal_key) #define IOC_OPAL_MBR_DONE _IOW('p', 233, struct opal_mbr_done) +#define IOC_OPAL_WRITE_SHADOW_MBR _IOW('p', 234, struct opal_shadow_mbr) #endif /* _UAPI_SED_OPAL_H */ From ff91064ea37c8323eba31cc3d2e22464f107b50d Mon Sep 17 00:00:00 2001 From: Jonas Rabenstein Date: Tue, 21 May 2019 22:46:46 +0200 Subject: [PATCH 147/153] block: sed-opal: check size of shadow mbr Check whether the shadow mbr does fit in the provided space on the target. Also a proper firmware should handle this case and return an error we may prevent problems or even damage with crappy firmwares. Signed-off-by: Jonas Rabenstein Signed-off-by: David Kozub Reviewed-by: Scott Bauer Reviewed-by: Jon Derrick Signed-off-by: Jens Axboe --- block/opal_proto.h | 16 ++++++++++++++++ block/sed-opal.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/block/opal_proto.h b/block/opal_proto.h index d9a05ad02eb5..466ec7be16ef 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -98,6 +98,7 @@ enum opal_uid { OPAL_ENTERPRISE_BANDMASTER0_UID, OPAL_ENTERPRISE_ERASEMASTER_UID, /* tables */ + OPAL_TABLE_TABLE, OPAL_LOCKINGRANGE_GLOBAL, OPAL_LOCKINGRANGE_ACE_RDLOCKED, OPAL_LOCKINGRANGE_ACE_WRLOCKED, @@ -152,6 +153,21 @@ enum opal_token { OPAL_STARTCOLUMN = 0x03, OPAL_ENDCOLUMN = 0x04, OPAL_VALUES = 0x01, + /* table table */ + OPAL_TABLE_UID = 0x00, + OPAL_TABLE_NAME = 0x01, + OPAL_TABLE_COMMON = 0x02, + OPAL_TABLE_TEMPLATE = 0x03, + OPAL_TABLE_KIND = 0x04, + OPAL_TABLE_COLUMN = 0x05, + OPAL_TABLE_COLUMNS = 0x06, + OPAL_TABLE_ROWS = 0x07, + OPAL_TABLE_ROWS_FREE = 0x08, + OPAL_TABLE_ROW_BYTES = 0x09, + OPAL_TABLE_LASTID = 0x0A, + OPAL_TABLE_MIN = 0x0B, + OPAL_TABLE_MAX = 0x0C, + /* authority table */ OPAL_PIN = 0x03, /* locking tokens */ diff --git a/block/sed-opal.c b/block/sed-opal.c index b02ef2ff0d75..7e1a444a25b2 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -130,6 +130,8 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = { /* tables */ + [OPAL_TABLE_TABLE] + { 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01 }, [OPAL_LOCKINGRANGE_GLOBAL] = { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 }, [OPAL_LOCKINGRANGE_ACE_RDLOCKED] = @@ -1131,6 +1133,29 @@ static int generic_get_column(struct opal_dev *dev, const u8 *table, return finalize_and_send(dev, parse_and_check_status); } +/* + * see TCG SAS 5.3.2.3 for a description of the available columns + * + * the result is provided in dev->resp->tok[4] + */ +static int generic_get_table_info(struct opal_dev *dev, enum opal_uid table, + u64 column) +{ + u8 uid[OPAL_UID_LENGTH]; + const unsigned int half = OPAL_UID_LENGTH/2; + + /* sed-opal UIDs can be split in two halves: + * first: actual table index + * second: relative index in the table + * so we have to get the first half of the OPAL_TABLE_TABLE and use the + * first part of the target table as relative index into that table + */ + memcpy(uid, opaluid[OPAL_TABLE_TABLE], half); + memcpy(uid+half, opaluid[table], half); + + return generic_get_column(dev, uid, column); +} + static int gen_key(struct opal_dev *dev, void *data) { u8 uid[OPAL_UID_LENGTH]; @@ -1557,6 +1582,20 @@ static int write_shadow_mbr(struct opal_dev *dev, void *data) u64 len; int err = 0; + /* do we fit in the available shadow mbr space? */ + err = generic_get_table_info(dev, OPAL_MBR, OPAL_TABLE_ROWS); + if (err) { + pr_debug("MBR: could not get shadow size\n"); + return err; + } + + len = response_get_u64(&dev->parsed, 4); + if (shadow->size > len || shadow->offset > len - shadow->size) { + pr_debug("MBR: does not fit in shadow (%llu vs. %llu)\n", + shadow->offset + shadow->size, len); + return -ENOSPC; + } + /* do the actual transmission(s) */ src = (u8 __user *)(uintptr_t)shadow->data; while (off < shadow->size) { From 79d08f89bb1b5c2c1ff90d9bb95497ab9e8aa7e0 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 1 Jul 2019 15:14:46 +0800 Subject: [PATCH 148/153] block: fix .bi_size overflow 'bio->bi_iter.bi_size' is 'unsigned int', which at most hold 4G - 1 bytes. Before 07173c3ec276 ("block: enable multipage bvecs"), one bio can include very limited pages, and usually at most 256, so the fs bio size won't be bigger than 1M bytes most of times. Since we support multi-page bvec, in theory one fs bio really can be added > 1M pages, especially in case of hugepage, or big writeback with too many dirty pages. Then there is chance in which .bi_size is overflowed. Fixes this issue by using bio_full() to check if the added segment may overflow .bi_size. Cc: Liu Yiding Cc: kernel test robot Cc: "Darrick J. Wong" Cc: linux-xfs@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: stable@vger.kernel.org Fixes: 07173c3ec276 ("block: enable multipage bvecs") Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 10 +++++----- fs/iomap.c | 2 +- fs/xfs/xfs_aops.c | 2 +- include/linux/bio.h | 18 ++++++++++++++++-- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/block/bio.c b/block/bio.c index 933c1e36643b..29cd6cf4da51 100644 --- a/block/bio.c +++ b/block/bio.c @@ -723,7 +723,7 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio, } } - if (bio_full(bio)) + if (bio_full(bio, len)) return 0; if (bio->bi_vcnt >= queue_max_segments(q)) @@ -797,7 +797,7 @@ void __bio_add_page(struct bio *bio, struct page *page, struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); - WARN_ON_ONCE(bio_full(bio)); + WARN_ON_ONCE(bio_full(bio, len)); bv->bv_page = page; bv->bv_offset = off; @@ -824,7 +824,7 @@ int bio_add_page(struct bio *bio, struct page *page, bool same_page = false; if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { - if (bio_full(bio)) + if (bio_full(bio, len)) return 0; __bio_add_page(bio, page, len, offset); } @@ -909,7 +909,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) if (same_page) put_page(page); } else { - if (WARN_ON_ONCE(bio_full(bio))) + if (WARN_ON_ONCE(bio_full(bio, len))) return -EINVAL; __bio_add_page(bio, page, len, offset); } @@ -953,7 +953,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) ret = __bio_iov_bvec_add_pages(bio, iter); else ret = __bio_iov_iter_get_pages(bio, iter); - } while (!ret && iov_iter_count(iter) && !bio_full(bio)); + } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); if (is_bvec) bio_set_flag(bio, BIO_NO_PAGE_REF); diff --git a/fs/iomap.c b/fs/iomap.c index 4f94788db43b..7a147aa0c4d9 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -333,7 +333,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (iop) atomic_inc(&iop->read_count); - if (!ctx->bio || !is_contig || bio_full(ctx->bio)) { + if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) { gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 8da5e6637771..11f703d4a605 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -782,7 +782,7 @@ xfs_add_to_ioend( atomic_inc(&iop->write_count); if (!merged) { - if (bio_full(wpc->ioend->io_bio)) + if (bio_full(wpc->ioend->io_bio, len)) xfs_chain_bio(wpc->ioend, wbc, bdev, sector); bio_add_page(wpc->ioend->io_bio, page, len, poff); } diff --git a/include/linux/bio.h b/include/linux/bio.h index dc630b05e6e5..3cdb84cdc488 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -102,9 +102,23 @@ static inline void *bio_data(struct bio *bio) return NULL; } -static inline bool bio_full(struct bio *bio) +/** + * bio_full - check if the bio is full + * @bio: bio to check + * @len: length of one segment to be added + * + * Return true if @bio is full and one segment with @len bytes can't be + * added to the bio, otherwise return false + */ +static inline bool bio_full(struct bio *bio, unsigned len) { - return bio->bi_vcnt >= bio->bi_max_vecs; + if (bio->bi_vcnt >= bio->bi_max_vecs) + return true; + + if (bio->bi_iter.bi_size > UINT_MAX - len) + return true; + + return false; } static inline bool bio_next_segment(const struct bio *bio, From 417232880c8a646739dbf4666a231505a1917fcb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 May 2019 18:39:16 +0300 Subject: [PATCH 149/153] sbitmap: Replace cmpxchg with xchg cmpxchg() with an immediate value could be replaced with less expensive xchg(). The same true if new value don't _depend_ on the old one. In the second block, atomic_cmpxchg() return value isn't checked, so after atomic_cmpxchg() -> atomic_xchg() conversion it could be replaced with atomic_set(). Comparison with atomic_read() in the second chunk was left as an optimisation (if that was the initial intention). Reviewed-by: Omar Sandoval Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- lib/sbitmap.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 54f57cd117c6..969e5400a615 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -26,9 +26,7 @@ static inline bool sbitmap_deferred_clear(struct sbitmap *sb, int index) /* * First get a stable cleared mask, setting the old mask to 0. */ - do { - mask = sb->map[index].cleared; - } while (cmpxchg(&sb->map[index].cleared, mask, 0) != mask); + mask = xchg(&sb->map[index].cleared, 0); /* * Now clear the masked bits in our free word @@ -516,10 +514,8 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) struct sbq_wait_state *ws = &sbq->ws[wake_index]; if (waitqueue_active(&ws->wait)) { - int o = atomic_read(&sbq->wake_index); - - if (wake_index != o) - atomic_cmpxchg(&sbq->wake_index, o, wake_index); + if (wake_index != atomic_read(&sbq->wake_index)) + atomic_set(&sbq->wake_index, wake_index); return ws; } From c05f42206f4de12b6807270fc669b45472f1bdb7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 1 Jul 2019 08:47:29 -0700 Subject: [PATCH 150/153] blk-mq: remove blk_mq_put_ctx() No code that occurs between blk_mq_get_ctx() and blk_mq_put_ctx() depends on preemption being disabled for its correctness. Since removing the CPU preemption calls does not measurably affect performance, simplify the blk-mq code by removing the blk_mq_put_ctx() function and also by not disabling preemption in blk_mq_get_ctx(). Cc: Hannes Reinecke Cc: Omar Sandoval Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 5 +---- block/blk-mq-tag.c | 8 -------- block/blk-mq.c | 16 +++------------- block/blk-mq.h | 7 +------ block/kyber-iosched.c | 1 - 5 files changed, 5 insertions(+), 32 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 956a7aa9a637..c9d183d6c499 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -330,10 +330,8 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, bool ret = false; enum hctx_type type; - if (e && e->type->ops.bio_merge) { - blk_mq_put_ctx(ctx); + if (e && e->type->ops.bio_merge) return e->type->ops.bio_merge(hctx, bio, nr_segs); - } type = hctx->type; if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && @@ -344,7 +342,6 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, spin_unlock(&ctx->lock); } - blk_mq_put_ctx(ctx); return ret; } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 7513c8eaabee..da19f0bc8876 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -113,7 +113,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) struct sbq_wait_state *ws; DEFINE_SBQ_WAIT(wait); unsigned int tag_offset; - bool drop_ctx; int tag; if (data->flags & BLK_MQ_REQ_RESERVED) { @@ -136,7 +135,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) return BLK_MQ_TAG_FAIL; ws = bt_wait_ptr(bt, data->hctx); - drop_ctx = data->ctx == NULL; do { struct sbitmap_queue *bt_prev; @@ -161,9 +159,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (tag != -1) break; - if (data->ctx) - blk_mq_put_ctx(data->ctx); - bt_prev = bt; io_schedule(); @@ -189,9 +184,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) ws = bt_wait_ptr(bt, data->hctx); } while (1); - if (drop_ctx && data->ctx) - blk_mq_put_ctx(data->ctx); - sbitmap_finish_wait(bt, ws, &wait); found_tag: diff --git a/block/blk-mq.c b/block/blk-mq.c index d89383847d09..0cb1b152f320 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -355,13 +355,13 @@ static struct request *blk_mq_get_request(struct request_queue *q, struct elevator_queue *e = q->elevator; struct request *rq; unsigned int tag; - bool put_ctx_on_error = false; + bool clear_ctx_on_error = false; blk_queue_enter_live(q); data->q = q; if (likely(!data->ctx)) { data->ctx = blk_mq_get_ctx(q); - put_ctx_on_error = true; + clear_ctx_on_error = true; } if (likely(!data->hctx)) data->hctx = blk_mq_map_queue(q, data->cmd_flags, @@ -387,10 +387,8 @@ static struct request *blk_mq_get_request(struct request_queue *q, tag = blk_mq_get_tag(data); if (tag == BLK_MQ_TAG_FAIL) { - if (put_ctx_on_error) { - blk_mq_put_ctx(data->ctx); + if (clear_ctx_on_error) data->ctx = NULL; - } blk_queue_exit(q); return NULL; } @@ -427,8 +425,6 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, if (!rq) return ERR_PTR(-EWOULDBLOCK); - blk_mq_put_ctx(alloc_data.ctx); - rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; @@ -1977,7 +1973,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) plug = current->plug; if (unlikely(is_flush_fua)) { - blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio, nr_segs); /* bypass scheduler for flush rq */ @@ -1991,7 +1986,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) unsigned int request_count = plug->rq_count; struct request *last = NULL; - blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio, nr_segs); if (!request_count) @@ -2025,8 +2019,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_add_rq_to_plug(plug, rq); trace_block_plug(q); - blk_mq_put_ctx(data.ctx); - if (same_queue_rq) { data.hctx = same_queue_rq->mq_hctx; trace_block_unplug(q, 1, true); @@ -2035,11 +2027,9 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && !data.hctx->dispatch_busy)) { - blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio, nr_segs); blk_mq_try_issue_directly(data.hctx, rq, &cookie); } else { - blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio, nr_segs); blk_mq_sched_insert_request(rq, false, true, true); } diff --git a/block/blk-mq.h b/block/blk-mq.h index 633a5a77ee8b..f4bf5161333e 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -151,12 +151,7 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, */ static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) { - return __blk_mq_get_ctx(q, get_cpu()); -} - -static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx) -{ - put_cpu(); + return __blk_mq_get_ctx(q, raw_smp_processor_id()); } struct blk_mq_alloc_data { diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 3c2602601741..34dcea0ef637 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -575,7 +575,6 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, spin_lock(&kcq->lock); merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio, nr_segs); spin_unlock(&kcq->lock); - blk_mq_put_ctx(ctx); return merged; } From 970d168de636ddac8221cbd4a11d7678943e7379 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 1 Jul 2019 08:47:30 -0700 Subject: [PATCH 151/153] blk-mq: simplify blk_mq_make_request() Move the blk_mq_bio_to_request() call in front of the if-statement. Cc: Hannes Reinecke Cc: Omar Sandoval Reviewed-by: Minwoo Im Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 0cb1b152f320..e5ef40c603ca 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1971,10 +1971,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) cookie = request_to_qc_t(data.hctx, rq); + blk_mq_bio_to_request(rq, bio, nr_segs); + plug = current->plug; if (unlikely(is_flush_fua)) { - blk_mq_bio_to_request(rq, bio, nr_segs); - /* bypass scheduler for flush rq */ blk_insert_flush(rq); blk_mq_run_hw_queue(data.hctx, true); @@ -1986,8 +1986,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) unsigned int request_count = plug->rq_count; struct request *last = NULL; - blk_mq_bio_to_request(rq, bio, nr_segs); - if (!request_count) trace_block_plug(q); else @@ -2001,8 +1999,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_add_rq_to_plug(plug, rq); } else if (plug && !blk_queue_nomerges(q)) { - blk_mq_bio_to_request(rq, bio, nr_segs); - /* * We do limited plugging. If the bio can be merged, do that. * Otherwise the existing request in the plug list will be @@ -2027,10 +2023,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && !data.hctx->dispatch_busy)) { - blk_mq_bio_to_request(rq, bio, nr_segs); blk_mq_try_issue_directly(data.hctx, rq, &cookie); } else { - blk_mq_bio_to_request(rq, bio, nr_segs); blk_mq_sched_insert_request(rq, false, true, true); } From d665e12aa713e598a1100a320e5679c3f73823ed Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 Jul 2019 05:24:35 -0700 Subject: [PATCH 152/153] block: nr_phys_segments needs to be zero for REQ_OP_WRITE_ZEROES Fix a regression introduced when removing bi_phys_segments for Write Zeroes requests, which need to have a segment count of zero, as they don't have a payload. Fixes: 14ccb66b3f58 ("block: remove the bi_phys_segments field in struct bio") Reported-by: Jens Axboe Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index ca45eb51c669..57f7990b342d 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -105,7 +105,7 @@ static struct bio *blk_bio_discard_split(struct request_queue *q, static struct bio *blk_bio_write_zeroes_split(struct request_queue *q, struct bio *bio, struct bio_set *bs, unsigned *nsegs) { - *nsegs = 1; + *nsegs = 0; if (!q->limits.max_write_zeroes_sectors) return NULL; From c9b3007feca018d3f7061f5d5a14cb00766ffe9b Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Fri, 5 Jul 2019 17:09:09 -0400 Subject: [PATCH 153/153] blk-iolatency: fix STS_AGAIN handling The iolatency controller is based on rq_qos. It increments on rq_qos_throttle() and decrements on either rq_qos_cleanup() or rq_qos_done_bio(). a3fb01ba5af0 fixes the double accounting issue where blk_mq_make_request() may call both rq_qos_cleanup() and rq_qos_done_bio() on REQ_NO_WAIT. So checking STS_AGAIN prevents the double decrement. The above works upstream as the only way we can get STS_AGAIN is from blk_mq_get_request() failing. The STS_AGAIN handling isn't a real problem as bio_endio() skipping only happens on reserved tag allocation failures which can only be caused by driver bugs and already triggers WARN. However, the fix creates a not so great dependency on how STS_AGAIN can be propagated. Internally, we (Facebook) carry a patch that kills read ahead if a cgroup is io congested or a fatal signal is pending. This combined with chained bios progagate their bi_status to the parent is not already set can can cause the parent bio to not clean up properly even though it was successful. This consequently leaks the inflight counter and can hang all IOs under that blkg. To nip the adverse interaction early, this removes the rq_qos_cleanup() callback in iolatency in favor of cleaning up always on the rq_qos_done_bio() path. Fixes: a3fb01ba5af0 ("blk-iolatency: only account submitted bios") Debugged-by: Tejun Heo Debugged-by: Josef Bacik Signed-off-by: Dennis Zhou Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 51 ++++++++++++------------------------------- 1 file changed, 14 insertions(+), 37 deletions(-) diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index e8859350ab6e..d973c38ee4fd 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -600,10 +600,6 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) if (!blkg || !bio_flagged(bio, BIO_TRACKED)) return; - /* We didn't actually submit this bio, don't account it. */ - if (bio->bi_status == BLK_STS_AGAIN) - return; - iolat = blkg_to_lat(bio->bi_blkg); if (!iolat) return; @@ -622,44 +618,26 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) inflight = atomic_dec_return(&rqw->inflight); WARN_ON_ONCE(inflight < 0); - if (iolat->min_lat_nsec == 0) - goto next; - iolatency_record_time(iolat, &bio->bi_issue, now, - issue_as_root); - window_start = atomic64_read(&iolat->window_start); - if (now > window_start && - (now - window_start) >= iolat->cur_win_nsec) { - if (atomic64_cmpxchg(&iolat->window_start, - window_start, now) == window_start) - iolatency_check_latencies(iolat, now); + /* + * If bi_status is BLK_STS_AGAIN, the bio wasn't actually + * submitted, so do not account for it. + */ + if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) { + iolatency_record_time(iolat, &bio->bi_issue, now, + issue_as_root); + window_start = atomic64_read(&iolat->window_start); + if (now > window_start && + (now - window_start) >= iolat->cur_win_nsec) { + if (atomic64_cmpxchg(&iolat->window_start, + window_start, now) == window_start) + iolatency_check_latencies(iolat, now); + } } -next: wake_up(&rqw->wait); blkg = blkg->parent; } } -static void blkcg_iolatency_cleanup(struct rq_qos *rqos, struct bio *bio) -{ - struct blkcg_gq *blkg; - - blkg = bio->bi_blkg; - while (blkg && blkg->parent) { - struct rq_wait *rqw; - struct iolatency_grp *iolat; - - iolat = blkg_to_lat(blkg); - if (!iolat) - goto next; - - rqw = &iolat->rq_wait; - atomic_dec(&rqw->inflight); - wake_up(&rqw->wait); -next: - blkg = blkg->parent; - } -} - static void blkcg_iolatency_exit(struct rq_qos *rqos) { struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); @@ -671,7 +649,6 @@ static void blkcg_iolatency_exit(struct rq_qos *rqos) static struct rq_qos_ops blkcg_iolatency_ops = { .throttle = blkcg_iolatency_throttle, - .cleanup = blkcg_iolatency_cleanup, .done_bio = blkcg_iolatency_done_bio, .exit = blkcg_iolatency_exit, };