From e3f948cd3283e4fbe5907f1f3967c839912f480e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 6 Oct 2016 14:09:16 -0700 Subject: [PATCH 1/8] RAID1: ignore discard error If a write error occurs, raid1 will try to rewrite the bio in small chunk size. If the rewrite fails, raid1 will record the error in bad block. narrow_write_error will always use WRITE for the bio, but actually it could be a discard. Since discard bio hasn't payload, write the bio will cause different issues. But discard error isn't fatal, we can safely ignore it. This is what this patch does. This issue should exist since discard is added, but only exposed with recent arbitrary bio size feature. Reported-and-tested-by: Sitsofe Wheeler Cc: stable@vger.kernel.org (v3.6) Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1961d827dbd1..db536a68b2ee 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -403,11 +403,14 @@ static void raid1_end_write_request(struct bio *bio) struct bio *to_put = NULL; int mirror = find_bio_disk(r1_bio, bio); struct md_rdev *rdev = conf->mirrors[mirror].rdev; + bool discard_error; + + discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD; /* * 'one mirror IO has finished' event handler: */ - if (bio->bi_error) { + if (bio->bi_error && !discard_error) { set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) set_bit(MD_RECOVERY_NEEDED, & @@ -444,7 +447,7 @@ static void raid1_end_write_request(struct bio *bio) /* Maybe we can clear some bad blocks. */ if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, - &first_bad, &bad_sectors)) { + &first_bad, &bad_sectors) && !discard_error) { r1_bio->bios[mirror] = IO_MADE_GOOD; set_bit(R1BIO_MadeGood, &r1_bio->state); } From 579ed34f7b751b8add233cba4cf755258dbdd60a Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 6 Oct 2016 14:13:52 -0700 Subject: [PATCH 2/8] RAID10: ignore discard error This is the counterpart of raid10 fix. If a write error occurs, raid10 will try to rewrite the bio in small chunk size. If the rewrite fails, raid10 will record the error in bad block. narrow_write_error will always use WRITE for the bio, but actually it could be a discard. Since discard bio hasn't payload, write the bio will cause different issues. But discard error isn't fatal, we can safely ignore it. This is what this patch does. This issue should exist since discard is added, but only exposed with recent arbitrary bio size feature. Cc: Sitsofe Wheeler Cc: stable@vger.kernel.org (v3.6) Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index be1a9fca3b2d..39fddda2fef2 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -447,6 +447,9 @@ static void raid10_end_write_request(struct bio *bio) struct r10conf *conf = r10_bio->mddev->private; int slot, repl; struct md_rdev *rdev = NULL; + bool discard_error; + + discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD; dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); @@ -460,7 +463,7 @@ static void raid10_end_write_request(struct bio *bio) /* * this branch is our 'one mirror IO has finished' event handler: */ - if (bio->bi_error) { + if (bio->bi_error && !discard_error) { if (repl) /* Never record new bad blocks to replacement, * just fail it. @@ -503,7 +506,7 @@ static void raid10_end_write_request(struct bio *bio) if (is_badblock(rdev, r10_bio->devs[slot].addr, r10_bio->sectors, - &first_bad, &bad_sectors)) { + &first_bad, &bad_sectors) && !discard_error) { bio_put(bio); if (repl) r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; From 28cd88e2b4c54a466dcae7eea1efac766d42386b Mon Sep 17 00:00:00 2001 From: Zhengyuan Liu Date: Mon, 24 Oct 2016 09:55:20 +0800 Subject: [PATCH 3/8] md/raid5: initialize next_checkpoint field before use No initial operation was done to this field when we load/recovery the log, it got assignment only when IO to raid disk was finished. So r5l_quiesce may use wrong next_checkpoint to reclaim log space, that would make reclaimable space calculation confused. Signed-off-by: Zhengyuan Liu Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 1b1ab4a1d132..998ea0025dd0 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1096,6 +1096,8 @@ static int r5l_recovery_log(struct r5l_log *log) log->seq = ctx.seq + 11; log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); r5l_write_super(log, ctx.pos); + log->last_checkpoint = ctx.pos; + log->next_checkpoint = ctx.pos; } else { log->log_start = ctx.pos; log->seq = ctx.seq; @@ -1168,6 +1170,7 @@ static int r5l_load_log(struct r5l_log *log) if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) log->max_free_space = RECLAIM_MAX_FREE_SPACE; log->last_checkpoint = cp; + log->next_checkpoint = cp; __free_page(page); From 56056c2e7d58ee705755efbe780aefff987a1dc8 Mon Sep 17 00:00:00 2001 From: Zhengyuan Liu Date: Mon, 24 Oct 2016 16:15:59 +0800 Subject: [PATCH 4/8] md/raid5: write an empty meta-block when creating log super-block If superblock points to an invalid meta block, r5l_load_log will set create_super with true and create an new superblock, this runtime path would always happen if we do no writing I/O to this array since it was created. Writing an empty meta block could avoid this unnecessary action at the first time we created log superblock. Another reason is for the corretness of log recovery. Currently we have bellow code to guarantee log revocery to be correct. if (ctx.seq > log->last_cp_seq + 1) { int ret; ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10); if (ret) return ret; log->seq = ctx.seq + 11; log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); r5l_write_super(log, ctx.pos); } else { log->log_start = ctx.pos; log->seq = ctx.seq; } If we just created a array with a journal device, log->log_start and log->last_checkpoint should all be 0, then we write three meta block which are valid except mid one and supposed crash happened. The ctx.seq would equal to log->last_cp_seq + 1 and log->log_start would be set to position of mid invalid meta block after we did a recovery, this will lead to problems which could be avoided with this patch. Signed-off-by: Zhengyuan Liu Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 998ea0025dd0..981f85515191 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1156,6 +1156,7 @@ static int r5l_load_log(struct r5l_log *log) if (create_super) { log->last_cp_seq = prandom_u32(); cp = 0; + r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq); /* * Make sure super points to correct address. Log might have * data very soon. If super hasn't correct log tail address, From 16f889499a5214ebe038f8bd00f4c0094ed0ed75 Mon Sep 17 00:00:00 2001 From: Tomasz Majchrzak Date: Mon, 24 Oct 2016 12:47:28 +0200 Subject: [PATCH 5/8] md: report 'write_pending' state when array in sync If there is a bad block on a disk and there is a recovery performed from this disk, the same bad block is reported for a new disk. It involves setting MD_CHANGE_PENDING flag in rdev_set_badblocks. For external metadata this flag is not being cleared as array state is reported as 'clean'. The read request to bad block in RAID5 array gets stuck as it is waiting for a flag to be cleared - as per commit c3cce6cda162 ("md/raid5: ensure device failure recorded before write request returns."). The meaning of MD_CHANGE_PENDING and MD_CHANGE_CLEAN flags has been clarified in commit 070dc6dd7103 ("md: resolve confusion of MD_CHANGE_CLEAN"), however MD_CHANGE_PENDING flag has been used in personality error handlers since and it doesn't fully comply with initial purpose. It was supposed to notify that write request is about to start, however now it is also used to request metadata update. Initially (in md_allow_write, md_write_start) MD_CHANGE_PENDING flag has been set and in_sync has been set to 0 at the same time. Error handlers just set the flag without modifying in_sync value. Sysfs array state is a single value so now it reports 'clean' when MD_CHANGE_PENDING flag is set and in_sync is set to 1. Userspace has no idea it is expected to take some action. Swap the order that array state is checked so 'write_pending' is reported ahead of 'clean' ('write_pending' is a misleading name but it is too late to rename it now). Signed-off-by: Tomasz Majchrzak Signed-off-by: Shaohua Li --- drivers/md/md.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index eac84d8ff724..25b57a55db1a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3887,10 +3887,10 @@ array_state_show(struct mddev *mddev, char *page) st = read_auto; break; case 0: - if (mddev->in_sync) - st = clean; - else if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) + if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) st = write_pending; + else if (mddev->in_sync) + st = clean; else if (mddev->safemode) st = active_idle; else From 9a8b27fac5bbb77337cc2e5d31d37c9936782d87 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 27 Oct 2016 15:22:13 -0700 Subject: [PATCH 6/8] raid5-cache: correct condition for empty metadata write As long as we recover one metadata block, we should write the empty metadata write. The original code could make recovery corrupted if only one meta is valid. Reported-by: Zhengyuan Liu Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 981f85515191..a227a9f3ee65 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1087,7 +1087,7 @@ static int r5l_recovery_log(struct r5l_log *log) * 1's seq + 10 and let superblock points to meta2. The same recovery will * not think meta 3 is a valid meta, because its seq doesn't match */ - if (ctx.seq > log->last_cp_seq + 1) { + if (ctx.seq > log->last_cp_seq) { int ret; ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10); From 7449f699b2fb23bdee0a0f03aa4efb5f96fd403f Mon Sep 17 00:00:00 2001 From: Tomasz Majchrzak Date: Fri, 28 Oct 2016 14:45:58 +0200 Subject: [PATCH 7/8] raid1: handle read error also in readonly mode If write is the first operation on a disk and it happens not to be aligned to page size, block layer sends read request first. If read operation fails, the disk is set as failed as no attempt to fix the error is made because array is in auto-readonly mode. Similarily, the disk is set as failed for read-only array. Take the same approach as in raid10. Don't fail the disk if array is in readonly or auto-readonly mode. Try to redirect the request first and if unsuccessful, return a read error. Signed-off-by: Tomasz Majchrzak Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index db536a68b2ee..29e2df5cd77b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2297,17 +2297,23 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) * This is all done synchronously while the array is * frozen */ + + bio = r1_bio->bios[r1_bio->read_disk]; + bdevname(bio->bi_bdev, b); + bio_put(bio); + r1_bio->bios[r1_bio->read_disk] = NULL; + if (mddev->ro == 0) { freeze_array(conf, 1); fix_read_error(conf, r1_bio->read_disk, r1_bio->sector, r1_bio->sectors); unfreeze_array(conf); - } else - md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); + } else { + r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED; + } + rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); - bio = r1_bio->bios[r1_bio->read_disk]; - bdevname(bio->bi_bdev, b); read_more: disk = read_balance(conf, r1_bio, &max_sectors); if (disk == -1) { @@ -2318,11 +2324,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) } else { const unsigned long do_sync = r1_bio->master_bio->bi_opf & REQ_SYNC; - if (bio) { - r1_bio->bios[r1_bio->read_disk] = - mddev->ro ? IO_BLOCKED : NULL; - bio_put(bio); - } r1_bio->read_disk = disk; bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector, From 1217e1d1999ed6c9c1e1b1acae0a74ab70464ae2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 28 Oct 2016 15:59:41 +1100 Subject: [PATCH 8/8] md: be careful not lot leak internal curr_resync value into metadata. -- (all) mddev->curr_resync usually records where the current resync is up to, but during the starting phase it has some "magic" values. 1 - means that the array is trying to start a resync, but has yielded to another array which shares physical devices, and also needs to start a resync 2 - means the array is trying to start resync, but has found another array which shares physical devices and has already started resync. 3 - means that resync has commensed, but it is possible that nothing has actually been resynced yet. It is important that this value not be visible to user-space and particularly that it doesn't get written to the metadata, as the resync or recovery checkpoint. In part, this is because it may be slightly higher than the correct value, though this is very rare. In part, because it is not a multiple of 4K, and some devices only support 4K aligned accesses. There are two places where this value is propagates into either ->curr_resync_completed or ->recovery_cp or ->recovery_offset. These currently avoid the propagation of values 1 and 3, but will allow 3 to leak through. Change them to only propagate the value if it is > 3. As this can cause an array to fail, the patch is suitable for -stable. Cc: stable@vger.kernel.org (v3.7+) Reported-by: Viswesh Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 25b57a55db1a..2089d46b0eb8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8144,14 +8144,14 @@ void md_do_sync(struct md_thread *thread) if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - mddev->curr_resync > 2) { + mddev->curr_resync > 3) { mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } mddev->pers->sync_request(mddev, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && - mddev->curr_resync > 2) { + mddev->curr_resync > 3) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (mddev->curr_resync >= mddev->recovery_cp) {