From 2d5b569b665ea6d0b15c52529ff06300de81a7ce Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 6 Jul 2015 12:49:23 +1000 Subject: [PATCH 01/10] md/raid5: avoid races when changing cache size. Cache size can grow or shrink due to various pressures at any time. So when we resize the cache as part of a 'grow' operation (i.e. change the size to allow more devices) we need to blocks that automatic growing/shrinking. So introduce a mutex. auto grow/shrink uses mutex_trylock() and just doesn't bother if there is a blockage. Resizing the whole cache holds the mutex to ensure that the correct number of new stripes is allocated. This bug can result in some stripes not being freed when an array is stopped. This leads to the kmem_cache not being freed and a subsequent array can try to use the same kmem_cache and get confused. Fixes: edbe83ab4c27 ("md/raid5: allow the stripe_cache to grow and shrink.") Cc: stable@vger.kernel.org (4.1 - please delay until 2 weeks after release of 4.2) Signed-off-by: NeilBrown --- drivers/md/raid5.c | 31 +++++++++++++++++++++++++------ drivers/md/raid5.h | 3 ++- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 59e44e99eef3..38300ee3c4e1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2162,6 +2162,9 @@ static int resize_stripes(struct r5conf *conf, int newsize) if (!sc) return -ENOMEM; + /* Need to ensure auto-resizing doesn't interfere */ + mutex_lock(&conf->cache_size_mutex); + for (i = conf->max_nr_stripes; i; i--) { nsh = alloc_stripe(sc, GFP_KERNEL); if (!nsh) @@ -2178,6 +2181,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) kmem_cache_free(sc, nsh); } kmem_cache_destroy(sc); + mutex_unlock(&conf->cache_size_mutex); return -ENOMEM; } /* Step 2 - Must use GFP_NOIO now. @@ -2224,6 +2228,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) } else err = -ENOMEM; + mutex_unlock(&conf->cache_size_mutex); /* Step 4, return new stripes to service */ while(!list_empty(&newstripes)) { nsh = list_entry(newstripes.next, struct stripe_head, lru); @@ -5857,12 +5862,14 @@ static void raid5d(struct md_thread *thread) pr_debug("%d stripes handled\n", handled); spin_unlock_irq(&conf->device_lock); - if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) { + if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) && + mutex_trylock(&conf->cache_size_mutex)) { grow_one_stripe(conf, __GFP_NOWARN); /* Set flag even if allocation failed. This helps * slow down allocation requests when mem is short */ set_bit(R5_DID_ALLOC, &conf->cache_state); + mutex_unlock(&conf->cache_size_mutex); } async_tx_issue_pending_all(); @@ -5894,18 +5901,22 @@ raid5_set_cache_size(struct mddev *mddev, int size) return -EINVAL; conf->min_nr_stripes = size; + mutex_lock(&conf->cache_size_mutex); while (size < conf->max_nr_stripes && drop_one_stripe(conf)) ; + mutex_unlock(&conf->cache_size_mutex); err = md_allow_write(mddev); if (err) return err; + mutex_lock(&conf->cache_size_mutex); while (size > conf->max_nr_stripes) if (!grow_one_stripe(conf, GFP_KERNEL)) break; + mutex_unlock(&conf->cache_size_mutex); return 0; } @@ -6371,11 +6382,18 @@ static unsigned long raid5_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); - int ret = 0; - while (ret < sc->nr_to_scan) { - if (drop_one_stripe(conf) == 0) - return SHRINK_STOP; - ret++; + unsigned long ret = SHRINK_STOP; + + if (mutex_trylock(&conf->cache_size_mutex)) { + ret= 0; + while (ret < sc->nr_to_scan) { + if (drop_one_stripe(conf) == 0) { + ret = SHRINK_STOP; + break; + } + ret++; + } + mutex_unlock(&conf->cache_size_mutex); } return ret; } @@ -6444,6 +6462,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) goto abort; spin_lock_init(&conf->device_lock); seqcount_init(&conf->gen_lock); + mutex_init(&conf->cache_size_mutex); init_waitqueue_head(&conf->wait_for_quiescent); for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) { init_waitqueue_head(&conf->wait_for_stripe[i]); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 02c3bf8fbfe7..d05144278690 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -482,7 +482,8 @@ struct r5conf { */ int active_name; char cache_name[2][32]; - struct kmem_cache *slab_cache; /* for allocating stripes */ + struct kmem_cache *slab_cache; /* for allocating stripes */ + struct mutex cache_size_mutex; /* Protect changes to cache size */ int seq_flush, seq_write; int quiesce; From 299b0685e31c9f3dcc2d58ee3beca761a40b44b3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 6 Jul 2015 17:37:49 +1000 Subject: [PATCH 02/10] md/raid10: always set reshape_safe when initializing reshape_position. 'reshape_position' tracks where in the reshape we have reached. 'reshape_safe' tracks where in the reshape we have safely recorded in the metadata. These are compared to determine when to update the metadata. So it is important that reshape_safe is initialised properly. Currently it isn't. When starting a reshape from the beginning it usually has the correct value by luck. But when reducing the number of devices in a RAID10, it has the wrong value and this leads to the metadata not being updated correctly. This can lead to corruption if the reshape is not allowed to complete. This patch is suitable for any -stable kernel which supports RAID10 reshape, which is 3.5 and later. Fixes: 3ea7daa5d7fd ("md/raid10: add reshape support") Cc: stable@vger.kernel.org (v3.5+ please wait for -final to be out for 2 weeks) Signed-off-by: NeilBrown --- drivers/md/raid10.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 940f2f365461..38c58e19cfce 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3556,6 +3556,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) /* far_copies must be 1 */ conf->prev.stride = conf->dev_sectors; } + conf->reshape_safe = conf->reshape_progress; spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); @@ -3760,7 +3761,6 @@ static int run(struct mddev *mddev) } conf->offset_diff = min_offset_diff; - conf->reshape_safe = conf->reshape_progress; clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); @@ -4103,6 +4103,7 @@ static int raid10_start_reshape(struct mddev *mddev) conf->reshape_progress = size; } else conf->reshape_progress = 0; + conf->reshape_safe = conf->reshape_progress; spin_unlock_irq(&conf->device_lock); if (mddev->delta_disks && mddev->bitmap) { @@ -4170,6 +4171,7 @@ static int raid10_start_reshape(struct mddev *mddev) rdev->new_data_offset = rdev->data_offset; smp_wmb(); conf->reshape_progress = MaxSector; + conf->reshape_safe = MaxSector; mddev->reshape_position = MaxSector; spin_unlock_irq(&conf->device_lock); return ret; @@ -4524,6 +4526,7 @@ static void end_reshape(struct r10conf *conf) md_finish_reshape(conf->mddev); smp_wmb(); conf->reshape_progress = MaxSector; + conf->reshape_safe = MaxSector; spin_unlock_irq(&conf->device_lock); /* read-ahead size must cover two whole stripes, which is From ee5d004fd0591536a061451eba2b187092e9127c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 22 Jul 2015 10:20:07 +1000 Subject: [PATCH 03/10] md: flush ->event_work before stopping array. The 'event_work' worker used by dm-raid may still be running when the array is stopped. This can result in an oops. So flush the workqueue on which it is run after detaching and before destroying the device. Reported-by: Heinz Mauelshagen Signed-off-by: NeilBrown Cc: stable@vger.kernel.org (2.6.38+ please delay 2 weeks after -final release) Fixes: 9d09e663d550 ("dm: raid456 basic support") --- drivers/md/md.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index df92d30ca054..5025b3ec13cd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5382,6 +5382,8 @@ static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; mddev_detach(mddev); + /* Ensure ->event_work is done */ + flush_workqueue(md_misc_wq); spin_lock(&mddev->lock); mddev->ready = 0; mddev->pers = NULL; From d3b178adb3a3adf54ecf77758138b654c3ee7f09 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 22 Jul 2015 12:09:17 -0500 Subject: [PATCH 04/10] md: Skip cluster setup for dm-raid There is a bug that the bitmap superblock isn't initialised properly for dm-raid, so a new field can have garbage in new fields. (dm-raid does initialisation in the kernel - md initialised the superblock in mdadm). This means that for dm-raid we cannot currently trust the new ->nodes field. So: - use __GFP_ZERO to initialise the superblock properly for all new arrays - initialise all fields in bitmap_info in bitmap_new_disk_sb - ignore ->nodes for dm arrays (yes, this is a hack) This bug exposes dm-raid to bug in the (still experimental) md-cluster code, so it is suitable for -stable. It does cause crashes. References: https://bugzilla.kernel.org/show_bug.cgi?id=100491 Cc: stable@vger.kernel.org (v4.1) Signed-off-By: Goldwyn Rodrigues Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 135a0907e9de..c90118e90708 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -494,7 +494,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) bitmap_super_t *sb; unsigned long chunksize, daemon_sleep, write_behind; - bitmap->storage.sb_page = alloc_page(GFP_KERNEL); + bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (bitmap->storage.sb_page == NULL) return -ENOMEM; bitmap->storage.sb_page->index = 0; @@ -541,6 +541,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) sb->state = cpu_to_le32(bitmap->flags); bitmap->events_cleared = bitmap->mddev->events; sb->events_cleared = cpu_to_le64(bitmap->mddev->events); + bitmap->mddev->bitmap_info.nodes = 0; kunmap_atomic(sb); @@ -611,8 +612,16 @@ static int bitmap_read_sb(struct bitmap *bitmap) daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; write_behind = le32_to_cpu(sb->write_behind); sectors_reserved = le32_to_cpu(sb->sectors_reserved); - nodes = le32_to_cpu(sb->nodes); - strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64); + /* XXX: This is a hack to ensure that we don't use clustering + * in case: + * - dm-raid is in use and + * - the nodes written in bitmap_sb is erroneous. + */ + if (!bitmap->mddev->sync_super) { + nodes = le32_to_cpu(sb->nodes); + strlcpy(bitmap->mddev->bitmap_info.cluster_name, + sb->cluster_name, 64); + } /* verify that the bitmap-specific fields are valid */ if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) From 34cab6f42003cb06f48f86a86652984dec338ae9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 24 Jul 2015 09:22:16 +1000 Subject: [PATCH 05/10] md/raid1: fix test for 'was read error from last working device'. When we get a read error from the last working device, we don't try to repair it, and don't fail the device. We simple report a read error to the caller. However the current test for 'is this the last working device' is wrong. When there is only one fully working device, it assumes that a non-faulty device is that device. However a spare which is rebuilding would be non-faulty but so not the only working device. So change the test from "!Faulty" to "In_sync". If ->degraded says there is only one fully working device and this device is in_sync, this must be the one. This bug has existed since we allowed read_balance to read from a recovering spare in v3.0 Reported-and-tested-by: Alexander Lyakas Fixes: 76073054c95b ("md/raid1: clean up read_balance.") Cc: stable@vger.kernel.org (v3.0+) Signed-off-by: NeilBrown --- drivers/md/raid1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f80f1af61ce7..50cf0c893b16 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -336,7 +336,7 @@ static void raid1_end_read_request(struct bio *bio, int error) spin_lock_irqsave(&conf->device_lock, flags); if (r1_bio->mddev->degraded == conf->raid_disks || (r1_bio->mddev->degraded == conf->raid_disks-1 && - !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags))) + test_bit(In_sync, &conf->mirrors[mirror].rdev->flags))) uptodate = 1; spin_unlock_irqrestore(&conf->device_lock, flags); } From f7357273198adc86fe11c2a7be8a0816f44103bb Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 22 Jul 2015 12:09:16 -0500 Subject: [PATCH 06/10] md: Skip cluster setup in case of error while reading bitmap If the bitmap read fails, the error code set is -EINVAL. However, we don't check for errors and go ahead with cluster_setup. Skip the cluster setup in case of error. Signed-off-by: Goldwyn Rodrigues Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index c90118e90708..a4ab51350f40 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -680,7 +680,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) kunmap_atomic(sb); /* Assiging chunksize is required for "re_read" */ bitmap->mddev->bitmap_info.chunksize = chunksize; - if (nodes && (bitmap->cluster_slot < 0)) { + if (err == 0 && nodes && (bitmap->cluster_slot < 0)) { err = md_setup_cluster(bitmap->mddev, nodes); if (err) { pr_err("%s: Could not setup cluster service (%d)\n", From b0c26a79d6993b280931f8e2b406ca4b220bb58f Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 22 Jul 2015 12:09:15 -0500 Subject: [PATCH 07/10] md: Return error if request_module fails and returns positive value request_module() can return 256 (process exited) in some cases, which is not as specified in the documentation before the request_module() definition. Convert the error to -ENOENT. The positive error number results in bitmap_create() returning a value that is meant to be an error but doesn't look like one, so it is dereferenced as a point and causes a crash. (not needed for stable as this is "experimental" code) Fixes: edb39c9deda8 ("Introduce md_cluster_operations to handle cluster functions") Signed-off-By: Goldwyn Rodrigues Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 5025b3ec13cd..0fcf6de2415a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7439,7 +7439,7 @@ int md_setup_cluster(struct mddev *mddev, int nodes) err = request_module("md-cluster"); if (err) { pr_err("md-cluster module not found.\n"); - return err; + return -ENOENT; } spin_lock(&pers_lock); From 33e38ac6887d975fe2635c7fcaefb6d5495cb2e1 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 1 Jul 2015 12:19:56 +1000 Subject: [PATCH 08/10] md-cluster: fix bitmap sub-offset in bitmap_read_sb bitmap_read_sb is modifying mddev->bitmap_info.offset. This works for the first bitmap read. However, when multiple bitmaps need to be opened by the same node, it ends up corrupting the offset. Fix it by using a local variable. Also, bitmap_read_sb is not required in bitmap_copy_from_slot since it is called in bitmap_create. Remove bitmap_read_sb(). Signed-off-by: Goldwyn Rodrigues Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index a4ab51350f40..1b9c2df718c6 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -559,6 +559,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) unsigned long sectors_reserved = 0; int err = -EINVAL; struct page *sb_page; + loff_t offset = bitmap->mddev->bitmap_info.offset; if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) { chunksize = 128 * 1024 * 1024; @@ -585,9 +586,9 @@ static int bitmap_read_sb(struct bitmap *bitmap) bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t); /* to 4k blocks */ bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); - bitmap->mddev->bitmap_info.offset += bitmap->cluster_slot * (bm_blocks << 3); + offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3)); pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, - bitmap->cluster_slot, (unsigned long long)bitmap->mddev->bitmap_info.offset); + bitmap->cluster_slot, offset); } if (bitmap->storage.file) { @@ -598,7 +599,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) bitmap, bytes, sb_page); } else { err = read_sb_page(bitmap->mddev, - bitmap->mddev->bitmap_info.offset, + offset, sb_page, 0, sizeof(bitmap_super_t)); } @@ -1875,10 +1876,6 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot, if (IS_ERR(bitmap)) return PTR_ERR(bitmap); - rv = bitmap_read_sb(bitmap); - if (rv) - goto err; - rv = bitmap_init_from_disk(bitmap, 0); if (rv) goto err; From 90382ed9afeafd42ef193f0eadc6b2a252d6c24d Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 24 Jun 2015 09:30:32 -0500 Subject: [PATCH 09/10] Fix read-balancing during node failure During a node failure, We need to suspend read balancing so that the reads are directed to the first device and stale data is not read. Suspending writes is not required because these would be recorded and synced eventually. A new flag MD_CLUSTER_SUSPEND_READ_BALANCING is set in recover_prep(). area_resyncing() will respond true for the entire devices if this flag is set and the request type is READ. The flag is cleared in recover_done(). Signed-off-by: Goldwyn Rodrigues Reported-By: David Teigland Signed-off-by: NeilBrown --- drivers/md/md-cluster.c | 12 +++++++++++- drivers/md/md-cluster.h | 2 +- drivers/md/raid1.c | 7 ++++--- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index fcfc4b9b2672..0072190515e0 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -44,6 +44,7 @@ struct resync_info { /* md_cluster_info flags */ #define MD_CLUSTER_WAITING_FOR_NEWDISK 1 +#define MD_CLUSTER_SUSPEND_READ_BALANCING 2 struct md_cluster_info { @@ -275,6 +276,9 @@ static void recover_bitmaps(struct md_thread *thread) static void recover_prep(void *arg) { + struct mddev *mddev = arg; + struct md_cluster_info *cinfo = mddev->cluster_info; + set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state); } static void recover_slot(void *arg, struct dlm_slot *slot) @@ -307,6 +311,7 @@ static void recover_done(void *arg, struct dlm_slot *slots, cinfo->slot_number = our_slot; complete(&cinfo->completion); + clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state); } static const struct dlm_lockspace_ops md_ls_ops = { @@ -816,12 +821,17 @@ static void resync_finish(struct mddev *mddev) resync_send(mddev, RESYNCING, 0, 0); } -static int area_resyncing(struct mddev *mddev, sector_t lo, sector_t hi) +static int area_resyncing(struct mddev *mddev, int direction, + sector_t lo, sector_t hi) { struct md_cluster_info *cinfo = mddev->cluster_info; int ret = 0; struct suspend_info *s; + if ((direction == READ) && + test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state)) + return 1; + spin_lock_irq(&cinfo->suspend_lock); if (list_empty(&cinfo->suspend_list)) goto out; diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index 6817ee00e053..00defe2badbc 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -18,7 +18,7 @@ struct md_cluster_operations { int (*metadata_update_start)(struct mddev *mddev); int (*metadata_update_finish)(struct mddev *mddev); int (*metadata_update_cancel)(struct mddev *mddev); - int (*area_resyncing)(struct mddev *mddev, sector_t lo, sector_t hi); + int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi); int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev); int (*add_new_disk_finish)(struct mddev *mddev); int (*new_disk_ack)(struct mddev *mddev, bool ack); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 50cf0c893b16..94f5b55069e0 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -541,7 +541,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect if ((conf->mddev->recovery_cp < this_sector + sectors) || (mddev_is_clustered(conf->mddev) && - md_cluster_ops->area_resyncing(conf->mddev, this_sector, + md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, this_sector + sectors))) choose_first = 1; else @@ -1111,7 +1111,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) ((bio_end_sector(bio) > mddev->suspend_lo && bio->bi_iter.bi_sector < mddev->suspend_hi) || (mddev_is_clustered(mddev) && - md_cluster_ops->area_resyncing(mddev, bio->bi_iter.bi_sector, bio_end_sector(bio))))) { + md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, bio_end_sector(bio))))) { /* As the suspend_* range is controlled by * userspace, we want an interruptible * wait. @@ -1124,7 +1125,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) if (bio_end_sector(bio) <= mddev->suspend_lo || bio->bi_iter.bi_sector >= mddev->suspend_hi || (mddev_is_clustered(mddev) && - !md_cluster_ops->area_resyncing(mddev, + !md_cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, bio_end_sector(bio)))) break; schedule(); From e6030cb06c40e4ab4e8c712f13f494a09638ed2c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 17 Jul 2015 13:26:23 +1000 Subject: [PATCH 10/10] md/raid5: clear R5_NeedReplace when no longer needed. This flag is currently never cleared, which can in rare cases trigger a warn-on if it is still set but the block isn't InSync. So clear it when it isn't need, which includes if the replacement device has failed. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 38300ee3c4e1..643d217bfa13 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4066,8 +4066,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) &first_bad, &bad_sectors)) set_bit(R5_ReadRepl, &dev->flags); else { - if (rdev) + if (rdev && !test_bit(Faulty, &rdev->flags)) set_bit(R5_NeedReplace, &dev->flags); + else + clear_bit(R5_NeedReplace, &dev->flags); rdev = rcu_dereference(conf->disks[i].rdev); clear_bit(R5_ReadRepl, &dev->flags); }