diff --git a/drivers/md/md-linear.h b/drivers/md/md-linear.h index 8381d651d4ed..24e97db50ebb 100644 --- a/drivers/md/md-linear.h +++ b/drivers/md/md-linear.h @@ -12,6 +12,6 @@ struct linear_conf struct rcu_head rcu; sector_t array_sectors; int raid_disks; /* a copy of mddev->raid_disks */ - struct dev_info disks[0]; + struct dev_info disks[]; }; #endif diff --git a/drivers/md/md.c b/drivers/md/md.c index 271e8a587354..f567f536b529 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -89,6 +89,7 @@ static struct module *md_cluster_mod; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; static struct workqueue_struct *md_misc_wq; +static struct workqueue_struct *md_rdev_misc_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); @@ -227,13 +228,13 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, goto abort; if (mddev->serial_info_pool == NULL) { - unsigned int noio_flag; - - noio_flag = memalloc_noio_save(); + /* + * already in memalloc noio context by + * mddev_suspend() + */ mddev->serial_info_pool = mempool_create_kmalloc_pool(NR_SERIAL_INFOS, sizeof(struct serial_info)); - memalloc_noio_restore(noio_flag); if (!mddev->serial_info_pool) { rdevs_uninit_serial(mddev); pr_err("can't alloc memory pool for serialization\n"); @@ -466,7 +467,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) { const int rw = bio_data_dir(bio); const int sgrp = op_stat_group(bio_op(bio)); - struct mddev *mddev = q->queuedata; + struct mddev *mddev = bio->bi_disk->private_data; unsigned int sectors; if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { @@ -527,11 +528,15 @@ void mddev_suspend(struct mddev *mddev) wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); del_timer_sync(&mddev->safemode_timer); + /* restrict memory reclaim I/O during raid array is suspend */ + mddev->noio_flag = memalloc_noio_save(); } EXPORT_SYMBOL_GPL(mddev_suspend); void mddev_resume(struct mddev *mddev) { + /* entred the memalloc scope from mddev_suspend() */ + memalloc_noio_restore(mddev->noio_flag); lockdep_assert_held(&mddev->reconfig_mutex); if (--mddev->suspended) return; @@ -2454,7 +2459,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) return err; } -static void md_delayed_delete(struct work_struct *ws) +static void rdev_delayed_delete(struct work_struct *ws) { struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); kobject_del(&rdev->kobj); @@ -2479,9 +2484,9 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) * to delay it due to rcu usage. */ synchronize_rcu(); - INIT_WORK(&rdev->del_work, md_delayed_delete); + INIT_WORK(&rdev->del_work, rdev_delayed_delete); kobject_get(&rdev->kobj); - queue_work(md_misc_wq, &rdev->del_work); + queue_work(md_rdev_misc_wq, &rdev->del_work); } /* @@ -3191,8 +3196,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) rdev->saved_raid_disk = -1; clear_bit(In_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags); - err = rdev->mddev->pers-> - hot_add_disk(rdev->mddev, rdev); + err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); if (err) { rdev->raid_disk = -1; return err; @@ -4514,6 +4518,20 @@ null_show(struct mddev *mddev, char *page) return -EINVAL; } +/* need to ensure rdev_delayed_delete() has completed */ +static void flush_rdev_wq(struct mddev *mddev) +{ + struct md_rdev *rdev; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (work_pending(&rdev->del_work)) { + flush_workqueue(md_rdev_misc_wq); + break; + } + rcu_read_unlock(); +} + static ssize_t new_dev_store(struct mddev *mddev, const char *buf, size_t len) { @@ -4541,8 +4559,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) minor != MINOR(dev)) return -EOVERFLOW; - flush_workqueue(md_misc_wq); - + flush_rdev_wq(mddev); err = mddev_lock(mddev); if (err) return err; @@ -4780,7 +4797,8 @@ action_store(struct mddev *mddev, const char *page, size_t len) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && mddev_lock(mddev) == 0) { - flush_workqueue(md_misc_wq); + if (work_pending(&mddev->del_work)) + flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); @@ -5626,7 +5644,6 @@ static int md_alloc(dev_t dev, char *name) mddev->queue = blk_alloc_queue(md_make_request, NUMA_NO_NODE); if (!mddev->queue) goto abort; - mddev->queue->queuedata = mddev; blk_set_stacking_limits(&mddev->queue->limits); @@ -6147,7 +6164,8 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - flush_workqueue(md_misc_wq); + if (work_pending(&mddev->del_work)) + flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); @@ -6200,7 +6218,8 @@ static void __md_stop(struct mddev *mddev) md_bitmap_destroy(mddev); mddev_detach(mddev); /* Ensure ->event_work is done */ - flush_workqueue(md_misc_wq); + if (mddev->event_work.func) + flush_workqueue(md_misc_wq); spin_lock(&mddev->lock); mddev->pers = NULL; spin_unlock(&mddev->lock); @@ -7495,9 +7514,8 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, } - if (cmd == ADD_NEW_DISK) - /* need to ensure md_delayed_delete() has completed */ - flush_workqueue(md_misc_wq); + if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK) + flush_rdev_wq(mddev); if (cmd == HOT_REMOVE_DISK) /* need to ensure recovery thread has run */ @@ -7752,7 +7770,8 @@ static int md_open(struct block_device *bdev, fmode_t mode) */ mddev_put(mddev); /* Wait until bdev->bd_disk is definitely gone */ - flush_workqueue(md_misc_wq); + if (work_pending(&mddev->del_work)) + flush_workqueue(md_misc_wq); /* Then retry the open from the top */ return -ERESTARTSYS; } @@ -9040,8 +9059,7 @@ static int remove_and_add_spares(struct mddev *mddev, rdev->recovery_offset = 0; } - if (mddev->pers-> - hot_add_disk(mddev, rdev) == 0) { + if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { if (sysfs_link_rdev(mddev, rdev)) /* failure here is OK */; if (!test_bit(Journal, &rdev->flags)) @@ -9469,6 +9487,10 @@ static int __init md_init(void) if (!md_misc_wq) goto err_misc_wq; + md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0); + if (!md_misc_wq) + goto err_rdev_misc_wq; + if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) goto err_md; @@ -9490,6 +9512,8 @@ static int __init md_init(void) err_mdp: unregister_blkdev(MD_MAJOR, "md"); err_md: + destroy_workqueue(md_rdev_misc_wq); +err_rdev_misc_wq: destroy_workqueue(md_misc_wq); err_misc_wq: destroy_workqueue(md_wq); @@ -9776,6 +9800,7 @@ static __exit void md_exit(void) * destroy_workqueue() below will wait for that to complete. */ } + destroy_workqueue(md_rdev_misc_wq); destroy_workqueue(md_misc_wq); destroy_workqueue(md_wq); } @@ -9785,7 +9810,7 @@ module_exit(md_exit) static int get_ro(char *buffer, const struct kernel_param *kp) { - return sprintf(buffer, "%d", start_readonly); + return sprintf(buffer, "%d\n", start_readonly); } static int set_ro(const char *val, const struct kernel_param *kp) { diff --git a/drivers/md/md.h b/drivers/md/md.h index acd681939112..612814d07d35 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -497,6 +497,7 @@ struct mddev { void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; unsigned int good_device_nr; /* good device num within cluster raid */ + unsigned int noio_flag; /* for memalloc scope API */ bool has_superblocks:1; bool fail_last_dev:1; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index cd810e195086..dcd27f3da84e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -296,22 +296,17 @@ static void reschedule_retry(struct r1bio *r1_bio) static void call_bio_endio(struct r1bio *r1_bio) { struct bio *bio = r1_bio->master_bio; - struct r1conf *conf = r1_bio->mddev->private; if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) bio->bi_status = BLK_STS_IOERR; bio_endio(bio); - /* - * Wake up any possible resync thread that waits for the device - * to go idle. - */ - allow_barrier(conf, r1_bio->sector); } static void raid_end_bio_io(struct r1bio *r1_bio) { struct bio *bio = r1_bio->master_bio; + struct r1conf *conf = r1_bio->mddev->private; /* if nobody has done the final endio yet, do it now */ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { @@ -322,6 +317,12 @@ static void raid_end_bio_io(struct r1bio *r1_bio) call_bio_endio(r1_bio); } + /* + * Wake up any possible resync thread that waits for the device + * to go idle. All I/Os, even write-behind writes, are done. + */ + allow_barrier(conf, r1_bio->sector); + free_r1bio(r1_bio); } diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index e7ccad898736..b7eb09e8c025 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -180,7 +180,7 @@ struct r1bio { * if the IO is in WRITE direction, then multiple bios are used. * We choose the number when they are allocated. */ - struct bio *bios[0]; + struct bio *bios[]; /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ }; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index d3eaaf3eb1bc..79cd2b7d3128 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -153,7 +153,7 @@ struct r10bio { }; sector_t addr; int devnum; - } devs[0]; + } devs[]; }; /* bits for r10bio.state */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ba00e9877f02..ab8067f9ce8c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2215,10 +2215,13 @@ static int grow_stripes(struct r5conf *conf, int num) } /** - * scribble_len - return the required size of the scribble region + * scribble_alloc - allocate percpu scribble buffer for required size + * of the scribble region + * @percpu - from for_each_present_cpu() of the caller * @num - total number of disks in the array + * @cnt - scribble objs count for required size of the scribble region * - * The size must be enough to contain: + * The scribble buffer size must be enough to contain: * 1/ a struct page pointer for each device in the array +2 * 2/ room to convert each entry in (1) to its corresponding dma * (dma_map_page()) or page (page_address()) address. @@ -2228,14 +2231,19 @@ static int grow_stripes(struct r5conf *conf, int num) * of the P and Q blocks. */ static int scribble_alloc(struct raid5_percpu *percpu, - int num, int cnt, gfp_t flags) + int num, int cnt) { size_t obj_size = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); void *scribble; - scribble = kvmalloc_array(cnt, obj_size, flags); + /* + * If here is in raid array suspend context, it is in memalloc noio + * context as well, there is no potential recursive memory reclaim + * I/Os with the GFP_KERNEL flag. + */ + scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL); if (!scribble) return -ENOMEM; @@ -2267,8 +2275,7 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) percpu = per_cpu_ptr(conf->percpu, cpu); err = scribble_alloc(percpu, new_disks, - new_sectors / STRIPE_SECTORS, - GFP_NOIO); + new_sectors / STRIPE_SECTORS); if (err) break; } @@ -6759,8 +6766,7 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu conf->previous_raid_disks), max(conf->chunk_sectors, conf->prev_chunk_sectors) - / STRIPE_SECTORS, - GFP_KERNEL)) { + / STRIPE_SECTORS)) { free_scratch_buffer(conf, percpu); return -ENOMEM; }