Merge branch 'md-next' of https://github.com/liu-song-6/linux into for-5.3/block

Pull MD changes from Song.

* 'md-next' of https://github.com/liu-song-6/linux:
  md: add bitmap_abort label in md_run
  md-bitmap: create and destroy wb_info_pool with the change of bitmap
  md-bitmap: create and destroy wb_info_pool with the change of backlog
  md: introduce mddev_create/destroy_wb_pool for the change of member device
  md/raid1: fix potential data inconsistency issue with write behind device
This commit is contained in:
Jens Axboe 2019-06-20 23:37:20 -06:00
commit 8d54094ef4
4 changed files with 218 additions and 9 deletions

View File

@ -1790,6 +1790,8 @@ void md_bitmap_destroy(struct mddev *mddev)
return;
md_bitmap_wait_behind_writes(mddev);
mempool_destroy(mddev->wb_info_pool);
mddev->wb_info_pool = NULL;
mutex_lock(&mddev->bitmap_info.mutex);
spin_lock(&mddev->lock);
@ -1900,10 +1902,14 @@ int md_bitmap_load(struct mddev *mddev)
sector_t start = 0;
sector_t sector = 0;
struct bitmap *bitmap = mddev->bitmap;
struct md_rdev *rdev;
if (!bitmap)
goto out;
rdev_for_each(rdev, mddev)
mddev_create_wb_pool(mddev, rdev, true);
if (mddev_is_clustered(mddev))
md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
@ -2462,12 +2468,26 @@ static ssize_t
backlog_store(struct mddev *mddev, const char *buf, size_t len)
{
unsigned long backlog;
unsigned long old_mwb = mddev->bitmap_info.max_write_behind;
int rv = kstrtoul(buf, 10, &backlog);
if (rv)
return rv;
if (backlog > COUNTER_MAX)
return -EINVAL;
mddev->bitmap_info.max_write_behind = backlog;
if (!backlog && mddev->wb_info_pool) {
/* wb_info_pool is not needed if backlog is zero */
mempool_destroy(mddev->wb_info_pool);
mddev->wb_info_pool = NULL;
} else if (backlog && !mddev->wb_info_pool) {
/* wb_info_pool is needed since backlog is not zero */
struct md_rdev *rdev;
rdev_for_each(rdev, mddev)
mddev_create_wb_pool(mddev, rdev, false);
}
if (old_mwb != backlog)
md_bitmap_update_sb(mddev->bitmap);
return len;
}

View File

@ -37,6 +37,7 @@
*/
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/kthread.h>
#include <linux/blkdev.h>
@ -124,6 +125,77 @@ static inline int speed_max(struct mddev *mddev)
mddev->sync_speed_max : sysctl_speed_limit_max;
}
static int rdev_init_wb(struct md_rdev *rdev)
{
if (rdev->bdev->bd_queue->nr_hw_queues == 1)
return 0;
spin_lock_init(&rdev->wb_list_lock);
INIT_LIST_HEAD(&rdev->wb_list);
init_waitqueue_head(&rdev->wb_io_wait);
set_bit(WBCollisionCheck, &rdev->flags);
return 1;
}
/*
* Create wb_info_pool if rdev is the first multi-queue device flaged
* with writemostly, also write-behind mode is enabled.
*/
void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
bool is_suspend)
{
if (mddev->bitmap_info.max_write_behind == 0)
return;
if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev))
return;
if (mddev->wb_info_pool == NULL) {
unsigned int noio_flag;
if (!is_suspend)
mddev_suspend(mddev);
noio_flag = memalloc_noio_save();
mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS,
sizeof(struct wb_info));
memalloc_noio_restore(noio_flag);
if (!mddev->wb_info_pool)
pr_err("can't alloc memory pool for writemostly\n");
if (!is_suspend)
mddev_resume(mddev);
}
}
EXPORT_SYMBOL_GPL(mddev_create_wb_pool);
/*
* destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck.
*/
static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev)
{
if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags))
return;
if (mddev->wb_info_pool) {
struct md_rdev *temp;
int num = 0;
/*
* Check if other rdevs need wb_info_pool.
*/
rdev_for_each(temp, mddev)
if (temp != rdev &&
test_bit(WBCollisionCheck, &temp->flags))
num++;
if (!num) {
mddev_suspend(rdev->mddev);
mempool_destroy(mddev->wb_info_pool);
mddev->wb_info_pool = NULL;
mddev_resume(rdev->mddev);
}
}
}
static struct ctl_table_header *raid_table_header;
static struct ctl_table raid_table[] = {
@ -2210,6 +2282,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
rdev->mddev = mddev;
pr_debug("md: bind<%s>\n", b);
if (mddev->raid_disks)
mddev_create_wb_pool(mddev, rdev, false);
if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
goto fail;
@ -2246,6 +2321,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set);
pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
mddev_destroy_wb_pool(rdev->mddev, rdev);
rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state);
@ -2758,8 +2834,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
}
} else if (cmd_match(buf, "writemostly")) {
set_bit(WriteMostly, &rdev->flags);
mddev_create_wb_pool(rdev->mddev, rdev, false);
err = 0;
} else if (cmd_match(buf, "-writemostly")) {
mddev_destroy_wb_pool(rdev->mddev, rdev);
clear_bit(WriteMostly, &rdev->flags);
err = 0;
} else if (cmd_match(buf, "blocked")) {
@ -5588,15 +5666,28 @@ int md_run(struct mddev *mddev)
mddev->bitmap = bitmap;
}
if (err) {
mddev_detach(mddev);
if (mddev->private)
pers->free(mddev, mddev->private);
mddev->private = NULL;
module_put(pers->owner);
md_bitmap_destroy(mddev);
goto abort;
if (err)
goto bitmap_abort;
if (mddev->bitmap_info.max_write_behind > 0) {
bool creat_pool = false;
rdev_for_each(rdev, mddev) {
if (test_bit(WriteMostly, &rdev->flags) &&
rdev_init_wb(rdev))
creat_pool = true;
}
if (creat_pool && mddev->wb_info_pool == NULL) {
mddev->wb_info_pool =
mempool_create_kmalloc_pool(NR_WB_INFOS,
sizeof(struct wb_info));
if (!mddev->wb_info_pool) {
err = -ENOMEM;
goto bitmap_abort;
}
}
}
if (mddev->queue) {
bool nonrot = true;
@ -5657,6 +5748,13 @@ int md_run(struct mddev *mddev)
sysfs_notify(&mddev->kobj, NULL, "degraded");
return 0;
bitmap_abort:
mddev_detach(mddev);
if (mddev->private)
pers->free(mddev, mddev->private);
mddev->private = NULL;
module_put(pers->owner);
md_bitmap_destroy(mddev);
abort:
bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set);
@ -5825,6 +5923,8 @@ static void __md_stop_writes(struct mddev *mddev)
mddev->in_sync = 1;
md_update_sb(mddev, 1);
}
mempool_destroy(mddev->wb_info_pool);
mddev->wb_info_pool = NULL;
}
void md_stop_writes(struct mddev *mddev)

View File

@ -109,6 +109,14 @@ struct md_rdev {
* for reporting to userspace and storing
* in superblock.
*/
/*
* The members for check collision of write behind IOs.
*/
struct list_head wb_list;
spinlock_t wb_list_lock;
wait_queue_head_t wb_io_wait;
struct work_struct del_work; /* used for delayed sysfs removal */
struct kernfs_node *sysfs_state; /* handle for 'state'
@ -193,6 +201,10 @@ enum flag_bits {
* it didn't fail, so don't use FailFast
* any more for metadata
*/
WBCollisionCheck, /*
* multiqueue device should check if there
* is collision between write behind bios.
*/
};
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
@ -245,6 +257,14 @@ enum mddev_sb_flags {
MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */
};
#define NR_WB_INFOS 8
/* record current range of write behind IOs */
struct wb_info {
sector_t lo;
sector_t hi;
struct list_head list;
};
struct mddev {
void *private;
struct md_personality *pers;
@ -461,6 +481,7 @@ struct mddev {
*/
struct work_struct flush_work;
struct work_struct event_work; /* used by dm to report failure event */
mempool_t *wb_info_pool;
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
struct md_cluster_info *cluster_info;
unsigned int good_device_nr; /* good device num within cluster raid */
@ -709,6 +730,8 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
extern void md_kick_rdev_from_array(struct md_rdev * rdev);
extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
bool is_suspend);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);

View File

@ -50,6 +50,57 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
#include "raid1-10.c"
static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
{
struct wb_info *wi, *temp_wi;
unsigned long flags;
int ret = 0;
struct mddev *mddev = rdev->mddev;
wi = mempool_alloc(mddev->wb_info_pool, GFP_NOIO);
spin_lock_irqsave(&rdev->wb_list_lock, flags);
list_for_each_entry(temp_wi, &rdev->wb_list, list) {
/* collision happened */
if (hi > temp_wi->lo && lo < temp_wi->hi) {
ret = -EBUSY;
break;
}
}
if (!ret) {
wi->lo = lo;
wi->hi = hi;
list_add(&wi->list, &rdev->wb_list);
} else
mempool_free(wi, mddev->wb_info_pool);
spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
return ret;
}
static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
{
struct wb_info *wi;
unsigned long flags;
int found = 0;
struct mddev *mddev = rdev->mddev;
spin_lock_irqsave(&rdev->wb_list_lock, flags);
list_for_each_entry(wi, &rdev->wb_list, list)
if (hi == wi->hi && lo == wi->lo) {
list_del(&wi->list);
mempool_free(wi, mddev->wb_info_pool);
found = 1;
break;
}
if (!found)
WARN_ON("The write behind IO is not recorded\n");
spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
wake_up(&rdev->wb_io_wait);
}
/*
* for resync bio, r1bio pointer can be retrieved from the per-bio
* 'struct resync_pages'.
@ -446,6 +497,12 @@ static void raid1_end_write_request(struct bio *bio)
}
if (behind) {
if (test_bit(WBCollisionCheck, &rdev->flags)) {
sector_t lo = r1_bio->sector;
sector_t hi = r1_bio->sector + r1_bio->sectors;
remove_wb(rdev, lo, hi);
}
if (test_bit(WriteMostly, &rdev->flags))
atomic_dec(&r1_bio->behind_remaining);
@ -1443,7 +1500,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
if (r1_bio->behind_master_bio) {
if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
struct md_rdev *rdev = conf->mirrors[i].rdev;
if (test_bit(WBCollisionCheck, &rdev->flags)) {
sector_t lo = r1_bio->sector;
sector_t hi = r1_bio->sector + r1_bio->sectors;
wait_event(rdev->wb_io_wait,
check_and_add_wb(rdev, lo, hi) == 0);
}
if (test_bit(WriteMostly, &rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
}