From 8031c3ddc70ab93099e7d1814382dba39f57b43e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 17 Aug 2017 10:35:11 -0700 Subject: [PATCH 01/12] md/bitmap: copy correct data for bitmap super raid5 cache could write bitmap superblock before bitmap superblock is initialized. The bitmap superblock is less than 512B. The current code will only copy the superblock to a new page and write the whole 512B, which will zero the the data after the superblock. Unfortunately the data could include bitmap, which we should preserve. The patch will make superblock read do 4k chunk and we always copy the 4k data to new page, so the superblock write will old data to disk and we don't change the bitmap. Reported-by: Song Liu Reviewed-by: Song Liu Cc: stable@vger.kernel.org (4.10+) Signed-off-by: Shaohua Li --- drivers/md/bitmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 40f3cd7eab0f..67e992185a24 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -625,7 +625,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) err = read_sb_page(bitmap->mddev, offset, sb_page, - 0, sizeof(bitmap_super_t)); + 0, PAGE_SIZE); } if (err) return err; @@ -2118,7 +2118,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, if (store.sb_page && bitmap->storage.sb_page) memcpy(page_address(store.sb_page), page_address(bitmap->storage.sb_page), - sizeof(bitmap_super_t)); + PAGE_SIZE); bitmap_file_unmap(&bitmap->storage); bitmap->storage = store; From 9c72a18e46ebe0f09484cce8ebf847abdab58498 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 24 Aug 2017 09:53:59 -0700 Subject: [PATCH 02/12] md/raid5: release/flush io in raid5_do_work() In raid5, there are scenarios where some ios are deferred to a later time, and some IO need a flush to complete. To make sure we make progress with these IOs, we need to call the following functions: flush_deferred_bios(conf); r5l_flush_stripe_to_raid(conf->log); Both of these functions are called in raid5d(), but missing in raid5_do_work(). As a result, these functions are not called when multi-threading (group_thread_cnt > 0) is enabled. This patch adds calls to these function to raid5_do_work(). Note for stable branches: r5l_flush_stripe_to_raid(conf->log) is need for 4.4+ flush_deferred_bios(conf) is only needed for 4.11+ Cc: stable@vger.kernel.org (4.4+) Signed-off-by: Song Liu Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0fc2748aaf95..e13a8ce7f589 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6235,6 +6235,10 @@ static void raid5_do_work(struct work_struct *work) spin_unlock_irq(&conf->device_lock); + flush_deferred_bios(conf); + + r5l_flush_stripe_to_raid(conf->log); + async_tx_issue_pending_all(); blk_finish_plug(&plug); From 208410b546207cfc4c832635fa46419cfa86b4cd Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 24 Aug 2017 17:50:40 -0700 Subject: [PATCH 03/12] md/raid1/10: reset bio allocated from mempool Data allocated from mempool doesn't always get initialized, this happens when the data is reused instead of fresh allocation. In the raid1/10 case, we must reinitialize the bios. Reported-by: Jonathan G. Underwood Fixes: f0250618361d(md: raid10: don't use bio's vec table to manage resync pages) Fixes: 98d30c5812c3(md: raid1: don't use bio's vec table to manage resync pages) Cc: stable@vger.kernel.org (4.12+) Cc: Ming Lei Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 19 ++++++++++++++++++- drivers/md/raid10.c | 35 ++++++++++++++++++++++++++++++++--- 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f50958ded9f0..79474f47eeef 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2564,6 +2564,23 @@ static int init_resync(struct r1conf *conf) return 0; } +static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf) +{ + struct r1bio *r1bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); + struct resync_pages *rps; + struct bio *bio; + int i; + + for (i = conf->poolinfo->raid_disks; i--; ) { + bio = r1bio->bios[i]; + rps = bio->bi_private; + bio_reset(bio); + bio->bi_private = rps; + } + r1bio->master_bio = NULL; + return r1bio; +} + /* * perform a "sync" on one "block" * @@ -2649,7 +2666,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, bitmap_cond_end_sync(mddev->bitmap, sector_nr, mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); - r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); + r1_bio = raid1_alloc_init_r1buf(conf); raise_barrier(conf, sector_nr); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f55d4cc085f6..d51ac02e98ef 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2798,6 +2798,35 @@ static int init_resync(struct r10conf *conf) return 0; } +static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf) +{ + struct r10bio *r10bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + struct rsync_pages *rp; + struct bio *bio; + int nalloc; + int i; + + if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) + nalloc = conf->copies; /* resync */ + else + nalloc = 2; /* recovery */ + + for (i = 0; i < nalloc; i++) { + bio = r10bio->devs[i].bio; + rp = bio->bi_private; + bio_reset(bio); + bio->bi_private = rp; + bio = r10bio->devs[i].repl_bio; + if (bio) { + rp = bio->bi_private; + bio_reset(bio); + bio->bi_private = rp; + } + } + return r10bio; +} + /* * perform a "sync" on one "block" * @@ -3027,7 +3056,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, atomic_inc(&mreplace->nr_pending); rcu_read_unlock(); - r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + r10_bio = raid10_alloc_init_r10buf(conf); r10_bio->state = 0; raise_barrier(conf, rb2 != NULL); atomic_set(&r10_bio->remaining, 0); @@ -3236,7 +3265,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } if (sync_blocks < max_sync) max_sync = sync_blocks; - r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + r10_bio = raid10_alloc_init_r10buf(conf); r10_bio->state = 0; r10_bio->mddev = mddev; @@ -4360,7 +4389,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, read_more: /* Now schedule reads for blocks from sector_nr to last */ - r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + r10_bio = raid10_alloc_init_r10buf(conf); r10_bio->state = 0; raise_barrier(conf, sectors_done != 0); atomic_set(&r10_bio->remaining, 0); From 5492c46e94b57b5137e3ba2c9ae22b2dcc3fc2cc Mon Sep 17 00:00:00 2001 From: Alexey Obitotskiy Date: Fri, 28 Jul 2017 15:49:25 +0200 Subject: [PATCH 04/12] md: notify about new spare disk in the container In case of external metadata arrays spare disks are added to containers first. mdadm keeps monitoring /proc/mdstat output and when spare disk is available, it moves it from the container to the array. The problem is there is no notification of new spare disk in the container and mdadm waits a long time (until timeout) before it takes the action. Signed-off-by: Alexey Obitotskiy Signed-off-by: Shaohua Li --- drivers/md/md.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index b01e458d31e9..95c0b644a5b9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4283,6 +4283,8 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) if (err) export_rdev(rdev); mddev_unlock(mddev); + if (!err) + md_new_event(mddev); return err ? err : len; } From 26e13043b7ea14c0b038f3f5a4af0d5cdda9e95f Mon Sep 17 00:00:00 2001 From: Cihangir Akturk Date: Sat, 29 Jul 2017 19:52:45 +0300 Subject: [PATCH 05/12] md: replace seq_release_private with seq_release Since commit f15146380d28 ("fs: seq_file - add event counter to simplify poll() support"), md.c code has been no longer used the private field of the struct seq_file, but seq_release_private() has been continued to be used to release the allocated seq_file. This seems to have been forgotten. So convert it to use seq_release() instead of seq_release_private(). Signed-off-by: Cihangir Akturk Signed-off-by: Shaohua Li --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 95c0b644a5b9..a74dc9963822 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7838,7 +7838,7 @@ static const struct file_operations md_seq_fops = { .open = md_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release, .poll = mdstat_poll, }; From a72cbf83b0cb2368568d4e78cf1687079aad4d25 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 8 Aug 2017 22:56:52 -0700 Subject: [PATCH 06/12] md/r5cache: call mddev_lock/unlock() in r5c_journal_mode_show In r5c_journal_mode_show(), it is necessary to call mddev_lock() before accessing conf and conf->log. Otherwise, the conf->log may change (and become NULL). Signed-off-by: Song Liu Reported-by: Stephen Rothwell Reported-by: kbuild test robot Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 2dcbafa8e66c..e357ed367140 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -2529,11 +2529,18 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp) static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; int ret; - if (!conf->log) + ret = mddev_lock(mddev); + if (ret) + return ret; + + conf = mddev->private; + if (!conf || !conf->log) { + mddev_unlock(mddev); return 0; + } switch (conf->log->r5c_journal_mode) { case R5C_JOURNAL_MODE_WRITE_THROUGH: @@ -2551,6 +2558,7 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) default: ret = 0; } + mddev_unlock(mddev); return ret; } From 27a4ff8f49a9b912be76f36f7e198824cf0aecd9 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 10 Aug 2017 16:12:17 +0800 Subject: [PATCH 07/12] raid5: remove raid5_build_block Now raid5_build_block is just called to set the sector of r5dev, raid5_compute_blocknr can be used directly for the purpose. Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e13a8ce7f589..6af57c6c0533 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -494,7 +494,6 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp) return 0; } -static void raid5_build_block(struct stripe_head *sh, int i, int previous); static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, struct stripe_head *sh); @@ -530,7 +529,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) WARN_ON(1); } dev->flags = 0; - raid5_build_block(sh, i, previous); + dev->sector = raid5_compute_blocknr(sh, i, previous); } if (read_seqcount_retry(&conf->gen_lock, seq)) goto retry; @@ -2662,14 +2661,6 @@ static void raid5_end_write_request(struct bio *bi) raid5_release_stripe(sh->batch_head); } -static void raid5_build_block(struct stripe_head *sh, int i, int previous) -{ - struct r5dev *dev = &sh->dev[i]; - - dev->flags = 0; - dev->sector = raid5_compute_blocknr(sh, i, previous); -} - static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; From b5e0fff19bef3a36631bce742e363d3a99590c3a Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 12 Aug 2017 19:43:46 +0200 Subject: [PATCH 08/12] lib/raid6: align AVX512 constants to 512 bits, not bytes Signed-off-by: Denys Vlasenko Cc: H. Peter Anvin Cc: mingo@redhat.com Cc: Jim Kukunas Cc: Fenghua Yu Cc: Megha Dey Cc: Gayatri Kammela Cc: x86@kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Shaohua Li --- lib/raid6/avx512.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/raid6/avx512.c b/lib/raid6/avx512.c index f524a7972006..46df7977b971 100644 --- a/lib/raid6/avx512.c +++ b/lib/raid6/avx512.c @@ -29,7 +29,7 @@ static const struct raid6_avx512_constants { u64 x1d[8]; -} raid6_avx512_constants __aligned(512) = { +} raid6_avx512_constants __aligned(512/8) = { { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, From 8a8e6f84ad10e7a10bde438b42926da0e9bc820f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 18 Aug 2017 10:27:59 -0700 Subject: [PATCH 09/12] md/raid0: attach correct cgroup info in bio The discard bio doesn't attach the original bio cgroup info. Normal bio is cloned, so is fine. Signed-off-by: Shaohua Li --- block/bio.c | 2 +- drivers/md/raid0.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index 9a63597aaacc..51189a1d9009 100644 --- a/block/bio.c +++ b/block/bio.c @@ -2085,7 +2085,7 @@ void bio_clone_blkcg_association(struct bio *dst, struct bio *src) if (src->bi_css) WARN_ON(bio_associate_blkcg(dst, src->bi_css)); } - +EXPORT_SYMBOL_GPL(bio_clone_blkcg_association); #endif /* CONFIG_BLK_CGROUP */ static void __init biovec_init_slabs(void) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 94d9ae9b0fd0..6fb81704aff4 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -539,6 +539,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) !discard_bio) continue; bio_chain(discard_bio, bio); + bio_clone_blkcg_association(discard_bio, bio); if (mddev->gendisk) trace_block_bio_remap(bdev_get_queue(rdev->bdev), discard_bio, disk_devt(mddev->gendisk), From ddc088238cd6988bb4ac3776f403d7ff9d3c7a63 Mon Sep 17 00:00:00 2001 From: Pawel Baldysiak Date: Wed, 16 Aug 2017 17:13:45 +0200 Subject: [PATCH 10/12] md: Runtime support for multiple ppls Increase PPL area to 1MB and use it as circular buffer to store PPL. The entry with highest generation number is the latest one. If PPL to be written is larger then space left in a buffer, rewind the buffer to the start (don't wrap it). Signed-off-by: Pawel Baldysiak Signed-off-by: Artur Paszkiewicz Signed-off-by: Shaohua Li --- drivers/md/md.c | 16 ++++++++++--- drivers/md/md.h | 1 + drivers/md/raid0.c | 3 ++- drivers/md/raid1.c | 3 ++- drivers/md/raid5-ppl.c | 43 +++++++++++++++++++++++++++++++--- drivers/md/raid5.c | 1 + include/uapi/linux/raid/md_p.h | 4 +++- 7 files changed, 62 insertions(+), 9 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index a74dc9963822..a7876237de10 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1536,7 +1536,8 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ } else if (sb->bblog_offset != 0) rdev->badblocks.shift = 0; - if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) { + if ((le32_to_cpu(sb->feature_map) & + (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); rdev->ppl.size = le16_to_cpu(sb->ppl.size); rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; @@ -1655,10 +1656,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) set_bit(MD_HAS_JOURNAL, &mddev->flags); - if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) { + if (le32_to_cpu(sb->feature_map) & + (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { if (le32_to_cpu(sb->feature_map) & (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) return -EINVAL; + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && + (le32_to_cpu(sb->feature_map) & + MD_FEATURE_MULTIPLE_PPLS)) + return -EINVAL; set_bit(MD_HAS_PPL, &mddev->flags); } } else if (mddev->pers == NULL) { @@ -1875,7 +1881,11 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); if (test_bit(MD_HAS_PPL, &mddev->flags)) { - sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); + if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) + sb->feature_map |= + cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); + else + sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); sb->ppl.size = cpu_to_le16(rdev->ppl.size); } diff --git a/drivers/md/md.h b/drivers/md/md.h index 09db03455801..d4bdfa5c223b 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -236,6 +236,7 @@ enum mddev_flags { * never cause the array to become failed. */ MD_HAS_PPL, /* The raid array has PPL feature set */ + MD_HAS_MULTIPLE_PPLS, /* The raid array has multiple PPLs feature set */ }; enum mddev_sb_flags { diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 6fb81704aff4..fd5e8e5efbef 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -30,7 +30,8 @@ ((1L << MD_HAS_JOURNAL) | \ (1L << MD_JOURNAL_CLEAN) | \ (1L << MD_FAILFAST_SUPPORTED) |\ - (1L << MD_HAS_PPL)) + (1L << MD_HAS_PPL) | \ + (1L << MD_HAS_MULTIPLE_PPLS)) static int raid0_congested(struct mddev *mddev, int bits) { diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 79474f47eeef..1f5bd9475dc1 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -48,7 +48,8 @@ #define UNSUPPORTED_MDDEV_FLAGS \ ((1L << MD_HAS_JOURNAL) | \ (1L << MD_JOURNAL_CLEAN) | \ - (1L << MD_HAS_PPL)) + (1L << MD_HAS_PPL) | \ + (1L << MD_HAS_MULTIPLE_PPLS)) /* * Number of guaranteed r1bios in case of extreme VM load: diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 44ad5baf3206..b313f17a6260 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -87,6 +87,8 @@ * The current io_unit accepting new stripes is always at the end of the list. */ +#define PPL_SPACE_SIZE (128 * 1024) + struct ppl_conf { struct mddev *mddev; @@ -122,6 +124,10 @@ struct ppl_log { * always at the end of io_list */ spinlock_t io_list_lock; struct list_head io_list; /* all io_units of this log */ + + sector_t next_io_sector; + unsigned int entry_space; + bool use_multippl; }; #define PPL_IO_INLINE_BVECS 32 @@ -264,13 +270,12 @@ static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh) int i; sector_t data_sector = 0; int data_disks = 0; - unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE; struct r5conf *conf = sh->raid_conf; pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector); /* check if current io_unit is full */ - if (io && (io->pp_size == entry_space || + if (io && (io->pp_size == log->entry_space || io->entries_count == PPL_HDR_MAX_ENTRIES)) { pr_debug("%s: add io_unit blocked by seq: %llu\n", __func__, io->seq); @@ -451,12 +456,25 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) pplhdr->entries_count = cpu_to_le32(io->entries_count); pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE)); + /* Rewind the buffer if current PPL is larger then remaining space */ + if (log->use_multippl && + log->rdev->ppl.sector + log->rdev->ppl.size - log->next_io_sector < + (PPL_HEADER_SIZE + io->pp_size) >> 9) + log->next_io_sector = log->rdev->ppl.sector; + + bio->bi_end_io = ppl_log_endio; bio->bi_opf = REQ_OP_WRITE | REQ_FUA; bio->bi_bdev = log->rdev->bdev; - bio->bi_iter.bi_sector = log->rdev->ppl.sector; + bio->bi_iter.bi_sector = log->next_io_sector; bio_add_page(bio, io->header_page, PAGE_SIZE, 0); + pr_debug("%s: log->current_io_sector: %llu\n", __func__, + (unsigned long long)log->next_io_sector); + + if (log->use_multippl) + log->next_io_sector += (PPL_HEADER_SIZE + io->pp_size) >> 9; + list_for_each_entry(sh, &io->stripe_list, log_list) { /* entries for full stripe writes have no partial parity */ if (test_bit(STRIPE_FULL_WRITE, &sh->state)) @@ -1031,6 +1049,7 @@ static int ppl_load(struct ppl_conf *ppl_conf) static void __ppl_exit_log(struct ppl_conf *ppl_conf) { clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags); + clear_bit(MD_HAS_MULTIPLE_PPLS, &ppl_conf->mddev->flags); kfree(ppl_conf->child_logs); @@ -1099,6 +1118,22 @@ static int ppl_validate_rdev(struct md_rdev *rdev) return 0; } +static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev) +{ + if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE + + PPL_HEADER_SIZE) * 2) { + log->use_multippl = true; + set_bit(MD_HAS_MULTIPLE_PPLS, + &log->ppl_conf->mddev->flags); + log->entry_space = PPL_SPACE_SIZE; + } else { + log->use_multippl = false; + log->entry_space = (log->rdev->ppl.size << 9) - + PPL_HEADER_SIZE; + } + log->next_io_sector = rdev->ppl.sector; +} + int ppl_init_log(struct r5conf *conf) { struct ppl_conf *ppl_conf; @@ -1196,6 +1231,7 @@ int ppl_init_log(struct r5conf *conf) q = bdev_get_queue(rdev->bdev); if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) need_cache_flush = true; + ppl_init_child_log(log, rdev); } } @@ -1261,6 +1297,7 @@ int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add) if (!ret) { log->rdev = rdev; ret = ppl_write_empty_header(log); + ppl_init_child_log(log, rdev); } } else { log->rdev = NULL; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6af57c6c0533..049a958d3c1e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7236,6 +7236,7 @@ static int raid5_run(struct mddev *mddev) pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n", mdname(mddev)); clear_bit(MD_HAS_PPL, &mddev->flags); + clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags); } if (mddev->private == NULL) diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index d500bd224979..b9197976b660 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -324,9 +324,10 @@ struct mdp_superblock_1 { #define MD_FEATURE_RECOVERY_BITMAP 128 /* recovery that is happening * is guided by bitmap. */ -#define MD_FEATURE_CLUSTERED 256 /* clustered MD */ +#define MD_FEATURE_CLUSTERED 256 /* clustered MD */ #define MD_FEATURE_JOURNAL 512 /* support write cache */ #define MD_FEATURE_PPL 1024 /* support PPL */ +#define MD_FEATURE_MULTIPLE_PPLS 2048 /* support for multiple PPLs */ #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ |MD_FEATURE_RECOVERY_OFFSET \ |MD_FEATURE_RESHAPE_ACTIVE \ @@ -338,6 +339,7 @@ struct mdp_superblock_1 { |MD_FEATURE_CLUSTERED \ |MD_FEATURE_JOURNAL \ |MD_FEATURE_PPL \ + |MD_FEATURE_MULTIPLE_PPLS \ ) struct r5l_payload_header { From 675dc2ccc27c02449da45e1a03234104c2449f68 Mon Sep 17 00:00:00 2001 From: Pawel Baldysiak Date: Wed, 16 Aug 2017 17:13:46 +0200 Subject: [PATCH 11/12] raid5-ppl: Recovery support for multiple partial parity logs Search PPL buffer in order to find out the latest PPL header (the one with largest generation number) and use it for recovery. The PPL entry format and recovery algorithm are the same as for single PPL approach. Signed-off-by: Pawel Baldysiak Signed-off-by: Shaohua Li --- drivers/md/raid5-ppl.c | 132 ++++++++++++++++++++++++++++------------- 1 file changed, 92 insertions(+), 40 deletions(-) diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index b313f17a6260..a98ef172f8e8 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -831,12 +831,14 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, return ret; } -static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr) +static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr, + sector_t offset) { struct ppl_conf *ppl_conf = log->ppl_conf; struct md_rdev *rdev = log->rdev; struct mddev *mddev = rdev->mddev; - sector_t ppl_sector = rdev->ppl.sector + (PPL_HEADER_SIZE >> 9); + sector_t ppl_sector = rdev->ppl.sector + offset + + (PPL_HEADER_SIZE >> 9); struct page *page; int i; int ret = 0; @@ -920,6 +922,9 @@ static int ppl_write_empty_header(struct ppl_log *log) return -ENOMEM; pplhdr = page_address(page); + /* zero out PPL space to avoid collision with old PPLs */ + blkdev_issue_zeroout(rdev->bdev, rdev->ppl.sector, + log->rdev->ppl.size, GFP_NOIO, 0); memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED); pplhdr->signature = cpu_to_le32(log->ppl_conf->signature); pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE)); @@ -940,63 +945,110 @@ static int ppl_load_distributed(struct ppl_log *log) struct ppl_conf *ppl_conf = log->ppl_conf; struct md_rdev *rdev = log->rdev; struct mddev *mddev = rdev->mddev; - struct page *page; - struct ppl_header *pplhdr; + struct page *page, *page2, *tmp; + struct ppl_header *pplhdr = NULL, *prev_pplhdr = NULL; u32 crc, crc_stored; u32 signature; - int ret = 0; + int ret = 0, i; + sector_t pplhdr_offset = 0, prev_pplhdr_offset = 0; pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk); - - /* read PPL header */ + /* read PPL headers, find the recent one */ page = alloc_page(GFP_KERNEL); if (!page) return -ENOMEM; - if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset, - PAGE_SIZE, page, REQ_OP_READ, 0, false)) { - md_error(mddev, rdev); - ret = -EIO; - goto out; + page2 = alloc_page(GFP_KERNEL); + if (!page2) { + __free_page(page); + return -ENOMEM; } - pplhdr = page_address(page); - /* check header validity */ - crc_stored = le32_to_cpu(pplhdr->checksum); - pplhdr->checksum = 0; - crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE); + /* searching ppl area for latest ppl */ + while (pplhdr_offset < rdev->ppl.size - (PPL_HEADER_SIZE >> 9)) { + if (!sync_page_io(rdev, + rdev->ppl.sector - rdev->data_offset + + pplhdr_offset, PAGE_SIZE, page, REQ_OP_READ, + 0, false)) { + md_error(mddev, rdev); + ret = -EIO; + /* if not able to read - don't recover any PPL */ + pplhdr = NULL; + break; + } + pplhdr = page_address(page); - if (crc_stored != crc) { - pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x\n", - __func__, crc_stored, crc); + /* check header validity */ + crc_stored = le32_to_cpu(pplhdr->checksum); + pplhdr->checksum = 0; + crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE); + + if (crc_stored != crc) { + pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x (offset: %llu)\n", + __func__, crc_stored, crc, + (unsigned long long)pplhdr_offset); + pplhdr = prev_pplhdr; + pplhdr_offset = prev_pplhdr_offset; + break; + } + + signature = le32_to_cpu(pplhdr->signature); + + if (mddev->external) { + /* + * For external metadata the header signature is set and + * validated in userspace. + */ + ppl_conf->signature = signature; + } else if (ppl_conf->signature != signature) { + pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x (offset: %llu)\n", + __func__, signature, ppl_conf->signature, + (unsigned long long)pplhdr_offset); + pplhdr = prev_pplhdr; + pplhdr_offset = prev_pplhdr_offset; + break; + } + + if (prev_pplhdr && le64_to_cpu(prev_pplhdr->generation) > + le64_to_cpu(pplhdr->generation)) { + /* previous was newest */ + pplhdr = prev_pplhdr; + pplhdr_offset = prev_pplhdr_offset; + break; + } + + prev_pplhdr_offset = pplhdr_offset; + prev_pplhdr = pplhdr; + + tmp = page; + page = page2; + page2 = tmp; + + /* calculate next potential ppl offset */ + for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) + pplhdr_offset += + le32_to_cpu(pplhdr->entries[i].pp_size) >> 9; + pplhdr_offset += PPL_HEADER_SIZE >> 9; + } + + /* no valid ppl found */ + if (!pplhdr) ppl_conf->mismatch_count++; - goto out; - } - - signature = le32_to_cpu(pplhdr->signature); - - if (mddev->external) { - /* - * For external metadata the header signature is set and - * validated in userspace. - */ - ppl_conf->signature = signature; - } else if (ppl_conf->signature != signature) { - pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x\n", - __func__, signature, ppl_conf->signature); - ppl_conf->mismatch_count++; - goto out; - } + else + pr_debug("%s: latest PPL found at offset: %llu, with generation: %llu\n", + __func__, (unsigned long long)pplhdr_offset, + le64_to_cpu(pplhdr->generation)); /* attempt to recover from log if we are starting a dirty array */ - if (!mddev->pers && mddev->recovery_cp != MaxSector) - ret = ppl_recover(log, pplhdr); -out: + if (pplhdr && !mddev->pers && mddev->recovery_cp != MaxSector) + ret = ppl_recover(log, pplhdr, pplhdr_offset); + /* write empty header if we are starting the array */ if (!ret && !mddev->pers) ret = ppl_write_empty_header(log); __free_page(page); + __free_page(page2); pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n", __func__, ret, ppl_conf->mismatch_count, From e8a27f836f165c26f867ece7f31eb5c811692319 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 31 Aug 2017 10:23:25 +1000 Subject: [PATCH 12/12] md/bitmap: disable bitmap_resize for file-backed bitmaps. bitmap_resize() does not work for file-backed bitmaps. The buffer_heads are allocated and initialized when the bitmap is read from the file, but resize doesn't read from the file, it loads from the internal bitmap. When it comes time to write the new bitmap, the bh is non-existent and we crash. The common case when growing an array involves making the array larger, and that normally means making the bitmap larger. Doing that inside the kernel is possible, but would need more code. It is probably easier to require people who use file-backed bitmaps to remove them and re-add after a reshape. So this patch disables the resizing of arrays which have file-backed bitmaps. This is better than crashing. Reported-by: Zhilong Liu Fixes: d60b479d177a ("md/bitmap: add bitmap_resize function to allow bitmap resizing.") Cc: stable@vger.kernel.org (v3.5+). Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/bitmap.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 67e992185a24..d2121637b4ab 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -2058,6 +2058,11 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, long pages; struct bitmap_page *new_bp; + if (bitmap->storage.file && !init) { + pr_info("md: cannot resize file-based bitmap\n"); + return -EINVAL; + } + if (chunksize == 0) { /* If there is enough space, leave the chunk size unchanged, * else increase by factor of two until there is enough space.