From 8031c3ddc70ab93099e7d1814382dba39f57b43e Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 17 Aug 2017 10:35:11 -0700
Subject: [PATCH 01/12] md/bitmap: copy correct data for bitmap super

raid5 cache could write bitmap superblock before bitmap superblock is
initialized. The bitmap superblock is less than 512B. The current code will
only copy the superblock to a new page and write the whole 512B, which will
zero the the data after the superblock. Unfortunately the data could include
bitmap, which we should preserve. The patch will make superblock read do 4k
chunk and we always copy the 4k data to new page, so the superblock write will
old data to disk and we don't change the bitmap.

Reported-by: Song Liu <songliubraving@fb.com>
Reviewed-by: Song Liu <songliubraving@fb.com>
Cc: stable@vger.kernel.org (4.10+)
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/bitmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 40f3cd7eab0f..67e992185a24 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -625,7 +625,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 		err = read_sb_page(bitmap->mddev,
 				   offset,
 				   sb_page,
-				   0, sizeof(bitmap_super_t));
+				   0, PAGE_SIZE);
 	}
 	if (err)
 		return err;
@@ -2118,7 +2118,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 	if (store.sb_page && bitmap->storage.sb_page)
 		memcpy(page_address(store.sb_page),
 		       page_address(bitmap->storage.sb_page),
-		       sizeof(bitmap_super_t));
+		       PAGE_SIZE);
 	bitmap_file_unmap(&bitmap->storage);
 	bitmap->storage = store;
 

From 9c72a18e46ebe0f09484cce8ebf847abdab58498 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 24 Aug 2017 09:53:59 -0700
Subject: [PATCH 02/12] md/raid5: release/flush io in raid5_do_work()

In raid5, there are scenarios where some ios are deferred to a later
time, and some IO need a flush to complete. To make sure we make
progress with these IOs, we need to call the following functions:

    flush_deferred_bios(conf);
    r5l_flush_stripe_to_raid(conf->log);

Both of these functions are called in raid5d(), but missing in
raid5_do_work(). As a result, these functions are not called
when multi-threading (group_thread_cnt > 0) is enabled. This patch
adds calls to these function to raid5_do_work().

Note for stable branches:

  r5l_flush_stripe_to_raid(conf->log) is need for 4.4+
  flush_deferred_bios(conf) is only needed for 4.11+

Cc: stable@vger.kernel.org (4.4+)
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0fc2748aaf95..e13a8ce7f589 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6235,6 +6235,10 @@ static void raid5_do_work(struct work_struct *work)
 
 	spin_unlock_irq(&conf->device_lock);
 
+	flush_deferred_bios(conf);
+
+	r5l_flush_stripe_to_raid(conf->log);
+
 	async_tx_issue_pending_all();
 	blk_finish_plug(&plug);
 

From 208410b546207cfc4c832635fa46419cfa86b4cd Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 24 Aug 2017 17:50:40 -0700
Subject: [PATCH 03/12] md/raid1/10: reset bio allocated from mempool

Data allocated from mempool doesn't always get initialized, this happens when
the data is reused instead of fresh allocation. In the raid1/10 case, we must
reinitialize the bios.

Reported-by: Jonathan G. Underwood <jonathan.underwood@gmail.com>
Fixes: f0250618361d(md: raid10: don't use bio's vec table to manage resync pages)
Fixes: 98d30c5812c3(md: raid1: don't use bio's vec table to manage resync pages)
Cc: stable@vger.kernel.org (4.12+)
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c  | 19 ++++++++++++++++++-
 drivers/md/raid10.c | 35 ++++++++++++++++++++++++++++++++---
 2 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f50958ded9f0..79474f47eeef 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2564,6 +2564,23 @@ static int init_resync(struct r1conf *conf)
 	return 0;
 }
 
+static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf)
+{
+	struct r1bio *r1bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
+	struct resync_pages *rps;
+	struct bio *bio;
+	int i;
+
+	for (i = conf->poolinfo->raid_disks; i--; ) {
+		bio = r1bio->bios[i];
+		rps = bio->bi_private;
+		bio_reset(bio);
+		bio->bi_private = rps;
+	}
+	r1bio->master_bio = NULL;
+	return r1bio;
+}
+
 /*
  * perform a "sync" on one "block"
  *
@@ -2649,7 +2666,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 	bitmap_cond_end_sync(mddev->bitmap, sector_nr,
 		mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
-	r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
+	r1_bio = raid1_alloc_init_r1buf(conf);
 
 	raise_barrier(conf, sector_nr);
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f55d4cc085f6..d51ac02e98ef 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2798,6 +2798,35 @@ static int init_resync(struct r10conf *conf)
 	return 0;
 }
 
+static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
+{
+	struct r10bio *r10bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+	struct rsync_pages *rp;
+	struct bio *bio;
+	int nalloc;
+	int i;
+
+	if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
+	    test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
+		nalloc = conf->copies; /* resync */
+	else
+		nalloc = 2; /* recovery */
+
+	for (i = 0; i < nalloc; i++) {
+		bio = r10bio->devs[i].bio;
+		rp = bio->bi_private;
+		bio_reset(bio);
+		bio->bi_private = rp;
+		bio = r10bio->devs[i].repl_bio;
+		if (bio) {
+			rp = bio->bi_private;
+			bio_reset(bio);
+			bio->bi_private = rp;
+		}
+	}
+	return r10bio;
+}
+
 /*
  * perform a "sync" on one "block"
  *
@@ -3027,7 +3056,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 				atomic_inc(&mreplace->nr_pending);
 			rcu_read_unlock();
 
-			r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+			r10_bio = raid10_alloc_init_r10buf(conf);
 			r10_bio->state = 0;
 			raise_barrier(conf, rb2 != NULL);
 			atomic_set(&r10_bio->remaining, 0);
@@ -3236,7 +3265,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		}
 		if (sync_blocks < max_sync)
 			max_sync = sync_blocks;
-		r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+		r10_bio = raid10_alloc_init_r10buf(conf);
 		r10_bio->state = 0;
 
 		r10_bio->mddev = mddev;
@@ -4360,7 +4389,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 
 read_more:
 	/* Now schedule reads for blocks from sector_nr to last */
-	r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+	r10_bio = raid10_alloc_init_r10buf(conf);
 	r10_bio->state = 0;
 	raise_barrier(conf, sectors_done != 0);
 	atomic_set(&r10_bio->remaining, 0);

From 5492c46e94b57b5137e3ba2c9ae22b2dcc3fc2cc Mon Sep 17 00:00:00 2001
From: Alexey Obitotskiy <aleksey.obitotskiy@intel.com>
Date: Fri, 28 Jul 2017 15:49:25 +0200
Subject: [PATCH 04/12] md: notify about new spare disk in the container

In case of external metadata arrays spare disks are added to containers
first. mdadm keeps monitoring /proc/mdstat output and when spare disk is
available, it moves it from the container to the array. The problem is
there is no notification of new spare disk in the container and mdadm
waits a long time (until timeout) before it takes the action.

Signed-off-by: Alexey Obitotskiy <aleksey.obitotskiy@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index b01e458d31e9..95c0b644a5b9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4283,6 +4283,8 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
 	if (err)
 		export_rdev(rdev);
 	mddev_unlock(mddev);
+	if (!err)
+		md_new_event(mddev);
 	return err ? err : len;
 }
 

From 26e13043b7ea14c0b038f3f5a4af0d5cdda9e95f Mon Sep 17 00:00:00 2001
From: Cihangir Akturk <cakturk@gmail.com>
Date: Sat, 29 Jul 2017 19:52:45 +0300
Subject: [PATCH 05/12] md: replace seq_release_private with seq_release

Since commit f15146380d28 ("fs: seq_file - add event counter to simplify
poll() support"), md.c code has been no longer used the private field of
the struct seq_file, but seq_release_private() has been continued to be
used to release the allocated seq_file. This seems to have been
forgotten. So convert it to use seq_release() instead of
seq_release_private().

Signed-off-by: Cihangir Akturk <cakturk@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 95c0b644a5b9..a74dc9963822 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7838,7 +7838,7 @@ static const struct file_operations md_seq_fops = {
 	.open           = md_seq_open,
 	.read           = seq_read,
 	.llseek         = seq_lseek,
-	.release	= seq_release_private,
+	.release	= seq_release,
 	.poll		= mdstat_poll,
 };
 

From a72cbf83b0cb2368568d4e78cf1687079aad4d25 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Tue, 8 Aug 2017 22:56:52 -0700
Subject: [PATCH 06/12] md/r5cache: call mddev_lock/unlock() in
 r5c_journal_mode_show

In r5c_journal_mode_show(), it is necessary to call mddev_lock()
before accessing conf and conf->log. Otherwise, the conf->log
may change (and become NULL).

Signed-off-by: Song Liu <songliubraving@fb.com>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 2dcbafa8e66c..e357ed367140 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2529,11 +2529,18 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp)
 
 static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
 {
-	struct r5conf *conf = mddev->private;
+	struct r5conf *conf;
 	int ret;
 
-	if (!conf->log)
+	ret = mddev_lock(mddev);
+	if (ret)
+		return ret;
+
+	conf = mddev->private;
+	if (!conf || !conf->log) {
+		mddev_unlock(mddev);
 		return 0;
+	}
 
 	switch (conf->log->r5c_journal_mode) {
 	case R5C_JOURNAL_MODE_WRITE_THROUGH:
@@ -2551,6 +2558,7 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
 	default:
 		ret = 0;
 	}
+	mddev_unlock(mddev);
 	return ret;
 }
 

From 27a4ff8f49a9b912be76f36f7e198824cf0aecd9 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Thu, 10 Aug 2017 16:12:17 +0800
Subject: [PATCH 07/12] raid5: remove raid5_build_block

Now raid5_build_block is just called to set the
sector of r5dev, raid5_compute_blocknr can be
used directly for the purpose.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e13a8ce7f589..6af57c6c0533 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -494,7 +494,6 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 	return 0;
 }
 
-static void raid5_build_block(struct stripe_head *sh, int i, int previous);
 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 			    struct stripe_head *sh);
 
@@ -530,7 +529,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 			WARN_ON(1);
 		}
 		dev->flags = 0;
-		raid5_build_block(sh, i, previous);
+		dev->sector = raid5_compute_blocknr(sh, i, previous);
 	}
 	if (read_seqcount_retry(&conf->gen_lock, seq))
 		goto retry;
@@ -2662,14 +2661,6 @@ static void raid5_end_write_request(struct bio *bi)
 		raid5_release_stripe(sh->batch_head);
 }
 
-static void raid5_build_block(struct stripe_head *sh, int i, int previous)
-{
-	struct r5dev *dev = &sh->dev[i];
-
-	dev->flags = 0;
-	dev->sector = raid5_compute_blocknr(sh, i, previous);
-}
-
 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char b[BDEVNAME_SIZE];

From b5e0fff19bef3a36631bce742e363d3a99590c3a Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Sat, 12 Aug 2017 19:43:46 +0200
Subject: [PATCH 08/12] lib/raid6: align AVX512 constants to 512 bits, not
 bytes

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: mingo@redhat.com
Cc: Jim Kukunas <james.t.kukunas@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Megha Dey <megha.dey@linux.intel.com>
Cc: Gayatri Kammela <gayatri.kammela@intel.com>
Cc: x86@kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Shaohua Li <shli@fb.com>
---
 lib/raid6/avx512.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/raid6/avx512.c b/lib/raid6/avx512.c
index f524a7972006..46df7977b971 100644
--- a/lib/raid6/avx512.c
+++ b/lib/raid6/avx512.c
@@ -29,7 +29,7 @@
 
 static const struct raid6_avx512_constants {
 	u64 x1d[8];
-} raid6_avx512_constants __aligned(512) = {
+} raid6_avx512_constants __aligned(512/8) = {
 	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,

From 8a8e6f84ad10e7a10bde438b42926da0e9bc820f Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 18 Aug 2017 10:27:59 -0700
Subject: [PATCH 09/12] md/raid0: attach correct cgroup info in bio

The discard bio doesn't attach the original bio cgroup info. Normal bio
is cloned, so is fine.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/bio.c        | 2 +-
 drivers/md/raid0.c | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index 9a63597aaacc..51189a1d9009 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2085,7 +2085,7 @@ void bio_clone_blkcg_association(struct bio *dst, struct bio *src)
 	if (src->bi_css)
 		WARN_ON(bio_associate_blkcg(dst, src->bi_css));
 }
-
+EXPORT_SYMBOL_GPL(bio_clone_blkcg_association);
 #endif /* CONFIG_BLK_CGROUP */
 
 static void __init biovec_init_slabs(void)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 94d9ae9b0fd0..6fb81704aff4 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -539,6 +539,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
 		    !discard_bio)
 			continue;
 		bio_chain(discard_bio, bio);
+		bio_clone_blkcg_association(discard_bio, bio);
 		if (mddev->gendisk)
 			trace_block_bio_remap(bdev_get_queue(rdev->bdev),
 				discard_bio, disk_devt(mddev->gendisk),

From ddc088238cd6988bb4ac3776f403d7ff9d3c7a63 Mon Sep 17 00:00:00 2001
From: Pawel Baldysiak <pawel.baldysiak@intel.com>
Date: Wed, 16 Aug 2017 17:13:45 +0200
Subject: [PATCH 10/12] md: Runtime support for multiple ppls

Increase PPL area to 1MB and use it as circular buffer to store PPL. The
entry with highest generation number is the latest one. If PPL to be
written is larger then space left in a buffer, rewind the buffer to the
start (don't wrap it).

Signed-off-by: Pawel Baldysiak <pawel.baldysiak@intel.com>
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c                | 16 ++++++++++---
 drivers/md/md.h                |  1 +
 drivers/md/raid0.c             |  3 ++-
 drivers/md/raid1.c             |  3 ++-
 drivers/md/raid5-ppl.c         | 43 +++++++++++++++++++++++++++++++---
 drivers/md/raid5.c             |  1 +
 include/uapi/linux/raid/md_p.h |  4 +++-
 7 files changed, 62 insertions(+), 9 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a74dc9963822..a7876237de10 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1536,7 +1536,8 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
 	} else if (sb->bblog_offset != 0)
 		rdev->badblocks.shift = 0;
 
-	if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
+	if ((le32_to_cpu(sb->feature_map) &
+	    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
 		rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
 		rdev->ppl.size = le16_to_cpu(sb->ppl.size);
 		rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
@@ -1655,10 +1656,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
 			set_bit(MD_HAS_JOURNAL, &mddev->flags);
 
-		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
+		if (le32_to_cpu(sb->feature_map) &
+		    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
 			if (le32_to_cpu(sb->feature_map) &
 			    (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
 				return -EINVAL;
+			if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
+			    (le32_to_cpu(sb->feature_map) &
+					    MD_FEATURE_MULTIPLE_PPLS))
+				return -EINVAL;
 			set_bit(MD_HAS_PPL, &mddev->flags);
 		}
 	} else if (mddev->pers == NULL) {
@@ -1875,7 +1881,11 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
 		sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
 
 	if (test_bit(MD_HAS_PPL, &mddev->flags)) {
-		sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
+		if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
+			sb->feature_map |=
+			    cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
+		else
+			sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
 		sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
 		sb->ppl.size = cpu_to_le16(rdev->ppl.size);
 	}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 09db03455801..d4bdfa5c223b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -236,6 +236,7 @@ enum mddev_flags {
 				 * never cause the array to become failed.
 				 */
 	MD_HAS_PPL,		/* The raid array has PPL feature set */
+	MD_HAS_MULTIPLE_PPLS,	/* The raid array has multiple PPLs feature set */
 };
 
 enum mddev_sb_flags {
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 6fb81704aff4..fd5e8e5efbef 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -30,7 +30,8 @@
 	((1L << MD_HAS_JOURNAL) |	\
 	 (1L << MD_JOURNAL_CLEAN) |	\
 	 (1L << MD_FAILFAST_SUPPORTED) |\
-	 (1L << MD_HAS_PPL))
+	 (1L << MD_HAS_PPL) |		\
+	 (1L << MD_HAS_MULTIPLE_PPLS))
 
 static int raid0_congested(struct mddev *mddev, int bits)
 {
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 79474f47eeef..1f5bd9475dc1 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -48,7 +48,8 @@
 #define UNSUPPORTED_MDDEV_FLAGS		\
 	((1L << MD_HAS_JOURNAL) |	\
 	 (1L << MD_JOURNAL_CLEAN) |	\
-	 (1L << MD_HAS_PPL))
+	 (1L << MD_HAS_PPL) |		\
+	 (1L << MD_HAS_MULTIPLE_PPLS))
 
 /*
  * Number of guaranteed r1bios in case of extreme VM load:
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 44ad5baf3206..b313f17a6260 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -87,6 +87,8 @@
  * The current io_unit accepting new stripes is always at the end of the list.
  */
 
+#define PPL_SPACE_SIZE (128 * 1024)
+
 struct ppl_conf {
 	struct mddev *mddev;
 
@@ -122,6 +124,10 @@ struct ppl_log {
 					 * always at the end of io_list */
 	spinlock_t io_list_lock;
 	struct list_head io_list;	/* all io_units of this log */
+
+	sector_t next_io_sector;
+	unsigned int entry_space;
+	bool use_multippl;
 };
 
 #define PPL_IO_INLINE_BVECS 32
@@ -264,13 +270,12 @@ static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
 	int i;
 	sector_t data_sector = 0;
 	int data_disks = 0;
-	unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE;
 	struct r5conf *conf = sh->raid_conf;
 
 	pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
 
 	/* check if current io_unit is full */
-	if (io && (io->pp_size == entry_space ||
+	if (io && (io->pp_size == log->entry_space ||
 		   io->entries_count == PPL_HDR_MAX_ENTRIES)) {
 		pr_debug("%s: add io_unit blocked by seq: %llu\n",
 			 __func__, io->seq);
@@ -451,12 +456,25 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
 	pplhdr->entries_count = cpu_to_le32(io->entries_count);
 	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
 
+	/* Rewind the buffer if current PPL is larger then remaining space */
+	if (log->use_multippl &&
+	    log->rdev->ppl.sector + log->rdev->ppl.size - log->next_io_sector <
+	    (PPL_HEADER_SIZE + io->pp_size) >> 9)
+		log->next_io_sector = log->rdev->ppl.sector;
+
+
 	bio->bi_end_io = ppl_log_endio;
 	bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
 	bio->bi_bdev = log->rdev->bdev;
-	bio->bi_iter.bi_sector = log->rdev->ppl.sector;
+	bio->bi_iter.bi_sector = log->next_io_sector;
 	bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
 
+	pr_debug("%s: log->current_io_sector: %llu\n", __func__,
+	    (unsigned long long)log->next_io_sector);
+
+	if (log->use_multippl)
+		log->next_io_sector += (PPL_HEADER_SIZE + io->pp_size) >> 9;
+
 	list_for_each_entry(sh, &io->stripe_list, log_list) {
 		/* entries for full stripe writes have no partial parity */
 		if (test_bit(STRIPE_FULL_WRITE, &sh->state))
@@ -1031,6 +1049,7 @@ static int ppl_load(struct ppl_conf *ppl_conf)
 static void __ppl_exit_log(struct ppl_conf *ppl_conf)
 {
 	clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
+	clear_bit(MD_HAS_MULTIPLE_PPLS, &ppl_conf->mddev->flags);
 
 	kfree(ppl_conf->child_logs);
 
@@ -1099,6 +1118,22 @@ static int ppl_validate_rdev(struct md_rdev *rdev)
 	return 0;
 }
 
+static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
+{
+	if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE +
+				      PPL_HEADER_SIZE) * 2) {
+		log->use_multippl = true;
+		set_bit(MD_HAS_MULTIPLE_PPLS,
+			&log->ppl_conf->mddev->flags);
+		log->entry_space = PPL_SPACE_SIZE;
+	} else {
+		log->use_multippl = false;
+		log->entry_space = (log->rdev->ppl.size << 9) -
+				   PPL_HEADER_SIZE;
+	}
+	log->next_io_sector = rdev->ppl.sector;
+}
+
 int ppl_init_log(struct r5conf *conf)
 {
 	struct ppl_conf *ppl_conf;
@@ -1196,6 +1231,7 @@ int ppl_init_log(struct r5conf *conf)
 			q = bdev_get_queue(rdev->bdev);
 			if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
 				need_cache_flush = true;
+			ppl_init_child_log(log, rdev);
 		}
 	}
 
@@ -1261,6 +1297,7 @@ int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add)
 		if (!ret) {
 			log->rdev = rdev;
 			ret = ppl_write_empty_header(log);
+			ppl_init_child_log(log, rdev);
 		}
 	} else {
 		log->rdev = NULL;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6af57c6c0533..049a958d3c1e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7236,6 +7236,7 @@ static int raid5_run(struct mddev *mddev)
 		pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
 			mdname(mddev));
 		clear_bit(MD_HAS_PPL, &mddev->flags);
+		clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags);
 	}
 
 	if (mddev->private == NULL)
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index d500bd224979..b9197976b660 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -324,9 +324,10 @@ struct mdp_superblock_1 {
 #define	MD_FEATURE_RECOVERY_BITMAP	128 /* recovery that is happening
 					     * is guided by bitmap.
 					     */
-#define MD_FEATURE_CLUSTERED		256 /* clustered MD */
+#define	MD_FEATURE_CLUSTERED		256 /* clustered MD */
 #define	MD_FEATURE_JOURNAL		512 /* support write cache */
 #define	MD_FEATURE_PPL			1024 /* support PPL */
+#define	MD_FEATURE_MULTIPLE_PPLS	2048 /* support for multiple PPLs */
 #define	MD_FEATURE_ALL			(MD_FEATURE_BITMAP_OFFSET	\
 					|MD_FEATURE_RECOVERY_OFFSET	\
 					|MD_FEATURE_RESHAPE_ACTIVE	\
@@ -338,6 +339,7 @@ struct mdp_superblock_1 {
 					|MD_FEATURE_CLUSTERED		\
 					|MD_FEATURE_JOURNAL		\
 					|MD_FEATURE_PPL			\
+					|MD_FEATURE_MULTIPLE_PPLS	\
 					)
 
 struct r5l_payload_header {

From 675dc2ccc27c02449da45e1a03234104c2449f68 Mon Sep 17 00:00:00 2001
From: Pawel Baldysiak <pawel.baldysiak@intel.com>
Date: Wed, 16 Aug 2017 17:13:46 +0200
Subject: [PATCH 11/12] raid5-ppl: Recovery support for multiple partial parity
 logs

Search PPL buffer in order to find out the latest PPL header (the one
with largest generation number) and use it for recovery. The PPL entry
format and recovery algorithm are the same as for single PPL approach.

Signed-off-by: Pawel Baldysiak <pawel.baldysiak@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-ppl.c | 132 ++++++++++++++++++++++++++++-------------
 1 file changed, 92 insertions(+), 40 deletions(-)

diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index b313f17a6260..a98ef172f8e8 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -831,12 +831,14 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
 	return ret;
 }
 
-static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr)
+static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr,
+		       sector_t offset)
 {
 	struct ppl_conf *ppl_conf = log->ppl_conf;
 	struct md_rdev *rdev = log->rdev;
 	struct mddev *mddev = rdev->mddev;
-	sector_t ppl_sector = rdev->ppl.sector + (PPL_HEADER_SIZE >> 9);
+	sector_t ppl_sector = rdev->ppl.sector + offset +
+			      (PPL_HEADER_SIZE >> 9);
 	struct page *page;
 	int i;
 	int ret = 0;
@@ -920,6 +922,9 @@ static int ppl_write_empty_header(struct ppl_log *log)
 		return -ENOMEM;
 
 	pplhdr = page_address(page);
+	/* zero out PPL space to avoid collision with old PPLs */
+	blkdev_issue_zeroout(rdev->bdev, rdev->ppl.sector,
+			    log->rdev->ppl.size, GFP_NOIO, 0);
 	memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
 	pplhdr->signature = cpu_to_le32(log->ppl_conf->signature);
 	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
@@ -940,63 +945,110 @@ static int ppl_load_distributed(struct ppl_log *log)
 	struct ppl_conf *ppl_conf = log->ppl_conf;
 	struct md_rdev *rdev = log->rdev;
 	struct mddev *mddev = rdev->mddev;
-	struct page *page;
-	struct ppl_header *pplhdr;
+	struct page *page, *page2, *tmp;
+	struct ppl_header *pplhdr = NULL, *prev_pplhdr = NULL;
 	u32 crc, crc_stored;
 	u32 signature;
-	int ret = 0;
+	int ret = 0, i;
+	sector_t pplhdr_offset = 0, prev_pplhdr_offset = 0;
 
 	pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk);
-
-	/* read PPL header */
+	/* read PPL headers, find the recent one */
 	page = alloc_page(GFP_KERNEL);
 	if (!page)
 		return -ENOMEM;
 
-	if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
-			  PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
-		md_error(mddev, rdev);
-		ret = -EIO;
-		goto out;
+	page2 = alloc_page(GFP_KERNEL);
+	if (!page2) {
+		__free_page(page);
+		return -ENOMEM;
 	}
-	pplhdr = page_address(page);
 
-	/* check header validity */
-	crc_stored = le32_to_cpu(pplhdr->checksum);
-	pplhdr->checksum = 0;
-	crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
+	/* searching ppl area for latest ppl */
+	while (pplhdr_offset < rdev->ppl.size - (PPL_HEADER_SIZE >> 9)) {
+		if (!sync_page_io(rdev,
+				  rdev->ppl.sector - rdev->data_offset +
+				  pplhdr_offset, PAGE_SIZE, page, REQ_OP_READ,
+				  0, false)) {
+			md_error(mddev, rdev);
+			ret = -EIO;
+			/* if not able to read - don't recover any PPL */
+			pplhdr = NULL;
+			break;
+		}
+		pplhdr = page_address(page);
 
-	if (crc_stored != crc) {
-		pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x\n",
-			 __func__, crc_stored, crc);
+		/* check header validity */
+		crc_stored = le32_to_cpu(pplhdr->checksum);
+		pplhdr->checksum = 0;
+		crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
+
+		if (crc_stored != crc) {
+			pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x (offset: %llu)\n",
+				 __func__, crc_stored, crc,
+				 (unsigned long long)pplhdr_offset);
+			pplhdr = prev_pplhdr;
+			pplhdr_offset = prev_pplhdr_offset;
+			break;
+		}
+
+		signature = le32_to_cpu(pplhdr->signature);
+
+		if (mddev->external) {
+			/*
+			 * For external metadata the header signature is set and
+			 * validated in userspace.
+			 */
+			ppl_conf->signature = signature;
+		} else if (ppl_conf->signature != signature) {
+			pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x (offset: %llu)\n",
+				 __func__, signature, ppl_conf->signature,
+				 (unsigned long long)pplhdr_offset);
+			pplhdr = prev_pplhdr;
+			pplhdr_offset = prev_pplhdr_offset;
+			break;
+		}
+
+		if (prev_pplhdr && le64_to_cpu(prev_pplhdr->generation) >
+		    le64_to_cpu(pplhdr->generation)) {
+			/* previous was newest */
+			pplhdr = prev_pplhdr;
+			pplhdr_offset = prev_pplhdr_offset;
+			break;
+		}
+
+		prev_pplhdr_offset = pplhdr_offset;
+		prev_pplhdr = pplhdr;
+
+		tmp = page;
+		page = page2;
+		page2 = tmp;
+
+		/* calculate next potential ppl offset */
+		for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++)
+			pplhdr_offset +=
+			    le32_to_cpu(pplhdr->entries[i].pp_size) >> 9;
+		pplhdr_offset += PPL_HEADER_SIZE >> 9;
+	}
+
+	/* no valid ppl found */
+	if (!pplhdr)
 		ppl_conf->mismatch_count++;
-		goto out;
-	}
-
-	signature = le32_to_cpu(pplhdr->signature);
-
-	if (mddev->external) {
-		/*
-		 * For external metadata the header signature is set and
-		 * validated in userspace.
-		 */
-		ppl_conf->signature = signature;
-	} else if (ppl_conf->signature != signature) {
-		pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x\n",
-			 __func__, signature, ppl_conf->signature);
-		ppl_conf->mismatch_count++;
-		goto out;
-	}
+	else
+		pr_debug("%s: latest PPL found at offset: %llu, with generation: %llu\n",
+		    __func__, (unsigned long long)pplhdr_offset,
+		    le64_to_cpu(pplhdr->generation));
 
 	/* attempt to recover from log if we are starting a dirty array */
-	if (!mddev->pers && mddev->recovery_cp != MaxSector)
-		ret = ppl_recover(log, pplhdr);
-out:
+	if (pplhdr && !mddev->pers && mddev->recovery_cp != MaxSector)
+		ret = ppl_recover(log, pplhdr, pplhdr_offset);
+
 	/* write empty header if we are starting the array */
 	if (!ret && !mddev->pers)
 		ret = ppl_write_empty_header(log);
 
 	__free_page(page);
+	__free_page(page2);
 
 	pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
 		 __func__, ret, ppl_conf->mismatch_count,

From e8a27f836f165c26f867ece7f31eb5c811692319 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 31 Aug 2017 10:23:25 +1000
Subject: [PATCH 12/12] md/bitmap: disable bitmap_resize for file-backed
 bitmaps.

bitmap_resize() does not work for file-backed bitmaps.
The buffer_heads are allocated and initialized when
the bitmap is read from the file, but resize doesn't
read from the file, it loads from the internal bitmap.
When it comes time to write the new bitmap, the bh is
non-existent and we crash.

The common case when growing an array involves making the array larger,
and that normally means making the bitmap larger.  Doing
that inside the kernel is possible, but would need more code.
It is probably easier to require people who use file-backed
bitmaps to remove them and re-add after a reshape.

So this patch disables the resizing of arrays which have
file-backed bitmaps.  This is better than crashing.

Reported-by: Zhilong Liu <zlliu@suse.com>
Fixes: d60b479d177a ("md/bitmap: add bitmap_resize function to allow bitmap resizing.")
Cc: stable@vger.kernel.org (v3.5+).
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/bitmap.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 67e992185a24..d2121637b4ab 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -2058,6 +2058,11 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 	long pages;
 	struct bitmap_page *new_bp;
 
+	if (bitmap->storage.file && !init) {
+		pr_info("md: cannot resize file-based bitmap\n");
+		return -EINVAL;
+	}
+
 	if (chunksize == 0) {
 		/* If there is enough space, leave the chunk size unchanged,
 		 * else increase by factor of two until there is enough space.