From aa5cbd103887011b4830355f88fb055f9ad2d556 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:46 +1100
Subject: [PATCH 01/27] md/bitmap: protect against bitmap removal while being
 updated.

A write intent bitmap can be removed from an array while the
array is active.
When this happens, all IO is suspended and flushed before the
bitmap is removed.
However it is possible that bitmap_daemon_work is still running to
clear old bits from the bitmap.  If it is, it can dereference the
bitmap after it has been freed.

So introduce a new mutex to protect bitmap_daemon_work and get it
before destroying a bitmap.

This is suitable for any current -stable kernel.

Signed-off-by: NeilBrown <neilb@suse.de>
Cc: stable@kernel.org
---
 drivers/md/bitmap.c | 24 ++++++++++++++++++------
 drivers/md/bitmap.h |  2 +-
 drivers/md/md.c     |  3 ++-
 drivers/md/md.h     |  1 +
 4 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 60e2b322db11..a5e5f2fbf963 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1078,23 +1078,31 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
  *			out to disk
  */
 
-void bitmap_daemon_work(struct bitmap *bitmap)
+void bitmap_daemon_work(mddev_t *mddev)
 {
+	struct bitmap *bitmap;
 	unsigned long j;
 	unsigned long flags;
 	struct page *page = NULL, *lastpage = NULL;
 	int blocks;
 	void *paddr;
 
-	if (bitmap == NULL)
+	/* Use a mutex to guard daemon_work against
+	 * bitmap_destroy.
+	 */
+	mutex_lock(&mddev->bitmap_mutex);
+	bitmap = mddev->bitmap;
+	if (bitmap == NULL) {
+		mutex_unlock(&mddev->bitmap_mutex);
 		return;
+	}
 	if (time_before(jiffies, bitmap->daemon_lastrun + bitmap->daemon_sleep*HZ))
 		goto done;
 
 	bitmap->daemon_lastrun = jiffies;
 	if (bitmap->allclean) {
 		bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
-		return;
+		goto done;
 	}
 	bitmap->allclean = 1;
 
@@ -1203,6 +1211,7 @@ void bitmap_daemon_work(struct bitmap *bitmap)
  done:
 	if (bitmap->allclean == 0)
 		bitmap->mddev->thread->timeout = bitmap->daemon_sleep * HZ;
+	mutex_unlock(&mddev->bitmap_mutex);
 }
 
 static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
@@ -1541,9 +1550,9 @@ void bitmap_flush(mddev_t *mddev)
 	 */
 	sleep = bitmap->daemon_sleep;
 	bitmap->daemon_sleep = 0;
-	bitmap_daemon_work(bitmap);
-	bitmap_daemon_work(bitmap);
-	bitmap_daemon_work(bitmap);
+	bitmap_daemon_work(mddev);
+	bitmap_daemon_work(mddev);
+	bitmap_daemon_work(mddev);
 	bitmap->daemon_sleep = sleep;
 	bitmap_update_sb(bitmap);
 }
@@ -1574,6 +1583,7 @@ static void bitmap_free(struct bitmap *bitmap)
 	kfree(bp);
 	kfree(bitmap);
 }
+
 void bitmap_destroy(mddev_t *mddev)
 {
 	struct bitmap *bitmap = mddev->bitmap;
@@ -1581,7 +1591,9 @@ void bitmap_destroy(mddev_t *mddev)
 	if (!bitmap) /* there was no bitmap */
 		return;
 
+	mutex_lock(&mddev->bitmap_mutex);
 	mddev->bitmap = NULL; /* disconnect from the md device */
+	mutex_unlock(&mddev->bitmap_mutex);
 	if (mddev->thread)
 		mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
 
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index e98900671ca9..7e38d13ddcac 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -282,7 +282,7 @@ void bitmap_close_sync(struct bitmap *bitmap);
 void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
 
 void bitmap_unplug(struct bitmap *bitmap);
-void bitmap_daemon_work(struct bitmap *bitmap);
+void bitmap_daemon_work(mddev_t *mddev);
 #endif
 
 #endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5f154ef1e4be..1be7a16a7a53 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -363,6 +363,7 @@ static mddev_t * mddev_find(dev_t unit)
 
 	mutex_init(&new->open_mutex);
 	mutex_init(&new->reconfig_mutex);
+	mutex_init(&new->bitmap_mutex);
 	INIT_LIST_HEAD(&new->disks);
 	INIT_LIST_HEAD(&new->all_mddevs);
 	init_timer(&new->safemode_timer);
@@ -6625,7 +6626,7 @@ void md_check_recovery(mddev_t *mddev)
 
 
 	if (mddev->bitmap)
-		bitmap_daemon_work(mddev->bitmap);
+		bitmap_daemon_work(mddev);
 
 	if (mddev->ro)
 		return;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index f184b69ef337..87430fea2875 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -289,6 +289,7 @@ struct mddev_s
 								* hot-adding a bitmap.  It should
 								* eventually be settable by sysfs.
 								*/
+	struct mutex			bitmap_mutex;
 
 	struct list_head		all_mddevs;
 };

From 8553fe7ec731e4f997245e14319572cb15118018 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:47 +1100
Subject: [PATCH 02/27] md/raid5: remove some sparse warnings.

qd_idx is previously declared and given exactly the same value!

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d29215d966da..34cb065f6d66 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3358,7 +3358,6 @@ static void handle_stripe6(struct stripe_head *sh)
 	 * completed
 	 */
 	if (sh->reconstruct_state == reconstruct_state_drain_result) {
-		int qd_idx = sh->qd_idx;
 
 		sh->reconstruct_state = reconstruct_state_idle;
 		/* All the 'written' buffers and the parity blocks are ready to
@@ -3741,7 +3740,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
 {
 	mddev_t *mddev = q->queuedata;
 	raid5_conf_t *conf = mddev->private;
-	unsigned int dd_idx;
+	int dd_idx;
 	struct bio* align_bi;
 	mdk_rdev_t *rdev;
 

From 7820f9e1dddcfebae2698fb2a245d04ce3aa6e74 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:47 +1100
Subject: [PATCH 03/27] md: remove sparse warning:symbol XXX was not declared.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid6algos.c | 19 -------------------
 include/linux/raid/pq.h | 19 +++++++++++++++++++
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c
index 866215ac7f25..8ce102cc2e7b 100644
--- a/drivers/md/raid6algos.c
+++ b/drivers/md/raid6algos.c
@@ -31,25 +31,6 @@ EXPORT_SYMBOL(raid6_empty_zero_page);
 struct raid6_calls raid6_call;
 EXPORT_SYMBOL_GPL(raid6_call);
 
-/* Various routine sets */
-extern const struct raid6_calls raid6_intx1;
-extern const struct raid6_calls raid6_intx2;
-extern const struct raid6_calls raid6_intx4;
-extern const struct raid6_calls raid6_intx8;
-extern const struct raid6_calls raid6_intx16;
-extern const struct raid6_calls raid6_intx32;
-extern const struct raid6_calls raid6_mmxx1;
-extern const struct raid6_calls raid6_mmxx2;
-extern const struct raid6_calls raid6_sse1x1;
-extern const struct raid6_calls raid6_sse1x2;
-extern const struct raid6_calls raid6_sse2x1;
-extern const struct raid6_calls raid6_sse2x2;
-extern const struct raid6_calls raid6_sse2x4;
-extern const struct raid6_calls raid6_altivec1;
-extern const struct raid6_calls raid6_altivec2;
-extern const struct raid6_calls raid6_altivec4;
-extern const struct raid6_calls raid6_altivec8;
-
 const struct raid6_calls * const raid6_algos[] = {
 	&raid6_intx1,
 	&raid6_intx2,
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index d92480f8285c..1cbbd2c11aa9 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -78,6 +78,25 @@ struct raid6_calls {
 /* Selected algorithm */
 extern struct raid6_calls raid6_call;
 
+/* Various routine sets */
+extern const struct raid6_calls raid6_intx1;
+extern const struct raid6_calls raid6_intx2;
+extern const struct raid6_calls raid6_intx4;
+extern const struct raid6_calls raid6_intx8;
+extern const struct raid6_calls raid6_intx16;
+extern const struct raid6_calls raid6_intx32;
+extern const struct raid6_calls raid6_mmxx1;
+extern const struct raid6_calls raid6_mmxx2;
+extern const struct raid6_calls raid6_sse1x1;
+extern const struct raid6_calls raid6_sse1x2;
+extern const struct raid6_calls raid6_sse2x1;
+extern const struct raid6_calls raid6_sse2x2;
+extern const struct raid6_calls raid6_sse2x4;
+extern const struct raid6_calls raid6_altivec1;
+extern const struct raid6_calls raid6_altivec2;
+extern const struct raid6_calls raid6_altivec4;
+extern const struct raid6_calls raid6_altivec8;
+
 /* Algorithm list */
 extern const struct raid6_calls * const raid6_algos[];
 int raid6_select_algo(void);

From c07b70ad32ed0a5ec9735cafb1aa10b3a2298b7d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:48 +1100
Subject: [PATCH 04/27] md: adjust resync_min usefully when resync aborts.

When a 'check' or 'repair' finished we should clear resync_min
so that a future check/repair will cover the whole array (by default).
However if it is interrupted, we should update resync_min to
where we got up to, so that when the check/repair continues it
just does the remainder of the array.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1be7a16a7a53..a08aa582925b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6524,11 +6524,15 @@ void md_do_sync(mddev_t *mddev)
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
  skip:
+	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+		/* We completed so min/max setting can be forgotten if used. */
+		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+			mddev->resync_min = 0;
+		mddev->resync_max = MaxSector;
+	} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+		mddev->resync_min = mddev->curr_resync_completed;
 	mddev->curr_resync = 0;
 	mddev->curr_resync_completed = 0;
-	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
-		/* We completed so max setting can be forgotten. */
-		mddev->resync_max = MaxSector;
 	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 	wake_up(&resync_wait);
 	set_bit(MD_RECOVERY_DONE, &mddev->recovery);

From efa593390e70b0e3c39f6b2dca8876b6b1461e41 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:49 +1100
Subject: [PATCH 05/27] md: don't reset curr_resync_completed after an
 interrupted resync

If a resync/recovery/check/repair is interrupted for some reason, it
can be useful to know exactly where it got up to.
So in that case, do not clear curr_resync_completed.
Initialise it when starting a resync/recovery/... instead.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a08aa582925b..d79a40649799 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6381,6 +6381,7 @@ void md_do_sync(mddev_t *mddev)
 		       desc, mdname(mddev));
 		mddev->curr_resync = j;
 	}
+	mddev->curr_resync_completed = mddev->curr_resync;
 
 	while (j < max_sectors) {
 		sector_t sectors;
@@ -6532,7 +6533,8 @@ void md_do_sync(mddev_t *mddev)
 	} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
 		mddev->resync_min = mddev->curr_resync_completed;
 	mddev->curr_resync = 0;
-	mddev->curr_resync_completed = 0;
+	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+		mddev->curr_resync_completed = 0;
 	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 	wake_up(&resync_wait);
 	set_bit(MD_RECOVERY_DONE, &mddev->recovery);

From a2826aa92e2e14db372eda01d333267258944033 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:49 +1100
Subject: [PATCH 06/27] md: support barrier requests on all personalities.

Previously barriers were only supported on RAID1.  This is because
other levels requires synchronisation across all devices and so needed
a different approach.
Here is that approach.

When a barrier arrives, we send a zero-length barrier to every active
device.  When that completes - and if the original request was not
empty -  we submit the barrier request itself (with the barrier flag
cleared) and then submit a fresh load of zero length barriers.

The barrier request itself is asynchronous, but any subsequent
request will block until the barrier completes.

The reason for clearing the barrier flag is that a barrier request is
allowed to fail.  If we pass a non-empty barrier through a striping
raid level it is conceivable that part of it could succeed and part
could fail.  That would be way too hard to deal with.
So if the first run of zero length barriers succeed, we assume all is
sufficiently well that we send the request and ignore errors in the
second run of barriers.

RAID5 needs extra care as write requests may not have been submitted
to the underlying devices yet.  So we flush the stripe cache before
proceeding with the barrier.

Note that the second set of zero-length barriers are submitted
immediately after the original request is submitted.  Thus when
a personality finds mddev->barrier to be set during make_request,
it should not return from make_request until the corresponding
per-device request(s) have been queued.

That will be done in later patches.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Andre Noll <maan@systemlinux.org>
---
 drivers/md/linear.c    |   2 +-
 drivers/md/md.c        | 105 ++++++++++++++++++++++++++++++++++++++++-
 drivers/md/md.h        |  12 +++++
 drivers/md/multipath.c |   2 +-
 drivers/md/raid0.c     |   2 +-
 drivers/md/raid10.c    |   2 +-
 drivers/md/raid5.c     |   8 +++-
 7 files changed, 126 insertions(+), 7 deletions(-)

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 1ceceb334d5e..3b3f77c4f249 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -292,7 +292,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 	int cpu;
 
 	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
-		bio_endio(bio, -EOPNOTSUPP);
+		md_barrier_request(mddev, bio);
 		return 0;
 	}
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d79a40649799..569f25183db6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -213,12 +213,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
 		return 0;
 	}
 	rcu_read_lock();
-	if (mddev->suspended) {
+	if (mddev->suspended || mddev->barrier) {
 		DEFINE_WAIT(__wait);
 		for (;;) {
 			prepare_to_wait(&mddev->sb_wait, &__wait,
 					TASK_UNINTERRUPTIBLE);
-			if (!mddev->suspended)
+			if (!mddev->suspended && !mddev->barrier)
 				break;
 			rcu_read_unlock();
 			schedule();
@@ -260,10 +260,110 @@ static void mddev_resume(mddev_t *mddev)
 
 int mddev_congested(mddev_t *mddev, int bits)
 {
+	if (mddev->barrier)
+		return 1;
 	return mddev->suspended;
 }
 EXPORT_SYMBOL(mddev_congested);
 
+/*
+ * Generic barrier handling for md
+ */
+
+#define POST_REQUEST_BARRIER ((void*)1)
+
+static void md_end_barrier(struct bio *bio, int err)
+{
+	mdk_rdev_t *rdev = bio->bi_private;
+	mddev_t *mddev = rdev->mddev;
+	if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
+		set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
+
+	rdev_dec_pending(rdev, mddev);
+
+	if (atomic_dec_and_test(&mddev->flush_pending)) {
+		if (mddev->barrier == POST_REQUEST_BARRIER) {
+			/* This was a post-request barrier */
+			mddev->barrier = NULL;
+			wake_up(&mddev->sb_wait);
+		} else
+			/* The pre-request barrier has finished */
+			schedule_work(&mddev->barrier_work);
+	}
+	bio_put(bio);
+}
+
+static void submit_barriers(mddev_t *mddev)
+{
+	mdk_rdev_t *rdev;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
+		if (rdev->raid_disk >= 0 &&
+		    !test_bit(Faulty, &rdev->flags)) {
+			/* Take two references, one is dropped
+			 * when request finishes, one after
+			 * we reclaim rcu_read_lock
+			 */
+			struct bio *bi;
+			atomic_inc(&rdev->nr_pending);
+			atomic_inc(&rdev->nr_pending);
+			rcu_read_unlock();
+			bi = bio_alloc(GFP_KERNEL, 0);
+			bi->bi_end_io = md_end_barrier;
+			bi->bi_private = rdev;
+			bi->bi_bdev = rdev->bdev;
+			atomic_inc(&mddev->flush_pending);
+			submit_bio(WRITE_BARRIER, bi);
+			rcu_read_lock();
+			rdev_dec_pending(rdev, mddev);
+		}
+	rcu_read_unlock();
+}
+
+static void md_submit_barrier(struct work_struct *ws)
+{
+	mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
+	struct bio *bio = mddev->barrier;
+
+	atomic_set(&mddev->flush_pending, 1);
+
+	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
+		bio_endio(bio, -EOPNOTSUPP);
+	else if (bio->bi_size == 0)
+		/* an empty barrier - all done */
+		bio_endio(bio, 0);
+	else {
+		bio->bi_rw &= ~(1<<BIO_RW_BARRIER);
+		if (mddev->pers->make_request(mddev->queue, bio))
+			generic_make_request(bio);
+		mddev->barrier = POST_REQUEST_BARRIER;
+		submit_barriers(mddev);
+	}
+	if (atomic_dec_and_test(&mddev->flush_pending)) {
+		mddev->barrier = NULL;
+		wake_up(&mddev->sb_wait);
+	}
+}
+
+void md_barrier_request(mddev_t *mddev, struct bio *bio)
+{
+	spin_lock_irq(&mddev->write_lock);
+	wait_event_lock_irq(mddev->sb_wait,
+			    !mddev->barrier,
+			    mddev->write_lock, /*nothing*/);
+	mddev->barrier = bio;
+	spin_unlock_irq(&mddev->write_lock);
+
+	atomic_set(&mddev->flush_pending, 1);
+	INIT_WORK(&mddev->barrier_work, md_submit_barrier);
+
+	submit_barriers(mddev);
+
+	if (atomic_dec_and_test(&mddev->flush_pending))
+		schedule_work(&mddev->barrier_work);
+}
+EXPORT_SYMBOL(md_barrier_request);
 
 static inline mddev_t *mddev_get(mddev_t *mddev)
 {
@@ -371,6 +471,7 @@ static mddev_t * mddev_find(dev_t unit)
 	atomic_set(&new->openers, 0);
 	atomic_set(&new->active_io, 0);
 	spin_lock_init(&new->write_lock);
+	atomic_set(&new->flush_pending, 0);
 	init_waitqueue_head(&new->sb_wait);
 	init_waitqueue_head(&new->recovery_wait);
 	new->reshape_position = MaxSector;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 87430fea2875..cb036868a9e9 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -292,6 +292,17 @@ struct mddev_s
 	struct mutex			bitmap_mutex;
 
 	struct list_head		all_mddevs;
+
+	/* Generic barrier handling.
+	 * If there is a pending barrier request, all other
+	 * writes are blocked while the devices are flushed.
+	 * The last to finish a flush schedules a worker to
+	 * submit the barrier request (without the barrier flag),
+	 * then submit more flush requests.
+	 */
+	struct bio *barrier;
+	atomic_t flush_pending;
+	struct work_struct barrier_work;
 };
 
 
@@ -432,6 +443,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
 extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
 
 extern int mddev_congested(mddev_t *mddev, int bits);
+extern void md_barrier_request(mddev_t *mddev, struct bio *bio);
 extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 			   sector_t sector, int size, struct page *page);
 extern void md_super_wait(mddev_t *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index ee7646f974a0..cbc0a99f3796 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -145,7 +145,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
 	int cpu;
 
 	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
-		bio_endio(bio, -EOPNOTSUPP);
+		md_barrier_request(mddev, bio);
 		return 0;
 	}
 
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index d3a4ce06015a..122d07af5b54 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -453,7 +453,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)
 	int cpu;
 
 	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
-		bio_endio(bio, -EOPNOTSUPP);
+		md_barrier_request(mddev, bio);
 		return 0;
 	}
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c2cb7b87b440..2fbf867f8b30 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -804,7 +804,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	mdk_rdev_t *blocked_rdev;
 
 	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
-		bio_endio(bio, -EOPNOTSUPP);
+		md_barrier_request(mddev, bio);
 		return 0;
 	}
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 34cb065f6d66..8c9395f2028f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3865,7 +3865,13 @@ static int make_request(struct request_queue *q, struct bio * bi)
 	int cpu, remaining;
 
 	if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
-		bio_endio(bi, -EOPNOTSUPP);
+		/* Drain all pending writes.  We only really need
+		 * to ensure they have been submitted, but this is
+		 * easier.
+		 */
+		mddev->pers->quiesce(mddev, 1);
+		mddev->pers->quiesce(mddev, 0);
+		md_barrier_request(mddev, bi);
 		return 0;
 	}
 

From 729a18663a30a9c8076e3adc2b3e4c866974f935 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:50 +1100
Subject: [PATCH 07/27] md/raid5: don't complete make_request on barrier until
 writes are scheduled

The post-barrier-flush is sent by md as soon as make_request on the
barrier write completes.  For raid5, the data might not be in the
per-device queues yet.  So for barrier requests, wait for any
pre-reading to be done so that the request will be in the per-device
queues.

We use the 'preread_active' count to check that nothing is still in
the preread phase, and delay the decrement of this count until after
write requests have been submitted to the underlying devices.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 51 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 12 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8c9395f2028f..c78d4835b0d6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2947,6 +2947,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	struct r5dev *dev;
 	mdk_rdev_t *blocked_rdev = NULL;
 	int prexor;
+	int dec_preread_active = 0;
 
 	memset(&s, 0, sizeof(s));
 	pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
@@ -3096,12 +3097,8 @@ static void handle_stripe5(struct stripe_head *sh)
 					set_bit(STRIPE_INSYNC, &sh->state);
 			}
 		}
-		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-			atomic_dec(&conf->preread_active_stripes);
-			if (atomic_read(&conf->preread_active_stripes) <
-				IO_THRESHOLD)
-				md_wakeup_thread(conf->mddev->thread);
-		}
+		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+			dec_preread_active = 1;
 	}
 
 	/* Now to consider new write requests and what else, if anything
@@ -3208,6 +3205,16 @@ static void handle_stripe5(struct stripe_head *sh)
 
 	ops_run_io(sh, &s);
 
+	if (dec_preread_active) {
+		/* We delay this until after ops_run_io so that if make_request
+		 * is waiting on a barrier, it won't continue until the writes
+		 * have actually been submitted.
+		 */
+		atomic_dec(&conf->preread_active_stripes);
+		if (atomic_read(&conf->preread_active_stripes) <
+		    IO_THRESHOLD)
+			md_wakeup_thread(conf->mddev->thread);
+	}
 	return_io(return_bi);
 }
 
@@ -3221,6 +3228,7 @@ static void handle_stripe6(struct stripe_head *sh)
 	struct r6_state r6s;
 	struct r5dev *dev, *pdev, *qdev;
 	mdk_rdev_t *blocked_rdev = NULL;
+	int dec_preread_active = 0;
 
 	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
 		"pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
@@ -3379,12 +3387,8 @@ static void handle_stripe6(struct stripe_head *sh)
 					set_bit(STRIPE_INSYNC, &sh->state);
 			}
 		}
-		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-			atomic_dec(&conf->preread_active_stripes);
-			if (atomic_read(&conf->preread_active_stripes) <
-				IO_THRESHOLD)
-				md_wakeup_thread(conf->mddev->thread);
-		}
+		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+			dec_preread_active = 1;
 	}
 
 	/* Now to consider new write requests and what else, if anything
@@ -3493,6 +3497,18 @@ static void handle_stripe6(struct stripe_head *sh)
 
 	ops_run_io(sh, &s);
 
+
+	if (dec_preread_active) {
+		/* We delay this until after ops_run_io so that if make_request
+		 * is waiting on a barrier, it won't continue until the writes
+		 * have actually been submitted.
+		 */
+		atomic_dec(&conf->preread_active_stripes);
+		if (atomic_read(&conf->preread_active_stripes) <
+		    IO_THRESHOLD)
+			md_wakeup_thread(conf->mddev->thread);
+	}
+
 	return_io(return_bi);
 }
 
@@ -3995,6 +4011,9 @@ static int make_request(struct request_queue *q, struct bio * bi)
 			finish_wait(&conf->wait_for_overlap, &w);
 			set_bit(STRIPE_HANDLE, &sh->state);
 			clear_bit(STRIPE_DELAYED, &sh->state);
+			if (mddev->barrier && 
+			    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+				atomic_inc(&conf->preread_active_stripes);
 			release_stripe(sh);
 		} else {
 			/* cannot get stripe for read-ahead, just give-up */
@@ -4014,6 +4033,14 @@ static int make_request(struct request_queue *q, struct bio * bi)
 
 		bio_endio(bi, 0);
 	}
+
+	if (mddev->barrier) {
+		/* We need to wait for the stripes to all be handled.
+		 * So: wait for preread_active_stripes to drop to 0.
+		 */
+		wait_event(mddev->thread->wqueue,
+			   atomic_read(&conf->preread_active_stripes) == 0);
+	}
 	return 0;
 }
 

From 6eef4b21ffc9207b78fdd718154f5b004644b97d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:51 +1100
Subject: [PATCH 08/27] md: add honouring of suspend_{lo,hi} to raid1.

This will allow us to stop writeout to portions of the array
while  they are resynced by someone else - e.g. another node in
a cluster.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e07ce2e033a9..35b2d8646ae9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -801,6 +801,25 @@ static int make_request(struct request_queue *q, struct bio * bio)
 
 	md_write_start(mddev, bio); /* wait on superblock update early */
 
+	if (bio_data_dir(bio) == WRITE &&
+	    bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo &&
+	    bio->bi_sector < mddev->suspend_hi) {
+		/* As the suspend_* range is controlled by
+		 * userspace, we want an interruptible
+		 * wait.
+		 */
+		DEFINE_WAIT(w);
+		for (;;) {
+			flush_signals(current);
+			prepare_to_wait(&conf->wait_barrier,
+					&w, TASK_INTERRUPTIBLE);
+			if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
+			    bio->bi_sector >= mddev->suspend_hi)
+				break;
+			schedule();
+		}
+		finish_wait(&conf->wait_barrier, &w);
+	}
 	if (unlikely(!mddev->barriers_work &&
 		     bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		if (rw == WRITE)
@@ -2271,6 +2290,9 @@ static void raid1_quiesce(mddev_t *mddev, int state)
 	conf_t *conf = mddev->private;
 
 	switch(state) {
+	case 2: /* wake for suspend */
+		wake_up(&conf->wait_barrier);
+		break;
 	case 1:
 		raise_barrier(conf);
 		break;

From 709ae4879ae33628ded276ce7da8cd5acfec476b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:51 +1100
Subject: [PATCH 09/27] md/raid1: add takeover support for raid5->raid1

A 2-device raid5 array can now be converted to raid1.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 195 +++++++++++++++++++++++++++------------------
 drivers/md/raid1.h |   5 ++
 2 files changed, 122 insertions(+), 78 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 35b2d8646ae9..7549b0bad326 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -677,6 +677,7 @@ static void raise_barrier(conf_t *conf)
 static void lower_barrier(conf_t *conf)
 {
 	unsigned long flags;
+	BUG_ON(conf->barrier <= 0);
 	spin_lock_irqsave(&conf->resync_lock, flags);
 	conf->barrier--;
 	spin_unlock_irqrestore(&conf->resync_lock, flags);
@@ -1960,74 +1961,48 @@ static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 	return mddev->dev_sectors;
 }
 
-static int run(mddev_t *mddev)
+static conf_t *setup_conf(mddev_t *mddev)
 {
 	conf_t *conf;
-	int i, j, disk_idx;
+	int i;
 	mirror_info_t *disk;
 	mdk_rdev_t *rdev;
+	int err = -ENOMEM;
 
-	if (mddev->level != 1) {
-		printk("raid1: %s: raid level not set to mirroring (%d)\n",
-		       mdname(mddev), mddev->level);
-		goto out;
-	}
-	if (mddev->reshape_position != MaxSector) {
-		printk("raid1: %s: reshape_position set but not supported\n",
-		       mdname(mddev));
-		goto out;
-	}
-	/*
-	 * copy the already verified devices into our private RAID1
-	 * bookkeeping area. [whatever we allocate in run(),
-	 * should be freed in stop()]
-	 */
 	conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
-	mddev->private = conf;
 	if (!conf)
-		goto out_no_mem;
+		goto abort;
 
 	conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
 				 GFP_KERNEL);
 	if (!conf->mirrors)
-		goto out_no_mem;
+		goto abort;
 
 	conf->tmppage = alloc_page(GFP_KERNEL);
 	if (!conf->tmppage)
-		goto out_no_mem;
+		goto abort;
 
-	conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
+	conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
 	if (!conf->poolinfo)
-		goto out_no_mem;
-	conf->poolinfo->mddev = NULL;
+		goto abort;
 	conf->poolinfo->raid_disks = mddev->raid_disks;
 	conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
 					  r1bio_pool_free,
 					  conf->poolinfo);
 	if (!conf->r1bio_pool)
-		goto out_no_mem;
+		goto abort;
+
 	conf->poolinfo->mddev = mddev;
 
 	spin_lock_init(&conf->device_lock);
-	mddev->queue->queue_lock = &conf->device_lock;
-
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
-		disk_idx = rdev->raid_disk;
+		int disk_idx = rdev->raid_disk;
 		if (disk_idx >= mddev->raid_disks
 		    || disk_idx < 0)
 			continue;
 		disk = conf->mirrors + disk_idx;
 
 		disk->rdev = rdev;
-		disk_stack_limits(mddev->gendisk, rdev->bdev,
-				  rdev->data_offset << 9);
-		/* as we don't honour merge_bvec_fn, we must never risk
-		 * violating it, so limit ->max_sector to one PAGE, as
-		 * a one page request is never in violation.
-		 */
-		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
-			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		disk->head_position = 0;
 	}
@@ -2041,8 +2016,7 @@ static int run(mddev_t *mddev)
 	bio_list_init(&conf->pending_bio_list);
 	bio_list_init(&conf->flushing_bio_list);
 
-
-	mddev->degraded = 0;
+	conf->last_used = -1;
 	for (i = 0; i < conf->raid_disks; i++) {
 
 		disk = conf->mirrors + i;
@@ -2050,38 +2024,97 @@ static int run(mddev_t *mddev)
 		if (!disk->rdev ||
 		    !test_bit(In_sync, &disk->rdev->flags)) {
 			disk->head_position = 0;
-			mddev->degraded++;
 			if (disk->rdev)
 				conf->fullsync = 1;
-		}
+		} else if (conf->last_used < 0)
+			/*
+			 * The first working device is used as a
+			 * starting point to read balancing.
+			 */
+			conf->last_used = i;
 	}
-	if (mddev->degraded == conf->raid_disks) {
+
+	err = -EIO;
+	if (conf->last_used < 0) {
 		printk(KERN_ERR "raid1: no operational mirrors for %s\n",
-			mdname(mddev));
-		goto out_free_conf;
+		       mdname(mddev));
+		goto abort;
 	}
-	if (conf->raid_disks - mddev->degraded == 1)
-		mddev->recovery_cp = MaxSector;
-
-	/*
-	 * find the first working one and use it as a starting point
-	 * to read balancing.
-	 */
-	for (j = 0; j < conf->raid_disks &&
-		     (!conf->mirrors[j].rdev ||
-		      !test_bit(In_sync, &conf->mirrors[j].rdev->flags)) ; j++)
-		/* nothing */;
-	conf->last_used = j;
-
-
-	mddev->thread = md_register_thread(raid1d, mddev, NULL);
-	if (!mddev->thread) {
+	err = -ENOMEM;
+	conf->thread = md_register_thread(raid1d, mddev, NULL);
+	if (!conf->thread) {
 		printk(KERN_ERR
 		       "raid1: couldn't allocate thread for %s\n",
 		       mdname(mddev));
-		goto out_free_conf;
+		goto abort;
 	}
 
+	return conf;
+
+ abort:
+	if (conf) {
+		if (conf->r1bio_pool)
+			mempool_destroy(conf->r1bio_pool);
+		kfree(conf->mirrors);
+		safe_put_page(conf->tmppage);
+		kfree(conf->poolinfo);
+		kfree(conf);
+	}
+	return ERR_PTR(err);
+}
+
+static int run(mddev_t *mddev)
+{
+	conf_t *conf;
+	int i;
+	mdk_rdev_t *rdev;
+
+	if (mddev->level != 1) {
+		printk("raid1: %s: raid level not set to mirroring (%d)\n",
+		       mdname(mddev), mddev->level);
+		return -EIO;
+	}
+	if (mddev->reshape_position != MaxSector) {
+		printk("raid1: %s: reshape_position set but not supported\n",
+		       mdname(mddev));
+		return -EIO;
+	}
+	/*
+	 * copy the already verified devices into our private RAID1
+	 * bookkeeping area. [whatever we allocate in run(),
+	 * should be freed in stop()]
+	 */
+	if (mddev->private == NULL)
+		conf = setup_conf(mddev);
+	else
+		conf = mddev->private;
+
+	if (IS_ERR(conf))
+		return PTR_ERR(conf);
+
+	mddev->queue->queue_lock = &conf->device_lock;
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		disk_stack_limits(mddev->gendisk, rdev->bdev,
+				  rdev->data_offset << 9);
+		/* as we don't honour merge_bvec_fn, we must never risk
+		 * violating it, so limit ->max_sector to one PAGE, as
+		 * a one page request is never in violation.
+		 */
+		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
+			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
+	}
+
+	mddev->degraded = 0;
+	for (i=0; i < conf->raid_disks; i++)
+		if (conf->mirrors[i].rdev == NULL ||
+		    !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
+		    test_bit(Faulty, &conf->mirrors[i].rdev->flags))
+			mddev->degraded++;
+
+	if (conf->raid_disks - mddev->degraded == 1)
+		mddev->recovery_cp = MaxSector;
+
 	if (mddev->recovery_cp != MaxSector)
 		printk(KERN_NOTICE "raid1: %s is not clean"
 		       " -- starting background reconstruction\n",
@@ -2090,9 +2123,14 @@ static int run(mddev_t *mddev)
 		"raid1: raid set %s active with %d out of %d mirrors\n",
 		mdname(mddev), mddev->raid_disks - mddev->degraded, 
 		mddev->raid_disks);
+
 	/*
 	 * Ok, everything is just fine now
 	 */
+	mddev->thread = conf->thread;
+	conf->thread = NULL;
+	mddev->private = conf;
+
 	md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
 
 	mddev->queue->unplug_fn = raid1_unplug;
@@ -2100,23 +2138,6 @@ static int run(mddev_t *mddev)
 	mddev->queue->backing_dev_info.congested_data = mddev;
 	md_integrity_register(mddev);
 	return 0;
-
-out_no_mem:
-	printk(KERN_ERR "raid1: couldn't allocate memory for %s\n",
-	       mdname(mddev));
-
-out_free_conf:
-	if (conf) {
-		if (conf->r1bio_pool)
-			mempool_destroy(conf->r1bio_pool);
-		kfree(conf->mirrors);
-		safe_put_page(conf->tmppage);
-		kfree(conf->poolinfo);
-		kfree(conf);
-		mddev->private = NULL;
-	}
-out:
-	return -EIO;
 }
 
 static int stop(mddev_t *mddev)
@@ -2302,6 +2323,23 @@ static void raid1_quiesce(mddev_t *mddev, int state)
 	}
 }
 
+static void *raid1_takeover(mddev_t *mddev)
+{
+	/* raid1 can take over:
+	 *  raid5 with 2 devices, any layout or chunk size
+	 */
+	if (mddev->level == 5 && mddev->raid_disks == 2) {
+		conf_t *conf;
+		mddev->new_level = 1;
+		mddev->new_layout = 0;
+		mddev->new_chunk_sectors = 0;
+		conf = setup_conf(mddev);
+		if (!IS_ERR(conf))
+			conf->barrier = 1;
+		return conf;
+	}
+	return ERR_PTR(-EINVAL);
+}
 
 static struct mdk_personality raid1_personality =
 {
@@ -2321,6 +2359,7 @@ static struct mdk_personality raid1_personality =
 	.size		= raid1_size,
 	.check_reshape	= raid1_reshape,
 	.quiesce	= raid1_quiesce,
+	.takeover	= raid1_takeover,
 };
 
 static int __init raid_init(void)
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index e87b84deff68..5f2d443ae28a 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -59,6 +59,11 @@ struct r1_private_data_s {
 
 	mempool_t *r1bio_pool;
 	mempool_t *r1buf_pool;
+
+	/* When taking over an array from a different personality, we store
+	 * the new thread here until we fully activate the array.
+	 */
+	struct mdk_thread_s	*thread;
 };
 
 typedef struct r1_private_data_s conf_t;

From c3d9714e88c8685cf9bc837c3241fc005f95fb82 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:52 +1100
Subject: [PATCH 10/27] md: collect bitmap-specific fields into one structure.

In preparation for making bitmap fields configurable via sysfs,
start tidying up by making a single structure to contain the
configuration fields.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 18 +++++------
 drivers/md/md.c     | 79 ++++++++++++++++++++++++---------------------
 drivers/md/md.h     | 22 +++++++------
 3 files changed, 63 insertions(+), 56 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index a5e5f2fbf963..c9270f2c9ecd 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1090,10 +1090,10 @@ void bitmap_daemon_work(mddev_t *mddev)
 	/* Use a mutex to guard daemon_work against
 	 * bitmap_destroy.
 	 */
-	mutex_lock(&mddev->bitmap_mutex);
+	mutex_lock(&mddev->bitmap_info.mutex);
 	bitmap = mddev->bitmap;
 	if (bitmap == NULL) {
-		mutex_unlock(&mddev->bitmap_mutex);
+		mutex_unlock(&mddev->bitmap_info.mutex);
 		return;
 	}
 	if (time_before(jiffies, bitmap->daemon_lastrun + bitmap->daemon_sleep*HZ))
@@ -1211,7 +1211,7 @@ void bitmap_daemon_work(mddev_t *mddev)
  done:
 	if (bitmap->allclean == 0)
 		bitmap->mddev->thread->timeout = bitmap->daemon_sleep * HZ;
-	mutex_unlock(&mddev->bitmap_mutex);
+	mutex_unlock(&mddev->bitmap_info.mutex);
 }
 
 static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
@@ -1591,9 +1591,9 @@ void bitmap_destroy(mddev_t *mddev)
 	if (!bitmap) /* there was no bitmap */
 		return;
 
-	mutex_lock(&mddev->bitmap_mutex);
+	mutex_lock(&mddev->bitmap_info.mutex);
 	mddev->bitmap = NULL; /* disconnect from the md device */
-	mutex_unlock(&mddev->bitmap_mutex);
+	mutex_unlock(&mddev->bitmap_info.mutex);
 	if (mddev->thread)
 		mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
 
@@ -1610,16 +1610,16 @@ int bitmap_create(mddev_t *mddev)
 	sector_t blocks = mddev->resync_max_sectors;
 	unsigned long chunks;
 	unsigned long pages;
-	struct file *file = mddev->bitmap_file;
+	struct file *file = mddev->bitmap_info.file;
 	int err;
 	sector_t start;
 
 	BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
 
-	if (!file && !mddev->bitmap_offset) /* bitmap disabled, nothing to do */
+	if (!file && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */
 		return 0;
 
-	BUG_ON(file && mddev->bitmap_offset);
+	BUG_ON(file && mddev->bitmap_info.offset);
 
 	bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
 	if (!bitmap)
@@ -1633,7 +1633,7 @@ int bitmap_create(mddev_t *mddev)
 	bitmap->mddev = mddev;
 
 	bitmap->file = file;
-	bitmap->offset = mddev->bitmap_offset;
+	bitmap->offset = mddev->bitmap_info.offset;
 	if (file) {
 		get_file(file);
 		/* As future accesses to this file will use bmap,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 569f25183db6..63da0a51852d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -463,7 +463,7 @@ static mddev_t * mddev_find(dev_t unit)
 
 	mutex_init(&new->open_mutex);
 	mutex_init(&new->reconfig_mutex);
-	mutex_init(&new->bitmap_mutex);
+	mutex_init(&new->bitmap_info.mutex);
 	INIT_LIST_HEAD(&new->disks);
 	INIT_LIST_HEAD(&new->all_mddevs);
 	init_timer(&new->safemode_timer);
@@ -850,7 +850,7 @@ struct super_type  {
  */
 int md_check_no_bitmap(mddev_t *mddev)
 {
-	if (!mddev->bitmap_file && !mddev->bitmap_offset)
+	if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
 		return 0;
 	printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
 		mdname(mddev), mddev->pers->name);
@@ -978,8 +978,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->raid_disks = sb->raid_disks;
 		mddev->dev_sectors = sb->size * 2;
 		mddev->events = ev1;
-		mddev->bitmap_offset = 0;
-		mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
+		mddev->bitmap_info.offset = 0;
+		mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
 
 		if (mddev->minor_version >= 91) {
 			mddev->reshape_position = sb->reshape_position;
@@ -1013,8 +1013,9 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->max_disks = MD_SB_DISKS;
 
 		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
-		    mddev->bitmap_file == NULL)
-			mddev->bitmap_offset = mddev->default_bitmap_offset;
+		    mddev->bitmap_info.file == NULL)
+			mddev->bitmap_info.offset =
+				mddev->bitmap_info.default_offset;
 
 	} else if (mddev->pers == NULL) {
 		/* Insist on good event counter while assembling */
@@ -1131,7 +1132,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	sb->layout = mddev->layout;
 	sb->chunk_size = mddev->chunk_sectors << 9;
 
-	if (mddev->bitmap && mddev->bitmap_file == NULL)
+	if (mddev->bitmap && mddev->bitmap_info.file == NULL)
 		sb->state |= (1<<MD_SB_BITMAP_PRESENT);
 
 	sb->disks[0].state = (1<<MD_DISK_REMOVED);
@@ -1209,7 +1210,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
 {
 	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
 		return 0; /* component must fit device */
-	if (rdev->mddev->bitmap_offset)
+	if (rdev->mddev->bitmap_info.offset)
 		return 0; /* can't move bitmap */
 	rdev->sb_start = calc_dev_sboffset(rdev->bdev);
 	if (!num_sectors || num_sectors > rdev->sb_start)
@@ -1388,8 +1389,8 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
 		mddev->dev_sectors = le64_to_cpu(sb->size);
 		mddev->events = ev1;
-		mddev->bitmap_offset = 0;
-		mddev->default_bitmap_offset = 1024 >> 9;
+		mddev->bitmap_info.offset = 0;
+		mddev->bitmap_info.default_offset = 1024 >> 9;
 		
 		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
 		memcpy(mddev->uuid, sb->set_uuid, 16);
@@ -1397,8 +1398,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->max_disks =  (4096-256)/2;
 
 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
-		    mddev->bitmap_file == NULL )
-			mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
+		    mddev->bitmap_info.file == NULL )
+			mddev->bitmap_info.offset =
+				(__s32)le32_to_cpu(sb->bitmap_offset);
 
 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
 			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
@@ -1492,8 +1494,8 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	sb->level = cpu_to_le32(mddev->level);
 	sb->layout = cpu_to_le32(mddev->layout);
 
-	if (mddev->bitmap && mddev->bitmap_file == NULL) {
-		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
+	if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
+		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
 		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
 	}
 
@@ -1560,7 +1562,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
 		max_sectors -= rdev->data_offset;
 		if (!num_sectors || num_sectors > max_sectors)
 			num_sectors = max_sectors;
-	} else if (rdev->mddev->bitmap_offset) {
+	} else if (rdev->mddev->bitmap_info.offset) {
 		/* minor version 0 with bitmap we can't move */
 		return 0;
 	} else {
@@ -4507,12 +4509,12 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
 
 		bitmap_destroy(mddev);
-		if (mddev->bitmap_file) {
-			restore_bitmap_write_access(mddev->bitmap_file);
-			fput(mddev->bitmap_file);
-			mddev->bitmap_file = NULL;
+		if (mddev->bitmap_info.file) {
+			restore_bitmap_write_access(mddev->bitmap_info.file);
+			fput(mddev->bitmap_info.file);
+			mddev->bitmap_info.file = NULL;
 		}
-		mddev->bitmap_offset = 0;
+		mddev->bitmap_info.offset = 0;
 
 		/* make sure all md_delayed_delete calls have finished */
 		flush_scheduled_work();
@@ -4553,6 +4555,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		mddev->degraded = 0;
 		mddev->barriers_work = 0;
 		mddev->safemode = 0;
+		mddev->bitmap_info.offset = 0;
+		mddev->bitmap_info.default_offset = 0;
 		kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
 		if (mddev->hold_active == UNTIL_STOP)
 			mddev->hold_active = 0;
@@ -4738,7 +4742,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
 	info.state         = 0;
 	if (mddev->in_sync)
 		info.state = (1<<MD_SB_CLEAN);
-	if (mddev->bitmap && mddev->bitmap_offset)
+	if (mddev->bitmap && mddev->bitmap_info.offset)
 		info.state = (1<<MD_SB_BITMAP_PRESENT);
 	info.active_disks  = insync;
 	info.working_disks = working;
@@ -5096,23 +5100,23 @@ static int set_bitmap_file(mddev_t *mddev, int fd)
 	if (fd >= 0) {
 		if (mddev->bitmap)
 			return -EEXIST; /* cannot add when bitmap is present */
-		mddev->bitmap_file = fget(fd);
+		mddev->bitmap_info.file = fget(fd);
 
-		if (mddev->bitmap_file == NULL) {
+		if (mddev->bitmap_info.file == NULL) {
 			printk(KERN_ERR "%s: error: failed to get bitmap file\n",
 			       mdname(mddev));
 			return -EBADF;
 		}
 
-		err = deny_bitmap_write_access(mddev->bitmap_file);
+		err = deny_bitmap_write_access(mddev->bitmap_info.file);
 		if (err) {
 			printk(KERN_ERR "%s: error: bitmap file is already in use\n",
 			       mdname(mddev));
-			fput(mddev->bitmap_file);
-			mddev->bitmap_file = NULL;
+			fput(mddev->bitmap_info.file);
+			mddev->bitmap_info.file = NULL;
 			return err;
 		}
-		mddev->bitmap_offset = 0; /* file overrides offset */
+		mddev->bitmap_info.offset = 0; /* file overrides offset */
 	} else if (mddev->bitmap == NULL)
 		return -ENOENT; /* cannot remove what isn't there */
 	err = 0;
@@ -5127,11 +5131,11 @@ static int set_bitmap_file(mddev_t *mddev, int fd)
 		mddev->pers->quiesce(mddev, 0);
 	}
 	if (fd < 0) {
-		if (mddev->bitmap_file) {
-			restore_bitmap_write_access(mddev->bitmap_file);
-			fput(mddev->bitmap_file);
+		if (mddev->bitmap_info.file) {
+			restore_bitmap_write_access(mddev->bitmap_info.file);
+			fput(mddev->bitmap_info.file);
 		}
-		mddev->bitmap_file = NULL;
+		mddev->bitmap_info.file = NULL;
 	}
 
 	return err;
@@ -5198,8 +5202,8 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 		mddev->flags         = 0;
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
-	mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
-	mddev->bitmap_offset = 0;
+	mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
+	mddev->bitmap_info.offset = 0;
 
 	mddev->reshape_position = MaxSector;
 
@@ -5299,7 +5303,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 	int state = 0;
 
 	/* calculate expected state,ignoring low bits */
-	if (mddev->bitmap && mddev->bitmap_offset)
+	if (mddev->bitmap && mddev->bitmap_info.offset)
 		state |= (1 << MD_SB_BITMAP_PRESENT);
 
 	if (mddev->major_version != info->major_version ||
@@ -5358,9 +5362,10 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 			/* add the bitmap */
 			if (mddev->bitmap)
 				return -EEXIST;
-			if (mddev->default_bitmap_offset == 0)
+			if (mddev->bitmap_info.default_offset == 0)
 				return -EINVAL;
-			mddev->bitmap_offset = mddev->default_bitmap_offset;
+			mddev->bitmap_info.offset =
+				mddev->bitmap_info.default_offset;
 			mddev->pers->quiesce(mddev, 1);
 			rv = bitmap_create(mddev);
 			if (rv)
@@ -5375,7 +5380,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 			mddev->pers->quiesce(mddev, 1);
 			bitmap_destroy(mddev);
 			mddev->pers->quiesce(mddev, 0);
-			mddev->bitmap_offset = 0;
+			mddev->bitmap_info.offset = 0;
 		}
 	}
 	md_update_sb(mddev, 1);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index cb036868a9e9..50e62ef32e9d 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -280,16 +280,18 @@ struct mddev_s
 	unsigned int                    max_write_behind; /* 0 = sync */
 
 	struct bitmap                   *bitmap; /* the bitmap for the device */
-	struct file			*bitmap_file; /* the bitmap file */
-	long				bitmap_offset; /* offset from superblock of
-							* start of bitmap. May be
-							* negative, but not '0'
-							*/
-	long				default_bitmap_offset; /* this is the offset to use when
-								* hot-adding a bitmap.  It should
-								* eventually be settable by sysfs.
-								*/
-	struct mutex			bitmap_mutex;
+	struct {
+		struct file		*file; /* the bitmap file */
+		long			offset; /* offset from superblock of
+						 * start of bitmap. May be
+						 * negative, but not '0'
+						 */
+		long			default_offset; /* this is the offset to use when
+							 * hot-adding a bitmap.  It should
+							 * eventually be settable by sysfs.
+							 */
+		struct mutex		mutex;
+	} bitmap_info;
 
 	struct list_head		all_mddevs;
 

From 42a04b5078ce73a32f85762551d5703c5bd646a1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:53 +1100
Subject: [PATCH 11/27] md: move offset, daemon_sleep and chunksize out of
 bitmap structure

... and into bitmap_info.  These are all configuration parameters
that need to be set before the bitmap is created.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 51 +++++++++++++++++++++++++--------------------
 drivers/md/bitmap.h |  6 +-----
 drivers/md/md.c     |  7 +++++--
 drivers/md/md.h     |  3 +++
 drivers/md/raid1.c  |  3 ++-
 drivers/md/raid10.c |  2 +-
 6 files changed, 40 insertions(+), 32 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index c9270f2c9ecd..1a5ada084487 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -287,27 +287,28 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 
 	while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
 			int size = PAGE_SIZE;
+			long offset = mddev->bitmap_info.offset;
 			if (page->index == bitmap->file_pages-1)
 				size = roundup(bitmap->last_page_size,
 					       bdev_logical_block_size(rdev->bdev));
 			/* Just make sure we aren't corrupting data or
 			 * metadata
 			 */
-			if (bitmap->offset < 0) {
+			if (offset < 0) {
 				/* DATA  BITMAP METADATA  */
-				if (bitmap->offset
+				if (offset
 				    + (long)(page->index * (PAGE_SIZE/512))
 				    + size/512 > 0)
 					/* bitmap runs in to metadata */
 					goto bad_alignment;
 				if (rdev->data_offset + mddev->dev_sectors
-				    > rdev->sb_start + bitmap->offset)
+				    > rdev->sb_start + offset)
 					/* data runs in to bitmap */
 					goto bad_alignment;
 			} else if (rdev->sb_start < rdev->data_offset) {
 				/* METADATA BITMAP DATA */
 				if (rdev->sb_start
-				    + bitmap->offset
+				    + offset
 				    + page->index*(PAGE_SIZE/512) + size/512
 				    > rdev->data_offset)
 					/* bitmap runs in to data */
@@ -316,7 +317,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 				/* DATA METADATA BITMAP - no problems */
 			}
 			md_super_write(mddev, rdev,
-				       rdev->sb_start + bitmap->offset
+				       rdev->sb_start + offset
 				       + page->index * (PAGE_SIZE/512),
 				       size,
 				       page);
@@ -550,7 +551,8 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 
 		bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes);
 	} else {
-		bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset,
+		bitmap->sb_page = read_sb_page(bitmap->mddev,
+					       bitmap->mddev->bitmap_info.offset,
 					       NULL,
 					       0, sizeof(bitmap_super_t));
 	}
@@ -610,10 +612,10 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	}
 success:
 	/* assign fields using values from superblock */
-	bitmap->chunksize = chunksize;
-	bitmap->daemon_sleep = daemon_sleep;
+	bitmap->mddev->bitmap_info.chunksize = chunksize;
+	bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
 	bitmap->daemon_lastrun = jiffies;
-	bitmap->max_write_behind = write_behind;
+	bitmap->mddev->bitmap_info.max_write_behind = write_behind;
 	bitmap->flags |= le32_to_cpu(sb->state);
 	if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
 		bitmap->flags |= BITMAP_HOSTENDIAN;
@@ -907,7 +909,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 	chunks = bitmap->chunks;
 	file = bitmap->file;
 
-	BUG_ON(!file && !bitmap->offset);
+	BUG_ON(!file && !bitmap->mddev->bitmap_info.offset);
 
 #ifdef INJECT_FAULTS_3
 	outofdate = 1;
@@ -967,14 +969,15 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 				offset = sizeof(bitmap_super_t);
 				if (!file)
 					read_sb_page(bitmap->mddev,
-						     bitmap->offset,
+						     bitmap->mddev->bitmap_info.offset,
 						     page,
 						     index, count);
 			} else if (file) {
 				page = read_page(file, index, bitmap, count);
 				offset = 0;
 			} else {
-				page = read_sb_page(bitmap->mddev, bitmap->offset,
+				page = read_sb_page(bitmap->mddev,
+						    bitmap->mddev->bitmap_info.offset,
 						    NULL,
 						    index, count);
 				offset = 0;
@@ -1096,7 +1099,8 @@ void bitmap_daemon_work(mddev_t *mddev)
 		mutex_unlock(&mddev->bitmap_info.mutex);
 		return;
 	}
-	if (time_before(jiffies, bitmap->daemon_lastrun + bitmap->daemon_sleep*HZ))
+	if (time_before(jiffies, bitmap->daemon_lastrun
+			+ bitmap->mddev->bitmap_info.daemon_sleep*HZ))
 		goto done;
 
 	bitmap->daemon_lastrun = jiffies;
@@ -1210,7 +1214,8 @@ void bitmap_daemon_work(mddev_t *mddev)
 
  done:
 	if (bitmap->allclean == 0)
-		bitmap->mddev->thread->timeout = bitmap->daemon_sleep * HZ;
+		bitmap->mddev->thread->timeout = 
+			bitmap->mddev->bitmap_info.daemon_sleep * HZ;
 	mutex_unlock(&mddev->bitmap_info.mutex);
 }
 
@@ -1479,7 +1484,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
 		return;
 	}
 	if (time_before(jiffies, (bitmap->last_end_sync
-				  + bitmap->daemon_sleep * HZ)))
+				  + bitmap->mddev->bitmap_info.daemon_sleep * HZ)))
 		return;
 	wait_event(bitmap->mddev->recovery_wait,
 		   atomic_read(&bitmap->mddev->recovery_active) == 0);
@@ -1540,7 +1545,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
 void bitmap_flush(mddev_t *mddev)
 {
 	struct bitmap *bitmap = mddev->bitmap;
-	int sleep;
+	long sleep;
 
 	if (!bitmap) /* there was no bitmap */
 		return;
@@ -1548,12 +1553,13 @@ void bitmap_flush(mddev_t *mddev)
 	/* run the daemon_work three time to ensure everything is flushed
 	 * that can be
 	 */
-	sleep = bitmap->daemon_sleep;
-	bitmap->daemon_sleep = 0;
+	sleep = mddev->bitmap_info.daemon_sleep * HZ * 2;
+	bitmap->daemon_lastrun -= sleep;
 	bitmap_daemon_work(mddev);
+	bitmap->daemon_lastrun -= sleep;
 	bitmap_daemon_work(mddev);
+	bitmap->daemon_lastrun -= sleep;
 	bitmap_daemon_work(mddev);
-	bitmap->daemon_sleep = sleep;
 	bitmap_update_sb(bitmap);
 }
 
@@ -1633,7 +1639,6 @@ int bitmap_create(mddev_t *mddev)
 	bitmap->mddev = mddev;
 
 	bitmap->file = file;
-	bitmap->offset = mddev->bitmap_info.offset;
 	if (file) {
 		get_file(file);
 		/* As future accesses to this file will use bmap,
@@ -1642,12 +1647,12 @@ int bitmap_create(mddev_t *mddev)
 		 */
 		vfs_fsync(file, file->f_dentry, 1);
 	}
-	/* read superblock from bitmap file (this sets bitmap->chunksize) */
+	/* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */
 	err = bitmap_read_sb(bitmap);
 	if (err)
 		goto error;
 
-	bitmap->chunkshift = ffz(~bitmap->chunksize);
+	bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize);
 
 	/* now that chunksize and chunkshift are set, we can use these macros */
  	chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >>
@@ -1689,7 +1694,7 @@ int bitmap_create(mddev_t *mddev)
 
 	mddev->bitmap = bitmap;
 
-	mddev->thread->timeout = bitmap->daemon_sleep * HZ;
+	mddev->thread->timeout = mddev->bitmap_info.daemon_sleep * HZ;
 
 	bitmap_update_sb(bitmap);
 
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 7e38d13ddcac..50ee4240f5db 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -106,7 +106,7 @@ typedef __u16 bitmap_counter_t;
 #define BITMAP_BLOCK_SHIFT 9
 
 /* how many blocks per chunk? (this is variable) */
-#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT)
+#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT)
 #define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
 #define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
 
@@ -209,7 +209,6 @@ struct bitmap {
 	int counter_bits; /* how many bits per block counter */
 
 	/* bitmap chunksize -- how much data does each bit represent? */
-	unsigned long chunksize;
 	unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
 	unsigned long chunks; /* total number of data chunks for the array */
 
@@ -226,7 +225,6 @@ struct bitmap {
 	/* bitmap spinlock */
 	spinlock_t lock;
 
-	long offset; /* offset from superblock if file is NULL */
 	struct file *file; /* backing disk file */
 	struct page *sb_page; /* cached copy of the bitmap file superblock */
 	struct page **filemap; /* list of cache pages for the file */
@@ -238,7 +236,6 @@ struct bitmap {
 
 	int allclean;
 
-	unsigned long max_write_behind; /* write-behind mode */
 	atomic_t behind_writes;
 
 	/*
@@ -246,7 +243,6 @@ struct bitmap {
 	 * file, cleaning up bits and flushing out pages to disk as necessary
 	 */
 	unsigned long daemon_lastrun; /* jiffies of last run */
-	unsigned long daemon_sleep; /* how many seconds between updates? */
 	unsigned long last_end_sync; /* when we lasted called end_sync to
 				      * update bitmap with resync progress */
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 63da0a51852d..c56c64d13075 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4557,6 +4557,9 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		mddev->safemode = 0;
 		mddev->bitmap_info.offset = 0;
 		mddev->bitmap_info.default_offset = 0;
+		mddev->bitmap_info.chunksize = 0;
+		mddev->bitmap_info.daemon_sleep = 0;
+		mddev->bitmap_info.max_write_behind = 0;
 		kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
 		if (mddev->hold_active == UNTIL_STOP)
 			mddev->hold_active = 0;
@@ -6089,14 +6092,14 @@ static int md_seq_show(struct seq_file *seq, void *v)
 			unsigned long chunk_kb;
 			unsigned long flags;
 			spin_lock_irqsave(&bitmap->lock, flags);
-			chunk_kb = bitmap->chunksize >> 10;
+			chunk_kb = mddev->bitmap_info.chunksize >> 10;
 			seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
 				"%lu%s chunk",
 				bitmap->pages - bitmap->missing_pages,
 				bitmap->pages,
 				(bitmap->pages - bitmap->missing_pages)
 					<< (PAGE_SHIFT - 10),
-				chunk_kb ? chunk_kb : bitmap->chunksize,
+				chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
 				chunk_kb ? "KB" : "B");
 			if (bitmap->file) {
 				seq_printf(seq, ", file: ");
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 50e62ef32e9d..4b07e0ab3841 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -291,6 +291,9 @@ struct mddev_s
 							 * eventually be settable by sysfs.
 							 */
 		struct mutex		mutex;
+		unsigned long		chunksize;
+		unsigned long		daemon_sleep; /* how many seconds between updates? */
+		unsigned long		max_write_behind; /* write-behind mode */
 	} bitmap_info;
 
 	struct list_head		all_mddevs;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7549b0bad326..f0949e2bcd6e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -943,7 +943,8 @@ static int make_request(struct request_queue *q, struct bio * bio)
 
 	/* do behind I/O ? */
 	if (bitmap &&
-	    atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind &&
+	    (atomic_read(&bitmap->behind_writes)
+	     < mddev->bitmap_info.max_write_behind) &&
 	    (behind_pages = alloc_behind_pages(bio)) != NULL)
 		set_bit(R1BIO_BehindIO, &r1_bio->state);
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 2fbf867f8b30..2255e33a3ecb 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2277,7 +2277,7 @@ static void raid10_quiesce(mddev_t *mddev, int state)
 	}
 	if (mddev->thread) {
 		if (mddev->bitmap)
-			mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
+			mddev->thread->timeout = mddev->bitmap_info.daemon_sleep * HZ;
 		else
 			mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
 		md_wakeup_thread(mddev->thread);

From 1b04be96f6910ee415287bf0d5309c7d4c94bd2b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:53 +1100
Subject: [PATCH 12/27] md: change daemon_sleep to be in 'jiffies' rather than
 'seconds'.

This removes a lot of multiplications by HZ.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 14 +++++++-------
 drivers/md/raid10.c |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 1a5ada084487..c1f28a87ea59 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -565,7 +565,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
 
 	chunksize = le32_to_cpu(sb->chunksize);
-	daemon_sleep = le32_to_cpu(sb->daemon_sleep);
+	daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
 	write_behind = le32_to_cpu(sb->write_behind);
 
 	/* verify that the bitmap-specific fields are valid */
@@ -578,7 +578,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 		reason = "bitmap chunksize too small";
 	else if ((1 << ffz(~chunksize)) != chunksize)
 		reason = "bitmap chunksize not a power of 2";
-	else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ)
+	else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT)
 		reason = "daemon sleep period out of range";
 	else if (write_behind > COUNTER_MAX)
 		reason = "write-behind limit out of range (0 - 16383)";
@@ -1100,7 +1100,7 @@ void bitmap_daemon_work(mddev_t *mddev)
 		return;
 	}
 	if (time_before(jiffies, bitmap->daemon_lastrun
-			+ bitmap->mddev->bitmap_info.daemon_sleep*HZ))
+			+ bitmap->mddev->bitmap_info.daemon_sleep))
 		goto done;
 
 	bitmap->daemon_lastrun = jiffies;
@@ -1215,7 +1215,7 @@ void bitmap_daemon_work(mddev_t *mddev)
  done:
 	if (bitmap->allclean == 0)
 		bitmap->mddev->thread->timeout = 
-			bitmap->mddev->bitmap_info.daemon_sleep * HZ;
+			bitmap->mddev->bitmap_info.daemon_sleep;
 	mutex_unlock(&mddev->bitmap_info.mutex);
 }
 
@@ -1484,7 +1484,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
 		return;
 	}
 	if (time_before(jiffies, (bitmap->last_end_sync
-				  + bitmap->mddev->bitmap_info.daemon_sleep * HZ)))
+				  + bitmap->mddev->bitmap_info.daemon_sleep)))
 		return;
 	wait_event(bitmap->mddev->recovery_wait,
 		   atomic_read(&bitmap->mddev->recovery_active) == 0);
@@ -1553,7 +1553,7 @@ void bitmap_flush(mddev_t *mddev)
 	/* run the daemon_work three time to ensure everything is flushed
 	 * that can be
 	 */
-	sleep = mddev->bitmap_info.daemon_sleep * HZ * 2;
+	sleep = mddev->bitmap_info.daemon_sleep * 2;
 	bitmap->daemon_lastrun -= sleep;
 	bitmap_daemon_work(mddev);
 	bitmap->daemon_lastrun -= sleep;
@@ -1694,7 +1694,7 @@ int bitmap_create(mddev_t *mddev)
 
 	mddev->bitmap = bitmap;
 
-	mddev->thread->timeout = mddev->bitmap_info.daemon_sleep * HZ;
+	mddev->thread->timeout = mddev->bitmap_info.daemon_sleep;
 
 	bitmap_update_sb(bitmap);
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 2255e33a3ecb..064c2bb5ee63 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2277,7 +2277,7 @@ static void raid10_quiesce(mddev_t *mddev, int state)
 	}
 	if (mddev->thread) {
 		if (mddev->bitmap)
-			mddev->thread->timeout = mddev->bitmap_info.daemon_sleep * HZ;
+			mddev->thread->timeout = mddev->bitmap_info.daemon_sleep;
 		else
 			mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
 		md_wakeup_thread(mddev->thread);

From 9cd30fdc33cde9ae4ac55a1ccbbb89f3f7b9b2f2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:54 +1100
Subject: [PATCH 13/27] md: remove needless setting of thread->timeout in
 raid10_quiesce

As bitmap_create and bitmap_destroy already set thread->timeout
as appropriate, there is no need to do it in raid10_quiesce.
There is a possible need to wake the thread after the timeout
has been set low, but it is better to do that where the timeout
is actually set low, in bitmap_create.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 1 +
 drivers/md/raid10.c | 7 -------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index c1f28a87ea59..b1bcd36ca963 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1695,6 +1695,7 @@ int bitmap_create(mddev_t *mddev)
 	mddev->bitmap = bitmap;
 
 	mddev->thread->timeout = mddev->bitmap_info.daemon_sleep;
+	md_wakeup_thread(mddev->thread);
 
 	bitmap_update_sb(bitmap);
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 064c2bb5ee63..d9e28a65af58 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2275,13 +2275,6 @@ static void raid10_quiesce(mddev_t *mddev, int state)
 		lower_barrier(conf);
 		break;
 	}
-	if (mddev->thread) {
-		if (mddev->bitmap)
-			mddev->thread->timeout = mddev->bitmap_info.daemon_sleep;
-		else
-			mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
-		md_wakeup_thread(mddev->thread);
-	}
 }
 
 static struct mdk_personality raid10_personality =

From f6af949c5672115313cc3c976d85b0533f607d7e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:54 +1100
Subject: [PATCH 14/27] md: support bitmap offset appropriate for
 external-metadata arrays.

For md arrays were metadata is managed externally, the kernel does not
know about a superblock so the superblock offset is 0.
If we want to have a write-intent-bitmap near the end of the
devices of such an array, we should support sector_t sized offset.
We need offset be possibly negative for when the bitmap is before
the metadata, so use loff_t instead.

Also add sanity check that bitmap does not overlap with data.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 14 +++++++++++---
 drivers/md/md.h     |  6 ++++--
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index b1bcd36ca963..958865445f06 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -212,7 +212,7 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
  */
 
 /* IO operations when bitmap is stored near all superblocks */
-static struct page *read_sb_page(mddev_t *mddev, long offset,
+static struct page *read_sb_page(mddev_t *mddev, loff_t offset,
 				 struct page *page,
 				 unsigned long index, int size)
 {
@@ -287,14 +287,22 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 
 	while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
 			int size = PAGE_SIZE;
-			long offset = mddev->bitmap_info.offset;
+			loff_t offset = mddev->bitmap_info.offset;
 			if (page->index == bitmap->file_pages-1)
 				size = roundup(bitmap->last_page_size,
 					       bdev_logical_block_size(rdev->bdev));
 			/* Just make sure we aren't corrupting data or
 			 * metadata
 			 */
-			if (offset < 0) {
+			if (mddev->external) {
+				/* Bitmap could be anywhere. */
+				if (rdev->sb_start + offset + (page->index *(PAGE_SIZE/512)) >
+				    rdev->data_offset &&
+				    rdev->sb_start + offset < 
+				    rdev->data_offset + mddev->dev_sectors +
+				    (PAGE_SIZE/512))
+					goto bad_alignment;
+			} else if (offset < 0) {
 				/* DATA  BITMAP METADATA  */
 				if (offset
 				    + (long)(page->index * (PAGE_SIZE/512))
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 4b07e0ab3841..df692953a12f 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -282,11 +282,13 @@ struct mddev_s
 	struct bitmap                   *bitmap; /* the bitmap for the device */
 	struct {
 		struct file		*file; /* the bitmap file */
-		long			offset; /* offset from superblock of
+		loff_t			offset; /* offset from superblock of
 						 * start of bitmap. May be
 						 * negative, but not '0'
+						 * For external metadata, offset
+						 * from start of device. 
 						 */
-		long			default_offset; /* this is the offset to use when
+		loff_t			default_offset; /* this is the offset to use when
 							 * hot-adding a bitmap.  It should
 							 * eventually be settable by sysfs.
 							 */

From 72e02075a33f739e21430262f71da8e82db9dbb3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:55 +1100
Subject: [PATCH 15/27] md: factor out parsing of fixed-point numbers

safe_delay_store can parse fixed point numbers (for fractions
of a second).  We will want to do that for another sysfs
file soon, so factor out the code.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 64 ++++++++++++++++++++++++++++++++-----------------
 drivers/md/md.h |  3 ++-
 2 files changed, 44 insertions(+), 23 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c56c64d13075..93287f88f1f4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2763,6 +2763,47 @@ static void analyze_sbs(mddev_t * mddev)
 	}
 }
 
+/* Read a fixed-point number.
+ * Numbers in sysfs attributes should be in "standard" units where
+ * possible, so time should be in seconds.
+ * However we internally use a a much smaller unit such as 
+ * milliseconds or jiffies.
+ * This function takes a decimal number with a possible fractional
+ * component, and produces an integer which is the result of
+ * multiplying that number by 10^'scale'.
+ * all without any floating-point arithmetic.
+ */
+int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
+{
+	unsigned long result = 0;
+	long decimals = -1;
+	while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
+		if (*cp == '.')
+			decimals = 0;
+		else if (decimals < scale) {
+			unsigned int value;
+			value = *cp - '0';
+			result = result * 10 + value;
+			if (decimals >= 0)
+				decimals++;
+		}
+		cp++;
+	}
+	if (*cp == '\n')
+		cp++;
+	if (*cp)
+		return -EINVAL;
+	if (decimals < 0)
+		decimals = 0;
+	while (decimals < scale) {
+		result *= 10;
+		decimals ++;
+	}
+	*res = result;
+	return 0;
+}
+
+
 static void md_safemode_timeout(unsigned long data);
 
 static ssize_t
@@ -2774,31 +2815,10 @@ safe_delay_show(mddev_t *mddev, char *page)
 static ssize_t
 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
 {
-	int scale=1;
-	int dot=0;
-	int i;
 	unsigned long msec;
-	char buf[30];
 
-	/* remove a period, and count digits after it */
-	if (len >= sizeof(buf))
+	if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
 		return -EINVAL;
-	strlcpy(buf, cbuf, sizeof(buf));
-	for (i=0; i<len; i++) {
-		if (dot) {
-			if (isdigit(buf[i])) {
-				buf[i-1] = buf[i];
-				scale *= 10;
-			}
-			buf[i] = 0;
-		} else if (buf[i] == '.') {
-			dot=1;
-			buf[i] = 0;
-		}
-	}
-	if (strict_strtoul(buf, 10, &msec) < 0)
-		return -EINVAL;
-	msec = (msec * 1000) / scale;
 	if (msec == 0)
 		mddev->safemode_delay = 0;
 	else {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index df692953a12f..b5c306925d94 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -463,6 +463,7 @@ extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
 extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
 extern int md_check_no_bitmap(mddev_t *mddev);
 extern int md_integrity_register(mddev_t *mddev);
-void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
+extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
+extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
 
 #endif /* _MD_MD_H */

From 43a705076e51c5af21ec4260a35699775ea298f5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:55 +1100
Subject: [PATCH 16/27] md: support updating bitmap parameters via sysfs.

A new attribute directory 'bitmap' in 'md' is created which
contains files for configuring the bitmap.
'location' identifies where the bitmap is, either 'none',
or 'file' or 'sector offset from metadata'.
Writing 'location' can create or remove a bitmap.
Adding a 'file' bitmap this way is not yet supported.
'chunksize' and 'time_base' must be set before 'location'
can be set.

'chunksize' can be set before creating a bitmap, but is
currently always over-ridden by the bitmap superblock.

'time_base' and 'backlog' can be updated at any time.


Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Andre Noll <maan@systemlinux.org>
---
 Documentation/md.txt |  29 ++++++
 drivers/md/bitmap.c  | 205 +++++++++++++++++++++++++++++++++++++++++++
 drivers/md/md.c      |   5 +-
 drivers/md/md.h      |   3 +-
 4 files changed, 240 insertions(+), 2 deletions(-)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index 4edd39ec7db9..18fad6876228 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -296,6 +296,35 @@ All md devices contain:
      active-idle
          like active, but no writes have been seen for a while (safe_mode_delay).
 
+  bitmap/location
+     This indicates where the write-intent bitmap for the array is
+     stored.
+     It can be one of "none", "file" or "[+-]N".
+     "file" may later be extended to "file:/file/name"
+     "[+-]N" means that many sectors from the start of the metadata.
+       This is replicated on all devices.  For arrays with externally
+       managed metadata, the offset is from the beginning of the
+       device.
+  bitmap/chunksize
+     The size, in bytes, of the chunk which will be represented by a
+     single bit.  For RAID456, it is a portion of an individual
+     device. For RAID10, it is a portion of the array.  For RAID1, it
+     is both (they come to the same thing).
+  bitmap/time_base
+     The time, in seconds, between looking for bits in the bitmap to
+     be cleared. In the current implementation, a bit will be cleared
+     between 2 and 3 times "time_base" after all the covered blocks
+     are known to be in-sync.
+  bitmap/backlog
+     When write-mostly devices are active in a RAID1, write requests
+     to those devices proceed in the background - the filesystem (or
+     other user of the device) does not have to wait for them.
+     'backlog' sets a limit on the number of concurrent background
+     writes.  If there are more than this, new writes will by
+     synchronous.
+     
+     
+     
 
 As component devices are added to an md array, they appear in the 'md'
 directory as new directories named
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 958865445f06..24fff75b2191 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -510,6 +510,9 @@ void bitmap_update_sb(struct bitmap *bitmap)
 		bitmap->events_cleared = bitmap->mddev->events;
 		sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
 	}
+	/* Just in case these have been changed via sysfs: */
+	sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
+	sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
 	kunmap_atomic(sb, KM_USER0);
 	write_page(bitmap, bitmap->sb_page, 1);
 }
@@ -1714,6 +1717,208 @@ int bitmap_create(mddev_t *mddev)
 	return err;
 }
 
+static ssize_t
+location_show(mddev_t *mddev, char *page)
+{
+	ssize_t len;
+	if (mddev->bitmap_info.file) {
+		len = sprintf(page, "file");
+	} else if (mddev->bitmap_info.offset) {
+		len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset);
+	} else
+		len = sprintf(page, "none");
+	len += sprintf(page+len, "\n");
+	return len;
+}
+
+static ssize_t
+location_store(mddev_t *mddev, const char *buf, size_t len)
+{
+
+	if (mddev->pers) {
+		if (!mddev->pers->quiesce)
+			return -EBUSY;
+		if (mddev->recovery || mddev->sync_thread)
+			return -EBUSY;
+	}
+
+	if (mddev->bitmap || mddev->bitmap_info.file ||
+	    mddev->bitmap_info.offset) {
+		/* bitmap already configured.  Only option is to clear it */
+		if (strncmp(buf, "none", 4) != 0)
+			return -EBUSY;
+		if (mddev->pers) {
+			mddev->pers->quiesce(mddev, 1);
+			bitmap_destroy(mddev);
+			mddev->pers->quiesce(mddev, 0);
+		}
+		mddev->bitmap_info.offset = 0;
+		if (mddev->bitmap_info.file) {
+			struct file *f = mddev->bitmap_info.file;
+			mddev->bitmap_info.file = NULL;
+			restore_bitmap_write_access(f);
+			fput(f);
+		}
+	} else {
+		/* No bitmap, OK to set a location */
+		long long offset;
+		if (strncmp(buf, "none", 4) == 0)
+			/* nothing to be done */;
+		else if (strncmp(buf, "file:", 5) == 0) {
+			/* Not supported yet */
+			return -EINVAL;
+		} else {
+			int rv;
+			if (buf[0] == '+')
+				rv = strict_strtoll(buf+1, 10, &offset);
+			else
+				rv = strict_strtoll(buf, 10, &offset);
+			if (rv)
+				return rv;
+			if (offset == 0)
+				return -EINVAL;
+			if (mddev->major_version == 0 &&
+			    offset != mddev->bitmap_info.default_offset)
+				return -EINVAL;
+			mddev->bitmap_info.offset = offset;
+			if (mddev->pers) {
+				mddev->pers->quiesce(mddev, 1);
+				rv = bitmap_create(mddev);
+				if (rv) {
+					bitmap_destroy(mddev);
+					mddev->bitmap_info.offset = 0;
+				}
+				mddev->pers->quiesce(mddev, 0);
+				if (rv)
+					return rv;
+			}
+		}
+	}
+	if (!mddev->external) {
+		/* Ensure new bitmap info is stored in
+		 * metadata promptly.
+		 */
+		set_bit(MD_CHANGE_DEVS, &mddev->flags);
+		md_wakeup_thread(mddev->thread);
+	}
+	return len;
+}
+
+static struct md_sysfs_entry bitmap_location =
+__ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store);
+
+static ssize_t
+timeout_show(mddev_t *mddev, char *page)
+{
+	ssize_t len;
+	unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ;
+	unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ;
+	
+	len = sprintf(page, "%lu", secs);
+	if (jifs)
+		len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs));
+	len += sprintf(page+len, "\n");
+	return len;
+}
+
+static ssize_t
+timeout_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	/* timeout can be set at any time */
+	unsigned long timeout;
+	int rv = strict_strtoul_scaled(buf, &timeout, 4);
+	if (rv)
+		return rv;
+
+	/* just to make sure we don't overflow... */
+	if (timeout >= LONG_MAX / HZ)
+		return -EINVAL;
+
+	timeout = timeout * HZ / 10000;
+
+	if (timeout >= MAX_SCHEDULE_TIMEOUT)
+		timeout = MAX_SCHEDULE_TIMEOUT-1;
+	if (timeout < 1)
+		timeout = 1;
+	mddev->bitmap_info.daemon_sleep = timeout;
+	if (mddev->thread) {
+		/* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then
+		 * the bitmap is all clean and we don't need to
+		 * adjust the timeout right now
+		 */
+		if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) {
+			mddev->thread->timeout = timeout;
+			md_wakeup_thread(mddev->thread);
+		}
+	}
+	return len;
+}
+
+static struct md_sysfs_entry bitmap_timeout =
+__ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store);
+
+static ssize_t
+backlog_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind);
+}
+
+static ssize_t
+backlog_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	unsigned long backlog;
+	int rv = strict_strtoul(buf, 10, &backlog);
+	if (rv)
+		return rv;
+	if (backlog > COUNTER_MAX)
+		return -EINVAL;
+	mddev->bitmap_info.max_write_behind = backlog;
+	return len;
+}
+
+static struct md_sysfs_entry bitmap_backlog =
+__ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store);
+
+static ssize_t
+chunksize_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize);
+}
+
+static ssize_t
+chunksize_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	/* Can only be changed when no bitmap is active */
+	int rv;
+	unsigned long csize;
+	if (mddev->bitmap)
+		return -EBUSY;
+	rv = strict_strtoul(buf, 10, &csize);
+	if (rv)
+		return rv;
+	if (csize < 512 ||
+	    !is_power_of_2(csize))
+		return -EINVAL;
+	mddev->bitmap_info.chunksize = csize;
+	return len;
+}
+
+static struct md_sysfs_entry bitmap_chunksize =
+__ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store);
+
+static struct attribute *md_bitmap_attrs[] = {
+	&bitmap_location.attr,
+	&bitmap_timeout.attr,
+	&bitmap_backlog.attr,
+	&bitmap_chunksize.attr,
+	NULL
+};
+struct attribute_group md_bitmap_group = {
+	.name = "bitmap",
+	.attrs = md_bitmap_attrs,
+};
+
+
 /* the bitmap API -- for raid personalities */
 EXPORT_SYMBOL(bitmap_startwrite);
 EXPORT_SYMBOL(bitmap_endwrite);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 93287f88f1f4..859edbf8c9b0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4018,6 +4018,7 @@ static void mddev_delayed_delete(struct work_struct *ws)
 		mddev->sysfs_action = NULL;
 		mddev->private = NULL;
 	}
+	sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
 	kobject_del(&mddev->kobj);
 	kobject_put(&mddev->kobj);
 }
@@ -4109,6 +4110,8 @@ static int md_alloc(dev_t dev, char *name)
 		       disk->disk_name);
 		error = 0;
 	}
+	if (sysfs_create_group(&mddev->kobj, &md_bitmap_group))
+		printk(KERN_DEBUG "pointless warning\n");
  abort:
 	mutex_unlock(&disks_mutex);
 	if (!error) {
@@ -4434,7 +4437,7 @@ static int deny_bitmap_write_access(struct file * file)
 	return 0;
 }
 
-static void restore_bitmap_write_access(struct file *file)
+void restore_bitmap_write_access(struct file *file)
 {
 	struct inode *inode = file->f_mapping->host;
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index b5c306925d94..fce02073f1a4 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -372,7 +372,7 @@ struct md_sysfs_entry {
 	ssize_t (*show)(mddev_t *, char *);
 	ssize_t (*store)(mddev_t *, const char *, size_t);
 };
-
+extern struct attribute_group md_bitmap_group;
 
 static inline char * mdname (mddev_t * mddev)
 {
@@ -465,5 +465,6 @@ extern int md_check_no_bitmap(mddev_t *mddev);
 extern int md_integrity_register(mddev_t *mddev);
 extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
+extern void restore_bitmap_write_access(struct file *file);
 
 #endif /* _MD_MD_H */

From 624ce4f5658fa3e0303c1217bba2706142fe7568 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:56 +1100
Subject: [PATCH 17/27] md/bitmap: move setting of daemon_lastrun out of
 bitmap_read_sb

Setting daemon_lastrun really has nothing to do with reading
the bitmap superblock, it just happens to be needed at the same time.
bitmap_read_sb is about to become options, so move that code out
to after the call to bitmap_read_sb.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 24fff75b2191..62958491f329 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -625,7 +625,6 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	/* assign fields using values from superblock */
 	bitmap->mddev->bitmap_info.chunksize = chunksize;
 	bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
-	bitmap->daemon_lastrun = jiffies;
 	bitmap->mddev->bitmap_info.max_write_behind = write_behind;
 	bitmap->flags |= le32_to_cpu(sb->state);
 	if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
@@ -1663,6 +1662,7 @@ int bitmap_create(mddev_t *mddev)
 	if (err)
 		goto error;
 
+	bitmap->daemon_lastrun = jiffies;
 	bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize);
 
 	/* now that chunksize and chunkshift are set, we can use these macros */

From ece5cff0da9e696c360fff592cb5f51b6419e4d6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:56 +1100
Subject: [PATCH 18/27] md: Support write-intent bitmaps with externally
 managed metadata.

In this case, the metadata needs to not be in the same
sector as the bitmap.
md will not read/write any bitmap metadata.  Config must be
done via sysfs and when a recovery makes the array non-degraded
again, writing 'true' to 'bitmap/can_clear' will allow bits in
the bitmap to be cleared again.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 Documentation/md.txt |  16 +++++
 drivers/md/bitmap.c  | 142 ++++++++++++++++++++++++++++++++++++-------
 drivers/md/bitmap.h  |  11 +---
 drivers/md/md.h      |   1 +
 4 files changed, 137 insertions(+), 33 deletions(-)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index 18fad6876228..21d26fb5d02b 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -322,6 +322,22 @@ All md devices contain:
      'backlog' sets a limit on the number of concurrent background
      writes.  If there are more than this, new writes will by
      synchronous.
+  bitmap/metadata
+     This can be either 'internal' or 'external'.
+     'internal' is the default and means the metadata for the bitmap
+     is stored in the first 256 bytes of the allocated space and is
+     managed by the md module.
+     'external' means that bitmap metadata is managed externally to
+     the kernel (i.e. by some userspace program)
+  bitmap/can_clear
+     This is either 'true' or 'false'.  If 'true', then bits in the
+     bitmap will be cleared when the corresponding blocks are thought
+     to be in-sync.  If 'false', bits will never be cleared.
+     This is automatically set to 'false' if a write happens on a
+     degraded array, or if the array becomes degraded during a write.
+     When metadata is managed externally, it should be set to true
+     once the array becomes non-degraded, and this fact has been
+     recorded in the metadata.
      
      
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 62958491f329..de5c42df8d17 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -497,6 +497,8 @@ void bitmap_update_sb(struct bitmap *bitmap)
 
 	if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
 		return;
+	if (bitmap->mddev->bitmap_info.external)
+		return;
 	spin_lock_irqsave(&bitmap->lock, flags);
 	if (!bitmap->sb_page) { /* no superblock */
 		spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -676,16 +678,26 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
  * general bitmap file operations
  */
 
+/*
+ * on-disk bitmap:
+ *
+ * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
+ * file a page at a time. There's a superblock at the start of the file.
+ */
 /* calculate the index of the page that contains this bit */
-static inline unsigned long file_page_index(unsigned long chunk)
+static inline unsigned long file_page_index(struct bitmap *bitmap, unsigned long chunk)
 {
-	return CHUNK_BIT_OFFSET(chunk) >> PAGE_BIT_SHIFT;
+	if (!bitmap->mddev->bitmap_info.external)
+		chunk += sizeof(bitmap_super_t) << 3;
+	return chunk >> PAGE_BIT_SHIFT;
 }
 
 /* calculate the (bit) offset of this bit within a page */
-static inline unsigned long file_page_offset(unsigned long chunk)
+static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned long chunk)
 {
-	return CHUNK_BIT_OFFSET(chunk) & (PAGE_BITS - 1);
+	if (!bitmap->mddev->bitmap_info.external)
+		chunk += sizeof(bitmap_super_t) << 3;
+	return chunk & (PAGE_BITS - 1);
 }
 
 /*
@@ -698,8 +710,9 @@ static inline unsigned long file_page_offset(unsigned long chunk)
 static inline struct page *filemap_get_page(struct bitmap *bitmap,
 					unsigned long chunk)
 {
-	if (file_page_index(chunk) >= bitmap->file_pages) return NULL;
-	return bitmap->filemap[file_page_index(chunk) - file_page_index(0)];
+	if (file_page_index(bitmap, chunk) >= bitmap->file_pages) return NULL;
+	return bitmap->filemap[file_page_index(bitmap, chunk)
+			       - file_page_index(bitmap, 0)];
 }
 
 
@@ -722,7 +735,7 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
 	spin_unlock_irqrestore(&bitmap->lock, flags);
 
 	while (pages--)
-		if (map[pages]->index != 0) /* 0 is sb_page, release it below */
+		if (map[pages] != sb_page) /* 0 is sb_page, release it below */
 			free_buffers(map[pages]);
 	kfree(map);
 	kfree(attr);
@@ -833,7 +846,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 
 	page = filemap_get_page(bitmap, chunk);
 	if (!page) return;
-	bit = file_page_offset(chunk);
+	bit = file_page_offset(bitmap, chunk);
 
  	/* set the bit */
 	kaddr = kmap_atomic(page, KM_USER0);
@@ -931,14 +944,17 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 			"recovery\n", bmname(bitmap));
 
 	bytes = (chunks + 7) / 8;
+	if (!bitmap->mddev->bitmap_info.external)
+		bytes += sizeof(bitmap_super_t);
 
-	num_pages = (bytes + sizeof(bitmap_super_t) + PAGE_SIZE - 1) / PAGE_SIZE;
+	
+	num_pages = (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
 
-	if (file && i_size_read(file->f_mapping->host) < bytes + sizeof(bitmap_super_t)) {
+	if (file && i_size_read(file->f_mapping->host) < bytes) {
 		printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n",
 			bmname(bitmap),
 			(unsigned long) i_size_read(file->f_mapping->host),
-			bytes + sizeof(bitmap_super_t));
+			bytes);
 		goto err;
 	}
 
@@ -959,17 +975,16 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 
 	for (i = 0; i < chunks; i++) {
 		int b;
-		index = file_page_index(i);
-		bit = file_page_offset(i);
+		index = file_page_index(bitmap, i);
+		bit = file_page_offset(bitmap, i);
 		if (index != oldindex) { /* this is a new page, read it in */
 			int count;
 			/* unmap the old page, we're done with it */
 			if (index == num_pages-1)
-				count = bytes + sizeof(bitmap_super_t)
-					- index * PAGE_SIZE;
+				count = bytes - index * PAGE_SIZE;
 			else
 				count = PAGE_SIZE;
-			if (index == 0) {
+			if (index == 0 && bitmap->sb_page) {
 				/*
 				 * if we're here then the superblock page
 				 * contains some bits (PAGE_SIZE != sizeof sb)
@@ -1164,7 +1179,8 @@ void bitmap_daemon_work(mddev_t *mddev)
 			/* We are possibly going to clear some bits, so make
 			 * sure that events_cleared is up-to-date.
 			 */
-			if (bitmap->need_sync) {
+			if (bitmap->need_sync &&
+			    bitmap->mddev->bitmap_info.external == 0) {
 				bitmap_super_t *sb;
 				bitmap->need_sync = 0;
 				sb = kmap_atomic(bitmap->sb_page, KM_USER0);
@@ -1174,7 +1190,8 @@ void bitmap_daemon_work(mddev_t *mddev)
 				write_page(bitmap, bitmap->sb_page, 1);
 			}
 			spin_lock_irqsave(&bitmap->lock, flags);
-			clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
+			if (!bitmap->need_sync)
+				clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
 		}
 		bmc = bitmap_get_counter(bitmap,
 					 (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
@@ -1189,7 +1206,7 @@ void bitmap_daemon_work(mddev_t *mddev)
 			if (*bmc == 2) {
 				*bmc=1; /* maybe clear the bit next time */
 				set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
-			} else if (*bmc == 1) {
+			} else if (*bmc == 1 && !bitmap->need_sync) {
 				/* we can clear the bit */
 				*bmc = 0;
 				bitmap_count_page(bitmap,
@@ -1199,9 +1216,11 @@ void bitmap_daemon_work(mddev_t *mddev)
 				/* clear the bit */
 				paddr = kmap_atomic(page, KM_USER0);
 				if (bitmap->flags & BITMAP_HOSTENDIAN)
-					clear_bit(file_page_offset(j), paddr);
+					clear_bit(file_page_offset(bitmap, j),
+						  paddr);
 				else
-					ext2_clear_bit(file_page_offset(j), paddr);
+					ext2_clear_bit(file_page_offset(bitmap, j),
+						       paddr);
 				kunmap_atomic(paddr, KM_USER0);
 			}
 		} else
@@ -1356,6 +1375,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
 		    bitmap->events_cleared < bitmap->mddev->events) {
 			bitmap->events_cleared = bitmap->mddev->events;
 			bitmap->need_sync = 1;
+			sysfs_notify_dirent(bitmap->sysfs_can_clear);
 		}
 
 		if (!success && ! (*bmc & NEEDED_MASK))
@@ -1613,6 +1633,9 @@ void bitmap_destroy(mddev_t *mddev)
 	if (mddev->thread)
 		mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
 
+	if (bitmap->sysfs_can_clear)
+		sysfs_put(bitmap->sysfs_can_clear);
+
 	bitmap_free(bitmap);
 }
 
@@ -1629,6 +1652,7 @@ int bitmap_create(mddev_t *mddev)
 	struct file *file = mddev->bitmap_info.file;
 	int err;
 	sector_t start;
+	struct sysfs_dirent *bm;
 
 	BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
 
@@ -1648,6 +1672,13 @@ int bitmap_create(mddev_t *mddev)
 
 	bitmap->mddev = mddev;
 
+	bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap");
+	if (bm) {
+		bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear");
+		sysfs_put(bm);
+	} else
+		bitmap->sysfs_can_clear = NULL;
+
 	bitmap->file = file;
 	if (file) {
 		get_file(file);
@@ -1658,7 +1689,16 @@ int bitmap_create(mddev_t *mddev)
 		vfs_fsync(file, file->f_dentry, 1);
 	}
 	/* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */
-	err = bitmap_read_sb(bitmap);
+	if (!mddev->bitmap_info.external)
+		err = bitmap_read_sb(bitmap);
+	else {
+		err = 0;
+		if (mddev->bitmap_info.chunksize == 0 ||
+		    mddev->bitmap_info.daemon_sleep == 0)
+			/* chunksize and time_base need to be
+			 * set first. */
+			err = -EINVAL;
+	}
 	if (err)
 		goto error;
 
@@ -1777,7 +1817,8 @@ location_store(mddev_t *mddev, const char *buf, size_t len)
 				return rv;
 			if (offset == 0)
 				return -EINVAL;
-			if (mddev->major_version == 0 &&
+			if (mddev->bitmap_info.external == 0 &&
+			    mddev->major_version == 0 &&
 			    offset != mddev->bitmap_info.default_offset)
 				return -EINVAL;
 			mddev->bitmap_info.offset = offset;
@@ -1906,11 +1947,66 @@ chunksize_store(mddev_t *mddev, const char *buf, size_t len)
 static struct md_sysfs_entry bitmap_chunksize =
 __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store);
 
+static ssize_t metadata_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%s\n", (mddev->bitmap_info.external
+				      ? "external" : "internal"));
+}
+
+static ssize_t metadata_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	if (mddev->bitmap ||
+	    mddev->bitmap_info.file ||
+	    mddev->bitmap_info.offset)
+		return -EBUSY;
+	if (strncmp(buf, "external", 8) == 0)
+		mddev->bitmap_info.external = 1;
+	else if (strncmp(buf, "internal", 8) == 0)
+		mddev->bitmap_info.external = 0;
+	else
+		return -EINVAL;
+	return len;
+}
+
+static struct md_sysfs_entry bitmap_metadata =
+__ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
+
+static ssize_t can_clear_show(mddev_t *mddev, char *page)
+{
+	int len;
+	if (mddev->bitmap)
+		len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ?
+					     "false" : "true"));
+	else
+		len = sprintf(page, "\n");
+	return len;
+}
+
+static ssize_t can_clear_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	if (mddev->bitmap == NULL)
+		return -ENOENT;
+	if (strncmp(buf, "false", 5) == 0)
+		mddev->bitmap->need_sync = 1;
+	else if (strncmp(buf, "true", 4) == 0) {
+		if (mddev->degraded)
+			return -EBUSY;
+		mddev->bitmap->need_sync = 0;
+	} else
+		return -EINVAL;
+	return len;
+}
+
+static struct md_sysfs_entry bitmap_can_clear =
+__ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store);
+
 static struct attribute *md_bitmap_attrs[] = {
 	&bitmap_location.attr,
 	&bitmap_timeout.attr,
 	&bitmap_backlog.attr,
 	&bitmap_chunksize.attr,
+	&bitmap_metadata.attr,
+	&bitmap_can_clear.attr,
 	NULL
 };
 struct attribute_group md_bitmap_group = {
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 50ee4240f5db..cb821d76d1b4 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -118,16 +118,6 @@ typedef __u16 bitmap_counter_t;
 			(CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
 #define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
 
-/*
- * on-disk bitmap:
- *
- * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
- * file a page at a time. There's a superblock at the start of the file.
- */
-
-/* map chunks (bits) to file pages - offset by the size of the superblock */
-#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))
-
 #endif
 
 /*
@@ -250,6 +240,7 @@ struct bitmap {
 	wait_queue_head_t write_wait;
 	wait_queue_head_t overflow_wait;
 
+	struct sysfs_dirent *sysfs_can_clear;
 };
 
 /* the bitmap API */
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fce02073f1a4..d9138885b87f 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -296,6 +296,7 @@ struct mddev_s
 		unsigned long		chunksize;
 		unsigned long		daemon_sleep; /* how many seconds between updates? */
 		unsigned long		max_write_behind; /* write-behind mode */
+		int			external;
 	} bitmap_info;
 
 	struct list_head		all_mddevs;

From ffa23322b1596bb1a115e767d4251b62842adc23 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:56 +1100
Subject: [PATCH 19/27] md/bitmap: update dirty flag when bitmap bits are
 explicitly set.

There is a sysfs file which allows bits in the write-intent
bitmap to be explicit set - indicating that the block is thought
to be 'dirty'.
When this happens we should really set recovery_cp backwards
to include the block to reflect this dirtiness.

In particular, a 'resync' process will refuse to start if
recovery_cp is beyond the end of the array, so this is needed
to allow a resync to be triggered.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index de5c42df8d17..26ac8aad0b19 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1566,6 +1566,12 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
 		sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap);
 		bitmap_set_memory_bits(bitmap, sec, 1);
 		bitmap_file_set_bit(bitmap, sec);
+		if (sec < bitmap->mddev->recovery_cp)
+			/* We are asserting that the array is dirty,
+			 * so move the recovery_cp address back so
+			 * that it is obvious that it is dirty
+			 */
+			bitmap->mddev->recovery_cp = sec;
 	}
 }
 

From 67b8dc4b06b0e97df55fd76e209f34f9a52e820e Mon Sep 17 00:00:00 2001
From: Robert Becker <Rob.Becker@riverbed.com>
Date: Mon, 14 Dec 2009 12:49:57 +1100
Subject: [PATCH 20/27] md/raid10: print more useful messages on device
 failure.

When we get a read error on a device in a RAID10, and attempting to
repair the error fails, print more useful messages about why it
failed.

Signed-off-by: Robert Becker <Rob.Becker@riverbed.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d9e28a65af58..670449f7411f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1488,6 +1488,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 		/* write it back and re-read */
 		rcu_read_lock();
 		while (sl != r10_bio->read_slot) {
+			char b[BDEVNAME_SIZE];
 			int d;
 			if (sl==0)
 				sl = conf->copies;
@@ -1503,9 +1504,21 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 						 r10_bio->devs[sl].addr +
 						 sect + rdev->data_offset,
 						 s<<9, conf->tmppage, WRITE)
-				    == 0)
+				    == 0) {
 					/* Well, this device is dead */
+					printk(KERN_NOTICE
+					       "raid10:%s: read correction "
+					       "write failed"
+					       " (%d sectors at %llu on %s)\n",
+					       mdname(mddev), s,
+					       (unsigned long long)(sect+
+					       rdev->data_offset),
+					       bdevname(rdev->bdev, b));
+					printk(KERN_NOTICE "raid10:%s: failing "
+					       "drive\n",
+					       bdevname(rdev->bdev, b));
 					md_error(mddev, rdev);
+				}
 				rdev_dec_pending(rdev, mddev);
 				rcu_read_lock();
 			}
@@ -1526,10 +1539,22 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 				if (sync_page_io(rdev->bdev,
 						 r10_bio->devs[sl].addr +
 						 sect + rdev->data_offset,
-						 s<<9, conf->tmppage, READ) == 0)
+						 s<<9, conf->tmppage,
+						 READ) == 0) {
 					/* Well, this device is dead */
+					printk(KERN_NOTICE
+					       "raid10:%s: unable to read back "
+					       "corrected sectors"
+					       " (%d sectors at %llu on %s)\n",
+					       mdname(mddev), s,
+					       (unsigned long long)(sect+
+						    rdev->data_offset),
+					       bdevname(rdev->bdev, b));
+					printk(KERN_NOTICE "raid10:%s: failing drive\n",
+					       bdevname(rdev->bdev, b));
+
 					md_error(mddev, rdev);
-				else
+				} else {
 					printk(KERN_INFO
 					       "raid10:%s: read error corrected"
 					       " (%d sectors at %llu on %s)\n",
@@ -1537,6 +1562,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 					       (unsigned long long)(sect+
 					            rdev->data_offset),
 					       bdevname(rdev->bdev, b));
+				}
 
 				rdev_dec_pending(rdev, mddev);
 				rcu_read_lock();

From 1e50915fe0bbf7a46db0fa7e1e604d3fc95f057d Mon Sep 17 00:00:00 2001
From: Robert Becker <Rob.Becker@riverbed.com>
Date: Mon, 14 Dec 2009 12:49:58 +1100
Subject: [PATCH 21/27] raid: improve MD/raid10 handling of correctable read
 errors.

We've noticed severe lasting performance degradation of our raid
arrays when we have drives that yield large amounts of media errors.
The raid10 module will queue each failed read for retry, and also
will attempt call fix_read_error() to perform the read recovery.
Read recovery is performed while the array is frozen, so repeated
recovery attempts can degrade the performance of the array for
extended periods of time.

With this patch I propose adding a per md device max number of
corrected read attempts.  Each rdev will maintain a count of
read correction attempts in the rdev->read_errors field (not
used currently for raid10). When we enter fix_read_error()
we'll check to see when the last read error occurred, and
divide the read error count by 2 for every hour since the
last read error. If at that point our read error count
exceeds the read error threshold, we'll fail the raid device.

In addition in this patch I add sysfs nodes (get/set) for
the per md max_read_errors attribute, the rdev->read_errors
attribute, and added some printk's to indicate when
fix_read_error fails to repair an rdev.

For testing I used debugfs->fail_make_request to inject
IO errors to the rdev while doing IO to the raid array.

Signed-off-by: Robert Becker <Rob.Becker@riverbed.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c     | 34 +++++++++++++++++++++
 drivers/md/md.h     |  4 +++
 drivers/md/raid10.c | 74 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 112 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 859edbf8c9b0..f1b905a20133 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -67,6 +67,12 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
 
 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
 
+/*
+ * Default number of read corrections we'll attempt on an rdev
+ * before ejecting it from the array. We divide the read error
+ * count by 2 for every hour elapsed between read errors.
+ */
+#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
 /*
  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
  * is 1000 KB/sec, so the extra system load does not show up that much.
@@ -2653,6 +2659,8 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
 	rdev->flags = 0;
 	rdev->data_offset = 0;
 	rdev->sb_events = 0;
+	rdev->last_read_error.tv_sec  = 0;
+	rdev->last_read_error.tv_nsec = 0;
 	atomic_set(&rdev->nr_pending, 0);
 	atomic_set(&rdev->read_errors, 0);
 	atomic_set(&rdev->corrected_errors, 0);
@@ -3289,6 +3297,29 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
 static struct md_sysfs_entry md_array_state =
 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
 
+static ssize_t
+max_corrected_read_errors_show(mddev_t *mddev, char *page) {
+	return sprintf(page, "%d\n",
+		       atomic_read(&mddev->max_corr_read_errors));
+}
+
+static ssize_t
+max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	char *e;
+	unsigned long n = simple_strtoul(buf, &e, 10);
+
+	if (*buf && (*e == 0 || *e == '\n')) {
+		atomic_set(&mddev->max_corr_read_errors, n);
+		return len;
+	}
+	return -EINVAL;
+}
+
+static struct md_sysfs_entry max_corr_read_errors =
+__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
+	max_corrected_read_errors_store);
+
 static ssize_t
 null_show(mddev_t *mddev, char *page)
 {
@@ -3914,6 +3945,7 @@ static struct attribute *md_default_attrs[] = {
 	&md_array_state.attr,
 	&md_reshape_position.attr,
 	&md_array_size.attr,
+	&max_corr_read_errors.attr,
 	NULL,
 };
 
@@ -4333,6 +4365,8 @@ static int do_md_run(mddev_t * mddev)
 		mddev->ro = 0;
 
  	atomic_set(&mddev->writes_pending,0);
+	atomic_set(&mddev->max_corr_read_errors,
+		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
 	mddev->safemode = 0;
 	mddev->safemode_timer.function = md_safemode_timeout;
 	mddev->safemode_timer.data = (unsigned long) mddev;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index d9138885b87f..8e4c75c00d46 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -97,6 +97,9 @@ struct mdk_rdev_s
 	atomic_t	read_errors;	/* number of consecutive read errors that
 					 * we have tried to ignore.
 					 */
+	struct timespec last_read_error;	/* monotonic time since our
+						 * last read error
+						 */
 	atomic_t	corrected_errors; /* number of corrected read errors,
 					   * for reporting to userspace and storing
 					   * in superblock.
@@ -299,6 +302,7 @@ struct mddev_s
 		int			external;
 	} bitmap_info;
 
+	atomic_t 			max_corr_read_errors; /* max read retries */
 	struct list_head		all_mddevs;
 
 	/* Generic barrier handling.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 670449f7411f..5c71a462c120 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1431,6 +1431,43 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 }
 
 
+/*
+ * Used by fix_read_error() to decay the per rdev read_errors.
+ * We halve the read error count for every hour that has elapsed
+ * since the last recorded read error.
+ *
+ */
+static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	struct timespec cur_time_mon;
+	unsigned long hours_since_last;
+	unsigned int read_errors = atomic_read(&rdev->read_errors);
+
+	ktime_get_ts(&cur_time_mon);
+
+	if (rdev->last_read_error.tv_sec == 0 &&
+	    rdev->last_read_error.tv_nsec == 0) {
+		/* first time we've seen a read error */
+		rdev->last_read_error = cur_time_mon;
+		return;
+	}
+
+	hours_since_last = (cur_time_mon.tv_sec -
+			    rdev->last_read_error.tv_sec) / 3600;
+
+	rdev->last_read_error = cur_time_mon;
+
+	/*
+	 * if hours_since_last is > the number of bits in read_errors
+	 * just set read errors to 0. We do this to avoid
+	 * overflowing the shift of read_errors by hours_since_last.
+	 */
+	if (hours_since_last >= 8 * sizeof(read_errors))
+		atomic_set(&rdev->read_errors, 0);
+	else
+		atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
+}
+
 /*
  * This is a kernel thread which:
  *
@@ -1444,6 +1481,43 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 	int sect = 0; /* Offset from r10_bio->sector */
 	int sectors = r10_bio->sectors;
 	mdk_rdev_t*rdev;
+	int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
+
+	rcu_read_lock();
+	{
+		int d = r10_bio->devs[r10_bio->read_slot].devnum;
+		char b[BDEVNAME_SIZE];
+		int cur_read_error_count = 0;
+
+		rdev = rcu_dereference(conf->mirrors[d].rdev);
+		bdevname(rdev->bdev, b);
+
+		if (test_bit(Faulty, &rdev->flags)) {
+			rcu_read_unlock();
+			/* drive has already been failed, just ignore any
+			   more fix_read_error() attempts */
+			return;
+		}
+
+		check_decay_read_errors(mddev, rdev);
+		atomic_inc(&rdev->read_errors);
+		cur_read_error_count = atomic_read(&rdev->read_errors);
+		if (cur_read_error_count > max_read_errors) {
+			rcu_read_unlock();
+			printk(KERN_NOTICE
+			       "raid10: %s: Raid device exceeded "
+			       "read_error threshold "
+			       "[cur %d:max %d]\n",
+			       b, cur_read_error_count, max_read_errors);
+			printk(KERN_NOTICE
+			       "raid10: %s: Failing raid "
+			       "device\n", b);
+			md_error(mddev, conf->mirrors[d].rdev);
+			return;
+		}
+	}
+	rcu_read_unlock();
+
 	while(sectors) {
 		int s = sectors;
 		int sl = r10_bio->read_slot;

From 0efb9e6191e1d3d34c1db90b829b742bc36d532e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:58 +1100
Subject: [PATCH 22/27] md: add MODULE_DESCRIPTION for all md related modules.

Suggested by  Oren Held <orenhe@il.ibm.com>

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/faulty.c     | 1 +
 drivers/md/linear.c     | 1 +
 drivers/md/md.c         | 1 +
 drivers/md/multipath.c  | 1 +
 drivers/md/raid0.c      | 1 +
 drivers/md/raid1.c      | 1 +
 drivers/md/raid10.c     | 1 +
 drivers/md/raid5.c      | 1 +
 drivers/md/raid6algos.c | 1 +
 9 files changed, 9 insertions(+)

diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 87d88dbb667f..713acd02ab39 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -360,6 +360,7 @@ static void raid_exit(void)
 module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Fault injection personality for MD");
 MODULE_ALIAS("md-personality-10"); /* faulty */
 MODULE_ALIAS("md-faulty");
 MODULE_ALIAS("md-level--5");
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 3b3f77c4f249..00435bd20699 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -383,6 +383,7 @@ static void linear_exit (void)
 module_init(linear_init);
 module_exit(linear_exit);
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Linear device concatenation personality for MD");
 MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
 MODULE_ALIAS("md-linear");
 MODULE_ALIAS("md-level--1");
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f1b905a20133..bcc66680952e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7168,5 +7168,6 @@ EXPORT_SYMBOL(md_unregister_thread);
 EXPORT_SYMBOL(md_wakeup_thread);
 EXPORT_SYMBOL(md_check_recovery);
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MD RAID framework");
 MODULE_ALIAS("md");
 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index cbc0a99f3796..32a662fc55c9 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -581,6 +581,7 @@ static void __exit multipath_exit (void)
 module_init(multipath_init);
 module_exit(multipath_exit);
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("simple multi-path personality for MD");
 MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
 MODULE_ALIAS("md-multipath");
 MODULE_ALIAS("md-level--4");
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 122d07af5b54..77605cdceaf1 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -567,6 +567,7 @@ static void raid0_exit (void)
 module_init(raid0_init);
 module_exit(raid0_exit);
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID0 (striping) personality for MD");
 MODULE_ALIAS("md-personality-2"); /* RAID0 */
 MODULE_ALIAS("md-raid0");
 MODULE_ALIAS("md-level-0");
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f0949e2bcd6e..859bd3ffe435 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2376,6 +2376,7 @@ static void raid_exit(void)
 module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
 MODULE_ALIAS("md-personality-3"); /* RAID1 */
 MODULE_ALIAS("md-raid1");
 MODULE_ALIAS("md-level-1");
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5c71a462c120..d119b7b75e71 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2408,6 +2408,7 @@ static void raid_exit(void)
 module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
 MODULE_ALIAS("md-personality-9"); /* RAID10 */
 MODULE_ALIAS("md-raid10");
 MODULE_ALIAS("md-level-10");
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c78d4835b0d6..e84204eb12df 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5892,6 +5892,7 @@ static void raid5_exit(void)
 module_init(raid5_init);
 module_exit(raid5_exit);
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
 MODULE_ALIAS("md-personality-4"); /* RAID5 */
 MODULE_ALIAS("md-raid5");
 MODULE_ALIAS("md-raid4");
diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c
index 8ce102cc2e7b..bffc61bff5ab 100644
--- a/drivers/md/raid6algos.c
+++ b/drivers/md/raid6algos.c
@@ -150,3 +150,4 @@ static void raid6_exit(void)
 subsys_initcall(raid6_select_algo);
 module_exit(raid6_exit);
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID6 Q-syndrome calculations");

From 93bd89a6d5f268acbdb8d775e40d3a280b07b5bd Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:49:59 +1100
Subject: [PATCH 23/27] md: revise Kconfig help for MD_MULTIPATH

Make it clear in the config message that MD_MULTIPATH is not under
active development.

Cc: Oren Held <orenhe@il.ibm.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/Kconfig | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 2158377a1359..acb3a4e404ff 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -185,11 +185,10 @@ config MD_MULTIPATH
 	tristate "Multipath I/O support"
 	depends on BLK_DEV_MD
 	help
-	  Multipath-IO is the ability of certain devices to address the same
-	  physical disk over multiple 'IO paths'. The code ensures that such
-	  paths can be defined and handled at runtime, and ensures that a
-	  transparent failover to the backup path(s) happens if a IO errors
-	  arrives on the primary path.
+	  MD_MULTIPATH provides a simple multi-path personality for use
+	  the MD framework.  It is not under active development.  New
+	  projects should consider using DM_MULTIPATH which has more
+	  features and more testing.
 
 	  If unsure, say N.
 

From aa98aa31987ad9831711ae71ea2270228ab62532 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 14 Dec 2009 12:50:05 +1100
Subject: [PATCH 24/27] md: move compat_ioctl handling into md.c

The RAID ioctls are only implemented in md.c, so the
handling for them should also be moved there from
fs/compat_ioctl.c.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Neil Brown <neilb@suse.de>
Cc: Andre Noll <maan@systemlinux.org>
Cc: linux-raid@vger.kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c   | 23 +++++++++++++++++++++++
 fs/compat_ioctl.c | 18 ------------------
 2 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index bcc66680952e..08564375d6b5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -44,6 +44,7 @@
 #include <linux/random.h>
 #include <linux/reboot.h>
 #include <linux/file.h>
+#include <linux/compat.h>
 #include <linux/delay.h>
 #include <linux/raid/md_p.h>
 #include <linux/raid/md_u.h>
@@ -5691,6 +5692,25 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 abort:
 	return err;
 }
+#ifdef CONFIG_COMPAT
+static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
+		    unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case HOT_REMOVE_DISK:
+	case HOT_ADD_DISK:
+	case SET_DISK_FAULTY:
+	case SET_BITMAP_FILE:
+		/* These take in integer arg, do not convert */
+		break;
+	default:
+		arg = (unsigned long)compat_ptr(arg);
+		break;
+	}
+
+	return md_ioctl(bdev, mode, cmd, arg);
+}
+#endif /* CONFIG_COMPAT */
 
 static int md_open(struct block_device *bdev, fmode_t mode)
 {
@@ -5756,6 +5776,9 @@ static const struct block_device_operations md_fops =
 	.open		= md_open,
 	.release	= md_release,
 	.ioctl		= md_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= md_compat_ioctl,
+#endif
 	.getgeo		= md_getgeo,
 	.media_changed	= md_media_changed,
 	.revalidate_disk= md_revalidate,
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 278020d2449c..14cbc831422a 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -979,24 +979,6 @@ COMPATIBLE_IOCTL(FIGETBSZ)
 /* 'X' - originally XFS but some now in the VFS */
 COMPATIBLE_IOCTL(FIFREEZE)
 COMPATIBLE_IOCTL(FITHAW)
-/* RAID */
-COMPATIBLE_IOCTL(RAID_VERSION)
-COMPATIBLE_IOCTL(GET_ARRAY_INFO)
-COMPATIBLE_IOCTL(GET_DISK_INFO)
-COMPATIBLE_IOCTL(PRINT_RAID_DEBUG)
-COMPATIBLE_IOCTL(RAID_AUTORUN)
-COMPATIBLE_IOCTL(CLEAR_ARRAY)
-COMPATIBLE_IOCTL(ADD_NEW_DISK)
-COMPATIBLE_IOCTL(SET_ARRAY_INFO)
-COMPATIBLE_IOCTL(SET_DISK_INFO)
-COMPATIBLE_IOCTL(WRITE_RAID_INFO)
-COMPATIBLE_IOCTL(UNPROTECT_ARRAY)
-COMPATIBLE_IOCTL(PROTECT_ARRAY)
-COMPATIBLE_IOCTL(RUN_ARRAY)
-COMPATIBLE_IOCTL(STOP_ARRAY)
-COMPATIBLE_IOCTL(STOP_ARRAY_RO)
-COMPATIBLE_IOCTL(RESTART_ARRAY_RW)
-COMPATIBLE_IOCTL(GET_BITMAP_FILE)
 COMPATIBLE_IOCTL(KDGETKEYCODE)
 COMPATIBLE_IOCTL(KDSETKEYCODE)
 COMPATIBLE_IOCTL(KDGKBTYPE)

From 93be75ffde6dfd1ad17cc3ff1dbd135bd711baf4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 14 Dec 2009 12:50:06 +1100
Subject: [PATCH 25/27] md: integrate spares into array at earliest
 opportunity.

As v1.x metadata can record that a member of the array is
not completely recovered, it make sense to record that a
spare has become a regular member of the array at the earliest
opportunity.
So remove the tests on "recovery_offset > 0" in super_1_sync
as they really aren't needed, and schedule a metadata update
immediately after adding spares to a degraded array.

This means that if a crash happens immediately after a recovery
starts, the new device will be included in the array and recovery will
continue from wherever it was up to.  Previously this didn't happen
unless recovery was at least 1/16 of the way through.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 08564375d6b5..68a8a29a9012 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1508,12 +1508,10 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 
 	if (rdev->raid_disk >= 0 &&
 	    !test_bit(In_sync, &rdev->flags)) {
-		if (rdev->recovery_offset > 0) {
-			sb->feature_map |=
-				cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
-			sb->recovery_offset =
-				cpu_to_le64(rdev->recovery_offset);
-		}
+		sb->feature_map |=
+			cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
+		sb->recovery_offset =
+			cpu_to_le64(rdev->recovery_offset);
 	}
 
 	if (mddev->reshape_position != MaxSector) {
@@ -1547,7 +1545,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 			sb->dev_roles[i] = cpu_to_le16(0xfffe);
 		else if (test_bit(In_sync, &rdev2->flags))
 			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
-		else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
+		else if (rdev2->raid_disk >= 0)
 			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
 		else
 			sb->dev_roles[i] = cpu_to_le16(0xffff);
@@ -6786,6 +6784,7 @@ static int remove_and_add_spares(mddev_t *mddev)
 						       nm, mdname(mddev));
 					spares++;
 					md_new_event(mddev);
+					set_bit(MD_CHANGE_DEVS, &mddev->flags);
 				} else
 					break;
 			}

From 4e59ca7da05f0d5d3ad40365c502c8b0fd24c7e3 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 12 Dec 2009 21:17:06 -0700
Subject: [PATCH 26/27] md: rcu_read_lock() walk of mddev->disks in
 md_do_sync()

Other walks of this list are either under rcu_read_lock() or the list
mutation lock (mddev_lock()).  This protects against the improbable case of a
disk being removed from the array at the start of md_do_sync().

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/md/md.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 68a8a29a9012..ea64a68e9c75 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6526,12 +6526,14 @@ void md_do_sync(mddev_t *mddev)
 		/* recovery follows the physical size of devices */
 		max_sectors = mddev->dev_sectors;
 		j = MaxSector;
-		list_for_each_entry(rdev, &mddev->disks, same_set)
+		rcu_read_lock();
+		list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
 			if (rdev->raid_disk >= 0 &&
 			    !test_bit(Faulty, &rdev->flags) &&
 			    !test_bit(In_sync, &rdev->flags) &&
 			    rdev->recovery_offset < j)
 				j = rdev->recovery_offset;
+		rcu_read_unlock();
 	}
 
 	printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
@@ -6701,12 +6703,14 @@ void md_do_sync(mddev_t *mddev)
 		} else {
 			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
 				mddev->curr_resync = MaxSector;
-			list_for_each_entry(rdev, &mddev->disks, same_set)
+			rcu_read_lock();
+			list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
 				if (rdev->raid_disk >= 0 &&
 				    !test_bit(Faulty, &rdev->flags) &&
 				    !test_bit(In_sync, &rdev->flags) &&
 				    rdev->recovery_offset < mddev->curr_resync)
 					rdev->recovery_offset = mddev->curr_resync;
+			rcu_read_unlock();
 		}
 	}
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);

From 06e3c817b750c131a20e82eed57a17841ea88ed2 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 12 Dec 2009 21:17:12 -0700
Subject: [PATCH 27/27] md: add 'recovery_start' per-device sysfs attribute

Enable external metadata arrays to manage rebuild checkpointing via a
md/dev-XXX/recovery_start attribute which reflects rdev->recovery_offset

Also update resync_start_store to allow 'none' to be written, for
consistency.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 Documentation/md.txt | 27 +++++++++++++++++++++++----
 drivers/md/md.c      | 41 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index 21d26fb5d02b..188f4768f1d5 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -233,9 +233,9 @@ All md devices contain:
 
   resync_start
      The point at which resync should start.  If no resync is needed,
-     this will be a very large number.  At array creation it will
-     default to 0, though starting the array as 'clean' will
-     set it much larger.
+     this will be a very large number (or 'none' since 2.6.30-rc1).  At
+     array creation it will default to 0, though starting the array as
+     'clean' will set it much larger.
 
    new_dev
      This file can be written but not read.  The value written should
@@ -379,8 +379,9 @@ Each directory contains:
 	Writing "writemostly" sets the writemostly flag.
 	Writing "-writemostly" clears the writemostly flag.
 	Writing "blocked" sets the "blocked" flag.
-	Writing "-blocked" clear the "blocked" flag and allows writes
+	Writing "-blocked" clears the "blocked" flag and allows writes
 		to complete.
+	Writing "in_sync" sets the in_sync flag.
 
 	This file responds to select/poll. Any change to 'faulty'
 	or 'blocked' causes an event.
@@ -417,6 +418,24 @@ Each directory contains:
         array.  If a value less than the current component_size is
         written, it will be rejected.
 
+      recovery_start
+
+        When the device is not 'in_sync', this records the number of
+	sectors from the start of the device which are known to be
+	correct.  This is normally zero, but during a recovery
+	operation is will steadily increase, and if the recovery is
+	interrupted, restoring this value can cause recovery to
+	avoid repeating the earlier blocks.  With v1.x metadata, this
+	value is saved and restored automatically.
+
+	This can be set whenever the device is not an active member of
+	the array, either before the array is activated, or before
+	the 'slot' is set.
+
+	Setting this to 'none' is equivalent to setting 'in_sync'.
+	Setting to any other value also clears the 'in_sync' flag.
+	
+
 
 An active md device will also contain and entry for each active device
 in the array.  These are named
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ea64a68e9c75..e1f3c1715cca 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2551,12 +2551,49 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 static struct rdev_sysfs_entry rdev_size =
 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
 
+
+static ssize_t recovery_start_show(mdk_rdev_t *rdev, char *page)
+{
+	unsigned long long recovery_start = rdev->recovery_offset;
+
+	if (test_bit(In_sync, &rdev->flags) ||
+	    recovery_start == MaxSector)
+		return sprintf(page, "none\n");
+
+	return sprintf(page, "%llu\n", recovery_start);
+}
+
+static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+	unsigned long long recovery_start;
+
+	if (cmd_match(buf, "none"))
+		recovery_start = MaxSector;
+	else if (strict_strtoull(buf, 10, &recovery_start))
+		return -EINVAL;
+
+	if (rdev->mddev->pers &&
+	    rdev->raid_disk >= 0)
+		return -EBUSY;
+
+	rdev->recovery_offset = recovery_start;
+	if (recovery_start == MaxSector)
+		set_bit(In_sync, &rdev->flags);
+	else
+		clear_bit(In_sync, &rdev->flags);
+	return len;
+}
+
+static struct rdev_sysfs_entry rdev_recovery_start =
+__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
+
 static struct attribute *rdev_default_attrs[] = {
 	&rdev_state.attr,
 	&rdev_errors.attr,
 	&rdev_slot.attr,
 	&rdev_offset.attr,
 	&rdev_size.attr,
+	&rdev_recovery_start.attr,
 	NULL,
 };
 static ssize_t
@@ -3101,7 +3138,9 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len)
 
 	if (mddev->pers)
 		return -EBUSY;
-	if (!*buf || (*e && *e != '\n'))
+	if (cmd_match(buf, "none"))
+		n = MaxSector;
+	else if (!*buf || (*e && *e != '\n'))
 		return -EINVAL;
 
 	mddev->recovery_cp = n;