From cbc4e3c1350beb47beab8f34ad9be3d34a20c705 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 27 Apr 2015 16:37:50 -0400
Subject: [PATCH 01/39] dm: do not allocate any mempools for blk-mq
 request-based DM

Do not allocate the io_pool mempool for blk-mq request-based DM
(DM_TYPE_MQ_REQUEST_BASED) in dm_alloc_rq_mempools().

Also refine __bind_mempools() to have more precise awareness of which
mempools each type of DM device uses -- avoids mempool churn when
reloading DM tables (particularly for DM_TYPE_REQUEST_BASED).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c |  4 +--
 drivers/md/dm.c       | 69 ++++++++++++++++++++++++-------------------
 2 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index a5f94125ad01..85e1d39e9a38 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -964,8 +964,8 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
 		return -EINVAL;
 	}
 
-	if (!t->mempools)
-		return -ENOMEM;
+	if (IS_ERR(t->mempools))
+		return PTR_ERR(t->mempools);
 
 	return 0;
 }
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4d6f089a0e9e..916f6015981c 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2323,39 +2323,52 @@ static void free_dev(struct mapped_device *md)
 	kfree(md);
 }
 
+static unsigned filter_md_type(unsigned type, struct mapped_device *md)
+{
+	if (type == DM_TYPE_BIO_BASED)
+		return type;
+
+	return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
+}
+
 static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 {
 	struct dm_md_mempools *p = dm_table_get_md_mempools(t);
 
-	if (md->bs) {
-		/* The md already has necessary mempools. */
-		if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
+	switch (filter_md_type(dm_table_get_type(t), md)) {
+	case DM_TYPE_BIO_BASED:
+		if (md->bs && md->io_pool) {
 			/*
+			 * This bio-based md already has necessary mempools.
 			 * Reload bioset because front_pad may have changed
 			 * because a different table was loaded.
 			 */
 			bioset_free(md->bs);
 			md->bs = p->bs;
 			p->bs = NULL;
+			goto out;
 		}
-		/*
-		 * There's no need to reload with request-based dm
-		 * because the size of front_pad doesn't change.
-		 * Note for future: If you are to reload bioset,
-		 * prep-ed requests in the queue may refer
-		 * to bio from the old bioset, so you must walk
-		 * through the queue to unprep.
-		 */
-		goto out;
+		break;
+	case DM_TYPE_REQUEST_BASED:
+		if (md->rq_pool && md->io_pool)
+			/*
+			 * This request-based md already has necessary mempools.
+			 */
+			goto out;
+		break;
+	case DM_TYPE_MQ_REQUEST_BASED:
+		BUG_ON(p); /* No mempools needed */
+		return;
 	}
 
+	BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
+
 	md->io_pool = p->io_pool;
 	p->io_pool = NULL;
 	md->rq_pool = p->rq_pool;
 	p->rq_pool = NULL;
 	md->bs = p->bs;
 	p->bs = NULL;
-
 out:
 	/* mempool bind completed, no longer need any mempools in the table */
 	dm_table_free_md_mempools(t);
@@ -2734,14 +2747,6 @@ static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
 	return err;
 }
 
-static unsigned filter_md_type(unsigned type, struct mapped_device *md)
-{
-	if (type == DM_TYPE_BIO_BASED)
-		return type;
-
-	return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
-}
-
 /*
  * Setup the DM device's queue based on md's type
  */
@@ -3463,7 +3468,7 @@ struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
 
 	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
 	if (!pools)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) +
 		offsetof(struct dm_target_io, clone);
@@ -3482,24 +3487,26 @@ struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
 	return pools;
 out:
 	dm_free_md_mempools(pools);
-	return NULL;
+	return ERR_PTR(-ENOMEM);
 }
 
 struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md,
 					    unsigned type)
 {
-	unsigned int pool_size = dm_get_reserved_rq_based_ios();
+	unsigned int pool_size;
 	struct dm_md_mempools *pools;
 
+	if (filter_md_type(type, md) == DM_TYPE_MQ_REQUEST_BASED)
+		return NULL; /* No mempools needed */
+
+	pool_size = dm_get_reserved_rq_based_ios();
 	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
 	if (!pools)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
-	if (filter_md_type(type, md) == DM_TYPE_REQUEST_BASED) {
-		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
-		if (!pools->rq_pool)
-			goto out;
-	}
+	pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
+	if (!pools->rq_pool)
+		goto out;
 
 	pools->io_pool = mempool_create_slab_pool(pool_size, _rq_tio_cache);
 	if (!pools->io_pool)
@@ -3508,7 +3515,7 @@ struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md,
 	return pools;
 out:
 	dm_free_md_mempools(pools);
-	return NULL;
+	return ERR_PTR(-ENOMEM);
 }
 
 void dm_free_md_mempools(struct dm_md_mempools *pools)

From 2d76fff18fd12284493456b01c998e540b140c23 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 29 Apr 2015 12:07:12 -0400
Subject: [PATCH 02/39] dm: cleanup methods that requeue requests

More often than not a request that is requeued _is_ mapped (meaning the
clone request is allocated and clone->q is initialized).  Rename
dm_requeue_unmapped_original_request() to avoid potential confusion due
to function name containing "unmapped".

Also, remove dm_requeue_unmapped_request() since callers can easily call
the dm_requeue_original_request() directly.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 916f6015981c..4b6cb1220182 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1113,8 +1113,8 @@ static void old_requeue_request(struct request *rq)
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-static void dm_requeue_unmapped_original_request(struct mapped_device *md,
-						 struct request *rq)
+static void dm_requeue_original_request(struct mapped_device *md,
+					struct request *rq)
 {
 	int rw = rq_data_dir(rq);
 
@@ -1130,13 +1130,6 @@ static void dm_requeue_unmapped_original_request(struct mapped_device *md,
 	rq_completed(md, rw, false);
 }
 
-static void dm_requeue_unmapped_request(struct request *clone)
-{
-	struct dm_rq_target_io *tio = clone->end_io_data;
-
-	dm_requeue_unmapped_original_request(tio->md, tio->orig);
-}
-
 static void old_stop_queue(struct request_queue *q)
 {
 	unsigned long flags;
@@ -1200,7 +1193,7 @@ static void dm_done(struct request *clone, int error, bool mapped)
 		return;
 	else if (r == DM_ENDIO_REQUEUE)
 		/* The target wants to requeue the I/O */
-		dm_requeue_unmapped_request(clone);
+		dm_requeue_original_request(tio->md, tio->orig);
 	else {
 		DMWARN("unimplemented target endio return value: %d", r);
 		BUG();
@@ -1910,7 +1903,7 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
 		break;
 	case DM_MAPIO_REQUEUE:
 		/* The target wants to requeue the I/O */
-		dm_requeue_unmapped_request(clone);
+		dm_requeue_original_request(md, tio->orig);
 		break;
 	default:
 		if (r > 0) {
@@ -1933,7 +1926,7 @@ static void map_tio_request(struct kthread_work *work)
 	struct mapped_device *md = tio->md;
 
 	if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
-		dm_requeue_unmapped_original_request(md, rq);
+		dm_requeue_original_request(md, rq);
 }
 
 static void dm_start_request(struct mapped_device *md, struct request *orig)

From 0f20972f7bf6922df49ef7ce7a6df802347d2c52 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 28 Apr 2015 11:50:29 -0400
Subject: [PATCH 03/39] dm: factor out a common cleanup_mapped_device()

Introduce a single common method for cleaning up a DM device's
mapped_device.  No functional change, just eliminates duplication of
delicate mapped_device cleanup code.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 80 +++++++++++++++++++++++++++----------------------
 1 file changed, 44 insertions(+), 36 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4b6cb1220182..767bce906588 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2166,6 +2166,40 @@ static void dm_init_old_md_queue(struct mapped_device *md)
 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
 }
 
+static void cleanup_mapped_device(struct mapped_device *md)
+{
+	cleanup_srcu_struct(&md->io_barrier);
+
+	if (md->wq)
+		destroy_workqueue(md->wq);
+	if (md->kworker_task)
+		kthread_stop(md->kworker_task);
+	if (md->io_pool)
+		mempool_destroy(md->io_pool);
+	if (md->rq_pool)
+		mempool_destroy(md->rq_pool);
+	if (md->bs)
+		bioset_free(md->bs);
+
+	if (md->disk) {
+		spin_lock(&_minor_lock);
+		md->disk->private_data = NULL;
+		spin_unlock(&_minor_lock);
+		if (blk_get_integrity(md->disk))
+			blk_integrity_unregister(md->disk);
+		del_gendisk(md->disk);
+		put_disk(md->disk);
+	}
+
+	if (md->queue)
+		blk_cleanup_queue(md->queue);
+
+	if (md->bdev) {
+		bdput(md->bdev);
+		md->bdev = NULL;
+	}
+}
+
 /*
  * Allocate and initialise a blank device with a given minor.
  */
@@ -2211,13 +2245,13 @@ static struct mapped_device *alloc_dev(int minor)
 
 	md->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!md->queue)
-		goto bad_queue;
+		goto bad;
 
 	dm_init_md_queue(md);
 
 	md->disk = alloc_disk(1);
 	if (!md->disk)
-		goto bad_disk;
+		goto bad;
 
 	atomic_set(&md->pending[0], 0);
 	atomic_set(&md->pending[1], 0);
@@ -2238,11 +2272,11 @@ static struct mapped_device *alloc_dev(int minor)
 
 	md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
 	if (!md->wq)
-		goto bad_thread;
+		goto bad;
 
 	md->bdev = bdget_disk(md->disk, 0);
 	if (!md->bdev)
-		goto bad_bdev;
+		goto bad;
 
 	bio_init(&md->flush_bio);
 	md->flush_bio.bi_bdev = md->bdev;
@@ -2259,15 +2293,8 @@ static struct mapped_device *alloc_dev(int minor)
 
 	return md;
 
-bad_bdev:
-	destroy_workqueue(md->wq);
-bad_thread:
-	del_gendisk(md->disk);
-	put_disk(md->disk);
-bad_disk:
-	blk_cleanup_queue(md->queue);
-bad_queue:
-	cleanup_srcu_struct(&md->io_barrier);
+bad:
+	cleanup_mapped_device(md);
 bad_io_barrier:
 	free_minor(minor);
 bad_minor:
@@ -2284,32 +2311,13 @@ static void free_dev(struct mapped_device *md)
 	int minor = MINOR(disk_devt(md->disk));
 
 	unlock_fs(md);
-	destroy_workqueue(md->wq);
 
-	if (md->kworker_task)
-		kthread_stop(md->kworker_task);
-	if (md->io_pool)
-		mempool_destroy(md->io_pool);
-	if (md->rq_pool)
-		mempool_destroy(md->rq_pool);
-	if (md->bs)
-		bioset_free(md->bs);
-
-	cleanup_srcu_struct(&md->io_barrier);
-	free_table_devices(&md->table_devices);
-	dm_stats_cleanup(&md->stats);
-
-	spin_lock(&_minor_lock);
-	md->disk->private_data = NULL;
-	spin_unlock(&_minor_lock);
-	if (blk_get_integrity(md->disk))
-		blk_integrity_unregister(md->disk);
-	del_gendisk(md->disk);
-	put_disk(md->disk);
-	blk_cleanup_queue(md->queue);
+	cleanup_mapped_device(md);
 	if (md->use_blk_mq)
 		blk_mq_free_tag_set(&md->tag_set);
-	bdput(md->bdev);
+
+	free_table_devices(&md->table_devices);
+	dm_stats_cleanup(&md->stats);
 	free_minor(minor);
 
 	module_put(THIS_MODULE);

From 8b908f8e94540296de95682640281a95ee5d320c Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 13 May 2015 17:53:13 -0400
Subject: [PATCH 04/39] dm thin: cleanup overwrite's endio restore to be
 centralized

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index e852602c0091..c0fbb6c306b2 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -705,6 +705,8 @@ static void overwrite_endio(struct bio *bio, int err)
 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 	struct dm_thin_new_mapping *m = h->overwrite_mapping;
 
+	bio->bi_end_io = m->saved_bi_end_io;
+
 	m->err = err;
 	complete_mapping_preparation(m);
 }
@@ -793,9 +795,6 @@ static void inc_remap_and_issue_cell(struct thin_c *tc,
 
 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 {
-	if (m->bio)
-		m->bio->bi_end_io = m->saved_bi_end_io;
-
 	cell_error(m->tc->pool, m->cell);
 	list_del(&m->list);
 	mempool_free(m, m->tc->pool->mapping_pool);
@@ -805,13 +804,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 {
 	struct thin_c *tc = m->tc;
 	struct pool *pool = tc->pool;
-	struct bio *bio;
+	struct bio *bio = m->bio;
 	int r;
 
-	bio = m->bio;
-	if (bio)
-		bio->bi_end_io = m->saved_bi_end_io;
-
 	if (m->err) {
 		cell_error(pool, m->cell);
 		goto out;

From f8ae75253e4174089ffe3046715c679183f1919f Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 14 May 2015 11:28:37 -0400
Subject: [PATCH 05/39] dm thin: cleanup schedule_zero() to read more logically

The overwrite has only ever about optimizing away the need to zero a
block if the entire block was being overwritten.  As such it is only
relevant when zeroing is enabled.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Joe Thornber <ejt@redhat.com>
---
 drivers/md/dm-thin.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index c0fbb6c306b2..c552df7b3420 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1086,16 +1086,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 	 * zeroing pre-existing data, we can issue the bio immediately.
 	 * Otherwise we use kcopyd to zero the data first.
 	 */
-	if (!pool->pf.zero_new_blocks)
+	if (pool->pf.zero_new_blocks) {
+		if (io_overwrites_block(pool, bio))
+			remap_and_issue_overwrite(tc, bio, data_block, m);
+		else
+			ll_zero(tc, m, data_block * pool->sectors_per_block,
+				(data_block + 1) * pool->sectors_per_block);
+	} else
 		process_prepared_mapping(m);
-
-	else if (io_overwrites_block(pool, bio))
-		remap_and_issue_overwrite(tc, bio, data_block, m);
-
-	else
-		ll_zero(tc, m,
-			data_block * pool->sectors_per_block,
-			(data_block + 1) * pool->sectors_per_block);
 }
 
 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,

From 49f154c7327139c275c9392b5fd69cad82f106bc Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 23 Apr 2015 15:06:27 -0400
Subject: [PATCH 06/39] dm thin metadata: remove in-core 'read_only' flag

Leverage the block manager's read_only flag instead of duplicating it;
access with new dm_bm_is_read_only() method.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin-metadata.c                 | 6 +-----
 drivers/md/persistent-data/dm-block-manager.c | 6 ++++++
 drivers/md/persistent-data/dm-block-manager.h | 1 +
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 79f694120ddf..cb6dd055053d 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -184,7 +184,6 @@ struct dm_pool_metadata {
 	uint64_t trans_id;
 	unsigned long flags;
 	sector_t data_block_size;
-	bool read_only:1;
 
 	/*
 	 * Set if a transaction has to be aborted but the attempt to roll back
@@ -836,7 +835,6 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
 	init_rwsem(&pmd->root_lock);
 	pmd->time = 0;
 	INIT_LIST_HEAD(&pmd->thin_devices);
-	pmd->read_only = false;
 	pmd->fail_io = false;
 	pmd->bdev = bdev;
 	pmd->data_block_size = data_block_size;
@@ -880,7 +878,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
 		return -EBUSY;
 	}
 
-	if (!pmd->read_only && !pmd->fail_io) {
+	if (!dm_bm_is_read_only(pmd->bm) && !pmd->fail_io) {
 		r = __commit_transaction(pmd);
 		if (r < 0)
 			DMWARN("%s: __commit_transaction() failed, error = %d",
@@ -1739,7 +1737,6 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou
 void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
 {
 	down_write(&pmd->root_lock);
-	pmd->read_only = true;
 	dm_bm_set_read_only(pmd->bm);
 	up_write(&pmd->root_lock);
 }
@@ -1747,7 +1744,6 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
 void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
 {
 	down_write(&pmd->root_lock);
-	pmd->read_only = false;
 	dm_bm_set_read_write(pmd->bm);
 	up_write(&pmd->root_lock);
 }
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 087411c95ffc..4d6c9b689eaa 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -609,6 +609,12 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b)
 	dm_bufio_prefetch(bm->bufio, b, 1);
 }
 
+bool dm_bm_is_read_only(struct dm_block_manager *bm)
+{
+	return bm->read_only;
+}
+EXPORT_SYMBOL_GPL(dm_bm_is_read_only);
+
 void dm_bm_set_read_only(struct dm_block_manager *bm)
 {
 	bm->read_only = true;
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index 1b95dfc17786..84330f59886d 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -123,6 +123,7 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
  * Additionally you should not use dm_bm_unlock_move, however no error will
  * be returned if you do.
  */
+bool dm_bm_is_read_only(struct dm_block_manager *bm);
 void dm_bm_set_read_only(struct dm_block_manager *bm);
 void dm_bm_set_read_write(struct dm_block_manager *bm);
 

From 0f4106b32f36165a4f40b6aad0372e02ff14cf34 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 29 Apr 2015 14:03:07 +0200
Subject: [PATCH 07/39] dm raid: fixup documentation for discard support

Remove comment above parse_raid_params() that claims
"devices_handle_discard_safely" is a table line argument when it is
actually is a module parameter.

Also, backfill dm-raid target version 1.6.0 documentation.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Reviewed-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/dm-raid.txt | 1 +
 drivers/md/dm-raid.c                    | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index ef8ba9fa58c4..85b74b15fa9c 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -224,3 +224,4 @@ Version History
 	New status (STATUSTYPE_INFO) fields: sync_action and mismatch_cnt.
 1.5.1   Add ability to restore transiently failed devices on resume.
 1.5.2   'mismatch_cnt' is zero unless [last_]sync_action is "check".
+1.6.0   Add discard support (and devices_handle_discard_safely module param).
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 88e4c7f24986..423e42e9a1ad 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -477,8 +477,6 @@ static int validate_raid_redundancy(struct raid_set *rs)
  *                                      will form the "stripe"
  *    [[no]sync]			Force or prevent recovery of the
  *                                      entire array
- *    [devices_handle_discard_safely]	Allow discards on RAID4/5/6; useful if RAID
- *					member device(s) properly support TRIM/UNMAP
  *    [rebuild <idx>]			Rebuild the drive indicated by the index
  *    [daemon_sleep <ms>]		Time between bitmap daemon work to
  *                                      clear bits

From c76d53f43ec4f9b9f200f031d303f21bdf6927d0 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 29 Apr 2015 14:03:00 +0200
Subject: [PATCH 08/39] dm raid: a few cleanups

- ensure maximum device limit in superblock
- rename DMPF_* (print flags) to CTR_FLAG_* (constructor flags)
  and their respective struct raid_set member
- use strcasecmp() in raid10_format_to_md_layout() as in the constructor

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Reviewed-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 91 ++++++++++++++++++++++----------------------
 1 file changed, 46 insertions(+), 45 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 423e42e9a1ad..af49ddebaa62 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -17,6 +17,7 @@
 #include <linux/device-mapper.h>
 
 #define DM_MSG_PREFIX "raid"
+#define	MAX_RAID_DEVICES	253 /* raid4/5/6 limit */
 
 static bool devices_handle_discard_safely = false;
 
@@ -45,25 +46,25 @@ struct raid_dev {
 };
 
 /*
- * Flags for rs->print_flags field.
+ * Flags for rs->ctr_flags field.
  */
-#define DMPF_SYNC              0x1
-#define DMPF_NOSYNC            0x2
-#define DMPF_REBUILD           0x4
-#define DMPF_DAEMON_SLEEP      0x8
-#define DMPF_MIN_RECOVERY_RATE 0x10
-#define DMPF_MAX_RECOVERY_RATE 0x20
-#define DMPF_MAX_WRITE_BEHIND  0x40
-#define DMPF_STRIPE_CACHE      0x80
-#define DMPF_REGION_SIZE       0x100
-#define DMPF_RAID10_COPIES     0x200
-#define DMPF_RAID10_FORMAT     0x400
+#define CTR_FLAG_SYNC              0x1
+#define CTR_FLAG_NOSYNC            0x2
+#define CTR_FLAG_REBUILD           0x4
+#define CTR_FLAG_DAEMON_SLEEP      0x8
+#define CTR_FLAG_MIN_RECOVERY_RATE 0x10
+#define CTR_FLAG_MAX_RECOVERY_RATE 0x20
+#define CTR_FLAG_MAX_WRITE_BEHIND  0x40
+#define CTR_FLAG_STRIPE_CACHE      0x80
+#define CTR_FLAG_REGION_SIZE       0x100
+#define CTR_FLAG_RAID10_COPIES     0x200
+#define CTR_FLAG_RAID10_FORMAT     0x400
 
 struct raid_set {
 	struct dm_target *ti;
 
 	uint32_t bitmap_loaded;
-	uint32_t print_flags;
+	uint32_t ctr_flags;
 
 	struct mddev md;
 	struct raid_type *raid_type;
@@ -119,15 +120,15 @@ static int raid10_format_to_md_layout(char *format, unsigned copies)
 {
 	unsigned n = 1, f = 1;
 
-	if (!strcmp("near", format))
+	if (!strcasecmp("near", format))
 		n = copies;
 	else
 		f = copies;
 
-	if (!strcmp("offset", format))
+	if (!strcasecmp("offset", format))
 		return 0x30000 | (f << 8) | n;
 
-	if (!strcmp("far", format))
+	if (!strcasecmp("far", format))
 		return 0x20000 | (f << 8) | n;
 
 	return (f << 8) | n;
@@ -553,12 +554,12 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 	for (i = 0; i < num_raid_params; i++) {
 		if (!strcasecmp(argv[i], "nosync")) {
 			rs->md.recovery_cp = MaxSector;
-			rs->print_flags |= DMPF_NOSYNC;
+			rs->ctr_flags |= CTR_FLAG_NOSYNC;
 			continue;
 		}
 		if (!strcasecmp(argv[i], "sync")) {
 			rs->md.recovery_cp = 0;
-			rs->print_flags |= DMPF_SYNC;
+			rs->ctr_flags |= CTR_FLAG_SYNC;
 			continue;
 		}
 
@@ -583,7 +584,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 				return -EINVAL;
 			}
 			raid10_format = argv[i];
-			rs->print_flags |= DMPF_RAID10_FORMAT;
+			rs->ctr_flags |= CTR_FLAG_RAID10_FORMAT;
 			continue;
 		}
 
@@ -600,7 +601,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 			}
 			clear_bit(In_sync, &rs->dev[value].rdev.flags);
 			rs->dev[value].rdev.recovery_offset = 0;
-			rs->print_flags |= DMPF_REBUILD;
+			rs->ctr_flags |= CTR_FLAG_REBUILD;
 		} else if (!strcasecmp(key, "write_mostly")) {
 			if (rs->raid_type->level != 1) {
 				rs->ti->error = "write_mostly option is only valid for RAID1";
@@ -616,7 +617,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 				rs->ti->error = "max_write_behind option is only valid for RAID1";
 				return -EINVAL;
 			}
-			rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
+			rs->ctr_flags |= CTR_FLAG_MAX_WRITE_BEHIND;
 
 			/*
 			 * In device-mapper, we specify things in sectors, but
@@ -629,14 +630,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 			}
 			rs->md.bitmap_info.max_write_behind = value;
 		} else if (!strcasecmp(key, "daemon_sleep")) {
-			rs->print_flags |= DMPF_DAEMON_SLEEP;
+			rs->ctr_flags |= CTR_FLAG_DAEMON_SLEEP;
 			if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
 				rs->ti->error = "daemon sleep period out of range";
 				return -EINVAL;
 			}
 			rs->md.bitmap_info.daemon_sleep = value;
 		} else if (!strcasecmp(key, "stripe_cache")) {
-			rs->print_flags |= DMPF_STRIPE_CACHE;
+			rs->ctr_flags |= CTR_FLAG_STRIPE_CACHE;
 
 			/*
 			 * In device-mapper, we specify things in sectors, but
@@ -654,21 +655,21 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 				return -EINVAL;
 			}
 		} else if (!strcasecmp(key, "min_recovery_rate")) {
-			rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
+			rs->ctr_flags |= CTR_FLAG_MIN_RECOVERY_RATE;
 			if (value > INT_MAX) {
 				rs->ti->error = "min_recovery_rate out of range";
 				return -EINVAL;
 			}
 			rs->md.sync_speed_min = (int)value;
 		} else if (!strcasecmp(key, "max_recovery_rate")) {
-			rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
+			rs->ctr_flags |= CTR_FLAG_MAX_RECOVERY_RATE;
 			if (value > INT_MAX) {
 				rs->ti->error = "max_recovery_rate out of range";
 				return -EINVAL;
 			}
 			rs->md.sync_speed_max = (int)value;
 		} else if (!strcasecmp(key, "region_size")) {
-			rs->print_flags |= DMPF_REGION_SIZE;
+			rs->ctr_flags |= CTR_FLAG_REGION_SIZE;
 			region_size = value;
 		} else if (!strcasecmp(key, "raid10_copies") &&
 			   (rs->raid_type->level == 10)) {
@@ -676,7 +677,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 				rs->ti->error = "Bad value for 'raid10_copies'";
 				return -EINVAL;
 			}
-			rs->print_flags |= DMPF_RAID10_COPIES;
+			rs->ctr_flags |= CTR_FLAG_RAID10_COPIES;
 			raid10_copies = value;
 		} else {
 			DMERR("Unable to parse RAID parameter: %s", key);
@@ -945,7 +946,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
 		return -EINVAL;
 	}
 
-	if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)))
+	if (!(rs->ctr_flags & (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC)))
 		mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
 
 	/*
@@ -1071,7 +1072,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 	freshest = NULL;
 	rdev_for_each_safe(rdev, tmp, mddev) {
 		/*
-		 * Skipping super_load due to DMPF_SYNC will cause
+		 * Skipping super_load due to CTR_FLAG_SYNC will cause
 		 * the array to undergo initialization again as
 		 * though it were new.  This is the intended effect
 		 * of the "sync" directive.
@@ -1080,7 +1081,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 		 * that the "sync" directive is disallowed during the
 		 * reshape.
 		 */
-		if (rs->print_flags & DMPF_SYNC)
+		if (rs->ctr_flags & CTR_FLAG_SYNC)
 			continue;
 
 		if (!rdev->meta_bdev)
@@ -1241,7 +1242,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 
 	if ((kstrtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
-	    (num_raid_devs >= INT_MAX)) {
+	    (num_raid_devs > MAX_RAID_DEVICES)) {
 		ti->error = "Cannot understand number of raid devices";
 		return -EINVAL;
 	}
@@ -1444,7 +1445,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 	case STATUSTYPE_TABLE:
 		/* The string you would use to construct this array */
 		for (i = 0; i < rs->md.raid_disks; i++) {
-			if ((rs->print_flags & DMPF_REBUILD) &&
+			if ((rs->ctr_flags & CTR_FLAG_REBUILD) &&
 			    rs->dev[i].data_dev &&
 			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
 				raid_param_cnt += 2; /* for rebuilds */
@@ -1453,33 +1454,33 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 				raid_param_cnt += 2;
 		}
 
-		raid_param_cnt += (hweight32(rs->print_flags & ~DMPF_REBUILD) * 2);
-		if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
+		raid_param_cnt += (hweight32(rs->ctr_flags & ~CTR_FLAG_REBUILD) * 2);
+		if (rs->ctr_flags & (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC))
 			raid_param_cnt--;
 
 		DMEMIT("%s %u %u", rs->raid_type->name,
 		       raid_param_cnt, rs->md.chunk_sectors);
 
-		if ((rs->print_flags & DMPF_SYNC) &&
+		if ((rs->ctr_flags & CTR_FLAG_SYNC) &&
 		    (rs->md.recovery_cp == MaxSector))
 			DMEMIT(" sync");
-		if (rs->print_flags & DMPF_NOSYNC)
+		if (rs->ctr_flags & CTR_FLAG_NOSYNC)
 			DMEMIT(" nosync");
 
 		for (i = 0; i < rs->md.raid_disks; i++)
-			if ((rs->print_flags & DMPF_REBUILD) &&
+			if ((rs->ctr_flags & CTR_FLAG_REBUILD) &&
 			    rs->dev[i].data_dev &&
 			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
 				DMEMIT(" rebuild %u", i);
 
-		if (rs->print_flags & DMPF_DAEMON_SLEEP)
+		if (rs->ctr_flags & CTR_FLAG_DAEMON_SLEEP)
 			DMEMIT(" daemon_sleep %lu",
 			       rs->md.bitmap_info.daemon_sleep);
 
-		if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
+		if (rs->ctr_flags & CTR_FLAG_MIN_RECOVERY_RATE)
 			DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
 
-		if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
+		if (rs->ctr_flags & CTR_FLAG_MAX_RECOVERY_RATE)
 			DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
 
 		for (i = 0; i < rs->md.raid_disks; i++)
@@ -1487,11 +1488,11 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 			    test_bit(WriteMostly, &rs->dev[i].rdev.flags))
 				DMEMIT(" write_mostly %u", i);
 
-		if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
+		if (rs->ctr_flags & CTR_FLAG_MAX_WRITE_BEHIND)
 			DMEMIT(" max_write_behind %lu",
 			       rs->md.bitmap_info.max_write_behind);
 
-		if (rs->print_flags & DMPF_STRIPE_CACHE) {
+		if (rs->ctr_flags & CTR_FLAG_STRIPE_CACHE) {
 			struct r5conf *conf = rs->md.private;
 
 			/* convert from kiB to sectors */
@@ -1499,15 +1500,15 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 			       conf ? conf->max_nr_stripes * 2 : 0);
 		}
 
-		if (rs->print_flags & DMPF_REGION_SIZE)
+		if (rs->ctr_flags & CTR_FLAG_REGION_SIZE)
 			DMEMIT(" region_size %lu",
 			       rs->md.bitmap_info.chunksize >> 9);
 
-		if (rs->print_flags & DMPF_RAID10_COPIES)
+		if (rs->ctr_flags & CTR_FLAG_RAID10_COPIES)
 			DMEMIT(" raid10_copies %u",
 			       raid10_md_layout_to_copies(rs->md.layout));
 
-		if (rs->print_flags & DMPF_RAID10_FORMAT)
+		if (rs->ctr_flags & CTR_FLAG_RAID10_FORMAT)
 			DMEMIT(" raid10_format %s",
 			       raid10_md_layout_to_format(rs->md.layout));
 

From 0cf4503174c12025ac7ea61048cb7c1d4d1ed85c Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 29 Apr 2015 14:03:04 +0200
Subject: [PATCH 09/39] dm raid: add support for the MD RAID0 personality

Add dm-raid access to the MD RAID0 personality to enable single zone
striping.

The following changes enable that access:
- add type definition to raid_types array
- make bitmap creation conditonal in super_validate(), because
  bitmaps are not allowed in raid0
- set rdev->sectors to the data image size in super_validate()
  to allow the raid0 personality to calculate the MD array
  size properly
- use mdddev(un)lock() functions instead of direct mutex_(un)lock()
  (wrapped in here because it's a trivial change)
- enhance raid_status() to always report full sync for raid0
  so that userspace checks for 100% sync will succeed and allow
  for resize (and takeover/reshape once added in future paches)
- enhance raid_resume() to not load bitmap in case of raid0
- add merge function to avoid data corruption (seen with readahead)
  that resulted from bio payloads that grew too large.  This problem
  did not occur with the other raid levels because it either did not
  apply without striping (raid1) or was avoided via stripe caching.
- raise version to 1.7.0 because of the raid0 API change

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Reviewed-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/dm-raid.txt |   1 +
 drivers/md/dm-raid.c                    | 130 +++++++++++++++---------
 2 files changed, 84 insertions(+), 47 deletions(-)

diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index 85b74b15fa9c..cb12af3b51c2 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -225,3 +225,4 @@ Version History
 1.5.1   Add ability to restore transiently failed devices on resume.
 1.5.2   'mismatch_cnt' is zero unless [last_]sync_action is "check".
 1.6.0   Add discard support (and devices_handle_discard_safely module param).
+1.7.0   Add support for MD RAID0 mappings.
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index af49ddebaa62..2daa67793511 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2010-2011 Neil Brown
- * Copyright (C) 2010-2014 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2010-2015 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
@@ -82,6 +82,7 @@ static struct raid_type {
 	const unsigned level;		/* RAID level. */
 	const unsigned algorithm;	/* RAID algorithm. */
 } raid_types[] = {
+	{"raid0",    "RAID0 (striping)",                0, 2, 0, 0 /* NONE */},
 	{"raid1",    "RAID1 (mirroring)",               0, 2, 1, 0 /* NONE */},
 	{"raid10",   "RAID10 (striped mirrors)",        0, 2, 10, UINT_MAX /* Varies */},
 	{"raid4",    "RAID4 (dedicated parity disk)",	1, 2, 5, ALGORITHM_PARITY_0},
@@ -719,7 +720,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 		rs->md.layout = raid10_format_to_md_layout(raid10_format,
 							   raid10_copies);
 		rs->md.new_layout = rs->md.layout;
-	} else if ((rs->raid_type->level > 1) &&
+	} else if ((!rs->raid_type->level || rs->raid_type->level > 1) &&
 		   sector_div(sectors_per_dev,
 			      (rs->md.raid_disks - rs->raid_type->parity_devs))) {
 		rs->ti->error = "Target length not divisible by number of data devices";
@@ -1025,8 +1026,9 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
 	return 0;
 }
 
-static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
 {
+	struct mddev *mddev = &rs->md;
 	struct dm_raid_superblock *sb = page_address(rdev->sb_page);
 
 	/*
@@ -1036,8 +1038,10 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
 	if (!mddev->events && super_init_validation(mddev, rdev))
 		return -EINVAL;
 
-	mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
-	rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
+	/* Enable bitmap creation for RAID levels != 0 */
+	mddev->bitmap_info.offset = (rs->raid_type->level) ? to_sector(4096) : 0;
+	rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
+
 	if (!test_bit(FirstUse, &rdev->flags)) {
 		rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
 		if (rdev->recovery_offset != MaxSector)
@@ -1081,6 +1085,8 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 		 * that the "sync" directive is disallowed during the
 		 * reshape.
 		 */
+		rdev->sectors = to_sector(i_size_read(rdev->bdev->bd_inode));
+
 		if (rs->ctr_flags & CTR_FLAG_SYNC)
 			continue;
 
@@ -1139,11 +1145,11 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 	 * validation for the remaining devices.
 	 */
 	ti->error = "Unable to assemble array: Invalid superblocks";
-	if (super_validate(mddev, freshest))
+	if (super_validate(rs, freshest))
 		return -EINVAL;
 
 	rdev_for_each(rdev, mddev)
-		if ((rdev != freshest) && super_validate(mddev, rdev))
+		if ((rdev != freshest) && super_validate(rs, rdev))
 			return -EINVAL;
 
 	return 0;
@@ -1281,10 +1287,11 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	 */
 	configure_discard_support(ti, rs);
 
-	mutex_lock(&rs->md.reconfig_mutex);
+	/* Has to be held on running the array */
+	mddev_lock_nointr(&rs->md);
 	ret = md_run(&rs->md);
 	rs->md.in_sync = 0; /* Assume already marked dirty */
-	mutex_unlock(&rs->md.reconfig_mutex);
+	mddev_unlock(&rs->md);
 
 	if (ret) {
 		ti->error = "Fail to run raid array";
@@ -1367,34 +1374,40 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 	case STATUSTYPE_INFO:
 		DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
 
-		if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
-			sync = rs->md.curr_resync_completed;
-		else
-			sync = rs->md.recovery_cp;
+		if (rs->raid_type->level) {
+			if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
+				sync = rs->md.curr_resync_completed;
+			else
+				sync = rs->md.recovery_cp;
 
-		if (sync >= rs->md.resync_max_sectors) {
-			/*
-			 * Sync complete.
-			 */
+			if (sync >= rs->md.resync_max_sectors) {
+				/*
+				 * Sync complete.
+				 */
+				array_in_sync = 1;
+				sync = rs->md.resync_max_sectors;
+			} else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
+				/*
+				 * If "check" or "repair" is occurring, the array has
+				 * undergone and initial sync and the health characters
+				 * should not be 'a' anymore.
+				 */
+				array_in_sync = 1;
+			} else {
+				/*
+				 * The array may be doing an initial sync, or it may
+				 * be rebuilding individual components.  If all the
+				 * devices are In_sync, then it is the array that is
+				 * being initialized.
+				 */
+				for (i = 0; i < rs->md.raid_disks; i++)
+					if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+						array_in_sync = 1;
+			}
+		} else {
+			/* RAID0 */
 			array_in_sync = 1;
 			sync = rs->md.resync_max_sectors;
-		} else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
-			/*
-			 * If "check" or "repair" is occurring, the array has
-			 * undergone and initial sync and the health characters
-			 * should not be 'a' anymore.
-			 */
-			array_in_sync = 1;
-		} else {
-			/*
-			 * The array may be doing an initial sync, or it may
-			 * be rebuilding individual components.  If all the
-			 * devices are In_sync, then it is the array that is
-			 * being initialized.
-			 */
-			for (i = 0; i < rs->md.raid_disks; i++)
-				if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
-					array_in_sync = 1;
 		}
 
 		/*
@@ -1683,26 +1696,48 @@ static void raid_resume(struct dm_target *ti)
 {
 	struct raid_set *rs = ti->private;
 
-	set_bit(MD_CHANGE_DEVS, &rs->md.flags);
-	if (!rs->bitmap_loaded) {
-		bitmap_load(&rs->md);
-		rs->bitmap_loaded = 1;
-	} else {
-		/*
-		 * A secondary resume while the device is active.
-		 * Take this opportunity to check whether any failed
-		 * devices are reachable again.
-		 */
-		attempt_restore_of_faulty_devices(rs);
+	if (rs->raid_type->level) {
+		set_bit(MD_CHANGE_DEVS, &rs->md.flags);
+
+		if (!rs->bitmap_loaded) {
+			bitmap_load(&rs->md);
+			rs->bitmap_loaded = 1;
+		} else {
+			/*
+			 * A secondary resume while the device is active.
+			 * Take this opportunity to check whether any failed
+			 * devices are reachable again.
+			 */
+			attempt_restore_of_faulty_devices(rs);
+		}
+
+		clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
 	}
 
-	clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
 	mddev_resume(&rs->md);
 }
 
+static int raid_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+		      struct bio_vec *biovec, int max_size)
+{
+	struct raid_set *rs = ti->private;
+	struct md_personality *pers = rs->md.pers;
+
+	if (pers && pers->mergeable_bvec)
+		return min(max_size, pers->mergeable_bvec(&rs->md, bvm, biovec));
+
+	/*
+	 * In case we can't request the personality because
+	 * the raid set is not running yet
+	 *
+	 * -> return safe minimum
+	 */
+	return rs->md.chunk_sectors;
+}
+
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 6, 0},
+	.version = {1, 7, 0},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,
@@ -1714,6 +1749,7 @@ static struct target_type raid_target = {
 	.presuspend = raid_presuspend,
 	.postsuspend = raid_postsuspend,
 	.resume = raid_resume,
+	.merge = raid_merge,
 };
 
 static int __init dm_raid_init(void)

From e223e1de4f8a586662c4917f6f673126574960dd Mon Sep 17 00:00:00 2001
From: Luis Henriques <luis.henriques@canonical.com>
Date: Mon, 27 Apr 2015 21:29:36 +0100
Subject: [PATCH 10/39] dm stripe: drop useless exit point from
 dm_stripe_init()

Signed-off-by: Luis Henriques <luis.henriques@canonical.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-stripe.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index f8b37d4c05d8..a672a1502c14 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -451,10 +451,8 @@ int __init dm_stripe_init(void)
 	int r;
 
 	r = dm_register_target(&stripe_target);
-	if (r < 0) {
+	if (r < 0)
 		DMWARN("target registration failed");
-		return r;
-	}
 
 	return r;
 }

From f4ad317aedf836e7bb67108a119bfec7f3d97ef1 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Sun, 19 Apr 2015 00:07:30 +0200
Subject: [PATCH 11/39] dm log writes: use ULL suffix for 64-bit constants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On 32-bit:
drivers/md/dm-log-writes.c: In function ‘log_super’:
drivers/md/dm-log-writes.c:323: warning: integer constant is too large for ‘long’ type

Add a ULL suffix to WRITE_LOG_MAGIC to fix this.
Also add a ULL suffix to WRITE_LOG_VERSION as it's stored in a __le64
field.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-log-writes.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 93e08446a87d..ad1b049ae2ab 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -55,8 +55,8 @@
 #define LOG_DISCARD_FLAG (1 << 2)
 #define LOG_MARK_FLAG (1 << 3)
 
-#define WRITE_LOG_VERSION 1
-#define WRITE_LOG_MAGIC 0x6a736677736872
+#define WRITE_LOG_VERSION 1ULL
+#define WRITE_LOG_MAGIC 0x6a736677736872ULL
 
 /*
  * The disk format for this is braindead simple.

From ed63287dd670f8e9d2412a913de7fdc50a689831 Mon Sep 17 00:00:00 2001
From: Lidong Zhong <lzhong@suse.com>
Date: Wed, 13 May 2015 14:04:10 +0800
Subject: [PATCH 12/39] dm raid1: keep issuing IO after leg failure

Currently if there is a leg failure, the bio will be put into the hold
list until userspace does a remove/replace on the leg.  Doing so in a
cluster config (clvmd) is problematic because there may be a temporary
path failure that results in cluster raid1 remove/replace.  Such
recovery takes a long time due to a full resync.

Update dm-raid1 to optionally ignore these failures so bios continue
being issued without interrupton.  To enable this feature userspace
must pass "keep_log" when creating the dm-raid1 device.

Signed-off-by: Lidong Zhong <lzhong@suse.com>
Tested-by: Liuhua Wang <lwang@suse.com>
Acked-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid1.c | 75 +++++++++++++++++++++++++++++++++----------
 1 file changed, 58 insertions(+), 17 deletions(-)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 743fa9bbae9e..d83696bf403b 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -23,8 +23,10 @@
 
 #define MAX_RECOVERY 1	/* Maximum number of regions recovered in parallel. */
 
-#define DM_RAID1_HANDLE_ERRORS 0x01
+#define DM_RAID1_HANDLE_ERRORS	0x01
+#define DM_RAID1_KEEP_LOG	0x02
 #define errors_handled(p)	((p)->features & DM_RAID1_HANDLE_ERRORS)
+#define keep_log(p)		((p)->features & DM_RAID1_KEEP_LOG)
 
 static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
 
@@ -229,7 +231,7 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
 	if (m != get_default_mirror(ms))
 		goto out;
 
-	if (!ms->in_sync) {
+	if (!ms->in_sync && !keep_log(ms)) {
 		/*
 		 * Better to issue requests to same failing device
 		 * than to risk returning corrupt data.
@@ -370,6 +372,17 @@ static int recover(struct mirror_set *ms, struct dm_region *reg)
 	return r;
 }
 
+static void reset_ms_flags(struct mirror_set *ms)
+{
+	unsigned int m;
+
+	ms->leg_failure = 0;
+	for (m = 0; m < ms->nr_mirrors; m++) {
+		atomic_set(&(ms->mirror[m].error_count), 0);
+		ms->mirror[m].error_type = 0;
+	}
+}
+
 static void do_recovery(struct mirror_set *ms)
 {
 	struct dm_region *reg;
@@ -398,6 +411,7 @@ static void do_recovery(struct mirror_set *ms)
 		/* the sync is complete */
 		dm_table_event(ms->ti->table);
 		ms->in_sync = 1;
+		reset_ms_flags(ms);
 	}
 }
 
@@ -759,7 +773,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 		dm_rh_delay(ms->rh, bio);
 
 	while ((bio = bio_list_pop(&nosync))) {
-		if (unlikely(ms->leg_failure) && errors_handled(ms)) {
+		if (unlikely(ms->leg_failure) && errors_handled(ms) && !keep_log(ms)) {
 			spin_lock_irq(&ms->lock);
 			bio_list_add(&ms->failures, bio);
 			spin_unlock_irq(&ms->lock);
@@ -803,15 +817,21 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
 
 		/*
 		 * If all the legs are dead, fail the I/O.
-		 * If we have been told to handle errors, hold the bio
-		 * and wait for userspace to deal with the problem.
+		 * If the device has failed and keep_log is enabled,
+		 * fail the I/O.
+		 *
+		 * If we have been told to handle errors, and keep_log
+		 * isn't enabled, hold the bio and wait for userspace to
+		 * deal with the problem.
+		 *
 		 * Otherwise pretend that the I/O succeeded. (This would
 		 * be wrong if the failed leg returned after reboot and
 		 * got replicated back to the good legs.)
 		 */
-		if (!get_valid_mirror(ms))
+
+		if (unlikely(!get_valid_mirror(ms) || (keep_log(ms) && ms->log_failure)))
 			bio_endio(bio, -EIO);
-		else if (errors_handled(ms))
+		else if (errors_handled(ms) && !keep_log(ms))
 			hold_bio(ms, bio);
 		else
 			bio_endio(bio, 0);
@@ -987,6 +1007,7 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
 	unsigned num_features;
 	struct dm_target *ti = ms->ti;
 	char dummy;
+	int i;
 
 	*args_used = 0;
 
@@ -1007,15 +1028,25 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
 		return -EINVAL;
 	}
 
-	if (!strcmp("handle_errors", argv[0]))
-		ms->features |= DM_RAID1_HANDLE_ERRORS;
-	else {
-		ti->error = "Unrecognised feature requested";
+	for (i = 0; i < num_features; i++) {
+		if (!strcmp("handle_errors", argv[0]))
+			ms->features |= DM_RAID1_HANDLE_ERRORS;
+		else if (!strcmp("keep_log", argv[0]))
+			ms->features |= DM_RAID1_KEEP_LOG;
+		else {
+			ti->error = "Unrecognised feature requested";
+			return -EINVAL;
+		}
+
+		argc--;
+		argv++;
+		(*args_used)++;
+	}
+	if (!errors_handled(ms) && keep_log(ms)) {
+		ti->error = "keep_log feature requires the handle_errors feature";
 		return -EINVAL;
 	}
 
-	(*args_used)++;
-
 	return 0;
 }
 
@@ -1029,7 +1060,7 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
  * log_type is "core" or "disk"
  * #log_params is between 1 and 3
  *
- * If present, features must be "handle_errors".
+ * If present, supported features are "handle_errors" and "keep_log".
  */
 static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
@@ -1363,6 +1394,7 @@ static void mirror_status(struct dm_target *ti, status_type_t type,
 			  unsigned status_flags, char *result, unsigned maxlen)
 {
 	unsigned int m, sz = 0;
+	int num_feature_args = 0;
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
 	struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
 	char buffer[ms->nr_mirrors + 1];
@@ -1392,8 +1424,17 @@ static void mirror_status(struct dm_target *ti, status_type_t type,
 			DMEMIT(" %s %llu", ms->mirror[m].dev->name,
 			       (unsigned long long)ms->mirror[m].offset);
 
-		if (ms->features & DM_RAID1_HANDLE_ERRORS)
-			DMEMIT(" 1 handle_errors");
+		num_feature_args += !!errors_handled(ms);
+		num_feature_args += !!keep_log(ms);
+		if (num_feature_args) {
+			DMEMIT(" %d", num_feature_args);
+			if (errors_handled(ms))
+				DMEMIT(" handle_errors");
+			if (keep_log(ms))
+				DMEMIT(" keep_log");
+		}
+
+		break;
 	}
 }
 
@@ -1413,7 +1454,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
 
 static struct target_type mirror_target = {
 	.name	 = "mirror",
-	.version = {1, 13, 2},
+	.version = {1, 14, 0},
 	.module	 = THIS_MODULE,
 	.ctr	 = mirror_ctr,
 	.dtr	 = mirror_dtr,

From 54cea3f6681ad9360814e2926d1f723bbd0f74ed Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 15 May 2015 17:00:25 +0200
Subject: [PATCH 13/39] dm crypt: add comments to better describe crypto
 processing logic

A crypto driver can process requests synchronously or asynchronously
and can use an internal driver queue to backlog requests.
Add some comments to clarify internal logic and completion return codes.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 5503e43e5f28..0f48fed44a17 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2003 Jana Saout <jana@saout.de>
  * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
- * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2006-2015 Red Hat, Inc. All rights reserved.
  * Copyright (C) 2013 Milan Broz <gmazyland@gmail.com>
  *
  * This file is released under the GPL.
@@ -891,6 +891,11 @@ static void crypt_alloc_req(struct crypt_config *cc,
 		ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
 
 	ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
+
+	/*
+	 * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs
+	 * requests if driver request queue is full.
+	 */
 	ablkcipher_request_set_callback(ctx->req,
 	    CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
 	    kcryptd_async_done, dmreq_of_req(cc, ctx->req));
@@ -924,24 +929,32 @@ static int crypt_convert(struct crypt_config *cc,
 		r = crypt_convert_block(cc, ctx, ctx->req);
 
 		switch (r) {
-		/* async */
+		/*
+		 * The request was queued by a crypto driver
+		 * but the driver request queue is full, let's wait.
+		 */
 		case -EBUSY:
 			wait_for_completion(&ctx->restart);
 			reinit_completion(&ctx->restart);
-			/* fall through*/
+			/* fall through */
+		/*
+		 * The request is queued and processed asynchronously,
+		 * completion function kcryptd_async_done() will be called.
+		 */
 		case -EINPROGRESS:
 			ctx->req = NULL;
 			ctx->cc_sector++;
 			continue;
-
-		/* sync */
+		/*
+		 * The request was already processed (synchronously).
+		 */
 		case 0:
 			atomic_dec(&ctx->cc_pending);
 			ctx->cc_sector++;
 			cond_resched();
 			continue;
 
-		/* error */
+		/* There was an error while processing the request. */
 		default:
 			atomic_dec(&ctx->cc_pending);
 			return r;
@@ -1346,6 +1359,11 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 	struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
 	struct crypt_config *cc = io->cc;
 
+	/*
+	 * A request from crypto driver backlog is going to be processed now,
+	 * finish the completion and continue in crypt_convert().
+	 * (Callback will be called for the second time for this request.)
+	 */
 	if (error == -EINPROGRESS) {
 		complete(&ctx->restart);
 		return;

From fb4100ae7f312c3d614b37621c2b17b3b7cf65f8 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 20 May 2015 10:30:32 +0100
Subject: [PATCH 14/39] dm cache: fix race when issuing a POLICY_REPLACE
 operation

There is a race between a policy deciding to replace a cache entry,
the core target writing back any dirty data from this block, and other
IO threads doing IO to the same block.

This sort of problem is avoided most of the time by the core target
grabbing a bio prison cell before making the request to the policy.
But for a demotion the core target doesn't know which block will be
demoted, so can't do this in advance.

Fix this demotion race by introducing a callback to the policy interface
that allows the policy to grab the cell on behalf of the core target.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm-cache-policy-cleaner.c  |  3 +-
 drivers/md/dm-cache-policy-internal.h |  5 ++-
 drivers/md/dm-cache-policy-mq.c       | 41 +++++++++++++------
 drivers/md/dm-cache-policy.h          | 15 ++++++-
 drivers/md/dm-cache-target.c          | 58 ++++++++++++++++++---------
 5 files changed, 85 insertions(+), 37 deletions(-)

diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
index b04d1f904d07..004e463c9423 100644
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ b/drivers/md/dm-cache-policy-cleaner.c
@@ -171,7 +171,8 @@ static void remove_cache_hash_entry(struct wb_cache_entry *e)
 /* Public interface (see dm-cache-policy.h */
 static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock,
 		  bool can_block, bool can_migrate, bool discarded_oblock,
-		  struct bio *bio, struct policy_result *result)
+		  struct bio *bio, struct policy_locker *locker,
+		  struct policy_result *result)
 {
 	struct policy *p = to_policy(pe);
 	struct wb_cache_entry *e;
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 2256a1f24f73..c198e6defb9c 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -16,9 +16,10 @@
  */
 static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock,
 			     bool can_block, bool can_migrate, bool discarded_oblock,
-			     struct bio *bio, struct policy_result *result)
+			     struct bio *bio, struct policy_locker *locker,
+			     struct policy_result *result)
 {
-	return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, result);
+	return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, locker, result);
 }
 
 static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 3ddd1162334d..515d44bf24d3 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -693,9 +693,10 @@ static void requeue(struct mq_policy *mq, struct entry *e)
  * - set the hit count to a hard coded value other than 1, eg, is it better
  *   if it goes in at level 2?
  */
-static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
+static int demote_cblock(struct mq_policy *mq,
+			 struct policy_locker *locker, dm_oblock_t *oblock)
 {
-	struct entry *demoted = pop(mq, &mq->cache_clean);
+	struct entry *demoted = peek(&mq->cache_clean);
 
 	if (!demoted)
 		/*
@@ -707,6 +708,13 @@ static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
 		 */
 		return -ENOSPC;
 
+	if (locker->fn(locker, demoted->oblock))
+		/*
+		 * We couldn't lock the demoted block.
+		 */
+		return -EBUSY;
+
+	del(mq, demoted);
 	*oblock = demoted->oblock;
 	free_entry(&mq->cache_pool, demoted);
 
@@ -795,6 +803,7 @@ static int cache_entry_found(struct mq_policy *mq,
  * finding which cache block to use.
  */
 static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
+			      struct policy_locker *locker,
 			      struct policy_result *result)
 {
 	int r;
@@ -803,11 +812,12 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
 	/* Ensure there's a free cblock in the cache */
 	if (epool_empty(&mq->cache_pool)) {
 		result->op = POLICY_REPLACE;
-		r = demote_cblock(mq, &result->old_oblock);
+		r = demote_cblock(mq, locker, &result->old_oblock);
 		if (r) {
 			result->op = POLICY_MISS;
 			return 0;
 		}
+
 	} else
 		result->op = POLICY_NEW;
 
@@ -829,7 +839,8 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
 
 static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
 				 bool can_migrate, bool discarded_oblock,
-				 int data_dir, struct policy_result *result)
+				 int data_dir, struct policy_locker *locker,
+				 struct policy_result *result)
 {
 	int r = 0;
 
@@ -842,7 +853,7 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
 
 	else {
 		requeue(mq, e);
-		r = pre_cache_to_cache(mq, e, result);
+		r = pre_cache_to_cache(mq, e, locker, result);
 	}
 
 	return r;
@@ -872,6 +883,7 @@ static void insert_in_pre_cache(struct mq_policy *mq,
 }
 
 static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
+			    struct policy_locker *locker,
 			    struct policy_result *result)
 {
 	int r;
@@ -879,7 +891,7 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
 
 	if (epool_empty(&mq->cache_pool)) {
 		result->op = POLICY_REPLACE;
-		r = demote_cblock(mq, &result->old_oblock);
+		r = demote_cblock(mq, locker, &result->old_oblock);
 		if (unlikely(r)) {
 			result->op = POLICY_MISS;
 			insert_in_pre_cache(mq, oblock);
@@ -907,11 +919,12 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
 
 static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
 			  bool can_migrate, bool discarded_oblock,
-			  int data_dir, struct policy_result *result)
+			  int data_dir, struct policy_locker *locker,
+			  struct policy_result *result)
 {
 	if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) {
 		if (can_migrate)
-			insert_in_cache(mq, oblock, result);
+			insert_in_cache(mq, oblock, locker, result);
 		else
 			return -EWOULDBLOCK;
 	} else {
@@ -928,7 +941,8 @@ static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
  */
 static int map(struct mq_policy *mq, dm_oblock_t oblock,
 	       bool can_migrate, bool discarded_oblock,
-	       int data_dir, struct policy_result *result)
+	       int data_dir, struct policy_locker *locker,
+	       struct policy_result *result)
 {
 	int r = 0;
 	struct entry *e = hash_lookup(mq, oblock);
@@ -942,11 +956,11 @@ static int map(struct mq_policy *mq, dm_oblock_t oblock,
 
 	else if (e)
 		r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
-					  data_dir, result);
+					  data_dir, locker, result);
 
 	else
 		r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
-				   data_dir, result);
+				   data_dir, locker, result);
 
 	if (r == -EWOULDBLOCK)
 		result->op = POLICY_MISS;
@@ -1012,7 +1026,8 @@ static void copy_tick(struct mq_policy *mq)
 
 static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
 		  bool can_block, bool can_migrate, bool discarded_oblock,
-		  struct bio *bio, struct policy_result *result)
+		  struct bio *bio, struct policy_locker *locker,
+		  struct policy_result *result)
 {
 	int r;
 	struct mq_policy *mq = to_mq_policy(p);
@@ -1028,7 +1043,7 @@ static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
 
 	iot_examine_bio(&mq->tracker, bio);
 	r = map(mq, oblock, can_migrate, discarded_oblock,
-		bio_data_dir(bio), result);
+		bio_data_dir(bio), locker, result);
 
 	mutex_unlock(&mq->lock);
 
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index f50fe360c546..5524e21e4836 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -69,6 +69,18 @@ enum policy_operation {
 	POLICY_REPLACE
 };
 
+/*
+ * When issuing a POLICY_REPLACE the policy needs to make a callback to
+ * lock the block being demoted.  This doesn't need to occur during a
+ * writeback operation since the block remains in the cache.
+ */
+struct policy_locker;
+typedef int (*policy_lock_fn)(struct policy_locker *l, dm_oblock_t oblock);
+
+struct policy_locker {
+	policy_lock_fn fn;
+};
+
 /*
  * This is the instruction passed back to the core target.
  */
@@ -122,7 +134,8 @@ struct dm_cache_policy {
 	 */
 	int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock,
 		   bool can_block, bool can_migrate, bool discarded_oblock,
-		   struct bio *bio, struct policy_result *result);
+		   struct bio *bio, struct policy_locker *locker,
+		   struct policy_result *result);
 
 	/*
 	 * Sometimes we want to see if a block is in the cache, without
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 41b2594a80c6..d5982480630b 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -1439,16 +1439,43 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
 		   &cache->stats.read_miss : &cache->stats.write_miss);
 }
 
+/*----------------------------------------------------------------*/
+
+struct old_oblock_lock {
+	struct policy_locker locker;
+	struct cache *cache;
+	struct prealloc *structs;
+	struct dm_bio_prison_cell *cell;
+};
+
+static int null_locker(struct policy_locker *locker, dm_oblock_t b)
+{
+	/* This should never be called */
+	BUG();
+	return 0;
+}
+
+static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
+{
+	struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker);
+	struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs);
+
+	return bio_detain(l->cache, b, NULL, cell_prealloc,
+			  (cell_free_fn) prealloc_put_cell,
+			  l->structs, &l->cell);
+}
+
 static void process_bio(struct cache *cache, struct prealloc *structs,
 			struct bio *bio)
 {
 	int r;
 	bool release_cell = true;
 	dm_oblock_t block = get_bio_block(cache, bio);
-	struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
+	struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
 	struct policy_result lookup_result;
 	bool passthrough = passthrough_mode(&cache->features);
 	bool discarded_block, can_migrate;
+	struct old_oblock_lock ool;
 
 	/*
 	 * Check to see if that block is currently migrating.
@@ -1463,8 +1490,12 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 	discarded_block = is_discarded_oblock(cache, block);
 	can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
 
+	ool.locker.fn = cell_locker;
+	ool.cache = cache;
+	ool.structs = structs;
+	ool.cell = NULL;
 	r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
-		       bio, &lookup_result);
+		       bio, &ool.locker, &lookup_result);
 
 	if (r == -EWOULDBLOCK)
 		/* migration has been denied */
@@ -1521,27 +1552,11 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 		break;
 
 	case POLICY_REPLACE:
-		cell_prealloc = prealloc_get_cell(structs);
-		r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
-			       (cell_free_fn) prealloc_put_cell,
-			       structs, &old_ocell);
-		if (r > 0) {
-			/*
-			 * We have to be careful to avoid lock inversion of
-			 * the cells.  So we back off, and wait for the
-			 * old_ocell to become free.
-			 */
-			policy_force_mapping(cache->policy, block,
-					     lookup_result.old_oblock);
-			atomic_inc(&cache->stats.cache_cell_clash);
-			break;
-		}
 		atomic_inc(&cache->stats.demotion);
 		atomic_inc(&cache->stats.promotion);
-
 		demote_then_promote(cache, structs, lookup_result.old_oblock,
 				    block, lookup_result.cblock,
-				    old_ocell, new_ocell);
+				    ool.cell, new_ocell);
 		release_cell = false;
 		break;
 
@@ -2589,6 +2604,9 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 	bool discarded_block;
 	struct policy_result lookup_result;
 	struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
+	struct old_oblock_lock ool;
+
+	ool.locker.fn = null_locker;
 
 	if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
 		/*
@@ -2627,7 +2645,7 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 	discarded_block = is_discarded_oblock(cache, block);
 
 	r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
-		       bio, &lookup_result);
+		       bio, &ool.locker, &lookup_result);
 	if (r == -EWOULDBLOCK) {
 		cell_defer(cache, *cell, true);
 		return DM_MAPIO_SUBMITTED;

From 77289d32073c4eac57fcca2abe6caefc6f3dc7d6 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 15 May 2015 13:45:30 +0100
Subject: [PATCH 15/39] dm cache: add io_tracker

A little class that keeps track of the volume of io that is in flight,
and the length of time that a device has been idle for.

FIXME: rather than jiffes, may be best to use ktime_t (to support faster
devices).

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 73 ++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index d5982480630b..6f9bdd1bf7c4 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -25,6 +25,79 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
 
 /*----------------------------------------------------------------*/
 
+#define IOT_RESOLUTION 4
+
+struct io_tracker {
+	spinlock_t lock;
+
+	/*
+	 * Sectors of in-flight IO.
+	 */
+	sector_t in_flight;
+
+	/*
+	 * The time, in jiffies, when this device became idle (if it is
+	 * indeed idle).
+	 */
+	unsigned long idle_time;
+	unsigned long last_update_time;
+};
+
+static void iot_init(struct io_tracker *iot)
+{
+	spin_lock_init(&iot->lock);
+	iot->in_flight = 0ul;
+	iot->idle_time = 0ul;
+	iot->last_update_time = jiffies;
+}
+
+static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
+{
+	if (iot->in_flight)
+		return false;
+
+	return time_after(jiffies, iot->idle_time + jifs);
+}
+
+static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
+{
+	bool r;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iot->lock, flags);
+	r = __iot_idle_for(iot, jifs);
+	spin_unlock_irqrestore(&iot->lock, flags);
+
+	return r;
+}
+
+static void iot_io_begin(struct io_tracker *iot, sector_t len)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&iot->lock, flags);
+	iot->in_flight += len;
+	spin_unlock_irqrestore(&iot->lock, flags);
+}
+
+static void __iot_io_end(struct io_tracker *iot, sector_t len)
+{
+	iot->in_flight -= len;
+	if (!iot->in_flight)
+		iot->idle_time = jiffies;
+}
+
+static void iot_io_end(struct io_tracker *iot, sector_t len)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&iot->lock, flags);
+	__iot_io_end(iot, len);
+	spin_unlock_irqrestore(&iot->lock, flags);
+}
+
+/*----------------------------------------------------------------*/
+
 /*
  * Glossary:
  *

From 066dbaa386c751164c39ab025e5e8803b4a4d691 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 15 May 2015 15:18:01 +0100
Subject: [PATCH 16/39] dm cache: track IO to the origin device using
 io_tracker

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 56 +++++++++++++++++++++++++++++++-----
 1 file changed, 49 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 6f9bdd1bf7c4..940c7b2b5ab4 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -355,6 +355,8 @@ struct cache {
 	 */
 	spinlock_t invalidation_lock;
 	struct list_head invalidation_requests;
+
+	struct io_tracker origin_tracker;
 };
 
 struct per_bio_data {
@@ -362,6 +364,7 @@ struct per_bio_data {
 	unsigned req_nr:2;
 	struct dm_deferred_entry *all_io_entry;
 	struct dm_hook_info hook_info;
+	sector_t len;
 
 	/*
 	 * writethrough fields.  These MUST remain at the end of this
@@ -768,6 +771,7 @@ static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
 	pb->tick = false;
 	pb->req_nr = dm_bio_get_target_bio_nr(bio);
 	pb->all_io_entry = NULL;
+	pb->len = 0;
 
 	return pb;
 }
@@ -865,12 +869,43 @@ static void inc_ds(struct cache *cache, struct bio *bio,
 	pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
 }
 
+static bool accountable_bio(struct cache *cache, struct bio *bio)
+{
+	return ((bio->bi_bdev == cache->origin_dev->bdev) &&
+		!(bio->bi_rw & REQ_DISCARD));
+}
+
+static void accounted_begin(struct cache *cache, struct bio *bio)
+{
+	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+	if (accountable_bio(cache, bio)) {
+		pb->len = bio_sectors(bio);
+		iot_io_begin(&cache->origin_tracker, pb->len);
+	}
+}
+
+static void accounted_complete(struct cache *cache, struct bio *bio)
+{
+	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+	iot_io_end(&cache->origin_tracker, pb->len);
+}
+
+static void accounted_request(struct cache *cache, struct bio *bio)
+{
+	accounted_begin(cache, bio);
+	generic_make_request(bio);
+}
+
 static void issue(struct cache *cache, struct bio *bio)
 {
 	unsigned long flags;
 
 	if (!bio_triggers_commit(cache, bio)) {
-		generic_make_request(bio);
+		accounted_request(cache, bio);
 		return;
 	}
 
@@ -1166,7 +1201,7 @@ static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
 	 * No need to inc_ds() here, since the cell will be held for the
 	 * duration of the io.
 	 */
-	generic_make_request(bio);
+	accounted_request(mg->cache, bio);
 }
 
 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
@@ -1722,7 +1757,7 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
 	 * These bios have already been through inc_ds()
 	 */
 	while ((bio = bio_list_pop(&bios)))
-		submit_bios ? generic_make_request(bio) : bio_io_error(bio);
+		submit_bios ? accounted_request(cache, bio) : bio_io_error(bio);
 }
 
 static void process_deferred_writethrough_bios(struct cache *cache)
@@ -1742,7 +1777,7 @@ static void process_deferred_writethrough_bios(struct cache *cache)
 	 * These bios have already been through inc_ds()
 	 */
 	while ((bio = bio_list_pop(&bios)))
-		generic_make_request(bio);
+		accounted_request(cache, bio);
 }
 
 static void writeback_some_dirty_blocks(struct cache *cache)
@@ -2602,6 +2637,8 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 	spin_lock_init(&cache->invalidation_lock);
 	INIT_LIST_HEAD(&cache->invalidation_requests);
 
+	iot_init(&cache->origin_tracker);
+
 	*result = cache;
 	return 0;
 
@@ -2791,9 +2828,13 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 	struct cache *cache = ti->private;
 
 	r = __cache_map(cache, bio, &cell);
-	if (r == DM_MAPIO_REMAPPED && cell) {
-		inc_ds(cache, bio, cell);
-		cell_defer(cache, cell, false);
+	if (r == DM_MAPIO_REMAPPED) {
+		accounted_begin(cache, bio);
+
+		if (cell) {
+			inc_ds(cache, bio, cell);
+			cell_defer(cache, cell, false);
+		}
 	}
 
 	return r;
@@ -2815,6 +2856,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
 	}
 
 	check_for_quiesced_migrations(cache, pb);
+	accounted_complete(cache, bio);
 
 	return 0;
 }

From 20f6814b94fff4a98b123f1c2b691e936be27aaf Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 15 May 2015 15:20:09 +0100
Subject: [PATCH 17/39] dm cache: pass a new 'critical' flag to the policies
 when requesting writeback work

We only allow non critical writeback if the origin is idle.  It is up
to the policy to decide what writeback work is critical.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-cleaner.c  | 3 ++-
 drivers/md/dm-cache-policy-internal.h | 5 +++--
 drivers/md/dm-cache-policy-mq.c       | 2 +-
 drivers/md/dm-cache-policy.h          | 7 +++++--
 drivers/md/dm-cache-target.c          | 3 ++-
 5 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
index 004e463c9423..240c9f0e85e7 100644
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ b/drivers/md/dm-cache-policy-cleaner.c
@@ -359,7 +359,8 @@ static struct wb_cache_entry *get_next_dirty_entry(struct policy *p)
 
 static int wb_writeback_work(struct dm_cache_policy *pe,
 			     dm_oblock_t *oblock,
-			     dm_cblock_t *cblock)
+			     dm_cblock_t *cblock,
+			     bool critical_only)
 {
 	int r = -ENOENT;
 	struct policy *p = to_policy(pe);
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index c198e6defb9c..776c685167e6 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -55,9 +55,10 @@ static inline int policy_walk_mappings(struct dm_cache_policy *p,
 
 static inline int policy_writeback_work(struct dm_cache_policy *p,
 					dm_oblock_t *oblock,
-					dm_cblock_t *cblock)
+					dm_cblock_t *cblock,
+					bool critical_only)
 {
-	return p->writeback_work ? p->writeback_work(p, oblock, cblock) : -ENOENT;
+	return p->writeback_work ? p->writeback_work(p, oblock, cblock, critical_only) : -ENOENT;
 }
 
 static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 515d44bf24d3..7cbae125879c 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -1236,7 +1236,7 @@ static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
 }
 
 static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
-			     dm_cblock_t *cblock)
+			     dm_cblock_t *cblock, bool critical_only)
 {
 	int r;
 	struct mq_policy *mq = to_mq_policy(p);
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index 5524e21e4836..6106ca3aa350 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -178,7 +178,9 @@ struct dm_cache_policy {
 	int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock);
 
 	/*
-	 * Provide a dirty block to be written back by the core target.
+	 * Provide a dirty block to be written back by the core target.  If
+	 * critical_only is set then the policy should only provide work if
+	 * it urgently needs it.
 	 *
 	 * Returns:
 	 *
@@ -186,7 +188,8 @@ struct dm_cache_policy {
 	 *
 	 * -ENODATA: no dirty blocks available
 	 */
-	int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
+	int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock,
+			      bool critical_only);
 
 	/*
 	 * How full is the cache?
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 940c7b2b5ab4..5a9cd2c5a359 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -1787,6 +1787,7 @@ static void writeback_some_dirty_blocks(struct cache *cache)
 	dm_cblock_t cblock;
 	struct prealloc structs;
 	struct dm_bio_prison_cell *old_ocell;
+	bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
 
 	memset(&structs, 0, sizeof(structs));
 
@@ -1794,7 +1795,7 @@ static void writeback_some_dirty_blocks(struct cache *cache)
 		if (prealloc_data_structs(cache, &structs))
 			break;
 
-		r = policy_writeback_work(cache->policy, &oblock, &cblock);
+		r = policy_writeback_work(cache->policy, &oblock, &cblock, busy);
 		if (r)
 			break;
 

From 451b9e0071b2833744db7f518115bc085bc7b23c Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 15 May 2015 15:22:02 +0100
Subject: [PATCH 18/39] dm cache: pull out some bitset utility functions for
 reuse

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-internal.h | 28 +++++++++++++++++++++++++++
 drivers/md/dm-cache-target.c          | 24 -----------------------
 2 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 776c685167e6..9dc05a52369e 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -7,6 +7,7 @@
 #ifndef DM_CACHE_POLICY_INTERNAL_H
 #define DM_CACHE_POLICY_INTERNAL_H
 
+#include <linux/vmalloc.h>
 #include "dm-cache-policy.h"
 
 /*----------------------------------------------------------------*/
@@ -106,6 +107,33 @@ static inline int policy_set_config_value(struct dm_cache_policy *p,
 
 /*----------------------------------------------------------------*/
 
+/*
+ * Some utility functions commonly used by policies and the core target.
+ */
+static inline size_t bitset_size_in_bytes(unsigned nr_entries)
+{
+	return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
+}
+
+static inline unsigned long *alloc_bitset(unsigned nr_entries)
+{
+	size_t s = bitset_size_in_bytes(nr_entries);
+	return vzalloc(s);
+}
+
+static inline void clear_bitset(void *bitset, unsigned nr_entries)
+{
+	size_t s = bitset_size_in_bytes(nr_entries);
+	memset(bitset, 0, s);
+}
+
+static inline void free_bitset(unsigned long *bits)
+{
+	vfree(bits);
+}
+
+/*----------------------------------------------------------------*/
+
 /*
  * Creates a new cache policy given a policy name, a cache size, an origin size and the block size.
  */
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 5a9cd2c5a359..5d3b20b91ba3 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -111,30 +111,6 @@ static void iot_io_end(struct io_tracker *iot, sector_t len)
 
 /*----------------------------------------------------------------*/
 
-static size_t bitset_size_in_bytes(unsigned nr_entries)
-{
-	return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
-}
-
-static unsigned long *alloc_bitset(unsigned nr_entries)
-{
-	size_t s = bitset_size_in_bytes(nr_entries);
-	return vzalloc(s);
-}
-
-static void clear_bitset(void *bitset, unsigned nr_entries)
-{
-	size_t s = bitset_size_in_bytes(nr_entries);
-	memset(bitset, 0, s);
-}
-
-static void free_bitset(unsigned long *bits)
-{
-	vfree(bits);
-}
-
-/*----------------------------------------------------------------*/
-
 /*
  * There are a couple of places where we let a bio run, but want to do some
  * work before calling its endio function.  We do this by temporarily

From 3cdf93f9d85979b22b6abfd4ab19350860e4dfac Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 15 May 2015 15:23:35 +0100
Subject: [PATCH 19/39] dm bio prison: add dm_cell_promote_or_release()

Rather than always releasing the prisoners in a cell, the client may
want to promote one of them to be the new holder.  There is a race here
though between releasing an empty cell, and other threads adding new
inmates.  So this function makes the decision with its lock held.

This function can have two outcomes:
i)  An inmate is promoted to be the holder of the cell (return value of 0).
ii) The cell has no inmate for promotion and is released (return value of 1).

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bio-prison.c | 26 ++++++++++++++++++++++++++
 drivers/md/dm-bio-prison.h | 13 +++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index be065300e93c..cd6d1d21e057 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -255,6 +255,32 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
 }
 EXPORT_SYMBOL_GPL(dm_cell_visit_release);
 
+static int __promote_or_release(struct dm_bio_prison *prison,
+				struct dm_bio_prison_cell *cell)
+{
+	if (bio_list_empty(&cell->bios)) {
+		rb_erase(&cell->node, &prison->cells);
+		return 1;
+	}
+
+	cell->holder = bio_list_pop(&cell->bios);
+	return 0;
+}
+
+int dm_cell_promote_or_release(struct dm_bio_prison *prison,
+			       struct dm_bio_prison_cell *cell)
+{
+	int r;
+	unsigned long flags;
+
+	spin_lock_irqsave(&prison->lock, flags);
+	r = __promote_or_release(prison, cell);
+	spin_unlock_irqrestore(&prison->lock, flags);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_promote_or_release);
+
 /*----------------------------------------------------------------*/
 
 #define DEFERRED_SET_SIZE 64
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index 74cf01144b1f..54352f009bfd 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -101,6 +101,19 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
 			   void (*visit_fn)(void *, struct dm_bio_prison_cell *),
 			   void *context, struct dm_bio_prison_cell *cell);
 
+/*
+ * Rather than always releasing the prisoners in a cell, the client may
+ * want to promote one of them to be the new holder.  There is a race here
+ * though between releasing an empty cell, and other threads adding new
+ * inmates.  So this function makes the decision with its lock held.
+ *
+ * This function can have two outcomes:
+ * i) An inmate is promoted to be the holder of the cell (return value of 0).
+ * ii) The cell has no inmate for promotion and is released (return value of 1).
+ */
+int dm_cell_promote_or_release(struct dm_bio_prison *prison,
+			       struct dm_bio_prison_cell *cell);
+
 /*----------------------------------------------------------------*/
 
 /*

From 651f5fa2a3959ff5db60c09a84efd66309fe4035 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 15 May 2015 15:26:08 +0100
Subject: [PATCH 20/39] dm cache: defer whole cells

Currently individual bios are deferred to the worker thread if they
cannot be processed immediately (eg, a block is in the process of
being moved to the fast device).

This patch passes whole cells across to the worker.  This saves
reaquiring the cell, and also collects bios destined for the same block
together, which allows them to be mapped with a single look up to the
policy.  This reduces the overhead of using dm-cache.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 325 ++++++++++++++++++++++++++++-------
 1 file changed, 262 insertions(+), 63 deletions(-)

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 5d3b20b91ba3..d2d91c164420 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -257,6 +257,7 @@ struct cache {
 	int sectors_per_block_shift;
 
 	spinlock_t lock;
+	struct list_head deferred_cells;
 	struct bio_list deferred_bios;
 	struct bio_list deferred_flush_bios;
 	struct bio_list deferred_writethrough_bios;
@@ -969,26 +970,63 @@ static void dec_io_migrations(struct cache *cache)
 	atomic_dec(&cache->nr_io_migrations);
 }
 
-static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
-			 bool holder)
+static void __cell_release(struct cache *cache, struct dm_bio_prison_cell *cell,
+			   bool holder, struct bio_list *bios)
 {
 	(holder ? dm_cell_release : dm_cell_release_no_holder)
-		(cache->prison, cell, &cache->deferred_bios);
+		(cache->prison, cell, bios);
 	free_prison_cell(cache, cell);
 }
 
-static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
-		       bool holder)
+static bool discard_or_flush(struct bio *bio)
+{
+	return bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD);
+}
+
+static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
+{
+	if (discard_or_flush(cell->holder))
+		/*
+		 * We have to handle these bios
+		 * individually.
+		 */
+		__cell_release(cache, cell, true, &cache->deferred_bios);
+
+	else
+		list_add_tail(&cell->user_list, &cache->deferred_cells);
+}
+
+static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder)
 {
 	unsigned long flags;
 
+	if (!holder && dm_cell_promote_or_release(cache->prison, cell)) {
+		/*
+		 * There was no prisoner to promote to holder, the
+		 * cell has been released.
+		 */
+		free_prison_cell(cache, cell);
+		return;
+	}
+
 	spin_lock_irqsave(&cache->lock, flags);
-	__cell_defer(cache, cell, holder);
+	__cell_defer(cache, cell);
 	spin_unlock_irqrestore(&cache->lock, flags);
 
 	wake_worker(cache);
 }
 
+static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err)
+{
+	dm_cell_error(cache->prison, cell, err);
+	dm_bio_prison_free_cell(cache->prison, cell);
+}
+
+static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell)
+{
+	cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE);
+}
+
 static void free_io_migration(struct dm_cache_migration *mg)
 {
 	dec_io_migrations(mg->cache);
@@ -1525,6 +1563,107 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
 
 /*----------------------------------------------------------------*/
 
+struct inc_detail {
+	struct cache *cache;
+	struct bio_list bios_for_issue;
+	struct bio_list unhandled_bios;
+	bool any_writes;
+};
+
+static void inc_fn(void *context, struct dm_bio_prison_cell *cell)
+{
+	struct bio *bio;
+	struct inc_detail *detail = context;
+	struct cache *cache = detail->cache;
+
+	inc_ds(cache, cell->holder, cell);
+	if (bio_data_dir(cell->holder) == WRITE)
+		detail->any_writes = true;
+
+	while ((bio = bio_list_pop(&cell->bios))) {
+		if (discard_or_flush(bio)) {
+			bio_list_add(&detail->unhandled_bios, bio);
+			continue;
+		}
+
+		if (bio_data_dir(bio) == WRITE)
+			detail->any_writes = true;
+
+		bio_list_add(&detail->bios_for_issue, bio);
+		inc_ds(cache, bio, cell);
+	}
+}
+
+// FIXME: refactor these two
+static void remap_cell_to_origin_clear_discard(struct cache *cache,
+					       struct dm_bio_prison_cell *cell,
+					       dm_oblock_t oblock, bool issue_holder)
+{
+	struct bio *bio;
+	unsigned long flags;
+	struct inc_detail detail;
+
+	detail.cache = cache;
+	bio_list_init(&detail.bios_for_issue);
+	bio_list_init(&detail.unhandled_bios);
+	detail.any_writes = false;
+
+	spin_lock_irqsave(&cache->lock, flags);
+	dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
+	bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	remap_to_origin(cache, cell->holder);
+	if (issue_holder)
+		issue(cache, cell->holder);
+	else
+		accounted_begin(cache, cell->holder);
+
+	if (detail.any_writes)
+		clear_discard(cache, oblock_to_dblock(cache, oblock));
+
+	while ((bio = bio_list_pop(&detail.bios_for_issue))) {
+		remap_to_origin(cache, bio);
+		issue(cache, bio);
+	}
+}
+
+static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell,
+				      dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder)
+{
+	struct bio *bio;
+	unsigned long flags;
+	struct inc_detail detail;
+
+	detail.cache = cache;
+	bio_list_init(&detail.bios_for_issue);
+	bio_list_init(&detail.unhandled_bios);
+	detail.any_writes = false;
+
+	spin_lock_irqsave(&cache->lock, flags);
+	dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
+	bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	remap_to_cache(cache, cell->holder, cblock);
+	if (issue_holder)
+		issue(cache, cell->holder);
+	else
+		accounted_begin(cache, cell->holder);
+
+	if (detail.any_writes) {
+		set_dirty(cache, oblock, cblock);
+		clear_discard(cache, oblock_to_dblock(cache, oblock));
+	}
+
+	while ((bio = bio_list_pop(&detail.bios_for_issue))) {
+		remap_to_cache(cache, bio, cblock);
+		issue(cache, bio);
+	}
+}
+
+/*----------------------------------------------------------------*/
+
 struct old_oblock_lock {
 	struct policy_locker locker;
 	struct cache *cache;
@@ -1549,28 +1688,18 @@ static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
 			  l->structs, &l->cell);
 }
 
-static void process_bio(struct cache *cache, struct prealloc *structs,
-			struct bio *bio)
+static void process_cell(struct cache *cache, struct prealloc *structs,
+			 struct dm_bio_prison_cell *new_ocell)
 {
 	int r;
 	bool release_cell = true;
+	struct bio *bio = new_ocell->holder;
 	dm_oblock_t block = get_bio_block(cache, bio);
-	struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
 	struct policy_result lookup_result;
 	bool passthrough = passthrough_mode(&cache->features);
 	bool discarded_block, can_migrate;
 	struct old_oblock_lock ool;
 
-	/*
-	 * Check to see if that block is currently migrating.
-	 */
-	cell_prealloc = prealloc_get_cell(structs);
-	r = bio_detain(cache, block, bio, cell_prealloc,
-		       (cell_free_fn) prealloc_put_cell,
-		       structs, &new_ocell);
-	if (r > 0)
-		return;
-
 	discarded_block = is_discarded_oblock(cache, block);
 	can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
 
@@ -1615,9 +1744,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
 				inc_and_issue(cache, bio, new_ocell);
 
-			} else  {
-				remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
-				inc_and_issue(cache, bio, new_ocell);
+			} else {
+				remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true);
+				release_cell = false;
 			}
 		}
 
@@ -1625,8 +1754,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 
 	case POLICY_MISS:
 		inc_miss_counter(cache, bio);
-		remap_to_origin_clear_discard(cache, bio, block);
-		inc_and_issue(cache, bio, new_ocell);
+		remap_cell_to_origin_clear_discard(cache, new_ocell, block, true);
+		release_cell = false;
 		break;
 
 	case POLICY_NEW:
@@ -1654,10 +1783,30 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 		cell_defer(cache, new_ocell, false);
 }
 
+static void process_bio(struct cache *cache, struct prealloc *structs,
+			struct bio *bio)
+{
+	int r;
+	dm_oblock_t block = get_bio_block(cache, bio);
+	struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
+
+	/*
+	 * Check to see if that block is currently migrating.
+	 */
+	cell_prealloc = prealloc_get_cell(structs);
+	r = bio_detain(cache, block, bio, cell_prealloc,
+		       (cell_free_fn) prealloc_put_cell,
+		       structs, &new_ocell);
+	if (r > 0)
+		return;
+
+	process_cell(cache, structs, new_ocell);
+}
+
 static int need_commit_due_to_time(struct cache *cache)
 {
-	return !time_in_range(jiffies, cache->last_commit_jiffies,
-			      cache->last_commit_jiffies + COMMIT_PERIOD);
+	return jiffies < cache->last_commit_jiffies ||
+	       jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
 }
 
 static int commit_if_needed(struct cache *cache)
@@ -1716,6 +1865,40 @@ static void process_deferred_bios(struct cache *cache)
 	prealloc_free_structs(cache, &structs);
 }
 
+static void process_deferred_cells(struct cache *cache)
+{
+	unsigned long flags;
+	struct dm_bio_prison_cell *cell, *tmp;
+	struct list_head cells;
+	struct prealloc structs;
+
+	memset(&structs, 0, sizeof(structs));
+
+	INIT_LIST_HEAD(&cells);
+
+	spin_lock_irqsave(&cache->lock, flags);
+	list_splice_init(&cache->deferred_cells, &cells);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	list_for_each_entry_safe(cell, tmp, &cells, user_list) {
+		/*
+		 * If we've got no free migration structs, and processing
+		 * this bio might require one, we pause until there are some
+		 * prepared mappings to process.
+		 */
+		if (prealloc_data_structs(cache, &structs)) {
+			spin_lock_irqsave(&cache->lock, flags);
+			list_splice(&cells, &cache->deferred_cells);
+			spin_unlock_irqrestore(&cache->lock, flags);
+			break;
+		}
+
+		process_cell(cache, &structs, cell);
+	}
+
+	prealloc_free_structs(cache, &structs);
+}
+
 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
 {
 	unsigned long flags;
@@ -1883,7 +2066,22 @@ static void stop_worker(struct cache *cache)
 	flush_workqueue(cache->wq);
 }
 
-static void requeue_deferred_io(struct cache *cache)
+static void requeue_deferred_cells(struct cache *cache)
+{
+	unsigned long flags;
+	struct list_head cells;
+	struct dm_bio_prison_cell *cell, *tmp;
+
+	INIT_LIST_HEAD(&cells);
+	spin_lock_irqsave(&cache->lock, flags);
+	list_splice_init(&cache->deferred_cells, &cells);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	list_for_each_entry_safe(cell, tmp, &cells, user_list)
+		cell_requeue(cache, cell);
+}
+
+static void requeue_deferred_bios(struct cache *cache)
 {
 	struct bio *bio;
 	struct bio_list bios;
@@ -1904,6 +2102,7 @@ static int more_work(struct cache *cache)
 			!list_empty(&cache->need_commit_migrations);
 	else
 		return !bio_list_empty(&cache->deferred_bios) ||
+			!list_empty(&cache->deferred_cells) ||
 			!bio_list_empty(&cache->deferred_flush_bios) ||
 			!bio_list_empty(&cache->deferred_writethrough_bios) ||
 			!list_empty(&cache->quiesced_migrations) ||
@@ -1921,6 +2120,7 @@ static void do_worker(struct work_struct *ws)
 			writeback_some_dirty_blocks(cache);
 			process_deferred_writethrough_bios(cache);
 			process_deferred_bios(cache);
+			process_deferred_cells(cache);
 			process_invalidation_requests(cache);
 		}
 
@@ -1935,6 +2135,7 @@ static void do_worker(struct work_struct *ws)
 			 * FIXME: rollback metadata or just go into a
 			 * failure mode and error everything
 			 */
+
 		} else {
 			process_deferred_flush_bios(cache, true);
 			process_migrations(cache, &cache->need_commit_migrations,
@@ -2525,6 +2726,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 	}
 
 	spin_lock_init(&cache->lock);
+	INIT_LIST_HEAD(&cache->deferred_cells);
 	bio_list_init(&cache->deferred_bios);
 	bio_list_init(&cache->deferred_flush_bios);
 	bio_list_init(&cache->deferred_writethrough_bios);
@@ -2682,9 +2884,14 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	return r;
 }
 
-static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell)
+/*----------------------------------------------------------------*/
+
+static int cache_map(struct dm_target *ti, struct bio *bio)
 {
+	struct cache *cache = ti->private;
+
 	int r;
+	struct dm_bio_prison_cell *cell = NULL;
 	dm_oblock_t block = get_bio_block(cache, bio);
 	size_t pb_data_size = get_per_bio_data_size(cache);
 	bool can_migrate = false;
@@ -2702,10 +2909,11 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 		 * Just remap to the origin and carry on.
 		 */
 		remap_to_origin(cache, bio);
+		accounted_begin(cache, bio);
 		return DM_MAPIO_REMAPPED;
 	}
 
-	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
+	if (discard_or_flush(bio)) {
 		defer_bio(cache, bio);
 		return DM_MAPIO_SUBMITTED;
 	}
@@ -2713,15 +2921,15 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 	/*
 	 * Check to see if that block is currently migrating.
 	 */
-	*cell = alloc_prison_cell(cache);
-	if (!*cell) {
+	cell = alloc_prison_cell(cache);
+	if (!cell) {
 		defer_bio(cache, bio);
 		return DM_MAPIO_SUBMITTED;
 	}
 
-	r = bio_detain(cache, block, bio, *cell,
+	r = bio_detain(cache, block, bio, cell,
 		       (cell_free_fn) free_prison_cell,
-		       cache, cell);
+		       cache, &cell);
 	if (r) {
 		if (r < 0)
 			defer_bio(cache, bio);
@@ -2734,12 +2942,12 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 	r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
 		       bio, &ool.locker, &lookup_result);
 	if (r == -EWOULDBLOCK) {
-		cell_defer(cache, *cell, true);
+		cell_defer(cache, cell, true);
 		return DM_MAPIO_SUBMITTED;
 
 	} else if (r) {
 		DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
-		cell_defer(cache, *cell, false);
+		cell_defer(cache, cell, false);
 		bio_io_error(bio);
 		return DM_MAPIO_SUBMITTED;
 	}
@@ -2753,21 +2961,30 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 				 * We need to invalidate this block, so
 				 * defer for the worker thread.
 				 */
-				cell_defer(cache, *cell, true);
+				cell_defer(cache, cell, true);
 				r = DM_MAPIO_SUBMITTED;
 
 			} else {
 				inc_miss_counter(cache, bio);
 				remap_to_origin_clear_discard(cache, bio, block);
+				accounted_begin(cache, bio);
+				inc_ds(cache, bio, cell);
+				// FIXME: we want to remap hits or misses straight
+				// away rather than passing over to the worker.
+				cell_defer(cache, cell, false);
 			}
 
 		} else {
 			inc_hit_counter(cache, bio);
 			if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
-			    !is_dirty(cache, lookup_result.cblock))
+			    !is_dirty(cache, lookup_result.cblock)) {
 				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
-			else
-				remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+				accounted_begin(cache, bio);
+				inc_ds(cache, bio, cell);
+				cell_defer(cache, cell, false);
+
+			} else
+				remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false);
 		}
 		break;
 
@@ -2779,18 +2996,18 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 			 * longer needed because the block has been demoted.
 			 */
 			bio_endio(bio, 0);
-			cell_defer(cache, *cell, false);
+			// FIXME: remap everything as a miss
+			cell_defer(cache, cell, false);
 			r = DM_MAPIO_SUBMITTED;
 
 		} else
-			remap_to_origin_clear_discard(cache, bio, block);
-
+			remap_cell_to_origin_clear_discard(cache, cell, block, false);
 		break;
 
 	default:
 		DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
 			    (unsigned) lookup_result.op);
-		cell_defer(cache, *cell, false);
+		cell_defer(cache, cell, false);
 		bio_io_error(bio);
 		r = DM_MAPIO_SUBMITTED;
 	}
@@ -2798,25 +3015,6 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 	return r;
 }
 
-static int cache_map(struct dm_target *ti, struct bio *bio)
-{
-	int r;
-	struct dm_bio_prison_cell *cell = NULL;
-	struct cache *cache = ti->private;
-
-	r = __cache_map(cache, bio, &cell);
-	if (r == DM_MAPIO_REMAPPED) {
-		accounted_begin(cache, bio);
-
-		if (cell) {
-			inc_ds(cache, bio, cell);
-			cell_defer(cache, cell, false);
-		}
-	}
-
-	return r;
-}
-
 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
 {
 	struct cache *cache = ti->private;
@@ -2913,7 +3111,8 @@ static void cache_postsuspend(struct dm_target *ti)
 	start_quiescing(cache);
 	wait_for_migrations(cache);
 	stop_worker(cache);
-	requeue_deferred_io(cache);
+	requeue_deferred_bios(cache);
+	requeue_deferred_cells(cache);
 	stop_quiescing(cache);
 
 	(void) sync_metadata(cache);

From 40775257b97e27305cf5c2425be7acaa6edee4ea Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 15 May 2015 15:29:58 +0100
Subject: [PATCH 21/39] dm cache: boost promotion of blocks that will be
 overwritten

When considering whether to move a block to the cache we already give
preferential treatment to discarded blocks, since they are cheap to
promote (no read of the origin required since the data is junk).

The same is true of blocks that are about to be completely
overwritten, so we likewise boost their promotion chances.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index d2d91c164420..7829d947ef01 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -1697,17 +1697,17 @@ static void process_cell(struct cache *cache, struct prealloc *structs,
 	dm_oblock_t block = get_bio_block(cache, bio);
 	struct policy_result lookup_result;
 	bool passthrough = passthrough_mode(&cache->features);
-	bool discarded_block, can_migrate;
+	bool fast_promotion, can_migrate;
 	struct old_oblock_lock ool;
 
-	discarded_block = is_discarded_oblock(cache, block);
-	can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
+	fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
+	can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache));
 
 	ool.locker.fn = cell_locker;
 	ool.cache = cache;
 	ool.structs = structs;
 	ool.cell = NULL;
-	r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
+	r = policy_map(cache->policy, block, true, can_migrate, fast_promotion,
 		       bio, &ool.locker, &lookup_result);
 
 	if (r == -EWOULDBLOCK)
@@ -2895,7 +2895,7 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 	dm_oblock_t block = get_bio_block(cache, bio);
 	size_t pb_data_size = get_per_bio_data_size(cache);
 	bool can_migrate = false;
-	bool discarded_block;
+	bool fast_promotion;
 	struct policy_result lookup_result;
 	struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
 	struct old_oblock_lock ool;
@@ -2937,9 +2937,9 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 		return DM_MAPIO_SUBMITTED;
 	}
 
-	discarded_block = is_discarded_oblock(cache, block);
+	fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
 
-	r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
+	r = policy_map(cache->policy, block, false, can_migrate, fast_promotion,
 		       bio, &ool.locker, &lookup_result);
 	if (r == -EWOULDBLOCK) {
 		cell_defer(cache, cell, true);

From 66a636356647a9be8885c2ce2948de126577698a Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 15 May 2015 15:33:34 +0100
Subject: [PATCH 22/39] dm cache: add stochastic-multi-queue (smq) policy

The stochastic-multi-queue (smq) policy addresses some of the problems
with the current multiqueue (mq) policy.

Memory usage
------------

The mq policy uses a lot of memory; 88 bytes per cache block on a 64
bit machine.

SMQ uses 28bit indexes to implement it's data structures rather than
pointers.  It avoids storing an explicit hit count for each block.  It
has a 'hotspot' queue rather than a pre cache which uses a quarter of
the entries (each hotspot block covers a larger area than a single
cache block).

All these mean smq uses ~25bytes per cache block.  Still a lot of
memory, but a substantial improvement nontheless.

Level balancing
---------------

MQ places entries in different levels of the multiqueue structures
based on their hit count (~ln(hit count)).  This means the bottom
levels generally have the most entries, and the top ones have very
few.  Having unbalanced levels like this reduces the efficacy of the
multiqueue.

SMQ does not maintain a hit count, instead it swaps hit entries with
the least recently used entry from the level above.  The over all
ordering being a side effect of this stochastic process.  With this
scheme we can decide how many entries occupy each multiqueue level,
resulting in better promotion/demotion decisions.

Adaptability
------------

The MQ policy maintains a hit count for each cache block.  For a
different block to get promoted to the cache it's hit count has to
exceed the lowest currently in the cache.  This means it can take a
long time for the cache to adapt between varying IO patterns.
Periodically degrading the hit counts could help with this, but I
haven't found a nice general solution.

SMQ doesn't maintain hit counts, so a lot of this problem just goes
away.  In addition it tracks performance of the hotspot queue, which
is used to decide which blocks to promote.  If the hotspot queue is
performing badly then it starts moving entries more quickly between
levels.  This lets it adapt to new IO patterns very quickly.

Performance
-----------

In my tests SMQ shows substantially better performance than MQ.  Once
this matures a bit more I'm sure it'll become the default policy.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/Kconfig               |   12 +
 drivers/md/Makefile              |    2 +
 drivers/md/dm-cache-policy-smq.c | 1768 ++++++++++++++++++++++++++++++
 3 files changed, 1782 insertions(+)
 create mode 100644 drivers/md/dm-cache-policy-smq.c

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index edcf4ab66e00..b59727309072 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -304,6 +304,18 @@ config DM_CACHE_MQ
          This is meant to be a general purpose policy.  It prioritises
          reads over writes.
 
+config DM_CACHE_SMQ
+       tristate "Stochastic MQ Cache Policy (EXPERIMENTAL)"
+       depends on DM_CACHE
+       default y
+       ---help---
+         A cache policy that uses a multiqueue ordered by recent hits
+         to select which blocks should be promoted and demoted.
+         This is meant to be a general purpose policy.  It prioritises
+         reads over writes.  This SMQ policy (vs MQ) offers the promise
+         of less memory utilization, improved performance and increased
+         adaptability in the face of changing workloads.
+
 config DM_CACHE_CLEANER
        tristate "Cleaner Cache Policy (EXPERIMENTAL)"
        depends on DM_CACHE
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index dba4db5985fb..462f443a4f85 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -13,6 +13,7 @@ dm-log-userspace-y \
 dm-thin-pool-y	+= dm-thin.o dm-thin-metadata.o
 dm-cache-y	+= dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
 dm-cache-mq-y   += dm-cache-policy-mq.o
+dm-cache-smq-y   += dm-cache-policy-smq.o
 dm-cache-cleaner-y += dm-cache-policy-cleaner.o
 dm-era-y	+= dm-era-target.o
 md-mod-y	+= md.o bitmap.o
@@ -54,6 +55,7 @@ obj-$(CONFIG_DM_THIN_PROVISIONING)	+= dm-thin-pool.o
 obj-$(CONFIG_DM_VERITY)		+= dm-verity.o
 obj-$(CONFIG_DM_CACHE)		+= dm-cache.o
 obj-$(CONFIG_DM_CACHE_MQ)	+= dm-cache-mq.o
+obj-$(CONFIG_DM_CACHE_SMQ)	+= dm-cache-smq.o
 obj-$(CONFIG_DM_CACHE_CLEANER)	+= dm-cache-cleaner.o
 obj-$(CONFIG_DM_ERA)		+= dm-era.o
 obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
new file mode 100644
index 000000000000..55a657f78f00
--- /dev/null
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -0,0 +1,1768 @@
+/*
+ * Copyright (C) 2015 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-policy.h"
+#include "dm-cache-policy-internal.h"
+#include "dm.h"
+
+#include <linux/hash.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/vmalloc.h>
+#include <linux/math64.h>
+
+#define DM_MSG_PREFIX "cache-policy-smq"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Safe division functions that return zero on divide by zero.
+ */
+static unsigned safe_div(unsigned n, unsigned d)
+{
+	return d ? n / d : 0u;
+}
+
+static unsigned safe_mod(unsigned n, unsigned d)
+{
+	return d ? n % d : 0u;
+}
+
+/*----------------------------------------------------------------*/
+
+struct entry {
+	unsigned hash_next:28;
+	unsigned prev:28;
+	unsigned next:28;
+	unsigned level:7;
+	bool dirty:1;
+	bool allocated:1;
+	bool sentinel:1;
+
+	dm_oblock_t oblock;
+};
+
+/*----------------------------------------------------------------*/
+
+#define INDEXER_NULL ((1u << 28u) - 1u)
+
+/*
+ * An entry_space manages a set of entries that we use for the queues.
+ * The clean and dirty queues share entries, so this object is separate
+ * from the queue itself.
+ */
+struct entry_space {
+	struct entry *begin;
+	struct entry *end;
+};
+
+static int space_init(struct entry_space *es, unsigned nr_entries)
+{
+	if (!nr_entries) {
+		es->begin = es->end = NULL;
+		return 0;
+	}
+
+	es->begin = vzalloc(sizeof(struct entry) * nr_entries);
+	if (!es->begin)
+		return -ENOMEM;
+
+	es->end = es->begin + nr_entries;
+	return 0;
+}
+
+static void space_exit(struct entry_space *es)
+{
+	vfree(es->begin);
+}
+
+static struct entry *__get_entry(struct entry_space *es, unsigned block)
+{
+	struct entry *e;
+
+	e = es->begin + block;
+	BUG_ON(e >= es->end);
+
+	return e;
+}
+
+static unsigned to_index(struct entry_space *es, struct entry *e)
+{
+	BUG_ON(e < es->begin || e >= es->end);
+	return e - es->begin;
+}
+
+static struct entry *to_entry(struct entry_space *es, unsigned block)
+{
+	if (block == INDEXER_NULL)
+		return NULL;
+
+	return __get_entry(es, block);
+}
+
+/*----------------------------------------------------------------*/
+
+struct ilist {
+	unsigned nr_elts;	/* excluding sentinel entries */
+	unsigned head, tail;
+};
+
+static void l_init(struct ilist *l)
+{
+	l->nr_elts = 0;
+	l->head = l->tail = INDEXER_NULL;
+}
+
+static struct entry *l_head(struct entry_space *es, struct ilist *l)
+{
+	return to_entry(es, l->head);
+}
+
+static struct entry *l_tail(struct entry_space *es, struct ilist *l)
+{
+	return to_entry(es, l->tail);
+}
+
+static struct entry *l_next(struct entry_space *es, struct entry *e)
+{
+	return to_entry(es, e->next);
+}
+
+static struct entry *l_prev(struct entry_space *es, struct entry *e)
+{
+	return to_entry(es, e->prev);
+}
+
+static bool l_empty(struct ilist *l)
+{
+	return l->head == INDEXER_NULL;
+}
+
+static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e)
+{
+	struct entry *head = l_head(es, l);
+
+	e->next = l->head;
+	e->prev = INDEXER_NULL;
+
+	if (head)
+		head->prev = l->head = to_index(es, e);
+	else
+		l->head = l->tail = to_index(es, e);
+
+	if (!e->sentinel)
+		l->nr_elts++;
+}
+
+static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e)
+{
+	struct entry *tail = l_tail(es, l);
+
+	e->next = INDEXER_NULL;
+	e->prev = l->tail;
+
+	if (tail)
+		tail->next = l->tail = to_index(es, e);
+	else
+		l->head = l->tail = to_index(es, e);
+
+	if (!e->sentinel)
+		l->nr_elts++;
+}
+
+static void l_add_before(struct entry_space *es, struct ilist *l,
+			 struct entry *old, struct entry *e)
+{
+	struct entry *prev = l_prev(es, old);
+
+	if (!prev)
+		l_add_head(es, l, e);
+
+	else {
+		e->prev = old->prev;
+		e->next = to_index(es, old);
+		prev->next = old->prev = to_index(es, e);
+
+		if (!e->sentinel)
+			l->nr_elts++;
+	}
+}
+
+static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
+{
+	struct entry *prev = l_prev(es, e);
+	struct entry *next = l_next(es, e);
+
+	if (prev)
+		prev->next = e->next;
+	else
+		l->head = e->next;
+
+	if (next)
+		next->prev = e->prev;
+	else
+		l->tail = e->prev;
+
+	if (!e->sentinel)
+		l->nr_elts--;
+}
+
+static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
+{
+	struct entry *e;
+
+	for (e = l_tail(es, l); e; e = l_prev(es, e))
+		if (!e->sentinel) {
+			l_del(es, l, e);
+			return e;
+		}
+
+	return NULL;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * The stochastic-multi-queue is a set of lru lists stacked into levels.
+ * Entries are moved up levels when they are used, which loosely orders the
+ * most accessed entries in the top levels and least in the bottom.  This
+ * structure is *much* better than a single lru list.
+ */
+#define MAX_LEVELS 64u
+
+struct queue {
+	struct entry_space *es;
+
+	unsigned nr_elts;
+	unsigned nr_levels;
+	struct ilist qs[MAX_LEVELS];
+
+	/*
+	 * We maintain a count of the number of entries we would like in each
+	 * level.
+	 */
+	unsigned last_target_nr_elts;
+	unsigned nr_top_levels;
+	unsigned nr_in_top_levels;
+	unsigned target_count[MAX_LEVELS];
+};
+
+static void q_init(struct queue *q, struct entry_space *es, unsigned nr_levels)
+{
+	unsigned i;
+
+	q->es = es;
+	q->nr_elts = 0;
+	q->nr_levels = nr_levels;
+
+	for (i = 0; i < q->nr_levels; i++) {
+		l_init(q->qs + i);
+		q->target_count[i] = 0u;
+	}
+
+	q->last_target_nr_elts = 0u;
+	q->nr_top_levels = 0u;
+	q->nr_in_top_levels = 0u;
+}
+
+static unsigned q_size(struct queue *q)
+{
+	return q->nr_elts;
+}
+
+/*
+ * Insert an entry to the back of the given level.
+ */
+static void q_push(struct queue *q, struct entry *e)
+{
+	if (!e->sentinel)
+		q->nr_elts++;
+
+	l_add_tail(q->es, q->qs + e->level, e);
+}
+
+static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
+{
+	if (!e->sentinel)
+		q->nr_elts++;
+
+	l_add_before(q->es, q->qs + e->level, old, e);
+}
+
+static void q_del(struct queue *q, struct entry *e)
+{
+	l_del(q->es, q->qs + e->level, e);
+	if (!e->sentinel)
+		q->nr_elts--;
+}
+
+/*
+ * Return the oldest entry of the lowest populated level.
+ */
+static struct entry *q_peek(struct queue *q, unsigned max_level, bool can_cross_sentinel)
+{
+	unsigned level;
+	struct entry *e;
+
+	max_level = min(max_level, q->nr_levels);
+
+	for (level = 0; level < max_level; level++)
+		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
+			if (e->sentinel) {
+				if (can_cross_sentinel)
+					continue;
+				else
+					break;
+			}
+
+			return e;
+		}
+
+	return NULL;
+}
+
+static struct entry *q_pop(struct queue *q)
+{
+	struct entry *e = q_peek(q, q->nr_levels, true);
+
+	if (e)
+		q_del(q, e);
+
+	return e;
+}
+
+/*
+ * Pops an entry from a level that is not past a sentinel.
+ */
+static struct entry *q_pop_old(struct queue *q, unsigned max_level)
+{
+	struct entry *e = q_peek(q, max_level, false);
+
+	if (e)
+		q_del(q, e);
+
+	return e;
+}
+
+/*
+ * This function assumes there is a non-sentinel entry to pop.  It's only
+ * used by redistribute, so we know this is true.  It also doesn't adjust
+ * the q->nr_elts count.
+ */
+static struct entry *__redist_pop_from(struct queue *q, unsigned level)
+{
+	struct entry *e;
+
+	for (; level < q->nr_levels; level++)
+		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e))
+			if (!e->sentinel) {
+				l_del(q->es, q->qs + e->level, e);
+				return e;
+			}
+
+	return NULL;
+}
+
+static void q_set_targets_subrange_(struct queue *q, unsigned nr_elts, unsigned lbegin, unsigned lend)
+{
+	unsigned level, nr_levels, entries_per_level, remainder;
+
+	BUG_ON(lbegin > lend);
+	BUG_ON(lend > q->nr_levels);
+	nr_levels = lend - lbegin;
+	entries_per_level = safe_div(nr_elts, nr_levels);
+	remainder = safe_mod(nr_elts, nr_levels);
+
+	for (level = lbegin; level < lend; level++)
+		q->target_count[level] =
+			(level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level;
+}
+
+/*
+ * Typically we have fewer elements in the top few levels which allows us
+ * to adjust the promote threshold nicely.
+ */
+static void q_set_targets(struct queue *q)
+{
+	if (q->last_target_nr_elts == q->nr_elts)
+		return;
+
+	q->last_target_nr_elts = q->nr_elts;
+
+	if (q->nr_top_levels > q->nr_levels)
+		q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels);
+
+	else {
+		q_set_targets_subrange_(q, q->nr_in_top_levels,
+					q->nr_levels - q->nr_top_levels, q->nr_levels);
+
+		if (q->nr_in_top_levels < q->nr_elts)
+			q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels,
+						0, q->nr_levels - q->nr_top_levels);
+		else
+			q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels);
+	}
+}
+
+static void q_redistribute(struct queue *q)
+{
+	unsigned target, level;
+	struct ilist *l, *l_above;
+	struct entry *e;
+
+	q_set_targets(q);
+
+	for (level = 0u; level < q->nr_levels - 1u; level++) {
+		l = q->qs + level;
+		target = q->target_count[level];
+
+		/*
+		 * Pull down some entries from the level above.
+		 */
+		while (l->nr_elts < target) {
+			e = __redist_pop_from(q, level + 1u);
+			if (!e) {
+				/* bug in nr_elts */
+				break;
+			}
+
+			e->level = level;
+			l_add_tail(q->es, l, e);
+		}
+
+		/*
+		 * Push some entries up.
+		 */
+		l_above = q->qs + level + 1u;
+		while (l->nr_elts > target) {
+			e = l_pop_tail(q->es, l);
+
+			if (!e)
+				/* bug in nr_elts */
+				break;
+
+			e->level = level + 1u;
+			l_add_head(q->es, l_above, e);
+		}
+	}
+}
+
+static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels)
+{
+	struct entry *de;
+	unsigned new_level;
+
+	q_del(q, e);
+
+	if (extra_levels && (e->level < q->nr_levels - 1u)) {
+		new_level = min(q->nr_levels - 1u, e->level + extra_levels);
+		for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) {
+			if (de->sentinel)
+				continue;
+
+			q_del(q, de);
+			de->level = e->level;
+
+			if (dest)
+				q_push_before(q, dest, de);
+			else
+				q_push(q, de);
+			break;
+		}
+
+		e->level = new_level;
+	}
+
+	q_push(q, e);
+}
+
+static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels)
+{
+	q_requeue_before(q, NULL, e, extra_levels);
+}
+
+/*----------------------------------------------------------------*/
+
+#define FP_SHIFT 8
+#define SIXTEENTH (1u << (FP_SHIFT - 4u))
+#define EIGHTH (1u << (FP_SHIFT - 3u))
+
+struct stats {
+	unsigned hit_threshold;
+	unsigned hits;
+	unsigned misses;
+};
+
+enum performance {
+	Q_POOR,
+	Q_FAIR,
+	Q_WELL
+};
+
+static void stats_init(struct stats *s, unsigned nr_levels)
+{
+	s->hit_threshold = (nr_levels * 3u) / 4u;
+	s->hits = 0u;
+	s->misses = 0u;
+}
+
+static void stats_reset(struct stats *s)
+{
+	s->hits = s->misses = 0u;
+}
+
+static void stats_level_accessed(struct stats *s, unsigned level)
+{
+	if (level >= s->hit_threshold)
+		s->hits++;
+	else
+		s->misses++;
+}
+
+static void stats_miss(struct stats *s)
+{
+	s->misses++;
+}
+
+/*
+ * There are times when we don't have any confidence in the hotspot queue.
+ * Such as when a fresh cache is created and the blocks have been spread
+ * out across the levels, or if an io load changes.  We detect this by
+ * seeing how often a lookup is in the top levels of the hotspot queue.
+ */
+static enum performance stats_assess(struct stats *s)
+{
+	unsigned confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses);
+
+	if (confidence < SIXTEENTH)
+		return Q_POOR;
+
+	else if (confidence < EIGHTH)
+		return Q_FAIR;
+
+	else
+		return Q_WELL;
+}
+
+/*----------------------------------------------------------------*/
+
+struct hash_table {
+	struct entry_space *es;
+	unsigned long long hash_bits;
+	unsigned *buckets;
+};
+
+/*
+ * All cache entries are stored in a chained hash table.  To save space we
+ * use indexing again, and only store indexes to the next entry.
+ */
+static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries)
+{
+	unsigned i, nr_buckets;
+
+	ht->es = es;
+	nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
+	ht->hash_bits = ffs(nr_buckets) - 1;
+
+	ht->buckets = vmalloc(sizeof(*ht->buckets) * nr_buckets);
+	if (!ht->buckets)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_buckets; i++)
+		ht->buckets[i] = INDEXER_NULL;
+
+	return 0;
+}
+
+static void h_exit(struct hash_table *ht)
+{
+	vfree(ht->buckets);
+}
+
+static struct entry *h_head(struct hash_table *ht, unsigned bucket)
+{
+	return to_entry(ht->es, ht->buckets[bucket]);
+}
+
+static struct entry *h_next(struct hash_table *ht, struct entry *e)
+{
+	return to_entry(ht->es, e->hash_next);
+}
+
+static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e)
+{
+	e->hash_next = ht->buckets[bucket];
+	ht->buckets[bucket] = to_index(ht->es, e);
+}
+
+static void h_insert(struct hash_table *ht, struct entry *e)
+{
+	unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
+	__h_insert(ht, h, e);
+}
+
+static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock,
+				struct entry **prev)
+{
+	struct entry *e;
+
+	*prev = NULL;
+	for (e = h_head(ht, h); e; e = h_next(ht, e)) {
+		if (e->oblock == oblock)
+			return e;
+
+		*prev = e;
+	}
+
+	return NULL;
+}
+
+static void __h_unlink(struct hash_table *ht, unsigned h,
+		       struct entry *e, struct entry *prev)
+{
+	if (prev)
+		prev->hash_next = e->hash_next;
+	else
+		ht->buckets[h] = e->hash_next;
+}
+
+/*
+ * Also moves each entry to the front of the bucket.
+ */
+static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
+{
+	struct entry *e, *prev;
+	unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
+
+	e = __h_lookup(ht, h, oblock, &prev);
+	if (e && prev) {
+		/*
+		 * Move to the front because this entry is likely
+		 * to be hit again.
+		 */
+		__h_unlink(ht, h, e, prev);
+		__h_insert(ht, h, e);
+	}
+
+	return e;
+}
+
+static void h_remove(struct hash_table *ht, struct entry *e)
+{
+	unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
+	struct entry *prev;
+
+	/*
+	 * The down side of using a singly linked list is we have to
+	 * iterate the bucket to remove an item.
+	 */
+	e = __h_lookup(ht, h, e->oblock, &prev);
+	if (e)
+		__h_unlink(ht, h, e, prev);
+}
+
+/*----------------------------------------------------------------*/
+
+struct entry_alloc {
+	struct entry_space *es;
+	unsigned begin;
+
+	unsigned nr_allocated;
+	struct ilist free;
+};
+
+static void init_allocator(struct entry_alloc *ea, struct entry_space *es,
+			   unsigned begin, unsigned end)
+{
+	unsigned i;
+
+	ea->es = es;
+	ea->nr_allocated = 0u;
+	ea->begin = begin;
+
+	l_init(&ea->free);
+	for (i = begin; i != end; i++)
+		l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i));
+}
+
+static void init_entry(struct entry *e)
+{
+	/*
+	 * We can't memset because that would clear the hotspot and
+	 * sentinel bits which remain constant.
+	 */
+	e->hash_next = INDEXER_NULL;
+	e->next = INDEXER_NULL;
+	e->prev = INDEXER_NULL;
+	e->level = 0u;
+	e->allocated = true;
+}
+
+static struct entry *alloc_entry(struct entry_alloc *ea)
+{
+	struct entry *e;
+
+	if (l_empty(&ea->free))
+		return NULL;
+
+	e = l_pop_tail(ea->es, &ea->free);
+	init_entry(e);
+	ea->nr_allocated++;
+
+	return e;
+}
+
+/*
+ * This assumes the cblock hasn't already been allocated.
+ */
+static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned i)
+{
+	struct entry *e = __get_entry(ea->es, ea->begin + i);
+
+	BUG_ON(e->allocated);
+
+	l_del(ea->es, &ea->free, e);
+	init_entry(e);
+	ea->nr_allocated++;
+
+	return e;
+}
+
+static void free_entry(struct entry_alloc *ea, struct entry *e)
+{
+	BUG_ON(!ea->nr_allocated);
+	BUG_ON(!e->allocated);
+
+	ea->nr_allocated--;
+	e->allocated = false;
+	l_add_tail(ea->es, &ea->free, e);
+}
+
+static bool allocator_empty(struct entry_alloc *ea)
+{
+	return l_empty(&ea->free);
+}
+
+static unsigned get_index(struct entry_alloc *ea, struct entry *e)
+{
+	return to_index(ea->es, e) - ea->begin;
+}
+
+static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
+{
+	return __get_entry(ea->es, ea->begin + index);
+}
+
+/*----------------------------------------------------------------*/
+
+#define NR_HOTSPOT_LEVELS 64u
+#define NR_CACHE_LEVELS 64u
+
+#define WRITEBACK_PERIOD (10 * HZ)
+#define DEMOTE_PERIOD (60 * HZ)
+
+#define HOTSPOT_UPDATE_PERIOD (HZ)
+#define CACHE_UPDATE_PERIOD (10u * HZ)
+
+struct smq_policy {
+	struct dm_cache_policy policy;
+
+	/* protects everything */
+	struct mutex lock;
+	dm_cblock_t cache_size;
+	sector_t cache_block_size;
+
+	sector_t hotspot_block_size;
+	unsigned nr_hotspot_blocks;
+	unsigned cache_blocks_per_hotspot_block;
+	unsigned hotspot_level_jump;
+
+	struct entry_space es;
+	struct entry_alloc writeback_sentinel_alloc;
+	struct entry_alloc demote_sentinel_alloc;
+	struct entry_alloc hotspot_alloc;
+	struct entry_alloc cache_alloc;
+
+	unsigned long *hotspot_hit_bits;
+	unsigned long *cache_hit_bits;
+
+	/*
+	 * We maintain three queues of entries.  The cache proper,
+	 * consisting of a clean and dirty queue, containing the currently
+	 * active mappings.  The hotspot queue uses a larger block size to
+	 * track blocks that are being hit frequently and potential
+	 * candidates for promotion to the cache.
+	 */
+	struct queue hotspot;
+	struct queue clean;
+	struct queue dirty;
+
+	struct stats hotspot_stats;
+	struct stats cache_stats;
+
+	/*
+	 * Keeps track of time, incremented by the core.  We use this to
+	 * avoid attributing multiple hits within the same tick.
+	 *
+	 * Access to tick_protected should be done with the spin lock held.
+	 * It's copied to tick at the start of the map function (within the
+	 * mutex).
+	 */
+	spinlock_t tick_lock;
+	unsigned tick_protected;
+	unsigned tick;
+
+	/*
+	 * The hash tables allows us to quickly find an entry by origin
+	 * block.
+	 */
+	struct hash_table table;
+	struct hash_table hotspot_table;
+
+	bool current_writeback_sentinels;
+	unsigned long next_writeback_period;
+
+	bool current_demote_sentinels;
+	unsigned long next_demote_period;
+
+	unsigned write_promote_level;
+	unsigned read_promote_level;
+
+	unsigned long next_hotspot_period;
+	unsigned long next_cache_period;
+};
+
+/*----------------------------------------------------------------*/
+
+static struct entry *get_sentinel(struct entry_alloc *ea, unsigned level, bool which)
+{
+	return get_entry(ea, which ? level : NR_CACHE_LEVELS + level);
+}
+
+static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned level)
+{
+	return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels);
+}
+
+static struct entry *demote_sentinel(struct smq_policy *mq, unsigned level)
+{
+	return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels);
+}
+
+static void __update_writeback_sentinels(struct smq_policy *mq)
+{
+	unsigned level;
+	struct queue *q = &mq->dirty;
+	struct entry *sentinel;
+
+	for (level = 0; level < q->nr_levels; level++) {
+		sentinel = writeback_sentinel(mq, level);
+		q_del(q, sentinel);
+		q_push(q, sentinel);
+	}
+}
+
+static void __update_demote_sentinels(struct smq_policy *mq)
+{
+	unsigned level;
+	struct queue *q = &mq->clean;
+	struct entry *sentinel;
+
+	for (level = 0; level < q->nr_levels; level++) {
+		sentinel = demote_sentinel(mq, level);
+		q_del(q, sentinel);
+		q_push(q, sentinel);
+	}
+}
+
+static void update_sentinels(struct smq_policy *mq)
+{
+	if (time_after(jiffies, mq->next_writeback_period)) {
+		__update_writeback_sentinels(mq);
+		mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
+		mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
+	}
+
+	if (time_after(jiffies, mq->next_demote_period)) {
+		__update_demote_sentinels(mq);
+		mq->next_demote_period = jiffies + DEMOTE_PERIOD;
+		mq->current_demote_sentinels = !mq->current_demote_sentinels;
+	}
+}
+
+static void __sentinels_init(struct smq_policy *mq)
+{
+	unsigned level;
+	struct entry *sentinel;
+
+	for (level = 0; level < NR_CACHE_LEVELS; level++) {
+		sentinel = writeback_sentinel(mq, level);
+		sentinel->level = level;
+		q_push(&mq->dirty, sentinel);
+
+		sentinel = demote_sentinel(mq, level);
+		sentinel->level = level;
+		q_push(&mq->clean, sentinel);
+	}
+}
+
+static void sentinels_init(struct smq_policy *mq)
+{
+	mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
+	mq->next_demote_period = jiffies + DEMOTE_PERIOD;
+
+	mq->current_writeback_sentinels = false;
+	mq->current_demote_sentinels = false;
+	__sentinels_init(mq);
+
+	mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
+	mq->current_demote_sentinels = !mq->current_demote_sentinels;
+	__sentinels_init(mq);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * These methods tie together the dirty queue, clean queue and hash table.
+ */
+static void push_new(struct smq_policy *mq, struct entry *e)
+{
+	struct queue *q = e->dirty ? &mq->dirty : &mq->clean;
+	h_insert(&mq->table, e);
+	q_push(q, e);
+}
+
+static void push(struct smq_policy *mq, struct entry *e)
+{
+	struct entry *sentinel;
+
+	h_insert(&mq->table, e);
+
+	/*
+	 * Punch this into the queue just in front of the sentinel, to
+	 * ensure it's cleaned straight away.
+	 */
+	if (e->dirty) {
+		sentinel = writeback_sentinel(mq, e->level);
+		q_push_before(&mq->dirty, sentinel, e);
+	} else {
+		sentinel = demote_sentinel(mq, e->level);
+		q_push_before(&mq->clean, sentinel, e);
+	}
+}
+
+/*
+ * Removes an entry from cache.  Removes from the hash table.
+ */
+static void __del(struct smq_policy *mq, struct queue *q, struct entry *e)
+{
+	q_del(q, e);
+	h_remove(&mq->table, e);
+}
+
+static void del(struct smq_policy *mq, struct entry *e)
+{
+	__del(mq, e->dirty ? &mq->dirty : &mq->clean, e);
+}
+
+static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level)
+{
+	struct entry *e = q_pop_old(q, max_level);
+	if (e)
+		h_remove(&mq->table, e);
+	return e;
+}
+
+static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
+{
+	return to_cblock(get_index(&mq->cache_alloc, e));
+}
+
+static void requeue(struct smq_policy *mq, struct entry *e)
+{
+	struct entry *sentinel;
+
+	if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
+		if (e->dirty) {
+			sentinel = writeback_sentinel(mq, e->level);
+			q_requeue_before(&mq->dirty, sentinel, e, 1u);
+		} else {
+			sentinel = demote_sentinel(mq, e->level);
+			q_requeue_before(&mq->clean, sentinel, e, 1u);
+		}
+	}
+}
+
+static unsigned default_promote_level(struct smq_policy *mq)
+{
+	/*
+	 * The promote level depends on the current performance of the
+	 * cache.
+	 *
+	 * If the cache is performing badly, then we can't afford
+	 * to promote much without causing performance to drop below that
+	 * of the origin device.
+	 *
+	 * If the cache is performing well, then we don't need to promote
+	 * much.  If it isn't broken, don't fix it.
+	 *
+	 * If the cache is middling then we promote more.
+	 *
+	 * This scheme reminds me of a graph of entropy vs probability of a
+	 * binary variable.
+	 */
+	static unsigned table[] = {1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1};
+
+	unsigned hits = mq->cache_stats.hits;
+	unsigned misses = mq->cache_stats.misses;
+	unsigned index = safe_div(hits << 4u, hits + misses);
+	return table[index];
+}
+
+static void update_promote_levels(struct smq_policy *mq)
+{
+	/*
+	 * If there are unused cache entries then we want to be really
+	 * eager to promote.
+	 */
+	unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
+		default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
+
+	/*
+	 * If the hotspot queue is performing badly then we have little
+	 * confidence that we know which blocks to promote.  So we cut down
+	 * the amount of promotions.
+	 */
+	switch (stats_assess(&mq->hotspot_stats)) {
+	case Q_POOR:
+		threshold_level /= 4u;
+		break;
+
+	case Q_FAIR:
+		threshold_level /= 2u;
+		break;
+
+	case Q_WELL:
+		break;
+	}
+
+	mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
+	mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u;
+}
+
+/*
+ * If the hotspot queue is performing badly, then we try and move entries
+ * around more quickly.
+ */
+static void update_level_jump(struct smq_policy *mq)
+{
+	switch (stats_assess(&mq->hotspot_stats)) {
+	case Q_POOR:
+		mq->hotspot_level_jump = 4u;
+		break;
+
+	case Q_FAIR:
+		mq->hotspot_level_jump = 2u;
+		break;
+
+	case Q_WELL:
+		mq->hotspot_level_jump = 1u;
+		break;
+	}
+}
+
+static void end_hotspot_period(struct smq_policy *mq)
+{
+	clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
+	update_promote_levels(mq);
+
+	if (time_after(jiffies, mq->next_hotspot_period)) {
+		update_level_jump(mq);
+		q_redistribute(&mq->hotspot);
+		stats_reset(&mq->hotspot_stats);
+		mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
+	}
+}
+
+static void end_cache_period(struct smq_policy *mq)
+{
+	if (time_after(jiffies, mq->next_cache_period)) {
+		clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
+
+		q_redistribute(&mq->dirty);
+		q_redistribute(&mq->clean);
+		stats_reset(&mq->cache_stats);
+
+		mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
+	}
+}
+
+static int demote_cblock(struct smq_policy *mq,
+			 struct policy_locker *locker,
+			 dm_oblock_t *oblock)
+{
+	struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false);
+	if (!demoted)
+		/*
+		 * We could get a block from mq->dirty, but that
+		 * would add extra latency to the triggering bio as it
+		 * waits for the writeback.  Better to not promote this
+		 * time and hope there's a clean block next time this block
+		 * is hit.
+		 */
+		return -ENOSPC;
+
+	if (locker->fn(locker, demoted->oblock))
+		/*
+		 * We couldn't lock this block.
+		 */
+		return -EBUSY;
+
+	del(mq, demoted);
+	*oblock = demoted->oblock;
+	free_entry(&mq->cache_alloc, demoted);
+
+	return 0;
+}
+
+enum promote_result {
+	PROMOTE_NOT,
+	PROMOTE_TEMPORARY,
+	PROMOTE_PERMANENT
+};
+
+/*
+ * Converts a boolean into a promote result.
+ */
+static enum promote_result maybe_promote(bool promote)
+{
+	return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
+}
+
+static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio,
+					  bool fast_promote)
+{
+	if (bio_data_dir(bio) == WRITE) {
+		if (!allocator_empty(&mq->cache_alloc) && fast_promote)
+			return PROMOTE_TEMPORARY;
+
+		else
+			return maybe_promote(hs_e->level >= mq->write_promote_level);
+	} else
+		return maybe_promote(hs_e->level >= mq->read_promote_level);
+}
+
+static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock,
+			    struct policy_locker *locker,
+			    struct policy_result *result, enum promote_result pr)
+{
+	int r;
+	struct entry *e;
+
+	if (allocator_empty(&mq->cache_alloc)) {
+		result->op = POLICY_REPLACE;
+		r = demote_cblock(mq, locker, &result->old_oblock);
+		if (r) {
+			result->op = POLICY_MISS;
+			return;
+		}
+
+	} else
+		result->op = POLICY_NEW;
+
+	e = alloc_entry(&mq->cache_alloc);
+	BUG_ON(!e);
+	e->oblock = oblock;
+
+	if (pr == PROMOTE_TEMPORARY)
+		push(mq, e);
+	else
+		push_new(mq, e);
+
+	result->cblock = infer_cblock(mq, e);
+}
+
+static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
+{
+	sector_t r = from_oblock(b);
+	(void) sector_div(r, mq->cache_blocks_per_hotspot_block);
+	return to_oblock(r);
+}
+
+static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio)
+{
+	unsigned hi;
+	dm_oblock_t hb = to_hblock(mq, b);
+	struct entry *e = h_lookup(&mq->hotspot_table, hb);
+
+	if (e) {
+		stats_level_accessed(&mq->hotspot_stats, e->level);
+
+		hi = get_index(&mq->hotspot_alloc, e);
+		q_requeue(&mq->hotspot, e,
+			  test_and_set_bit(hi, mq->hotspot_hit_bits) ?
+			  0u : mq->hotspot_level_jump);
+
+	} else {
+		stats_miss(&mq->hotspot_stats);
+
+		e = alloc_entry(&mq->hotspot_alloc);
+		if (!e) {
+			e = q_pop(&mq->hotspot);
+			if (e) {
+				h_remove(&mq->hotspot_table, e);
+				hi = get_index(&mq->hotspot_alloc, e);
+				clear_bit(hi, mq->hotspot_hit_bits);
+			}
+
+		}
+
+		if (e) {
+			e->oblock = hb;
+			q_push(&mq->hotspot, e);
+			h_insert(&mq->hotspot_table, e);
+		}
+	}
+
+	return e;
+}
+
+/*
+ * Looks the oblock up in the hash table, then decides whether to put in
+ * pre_cache, or cache etc.
+ */
+static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock,
+	       bool can_migrate, bool fast_promote,
+	       struct policy_locker *locker, struct policy_result *result)
+{
+	struct entry *e, *hs_e;
+	enum promote_result pr;
+
+	hs_e = update_hotspot_queue(mq, oblock, bio);
+
+	e = h_lookup(&mq->table, oblock);
+	if (e) {
+		stats_level_accessed(&mq->cache_stats, e->level);
+
+		requeue(mq, e);
+		result->op = POLICY_HIT;
+		result->cblock = infer_cblock(mq, e);
+
+	} else {
+		stats_miss(&mq->cache_stats);
+
+		pr = should_promote(mq, hs_e, bio, fast_promote);
+		if (pr == PROMOTE_NOT)
+			result->op = POLICY_MISS;
+
+		else {
+			if (!can_migrate) {
+				result->op = POLICY_MISS;
+				return -EWOULDBLOCK;
+			}
+
+			insert_in_cache(mq, oblock, locker, result, pr);
+		}
+	}
+
+	return 0;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Public interface, via the policy struct.  See dm-cache-policy.h for a
+ * description of these.
+ */
+
+static struct smq_policy *to_smq_policy(struct dm_cache_policy *p)
+{
+	return container_of(p, struct smq_policy, policy);
+}
+
+static void smq_destroy(struct dm_cache_policy *p)
+{
+	struct smq_policy *mq = to_smq_policy(p);
+
+	h_exit(&mq->hotspot_table);
+	h_exit(&mq->table);
+	free_bitset(mq->hotspot_hit_bits);
+	free_bitset(mq->cache_hit_bits);
+	space_exit(&mq->es);
+	kfree(mq);
+}
+
+static void copy_tick(struct smq_policy *mq)
+{
+	unsigned long flags, tick;
+
+	spin_lock_irqsave(&mq->tick_lock, flags);
+	tick = mq->tick_protected;
+	if (tick != mq->tick) {
+		update_sentinels(mq);
+		end_hotspot_period(mq);
+		end_cache_period(mq);
+		mq->tick = tick;
+	}
+	spin_unlock_irqrestore(&mq->tick_lock, flags);
+}
+
+static bool maybe_lock(struct smq_policy *mq, bool can_block)
+{
+	if (can_block) {
+		mutex_lock(&mq->lock);
+		return true;
+	} else
+		return mutex_trylock(&mq->lock);
+}
+
+static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
+		   bool can_block, bool can_migrate, bool fast_promote,
+		   struct bio *bio, struct policy_locker *locker,
+		   struct policy_result *result)
+{
+	int r;
+	struct smq_policy *mq = to_smq_policy(p);
+
+	result->op = POLICY_MISS;
+
+	if (!maybe_lock(mq, can_block))
+		return -EWOULDBLOCK;
+
+	copy_tick(mq);
+	r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result);
+	mutex_unlock(&mq->lock);
+
+	return r;
+}
+
+static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+{
+	int r;
+	struct smq_policy *mq = to_smq_policy(p);
+	struct entry *e;
+
+	if (!mutex_trylock(&mq->lock))
+		return -EWOULDBLOCK;
+
+	e = h_lookup(&mq->table, oblock);
+	if (e) {
+		*cblock = infer_cblock(mq, e);
+		r = 0;
+	} else
+		r = -ENOENT;
+
+	mutex_unlock(&mq->lock);
+
+	return r;
+}
+
+static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set)
+{
+	struct entry *e;
+
+	e = h_lookup(&mq->table, oblock);
+	BUG_ON(!e);
+
+	del(mq, e);
+	e->dirty = set;
+	push(mq, e);
+}
+
+static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+	struct smq_policy *mq = to_smq_policy(p);
+
+	mutex_lock(&mq->lock);
+	__smq_set_clear_dirty(mq, oblock, true);
+	mutex_unlock(&mq->lock);
+}
+
+static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+	struct smq_policy *mq = to_smq_policy(p);
+
+	mutex_lock(&mq->lock);
+	__smq_set_clear_dirty(mq, oblock, false);
+	mutex_unlock(&mq->lock);
+}
+
+static int smq_load_mapping(struct dm_cache_policy *p,
+			    dm_oblock_t oblock, dm_cblock_t cblock,
+			    uint32_t hint, bool hint_valid)
+{
+	struct smq_policy *mq = to_smq_policy(p);
+	struct entry *e;
+
+	e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
+	e->oblock = oblock;
+	e->dirty = false;	/* this gets corrected in a minute */
+	e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : 1;
+	push(mq, e);
+
+	return 0;
+}
+
+static int smq_save_hints(struct smq_policy *mq, struct queue *q,
+			  policy_walk_fn fn, void *context)
+{
+	int r;
+	unsigned level;
+	struct entry *e;
+
+	for (level = 0; level < q->nr_levels; level++)
+		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
+			if (!e->sentinel) {
+				r = fn(context, infer_cblock(mq, e),
+				       e->oblock, e->level);
+				if (r)
+					return r;
+			}
+		}
+
+	return 0;
+}
+
+static int smq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
+			     void *context)
+{
+	struct smq_policy *mq = to_smq_policy(p);
+	int r = 0;
+
+	mutex_lock(&mq->lock);
+
+	r = smq_save_hints(mq, &mq->clean, fn, context);
+	if (!r)
+		r = smq_save_hints(mq, &mq->dirty, fn, context);
+
+	mutex_unlock(&mq->lock);
+
+	return r;
+}
+
+static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock)
+{
+	struct entry *e;
+
+	e = h_lookup(&mq->table, oblock);
+	BUG_ON(!e);
+
+	del(mq, e);
+	free_entry(&mq->cache_alloc, e);
+}
+
+static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+	struct smq_policy *mq = to_smq_policy(p);
+
+	mutex_lock(&mq->lock);
+	__remove_mapping(mq, oblock);
+	mutex_unlock(&mq->lock);
+}
+
+static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock)
+{
+	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
+
+	if (!e || !e->allocated)
+		return -ENODATA;
+
+	del(mq, e);
+	free_entry(&mq->cache_alloc, e);
+
+	return 0;
+}
+
+static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+	int r;
+	struct smq_policy *mq = to_smq_policy(p);
+
+	mutex_lock(&mq->lock);
+	r = __remove_cblock(mq, cblock);
+	mutex_unlock(&mq->lock);
+
+	return r;
+}
+
+
+#define CLEAN_TARGET_CRITICAL 5u /* percent */
+
+static bool clean_target_met(struct smq_policy *mq, bool critical)
+{
+	if (critical) {
+		/*
+		 * Cache entries may not be populated.  So we're cannot rely on the
+		 * size of the clean queue.
+		 */
+		unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
+		unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u;
+
+		return nr_clean >= target;
+	} else
+		return !q_size(&mq->dirty);
+}
+
+static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock,
+				dm_cblock_t *cblock, bool critical_only)
+{
+	struct entry *e = NULL;
+	bool target_met = clean_target_met(mq, critical_only);
+
+	if (critical_only)
+		/*
+		 * Always try and keep the bottom level clean.
+		 */
+		e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels);
+
+	else
+		e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels);
+
+	if (!e)
+		return -ENODATA;
+
+	*oblock = e->oblock;
+	*cblock = infer_cblock(mq, e);
+	e->dirty = false;
+	push_new(mq, e);
+
+	return 0;
+}
+
+static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
+			      dm_cblock_t *cblock, bool critical_only)
+{
+	int r;
+	struct smq_policy *mq = to_smq_policy(p);
+
+	mutex_lock(&mq->lock);
+	r = __smq_writeback_work(mq, oblock, cblock, critical_only);
+	mutex_unlock(&mq->lock);
+
+	return r;
+}
+
+static void __force_mapping(struct smq_policy *mq,
+			    dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+	struct entry *e = h_lookup(&mq->table, current_oblock);
+
+	if (e) {
+		del(mq, e);
+		e->oblock = new_oblock;
+		e->dirty = true;
+		push(mq, e);
+	}
+}
+
+static void smq_force_mapping(struct dm_cache_policy *p,
+			      dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+	struct smq_policy *mq = to_smq_policy(p);
+
+	mutex_lock(&mq->lock);
+	__force_mapping(mq, current_oblock, new_oblock);
+	mutex_unlock(&mq->lock);
+}
+
+static dm_cblock_t smq_residency(struct dm_cache_policy *p)
+{
+	dm_cblock_t r;
+	struct smq_policy *mq = to_smq_policy(p);
+
+	mutex_lock(&mq->lock);
+	r = to_cblock(mq->cache_alloc.nr_allocated);
+	mutex_unlock(&mq->lock);
+
+	return r;
+}
+
+static void smq_tick(struct dm_cache_policy *p)
+{
+	struct smq_policy *mq = to_smq_policy(p);
+	unsigned long flags;
+
+	spin_lock_irqsave(&mq->tick_lock, flags);
+	mq->tick_protected++;
+	spin_unlock_irqrestore(&mq->tick_lock, flags);
+}
+
+/* Init the policy plugin interface function pointers. */
+static void init_policy_functions(struct smq_policy *mq)
+{
+	mq->policy.destroy = smq_destroy;
+	mq->policy.map = smq_map;
+	mq->policy.lookup = smq_lookup;
+	mq->policy.set_dirty = smq_set_dirty;
+	mq->policy.clear_dirty = smq_clear_dirty;
+	mq->policy.load_mapping = smq_load_mapping;
+	mq->policy.walk_mappings = smq_walk_mappings;
+	mq->policy.remove_mapping = smq_remove_mapping;
+	mq->policy.remove_cblock = smq_remove_cblock;
+	mq->policy.writeback_work = smq_writeback_work;
+	mq->policy.force_mapping = smq_force_mapping;
+	mq->policy.residency = smq_residency;
+	mq->policy.tick = smq_tick;
+}
+
+static bool too_many_hotspot_blocks(sector_t origin_size,
+				    sector_t hotspot_block_size,
+				    unsigned nr_hotspot_blocks)
+{
+	return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
+}
+
+static void calc_hotspot_params(sector_t origin_size,
+				sector_t cache_block_size,
+				unsigned nr_cache_blocks,
+				sector_t *hotspot_block_size,
+				unsigned *nr_hotspot_blocks)
+{
+	*hotspot_block_size = cache_block_size * 16u;
+	*nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u);
+
+	while ((*hotspot_block_size > cache_block_size) &&
+	       too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks))
+		*hotspot_block_size /= 2u;
+}
+
+static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
+					  sector_t origin_size,
+					  sector_t cache_block_size)
+{
+	unsigned i;
+	unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
+	unsigned total_sentinels = 2u * nr_sentinels_per_queue;
+	struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
+
+	if (!mq)
+		return NULL;
+
+	init_policy_functions(mq);
+	mq->cache_size = cache_size;
+	mq->cache_block_size = cache_block_size;
+
+	calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size),
+			    &mq->hotspot_block_size, &mq->nr_hotspot_blocks);
+
+	mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size);
+	mq->hotspot_level_jump = 1u;
+	if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) {
+		DMERR("couldn't initialize entry space");
+		goto bad_pool_init;
+	}
+
+	init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
+        for (i = 0; i < nr_sentinels_per_queue; i++)
+		get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
+
+	init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
+        for (i = 0; i < nr_sentinels_per_queue; i++)
+		get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
+
+	init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
+		       total_sentinels + mq->nr_hotspot_blocks);
+
+	init_allocator(&mq->cache_alloc, &mq->es,
+		       total_sentinels + mq->nr_hotspot_blocks,
+		       total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size));
+
+	mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks);
+	if (!mq->hotspot_hit_bits) {
+		DMERR("couldn't allocate hotspot hit bitset");
+		goto bad_hotspot_hit_bits;
+	}
+	clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
+
+	if (from_cblock(cache_size)) {
+		mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
+		if (!mq->cache_hit_bits && mq->cache_hit_bits) {
+			DMERR("couldn't allocate cache hit bitset");
+			goto bad_cache_hit_bits;
+		}
+		clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
+	} else
+		mq->cache_hit_bits = NULL;
+
+	mq->tick_protected = 0;
+	mq->tick = 0;
+	mutex_init(&mq->lock);
+	spin_lock_init(&mq->tick_lock);
+
+	q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
+	mq->hotspot.nr_top_levels = 8;
+	mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
+					   from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
+
+	q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS);
+	q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS);
+
+	stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS);
+	stats_init(&mq->cache_stats, NR_CACHE_LEVELS);
+
+	if (h_init(&mq->table, &mq->es, from_cblock(cache_size)))
+		goto bad_alloc_table;
+
+	if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks))
+		goto bad_alloc_hotspot_table;
+
+	sentinels_init(mq);
+	mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
+
+	mq->next_hotspot_period = jiffies;
+	mq->next_cache_period = jiffies;
+
+	return &mq->policy;
+
+bad_alloc_hotspot_table:
+	h_exit(&mq->table);
+bad_alloc_table:
+	free_bitset(mq->cache_hit_bits);
+bad_cache_hit_bits:
+	free_bitset(mq->hotspot_hit_bits);
+bad_hotspot_hit_bits:
+	space_exit(&mq->es);
+bad_pool_init:
+	kfree(mq);
+
+	return NULL;
+}
+
+/*----------------------------------------------------------------*/
+
+static struct dm_cache_policy_type smq_policy_type = {
+	.name = "smq",
+	.version = {1, 0, 0},
+	.hint_size = 4,
+	.owner = THIS_MODULE,
+	.create = smq_create
+};
+
+static int __init smq_init(void)
+{
+	int r;
+
+	r = dm_cache_policy_register(&smq_policy_type);
+	if (r) {
+		DMERR("register failed %d", r);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void __exit smq_exit(void)
+{
+	dm_cache_policy_unregister(&smq_policy_type);
+}
+
+module_init(smq_init);
+module_exit(smq_exit);
+
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("smq cache policy");

From 88bf5184fa5861e766e39fd34fc6d21557ac7be8 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 27 May 2015 15:39:45 +0100
Subject: [PATCH 23/39] dm cache: wake the worker thread every time we free a
 migration object

When the cache is idle, writeback work was only being issued every
second.  With this change outstanding writebacks are streamed
constantly.  This offers a writeback performance improvement.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 7829d947ef01..6d36ed3c46a0 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -418,10 +418,13 @@ static struct dm_cache_migration *alloc_migration(struct cache *cache)
 
 static void free_migration(struct dm_cache_migration *mg)
 {
-	if (atomic_dec_and_test(&mg->cache->nr_allocated_migrations))
-		wake_up(&mg->cache->migration_wait);
+	struct cache *cache = mg->cache;
 
-	mempool_free(mg, mg->cache->migration_pool);
+	if (atomic_dec_and_test(&cache->nr_allocated_migrations))
+		wake_up(&cache->migration_wait);
+
+	mempool_free(mg, cache->migration_pool);
+	wake_worker(cache);
 }
 
 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)

From 028ae9f76f2935e8cf9974bff9a4587e3a995ff3 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 22 Apr 2015 16:42:35 -0400
Subject: [PATCH 24/39] dm cache: add fail io mode and needs_check flag

If a cache metadata operation fails (e.g. transaction commit) the
cache's metadata device will abort the current transaction, set a new
needs_check flag, and the cache will transition to "read-only" mode.  If
aborting the transaction or setting the needs_check flag fails the cache
will transition to "fail-io" mode.

Once needs_check is set the cache device will not be allowed to
activate.  Activation requires write access to metadata.  Future work is
needed to add proper support for running the cache in read-only mode.

Once in fail-io mode the cache will report a status of "Fail".

Also, add commit() wrapper that will disallow commits if in read_only or
fail mode.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/cache.txt |   9 +-
 drivers/md/dm-cache-metadata.c        | 133 ++++++++++++++---
 drivers/md/dm-cache-metadata.h        |  10 ++
 drivers/md/dm-cache-policy-internal.h |  10 +-
 drivers/md/dm-cache-policy-mq.c       |   8 +-
 drivers/md/dm-cache-policy.h          |   4 +-
 drivers/md/dm-cache-target.c          | 204 ++++++++++++++++++++++----
 7 files changed, 320 insertions(+), 58 deletions(-)

diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt
index 68c0f517c60e..82960cffbad3 100644
--- a/Documentation/device-mapper/cache.txt
+++ b/Documentation/device-mapper/cache.txt
@@ -221,6 +221,7 @@ Status
 <#read hits> <#read misses> <#write hits> <#write misses>
 <#demotions> <#promotions> <#dirty> <#features> <features>*
 <#core args> <core args>* <policy name> <#policy args> <policy args>*
+<cache metadata mode>
 
 metadata block size	 : Fixed block size for each metadata block in
 			     sectors
@@ -251,8 +252,12 @@ core args		 : Key/value pairs for tuning the core
 			     e.g. migration_threshold
 policy name		 : Name of the policy
 #policy args		 : Number of policy arguments to follow (must be even)
-policy args		 : Key/value pairs
-			     e.g. sequential_threshold
+policy args		 : Key/value pairs e.g. sequential_threshold
+cache metadata mode      : ro if read-only, rw if read-write
+	In serious cases where even a read-only mode is deemed unsafe
+	no further I/O will be permitted and the status will just
+	contain the string 'Fail'.  The userspace recovery tools
+	should then be used.
 
 Messages
 --------
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index c1c010498a21..20cc36b01b77 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -39,6 +39,8 @@
 enum superblock_flag_bits {
 	/* for spotting crashes that would invalidate the dirty bitset */
 	CLEAN_SHUTDOWN,
+	/* metadata must be checked using the tools */
+	NEEDS_CHECK,
 };
 
 /*
@@ -107,6 +109,7 @@ struct dm_cache_metadata {
 	struct dm_disk_bitset discard_info;
 
 	struct rw_semaphore root_lock;
+	unsigned long flags;
 	dm_block_t root;
 	dm_block_t hint_root;
 	dm_block_t discard_root;
@@ -129,6 +132,14 @@ struct dm_cache_metadata {
 	 * buffer before the superblock is locked and updated.
 	 */
 	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+
+	/*
+	 * Set if a transaction has to be aborted but the attempt to roll
+	 * back to the previous (good) transaction failed.  The only
+	 * metadata operation permissible in this state is the closing of
+	 * the device.
+	 */
+	bool fail_io:1;
 };
 
 /*-------------------------------------------------------------------
@@ -527,6 +538,7 @@ static unsigned long clear_clean_shutdown(unsigned long flags)
 static void read_superblock_fields(struct dm_cache_metadata *cmd,
 				   struct cache_disk_superblock *disk_super)
 {
+	cmd->flags = le32_to_cpu(disk_super->flags);
 	cmd->root = le64_to_cpu(disk_super->mapping_root);
 	cmd->hint_root = le64_to_cpu(disk_super->hint_root);
 	cmd->discard_root = le64_to_cpu(disk_super->discard_root);
@@ -625,6 +637,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
 	if (mutator)
 		update_flags(disk_super, mutator);
 
+	disk_super->flags = cpu_to_le32(cmd->flags);
 	disk_super->mapping_root = cpu_to_le64(cmd->root);
 	disk_super->hint_root = cpu_to_le64(cmd->hint_root);
 	disk_super->discard_root = cpu_to_le64(cmd->discard_root);
@@ -693,6 +706,7 @@ static struct dm_cache_metadata *metadata_open(struct block_device *bdev,
 	cmd->cache_blocks = 0;
 	cmd->policy_hint_size = policy_hint_size;
 	cmd->changed = true;
+	cmd->fail_io = false;
 
 	r = __create_persistent_data_objects(cmd, may_format_device);
 	if (r) {
@@ -796,7 +810,8 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
 		list_del(&cmd->list);
 		mutex_unlock(&table_lock);
 
-		__destroy_persistent_data_objects(cmd);
+		if (!cmd->fail_io)
+			__destroy_persistent_data_objects(cmd);
 		kfree(cmd);
 	}
 }
@@ -848,13 +863,26 @@ static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
 	return 0;
 }
 
+#define WRITE_LOCK(cmd) \
+	if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \
+		return -EINVAL; \
+	down_write(&cmd->root_lock)
+
+#define WRITE_LOCK_VOID(cmd) \
+	if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \
+		return; \
+	down_write(&cmd->root_lock)
+
+#define WRITE_UNLOCK(cmd) \
+	up_write(&cmd->root_lock)
+
 int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
 {
 	int r;
 	bool clean;
 	__le64 null_mapping = pack_value(0, 0);
 
-	down_write(&cmd->root_lock);
+	WRITE_LOCK(cmd);
 	__dm_bless_for_disk(&null_mapping);
 
 	if (from_cblock(new_cache_size) < from_cblock(cmd->cache_blocks)) {
@@ -880,7 +908,7 @@ int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
 	cmd->changed = true;
 
 out:
-	up_write(&cmd->root_lock);
+	WRITE_UNLOCK(cmd);
 
 	return r;
 }
@@ -891,7 +919,7 @@ int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
 {
 	int r;
 
-	down_write(&cmd->root_lock);
+	WRITE_LOCK(cmd);
 	r = dm_bitset_resize(&cmd->discard_info,
 			     cmd->discard_root,
 			     from_dblock(cmd->discard_nr_blocks),
@@ -903,7 +931,7 @@ int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
 	}
 
 	cmd->changed = true;
-	up_write(&cmd->root_lock);
+	WRITE_UNLOCK(cmd);
 
 	return r;
 }
@@ -946,9 +974,9 @@ int dm_cache_set_discard(struct dm_cache_metadata *cmd,
 {
 	int r;
 
-	down_write(&cmd->root_lock);
+	WRITE_LOCK(cmd);
 	r = __discard(cmd, dblock, discard);
-	up_write(&cmd->root_lock);
+	WRITE_UNLOCK(cmd);
 
 	return r;
 }
@@ -1020,9 +1048,9 @@ int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
 {
 	int r;
 
-	down_write(&cmd->root_lock);
+	WRITE_LOCK(cmd);
 	r = __remove(cmd, cblock);
-	up_write(&cmd->root_lock);
+	WRITE_UNLOCK(cmd);
 
 	return r;
 }
@@ -1048,9 +1076,9 @@ int dm_cache_insert_mapping(struct dm_cache_metadata *cmd,
 {
 	int r;
 
-	down_write(&cmd->root_lock);
+	WRITE_LOCK(cmd);
 	r = __insert(cmd, cblock, oblock);
-	up_write(&cmd->root_lock);
+	WRITE_UNLOCK(cmd);
 
 	return r;
 }
@@ -1234,9 +1262,9 @@ int dm_cache_set_dirty(struct dm_cache_metadata *cmd,
 {
 	int r;
 
-	down_write(&cmd->root_lock);
+	WRITE_LOCK(cmd);
 	r = __dirty(cmd, cblock, dirty);
-	up_write(&cmd->root_lock);
+	WRITE_UNLOCK(cmd);
 
 	return r;
 }
@@ -1252,9 +1280,9 @@ void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
 void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
 				 struct dm_cache_statistics *stats)
 {
-	down_write(&cmd->root_lock);
+	WRITE_LOCK_VOID(cmd);
 	cmd->stats = *stats;
-	up_write(&cmd->root_lock);
+	WRITE_UNLOCK(cmd);
 }
 
 int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
@@ -1263,7 +1291,7 @@ int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
 	flags_mutator mutator = (clean_shutdown ? set_clean_shutdown :
 				 clear_clean_shutdown);
 
-	down_write(&cmd->root_lock);
+	WRITE_LOCK(cmd);
 	r = __commit_transaction(cmd, mutator);
 	if (r)
 		goto out;
@@ -1271,7 +1299,7 @@ int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
 	r = __begin_transaction(cmd);
 
 out:
-	up_write(&cmd->root_lock);
+	WRITE_UNLOCK(cmd);
 	return r;
 }
 
@@ -1376,9 +1404,9 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
 {
 	int r;
 
-	down_write(&cmd->root_lock);
+	WRITE_LOCK(cmd);
 	r = write_hints(cmd, policy);
-	up_write(&cmd->root_lock);
+	WRITE_UNLOCK(cmd);
 
 	return r;
 }
@@ -1387,3 +1415,70 @@ int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result)
 {
 	return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result);
 }
+
+void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd)
+{
+	WRITE_LOCK_VOID(cmd);
+	dm_bm_set_read_only(cmd->bm);
+	WRITE_UNLOCK(cmd);
+}
+
+void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd)
+{
+	WRITE_LOCK_VOID(cmd);
+	dm_bm_set_read_write(cmd->bm);
+	WRITE_UNLOCK(cmd);
+}
+
+int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd)
+{
+	int r;
+	struct dm_block *sblock;
+	struct cache_disk_superblock *disk_super;
+
+	/*
+	 * We ignore fail_io for this function.
+	 */
+	down_write(&cmd->root_lock);
+	set_bit(NEEDS_CHECK, &cmd->flags);
+
+	r = superblock_lock(cmd, &sblock);
+	if (r) {
+		DMERR("couldn't read superblock");
+		goto out;
+	}
+
+	disk_super = dm_block_data(sblock);
+	disk_super->flags = cpu_to_le32(cmd->flags);
+
+	dm_bm_unlock(sblock);
+
+out:
+	up_write(&cmd->root_lock);
+	return r;
+}
+
+bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd)
+{
+	bool needs_check;
+
+	down_read(&cmd->root_lock);
+	needs_check = !!test_bit(NEEDS_CHECK, &cmd->flags);
+	up_read(&cmd->root_lock);
+
+	return needs_check;
+}
+
+int dm_cache_metadata_abort(struct dm_cache_metadata *cmd)
+{
+	int r;
+
+	WRITE_LOCK(cmd);
+	__destroy_persistent_data_objects(cmd);
+	r = __create_persistent_data_objects(cmd, false);
+	if (r)
+		cmd->fail_io = true;
+	WRITE_UNLOCK(cmd);
+
+	return r;
+}
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 4ecc403be283..2ffee21f318d 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -102,6 +102,10 @@ struct dm_cache_statistics {
 
 void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
 				 struct dm_cache_statistics *stats);
+
+/*
+ * 'void' because it's no big deal if it fails.
+ */
 void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
 				 struct dm_cache_statistics *stats);
 
@@ -133,6 +137,12 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
  */
 int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result);
 
+bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd);
+int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd);
+void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd);
+void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd);
+int dm_cache_metadata_abort(struct dm_cache_metadata *cmd);
+
 /*----------------------------------------------------------------*/
 
 #endif /* DM_CACHE_METADATA_H */
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 9dc05a52369e..ccbe852d5362 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -89,13 +89,15 @@ static inline void policy_tick(struct dm_cache_policy *p)
 		return p->tick(p);
 }
 
-static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
+static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result,
+					    unsigned maxlen, ssize_t *sz_ptr)
 {
-	ssize_t sz = 0;
+	ssize_t sz = *sz_ptr;
 	if (p->emit_config_values)
-		return p->emit_config_values(p, result, maxlen);
+		return p->emit_config_values(p, result, maxlen, sz_ptr);
 
-	DMEMIT("0");
+	DMEMIT("0 ");
+	*sz_ptr = sz;
 	return 0;
 }
 
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 7cbae125879c..084eec653321 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -1323,22 +1323,24 @@ static int mq_set_config_value(struct dm_cache_policy *p,
 	return 0;
 }
 
-static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
+static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
+				 unsigned maxlen, ssize_t *sz_ptr)
 {
-	ssize_t sz = 0;
+	ssize_t sz = *sz_ptr;
 	struct mq_policy *mq = to_mq_policy(p);
 
 	DMEMIT("10 random_threshold %u "
 	       "sequential_threshold %u "
 	       "discard_promote_adjustment %u "
 	       "read_promote_adjustment %u "
-	       "write_promote_adjustment %u",
+	       "write_promote_adjustment %u ",
 	       mq->tracker.thresholds[PATTERN_RANDOM],
 	       mq->tracker.thresholds[PATTERN_SEQUENTIAL],
 	       mq->discard_promote_adjustment,
 	       mq->read_promote_adjustment,
 	       mq->write_promote_adjustment);
 
+	*sz_ptr = sz;
 	return 0;
 }
 
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index 6106ca3aa350..74709129d856 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -208,8 +208,8 @@ struct dm_cache_policy {
 	/*
 	 * Configuration.
 	 */
-	int (*emit_config_values)(struct dm_cache_policy *p,
-				  char *result, unsigned maxlen);
+	int (*emit_config_values)(struct dm_cache_policy *p, char *result,
+				  unsigned maxlen, ssize_t *sz_ptr);
 	int (*set_config_value)(struct dm_cache_policy *p,
 				const char *key, const char *value);
 
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 6d36ed3c46a0..dae0321ebfa9 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -150,12 +150,10 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
 
-/*
- * FIXME: the cache is read/write for the time being.
- */
 enum cache_metadata_mode {
 	CM_WRITE,		/* metadata may be changed */
 	CM_READ_ONLY,		/* metadata may not be changed */
+	CM_FAIL
 };
 
 enum cache_io_mode {
@@ -385,6 +383,8 @@ struct prealloc {
 	struct dm_bio_prison_cell *cell2;
 };
 
+static enum cache_metadata_mode get_cache_mode(struct cache *cache);
+
 static void wake_worker(struct cache *cache)
 {
 	queue_work(cache->wq, &cache->worker);
@@ -699,6 +699,9 @@ static void save_stats(struct cache *cache)
 {
 	struct dm_cache_statistics stats;
 
+	if (get_cache_mode(cache) >= CM_READ_ONLY)
+		return;
+
 	stats.read_hits = atomic_read(&cache->stats.read_hit);
 	stats.read_misses = atomic_read(&cache->stats.read_miss);
 	stats.write_hits = atomic_read(&cache->stats.write_hit);
@@ -957,6 +960,84 @@ static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
 	remap_to_origin_clear_discard(pb->cache, bio, oblock);
 }
 
+/*----------------------------------------------------------------
+ * Failure modes
+ *--------------------------------------------------------------*/
+static enum cache_metadata_mode get_cache_mode(struct cache *cache)
+{
+	return cache->features.mode;
+}
+
+static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
+{
+	const char *descs[] = {
+		"write",
+		"read-only",
+		"fail"
+	};
+
+	dm_table_event(cache->ti->table);
+	DMINFO("switching cache to %s mode", descs[(int)mode]);
+}
+
+static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
+{
+	bool needs_check = dm_cache_metadata_needs_check(cache->cmd);
+	enum cache_metadata_mode old_mode = get_cache_mode(cache);
+
+	if (new_mode == CM_WRITE && needs_check) {
+		DMERR("unable to switch cache to write mode until repaired.");
+		if (old_mode != new_mode)
+			new_mode = old_mode;
+		else
+			new_mode = CM_READ_ONLY;
+	}
+
+	/* Never move out of fail mode */
+	if (old_mode == CM_FAIL)
+		new_mode = CM_FAIL;
+
+	switch (new_mode) {
+	case CM_FAIL:
+	case CM_READ_ONLY:
+		dm_cache_metadata_set_read_only(cache->cmd);
+		break;
+
+	case CM_WRITE:
+		dm_cache_metadata_set_read_write(cache->cmd);
+		break;
+	}
+
+	cache->features.mode = new_mode;
+
+	if (new_mode != old_mode)
+		notify_mode_switch(cache, new_mode);
+}
+
+static void abort_transaction(struct cache *cache)
+{
+	if (get_cache_mode(cache) >= CM_READ_ONLY)
+		return;
+
+	if (dm_cache_metadata_set_needs_check(cache->cmd)) {
+		DMERR("failed to set 'needs_check' flag in metadata");
+		set_cache_mode(cache, CM_FAIL);
+	}
+
+	DMERR_LIMIT("aborting current metadata transaction");
+	if (dm_cache_metadata_abort(cache->cmd)) {
+		DMERR("failed to abort metadata transaction");
+		set_cache_mode(cache, CM_FAIL);
+	}
+}
+
+static void metadata_operation_failed(struct cache *cache, const char *op, int r)
+{
+	DMERR_LIMIT("metadata operation '%s' failed: error = %d", op, r);
+	abort_transaction(cache);
+	set_cache_mode(cache, CM_READ_ONLY);
+}
+
 /*----------------------------------------------------------------
  * Migration processing
  *
@@ -1063,6 +1144,7 @@ static void migration_failure(struct dm_cache_migration *mg)
 
 static void migration_success_pre_commit(struct dm_cache_migration *mg)
 {
+	int r;
 	unsigned long flags;
 	struct cache *cache = mg->cache;
 
@@ -1073,8 +1155,10 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg)
 		return;
 
 	} else if (mg->demote) {
-		if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
+		r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
+		if (r) {
 			DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
+			metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
 			policy_force_mapping(cache->policy, mg->new_oblock,
 					     mg->old_oblock);
 			if (mg->promote)
@@ -1083,8 +1167,10 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg)
 			return;
 		}
 	} else {
-		if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
+		r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
+		if (r) {
 			DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
+			metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
 			policy_remove_mapping(cache->policy, mg->new_oblock);
 			free_io_migration(mg);
 			return;
@@ -1812,15 +1898,32 @@ static int need_commit_due_to_time(struct cache *cache)
 	       jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
 }
 
+/*
+ * A non-zero return indicates read_only or fail_io mode.
+ */
+static int commit(struct cache *cache, bool clean_shutdown)
+{
+	int r;
+
+	if (get_cache_mode(cache) >= CM_READ_ONLY)
+		return -EINVAL;
+
+	atomic_inc(&cache->stats.commit_count);
+	r = dm_cache_commit(cache->cmd, clean_shutdown);
+	if (r)
+		metadata_operation_failed(cache, "dm_cache_commit", r);
+
+	return r;
+}
+
 static int commit_if_needed(struct cache *cache)
 {
 	int r = 0;
 
 	if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
 	    dm_cache_changed_this_transaction(cache->cmd)) {
-		atomic_inc(&cache->stats.commit_count);
+		r = commit(cache, false);
 		cache->commit_requested = false;
-		r = dm_cache_commit(cache->cmd, false);
 		cache->last_commit_jiffies = jiffies;
 	}
 
@@ -1988,8 +2091,10 @@ static void process_invalidation_request(struct cache *cache, struct invalidatio
 		r = policy_remove_cblock(cache->policy, to_cblock(begin));
 		if (!r) {
 			r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
-			if (r)
+			if (r) {
+				metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
 				break;
+			}
 
 		} else if (r == -ENODATA) {
 			/* harmless, already unmapped */
@@ -2133,12 +2238,6 @@ static void do_worker(struct work_struct *ws)
 		if (commit_if_needed(cache)) {
 			process_deferred_flush_bios(cache, false);
 			process_migrations(cache, &cache->need_commit_migrations, migration_failure);
-
-			/*
-			 * FIXME: rollback metadata or just go into a
-			 * failure mode and error everything
-			 */
-
 		} else {
 			process_deferred_flush_bios(cache, true);
 			process_migrations(cache, &cache->need_commit_migrations,
@@ -2711,6 +2810,12 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 		goto bad;
 	}
 	cache->cmd = cmd;
+	set_cache_mode(cache, CM_WRITE);
+	if (get_cache_mode(cache) != CM_WRITE) {
+		*error = "Unable to get write access to metadata, please check/repair metadata.";
+		r = -EINVAL;
+		goto bad;
+	}
 
 	if (passthrough_mode(&cache->features)) {
 		bool all_clean;
@@ -3043,11 +3148,16 @@ static int write_dirty_bitset(struct cache *cache)
 {
 	unsigned i, r;
 
+	if (get_cache_mode(cache) >= CM_READ_ONLY)
+		return -EINVAL;
+
 	for (i = 0; i < from_cblock(cache->cache_size); i++) {
 		r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
 				       is_dirty(cache, to_cblock(i)));
-		if (r)
+		if (r) {
+			metadata_operation_failed(cache, "dm_cache_set_dirty", r);
 			return r;
+		}
 	}
 
 	return 0;
@@ -3057,18 +3167,40 @@ static int write_discard_bitset(struct cache *cache)
 {
 	unsigned i, r;
 
+	if (get_cache_mode(cache) >= CM_READ_ONLY)
+		return -EINVAL;
+
 	r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
 					   cache->discard_nr_blocks);
 	if (r) {
 		DMERR("could not resize on-disk discard bitset");
+		metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
 		return r;
 	}
 
 	for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
 		r = dm_cache_set_discard(cache->cmd, to_dblock(i),
 					 is_discarded(cache, to_dblock(i)));
-		if (r)
+		if (r) {
+			metadata_operation_failed(cache, "dm_cache_set_discard", r);
 			return r;
+		}
+	}
+
+	return 0;
+}
+
+static int write_hints(struct cache *cache)
+{
+	int r;
+
+	if (get_cache_mode(cache) >= CM_READ_ONLY)
+		return -EINVAL;
+
+	r = dm_cache_write_hints(cache->cmd, cache->policy);
+	if (r) {
+		metadata_operation_failed(cache, "dm_cache_write_hints", r);
+		return r;
 	}
 
 	return 0;
@@ -3091,7 +3223,7 @@ static bool sync_metadata(struct cache *cache)
 
 	save_stats(cache);
 
-	r3 = dm_cache_write_hints(cache->cmd, cache->policy);
+	r3 = write_hints(cache);
 	if (r3)
 		DMERR("could not write hints");
 
@@ -3100,9 +3232,9 @@ static bool sync_metadata(struct cache *cache)
 	 * set the clean shutdown flag.  This will effectively force every
 	 * dirty bit to be set on reload.
 	 */
-	r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
+	r4 = commit(cache, !r1 && !r2 && !r3);
 	if (r4)
-		DMERR("could not write cache metadata.  Data loss may occur.");
+		DMERR("could not write cache metadata.");
 
 	return !r1 && !r2 && !r3 && !r4;
 }
@@ -3118,7 +3250,8 @@ static void cache_postsuspend(struct dm_target *ti)
 	requeue_deferred_cells(cache);
 	stop_quiescing(cache);
 
-	(void) sync_metadata(cache);
+	if (get_cache_mode(cache) == CM_WRITE)
+		(void) sync_metadata(cache);
 }
 
 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
@@ -3257,6 +3390,7 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
 	r = dm_cache_resize(cache->cmd, new_size);
 	if (r) {
 		DMERR("could not resize cache metadata");
+		metadata_operation_failed(cache, "dm_cache_resize", r);
 		return r;
 	}
 
@@ -3295,6 +3429,7 @@ static int cache_preresume(struct dm_target *ti)
 					   load_mapping, cache);
 		if (r) {
 			DMERR("could not load cache mappings");
+			metadata_operation_failed(cache, "dm_cache_load_mappings", r);
 			return r;
 		}
 
@@ -3315,6 +3450,7 @@ static int cache_preresume(struct dm_target *ti)
 		r = dm_cache_load_discards(cache->cmd, load_discard, &li);
 		if (r) {
 			DMERR("could not load origin discards");
+			metadata_operation_failed(cache, "dm_cache_load_discards", r);
 			return r;
 		}
 		set_discard_range(&li);
@@ -3342,7 +3478,7 @@ static void cache_resume(struct dm_target *ti)
  * <#demotions> <#promotions> <#dirty>
  * <#features> <features>*
  * <#core args> <core args>
- * <policy name> <#policy args> <policy args>*
+ * <policy name> <#policy args> <policy args>* <cache metadata mode>
  */
 static void cache_status(struct dm_target *ti, status_type_t type,
 			 unsigned status_flags, char *result, unsigned maxlen)
@@ -3358,13 +3494,15 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 
 	switch (type) {
 	case STATUSTYPE_INFO:
-		/* Commit to ensure statistics aren't out-of-date */
-		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
-			r = dm_cache_commit(cache->cmd, false);
-			if (r)
-				DMERR("could not commit metadata for accurate status");
+		if (get_cache_mode(cache) == CM_FAIL) {
+			DMEMIT("Fail");
+			break;
 		}
 
+		/* Commit to ensure statistics aren't out-of-date */
+		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
+			(void) commit(cache, false);
+
 		r = dm_cache_get_free_metadata_block_count(cache->cmd,
 							   &nr_free_blocks_metadata);
 		if (r) {
@@ -3413,11 +3551,16 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 
 		DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
 		if (sz < maxlen) {
-			r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
+			r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
 			if (r)
 				DMERR("policy_emit_config_values returned %d", r);
 		}
 
+		if (get_cache_mode(cache) == CM_READ_ONLY)
+			DMEMIT("ro ");
+		else
+			DMEMIT("rw ");
+
 		break;
 
 	case STATUSTYPE_TABLE:
@@ -3573,6 +3716,11 @@ static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
 	if (!argc)
 		return -EINVAL;
 
+	if (get_cache_mode(cache) >= CM_READ_ONLY) {
+		DMERR("unable to service cache target messages in READ_ONLY or FAIL mode");
+		return -EOPNOTSUPP;
+	}
+
 	if (!strcasecmp(argv[0], "invalidate_cblocks"))
 		return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
 
@@ -3646,7 +3794,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type cache_target = {
 	.name = "cache",
-	.version = {1, 6, 0},
+	.version = {1, 7, 0},
 	.module = THIS_MODULE,
 	.ctr = cache_ctr,
 	.dtr = cache_dtr,

From b61d9509628fea995196a96b4c1713fa67dade88 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 22 Apr 2015 17:25:56 -0400
Subject: [PATCH 25/39] dm cache: prefix all DMERR and DMINFO messages with
 cache device name

Having the DM device name associated with the ERR or INFO message is
very helpful.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 102 ++++++++++++++++++++++-------------
 1 file changed, 64 insertions(+), 38 deletions(-)

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index dae0321ebfa9..5aad875b822c 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -968,6 +968,11 @@ static enum cache_metadata_mode get_cache_mode(struct cache *cache)
 	return cache->features.mode;
 }
 
+static const char *cache_device_name(struct cache *cache)
+{
+	return dm_device_name(dm_table_get_md(cache->ti->table));
+}
+
 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
 {
 	const char *descs[] = {
@@ -977,7 +982,8 @@ static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mod
 	};
 
 	dm_table_event(cache->ti->table);
-	DMINFO("switching cache to %s mode", descs[(int)mode]);
+	DMINFO("%s: switching cache to %s mode",
+	       cache_device_name(cache), descs[(int)mode]);
 }
 
 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
@@ -986,7 +992,8 @@ static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mod
 	enum cache_metadata_mode old_mode = get_cache_mode(cache);
 
 	if (new_mode == CM_WRITE && needs_check) {
-		DMERR("unable to switch cache to write mode until repaired.");
+		DMERR("%s: unable to switch cache to write mode until repaired.",
+		      cache_device_name(cache));
 		if (old_mode != new_mode)
 			new_mode = old_mode;
 		else
@@ -1016,24 +1023,27 @@ static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mod
 
 static void abort_transaction(struct cache *cache)
 {
+	const char *dev_name = cache_device_name(cache);
+
 	if (get_cache_mode(cache) >= CM_READ_ONLY)
 		return;
 
 	if (dm_cache_metadata_set_needs_check(cache->cmd)) {
-		DMERR("failed to set 'needs_check' flag in metadata");
+		DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
 		set_cache_mode(cache, CM_FAIL);
 	}
 
-	DMERR_LIMIT("aborting current metadata transaction");
+	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
 	if (dm_cache_metadata_abort(cache->cmd)) {
-		DMERR("failed to abort metadata transaction");
+		DMERR("%s: failed to abort metadata transaction", dev_name);
 		set_cache_mode(cache, CM_FAIL);
 	}
 }
 
 static void metadata_operation_failed(struct cache *cache, const char *op, int r)
 {
-	DMERR_LIMIT("metadata operation '%s' failed: error = %d", op, r);
+	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
+		    cache_device_name(cache), op, r);
 	abort_transaction(cache);
 	set_cache_mode(cache, CM_READ_ONLY);
 }
@@ -1120,21 +1130,22 @@ static void free_io_migration(struct dm_cache_migration *mg)
 static void migration_failure(struct dm_cache_migration *mg)
 {
 	struct cache *cache = mg->cache;
+	const char *dev_name = cache_device_name(cache);
 
 	if (mg->writeback) {
-		DMWARN_LIMIT("writeback failed; couldn't copy block");
+		DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name);
 		set_dirty(cache, mg->old_oblock, mg->cblock);
 		cell_defer(cache, mg->old_ocell, false);
 
 	} else if (mg->demote) {
-		DMWARN_LIMIT("demotion failed; couldn't copy block");
+		DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name);
 		policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
 
 		cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
 		if (mg->promote)
 			cell_defer(cache, mg->new_ocell, true);
 	} else {
-		DMWARN_LIMIT("promotion failed; couldn't copy block");
+		DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name);
 		policy_remove_mapping(cache->policy, mg->new_oblock);
 		cell_defer(cache, mg->new_ocell, true);
 	}
@@ -1157,7 +1168,8 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg)
 	} else if (mg->demote) {
 		r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
 		if (r) {
-			DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
+			DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata",
+				    cache_device_name(cache));
 			metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
 			policy_force_mapping(cache->policy, mg->new_oblock,
 					     mg->old_oblock);
@@ -1169,7 +1181,8 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg)
 	} else {
 		r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
 		if (r) {
-			DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
+			DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata",
+				    cache_device_name(cache));
 			metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
 			policy_remove_mapping(cache->policy, mg->new_oblock);
 			free_io_migration(mg);
@@ -1189,7 +1202,8 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
 	struct cache *cache = mg->cache;
 
 	if (mg->writeback) {
-		DMWARN("writeback unexpectedly triggered commit");
+		DMWARN_LIMIT("%s: writeback unexpectedly triggered commit",
+			     cache_device_name(cache));
 		return;
 
 	} else if (mg->demote) {
@@ -1265,7 +1279,7 @@ static void issue_copy(struct dm_cache_migration *mg)
 	}
 
 	if (r < 0) {
-		DMERR_LIMIT("issuing migration failed");
+		DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache));
 		migration_failure(mg);
 	}
 }
@@ -1863,7 +1877,8 @@ static void process_cell(struct cache *cache, struct prealloc *structs,
 		break;
 
 	default:
-		DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
+		DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u",
+			    cache_device_name(cache), __func__,
 			    (unsigned) lookup_result.op);
 		bio_io_error(bio);
 	}
@@ -2101,7 +2116,7 @@ static void process_invalidation_request(struct cache *cache, struct invalidatio
 			r = 0;
 
 		} else {
-			DMERR("policy_remove_cblock failed");
+			DMERR("%s: policy_remove_cblock failed", cache_device_name(cache));
 			break;
 		}
 
@@ -3054,7 +3069,8 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 		return DM_MAPIO_SUBMITTED;
 
 	} else if (r) {
-		DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
+		DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d",
+			    cache_device_name(cache), r);
 		cell_defer(cache, cell, false);
 		bio_io_error(bio);
 		return DM_MAPIO_SUBMITTED;
@@ -3113,7 +3129,8 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 		break;
 
 	default:
-		DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
+		DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u",
+			    cache_device_name(cache), __func__,
 			    (unsigned) lookup_result.op);
 		cell_defer(cache, cell, false);
 		bio_io_error(bio);
@@ -3173,7 +3190,7 @@ static int write_discard_bitset(struct cache *cache)
 	r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
 					   cache->discard_nr_blocks);
 	if (r) {
-		DMERR("could not resize on-disk discard bitset");
+		DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
 		metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
 		return r;
 	}
@@ -3215,17 +3232,17 @@ static bool sync_metadata(struct cache *cache)
 
 	r1 = write_dirty_bitset(cache);
 	if (r1)
-		DMERR("could not write dirty bitset");
+		DMERR("%s: could not write dirty bitset", cache_device_name(cache));
 
 	r2 = write_discard_bitset(cache);
 	if (r2)
-		DMERR("could not write discard bitset");
+		DMERR("%s: could not write discard bitset", cache_device_name(cache));
 
 	save_stats(cache);
 
 	r3 = write_hints(cache);
 	if (r3)
-		DMERR("could not write hints");
+		DMERR("%s: could not write hints", cache_device_name(cache));
 
 	/*
 	 * If writing the above metadata failed, we still commit, but don't
@@ -3234,7 +3251,7 @@ static bool sync_metadata(struct cache *cache)
 	 */
 	r4 = commit(cache, !r1 && !r2 && !r3);
 	if (r4)
-		DMERR("could not write cache metadata.");
+		DMERR("%s: could not write cache metadata", cache_device_name(cache));
 
 	return !r1 && !r2 && !r3 && !r4;
 }
@@ -3374,7 +3391,8 @@ static bool can_resize(struct cache *cache, dm_cblock_t new_size)
 	while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
 		new_size = to_cblock(from_cblock(new_size) + 1);
 		if (is_dirty(cache, new_size)) {
-			DMERR("unable to shrink cache; cache block %llu is dirty",
+			DMERR("%s: unable to shrink cache; cache block %llu is dirty",
+			      cache_device_name(cache),
 			      (unsigned long long) from_cblock(new_size));
 			return false;
 		}
@@ -3389,7 +3407,7 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
 
 	r = dm_cache_resize(cache->cmd, new_size);
 	if (r) {
-		DMERR("could not resize cache metadata");
+		DMERR("%s: could not resize cache metadata", cache_device_name(cache));
 		metadata_operation_failed(cache, "dm_cache_resize", r);
 		return r;
 	}
@@ -3428,7 +3446,7 @@ static int cache_preresume(struct dm_target *ti)
 		r = dm_cache_load_mappings(cache->cmd, cache->policy,
 					   load_mapping, cache);
 		if (r) {
-			DMERR("could not load cache mappings");
+			DMERR("%s: could not load cache mappings", cache_device_name(cache));
 			metadata_operation_failed(cache, "dm_cache_load_mappings", r);
 			return r;
 		}
@@ -3449,7 +3467,7 @@ static int cache_preresume(struct dm_target *ti)
 		discard_load_info_init(cache, &li);
 		r = dm_cache_load_discards(cache->cmd, load_discard, &li);
 		if (r) {
-			DMERR("could not load origin discards");
+			DMERR("%s: could not load origin discards", cache_device_name(cache));
 			metadata_operation_failed(cache, "dm_cache_load_discards", r);
 			return r;
 		}
@@ -3503,16 +3521,17 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
 			(void) commit(cache, false);
 
-		r = dm_cache_get_free_metadata_block_count(cache->cmd,
-							   &nr_free_blocks_metadata);
+		r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
 		if (r) {
-			DMERR("could not get metadata free block count");
+			DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
+			      cache_device_name(cache), r);
 			goto err;
 		}
 
 		r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
 		if (r) {
-			DMERR("could not get metadata device size");
+			DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
+			      cache_device_name(cache), r);
 			goto err;
 		}
 
@@ -3543,7 +3562,8 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 			DMEMIT("1 writeback ");
 
 		else {
-			DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode);
+			DMERR("%s: internal error: unknown io mode: %d",
+			      cache_device_name(cache), (int) cache->features.io_mode);
 			goto err;
 		}
 
@@ -3553,7 +3573,8 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 		if (sz < maxlen) {
 			r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
 			if (r)
-				DMERR("policy_emit_config_values returned %d", r);
+				DMERR("%s: policy_emit_config_values returned %d",
+				      cache_device_name(cache), r);
 		}
 
 		if (get_cache_mode(cache) == CM_READ_ONLY)
@@ -3622,7 +3643,7 @@ static int parse_cblock_range(struct cache *cache, const char *str,
 		return 0;
 	}
 
-	DMERR("invalid cblock range '%s'", str);
+	DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
 	return -EINVAL;
 }
 
@@ -3633,17 +3654,20 @@ static int validate_cblock_range(struct cache *cache, struct cblock_range *range
 	uint64_t n = from_cblock(cache->cache_size);
 
 	if (b >= n) {
-		DMERR("begin cblock out of range: %llu >= %llu", b, n);
+		DMERR("%s: begin cblock out of range: %llu >= %llu",
+		      cache_device_name(cache), b, n);
 		return -EINVAL;
 	}
 
 	if (e > n) {
-		DMERR("end cblock out of range: %llu > %llu", e, n);
+		DMERR("%s: end cblock out of range: %llu > %llu",
+		      cache_device_name(cache), e, n);
 		return -EINVAL;
 	}
 
 	if (b >= e) {
-		DMERR("invalid cblock range: %llu >= %llu", b, e);
+		DMERR("%s: invalid cblock range: %llu >= %llu",
+		      cache_device_name(cache), b, e);
 		return -EINVAL;
 	}
 
@@ -3677,7 +3701,8 @@ static int process_invalidate_cblocks_message(struct cache *cache, unsigned coun
 	struct cblock_range range;
 
 	if (!passthrough_mode(&cache->features)) {
-		DMERR("cache has to be in passthrough mode for invalidation");
+		DMERR("%s: cache has to be in passthrough mode for invalidation",
+		      cache_device_name(cache));
 		return -EPERM;
 	}
 
@@ -3717,7 +3742,8 @@ static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
 		return -EINVAL;
 
 	if (get_cache_mode(cache) >= CM_READ_ONLY) {
-		DMERR("unable to service cache target messages in READ_ONLY or FAIL mode");
+		DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
+		      cache_device_name(cache));
 		return -EOPNOTSUPP;
 	}
 

From fba10109a45d864bab98ae90dd63bcc2789352b3 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 29 May 2015 10:20:56 +0100
Subject: [PATCH 26/39] dm cache: age and write back cache entries even without
 active IO

The policy tick() method is normally called from interrupt context.
Both the mq and smq policies do some bottom half work for the tick
method in their map functions.  However if no IO is going through the
cache, then that bottom half work doesn't occur.  With these policies
this means recently hit entries do not age and do not get written
back as early as we'd like.

Fix this by introducing a new 'can_block' parameter to the tick()
method.  When this is set the bottom half work occurs immediately.
'can_block' is set when the tick method is called every second by the
core target (not in interrupt context).

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-internal.h | 4 ++--
 drivers/md/dm-cache-policy-mq.c       | 8 +++++++-
 drivers/md/dm-cache-policy-smq.c      | 8 +++++++-
 drivers/md/dm-cache-policy.h          | 4 ++--
 drivers/md/dm-cache-target.c          | 4 ++--
 5 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index ccbe852d5362..2816018faa7f 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -83,10 +83,10 @@ static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
 	return p->residency(p);
 }
 
-static inline void policy_tick(struct dm_cache_policy *p)
+static inline void policy_tick(struct dm_cache_policy *p, bool can_block)
 {
 	if (p->tick)
-		return p->tick(p);
+		return p->tick(p, can_block);
 }
 
 static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result,
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 084eec653321..838665bb495a 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -1283,7 +1283,7 @@ static dm_cblock_t mq_residency(struct dm_cache_policy *p)
 	return r;
 }
 
-static void mq_tick(struct dm_cache_policy *p)
+static void mq_tick(struct dm_cache_policy *p, bool can_block)
 {
 	struct mq_policy *mq = to_mq_policy(p);
 	unsigned long flags;
@@ -1291,6 +1291,12 @@ static void mq_tick(struct dm_cache_policy *p)
 	spin_lock_irqsave(&mq->tick_lock, flags);
 	mq->tick_protected++;
 	spin_unlock_irqrestore(&mq->tick_lock, flags);
+
+	if (can_block) {
+		mutex_lock(&mq->lock);
+		copy_tick(mq);
+		mutex_unlock(&mq->lock);
+	}
 }
 
 static int mq_set_config_value(struct dm_cache_policy *p,
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 55a657f78f00..66feb307e697 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1581,7 +1581,7 @@ static dm_cblock_t smq_residency(struct dm_cache_policy *p)
 	return r;
 }
 
-static void smq_tick(struct dm_cache_policy *p)
+static void smq_tick(struct dm_cache_policy *p, bool can_block)
 {
 	struct smq_policy *mq = to_smq_policy(p);
 	unsigned long flags;
@@ -1589,6 +1589,12 @@ static void smq_tick(struct dm_cache_policy *p)
 	spin_lock_irqsave(&mq->tick_lock, flags);
 	mq->tick_protected++;
 	spin_unlock_irqrestore(&mq->tick_lock, flags);
+
+	if (can_block) {
+		mutex_lock(&mq->lock);
+		copy_tick(mq);
+		mutex_unlock(&mq->lock);
+	}
 }
 
 /* Init the policy plugin interface function pointers. */
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index 74709129d856..05db56eedb6a 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -200,10 +200,10 @@ struct dm_cache_policy {
 	 * Because of where we sit in the block layer, we can be asked to
 	 * map a lot of little bios that are all in the same block (no
 	 * queue merging has occurred).  To stop the policy being fooled by
-	 * these the core target sends regular tick() calls to the policy.
+	 * these, the core target sends regular tick() calls to the policy.
 	 * The policy should only count an entry as hit once per tick.
 	 */
-	void (*tick)(struct dm_cache_policy *p);
+	void (*tick)(struct dm_cache_policy *p, bool can_block);
 
 	/*
 	 * Configuration.
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 5aad875b822c..1b4e1756b169 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2271,7 +2271,7 @@ static void do_worker(struct work_struct *ws)
 static void do_waker(struct work_struct *ws)
 {
 	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
-	policy_tick(cache->policy);
+	policy_tick(cache->policy, true);
 	wake_worker(cache);
 	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
 }
@@ -3148,7 +3148,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 
 	if (pb->tick) {
-		policy_tick(cache->policy);
+		policy_tick(cache->policy, false);
 
 		spin_lock_irqsave(&cache->lock, flags);
 		cache->need_tick_bio = true;

From 0f24b79b52730e15d9e3386ce27da2110eb4597d Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Fri, 15 May 2015 21:35:21 +0300
Subject: [PATCH 27/39] dm stats: Use kvfree() in dm_kvfree()

Use kvfree() instead of open-coding it.

Signed-off-by: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-stats.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index f478a4c96d2f..492fe6a5ebf2 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -160,10 +160,7 @@ static void dm_kvfree(void *ptr, size_t alloc_size)
 
 	free_shared_memory(alloc_size);
 
-	if (is_vmalloc_addr(ptr))
-		vfree(ptr);
-	else
-		kfree(ptr);
+	kvfree(ptr);
 }
 
 static void dm_stat_free(struct rcu_head *head)

From 4ec331c3ea7ec94f28aa1c62a279cfa1cfe3c91b Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 13 Apr 2015 09:41:44 +0100
Subject: [PATCH 28/39] dm btree: add dm_btree_remove_leaves()

Removes a range of leaf values from the tree.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/persistent-data/dm-btree-remove.c | 127 +++++++++++++++++++
 drivers/md/persistent-data/dm-btree.h        |   9 ++
 2 files changed, 136 insertions(+)

diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index b88757cd0d1d..e04cfd2d60ef 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -590,3 +590,130 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 	return r;
 }
 EXPORT_SYMBOL_GPL(dm_btree_remove);
+
+/*----------------------------------------------------------------*/
+
+static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info,
+			  struct dm_btree_value_type *vt, dm_block_t root,
+			  uint64_t key, int *index)
+{
+	int i = *index, r;
+	struct btree_node *n;
+
+	for (;;) {
+		r = shadow_step(s, root, vt);
+		if (r < 0)
+			break;
+
+		/*
+		 * We have to patch up the parent node, ugly, but I don't
+		 * see a way to do this automatically as part of the spine
+		 * op.
+		 */
+		if (shadow_has_parent(s)) {
+			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
+			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
+			       &location, sizeof(__le64));
+		}
+
+		n = dm_block_data(shadow_current(s));
+
+		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
+			*index = lower_bound(n, key);
+			return 0;
+		}
+
+		r = rebalance_children(s, info, vt, key);
+		if (r)
+			break;
+
+		n = dm_block_data(shadow_current(s));
+		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
+			*index = lower_bound(n, key);
+			return 0;
+		}
+
+		i = lower_bound(n, key);
+
+		/*
+		 * We know the key is present, or else
+		 * rebalance_children would have returned
+		 * -ENODATA
+		 */
+		root = value64(n, i);
+	}
+
+	return r;
+}
+
+static int remove_one(struct dm_btree_info *info, dm_block_t root,
+		      uint64_t *keys, uint64_t end_key,
+		      dm_block_t *new_root, unsigned *nr_removed)
+{
+	unsigned level, last_level = info->levels - 1;
+	int index = 0, r = 0;
+	struct shadow_spine spine;
+	struct btree_node *n;
+	uint64_t k;
+
+	init_shadow_spine(&spine, info);
+	for (level = 0; level < last_level; level++) {
+		r = remove_raw(&spine, info, &le64_type,
+			       root, keys[level], (unsigned *) &index);
+		if (r < 0)
+			goto out;
+
+		n = dm_block_data(shadow_current(&spine));
+		root = value64(n, index);
+	}
+
+	r = remove_nearest(&spine, info, &info->value_type,
+			   root, keys[last_level], &index);
+	if (r < 0)
+		goto out;
+
+	n = dm_block_data(shadow_current(&spine));
+
+	if (index < 0)
+		index = 0;
+
+	if (index >= le32_to_cpu(n->header.nr_entries)) {
+		r = -ENODATA;
+		goto out;
+	}
+
+	k = le64_to_cpu(n->keys[index]);
+	if (k >= keys[last_level] && k < end_key) {
+		if (info->value_type.dec)
+			info->value_type.dec(info->value_type.context,
+					     value_ptr(n, index));
+
+		delete_at(n, index);
+
+	} else
+		r = -ENODATA;
+
+out:
+	*new_root = shadow_root(&spine);
+	exit_shadow_spine(&spine);
+
+	return r;
+}
+
+int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
+			   uint64_t *first_key, uint64_t end_key,
+			   dm_block_t *new_root, unsigned *nr_removed)
+{
+	int r;
+
+	*nr_removed = 0;
+	do {
+		r = remove_one(info, root, first_key, end_key, &root, nr_removed);
+		if (!r)
+			(*nr_removed)++;
+	} while (!r);
+
+	*new_root = root;
+	return r == -ENODATA ? 0 : r;
+}
+EXPORT_SYMBOL_GPL(dm_btree_remove_leaves);
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index dacfc34180b4..11d8cf78621d 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -134,6 +134,15 @@ int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
 int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 		    uint64_t *keys, dm_block_t *new_root);
 
+/*
+ * Removes values between 'keys' and keys2, where keys2 is keys with the
+ * final key replaced with 'end_key'.  'end_key' is the one-past-the-end
+ * value.  'keys' may be altered.
+ */
+int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
+			   uint64_t *keys, uint64_t end_key,
+			   dm_block_t *new_root, unsigned *nr_removed);
+
 /*
  * Returns < 0 on failure.  Otherwise the number of key entries that have
  * been filled out.  Remember trees can have zero entries, and as such have

From a5d895a90bf57e5fe87edf48dd1852e7292d570d Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 16 Apr 2015 12:47:21 +0100
Subject: [PATCH 29/39] dm thin metadata: add dm_thin_find_mapped_range()

Retrieve the next run of contiguously mapped blocks.  Useful for working
out where to break up IO.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin-metadata.c | 57 +++++++++++++++++++++++++++++++++++
 drivers/md/dm-thin-metadata.h |  9 ++++++
 2 files changed, 66 insertions(+)

diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index cb6dd055053d..94cf0db8a22e 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1417,6 +1417,63 @@ int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
 	return r;
 }
 
+/* FIXME: write a more efficient one in btree */
+int dm_thin_find_mapped_range(struct dm_thin_device *td,
+			      dm_block_t begin, dm_block_t end,
+			      dm_block_t *thin_begin, dm_block_t *thin_end,
+			      dm_block_t *pool_begin, bool *maybe_shared)
+{
+	int r;
+	dm_block_t pool_end;
+	struct dm_thin_lookup_result lookup;
+
+	if (end < begin)
+		return -ENODATA;
+
+	/*
+	 * Find first mapped block.
+	 */
+	while (begin < end) {
+		r = dm_thin_find_block(td, begin, true, &lookup);
+		if (r) {
+			if (r != -ENODATA)
+				return r;
+		} else
+			break;
+
+		begin++;
+	}
+
+	if (begin == end)
+		return -ENODATA;
+
+	*thin_begin = begin;
+	*pool_begin = lookup.block;
+	*maybe_shared = lookup.shared;
+
+	begin++;
+	pool_end = *pool_begin + 1;
+	while (begin != end) {
+		r = dm_thin_find_block(td, begin, true, &lookup);
+		if (r) {
+			if (r == -ENODATA)
+				break;
+			else
+				return r;
+		}
+
+		if ((lookup.block != pool_end) ||
+		    (lookup.shared != *maybe_shared))
+			break;
+
+		pool_end++;
+		begin++;
+	}
+
+	*thin_end = begin;
+	return 0;
+}
+
 static int __insert(struct dm_thin_device *td, dm_block_t block,
 		    dm_block_t data_block)
 {
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index fac01a96d303..f11f14095b93 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -146,6 +146,15 @@ struct dm_thin_lookup_result {
 int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
 		       int can_issue_io, struct dm_thin_lookup_result *result);
 
+/*
+ * Retrieve the next run of contiguously mapped blocks.  Useful for working
+ * out where to break up IO.  Returns 0 on success, < 0 on error.
+ */
+int dm_thin_find_mapped_range(struct dm_thin_device *td,
+			      dm_block_t begin, dm_block_t end,
+			      dm_block_t *thin_begin, dm_block_t *thin_end,
+			      dm_block_t *pool_begin, bool *maybe_shared);
+
 /*
  * Obtain an unused block.
  */

From 6550f075f5087459f64c1af71298fc50b102af11 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 13 Apr 2015 09:45:25 +0100
Subject: [PATCH 30/39] dm thin metadata: add dm_thin_remove_range()

Removes a range of blocks from the btree.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin-metadata.c | 54 +++++++++++++++++++++++++++++++++++
 drivers/md/dm-thin-metadata.h |  2 ++
 2 files changed, 56 insertions(+)

diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 94cf0db8a22e..8b521e3e1e1b 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1526,6 +1526,47 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
 	return 0;
 }
 
+static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end)
+{
+	int r;
+	unsigned count;
+	struct dm_pool_metadata *pmd = td->pmd;
+	dm_block_t keys[1] = { td->id };
+	__le64 value;
+	dm_block_t mapping_root;
+
+	/*
+	 * Find the mapping tree
+	 */
+	r = dm_btree_lookup(&pmd->tl_info, pmd->root, keys, &value);
+	if (r)
+		return r;
+
+	/*
+	 * Remove from the mapping tree, taking care to inc the
+	 * ref count so it doesn't get deleted.
+	 */
+	mapping_root = le64_to_cpu(value);
+	dm_tm_inc(pmd->tm, mapping_root);
+	r = dm_btree_remove(&pmd->tl_info, pmd->root, keys, &pmd->root);
+	if (r)
+		return r;
+
+	r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
+	if (r)
+		return r;
+
+	td->mapped_blocks -= count;
+	td->changed = 1;
+
+	/*
+	 * Reinsert the mapping tree.
+	 */
+	value = cpu_to_le64(mapping_root);
+	__dm_bless_for_disk(&value);
+	return dm_btree_insert(&pmd->tl_info, pmd->root, keys, &value, &pmd->root);
+}
+
 int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
 {
 	int r = -EINVAL;
@@ -1538,6 +1579,19 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
 	return r;
 }
 
+int dm_thin_remove_range(struct dm_thin_device *td,
+			 dm_block_t begin, dm_block_t end)
+{
+	int r = -EINVAL;
+
+	down_write(&td->pmd->root_lock);
+	if (!td->pmd->fail_io)
+		r = __remove_range(td, begin, end);
+	up_write(&td->pmd->root_lock);
+
+	return r;
+}
+
 int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
 {
 	int r;
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index f11f14095b93..a938babe4258 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -167,6 +167,8 @@ int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
 			 dm_block_t data_block);
 
 int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
+int dm_thin_remove_range(struct dm_thin_device *td,
+			 dm_block_t begin, dm_block_t end);
 
 /*
  * Queries.

From 34fbcf6257eb3f39a5b78a4f51b40f881b82033b Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 16 Apr 2015 12:58:35 +0100
Subject: [PATCH 31/39] dm thin: range discard support

Previously REQ_DISCARD bios have been split into block sized chunks
before submission to the thin target.  There are a couple of issues with
this:

 - If the block size is small, a large discard request can
   get broken up into a great many bios which is both slow and causes
   a lot of memory pressure.

 - The thin pool block size and the discard granularity for the
   underlying data device need to be compatible if we want to passdown
   the discard.

This patch relaxes the block size granularity for thin devices.  It
makes use of the recent range locking added to the bio_prison to
quiesce a whole range of thin blocks before unmapping them.  Once a
thin range has been unmapped the discard can then be passed down to
the data device for those sub ranges where the data blocks are no
longer used (ie. they weren't shared in the first place).

This patch also doesn't make any apologies about open-coding portions
of block core as a means to supporting async discard completions in the
near-term -- if/when late bio splitting lands it'll all get cleaned up.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 607 +++++++++++++++++++++++++++++++------------
 1 file changed, 446 insertions(+), 161 deletions(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index c552df7b3420..99daf2e332f4 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -111,22 +111,30 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
 /*
  * Key building.
  */
-static void build_data_key(struct dm_thin_device *td,
-			   dm_block_t b, struct dm_cell_key *key)
+enum lock_space {
+	VIRTUAL,
+	PHYSICAL
+};
+
+static void build_key(struct dm_thin_device *td, enum lock_space ls,
+		      dm_block_t b, dm_block_t e, struct dm_cell_key *key)
 {
-	key->virtual = 0;
+	key->virtual = (ls == VIRTUAL);
 	key->dev = dm_thin_dev_id(td);
 	key->block_begin = b;
-	key->block_end = b + 1ULL;
+	key->block_end = e;
+}
+
+static void build_data_key(struct dm_thin_device *td, dm_block_t b,
+			   struct dm_cell_key *key)
+{
+	build_key(td, PHYSICAL, b, b + 1llu, key);
 }
 
 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 			      struct dm_cell_key *key)
 {
-	key->virtual = 1;
-	key->dev = dm_thin_dev_id(td);
-	key->block_begin = b;
-	key->block_end = b + 1ULL;
+	build_key(td, VIRTUAL, b, b + 1llu, key);
 }
 
 /*----------------------------------------------------------------*/
@@ -312,6 +320,138 @@ struct thin_c {
 
 /*----------------------------------------------------------------*/
 
+/**
+ * __blkdev_issue_discard_async - queue a discard with async completion
+ * @bdev:	blockdev to issue discard for
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to discard
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @flags:	BLKDEV_IFL_* flags to control behaviour
+ * @parent_bio: parent discard bio that all sub discards get chained to
+ *
+ * Description:
+ *    Asynchronously issue a discard request for the sectors in question.
+ *    NOTE: this variant of blk-core's blkdev_issue_discard() is a stop-gap
+ *    that is being kept local to DM thinp until the block changes to allow
+ *    late bio splitting land upstream.
+ */
+static int __blkdev_issue_discard_async(struct block_device *bdev, sector_t sector,
+					sector_t nr_sects, gfp_t gfp_mask, unsigned long flags,
+					struct bio *parent_bio)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	int type = REQ_WRITE | REQ_DISCARD;
+	unsigned int max_discard_sectors, granularity;
+	int alignment;
+	struct bio *bio;
+	int ret = 0;
+	struct blk_plug plug;
+
+	if (!q)
+		return -ENXIO;
+
+	if (!blk_queue_discard(q))
+		return -EOPNOTSUPP;
+
+	/* Zero-sector (unknown) and one-sector granularities are the same.  */
+	granularity = max(q->limits.discard_granularity >> 9, 1U);
+	alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
+
+	/*
+	 * Ensure that max_discard_sectors is of the proper
+	 * granularity, so that requests stay aligned after a split.
+	 */
+	max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
+	max_discard_sectors -= max_discard_sectors % granularity;
+	if (unlikely(!max_discard_sectors)) {
+		/* Avoid infinite loop below. Being cautious never hurts. */
+		return -EOPNOTSUPP;
+	}
+
+	if (flags & BLKDEV_DISCARD_SECURE) {
+		if (!blk_queue_secdiscard(q))
+			return -EOPNOTSUPP;
+		type |= REQ_SECURE;
+	}
+
+	blk_start_plug(&plug);
+	while (nr_sects) {
+		unsigned int req_sects;
+		sector_t end_sect, tmp;
+
+		/*
+		 * Required bio_put occurs in bio_endio thanks to bio_chain below
+		 */
+		bio = bio_alloc(gfp_mask, 1);
+		if (!bio) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
+
+		/*
+		 * If splitting a request, and the next starting sector would be
+		 * misaligned, stop the discard at the previous aligned sector.
+		 */
+		end_sect = sector + req_sects;
+		tmp = end_sect;
+		if (req_sects < nr_sects &&
+		    sector_div(tmp, granularity) != alignment) {
+			end_sect = end_sect - alignment;
+			sector_div(end_sect, granularity);
+			end_sect = end_sect * granularity + alignment;
+			req_sects = end_sect - sector;
+		}
+
+		bio_chain(bio, parent_bio);
+
+		bio->bi_iter.bi_sector = sector;
+		bio->bi_bdev = bdev;
+
+		bio->bi_iter.bi_size = req_sects << 9;
+		nr_sects -= req_sects;
+		sector = end_sect;
+
+		submit_bio(type, bio);
+
+		/*
+		 * We can loop for a long time in here, if someone does
+		 * full device discards (like mkfs). Be nice and allow
+		 * us to schedule out to avoid softlocking if preempt
+		 * is disabled.
+		 */
+		cond_resched();
+	}
+	blk_finish_plug(&plug);
+
+	return ret;
+}
+
+static bool block_size_is_power_of_two(struct pool *pool)
+{
+	return pool->sectors_per_block_shift >= 0;
+}
+
+static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
+{
+	return block_size_is_power_of_two(pool) ?
+		(b << pool->sectors_per_block_shift) :
+		(b * pool->sectors_per_block);
+}
+
+static int issue_discard(struct thin_c *tc, dm_block_t data_b, dm_block_t data_e,
+			 struct bio *parent_bio)
+{
+	sector_t s = block_to_sectors(tc->pool, data_b);
+	sector_t len = block_to_sectors(tc->pool, data_e - data_b);
+
+	return __blkdev_issue_discard_async(tc->pool_dev->bdev, s, len,
+					    GFP_NOWAIT, 0, parent_bio);
+}
+
+/*----------------------------------------------------------------*/
+
 /*
  * wake_worker() is used when new work is queued and when pool_resume is
  * ready to continue deferred IO processing.
@@ -461,6 +601,7 @@ struct dm_thin_endio_hook {
 	struct dm_deferred_entry *all_io_entry;
 	struct dm_thin_new_mapping *overwrite_mapping;
 	struct rb_node rb_node;
+	struct dm_bio_prison_cell *cell;
 };
 
 static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
@@ -541,11 +682,6 @@ static void error_retry_list(struct pool *pool)
  * target.
  */
 
-static bool block_size_is_power_of_two(struct pool *pool)
-{
-	return pool->sectors_per_block_shift >= 0;
-}
-
 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 {
 	struct pool *pool = tc->pool;
@@ -559,6 +695,34 @@ static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 	return block_nr;
 }
 
+/*
+ * Returns the _complete_ blocks that this bio covers.
+ */
+static void get_bio_block_range(struct thin_c *tc, struct bio *bio,
+				dm_block_t *begin, dm_block_t *end)
+{
+	struct pool *pool = tc->pool;
+	sector_t b = bio->bi_iter.bi_sector;
+	sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
+
+	b += pool->sectors_per_block - 1ull; /* so we round up */
+
+	if (block_size_is_power_of_two(pool)) {
+		b >>= pool->sectors_per_block_shift;
+		e >>= pool->sectors_per_block_shift;
+	} else {
+		(void) sector_div(b, pool->sectors_per_block);
+		(void) sector_div(e, pool->sectors_per_block);
+	}
+
+	if (e < b)
+		/* Can happen if the bio is within a single block. */
+		e = b;
+
+	*begin = b;
+	*end = e;
+}
+
 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 {
 	struct pool *pool = tc->pool;
@@ -647,7 +811,7 @@ struct dm_thin_new_mapping {
 	struct list_head list;
 
 	bool pass_discard:1;
-	bool definitely_not_shared:1;
+	bool maybe_shared:1;
 
 	/*
 	 * Track quiescing, copying and zeroing preparation actions.  When this
@@ -658,9 +822,9 @@ struct dm_thin_new_mapping {
 
 	int err;
 	struct thin_c *tc;
-	dm_block_t virt_block;
+	dm_block_t virt_begin, virt_end;
 	dm_block_t data_block;
-	struct dm_bio_prison_cell *cell, *cell2;
+	struct dm_bio_prison_cell *cell;
 
 	/*
 	 * If the bio covers the whole area of a block then we can avoid
@@ -817,7 +981,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	 * Any I/O for this block arriving after this point will get
 	 * remapped to it directly.
 	 */
-	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
+	r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block);
 	if (r) {
 		metadata_operation_failed(pool, "dm_thin_insert_block", r);
 		cell_error(pool, m->cell);
@@ -844,50 +1008,112 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	mempool_free(m, pool->mapping_pool);
 }
 
+/*----------------------------------------------------------------*/
+
+static void free_discard_mapping(struct dm_thin_new_mapping *m)
+{
+	struct thin_c *tc = m->tc;
+	if (m->cell)
+		cell_defer_no_holder(tc, m->cell);
+	mempool_free(m, tc->pool->mapping_pool);
+}
+
 static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
 {
-	struct thin_c *tc = m->tc;
-
 	bio_io_error(m->bio);
-	cell_defer_no_holder(tc, m->cell);
-	cell_defer_no_holder(tc, m->cell2);
-	mempool_free(m, tc->pool->mapping_pool);
+	free_discard_mapping(m);
 }
 
-static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
+static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
 {
-	struct thin_c *tc = m->tc;
-
-	inc_all_io_entry(tc->pool, m->bio);
-	cell_defer_no_holder(tc, m->cell);
-	cell_defer_no_holder(tc, m->cell2);
-
-	if (m->pass_discard)
-		if (m->definitely_not_shared)
-			remap_and_issue(tc, m->bio, m->data_block);
-		else {
-			bool used = false;
-			if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
-				bio_endio(m->bio, 0);
-			else
-				remap_and_issue(tc, m->bio, m->data_block);
-		}
-	else
-		bio_endio(m->bio, 0);
-
-	mempool_free(m, tc->pool->mapping_pool);
+	bio_endio(m->bio, 0);
+	free_discard_mapping(m);
 }
 
-static void process_prepared_discard(struct dm_thin_new_mapping *m)
+static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
 {
 	int r;
 	struct thin_c *tc = m->tc;
 
-	r = dm_thin_remove_block(tc->td, m->virt_block);
-	if (r)
-		DMERR_LIMIT("dm_thin_remove_block() failed");
+	r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end);
+	if (r) {
+		metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);
+		bio_io_error(m->bio);
+	} else
+		bio_endio(m->bio, 0);
 
-	process_prepared_discard_passdown(m);
+	cell_defer_no_holder(tc, m->cell);
+	mempool_free(m, tc->pool->mapping_pool);
+}
+
+static int passdown_double_checking_shared_status(struct dm_thin_new_mapping *m)
+{
+	/*
+	 * We've already unmapped this range of blocks, but before we
+	 * passdown we have to check that these blocks are now unused.
+	 */
+	int r;
+	bool used = true;
+	struct thin_c *tc = m->tc;
+	struct pool *pool = tc->pool;
+	dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
+
+	while (b != end) {
+		/* find start of unmapped run */
+		for (; b < end; b++) {
+			r = dm_pool_block_is_used(pool->pmd, b, &used);
+			if (r)
+				return r;
+
+			if (!used)
+				break;
+		}
+
+		if (b == end)
+			break;
+
+		/* find end of run */
+		for (e = b + 1; e != end; e++) {
+			r = dm_pool_block_is_used(pool->pmd, e, &used);
+			if (r)
+				return r;
+
+			if (used)
+				break;
+		}
+
+		r = issue_discard(tc, b, e, m->bio);
+		if (r)
+			return r;
+
+		b = e;
+	}
+
+	return 0;
+}
+
+static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
+{
+	int r;
+	struct thin_c *tc = m->tc;
+	struct pool *pool = tc->pool;
+
+	r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
+	if (r)
+		metadata_operation_failed(pool, "dm_thin_remove_range", r);
+
+	else if (m->maybe_shared)
+		r = passdown_double_checking_shared_status(m);
+	else
+		r = issue_discard(tc, m->data_block, m->data_block + (m->virt_end - m->virt_begin), m->bio);
+
+	/*
+	 * Even if r is set, there could be sub discards in flight that we
+	 * need to wait for.
+	 */
+	bio_endio(m->bio, r);
+	cell_defer_no_holder(tc, m->cell);
+	mempool_free(m, pool->mapping_pool);
 }
 
 static void process_prepared(struct pool *pool, struct list_head *head,
@@ -971,7 +1197,7 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
 }
 
 static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
-				      dm_block_t data_block,
+				      dm_block_t data_begin,
 				      struct dm_thin_new_mapping *m)
 {
 	struct pool *pool = tc->pool;
@@ -981,7 +1207,7 @@ static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
 	m->bio = bio;
 	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
 	inc_all_io_entry(pool, bio);
-	remap_and_issue(tc, bio, data_block);
+	remap_and_issue(tc, bio, data_begin);
 }
 
 /*
@@ -998,7 +1224,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 	struct dm_thin_new_mapping *m = get_next_mapping(pool);
 
 	m->tc = tc;
-	m->virt_block = virt_block;
+	m->virt_begin = virt_block;
+	m->virt_end = virt_block + 1u;
 	m->data_block = data_dest;
 	m->cell = cell;
 
@@ -1077,7 +1304,8 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 
 	atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
 	m->tc = tc;
-	m->virt_block = virt_block;
+	m->virt_begin = virt_block;
+	m->virt_end = virt_block + 1u;
 	m->data_block = data_block;
 	m->cell = cell;
 
@@ -1284,99 +1512,149 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
 		retry_on_resume(bio);
 }
 
-static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+static void process_discard_cell_no_passdown(struct thin_c *tc,
+					     struct dm_bio_prison_cell *virt_cell)
 {
-	int r;
-	struct bio *bio = cell->holder;
 	struct pool *pool = tc->pool;
-	struct dm_bio_prison_cell *cell2;
-	struct dm_cell_key key2;
-	dm_block_t block = get_bio_block(tc, bio);
-	struct dm_thin_lookup_result lookup_result;
+	struct dm_thin_new_mapping *m = get_next_mapping(pool);
+
+	/*
+	 * We don't need to lock the data blocks, since there's no
+	 * passdown.  We only lock data blocks for allocation and breaking sharing.
+	 */
+	m->tc = tc;
+	m->virt_begin = virt_cell->key.block_begin;
+	m->virt_end = virt_cell->key.block_end;
+	m->cell = virt_cell;
+	m->bio = virt_cell->holder;
+
+	if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+		pool->process_prepared_discard(m);
+}
+
+/*
+ * FIXME: DM local hack to defer parent bios's end_io until we
+ * _know_ all chained sub range discard bios have completed.
+ * Will go away once late bio splitting lands upstream!
+ */
+static inline void __bio_inc_remaining(struct bio *bio)
+{
+	bio->bi_flags |= (1 << BIO_CHAIN);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_remaining);
+}
+
+static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
+				 struct bio *bio)
+{
+	struct pool *pool = tc->pool;
+
+	int r;
+	bool maybe_shared;
+	struct dm_cell_key data_key;
+	struct dm_bio_prison_cell *data_cell;
 	struct dm_thin_new_mapping *m;
+	dm_block_t virt_begin, virt_end, data_begin;
 
-	if (tc->requeue_mode) {
-		cell_requeue(pool, cell);
-		return;
-	}
+	while (begin != end) {
+		r = ensure_next_mapping(pool);
+		if (r)
+			/* we did our best */
+			return;
 
-	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
-	switch (r) {
-	case 0:
-		/*
-		 * Check nobody is fiddling with this pool block.  This can
-		 * happen if someone's in the process of breaking sharing
-		 * on this block.
-		 */
-		build_data_key(tc->td, lookup_result.block, &key2);
-		if (bio_detain(tc->pool, &key2, bio, &cell2)) {
-			cell_defer_no_holder(tc, cell);
+		r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
+					      &data_begin, &maybe_shared);
+		if (r)
+			/*
+			 * Silently fail, letting any mappings we've
+			 * created complete.
+			 */
 			break;
+
+		build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key);
+		if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
+			/* contention, we'll give up with this range */
+			begin = virt_end;
+			continue;
 		}
 
-		if (io_overlaps_block(pool, bio)) {
-			/*
-			 * IO may still be going to the destination block.  We must
-			 * quiesce before we can do the removal.
-			 */
-			m = get_next_mapping(pool);
-			m->tc = tc;
-			m->pass_discard = pool->pf.discard_passdown;
-			m->definitely_not_shared = !lookup_result.shared;
-			m->virt_block = block;
-			m->data_block = lookup_result.block;
-			m->cell = cell;
-			m->cell2 = cell2;
-			m->bio = bio;
-
-			if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
-				pool->process_prepared_discard(m);
-
-		} else {
-			inc_all_io_entry(pool, bio);
-			cell_defer_no_holder(tc, cell);
-			cell_defer_no_holder(tc, cell2);
-
-			/*
-			 * The DM core makes sure that the discard doesn't span
-			 * a block boundary.  So we submit the discard of a
-			 * partial block appropriately.
-			 */
-			if ((!lookup_result.shared) && pool->pf.discard_passdown)
-				remap_and_issue(tc, bio, lookup_result.block);
-			else
-				bio_endio(bio, 0);
-		}
-		break;
-
-	case -ENODATA:
 		/*
-		 * It isn't provisioned, just forget it.
+		 * IO may still be going to the destination block.  We must
+		 * quiesce before we can do the removal.
 		 */
-		cell_defer_no_holder(tc, cell);
-		bio_endio(bio, 0);
-		break;
+		m = get_next_mapping(pool);
+		m->tc = tc;
+		m->maybe_shared = maybe_shared;
+		m->virt_begin = virt_begin;
+		m->virt_end = virt_end;
+		m->data_block = data_begin;
+		m->cell = data_cell;
+		m->bio = bio;
 
-	default:
-		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
-			    __func__, r);
-		cell_defer_no_holder(tc, cell);
-		bio_io_error(bio);
-		break;
+		/*
+		 * The parent bio must not complete before sub discard bios are
+		 * chained to it (see __blkdev_issue_discard_async's bio_chain)!
+		 *
+		 * This per-mapping bi_remaining increment is paired with
+		 * the implicit decrement that occurs via bio_endio() in
+		 * process_prepared_discard_{passdown,no_passdown}.
+		 */
+		__bio_inc_remaining(bio);
+		if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+			pool->process_prepared_discard(m);
+
+		begin = virt_end;
 	}
 }
 
+static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_prison_cell *virt_cell)
+{
+	struct bio *bio = virt_cell->holder;
+	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+
+	/*
+	 * The virt_cell will only get freed once the origin bio completes.
+	 * This means it will remain locked while all the individual
+	 * passdown bios are in flight.
+	 */
+	h->cell = virt_cell;
+	break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio);
+
+	/*
+	 * We complete the bio now, knowing that the bi_remaining field
+	 * will prevent completion until the sub range discards have
+	 * completed.
+	 */
+	bio_endio(bio, 0);
+}
+
 static void process_discard_bio(struct thin_c *tc, struct bio *bio)
 {
-	struct dm_bio_prison_cell *cell;
-	struct dm_cell_key key;
-	dm_block_t block = get_bio_block(tc, bio);
+	dm_block_t begin, end;
+	struct dm_cell_key virt_key;
+	struct dm_bio_prison_cell *virt_cell;
 
-	build_virtual_key(tc->td, block, &key);
-	if (bio_detain(tc->pool, &key, bio, &cell))
+	get_bio_block_range(tc, bio, &begin, &end);
+	if (begin == end) {
+		/*
+		 * The discard covers less than a block.
+		 */
+		bio_endio(bio, 0);
+		return;
+	}
+
+	build_key(tc->td, VIRTUAL, begin, end, &virt_key);
+	if (bio_detain(tc->pool, &virt_key, bio, &virt_cell))
+		/*
+		 * Potential starvation issue: We're relying on the
+		 * fs/application being well behaved, and not trying to
+		 * send IO to a region at the same time as discarding it.
+		 * If they do this persistently then it's possible this
+		 * cell will never be granted.
+		 */
 		return;
 
-	process_discard_cell(tc, cell);
+	tc->pool->process_discard_cell(tc, virt_cell);
 }
 
 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
@@ -2092,6 +2370,24 @@ static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
 	       dm_device_name(pool->pool_md), new_mode);
 }
 
+static bool passdown_enabled(struct pool_c *pt)
+{
+	return pt->adjusted_pf.discard_passdown;
+}
+
+static void set_discard_callbacks(struct pool *pool)
+{
+	struct pool_c *pt = pool->ti->private;
+
+	if (passdown_enabled(pt)) {
+		pool->process_discard_cell = process_discard_cell_passdown;
+		pool->process_prepared_discard = process_prepared_discard_passdown;
+	} else {
+		pool->process_discard_cell = process_discard_cell_no_passdown;
+		pool->process_prepared_discard = process_prepared_discard_no_passdown;
+	}
+}
+
 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 {
 	struct pool_c *pt = pool->ti->private;
@@ -2143,7 +2439,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 		pool->process_cell = process_cell_read_only;
 		pool->process_discard_cell = process_cell_success;
 		pool->process_prepared_mapping = process_prepared_mapping_fail;
-		pool->process_prepared_discard = process_prepared_discard_passdown;
+		pool->process_prepared_discard = process_prepared_discard_success;
 
 		error_retry_list(pool);
 		break;
@@ -2162,9 +2458,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 		pool->process_bio = process_bio_read_only;
 		pool->process_discard = process_discard_bio;
 		pool->process_cell = process_cell_read_only;
-		pool->process_discard_cell = process_discard_cell;
 		pool->process_prepared_mapping = process_prepared_mapping;
-		pool->process_prepared_discard = process_prepared_discard;
+		set_discard_callbacks(pool);
 
 		if (!pool->pf.error_if_no_space && no_space_timeout)
 			queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
@@ -2177,9 +2472,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 		pool->process_bio = process_bio;
 		pool->process_discard = process_discard_bio;
 		pool->process_cell = process_cell;
-		pool->process_discard_cell = process_discard_cell;
 		pool->process_prepared_mapping = process_prepared_mapping;
-		pool->process_prepared_discard = process_prepared_discard;
+		set_discard_callbacks(pool);
 		break;
 	}
 
@@ -2268,6 +2562,7 @@ static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
 	h->shared_read_entry = NULL;
 	h->all_io_entry = NULL;
 	h->overwrite_mapping = NULL;
+	h->cell = NULL;
 }
 
 /*
@@ -2415,7 +2710,6 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
 	struct pool *pool = pt->pool;
 	struct block_device *data_bdev = pt->data_dev->bdev;
 	struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
-	sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
 	const char *reason = NULL;
 	char buf[BDEVNAME_SIZE];
 
@@ -2428,12 +2722,6 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
 	else if (data_limits->max_discard_sectors < pool->sectors_per_block)
 		reason = "max discard sectors smaller than a block";
 
-	else if (data_limits->discard_granularity > block_size)
-		reason = "discard granularity larger than a block";
-
-	else if (!is_factor(block_size, data_limits->discard_granularity))
-		reason = "discard granularity not a factor of block size";
-
 	if (reason) {
 		DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
 		pt->adjusted_pf.discard_passdown = false;
@@ -3566,24 +3854,6 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
 
-static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
-{
-	struct pool *pool = pt->pool;
-	struct queue_limits *data_limits;
-
-	limits->max_discard_sectors = pool->sectors_per_block;
-
-	/*
-	 * discard_granularity is just a hint, and not enforced.
-	 */
-	if (pt->adjusted_pf.discard_passdown) {
-		data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
-		limits->discard_granularity = max(data_limits->discard_granularity,
-						  pool->sectors_per_block << SECTOR_SHIFT);
-	} else
-		limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
-}
-
 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
 	struct pool_c *pt = ti->private;
@@ -3638,14 +3908,17 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 	disable_passdown_if_not_supported(pt);
 
-	set_discard_limits(pt, limits);
+	/*
+	 * The pool uses the same discard limits as the underlying data
+	 * device.  DM core has already set this up.
+	 */
 }
 
 static struct target_type pool_target = {
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 14, 0},
+	.version = {1, 15, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
@@ -3804,8 +4077,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (tc->pool->pf.discard_enabled) {
 		ti->discards_supported = true;
 		ti->num_discard_bios = 1;
-		/* Discard bios must be split on a block boundary */
-		ti->split_discard_bios = true;
+		ti->split_discard_bios = false;
 	}
 
 	mutex_unlock(&dm_thin_pool_table.mutex);
@@ -3892,6 +4164,9 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
 		}
 	}
 
+	if (h->cell)
+		cell_defer_no_holder(h->tc, h->cell);
+
 	return 0;
 }
 
@@ -4019,9 +4294,18 @@ static int thin_iterate_devices(struct dm_target *ti,
 	return 0;
 }
 
+static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+	struct thin_c *tc = ti->private;
+	struct pool *pool = tc->pool;
+
+	limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+	limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
+}
+
 static struct target_type thin_target = {
 	.name = "thin",
-	.version = {1, 14, 0},
+	.version = {1, 15, 0},
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,
@@ -4033,6 +4317,7 @@ static struct target_type thin_target = {
 	.status = thin_status,
 	.merge = thin_merge,
 	.iterate_devices = thin_iterate_devices,
+	.io_hints = thin_io_hints,
 };
 
 /*----------------------------------------------------------------*/

From fd467696e8beb542144cd005ff96cd35fff41354 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 9 Jun 2015 12:31:26 -0400
Subject: [PATCH 32/39] dm thin: fail messages with EOPNOTSUPP when pool cannot
 handle messages

Use EOPNOTSUPP, rather than EINVAL, error code when user attempts to
send the pool a message.  Otherwise usespace is led to believe the
message failed due to invalid argument.

Reported-by: Zdenek Kabelac <zkabelac@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 99daf2e332f4..c33f61a4cc28 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3656,7 +3656,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
 	if (get_pool_mode(pool) >= PM_READ_ONLY) {
 		DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
 		      dm_device_name(pool->pool_md));
-		return -EINVAL;
+		return -EOPNOTSUPP;
 	}
 
 	if (!strcasecmp(argv[0], "create_thin"))

From b1f11aff04cc86daa0757ada5deb669a92a8f0fb Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 11 Jun 2015 17:11:48 +0100
Subject: [PATCH 33/39] dm thin metadata: fix a race when entering fail mode

In dm_thin_find_block() the ->fail_io flag was checked outside the
metadata device's root_lock, causing dm_thin_find_block() to race with
the setting of this flag.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin-metadata.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 8b521e3e1e1b..48dfe3c4d6aa 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1390,10 +1390,11 @@ int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
 	dm_block_t keys[2] = { td->id, block };
 	struct dm_btree_info *info;
 
-	if (pmd->fail_io)
-		return -EINVAL;
-
 	down_read(&pmd->root_lock);
+	if (pmd->fail_io) {
+		up_read(&pmd->root_lock);
+		return -EINVAL;
+	}
 
 	if (can_issue_io) {
 		info = &pmd->info;

From 6096d91af0b65a3967139b32d5adbb3647858a26 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 17 Jun 2015 13:35:19 +0100
Subject: [PATCH 34/39] dm space map metadata: fix occasional leak of a
 metadata block on resize

The metadata space map has a simplified 'bootstrap' mode that is
operational when extending the space maps.  Whilst in this mode it's
possible for some refcount decrement operations to become queued (eg, as
a result of shadowing one of the bitmap indexes).  These decrements were
not being applied when switching out of bootstrap mode.

The effect of this bug was the leaking of a 4k metadata block.  This is
detected by the latest version of thin_check as a non fatal error.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 .../persistent-data/dm-space-map-metadata.c   | 50 +++++++++++++------
 1 file changed, 35 insertions(+), 15 deletions(-)

diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index e8a904298887..53091295fce9 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -204,6 +204,27 @@ static void in(struct sm_metadata *smm)
 	smm->recursion_count++;
 }
 
+static int apply_bops(struct sm_metadata *smm)
+{
+	int r = 0;
+
+	while (!brb_empty(&smm->uncommitted)) {
+		struct block_op bop;
+
+		r = brb_pop(&smm->uncommitted, &bop);
+		if (r) {
+			DMERR("bug in bop ring buffer");
+			break;
+		}
+
+		r = commit_bop(smm, &bop);
+		if (r)
+			break;
+	}
+
+	return r;
+}
+
 static int out(struct sm_metadata *smm)
 {
 	int r = 0;
@@ -216,21 +237,8 @@ static int out(struct sm_metadata *smm)
 		return -ENOMEM;
 	}
 
-	if (smm->recursion_count == 1) {
-		while (!brb_empty(&smm->uncommitted)) {
-			struct block_op bop;
-
-			r = brb_pop(&smm->uncommitted, &bop);
-			if (r) {
-				DMERR("bug in bop ring buffer");
-				break;
-			}
-
-			r = commit_bop(smm, &bop);
-			if (r)
-				break;
-		}
-	}
+	if (smm->recursion_count == 1)
+		apply_bops(smm);
 
 	smm->recursion_count--;
 
@@ -704,6 +712,12 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
 		}
 		old_len = smm->begin;
 
+		r = apply_bops(smm);
+		if (r) {
+			DMERR("%s: apply_bops failed", __func__);
+			goto out;
+		}
+
 		r = sm_ll_commit(&smm->ll);
 		if (r)
 			goto out;
@@ -773,6 +787,12 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
 	if (r)
 		return r;
 
+	r = apply_bops(smm);
+	if (r) {
+		DMERR("%s: apply_bops failed", __func__);
+		return r;
+	}
+
 	return sm_metadata_commit(sm);
 }
 

From bccab6a01afc26f53d91762d78153513cad10b29 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 17 Jun 2015 11:43:38 -0400
Subject: [PATCH 35/39] dm cache: switch the "default" cache replacement policy
 from mq to smq

The Stochastic multiqueue (SMQ) policy (vs MQ) offers the promise of
less memory utilization, improved performance and increased adaptability
in the face of changing workloads.  SMQ also does not have any
cumbersome tuning knobs.

Users may switch from "mq" to "smq" simply by appropriately reloading a
DM table that is using the cache target.  Doing so will cause all of the
mq policy's hints to be dropped.  Also, performance of the cache may
degrade slightly until smq recalculates the origin device's hotspots
that should be cached.

In the future the "mq" policy will just silently make use of "smq" and
the mq code will be removed.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
---
 .../device-mapper/cache-policies.txt          | 67 ++++++++++++++++++-
 drivers/md/dm-cache-policy-mq.c               | 34 ++--------
 drivers/md/dm-cache-policy-smq.c              | 17 +++++
 3 files changed, 86 insertions(+), 32 deletions(-)

diff --git a/Documentation/device-mapper/cache-policies.txt b/Documentation/device-mapper/cache-policies.txt
index 0d124a971801..d9246a32e673 100644
--- a/Documentation/device-mapper/cache-policies.txt
+++ b/Documentation/device-mapper/cache-policies.txt
@@ -25,10 +25,10 @@ trying to see when the io scheduler has let the ios run.
 Overview of supplied cache replacement policies
 ===============================================
 
-multiqueue
-----------
+multiqueue (mq)
+---------------
 
-This policy is the default.
+This policy has been deprecated in favor of the smq policy (see below).
 
 The multiqueue policy has three sets of 16 queues: one set for entries
 waiting for the cache and another two for those in the cache (a set for
@@ -73,6 +73,67 @@ If you're trying to quickly warm a new cache device you may wish to
 reduce these to encourage promotion.  Remember to switch them back to
 their defaults after the cache fills though.
 
+Stochastic multiqueue (smq)
+---------------------------
+
+This policy is the default.
+
+The stochastic multi-queue (smq) policy addresses some of the problems
+with the multiqueue (mq) policy.
+
+The smq policy (vs mq) offers the promise of less memory utilization,
+improved performance and increased adaptability in the face of changing
+workloads.  SMQ also does not have any cumbersome tuning knobs.
+
+Users may switch from "mq" to "smq" simply by appropriately reloading a
+DM table that is using the cache target.  Doing so will cause all of the
+mq policy's hints to be dropped.  Also, performance of the cache may
+degrade slightly until smq recalculates the origin device's hotspots
+that should be cached.
+
+Memory usage:
+The mq policy uses a lot of memory; 88 bytes per cache block on a 64
+bit machine.
+
+SMQ uses 28bit indexes to implement it's data structures rather than
+pointers.  It avoids storing an explicit hit count for each block.  It
+has a 'hotspot' queue rather than a pre cache which uses a quarter of
+the entries (each hotspot block covers a larger area than a single
+cache block).
+
+All these mean smq uses ~25bytes per cache block.  Still a lot of
+memory, but a substantial improvement nontheless.
+
+Level balancing:
+MQ places entries in different levels of the multiqueue structures
+based on their hit count (~ln(hit count)).  This means the bottom
+levels generally have the most entries, and the top ones have very
+few.  Having unbalanced levels like this reduces the efficacy of the
+multiqueue.
+
+SMQ does not maintain a hit count, instead it swaps hit entries with
+the least recently used entry from the level above.  The over all
+ordering being a side effect of this stochastic process.  With this
+scheme we can decide how many entries occupy each multiqueue level,
+resulting in better promotion/demotion decisions.
+
+Adaptability:
+The MQ policy maintains a hit count for each cache block.  For a
+different block to get promoted to the cache it's hit count has to
+exceed the lowest currently in the cache.  This means it can take a
+long time for the cache to adapt between varying IO patterns.
+Periodically degrading the hit counts could help with this, but I
+haven't found a nice general solution.
+
+SMQ doesn't maintain hit counts, so a lot of this problem just goes
+away.  In addition it tracks performance of the hotspot queue, which
+is used to decide which blocks to promote.  If the hotspot queue is
+performing badly then it starts moving entries more quickly between
+levels.  This lets it adapt to new IO patterns very quickly.
+
+Performance:
+Testing SMQ shows substantially better performance than MQ.
+
 cleaner
 -------
 
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 838665bb495a..32814371b8d3 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -1431,21 +1431,12 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
 
 static struct dm_cache_policy_type mq_policy_type = {
 	.name = "mq",
-	.version = {1, 3, 0},
+	.version = {1, 4, 0},
 	.hint_size = 4,
 	.owner = THIS_MODULE,
 	.create = mq_create
 };
 
-static struct dm_cache_policy_type default_policy_type = {
-	.name = "default",
-	.version = {1, 3, 0},
-	.hint_size = 4,
-	.owner = THIS_MODULE,
-	.create = mq_create,
-	.real = &mq_policy_type
-};
-
 static int __init mq_init(void)
 {
 	int r;
@@ -1455,36 +1446,21 @@ static int __init mq_init(void)
 					   __alignof__(struct entry),
 					   0, NULL);
 	if (!mq_entry_cache)
-		goto bad;
+		return -ENOMEM;
 
 	r = dm_cache_policy_register(&mq_policy_type);
 	if (r) {
 		DMERR("register failed %d", r);
-		goto bad_register_mq;
+		kmem_cache_destroy(mq_entry_cache);
+		return -ENOMEM;
 	}
 
-	r = dm_cache_policy_register(&default_policy_type);
-	if (!r) {
-		DMINFO("version %u.%u.%u loaded",
-		       mq_policy_type.version[0],
-		       mq_policy_type.version[1],
-		       mq_policy_type.version[2]);
-		return 0;
-	}
-
-	DMERR("register failed (as default) %d", r);
-
-	dm_cache_policy_unregister(&mq_policy_type);
-bad_register_mq:
-	kmem_cache_destroy(mq_entry_cache);
-bad:
-	return -ENOMEM;
+	return 0;
 }
 
 static void __exit mq_exit(void)
 {
 	dm_cache_policy_unregister(&mq_policy_type);
-	dm_cache_policy_unregister(&default_policy_type);
 
 	kmem_cache_destroy(mq_entry_cache);
 }
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 66feb307e697..80f02d3330e2 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1748,6 +1748,15 @@ static struct dm_cache_policy_type smq_policy_type = {
 	.create = smq_create
 };
 
+static struct dm_cache_policy_type default_policy_type = {
+	.name = "default",
+	.version = {1, 0, 0},
+	.hint_size = 4,
+	.owner = THIS_MODULE,
+	.create = smq_create,
+	.real = &smq_policy_type
+};
+
 static int __init smq_init(void)
 {
 	int r;
@@ -1758,12 +1767,20 @@ static int __init smq_init(void)
 		return -ENOMEM;
 	}
 
+	r = dm_cache_policy_register(&default_policy_type);
+	if (r) {
+		DMERR("register failed (as default) %d", r);
+		dm_cache_policy_unregister(&smq_policy_type);
+		return -ENOMEM;
+	}
+
 	return 0;
 }
 
 static void __exit smq_exit(void)
 {
 	dm_cache_policy_unregister(&smq_policy_type);
+	dm_cache_policy_unregister(&default_policy_type);
 }
 
 module_init(smq_init);

From dd4c1b7d0c95be1c9245118a3accc41a16f1db67 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 5 Jun 2015 09:50:42 -0400
Subject: [PATCH 36/39] dm stats: fix divide by zero if 'number_of_areas' arg
 is zero

If the number_of_areas argument was zero the kernel would crash on
div-by-zero.  Add better input validation.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org # v3.12+
---
 drivers/md/dm-stats.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 492fe6a5ebf2..d1fd31a6dd1a 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -792,6 +792,8 @@ static int message_stats_create(struct mapped_device *md,
 		return -EINVAL;
 
 	if (sscanf(argv[2], "/%u%c", &divisor, &dummy) == 1) {
+		if (!divisor)
+			return -EINVAL;
 		step = end - start;
 		if (do_div(step, divisor))
 			step++;

From c96aec344de0de857ef3d7fba53992c7ba311e1e Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 9 Jun 2015 17:21:39 -0400
Subject: [PATCH 37/39] dm stats: support precise timestamps

Make it possible to use precise timestamps with nanosecond granularity
in dm statistics.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/statistics.txt |  28 ++++-
 drivers/md/dm-stats.c                      | 138 +++++++++++++++------
 drivers/md/dm-stats.h                      |   4 +-
 3 files changed, 127 insertions(+), 43 deletions(-)

diff --git a/Documentation/device-mapper/statistics.txt b/Documentation/device-mapper/statistics.txt
index 2a1673adc200..ff6baeaa71f7 100644
--- a/Documentation/device-mapper/statistics.txt
+++ b/Documentation/device-mapper/statistics.txt
@@ -13,9 +13,13 @@ the range specified.
 The I/O statistics counters for each step-sized area of a region are
 in the same format as /sys/block/*/stat or /proc/diskstats (see:
 Documentation/iostats.txt).  But two extra counters (12 and 13) are
-provided: total time spent reading and writing in milliseconds.	 All
-these counters may be accessed by sending the @stats_print message to
-the appropriate DM device via dmsetup.
+provided: total time spent reading and writing.  All these counters may
+be accessed by sending the @stats_print message to the appropriate DM
+device via dmsetup.
+
+The reported times are in milliseconds and the granularity depends on
+the kernel ticks.  When the option precise_timestamps is used, the
+reported times are in nanoseconds.
 
 Each region has a corresponding unique identifier, which we call a
 region_id, that is assigned when the region is created.	 The region_id
@@ -33,7 +37,9 @@ memory is used by reading
 Messages
 ========
 
-    @stats_create <range> <step> [<program_id> [<aux_data>]]
+    @stats_create <range> <step>
+		[<number_of_optional_arguments> <optional_arguments>...]
+		[<program_id> [<aux_data>]]
 
 	Create a new region and return the region_id.
 
@@ -48,6 +54,17 @@ Messages
 	  "/<number_of_areas>" - the range is subdivided into the specified
 				 number of areas.
 
+	<number_of_optional_arguments>
+	  The number of optional arguments
+
+	<optional_arguments>
+	  The following optional arguments are supported
+	  precise_timestamps - use precise timer with nanosecond resolution
+		instead of the "jiffies" variable.  When this argument is
+		used, the resulting times are in nanoseconds instead of
+		milliseconds.  Precise timestamps are a little bit slower
+		to obtain than jiffies-based timestamps.
+
 	<program_id>
 	  An optional parameter.  A name that uniquely identifies
 	  the userspace owner of the range.  This groups ranges together
@@ -55,6 +72,9 @@ Messages
 	  created and ignore those created by others.
 	  The kernel returns this string back in the output of
 	  @stats_list message, but it doesn't use it for anything else.
+	  If we omit the number of optional arguments, program id must not
+	  be a number, otherwise it would be interpreted as the number of
+	  optional arguments.
 
 	<aux_data>
 	  An optional parameter.  A word that provides auxiliary data
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index d1fd31a6dd1a..4bfd84ab1d4a 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -33,13 +33,14 @@ struct dm_stat_percpu {
 
 struct dm_stat_shared {
 	atomic_t in_flight[2];
-	unsigned long stamp;
+	unsigned long long stamp;
 	struct dm_stat_percpu tmp;
 };
 
 struct dm_stat {
 	struct list_head list_entry;
 	int id;
+	unsigned stat_flags;
 	size_t n_entries;
 	sector_t start;
 	sector_t end;
@@ -53,6 +54,8 @@ struct dm_stat {
 	struct dm_stat_shared stat_shared[0];
 };
 
+#define STAT_PRECISE_TIMESTAMPS		1
+
 struct dm_stats_last_position {
 	sector_t last_sector;
 	unsigned last_rw;
@@ -224,7 +227,8 @@ void dm_stats_cleanup(struct dm_stats *stats)
 }
 
 static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
-			   sector_t step, const char *program_id, const char *aux_data,
+			   sector_t step, unsigned stat_flags,
+			   const char *program_id, const char *aux_data,
 			   void (*suspend_callback)(struct mapped_device *),
 			   void (*resume_callback)(struct mapped_device *),
 			   struct mapped_device *md)
@@ -265,6 +269,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
 	if (!s)
 		return -ENOMEM;
 
+	s->stat_flags = stat_flags;
 	s->n_entries = n_entries;
 	s->start = start;
 	s->end = end;
@@ -414,18 +419,24 @@ static int dm_stats_list(struct dm_stats *stats, const char *program,
 	return 1;
 }
 
-static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *p)
+static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared,
+			  struct dm_stat_percpu *p)
 {
 	/*
 	 * This is racy, but so is part_round_stats_single.
 	 */
-	unsigned long now = jiffies;
-	unsigned in_flight_read;
-	unsigned in_flight_write;
-	unsigned long difference = now - shared->stamp;
+	unsigned long long now, difference;
+	unsigned in_flight_read, in_flight_write;
 
+	if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)))
+		now = jiffies;
+	else
+		now = ktime_to_ns(ktime_get());
+
+	difference = now - shared->stamp;
 	if (!difference)
 		return;
+
 	in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]);
 	in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]);
 	if (in_flight_read)
@@ -440,8 +451,9 @@ static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *
 }
 
 static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
-			      unsigned long bi_rw, sector_t len, bool merged,
-			      bool end, unsigned long duration)
+			      unsigned long bi_rw, sector_t len,
+			      struct dm_stats_aux *stats_aux, bool end,
+			      unsigned long duration_jiffies)
 {
 	unsigned long idx = bi_rw & REQ_WRITE;
 	struct dm_stat_shared *shared = &s->stat_shared[entry];
@@ -471,15 +483,18 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
 	p = &s->stat_percpu[smp_processor_id()][entry];
 
 	if (!end) {
-		dm_stat_round(shared, p);
+		dm_stat_round(s, shared, p);
 		atomic_inc(&shared->in_flight[idx]);
 	} else {
-		dm_stat_round(shared, p);
+		dm_stat_round(s, shared, p);
 		atomic_dec(&shared->in_flight[idx]);
 		p->sectors[idx] += len;
 		p->ios[idx] += 1;
-		p->merges[idx] += merged;
-		p->ticks[idx] += duration;
+		p->merges[idx] += stats_aux->merged;
+		if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS))
+			p->ticks[idx] += duration_jiffies;
+		else
+			p->ticks[idx] += stats_aux->duration_ns;
 	}
 
 #if BITS_PER_LONG == 32
@@ -491,7 +506,7 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
 
 static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
 			  sector_t bi_sector, sector_t end_sector,
-			  bool end, unsigned long duration,
+			  bool end, unsigned long duration_jiffies,
 			  struct dm_stats_aux *stats_aux)
 {
 	sector_t rel_sector, offset, todo, fragment_len;
@@ -520,7 +535,7 @@ static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
 		if (fragment_len > s->step - offset)
 			fragment_len = s->step - offset;
 		dm_stat_for_entry(s, entry, bi_rw, fragment_len,
-				  stats_aux->merged, end, duration);
+				  stats_aux, end, duration_jiffies);
 		todo -= fragment_len;
 		entry++;
 		offset = 0;
@@ -529,11 +544,13 @@ static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
 
 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
 			 sector_t bi_sector, unsigned bi_sectors, bool end,
-			 unsigned long duration, struct dm_stats_aux *stats_aux)
+			 unsigned long duration_jiffies,
+			 struct dm_stats_aux *stats_aux)
 {
 	struct dm_stat *s;
 	sector_t end_sector;
 	struct dm_stats_last_position *last;
+	bool got_precise_time;
 
 	if (unlikely(!bi_sectors))
 		return;
@@ -557,8 +574,17 @@ void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
 
 	rcu_read_lock();
 
-	list_for_each_entry_rcu(s, &stats->list, list_entry)
-		__dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration, stats_aux);
+	got_precise_time = false;
+	list_for_each_entry_rcu(s, &stats->list, list_entry) {
+		if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) {
+			if (!end)
+				stats_aux->duration_ns = ktime_to_ns(ktime_get());
+			else
+				stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns;
+			got_precise_time = true;
+		}
+		__dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux);
+	}
 
 	rcu_read_unlock();
 }
@@ -571,7 +597,7 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
 
 	local_irq_disable();
 	p = &s->stat_percpu[smp_processor_id()][x];
-	dm_stat_round(shared, p);
+	dm_stat_round(s, shared, p);
 	local_irq_enable();
 
 	memset(&shared->tmp, 0, sizeof(shared->tmp));
@@ -643,11 +669,15 @@ static int dm_stats_clear(struct dm_stats *stats, int id)
 /*
  * This is like jiffies_to_msec, but works for 64-bit values.
  */
-static unsigned long long dm_jiffies_to_msec64(unsigned long long j)
+static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j)
 {
-	unsigned long long result = 0;
+	unsigned long long result;
 	unsigned mult;
 
+	if (s->stat_flags & STAT_PRECISE_TIMESTAMPS)
+		return j;
+
+	result = 0;
 	if (j)
 		result = jiffies_to_msecs(j & 0x3fffff);
 	if (j >= 1 << 22) {
@@ -709,16 +739,16 @@ static int dm_stats_print(struct dm_stats *stats, int id,
 		       shared->tmp.ios[READ],
 		       shared->tmp.merges[READ],
 		       shared->tmp.sectors[READ],
-		       dm_jiffies_to_msec64(shared->tmp.ticks[READ]),
+		       dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]),
 		       shared->tmp.ios[WRITE],
 		       shared->tmp.merges[WRITE],
 		       shared->tmp.sectors[WRITE],
-		       dm_jiffies_to_msec64(shared->tmp.ticks[WRITE]),
+		       dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]),
 		       dm_stat_in_flight(shared),
-		       dm_jiffies_to_msec64(shared->tmp.io_ticks_total),
-		       dm_jiffies_to_msec64(shared->tmp.time_in_queue),
-		       dm_jiffies_to_msec64(shared->tmp.io_ticks[READ]),
-		       dm_jiffies_to_msec64(shared->tmp.io_ticks[WRITE]));
+		       dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total),
+		       dm_jiffies_to_msec64(s, shared->tmp.time_in_queue),
+		       dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]),
+		       dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE]));
 
 		if (unlikely(sz + 1 >= maxlen))
 			goto buffer_overflow;
@@ -769,21 +799,31 @@ static int message_stats_create(struct mapped_device *md,
 	unsigned long long start, end, len, step;
 	unsigned divisor;
 	const char *program_id, *aux_data;
+	unsigned stat_flags = 0;
+
+	struct dm_arg_set as, as_backup;
+	const char *a;
+	unsigned feature_args;
 
 	/*
 	 * Input format:
-	 *   <range> <step> [<program_id> [<aux_data>]]
+	 *   <range> <step> [<extra_parameters> <parameters>] [<program_id> [<aux_data>]]
 	 */
 
-	if (argc < 3 || argc > 5)
+	if (argc < 3)
 		return -EINVAL;
 
-	if (!strcmp(argv[1], "-")) {
+	as.argc = argc;
+	as.argv = argv;
+	dm_consume_args(&as, 1);
+
+	a = dm_shift_arg(&as);
+	if (!strcmp(a, "-")) {
 		start = 0;
 		len = dm_get_size(md);
 		if (!len)
 			len = 1;
-	} else if (sscanf(argv[1], "%llu+%llu%c", &start, &len, &dummy) != 2 ||
+	} else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 ||
 		   start != (sector_t)start || len != (sector_t)len)
 		return -EINVAL;
 
@@ -791,7 +831,8 @@ static int message_stats_create(struct mapped_device *md,
 	if (start >= end)
 		return -EINVAL;
 
-	if (sscanf(argv[2], "/%u%c", &divisor, &dummy) == 1) {
+	a = dm_shift_arg(&as);
+	if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) {
 		if (!divisor)
 			return -EINVAL;
 		step = end - start;
@@ -799,18 +840,39 @@ static int message_stats_create(struct mapped_device *md,
 			step++;
 		if (!step)
 			step = 1;
-	} else if (sscanf(argv[2], "%llu%c", &step, &dummy) != 1 ||
+	} else if (sscanf(a, "%llu%c", &step, &dummy) != 1 ||
 		   step != (sector_t)step || !step)
 		return -EINVAL;
 
+	as_backup = as;
+	a = dm_shift_arg(&as);
+	if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) {
+		while (feature_args--) {
+			a = dm_shift_arg(&as);
+			if (!a)
+				return -EINVAL;
+			if (!strcasecmp(a, "precise_timestamps"))
+				stat_flags |= STAT_PRECISE_TIMESTAMPS;
+			else
+				return -EINVAL;
+		}
+	} else {
+		as = as_backup;
+	}
+
 	program_id = "-";
 	aux_data = "-";
 
-	if (argc > 3)
-		program_id = argv[3];
+	a = dm_shift_arg(&as);
+	if (a)
+		program_id = a;
 
-	if (argc > 4)
-		aux_data = argv[4];
+	a = dm_shift_arg(&as);
+	if (a)
+		aux_data = a;
+
+	if (as.argc)
+		return -EINVAL;
 
 	/*
 	 * If a buffer overflow happens after we created the region,
@@ -822,7 +884,7 @@ static int message_stats_create(struct mapped_device *md,
 	if (dm_message_test_buffer_overflow(result, maxlen))
 		return 1;
 
-	id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data,
+	id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags, program_id, aux_data,
 			     dm_internal_suspend_fast, dm_internal_resume_fast, md);
 	if (id < 0)
 		return id;
diff --git a/drivers/md/dm-stats.h b/drivers/md/dm-stats.h
index e7c4984bf235..f1c0956e3843 100644
--- a/drivers/md/dm-stats.h
+++ b/drivers/md/dm-stats.h
@@ -18,6 +18,7 @@ struct dm_stats {
 
 struct dm_stats_aux {
 	bool merged;
+	unsigned long long duration_ns;
 };
 
 void dm_stats_init(struct dm_stats *st);
@@ -30,7 +31,8 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
 
 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
 			 sector_t bi_sector, unsigned bi_sectors, bool end,
-			 unsigned long duration, struct dm_stats_aux *aux);
+			 unsigned long duration_jiffies,
+			 struct dm_stats_aux *aux);
 
 static inline bool dm_stats_used(struct dm_stats *st)
 {

From dfcfac3e4cd94abef779297fab6adfd2dbcf52fa Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 9 Jun 2015 17:22:05 -0400
Subject: [PATCH 38/39] dm stats: collect and report histogram of IO latencies

Add an option to dm statistics to collect and report a histogram of
IO latencies.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/statistics.txt |  19 +-
 drivers/md/dm-stats.c                      | 205 ++++++++++++++++++---
 2 files changed, 199 insertions(+), 25 deletions(-)

diff --git a/Documentation/device-mapper/statistics.txt b/Documentation/device-mapper/statistics.txt
index ff6baeaa71f7..4919b2dfd1b3 100644
--- a/Documentation/device-mapper/statistics.txt
+++ b/Documentation/device-mapper/statistics.txt
@@ -13,9 +13,10 @@ the range specified.
 The I/O statistics counters for each step-sized area of a region are
 in the same format as /sys/block/*/stat or /proc/diskstats (see:
 Documentation/iostats.txt).  But two extra counters (12 and 13) are
-provided: total time spent reading and writing.  All these counters may
-be accessed by sending the @stats_print message to the appropriate DM
-device via dmsetup.
+provided: total time spent reading and writing.  When the histogram
+argument is used, the 14th parameter is reported that represents the
+histogram of latencies.  All these counters may be accessed by sending
+the @stats_print message to the appropriate DM device via dmsetup.
 
 The reported times are in milliseconds and the granularity depends on
 the kernel ticks.  When the option precise_timestamps is used, the
@@ -64,6 +65,18 @@ Messages
 		used, the resulting times are in nanoseconds instead of
 		milliseconds.  Precise timestamps are a little bit slower
 		to obtain than jiffies-based timestamps.
+	  histogram:n1,n2,n3,n4,... - collect histogram of latencies.  The
+		numbers n1, n2, etc are times that represent the boundaries
+		of the histogram.  If precise_timestamps is not used, the
+		times are in milliseconds, otherwise they are in
+		nanoseconds.  For each range, the kernel will report the
+		number of requests that completed within this range. For
+		example, if we use "histogram:10,20,30", the kernel will
+		report four numbers a:b:c:d. a is the number of requests
+		that took 0-10 ms to complete, b is the number of requests
+		that took 10-20 ms to complete, c is the number of requests
+		that took 20-30 ms to complete and d is the number of
+		requests that took more than 30 ms to complete.
 
 	<program_id>
 	  An optional parameter.  A name that uniquely identifies
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 4bfd84ab1d4a..faf1071ef631 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -29,6 +29,7 @@ struct dm_stat_percpu {
 	unsigned long long io_ticks[2];
 	unsigned long long io_ticks_total;
 	unsigned long long time_in_queue;
+	unsigned long long *histogram;
 };
 
 struct dm_stat_shared {
@@ -45,11 +46,14 @@ struct dm_stat {
 	sector_t start;
 	sector_t end;
 	sector_t step;
+	unsigned n_histogram_entries;
+	unsigned long long *histogram_boundaries;
 	const char *program_id;
 	const char *aux_data;
 	struct rcu_head rcu_head;
 	size_t shared_alloc_size;
 	size_t percpu_alloc_size;
+	size_t histogram_alloc_size;
 	struct dm_stat_percpu *stat_percpu[NR_CPUS];
 	struct dm_stat_shared stat_shared[0];
 };
@@ -173,8 +177,11 @@ static void dm_stat_free(struct rcu_head *head)
 
 	kfree(s->program_id);
 	kfree(s->aux_data);
-	for_each_possible_cpu(cpu)
+	for_each_possible_cpu(cpu) {
+		dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size);
 		dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
+	}
+	dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size);
 	dm_kvfree(s, s->shared_alloc_size);
 }
 
@@ -228,6 +235,8 @@ void dm_stats_cleanup(struct dm_stats *stats)
 
 static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
 			   sector_t step, unsigned stat_flags,
+			   unsigned n_histogram_entries,
+			   unsigned long long *histogram_boundaries,
 			   const char *program_id, const char *aux_data,
 			   void (*suspend_callback)(struct mapped_device *),
 			   void (*resume_callback)(struct mapped_device *),
@@ -239,6 +248,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
 	size_t ni;
 	size_t shared_alloc_size;
 	size_t percpu_alloc_size;
+	size_t histogram_alloc_size;
 	struct dm_stat_percpu *p;
 	int cpu;
 	int ret_id;
@@ -262,7 +272,12 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
 	if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
 		return -EOVERFLOW;
 
-	if (!check_shared_memory(shared_alloc_size + num_possible_cpus() * percpu_alloc_size))
+	histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long);
+	if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long))
+		return -EOVERFLOW;
+
+	if (!check_shared_memory(shared_alloc_size + histogram_alloc_size +
+				 num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size)))
 		return -ENOMEM;
 
 	s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
@@ -276,6 +291,15 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
 	s->step = step;
 	s->shared_alloc_size = shared_alloc_size;
 	s->percpu_alloc_size = percpu_alloc_size;
+	s->histogram_alloc_size = histogram_alloc_size;
+
+	s->n_histogram_entries = n_histogram_entries;
+	s->histogram_boundaries = kmemdup(histogram_boundaries,
+					  s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
+	if (!s->histogram_boundaries) {
+		r = -ENOMEM;
+		goto out;
+	}
 
 	s->program_id = kstrdup(program_id, GFP_KERNEL);
 	if (!s->program_id) {
@@ -293,6 +317,19 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
 		atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
 	}
 
+	if (s->n_histogram_entries) {
+		unsigned long long *hi;
+		hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE);
+		if (!hi) {
+			r = -ENOMEM;
+			goto out;
+		}
+		for (ni = 0; ni < n_entries; ni++) {
+			s->stat_shared[ni].tmp.histogram = hi;
+			hi += s->n_histogram_entries + 1;
+		}
+	}
+
 	for_each_possible_cpu(cpu) {
 		p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
 		if (!p) {
@@ -300,6 +337,18 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
 			goto out;
 		}
 		s->stat_percpu[cpu] = p;
+		if (s->n_histogram_entries) {
+			unsigned long long *hi;
+			hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu));
+			if (!hi) {
+				r = -ENOMEM;
+				goto out;
+			}
+			for (ni = 0; ni < n_entries; ni++) {
+				p[ni].histogram = hi;
+				hi += s->n_histogram_entries + 1;
+			}
+		}
 	}
 
 	/*
@@ -377,9 +426,11 @@ static int dm_stats_delete(struct dm_stats *stats, int id)
 	 * vfree can't be called from RCU callback
 	 */
 	for_each_possible_cpu(cpu)
-		if (is_vmalloc_addr(s->stat_percpu))
+		if (is_vmalloc_addr(s->stat_percpu) ||
+		    is_vmalloc_addr(s->stat_percpu[cpu][0].histogram))
 			goto do_sync_free;
-	if (is_vmalloc_addr(s)) {
+	if (is_vmalloc_addr(s) ||
+	    is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) {
 do_sync_free:
 		synchronize_rcu_expedited();
 		dm_stat_free(&s->rcu_head);
@@ -486,15 +537,32 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
 		dm_stat_round(s, shared, p);
 		atomic_inc(&shared->in_flight[idx]);
 	} else {
+		unsigned long long duration;
 		dm_stat_round(s, shared, p);
 		atomic_dec(&shared->in_flight[idx]);
 		p->sectors[idx] += len;
 		p->ios[idx] += 1;
 		p->merges[idx] += stats_aux->merged;
-		if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS))
+		if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) {
 			p->ticks[idx] += duration_jiffies;
-		else
+			duration = jiffies_to_msecs(duration_jiffies);
+		} else {
 			p->ticks[idx] += stats_aux->duration_ns;
+			duration = stats_aux->duration_ns;
+		}
+		if (s->n_histogram_entries) {
+			unsigned lo = 0, hi = s->n_histogram_entries + 1;
+			while (lo + 1 < hi) {
+				unsigned mid = (lo + hi) / 2;
+				if (s->histogram_boundaries[mid - 1] > duration) {
+					hi = mid;
+				} else {
+					lo = mid;
+				}
+
+			}
+			p->histogram[lo]++;
+		}
 	}
 
 #if BITS_PER_LONG == 32
@@ -600,7 +668,22 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
 	dm_stat_round(s, shared, p);
 	local_irq_enable();
 
-	memset(&shared->tmp, 0, sizeof(shared->tmp));
+	shared->tmp.sectors[READ] = 0;
+	shared->tmp.sectors[WRITE] = 0;
+	shared->tmp.ios[READ] = 0;
+	shared->tmp.ios[WRITE] = 0;
+	shared->tmp.merges[READ] = 0;
+	shared->tmp.merges[WRITE] = 0;
+	shared->tmp.ticks[READ] = 0;
+	shared->tmp.ticks[WRITE] = 0;
+	shared->tmp.io_ticks[READ] = 0;
+	shared->tmp.io_ticks[WRITE] = 0;
+	shared->tmp.io_ticks_total = 0;
+	shared->tmp.time_in_queue = 0;
+
+	if (s->n_histogram_entries)
+		memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long));
+
 	for_each_possible_cpu(cpu) {
 		p = &s->stat_percpu[cpu][x];
 		shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
@@ -615,6 +698,11 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
 		shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
 		shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
 		shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
+		if (s->n_histogram_entries) {
+			unsigned i;
+			for (i = 0; i < s->n_histogram_entries + 1; i++)
+				shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]);
+		}
 	}
 }
 
@@ -644,6 +732,15 @@ static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
 		p->io_ticks_total -= shared->tmp.io_ticks_total;
 		p->time_in_queue -= shared->tmp.time_in_queue;
 		local_irq_enable();
+		if (s->n_histogram_entries) {
+			unsigned i;
+			for (i = 0; i < s->n_histogram_entries + 1; i++) {
+				local_irq_disable();
+				p = &s->stat_percpu[smp_processor_id()][x];
+				p->histogram[i] -= shared->tmp.histogram[i];
+				local_irq_enable();
+			}
+		}
 	}
 }
 
@@ -733,7 +830,7 @@ static int dm_stats_print(struct dm_stats *stats, int id,
 
 		__dm_stat_init_temporary_percpu_totals(shared, s, x);
 
-		DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu\n",
+		DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu",
 		       (unsigned long long)start,
 		       (unsigned long long)step,
 		       shared->tmp.ios[READ],
@@ -749,6 +846,13 @@ static int dm_stats_print(struct dm_stats *stats, int id,
 		       dm_jiffies_to_msec64(s, shared->tmp.time_in_queue),
 		       dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]),
 		       dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE]));
+		if (s->n_histogram_entries) {
+			unsigned i;
+			for (i = 0; i < s->n_histogram_entries + 1; i++) {
+				DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]);
+			}
+		}
+		DMEMIT("\n");
 
 		if (unlikely(sz + 1 >= maxlen))
 			goto buffer_overflow;
@@ -790,10 +894,47 @@ static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data
 	return 0;
 }
 
+static int parse_histogram(const char *h, unsigned *n_histogram_entries,
+			   unsigned long long **histogram_boundaries)
+{
+	const char *q;
+	unsigned n;
+	unsigned long long last;
+
+	*n_histogram_entries = 1;
+	for (q = h; *q; q++)
+		if (*q == ',')
+			(*n_histogram_entries)++;
+
+	*histogram_boundaries = kmalloc(*n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
+	if (!*histogram_boundaries)
+		return -ENOMEM;
+
+	n = 0;
+	last = 0;
+	while (1) {
+		unsigned long long hi;
+		int s;
+		char ch;
+		s = sscanf(h, "%llu%c", &hi, &ch);
+		if (!s || (s == 2 && ch != ','))
+			return -EINVAL;
+		if (hi <= last)
+			return -EINVAL;
+		last = hi;
+		(*histogram_boundaries)[n] = hi;
+		if (s == 1)
+			return 0;
+		h = strchr(h, ',') + 1;
+		n++;
+	}
+}
+
 static int message_stats_create(struct mapped_device *md,
 				unsigned argc, char **argv,
 				char *result, unsigned maxlen)
 {
+	int r;
 	int id;
 	char dummy;
 	unsigned long long start, end, len, step;
@@ -801,6 +942,9 @@ static int message_stats_create(struct mapped_device *md,
 	const char *program_id, *aux_data;
 	unsigned stat_flags = 0;
 
+	unsigned n_histogram_entries = 0;
+	unsigned long long *histogram_boundaries = NULL;
+
 	struct dm_arg_set as, as_backup;
 	const char *a;
 	unsigned feature_args;
@@ -811,7 +955,7 @@ static int message_stats_create(struct mapped_device *md,
 	 */
 
 	if (argc < 3)
-		return -EINVAL;
+		goto ret_einval;
 
 	as.argc = argc;
 	as.argv = argv;
@@ -825,11 +969,11 @@ static int message_stats_create(struct mapped_device *md,
 			len = 1;
 	} else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 ||
 		   start != (sector_t)start || len != (sector_t)len)
-		return -EINVAL;
+		goto ret_einval;
 
 	end = start + len;
 	if (start >= end)
-		return -EINVAL;
+		goto ret_einval;
 
 	a = dm_shift_arg(&as);
 	if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) {
@@ -842,7 +986,7 @@ static int message_stats_create(struct mapped_device *md,
 			step = 1;
 	} else if (sscanf(a, "%llu%c", &step, &dummy) != 1 ||
 		   step != (sector_t)step || !step)
-		return -EINVAL;
+		goto ret_einval;
 
 	as_backup = as;
 	a = dm_shift_arg(&as);
@@ -850,11 +994,16 @@ static int message_stats_create(struct mapped_device *md,
 		while (feature_args--) {
 			a = dm_shift_arg(&as);
 			if (!a)
-				return -EINVAL;
+				goto ret_einval;
 			if (!strcasecmp(a, "precise_timestamps"))
 				stat_flags |= STAT_PRECISE_TIMESTAMPS;
-			else
-				return -EINVAL;
+			else if (!strncasecmp(a, "histogram:", 10)) {
+				if (n_histogram_entries)
+					goto ret_einval;
+				if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries)))
+					goto ret;
+			} else
+				goto ret_einval;
 		}
 	} else {
 		as = as_backup;
@@ -872,7 +1021,7 @@ static int message_stats_create(struct mapped_device *md,
 		aux_data = a;
 
 	if (as.argc)
-		return -EINVAL;
+		goto ret_einval;
 
 	/*
 	 * If a buffer overflow happens after we created the region,
@@ -881,17 +1030,29 @@ static int message_stats_create(struct mapped_device *md,
 	 * leaked).  So we must detect buffer overflow in advance.
 	 */
 	snprintf(result, maxlen, "%d", INT_MAX);
-	if (dm_message_test_buffer_overflow(result, maxlen))
-		return 1;
+	if (dm_message_test_buffer_overflow(result, maxlen)) {
+		r = 1;
+		goto ret;
+	}
 
-	id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags, program_id, aux_data,
+	id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags,
+			     n_histogram_entries, histogram_boundaries, program_id, aux_data,
 			     dm_internal_suspend_fast, dm_internal_resume_fast, md);
-	if (id < 0)
-		return id;
+	if (id < 0) {
+		r = id;
+		goto ret;
+	}
 
 	snprintf(result, maxlen, "%d", id);
 
-	return 1;
+	r = 1;
+	goto ret;
+
+ret_einval:
+	r = -EINVAL;
+ret:
+	kfree(histogram_boundaries);
+	return r;
 }
 
 static int message_stats_delete(struct mapped_device *md,

From e262f34741522e0d821642e5449c6eeb512723fc Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 9 Jun 2015 17:22:49 -0400
Subject: [PATCH 39/39] dm stats: add support for request-based DM devices

This makes it possible to use dm stats with DM multipath.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-stats.c |  5 -----
 drivers/md/dm.c       | 26 ++++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index faf1071ef631..8a8b48fa901a 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -1155,11 +1155,6 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
 {
 	int r;
 
-	if (dm_request_based(md)) {
-		DMWARN("Statistics are only supported for bio-based devices");
-		return -EOPNOTSUPP;
-	}
-
 	/* All messages here must start with '@' */
 	if (!strcasecmp(argv[0], "@stats_create"))
 		r = message_stats_create(md, argc, argv, result, maxlen);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 767bce906588..90dc49e3c78f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -86,6 +86,9 @@ struct dm_rq_target_io {
 	struct kthread_work work;
 	int error;
 	union map_info info;
+	struct dm_stats_aux stats_aux;
+	unsigned long duration_jiffies;
+	unsigned n_sectors;
 };
 
 /*
@@ -995,6 +998,17 @@ static struct dm_rq_target_io *tio_from_request(struct request *rq)
 	return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
 }
 
+static void rq_end_stats(struct mapped_device *md, struct request *orig)
+{
+	if (unlikely(dm_stats_used(&md->stats))) {
+		struct dm_rq_target_io *tio = tio_from_request(orig);
+		tio->duration_jiffies = jiffies - tio->duration_jiffies;
+		dm_stats_account_io(&md->stats, orig->cmd_flags, blk_rq_pos(orig),
+				    tio->n_sectors, true, tio->duration_jiffies,
+				    &tio->stats_aux);
+	}
+}
+
 /*
  * Don't touch any member of the md after calling this function because
  * the md may be freed in dm_put() at the end of this function.
@@ -1078,6 +1092,7 @@ static void dm_end_request(struct request *clone, int error)
 	}
 
 	free_rq_clone(clone);
+	rq_end_stats(md, rq);
 	if (!rq->q->mq_ops)
 		blk_end_request_all(rq, error);
 	else
@@ -1120,6 +1135,7 @@ static void dm_requeue_original_request(struct mapped_device *md,
 
 	dm_unprep_request(rq);
 
+	rq_end_stats(md, rq);
 	if (!rq->q->mq_ops)
 		old_requeue_request(rq);
 	else {
@@ -1211,6 +1227,7 @@ static void dm_softirq_done(struct request *rq)
 	int rw;
 
 	if (!clone) {
+		rq_end_stats(tio->md, rq);
 		rw = rq_data_dir(rq);
 		if (!rq->q->mq_ops) {
 			blk_end_request_all(rq, tio->error);
@@ -1943,6 +1960,14 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
 		md->last_rq_start_time = ktime_get();
 	}
 
+	if (unlikely(dm_stats_used(&md->stats))) {
+		struct dm_rq_target_io *tio = tio_from_request(orig);
+		tio->duration_jiffies = jiffies;
+		tio->n_sectors = blk_rq_sectors(orig);
+		dm_stats_account_io(&md->stats, orig->cmd_flags, blk_rq_pos(orig),
+				    tio->n_sectors, false, 0, &tio->stats_aux);
+	}
+
 	/*
 	 * Hold the md reference here for the in-flight I/O.
 	 * We can't rely on the reference count by device opener,
@@ -2689,6 +2714,7 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 		/* Direct call is fine since .queue_rq allows allocations */
 		if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
 			/* Undo dm_start_request() before requeuing */
+			rq_end_stats(md, rq);
 			rq_completed(md, rq_data_dir(rq), false);
 			return BLK_MQ_RQ_QUEUE_BUSY;
 		}