From 60b0ea120c80cba1cf5fe5ae82a35b1179263de3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 4 Apr 2014 21:26:04 -0600
Subject: [PATCH 1/8] percpu_counter: fix bad counter state during suspend

I got a bug report yesterday from Laszlo Ersek <lersek@xxxxxxxxxx>, in
which he states that his kvm instance fails to suspend. He Laszlo
bisected it down to this commit:

commit 1cf7e9c68fe84248174e998922b39e508375e7c1
Author: Jens Axboe <axboe@xxxxxxxxx>
Date: Fri Nov 1 10:52:52 2013 -0600

virtio_blk: blk-mq support

where virtio-blk is converted to use the blk-mq infrastructure. After
digging a bit, it became clear that the issue was with the queue drain.
blk-mq tracks queue usage in a percpu counter, which is incremented on
request alloc and decremented when the request is freed. The initial
hunt was for an inconsistency in blk-mq, but everything seemed fine. In
fact, the counter only returned crazy values when suspend was in
progress. When a CPU is unplugged, the percpu counters merges that CPU
state with the general state. blk-mq takes care to register a hotcpu
notifier with the appropriate priority, so we know it runs after the
percpu counter notifier. However, the percpu counter notifier only
merges the state when the CPU is fully gone. This leaves a state
transition where the CPU going away is no longer in the online mask, yet
it still holds private values. This means that in this state,
percpu_counter_sum() returns invalid results, and the suspend then hangs
waiting for abs(dead-cpu-value) requests to complete which of course
will never happen.

Fix this by clearing the state earlier, so we never have a case where
the CPU isn't in online mask but still holds private state. This bug has
been there since forever, I guess we don't have a lot of users where
percpu counters needs to be reliable during the suspend cycle.

Reported-by: <lersek@redhat.com>
Tested-by: Laszlo Ersek <lersek@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 lib/percpu_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 8280a5dd1727..7dd33577b905 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -169,7 +169,7 @@ static int percpu_counter_hotcpu_callback(struct notifier_block *nb,
 	struct percpu_counter *fbc;
 
 	compute_batch_value();
-	if (action != CPU_DEAD)
+	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
 		return NOTIFY_OK;
 
 	cpu = (unsigned long)hcpu;

From bccb5f7c8bdfe460d95f986c6edf2e75d8052897 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 4 Apr 2014 21:34:48 -0600
Subject: [PATCH 2/8] blk-mq: fix potential stall during CPU unplug with IO
 pending

When a CPU is unplugged, we move the blk_mq_ctx request entries
to the current queue. The current code forgets to remap the
blk_mq_hw_ctx before marking the software context pending,
which breaks if old-cpu and new-cpu don't map to the same
hardware queue.

Additionally, if we mark entries as pending in the new
hardware queue, then make sure we schedule it for running.
Otherwise request could be sitting there until someone else
queues IO for that hardware queue.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b1bcc619d0ea..1d2a9bdbee57 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -956,6 +956,7 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
 			       unsigned int cpu)
 {
 	struct blk_mq_hw_ctx *hctx = data;
+	struct request_queue *q = hctx->queue;
 	struct blk_mq_ctx *ctx;
 	LIST_HEAD(tmp);
 
@@ -965,7 +966,7 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
 	/*
 	 * Move ctx entries to new CPU, if this one is going away.
 	 */
-	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
+	ctx = __blk_mq_get_ctx(q, cpu);
 
 	spin_lock(&ctx->lock);
 	if (!list_empty(&ctx->rq_list)) {
@@ -977,7 +978,7 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
 	if (list_empty(&tmp))
 		return;
 
-	ctx = blk_mq_get_ctx(hctx->queue);
+	ctx = blk_mq_get_ctx(q);
 	spin_lock(&ctx->lock);
 
 	while (!list_empty(&tmp)) {
@@ -988,10 +989,13 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
 		list_move_tail(&rq->queuelist, &ctx->rq_list);
 	}
 
+	hctx = q->mq_ops->map_queue(q, ctx->cpu);
 	blk_mq_hctx_mark_pending(hctx, ctx);
 
 	spin_unlock(&ctx->lock);
 	blk_mq_put_ctx(ctx);
+
+	blk_mq_run_hw_queue(hctx, true);
 }
 
 static int blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,

From 44bd70c347c466616e430b044c49d48fac29789d Mon Sep 17 00:00:00 2001
From: Mike Galbraith <bitbucket@online.de>
Date: Tue, 8 Apr 2014 13:43:52 -0700
Subject: [PATCH 3/8] drivers/block/loop.c: ratelimit error messages

Metric tons of high speed spew is not helpful when things go pear shaped.
systemd lost its mind, forgot how to stop services it insists on being
sole manager of, massive printk() flood ensued, box eventually died.

[16206.684000] loop: Write error at byte offset 11412291584, length 4096.
[16206.684000] systemd-journald[1758]: /dev/kmsg buffer overrun, some messages lost.
[16206.684000] loop: Write error at byte offset 13155434496, length 4096.
[16206.684000] loop: Write error at byte offset 13155438592, length 4096.
[16206.684000] loop: Write error at byte offset 13155442688, length 4096.
[16206.684000] loop: Write error at byte offset 13960736768, length 4096.
[16206.684000] loop: Write error at byte offset 14229172224, length 4096.
[16206.684000] systemd-journald[1758]: /dev/kmsg buffer overrun, some messages lost.
[16206.684000] loop: Write error at byte offset 14766043136, length 4096.
[16206.684000] loop: Write error at byte offset 15034478592, length 4096.
[16206.684000] systemd-journald[1758]: /dev/kmsg buffer overrun, some messages lost.

Signed-off-by: Mike Galbraith <bitbucket@online.de>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/loop.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 66e8c3b94ef3..f70a230a2945 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -237,7 +237,7 @@ static int __do_lo_send_write(struct file *file,
 	file_end_write(file);
 	if (likely(bw == len))
 		return 0;
-	printk(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n",
+	printk_ratelimited(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n",
 			(unsigned long long)pos, len);
 	if (bw >= 0)
 		bw = -EIO;
@@ -277,7 +277,7 @@ static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
 		return __do_lo_send_write(lo->lo_backing_file,
 				page_address(page), bvec->bv_len,
 				pos);
-	printk(KERN_ERR "loop: Transfer error at byte offset %llu, "
+	printk_ratelimited(KERN_ERR "loop: Transfer error at byte offset %llu, "
 			"length %i.\n", (unsigned long long)pos, bvec->bv_len);
 	if (ret > 0)
 		ret = -EIO;
@@ -316,7 +316,7 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
 out:
 	return ret;
 fail:
-	printk(KERN_ERR "loop: Failed to allocate temporary page for write.\n");
+	printk_ratelimited(KERN_ERR "loop: Failed to allocate temporary page for write.\n");
 	ret = -ENOMEM;
 	goto out;
 }
@@ -345,7 +345,7 @@ lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 		size = p->bsize;
 
 	if (lo_do_transfer(lo, READ, page, buf->offset, p->page, p->offset, size, IV)) {
-		printk(KERN_ERR "loop: transfer error block %ld\n",
+		printk_ratelimited(KERN_ERR "loop: transfer error block %ld\n",
 		       page->index);
 		size = -EINVAL;
 	}

From b7aa84d9cb9f26da1a9312c3e39dbd1a3c25a426 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Tue, 8 Apr 2014 22:43:43 -0400
Subject: [PATCH 4/8] block: Fix for_each_bvec()

Commit 4550dd6c6b062 introduced for_each_bvec() which iterates over each
bvec attached to a bio or bip. However, the macro fails to check bi_size
before dereferencing which can lead to crashes while counting/mapping
integrity scatterlist segments.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Kent Overstreet <kmo@daterainc.com>
Cc: Nicholas Bellinger <nab@linux-iscsi.org>
Cc: <stable@vger.kernel.org> # v3.14+
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/bio.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5a4d39b4686b..5aa372a7380c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -216,9 +216,9 @@ static inline void bvec_iter_advance(struct bio_vec *bv, struct bvec_iter *iter,
 }
 
 #define for_each_bvec(bvl, bio_vec, iter, start)			\
-	for ((iter) = start;						\
-	     (bvl) = bvec_iter_bvec((bio_vec), (iter)),			\
-		(iter).bi_size;						\
+	for (iter = (start);						\
+	     (iter).bi_size &&						\
+		((bvl = bvec_iter_bvec((bio_vec), (iter))), 1);	\
 	     bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len))
 
 

From 0bc69973066d62f70ce60bd6762b5cb75a699159 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Tue, 8 Apr 2014 22:43:44 -0400
Subject: [PATCH 5/8] block: Fix integrity verification

Commit bf36f9cfa6d3d caused a regression by effectively reverting Nic's
fix from 5837c80e870b that ensures we traverse the full bio_vec list
upon completion.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Nicholas Bellinger <nab@linux-iscsi.org>
Cc: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/bio-integrity.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 29696b78d1f4..b355b98dbf1b 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -309,10 +309,9 @@ static int bio_integrity_generate_verify(struct bio *bio, int operate)
 {
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
 	struct blk_integrity_exchg bix;
-	struct bio_vec bv;
-	struct bvec_iter iter;
+	struct bio_vec *bv;
 	sector_t sector;
-	unsigned int sectors, ret = 0;
+	unsigned int sectors, ret = 0, i;
 	void *prot_buf = bio->bi_integrity->bip_buf;
 
 	if (operate)
@@ -323,16 +322,16 @@ static int bio_integrity_generate_verify(struct bio *bio, int operate)
 	bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
 	bix.sector_size = bi->sector_size;
 
-	bio_for_each_segment(bv, bio, iter) {
-		void *kaddr = kmap_atomic(bv.bv_page);
-		bix.data_buf = kaddr + bv.bv_offset;
-		bix.data_size = bv.bv_len;
+	bio_for_each_segment_all(bv, bio, i) {
+		void *kaddr = kmap_atomic(bv->bv_page);
+		bix.data_buf = kaddr + bv->bv_offset;
+		bix.data_size = bv->bv_len;
 		bix.prot_buf = prot_buf;
 		bix.sector = sector;
 
-		if (operate) {
+		if (operate)
 			bi->generate_fn(&bix);
-		} else {
+		else {
 			ret = bi->verify_fn(&bix);
 			if (ret) {
 				kunmap_atomic(kaddr);
@@ -340,7 +339,7 @@ static int bio_integrity_generate_verify(struct bio *bio, int operate)
 			}
 		}
 
-		sectors = bv.bv_len / bi->sector_size;
+		sectors = bv->bv_len / bi->sector_size;
 		sector += sectors;
 		prot_buf += sectors * bi->tuple_size;
 

From e69f18f06b97ed29645d020500222bfcec2b42b2 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Tue, 8 Apr 2014 22:59:31 -0400
Subject: [PATCH 6/8] block: Ensure we only enable integrity metadata for reads
 and writes

We'd occasionally attempt to generate protection information for flushes
and other requests with a zero payload. Make sure we only attempt to
enable integrity for reads and writes.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/bio-integrity.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index b355b98dbf1b..1c2ce0c87711 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -182,6 +182,9 @@ static int bdev_integrity_enabled(struct block_device *bdev, int rw)
  */
 int bio_integrity_enabled(struct bio *bio)
 {
+	if (!bio_is_rw(bio))
+		return 0;
+
 	/* Already protected? */
 	if (bio_integrity(bio))
 		return 0;

From 2bfad21ecc6f837de29743f4419f47dee3fac9e2 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 9 Apr 2014 22:20:48 -0400
Subject: [PATCH 7/8] scsi: Make sure cmd_flags are 64-bit

cmd_flags in struct request is now 64 bits wide but the scsi_execute
functions truncated arguments passed to int leading to errors. Make sure
the flags parameters are u64.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Jens Axboe <axboe@fb.com>
CC: Jan Kara <jack@suse.cz>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/scsi/scsi_lib.c    | 4 ++--
 include/scsi/scsi_device.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 5681c05ac506..65a123d9c676 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -184,7 +184,7 @@ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason)
  */
 int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 		 int data_direction, void *buffer, unsigned bufflen,
-		 unsigned char *sense, int timeout, int retries, int flags,
+		 unsigned char *sense, int timeout, int retries, u64 flags,
 		 int *resid)
 {
 	struct request *req;
@@ -235,7 +235,7 @@ EXPORT_SYMBOL(scsi_execute);
 int scsi_execute_req_flags(struct scsi_device *sdev, const unsigned char *cmd,
 		     int data_direction, void *buffer, unsigned bufflen,
 		     struct scsi_sense_hdr *sshdr, int timeout, int retries,
-		     int *resid, int flags)
+		     int *resid, u64 flags)
 {
 	char *sense = NULL;
 	int result;
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 4e845b80efd3..5853c913d2b0 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -423,11 +423,11 @@ extern int scsi_is_target_device(const struct device *);
 extern int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 			int data_direction, void *buffer, unsigned bufflen,
 			unsigned char *sense, int timeout, int retries,
-			int flag, int *resid);
+			u64 flags, int *resid);
 extern int scsi_execute_req_flags(struct scsi_device *sdev,
 	const unsigned char *cmd, int data_direction, void *buffer,
 	unsigned bufflen, struct scsi_sense_hdr *sshdr, int timeout,
-	int retries, int *resid, int flags);
+	int retries, int *resid, u64 flags);
 static inline int scsi_execute_req(struct scsi_device *sdev,
 	const unsigned char *cmd, int data_direction, void *buffer,
 	unsigned bufflen, struct scsi_sense_hdr *sshdr, int timeout,

From 360f92c2443073143467a0088daffec96a17910b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 9 Apr 2014 20:27:01 -0600
Subject: [PATCH 8/8] block: fix regression with block enabled tagging

Martin reported that his test system would not boot with
current git, it oopsed with this:

BUG: unable to handle kernel paging request at ffff88046c6c9e80
IP: [<ffffffff812971e0>] blk_queue_start_tag+0x90/0x150
PGD 1ddf067 PUD 1de2067 PMD 47fc7d067 PTE 800000046c6c9060
Oops: 0002 [#1] SMP DEBUG_PAGEALLOC
Modules linked in: sd_mod lpfc(+) scsi_transport_fc scsi_tgt oracleasm
rpcsec_gss_krb5 ipv6 igb dca i2c_algo_bit i2c_core hwmon
CPU: 3 PID: 87 Comm: kworker/u17:1 Not tainted 3.14.0+ #246
Hardware name: Supermicro X9DRX+-F/X9DRX+-F, BIOS 3.00 07/09/2013
Workqueue: events_unbound async_run_entry_fn
task: ffff8802743c2150 ti: ffff880273d02000 task.ti: ffff880273d02000
RIP: 0010:[<ffffffff812971e0>]  [<ffffffff812971e0>]
blk_queue_start_tag+0x90/0x150
RSP: 0018:ffff880273d03a58  EFLAGS: 00010092
RAX: ffff88046c6c9e78 RBX: ffff880077208e78 RCX: 00000000fffc8da6
RDX: 00000000fffc186d RSI: 0000000000000009 RDI: 00000000fffc8d9d
RBP: ffff880273d03a88 R08: 0000000000000001 R09: ffff8800021c2410
R10: 0000000000000005 R11: 0000000000015b30 R12: ffff88046c5bb8a0
R13: ffff88046c5c0890 R14: 000000000000001e R15: 000000000000001e
FS:  0000000000000000(0000) GS:ffff880277b00000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffff88046c6c9e80 CR3: 00000000018f6000 CR4: 00000000000407e0
Stack:
 ffff880273d03a98 ffff880474b18800 0000000000000000 ffff880474157000
 ffff88046c5c0890 ffff880077208e78 ffff880273d03ae8 ffffffff813b9e62
 ffff880200000010 ffff880474b18968 ffff880474b18848 ffff88046c5c0cd8
Call Trace:
 [<ffffffff813b9e62>] scsi_request_fn+0xf2/0x510
 [<ffffffff81293167>] __blk_run_queue+0x37/0x50
 [<ffffffff8129ac43>] blk_execute_rq_nowait+0xb3/0x130
 [<ffffffff8129ad24>] blk_execute_rq+0x64/0xf0
 [<ffffffff8108d2b0>] ? bit_waitqueue+0xd0/0xd0
 [<ffffffff813bba35>] scsi_execute+0xe5/0x180
 [<ffffffff813bbe4a>] scsi_execute_req_flags+0x9a/0x110
 [<ffffffffa01b1304>] sd_spinup_disk+0x94/0x460 [sd_mod]
 [<ffffffff81160000>] ? __unmap_hugepage_range+0x200/0x2f0
 [<ffffffffa01b2b9a>] sd_revalidate_disk+0xaa/0x3f0 [sd_mod]
 [<ffffffffa01b2fb8>] sd_probe_async+0xd8/0x200 [sd_mod]
 [<ffffffff8107703f>] async_run_entry_fn+0x3f/0x140
 [<ffffffff8106a1c5>] process_one_work+0x175/0x410
 [<ffffffff8106b373>] worker_thread+0x123/0x400
 [<ffffffff8106b250>] ? manage_workers+0x160/0x160
 [<ffffffff8107104e>] kthread+0xce/0xf0
 [<ffffffff81070f80>] ? kthread_freezable_should_stop+0x70/0x70
 [<ffffffff815f0bac>] ret_from_fork+0x7c/0xb0
 [<ffffffff81070f80>] ? kthread_freezable_should_stop+0x70/0x70
Code: 48 0f ab 11 72 db 48 81 4b 40 00 00 10 00 89 83 08 01 00 00 48 89
df 49 8b 04 24 48 89 1c d0 e8 f7 a8 ff ff 49 8b 85 28 05 00 00 <48> 89
58 08 48 89 03 49 8d 85 28 05 00 00 48 89 43 08 49 89 9d
RIP  [<ffffffff812971e0>] blk_queue_start_tag+0x90/0x150
 RSP <ffff880273d03a58>
CR2: ffff88046c6c9e80

Martin bisected and found this to be the problem patch;

	commit 6d113398dcf4dfcd9787a4ead738b186f7b7ff0f
	Author: Jan Kara <jack@suse.cz>
	Date:   Mon Feb 24 16:39:54 2014 +0100

	    block: Stop abusing rq->csd.list in blk-softirq

and the problem was immediately apparent. The patch states that
it is safe to reuse queuelist at completion time, since it is
no longer used. However, that is not true if a device is using
block enabled tagging. If that is the case, then the queuelist
is reused to keep track of busy tags. If a device also ended
up using softirq completions, we'd reuse ->queuelist for the
IPI handling while block tagging was still using it. Boom.

Fix this by adding a new ipi_list list head, and share the
memory used with the request hash table. The hash table is
never used after the request is moved to the dispatch list,
which happens long before any potential completion of the
request. Add a new request bit for this, so we don't have
cases that check rq->hash while it could potentially have
been reused for the IPI completion.

Reported-by: Martin K. Petersen <martin.petersen@oracle.com>
Tested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c          |  2 +-
 block/blk-softirq.c       | 17 ++++++-----------
 block/blk.h               |  2 +-
 block/elevator.c          |  2 ++
 include/linux/blk_types.h |  2 ++
 include/linux/blkdev.h    | 13 ++++++++++++-
 6 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 34d7c196338b..a0e3096c4bb5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1307,7 +1307,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 		struct request_list *rl = blk_rq_rl(req);
 
 		BUG_ON(!list_empty(&req->queuelist));
-		BUG_ON(!hlist_unhashed(&req->hash));
+		BUG_ON(ELV_ON_HASH(req));
 
 		blk_free_request(rl, req);
 		freed_request(rl, flags);
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index ebd6b6f1bdeb..53b1737e978d 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -30,8 +30,8 @@ static void blk_done_softirq(struct softirq_action *h)
 	while (!list_empty(&local_list)) {
 		struct request *rq;
 
-		rq = list_entry(local_list.next, struct request, queuelist);
-		list_del_init(&rq->queuelist);
+		rq = list_entry(local_list.next, struct request, ipi_list);
+		list_del_init(&rq->ipi_list);
 		rq->q->softirq_done_fn(rq);
 	}
 }
@@ -45,14 +45,9 @@ static void trigger_softirq(void *data)
 
 	local_irq_save(flags);
 	list = this_cpu_ptr(&blk_cpu_done);
-	/*
-	 * We reuse queuelist for a list of requests to process. Since the
-	 * queuelist is used by the block layer only for requests waiting to be
-	 * submitted to the device it is unused now.
-	 */
-	list_add_tail(&rq->queuelist, list);
+	list_add_tail(&rq->ipi_list, list);
 
-	if (list->next == &rq->queuelist)
+	if (list->next == &rq->ipi_list)
 		raise_softirq_irqoff(BLOCK_SOFTIRQ);
 
 	local_irq_restore(flags);
@@ -141,7 +136,7 @@ void __blk_complete_request(struct request *req)
 		struct list_head *list;
 do_local:
 		list = this_cpu_ptr(&blk_cpu_done);
-		list_add_tail(&req->queuelist, list);
+		list_add_tail(&req->ipi_list, list);
 
 		/*
 		 * if the list only contains our just added request,
@@ -149,7 +144,7 @@ void __blk_complete_request(struct request *req)
 		 * entries there, someone already raised the irq but it
 		 * hasn't run yet.
 		 */
-		if (list->next == &req->queuelist)
+		if (list->next == &req->ipi_list)
 			raise_softirq_irqoff(BLOCK_SOFTIRQ);
 	} else if (raise_blk_irq(ccpu, req))
 		goto do_local;
diff --git a/block/blk.h b/block/blk.h
index d23b415b8a28..1d880f1f957f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -78,7 +78,7 @@ static inline void blk_clear_rq_complete(struct request *rq)
 /*
  * Internal elevator interface
  */
-#define ELV_ON_HASH(rq) hash_hashed(&(rq)->hash)
+#define ELV_ON_HASH(rq) ((rq)->cmd_flags & REQ_HASHED)
 
 void blk_insert_flush(struct request *rq);
 void blk_abort_flushes(struct request_queue *q);
diff --git a/block/elevator.c b/block/elevator.c
index 42c45a7d6714..1e01b66a0b92 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -247,6 +247,7 @@ EXPORT_SYMBOL(elevator_exit);
 static inline void __elv_rqhash_del(struct request *rq)
 {
 	hash_del(&rq->hash);
+	rq->cmd_flags &= ~REQ_HASHED;
 }
 
 static void elv_rqhash_del(struct request_queue *q, struct request *rq)
@@ -261,6 +262,7 @@ static void elv_rqhash_add(struct request_queue *q, struct request *rq)
 
 	BUG_ON(ELV_ON_HASH(rq));
 	hash_add(e->hash, &rq->hash, rq_hash_key(rq));
+	rq->cmd_flags |= REQ_HASHED;
 }
 
 static void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index bbc3a6c88fce..aa0eaa2d0bd8 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -189,6 +189,7 @@ enum rq_flag_bits {
 	__REQ_KERNEL, 		/* direct IO to kernel pages */
 	__REQ_PM,		/* runtime pm request */
 	__REQ_END,		/* last of chain of requests */
+	__REQ_HASHED,		/* on IO scheduler merge hash */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -241,5 +242,6 @@ enum rq_flag_bits {
 #define REQ_KERNEL		(1ULL << __REQ_KERNEL)
 #define REQ_PM			(1ULL << __REQ_PM)
 #define REQ_END			(1ULL << __REQ_END)
+#define REQ_HASHED		(1ULL << __REQ_HASHED)
 
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1e1fa3f93d5f..99617cf7dd1a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -118,7 +118,18 @@ struct request {
 	struct bio *bio;
 	struct bio *biotail;
 
-	struct hlist_node hash;	/* merge hash */
+	/*
+	 * The hash is used inside the scheduler, and killed once the
+	 * request reaches the dispatch list. The ipi_list is only used
+	 * to queue the request for softirq completion, which is long
+	 * after the request has been unhashed (and even removed from
+	 * the dispatch list).
+	 */
+	union {
+		struct hlist_node hash;	/* merge hash */
+		struct list_head ipi_list;
+	};
+
 	/*
 	 * The rb_node is only used inside the io scheduler, requests
 	 * are pruned when moved to the dispatch queue. So let the