From e9ef46369f5107e634a93b7fc4e62a1f53343197 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 24 Jul 2012 15:01:04 -0600
Subject: [PATCH 01/44] NVMe: Set request queue logical block size

Sets the request queue logical block size with the block size of the
namespace.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index f4996b0e4b1a..38b9c73f6706 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1344,6 +1344,7 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
 	ns->disk = disk;
 	lbaf = id->flbas & 0xf;
 	ns->lba_shift = id->lbaf[lbaf].ds;
+	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
 
 	disk->major = nvme_major;
 	disk->minors = NVME_MINORS;

From 5c42ea1643a630060f9e71e06d3933d244970967 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 25 Jul 2012 16:05:18 -0600
Subject: [PATCH 02/44] NVMe: Fix nvme module init when nvme_major is set

register_blkdev returns 0 when given a valid major number.

Reported-by:Ross Zwisler <ross.zwisler@intel.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 38b9c73f6706..46e33eec6298 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1716,9 +1716,11 @@ static int __init nvme_init(void)
 	if (IS_ERR(nvme_thread))
 		return PTR_ERR(nvme_thread);
 
-	nvme_major = register_blkdev(nvme_major, "nvme");
-	if (nvme_major <= 0)
+	result = register_blkdev(nvme_major, "nvme");
+	if (result < 0)
 		goto kill_kthread;
+	else if (result > 0)
+	    nvme_major = result;
 
 	result = pci_register_driver(&nvme_driver);
 	if (result)

From 50af8baec46a99a9b81a4600c0374f83a5a590a9 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 25 Jul 2012 16:07:55 -0600
Subject: [PATCH 03/44] NVMe: replace nvme_ns with nvme_dev for user admin

The function nvme_user_admin_command does not require a namespace to
proceed.  Replace with the nvme_dev structure so that it can be called
from contexts that do not have a namespace.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 46e33eec6298..89935853cb85 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1151,10 +1151,9 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	return status;
 }
 
-static int nvme_user_admin_cmd(struct nvme_ns *ns,
+static int nvme_user_admin_cmd(struct nvme_dev *dev,
 					struct nvme_admin_cmd __user *ucmd)
 {
-	struct nvme_dev *dev = ns->dev;
 	struct nvme_admin_cmd cmd;
 	struct nvme_command c;
 	int status, length;
@@ -1209,7 +1208,7 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 	case NVME_IOCTL_ID:
 		return ns->ns_id;
 	case NVME_IOCTL_ADMIN_CMD:
-		return nvme_user_admin_cmd(ns, (void __user *)arg);
+		return nvme_user_admin_cmd(ns->dev, (void __user *)arg);
 	case NVME_IOCTL_SUBMIT_IO:
 		return nvme_submit_io(ns, (void __user *)arg);
 	default:

From a42ceccef0c43b46ff6bc1b12a7c1076ef243df1 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 25 Jul 2012 16:06:38 -0600
Subject: [PATCH 04/44] NVMe: use namespace id for nvme_get_features

The specification does not provide a use for command dword11 in the NVMe
Get Features command, but does use the NSID for some features.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 89935853cb85..7bcd88205a41 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -840,15 +840,15 @@ static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
 }
 
 static int nvme_get_features(struct nvme_dev *dev, unsigned fid,
-				unsigned dword11, dma_addr_t dma_addr)
+				unsigned nsid, dma_addr_t dma_addr)
 {
 	struct nvme_command c;
 
 	memset(&c, 0, sizeof(c));
 	c.features.opcode = nvme_admin_get_features;
+	c.features.nsid = cpu_to_le32(nsid);
 	c.features.prp1 = cpu_to_le64(dma_addr);
 	c.features.fid = cpu_to_le32(fid);
-	c.features.dword11 = cpu_to_le32(dword11);
 
 	return nvme_submit_admin_cmd(dev, &c, NULL);
 }

From 8fc23e032debd682f5ba9fc524a5846c10d2c522 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 26 Jul 2012 11:29:57 -0600
Subject: [PATCH 05/44] NVMe: Set block queue max sectors

Set the max hw sectors in a namespace's request queue if the nvme device
has a max data transfer size.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 7 +++++++
 include/linux/nvme.h | 1 +
 2 files changed, 8 insertions(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 7bcd88205a41..11951fa11a90 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -78,6 +78,7 @@ struct nvme_dev {
 	char serial[20];
 	char model[40];
 	char firmware_rev[8];
+	u32 max_hw_sectors;
 };
 
 /*
@@ -1344,6 +1345,8 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
 	lbaf = id->flbas & 0xf;
 	ns->lba_shift = id->lbaf[lbaf].ds;
 	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
+	if (dev->max_hw_sectors)
+		blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
 
 	disk->major = nvme_major;
 	disk->minors = NVME_MINORS;
@@ -1485,6 +1488,10 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
 	memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
 	memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
 	memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
+	if (ctrl->mdts) {
+		int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
+		dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
+	}
 
 	id_ns = mem;
 	for (i = 1; i <= nn; i++) {
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 9490a00529f4..8c71d2004c6d 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -37,6 +37,7 @@ struct nvme_bar {
 
 #define NVME_CAP_TIMEOUT(cap)	(((cap) >> 24) & 0xff)
 #define NVME_CAP_STRIDE(cap)	(((cap) >> 32) & 0xf)
+#define NVME_CAP_MPSMIN(cap)	(((cap) >> 48) & 0xf)
 
 enum {
 	NVME_CC_ENABLE		= 1 << 0,

From a0cadb85b8b758608ae0759151e29de7581c6731 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Fri, 27 Jul 2012 13:57:23 -0400
Subject: [PATCH 06/44] NVMe: Do not set IO queue depth beyond device max

Set the depth for IO queues to the device's maximum supported queue
entries if the requested depth exceeds the device's capabilities.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 10 ++++++----
 include/linux/nvme.h |  1 +
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 11951fa11a90..af1ef39bd6b4 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -893,7 +893,8 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 							int depth, int vector)
 {
 	struct device *dmadev = &dev->pci_dev->dev;
-	unsigned extra = (depth / 8) + (depth * sizeof(struct nvme_cmd_info));
+	unsigned extra = DIV_ROUND_UP(depth, 8) + (depth *
+						sizeof(struct nvme_cmd_info));
 	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
 	if (!nvmeq)
 		return NULL;
@@ -1391,7 +1392,7 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 
 static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 {
-	int result, cpu, i, nr_io_queues, db_bar_size;
+	int result, cpu, i, nr_io_queues, db_bar_size, q_depth;
 
 	nr_io_queues = num_online_cpus();
 	result = set_queue_count(dev, nr_io_queues);
@@ -1437,9 +1438,10 @@ static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 		cpu = cpumask_next(cpu, cpu_online_mask);
 	}
 
+	q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1,
+								NVME_Q_DEPTH);
 	for (i = 0; i < nr_io_queues; i++) {
-		dev->queues[i + 1] = nvme_create_queue(dev, i + 1,
-							NVME_Q_DEPTH, i);
+		dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i);
 		if (IS_ERR(dev->queues[i + 1]))
 			return PTR_ERR(dev->queues[i + 1]);
 		dev->queue_count++;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 8c71d2004c6d..c25cccaa555a 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -35,6 +35,7 @@ struct nvme_bar {
 	__u64			acq;	/* Admin CQ Base Address */
 };
 
+#define NVME_CAP_MQES(cap)	((cap) & 0xffff)
 #define NVME_CAP_TIMEOUT(cap)	(((cap) >> 24) & 0xff)
 #define NVME_CAP_STRIDE(cap)	(((cap) >> 32) & 0xf)
 #define NVME_CAP_MPSMIN(cap)	(((cap) >> 48) & 0xf)

From c7d36ab8fa04c213328119a9c0d66985fe204ee5 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Fri, 27 Jul 2012 11:53:28 -0600
Subject: [PATCH 07/44] NVMe: Fix uninitialized iod compiler warning

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index af1ef39bd6b4..6c0eb768562f 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1159,7 +1159,7 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev,
 	struct nvme_admin_cmd cmd;
 	struct nvme_command c;
 	int status, length;
-	struct nvme_iod *iod;
+	struct nvme_iod *uninitialized_var(iod);
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;

From 22fff826e715e9727d3c7a69f15e602a9801b673 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 20 Jan 2012 07:55:30 -0500
Subject: [PATCH 08/44] NVMe: handle allocation failure in
 nvme_map_user_pages()

We should return here and avoid a NULL dereference.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 6c0eb768562f..064e86a6bb4e 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1044,6 +1044,8 @@ static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 	offset = offset_in_page(addr);
 	count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
 	pages = kcalloc(count, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
 
 	err = get_user_pages_fast(addr, count, 1, pages);
 	if (err < count) {

From 0ac13140d796eb1e2f8956aea97a6e5e4ebcf981 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 31 Jul 2012 13:31:15 -0400
Subject: [PATCH 09/44] NVMe: Fix whitespace damage in nvme_init

Commit 5c42ea1643 used spaces instead of tabs.
Also remove the unnecessary initialisation of the 'result' variable.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 064e86a6bb4e..0ba6b7cb344b 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1720,7 +1720,7 @@ static struct pci_driver nvme_driver = {
 
 static int __init nvme_init(void)
 {
-	int result = -EBUSY;
+	int result;
 
 	nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
 	if (IS_ERR(nvme_thread))
@@ -1730,7 +1730,7 @@ static int __init nvme_init(void)
 	if (result < 0)
 		goto kill_kthread;
 	else if (result > 0)
-	    nvme_major = result;
+		nvme_major = result;
 
 	result = pci_register_driver(&nvme_driver);
 	if (result)

From cd58ad7d188c643cf572b038909c2f7dd96fdafe Mon Sep 17 00:00:00 2001
From: Quoc-Son Anh <quoc-sonx.anh@intel.com>
Date: Tue, 21 Feb 2012 16:50:53 -0700
Subject: [PATCH 10/44] NVMe: Use ida for nvme device instance

Signed-off-by: Quoc-Son Anh <quoc-sonx.anh@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 0ba6b7cb344b..3278fbdb8dc0 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -1576,15 +1576,33 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
 	dma_pool_destroy(dev->prp_small_pool);
 }
 
-/* XXX: Use an ida or something to let remove / add work correctly */
-static void nvme_set_instance(struct nvme_dev *dev)
+static DEFINE_IDA(nvme_instance_ida);
+
+static int nvme_set_instance(struct nvme_dev *dev)
 {
-	static int instance;
-	dev->instance = instance++;
+	int instance, error;
+
+	do {
+		if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
+			return -ENODEV;
+
+		spin_lock(&dev_list_lock);
+		error = ida_get_new(&nvme_instance_ida, &instance);
+		spin_unlock(&dev_list_lock);
+	} while (error == -EAGAIN);
+
+	if (error)
+		return -ENODEV;
+
+	dev->instance = instance;
+	return 0;
 }
 
 static void nvme_release_instance(struct nvme_dev *dev)
 {
+	spin_lock(&dev_list_lock);
+	ida_remove(&nvme_instance_ida, dev->instance);
+	spin_unlock(&dev_list_lock);
 }
 
 static int __devinit nvme_probe(struct pci_dev *pdev,
@@ -1617,7 +1635,10 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 	pci_set_drvdata(pdev, dev);
 	dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
 	dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
-	nvme_set_instance(dev);
+	result = nvme_set_instance(dev);
+	if (result)
+		goto disable;
+
 	dev->entry[0].vector = pdev->irq;
 
 	result = nvme_setup_prp_pools(dev);

From 9e866774aab5d2654b0fa8f97890f68913f05700 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Fri, 3 Aug 2012 13:55:56 -0400
Subject: [PATCH 11/44] NVMe: Free admin queue memory on initialisation failure

If the adapter fails initialisation, the memory allocated for the
admin queue may not be freed.  Split the memory freeing part of
nvme_free_queue() into nvme_free_queue_mem() and call it in the case of
initialisation failure.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Reported-by: Vishal Verma <vishal.l.verma@intel.com>
---
 drivers/block/nvme.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 3278fbdb8dc0..214037055e2a 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -868,6 +868,15 @@ static int nvme_set_features(struct nvme_dev *dev, unsigned fid,
 	return nvme_submit_admin_cmd(dev, &c, result);
 }
 
+static void nvme_free_queue_mem(struct nvme_queue *nvmeq)
+{
+	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
+				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
+	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+	kfree(nvmeq);
+}
+
 static void nvme_free_queue(struct nvme_dev *dev, int qid)
 {
 	struct nvme_queue *nvmeq = dev->queues[qid];
@@ -882,11 +891,7 @@ static void nvme_free_queue(struct nvme_dev *dev, int qid)
 		adapter_delete_cq(dev, qid);
 	}
 
-	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
-				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
-	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
-					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
-	kfree(nvmeq);
+	nvme_free_queue_mem(nvmeq);
 }
 
 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
@@ -982,7 +987,7 @@ static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
 
 static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 {
-	int result;
+	int result = 0;
 	u32 aqa;
 	u64 cap;
 	unsigned long timeout;
@@ -1012,17 +1017,22 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 	timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
 	dev->db_stride = NVME_CAP_STRIDE(cap);
 
-	while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
+	while (!result && !(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
 		msleep(100);
 		if (fatal_signal_pending(current))
-			return -EINTR;
+			result = -EINTR;
 		if (time_after(jiffies, timeout)) {
 			dev_err(&dev->pci_dev->dev,
 				"Device not ready; aborting initialisation\n");
-			return -ENODEV;
+			result = -ENODEV;
 		}
 	}
 
+	if (result) {
+		nvme_free_queue_mem(nvmeq);
+		return result;
+	}
+
 	result = queue_request_irq(dev, nvmeq, "nvme admin");
 	dev->queues[0] = nvmeq;
 	return result;

From a09115b23e2002bb35b7bfd337683f00875671ec Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 7 Aug 2012 15:56:23 -0400
Subject: [PATCH 12/44] NVMe: Cancel outstanding IOs on queue deletion

If the device is hot-unplugged while there are active commands, we should
time out the I/Os so that upper layers don't just see the I/Os disappear.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 55 ++++++++++++++++++++++++++------------------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 214037055e2a..f9ad514c9227 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -868,6 +868,33 @@ static int nvme_set_features(struct nvme_dev *dev, unsigned fid,
 	return nvme_submit_admin_cmd(dev, &c, result);
 }
 
+/**
+ * nvme_cancel_ios - Cancel outstanding I/Os
+ * @queue: The queue to cancel I/Os on
+ * @timeout: True to only cancel I/Os which have timed out
+ */
+static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
+{
+	int depth = nvmeq->q_depth - 1;
+	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+	unsigned long now = jiffies;
+	int cmdid;
+
+	for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
+		void *ctx;
+		nvme_completion_fn fn;
+		static struct nvme_completion cqe = {
+			.status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1,
+		};
+
+		if (timeout && !time_after(now, info[cmdid].timeout))
+			continue;
+		dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid);
+		ctx = cancel_cmdid(nvmeq, cmdid, &fn);
+		fn(nvmeq->dev, ctx, &cqe);
+	}
+}
+
 static void nvme_free_queue_mem(struct nvme_queue *nvmeq)
 {
 	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
@@ -882,6 +909,10 @@ static void nvme_free_queue(struct nvme_dev *dev, int qid)
 	struct nvme_queue *nvmeq = dev->queues[qid];
 	int vector = dev->entry[nvmeq->cq_vector].vector;
 
+	spin_lock_irq(&nvmeq->q_lock);
+	nvme_cancel_ios(nvmeq, false);
+	spin_unlock_irq(&nvmeq->q_lock);
+
 	irq_set_affinity_hint(vector, NULL);
 	free_irq(vector, nvmeq);
 
@@ -1236,26 +1267,6 @@ static const struct block_device_operations nvme_fops = {
 	.compat_ioctl	= nvme_ioctl,
 };
 
-static void nvme_timeout_ios(struct nvme_queue *nvmeq)
-{
-	int depth = nvmeq->q_depth - 1;
-	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
-	unsigned long now = jiffies;
-	int cmdid;
-
-	for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
-		void *ctx;
-		nvme_completion_fn fn;
-		static struct nvme_completion cqe = { .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, };
-
-		if (!time_after(now, info[cmdid].timeout))
-			continue;
-		dev_warn(nvmeq->q_dmadev, "Timing out I/O %d\n", cmdid);
-		ctx = cancel_cmdid(nvmeq, cmdid, &fn);
-		fn(nvmeq->dev, ctx, &cqe);
-	}
-}
-
 static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
 {
 	while (bio_list_peek(&nvmeq->sq_cong)) {
@@ -1287,7 +1298,7 @@ static int nvme_kthread(void *data)
 				spin_lock_irq(&nvmeq->q_lock);
 				if (nvme_process_cq(nvmeq))
 					printk("process_cq did something\n");
-				nvme_timeout_ios(nvmeq);
+				nvme_cancel_ios(nvmeq, true);
 				nvme_resubmit_bios(nvmeq);
 				spin_unlock_irq(&nvmeq->q_lock);
 			}
@@ -1549,8 +1560,6 @@ static int nvme_dev_remove(struct nvme_dev *dev)
 	list_del(&dev->node);
 	spin_unlock(&dev_list_lock);
 
-	/* TODO: wait all I/O finished or cancel them */
-
 	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
 		list_del(&ns->list);
 		del_gendisk(ns->disk);

From 156cacb1d0d36b0d0582d9e798e58e0044f516b3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 21 Sep 2012 08:19:02 -0400
Subject: [PATCH 13/44] do_add_mount()/umount -l races

normally we deal with lock_mount()/umount races by checking that
mountpoint to be is still in our namespace after lock_mount() has
been done.  However, do_add_mount() skips that check when called
with MNT_SHRINKABLE in flags (i.e. from finish_automount()).  The
reason is that ->mnt_ns may be a temporary namespace created exactly
to contain automounts a-la NFS4 referral handling.  It's not the
namespace of the caller, though, so check_mnt() would fail here.
We still need to check that ->mnt_ns is non-NULL in that case,
though.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 4d31f73e2561..7bdf7907413f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1886,8 +1886,14 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
 		return err;
 
 	err = -EINVAL;
-	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(real_mount(path->mnt)))
-		goto unlock;
+	if (unlikely(!check_mnt(real_mount(path->mnt)))) {
+		/* that's acceptable only for automounts done in private ns */
+		if (!(mnt_flags & MNT_SHRINKABLE))
+			goto unlock;
+		/* ... and for those we'd better have mountpoint still alive */
+		if (!real_mount(path->mnt)->mnt_ns)
+			goto unlock;
+	}
 
 	/* Refuse the same filesystem on the same mount point */
 	err = -EBUSY;

From c5aa1e554a20fb3542c62688ae46049c9225a965 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 29 Aug 2012 09:00:01 -0400
Subject: [PATCH 14/44] close the race in nlmsvc_free_block()

we need to grab mutex before the reference counter reaches 0

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/lockd/svclock.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index fb1a2bedbe97..8d80c990dffd 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -289,7 +289,6 @@ static void nlmsvc_free_block(struct kref *kref)
 	dprintk("lockd: freeing block %p...\n", block);
 
 	/* Remove block from file's list of blocks */
-	mutex_lock(&file->f_mutex);
 	list_del_init(&block->b_flist);
 	mutex_unlock(&file->f_mutex);
 
@@ -303,7 +302,7 @@ static void nlmsvc_free_block(struct kref *kref)
 static void nlmsvc_release_block(struct nlm_block *block)
 {
 	if (block != NULL)
-		kref_put(&block->b_count, nlmsvc_free_block);
+		kref_put_mutex(&block->b_count, nlmsvc_free_block, &block->b_file->f_mutex);
 }
 
 /*

From bef83de5a0e3031ff6ff9584b458611a7ac01b85 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 24 Sep 2012 21:23:25 -0600
Subject: [PATCH 15/44] iommu: static inline iommu group stub functions

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 include/linux/iommu.h | 42 ++++++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 7e83370e6fd2..f3b99e1c1042 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -256,72 +256,78 @@ static inline void iommu_set_fault_handler(struct iommu_domain *domain,
 {
 }
 
-int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group)
+static inline int iommu_attach_group(struct iommu_domain *domain,
+				     struct iommu_group *group)
 {
 	return -ENODEV;
 }
 
-void iommu_detach_group(struct iommu_domain *domain, struct iommu_group *group)
+static inline void iommu_detach_group(struct iommu_domain *domain,
+				      struct iommu_group *group)
 {
 }
 
-struct iommu_group *iommu_group_alloc(void)
+static inline struct iommu_group *iommu_group_alloc(void)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-void *iommu_group_get_iommudata(struct iommu_group *group)
+static inline void *iommu_group_get_iommudata(struct iommu_group *group)
 {
 	return NULL;
 }
 
-void iommu_group_set_iommudata(struct iommu_group *group, void *iommu_data,
-			       void (*release)(void *iommu_data))
+static inline void iommu_group_set_iommudata(struct iommu_group *group,
+					     void *iommu_data,
+					     void (*release)(void *iommu_data))
 {
 }
 
-int iommu_group_set_name(struct iommu_group *group, const char *name)
+static inline int iommu_group_set_name(struct iommu_group *group,
+				       const char *name)
 {
 	return -ENODEV;
 }
 
-int iommu_group_add_device(struct iommu_group *group, struct device *dev)
+static inline int iommu_group_add_device(struct iommu_group *group,
+					 struct device *dev)
 {
 	return -ENODEV;
 }
 
-void iommu_group_remove_device(struct device *dev)
+static inline void iommu_group_remove_device(struct device *dev)
 {
 }
 
-int iommu_group_for_each_dev(struct iommu_group *group, void *data,
-			     int (*fn)(struct device *, void *))
+static inline int iommu_group_for_each_dev(struct iommu_group *group,
+					   void *data,
+					   int (*fn)(struct device *, void *))
 {
 	return -ENODEV;
 }
 
-struct iommu_group *iommu_group_get(struct device *dev)
+static inline struct iommu_group *iommu_group_get(struct device *dev)
 {
 	return NULL;
 }
 
-void iommu_group_put(struct iommu_group *group)
+static inline void iommu_group_put(struct iommu_group *group)
 {
 }
 
-int iommu_group_register_notifier(struct iommu_group *group,
-				  struct notifier_block *nb)
+static inline int iommu_group_register_notifier(struct iommu_group *group,
+						struct notifier_block *nb)
 {
 	return -ENODEV;
 }
 
-int iommu_group_unregister_notifier(struct iommu_group *group,
-				    struct notifier_block *nb)
+static inline int iommu_group_unregister_notifier(struct iommu_group *group,
+						  struct notifier_block *nb)
 {
 	return 0;
 }
 
-int iommu_group_id(struct iommu_group *group)
+static inline int iommu_group_id(struct iommu_group *group)
 {
 	return -ENODEV;
 }

From 82e6bfe2fbc4d48852114c4f979137cd5bf1d1a8 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@inai.de>
Date: Fri, 21 Sep 2012 22:26:52 +0000
Subject: [PATCH 16/44] netfilter: xt_limit: have r->cost != 0 case work

Commit v2.6.19-rc1~1272^2~41 tells us that r->cost != 0 can happen when
a running state is saved to userspace and then reinstated from there.

Make sure that private xt_limit area is initialized with correct values.
Otherwise, random matchings due to use of uninitialized memory.

Signed-off-by: Jan Engelhardt <jengelh@inai.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_limit.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index 5c22ce8ab309..a4c1e4528cac 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -117,11 +117,11 @@ static int limit_mt_check(const struct xt_mtchk_param *par)
 
 	/* For SMP, we only want to use one set of state. */
 	r->master = priv;
+	/* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies *
+	   128. */
+	priv->prev = jiffies;
+	priv->credit = user2credits(r->avg * r->burst); /* Credits full. */
 	if (r->cost == 0) {
-		/* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies *
-		   128. */
-		priv->prev = jiffies;
-		priv->credit = user2credits(r->avg * r->burst); /* Credits full. */
 		r->credit_cap = priv->credit; /* Credits full. */
 		r->cost = user2credits(r->avg);
 	}

From 307615a26e95406c42c95916a66ba50434567e0f Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 26 Sep 2012 23:45:39 +0100
Subject: [PATCH 17/44] dm thin: do not set discard_zeroes_data

The dm thin pool target claims to support the zeroing of discarded
data areas.  This turns out to be incorrect when processing discards
that do not exactly cover a complete number of blocks, so the target
must always set discard_zeroes_data_unsupported.

The thin pool target will zero blocks when they are allocated if the
skip_block_zeroing feature is not specified.  The block layer
may send a discard that only partly covers a block.  If a thin pool
block is partially discarded then there is no guarantee that the
discarded data will get zeroed before it is accessed again.
Due to this, thin devices cannot claim discards will always zero data.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Joe Thornber <ejt@redhat.com>
Cc: stable@vger.kernel.org # 3.4+
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-thin.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index af1fc3b2c2ad..d4209231069d 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2274,6 +2274,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		 * thin devices' discard limits consistent).
 		 */
 		ti->discards_supported = true;
+		ti->discard_zeroes_data_unsupported = true;
 	}
 	ti->private = pt;
 
@@ -2745,7 +2746,6 @@ static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
 	 * boundary is not sent to this target.
 	 */
 	limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
-	limits->discard_zeroes_data = pool->pf.zero_new_blocks;
 }
 
 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)

From 7ba10aa6fbac7158a50bec142132b04bc480bb29 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 26 Sep 2012 23:45:41 +0100
Subject: [PATCH 18/44] dm mpath: only retry ioctl when no paths if
 queue_if_no_path set

When there are no paths and multipath receives an ioctl, it waits until
a path becomes available.  This behaviour is incorrect if the
"queue_if_no_path" setting was not specified, as then the ioctl should
be rejected immediately, which this patch now does.

commit 35991652b ("dm mpath: allow ioctls to trigger pg init") should
have checked if queue_if_no_path was configured before queueing IO.

Checking for the queue_if_no_path feature, like is done in map_io(),
allows the following table load to work without blocking in the
multipath_ioctl retry loop:

  echo "0 1024 multipath 0 0 0 0" | dmsetup create mpath_nodevs

Without this fix the multipath_ioctl will block with the following stack
trace:

  blkid           D 0000000000000002     0 23936      1 0x00000000
   ffff8802b89e5cd8 0000000000000082 ffff8802b89e5fd8 0000000000012440
   ffff8802b89e4010 0000000000012440 0000000000012440 0000000000012440
   ffff8802b89e5fd8 0000000000012440 ffff88030c2aab30 ffff880325794040
  Call Trace:
   [<ffffffff814ce099>] schedule+0x29/0x70
   [<ffffffff814cc312>] schedule_timeout+0x182/0x2e0
   [<ffffffff8104dee0>] ? lock_timer_base+0x70/0x70
   [<ffffffff814cc48e>] schedule_timeout_uninterruptible+0x1e/0x20
   [<ffffffff8104f840>] msleep+0x20/0x30
   [<ffffffffa0000839>] multipath_ioctl+0x109/0x170 [dm_multipath]
   [<ffffffffa06bfb9c>] dm_blk_ioctl+0xbc/0xd0 [dm_mod]
   [<ffffffff8122a408>] __blkdev_driver_ioctl+0x28/0x30
   [<ffffffff8122a79e>] blkdev_ioctl+0xce/0x730
   [<ffffffff811970ac>] block_ioctl+0x3c/0x40
   [<ffffffff8117321c>] do_vfs_ioctl+0x8c/0x340
   [<ffffffff81166293>] ? sys_newfstat+0x33/0x40
   [<ffffffff81173571>] sys_ioctl+0xa1/0xb0
   [<ffffffff814d70a9>] system_call_fastpath+0x16/0x1b

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org # 3.5+
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index d8abb90a6c2f..034233eefc82 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1555,6 +1555,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 			   unsigned long arg)
 {
 	struct multipath *m = ti->private;
+	struct pgpath *pgpath;
 	struct block_device *bdev;
 	fmode_t mode;
 	unsigned long flags;
@@ -1570,12 +1571,14 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 	if (!m->current_pgpath)
 		__choose_pgpath(m, 0);
 
-	if (m->current_pgpath) {
-		bdev = m->current_pgpath->path.dev->bdev;
-		mode = m->current_pgpath->path.dev->mode;
+	pgpath = m->current_pgpath;
+
+	if (pgpath) {
+		bdev = pgpath->path.dev->bdev;
+		mode = pgpath->path.dev->mode;
 	}
 
-	if (m->queue_io)
+	if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path))
 		r = -EAGAIN;
 	else if (!bdev)
 		r = -EIO;

From ba1cbad93dd47223b1f3b8edd50dd9ef2abcb2ed Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 26 Sep 2012 23:45:42 +0100
Subject: [PATCH 19/44] dm: handle requests beyond end of device instead of
 using BUG_ON

The access beyond the end of device BUG_ON that was introduced to
dm_request_fn via commit 29e4013de7ad950280e4b2208 ("dm: implement
REQ_FLUSH/FUA support for request-based dm") was an overly
drastic (but simple) response to this situation.

I have received a report that this BUG_ON was hit and now think
it would be better to use dm_kill_unmapped_request() to fail the clone
and original request with -EIO.

map_request() will assign the valid target returned by
dm_table_find_target to tio->ti.  But when the target
isn't valid tio->ti is never assigned (because map_request isn't
called); so add a check for tio->ti != NULL to dm_done().

Reported-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: stable@vger.kernel.org # v2.6.37+
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 56 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 38 insertions(+), 18 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4e09b6ff5b49..6748e0c4df1f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -865,10 +865,14 @@ static void dm_done(struct request *clone, int error, bool mapped)
 {
 	int r = error;
 	struct dm_rq_target_io *tio = clone->end_io_data;
-	dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
+	dm_request_endio_fn rq_end_io = NULL;
 
-	if (mapped && rq_end_io)
-		r = rq_end_io(tio->ti, clone, error, &tio->info);
+	if (tio->ti) {
+		rq_end_io = tio->ti->type->rq_end_io;
+
+		if (mapped && rq_end_io)
+			r = rq_end_io(tio->ti, clone, error, &tio->info);
+	}
 
 	if (r <= 0)
 		/* The target wants to complete the I/O */
@@ -1588,15 +1592,6 @@ static int map_request(struct dm_target *ti, struct request *clone,
 	int r, requeued = 0;
 	struct dm_rq_target_io *tio = clone->end_io_data;
 
-	/*
-	 * Hold the md reference here for the in-flight I/O.
-	 * We can't rely on the reference count by device opener,
-	 * because the device may be closed during the request completion
-	 * when all bios are completed.
-	 * See the comment in rq_completed() too.
-	 */
-	dm_get(md);
-
 	tio->ti = ti;
 	r = ti->type->map_rq(ti, clone, &tio->info);
 	switch (r) {
@@ -1628,6 +1623,26 @@ static int map_request(struct dm_target *ti, struct request *clone,
 	return requeued;
 }
 
+static struct request *dm_start_request(struct mapped_device *md, struct request *orig)
+{
+	struct request *clone;
+
+	blk_start_request(orig);
+	clone = orig->special;
+	atomic_inc(&md->pending[rq_data_dir(clone)]);
+
+	/*
+	 * Hold the md reference here for the in-flight I/O.
+	 * We can't rely on the reference count by device opener,
+	 * because the device may be closed during the request completion
+	 * when all bios are completed.
+	 * See the comment in rq_completed() too.
+	 */
+	dm_get(md);
+
+	return clone;
+}
+
 /*
  * q->request_fn for request-based dm.
  * Called with the queue lock held.
@@ -1657,14 +1672,21 @@ static void dm_request_fn(struct request_queue *q)
 			pos = blk_rq_pos(rq);
 
 		ti = dm_table_find_target(map, pos);
-		BUG_ON(!dm_target_is_valid(ti));
+		if (!dm_target_is_valid(ti)) {
+			/*
+			 * Must perform setup, that dm_done() requires,
+			 * before calling dm_kill_unmapped_request
+			 */
+			DMERR_LIMIT("request attempted access beyond the end of device");
+			clone = dm_start_request(md, rq);
+			dm_kill_unmapped_request(clone, -EIO);
+			continue;
+		}
 
 		if (ti->type->busy && ti->type->busy(ti))
 			goto delay_and_out;
 
-		blk_start_request(rq);
-		clone = rq->special;
-		atomic_inc(&md->pending[rq_data_dir(clone)]);
+		clone = dm_start_request(md, rq);
 
 		spin_unlock(q->queue_lock);
 		if (map_request(ti, clone, md))
@@ -1684,8 +1706,6 @@ static void dm_request_fn(struct request_queue *q)
 	blk_delay_queue(q, HZ / 10);
 out:
 	dm_table_put(map);
-
-	return;
 }
 
 int dm_underlying_device_busy(struct request_queue *q)

From c3c4555edd10dbc0b388a0125b9c50de5e79af05 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Wed, 26 Sep 2012 23:45:43 +0100
Subject: [PATCH 20/44] dm table: clear add_random unless all devices have it
 set

Always clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not
have it set. Otherwise devices with predictable characteristics may
contribute entropy.

QUEUE_FLAG_ADD_RANDOM specifies whether or not queue IO timings
contribute to the random pool.

For bio-based targets this flag is always 0 because such devices have no
real queue.

For request-based devices this flag was always set to 1 by default.

Now set it according to the flags on underlying devices. If there is at
least one device which should not contribute, set the flag to zero: If a
device, such as fast SSD storage, is not suitable for supplying entropy,
a request-based queue stacked over it will not be either.

Because the checking logic is exactly same as for the rotational flag,
share the iteration function with device_is_nonrot().

Signed-off-by: Milan Broz <mbroz@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index f90069029aae..77b90ae66991 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1354,17 +1354,25 @@ static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
 	return q && blk_queue_nonrot(q);
 }
 
-static bool dm_table_is_nonrot(struct dm_table *t)
+static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
+			     sector_t start, sector_t len, void *data)
+{
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+
+	return q && !blk_queue_add_random(q);
+}
+
+static bool dm_table_all_devices_attribute(struct dm_table *t,
+					   iterate_devices_callout_fn func)
 {
 	struct dm_target *ti;
 	unsigned i = 0;
 
-	/* Ensure that all underlying device are non-rotational. */
 	while (i < dm_table_get_num_targets(t)) {
 		ti = dm_table_get_target(t, i++);
 
 		if (!ti->type->iterate_devices ||
-		    !ti->type->iterate_devices(ti, device_is_nonrot, NULL))
+		    !ti->type->iterate_devices(ti, func, NULL))
 			return 0;
 	}
 
@@ -1396,13 +1404,23 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	if (!dm_table_discard_zeroes_data(t))
 		q->limits.discard_zeroes_data = 0;
 
-	if (dm_table_is_nonrot(t))
+	/* Ensure that all underlying devices are non-rotational. */
+	if (dm_table_all_devices_attribute(t, device_is_nonrot))
 		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
 	else
 		queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q);
 
 	dm_table_set_integrity(t);
 
+	/*
+	 * Determine whether or not this queue's I/O timings contribute
+	 * to the entropy pool, Only request-based targets use this.
+	 * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not
+	 * have it set.
+	 */
+	if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random))
+		queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q);
+
 	/*
 	 * QUEUE_FLAG_STACKABLE must be set after all queue settings are
 	 * visible to other CPUs because, once the flag is set, incoming bios

From 3ae706561637331aa578e52bb89ecbba5edcb7a9 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 26 Sep 2012 23:45:45 +0100
Subject: [PATCH 21/44] dm: retain table limits when swapping to new table with
 no devices

Add a safety net that will re-use the DM device's existing limits in the
event that DM device has a temporary table that doesn't have any
component devices.  This is to reduce the chance that requests not
respecting the hardware limits will reach the device.

DM recalculates queue limits based only on devices which currently exist
in the table.  This creates a problem in the event all devices are
temporarily removed such as all paths being lost in multipath.  DM will
reset the limits to the maximum permissible, which can then assemble
requests which exceed the limits of the paths when the paths are
restored.  The request will fail the blk_rq_check_limits() test when
sent to a path with lower limits, and will be retried without end by
multipath.  This became a much bigger issue after v3.6 commit fe86cdcef
("block: do not artificially constrain max_sectors for stacking
drivers").

Reported-by: David Jeffery <djeffery@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c | 35 +++++++++++++++++++++++++++++++++++
 drivers/md/dm.c       | 15 ++++++++++++++-
 drivers/md/dm.h       |  1 +
 3 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 77b90ae66991..100368eb7991 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1212,6 +1212,41 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
 	return &t->targets[(KEYS_PER_NODE * n) + k];
 }
 
+static int count_device(struct dm_target *ti, struct dm_dev *dev,
+			sector_t start, sector_t len, void *data)
+{
+	unsigned *num_devices = data;
+
+	(*num_devices)++;
+
+	return 0;
+}
+
+/*
+ * Check whether a table has no data devices attached using each
+ * target's iterate_devices method.
+ * Returns false if the result is unknown because a target doesn't
+ * support iterate_devices.
+ */
+bool dm_table_has_no_data_devices(struct dm_table *table)
+{
+	struct dm_target *uninitialized_var(ti);
+	unsigned i = 0, num_devices = 0;
+
+	while (i < dm_table_get_num_targets(table)) {
+		ti = dm_table_get_target(table, i++);
+
+		if (!ti->type->iterate_devices)
+			return false;
+
+		ti->type->iterate_devices(ti, count_device, &num_devices);
+		if (num_devices)
+			return false;
+	}
+
+	return true;
+}
+
 /*
  * Establish the new table's queue_limits and validate them.
  */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6748e0c4df1f..67ffa391edcf 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2429,7 +2429,7 @@ static void dm_queue_flush(struct mapped_device *md)
  */
 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
 {
-	struct dm_table *map = ERR_PTR(-EINVAL);
+	struct dm_table *live_map, *map = ERR_PTR(-EINVAL);
 	struct queue_limits limits;
 	int r;
 
@@ -2439,6 +2439,19 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
 	if (!dm_suspended_md(md))
 		goto out;
 
+	/*
+	 * If the new table has no data devices, retain the existing limits.
+	 * This helps multipath with queue_if_no_path if all paths disappear,
+	 * then new I/O is queued based on these limits, and then some paths
+	 * reappear.
+	 */
+	if (dm_table_has_no_data_devices(table)) {
+		live_map = dm_get_live_table(md);
+		if (live_map)
+			limits = md->queue->limits;
+		dm_table_put(live_map);
+	}
+
 	r = dm_calculate_queue_limits(table, &limits);
 	if (r) {
 		map = ERR_PTR(r);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 52eef493d266..6a99fefaa743 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -54,6 +54,7 @@ void dm_table_event_callback(struct dm_table *t,
 			     void (*fn)(void *), void *context);
 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
+bool dm_table_has_no_data_devices(struct dm_table *table);
 int dm_calculate_queue_limits(struct dm_table *table,
 			      struct queue_limits *limits);
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,

From 9bc142dd755d360c08a91ecb107d218787a2e9db Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 26 Sep 2012 23:45:46 +0100
Subject: [PATCH 22/44] dm thin: tidy discard support

A little thin discard code refactoring to make the next patch (dm thin:
fix discard support for data devices) more readable.
Pull out a couple of functions (and uses bools instead of unsigned for
features).

No functional changes.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-thin.c | 64 +++++++++++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 25 deletions(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index d4209231069d..e99f4134dbd7 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -509,9 +509,9 @@ enum pool_mode {
 struct pool_features {
 	enum pool_mode mode;
 
-	unsigned zero_new_blocks:1;
-	unsigned discard_enabled:1;
-	unsigned discard_passdown:1;
+	bool zero_new_blocks:1;
+	bool discard_enabled:1;
+	bool discard_passdown:1;
 };
 
 struct thin_c;
@@ -1839,6 +1839,32 @@ static void __requeue_bios(struct pool *pool)
 /*----------------------------------------------------------------
  * Binding of control targets to a pool object
  *--------------------------------------------------------------*/
+static bool data_dev_supports_discard(struct pool_c *pt)
+{
+	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
+
+	return q && blk_queue_discard(q);
+}
+
+/*
+ * If discard_passdown was enabled verify that the data device
+ * supports discards.  Disable discard_passdown if not; otherwise
+ * -EOPNOTSUPP will be returned.
+ */
+static void disable_passdown_if_not_supported(struct pool_c *pt,
+					      struct pool_features *pf)
+{
+	char buf[BDEVNAME_SIZE];
+
+	if (!pf->discard_passdown || data_dev_supports_discard(pt))
+		return;
+
+	DMWARN("Discard unsupported by data device (%s): Disabling discard passdown.",
+	       bdevname(pt->data_dev->bdev, buf));
+
+	pf->discard_passdown = false;
+}
+
 static int bind_control_target(struct pool *pool, struct dm_target *ti)
 {
 	struct pool_c *pt = ti->private;
@@ -1855,23 +1881,9 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
 	pool->ti = ti;
 	pool->low_water_blocks = pt->low_water_blocks;
 	pool->pf = pt->pf;
-	set_pool_mode(pool, new_mode);
 
-	/*
-	 * If discard_passdown was enabled verify that the data device
-	 * supports discards.  Disable discard_passdown if not; otherwise
-	 * -EOPNOTSUPP will be returned.
-	 */
-	/* FIXME: pull this out into a sep fn. */
-	if (pt->pf.discard_passdown) {
-		struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
-		if (!q || !blk_queue_discard(q)) {
-			char buf[BDEVNAME_SIZE];
-			DMWARN("Discard unsupported by data device (%s): Disabling discard passdown.",
-			       bdevname(pt->data_dev->bdev, buf));
-			pool->pf.discard_passdown = 0;
-		}
-	}
+	disable_passdown_if_not_supported(pt, &pool->pf);
+	set_pool_mode(pool, new_mode);
 
 	return 0;
 }
@@ -1889,9 +1901,9 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti)
 static void pool_features_init(struct pool_features *pf)
 {
 	pf->mode = PM_WRITE;
-	pf->zero_new_blocks = 1;
-	pf->discard_enabled = 1;
-	pf->discard_passdown = 1;
+	pf->zero_new_blocks = true;
+	pf->discard_enabled = true;
+	pf->discard_passdown = true;
 }
 
 static void __pool_destroy(struct pool *pool)
@@ -2119,13 +2131,13 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
 		argc--;
 
 		if (!strcasecmp(arg_name, "skip_block_zeroing"))
-			pf->zero_new_blocks = 0;
+			pf->zero_new_blocks = false;
 
 		else if (!strcasecmp(arg_name, "ignore_discard"))
-			pf->discard_enabled = 0;
+			pf->discard_enabled = false;
 
 		else if (!strcasecmp(arg_name, "no_discard_passdown"))
-			pf->discard_passdown = 0;
+			pf->discard_passdown = false;
 
 		else if (!strcasecmp(arg_name, "read_only"))
 			pf->mode = PM_READ_ONLY;
@@ -2261,6 +2273,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	pt->low_water_blocks = low_water_blocks;
 	pt->pf = pf;
 	ti->num_flush_requests = 1;
+
 	/*
 	 * Only need to enable discards if the pool should pass
 	 * them down to the data device.  The thin device's discard
@@ -2268,6 +2281,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	 */
 	if (pf.discard_enabled && pf.discard_passdown) {
 		ti->num_discard_requests = 1;
+
 		/*
 		 * Setting 'discards_supported' circumvents the normal
 		 * stacking of discard limits (this keeps the pool and

From 0424caa14508f19ca8093d36c15250e0331a3a0a Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 26 Sep 2012 23:45:47 +0100
Subject: [PATCH 23/44] dm thin: fix discard support for data devices

The discard limits that get established for a thin-pool or thin device
may be incompatible with the pool's data device.  Avoid this by checking
the discard limits of the pool's data device.  If an incompatibility is
found then the pool's 'discard passdown' feature is disabled.

Change thin_io_hints to ensure that a thin device always uses the same
queue limits as its pool device.

Introduce requested_pf to track whether or not the table line originally
contained the no_discard_passdown flag and use this directly for table
output.  We prepare the correct setting for discard_passdown directly in
bind_control_target (called from pool_io_hints) and store it in
adjusted_pf rather than waiting until we have access to pool->pf in
pool_preresume.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-thin.c | 87 +++++++++++++++++++++++++++++---------------
 1 file changed, 57 insertions(+), 30 deletions(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index e99f4134dbd7..c29410af1e22 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -580,7 +580,8 @@ struct pool_c {
 	struct dm_target_callbacks callbacks;
 
 	dm_block_t low_water_blocks;
-	struct pool_features pf;
+	struct pool_features requested_pf; /* Features requested during table load */
+	struct pool_features adjusted_pf;  /* Features used after adjusting for constituent devices */
 };
 
 /*
@@ -1848,21 +1849,36 @@ static bool data_dev_supports_discard(struct pool_c *pt)
 
 /*
  * If discard_passdown was enabled verify that the data device
- * supports discards.  Disable discard_passdown if not; otherwise
- * -EOPNOTSUPP will be returned.
+ * supports discards.  Disable discard_passdown if not.
  */
-static void disable_passdown_if_not_supported(struct pool_c *pt,
-					      struct pool_features *pf)
+static void disable_passdown_if_not_supported(struct pool_c *pt)
 {
+	struct pool *pool = pt->pool;
+	struct block_device *data_bdev = pt->data_dev->bdev;
+	struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
+	sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
+	const char *reason = NULL;
 	char buf[BDEVNAME_SIZE];
 
-	if (!pf->discard_passdown || data_dev_supports_discard(pt))
+	if (!pt->adjusted_pf.discard_passdown)
 		return;
 
-	DMWARN("Discard unsupported by data device (%s): Disabling discard passdown.",
-	       bdevname(pt->data_dev->bdev, buf));
+	if (!data_dev_supports_discard(pt))
+		reason = "discard unsupported";
 
-	pf->discard_passdown = false;
+	else if (data_limits->max_discard_sectors < pool->sectors_per_block)
+		reason = "max discard sectors smaller than a block";
+
+	else if (data_limits->discard_granularity > block_size)
+		reason = "discard granularity larger than a block";
+
+	else if (block_size & (data_limits->discard_granularity - 1))
+		reason = "discard granularity not a factor of block size";
+
+	if (reason) {
+		DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
+		pt->adjusted_pf.discard_passdown = false;
+	}
 }
 
 static int bind_control_target(struct pool *pool, struct dm_target *ti)
@@ -1873,16 +1889,15 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
 	 * We want to make sure that degraded pools are never upgraded.
 	 */
 	enum pool_mode old_mode = pool->pf.mode;
-	enum pool_mode new_mode = pt->pf.mode;
+	enum pool_mode new_mode = pt->adjusted_pf.mode;
 
 	if (old_mode > new_mode)
 		new_mode = old_mode;
 
 	pool->ti = ti;
 	pool->low_water_blocks = pt->low_water_blocks;
-	pool->pf = pt->pf;
+	pool->pf = pt->adjusted_pf;
 
-	disable_passdown_if_not_supported(pt, &pool->pf);
 	set_pool_mode(pool, new_mode);
 
 	return 0;
@@ -2271,7 +2286,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	pt->metadata_dev = metadata_dev;
 	pt->data_dev = data_dev;
 	pt->low_water_blocks = low_water_blocks;
-	pt->pf = pf;
+	pt->adjusted_pf = pt->requested_pf = pf;
 	ti->num_flush_requests = 1;
 
 	/*
@@ -2718,7 +2733,7 @@ static int pool_status(struct dm_target *ti, status_type_t type,
 		       format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
 		       (unsigned long)pool->sectors_per_block,
 		       (unsigned long long)pt->low_water_blocks);
-		emit_flags(&pt->pf, result, sz, maxlen);
+		emit_flags(&pt->requested_pf, result, sz, maxlen);
 		break;
 	}
 
@@ -2747,19 +2762,21 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
 
-static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
+static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
 {
-	/*
-	 * FIXME: these limits may be incompatible with the pool's data device
-	 */
+	struct pool *pool = pt->pool;
+	struct queue_limits *data_limits;
+
 	limits->max_discard_sectors = pool->sectors_per_block;
 
 	/*
-	 * This is just a hint, and not enforced.  We have to cope with
-	 * bios that cover a block partially.  A discard that spans a block
-	 * boundary is not sent to this target.
+	 * discard_granularity is just a hint, and not enforced.
 	 */
-	limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+	if (pt->adjusted_pf.discard_passdown) {
+		data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
+		limits->discard_granularity = data_limits->discard_granularity;
+	} else
+		limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
 }
 
 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
@@ -2769,15 +2786,25 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 	blk_limits_io_min(limits, 0);
 	blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
-	if (pool->pf.discard_enabled)
-		set_discard_limits(pool, limits);
+
+	/*
+	 * pt->adjusted_pf is a staging area for the actual features to use.
+	 * They get transferred to the live pool in bind_control_target()
+	 * called from pool_preresume().
+	 */
+	if (!pt->adjusted_pf.discard_enabled)
+		return;
+
+	disable_passdown_if_not_supported(pt);
+
+	set_discard_limits(pt, limits);
 }
 
 static struct target_type pool_target = {
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 3, 0},
+	.version = {1, 4, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
@@ -3056,19 +3083,19 @@ static int thin_iterate_devices(struct dm_target *ti,
 	return 0;
 }
 
+/*
+ * A thin device always inherits its queue limits from its pool.
+ */
 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
 	struct thin_c *tc = ti->private;
-	struct pool *pool = tc->pool;
 
-	blk_limits_io_min(limits, 0);
-	blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
-	set_discard_limits(pool, limits);
+	*limits = bdev_get_queue(tc->pool_dev->bdev)->limits;
 }
 
 static struct target_type thin_target = {
 	.name = "thin",
-	.version = {1, 3, 0},
+	.version = {1, 4, 0},
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,

From 1d55f6bcc0331d744cd5b56c4ee79e3809438161 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 26 Sep 2012 23:45:48 +0100
Subject: [PATCH 24/44] dm verity: fix overflow check

This patch fixes sector_t overflow checking in dm-verity.

Without this patch, the code checks for overflow only if sector_t is
smaller than long long, not if sector_t and long long have the same size.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-verity.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 254d19268ad2..892ae2766aa6 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -718,8 +718,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	v->hash_dev_block_bits = ffs(num) - 1;
 
 	if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 ||
-	    num_ll << (v->data_dev_block_bits - SECTOR_SHIFT) !=
-	    (sector_t)num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) {
+	    (sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT))
+	    >> (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll) {
 		ti->error = "Invalid data blocks";
 		r = -EINVAL;
 		goto bad;
@@ -733,8 +733,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 
 	if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 ||
-	    num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT) !=
-	    (sector_t)num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) {
+	    (sector_t)(num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT))
+	    >> (v->hash_dev_block_bits - SECTOR_SHIFT) != num_ll) {
 		ti->error = "Invalid hash start";
 		r = -EINVAL;
 		goto bad;

From 424597921ac1fc2a475a656ea5895cbe78f25d0d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 3 Sep 2012 03:24:18 -0400
Subject: [PATCH 25/44] um: take cleaning singlestep to start_thread()

... assuming it's needed to be done at all

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/um/kernel/exec.c | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c
index 6cade9366364..4cab0c78684b 100644
--- a/arch/um/kernel/exec.c
+++ b/arch/um/kernel/exec.c
@@ -41,32 +41,18 @@ void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp)
 {
 	PT_REGS_IP(regs) = eip;
 	PT_REGS_SP(regs) = esp;
+	current->ptrace &= ~PT_DTRACE;
+#ifdef SUBARCH_EXECVE1
+	SUBARCH_EXECVE1(regs->regs);
+#endif
 }
 EXPORT_SYMBOL(start_thread);
 
-static long execve1(const char *file,
-		    const char __user *const __user *argv,
-		    const char __user *const __user *env)
-{
-	long error;
-
-	error = do_execve(file, argv, env, &current->thread.regs);
-	if (error == 0) {
-		task_lock(current);
-		current->ptrace &= ~PT_DTRACE;
-#ifdef SUBARCH_EXECVE1
-		SUBARCH_EXECVE1(&current->thread.regs.regs);
-#endif
-		task_unlock(current);
-	}
-	return error;
-}
-
 long um_execve(const char *file, const char __user *const __user *argv, const char __user *const __user *env)
 {
 	long err;
 
-	err = execve1(file, argv, env);
+	err = do_execve(file, argv, env, &current->thread.regs);
 	if (!err)
 		UML_LONGJMP(current->thread.exec_buf, 1);
 	return err;
@@ -81,7 +67,7 @@ long sys_execve(const char __user *file, const char __user *const __user *argv,
 	filename = getname(file);
 	error = PTR_ERR(filename);
 	if (IS_ERR(filename)) goto out;
-	error = execve1(filename, argv, env);
+	error = do_execve(filename, argv, env, &current->thread.regs);
 	putname(filename);
  out:
 	return error;

From bf56d57638452a8ee9241c9ca082d59cd2ca4cc8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Sep 2012 23:20:33 -0400
Subject: [PATCH 26/44] um: don't leak floating point state and segment
 registers on execve()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/um/kernel/exec.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c
index 4cab0c78684b..8c82786da823 100644
--- a/arch/um/kernel/exec.c
+++ b/arch/um/kernel/exec.c
@@ -39,6 +39,7 @@ void flush_thread(void)
 
 void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp)
 {
+	get_safe_registers(regs->regs.gp, regs->regs.fp);
 	PT_REGS_IP(regs) = eip;
 	PT_REGS_SP(regs) = esp;
 	current->ptrace &= ~PT_DTRACE;

From f9a38eace4498a5e9f6d2cdfc879d5444edc3a5f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 6 Sep 2012 13:39:47 -0400
Subject: [PATCH 27/44] um: let signal_delivered() do SIGTRAP on singlestepping
 into handler

... rather than duplicating that in sigframe setup code (and doing that
inconsistently, at that)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/um/kernel/signal.c | 6 +++++-
 arch/x86/um/signal.c    | 6 ------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c
index 7362d58efc29..cc9c2350e417 100644
--- a/arch/um/kernel/signal.c
+++ b/arch/um/kernel/signal.c
@@ -22,9 +22,13 @@ static void handle_signal(struct pt_regs *regs, unsigned long signr,
 			 struct k_sigaction *ka, siginfo_t *info)
 {
 	sigset_t *oldset = sigmask_to_save();
+	int singlestep = 0;
 	unsigned long sp;
 	int err;
 
+	if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED))
+		singlestep = 1;
+
 	/* Did we come from a system call? */
 	if (PT_REGS_SYSCALL_NR(regs) >= 0) {
 		/* If so, check system call restarting.. */
@@ -61,7 +65,7 @@ static void handle_signal(struct pt_regs *regs, unsigned long signr,
 	if (err)
 		force_sigsegv(signr, current);
 	else
-		signal_delivered(signr, info, ka, regs, 0);
+		signal_delivered(signr, info, ka, regs, singlestep);
 }
 
 static int kern_do_signal(struct pt_regs *regs)
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index a508cea13503..ba7363ecf896 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -416,9 +416,6 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig,
 	PT_REGS_AX(regs) = (unsigned long) sig;
 	PT_REGS_DX(regs) = (unsigned long) 0;
 	PT_REGS_CX(regs) = (unsigned long) 0;
-
-	if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED))
-		ptrace_notify(SIGTRAP);
 	return 0;
 }
 
@@ -466,9 +463,6 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
 	PT_REGS_AX(regs) = (unsigned long) sig;
 	PT_REGS_DX(regs) = (unsigned long) &frame->info;
 	PT_REGS_CX(regs) = (unsigned long) &frame->uc;
-
-	if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED))
-		ptrace_notify(SIGTRAP);
 	return 0;
 }
 

From d2ce4e92fa4f79a5fdb4cc912b411280afe21697 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 20 Sep 2012 09:28:25 -0400
Subject: [PATCH 28/44] um: kill thread->forking

we only use that to tell copy_thread() done by syscall from that
done by kernel_thread().  However, it's easier to do simply by
checking PF_KTHREAD in thread flags.

Merge sys_clone() guts for 32bit and 64bit, while we are at it...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/um/include/asm/processor-generic.h |  9 ---------
 arch/um/kernel/process.c                |  8 ++++----
 arch/um/kernel/syscall.c                | 24 +++++++++++-----------
 arch/x86/um/shared/sysdep/syscalls.h    |  2 ++
 arch/x86/um/sys_call_table_32.c         |  2 +-
 arch/x86/um/syscalls_32.c               | 27 +++++++------------------
 arch/x86/um/syscalls_64.c               | 23 +++------------------
 7 files changed, 29 insertions(+), 66 deletions(-)

diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h
index 69f1c57a8d0d..33a6a2423bd2 100644
--- a/arch/um/include/asm/processor-generic.h
+++ b/arch/um/include/asm/processor-generic.h
@@ -20,14 +20,6 @@ struct mm_struct;
 
 struct thread_struct {
 	struct task_struct *saved_task;
-	/*
-	 * This flag is set to 1 before calling do_fork (and analyzed in
-	 * copy_thread) to mark that we are begin called from userspace (fork /
-	 * vfork / clone), and reset to 0 after. It is left to 0 when called
-	 * from kernelspace (i.e. kernel_thread() or fork_idle(),
-	 * as of 2.6.11).
-	 */
-	int forking;
 	struct pt_regs regs;
 	int singlestep_syscall;
 	void *fault_addr;
@@ -58,7 +50,6 @@ struct thread_struct {
 
 #define INIT_THREAD \
 { \
-	.forking		= 0, \
 	.regs		   	= EMPTY_REGS,	\
 	.fault_addr		= NULL, \
 	.prev_sched		= NULL, \
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 57fc7028714a..c5f5afa50745 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -181,11 +181,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 		struct pt_regs *regs)
 {
 	void (*handler)(void);
+	int kthread = current->flags & PF_KTHREAD;
 	int ret = 0;
 
 	p->thread = (struct thread_struct) INIT_THREAD;
 
-	if (current->thread.forking) {
+	if (!kthread) {
 	  	memcpy(&p->thread.regs.regs, &regs->regs,
 		       sizeof(p->thread.regs.regs));
 		PT_REGS_SET_SYSCALL_RETURN(&p->thread.regs, 0);
@@ -195,8 +196,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 		handler = fork_handler;
 
 		arch_copy_thread(&current->thread.arch, &p->thread.arch);
-	}
-	else {
+	} else {
 		get_safe_registers(p->thread.regs.regs.gp, p->thread.regs.regs.fp);
 		p->thread.request.u.thread = current->thread.request.u.thread;
 		handler = new_thread_handler;
@@ -204,7 +204,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 
 	new_thread(task_stack_page(p), &p->thread.switch_buf, handler);
 
-	if (current->thread.forking) {
+	if (!kthread) {
 		clear_flushed_tls(p);
 
 		/*
diff --git a/arch/um/kernel/syscall.c b/arch/um/kernel/syscall.c
index f958cb876ee3..a4c6d8eee74c 100644
--- a/arch/um/kernel/syscall.c
+++ b/arch/um/kernel/syscall.c
@@ -17,25 +17,25 @@
 
 long sys_fork(void)
 {
-	long ret;
-
-	current->thread.forking = 1;
-	ret = do_fork(SIGCHLD, UPT_SP(&current->thread.regs.regs),
+	return do_fork(SIGCHLD, UPT_SP(&current->thread.regs.regs),
 		      &current->thread.regs, 0, NULL, NULL);
-	current->thread.forking = 0;
-	return ret;
 }
 
 long sys_vfork(void)
 {
-	long ret;
-
-	current->thread.forking = 1;
-	ret = do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD,
+	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD,
 		      UPT_SP(&current->thread.regs.regs),
 		      &current->thread.regs, 0, NULL, NULL);
-	current->thread.forking = 0;
-	return ret;
+}
+
+long sys_clone(unsigned long clone_flags, unsigned long newsp,
+	       void __user *parent_tid, void __user *child_tid)
+{
+	if (!newsp)
+		newsp = UPT_SP(&current->thread.regs.regs);
+
+	return do_fork(clone_flags, newsp, &current->thread.regs, 0, parent_tid,
+		      child_tid);
 }
 
 long old_mmap(unsigned long addr, unsigned long len,
diff --git a/arch/x86/um/shared/sysdep/syscalls.h b/arch/x86/um/shared/sysdep/syscalls.h
index bd9a89b67e41..ca255a805ed9 100644
--- a/arch/x86/um/shared/sysdep/syscalls.h
+++ b/arch/x86/um/shared/sysdep/syscalls.h
@@ -1,3 +1,5 @@
+extern long sys_clone(unsigned long clone_flags, unsigned long newsp,
+	       void __user *parent_tid, void __user *child_tid);
 #ifdef __i386__
 #include "syscalls_32.h"
 #else
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index 68d1dc91b37b..b5408cecac6c 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -28,7 +28,7 @@
 #define ptregs_execve sys_execve
 #define ptregs_iopl sys_iopl
 #define ptregs_vm86old sys_vm86old
-#define ptregs_clone sys_clone
+#define ptregs_clone i386_clone
 #define ptregs_vm86 sys_vm86
 #define ptregs_sigaltstack sys_sigaltstack
 #define ptregs_vfork sys_vfork
diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c
index b853e8600b9d..db444c7218fe 100644
--- a/arch/x86/um/syscalls_32.c
+++ b/arch/x86/um/syscalls_32.c
@@ -3,37 +3,24 @@
  * Licensed under the GPL
  */
 
-#include "linux/sched.h"
-#include "linux/shm.h"
-#include "linux/ipc.h"
-#include "linux/syscalls.h"
-#include "asm/mman.h"
-#include "asm/uaccess.h"
-#include "asm/unistd.h"
+#include <linux/syscalls.h>
+#include <sysdep/syscalls.h>
 
 /*
  * The prototype on i386 is:
  *
- *     int clone(int flags, void * child_stack, int * parent_tidptr, struct user_desc * newtls, int * child_tidptr)
+ *     int clone(int flags, void * child_stack, int * parent_tidptr, struct user_desc * newtls
  *
  * and the "newtls" arg. on i386 is read by copy_thread directly from the
  * register saved on the stack.
  */
-long sys_clone(unsigned long clone_flags, unsigned long newsp,
-	       int __user *parent_tid, void *newtls, int __user *child_tid)
+long i386_clone(unsigned long clone_flags, unsigned long newsp,
+		int __user *parent_tid, void *newtls, int __user *child_tid)
 {
-	long ret;
-
-	if (!newsp)
-		newsp = UPT_SP(&current->thread.regs.regs);
-
-	current->thread.forking = 1;
-	ret = do_fork(clone_flags, newsp, &current->thread.regs, 0, parent_tid,
-		      child_tid);
-	current->thread.forking = 0;
-	return ret;
+	return sys_clone(clone_flags, newsp, parent_tid, child_tid);
 }
 
+
 long sys_sigaction(int sig, const struct old_sigaction __user *act,
 			 struct old_sigaction __user *oact)
 {
diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c
index f3d82bb6e15a..adb08eb5c22a 100644
--- a/arch/x86/um/syscalls_64.c
+++ b/arch/x86/um/syscalls_64.c
@@ -5,12 +5,9 @@
  * Licensed under the GPL
  */
 
-#include "linux/linkage.h"
-#include "linux/personality.h"
-#include "linux/utsname.h"
-#include "asm/prctl.h" /* XXX This should get the constants from libc */
-#include "asm/uaccess.h"
-#include "os.h"
+#include <linux/sched.h>
+#include <asm/prctl.h> /* XXX This should get the constants from libc */
+#include <os.h>
 
 long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr)
 {
@@ -79,20 +76,6 @@ long sys_arch_prctl(int code, unsigned long addr)
 	return arch_prctl(current, code, (unsigned long __user *) addr);
 }
 
-long sys_clone(unsigned long clone_flags, unsigned long newsp,
-	       void __user *parent_tid, void __user *child_tid)
-{
-	long ret;
-
-	if (!newsp)
-		newsp = UPT_SP(&current->thread.regs.regs);
-	current->thread.forking = 1;
-	ret = do_fork(clone_flags, newsp, &current->thread.regs, 0, parent_tid,
-		      child_tid);
-	current->thread.forking = 0;
-	return ret;
-}
-
 void arch_switch_to(struct task_struct *to)
 {
 	if ((to->thread.arch.fs == 0) || (to->mm == NULL))

From 7f8436a1269eaaf2d0b1054a325eddf4e14cb80d Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Mon, 24 Sep 2012 18:29:01 +0000
Subject: [PATCH 29/44] l2tp: fix return value check

In case of error, the function genlmsg_put() returns NULL pointer
not ERR_PTR(). The IS_ERR() test in the return value check should
be replaced with NULL test.

dpatch engine is used to auto generate this patch.
(https://github.com/weiyj/dpatch)

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_netlink.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index d71cd9229a47..6f936358d664 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -80,8 +80,8 @@ static int l2tp_nl_cmd_noop(struct sk_buff *skb, struct genl_info *info)
 
 	hdr = genlmsg_put(msg, info->snd_pid, info->snd_seq,
 			  &l2tp_nl_family, 0, L2TP_CMD_NOOP);
-	if (IS_ERR(hdr)) {
-		ret = PTR_ERR(hdr);
+	if (!hdr) {
+		ret = -EMSGSIZE;
 		goto err_out;
 	}
 
@@ -250,8 +250,8 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 pid, u32 seq, int flags,
 
 	hdr = genlmsg_put(skb, pid, seq, &l2tp_nl_family, flags,
 			  L2TP_CMD_TUNNEL_GET);
-	if (IS_ERR(hdr))
-		return PTR_ERR(hdr);
+	if (!hdr)
+		return -EMSGSIZE;
 
 	if (nla_put_u8(skb, L2TP_ATTR_PROTO_VERSION, tunnel->version) ||
 	    nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) ||
@@ -617,8 +617,8 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 pid, u32 seq, int flags
 	sk = tunnel->sock;
 
 	hdr = genlmsg_put(skb, pid, seq, &l2tp_nl_family, flags, L2TP_CMD_SESSION_GET);
-	if (IS_ERR(hdr))
-		return PTR_ERR(hdr);
+	if (!hdr)
+		return -EMSGSIZE;
 
 	if (nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) ||
 	    nla_put_u32(skb, L2TP_ATTR_SESSION_ID, session->session_id) ||

From a326e6dd315ee38c280b7ecf8a0d777952e384cd Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Mon, 24 Sep 2012 18:29:35 +0000
Subject: [PATCH 30/44] team: fix return value check

In case of error, the function genlmsg_put() returns NULL pointer
not ERR_PTR(). The IS_ERR() test in the return value check should
be replaced with NULL test.

dpatch engine is used to auto generate this patch.
(https://github.com/weiyj/dpatch)

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/team/team.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 3ffe8a6e3c8b..f8cd61f449a4 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1653,8 +1653,8 @@ static int team_nl_cmd_noop(struct sk_buff *skb, struct genl_info *info)
 
 	hdr = genlmsg_put(msg, info->snd_pid, info->snd_seq,
 			  &team_nl_family, 0, TEAM_CMD_NOOP);
-	if (IS_ERR(hdr)) {
-		err = PTR_ERR(hdr);
+	if (!hdr) {
+		err = -EMSGSIZE;
 		goto err_msg_put;
 	}
 
@@ -1848,8 +1848,8 @@ static int team_nl_send_options_get(struct team *team, u32 pid, u32 seq,
 
 	hdr = genlmsg_put(skb, pid, seq, &team_nl_family, flags | NLM_F_MULTI,
 			  TEAM_CMD_OPTIONS_GET);
-	if (IS_ERR(hdr))
-		return PTR_ERR(hdr);
+	if (!hdr)
+		return -EMSGSIZE;
 
 	if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex))
 		goto nla_put_failure;
@@ -2068,8 +2068,8 @@ static int team_nl_fill_port_list_get(struct sk_buff *skb,
 
 	hdr = genlmsg_put(skb, pid, seq, &team_nl_family, flags,
 			  TEAM_CMD_PORT_LIST_GET);
-	if (IS_ERR(hdr))
-		return PTR_ERR(hdr);
+	if (!hdr)
+		return -EMSGSIZE;
 
 	if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex))
 		goto nla_put_failure;

From beb5ac20b3f90ffabac1eecd3c00205255df0728 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Wed, 26 Sep 2012 19:51:58 +0000
Subject: [PATCH 31/44] netdev: pasemi: fix return value check in
 pasemi_mac_phy_init()

In case of error, the function of_phy_connect() returns NULL
pointer not ERR_PTR(). The IS_ERR() test in the return value
check should be replaced with NULL test.

dpatch engine is used to auto generate this patch.
(https://github.com/weiyj/dpatch)

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/pasemi/pasemi_mac.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/pasemi/pasemi_mac.c b/drivers/net/ethernet/pasemi/pasemi_mac.c
index e559dfa06d6a..6fa74d530e44 100644
--- a/drivers/net/ethernet/pasemi/pasemi_mac.c
+++ b/drivers/net/ethernet/pasemi/pasemi_mac.c
@@ -1101,9 +1101,9 @@ static int pasemi_mac_phy_init(struct net_device *dev)
 	phydev = of_phy_connect(dev, phy_dn, &pasemi_adjust_link, 0,
 				PHY_INTERFACE_MODE_SGMII);
 
-	if (IS_ERR(phydev)) {
+	if (!phydev) {
 		printk(KERN_ERR "%s: Could not attach to phy\n", dev->name);
-		return PTR_ERR(phydev);
+		return -ENODEV;
 	}
 
 	mac->phydev = phydev;

From bbb35efcda41d589cfff5e2b08c5fb457791117c Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Thu, 27 Sep 2012 20:10:57 +0200
Subject: [PATCH 32/44] um: Fix IPC on um
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit c1d7e01d (ipc: use Kconfig options for __ARCH_WANT_[COMPAT_]IPC_PARSE_VERSION)
forgot UML and broke IPC on it.
Also UML has to select ARCH_WANT_IPC_PARSE_VERSION usin Kconfig.

Reported-and-tested-by: <Toralf Förster toralf.foerster@gmx.de>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/x86/um/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index 9926e11a772d..aeaff8bef2f1 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -21,6 +21,7 @@ config 64BIT
 config X86_32
 	def_bool !64BIT
 	select HAVE_AOUT
+	select ARCH_WANT_IPC_PARSE_VERSION
 
 config X86_64
 	def_bool 64BIT

From 9429ec96c2718c0d1e3317cf60a87a0405223814 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Thu, 16 Aug 2012 20:15:05 +0200
Subject: [PATCH 33/44] um: Preinclude include/linux/kern_levels.h

The userspace part of UML uses the asm-offsets.h generator mechanism to
create definitions for UM_KERN_<LEVEL> that match the in-kernel
KERN_<LEVEL> constant definitions.

As of commit 04d2c8c83d0e3ac5f78aeede51babb3236200112 ("printk: convert
the format for KERN_<LEVEL> to a 2 byte pattern"), KERN_<LEVEL> is no
longer expanded to the literal '"<LEVEL>"', but to '"\001" "LEVEL"', i.e.
it contains two parts.

However, the combo of DEFINE_STR() in
arch/x86/um/shared/sysdep/kernel-offsets.h and sed-y in Kbuild doesn't
support string literals consisting of multiple parts. Hence for all
UM_KERN_<LEVEL> definitions, only the SOH character is retained in the actual
definition, while the remainder ends up in the comment. E.g. in
include/generated/asm-offsets.h we get

    #define UM_KERN_INFO "\001" /* "6" KERN_INFO */

instead of

    #define UM_KERN_INFO "\001" "6" /* KERN_INFO */

This causes spurious '^A' output in some kernel messages:

    Calibrating delay loop... 4640.76 BogoMIPS (lpj=23203840)
    pid_max: default: 32768 minimum: 301
    Mount-cache hash table entries: 256
    ^AChecking that host ptys support output SIGIO...Yes
    ^AChecking that host ptys support SIGIO on close...No, enabling workaround
    ^AUsing 2.6 host AIO
    NET: Registered protocol family 16
    bio: create slab <bio-0> at 0
    Switching to clocksource itimer

To fix this:
  - Move the mapping from UM_KERN_<LEVEL> to KERN_<LEVEL> from
    arch/um/include/shared/common-offsets.h to
    arch/um/include/shared/user.h, which is preincluded for all userspace
    parts,
  - Preinclude include/linux/kern_levels.h for all userspace parts, to
    obtain the in-kernel KERN_<LEVEL> constant definitions. This doesn't
    violate the kernel/userspace separation, as include/linux/kern_levels.h
    is self-contained and doesn't expose any other kernel internals.
  - Remove the now unused STR() and DEFINE_STR() macros.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/um/include/shared/common-offsets.h    | 10 ----------
 arch/um/include/shared/user.h              | 11 +++++++++++
 arch/um/scripts/Makefile.rules             |  2 +-
 arch/x86/um/shared/sysdep/kernel-offsets.h |  3 ---
 4 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h
index 40db8f71deae..2df313b6a586 100644
--- a/arch/um/include/shared/common-offsets.h
+++ b/arch/um/include/shared/common-offsets.h
@@ -7,16 +7,6 @@ DEFINE(UM_KERN_PAGE_MASK, PAGE_MASK);
 DEFINE(UM_KERN_PAGE_SHIFT, PAGE_SHIFT);
 DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC);
 
-DEFINE_STR(UM_KERN_EMERG, KERN_EMERG);
-DEFINE_STR(UM_KERN_ALERT, KERN_ALERT);
-DEFINE_STR(UM_KERN_CRIT, KERN_CRIT);
-DEFINE_STR(UM_KERN_ERR, KERN_ERR);
-DEFINE_STR(UM_KERN_WARNING, KERN_WARNING);
-DEFINE_STR(UM_KERN_NOTICE, KERN_NOTICE);
-DEFINE_STR(UM_KERN_INFO, KERN_INFO);
-DEFINE_STR(UM_KERN_DEBUG, KERN_DEBUG);
-DEFINE_STR(UM_KERN_CONT, KERN_CONT);
-
 DEFINE(UM_ELF_CLASS, ELF_CLASS);
 DEFINE(UM_ELFCLASS32, ELFCLASS32);
 DEFINE(UM_ELFCLASS64, ELFCLASS64);
diff --git a/arch/um/include/shared/user.h b/arch/um/include/shared/user.h
index 4fa82c055aab..cef068563336 100644
--- a/arch/um/include/shared/user.h
+++ b/arch/um/include/shared/user.h
@@ -26,6 +26,17 @@
 extern void panic(const char *fmt, ...)
 	__attribute__ ((format (printf, 1, 2)));
 
+/* Requires preincluding include/linux/kern_levels.h */
+#define UM_KERN_EMERG	KERN_EMERG
+#define UM_KERN_ALERT	KERN_ALERT
+#define UM_KERN_CRIT	KERN_CRIT
+#define UM_KERN_ERR	KERN_ERR
+#define UM_KERN_WARNING	KERN_WARNING
+#define UM_KERN_NOTICE	KERN_NOTICE
+#define UM_KERN_INFO	KERN_INFO
+#define UM_KERN_DEBUG	KERN_DEBUG
+#define UM_KERN_CONT	KERN_CONT
+
 #ifdef UML_CONFIG_PRINTK
 extern int printk(const char *fmt, ...)
 	__attribute__ ((format (printf, 1, 2)));
diff --git a/arch/um/scripts/Makefile.rules b/arch/um/scripts/Makefile.rules
index d50270d26b42..15889df9b466 100644
--- a/arch/um/scripts/Makefile.rules
+++ b/arch/um/scripts/Makefile.rules
@@ -8,7 +8,7 @@ USER_OBJS += $(filter %_user.o,$(obj-y) $(obj-m)  $(USER_SINGLE_OBJS))
 USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file))
 
 $(USER_OBJS:.o=.%): \
-	c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) -include user.h $(CFLAGS_$(basetarget).o)
+	c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) -include $(srctree)/include/linux/kern_levels.h -include user.h $(CFLAGS_$(basetarget).o)
 
 # These are like USER_OBJS but filter USER_CFLAGS through unprofile instead of
 # using it directly.
diff --git a/arch/x86/um/shared/sysdep/kernel-offsets.h b/arch/x86/um/shared/sysdep/kernel-offsets.h
index 5868526b5eef..46a9df99f3c5 100644
--- a/arch/x86/um/shared/sysdep/kernel-offsets.h
+++ b/arch/x86/um/shared/sysdep/kernel-offsets.h
@@ -7,9 +7,6 @@
 #define DEFINE(sym, val) \
 	asm volatile("\n->" #sym " %0 " #val : : "i" (val))
 
-#define STR(x) #x
-#define DEFINE_STR(sym, val) asm volatile("\n->" #sym " " STR(val) " " #val: : )
-
 #define BLANK() asm volatile("\n->" : : )
 
 #define OFFSET(sym, str, mem) \

From a3cff128c970da8400875c8893d68ac735e783fb Mon Sep 17 00:00:00 2001
From: Steve Glendinning <steve.glendinning@shawell.net>
Date: Mon, 24 Sep 2012 04:42:59 +0000
Subject: [PATCH 34/44] smsc75xx: fix resume after device reset

On some systems this device fails to properly resume after suspend,
this patch fixes it by running the usbnet_resume handler.

I suspect this also fixes this bug:

http://code.google.com/p/chromium-os/issues/detail?id=31871

Signed-off-by: Steve Glendinning <steve.glendinning@shawell.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/smsc75xx.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c
index f5ab6e613ec8..376143e8a1aa 100644
--- a/drivers/net/usb/smsc75xx.c
+++ b/drivers/net/usb/smsc75xx.c
@@ -1253,6 +1253,7 @@ static struct usb_driver smsc75xx_driver = {
 	.probe		= usbnet_probe,
 	.suspend	= usbnet_suspend,
 	.resume		= usbnet_resume,
+	.reset_resume	= usbnet_resume,
 	.disconnect	= usbnet_disconnect,
 	.disable_hub_initiated_lpm = 1,
 };

From b629820d18fa65cc598390e4b9712fd5f83ee693 Mon Sep 17 00:00:00 2001
From: Marek Vasut <marex@denx.de>
Date: Tue, 25 Sep 2012 10:17:42 +0000
Subject: [PATCH 35/44] net: phy: smsc: Implement PHY config_init for LAN87xx

The LAN8710/LAN8720 chips do have broken the "FlexPWR" smart power-saving
capability. Enabling it leads to the PHY not being able to detect Link when
cold-started without cable connected. Thus, make sure this is disabled.

Signed-off-by: Marek Vasut <marex@denx.de>
Cc: Christian Hohnstaedt <chohnstaedt@innominate.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Fabio Estevam <fabio.estevam@freescale.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Otavio Salvador <otavio@ossystems.com.br>
Acked-by: Otavio Salvador <otavio@ossystems.com.br>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/smsc.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/drivers/net/phy/smsc.c b/drivers/net/phy/smsc.c
index 6d6192316b30..88e3991464e7 100644
--- a/drivers/net/phy/smsc.c
+++ b/drivers/net/phy/smsc.c
@@ -56,6 +56,32 @@ static int smsc_phy_config_init(struct phy_device *phydev)
 	return smsc_phy_ack_interrupt (phydev);
 }
 
+static int lan87xx_config_init(struct phy_device *phydev)
+{
+	/*
+	 * Make sure the EDPWRDOWN bit is NOT set. Setting this bit on
+	 * LAN8710/LAN8720 PHY causes the PHY to misbehave, likely due
+	 * to a bug on the chip.
+	 *
+	 * When the system is powered on with the network cable being
+	 * disconnected all the way until after ifconfig ethX up is
+	 * issued for the LAN port with this PHY, connecting the cable
+	 * afterwards does not cause LINK change detection, while the
+	 * expected behavior is the Link UP being detected.
+	 */
+	int rc = phy_read(phydev, MII_LAN83C185_CTRL_STATUS);
+	if (rc < 0)
+		return rc;
+
+	rc &= ~MII_LAN83C185_EDPWRDOWN;
+
+	rc = phy_write(phydev, MII_LAN83C185_CTRL_STATUS, rc);
+	if (rc < 0)
+		return rc;
+
+	return smsc_phy_ack_interrupt(phydev);
+}
+
 static int lan911x_config_init(struct phy_device *phydev)
 {
 	return smsc_phy_ack_interrupt(phydev);
@@ -162,7 +188,7 @@ static struct phy_driver smsc_phy_driver[] = {
 	/* basic functions */
 	.config_aneg	= genphy_config_aneg,
 	.read_status	= genphy_read_status,
-	.config_init	= smsc_phy_config_init,
+	.config_init	= lan87xx_config_init,
 
 	/* IRQ related */
 	.ack_interrupt	= smsc_phy_ack_interrupt,

From 64f605c77408ad3aaff025e849d2e42cad9687e3 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Wed, 26 Sep 2012 07:22:02 +0000
Subject: [PATCH 36/44] bnx2: Clean up remaining iounmap

commit c0357e975afdbbedab5c662d19bef865f02adc17 modified bnx2 to switch from
using ioremap/iounmap to pci_iomap/pci_iounmap.  They missed a spot in the error
path of bnx2_init_one though.  This patch just cleans that up.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Michael Chan <mcan@broadcom.com>
CC: "David S. Miller" <davem@davemloft.net>
Acked-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index 79cebd8525ce..e48312f2305d 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -8564,7 +8564,7 @@ bnx2_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	return 0;
 
 error:
-	iounmap(bp->regview);
+	pci_iounmap(pdev, bp->regview);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
 	pci_set_drvdata(pdev, NULL);

From 68b3f28c11bf896be32ad2a2e151aaa5f4a7c98e Mon Sep 17 00:00:00 2001
From: Narendra K <narendra_k@dell.com>
Date: Tue, 25 Sep 2012 07:53:19 +0000
Subject: [PATCH 37/44] qlcnic: Fix scheduling while atomic bug

In the device close path, 'qlcnic_fw_destroy_ctx' and
'qlcnic_poll_rsp' call msleep. But  'qlcnic_fw_destroy_ctx' and
'qlcnic_poll_rsp' are called with 'adapter->tx_clean_lock' spin lock
held resulting in scheduling while atomic bug causing the following
trace.

I observed that the commit 012dc19a45b2b9cc2ebd14aaa401cf782c2abba4
from John Fastabend addresses a similar issue in ixgbevf driver.
Adopting the same approach used in the commit, this patch uses mdelay
to address the issue.

[79884.999115] BUG: scheduling while atomic: ip/30846/0x00000002
[79885.005562] INFO: lockdep is turned off.
[79885.009958] Modules linked in: qlcnic fuse nf_conntrack_netbios_ns nf_conntrack_broadcast ipt_MASQUERADE bnep bluetooth rfkill ip6table_mangle ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6_tables iptable_nat nf_nat iptable_mangle ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack iptable_filter ip_tables dcdbas coretemp kvm_intel kvm iTCO_wdt ixgbe iTCO_vendor_support crc32c_intel ghash_clmulni_intel nfsd microcode sb_edac pcspkr edac_core dca bnx2x shpchp auth_rpcgss nfs_acl lpc_ich mfd_core mdio lockd libcrc32c wmi acpi_pad acpi_power_meter sunrpc uinput sd_mod sr_mod cdrom crc_t10dif ahci libahci libata megaraid_sas usb_storage dm_mirror dm_region_hash dm_log dm_mod [last unloaded: qlcnic]
[79885.083608] Pid: 30846, comm: ip Tainted: G        W  O 3.6.0-rc7+ #1
[79885.090805] Call Trace:
[79885.093569]  [<ffffffff816764d8>] __schedule_bug+0x68/0x76
[79885.099699]  [<ffffffff8168358e>] __schedule+0x99e/0xa00
[79885.105634]  [<ffffffff81683929>] schedule+0x29/0x70
[79885.111186]  [<ffffffff81680def>] schedule_timeout+0x16f/0x350
[79885.117724]  [<ffffffff811afb7a>] ? init_object+0x4a/0x90
[79885.123770]  [<ffffffff8107c190>] ? __internal_add_timer+0x140/0x140
[79885.130873]  [<ffffffff81680fee>] schedule_timeout_uninterruptible+0x1e/0x20
[79885.138773]  [<ffffffff8107e830>] msleep+0x20/0x30
[79885.144159]  [<ffffffffa04c7fbf>] qlcnic_issue_cmd+0xef/0x290 [qlcnic]
[79885.151478]  [<ffffffffa04c8265>] qlcnic_fw_cmd_destroy_rx_ctx+0x55/0x90 [qlcnic]
[79885.159868]  [<ffffffffa04c92fd>] qlcnic_fw_destroy_ctx+0x2d/0xa0 [qlcnic]
[79885.167576]  [<ffffffffa04bf2ed>] __qlcnic_down+0x11d/0x180 [qlcnic]
[79885.174708]  [<ffffffffa04bf6f8>] qlcnic_close+0x18/0x20 [qlcnic]
[79885.181547]  [<ffffffff8153b4c5>] __dev_close_many+0x95/0xe0
[79885.187899]  [<ffffffff8153b548>] __dev_close+0x38/0x50
[79885.193761]  [<ffffffff81545101>] __dev_change_flags+0xa1/0x180
[79885.200419]  [<ffffffff81545298>] dev_change_flags+0x28/0x70
[79885.206779]  [<ffffffff815531b8>] do_setlink+0x378/0xa00
[79885.212731]  [<ffffffff81354fe1>] ? nla_parse+0x31/0xe0
[79885.218612]  [<ffffffff815558ee>] rtnl_newlink+0x37e/0x560
[79885.224768]  [<ffffffff812cfa19>] ? selinux_capable+0x39/0x50
[79885.231217]  [<ffffffff812cbf98>] ? security_capable+0x18/0x20
[79885.237765]  [<ffffffff81555114>] rtnetlink_rcv_msg+0x114/0x2f0
[79885.244412]  [<ffffffff81551f87>] ? rtnl_lock+0x17/0x20
[79885.250280]  [<ffffffff81551f87>] ? rtnl_lock+0x17/0x20
[79885.256148]  [<ffffffff81555000>] ? __rtnl_unlock+0x20/0x20
[79885.262413]  [<ffffffff81570fc1>] netlink_rcv_skb+0xa1/0xb0
[79885.268661]  [<ffffffff81551fb5>] rtnetlink_rcv+0x25/0x40
[79885.274727]  [<ffffffff815708bd>] netlink_unicast+0x19d/0x220
[79885.281146]  [<ffffffff81570c45>] netlink_sendmsg+0x305/0x3f0
[79885.287595]  [<ffffffff8152b188>] ? sock_update_classid+0x148/0x2e0
[79885.294650]  [<ffffffff81525c2c>] sock_sendmsg+0xbc/0xf0
[79885.300600]  [<ffffffff8152600c>] __sys_sendmsg+0x3ac/0x3c0
[79885.306853]  [<ffffffff8109be23>] ? up_read+0x23/0x40
[79885.312510]  [<ffffffff816896cc>] ? do_page_fault+0x2bc/0x570
[79885.318968]  [<ffffffff81191854>] ? sys_brk+0x44/0x150
[79885.324715]  [<ffffffff811c458c>] ? fget_light+0x24c/0x520
[79885.330875]  [<ffffffff815286f9>] sys_sendmsg+0x49/0x90
[79885.336707]  [<ffffffff8168e429>] system_call_fastpath+0x16/0x1b

Signed-off-by: Narendra K <narendra_k@dell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c
index b8ead696141e..2a179d087207 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c
@@ -15,7 +15,7 @@ qlcnic_poll_rsp(struct qlcnic_adapter *adapter)
 
 	do {
 		/* give atleast 1ms for firmware to respond */
-		msleep(1);
+		mdelay(1);
 
 		if (++timeout > QLCNIC_OS_CRB_RETRY_COUNT)
 			return QLCNIC_CDRP_RSP_TIMEOUT;
@@ -601,7 +601,7 @@ void qlcnic_fw_destroy_ctx(struct qlcnic_adapter *adapter)
 		qlcnic_fw_cmd_destroy_tx_ctx(adapter);
 
 		/* Allow dma queues to drain after context reset */
-		msleep(20);
+		mdelay(20);
 	}
 }
 

From bc9259a8bae9e814fc1f775a1b3effa13e6ad330 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 27 Sep 2012 04:11:00 +0000
Subject: [PATCH 38/44] inetpeer: fix token initialization

When jiffies wraps around (for example, 5 minutes after the boot, see
INITIAL_JIFFIES) and peer has just been created, now - peer->rate_last can be
< XRLIM_BURST_FACTOR * timeout, so token is not set to the maximum value, thus
some icmp packets can be unexpectedly dropped.

Fix this case by initializing last_rate to 60 seconds in the past.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inetpeer.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index e1e0a4e8fd34..c7527f6b9ad9 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -510,7 +510,10 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
 					secure_ipv6_id(daddr->addr.a6));
 		p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
 		p->rate_tokens = 0;
-		p->rate_last = 0;
+		/* 60*HZ is arbitrary, but chosen enough high so that the first
+		 * calculation of tokens is at its maximum.
+		 */
+		p->rate_last = jiffies - 60*HZ;
 		INIT_LIST_HEAD(&p->gc_list);
 
 		/* Link the node. */

From df555b665367f9de6c04826acc482096f17c243d Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Thu, 27 Sep 2012 19:04:21 +0000
Subject: [PATCH 39/44] netdev: octeon: fix return value check in
 octeon_mgmt_init_phy()

In case of error, the function of_phy_connect() returns NULL
pointer not ERR_PTR(). The IS_ERR() test in the return value
check should be replaced with NULL test.

dpatch engine is used to auto generate this patch.
(https://github.com/weiyj/dpatch)

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/octeon/octeon_mgmt.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/octeon/octeon_mgmt.c b/drivers/net/ethernet/octeon/octeon_mgmt.c
index c42bbb16cdae..a688a2ddcfd6 100644
--- a/drivers/net/ethernet/octeon/octeon_mgmt.c
+++ b/drivers/net/ethernet/octeon/octeon_mgmt.c
@@ -722,10 +722,8 @@ static int octeon_mgmt_init_phy(struct net_device *netdev)
 				   octeon_mgmt_adjust_link, 0,
 				   PHY_INTERFACE_MODE_MII);
 
-	if (IS_ERR(p->phydev)) {
-		p->phydev = NULL;
+	if (!p->phydev)
 		return -1;
-	}
 
 	phy_start_aneg(p->phydev);
 

From 0774e392555a128cff7a94929b9ce957927fef49 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 28 Sep 2012 16:14:44 +0200
Subject: [PATCH 40/44] iommu/amd: Fix wrong assumption in iommu-group specific
 code

The new IOMMU groups code in the AMD IOMMU driver makes the
assumption that there is a pci_dev struct available for all
device-ids listed in the IVRS ACPI table. Unfortunatly this
assumption is not true and so this code causes a NULL
pointer dereference at boot on some systems.

Fix it by making sure the given pointer is never NULL when
passed to the group specific code. The real fix is larger
and will be queued for v3.7.

Reported-by: Florian Dazinger <florian@dazinger.net>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/iommu/amd_iommu.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index b64502dfa9f4..e89daf1b21b4 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -266,7 +266,7 @@ static void swap_pci_ref(struct pci_dev **from, struct pci_dev *to)
 
 static int iommu_init_device(struct device *dev)
 {
-	struct pci_dev *dma_pdev, *pdev = to_pci_dev(dev);
+	struct pci_dev *dma_pdev = NULL, *pdev = to_pci_dev(dev);
 	struct iommu_dev_data *dev_data;
 	struct iommu_group *group;
 	u16 alias;
@@ -293,7 +293,9 @@ static int iommu_init_device(struct device *dev)
 		dev_data->alias_data = alias_data;
 
 		dma_pdev = pci_get_bus_and_slot(alias >> 8, alias & 0xff);
-	} else
+	}
+
+	if (dma_pdev == NULL)
 		dma_pdev = pci_dev_get(pdev);
 
 	/* Account for quirked devices */

From 99a1300e1d84709f419182bb5189760e78234882 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 28 Sep 2012 14:35:31 +0200
Subject: [PATCH 41/44] thp: avoid VM_BUG_ON page_count(page) false positives
 in __collapse_huge_page_copy

Speculative cache pagecache lookups can elevate the refcount from
under us, so avoid the false positive. If the refcount is < 2 we'll be
notified by a VM_BUG_ON in put_page_testzero as there are two
put_page(src_page) in a row before returning from this function.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Petr Holasek <pholasek@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 57c4b9309015..141dbb695097 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1811,7 +1811,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 			src_page = pte_page(pteval);
 			copy_user_highpage(page, src_page, address, vma);
 			VM_BUG_ON(page_mapcount(src_page) != 1);
-			VM_BUG_ON(page_count(src_page) != 2);
 			release_pte_page(src_page);
 			/*
 			 * ptl mostly unnecessary, but preempt has to

From 9c603e53d380459fb62fec7cd085acb0b74ac18f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 8 Sep 2012 12:57:30 -0700
Subject: [PATCH 42/44] mtdchar: fix offset overflow detection

Sasha Levin has been running trinity in a KVM tools guest, and was able
to trigger the BUG_ON() at arch/x86/mm/pat.c:279 (verifying the range of
the memory type).  The call trace showed that it was mtdchar_mmap() that
created an invalid remap_pfn_range().

The problem is that mtdchar_mmap() does various really odd and subtle
things with the vma page offset etc, and uses the wrong types (and the
wrong overflow) detection for it.

For example, the page offset may well be 32-bit on a 32-bit
architecture, but after shifting it up by PAGE_SHIFT, we need to use a
potentially 64-bit resource_size_t to correctly hold the full value.

Also, we need to check that the vma length plus offset doesn't overflow
before we check that it is smaller than the length of the mtdmap region.

This fixes things up and tries to make the code a bit easier to read.

Reported-and-tested-by: Sasha Levin <levinsasha928@gmail.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Artem Bityutskiy <dedekind1@gmail.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: linux-mtd@lists.infradead.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mtd/mtdchar.c | 48 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index f2f482bec573..a6e74514e662 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -1123,6 +1123,33 @@ static unsigned long mtdchar_get_unmapped_area(struct file *file,
 }
 #endif
 
+static inline unsigned long get_vm_size(struct vm_area_struct *vma)
+{
+	return vma->vm_end - vma->vm_start;
+}
+
+static inline resource_size_t get_vm_offset(struct vm_area_struct *vma)
+{
+	return (resource_size_t) vma->vm_pgoff << PAGE_SHIFT;
+}
+
+/*
+ * Set a new vm offset.
+ *
+ * Verify that the incoming offset really works as a page offset,
+ * and that the offset and size fit in a resource_size_t.
+ */
+static inline int set_vm_offset(struct vm_area_struct *vma, resource_size_t off)
+{
+	pgoff_t pgoff = off >> PAGE_SHIFT;
+	if (off != (resource_size_t) pgoff << PAGE_SHIFT)
+		return -EINVAL;
+	if (off + get_vm_size(vma) - 1 < off)
+		return -EINVAL;
+	vma->vm_pgoff = pgoff;
+	return 0;
+}
+
 /*
  * set up a mapping for shared memory segments
  */
@@ -1132,20 +1159,29 @@ static int mtdchar_mmap(struct file *file, struct vm_area_struct *vma)
 	struct mtd_file_info *mfi = file->private_data;
 	struct mtd_info *mtd = mfi->mtd;
 	struct map_info *map = mtd->priv;
-	unsigned long start;
-	unsigned long off;
-	u32 len;
+	resource_size_t start, off;
+	unsigned long len, vma_len;
 
 	if (mtd->type == MTD_RAM || mtd->type == MTD_ROM) {
-		off = vma->vm_pgoff << PAGE_SHIFT;
+		off = get_vm_offset(vma);
 		start = map->phys;
 		len = PAGE_ALIGN((start & ~PAGE_MASK) + map->size);
 		start &= PAGE_MASK;
-		if ((vma->vm_end - vma->vm_start + off) > len)
+		vma_len = get_vm_size(vma);
+
+		/* Overflow in off+len? */
+		if (vma_len + off < off)
+			return -EINVAL;
+		/* Does it fit in the mapping? */
+		if (vma_len + off > len)
 			return -EINVAL;
 
 		off += start;
-		vma->vm_pgoff = off >> PAGE_SHIFT;
+		/* Did that overflow? */
+		if (off < start)
+			return -EINVAL;
+		if (set_vm_offset(vma, off) < 0)
+			return -EINVAL;
 		vma->vm_flags |= VM_IO | VM_RESERVED;
 
 #ifdef pgprot_noncached

From 8110e16d42d587997bcaee0c864179e6d93603fe Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Mon, 17 Sep 2012 22:23:30 +0200
Subject: [PATCH 43/44] vfs: dcache: fix deadlock in tree traversal

IBM reported a deadlock in select_parent().  This was found to be caused
by taking rename_lock when already locked when restarting the tree
traversal.

There are two cases when the traversal needs to be restarted:

 1) concurrent d_move(); this can only happen when not already locked,
    since taking rename_lock protects against concurrent d_move().

 2) racing with final d_put() on child just at the moment of ascending
    to parent; rename_lock doesn't protect against this rare race, so it
    can happen when already locked.

Because of case 2, we need to be able to handle restarting the traversal
when rename_lock is already held.  This patch fixes all three callers of
try_to_ascend().

IBM reported that the deadlock is gone with this patch.

[ I rewrote the patch to be smaller and just do the "goto again" if the
  lock was already held, but credit goes to Miklos for the real work.
   - Linus ]

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/dcache.c b/fs/dcache.c
index 0364af2311f4..693f95bf1cae 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1134,6 +1134,8 @@ int have_submounts(struct dentry *parent)
 	return 1;
 
 rename_retry:
+	if (locked)
+		goto again;
 	locked = 1;
 	write_seqlock(&rename_lock);
 	goto again;
@@ -1236,6 +1238,8 @@ static int select_parent(struct dentry *parent, struct list_head *dispose)
 rename_retry:
 	if (found)
 		return found;
+	if (locked)
+		goto again;
 	locked = 1;
 	write_seqlock(&rename_lock);
 	goto again;
@@ -3035,6 +3039,8 @@ void d_genocide(struct dentry *root)
 	return;
 
 rename_retry:
+	if (locked)
+		goto again;
 	locked = 1;
 	write_seqlock(&rename_lock);
 	goto again;

From a0d271cbfed1dd50278c6b06bead3d00ba0a88f9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 30 Sep 2012 16:47:46 -0700
Subject: [PATCH 44/44] Linux 3.6

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index a3c11d589681..bb9fff26f078 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 3
 PATCHLEVEL = 6
 SUBLEVEL = 0
-EXTRAVERSION = -rc7
+EXTRAVERSION =
 NAME = Terrified Chipmunk
 
 # *DOCUMENTATION*