From e9ef46369f5107e634a93b7fc4e62a1f53343197 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 24 Jul 2012 15:01:04 -0600 Subject: [PATCH 01/44] NVMe: Set request queue logical block size Sets the request queue logical block size with the block size of the namespace. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index f4996b0e4b1a..38b9c73f6706 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -1344,6 +1344,7 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, ns->disk = disk; lbaf = id->flbas & 0xf; ns->lba_shift = id->lbaf[lbaf].ds; + blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); disk->major = nvme_major; disk->minors = NVME_MINORS; From 5c42ea1643a630060f9e71e06d3933d244970967 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 25 Jul 2012 16:05:18 -0600 Subject: [PATCH 02/44] NVMe: Fix nvme module init when nvme_major is set register_blkdev returns 0 when given a valid major number. Reported-by:Ross Zwisler Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 38b9c73f6706..46e33eec6298 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -1716,9 +1716,11 @@ static int __init nvme_init(void) if (IS_ERR(nvme_thread)) return PTR_ERR(nvme_thread); - nvme_major = register_blkdev(nvme_major, "nvme"); - if (nvme_major <= 0) + result = register_blkdev(nvme_major, "nvme"); + if (result < 0) goto kill_kthread; + else if (result > 0) + nvme_major = result; result = pci_register_driver(&nvme_driver); if (result) From 50af8baec46a99a9b81a4600c0374f83a5a590a9 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 25 Jul 2012 16:07:55 -0600 Subject: [PATCH 03/44] NVMe: replace nvme_ns with nvme_dev for user admin The function nvme_user_admin_command does not require a namespace to proceed. Replace with the nvme_dev structure so that it can be called from contexts that do not have a namespace. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 46e33eec6298..89935853cb85 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -1151,10 +1151,9 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) return status; } -static int nvme_user_admin_cmd(struct nvme_ns *ns, +static int nvme_user_admin_cmd(struct nvme_dev *dev, struct nvme_admin_cmd __user *ucmd) { - struct nvme_dev *dev = ns->dev; struct nvme_admin_cmd cmd; struct nvme_command c; int status, length; @@ -1209,7 +1208,7 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, case NVME_IOCTL_ID: return ns->ns_id; case NVME_IOCTL_ADMIN_CMD: - return nvme_user_admin_cmd(ns, (void __user *)arg); + return nvme_user_admin_cmd(ns->dev, (void __user *)arg); case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, (void __user *)arg); default: From a42ceccef0c43b46ff6bc1b12a7c1076ef243df1 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 25 Jul 2012 16:06:38 -0600 Subject: [PATCH 04/44] NVMe: use namespace id for nvme_get_features The specification does not provide a use for command dword11 in the NVMe Get Features command, but does use the NSID for some features. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 89935853cb85..7bcd88205a41 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -840,15 +840,15 @@ static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, } static int nvme_get_features(struct nvme_dev *dev, unsigned fid, - unsigned dword11, dma_addr_t dma_addr) + unsigned nsid, dma_addr_t dma_addr) { struct nvme_command c; memset(&c, 0, sizeof(c)); c.features.opcode = nvme_admin_get_features; + c.features.nsid = cpu_to_le32(nsid); c.features.prp1 = cpu_to_le64(dma_addr); c.features.fid = cpu_to_le32(fid); - c.features.dword11 = cpu_to_le32(dword11); return nvme_submit_admin_cmd(dev, &c, NULL); } From 8fc23e032debd682f5ba9fc524a5846c10d2c522 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 26 Jul 2012 11:29:57 -0600 Subject: [PATCH 05/44] NVMe: Set block queue max sectors Set the max hw sectors in a namespace's request queue if the nvme device has a max data transfer size. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 7 +++++++ include/linux/nvme.h | 1 + 2 files changed, 8 insertions(+) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 7bcd88205a41..11951fa11a90 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -78,6 +78,7 @@ struct nvme_dev { char serial[20]; char model[40]; char firmware_rev[8]; + u32 max_hw_sectors; }; /* @@ -1344,6 +1345,8 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, lbaf = id->flbas & 0xf; ns->lba_shift = id->lbaf[lbaf].ds; blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); + if (dev->max_hw_sectors) + blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); disk->major = nvme_major; disk->minors = NVME_MINORS; @@ -1485,6 +1488,10 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev) memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); + if (ctrl->mdts) { + int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; + dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); + } id_ns = mem; for (i = 1; i <= nn; i++) { diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 9490a00529f4..8c71d2004c6d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -37,6 +37,7 @@ struct nvme_bar { #define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff) #define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf) +#define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf) enum { NVME_CC_ENABLE = 1 << 0, From a0cadb85b8b758608ae0759151e29de7581c6731 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 27 Jul 2012 13:57:23 -0400 Subject: [PATCH 06/44] NVMe: Do not set IO queue depth beyond device max Set the depth for IO queues to the device's maximum supported queue entries if the requested depth exceeds the device's capabilities. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 10 ++++++---- include/linux/nvme.h | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 11951fa11a90..af1ef39bd6b4 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -893,7 +893,8 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth, int vector) { struct device *dmadev = &dev->pci_dev->dev; - unsigned extra = (depth / 8) + (depth * sizeof(struct nvme_cmd_info)); + unsigned extra = DIV_ROUND_UP(depth, 8) + (depth * + sizeof(struct nvme_cmd_info)); struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); if (!nvmeq) return NULL; @@ -1391,7 +1392,7 @@ static int set_queue_count(struct nvme_dev *dev, int count) static int __devinit nvme_setup_io_queues(struct nvme_dev *dev) { - int result, cpu, i, nr_io_queues, db_bar_size; + int result, cpu, i, nr_io_queues, db_bar_size, q_depth; nr_io_queues = num_online_cpus(); result = set_queue_count(dev, nr_io_queues); @@ -1437,9 +1438,10 @@ static int __devinit nvme_setup_io_queues(struct nvme_dev *dev) cpu = cpumask_next(cpu, cpu_online_mask); } + q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, + NVME_Q_DEPTH); for (i = 0; i < nr_io_queues; i++) { - dev->queues[i + 1] = nvme_create_queue(dev, i + 1, - NVME_Q_DEPTH, i); + dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i); if (IS_ERR(dev->queues[i + 1])) return PTR_ERR(dev->queues[i + 1]); dev->queue_count++; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 8c71d2004c6d..c25cccaa555a 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -35,6 +35,7 @@ struct nvme_bar { __u64 acq; /* Admin CQ Base Address */ }; +#define NVME_CAP_MQES(cap) ((cap) & 0xffff) #define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff) #define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf) #define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf) From c7d36ab8fa04c213328119a9c0d66985fe204ee5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 27 Jul 2012 11:53:28 -0600 Subject: [PATCH 07/44] NVMe: Fix uninitialized iod compiler warning Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index af1ef39bd6b4..6c0eb768562f 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -1159,7 +1159,7 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, struct nvme_admin_cmd cmd; struct nvme_command c; int status, length; - struct nvme_iod *iod; + struct nvme_iod *uninitialized_var(iod); if (!capable(CAP_SYS_ADMIN)) return -EACCES; From 22fff826e715e9727d3c7a69f15e602a9801b673 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 20 Jan 2012 07:55:30 -0500 Subject: [PATCH 08/44] NVMe: handle allocation failure in nvme_map_user_pages() We should return here and avoid a NULL dereference. Signed-off-by: Dan Carpenter Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 6c0eb768562f..064e86a6bb4e 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -1044,6 +1044,8 @@ static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, offset = offset_in_page(addr); count = DIV_ROUND_UP(offset + length, PAGE_SIZE); pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); + if (!pages) + return ERR_PTR(-ENOMEM); err = get_user_pages_fast(addr, count, 1, pages); if (err < count) { From 0ac13140d796eb1e2f8956aea97a6e5e4ebcf981 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 31 Jul 2012 13:31:15 -0400 Subject: [PATCH 09/44] NVMe: Fix whitespace damage in nvme_init Commit 5c42ea1643 used spaces instead of tabs. Also remove the unnecessary initialisation of the 'result' variable. Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 064e86a6bb4e..0ba6b7cb344b 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -1720,7 +1720,7 @@ static struct pci_driver nvme_driver = { static int __init nvme_init(void) { - int result = -EBUSY; + int result; nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); if (IS_ERR(nvme_thread)) @@ -1730,7 +1730,7 @@ static int __init nvme_init(void) if (result < 0) goto kill_kthread; else if (result > 0) - nvme_major = result; + nvme_major = result; result = pci_register_driver(&nvme_driver); if (result) From cd58ad7d188c643cf572b038909c2f7dd96fdafe Mon Sep 17 00:00:00 2001 From: Quoc-Son Anh Date: Tue, 21 Feb 2012 16:50:53 -0700 Subject: [PATCH 10/44] NVMe: Use ida for nvme device instance Signed-off-by: Quoc-Son Anh Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 0ba6b7cb344b..3278fbdb8dc0 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -1576,15 +1576,33 @@ static void nvme_release_prp_pools(struct nvme_dev *dev) dma_pool_destroy(dev->prp_small_pool); } -/* XXX: Use an ida or something to let remove / add work correctly */ -static void nvme_set_instance(struct nvme_dev *dev) +static DEFINE_IDA(nvme_instance_ida); + +static int nvme_set_instance(struct nvme_dev *dev) { - static int instance; - dev->instance = instance++; + int instance, error; + + do { + if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) + return -ENODEV; + + spin_lock(&dev_list_lock); + error = ida_get_new(&nvme_instance_ida, &instance); + spin_unlock(&dev_list_lock); + } while (error == -EAGAIN); + + if (error) + return -ENODEV; + + dev->instance = instance; + return 0; } static void nvme_release_instance(struct nvme_dev *dev) { + spin_lock(&dev_list_lock); + ida_remove(&nvme_instance_ida, dev->instance); + spin_unlock(&dev_list_lock); } static int __devinit nvme_probe(struct pci_dev *pdev, @@ -1617,7 +1635,10 @@ static int __devinit nvme_probe(struct pci_dev *pdev, pci_set_drvdata(pdev, dev); dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); - nvme_set_instance(dev); + result = nvme_set_instance(dev); + if (result) + goto disable; + dev->entry[0].vector = pdev->irq; result = nvme_setup_prp_pools(dev); From 9e866774aab5d2654b0fa8f97890f68913f05700 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 3 Aug 2012 13:55:56 -0400 Subject: [PATCH 11/44] NVMe: Free admin queue memory on initialisation failure If the adapter fails initialisation, the memory allocated for the admin queue may not be freed. Split the memory freeing part of nvme_free_queue() into nvme_free_queue_mem() and call it in the case of initialisation failure. Signed-off-by: Matthew Wilcox Reported-by: Vishal Verma --- drivers/block/nvme.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 3278fbdb8dc0..214037055e2a 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -868,6 +868,15 @@ static int nvme_set_features(struct nvme_dev *dev, unsigned fid, return nvme_submit_admin_cmd(dev, &c, result); } +static void nvme_free_queue_mem(struct nvme_queue *nvmeq) +{ + dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), + (void *)nvmeq->cqes, nvmeq->cq_dma_addr); + dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), + nvmeq->sq_cmds, nvmeq->sq_dma_addr); + kfree(nvmeq); +} + static void nvme_free_queue(struct nvme_dev *dev, int qid) { struct nvme_queue *nvmeq = dev->queues[qid]; @@ -882,11 +891,7 @@ static void nvme_free_queue(struct nvme_dev *dev, int qid) adapter_delete_cq(dev, qid); } - dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), - (void *)nvmeq->cqes, nvmeq->cq_dma_addr); - dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), - nvmeq->sq_cmds, nvmeq->sq_dma_addr); - kfree(nvmeq); + nvme_free_queue_mem(nvmeq); } static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, @@ -982,7 +987,7 @@ static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev) { - int result; + int result = 0; u32 aqa; u64 cap; unsigned long timeout; @@ -1012,17 +1017,22 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev) timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; dev->db_stride = NVME_CAP_STRIDE(cap); - while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) { + while (!result && !(readl(&dev->bar->csts) & NVME_CSTS_RDY)) { msleep(100); if (fatal_signal_pending(current)) - return -EINTR; + result = -EINTR; if (time_after(jiffies, timeout)) { dev_err(&dev->pci_dev->dev, "Device not ready; aborting initialisation\n"); - return -ENODEV; + result = -ENODEV; } } + if (result) { + nvme_free_queue_mem(nvmeq); + return result; + } + result = queue_request_irq(dev, nvmeq, "nvme admin"); dev->queues[0] = nvmeq; return result; From a09115b23e2002bb35b7bfd337683f00875671ec Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 7 Aug 2012 15:56:23 -0400 Subject: [PATCH 12/44] NVMe: Cancel outstanding IOs on queue deletion If the device is hot-unplugged while there are active commands, we should time out the I/Os so that upper layers don't just see the I/Os disappear. Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 55 ++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 214037055e2a..f9ad514c9227 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -868,6 +868,33 @@ static int nvme_set_features(struct nvme_dev *dev, unsigned fid, return nvme_submit_admin_cmd(dev, &c, result); } +/** + * nvme_cancel_ios - Cancel outstanding I/Os + * @queue: The queue to cancel I/Os on + * @timeout: True to only cancel I/Os which have timed out + */ +static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) +{ + int depth = nvmeq->q_depth - 1; + struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); + unsigned long now = jiffies; + int cmdid; + + for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { + void *ctx; + nvme_completion_fn fn; + static struct nvme_completion cqe = { + .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, + }; + + if (timeout && !time_after(now, info[cmdid].timeout)) + continue; + dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid); + ctx = cancel_cmdid(nvmeq, cmdid, &fn); + fn(nvmeq->dev, ctx, &cqe); + } +} + static void nvme_free_queue_mem(struct nvme_queue *nvmeq) { dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), @@ -882,6 +909,10 @@ static void nvme_free_queue(struct nvme_dev *dev, int qid) struct nvme_queue *nvmeq = dev->queues[qid]; int vector = dev->entry[nvmeq->cq_vector].vector; + spin_lock_irq(&nvmeq->q_lock); + nvme_cancel_ios(nvmeq, false); + spin_unlock_irq(&nvmeq->q_lock); + irq_set_affinity_hint(vector, NULL); free_irq(vector, nvmeq); @@ -1236,26 +1267,6 @@ static const struct block_device_operations nvme_fops = { .compat_ioctl = nvme_ioctl, }; -static void nvme_timeout_ios(struct nvme_queue *nvmeq) -{ - int depth = nvmeq->q_depth - 1; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - unsigned long now = jiffies; - int cmdid; - - for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { - void *ctx; - nvme_completion_fn fn; - static struct nvme_completion cqe = { .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, }; - - if (!time_after(now, info[cmdid].timeout)) - continue; - dev_warn(nvmeq->q_dmadev, "Timing out I/O %d\n", cmdid); - ctx = cancel_cmdid(nvmeq, cmdid, &fn); - fn(nvmeq->dev, ctx, &cqe); - } -} - static void nvme_resubmit_bios(struct nvme_queue *nvmeq) { while (bio_list_peek(&nvmeq->sq_cong)) { @@ -1287,7 +1298,7 @@ static int nvme_kthread(void *data) spin_lock_irq(&nvmeq->q_lock); if (nvme_process_cq(nvmeq)) printk("process_cq did something\n"); - nvme_timeout_ios(nvmeq); + nvme_cancel_ios(nvmeq, true); nvme_resubmit_bios(nvmeq); spin_unlock_irq(&nvmeq->q_lock); } @@ -1549,8 +1560,6 @@ static int nvme_dev_remove(struct nvme_dev *dev) list_del(&dev->node); spin_unlock(&dev_list_lock); - /* TODO: wait all I/O finished or cancel them */ - list_for_each_entry_safe(ns, next, &dev->namespaces, list) { list_del(&ns->list); del_gendisk(ns->disk); From 156cacb1d0d36b0d0582d9e798e58e0044f516b3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Sep 2012 08:19:02 -0400 Subject: [PATCH 13/44] do_add_mount()/umount -l races normally we deal with lock_mount()/umount races by checking that mountpoint to be is still in our namespace after lock_mount() has been done. However, do_add_mount() skips that check when called with MNT_SHRINKABLE in flags (i.e. from finish_automount()). The reason is that ->mnt_ns may be a temporary namespace created exactly to contain automounts a-la NFS4 referral handling. It's not the namespace of the caller, though, so check_mnt() would fail here. We still need to check that ->mnt_ns is non-NULL in that case, though. Signed-off-by: Al Viro --- fs/namespace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 4d31f73e2561..7bdf7907413f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1886,8 +1886,14 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) return err; err = -EINVAL; - if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(real_mount(path->mnt))) - goto unlock; + if (unlikely(!check_mnt(real_mount(path->mnt)))) { + /* that's acceptable only for automounts done in private ns */ + if (!(mnt_flags & MNT_SHRINKABLE)) + goto unlock; + /* ... and for those we'd better have mountpoint still alive */ + if (!real_mount(path->mnt)->mnt_ns) + goto unlock; + } /* Refuse the same filesystem on the same mount point */ err = -EBUSY; From c5aa1e554a20fb3542c62688ae46049c9225a965 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 29 Aug 2012 09:00:01 -0400 Subject: [PATCH 14/44] close the race in nlmsvc_free_block() we need to grab mutex before the reference counter reaches 0 Signed-off-by: Al Viro --- fs/lockd/svclock.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index fb1a2bedbe97..8d80c990dffd 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -289,7 +289,6 @@ static void nlmsvc_free_block(struct kref *kref) dprintk("lockd: freeing block %p...\n", block); /* Remove block from file's list of blocks */ - mutex_lock(&file->f_mutex); list_del_init(&block->b_flist); mutex_unlock(&file->f_mutex); @@ -303,7 +302,7 @@ static void nlmsvc_free_block(struct kref *kref) static void nlmsvc_release_block(struct nlm_block *block) { if (block != NULL) - kref_put(&block->b_count, nlmsvc_free_block); + kref_put_mutex(&block->b_count, nlmsvc_free_block, &block->b_file->f_mutex); } /* From bef83de5a0e3031ff6ff9584b458611a7ac01b85 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 24 Sep 2012 21:23:25 -0600 Subject: [PATCH 15/44] iommu: static inline iommu group stub functions Signed-off-by: Alex Williamson Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7e83370e6fd2..f3b99e1c1042 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -256,72 +256,78 @@ static inline void iommu_set_fault_handler(struct iommu_domain *domain, { } -int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) +static inline int iommu_attach_group(struct iommu_domain *domain, + struct iommu_group *group) { return -ENODEV; } -void iommu_detach_group(struct iommu_domain *domain, struct iommu_group *group) +static inline void iommu_detach_group(struct iommu_domain *domain, + struct iommu_group *group) { } -struct iommu_group *iommu_group_alloc(void) +static inline struct iommu_group *iommu_group_alloc(void) { return ERR_PTR(-ENODEV); } -void *iommu_group_get_iommudata(struct iommu_group *group) +static inline void *iommu_group_get_iommudata(struct iommu_group *group) { return NULL; } -void iommu_group_set_iommudata(struct iommu_group *group, void *iommu_data, - void (*release)(void *iommu_data)) +static inline void iommu_group_set_iommudata(struct iommu_group *group, + void *iommu_data, + void (*release)(void *iommu_data)) { } -int iommu_group_set_name(struct iommu_group *group, const char *name) +static inline int iommu_group_set_name(struct iommu_group *group, + const char *name) { return -ENODEV; } -int iommu_group_add_device(struct iommu_group *group, struct device *dev) +static inline int iommu_group_add_device(struct iommu_group *group, + struct device *dev) { return -ENODEV; } -void iommu_group_remove_device(struct device *dev) +static inline void iommu_group_remove_device(struct device *dev) { } -int iommu_group_for_each_dev(struct iommu_group *group, void *data, - int (*fn)(struct device *, void *)) +static inline int iommu_group_for_each_dev(struct iommu_group *group, + void *data, + int (*fn)(struct device *, void *)) { return -ENODEV; } -struct iommu_group *iommu_group_get(struct device *dev) +static inline struct iommu_group *iommu_group_get(struct device *dev) { return NULL; } -void iommu_group_put(struct iommu_group *group) +static inline void iommu_group_put(struct iommu_group *group) { } -int iommu_group_register_notifier(struct iommu_group *group, - struct notifier_block *nb) +static inline int iommu_group_register_notifier(struct iommu_group *group, + struct notifier_block *nb) { return -ENODEV; } -int iommu_group_unregister_notifier(struct iommu_group *group, - struct notifier_block *nb) +static inline int iommu_group_unregister_notifier(struct iommu_group *group, + struct notifier_block *nb) { return 0; } -int iommu_group_id(struct iommu_group *group) +static inline int iommu_group_id(struct iommu_group *group) { return -ENODEV; } From 82e6bfe2fbc4d48852114c4f979137cd5bf1d1a8 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Fri, 21 Sep 2012 22:26:52 +0000 Subject: [PATCH 16/44] netfilter: xt_limit: have r->cost != 0 case work Commit v2.6.19-rc1~1272^2~41 tells us that r->cost != 0 can happen when a running state is saved to userspace and then reinstated from there. Make sure that private xt_limit area is initialized with correct values. Otherwise, random matchings due to use of uninitialized memory. Signed-off-by: Jan Engelhardt Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_limit.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index 5c22ce8ab309..a4c1e4528cac 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -117,11 +117,11 @@ static int limit_mt_check(const struct xt_mtchk_param *par) /* For SMP, we only want to use one set of state. */ r->master = priv; + /* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies * + 128. */ + priv->prev = jiffies; + priv->credit = user2credits(r->avg * r->burst); /* Credits full. */ if (r->cost == 0) { - /* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies * - 128. */ - priv->prev = jiffies; - priv->credit = user2credits(r->avg * r->burst); /* Credits full. */ r->credit_cap = priv->credit; /* Credits full. */ r->cost = user2credits(r->avg); } From 307615a26e95406c42c95916a66ba50434567e0f Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 26 Sep 2012 23:45:39 +0100 Subject: [PATCH 17/44] dm thin: do not set discard_zeroes_data The dm thin pool target claims to support the zeroing of discarded data areas. This turns out to be incorrect when processing discards that do not exactly cover a complete number of blocks, so the target must always set discard_zeroes_data_unsupported. The thin pool target will zero blocks when they are allocated if the skip_block_zeroing feature is not specified. The block layer may send a discard that only partly covers a block. If a thin pool block is partially discarded then there is no guarantee that the discarded data will get zeroed before it is accessed again. Due to this, thin devices cannot claim discards will always zero data. Signed-off-by: Mike Snitzer Signed-off-by: Joe Thornber Cc: stable@vger.kernel.org # 3.4+ Signed-off-by: Alasdair G Kergon --- drivers/md/dm-thin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index af1fc3b2c2ad..d4209231069d 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2274,6 +2274,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) * thin devices' discard limits consistent). */ ti->discards_supported = true; + ti->discard_zeroes_data_unsupported = true; } ti->private = pt; @@ -2745,7 +2746,6 @@ static void set_discard_limits(struct pool *pool, struct queue_limits *limits) * boundary is not sent to this target. */ limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; - limits->discard_zeroes_data = pool->pf.zero_new_blocks; } static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) From 7ba10aa6fbac7158a50bec142132b04bc480bb29 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 26 Sep 2012 23:45:41 +0100 Subject: [PATCH 18/44] dm mpath: only retry ioctl when no paths if queue_if_no_path set When there are no paths and multipath receives an ioctl, it waits until a path becomes available. This behaviour is incorrect if the "queue_if_no_path" setting was not specified, as then the ioctl should be rejected immediately, which this patch now does. commit 35991652b ("dm mpath: allow ioctls to trigger pg init") should have checked if queue_if_no_path was configured before queueing IO. Checking for the queue_if_no_path feature, like is done in map_io(), allows the following table load to work without blocking in the multipath_ioctl retry loop: echo "0 1024 multipath 0 0 0 0" | dmsetup create mpath_nodevs Without this fix the multipath_ioctl will block with the following stack trace: blkid D 0000000000000002 0 23936 1 0x00000000 ffff8802b89e5cd8 0000000000000082 ffff8802b89e5fd8 0000000000012440 ffff8802b89e4010 0000000000012440 0000000000012440 0000000000012440 ffff8802b89e5fd8 0000000000012440 ffff88030c2aab30 ffff880325794040 Call Trace: [] schedule+0x29/0x70 [] schedule_timeout+0x182/0x2e0 [] ? lock_timer_base+0x70/0x70 [] schedule_timeout_uninterruptible+0x1e/0x20 [] msleep+0x20/0x30 [] multipath_ioctl+0x109/0x170 [dm_multipath] [] dm_blk_ioctl+0xbc/0xd0 [dm_mod] [] __blkdev_driver_ioctl+0x28/0x30 [] blkdev_ioctl+0xce/0x730 [] block_ioctl+0x3c/0x40 [] do_vfs_ioctl+0x8c/0x340 [] ? sys_newfstat+0x33/0x40 [] sys_ioctl+0xa1/0xb0 [] system_call_fastpath+0x16/0x1b Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 3.5+ Acked-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-mpath.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index d8abb90a6c2f..034233eefc82 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1555,6 +1555,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg) { struct multipath *m = ti->private; + struct pgpath *pgpath; struct block_device *bdev; fmode_t mode; unsigned long flags; @@ -1570,12 +1571,14 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, if (!m->current_pgpath) __choose_pgpath(m, 0); - if (m->current_pgpath) { - bdev = m->current_pgpath->path.dev->bdev; - mode = m->current_pgpath->path.dev->mode; + pgpath = m->current_pgpath; + + if (pgpath) { + bdev = pgpath->path.dev->bdev; + mode = pgpath->path.dev->mode; } - if (m->queue_io) + if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path)) r = -EAGAIN; else if (!bdev) r = -EIO; From ba1cbad93dd47223b1f3b8edd50dd9ef2abcb2ed Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 26 Sep 2012 23:45:42 +0100 Subject: [PATCH 19/44] dm: handle requests beyond end of device instead of using BUG_ON The access beyond the end of device BUG_ON that was introduced to dm_request_fn via commit 29e4013de7ad950280e4b2208 ("dm: implement REQ_FLUSH/FUA support for request-based dm") was an overly drastic (but simple) response to this situation. I have received a report that this BUG_ON was hit and now think it would be better to use dm_kill_unmapped_request() to fail the clone and original request with -EIO. map_request() will assign the valid target returned by dm_table_find_target to tio->ti. But when the target isn't valid tio->ti is never assigned (because map_request isn't called); so add a check for tio->ti != NULL to dm_done(). Reported-by: Mike Christie Signed-off-by: Mike Snitzer Signed-off-by: Jun'ichi Nomura Cc: stable@vger.kernel.org # v2.6.37+ Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 56 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 4e09b6ff5b49..6748e0c4df1f 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -865,10 +865,14 @@ static void dm_done(struct request *clone, int error, bool mapped) { int r = error; struct dm_rq_target_io *tio = clone->end_io_data; - dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; + dm_request_endio_fn rq_end_io = NULL; - if (mapped && rq_end_io) - r = rq_end_io(tio->ti, clone, error, &tio->info); + if (tio->ti) { + rq_end_io = tio->ti->type->rq_end_io; + + if (mapped && rq_end_io) + r = rq_end_io(tio->ti, clone, error, &tio->info); + } if (r <= 0) /* The target wants to complete the I/O */ @@ -1588,15 +1592,6 @@ static int map_request(struct dm_target *ti, struct request *clone, int r, requeued = 0; struct dm_rq_target_io *tio = clone->end_io_data; - /* - * Hold the md reference here for the in-flight I/O. - * We can't rely on the reference count by device opener, - * because the device may be closed during the request completion - * when all bios are completed. - * See the comment in rq_completed() too. - */ - dm_get(md); - tio->ti = ti; r = ti->type->map_rq(ti, clone, &tio->info); switch (r) { @@ -1628,6 +1623,26 @@ static int map_request(struct dm_target *ti, struct request *clone, return requeued; } +static struct request *dm_start_request(struct mapped_device *md, struct request *orig) +{ + struct request *clone; + + blk_start_request(orig); + clone = orig->special; + atomic_inc(&md->pending[rq_data_dir(clone)]); + + /* + * Hold the md reference here for the in-flight I/O. + * We can't rely on the reference count by device opener, + * because the device may be closed during the request completion + * when all bios are completed. + * See the comment in rq_completed() too. + */ + dm_get(md); + + return clone; +} + /* * q->request_fn for request-based dm. * Called with the queue lock held. @@ -1657,14 +1672,21 @@ static void dm_request_fn(struct request_queue *q) pos = blk_rq_pos(rq); ti = dm_table_find_target(map, pos); - BUG_ON(!dm_target_is_valid(ti)); + if (!dm_target_is_valid(ti)) { + /* + * Must perform setup, that dm_done() requires, + * before calling dm_kill_unmapped_request + */ + DMERR_LIMIT("request attempted access beyond the end of device"); + clone = dm_start_request(md, rq); + dm_kill_unmapped_request(clone, -EIO); + continue; + } if (ti->type->busy && ti->type->busy(ti)) goto delay_and_out; - blk_start_request(rq); - clone = rq->special; - atomic_inc(&md->pending[rq_data_dir(clone)]); + clone = dm_start_request(md, rq); spin_unlock(q->queue_lock); if (map_request(ti, clone, md)) @@ -1684,8 +1706,6 @@ static void dm_request_fn(struct request_queue *q) blk_delay_queue(q, HZ / 10); out: dm_table_put(map); - - return; } int dm_underlying_device_busy(struct request_queue *q) From c3c4555edd10dbc0b388a0125b9c50de5e79af05 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Wed, 26 Sep 2012 23:45:43 +0100 Subject: [PATCH 20/44] dm table: clear add_random unless all devices have it set Always clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not have it set. Otherwise devices with predictable characteristics may contribute entropy. QUEUE_FLAG_ADD_RANDOM specifies whether or not queue IO timings contribute to the random pool. For bio-based targets this flag is always 0 because such devices have no real queue. For request-based devices this flag was always set to 1 by default. Now set it according to the flags on underlying devices. If there is at least one device which should not contribute, set the flag to zero: If a device, such as fast SSD storage, is not suitable for supplying entropy, a request-based queue stacked over it will not be either. Because the checking logic is exactly same as for the rotational flag, share the iteration function with device_is_nonrot(). Signed-off-by: Milan Broz Cc: stable@vger.kernel.org Signed-off-by: Alasdair G Kergon --- drivers/md/dm-table.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index f90069029aae..77b90ae66991 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1354,17 +1354,25 @@ static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev, return q && blk_queue_nonrot(q); } -static bool dm_table_is_nonrot(struct dm_table *t) +static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && !blk_queue_add_random(q); +} + +static bool dm_table_all_devices_attribute(struct dm_table *t, + iterate_devices_callout_fn func) { struct dm_target *ti; unsigned i = 0; - /* Ensure that all underlying device are non-rotational. */ while (i < dm_table_get_num_targets(t)) { ti = dm_table_get_target(t, i++); if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_is_nonrot, NULL)) + !ti->type->iterate_devices(ti, func, NULL)) return 0; } @@ -1396,13 +1404,23 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (!dm_table_discard_zeroes_data(t)) q->limits.discard_zeroes_data = 0; - if (dm_table_is_nonrot(t)) + /* Ensure that all underlying devices are non-rotational. */ + if (dm_table_all_devices_attribute(t, device_is_nonrot)) queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); else queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q); dm_table_set_integrity(t); + /* + * Determine whether or not this queue's I/O timings contribute + * to the entropy pool, Only request-based targets use this. + * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not + * have it set. + */ + if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random)) + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); + /* * QUEUE_FLAG_STACKABLE must be set after all queue settings are * visible to other CPUs because, once the flag is set, incoming bios From 3ae706561637331aa578e52bb89ecbba5edcb7a9 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 26 Sep 2012 23:45:45 +0100 Subject: [PATCH 21/44] dm: retain table limits when swapping to new table with no devices Add a safety net that will re-use the DM device's existing limits in the event that DM device has a temporary table that doesn't have any component devices. This is to reduce the chance that requests not respecting the hardware limits will reach the device. DM recalculates queue limits based only on devices which currently exist in the table. This creates a problem in the event all devices are temporarily removed such as all paths being lost in multipath. DM will reset the limits to the maximum permissible, which can then assemble requests which exceed the limits of the paths when the paths are restored. The request will fail the blk_rq_check_limits() test when sent to a path with lower limits, and will be retried without end by multipath. This became a much bigger issue after v3.6 commit fe86cdcef ("block: do not artificially constrain max_sectors for stacking drivers"). Reported-by: David Jeffery Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-table.c | 35 +++++++++++++++++++++++++++++++++++ drivers/md/dm.c | 15 ++++++++++++++- drivers/md/dm.h | 1 + 3 files changed, 50 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 77b90ae66991..100368eb7991 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1212,6 +1212,41 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) return &t->targets[(KEYS_PER_NODE * n) + k]; } +static int count_device(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + unsigned *num_devices = data; + + (*num_devices)++; + + return 0; +} + +/* + * Check whether a table has no data devices attached using each + * target's iterate_devices method. + * Returns false if the result is unknown because a target doesn't + * support iterate_devices. + */ +bool dm_table_has_no_data_devices(struct dm_table *table) +{ + struct dm_target *uninitialized_var(ti); + unsigned i = 0, num_devices = 0; + + while (i < dm_table_get_num_targets(table)) { + ti = dm_table_get_target(table, i++); + + if (!ti->type->iterate_devices) + return false; + + ti->type->iterate_devices(ti, count_device, &num_devices); + if (num_devices) + return false; + } + + return true; +} + /* * Establish the new table's queue_limits and validate them. */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 6748e0c4df1f..67ffa391edcf 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2429,7 +2429,7 @@ static void dm_queue_flush(struct mapped_device *md) */ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) { - struct dm_table *map = ERR_PTR(-EINVAL); + struct dm_table *live_map, *map = ERR_PTR(-EINVAL); struct queue_limits limits; int r; @@ -2439,6 +2439,19 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) if (!dm_suspended_md(md)) goto out; + /* + * If the new table has no data devices, retain the existing limits. + * This helps multipath with queue_if_no_path if all paths disappear, + * then new I/O is queued based on these limits, and then some paths + * reappear. + */ + if (dm_table_has_no_data_devices(table)) { + live_map = dm_get_live_table(md); + if (live_map) + limits = md->queue->limits; + dm_table_put(live_map); + } + r = dm_calculate_queue_limits(table, &limits); if (r) { map = ERR_PTR(r); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 52eef493d266..6a99fefaa743 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -54,6 +54,7 @@ void dm_table_event_callback(struct dm_table *t, void (*fn)(void *), void *context); struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); +bool dm_table_has_no_data_devices(struct dm_table *table); int dm_calculate_queue_limits(struct dm_table *table, struct queue_limits *limits); void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, From 9bc142dd755d360c08a91ecb107d218787a2e9db Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 26 Sep 2012 23:45:46 +0100 Subject: [PATCH 22/44] dm thin: tidy discard support A little thin discard code refactoring to make the next patch (dm thin: fix discard support for data devices) more readable. Pull out a couple of functions (and uses bools instead of unsigned for features). No functional changes. Signed-off-by: Mike Snitzer Signed-off-by: Joe Thornber Signed-off-by: Alasdair G Kergon --- drivers/md/dm-thin.c | 64 +++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index d4209231069d..e99f4134dbd7 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -509,9 +509,9 @@ enum pool_mode { struct pool_features { enum pool_mode mode; - unsigned zero_new_blocks:1; - unsigned discard_enabled:1; - unsigned discard_passdown:1; + bool zero_new_blocks:1; + bool discard_enabled:1; + bool discard_passdown:1; }; struct thin_c; @@ -1839,6 +1839,32 @@ static void __requeue_bios(struct pool *pool) /*---------------------------------------------------------------- * Binding of control targets to a pool object *--------------------------------------------------------------*/ +static bool data_dev_supports_discard(struct pool_c *pt) +{ + struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); + + return q && blk_queue_discard(q); +} + +/* + * If discard_passdown was enabled verify that the data device + * supports discards. Disable discard_passdown if not; otherwise + * -EOPNOTSUPP will be returned. + */ +static void disable_passdown_if_not_supported(struct pool_c *pt, + struct pool_features *pf) +{ + char buf[BDEVNAME_SIZE]; + + if (!pf->discard_passdown || data_dev_supports_discard(pt)) + return; + + DMWARN("Discard unsupported by data device (%s): Disabling discard passdown.", + bdevname(pt->data_dev->bdev, buf)); + + pf->discard_passdown = false; +} + static int bind_control_target(struct pool *pool, struct dm_target *ti) { struct pool_c *pt = ti->private; @@ -1855,23 +1881,9 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) pool->ti = ti; pool->low_water_blocks = pt->low_water_blocks; pool->pf = pt->pf; - set_pool_mode(pool, new_mode); - /* - * If discard_passdown was enabled verify that the data device - * supports discards. Disable discard_passdown if not; otherwise - * -EOPNOTSUPP will be returned. - */ - /* FIXME: pull this out into a sep fn. */ - if (pt->pf.discard_passdown) { - struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); - if (!q || !blk_queue_discard(q)) { - char buf[BDEVNAME_SIZE]; - DMWARN("Discard unsupported by data device (%s): Disabling discard passdown.", - bdevname(pt->data_dev->bdev, buf)); - pool->pf.discard_passdown = 0; - } - } + disable_passdown_if_not_supported(pt, &pool->pf); + set_pool_mode(pool, new_mode); return 0; } @@ -1889,9 +1901,9 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti) static void pool_features_init(struct pool_features *pf) { pf->mode = PM_WRITE; - pf->zero_new_blocks = 1; - pf->discard_enabled = 1; - pf->discard_passdown = 1; + pf->zero_new_blocks = true; + pf->discard_enabled = true; + pf->discard_passdown = true; } static void __pool_destroy(struct pool *pool) @@ -2119,13 +2131,13 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, argc--; if (!strcasecmp(arg_name, "skip_block_zeroing")) - pf->zero_new_blocks = 0; + pf->zero_new_blocks = false; else if (!strcasecmp(arg_name, "ignore_discard")) - pf->discard_enabled = 0; + pf->discard_enabled = false; else if (!strcasecmp(arg_name, "no_discard_passdown")) - pf->discard_passdown = 0; + pf->discard_passdown = false; else if (!strcasecmp(arg_name, "read_only")) pf->mode = PM_READ_ONLY; @@ -2261,6 +2273,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) pt->low_water_blocks = low_water_blocks; pt->pf = pf; ti->num_flush_requests = 1; + /* * Only need to enable discards if the pool should pass * them down to the data device. The thin device's discard @@ -2268,6 +2281,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) */ if (pf.discard_enabled && pf.discard_passdown) { ti->num_discard_requests = 1; + /* * Setting 'discards_supported' circumvents the normal * stacking of discard limits (this keeps the pool and From 0424caa14508f19ca8093d36c15250e0331a3a0a Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 26 Sep 2012 23:45:47 +0100 Subject: [PATCH 23/44] dm thin: fix discard support for data devices The discard limits that get established for a thin-pool or thin device may be incompatible with the pool's data device. Avoid this by checking the discard limits of the pool's data device. If an incompatibility is found then the pool's 'discard passdown' feature is disabled. Change thin_io_hints to ensure that a thin device always uses the same queue limits as its pool device. Introduce requested_pf to track whether or not the table line originally contained the no_discard_passdown flag and use this directly for table output. We prepare the correct setting for discard_passdown directly in bind_control_target (called from pool_io_hints) and store it in adjusted_pf rather than waiting until we have access to pool->pf in pool_preresume. Signed-off-by: Mike Snitzer Signed-off-by: Joe Thornber Signed-off-by: Alasdair G Kergon --- drivers/md/dm-thin.c | 87 +++++++++++++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 30 deletions(-) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index e99f4134dbd7..c29410af1e22 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -580,7 +580,8 @@ struct pool_c { struct dm_target_callbacks callbacks; dm_block_t low_water_blocks; - struct pool_features pf; + struct pool_features requested_pf; /* Features requested during table load */ + struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */ }; /* @@ -1848,21 +1849,36 @@ static bool data_dev_supports_discard(struct pool_c *pt) /* * If discard_passdown was enabled verify that the data device - * supports discards. Disable discard_passdown if not; otherwise - * -EOPNOTSUPP will be returned. + * supports discards. Disable discard_passdown if not. */ -static void disable_passdown_if_not_supported(struct pool_c *pt, - struct pool_features *pf) +static void disable_passdown_if_not_supported(struct pool_c *pt) { + struct pool *pool = pt->pool; + struct block_device *data_bdev = pt->data_dev->bdev; + struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits; + sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT; + const char *reason = NULL; char buf[BDEVNAME_SIZE]; - if (!pf->discard_passdown || data_dev_supports_discard(pt)) + if (!pt->adjusted_pf.discard_passdown) return; - DMWARN("Discard unsupported by data device (%s): Disabling discard passdown.", - bdevname(pt->data_dev->bdev, buf)); + if (!data_dev_supports_discard(pt)) + reason = "discard unsupported"; - pf->discard_passdown = false; + else if (data_limits->max_discard_sectors < pool->sectors_per_block) + reason = "max discard sectors smaller than a block"; + + else if (data_limits->discard_granularity > block_size) + reason = "discard granularity larger than a block"; + + else if (block_size & (data_limits->discard_granularity - 1)) + reason = "discard granularity not a factor of block size"; + + if (reason) { + DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason); + pt->adjusted_pf.discard_passdown = false; + } } static int bind_control_target(struct pool *pool, struct dm_target *ti) @@ -1873,16 +1889,15 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) * We want to make sure that degraded pools are never upgraded. */ enum pool_mode old_mode = pool->pf.mode; - enum pool_mode new_mode = pt->pf.mode; + enum pool_mode new_mode = pt->adjusted_pf.mode; if (old_mode > new_mode) new_mode = old_mode; pool->ti = ti; pool->low_water_blocks = pt->low_water_blocks; - pool->pf = pt->pf; + pool->pf = pt->adjusted_pf; - disable_passdown_if_not_supported(pt, &pool->pf); set_pool_mode(pool, new_mode); return 0; @@ -2271,7 +2286,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) pt->metadata_dev = metadata_dev; pt->data_dev = data_dev; pt->low_water_blocks = low_water_blocks; - pt->pf = pf; + pt->adjusted_pf = pt->requested_pf = pf; ti->num_flush_requests = 1; /* @@ -2718,7 +2733,7 @@ static int pool_status(struct dm_target *ti, status_type_t type, format_dev_t(buf2, pt->data_dev->bdev->bd_dev), (unsigned long)pool->sectors_per_block, (unsigned long long)pt->low_water_blocks); - emit_flags(&pt->pf, result, sz, maxlen); + emit_flags(&pt->requested_pf, result, sz, maxlen); break; } @@ -2747,19 +2762,21 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } -static void set_discard_limits(struct pool *pool, struct queue_limits *limits) +static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits) { - /* - * FIXME: these limits may be incompatible with the pool's data device - */ + struct pool *pool = pt->pool; + struct queue_limits *data_limits; + limits->max_discard_sectors = pool->sectors_per_block; /* - * This is just a hint, and not enforced. We have to cope with - * bios that cover a block partially. A discard that spans a block - * boundary is not sent to this target. + * discard_granularity is just a hint, and not enforced. */ - limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; + if (pt->adjusted_pf.discard_passdown) { + data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits; + limits->discard_granularity = data_limits->discard_granularity; + } else + limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; } static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) @@ -2769,15 +2786,25 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_min(limits, 0); blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); - if (pool->pf.discard_enabled) - set_discard_limits(pool, limits); + + /* + * pt->adjusted_pf is a staging area for the actual features to use. + * They get transferred to the live pool in bind_control_target() + * called from pool_preresume(). + */ + if (!pt->adjusted_pf.discard_enabled) + return; + + disable_passdown_if_not_supported(pt); + + set_discard_limits(pt, limits); } static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 3, 0}, + .version = {1, 4, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -3056,19 +3083,19 @@ static int thin_iterate_devices(struct dm_target *ti, return 0; } +/* + * A thin device always inherits its queue limits from its pool. + */ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct thin_c *tc = ti->private; - struct pool *pool = tc->pool; - blk_limits_io_min(limits, 0); - blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); - set_discard_limits(pool, limits); + *limits = bdev_get_queue(tc->pool_dev->bdev)->limits; } static struct target_type thin_target = { .name = "thin", - .version = {1, 3, 0}, + .version = {1, 4, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, From 1d55f6bcc0331d744cd5b56c4ee79e3809438161 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 26 Sep 2012 23:45:48 +0100 Subject: [PATCH 24/44] dm verity: fix overflow check This patch fixes sector_t overflow checking in dm-verity. Without this patch, the code checks for overflow only if sector_t is smaller than long long, not if sector_t and long long have the same size. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Alasdair G Kergon --- drivers/md/dm-verity.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 254d19268ad2..892ae2766aa6 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -718,8 +718,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) v->hash_dev_block_bits = ffs(num) - 1; if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 || - num_ll << (v->data_dev_block_bits - SECTOR_SHIFT) != - (sector_t)num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) { + (sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) + >> (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll) { ti->error = "Invalid data blocks"; r = -EINVAL; goto bad; @@ -733,8 +733,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) } if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 || - num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT) != - (sector_t)num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) { + (sector_t)(num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) + >> (v->hash_dev_block_bits - SECTOR_SHIFT) != num_ll) { ti->error = "Invalid hash start"; r = -EINVAL; goto bad; From 424597921ac1fc2a475a656ea5895cbe78f25d0d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 3 Sep 2012 03:24:18 -0400 Subject: [PATCH 25/44] um: take cleaning singlestep to start_thread() ... assuming it's needed to be done at all Signed-off-by: Al Viro --- arch/um/kernel/exec.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c index 6cade9366364..4cab0c78684b 100644 --- a/arch/um/kernel/exec.c +++ b/arch/um/kernel/exec.c @@ -41,32 +41,18 @@ void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp) { PT_REGS_IP(regs) = eip; PT_REGS_SP(regs) = esp; + current->ptrace &= ~PT_DTRACE; +#ifdef SUBARCH_EXECVE1 + SUBARCH_EXECVE1(regs->regs); +#endif } EXPORT_SYMBOL(start_thread); -static long execve1(const char *file, - const char __user *const __user *argv, - const char __user *const __user *env) -{ - long error; - - error = do_execve(file, argv, env, ¤t->thread.regs); - if (error == 0) { - task_lock(current); - current->ptrace &= ~PT_DTRACE; -#ifdef SUBARCH_EXECVE1 - SUBARCH_EXECVE1(¤t->thread.regs.regs); -#endif - task_unlock(current); - } - return error; -} - long um_execve(const char *file, const char __user *const __user *argv, const char __user *const __user *env) { long err; - err = execve1(file, argv, env); + err = do_execve(file, argv, env, ¤t->thread.regs); if (!err) UML_LONGJMP(current->thread.exec_buf, 1); return err; @@ -81,7 +67,7 @@ long sys_execve(const char __user *file, const char __user *const __user *argv, filename = getname(file); error = PTR_ERR(filename); if (IS_ERR(filename)) goto out; - error = execve1(filename, argv, env); + error = do_execve(filename, argv, env, ¤t->thread.regs); putname(filename); out: return error; From bf56d57638452a8ee9241c9ca082d59cd2ca4cc8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 5 Sep 2012 23:20:33 -0400 Subject: [PATCH 26/44] um: don't leak floating point state and segment registers on execve() Signed-off-by: Al Viro --- arch/um/kernel/exec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c index 4cab0c78684b..8c82786da823 100644 --- a/arch/um/kernel/exec.c +++ b/arch/um/kernel/exec.c @@ -39,6 +39,7 @@ void flush_thread(void) void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp) { + get_safe_registers(regs->regs.gp, regs->regs.fp); PT_REGS_IP(regs) = eip; PT_REGS_SP(regs) = esp; current->ptrace &= ~PT_DTRACE; From f9a38eace4498a5e9f6d2cdfc879d5444edc3a5f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 6 Sep 2012 13:39:47 -0400 Subject: [PATCH 27/44] um: let signal_delivered() do SIGTRAP on singlestepping into handler ... rather than duplicating that in sigframe setup code (and doing that inconsistently, at that) Signed-off-by: Al Viro --- arch/um/kernel/signal.c | 6 +++++- arch/x86/um/signal.c | 6 ------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c index 7362d58efc29..cc9c2350e417 100644 --- a/arch/um/kernel/signal.c +++ b/arch/um/kernel/signal.c @@ -22,9 +22,13 @@ static void handle_signal(struct pt_regs *regs, unsigned long signr, struct k_sigaction *ka, siginfo_t *info) { sigset_t *oldset = sigmask_to_save(); + int singlestep = 0; unsigned long sp; int err; + if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) + singlestep = 1; + /* Did we come from a system call? */ if (PT_REGS_SYSCALL_NR(regs) >= 0) { /* If so, check system call restarting.. */ @@ -61,7 +65,7 @@ static void handle_signal(struct pt_regs *regs, unsigned long signr, if (err) force_sigsegv(signr, current); else - signal_delivered(signr, info, ka, regs, 0); + signal_delivered(signr, info, ka, regs, singlestep); } static int kern_do_signal(struct pt_regs *regs) diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index a508cea13503..ba7363ecf896 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -416,9 +416,6 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig, PT_REGS_AX(regs) = (unsigned long) sig; PT_REGS_DX(regs) = (unsigned long) 0; PT_REGS_CX(regs) = (unsigned long) 0; - - if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) - ptrace_notify(SIGTRAP); return 0; } @@ -466,9 +463,6 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, PT_REGS_AX(regs) = (unsigned long) sig; PT_REGS_DX(regs) = (unsigned long) &frame->info; PT_REGS_CX(regs) = (unsigned long) &frame->uc; - - if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) - ptrace_notify(SIGTRAP); return 0; } From d2ce4e92fa4f79a5fdb4cc912b411280afe21697 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 20 Sep 2012 09:28:25 -0400 Subject: [PATCH 28/44] um: kill thread->forking we only use that to tell copy_thread() done by syscall from that done by kernel_thread(). However, it's easier to do simply by checking PF_KTHREAD in thread flags. Merge sys_clone() guts for 32bit and 64bit, while we are at it... Signed-off-by: Al Viro --- arch/um/include/asm/processor-generic.h | 9 --------- arch/um/kernel/process.c | 8 ++++---- arch/um/kernel/syscall.c | 24 +++++++++++----------- arch/x86/um/shared/sysdep/syscalls.h | 2 ++ arch/x86/um/sys_call_table_32.c | 2 +- arch/x86/um/syscalls_32.c | 27 +++++++------------------ arch/x86/um/syscalls_64.c | 23 +++------------------ 7 files changed, 29 insertions(+), 66 deletions(-) diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index 69f1c57a8d0d..33a6a2423bd2 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -20,14 +20,6 @@ struct mm_struct; struct thread_struct { struct task_struct *saved_task; - /* - * This flag is set to 1 before calling do_fork (and analyzed in - * copy_thread) to mark that we are begin called from userspace (fork / - * vfork / clone), and reset to 0 after. It is left to 0 when called - * from kernelspace (i.e. kernel_thread() or fork_idle(), - * as of 2.6.11). - */ - int forking; struct pt_regs regs; int singlestep_syscall; void *fault_addr; @@ -58,7 +50,6 @@ struct thread_struct { #define INIT_THREAD \ { \ - .forking = 0, \ .regs = EMPTY_REGS, \ .fault_addr = NULL, \ .prev_sched = NULL, \ diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 57fc7028714a..c5f5afa50745 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -181,11 +181,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, struct pt_regs *regs) { void (*handler)(void); + int kthread = current->flags & PF_KTHREAD; int ret = 0; p->thread = (struct thread_struct) INIT_THREAD; - if (current->thread.forking) { + if (!kthread) { memcpy(&p->thread.regs.regs, ®s->regs, sizeof(p->thread.regs.regs)); PT_REGS_SET_SYSCALL_RETURN(&p->thread.regs, 0); @@ -195,8 +196,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, handler = fork_handler; arch_copy_thread(¤t->thread.arch, &p->thread.arch); - } - else { + } else { get_safe_registers(p->thread.regs.regs.gp, p->thread.regs.regs.fp); p->thread.request.u.thread = current->thread.request.u.thread; handler = new_thread_handler; @@ -204,7 +204,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, new_thread(task_stack_page(p), &p->thread.switch_buf, handler); - if (current->thread.forking) { + if (!kthread) { clear_flushed_tls(p); /* diff --git a/arch/um/kernel/syscall.c b/arch/um/kernel/syscall.c index f958cb876ee3..a4c6d8eee74c 100644 --- a/arch/um/kernel/syscall.c +++ b/arch/um/kernel/syscall.c @@ -17,25 +17,25 @@ long sys_fork(void) { - long ret; - - current->thread.forking = 1; - ret = do_fork(SIGCHLD, UPT_SP(¤t->thread.regs.regs), + return do_fork(SIGCHLD, UPT_SP(¤t->thread.regs.regs), ¤t->thread.regs, 0, NULL, NULL); - current->thread.forking = 0; - return ret; } long sys_vfork(void) { - long ret; - - current->thread.forking = 1; - ret = do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, UPT_SP(¤t->thread.regs.regs), ¤t->thread.regs, 0, NULL, NULL); - current->thread.forking = 0; - return ret; +} + +long sys_clone(unsigned long clone_flags, unsigned long newsp, + void __user *parent_tid, void __user *child_tid) +{ + if (!newsp) + newsp = UPT_SP(¤t->thread.regs.regs); + + return do_fork(clone_flags, newsp, ¤t->thread.regs, 0, parent_tid, + child_tid); } long old_mmap(unsigned long addr, unsigned long len, diff --git a/arch/x86/um/shared/sysdep/syscalls.h b/arch/x86/um/shared/sysdep/syscalls.h index bd9a89b67e41..ca255a805ed9 100644 --- a/arch/x86/um/shared/sysdep/syscalls.h +++ b/arch/x86/um/shared/sysdep/syscalls.h @@ -1,3 +1,5 @@ +extern long sys_clone(unsigned long clone_flags, unsigned long newsp, + void __user *parent_tid, void __user *child_tid); #ifdef __i386__ #include "syscalls_32.h" #else diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index 68d1dc91b37b..b5408cecac6c 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -28,7 +28,7 @@ #define ptregs_execve sys_execve #define ptregs_iopl sys_iopl #define ptregs_vm86old sys_vm86old -#define ptregs_clone sys_clone +#define ptregs_clone i386_clone #define ptregs_vm86 sys_vm86 #define ptregs_sigaltstack sys_sigaltstack #define ptregs_vfork sys_vfork diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c index b853e8600b9d..db444c7218fe 100644 --- a/arch/x86/um/syscalls_32.c +++ b/arch/x86/um/syscalls_32.c @@ -3,37 +3,24 @@ * Licensed under the GPL */ -#include "linux/sched.h" -#include "linux/shm.h" -#include "linux/ipc.h" -#include "linux/syscalls.h" -#include "asm/mman.h" -#include "asm/uaccess.h" -#include "asm/unistd.h" +#include +#include /* * The prototype on i386 is: * - * int clone(int flags, void * child_stack, int * parent_tidptr, struct user_desc * newtls, int * child_tidptr) + * int clone(int flags, void * child_stack, int * parent_tidptr, struct user_desc * newtls * * and the "newtls" arg. on i386 is read by copy_thread directly from the * register saved on the stack. */ -long sys_clone(unsigned long clone_flags, unsigned long newsp, - int __user *parent_tid, void *newtls, int __user *child_tid) +long i386_clone(unsigned long clone_flags, unsigned long newsp, + int __user *parent_tid, void *newtls, int __user *child_tid) { - long ret; - - if (!newsp) - newsp = UPT_SP(¤t->thread.regs.regs); - - current->thread.forking = 1; - ret = do_fork(clone_flags, newsp, ¤t->thread.regs, 0, parent_tid, - child_tid); - current->thread.forking = 0; - return ret; + return sys_clone(clone_flags, newsp, parent_tid, child_tid); } + long sys_sigaction(int sig, const struct old_sigaction __user *act, struct old_sigaction __user *oact) { diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c index f3d82bb6e15a..adb08eb5c22a 100644 --- a/arch/x86/um/syscalls_64.c +++ b/arch/x86/um/syscalls_64.c @@ -5,12 +5,9 @@ * Licensed under the GPL */ -#include "linux/linkage.h" -#include "linux/personality.h" -#include "linux/utsname.h" -#include "asm/prctl.h" /* XXX This should get the constants from libc */ -#include "asm/uaccess.h" -#include "os.h" +#include +#include /* XXX This should get the constants from libc */ +#include long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) { @@ -79,20 +76,6 @@ long sys_arch_prctl(int code, unsigned long addr) return arch_prctl(current, code, (unsigned long __user *) addr); } -long sys_clone(unsigned long clone_flags, unsigned long newsp, - void __user *parent_tid, void __user *child_tid) -{ - long ret; - - if (!newsp) - newsp = UPT_SP(¤t->thread.regs.regs); - current->thread.forking = 1; - ret = do_fork(clone_flags, newsp, ¤t->thread.regs, 0, parent_tid, - child_tid); - current->thread.forking = 0; - return ret; -} - void arch_switch_to(struct task_struct *to) { if ((to->thread.arch.fs == 0) || (to->mm == NULL)) From 7f8436a1269eaaf2d0b1054a325eddf4e14cb80d Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 24 Sep 2012 18:29:01 +0000 Subject: [PATCH 29/44] l2tp: fix return value check In case of error, the function genlmsg_put() returns NULL pointer not ERR_PTR(). The IS_ERR() test in the return value check should be replaced with NULL test. dpatch engine is used to auto generate this patch. (https://github.com/weiyj/dpatch) Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/l2tp/l2tp_netlink.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index d71cd9229a47..6f936358d664 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -80,8 +80,8 @@ static int l2tp_nl_cmd_noop(struct sk_buff *skb, struct genl_info *info) hdr = genlmsg_put(msg, info->snd_pid, info->snd_seq, &l2tp_nl_family, 0, L2TP_CMD_NOOP); - if (IS_ERR(hdr)) { - ret = PTR_ERR(hdr); + if (!hdr) { + ret = -EMSGSIZE; goto err_out; } @@ -250,8 +250,8 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 pid, u32 seq, int flags, hdr = genlmsg_put(skb, pid, seq, &l2tp_nl_family, flags, L2TP_CMD_TUNNEL_GET); - if (IS_ERR(hdr)) - return PTR_ERR(hdr); + if (!hdr) + return -EMSGSIZE; if (nla_put_u8(skb, L2TP_ATTR_PROTO_VERSION, tunnel->version) || nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) || @@ -617,8 +617,8 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 pid, u32 seq, int flags sk = tunnel->sock; hdr = genlmsg_put(skb, pid, seq, &l2tp_nl_family, flags, L2TP_CMD_SESSION_GET); - if (IS_ERR(hdr)) - return PTR_ERR(hdr); + if (!hdr) + return -EMSGSIZE; if (nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) || nla_put_u32(skb, L2TP_ATTR_SESSION_ID, session->session_id) || From a326e6dd315ee38c280b7ecf8a0d777952e384cd Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 24 Sep 2012 18:29:35 +0000 Subject: [PATCH 30/44] team: fix return value check In case of error, the function genlmsg_put() returns NULL pointer not ERR_PTR(). The IS_ERR() test in the return value check should be replaced with NULL test. dpatch engine is used to auto generate this patch. (https://github.com/weiyj/dpatch) Signed-off-by: Wei Yongjun Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/team/team.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 3ffe8a6e3c8b..f8cd61f449a4 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1653,8 +1653,8 @@ static int team_nl_cmd_noop(struct sk_buff *skb, struct genl_info *info) hdr = genlmsg_put(msg, info->snd_pid, info->snd_seq, &team_nl_family, 0, TEAM_CMD_NOOP); - if (IS_ERR(hdr)) { - err = PTR_ERR(hdr); + if (!hdr) { + err = -EMSGSIZE; goto err_msg_put; } @@ -1848,8 +1848,8 @@ static int team_nl_send_options_get(struct team *team, u32 pid, u32 seq, hdr = genlmsg_put(skb, pid, seq, &team_nl_family, flags | NLM_F_MULTI, TEAM_CMD_OPTIONS_GET); - if (IS_ERR(hdr)) - return PTR_ERR(hdr); + if (!hdr) + return -EMSGSIZE; if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex)) goto nla_put_failure; @@ -2068,8 +2068,8 @@ static int team_nl_fill_port_list_get(struct sk_buff *skb, hdr = genlmsg_put(skb, pid, seq, &team_nl_family, flags, TEAM_CMD_PORT_LIST_GET); - if (IS_ERR(hdr)) - return PTR_ERR(hdr); + if (!hdr) + return -EMSGSIZE; if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex)) goto nla_put_failure; From beb5ac20b3f90ffabac1eecd3c00205255df0728 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 26 Sep 2012 19:51:58 +0000 Subject: [PATCH 31/44] netdev: pasemi: fix return value check in pasemi_mac_phy_init() In case of error, the function of_phy_connect() returns NULL pointer not ERR_PTR(). The IS_ERR() test in the return value check should be replaced with NULL test. dpatch engine is used to auto generate this patch. (https://github.com/weiyj/dpatch) Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- drivers/net/ethernet/pasemi/pasemi_mac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/pasemi/pasemi_mac.c b/drivers/net/ethernet/pasemi/pasemi_mac.c index e559dfa06d6a..6fa74d530e44 100644 --- a/drivers/net/ethernet/pasemi/pasemi_mac.c +++ b/drivers/net/ethernet/pasemi/pasemi_mac.c @@ -1101,9 +1101,9 @@ static int pasemi_mac_phy_init(struct net_device *dev) phydev = of_phy_connect(dev, phy_dn, &pasemi_adjust_link, 0, PHY_INTERFACE_MODE_SGMII); - if (IS_ERR(phydev)) { + if (!phydev) { printk(KERN_ERR "%s: Could not attach to phy\n", dev->name); - return PTR_ERR(phydev); + return -ENODEV; } mac->phydev = phydev; From bbb35efcda41d589cfff5e2b08c5fb457791117c Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Thu, 27 Sep 2012 20:10:57 +0200 Subject: [PATCH 32/44] um: Fix IPC on um MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit c1d7e01d (ipc: use Kconfig options for __ARCH_WANT_[COMPAT_]IPC_PARSE_VERSION) forgot UML and broke IPC on it. Also UML has to select ARCH_WANT_IPC_PARSE_VERSION usin Kconfig. Reported-and-tested-by: Signed-off-by: Richard Weinberger --- arch/x86/um/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 9926e11a772d..aeaff8bef2f1 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -21,6 +21,7 @@ config 64BIT config X86_32 def_bool !64BIT select HAVE_AOUT + select ARCH_WANT_IPC_PARSE_VERSION config X86_64 def_bool 64BIT From 9429ec96c2718c0d1e3317cf60a87a0405223814 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 16 Aug 2012 20:15:05 +0200 Subject: [PATCH 33/44] um: Preinclude include/linux/kern_levels.h The userspace part of UML uses the asm-offsets.h generator mechanism to create definitions for UM_KERN_ that match the in-kernel KERN_ constant definitions. As of commit 04d2c8c83d0e3ac5f78aeede51babb3236200112 ("printk: convert the format for KERN_ to a 2 byte pattern"), KERN_ is no longer expanded to the literal '""', but to '"\001" "LEVEL"', i.e. it contains two parts. However, the combo of DEFINE_STR() in arch/x86/um/shared/sysdep/kernel-offsets.h and sed-y in Kbuild doesn't support string literals consisting of multiple parts. Hence for all UM_KERN_ definitions, only the SOH character is retained in the actual definition, while the remainder ends up in the comment. E.g. in include/generated/asm-offsets.h we get #define UM_KERN_INFO "\001" /* "6" KERN_INFO */ instead of #define UM_KERN_INFO "\001" "6" /* KERN_INFO */ This causes spurious '^A' output in some kernel messages: Calibrating delay loop... 4640.76 BogoMIPS (lpj=23203840) pid_max: default: 32768 minimum: 301 Mount-cache hash table entries: 256 ^AChecking that host ptys support output SIGIO...Yes ^AChecking that host ptys support SIGIO on close...No, enabling workaround ^AUsing 2.6 host AIO NET: Registered protocol family 16 bio: create slab at 0 Switching to clocksource itimer To fix this: - Move the mapping from UM_KERN_ to KERN_ from arch/um/include/shared/common-offsets.h to arch/um/include/shared/user.h, which is preincluded for all userspace parts, - Preinclude include/linux/kern_levels.h for all userspace parts, to obtain the in-kernel KERN_ constant definitions. This doesn't violate the kernel/userspace separation, as include/linux/kern_levels.h is self-contained and doesn't expose any other kernel internals. - Remove the now unused STR() and DEFINE_STR() macros. Signed-off-by: Geert Uytterhoeven Signed-off-by: Richard Weinberger --- arch/um/include/shared/common-offsets.h | 10 ---------- arch/um/include/shared/user.h | 11 +++++++++++ arch/um/scripts/Makefile.rules | 2 +- arch/x86/um/shared/sysdep/kernel-offsets.h | 3 --- 4 files changed, 12 insertions(+), 14 deletions(-) diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h index 40db8f71deae..2df313b6a586 100644 --- a/arch/um/include/shared/common-offsets.h +++ b/arch/um/include/shared/common-offsets.h @@ -7,16 +7,6 @@ DEFINE(UM_KERN_PAGE_MASK, PAGE_MASK); DEFINE(UM_KERN_PAGE_SHIFT, PAGE_SHIFT); DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC); -DEFINE_STR(UM_KERN_EMERG, KERN_EMERG); -DEFINE_STR(UM_KERN_ALERT, KERN_ALERT); -DEFINE_STR(UM_KERN_CRIT, KERN_CRIT); -DEFINE_STR(UM_KERN_ERR, KERN_ERR); -DEFINE_STR(UM_KERN_WARNING, KERN_WARNING); -DEFINE_STR(UM_KERN_NOTICE, KERN_NOTICE); -DEFINE_STR(UM_KERN_INFO, KERN_INFO); -DEFINE_STR(UM_KERN_DEBUG, KERN_DEBUG); -DEFINE_STR(UM_KERN_CONT, KERN_CONT); - DEFINE(UM_ELF_CLASS, ELF_CLASS); DEFINE(UM_ELFCLASS32, ELFCLASS32); DEFINE(UM_ELFCLASS64, ELFCLASS64); diff --git a/arch/um/include/shared/user.h b/arch/um/include/shared/user.h index 4fa82c055aab..cef068563336 100644 --- a/arch/um/include/shared/user.h +++ b/arch/um/include/shared/user.h @@ -26,6 +26,17 @@ extern void panic(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); +/* Requires preincluding include/linux/kern_levels.h */ +#define UM_KERN_EMERG KERN_EMERG +#define UM_KERN_ALERT KERN_ALERT +#define UM_KERN_CRIT KERN_CRIT +#define UM_KERN_ERR KERN_ERR +#define UM_KERN_WARNING KERN_WARNING +#define UM_KERN_NOTICE KERN_NOTICE +#define UM_KERN_INFO KERN_INFO +#define UM_KERN_DEBUG KERN_DEBUG +#define UM_KERN_CONT KERN_CONT + #ifdef UML_CONFIG_PRINTK extern int printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); diff --git a/arch/um/scripts/Makefile.rules b/arch/um/scripts/Makefile.rules index d50270d26b42..15889df9b466 100644 --- a/arch/um/scripts/Makefile.rules +++ b/arch/um/scripts/Makefile.rules @@ -8,7 +8,7 @@ USER_OBJS += $(filter %_user.o,$(obj-y) $(obj-m) $(USER_SINGLE_OBJS)) USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file)) $(USER_OBJS:.o=.%): \ - c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) -include user.h $(CFLAGS_$(basetarget).o) + c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) -include $(srctree)/include/linux/kern_levels.h -include user.h $(CFLAGS_$(basetarget).o) # These are like USER_OBJS but filter USER_CFLAGS through unprofile instead of # using it directly. diff --git a/arch/x86/um/shared/sysdep/kernel-offsets.h b/arch/x86/um/shared/sysdep/kernel-offsets.h index 5868526b5eef..46a9df99f3c5 100644 --- a/arch/x86/um/shared/sysdep/kernel-offsets.h +++ b/arch/x86/um/shared/sysdep/kernel-offsets.h @@ -7,9 +7,6 @@ #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) -#define STR(x) #x -#define DEFINE_STR(sym, val) asm volatile("\n->" #sym " " STR(val) " " #val: : ) - #define BLANK() asm volatile("\n->" : : ) #define OFFSET(sym, str, mem) \ From a3cff128c970da8400875c8893d68ac735e783fb Mon Sep 17 00:00:00 2001 From: Steve Glendinning Date: Mon, 24 Sep 2012 04:42:59 +0000 Subject: [PATCH 34/44] smsc75xx: fix resume after device reset On some systems this device fails to properly resume after suspend, this patch fixes it by running the usbnet_resume handler. I suspect this also fixes this bug: http://code.google.com/p/chromium-os/issues/detail?id=31871 Signed-off-by: Steve Glendinning Signed-off-by: David S. Miller --- drivers/net/usb/smsc75xx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c index f5ab6e613ec8..376143e8a1aa 100644 --- a/drivers/net/usb/smsc75xx.c +++ b/drivers/net/usb/smsc75xx.c @@ -1253,6 +1253,7 @@ static struct usb_driver smsc75xx_driver = { .probe = usbnet_probe, .suspend = usbnet_suspend, .resume = usbnet_resume, + .reset_resume = usbnet_resume, .disconnect = usbnet_disconnect, .disable_hub_initiated_lpm = 1, }; From b629820d18fa65cc598390e4b9712fd5f83ee693 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Tue, 25 Sep 2012 10:17:42 +0000 Subject: [PATCH 35/44] net: phy: smsc: Implement PHY config_init for LAN87xx The LAN8710/LAN8720 chips do have broken the "FlexPWR" smart power-saving capability. Enabling it leads to the PHY not being able to detect Link when cold-started without cable connected. Thus, make sure this is disabled. Signed-off-by: Marek Vasut Cc: Christian Hohnstaedt Cc: David S. Miller Cc: Fabio Estevam Cc: Giuseppe Cavallaro Cc: Otavio Salvador Acked-by: Otavio Salvador Signed-off-by: David S. Miller --- drivers/net/phy/smsc.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/smsc.c b/drivers/net/phy/smsc.c index 6d6192316b30..88e3991464e7 100644 --- a/drivers/net/phy/smsc.c +++ b/drivers/net/phy/smsc.c @@ -56,6 +56,32 @@ static int smsc_phy_config_init(struct phy_device *phydev) return smsc_phy_ack_interrupt (phydev); } +static int lan87xx_config_init(struct phy_device *phydev) +{ + /* + * Make sure the EDPWRDOWN bit is NOT set. Setting this bit on + * LAN8710/LAN8720 PHY causes the PHY to misbehave, likely due + * to a bug on the chip. + * + * When the system is powered on with the network cable being + * disconnected all the way until after ifconfig ethX up is + * issued for the LAN port with this PHY, connecting the cable + * afterwards does not cause LINK change detection, while the + * expected behavior is the Link UP being detected. + */ + int rc = phy_read(phydev, MII_LAN83C185_CTRL_STATUS); + if (rc < 0) + return rc; + + rc &= ~MII_LAN83C185_EDPWRDOWN; + + rc = phy_write(phydev, MII_LAN83C185_CTRL_STATUS, rc); + if (rc < 0) + return rc; + + return smsc_phy_ack_interrupt(phydev); +} + static int lan911x_config_init(struct phy_device *phydev) { return smsc_phy_ack_interrupt(phydev); @@ -162,7 +188,7 @@ static struct phy_driver smsc_phy_driver[] = { /* basic functions */ .config_aneg = genphy_config_aneg, .read_status = genphy_read_status, - .config_init = smsc_phy_config_init, + .config_init = lan87xx_config_init, /* IRQ related */ .ack_interrupt = smsc_phy_ack_interrupt, From 64f605c77408ad3aaff025e849d2e42cad9687e3 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 26 Sep 2012 07:22:02 +0000 Subject: [PATCH 36/44] bnx2: Clean up remaining iounmap commit c0357e975afdbbedab5c662d19bef865f02adc17 modified bnx2 to switch from using ioremap/iounmap to pci_iomap/pci_iounmap. They missed a spot in the error path of bnx2_init_one though. This patch just cleans that up. Signed-off-by: Neil Horman CC: Michael Chan CC: "David S. Miller" Acked-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c index 79cebd8525ce..e48312f2305d 100644 --- a/drivers/net/ethernet/broadcom/bnx2.c +++ b/drivers/net/ethernet/broadcom/bnx2.c @@ -8564,7 +8564,7 @@ bnx2_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) return 0; error: - iounmap(bp->regview); + pci_iounmap(pdev, bp->regview); pci_release_regions(pdev); pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); From 68b3f28c11bf896be32ad2a2e151aaa5f4a7c98e Mon Sep 17 00:00:00 2001 From: Narendra K Date: Tue, 25 Sep 2012 07:53:19 +0000 Subject: [PATCH 37/44] qlcnic: Fix scheduling while atomic bug In the device close path, 'qlcnic_fw_destroy_ctx' and 'qlcnic_poll_rsp' call msleep. But 'qlcnic_fw_destroy_ctx' and 'qlcnic_poll_rsp' are called with 'adapter->tx_clean_lock' spin lock held resulting in scheduling while atomic bug causing the following trace. I observed that the commit 012dc19a45b2b9cc2ebd14aaa401cf782c2abba4 from John Fastabend addresses a similar issue in ixgbevf driver. Adopting the same approach used in the commit, this patch uses mdelay to address the issue. [79884.999115] BUG: scheduling while atomic: ip/30846/0x00000002 [79885.005562] INFO: lockdep is turned off. [79885.009958] Modules linked in: qlcnic fuse nf_conntrack_netbios_ns nf_conntrack_broadcast ipt_MASQUERADE bnep bluetooth rfkill ip6table_mangle ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6_tables iptable_nat nf_nat iptable_mangle ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack iptable_filter ip_tables dcdbas coretemp kvm_intel kvm iTCO_wdt ixgbe iTCO_vendor_support crc32c_intel ghash_clmulni_intel nfsd microcode sb_edac pcspkr edac_core dca bnx2x shpchp auth_rpcgss nfs_acl lpc_ich mfd_core mdio lockd libcrc32c wmi acpi_pad acpi_power_meter sunrpc uinput sd_mod sr_mod cdrom crc_t10dif ahci libahci libata megaraid_sas usb_storage dm_mirror dm_region_hash dm_log dm_mod [last unloaded: qlcnic] [79885.083608] Pid: 30846, comm: ip Tainted: G W O 3.6.0-rc7+ #1 [79885.090805] Call Trace: [79885.093569] [] __schedule_bug+0x68/0x76 [79885.099699] [] __schedule+0x99e/0xa00 [79885.105634] [] schedule+0x29/0x70 [79885.111186] [] schedule_timeout+0x16f/0x350 [79885.117724] [] ? init_object+0x4a/0x90 [79885.123770] [] ? __internal_add_timer+0x140/0x140 [79885.130873] [] schedule_timeout_uninterruptible+0x1e/0x20 [79885.138773] [] msleep+0x20/0x30 [79885.144159] [] qlcnic_issue_cmd+0xef/0x290 [qlcnic] [79885.151478] [] qlcnic_fw_cmd_destroy_rx_ctx+0x55/0x90 [qlcnic] [79885.159868] [] qlcnic_fw_destroy_ctx+0x2d/0xa0 [qlcnic] [79885.167576] [] __qlcnic_down+0x11d/0x180 [qlcnic] [79885.174708] [] qlcnic_close+0x18/0x20 [qlcnic] [79885.181547] [] __dev_close_many+0x95/0xe0 [79885.187899] [] __dev_close+0x38/0x50 [79885.193761] [] __dev_change_flags+0xa1/0x180 [79885.200419] [] dev_change_flags+0x28/0x70 [79885.206779] [] do_setlink+0x378/0xa00 [79885.212731] [] ? nla_parse+0x31/0xe0 [79885.218612] [] rtnl_newlink+0x37e/0x560 [79885.224768] [] ? selinux_capable+0x39/0x50 [79885.231217] [] ? security_capable+0x18/0x20 [79885.237765] [] rtnetlink_rcv_msg+0x114/0x2f0 [79885.244412] [] ? rtnl_lock+0x17/0x20 [79885.250280] [] ? rtnl_lock+0x17/0x20 [79885.256148] [] ? __rtnl_unlock+0x20/0x20 [79885.262413] [] netlink_rcv_skb+0xa1/0xb0 [79885.268661] [] rtnetlink_rcv+0x25/0x40 [79885.274727] [] netlink_unicast+0x19d/0x220 [79885.281146] [] netlink_sendmsg+0x305/0x3f0 [79885.287595] [] ? sock_update_classid+0x148/0x2e0 [79885.294650] [] sock_sendmsg+0xbc/0xf0 [79885.300600] [] __sys_sendmsg+0x3ac/0x3c0 [79885.306853] [] ? up_read+0x23/0x40 [79885.312510] [] ? do_page_fault+0x2bc/0x570 [79885.318968] [] ? sys_brk+0x44/0x150 [79885.324715] [] ? fget_light+0x24c/0x520 [79885.330875] [] sys_sendmsg+0x49/0x90 [79885.336707] [] system_call_fastpath+0x16/0x1b Signed-off-by: Narendra K Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c index b8ead696141e..2a179d087207 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c @@ -15,7 +15,7 @@ qlcnic_poll_rsp(struct qlcnic_adapter *adapter) do { /* give atleast 1ms for firmware to respond */ - msleep(1); + mdelay(1); if (++timeout > QLCNIC_OS_CRB_RETRY_COUNT) return QLCNIC_CDRP_RSP_TIMEOUT; @@ -601,7 +601,7 @@ void qlcnic_fw_destroy_ctx(struct qlcnic_adapter *adapter) qlcnic_fw_cmd_destroy_tx_ctx(adapter); /* Allow dma queues to drain after context reset */ - msleep(20); + mdelay(20); } } From bc9259a8bae9e814fc1f775a1b3effa13e6ad330 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 27 Sep 2012 04:11:00 +0000 Subject: [PATCH 38/44] inetpeer: fix token initialization When jiffies wraps around (for example, 5 minutes after the boot, see INITIAL_JIFFIES) and peer has just been created, now - peer->rate_last can be < XRLIM_BURST_FACTOR * timeout, so token is not set to the maximum value, thus some icmp packets can be unexpectedly dropped. Fix this case by initializing last_rate to 60 seconds in the past. Signed-off-by: Nicolas Dichtel Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inetpeer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index e1e0a4e8fd34..c7527f6b9ad9 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -510,7 +510,10 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, secure_ipv6_id(daddr->addr.a6)); p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; - p->rate_last = 0; + /* 60*HZ is arbitrary, but chosen enough high so that the first + * calculation of tokens is at its maximum. + */ + p->rate_last = jiffies - 60*HZ; INIT_LIST_HEAD(&p->gc_list); /* Link the node. */ From df555b665367f9de6c04826acc482096f17c243d Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 27 Sep 2012 19:04:21 +0000 Subject: [PATCH 39/44] netdev: octeon: fix return value check in octeon_mgmt_init_phy() In case of error, the function of_phy_connect() returns NULL pointer not ERR_PTR(). The IS_ERR() test in the return value check should be replaced with NULL test. dpatch engine is used to auto generate this patch. (https://github.com/weiyj/dpatch) Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- drivers/net/ethernet/octeon/octeon_mgmt.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/octeon/octeon_mgmt.c b/drivers/net/ethernet/octeon/octeon_mgmt.c index c42bbb16cdae..a688a2ddcfd6 100644 --- a/drivers/net/ethernet/octeon/octeon_mgmt.c +++ b/drivers/net/ethernet/octeon/octeon_mgmt.c @@ -722,10 +722,8 @@ static int octeon_mgmt_init_phy(struct net_device *netdev) octeon_mgmt_adjust_link, 0, PHY_INTERFACE_MODE_MII); - if (IS_ERR(p->phydev)) { - p->phydev = NULL; + if (!p->phydev) return -1; - } phy_start_aneg(p->phydev); From 0774e392555a128cff7a94929b9ce957927fef49 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 28 Sep 2012 16:14:44 +0200 Subject: [PATCH 40/44] iommu/amd: Fix wrong assumption in iommu-group specific code The new IOMMU groups code in the AMD IOMMU driver makes the assumption that there is a pci_dev struct available for all device-ids listed in the IVRS ACPI table. Unfortunatly this assumption is not true and so this code causes a NULL pointer dereference at boot on some systems. Fix it by making sure the given pointer is never NULL when passed to the group specific code. The real fix is larger and will be queued for v3.7. Reported-by: Florian Dazinger Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index b64502dfa9f4..e89daf1b21b4 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -266,7 +266,7 @@ static void swap_pci_ref(struct pci_dev **from, struct pci_dev *to) static int iommu_init_device(struct device *dev) { - struct pci_dev *dma_pdev, *pdev = to_pci_dev(dev); + struct pci_dev *dma_pdev = NULL, *pdev = to_pci_dev(dev); struct iommu_dev_data *dev_data; struct iommu_group *group; u16 alias; @@ -293,7 +293,9 @@ static int iommu_init_device(struct device *dev) dev_data->alias_data = alias_data; dma_pdev = pci_get_bus_and_slot(alias >> 8, alias & 0xff); - } else + } + + if (dma_pdev == NULL) dma_pdev = pci_dev_get(pdev); /* Account for quirked devices */ From 99a1300e1d84709f419182bb5189760e78234882 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 28 Sep 2012 14:35:31 +0200 Subject: [PATCH 41/44] thp: avoid VM_BUG_ON page_count(page) false positives in __collapse_huge_page_copy Speculative cache pagecache lookups can elevate the refcount from under us, so avoid the false positive. If the refcount is < 2 we'll be notified by a VM_BUG_ON in put_page_testzero as there are two put_page(src_page) in a row before returning from this function. Signed-off-by: Andrea Arcangeli Reviewed-by: Rik van Riel Reviewed-by: Johannes Weiner Cc: Hugh Dickins Cc: Mel Gorman Cc: Petr Holasek Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 57c4b9309015..141dbb695097 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1811,7 +1811,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, src_page = pte_page(pteval); copy_user_highpage(page, src_page, address, vma); VM_BUG_ON(page_mapcount(src_page) != 1); - VM_BUG_ON(page_count(src_page) != 2); release_pte_page(src_page); /* * ptl mostly unnecessary, but preempt has to From 9c603e53d380459fb62fec7cd085acb0b74ac18f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 8 Sep 2012 12:57:30 -0700 Subject: [PATCH 42/44] mtdchar: fix offset overflow detection Sasha Levin has been running trinity in a KVM tools guest, and was able to trigger the BUG_ON() at arch/x86/mm/pat.c:279 (verifying the range of the memory type). The call trace showed that it was mtdchar_mmap() that created an invalid remap_pfn_range(). The problem is that mtdchar_mmap() does various really odd and subtle things with the vma page offset etc, and uses the wrong types (and the wrong overflow) detection for it. For example, the page offset may well be 32-bit on a 32-bit architecture, but after shifting it up by PAGE_SHIFT, we need to use a potentially 64-bit resource_size_t to correctly hold the full value. Also, we need to check that the vma length plus offset doesn't overflow before we check that it is smaller than the length of the mtdmap region. This fixes things up and tries to make the code a bit easier to read. Reported-and-tested-by: Sasha Levin Acked-by: Suresh Siddha Acked-by: Artem Bityutskiy Cc: David Woodhouse Cc: linux-mtd@lists.infradead.org Signed-off-by: Linus Torvalds --- drivers/mtd/mtdchar.c | 48 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index f2f482bec573..a6e74514e662 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c @@ -1123,6 +1123,33 @@ static unsigned long mtdchar_get_unmapped_area(struct file *file, } #endif +static inline unsigned long get_vm_size(struct vm_area_struct *vma) +{ + return vma->vm_end - vma->vm_start; +} + +static inline resource_size_t get_vm_offset(struct vm_area_struct *vma) +{ + return (resource_size_t) vma->vm_pgoff << PAGE_SHIFT; +} + +/* + * Set a new vm offset. + * + * Verify that the incoming offset really works as a page offset, + * and that the offset and size fit in a resource_size_t. + */ +static inline int set_vm_offset(struct vm_area_struct *vma, resource_size_t off) +{ + pgoff_t pgoff = off >> PAGE_SHIFT; + if (off != (resource_size_t) pgoff << PAGE_SHIFT) + return -EINVAL; + if (off + get_vm_size(vma) - 1 < off) + return -EINVAL; + vma->vm_pgoff = pgoff; + return 0; +} + /* * set up a mapping for shared memory segments */ @@ -1132,20 +1159,29 @@ static int mtdchar_mmap(struct file *file, struct vm_area_struct *vma) struct mtd_file_info *mfi = file->private_data; struct mtd_info *mtd = mfi->mtd; struct map_info *map = mtd->priv; - unsigned long start; - unsigned long off; - u32 len; + resource_size_t start, off; + unsigned long len, vma_len; if (mtd->type == MTD_RAM || mtd->type == MTD_ROM) { - off = vma->vm_pgoff << PAGE_SHIFT; + off = get_vm_offset(vma); start = map->phys; len = PAGE_ALIGN((start & ~PAGE_MASK) + map->size); start &= PAGE_MASK; - if ((vma->vm_end - vma->vm_start + off) > len) + vma_len = get_vm_size(vma); + + /* Overflow in off+len? */ + if (vma_len + off < off) + return -EINVAL; + /* Does it fit in the mapping? */ + if (vma_len + off > len) return -EINVAL; off += start; - vma->vm_pgoff = off >> PAGE_SHIFT; + /* Did that overflow? */ + if (off < start) + return -EINVAL; + if (set_vm_offset(vma, off) < 0) + return -EINVAL; vma->vm_flags |= VM_IO | VM_RESERVED; #ifdef pgprot_noncached From 8110e16d42d587997bcaee0c864179e6d93603fe Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 17 Sep 2012 22:23:30 +0200 Subject: [PATCH 43/44] vfs: dcache: fix deadlock in tree traversal IBM reported a deadlock in select_parent(). This was found to be caused by taking rename_lock when already locked when restarting the tree traversal. There are two cases when the traversal needs to be restarted: 1) concurrent d_move(); this can only happen when not already locked, since taking rename_lock protects against concurrent d_move(). 2) racing with final d_put() on child just at the moment of ascending to parent; rename_lock doesn't protect against this rare race, so it can happen when already locked. Because of case 2, we need to be able to handle restarting the traversal when rename_lock is already held. This patch fixes all three callers of try_to_ascend(). IBM reported that the deadlock is gone with this patch. [ I rewrote the patch to be smaller and just do the "goto again" if the lock was already held, but credit goes to Miklos for the real work. - Linus ] Signed-off-by: Miklos Szeredi Cc: Al Viro Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/dcache.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/dcache.c b/fs/dcache.c index 0364af2311f4..693f95bf1cae 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1134,6 +1134,8 @@ int have_submounts(struct dentry *parent) return 1; rename_retry: + if (locked) + goto again; locked = 1; write_seqlock(&rename_lock); goto again; @@ -1236,6 +1238,8 @@ static int select_parent(struct dentry *parent, struct list_head *dispose) rename_retry: if (found) return found; + if (locked) + goto again; locked = 1; write_seqlock(&rename_lock); goto again; @@ -3035,6 +3039,8 @@ void d_genocide(struct dentry *root) return; rename_retry: + if (locked) + goto again; locked = 1; write_seqlock(&rename_lock); goto again; From a0d271cbfed1dd50278c6b06bead3d00ba0a88f9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 30 Sep 2012 16:47:46 -0700 Subject: [PATCH 44/44] Linux 3.6 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a3c11d589681..bb9fff26f078 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 3 PATCHLEVEL = 6 SUBLEVEL = 0 -EXTRAVERSION = -rc7 +EXTRAVERSION = NAME = Terrified Chipmunk # *DOCUMENTATION*