From 0d23ba6034b9cf48b8918404367506da3e4b3ee5 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 3 Sep 2018 18:54:14 +0200 Subject: [PATCH 1/8] RDMA/ucma: check fd type in ucma_migrate_id() The current code grabs the private_data of whatever file descriptor userspace has supplied and implicitly casts it to a `struct ucma_file *`, potentially causing a type confusion. This is probably fine in practice because the pointer is only used for comparisons, it is never actually dereferenced; and even in the comparisons, it is unlikely that a file from another filesystem would have a ->private_data pointer that happens to also be valid in this context. But ->private_data is not always guaranteed to be a valid pointer to an object owned by the file's filesystem; for example, some filesystems just cram numbers in there. Check the type of the supplied file descriptor to be safe, analogous to how other places in the kernel do it. Fixes: 88314e4dda1e ("RDMA/cma: add support for rdma_migrate_id()") Signed-off-by: Jann Horn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/ucma.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index ec8fb289621f..5f437d1570fb 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -124,6 +124,8 @@ static DEFINE_MUTEX(mut); static DEFINE_IDR(ctx_idr); static DEFINE_IDR(multicast_idr); +static const struct file_operations ucma_fops; + static inline struct ucma_context *_ucma_find_context(int id, struct ucma_file *file) { @@ -1581,6 +1583,10 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file, f = fdget(cmd.fd); if (!f.file) return -ENOENT; + if (f.file->f_op != &ucma_fops) { + ret = -EINVAL; + goto file_put; + } /* Validate current fd and prevent destruction of id. */ ctx = ucma_get_ctx(f.file->private_data, cmd.id); From e4ff3d22c11dd505353896cdcad0ee8f3251be68 Mon Sep 17 00:00:00 2001 From: Artemy Kovalyov Date: Tue, 28 Aug 2018 14:40:32 +0300 Subject: [PATCH 2/8] IB/core: Release object lock if destroy failed The object lock was supposed to always be released during destroy, but when the destruction retry series was integrated with the destroy series it created a failure path that missed the unlock. Keep with convention, if destroy fails the caller must undo all locking. Fixes: 87ad80abc70d ("IB/uverbs: Consolidate uobject destruction") Signed-off-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/rdma_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 6eb64c6f0802..c4118bcd5103 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -882,6 +882,8 @@ static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile, WARN_ON(uverbs_try_lock_object(obj, UVERBS_LOOKUP_WRITE)); if (!uverbs_destroy_uobject(obj, reason)) ret = 0; + else + atomic_set(&obj->usecnt, 0); } return ret; } From 308aa2b8f7b7db3332a7d41099fd37851fb793b2 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 31 Aug 2018 07:15:56 -0700 Subject: [PATCH 3/8] iw_cxgb4: only allow 1 flush on user qps Once the qp has been flushed, it cannot be flushed again. The user qp flush logic wasn't enforcing it however. The bug can cause touch-after-free crashes like: Unable to handle kernel paging request for data at address 0x000001ec Faulting instruction address: 0xc008000016069100 Oops: Kernel access of bad area, sig: 11 [#1] ... NIP [c008000016069100] flush_qp+0x80/0x480 [iw_cxgb4] LR [c00800001606cd6c] c4iw_modify_qp+0x71c/0x11d0 [iw_cxgb4] Call Trace: [c00800001606cd6c] c4iw_modify_qp+0x71c/0x11d0 [iw_cxgb4] [c00800001606e868] c4iw_ib_modify_qp+0x118/0x200 [iw_cxgb4] [c0080000119eae80] ib_security_modify_qp+0xd0/0x3d0 [ib_core] [c0080000119c4e24] ib_modify_qp+0xc4/0x2c0 [ib_core] [c008000011df0284] iwcm_modify_qp_err+0x44/0x70 [iw_cm] [c008000011df0fec] destroy_cm_id+0xcc/0x370 [iw_cm] [c008000011ed4358] rdma_destroy_id+0x3c8/0x520 [rdma_cm] [c0080000134b0540] ucma_close+0x90/0x1b0 [rdma_ucm] [c000000000444da4] __fput+0xe4/0x2f0 So fix flush_qp() to only flush the wq once. Cc: stable@vger.kernel.org Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/qp.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index b3203afa3b1d..347fe18b1a41 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -1685,6 +1685,12 @@ static void flush_qp(struct c4iw_qp *qhp) schp = to_c4iw_cq(qhp->ibqp.send_cq); if (qhp->ibqp.uobject) { + + /* for user qps, qhp->wq.flushed is protected by qhp->mutex */ + if (qhp->wq.flushed) + return; + + qhp->wq.flushed = 1; t4_set_wq_in_error(&qhp->wq, 0); t4_set_cq_in_error(&rchp->cq); spin_lock_irqsave(&rchp->comp_handler_lock, flag); From 816e846c2eb9129a3e0afa5f920c8bbc71efecaa Mon Sep 17 00:00:00 2001 From: Aaron Knister Date: Fri, 24 Aug 2018 08:42:46 -0400 Subject: [PATCH 4/8] IB/ipoib: Avoid a race condition between start_xmit and cm_rep_handler Inside of start_xmit() the call to check if the connection is up and the queueing of the packets for later transmission is not atomic which leaves a window where cm_rep_handler can run, set the connection up, dequeue pending packets and leave the subsequently queued packets by start_xmit() sitting on neigh->queue until they're dropped when the connection is torn down. This only applies to connected mode. These dropped packets can really upset TCP, for example, and cause multi-minute delays in transmission for open connections. Here's the code in start_xmit where we check to see if the connection is up: if (ipoib_cm_get(neigh)) { if (ipoib_cm_up(neigh)) { ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); goto unref; } } The race occurs if cm_rep_handler execution occurs after the above connection check (specifically if it gets to the point where it acquires priv->lock to dequeue pending skb's) but before the below code snippet in start_xmit where packets are queued. if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, phdr->hwaddr); spin_lock_irqsave(&priv->lock, flags); __skb_queue_tail(&neigh->queue, skb); spin_unlock_irqrestore(&priv->lock, flags); } else { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } The patch acquires the netif tx lock in cm_rep_handler for the section where it sets the connection up and dequeues and retransmits deferred skb's. Fixes: 839fcaba355a ("IPoIB: Connected mode experimental support") Cc: stable@vger.kernel.org Signed-off-by: Aaron Knister Tested-by: Ira Weiny Reviewed-by: Ira Weiny Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index ea01b8dd2be6..3d5424f335cb 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -1027,12 +1027,14 @@ static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, skb_queue_head_init(&skqueue); + netif_tx_lock_bh(p->dev); spin_lock_irq(&priv->lock); set_bit(IPOIB_FLAG_OPER_UP, &p->flags); if (p->neigh) while ((skb = __skb_dequeue(&p->neigh->queue))) __skb_queue_tail(&skqueue, skb); spin_unlock_irq(&priv->lock); + netif_tx_unlock_bh(p->dev); while ((skb = __skb_dequeue(&skqueue))) { skb->dev = p->dev; From f40f299bbe806a2e2c8b0d7cdda822fa3bdd171b Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Wed, 5 Sep 2018 13:20:34 +0530 Subject: [PATCH 5/8] bnxt_re: Fix couple of memory leaks that could lead to IOMMU call traces 1. DMA-able memory allocated for Shadow QP was not being freed. 2. bnxt_qplib_alloc_qp_hdr_buf() had a bug wherein the SQ pointer was erroneously pointing to the RQ. But since the corresponding free_qp_hdr_buf() was correct, memory being free was less than what was allocated. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Signed-off-by: Somnath Kotur Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 2 ++ drivers/infiniband/hw/bnxt_re/qplib_fp.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index bbfb86eb2d24..bc2b9e038439 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -833,6 +833,8 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp) "Failed to destroy Shadow QP"); return rc; } + bnxt_qplib_free_qp_res(&rdev->qplib_res, + &rdev->qp1_sqp->qplib_qp); mutex_lock(&rdev->qp_lock); list_del(&rdev->qp1_sqp->list); atomic_dec(&rdev->qp_count); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index e426b990c1dd..6ad0d46ab879 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -196,7 +196,7 @@ static int bnxt_qplib_alloc_qp_hdr_buf(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) { struct bnxt_qplib_q *rq = &qp->rq; - struct bnxt_qplib_q *sq = &qp->rq; + struct bnxt_qplib_q *sq = &qp->sq; int rc = 0; if (qp->sq_hdr_buf_size && sq->hwq.max_elements) { From 08e74be103051861eb2c1ee52a2dcf119cde264f Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 09:47:57 +0300 Subject: [PATCH 6/8] RDMA/uverbs: Fix error cleanup path of ib_uverbs_add_one() If ib_uverbs_create_uapi() fails, dev_num should be freed from the bitmap. Fixes: 7d96c9b17636 ("IB/uverbs: Have the core code create the uverbs_root_spec") Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 823beca448e1..6d974e2363df 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -1050,7 +1050,7 @@ static void ib_uverbs_add_one(struct ib_device *device) uverbs_dev->num_comp_vectors = device->num_comp_vectors; if (ib_uverbs_create_uapi(device, uverbs_dev)) - goto err; + goto err_uapi; cdev_init(&uverbs_dev->cdev, NULL); uverbs_dev->cdev.owner = THIS_MODULE; @@ -1077,11 +1077,10 @@ static void ib_uverbs_add_one(struct ib_device *device) err_class: device_destroy(uverbs_class, uverbs_dev->cdev.dev); - err_cdev: cdev_del(&uverbs_dev->cdev); +err_uapi: clear_bit(devnum, dev_map); - err: if (atomic_dec_and_test(&uverbs_dev->refcount)) ib_uverbs_comp_dev(uverbs_dev); From 954a8e3aea87e896e320cf648c1a5bbe47de443e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 30 Aug 2018 08:35:19 +0300 Subject: [PATCH 7/8] RDMA/cma: Protect cma dev list with lock When AF_IB addresses are used during rdma_resolve_addr() a lock is not held. A cma device can get removed while list traversal is in progress which may lead to crash. ie CPU0 CPU1 ==== ==== rdma_resolve_addr() cma_resolve_ib_dev() list_for_each() cma_remove_one() cur_dev->device mutex_lock(&lock) list_del(); mutex_unlock(&lock); cma_process_remove(); Therefore, hold a lock while traversing the list which avoids such situation. Cc: # 3.10 Fixes: f17df3b0dede ("RDMA/cma: Add support for AF_IB to rdma_resolve_addr()") Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index f72677291b69..a36c94930c31 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -724,6 +724,7 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) dgid = (union ib_gid *) &addr->sib_addr; pkey = ntohs(addr->sib_pkey); + mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { if (!rdma_cap_af_ib(cur_dev->device, p)) @@ -750,18 +751,19 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) cma_dev = cur_dev; sgid = gid; id_priv->id.port_num = p; + goto found; } } } } - - if (!cma_dev) - return -ENODEV; + mutex_unlock(&lock); + return -ENODEV; found: cma_attach_to_dev(id_priv, cma_dev); - addr = (struct sockaddr_ib *) cma_src_addr(id_priv); - memcpy(&addr->sib_addr, &sgid, sizeof sgid); + mutex_unlock(&lock); + addr = (struct sockaddr_ib *)cma_src_addr(id_priv); + memcpy(&addr->sib_addr, &sgid, sizeof(sgid)); cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr); return 0; } From 8f28b178f71cc56eccf2a6e2c0ace17c82f900d7 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 3 Sep 2018 09:11:14 +0300 Subject: [PATCH 8/8] RDMA/mlx4: Ensure that maximal send/receive SGE less than supported by HW In calculating the global maximum number of the Scatter/Gather elements supported, the following four maximum parameters must be taken into consideration: max_sg_rq, max_sg_sq, max_desc_sz_rq and max_desc_sz_sq. However instead of bringing this complexity to query_device, which still won't be sufficient anyway (the calculations are dependent on QP type), the safer approach will be to restore old code, which will give us 32 SGEs. Fixes: 33023fb85a42 ("IB/core: add max_send_sge and max_recv_sge attributes") Reported-by: Chuck Lever Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/main.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index ca0f1ee26091..0bbeaaae47e0 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -517,9 +517,11 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->page_size_cap = dev->dev->caps.page_size_cap; props->max_qp = dev->dev->quotas.qp; props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; - props->max_send_sge = dev->dev->caps.max_sq_sg; - props->max_recv_sge = dev->dev->caps.max_rq_sg; - props->max_sge_rd = MLX4_MAX_SGE_RD; + props->max_send_sge = + min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); + props->max_recv_sge = + min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); + props->max_sge_rd = MLX4_MAX_SGE_RD; props->max_cq = dev->dev->quotas.cq; props->max_cqe = dev->dev->caps.max_cqes; props->max_mr = dev->dev->quotas.mpt;