From f39cc34df7ed919c5c7630a8a7a701939356c37c Mon Sep 17 00:00:00 2001
From: Mike Marciniszyn <mike.marciniszyn@intel.com>
Date: Tue, 12 Apr 2016 10:45:51 -0700
Subject: [PATCH 01/46] IB/rdmavt: Fix adaptive pio hang

The RVT_S_WAIT_PIO_DRAIN flag was missing from
the set of flags indicating a qp is waiting
on a resource.

This caused the sleep/wakeup for adaptive pio
drain to lose a wakeup "hanging" a QP.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/rdma/rdmavt_qp.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 497e59065c2c..0e1ff2abfe92 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -117,8 +117,9 @@
 /*
  * Wait flags that would prevent any packet type from being sent.
  */
-#define RVT_S_ANY_WAIT_IO (RVT_S_WAIT_PIO | RVT_S_WAIT_TX | \
-	RVT_S_WAIT_DMA_DESC | RVT_S_WAIT_KMEM)
+#define RVT_S_ANY_WAIT_IO \
+	(RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN | RVT_S_WAIT_TX | \
+	 RVT_S_WAIT_DMA_DESC | RVT_S_WAIT_KMEM)
 
 /*
  * Wait flags that would prevent send work requests from making progress.

From 747f4d7a9d1bc07e3f9f22c84201ffb0abee1634 Mon Sep 17 00:00:00 2001
From: Mike Marciniszyn <mike.marciniszyn@intel.com>
Date: Tue, 12 Apr 2016 10:46:10 -0700
Subject: [PATCH 02/46] IB/qib, IB/hfi1: Fix up UD loopback use of irq flags

The dual lock patch moved locking around and missed an issue
with handling irq flags when processing UD loopback
packets.  This issue was revealed by smatch.

Fix for both qib and hfi1 to pass the saved flags to the UD request
builder and handle the changes correctly.

Fixes: 46a80d62e6e0 ("IB/qib, staging/rdma/hfi1: add s_hlock for use in post send")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/qib/qib_rc.c    |  2 +-
 drivers/infiniband/hw/qib/qib_ruc.c   |  4 ++--
 drivers/infiniband/hw/qib/qib_uc.c    |  2 +-
 drivers/infiniband/hw/qib/qib_ud.c    | 10 +++++-----
 drivers/infiniband/hw/qib/qib_verbs.h |  6 +++---
 drivers/staging/rdma/hfi1/ruc.c       | 20 +++++++++++---------
 drivers/staging/rdma/hfi1/ud.c        |  8 ++++----
 drivers/staging/rdma/hfi1/verbs.h     |  1 +
 8 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c
index 9088e26d3ac8..444028a3582a 100644
--- a/drivers/infiniband/hw/qib/qib_rc.c
+++ b/drivers/infiniband/hw/qib/qib_rc.c
@@ -230,7 +230,7 @@ static int qib_make_rc_ack(struct qib_ibdev *dev, struct rvt_qp *qp,
  *
  * Return 1 if constructed; otherwise, return 0.
  */
-int qib_make_rc_req(struct rvt_qp *qp)
+int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags)
 {
 	struct qib_qp_priv *priv = qp->priv;
 	struct qib_ibdev *dev = to_idev(qp->ibqp.device);
diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c
index a5f07a64b228..b67779256297 100644
--- a/drivers/infiniband/hw/qib/qib_ruc.c
+++ b/drivers/infiniband/hw/qib/qib_ruc.c
@@ -739,7 +739,7 @@ void qib_do_send(struct rvt_qp *qp)
 	struct qib_qp_priv *priv = qp->priv;
 	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
 	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
-	int (*make_req)(struct rvt_qp *qp);
+	int (*make_req)(struct rvt_qp *qp, unsigned long *flags);
 	unsigned long flags;
 
 	if ((qp->ibqp.qp_type == IB_QPT_RC ||
@@ -781,7 +781,7 @@ void qib_do_send(struct rvt_qp *qp)
 			qp->s_hdrwords = 0;
 			spin_lock_irqsave(&qp->s_lock, flags);
 		}
-	} while (make_req(qp));
+	} while (make_req(qp, &flags));
 
 	spin_unlock_irqrestore(&qp->s_lock, flags);
 }
diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c
index 7bdbc79ceaa3..1d61bd04f449 100644
--- a/drivers/infiniband/hw/qib/qib_uc.c
+++ b/drivers/infiniband/hw/qib/qib_uc.c
@@ -45,7 +45,7 @@
  *
  * Return 1 if constructed; otherwise, return 0.
  */
-int qib_make_uc_req(struct rvt_qp *qp)
+int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags)
 {
 	struct qib_qp_priv *priv = qp->priv;
 	struct qib_other_headers *ohdr;
diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c
index d9502137de62..846e6c726df7 100644
--- a/drivers/infiniband/hw/qib/qib_ud.c
+++ b/drivers/infiniband/hw/qib/qib_ud.c
@@ -238,7 +238,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
  *
  * Return 1 if constructed; otherwise, return 0.
  */
-int qib_make_ud_req(struct rvt_qp *qp)
+int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags)
 {
 	struct qib_qp_priv *priv = qp->priv;
 	struct qib_other_headers *ohdr;
@@ -294,7 +294,7 @@ int qib_make_ud_req(struct rvt_qp *qp)
 		this_cpu_inc(ibp->pmastats->n_unicast_xmit);
 		lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1);
 		if (unlikely(lid == ppd->lid)) {
-			unsigned long flags;
+			unsigned long tflags = *flags;
 			/*
 			 * If DMAs are in progress, we can't generate
 			 * a completion for the loopback packet since
@@ -307,10 +307,10 @@ int qib_make_ud_req(struct rvt_qp *qp)
 				goto bail;
 			}
 			qp->s_cur = next_cur;
-			local_irq_save(flags);
-			spin_unlock_irqrestore(&qp->s_lock, flags);
+			spin_unlock_irqrestore(&qp->s_lock, tflags);
 			qib_ud_loopback(qp, wqe);
-			spin_lock_irqsave(&qp->s_lock, flags);
+			spin_lock_irqsave(&qp->s_lock, tflags);
+			*flags = tflags;
 			qib_send_complete(qp, wqe, IB_WC_SUCCESS);
 			goto done;
 		}
diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h
index 4b76a8d59337..6888f03c6d61 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.h
+++ b/drivers/infiniband/hw/qib/qib_verbs.h
@@ -430,11 +430,11 @@ void qib_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
 
 void qib_send_rc_ack(struct rvt_qp *qp);
 
-int qib_make_rc_req(struct rvt_qp *qp);
+int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags);
 
-int qib_make_uc_req(struct rvt_qp *qp);
+int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags);
 
-int qib_make_ud_req(struct rvt_qp *qp);
+int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags);
 
 int qib_register_ib_device(struct qib_devdata *);
 
diff --git a/drivers/staging/rdma/hfi1/ruc.c b/drivers/staging/rdma/hfi1/ruc.c
index 08813cdbd475..a659aec3c3c6 100644
--- a/drivers/staging/rdma/hfi1/ruc.c
+++ b/drivers/staging/rdma/hfi1/ruc.c
@@ -831,7 +831,6 @@ void hfi1_do_send(struct rvt_qp *qp)
 	struct hfi1_pkt_state ps;
 	struct hfi1_qp_priv *priv = qp->priv;
 	int (*make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
-	unsigned long flags;
 	unsigned long timeout;
 	unsigned long timeout_int;
 	int cpu;
@@ -866,11 +865,11 @@ void hfi1_do_send(struct rvt_qp *qp)
 		timeout_int = SEND_RESCHED_TIMEOUT;
 	}
 
-	spin_lock_irqsave(&qp->s_lock, flags);
+	spin_lock_irqsave(&qp->s_lock, ps.flags);
 
 	/* Return if we are already busy processing a work request. */
 	if (!hfi1_send_ok(qp)) {
-		spin_unlock_irqrestore(&qp->s_lock, flags);
+		spin_unlock_irqrestore(&qp->s_lock, ps.flags);
 		return;
 	}
 
@@ -884,7 +883,7 @@ void hfi1_do_send(struct rvt_qp *qp)
 	do {
 		/* Check for a constructed packet to be sent. */
 		if (qp->s_hdrwords != 0) {
-			spin_unlock_irqrestore(&qp->s_lock, flags);
+			spin_unlock_irqrestore(&qp->s_lock, ps.flags);
 			/*
 			 * If the packet cannot be sent now, return and
 			 * the send tasklet will be woken up later.
@@ -897,11 +896,14 @@ void hfi1_do_send(struct rvt_qp *qp)
 			if (unlikely(time_after(jiffies, timeout))) {
 				if (workqueue_congested(cpu,
 							ps.ppd->hfi1_wq)) {
-					spin_lock_irqsave(&qp->s_lock, flags);
+					spin_lock_irqsave(
+						&qp->s_lock,
+						ps.flags);
 					qp->s_flags &= ~RVT_S_BUSY;
 					hfi1_schedule_send(qp);
-					spin_unlock_irqrestore(&qp->s_lock,
-							       flags);
+					spin_unlock_irqrestore(
+						&qp->s_lock,
+						ps.flags);
 					this_cpu_inc(
 						*ps.ppd->dd->send_schedule);
 					return;
@@ -913,11 +915,11 @@ void hfi1_do_send(struct rvt_qp *qp)
 				}
 				timeout = jiffies + (timeout_int) / 8;
 			}
-			spin_lock_irqsave(&qp->s_lock, flags);
+			spin_lock_irqsave(&qp->s_lock, ps.flags);
 		}
 	} while (make_req(qp, &ps));
 
-	spin_unlock_irqrestore(&qp->s_lock, flags);
+	spin_unlock_irqrestore(&qp->s_lock, ps.flags);
 }
 
 /*
diff --git a/drivers/staging/rdma/hfi1/ud.c b/drivers/staging/rdma/hfi1/ud.c
index ae8a70f703eb..1e503ad0bebb 100644
--- a/drivers/staging/rdma/hfi1/ud.c
+++ b/drivers/staging/rdma/hfi1/ud.c
@@ -322,7 +322,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 			     (lid == ppd->lid ||
 			      (lid == be16_to_cpu(IB_LID_PERMISSIVE) &&
 			      qp->ibqp.qp_type == IB_QPT_GSI)))) {
-			unsigned long flags;
+			unsigned long tflags = ps->flags;
 			/*
 			 * If DMAs are in progress, we can't generate
 			 * a completion for the loopback packet since
@@ -335,10 +335,10 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 				goto bail;
 			}
 			qp->s_cur = next_cur;
-			local_irq_save(flags);
-			spin_unlock_irqrestore(&qp->s_lock, flags);
+			spin_unlock_irqrestore(&qp->s_lock, tflags);
 			ud_loopback(qp, wqe);
-			spin_lock_irqsave(&qp->s_lock, flags);
+			spin_lock_irqsave(&qp->s_lock, tflags);
+			ps->flags = tflags;
 			hfi1_send_complete(qp, wqe, IB_WC_SUCCESS);
 			goto done_free_tx;
 		}
diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h
index 6c4670fffdbb..2ba1373f4fb4 100644
--- a/drivers/staging/rdma/hfi1/verbs.h
+++ b/drivers/staging/rdma/hfi1/verbs.h
@@ -215,6 +215,7 @@ struct hfi1_pkt_state {
 	struct hfi1_ibport *ibp;
 	struct hfi1_pportdata *ppd;
 	struct verbs_txreq *s_txreq;
+	unsigned long flags;
 };
 
 #define HFI1_PSN_CREDIT  16

From 4787bc5e1783e94f6b9518664609f3034dc799eb Mon Sep 17 00:00:00 2001
From: Mitko Haralanov <mitko.haralanov@intel.com>
Date: Tue, 12 Apr 2016 10:46:23 -0700
Subject: [PATCH 03/46] IB/hfi1: Don't remove list entries if they are not in a
 list

The SDMA cache logic maintains an eviction list which is ordered
by most recently used user buffers. Upon errors or buffer freeing,
the list nodes were unconditionally being deleted. This would lead
to list corruption warnings if the nodes were never inserted in the
eviction list to begin with.

This commit prevents this by checking that the nodes are already
part of the eviction list.

Reviewed-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/user_sdma.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c
index d53a659548e0..032949bac801 100644
--- a/drivers/staging/rdma/hfi1/user_sdma.c
+++ b/drivers/staging/rdma/hfi1/user_sdma.c
@@ -1135,7 +1135,8 @@ static int pin_vector_pages(struct user_sdma_request *req,
 		ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb);
 		if (ret) {
 			spin_lock(&pq->evict_lock);
-			list_del(&node->list);
+			if (!list_empty(&node->list))
+				list_del(&node->list);
 			pq->n_locked -= node->npages;
 			spin_unlock(&pq->evict_lock);
 			ret = 0;
@@ -1558,7 +1559,8 @@ static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode,
 		container_of(mnode, struct sdma_mmu_node, rb);
 
 	spin_lock(&node->pq->evict_lock);
-	list_del(&node->list);
+	if (!list_empty(&node->list))
+		list_del(&node->list);
 	node->pq->n_locked -= node->npages;
 	spin_unlock(&node->pq->evict_lock);
 

From 0ad2d3d05b4a597d1f2e239dcc6ab537cc3eeacb Mon Sep 17 00:00:00 2001
From: Mitko Haralanov <mitko.haralanov@intel.com>
Date: Tue, 12 Apr 2016 10:46:29 -0700
Subject: [PATCH 04/46] IB/hfi1: Fix memory leak in user ExpRcv and SDMA

The driver had two memory leaks - one in the user
expected receive code and one in SDMA buffer cache.

The leak in the expected receive code only showed up
when the user/admin had set ulimit sufficiently low
and the driver did not have enough room in the cache
before hitting the limit of allowed cachable memory.

When this condition occurred, the driver returned
early signaling userland that it needed to free some
buffers to free up room in the cache.

The bug was that the driver was not cleaning up
allocated memory prior to returning early.

The leak in the SDMA buffer cache could occur (even
though it never did), when the insertion of a buffer
node in the interval RB tree failed. In this case, the
driver failed to unpin the pages of the node instead
erroneously returning success.

Reviewed-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/user_exp_rcv.c | 7 +++++--
 drivers/staging/rdma/hfi1/user_sdma.c    | 3 ++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/staging/rdma/hfi1/user_exp_rcv.c
index 8bd56d5c783d..1b640a35b3fe 100644
--- a/drivers/staging/rdma/hfi1/user_exp_rcv.c
+++ b/drivers/staging/rdma/hfi1/user_exp_rcv.c
@@ -399,8 +399,11 @@ int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo)
 	 * pages, accept the amount pinned so far and program only that.
 	 * User space knows how to deal with partially programmed buffers.
 	 */
-	if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages))
-		return -ENOMEM;
+	if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages)) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
 	pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages);
 	if (pinned <= 0) {
 		ret = pinned;
diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c
index 032949bac801..044d33777fba 100644
--- a/drivers/staging/rdma/hfi1/user_sdma.c
+++ b/drivers/staging/rdma/hfi1/user_sdma.c
@@ -1139,7 +1139,8 @@ static int pin_vector_pages(struct user_sdma_request *req,
 				list_del(&node->list);
 			pq->n_locked -= node->npages;
 			spin_unlock(&pq->evict_lock);
-			ret = 0;
+			unpin_vector_pages(current->mm, node->pages, 0,
+					   node->npages);
 			goto bail;
 		}
 	} else {

From 782f6697d20d180843565ac93726f5ec6c7ec8ce Mon Sep 17 00:00:00 2001
From: Mitko Haralanov <mitko.haralanov@intel.com>
Date: Tue, 12 Apr 2016 10:46:35 -0700
Subject: [PATCH 05/46] IB/hfi1: Protect the interval RB tree when cleaning up

The current implementation of the clean up function for
the interval RB trees has two flaws which may cause
problems in cases of concurrent executing of the function
and MMU notifier.

The flaws were due to the fact that deregistration of the
MMU callbacks was done after the tree was emptied and,
furthermore, the tree was not being locked.

This commit fixes both of these flaws by, first, switch the
order of operations, and, second, locking the tree while
traversing it to prevent any other operations.

Reviewed-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/mmu_rb.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/mmu_rb.c b/drivers/staging/rdma/hfi1/mmu_rb.c
index b3f0682a36c9..72b6d70ae2b5 100644
--- a/drivers/staging/rdma/hfi1/mmu_rb.c
+++ b/drivers/staging/rdma/hfi1/mmu_rb.c
@@ -126,10 +126,15 @@ void hfi1_mmu_rb_unregister(struct rb_root *root)
 	if (!handler)
 		return;
 
+	/* Unregister first so we don't get any more notifications. */
+	if (current->mm)
+		mmu_notifier_unregister(&handler->mn, current->mm);
+
 	spin_lock_irqsave(&mmu_rb_lock, flags);
 	list_del(&handler->list);
 	spin_unlock_irqrestore(&mmu_rb_lock, flags);
 
+	spin_lock_irqsave(&handler->lock, flags);
 	if (!RB_EMPTY_ROOT(root)) {
 		struct rb_node *node;
 		struct mmu_rb_node *rbnode;
@@ -141,9 +146,8 @@ void hfi1_mmu_rb_unregister(struct rb_root *root)
 				handler->ops->remove(root, rbnode, NULL);
 		}
 	}
+	spin_unlock_irqrestore(&handler->lock, flags);
 
-	if (current->mm)
-		mmu_notifier_unregister(&handler->mn, current->mm);
 	kfree(handler);
 }
 

From de79093b284888faedb826d8ecd326e5b6843d88 Mon Sep 17 00:00:00 2001
From: Mitko Haralanov <mitko.haralanov@intel.com>
Date: Tue, 12 Apr 2016 10:46:41 -0700
Subject: [PATCH 06/46] IB/hfi1: Correctly compute node interval

The computation of the interval of an interval RB node
was incorrect leading to data corruption due to the RB
search algorithm not properly finding the all RB nodes
in an MMU invalidation interval.

The problem stemmed from the fact that the beginning
address of the node's range was being aligned to a page
boundary. For certain buffer sizes, this would lead to
a end address calculation that was off by 1 page.

An important aspect of keeping the RB same is also
updating the node's range in the case it's being extended.

Reviewed-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/mmu_rb.c    | 2 +-
 drivers/staging/rdma/hfi1/user_sdma.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/mmu_rb.c b/drivers/staging/rdma/hfi1/mmu_rb.c
index 72b6d70ae2b5..a1aaaeac4475 100644
--- a/drivers/staging/rdma/hfi1/mmu_rb.c
+++ b/drivers/staging/rdma/hfi1/mmu_rb.c
@@ -91,7 +91,7 @@ static unsigned long mmu_node_start(struct mmu_rb_node *node)
 
 static unsigned long mmu_node_last(struct mmu_rb_node *node)
 {
-	return PAGE_ALIGN((node->addr & PAGE_MASK) + node->len) - 1;
+	return PAGE_ALIGN(node->addr + node->len) - 1;
 }
 
 int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops)
diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c
index 044d33777fba..d1645d98a43d 100644
--- a/drivers/staging/rdma/hfi1/user_sdma.c
+++ b/drivers/staging/rdma/hfi1/user_sdma.c
@@ -1076,7 +1076,6 @@ static int pin_vector_pages(struct user_sdma_request *req,
 			return -ENOMEM;
 
 		node->rb.addr = (unsigned long)iovec->iov.iov_base;
-		node->rb.len = iovec->iov.iov_len;
 		node->pq = pq;
 		atomic_set(&node->refcount, 0);
 		INIT_LIST_HEAD(&node->list);
@@ -1117,6 +1116,7 @@ static int pin_vector_pages(struct user_sdma_request *req,
 			goto bail;
 		}
 		kfree(node->pages);
+		node->rb.len = iovec->iov.iov_len;
 		node->pages = pages;
 		node->npages += pinned;
 		npages = node->npages;

From f53af85e4735514a572996fb551968da78657639 Mon Sep 17 00:00:00 2001
From: Mitko Haralanov <mitko.haralanov@intel.com>
Date: Tue, 12 Apr 2016 10:46:47 -0700
Subject: [PATCH 07/46] IB/hfi1: Extract and reinsert MMU RB node on lookup

The page pinning function, which also maintains the pin cache,
behaves one of two ways when an exact buffer match is not found:
  1. If no node is not found (a buffer with the same starting address
     is not found in the cache), a new node is created, the buffer
     pages are pinned, and the node is inserted into the RB tree, or
  2. If a node is found but the buffer in that node is a subset of
     the new user buffer, the node is extended with the new buffer
     pages.

Both modes of operation require (re-)insertion into the interval RB
tree.

When the node being inserted is a new node, the operations are pretty
simple. However, when the node is already existing and is being
extended, special care must be taken.

First, we want to guard against an asynchronous attempt to
delete the node by the MMU invalidation notifier. The simplest way to
do this is to remove the node from the RB tree, preventing the search
algorithm from finding it.

Second, the node needs to be re-inserted so it lands in the proper place
in the tree and the tree is correctly re-balanced. This also requires
the node to be removed from the RB tree.

This commit adds the hfi1_mmu_rb_extract() function, which will search
for a node in the interval RB tree matching an address and length and
remove it from the RB tree if found. This allows for both of the above
special cases be handled in a single step.

Reviewed-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/mmu_rb.c    | 19 +++++++++++++++
 drivers/staging/rdma/hfi1/mmu_rb.h    |  2 ++
 drivers/staging/rdma/hfi1/user_sdma.c | 33 ++++++++++++---------------
 3 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/mmu_rb.c b/drivers/staging/rdma/hfi1/mmu_rb.c
index a1aaaeac4475..03f360accacd 100644
--- a/drivers/staging/rdma/hfi1/mmu_rb.c
+++ b/drivers/staging/rdma/hfi1/mmu_rb.c
@@ -239,6 +239,25 @@ struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *root, unsigned long addr,
 	return node;
 }
 
+struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *root,
+					unsigned long addr, unsigned long len)
+{
+	struct mmu_rb_handler *handler = find_mmu_handler(root);
+	struct mmu_rb_node *node;
+	unsigned long flags;
+
+	if (!handler)
+		return ERR_PTR(-EINVAL);
+
+	spin_lock_irqsave(&handler->lock, flags);
+	node = __mmu_rb_search(handler, addr, len);
+	if (node)
+		__mmu_int_rb_remove(node, handler->root);
+	spin_unlock_irqrestore(&handler->lock, flags);
+
+	return node;
+}
+
 void hfi1_mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node)
 {
 	struct mmu_rb_handler *handler = find_mmu_handler(root);
diff --git a/drivers/staging/rdma/hfi1/mmu_rb.h b/drivers/staging/rdma/hfi1/mmu_rb.h
index 19a306e83c7d..7a57b9c49d27 100644
--- a/drivers/staging/rdma/hfi1/mmu_rb.h
+++ b/drivers/staging/rdma/hfi1/mmu_rb.h
@@ -70,5 +70,7 @@ int hfi1_mmu_rb_insert(struct rb_root *, struct mmu_rb_node *);
 void hfi1_mmu_rb_remove(struct rb_root *, struct mmu_rb_node *);
 struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *, unsigned long,
 				       unsigned long);
+struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *, unsigned long,
+					unsigned long);
 
 #endif /* _HFI1_MMU_RB_H */
diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c
index d1645d98a43d..f7e2fe7ed9db 100644
--- a/drivers/staging/rdma/hfi1/user_sdma.c
+++ b/drivers/staging/rdma/hfi1/user_sdma.c
@@ -1062,9 +1062,9 @@ static int pin_vector_pages(struct user_sdma_request *req,
 	struct sdma_mmu_node *node = NULL;
 	struct mmu_rb_node *rb_node;
 
-	rb_node = hfi1_mmu_rb_search(&pq->sdma_rb_root,
-				     (unsigned long)iovec->iov.iov_base,
-				     iovec->iov.iov_len);
+	rb_node = hfi1_mmu_rb_extract(&pq->sdma_rb_root,
+				      (unsigned long)iovec->iov.iov_base,
+				      iovec->iov.iov_len);
 	if (rb_node && !IS_ERR(rb_node))
 		node = container_of(rb_node, struct sdma_mmu_node, rb);
 	else
@@ -1131,25 +1131,20 @@ static int pin_vector_pages(struct user_sdma_request *req,
 	iovec->pages = node->pages;
 	iovec->npages = npages;
 
-	if (!rb_node) {
-		ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb);
-		if (ret) {
-			spin_lock(&pq->evict_lock);
-			if (!list_empty(&node->list))
-				list_del(&node->list);
-			pq->n_locked -= node->npages;
-			spin_unlock(&pq->evict_lock);
-			unpin_vector_pages(current->mm, node->pages, 0,
-					   node->npages);
-			goto bail;
-		}
-	} else {
-		atomic_inc(&node->refcount);
+	ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb);
+	if (ret) {
+		spin_lock(&pq->evict_lock);
+		if (!list_empty(&node->list))
+			list_del(&node->list);
+		pq->n_locked -= node->npages;
+		spin_unlock(&pq->evict_lock);
+		goto bail;
 	}
 	return 0;
 bail:
-	if (!rb_node)
-		kfree(node);
+	if (rb_node)
+		unpin_vector_pages(current->mm, node->pages, 0, node->npages);
+	kfree(node);
 	return ret;
 }
 

From e88c9271d9f8db79d8104f7f7bd14cb8d88cc187 Mon Sep 17 00:00:00 2001
From: Mitko Haralanov <mitko.haralanov@intel.com>
Date: Tue, 12 Apr 2016 10:46:53 -0700
Subject: [PATCH 08/46] IB/hfi1: Fix buffer cache races which may cause
 corruption

There are two possible causes for node/memory corruption both
of which are related to the cache eviction algorithm. One way
to cause corruption is due to the asynchronous nature of the
MMU invalidation and the locking used when invalidating node.

The MMU invalidation routine would temporarily release the
RB tree lock to avoid a deadlock. However, this would allow
the eviction function to take the lock resulting in the removal
of cache nodes.

If the node being removed by the eviction code is the same as
the node being invalidated, the result is use after free.

The same is true in the other direction due to the temporary
release of the eviction list lock in the eviction loop.

Another corner case exists when dealing with the SDMA buffer
cache that could cause memory corruption of kernel memory.
The most common way, in which this corruption exhibits itself
is a linked list node corruption. In that case, the kernel will
complain that a node with poisoned pointers is being removed.
The fact that the pointers are already poisoned means that the
node has already been removed from the list.

To root cause of this corruption was a mishandling of the
eviction list maintained by the driver. In order for this
to happen four conditions need to be satisfied:

   1. A node describing a user buffer already exists in the
      interval RB tree,
   2. The beginning of the current user buffer matches that
      node but is bigger. This will cause the node to be
      extended.
   3. The amount of cached buffers is close or at the limit
      of the buffer cache size.
   4. The node has dropped close to the end of the eviction
      list. This will cause the node to be considered for
      eviction.

If all of the above conditions have been satisfied, it is
possible for the eviction algorithm to evict the current node,
which will free the node without the driver knowing.

To solve both issues described above:
   - the locking around the MMU invalidation loop and cache
     eviction loop has been improved so locks are not released in
     the loop body,
   - a new RB function is introduced which will "atomically" find
     and remove the matching node from the RB tree, preventing the
     MMU invalidation loop from touching it, and
   - the node being extended by the pin_vector_pages() function is
     removed from the eviction list prior to calling the eviction
     function.

Reviewed-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/mmu_rb.c    |  6 +--
 drivers/staging/rdma/hfi1/user_sdma.c | 56 ++++++++++++++++++++-------
 2 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/mmu_rb.c b/drivers/staging/rdma/hfi1/mmu_rb.c
index 03f360accacd..2b0e91d3093d 100644
--- a/drivers/staging/rdma/hfi1/mmu_rb.c
+++ b/drivers/staging/rdma/hfi1/mmu_rb.c
@@ -316,9 +316,9 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn,
 		hfi1_cdbg(MMU, "Invalidating node addr 0x%llx, len %u",
 			  node->addr, node->len);
 		if (handler->ops->invalidate(root, node)) {
-			spin_unlock_irqrestore(&handler->lock, flags);
-			__mmu_rb_remove(handler, node, mm);
-			spin_lock_irqsave(&handler->lock, flags);
+			__mmu_int_rb_remove(node, root);
+			if (handler->ops->remove)
+				handler->ops->remove(root, node, mm);
 		}
 	}
 	spin_unlock_irqrestore(&handler->lock, flags);
diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c
index f7e2fe7ed9db..635ddf8b406d 100644
--- a/drivers/staging/rdma/hfi1/user_sdma.c
+++ b/drivers/staging/rdma/hfi1/user_sdma.c
@@ -180,6 +180,8 @@ struct user_sdma_iovec {
 	u64 offset;
 };
 
+#define SDMA_CACHE_NODE_EVICT BIT(0)
+
 struct sdma_mmu_node {
 	struct mmu_rb_node rb;
 	struct list_head list;
@@ -187,6 +189,7 @@ struct sdma_mmu_node {
 	atomic_t refcount;
 	struct page **pages;
 	unsigned npages;
+	unsigned long flags;
 };
 
 struct user_sdma_request {
@@ -1030,27 +1033,29 @@ static inline int num_user_pages(const struct iovec *iov)
 	return 1 + ((epage - spage) >> PAGE_SHIFT);
 }
 
-/* Caller must hold pq->evict_lock */
 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
 {
 	u32 cleared = 0;
 	struct sdma_mmu_node *node, *ptr;
+	struct list_head to_evict = LIST_HEAD_INIT(to_evict);
 
+	spin_lock(&pq->evict_lock);
 	list_for_each_entry_safe_reverse(node, ptr, &pq->evict, list) {
 		/* Make sure that no one is still using the node. */
 		if (!atomic_read(&node->refcount)) {
-			/*
-			 * Need to use the page count now as the remove callback
-			 * will free the node.
-			 */
+			set_bit(SDMA_CACHE_NODE_EVICT, &node->flags);
+			list_del_init(&node->list);
+			list_add(&node->list, &to_evict);
 			cleared += node->npages;
-			spin_unlock(&pq->evict_lock);
-			hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb);
-			spin_lock(&pq->evict_lock);
 			if (cleared >= npages)
 				break;
 		}
 	}
+	spin_unlock(&pq->evict_lock);
+
+	list_for_each_entry_safe(node, ptr, &to_evict, list)
+		hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb);
+
 	return cleared;
 }
 
@@ -1092,11 +1097,25 @@ static int pin_vector_pages(struct user_sdma_request *req,
 		memcpy(pages, node->pages, node->npages * sizeof(*pages));
 
 		npages -= node->npages;
+
+		/*
+		 * If rb_node is NULL, it means that this is brand new node
+		 * and, therefore not on the eviction list.
+		 * If, however, the rb_node is non-NULL, it means that the
+		 * node is already in RB tree and, therefore on the eviction
+		 * list (nodes are unconditionally inserted in the eviction
+		 * list). In that case, we have to remove the node prior to
+		 * calling the eviction function in order to prevent it from
+		 * freeing this node.
+		 */
+		if (rb_node) {
+			spin_lock(&pq->evict_lock);
+			list_del_init(&node->list);
+			spin_unlock(&pq->evict_lock);
+		}
 retry:
 		if (!hfi1_can_pin_pages(pq->dd, pq->n_locked, npages)) {
-			spin_lock(&pq->evict_lock);
 			cleared = sdma_cache_evict(pq, npages);
-			spin_unlock(&pq->evict_lock);
 			if (cleared >= npages)
 				goto retry;
 		}
@@ -1121,10 +1140,7 @@ static int pin_vector_pages(struct user_sdma_request *req,
 		node->npages += pinned;
 		npages = node->npages;
 		spin_lock(&pq->evict_lock);
-		if (!rb_node)
-			list_add(&node->list, &pq->evict);
-		else
-			list_move(&node->list, &pq->evict);
+		list_add(&node->list, &pq->evict);
 		pq->n_locked += pinned;
 		spin_unlock(&pq->evict_lock);
 	}
@@ -1555,6 +1571,18 @@ static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode,
 		container_of(mnode, struct sdma_mmu_node, rb);
 
 	spin_lock(&node->pq->evict_lock);
+	/*
+	 * We've been called by the MMU notifier but this node has been
+	 * scheduled for eviction. The eviction function will take care
+	 * of freeing this node.
+	 * We have to take the above lock first because we are racing
+	 * against the setting of the bit in the eviction function.
+	 */
+	if (mm && test_bit(SDMA_CACHE_NODE_EVICT, &node->flags)) {
+		spin_unlock(&node->pq->evict_lock);
+		return;
+	}
+
 	if (!list_empty(&node->list))
 		list_del(&node->list);
 	node->pq->n_locked -= node->npages;

From ea0e4ce3bcccef360e1aa69d17a210d1221ab80c Mon Sep 17 00:00:00 2001
From: Jubin John <jubin.john@intel.com>
Date: Wed, 20 Apr 2016 06:05:24 -0700
Subject: [PATCH 09/46] IB/rdmavt,hfi1,qib: Fix memory leak

rdi->ports has memory allocated in rvt_alloc_device(), but does not get
freed because the hfi1 and qib drivers drivers call ib_dealloc_device()
directly instead of going through rdmavt. Add a rvt_dealloc_device()
that frees rdi->ports and then calls ib_dealloc_device(). Switch hfi1
and qib drivers to calling rvt_dealloc_device() instead of
ib_dealloc_device() directly.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Reviewed-by: Brian Welty <brian.welty@intel.com>
Signed-off-by: Jubin John <jubin.john@intel.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/qib/qib_init.c |  4 ++--
 drivers/infiniband/sw/rdmavt/vt.c    | 13 +++++++++++++
 drivers/staging/rdma/hfi1/init.c     |  4 ++--
 include/rdma/rdma_vt.h               |  1 +
 4 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c
index 3f062f0dd9d8..f253111e682e 100644
--- a/drivers/infiniband/hw/qib/qib_init.c
+++ b/drivers/infiniband/hw/qib/qib_init.c
@@ -1090,7 +1090,7 @@ void qib_free_devdata(struct qib_devdata *dd)
 	qib_dbg_ibdev_exit(&dd->verbs_dev);
 #endif
 	free_percpu(dd->int_counter);
-	ib_dealloc_device(&dd->verbs_dev.rdi.ibdev);
+	rvt_dealloc_device(&dd->verbs_dev.rdi);
 }
 
 u64 qib_int_counter(struct qib_devdata *dd)
@@ -1183,7 +1183,7 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra)
 bail:
 	if (!list_empty(&dd->list))
 		list_del_init(&dd->list);
-	ib_dealloc_device(&dd->verbs_dev.rdi.ibdev);
+	rvt_dealloc_device(&dd->verbs_dev.rdi);
 	return ERR_PTR(ret);
 }
 
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c
index 6caf5272ba1f..e1cc2cc42f25 100644
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -106,6 +106,19 @@ struct rvt_dev_info *rvt_alloc_device(size_t size, int nports)
 }
 EXPORT_SYMBOL(rvt_alloc_device);
 
+/**
+ * rvt_dealloc_device - deallocate rdi
+ * @rdi: structure to free
+ *
+ * Free a structure allocated with rvt_alloc_device()
+ */
+void rvt_dealloc_device(struct rvt_dev_info *rdi)
+{
+	kfree(rdi->ports);
+	ib_dealloc_device(&rdi->ibdev);
+}
+EXPORT_SYMBOL(rvt_dealloc_device);
+
 static int rvt_query_device(struct ib_device *ibdev,
 			    struct ib_device_attr *props,
 			    struct ib_udata *uhw)
diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c
index cfcdc16b41c3..00edd500a69a 100644
--- a/drivers/staging/rdma/hfi1/init.c
+++ b/drivers/staging/rdma/hfi1/init.c
@@ -1007,7 +1007,7 @@ void hfi1_free_devdata(struct hfi1_devdata *dd)
 	free_percpu(dd->rcv_limit);
 	hfi1_dev_affinity_free(dd);
 	free_percpu(dd->send_schedule);
-	ib_dealloc_device(&dd->verbs_dev.rdi.ibdev);
+	rvt_dealloc_device(&dd->verbs_dev.rdi);
 }
 
 /*
@@ -1110,7 +1110,7 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
 bail:
 	if (!list_empty(&dd->list))
 		list_del_init(&dd->list);
-	ib_dealloc_device(&dd->verbs_dev.rdi.ibdev);
+	rvt_dealloc_device(&dd->verbs_dev.rdi);
 	return ERR_PTR(ret);
 }
 
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index a8696551abb1..d57ceee90d26 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -467,6 +467,7 @@ static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi,
 }
 
 struct rvt_dev_info *rvt_alloc_device(size_t size, int nports);
+void rvt_dealloc_device(struct rvt_dev_info *rdi);
 int rvt_register_device(struct rvt_dev_info *rvd);
 void rvt_unregister_device(struct rvt_dev_info *rvd);
 int rvt_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);

From 4ee1585972c0772a5126d7242a906b18acee7b0f Mon Sep 17 00:00:00 2001
From: Dean Luick <dean.luick@intel.com>
Date: Tue, 12 Apr 2016 10:49:58 -0700
Subject: [PATCH 10/46] IB/hfi1: Fix sysfs file offset usage

Two sysfs files do not pay attention to the file offset when
reading data. Fix that.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Jubin John <jubin.john@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/sysfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/sysfs.c b/drivers/staging/rdma/hfi1/sysfs.c
index c7f1271190af..8cd6df8634ad 100644
--- a/drivers/staging/rdma/hfi1/sysfs.c
+++ b/drivers/staging/rdma/hfi1/sysfs.c
@@ -84,7 +84,7 @@ static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj,
 		rcu_read_unlock();
 		return -EINVAL;
 	}
-	memcpy(buf, &cc_state->cct, count);
+	memcpy(buf, (void *)&cc_state->cct + pos, count);
 	rcu_read_unlock();
 
 	return count;
@@ -131,7 +131,7 @@ static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj,
 		rcu_read_unlock();
 		return -EINVAL;
 	}
-	memcpy(buf, &cc_state->cong_setting, count);
+	memcpy(buf, (void *)&cc_state->cong_setting + pos, count);
 	rcu_read_unlock();
 
 	return count;

From f9c82a0b75a120ca3e9542a7a6e60244d04c915a Mon Sep 17 00:00:00 2001
From: Dean Luick <dean.luick@intel.com>
Date: Tue, 12 Apr 2016 10:50:04 -0700
Subject: [PATCH 11/46] IB/hfi1: Fix i2c resource reservation checks

The i2c and qsfp read/write routines should check for the resource
reservation of the incoming argument target rather than the implicit
target of the hardware HFI.

Reviewed-by: Easwar Hariharan <easwar.hariharan@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Jubin John <jubin.john@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/qsfp.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/qsfp.c b/drivers/staging/rdma/hfi1/qsfp.c
index 9ed1963010fe..ac03d80d27cc 100644
--- a/drivers/staging/rdma/hfi1/qsfp.c
+++ b/drivers/staging/rdma/hfi1/qsfp.c
@@ -96,7 +96,7 @@ int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
 {
 	int ret;
 
-	if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__))
+	if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
 		return -EACCES;
 
 	/* make sure the TWSI bus is in a sane state */
@@ -162,7 +162,7 @@ int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
 {
 	int ret;
 
-	if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__))
+	if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
 		return -EACCES;
 
 	/* make sure the TWSI bus is in a sane state */
@@ -192,7 +192,7 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
 	int ret;
 	u8 page;
 
-	if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__))
+	if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
 		return -EACCES;
 
 	/* make sure the TWSI bus is in a sane state */
@@ -276,7 +276,7 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
 	int ret;
 	u8 page;
 
-	if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__))
+	if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
 		return -EACCES;
 
 	/* make sure the TWSI bus is in a sane state */

From 153d58cd8e2871ec99d3a6570dd26b6f569a8b5e Mon Sep 17 00:00:00 2001
From: Dean Luick <dean.luick@intel.com>
Date: Tue, 12 Apr 2016 10:50:10 -0700
Subject: [PATCH 12/46] IB/hfi1: Fix QOS num_vl bit width

The bit width for num_vls, n, needs to be calculated based on
the pow2 rounded up of the number of vls.  Otherwise num_vls of 3,
5, 6, and 7 will have misplaced QOS RSM map entries.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/chip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
index 16eb653903e0..f1efbe56536a 100644
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -13508,7 +13508,7 @@ static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt)
 		goto bail;
 	qpns_per_vl = __roundup_pow_of_two(max_by_vl);
 	/* determine bits vl */
-	n = ilog2(num_vls);
+	n = ilog2(__roundup_pow_of_two(num_vls));
 	/* determine bits for qpn */
 	m = ilog2(qpns_per_vl);
 	if ((m + n) > 7)

From 35969d9b949a479bebc72cf8049425cd05b3b28e Mon Sep 17 00:00:00 2001
From: Dean Luick <dean.luick@intel.com>
Date: Tue, 12 Apr 2016 10:50:16 -0700
Subject: [PATCH 13/46] IB/hfi1: Remove invalid QOS check

Remove an invalid compare of the number of QOS RSM map table entries
against the number of physical receive contexts.  The RSM map table
has its own size and has no relation to the number of physical receive
contexts.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/chip.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
index f1efbe56536a..8b54d66cfc50 100644
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -13513,8 +13513,6 @@ static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt)
 	m = ilog2(qpns_per_vl);
 	if ((m + n) > 7)
 		goto bail;
-	if (num_vls * qpns_per_vl > dd->chip_rcv_contexts)
-		goto bail;
 	rsmmap = kmalloc_array(NUM_MAP_REGS, sizeof(u64), GFP_KERNEL);
 	if (!rsmmap)
 		goto bail;

From 33a9eb527172cb46711705c858ccd605e32c82f9 Mon Sep 17 00:00:00 2001
From: Dean Luick <dean.luick@intel.com>
Date: Tue, 12 Apr 2016 10:50:22 -0700
Subject: [PATCH 14/46] IB/hfi1: Fix QOS rule mappings

The QOS RSM rule mappings are off by one, referencing a kernel receive
context that does not exist.

Correctly start the QOS RSM map entries at FIRST_KERNEL_CONTEXT rather
than MIN_KERNEL_KCTXTS.  Remove the cruft that hid this.

Change the QP map table so all traffic not caught by QOS RSM goes to
the control context rather than the first QOS context.

Correct comments to match the actual code operation and intent.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/chip.c | 48 ++++++++++++++------------------
 1 file changed, 21 insertions(+), 27 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
index 8b54d66cfc50..79dc29ae6e68 100644
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -12675,20 +12675,20 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
 	unsigned ngroups;
 
 	/*
-	 * Kernel contexts: (to be fixed later):
-	 * - min or 2 or 1 context/numa
+	 * Kernel receive contexts:
+	 * - min of 2 or 1 context/numa (excluding control context)
 	 * - Context 0 - control context (VL15/multicast/error)
-	 * - Context 1 - default context
+	 * - Context 1 - first kernel context
+	 * - Context 2 - second kernel context
+	 * ...
 	 */
 	if (n_krcvqs)
 		/*
-		 * Don't count context 0 in n_krcvqs since
-		 * is isn't used for normal verbs traffic.
-		 *
-		 * krcvqs will reflect number of kernel
-		 * receive contexts above 0.
+		 * n_krcvqs is the sum of module parameter kernel receive
+		 * contexts, krcvqs[].  It does not include the control
+		 * context, so add that.
 		 */
-		num_kernel_contexts = n_krcvqs + MIN_KERNEL_KCTXTS - 1;
+		num_kernel_contexts = n_krcvqs + 1;
 	else
 		num_kernel_contexts = num_online_nodes() + 1;
 	num_kernel_contexts =
@@ -13473,22 +13473,17 @@ static void init_qpmap_table(struct hfi1_devdata *dd,
 /**
  * init_qos - init RX qos
  * @dd - device data
- * @first_context
  *
- * This routine initializes Rule 0 and the
- * RSM map table to implement qos.
+ * This routine initializes Rule 0 and the RSM map table to implement
+ * quality of service (qos).
  *
- * If all of the limit tests succeed,
- * qos is applied based on the array
- * interpretation of krcvqs where
- * entry 0 is VL0.
- *
- * The number of vl bits (n) and the number of qpn
- * bits (m) are computed to feed both the RSM map table
- * and the single rule.
+ * If all of the limit tests succeed, qos is applied based on the array
+ * interpretation of krcvqs where entry 0 is VL0.
  *
+ * The number of vl bits (n) and the number of qpn bits (m) are computed to
+ * feed both the RSM map table and the single rule.
  */
-static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt)
+static void init_qos(struct hfi1_devdata *dd)
 {
 	u8 max_by_vl = 0;
 	unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
@@ -13518,7 +13513,7 @@ static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt)
 		goto bail;
 	memset(rsmmap, rxcontext, NUM_MAP_REGS * sizeof(u64));
 	/* init the local copy of the table */
-	for (i = 0, ctxt = first_ctxt; i < num_vls; i++) {
+	for (i = 0, ctxt = FIRST_KERNEL_KCTXT; i < num_vls; i++) {
 		unsigned tctxt;
 
 		for (qpn = 0, tctxt = ctxt;
@@ -13546,7 +13541,7 @@ static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt)
 	/* add rule0 */
 	write_csr(dd, RCV_RSM_CFG /* + (8 * 0) */,
 		  RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK <<
-		  RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT |
+			RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT |
 		  2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
 	write_csr(dd, RCV_RSM_SELECT /* + (8 * 0) */,
 		  LRH_BTH_MATCH_OFFSET << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
@@ -13563,8 +13558,8 @@ static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt)
 	/* Enable RSM */
 	add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
 	kfree(rsmmap);
-	/* map everything else to first context */
-	init_qpmap_table(dd, FIRST_KERNEL_KCTXT, MIN_KERNEL_KCTXTS - 1);
+	/* map everything else to the mcast/err/vl15 context */
+	init_qpmap_table(dd, HFI1_CTRL_CTXT, HFI1_CTRL_CTXT);
 	dd->qos_shift = n + 1;
 	return;
 bail:
@@ -13577,8 +13572,7 @@ static void init_rxe(struct hfi1_devdata *dd)
 	/* enable all receive errors */
 	write_csr(dd, RCV_ERR_MASK, ~0ull);
 	/* setup QPN map table - start where VL15 context leaves off */
-	init_qos(dd, dd->n_krcv_queues > MIN_KERNEL_KCTXTS ?
-		 MIN_KERNEL_KCTXTS : 0);
+	init_qos(dd);
 	/*
 	 * make sure RcvCtrl.RcvWcb <= PCIe Device Control
 	 * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config

From de882ff5b8640c34d5f1d5dfd2f22f405465aa19 Mon Sep 17 00:00:00 2001
From: Dean Luick <dean.luick@intel.com>
Date: Tue, 12 Apr 2016 10:50:28 -0700
Subject: [PATCH 15/46] IB/hfi1: Correctly obtain the full service class

The function hdr2sc was using an unshifted mask to obtain
the 5th bit of the service class.  Correct the issue by using
the shifted mask.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/hfi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h
index 16cbdc4073e0..ac553f1ed730 100644
--- a/drivers/staging/rdma/hfi1/hfi.h
+++ b/drivers/staging/rdma/hfi1/hfi.h
@@ -1258,7 +1258,7 @@ void receive_interrupt_work(struct work_struct *work);
 static inline int hdr2sc(struct hfi1_message_header *hdr, u64 rhf)
 {
 	return ((be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf) |
-	       ((!!(rhf & RHF_DC_INFO_MASK)) << 4);
+	       ((!!(rhf & RHF_DC_INFO_SMASK)) << 4);
 }
 
 static inline u16 generate_jkey(kuid_t uid)

From 60d585ad6e64bac87a676df4d941c823360f6ae1 Mon Sep 17 00:00:00 2001
From: Dean Luick <dean.luick@intel.com>
Date: Tue, 12 Apr 2016 10:50:35 -0700
Subject: [PATCH 16/46] IB/hfi1: Simplify init_qpmap_table()

Make init_qpmap_table() easier to understand by simplifying
the loop indexing and writing each register when it is "full",
removing the need for a follow-on register write.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/chip.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
index 79dc29ae6e68..56753c65b614 100644
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -13451,20 +13451,17 @@ static void init_qpmap_table(struct hfi1_devdata *dd,
 	int i;
 	u64 ctxt = first_ctxt;
 
-	for (i = 0; i < 256;) {
+	for (i = 0; i < 256; i++) {
 		reg |= ctxt << (8 * (i % 8));
-		i++;
 		ctxt++;
 		if (ctxt > last_ctxt)
 			ctxt = first_ctxt;
-		if (i % 8 == 0) {
+		if (i % 8 == 7) {
 			write_csr(dd, regno, reg);
 			reg = 0;
 			regno += 8;
 		}
 	}
-	if (i % 8)
-		write_csr(dd, regno, reg);
 
 	add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK
 			| RCV_CTRL_RCV_BYPASS_ENABLE_SMASK);

From ef699e849c081d8123d574b3aa279f4e550de4ad Mon Sep 17 00:00:00 2001
From: Sebastian Sanchez <sebastian.sanchez@intel.com>
Date: Tue, 12 Apr 2016 11:17:09 -0700
Subject: [PATCH 17/46] IB/hfi1: Adjust default MTU to be 10KB

Increasing the default MTU size to 10KB improves performance
for PSM. Change the default MTU to 10KB but constrain
Verbs MTU to 8KB. Also update default MTU module parameter
description to be HFI1_DEFAULT_MAX_MTU.

Reviewed-by: Dean Luick <dean.luick@intel.com>
Reviewed-by: Mitko Haralanov <mitko.haralanov@intel.com>
Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Jubin John <jubin.john@intel.com>
Signed-off-by: Sebastian Sanchez <sebastian.sanchez@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/driver.c | 3 ++-
 drivers/staging/rdma/hfi1/hfi.h    | 4 ++--
 drivers/staging/rdma/hfi1/qp.c     | 6 +++++-
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/driver.c b/drivers/staging/rdma/hfi1/driver.c
index 34511e5df1d5..700c6fa3a633 100644
--- a/drivers/staging/rdma/hfi1/driver.c
+++ b/drivers/staging/rdma/hfi1/driver.c
@@ -75,7 +75,8 @@ DEFINE_MUTEX(hfi1_mutex);	/* general driver use */
 
 unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
 module_param_named(max_mtu, hfi1_max_mtu, uint, S_IRUGO);
-MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is 8192");
+MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is " __stringify(
+		 HFI1_DEFAULT_MAX_MTU));
 
 unsigned int hfi1_cu = 1;
 module_param_named(cu, hfi1_cu, uint, S_IRUGO);
diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h
index ac553f1ed730..ff04593eabe2 100644
--- a/drivers/staging/rdma/hfi1/hfi.h
+++ b/drivers/staging/rdma/hfi1/hfi.h
@@ -455,9 +455,9 @@ struct rvt_sge_state;
 #define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE)
 
 /* use this MTU size if none other is given */
-#define HFI1_DEFAULT_ACTIVE_MTU 8192
+#define HFI1_DEFAULT_ACTIVE_MTU 10240
 /* use this MTU size as the default maximum */
-#define HFI1_DEFAULT_MAX_MTU 8192
+#define HFI1_DEFAULT_MAX_MTU 10240
 /* default partition key */
 #define DEFAULT_PKEY 0xffff
 
diff --git a/drivers/staging/rdma/hfi1/qp.c b/drivers/staging/rdma/hfi1/qp.c
index dc9119e1b458..91eb42316df9 100644
--- a/drivers/staging/rdma/hfi1/qp.c
+++ b/drivers/staging/rdma/hfi1/qp.c
@@ -167,8 +167,12 @@ static inline int opa_mtu_enum_to_int(int mtu)
  */
 static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu)
 {
-	int val = opa_mtu_enum_to_int((int)mtu);
+	int val;
 
+	/* Constraining 10KB packets to 8KB packets */
+	if (mtu == (enum ib_mtu)OPA_MTU_10240)
+		mtu = OPA_MTU_8192;
+	val = opa_mtu_enum_to_int((int)mtu);
 	if (val > 0)
 		return val;
 	return ib_mtu_enum_to_int(mtu);

From e38d1e4f5099d533a833afee89f439853c94f272 Mon Sep 17 00:00:00 2001
From: Sebastian Sanchez <sebastian.sanchez@intel.com>
Date: Tue, 12 Apr 2016 11:22:21 -0700
Subject: [PATCH 18/46] IB/hfi1: Check P_KEY for all sent packets from user
 mode

Add the P_KEY check for user-context mechanism for
both PIO and SDMA. For PIO, the
SendCtxtCheckEnable.DisallowKDETHPackets is set by
default. When the P_KEY is set,
SendCtxtCheckEnable.DisallowKDETHPackets is cleared.
For SDMA, a software check was included. This change
requires user processes to set the P_KEY before sending
any packets, otherwise, the sent packet will fail. The
original submission didn't have this check but it's
required.

Reviewed-by: Dean Luick <dean.luick@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Reviewed-by: Mikto Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Sebastian Sanchez <sebastian.sanchez@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/chip.c      |   1 +
 drivers/staging/rdma/hfi1/hfi.h       |   4 +
 drivers/staging/rdma/hfi1/user_sdma.c |   7 ++
 drivers/staging/rdma/hfi1/verbs.c     | 104 ++++++++++++++++----------
 4 files changed, 76 insertions(+), 40 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
index 56753c65b614..b0a0a0d2cbad 100644
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -13751,6 +13751,7 @@ int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey)
 	write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg);
 	reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
 	reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+	reg &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK;
 	write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
 done:
 	return ret;
diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h
index ff04593eabe2..b1d4f605c14d 100644
--- a/drivers/staging/rdma/hfi1/hfi.h
+++ b/drivers/staging/rdma/hfi1/hfi.h
@@ -1333,6 +1333,9 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl,  u16 rlid, u32 lqpn,
 void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
 		u32 pkey, u32 slid, u32 dlid, u8 sc5,
 		const struct ib_grh *old_grh);
+#define PKEY_CHECK_INVALID -1
+int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth,
+		      u8 sc5, int8_t s_pkey_index);
 
 #define PACKET_EGRESS_TIMEOUT 350
 static inline void pause_for_credit_return(struct hfi1_devdata *dd)
@@ -1776,6 +1779,7 @@ extern struct mutex hfi1_mutex;
 
 #define HFI1_PKT_USER_SC_INTEGRITY					    \
 	(SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK	    \
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK		\
 	| SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK		    \
 	| SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK)
 
diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c
index 635ddf8b406d..0014c9c0e967 100644
--- a/drivers/staging/rdma/hfi1/user_sdma.c
+++ b/drivers/staging/rdma/hfi1/user_sdma.c
@@ -600,6 +600,13 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
 		goto free_req;
 	}
 
+	/* Checking P_KEY for requests from user-space */
+	if (egress_pkey_check(dd->pport, req->hdr.lrh, req->hdr.bth, sc,
+			      PKEY_CHECK_INVALID)) {
+		ret = -EINVAL;
+		goto free_req;
+	}
+
 	/*
 	 * Also should check the BTH.lnh. If it says the next header is GRH then
 	 * the RXE parsing will be off and will land in the middle of the KDETH
diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c
index 89f2aad45c1b..c56c0cb0de80 100644
--- a/drivers/staging/rdma/hfi1/verbs.c
+++ b/drivers/staging/rdma/hfi1/verbs.c
@@ -1089,16 +1089,16 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 
 /*
  * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent
- * being an entry from the ingress partition key table), return 0
+ * being an entry from the partition key table), return 0
  * otherwise. Use the matching criteria for egress partition keys
  * specified in the OPAv1 spec., section 9.1l.7.
  */
 static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
 {
 	u16 mkey = pkey & PKEY_LOW_15_MASK;
-	u16 ment = ent & PKEY_LOW_15_MASK;
+	u16 mentry = ent & PKEY_LOW_15_MASK;
 
-	if (mkey == ment) {
+	if (mkey == mentry) {
 		/*
 		 * If pkey[15] is set (full partition member),
 		 * is bit 15 in the corresponding table element
@@ -1111,32 +1111,32 @@ static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
 	return 0;
 }
 
-/*
- * egress_pkey_check - return 0 if hdr's pkey matches according to the
- * criteria in the OPAv1 spec., section 9.11.7.
+/**
+ * egress_pkey_check - check P_KEY of a packet
+ * @ppd:    Physical IB port data
+ * @lrh: Local route header
+ * @bth: Base transport header
+ * @sc5:    SC for packet
+ * @s_pkey_index: It will be used for look up optimization for kernel contexts
+ * only. If it is negative value, then it means user contexts is calling this
+ * function.
+ *
+ * It checks if hdr's pkey is valid.
+ *
+ * Return: 0 on success, otherwise, 1
  */
-static inline int egress_pkey_check(struct hfi1_pportdata *ppd,
-				    struct hfi1_ib_header *hdr,
-				    struct rvt_qp *qp)
+int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth,
+		      u8 sc5, int8_t s_pkey_index)
 {
-	struct hfi1_qp_priv *priv = qp->priv;
-	struct hfi1_other_headers *ohdr;
 	struct hfi1_devdata *dd;
-	int i = 0;
+	int i;
 	u16 pkey;
-	u8 lnh, sc5 = priv->s_sc;
+	int is_user_ctxt_mechanism = (s_pkey_index < 0);
 
 	if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT))
 		return 0;
 
-	/* locate the pkey within the headers */
-	lnh = be16_to_cpu(hdr->lrh[0]) & 3;
-	if (lnh == HFI1_LRH_GRH)
-		ohdr = &hdr->u.l.oth;
-	else
-		ohdr = &hdr->u.oth;
-
-	pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+	pkey = (u16)be32_to_cpu(bth[0]);
 
 	/* If SC15, pkey[0:14] must be 0x7fff */
 	if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
@@ -1146,28 +1146,37 @@ static inline int egress_pkey_check(struct hfi1_pportdata *ppd,
 	if ((pkey & PKEY_LOW_15_MASK) == 0)
 		goto bad;
 
-	/* The most likely matching pkey has index qp->s_pkey_index */
-	if (unlikely(!egress_pkey_matches_entry(pkey,
-						ppd->pkeys
-						[qp->s_pkey_index]))) {
-		/* no match - try the entire table */
-		for (; i < MAX_PKEY_VALUES; i++) {
-			if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
-				break;
-		}
+	/*
+	 * For the kernel contexts only, if a qp is passed into the function,
+	 * the most likely matching pkey has index qp->s_pkey_index
+	 */
+	if (!is_user_ctxt_mechanism &&
+	    egress_pkey_matches_entry(pkey, ppd->pkeys[s_pkey_index])) {
+		return 0;
 	}
 
-	if (i < MAX_PKEY_VALUES)
-		return 0;
+	for (i = 0; i < MAX_PKEY_VALUES; i++) {
+		if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
+			return 0;
+	}
 bad:
-	incr_cntr64(&ppd->port_xmit_constraint_errors);
-	dd = ppd->dd;
-	if (!(dd->err_info_xmit_constraint.status & OPA_EI_STATUS_SMASK)) {
-		u16 slid = be16_to_cpu(hdr->lrh[3]);
+	/*
+	 * For the user-context mechanism, the P_KEY check would only happen
+	 * once per SDMA request, not once per packet.  Therefore, there's no
+	 * need to increment the counter for the user-context mechanism.
+	 */
+	if (!is_user_ctxt_mechanism) {
+		incr_cntr64(&ppd->port_xmit_constraint_errors);
+		dd = ppd->dd;
+		if (!(dd->err_info_xmit_constraint.status &
+		      OPA_EI_STATUS_SMASK)) {
+			u16 slid = be16_to_cpu(lrh[3]);
 
-		dd->err_info_xmit_constraint.status |= OPA_EI_STATUS_SMASK;
-		dd->err_info_xmit_constraint.slid = slid;
-		dd->err_info_xmit_constraint.pkey = pkey;
+			dd->err_info_xmit_constraint.status |=
+				OPA_EI_STATUS_SMASK;
+			dd->err_info_xmit_constraint.slid = slid;
+			dd->err_info_xmit_constraint.pkey = pkey;
+		}
 	}
 	return 1;
 }
@@ -1227,11 +1236,26 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 {
 	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
 	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_other_headers *ohdr;
+	struct hfi1_ib_header *hdr;
 	send_routine sr;
 	int ret;
+	u8 lnh;
+
+	hdr = &ps->s_txreq->phdr.hdr;
+	/* locate the pkey within the headers */
+	lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+	if (lnh == HFI1_LRH_GRH)
+		ohdr = &hdr->u.l.oth;
+	else
+		ohdr = &hdr->u.oth;
 
 	sr = get_send_routine(qp, ps->s_txreq);
-	ret = egress_pkey_check(dd->pport, &ps->s_txreq->phdr.hdr, qp);
+	ret = egress_pkey_check(dd->pport,
+				hdr->lrh,
+				ohdr->bth,
+				priv->s_sc,
+				qp->s_pkey_index);
 	if (unlikely(ret)) {
 		/*
 		 * The value we are returning here does not get propagated to

From 145dd2b3995830ab4bf14f1dc7d44dc5d2b7cb10 Mon Sep 17 00:00:00 2001
From: Easwar Hariharan <easwar.hariharan@intel.com>
Date: Tue, 12 Apr 2016 11:25:31 -0700
Subject: [PATCH 19/46] IB/hfi1: Always turn on CDRs for low power QSFP modules

Clock and data recovery mechanisms (CDRs) in active QSFP modules
can be turned on or off to improve the bit error rate observed on
the channel. Signal integrity and bit error rate requirements require
us to always turn on any CDRs present in low power cables (power
dissipation 2.5W or lower). However, we adhere to the platform
designer's settings (provided in the platform configuration) for
higher power cables (dissipation 3.5W or higher) if the platform
designer has determined that the platform requires the CDRs to be
turned on (or off) and is capable of supplying and cooling the higher
power modules.

This patch also introduces the get_qsfp_power_class function to
centralize the bit twiddling required to determine the QSFP power class
across the code. Reusing this function improves the readability of code
that depends on knowing the power class of the cable, such as the
active and optical channel tuning algorithm.

Reviewed-by: Dean Luick <dean.luick@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Easwar Hariharan <easwar.hariharan@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/chip.c     | 49 +++-----------------------
 drivers/staging/rdma/hfi1/chip.h     |  1 -
 drivers/staging/rdma/hfi1/hfi.h      |  1 -
 drivers/staging/rdma/hfi1/init.c     |  1 -
 drivers/staging/rdma/hfi1/platform.c | 51 +++++++++++++++-------------
 drivers/staging/rdma/hfi1/qsfp.c     | 38 +++++++++++++++++++--
 drivers/staging/rdma/hfi1/qsfp.h     | 15 ++++----
 7 files changed, 74 insertions(+), 82 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
index b0a0a0d2cbad..962cca0fa858 100644
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -6199,18 +6199,13 @@ static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data)
 
 /*
  * Handle host requests from the 8051.
- *
- * This is a work-queue function outside of the interrupt.
  */
-void handle_8051_request(struct work_struct *work)
+static void handle_8051_request(struct hfi1_pportdata *ppd)
 {
-	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-							dc_host_req_work);
 	struct hfi1_devdata *dd = ppd->dd;
 	u64 reg;
 	u16 data = 0;
-	u8 type, i, lanes, *cache = ppd->qsfp_info.cache;
-	u8 cdr_ctrl_byte = cache[QSFP_CDR_CTRL_BYTE_OFFS];
+	u8 type;
 
 	reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1);
 	if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0)
@@ -6231,46 +6226,11 @@ void handle_8051_request(struct work_struct *work)
 	case HREQ_READ_CONFIG:
 	case HREQ_SET_TX_EQ_ABS:
 	case HREQ_SET_TX_EQ_REL:
+	case HREQ_ENABLE:
 		dd_dev_info(dd, "8051 request: request 0x%x not supported\n",
 			    type);
 		hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
 		break;
-
-	case HREQ_ENABLE:
-		lanes = data & 0xF;
-		for (i = 0; lanes; lanes >>= 1, i++) {
-			if (!(lanes & 1))
-				continue;
-			if (data & 0x200) {
-				/* enable TX CDR */
-				if (cache[QSFP_MOD_PWR_OFFS] & 0x8 &&
-				    cache[QSFP_CDR_INFO_OFFS] & 0x80)
-					cdr_ctrl_byte |= (1 << (i + 4));
-			} else {
-				/* disable TX CDR */
-				if (cache[QSFP_MOD_PWR_OFFS] & 0x8 &&
-				    cache[QSFP_CDR_INFO_OFFS] & 0x80)
-					cdr_ctrl_byte &= ~(1 << (i + 4));
-			}
-
-			if (data & 0x800) {
-				/* enable RX CDR */
-				if (cache[QSFP_MOD_PWR_OFFS] & 0x4 &&
-				    cache[QSFP_CDR_INFO_OFFS] & 0x40)
-					cdr_ctrl_byte |= (1 << i);
-			} else {
-				/* disable RX CDR */
-				if (cache[QSFP_MOD_PWR_OFFS] & 0x4 &&
-				    cache[QSFP_CDR_INFO_OFFS] & 0x40)
-					cdr_ctrl_byte &= ~(1 << i);
-			}
-		}
-		one_qsfp_write(ppd, dd->hfi1_id, QSFP_CDR_CTRL_BYTE_OFFS,
-			       &cdr_ctrl_byte, 1);
-		hreq_response(dd, HREQ_SUCCESS, data);
-		refresh_qsfp_cache(ppd, &ppd->qsfp_info);
-		break;
-
 	case HREQ_CONFIG_DONE:
 		hreq_response(dd, HREQ_SUCCESS, 0);
 		break;
@@ -6278,7 +6238,6 @@ void handle_8051_request(struct work_struct *work)
 	case HREQ_INTERFACE_TEST:
 		hreq_response(dd, HREQ_SUCCESS, data);
 		break;
-
 	default:
 		dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type);
 		hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
@@ -7534,7 +7493,7 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
 			host_msg &= ~(u64)LINKUP_ACHIEVED;
 		}
 		if (host_msg & EXT_DEVICE_CFG_REQ) {
-			queue_work(ppd->hfi1_wq, &ppd->dc_host_req_work);
+			handle_8051_request(ppd);
 			host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
 		}
 		if (host_msg & VERIFY_CAP_FRAME) {
diff --git a/drivers/staging/rdma/hfi1/chip.h b/drivers/staging/rdma/hfi1/chip.h
index 4f3b878e43eb..e02e00619adc 100644
--- a/drivers/staging/rdma/hfi1/chip.h
+++ b/drivers/staging/rdma/hfi1/chip.h
@@ -691,7 +691,6 @@ void handle_verify_cap(struct work_struct *work);
 void handle_freeze(struct work_struct *work);
 void handle_link_up(struct work_struct *work);
 void handle_link_down(struct work_struct *work);
-void handle_8051_request(struct work_struct *work);
 void handle_link_downgrade(struct work_struct *work);
 void handle_link_bounce(struct work_struct *work);
 void handle_sma_message(struct work_struct *work);
diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h
index b1d4f605c14d..7b78d56de7f5 100644
--- a/drivers/staging/rdma/hfi1/hfi.h
+++ b/drivers/staging/rdma/hfi1/hfi.h
@@ -606,7 +606,6 @@ struct hfi1_pportdata {
 	struct work_struct link_vc_work;
 	struct work_struct link_up_work;
 	struct work_struct link_down_work;
-	struct work_struct dc_host_req_work;
 	struct work_struct sma_message_work;
 	struct work_struct freeze_work;
 	struct work_struct link_downgrade_work;
diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c
index 00edd500a69a..b1582b5a169e 100644
--- a/drivers/staging/rdma/hfi1/init.c
+++ b/drivers/staging/rdma/hfi1/init.c
@@ -496,7 +496,6 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
 	INIT_WORK(&ppd->link_vc_work, handle_verify_cap);
 	INIT_WORK(&ppd->link_up_work, handle_link_up);
 	INIT_WORK(&ppd->link_down_work, handle_link_down);
-	INIT_WORK(&ppd->dc_host_req_work, handle_8051_request);
 	INIT_WORK(&ppd->freeze_work, handle_freeze);
 	INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade);
 	INIT_WORK(&ppd->sma_message_work, handle_sma_message);
diff --git a/drivers/staging/rdma/hfi1/platform.c b/drivers/staging/rdma/hfi1/platform.c
index 0a1d074583e4..19d0223eb0cf 100644
--- a/drivers/staging/rdma/hfi1/platform.c
+++ b/drivers/staging/rdma/hfi1/platform.c
@@ -114,21 +114,11 @@ static int qual_power(struct hfi1_pportdata *ppd)
 	if (ret)
 		return ret;
 
-	if (QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]) != 4)
-		cable_power_class = QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]);
-	else
-		cable_power_class = QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]);
+	cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
 
-	if (cable_power_class <= 3 && cable_power_class > (power_class_max - 1))
+	if (cable_power_class > power_class_max)
 		ppd->offline_disabled_reason =
 			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY);
-	else if (cable_power_class > 4 && cable_power_class > (power_class_max))
-		ppd->offline_disabled_reason =
-			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY);
-	/*
-	 * cable_power_class will never have value 4 as this simply
-	 * means the high power settings are unused
-	 */
 
 	if (ppd->offline_disabled_reason ==
 			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY)) {
@@ -173,12 +163,9 @@ static int set_qsfp_high_power(struct hfi1_pportdata *ppd)
 	u8 *cache = ppd->qsfp_info.cache;
 	int ret;
 
-	if (QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]) != 4)
-		cable_power_class = QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]);
-	else
-		cable_power_class = QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]);
+	cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
 
-	if (cable_power_class) {
+	if (cable_power_class > QSFP_POWER_CLASS_1) {
 		power_ctrl_byte = cache[QSFP_PWR_CTRL_BYTE_OFFS];
 
 		power_ctrl_byte |= 1;
@@ -190,8 +177,7 @@ static int set_qsfp_high_power(struct hfi1_pportdata *ppd)
 		if (ret != 1)
 			return -EIO;
 
-		if (cable_power_class > 3) {
-			/* > power class 4*/
+		if (cable_power_class > QSFP_POWER_CLASS_4) {
 			power_ctrl_byte |= (1 << 2);
 			ret = qsfp_write(ppd, ppd->dd->hfi1_id,
 					 QSFP_PWR_CTRL_BYTE_OFFS,
@@ -212,12 +198,21 @@ static void apply_rx_cdr(struct hfi1_pportdata *ppd,
 {
 	u32 rx_preset;
 	u8 *cache = ppd->qsfp_info.cache;
+	int cable_power_class;
 
 	if (!((cache[QSFP_MOD_PWR_OFFS] & 0x4) &&
 	      (cache[QSFP_CDR_INFO_OFFS] & 0x40)))
 		return;
 
-	/* rx_preset preset to zero to catch error */
+	/* RX CDR present, bypass supported */
+	cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+	if (cable_power_class <= QSFP_POWER_CLASS_3) {
+		/* Power class <= 3, ignore config & turn RX CDR on */
+		*cdr_ctrl_byte |= 0xF;
+		return;
+	}
+
 	get_platform_config_field(
 		ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
 		rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR_APPLY,
@@ -250,15 +245,25 @@ static void apply_rx_cdr(struct hfi1_pportdata *ppd,
 
 static void apply_tx_cdr(struct hfi1_pportdata *ppd,
 			 u32 tx_preset_index,
-			 u8 *ctr_ctrl_byte)
+			 u8 *cdr_ctrl_byte)
 {
 	u32 tx_preset;
 	u8 *cache = ppd->qsfp_info.cache;
+	int cable_power_class;
 
 	if (!((cache[QSFP_MOD_PWR_OFFS] & 0x8) &&
 	      (cache[QSFP_CDR_INFO_OFFS] & 0x80)))
 		return;
 
+	/* TX CDR present, bypass supported */
+	cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+	if (cable_power_class <= QSFP_POWER_CLASS_3) {
+		/* Power class <= 3, ignore config & turn TX CDR on */
+		*cdr_ctrl_byte |= 0xF0;
+		return;
+	}
+
 	get_platform_config_field(
 		ppd->dd,
 		PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index,
@@ -282,10 +287,10 @@ static void apply_tx_cdr(struct hfi1_pportdata *ppd,
 			(tx_preset << 2) | (tx_preset << 3));
 
 	if (tx_preset)
-		*ctr_ctrl_byte |= (tx_preset << 4);
+		*cdr_ctrl_byte |= (tx_preset << 4);
 	else
 		/* Preserve current/determined RX CDR status */
-		*ctr_ctrl_byte &= ((tx_preset << 4) | 0xF);
+		*cdr_ctrl_byte &= ((tx_preset << 4) | 0xF);
 }
 
 static void apply_cdr_settings(
diff --git a/drivers/staging/rdma/hfi1/qsfp.c b/drivers/staging/rdma/hfi1/qsfp.c
index ac03d80d27cc..dc5d1864dcd4 100644
--- a/drivers/staging/rdma/hfi1/qsfp.c
+++ b/drivers/staging/rdma/hfi1/qsfp.c
@@ -466,7 +466,28 @@ const char * const hfi1_qsfp_devtech[16] = {
 #define QSFP_DUMP_CHUNK 16 /* Holds longest string */
 #define QSFP_DEFAULT_HDR_CNT 224
 
-static const char *pwr_codes = "1.5W2.0W2.5W3.5W";
+#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
+#define QSFP_HIGH_PWR(pbyte) ((pbyte) & 3)
+/* For use with QSFP_HIGH_PWR macro */
+#define QSFP_HIGH_PWR_UNUSED	0 /* Bits [1:0] = 00 implies low power module */
+
+/*
+ * Takes power class byte [Page 00 Byte 129] in SFF 8636
+ * Returns power class as integer (1 through 7, per SFF 8636 rev 2.4)
+ */
+int get_qsfp_power_class(u8 power_byte)
+{
+	if (QSFP_HIGH_PWR(power_byte) == QSFP_HIGH_PWR_UNUSED)
+		/* power classes count from 1, their bit encodings from 0 */
+		return (QSFP_PWR(power_byte) + 1);
+	/*
+	 * 00 in the high power classes stands for unused, bringing
+	 * balance to the off-by-1 offset above, we add 4 here to
+	 * account for the difference between the low and high power
+	 * groups
+	 */
+	return (QSFP_HIGH_PWR(power_byte) + 4);
+}
 
 int qsfp_mod_present(struct hfi1_pportdata *ppd)
 {
@@ -537,6 +558,16 @@ int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, u32 len,
 	return ret;
 }
 
+static const char *pwr_codes[8] = {"N/AW",
+				  "1.5W",
+				  "2.0W",
+				  "2.5W",
+				  "3.5W",
+				  "4.0W",
+				  "4.5W",
+				  "5.0W"
+				 };
+
 int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
 {
 	u8 *cache = &ppd->qsfp_info.cache[0];
@@ -546,6 +577,7 @@ int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
 	int bidx = 0;
 	u8 *atten = &cache[QSFP_ATTEN_OFFS];
 	u8 *vendor_oui = &cache[QSFP_VOUI_OFFS];
+	u8 power_byte = 0;
 
 	sofar = 0;
 	lenstr[0] = ' ';
@@ -555,9 +587,9 @@ int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
 		if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
 			sprintf(lenstr, "%dM ", cache[QSFP_MOD_LEN_OFFS]);
 
+		power_byte = cache[QSFP_MOD_PWR_OFFS];
 		sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n",
-				pwr_codes +
-				(QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]) * 4));
+				pwr_codes[get_qsfp_power_class(power_byte)]);
 
 		sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n",
 				lenstr,
diff --git a/drivers/staging/rdma/hfi1/qsfp.h b/drivers/staging/rdma/hfi1/qsfp.h
index 831fe4cf1345..dadc66c442b9 100644
--- a/drivers/staging/rdma/hfi1/qsfp.h
+++ b/drivers/staging/rdma/hfi1/qsfp.h
@@ -82,8 +82,9 @@
 /* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */
 #define QSFP_MOD_ID_OFFS 128
 /*
- * Byte 129 is "Extended Identifier". We only care about D7,D6: Power class
- *  0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
+ * Byte 129 is "Extended Identifier".
+ * For bits [7:6]: 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
+ * For bits [1:0]: 0:Unused, 1:4W, 2:4.5W, 3:5W
  */
 #define QSFP_MOD_PWR_OFFS 129
 /* Byte 130 is Connector type. Not Intel req'd */
@@ -190,6 +191,9 @@ extern const char *const hfi1_qsfp_devtech[16];
 #define QSFP_HIGH_BIAS_WARNING		0x22
 #define QSFP_LOW_BIAS_WARNING		0x11
 
+#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
+#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
+
 /*
  * struct qsfp_data encapsulates state of QSFP device for one port.
  * it will be part of port-specific data if a board supports QSFP.
@@ -201,12 +205,6 @@ extern const char *const hfi1_qsfp_devtech[16];
  * and let the qsfp_lock arbitrate access to common resources.
  *
  */
-
-#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
-#define QSFP_HIGH_PWR(pbyte) (((pbyte) & 3) | 4)
-#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
-#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
-
 struct qsfp_data {
 	/* Helps to find our way */
 	struct hfi1_pportdata *ppd;
@@ -223,6 +221,7 @@ struct qsfp_data {
 
 int refresh_qsfp_cache(struct hfi1_pportdata *ppd,
 		       struct qsfp_data *cp);
+int get_qsfp_power_class(u8 power_byte);
 int qsfp_mod_present(struct hfi1_pportdata *ppd);
 int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr,
 		   u32 len, u8 *data);

From 623bba2d92a32572e21758a17718eeec65594f7d Mon Sep 17 00:00:00 2001
From: Easwar Hariharan <easwar.hariharan@intel.com>
Date: Tue, 12 Apr 2016 11:25:57 -0700
Subject: [PATCH 20/46] IB/hfi1: Remove module presence check outside pre-LNI
 checks

The pre-LNI SerDes and channel tuning algorithm already checks for
module presence assertion for the relevant port types. The extraneous
check removed in this patch blocks link up for port types for which
the module presence assertion is not relevant.

Reviewed-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Easwar Hariharan <easwar.hariharan@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/chip.c     | 20 ++++++--------------
 drivers/staging/rdma/hfi1/platform.c |  8 ++++++--
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
index 962cca0fa858..38c0e9726cee 100644
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -6849,7 +6849,7 @@ void handle_link_down(struct work_struct *work)
 	 * If there is no cable attached, turn the DC off. Otherwise,
 	 * start the link bring up.
 	 */
-	if (!qsfp_mod_present(ppd)) {
+	if (ppd->port_type == PORT_TYPE_QSFP && !qsfp_mod_present(ppd)) {
 		dc_shutdown(ppd->dd);
 	} else {
 		tune_serdes(ppd);
@@ -9008,9 +9008,9 @@ static int set_local_link_attributes(struct hfi1_pportdata *ppd)
 }
 
 /*
- * Call this to start the link.  Schedule a retry if the cable is not
- * present or if unable to start polling.  Do not do anything if the
- * link is disabled.  Returns 0 if link is disabled or moved to polling
+ * Call this to start the link.
+ * Do not do anything if the link is disabled.
+ * Returns 0 if link is disabled, moved to polling, or the driver is not ready.
  */
 int start_link(struct hfi1_pportdata *ppd)
 {
@@ -9027,15 +9027,7 @@ int start_link(struct hfi1_pportdata *ppd)
 		return 0;
 	}
 
-	if (qsfp_mod_present(ppd) || loopback == LOOPBACK_SERDES ||
-	    loopback == LOOPBACK_LCB ||
-	    ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
-		return set_link_state(ppd, HLS_DN_POLL);
-
-	dd_dev_info(ppd->dd,
-		    "%s: stopping link start because no cable is present\n",
-		    __func__);
-	return -EAGAIN;
+	return set_link_state(ppd, HLS_DN_POLL);
 }
 
 static void wait_for_qsfp_init(struct hfi1_pportdata *ppd)
@@ -9206,7 +9198,7 @@ static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
 	return 0;
 }
 
-/* This routine will only be scheduled if the QSFP module is present */
+/* This routine will only be scheduled if the QSFP module present is asserted */
 void qsfp_event(struct work_struct *work)
 {
 	struct qsfp_data *qd;
diff --git a/drivers/staging/rdma/hfi1/platform.c b/drivers/staging/rdma/hfi1/platform.c
index 19d0223eb0cf..b97027d4ddc6 100644
--- a/drivers/staging/rdma/hfi1/platform.c
+++ b/drivers/staging/rdma/hfi1/platform.c
@@ -838,9 +838,11 @@ void tune_serdes(struct hfi1_pportdata *ppd)
 			total_atten = platform_atten + remote_atten;
 
 			tuning_method = OPA_PASSIVE_TUNING;
-		} else
+		} else {
 			ppd->offline_disabled_reason =
 			     HFI1_ODR_MASK(OPA_LINKDOWN_REASON_CHASSIS_CONFIG);
+			goto bail;
+		}
 		break;
 	case PORT_TYPE_QSFP:
 		if (qsfp_mod_present(ppd)) {
@@ -869,10 +871,12 @@ void tune_serdes(struct hfi1_pportdata *ppd)
 					   __func__);
 				goto bail;
 			}
-		} else
+		} else {
 			ppd->offline_disabled_reason =
 			   HFI1_ODR_MASK(
 				OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED);
+			goto bail;
+		}
 		break;
 	default:
 		dd_dev_info(ppd->dd, "%s: Unknown port type\n", __func__);

From 90315ad86abfe2fe43a564d705a342d823c1589c Mon Sep 17 00:00:00 2001
From: Dean Luick <dean.luick@intel.com>
Date: Tue, 12 Apr 2016 11:26:21 -0700
Subject: [PATCH 21/46] IB/hfi1: Guard against concurrent I2C access across all
 chains

The discrete ASIC board design makes the two I2C chains not
independent of each other.  That is, only one chain can safely
be accessed at a time.  For discrete ASIC devices, adjust the
resource locking so that access to one I2C chain will lock both
of the chains.

Reviewed-by: Easwar Hariharan <easwar.hariharan@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/firmware.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/staging/rdma/hfi1/firmware.c b/drivers/staging/rdma/hfi1/firmware.c
index 3040162cb326..ed680fda611d 100644
--- a/drivers/staging/rdma/hfi1/firmware.c
+++ b/drivers/staging/rdma/hfi1/firmware.c
@@ -1413,8 +1413,15 @@ static int __acquire_chip_resource(struct hfi1_devdata *dd, u32 resource)
 
 	if (resource & CR_DYN_MASK) {
 		/* a dynamic resource is in use if either HFI has set the bit */
-		all_bits = resource_mask(0, resource) |
+		if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0 &&
+		    (resource & (CR_I2C1 | CR_I2C2))) {
+			/* discrete devices must serialize across both chains */
+			all_bits = resource_mask(0, CR_I2C1 | CR_I2C2) |
+					resource_mask(1, CR_I2C1 | CR_I2C2);
+		} else {
+			all_bits = resource_mask(0, resource) |
 						resource_mask(1, resource);
+		}
 		my_bit = resource_mask(dd->hfi1_id, resource);
 	} else {
 		/* non-dynamic resources are not split between HFIs */

From e4e0e39c8d41d5f6cb664a34ac7b2c6388b1b523 Mon Sep 17 00:00:00 2001
From: Dean Luick <dean.luick@intel.com>
Date: Tue, 12 Apr 2016 11:28:36 -0700
Subject: [PATCH 22/46] IB/hfi1: Fix double QSFP resource acquire on cache
 refresh

The function refresh_qsfp_cache() acquires the i2c chain resource,
but one caller already holds the resource.  Change the acquire so
all calls to refresh_qsfp_cache() are covered by the acquire and
remove the acquire within refresh_qsfp_cache().

Reviewed-by: Easwar Hariharan <easwar.hariharan@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/platform.c | 40 ++++++++++++++--------------
 drivers/staging/rdma/hfi1/qsfp.c     | 12 +++------
 2 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/platform.c b/drivers/staging/rdma/hfi1/platform.c
index b97027d4ddc6..8fe8a205b5bb 100644
--- a/drivers/staging/rdma/hfi1/platform.c
+++ b/drivers/staging/rdma/hfi1/platform.c
@@ -603,6 +603,7 @@ static void apply_tunings(
 		       "Applying TX settings");
 }
 
+/* Must be holding the QSFP i2c resource */
 static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
 			    u32 *ptr_rx_preset, u32 *ptr_total_atten)
 {
@@ -610,26 +611,19 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
 	u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
 	u8 *cache = ppd->qsfp_info.cache;
 
-	ret = acquire_chip_resource(ppd->dd, qsfp_resource(ppd->dd), QSFP_WAIT);
-	if (ret) {
-		dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n",
-			   __func__, (int)ppd->dd->hfi1_id);
-		return ret;
-	}
-
 	ppd->qsfp_info.limiting_active = 1;
 
 	ret = set_qsfp_tx(ppd, 0);
 	if (ret)
-		goto bail_unlock;
+		return ret;
 
 	ret = qual_power(ppd);
 	if (ret)
-		goto bail_unlock;
+		return ret;
 
 	ret = qual_bitrate(ppd);
 	if (ret)
-		goto bail_unlock;
+		return ret;
 
 	if (ppd->qsfp_info.reset_needed) {
 		reset_qsfp(ppd);
@@ -641,7 +635,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
 
 	ret = set_qsfp_high_power(ppd);
 	if (ret)
-		goto bail_unlock;
+		return ret;
 
 	if (cache[QSFP_EQ_INFO_OFFS] & 0x4) {
 		ret = get_platform_config_field(
@@ -651,7 +645,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
 			ptr_tx_preset, 4);
 		if (ret) {
 			*ptr_tx_preset = OPA_INVALID_INDEX;
-			goto bail_unlock;
+			return ret;
 		}
 	} else {
 		ret = get_platform_config_field(
@@ -661,7 +655,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
 			ptr_tx_preset, 4);
 		if (ret) {
 			*ptr_tx_preset = OPA_INVALID_INDEX;
-			goto bail_unlock;
+			return ret;
 		}
 	}
 
@@ -670,7 +664,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
 		PORT_TABLE_RX_PRESET_IDX, ptr_rx_preset, 4);
 	if (ret) {
 		*ptr_rx_preset = OPA_INVALID_INDEX;
-		goto bail_unlock;
+		return ret;
 	}
 
 	if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G))
@@ -690,8 +684,6 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
 
 	ret = set_qsfp_tx(ppd, 1);
 
-bail_unlock:
-	release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
 	return ret;
 }
 
@@ -846,6 +838,14 @@ void tune_serdes(struct hfi1_pportdata *ppd)
 		break;
 	case PORT_TYPE_QSFP:
 		if (qsfp_mod_present(ppd)) {
+			ret = acquire_chip_resource(ppd->dd,
+						    qsfp_resource(ppd->dd),
+						    QSFP_WAIT);
+			if (ret) {
+				dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n",
+					   __func__, (int)ppd->dd->hfi1_id);
+				goto bail;
+			}
 			refresh_qsfp_cache(ppd, &ppd->qsfp_info);
 
 			if (ppd->qsfp_info.cache_valid) {
@@ -860,17 +860,17 @@ void tune_serdes(struct hfi1_pportdata *ppd)
 				 * update the cache to reflect the changes
 				 */
 				refresh_qsfp_cache(ppd, &ppd->qsfp_info);
-				if (ret)
-					goto bail;
-
 				limiting_active =
 						ppd->qsfp_info.limiting_active;
 			} else {
 				dd_dev_err(dd,
 					   "%s: Reading QSFP memory failed\n",
 					   __func__);
-				goto bail;
+				ret = -EINVAL; /* a fail indication */
 			}
+			release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
+			if (ret)
+				goto bail;
 		} else {
 			ppd->offline_disabled_reason =
 			   HFI1_ODR_MASK(
diff --git a/drivers/staging/rdma/hfi1/qsfp.c b/drivers/staging/rdma/hfi1/qsfp.c
index dc5d1864dcd4..2441669f0817 100644
--- a/drivers/staging/rdma/hfi1/qsfp.c
+++ b/drivers/staging/rdma/hfi1/qsfp.c
@@ -355,6 +355,8 @@ int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
  * The calls to qsfp_{read,write} in this function correctly handle the
  * address map difference between this mapping and the mapping implemented
  * by those functions
+ *
+ * The caller must be holding the QSFP i2c chain resource.
  */
 int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
 {
@@ -371,13 +373,9 @@ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
 
 	if (!qsfp_mod_present(ppd)) {
 		ret = -ENODEV;
-		goto bail_no_release;
+		goto bail;
 	}
 
-	ret = acquire_chip_resource(ppd->dd, qsfp_resource(ppd->dd), QSFP_WAIT);
-	if (ret)
-		goto bail_no_release;
-
 	ret = qsfp_read(ppd, target, 0, cache, QSFP_PAGESIZE);
 	if (ret != QSFP_PAGESIZE) {
 		dd_dev_info(ppd->dd,
@@ -440,8 +438,6 @@ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
 		}
 	}
 
-	release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
-
 	spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
 	ppd->qsfp_info.cache_valid = 1;
 	ppd->qsfp_info.cache_refresh_required = 0;
@@ -450,8 +446,6 @@ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
 	return 0;
 
 bail:
-	release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
-bail_no_release:
 	memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128));
 	return ret;
 }

From 87717f0a75432e078bb894ba230dd236be7a31a7 Mon Sep 17 00:00:00 2001
From: Mike Marciniszyn <mike.marciniszyn@intel.com>
Date: Tue, 12 Apr 2016 11:28:56 -0700
Subject: [PATCH 23/46] IB/hfi1: Remove unreachable code

Remove unreachable code from RC ack handling to fix an
smatch error.

Fixes: 633d27399514 ("staging/rdma/hfi1: use mod_timer when appropriate")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/rc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/rdma/hfi1/rc.c b/drivers/staging/rdma/hfi1/rc.c
index 0d7e1017f3cb..c6147793bf4b 100644
--- a/drivers/staging/rdma/hfi1/rc.c
+++ b/drivers/staging/rdma/hfi1/rc.c
@@ -1497,7 +1497,7 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
 		/* Ignore reserved NAK codes. */
 		goto bail_stop;
 	}
-	return ret;
+	/* cannot be reached  */
 bail_stop:
 	hfi1_stop_rc_timers(qp);
 	return ret;

From b218f786adc215509e806fe4eb98725e33e8d784 Mon Sep 17 00:00:00 2001
From: Mike Marciniszyn <mike.marciniszyn@intel.com>
Date: Tue, 12 Apr 2016 11:29:20 -0700
Subject: [PATCH 24/46] IB/hfi1: Use global defines for upper bits in opcode

The awkward coding for setting the allowed_ops field
was tripping an smatch warning.

This patch uses the more appropriate defines from include/rdma
to avoid the issue.

As part of the patch remove a mask that was duplicated
in rdmavt include files and use that mask as appropriate.

Fixes: 8bea6b1cfe6f ("IB/rdmavt: Add create queue pair functionality")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/sw/rdmavt/qp.c | 6 +++---
 drivers/staging/rdma/hfi1/verbs.c | 2 +-
 drivers/staging/rdma/hfi1/verbs.h | 3 ---
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index a9e3bcc522c4..0f12c211c385 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -829,13 +829,13 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 	case IB_QPT_SMI:
 	case IB_QPT_GSI:
 	case IB_QPT_UD:
-		qp->allowed_ops = IB_OPCODE_UD_SEND_ONLY & RVT_OPCODE_QP_MASK;
+		qp->allowed_ops = IB_OPCODE_UD;
 		break;
 	case IB_QPT_RC:
-		qp->allowed_ops = IB_OPCODE_RC_SEND_ONLY & RVT_OPCODE_QP_MASK;
+		qp->allowed_ops = IB_OPCODE_RC;
 		break;
 	case IB_QPT_UC:
-		qp->allowed_ops = IB_OPCODE_UC_SEND_ONLY & RVT_OPCODE_QP_MASK;
+		qp->allowed_ops = IB_OPCODE_UC;
 		break;
 	default:
 		ret = ERR_PTR(-EINVAL);
diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c
index c56c0cb0de80..9cdc85fa366f 100644
--- a/drivers/staging/rdma/hfi1/verbs.c
+++ b/drivers/staging/rdma/hfi1/verbs.c
@@ -545,7 +545,7 @@ static inline int qp_ok(int opcode, struct hfi1_packet *packet)
 
 	if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
 		goto dropit;
-	if (((opcode & OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
+	if (((opcode & RVT_OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
 	    (opcode == IB_OPCODE_CNP))
 		return 1;
 dropit:
diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h
index 2ba1373f4fb4..3ee223983b20 100644
--- a/drivers/staging/rdma/hfi1/verbs.h
+++ b/drivers/staging/rdma/hfi1/verbs.h
@@ -335,9 +335,6 @@ int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
 #endif
 #define PSN_MODIFY_MASK 0xFFFFFF
 
-/* Number of bits to pay attention to in the opcode for checking qp type */
-#define OPCODE_QP_MASK 0xE0
-
 /*
  * Compare the lower 24 bits of the msn values.
  * Returns an integer <, ==, or > than zero.

From 0852d241f48463d80764f841f2e06c0f3c4da923 Mon Sep 17 00:00:00 2001
From: Jubin John <jubin.john@intel.com>
Date: Tue, 12 Apr 2016 11:30:08 -0700
Subject: [PATCH 25/46] IB/hfi1: Change default number of user contexts

Change the default number of user contexts to the number of real
(non-HT) cpu cores in order to reduce the division of hfi1 hardware
contexts in the case of high core counts with hyper-threading enabled.

Reviewed-by: Dean Luick <dean.luick@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Reviewed-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Jubin John <jubin.john@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/affinity.c | 93 ++++++++++++++--------------
 drivers/staging/rdma/hfi1/affinity.h | 19 +++++-
 drivers/staging/rdma/hfi1/chip.c     | 26 +++++---
 3 files changed, 84 insertions(+), 54 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/affinity.c b/drivers/staging/rdma/hfi1/affinity.c
index 2cb8ca77f876..6e7050ab9e16 100644
--- a/drivers/staging/rdma/hfi1/affinity.c
+++ b/drivers/staging/rdma/hfi1/affinity.c
@@ -53,20 +53,6 @@
 #include "sdma.h"
 #include "trace.h"
 
-struct cpu_mask_set {
-	struct cpumask mask;
-	struct cpumask used;
-	uint gen;
-};
-
-struct hfi1_affinity {
-	struct cpu_mask_set def_intr;
-	struct cpu_mask_set rcv_intr;
-	struct cpu_mask_set proc;
-	/* spin lock to protect affinity struct */
-	spinlock_t lock;
-};
-
 /* Name of IRQ types, indexed by enum irq_type */
 static const char * const irq_type_names[] = {
 	"SDMA",
@@ -82,6 +68,48 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set)
 	set->gen = 0;
 }
 
+/* Initialize non-HT cpu cores mask */
+int init_real_cpu_mask(struct hfi1_devdata *dd)
+{
+	struct hfi1_affinity *info;
+	int possible, curr_cpu, i, ht;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	cpumask_clear(&info->real_cpu_mask);
+
+	/* Start with cpu online mask as the real cpu mask */
+	cpumask_copy(&info->real_cpu_mask, cpu_online_mask);
+
+	/*
+	 * Remove HT cores from the real cpu mask.  Do this in two steps below.
+	 */
+	possible = cpumask_weight(&info->real_cpu_mask);
+	ht = cpumask_weight(topology_sibling_cpumask(
+					cpumask_first(&info->real_cpu_mask)));
+	/*
+	 * Step 1.  Skip over the first N HT siblings and use them as the
+	 * "real" cores.  Assumes that HT cores are not enumerated in
+	 * succession (except in the single core case).
+	 */
+	curr_cpu = cpumask_first(&info->real_cpu_mask);
+	for (i = 0; i < possible / ht; i++)
+		curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+	/*
+	 * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
+	 * skip any gaps.
+	 */
+	for (; i < possible; i++) {
+		cpumask_clear_cpu(curr_cpu, &info->real_cpu_mask);
+		curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+	}
+
+	dd->affinity = info;
+	return 0;
+}
+
 /*
  * Interrupt affinity.
  *
@@ -93,20 +121,17 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set)
  * to the node relative 1 as necessary.
  *
  */
-int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
+void hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 {
 	int node = pcibus_to_node(dd->pcidev->bus);
-	struct hfi1_affinity *info;
+	struct hfi1_affinity *info = dd->affinity;
 	const struct cpumask *local_mask;
-	int curr_cpu, possible, i, ht;
+	int curr_cpu, possible, i;
 
 	if (node < 0)
 		node = numa_node_id();
 	dd->node = node;
 
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info)
-		return -ENOMEM;
 	spin_lock_init(&info->lock);
 
 	init_cpu_mask_set(&info->def_intr);
@@ -116,30 +141,8 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 	local_mask = cpumask_of_node(dd->node);
 	if (cpumask_first(local_mask) >= nr_cpu_ids)
 		local_mask = topology_core_cpumask(0);
-	/* use local mask as default */
-	cpumask_copy(&info->def_intr.mask, local_mask);
-	/*
-	 * Remove HT cores from the default mask.  Do this in two steps below.
-	 */
-	possible = cpumask_weight(&info->def_intr.mask);
-	ht = cpumask_weight(topology_sibling_cpumask(
-					cpumask_first(&info->def_intr.mask)));
-	/*
-	 * Step 1.  Skip over the first N HT siblings and use them as the
-	 * "real" cores.  Assumes that HT cores are not enumerated in
-	 * succession (except in the single core case).
-	 */
-	curr_cpu = cpumask_first(&info->def_intr.mask);
-	for (i = 0; i < possible / ht; i++)
-		curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
-	/*
-	 * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
-	 * skip any gaps.
-	 */
-	for (; i < possible; i++) {
-		cpumask_clear_cpu(curr_cpu, &info->def_intr.mask);
-		curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
-	}
+	/* Use the "real" cpu mask of this node as the default */
+	cpumask_and(&info->def_intr.mask, &info->real_cpu_mask, local_mask);
 
 	/*  fill in the receive list */
 	possible = cpumask_weight(&info->def_intr.mask);
@@ -167,8 +170,6 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 	}
 
 	cpumask_copy(&info->proc.mask, cpu_online_mask);
-	dd->affinity = info;
-	return 0;
 }
 
 void hfi1_dev_affinity_free(struct hfi1_devdata *dd)
diff --git a/drivers/staging/rdma/hfi1/affinity.h b/drivers/staging/rdma/hfi1/affinity.h
index b287e4963024..20f52fe74091 100644
--- a/drivers/staging/rdma/hfi1/affinity.h
+++ b/drivers/staging/rdma/hfi1/affinity.h
@@ -64,10 +64,27 @@ enum affinity_flags {
 	AFF_IRQ_LOCAL
 };
 
+struct cpu_mask_set {
+	struct cpumask mask;
+	struct cpumask used;
+	uint gen;
+};
+
+struct hfi1_affinity {
+	struct cpu_mask_set def_intr;
+	struct cpu_mask_set rcv_intr;
+	struct cpu_mask_set proc;
+	struct cpumask real_cpu_mask;
+	/* spin lock to protect affinity struct */
+	spinlock_t lock;
+};
+
 struct hfi1_msix_entry;
 
+/* Initialize non-HT cpu cores mask */
+int init_real_cpu_mask(struct hfi1_devdata *);
 /* Initialize driver affinity data */
-int hfi1_dev_affinity_init(struct hfi1_devdata *);
+void hfi1_dev_affinity_init(struct hfi1_devdata *);
 /* Free driver affinity data */
 void hfi1_dev_affinity_free(struct hfi1_devdata *);
 /*
diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
index 38c0e9726cee..1984454ff116 100644
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -12656,12 +12656,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
 		num_kernel_contexts = dd->chip_send_contexts - num_vls - 1;
 	}
 	/*
-	 * User contexts: (to be fixed later)
-	 *	- default to 1 user context per CPU if num_user_contexts is
-	 *	  negative
+	 * User contexts:
+	 *	- default to 1 user context per real (non-HT) CPU core if
+	 *	  num_user_contexts is negative
 	 */
 	if (num_user_contexts < 0)
-		num_user_contexts = num_online_cpus();
+		num_user_contexts =
+			cpumask_weight(&dd->affinity->real_cpu_mask);
 
 	total_contexts = num_kernel_contexts + num_user_contexts;
 
@@ -14089,6 +14090,19 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
 		 (dd->revision >> CCE_REVISION_SW_SHIFT)
 		    & CCE_REVISION_SW_MASK);
 
+	/*
+	 * The real cpu mask is part of the affinity struct but has to be
+	 * initialized earlier than the rest of the affinity struct because it
+	 * is needed to calculate the number of user contexts in
+	 * set_up_context_variables(). However, hfi1_dev_affinity_init(),
+	 * which initializes the rest of the affinity struct members,
+	 * depends on set_up_context_variables() for the number of kernel
+	 * contexts, so it cannot be called before set_up_context_variables().
+	 */
+	ret = init_real_cpu_mask(dd);
+	if (ret)
+		goto bail_cleanup;
+
 	ret = set_up_context_variables(dd);
 	if (ret)
 		goto bail_cleanup;
@@ -14102,9 +14116,7 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
 	/* set up KDETH QP prefix in both RX and TX CSRs */
 	init_kdeth_qp(dd);
 
-	ret = hfi1_dev_affinity_init(dd);
-	if (ret)
-		goto bail_cleanup;
+	hfi1_dev_affinity_init(dd);
 
 	/* send contexts must be set up before receive contexts */
 	ret = init_send_contexts(dd);

From 44306f15f0575bff67a923c28aff6e7b2d33021f Mon Sep 17 00:00:00 2001
From: Jianxin Xiong <jianxin.xiong@intel.com>
Date: Tue, 12 Apr 2016 11:30:28 -0700
Subject: [PATCH 26/46] IB/hfi1: Reduce kernel context pio buffer allocation

The pio buffers were pooled evenly among all kernel contexts and
user contexts. However, the demand from kernel contexts is much
lower than user contexts. This patch reduces the allocation for
kernel contexts and thus makes more credits available for PSM,
helping performance. This is especially useful on high core-count
systems where large numbers of contexts are used.

A new context type SC_VL15 is added to distinguish the context used
for VL15 from other kernel contexts. The reason is that VL15 needs
to support 2KB sized packet while other kernel contexts need only
support packets up to the size determined by "piothreshold", which
has a default value of 256.

The new allocation method allows triple buffering of largest pio
packets configured for these contexts. This is sufficient to maintain
verbs performance. The largest pio packet size is 2048B for VL15
and "piothreshold" for other kernel contexts. A cap is applied to
"piothreshold" to avoid excessive buffer allocation.

The special case that SDMA is disable is handled differently. In
that case, the original pooling allocation is used to better
support the much higher pio traffic.

Notice that if adaptive pio is disabled (piothreshold==0), the pio
buffer size doesn't matter for non-VL15 kernel send contexts when
SDMA is enabled because pio is not used at all on these contexts
and thus the new allocation is still valid. If SDMA is disabled then
pooling allocation is used as mentioned in previous paragraph.

Adjustment is also made to the calculation of the credit return
threshold for the kernel contexts. Instead of purely based on
the MTU size, a percentage based threshold is also considered and
the smaller one of the two is chosen. This is necessary to ensure
that with the reduced buffer allocation credits are returned in
time to avoid unnecessary stall in the send path.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Dean Luick <dean.luick@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Reviewed-by: Mark Debbage <mark.debbage@intel.com>
Reviewed-by: Jubin John <jubin.john@intel.com>
Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/chip.c | 27 +++++++++--------
 drivers/staging/rdma/hfi1/diag.c |  3 +-
 drivers/staging/rdma/hfi1/pio.c  | 52 +++++++++++++++++++++++++-------
 drivers/staging/rdma/hfi1/pio.h  |  4 ++-
 4 files changed, 61 insertions(+), 25 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
index 1984454ff116..96badb49da9e 100644
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -5661,7 +5661,7 @@ static int sc_to_vl(struct hfi1_devdata *dd, int sw_index)
 	sci = &dd->send_contexts[sw_index];
 
 	/* there is no information for user (PSM) and ack contexts */
-	if (sci->type != SC_KERNEL)
+	if ((sci->type != SC_KERNEL) && (sci->type != SC_VL15))
 		return -1;
 
 	sc = sci->sc;
@@ -9627,6 +9627,7 @@ static void set_send_length(struct hfi1_pportdata *ppd)
 			      & SEND_LEN_CHECK1_LEN_VL15_MASK) <<
 		SEND_LEN_CHECK1_LEN_VL15_SHIFT;
 	int i;
+	u32 thres;
 
 	for (i = 0; i < ppd->vls_supported; i++) {
 		if (dd->vld[i].mtu > maxvlmtu)
@@ -9645,16 +9646,17 @@ static void set_send_length(struct hfi1_pportdata *ppd)
 	/* adjust kernel credit return thresholds based on new MTUs */
 	/* all kernel receive contexts have the same hdrqentsize */
 	for (i = 0; i < ppd->vls_supported; i++) {
-		sc_set_cr_threshold(dd->vld[i].sc,
-				    sc_mtu_to_threshold(dd->vld[i].sc,
-							dd->vld[i].mtu,
-							dd->rcd[0]->
-							rcvhdrqentsize));
-	}
-	sc_set_cr_threshold(dd->vld[15].sc,
-			    sc_mtu_to_threshold(dd->vld[15].sc,
-						dd->vld[15].mtu,
+		thres = min(sc_percent_to_threshold(dd->vld[i].sc, 50),
+			    sc_mtu_to_threshold(dd->vld[i].sc,
+						dd->vld[i].mtu,
 						dd->rcd[0]->rcvhdrqentsize));
+		sc_set_cr_threshold(dd->vld[i].sc, thres);
+	}
+	thres = min(sc_percent_to_threshold(dd->vld[15].sc, 50),
+		    sc_mtu_to_threshold(dd->vld[15].sc,
+					dd->vld[15].mtu,
+					dd->rcd[0]->rcvhdrqentsize));
+	sc_set_cr_threshold(dd->vld[15].sc, thres);
 
 	/* Adjust maximum MTU for the port in DC */
 	dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 :
@@ -12728,12 +12730,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
 		dd->num_send_contexts = ret;
 		dd_dev_info(
 			dd,
-			"send contexts: chip %d, used %d (kernel %d, ack %d, user %d)\n",
+			"send contexts: chip %d, used %d (kernel %d, ack %d, user %d, vl15 %d)\n",
 			dd->chip_send_contexts,
 			dd->num_send_contexts,
 			dd->sc_sizes[SC_KERNEL].count,
 			dd->sc_sizes[SC_ACK].count,
-			dd->sc_sizes[SC_USER].count);
+			dd->sc_sizes[SC_USER].count,
+			dd->sc_sizes[SC_VL15].count);
 		ret = 0;	/* success */
 	}
 
diff --git a/drivers/staging/rdma/hfi1/diag.c b/drivers/staging/rdma/hfi1/diag.c
index c5b520bf610e..bb2409ad891a 100644
--- a/drivers/staging/rdma/hfi1/diag.c
+++ b/drivers/staging/rdma/hfi1/diag.c
@@ -413,7 +413,8 @@ static ssize_t diagpkt_send(struct diag_pkt *dp)
 		goto bail;
 	}
 	/* can only use kernel contexts */
-	if (dd->send_contexts[dp->sw_index].type != SC_KERNEL) {
+	if (dd->send_contexts[dp->sw_index].type != SC_KERNEL &&
+	    dd->send_contexts[dp->sw_index].type != SC_VL15) {
 		ret = -EINVAL;
 		goto bail;
 	}
diff --git a/drivers/staging/rdma/hfi1/pio.c b/drivers/staging/rdma/hfi1/pio.c
index c6849ce9e5eb..c67b9ad3fcf4 100644
--- a/drivers/staging/rdma/hfi1/pio.c
+++ b/drivers/staging/rdma/hfi1/pio.c
@@ -139,23 +139,30 @@ void pio_send_control(struct hfi1_devdata *dd, int op)
 /* Send Context Size (SCS) wildcards */
 #define SCS_POOL_0 -1
 #define SCS_POOL_1 -2
+
 /* Send Context Count (SCC) wildcards */
 #define SCC_PER_VL -1
 #define SCC_PER_CPU  -2
-
 #define SCC_PER_KRCVQ  -3
-#define SCC_ACK_CREDITS  32
+
+/* Send Context Size (SCS) constants */
+#define SCS_ACK_CREDITS  32
+#define SCS_VL15_CREDITS 102	/* 3 pkts of 2048B data + 128B header */
+
+#define PIO_THRESHOLD_CEILING 4096
 
 #define PIO_WAIT_BATCH_SIZE 5
 
 /* default send context sizes */
 static struct sc_config_sizes sc_config_sizes[SC_MAX] = {
 	[SC_KERNEL] = { .size  = SCS_POOL_0,	/* even divide, pool 0 */
-			.count = SCC_PER_VL },/* one per NUMA */
-	[SC_ACK]    = { .size  = SCC_ACK_CREDITS,
+			.count = SCC_PER_VL },	/* one per NUMA */
+	[SC_ACK]    = { .size  = SCS_ACK_CREDITS,
 			.count = SCC_PER_KRCVQ },
 	[SC_USER]   = { .size  = SCS_POOL_0,	/* even divide, pool 0 */
 			.count = SCC_PER_CPU },	/* one per CPU */
+	[SC_VL15]   = { .size  = SCS_VL15_CREDITS,
+			.count = 1 },
 
 };
 
@@ -202,7 +209,8 @@ static int wildcard_to_pool(int wc)
 static const char *sc_type_names[SC_MAX] = {
 	"kernel",
 	"ack",
-	"user"
+	"user",
+	"vl15"
 };
 
 static const char *sc_type_name(int index)
@@ -230,6 +238,22 @@ int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
 	int extra;
 	int i;
 
+	/*
+	 * When SDMA is enabled, kernel context pio packet size is capped by
+	 * "piothreshold". Reduce pio buffer allocation for kernel context by
+	 * setting it to a fixed size. The allocation allows 3-deep buffering
+	 * of the largest pio packets plus up to 128 bytes header, sufficient
+	 * to maintain verbs performance.
+	 *
+	 * When SDMA is disabled, keep the default pooling allocation.
+	 */
+	if (HFI1_CAP_IS_KSET(SDMA)) {
+		u16 max_pkt_size = (piothreshold < PIO_THRESHOLD_CEILING) ?
+					 piothreshold : PIO_THRESHOLD_CEILING;
+		sc_config_sizes[SC_KERNEL].size =
+			3 * (max_pkt_size + 128) / PIO_BLOCK_SIZE;
+	}
+
 	/*
 	 * Step 0:
 	 *	- copy the centipercents/absolute sizes from the pool config
@@ -311,7 +335,7 @@ int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
 		if (i == SC_ACK) {
 			count = dd->n_krcv_queues;
 		} else if (i == SC_KERNEL) {
-			count = (INIT_SC_PER_VL * num_vls) + 1 /* VL15 */;
+			count = INIT_SC_PER_VL * num_vls;
 		} else if (count == SCC_PER_CPU) {
 			count = dd->num_rcv_contexts - dd->n_krcv_queues;
 		} else if (count < 0) {
@@ -596,7 +620,7 @@ u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize)
  * Return value is what to write into the CSR: trigger return when
  * unreturned credits pass this count.
  */
-static u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
+u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
 {
 	return (sc->credits * percent) / 100;
 }
@@ -790,7 +814,10 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
 	 * For Ack contexts, set a threshold for half the credits.
 	 * For User contexts use the given percentage.  This has been
 	 * sanitized on driver start-up.
-	 * For Kernel contexts, use the default MTU plus a header.
+	 * For Kernel contexts, use the default MTU plus a header
+	 * or half the credits, whichever is smaller. This should
+	 * work for both the 3-deep buffering allocation and the
+	 * pooling allocation.
 	 */
 	if (type == SC_ACK) {
 		thresh = sc_percent_to_threshold(sc, 50);
@@ -798,7 +825,9 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
 		thresh = sc_percent_to_threshold(sc,
 						 user_credit_return_threshold);
 	} else { /* kernel */
-		thresh = sc_mtu_to_threshold(sc, hfi1_max_mtu, hdrqentsize);
+		thresh = min(sc_percent_to_threshold(sc, 50),
+			     sc_mtu_to_threshold(sc, hfi1_max_mtu,
+						 hdrqentsize));
 	}
 	reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT);
 	/* add in early return */
@@ -1531,7 +1560,8 @@ static void sc_piobufavail(struct send_context *sc)
 	unsigned long flags;
 	unsigned i, n = 0;
 
-	if (dd->send_contexts[sc->sw_index].type != SC_KERNEL)
+	if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
+	    dd->send_contexts[sc->sw_index].type != SC_VL15)
 		return;
 	list = &sc->piowait;
 	/*
@@ -1900,7 +1930,7 @@ int init_pervl_scs(struct hfi1_devdata *dd)
 	u32 ctxt;
 	struct hfi1_pportdata *ppd = dd->pport;
 
-	dd->vld[15].sc = sc_alloc(dd, SC_KERNEL,
+	dd->vld[15].sc = sc_alloc(dd, SC_VL15,
 				  dd->rcd[0]->rcvhdrqentsize, dd->node);
 	if (!dd->vld[15].sc)
 		goto nomem;
diff --git a/drivers/staging/rdma/hfi1/pio.h b/drivers/staging/rdma/hfi1/pio.h
index 0026976ce4f6..53a08edb7f64 100644
--- a/drivers/staging/rdma/hfi1/pio.h
+++ b/drivers/staging/rdma/hfi1/pio.h
@@ -51,7 +51,8 @@
 #define SC_KERNEL 0
 #define SC_ACK    1
 #define SC_USER   2
-#define SC_MAX    3
+#define SC_VL15   3
+#define SC_MAX    4
 
 /* invalid send context index */
 #define INVALID_SCI 0xff
@@ -293,6 +294,7 @@ void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context);
 void sc_add_credit_return_intr(struct send_context *sc);
 void sc_del_credit_return_intr(struct send_context *sc);
 void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold);
+u32 sc_percent_to_threshold(struct send_context *sc, u32 percent);
 u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize);
 void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint);
 void sc_wait(struct hfi1_devdata *dd);

From 372cc85a13c97d6c743cb8df25b52fa5e93d73f6 Mon Sep 17 00:00:00 2001
From: Dean Luick <dean.luick@intel.com>
Date: Tue, 12 Apr 2016 11:30:51 -0700
Subject: [PATCH 27/46] IB/hfi1: Extract RSM map table init from QOS

Refactor the allocation, tracking, and writing of the RSM map table
into its own set of routines.  This will allow the map table to be
passed to multiple users to fill in as needed.  Start with the original
user, QOS.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/chip.c           | 91 +++++++++++++++++-----
 drivers/staging/rdma/hfi1/chip_registers.h |  1 +
 2 files changed, 71 insertions(+), 21 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
index 96badb49da9e..234c0784b97b 100644
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -123,6 +123,8 @@ struct flag_table {
 
 #define MIN_KERNEL_KCTXTS         2
 #define FIRST_KERNEL_KCTXT        1
+/* sizes for both the QP and RSM map tables */
+#define NUM_MAP_ENTRIES		256
 #define NUM_MAP_REGS             32
 
 /* Bit offset into the GUID which carries HFI id information */
@@ -13422,9 +13424,52 @@ static void init_qpmap_table(struct hfi1_devdata *dd,
 			| RCV_CTRL_RCV_BYPASS_ENABLE_SMASK);
 }
 
+struct rsm_map_table {
+	u64 map[NUM_MAP_REGS];
+	unsigned int used;
+};
+
+/*
+ * Return an initialized RMT map table for users to fill in.  OK if it
+ * returns NULL, indicating no table.
+ */
+static struct rsm_map_table *alloc_rsm_map_table(struct hfi1_devdata *dd)
+{
+	struct rsm_map_table *rmt;
+	u8 rxcontext = is_ax(dd) ? 0 : 0xff;  /* 0 is default if a0 ver. */
+
+	rmt = kmalloc(sizeof(*rmt), GFP_KERNEL);
+	if (rmt) {
+		memset(rmt->map, rxcontext, sizeof(rmt->map));
+		rmt->used = 0;
+	}
+
+	return rmt;
+}
+
+/*
+ * Write the final RMT map table to the chip and free the table.  OK if
+ * table is NULL.
+ */
+static void complete_rsm_map_table(struct hfi1_devdata *dd,
+				   struct rsm_map_table *rmt)
+{
+	int i;
+
+	if (rmt) {
+		/* write table to chip */
+		for (i = 0; i < NUM_MAP_REGS; i++)
+			write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rmt->map[i]);
+
+		/* enable RSM */
+		add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
+	}
+}
+
 /**
  * init_qos - init RX qos
  * @dd - device data
+ * @rmt - RSM map table
  *
  * This routine initializes Rule 0 and the RSM map table to implement
  * quality of service (qos).
@@ -13435,16 +13480,16 @@ static void init_qpmap_table(struct hfi1_devdata *dd,
  * The number of vl bits (n) and the number of qpn bits (m) are computed to
  * feed both the RSM map table and the single rule.
  */
-static void init_qos(struct hfi1_devdata *dd)
+static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt)
 {
 	u8 max_by_vl = 0;
 	unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
-	u64 *rsmmap;
+	unsigned int rmt_entries;
 	u64 reg;
-	u8  rxcontext = is_ax(dd) ? 0 : 0xff;  /* 0 is default if a0 ver. */
 
 	/* validate */
-	if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
+	if (!rmt ||
+	    dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
 	    num_vls == 1 ||
 	    krcvqsset <= 1)
 		goto bail;
@@ -13460,11 +13505,11 @@ static void init_qos(struct hfi1_devdata *dd)
 	m = ilog2(qpns_per_vl);
 	if ((m + n) > 7)
 		goto bail;
-	rsmmap = kmalloc_array(NUM_MAP_REGS, sizeof(u64), GFP_KERNEL);
-	if (!rsmmap)
+	/* enough room in the map table? */
+	rmt_entries = 1 << (m + n);
+	if (rmt->used + rmt_entries >= NUM_MAP_ENTRIES)
 		goto bail;
-	memset(rsmmap, rxcontext, NUM_MAP_REGS * sizeof(u64));
-	/* init the local copy of the table */
+	/* add qos entries to the the RSM map table */
 	for (i = 0, ctxt = FIRST_KERNEL_KCTXT; i < num_vls; i++) {
 		unsigned tctxt;
 
@@ -13472,26 +13517,24 @@ static void init_qos(struct hfi1_devdata *dd)
 		     krcvqs[i] && qpn < qpns_per_vl; qpn++) {
 			unsigned idx, regoff, regidx;
 
-			/* generate index <= 128 */
-			idx = (qpn << n) ^ i;
+			/* generate the index the hardware will produce */
+			idx = rmt->used + ((qpn << n) ^ i);
 			regoff = (idx % 8) * 8;
 			regidx = idx / 8;
-			reg = rsmmap[regidx];
-			/* replace 0xff with context number */
+			/* replace default with context number */
+			reg = rmt->map[regidx];
 			reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK
 				<< regoff);
 			reg |= (u64)(tctxt++) << regoff;
-			rsmmap[regidx] = reg;
+			rmt->map[regidx] = reg;
 			if (tctxt == ctxt + krcvqs[i])
 				tctxt = ctxt;
 		}
 		ctxt += krcvqs[i];
 	}
-	/* flush cached copies to chip */
-	for (i = 0; i < NUM_MAP_REGS; i++)
-		write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rsmmap[i]);
 	/* add rule0 */
 	write_csr(dd, RCV_RSM_CFG /* + (8 * 0) */,
+		  (u64)rmt->used << RCV_RSM_CFG_OFFSET_SHIFT |
 		  RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK <<
 			RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT |
 		  2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
@@ -13507,9 +13550,8 @@ static void init_qos(struct hfi1_devdata *dd)
 		  LRH_BTH_VALUE << RCV_RSM_MATCH_VALUE1_SHIFT |
 		  LRH_SC_MASK << RCV_RSM_MATCH_MASK2_SHIFT |
 		  LRH_SC_VALUE << RCV_RSM_MATCH_VALUE2_SHIFT);
-	/* Enable RSM */
-	add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
-	kfree(rsmmap);
+	/* mark RSM map entries as used */
+	rmt->used += rmt_entries;
 	/* map everything else to the mcast/err/vl15 context */
 	init_qpmap_table(dd, HFI1_CTRL_CTXT, HFI1_CTRL_CTXT);
 	dd->qos_shift = n + 1;
@@ -13521,10 +13563,17 @@ static void init_qos(struct hfi1_devdata *dd)
 
 static void init_rxe(struct hfi1_devdata *dd)
 {
+	struct rsm_map_table *rmt;
+
 	/* enable all receive errors */
 	write_csr(dd, RCV_ERR_MASK, ~0ull);
-	/* setup QPN map table - start where VL15 context leaves off */
-	init_qos(dd);
+
+	rmt = alloc_rsm_map_table(dd);
+	/* set up QOS, including the QPN map table */
+	init_qos(dd, rmt);
+	complete_rsm_map_table(dd, rmt);
+	kfree(rmt);
+
 	/*
 	 * make sure RcvCtrl.RcvWcb <= PCIe Device Control
 	 * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config
diff --git a/drivers/staging/rdma/hfi1/chip_registers.h b/drivers/staging/rdma/hfi1/chip_registers.h
index 770f05c9b8de..8744de6667c2 100644
--- a/drivers/staging/rdma/hfi1/chip_registers.h
+++ b/drivers/staging/rdma/hfi1/chip_registers.h
@@ -771,6 +771,7 @@
 #define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK 0x1ull
 #define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT 0
 #define RCV_RSM_CFG_PACKET_TYPE_SHIFT 60
+#define RCV_RSM_CFG_OFFSET_SHIFT 32
 #define RCV_RSM_MAP_TABLE (RXE + 0x000000000900)
 #define RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK 0xFFull
 #define RCV_RSM_MATCH (RXE + 0x000000000800)

From 4a818bedf7b6087f4bbf875847b11f1946775934 Mon Sep 17 00:00:00 2001
From: Dean Luick <dean.luick@intel.com>
Date: Tue, 12 Apr 2016 11:31:11 -0700
Subject: [PATCH 28/46] IB/hfi1: Move QOS decision logic into its own function

The decision to use QOS affects other resource allocation.
Move the QOS decision logic into its own function so it can
be called by other interested parties.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/chip.c | 67 ++++++++++++++++++++++++--------
 1 file changed, 50 insertions(+), 17 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
index 234c0784b97b..b2c037ce451e 100644
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -13466,6 +13466,50 @@ static void complete_rsm_map_table(struct hfi1_devdata *dd,
 	}
 }
 
+/* return the number of RSM map table entries that will be used for QOS */
+static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+			   unsigned int *np)
+{
+	int i;
+	unsigned int m, n;
+	u8 max_by_vl = 0;
+
+	/* is QOS active at all? */
+	if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
+	    num_vls == 1 ||
+	    krcvqsset <= 1)
+		goto no_qos;
+
+	/* determine bits for qpn */
+	for (i = 0; i < min_t(unsigned int, num_vls, krcvqsset); i++)
+		if (krcvqs[i] > max_by_vl)
+			max_by_vl = krcvqs[i];
+	if (max_by_vl > 32)
+		goto no_qos;
+	m = ilog2(__roundup_pow_of_two(max_by_vl));
+
+	/* determine bits for vl */
+	n = ilog2(__roundup_pow_of_two(num_vls));
+
+	/* reject if too much is used */
+	if ((m + n) > 7)
+		goto no_qos;
+
+	if (mp)
+		*mp = m;
+	if (np)
+		*np = n;
+
+	return 1 << (m + n);
+
+no_qos:
+	if (mp)
+		*mp = 0;
+	if (np)
+		*np = 0;
+	return 0;
+}
+
 /**
  * init_qos - init RX qos
  * @dd - device data
@@ -13482,33 +13526,22 @@ static void complete_rsm_map_table(struct hfi1_devdata *dd,
  */
 static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt)
 {
-	u8 max_by_vl = 0;
 	unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
 	unsigned int rmt_entries;
 	u64 reg;
 
-	/* validate */
-	if (!rmt ||
-	    dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
-	    num_vls == 1 ||
-	    krcvqsset <= 1)
+	if (!rmt)
 		goto bail;
-	for (i = 0; i < min_t(unsigned, num_vls, krcvqsset); i++)
-		if (krcvqs[i] > max_by_vl)
-			max_by_vl = krcvqs[i];
-	if (max_by_vl > 32)
-		goto bail;
-	qpns_per_vl = __roundup_pow_of_two(max_by_vl);
-	/* determine bits vl */
-	n = ilog2(__roundup_pow_of_two(num_vls));
-	/* determine bits for qpn */
-	m = ilog2(qpns_per_vl);
-	if ((m + n) > 7)
+	rmt_entries = qos_rmt_entries(dd, &m, &n);
+	if (rmt_entries == 0)
 		goto bail;
+	qpns_per_vl = 1 << m;
+
 	/* enough room in the map table? */
 	rmt_entries = 1 << (m + n);
 	if (rmt->used + rmt_entries >= NUM_MAP_ENTRIES)
 		goto bail;
+
 	/* add qos entries to the the RSM map table */
 	for (i = 0, ctxt = FIRST_KERNEL_KCTXT; i < num_vls; i++) {
 		unsigned tctxt;

From b12349ae13e13b9d07dfda4c1484f91c44c4b469 Mon Sep 17 00:00:00 2001
From: Dean Luick <dean.luick@intel.com>
Date: Tue, 12 Apr 2016 11:31:33 -0700
Subject: [PATCH 29/46] IB/hfi1: Create a routine to set a receive side mapping
 rule

Move the rule setting code into its own routine for improved
searchability and reuse.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/chip.c | 75 ++++++++++++++++++++++++--------
 1 file changed, 57 insertions(+), 18 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
index b2c037ce451e..b5edc3ac3c17 100644
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -13429,6 +13429,21 @@ struct rsm_map_table {
 	unsigned int used;
 };
 
+struct rsm_rule_data {
+	u8 offset;
+	u8 pkt_type;
+	u32 field1_off;
+	u32 field2_off;
+	u32 index1_off;
+	u32 index1_width;
+	u32 index2_off;
+	u32 index2_width;
+	u32 mask1;
+	u32 value1;
+	u32 mask2;
+	u32 value2;
+};
+
 /*
  * Return an initialized RMT map table for users to fill in.  OK if it
  * returns NULL, indicating no table.
@@ -13466,6 +13481,30 @@ static void complete_rsm_map_table(struct hfi1_devdata *dd,
 	}
 }
 
+/*
+ * Add a receive side mapping rule.
+ */
+static void add_rsm_rule(struct hfi1_devdata *dd, u8 rule_index,
+			 struct rsm_rule_data *rrd)
+{
+	write_csr(dd, RCV_RSM_CFG + (8 * rule_index),
+		  (u64)rrd->offset << RCV_RSM_CFG_OFFSET_SHIFT |
+		  1ull << rule_index | /* enable bit */
+		  (u64)rrd->pkt_type << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
+	write_csr(dd, RCV_RSM_SELECT + (8 * rule_index),
+		  (u64)rrd->field1_off << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
+		  (u64)rrd->field2_off << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
+		  (u64)rrd->index1_off << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
+		  (u64)rrd->index1_width << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
+		  (u64)rrd->index2_off << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
+		  (u64)rrd->index2_width << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
+	write_csr(dd, RCV_RSM_MATCH + (8 * rule_index),
+		  (u64)rrd->mask1 << RCV_RSM_MATCH_MASK1_SHIFT |
+		  (u64)rrd->value1 << RCV_RSM_MATCH_VALUE1_SHIFT |
+		  (u64)rrd->mask2 << RCV_RSM_MATCH_MASK2_SHIFT |
+		  (u64)rrd->value2 << RCV_RSM_MATCH_VALUE2_SHIFT);
+}
+
 /* return the number of RSM map table entries that will be used for QOS */
 static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
 			   unsigned int *np)
@@ -13526,6 +13565,7 @@ static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
  */
 static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt)
 {
+	struct rsm_rule_data rrd;
 	unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
 	unsigned int rmt_entries;
 	u64 reg;
@@ -13565,24 +13605,23 @@ static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt)
 		}
 		ctxt += krcvqs[i];
 	}
-	/* add rule0 */
-	write_csr(dd, RCV_RSM_CFG /* + (8 * 0) */,
-		  (u64)rmt->used << RCV_RSM_CFG_OFFSET_SHIFT |
-		  RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK <<
-			RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT |
-		  2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
-	write_csr(dd, RCV_RSM_SELECT /* + (8 * 0) */,
-		  LRH_BTH_MATCH_OFFSET << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
-		  LRH_SC_MATCH_OFFSET << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
-		  LRH_SC_SELECT_OFFSET << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
-		  ((u64)n) << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
-		  QPN_SELECT_OFFSET << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
-		  ((u64)m + (u64)n) << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
-	write_csr(dd, RCV_RSM_MATCH /* + (8 * 0) */,
-		  LRH_BTH_MASK << RCV_RSM_MATCH_MASK1_SHIFT |
-		  LRH_BTH_VALUE << RCV_RSM_MATCH_VALUE1_SHIFT |
-		  LRH_SC_MASK << RCV_RSM_MATCH_MASK2_SHIFT |
-		  LRH_SC_VALUE << RCV_RSM_MATCH_VALUE2_SHIFT);
+
+	rrd.offset = rmt->used;
+	rrd.pkt_type = 2;
+	rrd.field1_off = LRH_BTH_MATCH_OFFSET;
+	rrd.field2_off = LRH_SC_MATCH_OFFSET;
+	rrd.index1_off = LRH_SC_SELECT_OFFSET;
+	rrd.index1_width = n;
+	rrd.index2_off = QPN_SELECT_OFFSET;
+	rrd.index2_width = m + n;
+	rrd.mask1 = LRH_BTH_MASK;
+	rrd.value1 = LRH_BTH_VALUE;
+	rrd.mask2 = LRH_SC_MASK;
+	rrd.value2 = LRH_SC_VALUE;
+
+	/* add rule 0 */
+	add_rsm_rule(dd, 0, &rrd);
+
 	/* mark RSM map entries as used */
 	rmt->used += rmt_entries;
 	/* map everything else to the mcast/err/vl15 context */

From 8f000f7f6e0ad8973f76cc64ecbc4d84099ff959 Mon Sep 17 00:00:00 2001
From: Dean Luick <dean.luick@intel.com>
Date: Tue, 12 Apr 2016 11:32:06 -0700
Subject: [PATCH 30/46] IB/hfi1: Add RSM rule for user FECN handling

Add a receive side mapping rule to extract expected user packets with
the FECN bit set and place them in an eager buffer.  This will allow
user libraries to recognize that a FECN was sent when using header
suppression and respond appropriately.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/chip.c | 84 ++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
index b5edc3ac3c17..386309c67efd 100644
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -1034,6 +1034,8 @@ static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
 static void handle_temp_err(struct hfi1_devdata *);
 static void dc_shutdown(struct hfi1_devdata *);
 static void dc_start(struct hfi1_devdata *);
+static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+			   unsigned int *np);
 
 /*
  * Error interrupt table entry.  This is used as input to the interrupt
@@ -12628,6 +12630,8 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
 	int total_contexts;
 	int ret;
 	unsigned ngroups;
+	int qos_rmt_count;
+	int user_rmt_reduced;
 
 	/*
 	 * Kernel receive contexts:
@@ -12683,6 +12687,19 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
 		total_contexts = num_kernel_contexts + num_user_contexts;
 	}
 
+	/* each user context requires an entry in the RMT */
+	qos_rmt_count = qos_rmt_entries(dd, NULL, NULL);
+	if (qos_rmt_count + num_user_contexts > NUM_MAP_ENTRIES) {
+		user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count;
+		dd_dev_err(dd,
+			   "RMT size is reducing the number of user receive contexts from %d to %d\n",
+			   (int)num_user_contexts,
+			   user_rmt_reduced);
+		/* recalculate */
+		num_user_contexts = user_rmt_reduced;
+		total_contexts = num_kernel_contexts + num_user_contexts;
+	}
+
 	/* the first N are kernel contexts, the rest are user contexts */
 	dd->num_rcv_contexts = total_contexts;
 	dd->n_krcv_queues = num_kernel_contexts;
@@ -13633,6 +13650,72 @@ static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt)
 	init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1);
 }
 
+static void init_user_fecn_handling(struct hfi1_devdata *dd,
+				    struct rsm_map_table *rmt)
+{
+	struct rsm_rule_data rrd;
+	u64 reg;
+	int i, idx, regoff, regidx;
+	u8 offset;
+
+	/* there needs to be enough room in the map table */
+	if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) {
+		dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n");
+		return;
+	}
+
+	/*
+	 * RSM will extract the destination context as an index into the
+	 * map table.  The destination contexts are a sequential block
+	 * in the range first_user_ctxt...num_rcv_contexts-1 (inclusive).
+	 * Map entries are accessed as offset + extracted value.  Adjust
+	 * the added offset so this sequence can be placed anywhere in
+	 * the table - as long as the entries themselves do not wrap.
+	 * There are only enough bits in offset for the table size, so
+	 * start with that to allow for a "negative" offset.
+	 */
+	offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used -
+						(int)dd->first_user_ctxt);
+
+	for (i = dd->first_user_ctxt, idx = rmt->used;
+				i < dd->num_rcv_contexts; i++, idx++) {
+		/* replace with identity mapping */
+		regoff = (idx % 8) * 8;
+		regidx = idx / 8;
+		reg = rmt->map[regidx];
+		reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK << regoff);
+		reg |= (u64)i << regoff;
+		rmt->map[regidx] = reg;
+	}
+
+	/*
+	 * For RSM intercept of Expected FECN packets:
+	 * o packet type 0 - expected
+	 * o match on F (bit 95), using select/match 1, and
+	 * o match on SH (bit 133), using select/match 2.
+	 *
+	 * Use index 1 to extract the 8-bit receive context from DestQP
+	 * (start at bit 64).  Use that as the RSM map table index.
+	 */
+	rrd.offset = offset;
+	rrd.pkt_type = 0;
+	rrd.field1_off = 95;
+	rrd.field2_off = 133;
+	rrd.index1_off = 64;
+	rrd.index1_width = 8;
+	rrd.index2_off = 0;
+	rrd.index2_width = 0;
+	rrd.mask1 = 1;
+	rrd.value1 = 1;
+	rrd.mask2 = 1;
+	rrd.value2 = 1;
+
+	/* add rule 1 */
+	add_rsm_rule(dd, 1, &rrd);
+
+	rmt->used += dd->num_user_contexts;
+}
+
 static void init_rxe(struct hfi1_devdata *dd)
 {
 	struct rsm_map_table *rmt;
@@ -13643,6 +13726,7 @@ static void init_rxe(struct hfi1_devdata *dd)
 	rmt = alloc_rsm_map_table(dd);
 	/* set up QOS, including the QPN map table */
 	init_qos(dd, rmt);
+	init_user_fecn_handling(dd, rmt);
 	complete_rsm_map_table(dd, rmt);
 	kfree(rmt);
 

From f9b5635cbe7bcacb94870e44387df133dd851b10 Mon Sep 17 00:00:00 2001
From: Dean Luick <dean.luick@intel.com>
Date: Thu, 14 Apr 2016 08:31:30 -0700
Subject: [PATCH 31/46] IB/hfi1: Ignore link downgrade with 0 lanes

Versions of the 8051 firmware < 0.38 may report a link failure
as a link downgrade with a width of 0 followed by a link down
notification.  Ignore the zero width downgrade notification -
the driver should follow the link down path.

Reviewed-by: Easwar Hariharan <easwar.hariharan@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/chip.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
index 386309c67efd..f50f1b132af9 100644
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -7336,7 +7336,11 @@ void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, int refresh_widths)
 		ppd->link_width_downgrade_rx_active = rx;
 	}
 
-	if (lwde == 0) {
+	if (ppd->link_width_downgrade_tx_active == 0 ||
+	    ppd->link_width_downgrade_rx_active == 0) {
+		/* the 8051 reported a dead link as a downgrade */
+		dd_dev_err(ppd->dd, "Link downgrade is really a link down, ignoring\n");
+	} else if (lwde == 0) {
 		/* downgrade is disabled */
 
 		/* bounce if not at starting active width */

From feb831ddf2f2cfbc4d26d9df3a982790f6fbbf6a Mon Sep 17 00:00:00 2001
From: Dean Luick <dean.luick@intel.com>
Date: Thu, 14 Apr 2016 08:31:36 -0700
Subject: [PATCH 32/46] IB/hfi1: Use the neighbor link down reason only when
 valid

The 8051 uses a link down reason to inform the driver why the
link went down.  The neighbor planned link down reason code is
only valid when a link down idle message is received by the 8051.
Enhance the explanation on why the link went down.

Reviewed-by: Easwar Hariharan <easwar.hariharan@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/chip.c | 133 ++++++++++++++++++++++++++++---
 drivers/staging/rdma/hfi1/chip.h |   6 ++
 2 files changed, 130 insertions(+), 9 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
index f50f1b132af9..45ff8aea9132 100644
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -1031,6 +1031,7 @@ static int thermal_init(struct hfi1_devdata *dd);
 static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
 				  int msecs);
 static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
+static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
 static void handle_temp_err(struct hfi1_devdata *);
 static void dc_shutdown(struct hfi1_devdata *);
 static void dc_start(struct hfi1_devdata *);
@@ -6812,6 +6813,75 @@ static void reset_neighbor_info(struct hfi1_pportdata *ppd)
 	ppd->neighbor_fm_security = 0;
 }
 
+static const char * const link_down_reason_strs[] = {
+	[OPA_LINKDOWN_REASON_NONE] = "None",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Recive error 0",
+	[OPA_LINKDOWN_REASON_BAD_PKT_LEN] = "Bad packet length",
+	[OPA_LINKDOWN_REASON_PKT_TOO_LONG] = "Packet too long",
+	[OPA_LINKDOWN_REASON_PKT_TOO_SHORT] = "Packet too short",
+	[OPA_LINKDOWN_REASON_BAD_SLID] = "Bad SLID",
+	[OPA_LINKDOWN_REASON_BAD_DLID] = "Bad DLID",
+	[OPA_LINKDOWN_REASON_BAD_L2] = "Bad L2",
+	[OPA_LINKDOWN_REASON_BAD_SC] = "Bad SC",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_8] = "Receive error 8",
+	[OPA_LINKDOWN_REASON_BAD_MID_TAIL] = "Bad mid tail",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_10] = "Receive error 10",
+	[OPA_LINKDOWN_REASON_PREEMPT_ERROR] = "Preempt error",
+	[OPA_LINKDOWN_REASON_PREEMPT_VL15] = "Preempt vl15",
+	[OPA_LINKDOWN_REASON_BAD_VL_MARKER] = "Bad VL marker",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_14] = "Receive error 14",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_15] = "Receive error 15",
+	[OPA_LINKDOWN_REASON_BAD_HEAD_DIST] = "Bad head distance",
+	[OPA_LINKDOWN_REASON_BAD_TAIL_DIST] = "Bad tail distance",
+	[OPA_LINKDOWN_REASON_BAD_CTRL_DIST] = "Bad control distance",
+	[OPA_LINKDOWN_REASON_BAD_CREDIT_ACK] = "Bad credit ack",
+	[OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER] = "Unsupported VL marker",
+	[OPA_LINKDOWN_REASON_BAD_PREEMPT] = "Bad preempt",
+	[OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT] = "Bad control flit",
+	[OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT] = "Exceed multicast limit",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_24] = "Receive error 24",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_25] = "Receive error 25",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_26] = "Receive error 26",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_27] = "Receive error 27",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_28] = "Receive error 28",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_29] = "Receive error 29",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_30] = "Receive error 30",
+	[OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN] =
+					"Excessive buffer overrun",
+	[OPA_LINKDOWN_REASON_UNKNOWN] = "Unknown",
+	[OPA_LINKDOWN_REASON_REBOOT] = "Reboot",
+	[OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN] = "Neighbor unknown",
+	[OPA_LINKDOWN_REASON_FM_BOUNCE] = "FM bounce",
+	[OPA_LINKDOWN_REASON_SPEED_POLICY] = "Speed policy",
+	[OPA_LINKDOWN_REASON_WIDTH_POLICY] = "Width policy",
+	[OPA_LINKDOWN_REASON_DISCONNECTED] = "Disconnected",
+	[OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED] =
+					"Local media not installed",
+	[OPA_LINKDOWN_REASON_NOT_INSTALLED] = "Not installed",
+	[OPA_LINKDOWN_REASON_CHASSIS_CONFIG] = "Chassis config",
+	[OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED] =
+					"End to end not installed",
+	[OPA_LINKDOWN_REASON_POWER_POLICY] = "Power policy",
+	[OPA_LINKDOWN_REASON_LINKSPEED_POLICY] = "Link speed policy",
+	[OPA_LINKDOWN_REASON_LINKWIDTH_POLICY] = "Link width policy",
+	[OPA_LINKDOWN_REASON_SWITCH_MGMT] = "Switch management",
+	[OPA_LINKDOWN_REASON_SMA_DISABLED] = "SMA disabled",
+	[OPA_LINKDOWN_REASON_TRANSIENT] = "Transient"
+};
+
+/* return the neighbor link down reason string */
+static const char *link_down_reason_str(u8 reason)
+{
+	const char *str = NULL;
+
+	if (reason < ARRAY_SIZE(link_down_reason_strs))
+		str = link_down_reason_strs[reason];
+	if (!str)
+		str = "(invalid)";
+
+	return str;
+}
+
 /*
  * Handle a link down interrupt from the 8051.
  *
@@ -6820,8 +6890,11 @@ static void reset_neighbor_info(struct hfi1_pportdata *ppd)
 void handle_link_down(struct work_struct *work)
 {
 	u8 lcl_reason, neigh_reason = 0;
+	u8 link_down_reason;
 	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-								link_down_work);
+						  link_down_work);
+	int was_up;
+	static const char ldr_str[] = "Link down reason: ";
 
 	if ((ppd->host_link_state &
 	     (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) &&
@@ -6830,17 +6903,51 @@ void handle_link_down(struct work_struct *work)
 			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NOT_INSTALLED);
 
 	/* Go offline first, then deal with reading/writing through 8051 */
+	was_up = !!(ppd->host_link_state & HLS_UP);
 	set_link_state(ppd, HLS_DN_OFFLINE);
 
-	lcl_reason = 0;
-	read_planned_down_reason_code(ppd->dd, &neigh_reason);
+	if (was_up) {
+		lcl_reason = 0;
+		/* link down reason is only valid if the link was up */
+		read_link_down_reason(ppd->dd, &link_down_reason);
+		switch (link_down_reason) {
+		case LDR_LINK_TRANSFER_ACTIVE_LOW:
+			/* the link went down, no idle message reason */
+			dd_dev_info(ppd->dd, "%sUnexpected link down\n",
+				    ldr_str);
+			break;
+		case LDR_RECEIVED_LINKDOWN_IDLE_MSG:
+			/*
+			 * The neighbor reason is only valid if an idle message
+			 * was received for it.
+			 */
+			read_planned_down_reason_code(ppd->dd, &neigh_reason);
+			dd_dev_info(ppd->dd,
+				    "%sNeighbor link down message %d, %s\n",
+				    ldr_str, neigh_reason,
+				    link_down_reason_str(neigh_reason));
+			break;
+		case LDR_RECEIVED_HOST_OFFLINE_REQ:
+			dd_dev_info(ppd->dd,
+				    "%sHost requested link to go offline\n",
+				    ldr_str);
+			break;
+		default:
+			dd_dev_info(ppd->dd, "%sUnknown reason 0x%x\n",
+				    ldr_str, link_down_reason);
+			break;
+		}
 
-	/*
-	 * If no reason, assume peer-initiated but missed
-	 * LinkGoingDown idle flits.
-	 */
-	if (neigh_reason == 0)
-		lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
+		/*
+		 * If no reason, assume peer-initiated but missed
+		 * LinkGoingDown idle flits.
+		 */
+		if (neigh_reason == 0)
+			lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
+	} else {
+		/* went down while polling or going up */
+		lcl_reason = OPA_LINKDOWN_REASON_TRANSIENT;
+	}
 
 	set_link_down_reason(ppd, lcl_reason, neigh_reason, 0);
 
@@ -8627,6 +8734,14 @@ static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc)
 	*pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK;
 }
 
+static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr)
+{
+	u32 frame;
+
+	read_8051_config(dd, LINK_DOWN_REASON, GENERAL_CONFIG, &frame);
+	*ldr = (frame & 0xff);
+}
+
 static int read_tx_settings(struct hfi1_devdata *dd,
 			    u8 *enable_lane_tx,
 			    u8 *tx_polarity_inversion,
diff --git a/drivers/staging/rdma/hfi1/chip.h b/drivers/staging/rdma/hfi1/chip.h
index e02e00619adc..1948706fff1a 100644
--- a/drivers/staging/rdma/hfi1/chip.h
+++ b/drivers/staging/rdma/hfi1/chip.h
@@ -389,6 +389,7 @@
 #define LAST_REMOTE_STATE_COMPLETE   0x13
 #define LINK_QUALITY_INFO            0x14
 #define REMOTE_DEVICE_ID	     0x15
+#define LINK_DOWN_REASON	     0x16
 
 /* 8051 lane specific register field IDs */
 #define TX_EQ_SETTINGS		0x00
@@ -497,6 +498,11 @@
 #define PWRM_BER_CONTROL	0x1
 #define PWRM_BANDWIDTH_CONTROL	0x2
 
+/* 8051 link down reasons */
+#define LDR_LINK_TRANSFER_ACTIVE_LOW   0xa
+#define LDR_RECEIVED_LINKDOWN_IDLE_MSG 0xb
+#define LDR_RECEIVED_HOST_OFFLINE_REQ  0xc
+
 /* verify capability fabric CRC size bits */
 enum {
 	CAP_CRC_14B = (1 << 0), /* 14b CRC */

From 015e91fbc9c6ab48596aa38fc0f8c8aab44036f9 Mon Sep 17 00:00:00 2001
From: Dean Luick <dean.luick@intel.com>
Date: Thu, 14 Apr 2016 08:31:42 -0700
Subject: [PATCH 33/46] IB/hfi1: Correctly report neighbor link down reason

The code to save the link down reason for reporting to the SMA
was in a location before the actual reason was read.  Move the
SMA link down reason assignment to a better location.

Reviewed-by: Easwar Hariharan <easwar.hariharan@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/chip.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
index 45ff8aea9132..dcae8e723f98 100644
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -6951,6 +6951,15 @@ void handle_link_down(struct work_struct *work)
 
 	set_link_down_reason(ppd, lcl_reason, neigh_reason, 0);
 
+	/* inform the SMA when the link transitions from up to down */
+	if (was_up && ppd->local_link_down_reason.sma == 0 &&
+	    ppd->neigh_link_down_reason.sma == 0) {
+		ppd->local_link_down_reason.sma =
+					ppd->local_link_down_reason.latest;
+		ppd->neigh_link_down_reason.sma =
+					ppd->neigh_link_down_reason.latest;
+	}
+
 	reset_neighbor_info(ppd);
 
 	/* disable the port */
@@ -10106,7 +10115,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
 	struct hfi1_devdata *dd = ppd->dd;
 	struct ib_event event = {.device = NULL};
 	int ret1, ret = 0;
-	int was_up, is_down;
 	int orig_new_state, poll_bounce;
 
 	mutex_lock(&ppd->hls_lock);
@@ -10125,8 +10133,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
 		    poll_bounce ? "(bounce) " : "",
 		    link_state_reason_name(ppd, state));
 
-	was_up = !!(ppd->host_link_state & HLS_UP);
-
 	/*
 	 * If we're going to a (HLS_*) link state that implies the logical
 	 * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then
@@ -10337,17 +10343,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
 		break;
 	}
 
-	is_down = !!(ppd->host_link_state & (HLS_DN_POLL |
-			HLS_DN_DISABLE | HLS_DN_OFFLINE));
-
-	if (was_up && is_down && ppd->local_link_down_reason.sma == 0 &&
-	    ppd->neigh_link_down_reason.sma == 0) {
-		ppd->local_link_down_reason.sma =
-		  ppd->local_link_down_reason.latest;
-		ppd->neigh_link_down_reason.sma =
-		  ppd->neigh_link_down_reason.latest;
-	}
-
 	goto done;
 
 unexpected:

From 1cbaa670355e4a4e339ac97167fb8ecf536045d3 Mon Sep 17 00:00:00 2001
From: Dean Luick <dean.luick@intel.com>
Date: Thu, 14 Apr 2016 08:31:48 -0700
Subject: [PATCH 34/46] IB/hfi1: Fix MAD port poll for active cables

A MAD directive to start polling must go through the normal
link tuning and start steps in order to correctly handle
active cables.

Reviewed-by: Easwar Hariharan <easwar.hariharan@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/mad.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/staging/rdma/hfi1/mad.c b/drivers/staging/rdma/hfi1/mad.c
index d1e7f4d7cf6f..ed58cf21e790 100644
--- a/drivers/staging/rdma/hfi1/mad.c
+++ b/drivers/staging/rdma/hfi1/mad.c
@@ -999,7 +999,21 @@ static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp,
 			break;
 		}
 
-		set_link_state(ppd, link_state);
+		if ((link_state == HLS_DN_POLL ||
+		     link_state == HLS_DN_DOWNDEF)) {
+			/*
+			 * Going to poll.  No matter what the current state,
+			 * always move offline first, then tune and start the
+			 * link.  This correctly handles a FM link bounce and
+			 * a link enable.  Going offline is a no-op if already
+			 * offline.
+			 */
+			set_link_state(ppd, HLS_DN_OFFLINE);
+			tune_serdes(ppd);
+			start_link(ppd);
+		} else {
+			set_link_state(ppd, link_state);
+		}
 		if (link_state == HLS_DN_DISABLE &&
 		    (ppd->offline_disabled_reason >
 		     HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED) ||

From d35cf74492c5ba0d8e1c08755c78be4ef3af650e Mon Sep 17 00:00:00 2001
From: Jubin John <jubin.john@intel.com>
Date: Thu, 14 Apr 2016 08:31:53 -0700
Subject: [PATCH 35/46] IB/hfi1: Serialize hrtimer function calls

hrtimer functions do not guarantee serialization, so we extend the
cca_timer_lock to cover the hrtimer_forward_now() in the hrtimer
callback handler and the hrtimer_start() in process_becn(). This
prevents races between these 2 functions to update the hrtimer state
leading to problems such as:
kernel BUG at kernel/hrtimer.c:1282!
encountered during validation of the CCA feature.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Jubin John <jubin.john@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/staging/rdma/hfi1/init.c | 20 +++++++++-----------
 drivers/staging/rdma/hfi1/rc.c   |  7 +++----
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c
index b1582b5a169e..502b7cf4647d 100644
--- a/drivers/staging/rdma/hfi1/init.c
+++ b/drivers/staging/rdma/hfi1/init.c
@@ -422,9 +422,10 @@ static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
 	struct cca_timer *cca_timer;
 	struct hfi1_pportdata *ppd;
 	int sl;
-	u16 ccti, ccti_timer, ccti_min;
+	u16 ccti_timer, ccti_min;
 	struct cc_state *cc_state;
 	unsigned long flags;
+	enum hrtimer_restart ret = HRTIMER_NORESTART;
 
 	cca_timer = container_of(t, struct cca_timer, hrtimer);
 	ppd = cca_timer->ppd;
@@ -450,24 +451,21 @@ static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
 
 	spin_lock_irqsave(&ppd->cca_timer_lock, flags);
 
-	ccti = cca_timer->ccti;
-
-	if (ccti > ccti_min) {
+	if (cca_timer->ccti > ccti_min) {
 		cca_timer->ccti--;
 		set_link_ipg(ppd);
 	}
 
-	spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
-
-	rcu_read_unlock();
-
-	if (ccti > ccti_min) {
+	if (cca_timer->ccti > ccti_min) {
 		unsigned long nsec = 1024 * ccti_timer;
 		/* ccti_timer is in units of 1.024 usec */
 		hrtimer_forward_now(t, ns_to_ktime(nsec));
-		return HRTIMER_RESTART;
+		ret = HRTIMER_RESTART;
 	}
-	return HRTIMER_NORESTART;
+
+	spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
+	rcu_read_unlock();
+	return ret;
 }
 
 /*
diff --git a/drivers/staging/rdma/hfi1/rc.c b/drivers/staging/rdma/hfi1/rc.c
index c6147793bf4b..792f15eb8efe 100644
--- a/drivers/staging/rdma/hfi1/rc.c
+++ b/drivers/staging/rdma/hfi1/rc.c
@@ -2021,8 +2021,6 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
 	if (sl >= OPA_MAX_SLS)
 		return;
 
-	cca_timer = &ppd->cca_timer[sl];
-
 	cc_state = get_cc_state(ppd);
 
 	if (!cc_state)
@@ -2041,6 +2039,7 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
 
 	spin_lock_irqsave(&ppd->cca_timer_lock, flags);
 
+	cca_timer = &ppd->cca_timer[sl];
 	if (cca_timer->ccti < ccti_limit) {
 		if (cca_timer->ccti + ccti_incr <= ccti_limit)
 			cca_timer->ccti += ccti_incr;
@@ -2049,8 +2048,6 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
 		set_link_ipg(ppd);
 	}
 
-	spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
-
 	ccti = cca_timer->ccti;
 
 	if (!hrtimer_active(&cca_timer->hrtimer)) {
@@ -2061,6 +2058,8 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
 			      HRTIMER_MODE_REL);
 	}
 
+	spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
+
 	if ((trigger_threshold != 0) && (ccti >= trigger_threshold))
 		log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type);
 }

From 4c8bb95921e9ac01b9dd0c3abbaf6514ce88af92 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 24 Apr 2016 22:18:59 +0200
Subject: [PATCH 36/46] RDMA/nes: don't leak skb if carrier down

Alternatively one could free the skb, OTOH I don't think this test is
useful so just remove it.

Cc: <linux-rdma@vger.kernel.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/nes/nes_nic.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c
index 3ea9e055fdd3..92914539edc7 100644
--- a/drivers/infiniband/hw/nes/nes_nic.c
+++ b/drivers/infiniband/hw/nes/nes_nic.c
@@ -500,9 +500,6 @@ static int nes_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev)
 	 *		skb_shinfo(skb)->nr_frags, skb_is_gso(skb));
 	 */
 
-	if (!netif_carrier_ok(netdev))
-		return NETDEV_TX_OK;
-
 	if (netif_queue_stopped(netdev))
 		return NETDEV_TX_BUSY;
 

From 9c674815d346305068b27bf03b5e86b659a1b111 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 18 Apr 2016 17:06:28 -0400
Subject: [PATCH 37/46] IB/iser: Fix max_sectors calculation

iSER currently has a couple places that set max_sectors in either the host
template or SCSI host, and all of them get it wrong.

This patch instead uses a single assignment that (hopefully) gets it right:
the max_sectors value must be derived from the number of segments in the
FR or FMR structure, but actually be one lower than the page size multiplied
by the number of sectors, as it has to handle the case of non-aligned I/O.

Without this I get trivial to reproduce hangs when running xfstests
(on XFS) over iSER to Linux targets.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Acked-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/ulp/iser/iscsi_iser.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 80b6bedc172f..64b3d11dcf1e 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -612,6 +612,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
 	struct Scsi_Host *shost;
 	struct iser_conn *iser_conn = NULL;
 	struct ib_conn *ib_conn;
+	u32 max_fr_sectors;
 	u16 max_cmds;
 
 	shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0);
@@ -632,7 +633,6 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
 		iser_conn = ep->dd_data;
 		max_cmds = iser_conn->max_cmds;
 		shost->sg_tablesize = iser_conn->scsi_sg_tablesize;
-		shost->max_sectors = iser_conn->scsi_max_sectors;
 
 		mutex_lock(&iser_conn->state_mutex);
 		if (iser_conn->state != ISER_CONN_UP) {
@@ -657,8 +657,6 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
 		 */
 		shost->sg_tablesize = min_t(unsigned short, shost->sg_tablesize,
 			ib_conn->device->ib_device->attrs.max_fast_reg_page_list_len);
-		shost->max_sectors = min_t(unsigned int,
-			1024, (shost->sg_tablesize * PAGE_SIZE) >> 9);
 
 		if (iscsi_host_add(shost,
 				   ib_conn->device->ib_device->dma_device)) {
@@ -672,6 +670,15 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
 			goto free_host;
 	}
 
+	/*
+	 * FRs or FMRs can only map up to a (device) page per entry, but if the
+	 * first entry is misaligned we'll end up using using two entries
+	 * (head and tail) for a single page worth data, so we have to drop
+	 * one segment from the calculation.
+	 */
+	max_fr_sectors = ((shost->sg_tablesize - 1) * PAGE_SIZE) >> 9;
+	shost->max_sectors = min(iser_max_sectors, max_fr_sectors);
+
 	if (cmds_max > max_cmds) {
 		iser_info("cmds_max changed from %u to %u\n",
 			  cmds_max, max_cmds);
@@ -989,7 +996,6 @@ static struct scsi_host_template iscsi_iser_sht = {
 	.queuecommand           = iscsi_queuecommand,
 	.change_queue_depth	= scsi_change_queue_depth,
 	.sg_tablesize           = ISCSI_ISER_DEF_SG_TABLESIZE,
-	.max_sectors            = ISER_DEF_MAX_SECTORS,
 	.cmd_per_lun            = ISER_DEF_CMD_PER_LUN,
 	.eh_abort_handler       = iscsi_eh_abort,
 	.eh_device_reset_handler= iscsi_eh_device_reset,

From 9dec900c20d95ef1f3c40bc5d5901499f5d63381 Mon Sep 17 00:00:00 2001
From: Hariprasad S <hariprasad@chelsio.com>
Date: Thu, 5 May 2016 01:27:29 +0530
Subject: [PATCH 38/46] RDMA/iw_cxgb4: release ep resources on accept arp
 failure

If ARP fails before the CPL_PASS_ACCEPT_RPL is seen by hardware, the tid
will be stuck in SYN_PEND and never released.  So create an arp failure
handler specifically for this message to release the endpoint resources.

In pass_accept_rpl_arp_failure(), put the parent endpoint so it will
be freed when destroyed.  Also we don't need to call release_tid() here
because _c4iw_free_ep() calls cxgb4_remove_tid() which releases the
hwtid.

If we get an ABORT_REQ_RSS instead of a PASS_ESTABLISH (because the
peer's ACK to our SYN is never received), then put the parent as well
in peer_abort().

Treat accept_cr() failures just like arp failures: put the parent ep
and release the ep resources destroying the tid

The ARP failure handlers are called in an atomic context, so we need to
schedule some of the processing which might block.  Namely _c4iw_free_ep()
which needs a mutex.  So create a "special" CPL opcode and handler and
schedule it via sched() to be run by process_work() in a blockable context.

Also rework the active open arp failure handler to make use of
release_ep_resources().  This allows both the active and passive arp
failure handlers to use the same deferred cleanup function.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Hariprasad Shenai <hariprasad@chelsio.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/cxgb4/cm.c | 82 +++++++++++++++++++++++++-------
 1 file changed, 66 insertions(+), 16 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 651711370d55..49784a40d1d1 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -145,6 +145,7 @@ static struct sk_buff_head rxq;
 static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp);
 static void ep_timeout(unsigned long arg);
 static void connect_reply_upcall(struct c4iw_ep *ep, int status);
+static int sched(struct c4iw_dev *dev, struct sk_buff *skb);
 
 static LIST_HEAD(timeout_list);
 static spinlock_t timeout_lock;
@@ -295,7 +296,7 @@ void _c4iw_free_ep(struct kref *kref)
 	struct c4iw_ep *ep;
 
 	ep = container_of(kref, struct c4iw_ep, com.kref);
-	PDBG("%s ep %p state %s\n", __func__, ep, states[state_read(&ep->com)]);
+	PDBG("%s ep %p state %s\n", __func__, ep, states[ep->com.state]);
 	if (test_bit(QP_REFERENCED, &ep->com.flags))
 		deref_qp(ep);
 	if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) {
@@ -432,10 +433,57 @@ static struct dst_entry *find_route(struct c4iw_dev *dev, __be32 local_ip,
 
 static void arp_failure_discard(void *handle, struct sk_buff *skb)
 {
-	PDBG("%s c4iw_dev %p\n", __func__, handle);
+	pr_err(MOD "ARP failure\n");
 	kfree_skb(skb);
 }
 
+enum {
+	NUM_FAKE_CPLS = 1,
+	FAKE_CPL_PUT_EP_SAFE = NUM_CPL_CMDS + 0,
+};
+
+static int _put_ep_safe(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+
+	ep = *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *)));
+	release_ep_resources(ep);
+	return 0;
+}
+
+/*
+ * Fake up a special CPL opcode and call sched() so process_work() will call
+ * _put_ep_safe() in a safe context to free the ep resources.  This is needed
+ * because ARP error handlers are called in an ATOMIC context, and
+ * _c4iw_free_ep() needs to block.
+ */
+static void queue_arp_failure_cpl(struct c4iw_ep *ep, struct sk_buff *skb)
+{
+	struct cpl_act_establish *rpl = cplhdr(skb);
+
+	/* Set our special ARP_FAILURE opcode */
+	rpl->ot.opcode = FAKE_CPL_PUT_EP_SAFE;
+
+	/*
+	 * Save ep in the skb->cb area, after where sched() will save the dev
+	 * ptr.
+	 */
+	*((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *))) = ep;
+	sched(ep->com.dev, skb);
+}
+
+/* Handle an ARP failure for an accept */
+static void pass_accept_rpl_arp_failure(void *handle, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep = handle;
+
+	pr_err(MOD "ARP failure during accept - tid %u -dropping connection\n",
+	       ep->hwtid);
+
+	__state_set(&ep->com, DEAD);
+	queue_arp_failure_cpl(ep, skb);
+}
+
 /*
  * Handle an ARP failure for an active open.
  */
@@ -444,9 +492,8 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb)
 	struct c4iw_ep *ep = handle;
 
 	printk(KERN_ERR MOD "ARP failure during connect\n");
-	kfree_skb(skb);
 	connect_reply_upcall(ep, -EHOSTUNREACH);
-	state_set(&ep->com, DEAD);
+	__state_set(&ep->com, DEAD);
 	if (ep->com.remote_addr.ss_family == AF_INET6) {
 		struct sockaddr_in6 *sin6 =
 			(struct sockaddr_in6 *)&ep->com.local_addr;
@@ -455,9 +502,7 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb)
 	}
 	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
 	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
-	dst_release(ep->dst);
-	cxgb4_l2t_release(ep->l2t);
-	c4iw_put_ep(&ep->com);
+	queue_arp_failure_cpl(ep, skb);
 }
 
 /*
@@ -2198,8 +2243,8 @@ static int close_listsrv_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
 	return 0;
 }
 
-static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb,
-		      struct cpl_pass_accept_req *req)
+static int accept_cr(struct c4iw_ep *ep, struct sk_buff *skb,
+		     struct cpl_pass_accept_req *req)
 {
 	struct cpl_pass_accept_rpl *rpl;
 	unsigned int mtu_idx;
@@ -2287,10 +2332,9 @@ static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb,
 	rpl->opt0 = cpu_to_be64(opt0);
 	rpl->opt2 = cpu_to_be32(opt2);
 	set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx);
-	t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
-	c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+	t4_set_arp_err_handler(skb, ep, pass_accept_rpl_arp_failure);
 
-	return;
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
 }
 
 static void reject_cr(struct c4iw_dev *dev, u32 hwtid, struct sk_buff *skb)
@@ -2469,8 +2513,12 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
 	init_timer(&child_ep->timer);
 	cxgb4_insert_tid(t, child_ep, hwtid);
 	insert_handle(dev, &dev->hwtid_idr, child_ep, child_ep->hwtid);
-	accept_cr(child_ep, skb, req);
-	set_bit(PASS_ACCEPT_REQ, &child_ep->com.history);
+	if (accept_cr(child_ep, skb, req)) {
+		c4iw_put_ep(&parent_ep->com);
+		release_ep_resources(child_ep);
+	} else {
+		set_bit(PASS_ACCEPT_REQ, &child_ep->com.history);
+	}
 	if (iptype == 6) {
 		sin6 = (struct sockaddr_in6 *)&child_ep->com.local_addr;
 		cxgb4_clip_get(child_ep->com.dev->rdev.lldi.ports[0],
@@ -2633,6 +2681,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
 	mutex_lock(&ep->com.mutex);
 	switch (ep->com.state) {
 	case CONNECTING:
+		c4iw_put_ep(&ep->parent_ep->com);
 		break;
 	case MPA_REQ_WAIT:
 		(void)stop_ep_timer(ep);
@@ -3809,7 +3858,7 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb)
  * These are the real handlers that are called from a
  * work queue.
  */
-static c4iw_handler_func work_handlers[NUM_CPL_CMDS] = {
+static c4iw_handler_func work_handlers[NUM_CPL_CMDS + NUM_FAKE_CPLS] = {
 	[CPL_ACT_ESTABLISH] = act_establish,
 	[CPL_ACT_OPEN_RPL] = act_open_rpl,
 	[CPL_RX_DATA] = rx_data,
@@ -3825,7 +3874,8 @@ static c4iw_handler_func work_handlers[NUM_CPL_CMDS] = {
 	[CPL_RDMA_TERMINATE] = terminate,
 	[CPL_FW4_ACK] = fw4_ack,
 	[CPL_FW6_MSG] = deferred_fw6_msg,
-	[CPL_RX_PKT] = rx_pkt
+	[CPL_RX_PKT] = rx_pkt,
+	[FAKE_CPL_PUT_EP_SAFE] = _put_ep_safe
 };
 
 static void process_timeout(struct c4iw_ep *ep)

From 88bc230dc614b8e19000022d0ae2c1dfd578a0b0 Mon Sep 17 00:00:00 2001
From: Hariprasad S <hariprasad@chelsio.com>
Date: Thu, 5 May 2016 01:27:30 +0530
Subject: [PATCH 39/46] RDMA/iw_cxgb4: stop ep timer on close failure

In c4iw_ep_disconnect(), if we start the ep timer to begin a close,
but send_halfclose() fails, we need to stop the timer and send a CLOSE
event up to the IWCM before releasing the resources. Otherwise, we can
crash when the ep timer fires if the ep is referencing a previous instance
of the device. This can happen as part of adapter reset/recovery, for
instance.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Hariprasad Shenai <hariprasad@chelsio.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/cxgb4/cm.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 49784a40d1d1..cc9836e46800 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -3467,8 +3467,13 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
 			set_bit(EP_DISC_CLOSE, &ep->com.history);
 			ret = send_halfclose(ep, gfp);
 		}
-		if (ret)
+		if (ret) {
+			if (!abrupt) {
+				stop_ep_timer(ep);
+				close_complete_upcall(ep, -EIO);
+			}
 			fatal = 1;
+		}
 	}
 	mutex_unlock(&ep->com.mutex);
 	if (fatal)

From 6e410d8f7175caf2316c515f1ea0bf80d33b3158 Mon Sep 17 00:00:00 2001
From: Hariprasad S <hariprasad@chelsio.com>
Date: Thu, 5 May 2016 01:27:31 +0530
Subject: [PATCH 40/46] RDMA/iw_cxgb4: ensure eps don't get freed while the
 mutex is held

In rx_data(), with the ep in FPDU_MODE, refcnt=2, if we get unexpected
streaming data, we call c4iw_modify_rc_qp() and move the qp from
RTS -> TERMINATE.  In c4iw_modify_rc_qp(), if rdma_fini() returns
an error, the ep will be dereferenced (refcnt=1).  Then rx_data()
calls c4iw_ep_disconnect() which starts the close operation.
But if send_halfclose() fails in c4iw_ep_disconnect(), we  will call
release_ep_resources() derefing the ep which reduces the refcnt to 0 and
and frees the ep. However we still has the ep mutex at that point, so we
have a touch-after-free bug.  There is a similar issue where
peer_close() calls c4iw_ep_disconnect().

The solution is to add a reference to the ep in c4iw_ep_disconnect()
after acquiring  the mutex, and release it after releasing the mutex.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Hariprasad Shenai <hariprasad@chelsio.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/cxgb4/cm.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index cc9836e46800..12eac98661c1 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -3416,6 +3416,12 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
 	PDBG("%s ep %p state %s, abrupt %d\n", __func__, ep,
 	     states[ep->com.state], abrupt);
 
+	/*
+	 * Ref the ep here in case we have fatal errors causing the
+	 * ep to be released and freed.
+	 */
+	c4iw_get_ep(&ep->com);
+
 	rdev = &ep->com.dev->rdev;
 	if (c4iw_fatal_error(rdev)) {
 		fatal = 1;
@@ -3476,6 +3482,7 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
 		}
 	}
 	mutex_unlock(&ep->com.mutex);
+	c4iw_put_ep(&ep->com);
 	if (fatal)
 		release_ep_resources(ep);
 	return ret;

From f8e1e1d13773e1bcad127cbb5be964d00ee1f682 Mon Sep 17 00:00:00 2001
From: Hariprasad S <hariprasad@chelsio.com>
Date: Thu, 5 May 2016 01:27:32 +0530
Subject: [PATCH 41/46] RDMA/iw_cxgb4: remove connection abort from
 process_mpa_reply

Instead, have the caller, rx_data() handle the close/abort like
it does for process_mpa_request(). This is part of getting rid of
abort_connection() altogether so we properly clean up on send_abort()
failures.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Hariprasad Shenai <hariprasad@chelsio.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/cxgb4/cm.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 12eac98661c1..c4ce707d210b 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -1346,6 +1346,18 @@ static int update_rx_credits(struct c4iw_ep *ep, u32 credits)
 
 #define RELAXED_IRD_NEGOTIATION 1
 
+/*
+ * process_mpa_reply - process streaming mode MPA reply
+ *
+ * Returns:
+ *
+ * 0 upon success indicating a connect request was delivered to the ULP
+ * or the mpa request is incomplete but valid so far.
+ *
+ * 1 if a failure requires the caller to close the connection.
+ *
+ * 2 if a failure requires the caller to abort the connection.
+ */
 static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
 {
 	struct mpa_message *mpa;
@@ -1575,8 +1587,7 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
 	}
 	goto out;
 err:
-	__state_set(&ep->com, ABORTING);
-	send_abort(ep, skb, GFP_KERNEL);
+	disconnect = 2;
 out:
 	connect_reply_upcall(ep, err);
 	return disconnect;

From fef4422d00c135da4300d7d58e62cd0afe2af730 Mon Sep 17 00:00:00 2001
From: Hariprasad S <hariprasad@chelsio.com>
Date: Thu, 5 May 2016 01:27:33 +0530
Subject: [PATCH 42/46] RDMA/iw_cxgb4: free resources when send_flowc() fails

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Hariprasad Shenai <hariprasad@chelsio.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/cxgb4/cm.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index c4ce707d210b..864da9dec9f6 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -519,7 +519,7 @@ static void abort_arp_failure(void *handle, struct sk_buff *skb)
 	c4iw_ofld_send(rdev, skb);
 }
 
-static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb)
+static int send_flowc(struct c4iw_ep *ep, struct sk_buff *skb)
 {
 	unsigned int flowclen = 80;
 	struct fw_flowc_wr *flowc;
@@ -575,7 +575,7 @@ static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb)
 	}
 
 	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
-	c4iw_ofld_send(&ep->com.dev->rdev, skb);
+	return c4iw_ofld_send(&ep->com.dev->rdev, skb);
 }
 
 static int send_halfclose(struct c4iw_ep *ep, gfp_t gfp)
@@ -1119,6 +1119,7 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)
 	unsigned int tid = GET_TID(req);
 	unsigned int atid = TID_TID_G(ntohl(req->tos_atid));
 	struct tid_info *t = dev->rdev.lldi.tids;
+	int ret;
 
 	ep = lookup_atid(t, atid);
 
@@ -1144,13 +1145,20 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)
 	set_bit(ACT_ESTAB, &ep->com.history);
 
 	/* start MPA negotiation */
-	send_flowc(ep, NULL);
+	ret = send_flowc(ep, NULL);
+	if (ret)
+		goto err;
 	if (ep->retry_with_mpa_v1)
 		send_mpa_req(ep, skb, 1);
 	else
 		send_mpa_req(ep, skb, mpa_rev);
 	mutex_unlock(&ep->com.mutex);
 	return 0;
+err:
+	mutex_unlock(&ep->com.mutex);
+	connect_reply_upcall(ep, -ENOMEM);
+	c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
+	return 0;
 }
 
 static void close_complete_upcall(struct c4iw_ep *ep, int status)
@@ -2548,6 +2556,7 @@ static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb)
 	struct cpl_pass_establish *req = cplhdr(skb);
 	struct tid_info *t = dev->rdev.lldi.tids;
 	unsigned int tid = GET_TID(req);
+	int ret;
 
 	ep = lookup_tid(t, tid);
 	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
@@ -2560,10 +2569,14 @@ static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb)
 	set_emss(ep, ntohs(req->tcp_opt));
 
 	dst_confirm(ep->dst);
-	state_set(&ep->com, MPA_REQ_WAIT);
+	mutex_lock(&ep->com.mutex);
+	ep->com.state = MPA_REQ_WAIT;
 	start_ep_timer(ep);
-	send_flowc(ep, skb);
 	set_bit(PASS_ESTAB, &ep->com.history);
+	ret = send_flowc(ep, skb);
+	mutex_unlock(&ep->com.mutex);
+	if (ret)
+		c4iw_ep_disconnect(ep, 1, GFP_KERNEL);
 
 	return 0;
 }

From eaf4c6d46a6948302b64be2b7149cce22131ee0d Mon Sep 17 00:00:00 2001
From: Hariprasad S <hariprasad@chelsio.com>
Date: Thu, 5 May 2016 01:27:34 +0530
Subject: [PATCH 43/46] RDMA/iw_cxgb4: remove abort_connection() usage from
 accept/reject

Use c4iw_ep_disconnect() instead. This is part of getting rid of
abort_connection() altogether so we properly clean up on send_abort()
failures.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Hariprasad Shenai <hariprasad@chelsio.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/cxgb4/cm.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 864da9dec9f6..d862369b5dd7 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -2922,14 +2922,14 @@ int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
 	set_bit(ULP_REJECT, &ep->com.history);
 	BUG_ON(ep->com.state != MPA_REQ_RCVD);
 	if (mpa_rev == 0)
-		abort_connection(ep, NULL, GFP_KERNEL);
+		disconnect = 2;
 	else {
 		err = send_mpa_reject(ep, pdata, pdata_len);
 		disconnect = 1;
 	}
 	mutex_unlock(&ep->com.mutex);
 	if (disconnect)
-		err = c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
+		err = c4iw_ep_disconnect(ep, disconnect == 2, GFP_KERNEL);
 	c4iw_put_ep(&ep->com);
 	return 0;
 }
@@ -2942,13 +2942,14 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	struct c4iw_ep *ep = to_ep(cm_id);
 	struct c4iw_dev *h = to_c4iw_dev(cm_id->device);
 	struct c4iw_qp *qp = get_qhp(h, conn_param->qpn);
+	int abort = 0;
 
 	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
 
 	mutex_lock(&ep->com.mutex);
 	if (ep->com.state == DEAD) {
 		err = -ECONNRESET;
-		goto err;
+		goto err_out;
 	}
 
 	BUG_ON(ep->com.state != MPA_REQ_RCVD);
@@ -2957,9 +2958,8 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	set_bit(ULP_ACCEPT, &ep->com.history);
 	if ((conn_param->ord > cur_max_read_depth(ep->com.dev)) ||
 	    (conn_param->ird > cur_max_read_depth(ep->com.dev))) {
-		abort_connection(ep, NULL, GFP_KERNEL);
 		err = -EINVAL;
-		goto err;
+		goto err_abort;
 	}
 
 	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
@@ -2971,9 +2971,8 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 				ep->ord = conn_param->ord;
 				send_mpa_reject(ep, conn_param->private_data,
 						conn_param->private_data_len);
-				abort_connection(ep, NULL, GFP_KERNEL);
 				err = -ENOMEM;
-				goto err;
+				goto err_abort;
 			}
 		}
 		if (conn_param->ird < ep->ord) {
@@ -2981,9 +2980,8 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 			    ep->ord <= h->rdev.lldi.max_ordird_qp) {
 				conn_param->ird = ep->ord;
 			} else {
-				abort_connection(ep, NULL, GFP_KERNEL);
 				err = -ENOMEM;
-				goto err;
+				goto err_abort;
 			}
 		}
 	}
@@ -3024,23 +3022,26 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	err = c4iw_modify_qp(ep->com.qp->rhp,
 			     ep->com.qp, mask, &attrs, 1);
 	if (err)
-		goto err1;
+		goto err_deref_cm_id;
 	err = send_mpa_reply(ep, conn_param->private_data,
 			     conn_param->private_data_len);
 	if (err)
-		goto err1;
+		goto err_deref_cm_id;
 
 	__state_set(&ep->com, FPDU_MODE);
 	established_upcall(ep);
 	mutex_unlock(&ep->com.mutex);
 	c4iw_put_ep(&ep->com);
 	return 0;
-err1:
+err_deref_cm_id:
 	ep->com.cm_id = NULL;
-	abort_connection(ep, NULL, GFP_KERNEL);
 	cm_id->rem_ref(cm_id);
-err:
+err_abort:
+	abort = 1;
+err_out:
 	mutex_unlock(&ep->com.mutex);
+	if (abort)
+		c4iw_ep_disconnect(ep, 1, GFP_KERNEL);
 	c4iw_put_ep(&ep->com);
 	return err;
 }

From fd6aabe48c8f76d31aacb55fc6c90af770632ae2 Mon Sep 17 00:00:00 2001
From: Hariprasad S <hariprasad@chelsio.com>
Date: Thu, 5 May 2016 01:27:35 +0530
Subject: [PATCH 44/46] RDMA/iw_cxgb4: don't use abort_connection in
 process_mpa_request()

Instead return whether the caller needs to disconnect. This is part of
getting rid of abort_connection() altogether so we properly clean up on
send_abort() failures.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Hariprasad Shenai <hariprasad@chelsio.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/cxgb4/cm.c | 64 +++++++++++++++++---------------
 1 file changed, 35 insertions(+), 29 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index d862369b5dd7..44e0bc409d59 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -1601,7 +1601,19 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
 	return disconnect;
 }
 
-static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
+/*
+ * process_mpa_request - process streaming mode MPA request
+ *
+ * Returns:
+ *
+ * 0 upon success indicating a connect request was delivered to the ULP
+ * or the mpa request is incomplete but valid so far.
+ *
+ * 1 if a failure requires the caller to close the connection.
+ *
+ * 2 if a failure requires the caller to abort the connection.
+ */
+static int process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
 {
 	struct mpa_message *mpa;
 	struct mpa_v2_conn_params *mpa_v2_params;
@@ -1613,11 +1625,8 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
 	 * If we get more than the supported amount of private data
 	 * then we must fail this connection.
 	 */
-	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
-		(void)stop_ep_timer(ep);
-		abort_connection(ep, skb, GFP_KERNEL);
-		return;
-	}
+	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt))
+		goto err_stop_timer;
 
 	PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
 
@@ -1633,7 +1642,7 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
 	 * We'll continue process when more data arrives.
 	 */
 	if (ep->mpa_pkt_len < sizeof(*mpa))
-		return;
+		return 0;
 
 	PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
 	mpa = (struct mpa_message *) ep->mpa_pkt;
@@ -1644,43 +1653,32 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
 	if (mpa->revision > mpa_rev) {
 		printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d,"
 		       " Received = %d\n", __func__, mpa_rev, mpa->revision);
-		(void)stop_ep_timer(ep);
-		abort_connection(ep, skb, GFP_KERNEL);
-		return;
+		goto err_stop_timer;
 	}
 
-	if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) {
-		(void)stop_ep_timer(ep);
-		abort_connection(ep, skb, GFP_KERNEL);
-		return;
-	}
+	if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key)))
+		goto err_stop_timer;
 
 	plen = ntohs(mpa->private_data_size);
 
 	/*
 	 * Fail if there's too much private data.
 	 */
-	if (plen > MPA_MAX_PRIVATE_DATA) {
-		(void)stop_ep_timer(ep);
-		abort_connection(ep, skb, GFP_KERNEL);
-		return;
-	}
+	if (plen > MPA_MAX_PRIVATE_DATA)
+		goto err_stop_timer;
 
 	/*
 	 * If plen does not account for pkt size
 	 */
-	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
-		(void)stop_ep_timer(ep);
-		abort_connection(ep, skb, GFP_KERNEL);
-		return;
-	}
+	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen))
+		goto err_stop_timer;
 	ep->plen = (u8) plen;
 
 	/*
 	 * If we don't have all the pdata yet, then bail.
 	 */
 	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
-		return;
+		return 0;
 
 	/*
 	 * If we get here we have accumulated the entire mpa
@@ -1742,13 +1740,21 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
 				  SINGLE_DEPTH_NESTING);
 		if (ep->parent_ep->com.state != DEAD) {
 			if (connect_request_upcall(ep))
-				abort_connection(ep, skb, GFP_KERNEL);
+				goto err_unlock_parent;
 		} else {
-			abort_connection(ep, skb, GFP_KERNEL);
+			goto err_unlock_parent;
 		}
 		mutex_unlock(&ep->parent_ep->com.mutex);
 	}
-	return;
+	return 0;
+
+err_unlock_parent:
+	mutex_unlock(&ep->parent_ep->com.mutex);
+	goto err_out;
+err_stop_timer:
+	(void)stop_ep_timer(ep);
+err_out:
+	return 2;
 }
 
 static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb)

From c00dcbafac39760f567350ce0c1cef1e4bb28a64 Mon Sep 17 00:00:00 2001
From: Hariprasad S <hariprasad@chelsio.com>
Date: Thu, 5 May 2016 01:27:36 +0530
Subject: [PATCH 45/46] RDMA/iw_cxgb4: move QP -> ERROR on fatal disconnect
 errors

In c4iw_ep_disconnect(), if we fail to initiate a close operation, then
move the qp to ERROR to disassociate the ep from the qp.  Failure to do
this will leak the ep resources.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Hariprasad Shenai <hariprasad@chelsio.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/cxgb4/cm.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 44e0bc409d59..aea69ca495f3 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -3509,6 +3509,19 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
 				stop_ep_timer(ep);
 				close_complete_upcall(ep, -EIO);
 			}
+			if (ep->com.qp) {
+				struct c4iw_qp_attributes attrs;
+
+				attrs.next_state = C4IW_QP_STATE_ERROR;
+				ret = c4iw_modify_qp(ep->com.qp->rhp,
+						     ep->com.qp,
+						     C4IW_QP_ATTR_NEXT_STATE,
+						     &attrs, 1);
+				if (ret)
+					pr_err(MOD
+					       "%s - qp <- error failed!\n",
+					       __func__);
+			}
 			fatal = 1;
 		}
 	}

From 6973627968acbdf7d6f45a4c4813d46bf8e2a66a Mon Sep 17 00:00:00 2001
From: Hariprasad S <hariprasad@chelsio.com>
Date: Thu, 5 May 2016 01:27:37 +0530
Subject: [PATCH 46/46] RDMA/iw_cxgb4: remove abort_connection() usage from
 ep_timeout()

Use c4iw_ep_disconnect() instead.  This is part of getting rid of
abort_connection() altogether so we properly clean up on send_abort()
failures.

This is the last user of abort_connection(), so remove it too.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Hariprasad Shenai <hariprasad@chelsio.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/cxgb4/cm.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index aea69ca495f3..d7f7ab34eeba 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -1179,14 +1179,6 @@ static void close_complete_upcall(struct c4iw_ep *ep, int status)
 	}
 }
 
-static int abort_connection(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp)
-{
-	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
-	__state_set(&ep->com, ABORTING);
-	set_bit(ABORT_CONN, &ep->com.history);
-	return send_abort(ep, skb, gfp);
-}
-
 static void peer_close_upcall(struct c4iw_ep *ep)
 {
 	struct iw_cm_event event;
@@ -3977,9 +3969,9 @@ static void process_timeout(struct c4iw_ep *ep)
 			__func__, ep, ep->hwtid, ep->com.state);
 		abort = 0;
 	}
-	if (abort)
-		abort_connection(ep, NULL, GFP_KERNEL);
 	mutex_unlock(&ep->com.mutex);
+	if (abort)
+		c4iw_ep_disconnect(ep, 1, GFP_KERNEL);
 	c4iw_put_ep(&ep->com);
 }