Merge branches 'hfi1' and 'iw_cxgb4' into k.o/for-4.7

2016-05-05 16:42:09 -04:00 · 2016-05-05 16:42:09 -04:00 · 94d7f1a255
parent ccea5f0f01 d35cf74492 6973627968
commit 94d7f1a255
40 changed files with 1140 additions and 548 deletions
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@ -145,6 +145,7 @@ static struct sk_buff_head rxq;
 static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp);
 static void ep_timeout(unsigned long arg);
 static void connect_reply_upcall(struct c4iw_ep *ep, int status);
+static int sched(struct c4iw_dev *dev, struct sk_buff *skb);

 static LIST_HEAD(timeout_list);
 static spinlock_t timeout_lock;
@ -295,7 +296,7 @@ void _c4iw_free_ep(struct kref *kref)
 	struct c4iw_ep *ep;

 	ep = container_of(kref, struct c4iw_ep, com.kref);
-	PDBG("%s ep %p state %s\n", __func__, ep, states[state_read(&ep->com)]);
+	PDBG("%s ep %p state %s\n", __func__, ep, states[ep->com.state]);
 	if (test_bit(QP_REFERENCED, &ep->com.flags))
 		deref_qp(ep);
 	if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) {
@ -432,10 +433,57 @@ static struct dst_entry *find_route(struct c4iw_dev *dev, __be32 local_ip,

 static void arp_failure_discard(void *handle, struct sk_buff *skb)
 {
-	PDBG("%s c4iw_dev %p\n", __func__, handle);
+	pr_err(MOD "ARP failure\n");
 	kfree_skb(skb);
 }

+enum {
+	NUM_FAKE_CPLS = 1,
+	FAKE_CPL_PUT_EP_SAFE = NUM_CPL_CMDS + 0,
+};
+
+static int _put_ep_safe(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+
+	ep = *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *)));
+	release_ep_resources(ep);
+	return 0;
+}
+
+/*
+ * Fake up a special CPL opcode and call sched() so process_work() will call
+ * _put_ep_safe() in a safe context to free the ep resources.  This is needed
+ * because ARP error handlers are called in an ATOMIC context, and
+ * _c4iw_free_ep() needs to block.
+ */
+static void queue_arp_failure_cpl(struct c4iw_ep *ep, struct sk_buff *skb)
+{
+	struct cpl_act_establish *rpl = cplhdr(skb);
+
+	/* Set our special ARP_FAILURE opcode */
+	rpl->ot.opcode = FAKE_CPL_PUT_EP_SAFE;
+
+	/*
+	 * Save ep in the skb->cb area, after where sched() will save the dev
+	 * ptr.
+	 */
+	*((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *))) = ep;
+	sched(ep->com.dev, skb);
+}
+
+/* Handle an ARP failure for an accept */
+static void pass_accept_rpl_arp_failure(void *handle, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep = handle;
+
+	pr_err(MOD "ARP failure during accept - tid %u -dropping connection\n",
+	       ep->hwtid);
+
+	__state_set(&ep->com, DEAD);
+	queue_arp_failure_cpl(ep, skb);
+}
+
 /*
 * Handle an ARP failure for an active open.
 */
@ -444,9 +492,8 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb)
 	struct c4iw_ep *ep = handle;

 	printk(KERN_ERR MOD "ARP failure during connect\n");
-	kfree_skb(skb);
 	connect_reply_upcall(ep, -EHOSTUNREACH);
-	state_set(&ep->com, DEAD);
+	__state_set(&ep->com, DEAD);
 	if (ep->com.remote_addr.ss_family == AF_INET6) {
 		struct sockaddr_in6 *sin6 =
 			(struct sockaddr_in6 *)&ep->com.local_addr;
@ -455,9 +502,7 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb)
 	}
 	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
 	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
-	dst_release(ep->dst);
-	cxgb4_l2t_release(ep->l2t);
-	c4iw_put_ep(&ep->com);
+	queue_arp_failure_cpl(ep, skb);
 }

 /*
@ -474,7 +519,7 @@ static void abort_arp_failure(void *handle, struct sk_buff *skb)
 	c4iw_ofld_send(rdev, skb);
 }

-static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb)
+static int send_flowc(struct c4iw_ep *ep, struct sk_buff *skb)
 {
 	unsigned int flowclen = 80;
 	struct fw_flowc_wr *flowc;
@ -530,7 +575,7 @@ static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb)
 	}

 	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
-	c4iw_ofld_send(&ep->com.dev->rdev, skb);
+	return c4iw_ofld_send(&ep->com.dev->rdev, skb);
 }

 static int send_halfclose(struct c4iw_ep *ep, gfp_t gfp)
@ -1074,6 +1119,7 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)
 	unsigned int tid = GET_TID(req);
 	unsigned int atid = TID_TID_G(ntohl(req->tos_atid));
 	struct tid_info *t = dev->rdev.lldi.tids;
+	int ret;

 	ep = lookup_atid(t, atid);

@ -1099,13 +1145,20 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)
 	set_bit(ACT_ESTAB, &ep->com.history);

 	/* start MPA negotiation */
-	send_flowc(ep, NULL);
+	ret = send_flowc(ep, NULL);
+	if (ret)
+		goto err;
 	if (ep->retry_with_mpa_v1)
 		send_mpa_req(ep, skb, 1);
 	else
 		send_mpa_req(ep, skb, mpa_rev);
 	mutex_unlock(&ep->com.mutex);
 	return 0;
+err:
+	mutex_unlock(&ep->com.mutex);
+	connect_reply_upcall(ep, -ENOMEM);
+	c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
+	return 0;
 }

 static void close_complete_upcall(struct c4iw_ep *ep, int status)
@ -1126,14 +1179,6 @@ static void close_complete_upcall(struct c4iw_ep *ep, int status)
 	}
 }

-static int abort_connection(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp)
-{
-	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
-	__state_set(&ep->com, ABORTING);
-	set_bit(ABORT_CONN, &ep->com.history);
-	return send_abort(ep, skb, gfp);
-}
-
 static void peer_close_upcall(struct c4iw_ep *ep)
 {
 	struct iw_cm_event event;
@ -1301,6 +1346,18 @@ static int update_rx_credits(struct c4iw_ep *ep, u32 credits)

 #define RELAXED_IRD_NEGOTIATION 1

+/*
+ * process_mpa_reply - process streaming mode MPA reply
+ *
+ * Returns:
+ *
+ * 0 upon success indicating a connect request was delivered to the ULP
+ * or the mpa request is incomplete but valid so far.
+ *
+ * 1 if a failure requires the caller to close the connection.
+ *
+ * 2 if a failure requires the caller to abort the connection.
+ */
 static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
 {
 	struct mpa_message *mpa;
@ -1530,14 +1587,25 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
 	}
 	goto out;
 err:
-	__state_set(&ep->com, ABORTING);
-	send_abort(ep, skb, GFP_KERNEL);
+	disconnect = 2;
 out:
 	connect_reply_upcall(ep, err);
 	return disconnect;
 }

-static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
+/*
+ * process_mpa_request - process streaming mode MPA request
+ *
+ * Returns:
+ *
+ * 0 upon success indicating a connect request was delivered to the ULP
+ * or the mpa request is incomplete but valid so far.
+ *
+ * 1 if a failure requires the caller to close the connection.
+ *
+ * 2 if a failure requires the caller to abort the connection.
+ */
+static int process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
 {
 	struct mpa_message *mpa;
 	struct mpa_v2_conn_params *mpa_v2_params;
@ -1549,11 +1617,8 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
 	 * If we get more than the supported amount of private data
 	 * then we must fail this connection.
 	 */
-	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
-		(void)stop_ep_timer(ep);
-		abort_connection(ep, skb, GFP_KERNEL);
-		return;
-	}
+	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt))
+		goto err_stop_timer;

 	PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);

@ -1569,7 +1634,7 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
 	 * We'll continue process when more data arrives.
 	 */
 	if (ep->mpa_pkt_len < sizeof(*mpa))
-		return;
+		return 0;

 	PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
 	mpa = (struct mpa_message *) ep->mpa_pkt;
@ -1580,43 +1645,32 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
 	if (mpa->revision > mpa_rev) {
 		printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d,"
 		       " Received = %d\n", __func__, mpa_rev, mpa->revision);
-		(void)stop_ep_timer(ep);
-		abort_connection(ep, skb, GFP_KERNEL);
-		return;
+		goto err_stop_timer;
 	}

-	if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) {
-		(void)stop_ep_timer(ep);
-		abort_connection(ep, skb, GFP_KERNEL);
-		return;
-	}
+	if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key)))
+		goto err_stop_timer;

 	plen = ntohs(mpa->private_data_size);

 	/*
 	 * Fail if there's too much private data.
 	 */
-	if (plen > MPA_MAX_PRIVATE_DATA) {
-		(void)stop_ep_timer(ep);
-		abort_connection(ep, skb, GFP_KERNEL);
-		return;
-	}
+	if (plen > MPA_MAX_PRIVATE_DATA)
+		goto err_stop_timer;

 	/*
 	 * If plen does not account for pkt size
 	 */
-	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
-		(void)stop_ep_timer(ep);
-		abort_connection(ep, skb, GFP_KERNEL);
-		return;
-	}
+	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen))
+		goto err_stop_timer;
 	ep->plen = (u8) plen;

 	/*
 	 * If we don't have all the pdata yet, then bail.
 	 */
 	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
-		return;
+		return 0;

 	/*
 	 * If we get here we have accumulated the entire mpa
@ -1678,13 +1732,21 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
 				  SINGLE_DEPTH_NESTING);
 		if (ep->parent_ep->com.state != DEAD) {
 			if (connect_request_upcall(ep))
-				abort_connection(ep, skb, GFP_KERNEL);
+				goto err_unlock_parent;
 		} else {
-			abort_connection(ep, skb, GFP_KERNEL);
+			goto err_unlock_parent;
 		}
 		mutex_unlock(&ep->parent_ep->com.mutex);
 	}
-	return;
+	return 0;
+
+err_unlock_parent:
+	mutex_unlock(&ep->parent_ep->com.mutex);
+	goto err_out;
+err_stop_timer:
+	(void)stop_ep_timer(ep);
+err_out:
+	return 2;
 }

 static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb)
@ -2198,8 +2260,8 @@ static int close_listsrv_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
 	return 0;
 }

-static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb,
-		      struct cpl_pass_accept_req *req)
+static int accept_cr(struct c4iw_ep *ep, struct sk_buff *skb,
+		     struct cpl_pass_accept_req *req)
 {
 	struct cpl_pass_accept_rpl *rpl;
 	unsigned int mtu_idx;
@ -2287,10 +2349,9 @@ static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb,
 	rpl->opt0 = cpu_to_be64(opt0);
 	rpl->opt2 = cpu_to_be32(opt2);
 	set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx);
-	t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
-	c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+	t4_set_arp_err_handler(skb, ep, pass_accept_rpl_arp_failure);

-	return;
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
 }

 static void reject_cr(struct c4iw_dev *dev, u32 hwtid, struct sk_buff *skb)
@ -2469,8 +2530,12 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
 	init_timer(&child_ep->timer);
 	cxgb4_insert_tid(t, child_ep, hwtid);
 	insert_handle(dev, &dev->hwtid_idr, child_ep, child_ep->hwtid);
-	accept_cr(child_ep, skb, req);
-	set_bit(PASS_ACCEPT_REQ, &child_ep->com.history);
+	if (accept_cr(child_ep, skb, req)) {
+		c4iw_put_ep(&parent_ep->com);
+		release_ep_resources(child_ep);
+	} else {
+		set_bit(PASS_ACCEPT_REQ, &child_ep->com.history);
+	}
 	if (iptype == 6) {
 		sin6 = (struct sockaddr_in6 *)&child_ep->com.local_addr;
 		cxgb4_clip_get(child_ep->com.dev->rdev.lldi.ports[0],
@ -2489,6 +2554,7 @@ static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb)
 	struct cpl_pass_establish *req = cplhdr(skb);
 	struct tid_info *t = dev->rdev.lldi.tids;
 	unsigned int tid = GET_TID(req);
+	int ret;

 	ep = lookup_tid(t, tid);
 	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
@ -2501,10 +2567,14 @@ static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb)
 	set_emss(ep, ntohs(req->tcp_opt));

 	dst_confirm(ep->dst);
-	state_set(&ep->com, MPA_REQ_WAIT);
+	mutex_lock(&ep->com.mutex);
+	ep->com.state = MPA_REQ_WAIT;
 	start_ep_timer(ep);
-	send_flowc(ep, skb);
 	set_bit(PASS_ESTAB, &ep->com.history);
+	ret = send_flowc(ep, skb);
+	mutex_unlock(&ep->com.mutex);
+	if (ret)
+		c4iw_ep_disconnect(ep, 1, GFP_KERNEL);

 	return 0;
 }
@ -2633,6 +2703,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
 	mutex_lock(&ep->com.mutex);
 	switch (ep->com.state) {
 	case CONNECTING:
+		c4iw_put_ep(&ep->parent_ep->com);
 		break;
 	case MPA_REQ_WAIT:
 		(void)stop_ep_timer(ep);
@ -2849,14 +2920,14 @@ int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
 	set_bit(ULP_REJECT, &ep->com.history);
 	BUG_ON(ep->com.state != MPA_REQ_RCVD);
 	if (mpa_rev == 0)
-		abort_connection(ep, NULL, GFP_KERNEL);
+		disconnect = 2;
 	else {
 		err = send_mpa_reject(ep, pdata, pdata_len);
 		disconnect = 1;
 	}
 	mutex_unlock(&ep->com.mutex);
 	if (disconnect)
-		err = c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
+		err = c4iw_ep_disconnect(ep, disconnect == 2, GFP_KERNEL);
 	c4iw_put_ep(&ep->com);
 	return 0;
 }
@ -2869,13 +2940,14 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	struct c4iw_ep *ep = to_ep(cm_id);
 	struct c4iw_dev *h = to_c4iw_dev(cm_id->device);
 	struct c4iw_qp *qp = get_qhp(h, conn_param->qpn);
+	int abort = 0;

 	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);

 	mutex_lock(&ep->com.mutex);
 	if (ep->com.state == DEAD) {
 		err = -ECONNRESET;
-		goto err;
+		goto err_out;
 	}

 	BUG_ON(ep->com.state != MPA_REQ_RCVD);
@ -2884,9 +2956,8 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	set_bit(ULP_ACCEPT, &ep->com.history);
 	if ((conn_param->ord > cur_max_read_depth(ep->com.dev)) ||
 	    (conn_param->ird > cur_max_read_depth(ep->com.dev))) {
-		abort_connection(ep, NULL, GFP_KERNEL);
 		err = -EINVAL;
-		goto err;
+		goto err_abort;
 	}

 	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
@ -2898,9 +2969,8 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 				ep->ord = conn_param->ord;
 				send_mpa_reject(ep, conn_param->private_data,
 						conn_param->private_data_len);
-				abort_connection(ep, NULL, GFP_KERNEL);
 				err = -ENOMEM;
-				goto err;
+				goto err_abort;
 			}
 		}
 		if (conn_param->ird < ep->ord) {
@ -2908,9 +2978,8 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 			    ep->ord <= h->rdev.lldi.max_ordird_qp) {
 				conn_param->ird = ep->ord;
 			} else {
-				abort_connection(ep, NULL, GFP_KERNEL);
 				err = -ENOMEM;
-				goto err;
+				goto err_abort;
 			}
 		}
 	}
@ -2951,23 +3020,26 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	err = c4iw_modify_qp(ep->com.qp->rhp,
 			     ep->com.qp, mask, &attrs, 1);
 	if (err)
-		goto err1;
+		goto err_deref_cm_id;
 	err = send_mpa_reply(ep, conn_param->private_data,
 			     conn_param->private_data_len);
 	if (err)
-		goto err1;
+		goto err_deref_cm_id;

 	__state_set(&ep->com, FPDU_MODE);
 	established_upcall(ep);
 	mutex_unlock(&ep->com.mutex);
 	c4iw_put_ep(&ep->com);
 	return 0;
-err1:
+err_deref_cm_id:
 	ep->com.cm_id = NULL;
-	abort_connection(ep, NULL, GFP_KERNEL);
 	cm_id->rem_ref(cm_id);
-err:
+err_abort:
+	abort = 1;
+err_out:
 	mutex_unlock(&ep->com.mutex);
+	if (abort)
+		c4iw_ep_disconnect(ep, 1, GFP_KERNEL);
 	c4iw_put_ep(&ep->com);
 	return err;
 }
@ -3367,6 +3439,12 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
 	PDBG("%s ep %p state %s, abrupt %d\n", __func__, ep,
 	     states[ep->com.state], abrupt);

+	/*
+	 * Ref the ep here in case we have fatal errors causing the
+	 * ep to be released and freed.
+	 */
+	c4iw_get_ep(&ep->com);
+
 	rdev = &ep->com.dev->rdev;
 	if (c4iw_fatal_error(rdev)) {
 		fatal = 1;
@ -3418,10 +3496,29 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
 			set_bit(EP_DISC_CLOSE, &ep->com.history);
 			ret = send_halfclose(ep, gfp);
 		}
-		if (ret)
+		if (ret) {
+			if (!abrupt) {
+				stop_ep_timer(ep);
+				close_complete_upcall(ep, -EIO);
+			}
+			if (ep->com.qp) {
+				struct c4iw_qp_attributes attrs;
+
+				attrs.next_state = C4IW_QP_STATE_ERROR;
+				ret = c4iw_modify_qp(ep->com.qp->rhp,
+						     ep->com.qp,
+						     C4IW_QP_ATTR_NEXT_STATE,
+						     &attrs, 1);
+				if (ret)
+					pr_err(MOD
+					       "%s - qp <- error failed!\n",
+					       __func__);
+			}
 			fatal = 1;
+		}
 	}
 	mutex_unlock(&ep->com.mutex);
+	c4iw_put_ep(&ep->com);
 	if (fatal)
 		release_ep_resources(ep);
 	return ret;
@ -3809,7 +3906,7 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb)
 * These are the real handlers that are called from a
 * work queue.
 */
-static c4iw_handler_func work_handlers[NUM_CPL_CMDS] = {
+static c4iw_handler_func work_handlers[NUM_CPL_CMDS + NUM_FAKE_CPLS] = {
 	[CPL_ACT_ESTABLISH] = act_establish,
 	[CPL_ACT_OPEN_RPL] = act_open_rpl,
 	[CPL_RX_DATA] = rx_data,
@ -3825,7 +3922,8 @@ static c4iw_handler_func work_handlers[NUM_CPL_CMDS] = {
 	[CPL_RDMA_TERMINATE] = terminate,
 	[CPL_FW4_ACK] = fw4_ack,
 	[CPL_FW6_MSG] = deferred_fw6_msg,
-	[CPL_RX_PKT] = rx_pkt
+	[CPL_RX_PKT] = rx_pkt,
+	[FAKE_CPL_PUT_EP_SAFE] = _put_ep_safe
 };

 static void process_timeout(struct c4iw_ep *ep)
@ -3871,9 +3969,9 @@ static void process_timeout(struct c4iw_ep *ep)
 			__func__, ep, ep->hwtid, ep->com.state);
 		abort = 0;
 	}
-	if (abort)
-		abort_connection(ep, NULL, GFP_KERNEL);
 	mutex_unlock(&ep->com.mutex);
+	if (abort)
+		c4iw_ep_disconnect(ep, 1, GFP_KERNEL);
 	c4iw_put_ep(&ep->com);
 }

--- a/drivers/infiniband/hw/nes/nes_nic.c
+++ b/drivers/infiniband/hw/nes/nes_nic.c
@ -500,9 +500,6 @@ static int nes_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev)
 	 *		skb_shinfo(skb)->nr_frags, skb_is_gso(skb));
 	 */

-	if (!netif_carrier_ok(netdev))
-		return NETDEV_TX_OK;
-
 	if (netif_queue_stopped(netdev))
 		return NETDEV_TX_BUSY;

--- a/drivers/infiniband/hw/qib/qib_init.c
+++ b/drivers/infiniband/hw/qib/qib_init.c
@ -1090,7 +1090,7 @@ void qib_free_devdata(struct qib_devdata *dd)
 	qib_dbg_ibdev_exit(&dd->verbs_dev);
 #endif
 	free_percpu(dd->int_counter);
-	ib_dealloc_device(&dd->verbs_dev.rdi.ibdev);
+	rvt_dealloc_device(&dd->verbs_dev.rdi);
 }

 u64 qib_int_counter(struct qib_devdata *dd)
@ -1183,7 +1183,7 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra)
 bail:
 	if (!list_empty(&dd->list))
 		list_del_init(&dd->list);
-	ib_dealloc_device(&dd->verbs_dev.rdi.ibdev);
+	rvt_dealloc_device(&dd->verbs_dev.rdi);
 	return ERR_PTR(ret);
 }

--- a/drivers/infiniband/hw/qib/qib_rc.c
+++ b/drivers/infiniband/hw/qib/qib_rc.c
@ -230,7 +230,7 @@ static int qib_make_rc_ack(struct qib_ibdev *dev, struct rvt_qp *qp,
 *
 * Return 1 if constructed; otherwise, return 0.
 */
-int qib_make_rc_req(struct rvt_qp *qp)
+int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags)
 {
 	struct qib_qp_priv *priv = qp->priv;
 	struct qib_ibdev *dev = to_idev(qp->ibqp.device);
--- a/drivers/infiniband/hw/qib/qib_ruc.c
+++ b/drivers/infiniband/hw/qib/qib_ruc.c
@ -739,7 +739,7 @@ void qib_do_send(struct rvt_qp *qp)
 	struct qib_qp_priv *priv = qp->priv;
 	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
 	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
-	int (*make_req)(struct rvt_qp *qp);
+	int (*make_req)(struct rvt_qp *qp, unsigned long *flags);
 	unsigned long flags;

 	if ((qp->ibqp.qp_type == IB_QPT_RC ||
@ -781,7 +781,7 @@ void qib_do_send(struct rvt_qp *qp)
 			qp->s_hdrwords = 0;
 			spin_lock_irqsave(&qp->s_lock, flags);
 		}
-	} while (make_req(qp));
+	} while (make_req(qp, &flags));

 	spin_unlock_irqrestore(&qp->s_lock, flags);
 }
--- a/drivers/infiniband/hw/qib/qib_uc.c
+++ b/drivers/infiniband/hw/qib/qib_uc.c
@ -45,7 +45,7 @@
 *
 * Return 1 if constructed; otherwise, return 0.
 */
-int qib_make_uc_req(struct rvt_qp *qp)
+int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags)
 {
 	struct qib_qp_priv *priv = qp->priv;
 	struct qib_other_headers *ohdr;
--- a/drivers/infiniband/hw/qib/qib_ud.c
+++ b/drivers/infiniband/hw/qib/qib_ud.c
@ -238,7 +238,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 *
 * Return 1 if constructed; otherwise, return 0.
 */
-int qib_make_ud_req(struct rvt_qp *qp)
+int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags)
 {
 	struct qib_qp_priv *priv = qp->priv;
 	struct qib_other_headers *ohdr;
@ -294,7 +294,7 @@ int qib_make_ud_req(struct rvt_qp *qp)
 		this_cpu_inc(ibp->pmastats->n_unicast_xmit);
 		lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1);
 		if (unlikely(lid == ppd->lid)) {
-			unsigned long flags;
+			unsigned long tflags = *flags;
 			/*
 			 * If DMAs are in progress, we can't generate
 			 * a completion for the loopback packet since
@ -307,10 +307,10 @@ int qib_make_ud_req(struct rvt_qp *qp)
 				goto bail;
 			}
 			qp->s_cur = next_cur;
-			local_irq_save(flags);
-			spin_unlock_irqrestore(&qp->s_lock, flags);
+			spin_unlock_irqrestore(&qp->s_lock, tflags);
 			qib_ud_loopback(qp, wqe);
-			spin_lock_irqsave(&qp->s_lock, flags);
+			spin_lock_irqsave(&qp->s_lock, tflags);
+			*flags = tflags;
 			qib_send_complete(qp, wqe, IB_WC_SUCCESS);
 			goto done;
 		}
--- a/drivers/infiniband/hw/qib/qib_verbs.h
+++ b/drivers/infiniband/hw/qib/qib_verbs.h
@ -430,11 +430,11 @@ void qib_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,

 void qib_send_rc_ack(struct rvt_qp *qp);

-int qib_make_rc_req(struct rvt_qp *qp);
+int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags);

-int qib_make_uc_req(struct rvt_qp *qp);
+int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags);

-int qib_make_ud_req(struct rvt_qp *qp);
+int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags);

 int qib_register_ib_device(struct qib_devdata *);

--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@ -829,13 +829,13 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 	case IB_QPT_SMI:
 	case IB_QPT_GSI:
 	case IB_QPT_UD:
-		qp->allowed_ops = IB_OPCODE_UD_SEND_ONLY & RVT_OPCODE_QP_MASK;
+		qp->allowed_ops = IB_OPCODE_UD;
 		break;
 	case IB_QPT_RC:
-		qp->allowed_ops = IB_OPCODE_RC_SEND_ONLY & RVT_OPCODE_QP_MASK;
+		qp->allowed_ops = IB_OPCODE_RC;
 		break;
 	case IB_QPT_UC:
-		qp->allowed_ops = IB_OPCODE_UC_SEND_ONLY & RVT_OPCODE_QP_MASK;
+		qp->allowed_ops = IB_OPCODE_UC;
 		break;
 	default:
 		ret = ERR_PTR(-EINVAL);
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@ -106,6 +106,19 @@ struct rvt_dev_info *rvt_alloc_device(size_t size, int nports)
 }
 EXPORT_SYMBOL(rvt_alloc_device);

+/**
+ * rvt_dealloc_device - deallocate rdi
+ * @rdi: structure to free
+ *
+ * Free a structure allocated with rvt_alloc_device()
+ */
+void rvt_dealloc_device(struct rvt_dev_info *rdi)
+{
+	kfree(rdi->ports);
+	ib_dealloc_device(&rdi->ibdev);
+}
+EXPORT_SYMBOL(rvt_dealloc_device);
+
 static int rvt_query_device(struct ib_device *ibdev,
 			    struct ib_device_attr *props,
 			    struct ib_udata *uhw)
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@ -612,6 +612,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
 	struct Scsi_Host *shost;
 	struct iser_conn *iser_conn = NULL;
 	struct ib_conn *ib_conn;
+	u32 max_fr_sectors;
 	u16 max_cmds;

 	shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0);
@ -632,7 +633,6 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
 		iser_conn = ep->dd_data;
 		max_cmds = iser_conn->max_cmds;
 		shost->sg_tablesize = iser_conn->scsi_sg_tablesize;
-		shost->max_sectors = iser_conn->scsi_max_sectors;

 		mutex_lock(&iser_conn->state_mutex);
 		if (iser_conn->state != ISER_CONN_UP) {
@ -657,8 +657,6 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
 		 */
 		shost->sg_tablesize = min_t(unsigned short, shost->sg_tablesize,
 			ib_conn->device->ib_device->attrs.max_fast_reg_page_list_len);
-		shost->max_sectors = min_t(unsigned int,
-			1024, (shost->sg_tablesize * PAGE_SIZE) >> 9);

 		if (iscsi_host_add(shost,
 				   ib_conn->device->ib_device->dma_device)) {
@ -672,6 +670,15 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
 			goto free_host;
 	}

+	/*
+	 * FRs or FMRs can only map up to a (device) page per entry, but if the
+	 * first entry is misaligned we'll end up using using two entries
+	 * (head and tail) for a single page worth data, so we have to drop
+	 * one segment from the calculation.
+	 */
+	max_fr_sectors = ((shost->sg_tablesize - 1) * PAGE_SIZE) >> 9;
+	shost->max_sectors = min(iser_max_sectors, max_fr_sectors);
+
 	if (cmds_max > max_cmds) {
 		iser_info("cmds_max changed from %u to %u\n",
 			  cmds_max, max_cmds);
@ -989,7 +996,6 @@ static struct scsi_host_template iscsi_iser_sht = {
 	.queuecommand           = iscsi_queuecommand,
 	.change_queue_depth	= scsi_change_queue_depth,
 	.sg_tablesize           = ISCSI_ISER_DEF_SG_TABLESIZE,
-	.max_sectors            = ISER_DEF_MAX_SECTORS,
 	.cmd_per_lun            = ISER_DEF_CMD_PER_LUN,
 	.eh_abort_handler       = iscsi_eh_abort,
 	.eh_device_reset_handler= iscsi_eh_device_reset,
--- a/drivers/staging/rdma/hfi1/affinity.c
+++ b/drivers/staging/rdma/hfi1/affinity.c
@ -53,20 +53,6 @@
 #include "sdma.h"
 #include "trace.h"

-struct cpu_mask_set {
-	struct cpumask mask;
-	struct cpumask used;
-	uint gen;
-};
-
-struct hfi1_affinity {
-	struct cpu_mask_set def_intr;
-	struct cpu_mask_set rcv_intr;
-	struct cpu_mask_set proc;
-	/* spin lock to protect affinity struct */
-	spinlock_t lock;
-};
-
 /* Name of IRQ types, indexed by enum irq_type */
 static const char * const irq_type_names[] = {
 	"SDMA",
@ -82,6 +68,48 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set)
 	set->gen = 0;
 }

+/* Initialize non-HT cpu cores mask */
+int init_real_cpu_mask(struct hfi1_devdata *dd)
+{
+	struct hfi1_affinity *info;
+	int possible, curr_cpu, i, ht;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	cpumask_clear(&info->real_cpu_mask);
+
+	/* Start with cpu online mask as the real cpu mask */
+	cpumask_copy(&info->real_cpu_mask, cpu_online_mask);
+
+	/*
+	 * Remove HT cores from the real cpu mask.  Do this in two steps below.
+	 */
+	possible = cpumask_weight(&info->real_cpu_mask);
+	ht = cpumask_weight(topology_sibling_cpumask(
+					cpumask_first(&info->real_cpu_mask)));
+	/*
+	 * Step 1.  Skip over the first N HT siblings and use them as the
+	 * "real" cores.  Assumes that HT cores are not enumerated in
+	 * succession (except in the single core case).
+	 */
+	curr_cpu = cpumask_first(&info->real_cpu_mask);
+	for (i = 0; i < possible / ht; i++)
+		curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+	/*
+	 * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
+	 * skip any gaps.
+	 */
+	for (; i < possible; i++) {
+		cpumask_clear_cpu(curr_cpu, &info->real_cpu_mask);
+		curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+	}
+
+	dd->affinity = info;
+	return 0;
+}
+
 /*
 * Interrupt affinity.
 *
@ -93,20 +121,17 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set)
 * to the node relative 1 as necessary.
 *
 */
-int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
+void hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 {
 	int node = pcibus_to_node(dd->pcidev->bus);
-	struct hfi1_affinity *info;
+	struct hfi1_affinity *info = dd->affinity;
 	const struct cpumask *local_mask;
-	int curr_cpu, possible, i, ht;
+	int curr_cpu, possible, i;

 	if (node < 0)
 		node = numa_node_id();
 	dd->node = node;

-	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info)
-		return -ENOMEM;
 	spin_lock_init(&info->lock);

 	init_cpu_mask_set(&info->def_intr);
@ -116,30 +141,8 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 	local_mask = cpumask_of_node(dd->node);
 	if (cpumask_first(local_mask) >= nr_cpu_ids)
 		local_mask = topology_core_cpumask(0);
-	/* use local mask as default */
-	cpumask_copy(&info->def_intr.mask, local_mask);
-	/*
-	 * Remove HT cores from the default mask.  Do this in two steps below.
-	 */
-	possible = cpumask_weight(&info->def_intr.mask);
-	ht = cpumask_weight(topology_sibling_cpumask(
-					cpumask_first(&info->def_intr.mask)));
-	/*
-	 * Step 1.  Skip over the first N HT siblings and use them as the
-	 * "real" cores.  Assumes that HT cores are not enumerated in
-	 * succession (except in the single core case).
-	 */
-	curr_cpu = cpumask_first(&info->def_intr.mask);
-	for (i = 0; i < possible / ht; i++)
-		curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
-	/*
-	 * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
-	 * skip any gaps.
-	 */
-	for (; i < possible; i++) {
-		cpumask_clear_cpu(curr_cpu, &info->def_intr.mask);
-		curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
-	}
+	/* Use the "real" cpu mask of this node as the default */
+	cpumask_and(&info->def_intr.mask, &info->real_cpu_mask, local_mask);

 	/*  fill in the receive list */
 	possible = cpumask_weight(&info->def_intr.mask);
@ -167,8 +170,6 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 	}

 	cpumask_copy(&info->proc.mask, cpu_online_mask);
-	dd->affinity = info;
-	return 0;
 }

 void hfi1_dev_affinity_free(struct hfi1_devdata *dd)
--- a/drivers/staging/rdma/hfi1/affinity.h
+++ b/drivers/staging/rdma/hfi1/affinity.h
@ -64,10 +64,27 @@ enum affinity_flags {
 	AFF_IRQ_LOCAL
 };

+struct cpu_mask_set {
+	struct cpumask mask;
+	struct cpumask used;
+	uint gen;
+};
+
+struct hfi1_affinity {
+	struct cpu_mask_set def_intr;
+	struct cpu_mask_set rcv_intr;
+	struct cpu_mask_set proc;
+	struct cpumask real_cpu_mask;
+	/* spin lock to protect affinity struct */
+	spinlock_t lock;
+};
+
 struct hfi1_msix_entry;

+/* Initialize non-HT cpu cores mask */
+int init_real_cpu_mask(struct hfi1_devdata *);
 /* Initialize driver affinity data */
-int hfi1_dev_affinity_init(struct hfi1_devdata *);
+void hfi1_dev_affinity_init(struct hfi1_devdata *);
 /* Free driver affinity data */
 void hfi1_dev_affinity_free(struct hfi1_devdata *);
 /*
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@ -123,6 +123,8 @@ struct flag_table {

 #define MIN_KERNEL_KCTXTS         2
 #define FIRST_KERNEL_KCTXT        1
+/* sizes for both the QP and RSM map tables */
+#define NUM_MAP_ENTRIES		256
 #define NUM_MAP_REGS             32

 /* Bit offset into the GUID which carries HFI id information */
@ -1029,9 +1031,12 @@ static int thermal_init(struct hfi1_devdata *dd);
 static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
 				  int msecs);
 static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
+static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
 static void handle_temp_err(struct hfi1_devdata *);
 static void dc_shutdown(struct hfi1_devdata *);
 static void dc_start(struct hfi1_devdata *);
+static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+			   unsigned int *np);

 /*
 * Error interrupt table entry.  This is used as input to the interrupt
@ -5661,7 +5666,7 @@ static int sc_to_vl(struct hfi1_devdata *dd, int sw_index)
 	sci = &dd->send_contexts[sw_index];

 	/* there is no information for user (PSM) and ack contexts */
-	if (sci->type != SC_KERNEL)
+	if ((sci->type != SC_KERNEL) && (sci->type != SC_VL15))
 		return -1;

 	sc = sci->sc;
@ -6199,18 +6204,13 @@ static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data)

 /*
 * Handle host requests from the 8051.
- *
- * This is a work-queue function outside of the interrupt.
 */
-void handle_8051_request(struct work_struct *work)
+static void handle_8051_request(struct hfi1_pportdata *ppd)
 {
-	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-							dc_host_req_work);
 	struct hfi1_devdata *dd = ppd->dd;
 	u64 reg;
 	u16 data = 0;
-	u8 type, i, lanes, *cache = ppd->qsfp_info.cache;
-	u8 cdr_ctrl_byte = cache[QSFP_CDR_CTRL_BYTE_OFFS];
+	u8 type;

 	reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1);
 	if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0)
@ -6231,46 +6231,11 @@ void handle_8051_request(struct work_struct *work)
 	case HREQ_READ_CONFIG:
 	case HREQ_SET_TX_EQ_ABS:
 	case HREQ_SET_TX_EQ_REL:
+	case HREQ_ENABLE:
 		dd_dev_info(dd, "8051 request: request 0x%x not supported\n",
 			    type);
 		hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
 		break;
-
-	case HREQ_ENABLE:
-		lanes = data & 0xF;
-		for (i = 0; lanes; lanes >>= 1, i++) {
-			if (!(lanes & 1))
-				continue;
-			if (data & 0x200) {
-				/* enable TX CDR */
-				if (cache[QSFP_MOD_PWR_OFFS] & 0x8 &&
-				    cache[QSFP_CDR_INFO_OFFS] & 0x80)
-					cdr_ctrl_byte |= (1 << (i + 4));
-			} else {
-				/* disable TX CDR */
-				if (cache[QSFP_MOD_PWR_OFFS] & 0x8 &&
-				    cache[QSFP_CDR_INFO_OFFS] & 0x80)
-					cdr_ctrl_byte &= ~(1 << (i + 4));
-			}
-
-			if (data & 0x800) {
-				/* enable RX CDR */
-				if (cache[QSFP_MOD_PWR_OFFS] & 0x4 &&
-				    cache[QSFP_CDR_INFO_OFFS] & 0x40)
-					cdr_ctrl_byte |= (1 << i);
-			} else {
-				/* disable RX CDR */
-				if (cache[QSFP_MOD_PWR_OFFS] & 0x4 &&
-				    cache[QSFP_CDR_INFO_OFFS] & 0x40)
-					cdr_ctrl_byte &= ~(1 << i);
-			}
-		}
-		one_qsfp_write(ppd, dd->hfi1_id, QSFP_CDR_CTRL_BYTE_OFFS,
-			       &cdr_ctrl_byte, 1);
-		hreq_response(dd, HREQ_SUCCESS, data);
-		refresh_qsfp_cache(ppd, &ppd->qsfp_info);
-		break;
-
 	case HREQ_CONFIG_DONE:
 		hreq_response(dd, HREQ_SUCCESS, 0);
 		break;
@ -6278,7 +6243,6 @@ void handle_8051_request(struct work_struct *work)
 	case HREQ_INTERFACE_TEST:
 		hreq_response(dd, HREQ_SUCCESS, data);
 		break;
-
 	default:
 		dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type);
 		hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
@ -6849,6 +6813,75 @@ static void reset_neighbor_info(struct hfi1_pportdata *ppd)
 	ppd->neighbor_fm_security = 0;
 }

+static const char * const link_down_reason_strs[] = {
+	[OPA_LINKDOWN_REASON_NONE] = "None",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Recive error 0",
+	[OPA_LINKDOWN_REASON_BAD_PKT_LEN] = "Bad packet length",
+	[OPA_LINKDOWN_REASON_PKT_TOO_LONG] = "Packet too long",
+	[OPA_LINKDOWN_REASON_PKT_TOO_SHORT] = "Packet too short",
+	[OPA_LINKDOWN_REASON_BAD_SLID] = "Bad SLID",
+	[OPA_LINKDOWN_REASON_BAD_DLID] = "Bad DLID",
+	[OPA_LINKDOWN_REASON_BAD_L2] = "Bad L2",
+	[OPA_LINKDOWN_REASON_BAD_SC] = "Bad SC",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_8] = "Receive error 8",
+	[OPA_LINKDOWN_REASON_BAD_MID_TAIL] = "Bad mid tail",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_10] = "Receive error 10",
+	[OPA_LINKDOWN_REASON_PREEMPT_ERROR] = "Preempt error",
+	[OPA_LINKDOWN_REASON_PREEMPT_VL15] = "Preempt vl15",
+	[OPA_LINKDOWN_REASON_BAD_VL_MARKER] = "Bad VL marker",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_14] = "Receive error 14",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_15] = "Receive error 15",
+	[OPA_LINKDOWN_REASON_BAD_HEAD_DIST] = "Bad head distance",
+	[OPA_LINKDOWN_REASON_BAD_TAIL_DIST] = "Bad tail distance",
+	[OPA_LINKDOWN_REASON_BAD_CTRL_DIST] = "Bad control distance",
+	[OPA_LINKDOWN_REASON_BAD_CREDIT_ACK] = "Bad credit ack",
+	[OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER] = "Unsupported VL marker",
+	[OPA_LINKDOWN_REASON_BAD_PREEMPT] = "Bad preempt",
+	[OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT] = "Bad control flit",
+	[OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT] = "Exceed multicast limit",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_24] = "Receive error 24",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_25] = "Receive error 25",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_26] = "Receive error 26",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_27] = "Receive error 27",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_28] = "Receive error 28",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_29] = "Receive error 29",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_30] = "Receive error 30",
+	[OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN] =
+					"Excessive buffer overrun",
+	[OPA_LINKDOWN_REASON_UNKNOWN] = "Unknown",
+	[OPA_LINKDOWN_REASON_REBOOT] = "Reboot",
+	[OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN] = "Neighbor unknown",
+	[OPA_LINKDOWN_REASON_FM_BOUNCE] = "FM bounce",
+	[OPA_LINKDOWN_REASON_SPEED_POLICY] = "Speed policy",
+	[OPA_LINKDOWN_REASON_WIDTH_POLICY] = "Width policy",
+	[OPA_LINKDOWN_REASON_DISCONNECTED] = "Disconnected",
+	[OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED] =
+					"Local media not installed",
+	[OPA_LINKDOWN_REASON_NOT_INSTALLED] = "Not installed",
+	[OPA_LINKDOWN_REASON_CHASSIS_CONFIG] = "Chassis config",
+	[OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED] =
+					"End to end not installed",
+	[OPA_LINKDOWN_REASON_POWER_POLICY] = "Power policy",
+	[OPA_LINKDOWN_REASON_LINKSPEED_POLICY] = "Link speed policy",
+	[OPA_LINKDOWN_REASON_LINKWIDTH_POLICY] = "Link width policy",
+	[OPA_LINKDOWN_REASON_SWITCH_MGMT] = "Switch management",
+	[OPA_LINKDOWN_REASON_SMA_DISABLED] = "SMA disabled",
+	[OPA_LINKDOWN_REASON_TRANSIENT] = "Transient"
+};
+
+/* return the neighbor link down reason string */
+static const char *link_down_reason_str(u8 reason)
+{
+	const char *str = NULL;
+
+	if (reason < ARRAY_SIZE(link_down_reason_strs))
+		str = link_down_reason_strs[reason];
+	if (!str)
+		str = "(invalid)";
+
+	return str;
+}
+
 /*
 * Handle a link down interrupt from the 8051.
 *
@ -6857,8 +6890,11 @@ static void reset_neighbor_info(struct hfi1_pportdata *ppd)
 void handle_link_down(struct work_struct *work)
 {
 	u8 lcl_reason, neigh_reason = 0;
+	u8 link_down_reason;
 	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-								link_down_work);
+						  link_down_work);
+	int was_up;
+	static const char ldr_str[] = "Link down reason: ";

 	if ((ppd->host_link_state &
 	     (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) &&
@ -6867,20 +6903,63 @@ void handle_link_down(struct work_struct *work)
 			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NOT_INSTALLED);

 	/* Go offline first, then deal with reading/writing through 8051 */
+	was_up = !!(ppd->host_link_state & HLS_UP);
 	set_link_state(ppd, HLS_DN_OFFLINE);

-	lcl_reason = 0;
-	read_planned_down_reason_code(ppd->dd, &neigh_reason);
+	if (was_up) {
+		lcl_reason = 0;
+		/* link down reason is only valid if the link was up */
+		read_link_down_reason(ppd->dd, &link_down_reason);
+		switch (link_down_reason) {
+		case LDR_LINK_TRANSFER_ACTIVE_LOW:
+			/* the link went down, no idle message reason */
+			dd_dev_info(ppd->dd, "%sUnexpected link down\n",
+				    ldr_str);
+			break;
+		case LDR_RECEIVED_LINKDOWN_IDLE_MSG:
+			/*
+			 * The neighbor reason is only valid if an idle message
+			 * was received for it.
+			 */
+			read_planned_down_reason_code(ppd->dd, &neigh_reason);
+			dd_dev_info(ppd->dd,
+				    "%sNeighbor link down message %d, %s\n",
+				    ldr_str, neigh_reason,
+				    link_down_reason_str(neigh_reason));
+			break;
+		case LDR_RECEIVED_HOST_OFFLINE_REQ:
+			dd_dev_info(ppd->dd,
+				    "%sHost requested link to go offline\n",
+				    ldr_str);
+			break;
+		default:
+			dd_dev_info(ppd->dd, "%sUnknown reason 0x%x\n",
+				    ldr_str, link_down_reason);
+			break;
+		}

-	/*
-	 * If no reason, assume peer-initiated but missed
-	 * LinkGoingDown idle flits.
-	 */
-	if (neigh_reason == 0)
-		lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
+		/*
+		 * If no reason, assume peer-initiated but missed
+		 * LinkGoingDown idle flits.
+		 */
+		if (neigh_reason == 0)
+			lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
+	} else {
+		/* went down while polling or going up */
+		lcl_reason = OPA_LINKDOWN_REASON_TRANSIENT;
+	}

 	set_link_down_reason(ppd, lcl_reason, neigh_reason, 0);

+	/* inform the SMA when the link transitions from up to down */
+	if (was_up && ppd->local_link_down_reason.sma == 0 &&
+	    ppd->neigh_link_down_reason.sma == 0) {
+		ppd->local_link_down_reason.sma =
+					ppd->local_link_down_reason.latest;
+		ppd->neigh_link_down_reason.sma =
+					ppd->neigh_link_down_reason.latest;
+	}
+
 	reset_neighbor_info(ppd);

 	/* disable the port */
@ -6890,7 +6969,7 @@ void handle_link_down(struct work_struct *work)
 	 * If there is no cable attached, turn the DC off. Otherwise,
 	 * start the link bring up.
 	 */
-	if (!qsfp_mod_present(ppd)) {
+	if (ppd->port_type == PORT_TYPE_QSFP && !qsfp_mod_present(ppd)) {
 		dc_shutdown(ppd->dd);
 	} else {
 		tune_serdes(ppd);
@ -7373,7 +7452,11 @@ void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, int refresh_widths)
 		ppd->link_width_downgrade_rx_active = rx;
 	}

-	if (lwde == 0) {
+	if (ppd->link_width_downgrade_tx_active == 0 ||
+	    ppd->link_width_downgrade_rx_active == 0) {
+		/* the 8051 reported a dead link as a downgrade */
+		dd_dev_err(ppd->dd, "Link downgrade is really a link down, ignoring\n");
+	} else if (lwde == 0) {
 		/* downgrade is disabled */

 		/* bounce if not at starting active width */
@ -7534,7 +7617,7 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
 			host_msg &= ~(u64)LINKUP_ACHIEVED;
 		}
 		if (host_msg & EXT_DEVICE_CFG_REQ) {
-			queue_work(ppd->hfi1_wq, &ppd->dc_host_req_work);
+			handle_8051_request(ppd);
 			host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
 		}
 		if (host_msg & VERIFY_CAP_FRAME) {
@ -8660,6 +8743,14 @@ static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc)
 	*pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK;
 }

+static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr)
+{
+	u32 frame;
+
+	read_8051_config(dd, LINK_DOWN_REASON, GENERAL_CONFIG, &frame);
+	*ldr = (frame & 0xff);
+}
+
 static int read_tx_settings(struct hfi1_devdata *dd,
 			    u8 *enable_lane_tx,
 			    u8 *tx_polarity_inversion,
@ -9049,9 +9140,9 @@ static int set_local_link_attributes(struct hfi1_pportdata *ppd)
 }

 /*
- * Call this to start the link.  Schedule a retry if the cable is not
- * present or if unable to start polling.  Do not do anything if the
- * link is disabled.  Returns 0 if link is disabled or moved to polling
+ * Call this to start the link.
+ * Do not do anything if the link is disabled.
+ * Returns 0 if link is disabled, moved to polling, or the driver is not ready.
 */
 int start_link(struct hfi1_pportdata *ppd)
 {
@ -9068,15 +9159,7 @@ int start_link(struct hfi1_pportdata *ppd)
 		return 0;
 	}

-	if (qsfp_mod_present(ppd) || loopback == LOOPBACK_SERDES ||
-	    loopback == LOOPBACK_LCB ||
-	    ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
-		return set_link_state(ppd, HLS_DN_POLL);
-
-	dd_dev_info(ppd->dd,
-		    "%s: stopping link start because no cable is present\n",
-		    __func__);
-	return -EAGAIN;
+	return set_link_state(ppd, HLS_DN_POLL);
 }

 static void wait_for_qsfp_init(struct hfi1_pportdata *ppd)
@ -9247,7 +9330,7 @@ static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
 	return 0;
 }

-/* This routine will only be scheduled if the QSFP module is present */
+/* This routine will only be scheduled if the QSFP module present is asserted */
 void qsfp_event(struct work_struct *work)
 {
 	struct qsfp_data *qd;
@ -9676,6 +9759,7 @@ static void set_send_length(struct hfi1_pportdata *ppd)
 			      & SEND_LEN_CHECK1_LEN_VL15_MASK) <<
 		SEND_LEN_CHECK1_LEN_VL15_SHIFT;
 	int i;
+	u32 thres;

 	for (i = 0; i < ppd->vls_supported; i++) {
 		if (dd->vld[i].mtu > maxvlmtu)
@ -9694,16 +9778,17 @@ static void set_send_length(struct hfi1_pportdata *ppd)
 	/* adjust kernel credit return thresholds based on new MTUs */
 	/* all kernel receive contexts have the same hdrqentsize */
 	for (i = 0; i < ppd->vls_supported; i++) {
-		sc_set_cr_threshold(dd->vld[i].sc,
-				    sc_mtu_to_threshold(dd->vld[i].sc,
-							dd->vld[i].mtu,
-							dd->rcd[0]->
-							rcvhdrqentsize));
-	}
-	sc_set_cr_threshold(dd->vld[15].sc,
-			    sc_mtu_to_threshold(dd->vld[15].sc,
-						dd->vld[15].mtu,
+		thres = min(sc_percent_to_threshold(dd->vld[i].sc, 50),
+			    sc_mtu_to_threshold(dd->vld[i].sc,
+						dd->vld[i].mtu,
 						dd->rcd[0]->rcvhdrqentsize));
+		sc_set_cr_threshold(dd->vld[i].sc, thres);
+	}
+	thres = min(sc_percent_to_threshold(dd->vld[15].sc, 50),
+		    sc_mtu_to_threshold(dd->vld[15].sc,
+					dd->vld[15].mtu,
+					dd->rcd[0]->rcvhdrqentsize));
+	sc_set_cr_threshold(dd->vld[15].sc, thres);

 	/* Adjust maximum MTU for the port in DC */
 	dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 :
@ -10030,7 +10115,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
 	struct hfi1_devdata *dd = ppd->dd;
 	struct ib_event event = {.device = NULL};
 	int ret1, ret = 0;
-	int was_up, is_down;
 	int orig_new_state, poll_bounce;

 	mutex_lock(&ppd->hls_lock);
@ -10049,8 +10133,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
 		    poll_bounce ? "(bounce) " : "",
 		    link_state_reason_name(ppd, state));

-	was_up = !!(ppd->host_link_state & HLS_UP);
-
 	/*
 	 * If we're going to a (HLS_*) link state that implies the logical
 	 * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then
@ -10261,17 +10343,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
 		break;
 	}

-	is_down = !!(ppd->host_link_state & (HLS_DN_POLL |
-			HLS_DN_DISABLE | HLS_DN_OFFLINE));
-
-	if (was_up && is_down && ppd->local_link_down_reason.sma == 0 &&
-	    ppd->neigh_link_down_reason.sma == 0) {
-		ppd->local_link_down_reason.sma =
-		  ppd->local_link_down_reason.latest;
-		ppd->neigh_link_down_reason.sma =
-		  ppd->neigh_link_down_reason.latest;
-	}
-
 	goto done;

 unexpected:
@ -12673,22 +12744,24 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
 	int total_contexts;
 	int ret;
 	unsigned ngroups;
+	int qos_rmt_count;
+	int user_rmt_reduced;

 	/*
-	 * Kernel contexts: (to be fixed later):
-	 * - min or 2 or 1 context/numa
+	 * Kernel receive contexts:
+	 * - min of 2 or 1 context/numa (excluding control context)
 	 * - Context 0 - control context (VL15/multicast/error)
-	 * - Context 1 - default context
+	 * - Context 1 - first kernel context
+	 * - Context 2 - second kernel context
+	 * ...
 	 */
 	if (n_krcvqs)
 		/*
-		 * Don't count context 0 in n_krcvqs since
-		 * is isn't used for normal verbs traffic.
-		 *
-		 * krcvqs will reflect number of kernel
-		 * receive contexts above 0.
+		 * n_krcvqs is the sum of module parameter kernel receive
+		 * contexts, krcvqs[].  It does not include the control
+		 * context, so add that.
 		 */
-		num_kernel_contexts = n_krcvqs + MIN_KERNEL_KCTXTS - 1;
+		num_kernel_contexts = n_krcvqs + 1;
 	else
 		num_kernel_contexts = num_online_nodes() + 1;
 	num_kernel_contexts =
@ -12705,12 +12778,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
 		num_kernel_contexts = dd->chip_send_contexts - num_vls - 1;
 	}
 	/*
-	 * User contexts: (to be fixed later)
-	 *	- default to 1 user context per CPU if num_user_contexts is
-	 *	  negative
+	 * User contexts:
+	 *	- default to 1 user context per real (non-HT) CPU core if
+	 *	  num_user_contexts is negative
 	 */
 	if (num_user_contexts < 0)
-		num_user_contexts = num_online_cpus();
+		num_user_contexts =
+			cpumask_weight(&dd->affinity->real_cpu_mask);

 	total_contexts = num_kernel_contexts + num_user_contexts;

@ -12727,6 +12801,19 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
 		total_contexts = num_kernel_contexts + num_user_contexts;
 	}

+	/* each user context requires an entry in the RMT */
+	qos_rmt_count = qos_rmt_entries(dd, NULL, NULL);
+	if (qos_rmt_count + num_user_contexts > NUM_MAP_ENTRIES) {
+		user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count;
+		dd_dev_err(dd,
+			   "RMT size is reducing the number of user receive contexts from %d to %d\n",
+			   (int)num_user_contexts,
+			   user_rmt_reduced);
+		/* recalculate */
+		num_user_contexts = user_rmt_reduced;
+		total_contexts = num_kernel_contexts + num_user_contexts;
+	}
+
 	/* the first N are kernel contexts, the rest are user contexts */
 	dd->num_rcv_contexts = total_contexts;
 	dd->n_krcv_queues = num_kernel_contexts;
@ -12776,12 +12863,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
 		dd->num_send_contexts = ret;
 		dd_dev_info(
 			dd,
-			"send contexts: chip %d, used %d (kernel %d, ack %d, user %d)\n",
+			"send contexts: chip %d, used %d (kernel %d, ack %d, user %d, vl15 %d)\n",
 			dd->chip_send_contexts,
 			dd->num_send_contexts,
 			dd->sc_sizes[SC_KERNEL].count,
 			dd->sc_sizes[SC_ACK].count,
-			dd->sc_sizes[SC_USER].count);
+			dd->sc_sizes[SC_USER].count,
+			dd->sc_sizes[SC_VL15].count);
 		ret = 0;	/* success */
 	}

@ -13451,122 +13539,224 @@ static void init_qpmap_table(struct hfi1_devdata *dd,
 	int i;
 	u64 ctxt = first_ctxt;

-	for (i = 0; i < 256;) {
+	for (i = 0; i < 256; i++) {
 		reg |= ctxt << (8 * (i % 8));
-		i++;
 		ctxt++;
 		if (ctxt > last_ctxt)
 			ctxt = first_ctxt;
-		if (i % 8 == 0) {
+		if (i % 8 == 7) {
 			write_csr(dd, regno, reg);
 			reg = 0;
 			regno += 8;
 		}
 	}
-	if (i % 8)
-		write_csr(dd, regno, reg);

 	add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK
 			| RCV_CTRL_RCV_BYPASS_ENABLE_SMASK);
 }

-/**
- * init_qos - init RX qos
- * @dd - device data
- * @first_context
- *
- * This routine initializes Rule 0 and the
- * RSM map table to implement qos.
- *
- * If all of the limit tests succeed,
- * qos is applied based on the array
- * interpretation of krcvqs where
- * entry 0 is VL0.
- *
- * The number of vl bits (n) and the number of qpn
- * bits (m) are computed to feed both the RSM map table
- * and the single rule.
- *
- */
-static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt)
-{
-	u8 max_by_vl = 0;
-	unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
-	u64 *rsmmap;
-	u64 reg;
-	u8  rxcontext = is_ax(dd) ? 0 : 0xff;  /* 0 is default if a0 ver. */
+struct rsm_map_table {
+	u64 map[NUM_MAP_REGS];
+	unsigned int used;
+};

-	/* validate */
+struct rsm_rule_data {
+	u8 offset;
+	u8 pkt_type;
+	u32 field1_off;
+	u32 field2_off;
+	u32 index1_off;
+	u32 index1_width;
+	u32 index2_off;
+	u32 index2_width;
+	u32 mask1;
+	u32 value1;
+	u32 mask2;
+	u32 value2;
+};
+
+/*
+ * Return an initialized RMT map table for users to fill in.  OK if it
+ * returns NULL, indicating no table.
+ */
+static struct rsm_map_table *alloc_rsm_map_table(struct hfi1_devdata *dd)
+{
+	struct rsm_map_table *rmt;
+	u8 rxcontext = is_ax(dd) ? 0 : 0xff;  /* 0 is default if a0 ver. */
+
+	rmt = kmalloc(sizeof(*rmt), GFP_KERNEL);
+	if (rmt) {
+		memset(rmt->map, rxcontext, sizeof(rmt->map));
+		rmt->used = 0;
+	}
+
+	return rmt;
+}
+
+/*
+ * Write the final RMT map table to the chip and free the table.  OK if
+ * table is NULL.
+ */
+static void complete_rsm_map_table(struct hfi1_devdata *dd,
+				   struct rsm_map_table *rmt)
+{
+	int i;
+
+	if (rmt) {
+		/* write table to chip */
+		for (i = 0; i < NUM_MAP_REGS; i++)
+			write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rmt->map[i]);
+
+		/* enable RSM */
+		add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
+	}
+}
+
+/*
+ * Add a receive side mapping rule.
+ */
+static void add_rsm_rule(struct hfi1_devdata *dd, u8 rule_index,
+			 struct rsm_rule_data *rrd)
+{
+	write_csr(dd, RCV_RSM_CFG + (8 * rule_index),
+		  (u64)rrd->offset << RCV_RSM_CFG_OFFSET_SHIFT |
+		  1ull << rule_index | /* enable bit */
+		  (u64)rrd->pkt_type << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
+	write_csr(dd, RCV_RSM_SELECT + (8 * rule_index),
+		  (u64)rrd->field1_off << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
+		  (u64)rrd->field2_off << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
+		  (u64)rrd->index1_off << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
+		  (u64)rrd->index1_width << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
+		  (u64)rrd->index2_off << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
+		  (u64)rrd->index2_width << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
+	write_csr(dd, RCV_RSM_MATCH + (8 * rule_index),
+		  (u64)rrd->mask1 << RCV_RSM_MATCH_MASK1_SHIFT |
+		  (u64)rrd->value1 << RCV_RSM_MATCH_VALUE1_SHIFT |
+		  (u64)rrd->mask2 << RCV_RSM_MATCH_MASK2_SHIFT |
+		  (u64)rrd->value2 << RCV_RSM_MATCH_VALUE2_SHIFT);
+}
+
+/* return the number of RSM map table entries that will be used for QOS */
+static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+			   unsigned int *np)
+{
+	int i;
+	unsigned int m, n;
+	u8 max_by_vl = 0;
+
+	/* is QOS active at all? */
 	if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
 	    num_vls == 1 ||
 	    krcvqsset <= 1)
-		goto bail;
-	for (i = 0; i < min_t(unsigned, num_vls, krcvqsset); i++)
+		goto no_qos;
+
+	/* determine bits for qpn */
+	for (i = 0; i < min_t(unsigned int, num_vls, krcvqsset); i++)
 		if (krcvqs[i] > max_by_vl)
 			max_by_vl = krcvqs[i];
 	if (max_by_vl > 32)
-		goto bail;
-	qpns_per_vl = __roundup_pow_of_two(max_by_vl);
-	/* determine bits vl */
-	n = ilog2(num_vls);
-	/* determine bits for qpn */
-	m = ilog2(qpns_per_vl);
+		goto no_qos;
+	m = ilog2(__roundup_pow_of_two(max_by_vl));
+
+	/* determine bits for vl */
+	n = ilog2(__roundup_pow_of_two(num_vls));
+
+	/* reject if too much is used */
 	if ((m + n) > 7)
+		goto no_qos;
+
+	if (mp)
+		*mp = m;
+	if (np)
+		*np = n;
+
+	return 1 << (m + n);
+
+no_qos:
+	if (mp)
+		*mp = 0;
+	if (np)
+		*np = 0;
+	return 0;
+}
+
+/**
+ * init_qos - init RX qos
+ * @dd - device data
+ * @rmt - RSM map table
+ *
+ * This routine initializes Rule 0 and the RSM map table to implement
+ * quality of service (qos).
+ *
+ * If all of the limit tests succeed, qos is applied based on the array
+ * interpretation of krcvqs where entry 0 is VL0.
+ *
+ * The number of vl bits (n) and the number of qpn bits (m) are computed to
+ * feed both the RSM map table and the single rule.
+ */
+static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt)
+{
+	struct rsm_rule_data rrd;
+	unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
+	unsigned int rmt_entries;
+	u64 reg;
+
+	if (!rmt)
 		goto bail;
-	if (num_vls * qpns_per_vl > dd->chip_rcv_contexts)
+	rmt_entries = qos_rmt_entries(dd, &m, &n);
+	if (rmt_entries == 0)
 		goto bail;
-	rsmmap = kmalloc_array(NUM_MAP_REGS, sizeof(u64), GFP_KERNEL);
-	if (!rsmmap)
+	qpns_per_vl = 1 << m;
+
+	/* enough room in the map table? */
+	rmt_entries = 1 << (m + n);
+	if (rmt->used + rmt_entries >= NUM_MAP_ENTRIES)
 		goto bail;
-	memset(rsmmap, rxcontext, NUM_MAP_REGS * sizeof(u64));
-	/* init the local copy of the table */
-	for (i = 0, ctxt = first_ctxt; i < num_vls; i++) {
+
+	/* add qos entries to the the RSM map table */
+	for (i = 0, ctxt = FIRST_KERNEL_KCTXT; i < num_vls; i++) {
 		unsigned tctxt;

 		for (qpn = 0, tctxt = ctxt;
 		     krcvqs[i] && qpn < qpns_per_vl; qpn++) {
 			unsigned idx, regoff, regidx;

-			/* generate index <= 128 */
-			idx = (qpn << n) ^ i;
+			/* generate the index the hardware will produce */
+			idx = rmt->used + ((qpn << n) ^ i);
 			regoff = (idx % 8) * 8;
 			regidx = idx / 8;
-			reg = rsmmap[regidx];
-			/* replace 0xff with context number */
+			/* replace default with context number */
+			reg = rmt->map[regidx];
 			reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK
 				<< regoff);
 			reg |= (u64)(tctxt++) << regoff;
-			rsmmap[regidx] = reg;
+			rmt->map[regidx] = reg;
 			if (tctxt == ctxt + krcvqs[i])
 				tctxt = ctxt;
 		}
 		ctxt += krcvqs[i];
 	}
-	/* flush cached copies to chip */
-	for (i = 0; i < NUM_MAP_REGS; i++)
-		write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rsmmap[i]);
-	/* add rule0 */
-	write_csr(dd, RCV_RSM_CFG /* + (8 * 0) */,
-		  RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK <<
-		  RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT |
-		  2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
-	write_csr(dd, RCV_RSM_SELECT /* + (8 * 0) */,
-		  LRH_BTH_MATCH_OFFSET << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
-		  LRH_SC_MATCH_OFFSET << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
-		  LRH_SC_SELECT_OFFSET << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
-		  ((u64)n) << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
-		  QPN_SELECT_OFFSET << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
-		  ((u64)m + (u64)n) << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
-	write_csr(dd, RCV_RSM_MATCH /* + (8 * 0) */,
-		  LRH_BTH_MASK << RCV_RSM_MATCH_MASK1_SHIFT |
-		  LRH_BTH_VALUE << RCV_RSM_MATCH_VALUE1_SHIFT |
-		  LRH_SC_MASK << RCV_RSM_MATCH_MASK2_SHIFT |
-		  LRH_SC_VALUE << RCV_RSM_MATCH_VALUE2_SHIFT);
-	/* Enable RSM */
-	add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
-	kfree(rsmmap);
-	/* map everything else to first context */
-	init_qpmap_table(dd, FIRST_KERNEL_KCTXT, MIN_KERNEL_KCTXTS - 1);
+
+	rrd.offset = rmt->used;
+	rrd.pkt_type = 2;
+	rrd.field1_off = LRH_BTH_MATCH_OFFSET;
+	rrd.field2_off = LRH_SC_MATCH_OFFSET;
+	rrd.index1_off = LRH_SC_SELECT_OFFSET;
+	rrd.index1_width = n;
+	rrd.index2_off = QPN_SELECT_OFFSET;
+	rrd.index2_width = m + n;
+	rrd.mask1 = LRH_BTH_MASK;
+	rrd.value1 = LRH_BTH_VALUE;
+	rrd.mask2 = LRH_SC_MASK;
+	rrd.value2 = LRH_SC_VALUE;
+
+	/* add rule 0 */
+	add_rsm_rule(dd, 0, &rrd);
+
+	/* mark RSM map entries as used */
+	rmt->used += rmt_entries;
+	/* map everything else to the mcast/err/vl15 context */
+	init_qpmap_table(dd, HFI1_CTRL_CTXT, HFI1_CTRL_CTXT);
 	dd->qos_shift = n + 1;
 	return;
 bail:
@ -13574,13 +13764,86 @@ static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt)
 	init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1);
 }

+static void init_user_fecn_handling(struct hfi1_devdata *dd,
+				    struct rsm_map_table *rmt)
+{
+	struct rsm_rule_data rrd;
+	u64 reg;
+	int i, idx, regoff, regidx;
+	u8 offset;
+
+	/* there needs to be enough room in the map table */
+	if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) {
+		dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n");
+		return;
+	}
+
+	/*
+	 * RSM will extract the destination context as an index into the
+	 * map table.  The destination contexts are a sequential block
+	 * in the range first_user_ctxt...num_rcv_contexts-1 (inclusive).
+	 * Map entries are accessed as offset + extracted value.  Adjust
+	 * the added offset so this sequence can be placed anywhere in
+	 * the table - as long as the entries themselves do not wrap.
+	 * There are only enough bits in offset for the table size, so
+	 * start with that to allow for a "negative" offset.
+	 */
+	offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used -
+						(int)dd->first_user_ctxt);
+
+	for (i = dd->first_user_ctxt, idx = rmt->used;
+				i < dd->num_rcv_contexts; i++, idx++) {
+		/* replace with identity mapping */
+		regoff = (idx % 8) * 8;
+		regidx = idx / 8;
+		reg = rmt->map[regidx];
+		reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK << regoff);
+		reg |= (u64)i << regoff;
+		rmt->map[regidx] = reg;
+	}
+
+	/*
+	 * For RSM intercept of Expected FECN packets:
+	 * o packet type 0 - expected
+	 * o match on F (bit 95), using select/match 1, and
+	 * o match on SH (bit 133), using select/match 2.
+	 *
+	 * Use index 1 to extract the 8-bit receive context from DestQP
+	 * (start at bit 64).  Use that as the RSM map table index.
+	 */
+	rrd.offset = offset;
+	rrd.pkt_type = 0;
+	rrd.field1_off = 95;
+	rrd.field2_off = 133;
+	rrd.index1_off = 64;
+	rrd.index1_width = 8;
+	rrd.index2_off = 0;
+	rrd.index2_width = 0;
+	rrd.mask1 = 1;
+	rrd.value1 = 1;
+	rrd.mask2 = 1;
+	rrd.value2 = 1;
+
+	/* add rule 1 */
+	add_rsm_rule(dd, 1, &rrd);
+
+	rmt->used += dd->num_user_contexts;
+}
+
 static void init_rxe(struct hfi1_devdata *dd)
 {
+	struct rsm_map_table *rmt;
+
 	/* enable all receive errors */
 	write_csr(dd, RCV_ERR_MASK, ~0ull);
-	/* setup QPN map table - start where VL15 context leaves off */
-	init_qos(dd, dd->n_krcv_queues > MIN_KERNEL_KCTXTS ?
-		 MIN_KERNEL_KCTXTS : 0);
+
+	rmt = alloc_rsm_map_table(dd);
+	/* set up QOS, including the QPN map table */
+	init_qos(dd, rmt);
+	init_user_fecn_handling(dd, rmt);
+	complete_rsm_map_table(dd, rmt);
+	kfree(rmt);
+
 	/*
 	 * make sure RcvCtrl.RcvWcb <= PCIe Device Control
 	 * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config
@ -13762,6 +14025,7 @@ int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey)
 	write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg);
 	reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
 	reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+	reg &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK;
 	write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
 done:
 	return ret;
@ -14148,6 +14412,19 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
 		 (dd->revision >> CCE_REVISION_SW_SHIFT)
 		    & CCE_REVISION_SW_MASK);

+	/*
+	 * The real cpu mask is part of the affinity struct but has to be
+	 * initialized earlier than the rest of the affinity struct because it
+	 * is needed to calculate the number of user contexts in
+	 * set_up_context_variables(). However, hfi1_dev_affinity_init(),
+	 * which initializes the rest of the affinity struct members,
+	 * depends on set_up_context_variables() for the number of kernel
+	 * contexts, so it cannot be called before set_up_context_variables().
+	 */
+	ret = init_real_cpu_mask(dd);
+	if (ret)
+		goto bail_cleanup;
+
 	ret = set_up_context_variables(dd);
 	if (ret)
 		goto bail_cleanup;
@ -14161,9 +14438,7 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
 	/* set up KDETH QP prefix in both RX and TX CSRs */
 	init_kdeth_qp(dd);

-	ret = hfi1_dev_affinity_init(dd);
-	if (ret)
-		goto bail_cleanup;
+	hfi1_dev_affinity_init(dd);

 	/* send contexts must be set up before receive contexts */
 	ret = init_send_contexts(dd);
--- a/drivers/staging/rdma/hfi1/chip.h
+++ b/drivers/staging/rdma/hfi1/chip.h
@ -389,6 +389,7 @@
 #define LAST_REMOTE_STATE_COMPLETE   0x13
 #define LINK_QUALITY_INFO            0x14
 #define REMOTE_DEVICE_ID	     0x15
+#define LINK_DOWN_REASON	     0x16

 /* 8051 lane specific register field IDs */
 #define TX_EQ_SETTINGS		0x00
@ -497,6 +498,11 @@
 #define PWRM_BER_CONTROL	0x1
 #define PWRM_BANDWIDTH_CONTROL	0x2

+/* 8051 link down reasons */
+#define LDR_LINK_TRANSFER_ACTIVE_LOW   0xa
+#define LDR_RECEIVED_LINKDOWN_IDLE_MSG 0xb
+#define LDR_RECEIVED_HOST_OFFLINE_REQ  0xc
+
 /* verify capability fabric CRC size bits */
 enum {
 	CAP_CRC_14B = (1 << 0), /* 14b CRC */
@ -691,7 +697,6 @@ void handle_verify_cap(struct work_struct *work);
 void handle_freeze(struct work_struct *work);
 void handle_link_up(struct work_struct *work);
 void handle_link_down(struct work_struct *work);
-void handle_8051_request(struct work_struct *work);
 void handle_link_downgrade(struct work_struct *work);
 void handle_link_bounce(struct work_struct *work);
 void handle_sma_message(struct work_struct *work);
--- a/drivers/staging/rdma/hfi1/chip_registers.h
+++ b/drivers/staging/rdma/hfi1/chip_registers.h
@ -771,6 +771,7 @@
 #define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK 0x1ull
 #define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT 0
 #define RCV_RSM_CFG_PACKET_TYPE_SHIFT 60
+#define RCV_RSM_CFG_OFFSET_SHIFT 32
 #define RCV_RSM_MAP_TABLE (RXE + 0x000000000900)
 #define RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK 0xFFull
 #define RCV_RSM_MATCH (RXE + 0x000000000800)
--- a/drivers/staging/rdma/hfi1/diag.c
+++ b/drivers/staging/rdma/hfi1/diag.c
@ -413,7 +413,8 @@ static ssize_t diagpkt_send(struct diag_pkt *dp)
 		goto bail;
 	}
 	/* can only use kernel contexts */
-	if (dd->send_contexts[dp->sw_index].type != SC_KERNEL) {
+	if (dd->send_contexts[dp->sw_index].type != SC_KERNEL &&
+	    dd->send_contexts[dp->sw_index].type != SC_VL15) {
 		ret = -EINVAL;
 		goto bail;
 	}
--- a/drivers/staging/rdma/hfi1/driver.c
+++ b/drivers/staging/rdma/hfi1/driver.c
@ -75,7 +75,8 @@ DEFINE_MUTEX(hfi1_mutex);	/* general driver use */

 unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
 module_param_named(max_mtu, hfi1_max_mtu, uint, S_IRUGO);
-MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is 8192");
+MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is " __stringify(
+		 HFI1_DEFAULT_MAX_MTU));

 unsigned int hfi1_cu = 1;
 module_param_named(cu, hfi1_cu, uint, S_IRUGO);
--- a/drivers/staging/rdma/hfi1/firmware.c
+++ b/drivers/staging/rdma/hfi1/firmware.c
@ -1413,8 +1413,15 @@ static int __acquire_chip_resource(struct hfi1_devdata *dd, u32 resource)

 	if (resource & CR_DYN_MASK) {
 		/* a dynamic resource is in use if either HFI has set the bit */
-		all_bits = resource_mask(0, resource) |
+		if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0 &&
+		    (resource & (CR_I2C1 | CR_I2C2))) {
+			/* discrete devices must serialize across both chains */
+			all_bits = resource_mask(0, CR_I2C1 | CR_I2C2) |
+					resource_mask(1, CR_I2C1 | CR_I2C2);
+		} else {
+			all_bits = resource_mask(0, resource) |
 						resource_mask(1, resource);
+		}
 		my_bit = resource_mask(dd->hfi1_id, resource);
 	} else {
 		/* non-dynamic resources are not split between HFIs */
--- a/drivers/staging/rdma/hfi1/hfi.h
+++ b/drivers/staging/rdma/hfi1/hfi.h
@ -455,9 +455,9 @@ struct rvt_sge_state;
 #define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE)

 /* use this MTU size if none other is given */
-#define HFI1_DEFAULT_ACTIVE_MTU 8192
+#define HFI1_DEFAULT_ACTIVE_MTU 10240
 /* use this MTU size as the default maximum */
-#define HFI1_DEFAULT_MAX_MTU 8192
+#define HFI1_DEFAULT_MAX_MTU 10240
 /* default partition key */
 #define DEFAULT_PKEY 0xffff

@ -606,7 +606,6 @@ struct hfi1_pportdata {
 	struct work_struct link_vc_work;
 	struct work_struct link_up_work;
 	struct work_struct link_down_work;
-	struct work_struct dc_host_req_work;
 	struct work_struct sma_message_work;
 	struct work_struct freeze_work;
 	struct work_struct link_downgrade_work;
@ -1258,7 +1257,7 @@ void receive_interrupt_work(struct work_struct *work);
 static inline int hdr2sc(struct hfi1_message_header *hdr, u64 rhf)
 {
 	return ((be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf) |
-	       ((!!(rhf & RHF_DC_INFO_MASK)) << 4);
+	       ((!!(rhf & RHF_DC_INFO_SMASK)) << 4);
 }

 static inline u16 generate_jkey(kuid_t uid)
@ -1333,6 +1332,9 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl,  u16 rlid, u32 lqpn,
 void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
 		u32 pkey, u32 slid, u32 dlid, u8 sc5,
 		const struct ib_grh *old_grh);
+#define PKEY_CHECK_INVALID -1
+int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth,
+		      u8 sc5, int8_t s_pkey_index);

 #define PACKET_EGRESS_TIMEOUT 350
 static inline void pause_for_credit_return(struct hfi1_devdata *dd)
@ -1776,6 +1778,7 @@ extern struct mutex hfi1_mutex;

 #define HFI1_PKT_USER_SC_INTEGRITY					    \
 	(SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK	    \
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK		\
 	| SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK		    \
 	| SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK)

--- a/drivers/staging/rdma/hfi1/init.c
+++ b/drivers/staging/rdma/hfi1/init.c
@ -422,9 +422,10 @@ static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
 	struct cca_timer *cca_timer;
 	struct hfi1_pportdata *ppd;
 	int sl;
-	u16 ccti, ccti_timer, ccti_min;
+	u16 ccti_timer, ccti_min;
 	struct cc_state *cc_state;
 	unsigned long flags;
+	enum hrtimer_restart ret = HRTIMER_NORESTART;

 	cca_timer = container_of(t, struct cca_timer, hrtimer);
 	ppd = cca_timer->ppd;
@ -450,24 +451,21 @@ static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)

 	spin_lock_irqsave(&ppd->cca_timer_lock, flags);

-	ccti = cca_timer->ccti;
-
-	if (ccti > ccti_min) {
+	if (cca_timer->ccti > ccti_min) {
 		cca_timer->ccti--;
 		set_link_ipg(ppd);
 	}

-	spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
-
-	rcu_read_unlock();
-
-	if (ccti > ccti_min) {
+	if (cca_timer->ccti > ccti_min) {
 		unsigned long nsec = 1024 * ccti_timer;
 		/* ccti_timer is in units of 1.024 usec */
 		hrtimer_forward_now(t, ns_to_ktime(nsec));
-		return HRTIMER_RESTART;
+		ret = HRTIMER_RESTART;
 	}
-	return HRTIMER_NORESTART;
+
+	spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
+	rcu_read_unlock();
+	return ret;
 }

 /*
@ -496,7 +494,6 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
 	INIT_WORK(&ppd->link_vc_work, handle_verify_cap);
 	INIT_WORK(&ppd->link_up_work, handle_link_up);
 	INIT_WORK(&ppd->link_down_work, handle_link_down);
-	INIT_WORK(&ppd->dc_host_req_work, handle_8051_request);
 	INIT_WORK(&ppd->freeze_work, handle_freeze);
 	INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade);
 	INIT_WORK(&ppd->sma_message_work, handle_sma_message);
@ -1007,7 +1004,7 @@ void hfi1_free_devdata(struct hfi1_devdata *dd)
 	free_percpu(dd->rcv_limit);
 	hfi1_dev_affinity_free(dd);
 	free_percpu(dd->send_schedule);
-	ib_dealloc_device(&dd->verbs_dev.rdi.ibdev);
+	rvt_dealloc_device(&dd->verbs_dev.rdi);
 }

 /*
@ -1110,7 +1107,7 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
 bail:
 	if (!list_empty(&dd->list))
 		list_del_init(&dd->list);
-	ib_dealloc_device(&dd->verbs_dev.rdi.ibdev);
+	rvt_dealloc_device(&dd->verbs_dev.rdi);
 	return ERR_PTR(ret);
 }

--- a/drivers/staging/rdma/hfi1/mad.c
+++ b/drivers/staging/rdma/hfi1/mad.c
@ -999,7 +999,21 @@ static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp,
 			break;
 		}

-		set_link_state(ppd, link_state);
+		if ((link_state == HLS_DN_POLL ||
+		     link_state == HLS_DN_DOWNDEF)) {
+			/*
+			 * Going to poll.  No matter what the current state,
+			 * always move offline first, then tune and start the
+			 * link.  This correctly handles a FM link bounce and
+			 * a link enable.  Going offline is a no-op if already
+			 * offline.
+			 */
+			set_link_state(ppd, HLS_DN_OFFLINE);
+			tune_serdes(ppd);
+			start_link(ppd);
+		} else {
+			set_link_state(ppd, link_state);
+		}
 		if (link_state == HLS_DN_DISABLE &&
 		    (ppd->offline_disabled_reason >
 		     HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED) ||
--- a/drivers/staging/rdma/hfi1/mmu_rb.c
+++ b/drivers/staging/rdma/hfi1/mmu_rb.c
@ -91,7 +91,7 @@ static unsigned long mmu_node_start(struct mmu_rb_node *node)

 static unsigned long mmu_node_last(struct mmu_rb_node *node)
 {
-	return PAGE_ALIGN((node->addr & PAGE_MASK) + node->len) - 1;
+	return PAGE_ALIGN(node->addr + node->len) - 1;
 }

 int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops)
@ -126,10 +126,15 @@ void hfi1_mmu_rb_unregister(struct rb_root *root)
 	if (!handler)
 		return;

+	/* Unregister first so we don't get any more notifications. */
+	if (current->mm)
+		mmu_notifier_unregister(&handler->mn, current->mm);
+
 	spin_lock_irqsave(&mmu_rb_lock, flags);
 	list_del(&handler->list);
 	spin_unlock_irqrestore(&mmu_rb_lock, flags);

+	spin_lock_irqsave(&handler->lock, flags);
 	if (!RB_EMPTY_ROOT(root)) {
 		struct rb_node *node;
 		struct mmu_rb_node *rbnode;
@ -141,9 +146,8 @@ void hfi1_mmu_rb_unregister(struct rb_root *root)
 				handler->ops->remove(root, rbnode, NULL);
 		}
 	}
+	spin_unlock_irqrestore(&handler->lock, flags);

-	if (current->mm)
-		mmu_notifier_unregister(&handler->mn, current->mm);
 	kfree(handler);
 }

@ -235,6 +239,25 @@ struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *root, unsigned long addr,
 	return node;
 }

+struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *root,
+					unsigned long addr, unsigned long len)
+{
+	struct mmu_rb_handler *handler = find_mmu_handler(root);
+	struct mmu_rb_node *node;
+	unsigned long flags;
+
+	if (!handler)
+		return ERR_PTR(-EINVAL);
+
+	spin_lock_irqsave(&handler->lock, flags);
+	node = __mmu_rb_search(handler, addr, len);
+	if (node)
+		__mmu_int_rb_remove(node, handler->root);
+	spin_unlock_irqrestore(&handler->lock, flags);
+
+	return node;
+}
+
 void hfi1_mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node)
 {
 	struct mmu_rb_handler *handler = find_mmu_handler(root);
@ -293,9 +316,9 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn,
 		hfi1_cdbg(MMU, "Invalidating node addr 0x%llx, len %u",
 			  node->addr, node->len);
 		if (handler->ops->invalidate(root, node)) {
-			spin_unlock_irqrestore(&handler->lock, flags);
-			__mmu_rb_remove(handler, node, mm);
-			spin_lock_irqsave(&handler->lock, flags);
+			__mmu_int_rb_remove(node, root);
+			if (handler->ops->remove)
+				handler->ops->remove(root, node, mm);
 		}
 	}
 	spin_unlock_irqrestore(&handler->lock, flags);
--- a/drivers/staging/rdma/hfi1/mmu_rb.h
+++ b/drivers/staging/rdma/hfi1/mmu_rb.h
@ -70,5 +70,7 @@ int hfi1_mmu_rb_insert(struct rb_root *, struct mmu_rb_node *);
 void hfi1_mmu_rb_remove(struct rb_root *, struct mmu_rb_node *);
 struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *, unsigned long,
 				       unsigned long);
+struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *, unsigned long,
+					unsigned long);

 #endif /* _HFI1_MMU_RB_H */
--- a/drivers/staging/rdma/hfi1/pio.c
+++ b/drivers/staging/rdma/hfi1/pio.c
@ -139,23 +139,30 @@ void pio_send_control(struct hfi1_devdata *dd, int op)
 /* Send Context Size (SCS) wildcards */
 #define SCS_POOL_0 -1
 #define SCS_POOL_1 -2
+
 /* Send Context Count (SCC) wildcards */
 #define SCC_PER_VL -1
 #define SCC_PER_CPU  -2
-
 #define SCC_PER_KRCVQ  -3
-#define SCC_ACK_CREDITS  32
+
+/* Send Context Size (SCS) constants */
+#define SCS_ACK_CREDITS  32
+#define SCS_VL15_CREDITS 102	/* 3 pkts of 2048B data + 128B header */
+
+#define PIO_THRESHOLD_CEILING 4096

 #define PIO_WAIT_BATCH_SIZE 5

 /* default send context sizes */
 static struct sc_config_sizes sc_config_sizes[SC_MAX] = {
 	[SC_KERNEL] = { .size  = SCS_POOL_0,	/* even divide, pool 0 */
-			.count = SCC_PER_VL },/* one per NUMA */
-	[SC_ACK]    = { .size  = SCC_ACK_CREDITS,
+			.count = SCC_PER_VL },	/* one per NUMA */
+	[SC_ACK]    = { .size  = SCS_ACK_CREDITS,
 			.count = SCC_PER_KRCVQ },
 	[SC_USER]   = { .size  = SCS_POOL_0,	/* even divide, pool 0 */
 			.count = SCC_PER_CPU },	/* one per CPU */
+	[SC_VL15]   = { .size  = SCS_VL15_CREDITS,
+			.count = 1 },

 };

@ -202,7 +209,8 @@ static int wildcard_to_pool(int wc)
 static const char *sc_type_names[SC_MAX] = {
 	"kernel",
 	"ack",
-	"user"
+	"user",
+	"vl15"
 };

 static const char *sc_type_name(int index)
@ -230,6 +238,22 @@ int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
 	int extra;
 	int i;

+	/*
+	 * When SDMA is enabled, kernel context pio packet size is capped by
+	 * "piothreshold". Reduce pio buffer allocation for kernel context by
+	 * setting it to a fixed size. The allocation allows 3-deep buffering
+	 * of the largest pio packets plus up to 128 bytes header, sufficient
+	 * to maintain verbs performance.
+	 *
+	 * When SDMA is disabled, keep the default pooling allocation.
+	 */
+	if (HFI1_CAP_IS_KSET(SDMA)) {
+		u16 max_pkt_size = (piothreshold < PIO_THRESHOLD_CEILING) ?
+					 piothreshold : PIO_THRESHOLD_CEILING;
+		sc_config_sizes[SC_KERNEL].size =
+			3 * (max_pkt_size + 128) / PIO_BLOCK_SIZE;
+	}
+
 	/*
 	 * Step 0:
 	 *	- copy the centipercents/absolute sizes from the pool config
@ -311,7 +335,7 @@ int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
 		if (i == SC_ACK) {
 			count = dd->n_krcv_queues;
 		} else if (i == SC_KERNEL) {
-			count = (INIT_SC_PER_VL * num_vls) + 1 /* VL15 */;
+			count = INIT_SC_PER_VL * num_vls;
 		} else if (count == SCC_PER_CPU) {
 			count = dd->num_rcv_contexts - dd->n_krcv_queues;
 		} else if (count < 0) {
@ -596,7 +620,7 @@ u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize)
 * Return value is what to write into the CSR: trigger return when
 * unreturned credits pass this count.
 */
-static u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
+u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
 {
 	return (sc->credits * percent) / 100;
 }
@ -790,7 +814,10 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
 	 * For Ack contexts, set a threshold for half the credits.
 	 * For User contexts use the given percentage.  This has been
 	 * sanitized on driver start-up.
-	 * For Kernel contexts, use the default MTU plus a header.
+	 * For Kernel contexts, use the default MTU plus a header
+	 * or half the credits, whichever is smaller. This should
+	 * work for both the 3-deep buffering allocation and the
+	 * pooling allocation.
 	 */
 	if (type == SC_ACK) {
 		thresh = sc_percent_to_threshold(sc, 50);
@ -798,7 +825,9 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
 		thresh = sc_percent_to_threshold(sc,
 						 user_credit_return_threshold);
 	} else { /* kernel */
-		thresh = sc_mtu_to_threshold(sc, hfi1_max_mtu, hdrqentsize);
+		thresh = min(sc_percent_to_threshold(sc, 50),
+			     sc_mtu_to_threshold(sc, hfi1_max_mtu,
+						 hdrqentsize));
 	}
 	reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT);
 	/* add in early return */
@ -1531,7 +1560,8 @@ static void sc_piobufavail(struct send_context *sc)
 	unsigned long flags;
 	unsigned i, n = 0;

-	if (dd->send_contexts[sc->sw_index].type != SC_KERNEL)
+	if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
+	    dd->send_contexts[sc->sw_index].type != SC_VL15)
 		return;
 	list = &sc->piowait;
 	/*
@ -1900,7 +1930,7 @@ int init_pervl_scs(struct hfi1_devdata *dd)
 	u32 ctxt;
 	struct hfi1_pportdata *ppd = dd->pport;

-	dd->vld[15].sc = sc_alloc(dd, SC_KERNEL,
+	dd->vld[15].sc = sc_alloc(dd, SC_VL15,
 				  dd->rcd[0]->rcvhdrqentsize, dd->node);
 	if (!dd->vld[15].sc)
 		goto nomem;
--- a/drivers/staging/rdma/hfi1/pio.h
+++ b/drivers/staging/rdma/hfi1/pio.h
@ -51,7 +51,8 @@
 #define SC_KERNEL 0
 #define SC_ACK    1
 #define SC_USER   2
-#define SC_MAX    3
+#define SC_VL15   3
+#define SC_MAX    4

 /* invalid send context index */
 #define INVALID_SCI 0xff
@ -293,6 +294,7 @@ void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context);
 void sc_add_credit_return_intr(struct send_context *sc);
 void sc_del_credit_return_intr(struct send_context *sc);
 void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold);
+u32 sc_percent_to_threshold(struct send_context *sc, u32 percent);
 u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize);
 void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint);
 void sc_wait(struct hfi1_devdata *dd);
--- a/drivers/staging/rdma/hfi1/platform.c
+++ b/drivers/staging/rdma/hfi1/platform.c
@ -114,21 +114,11 @@ static int qual_power(struct hfi1_pportdata *ppd)
 	if (ret)
 		return ret;

-	if (QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]) != 4)
-		cable_power_class = QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]);
-	else
-		cable_power_class = QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]);
+	cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);

-	if (cable_power_class <= 3 && cable_power_class > (power_class_max - 1))
+	if (cable_power_class > power_class_max)
 		ppd->offline_disabled_reason =
 			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY);
-	else if (cable_power_class > 4 && cable_power_class > (power_class_max))
-		ppd->offline_disabled_reason =
-			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY);
-	/*
-	 * cable_power_class will never have value 4 as this simply
-	 * means the high power settings are unused
-	 */

 	if (ppd->offline_disabled_reason ==
 			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY)) {
@ -173,12 +163,9 @@ static int set_qsfp_high_power(struct hfi1_pportdata *ppd)
 	u8 *cache = ppd->qsfp_info.cache;
 	int ret;

-	if (QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]) != 4)
-		cable_power_class = QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]);
-	else
-		cable_power_class = QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]);
+	cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);

-	if (cable_power_class) {
+	if (cable_power_class > QSFP_POWER_CLASS_1) {
 		power_ctrl_byte = cache[QSFP_PWR_CTRL_BYTE_OFFS];

 		power_ctrl_byte |= 1;
@ -190,8 +177,7 @@ static int set_qsfp_high_power(struct hfi1_pportdata *ppd)
 		if (ret != 1)
 			return -EIO;

-		if (cable_power_class > 3) {
-			/* > power class 4*/
+		if (cable_power_class > QSFP_POWER_CLASS_4) {
 			power_ctrl_byte |= (1 << 2);
 			ret = qsfp_write(ppd, ppd->dd->hfi1_id,
 					 QSFP_PWR_CTRL_BYTE_OFFS,
@ -212,12 +198,21 @@ static void apply_rx_cdr(struct hfi1_pportdata *ppd,
 {
 	u32 rx_preset;
 	u8 *cache = ppd->qsfp_info.cache;
+	int cable_power_class;

 	if (!((cache[QSFP_MOD_PWR_OFFS] & 0x4) &&
 	      (cache[QSFP_CDR_INFO_OFFS] & 0x40)))
 		return;

-	/* rx_preset preset to zero to catch error */
+	/* RX CDR present, bypass supported */
+	cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+	if (cable_power_class <= QSFP_POWER_CLASS_3) {
+		/* Power class <= 3, ignore config & turn RX CDR on */
+		*cdr_ctrl_byte |= 0xF;
+		return;
+	}
+
 	get_platform_config_field(
 		ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
 		rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR_APPLY,
@ -250,15 +245,25 @@ static void apply_rx_cdr(struct hfi1_pportdata *ppd,

 static void apply_tx_cdr(struct hfi1_pportdata *ppd,
 			 u32 tx_preset_index,
-			 u8 *ctr_ctrl_byte)
+			 u8 *cdr_ctrl_byte)
 {
 	u32 tx_preset;
 	u8 *cache = ppd->qsfp_info.cache;
+	int cable_power_class;

 	if (!((cache[QSFP_MOD_PWR_OFFS] & 0x8) &&
 	      (cache[QSFP_CDR_INFO_OFFS] & 0x80)))
 		return;

+	/* TX CDR present, bypass supported */
+	cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+	if (cable_power_class <= QSFP_POWER_CLASS_3) {
+		/* Power class <= 3, ignore config & turn TX CDR on */
+		*cdr_ctrl_byte |= 0xF0;
+		return;
+	}
+
 	get_platform_config_field(
 		ppd->dd,
 		PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index,
@ -282,10 +287,10 @@ static void apply_tx_cdr(struct hfi1_pportdata *ppd,
 			(tx_preset << 2) | (tx_preset << 3));

 	if (tx_preset)
-		*ctr_ctrl_byte |= (tx_preset << 4);
+		*cdr_ctrl_byte |= (tx_preset << 4);
 	else
 		/* Preserve current/determined RX CDR status */
-		*ctr_ctrl_byte &= ((tx_preset << 4) | 0xF);
+		*cdr_ctrl_byte &= ((tx_preset << 4) | 0xF);
 }

 static void apply_cdr_settings(
@ -598,6 +603,7 @@ static void apply_tunings(
 		       "Applying TX settings");
 }

+/* Must be holding the QSFP i2c resource */
 static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
 			    u32 *ptr_rx_preset, u32 *ptr_total_atten)
 {
@ -605,26 +611,19 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
 	u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
 	u8 *cache = ppd->qsfp_info.cache;

-	ret = acquire_chip_resource(ppd->dd, qsfp_resource(ppd->dd), QSFP_WAIT);
-	if (ret) {
-		dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n",
-			   __func__, (int)ppd->dd->hfi1_id);
-		return ret;
-	}
-
 	ppd->qsfp_info.limiting_active = 1;

 	ret = set_qsfp_tx(ppd, 0);
 	if (ret)
-		goto bail_unlock;
+		return ret;

 	ret = qual_power(ppd);
 	if (ret)
-		goto bail_unlock;
+		return ret;

 	ret = qual_bitrate(ppd);
 	if (ret)
-		goto bail_unlock;
+		return ret;

 	if (ppd->qsfp_info.reset_needed) {
 		reset_qsfp(ppd);
@ -636,7 +635,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,

 	ret = set_qsfp_high_power(ppd);
 	if (ret)
-		goto bail_unlock;
+		return ret;

 	if (cache[QSFP_EQ_INFO_OFFS] & 0x4) {
 		ret = get_platform_config_field(
@ -646,7 +645,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
 			ptr_tx_preset, 4);
 		if (ret) {
 			*ptr_tx_preset = OPA_INVALID_INDEX;
-			goto bail_unlock;
+			return ret;
 		}
 	} else {
 		ret = get_platform_config_field(
@ -656,7 +655,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
 			ptr_tx_preset, 4);
 		if (ret) {
 			*ptr_tx_preset = OPA_INVALID_INDEX;
-			goto bail_unlock;
+			return ret;
 		}
 	}

@ -665,7 +664,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
 		PORT_TABLE_RX_PRESET_IDX, ptr_rx_preset, 4);
 	if (ret) {
 		*ptr_rx_preset = OPA_INVALID_INDEX;
-		goto bail_unlock;
+		return ret;
 	}

 	if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G))
@ -685,8 +684,6 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,

 	ret = set_qsfp_tx(ppd, 1);

-bail_unlock:
-	release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
 	return ret;
 }

@ -833,12 +830,22 @@ void tune_serdes(struct hfi1_pportdata *ppd)
 			total_atten = platform_atten + remote_atten;

 			tuning_method = OPA_PASSIVE_TUNING;
-		} else
+		} else {
 			ppd->offline_disabled_reason =
 			     HFI1_ODR_MASK(OPA_LINKDOWN_REASON_CHASSIS_CONFIG);
+			goto bail;
+		}
 		break;
 	case PORT_TYPE_QSFP:
 		if (qsfp_mod_present(ppd)) {
+			ret = acquire_chip_resource(ppd->dd,
+						    qsfp_resource(ppd->dd),
+						    QSFP_WAIT);
+			if (ret) {
+				dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n",
+					   __func__, (int)ppd->dd->hfi1_id);
+				goto bail;
+			}
 			refresh_qsfp_cache(ppd, &ppd->qsfp_info);

 			if (ppd->qsfp_info.cache_valid) {
@ -853,21 +860,23 @@ void tune_serdes(struct hfi1_pportdata *ppd)
 				 * update the cache to reflect the changes
 				 */
 				refresh_qsfp_cache(ppd, &ppd->qsfp_info);
-				if (ret)
-					goto bail;
-
 				limiting_active =
 						ppd->qsfp_info.limiting_active;
 			} else {
 				dd_dev_err(dd,
 					   "%s: Reading QSFP memory failed\n",
 					   __func__);
-				goto bail;
+				ret = -EINVAL; /* a fail indication */
 			}
-		} else
+			release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
+			if (ret)
+				goto bail;
+		} else {
 			ppd->offline_disabled_reason =
 			   HFI1_ODR_MASK(
 				OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED);
+			goto bail;
+		}
 		break;
 	default:
 		dd_dev_info(ppd->dd, "%s: Unknown port type\n", __func__);
--- a/drivers/staging/rdma/hfi1/qp.c
+++ b/drivers/staging/rdma/hfi1/qp.c
@ -167,8 +167,12 @@ static inline int opa_mtu_enum_to_int(int mtu)
 */
 static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu)
 {
-	int val = opa_mtu_enum_to_int((int)mtu);
+	int val;

+	/* Constraining 10KB packets to 8KB packets */
+	if (mtu == (enum ib_mtu)OPA_MTU_10240)
+		mtu = OPA_MTU_8192;
+	val = opa_mtu_enum_to_int((int)mtu);
 	if (val > 0)
 		return val;
 	return ib_mtu_enum_to_int(mtu);
--- a/drivers/staging/rdma/hfi1/qsfp.c
+++ b/drivers/staging/rdma/hfi1/qsfp.c
@ -96,7 +96,7 @@ int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
 {
 	int ret;

-	if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__))
+	if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
 		return -EACCES;

 	/* make sure the TWSI bus is in a sane state */
@ -162,7 +162,7 @@ int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
 {
 	int ret;

-	if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__))
+	if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
 		return -EACCES;

 	/* make sure the TWSI bus is in a sane state */
@ -192,7 +192,7 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
 	int ret;
 	u8 page;

-	if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__))
+	if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
 		return -EACCES;

 	/* make sure the TWSI bus is in a sane state */
@ -276,7 +276,7 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
 	int ret;
 	u8 page;

-	if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__))
+	if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
 		return -EACCES;

 	/* make sure the TWSI bus is in a sane state */
@ -355,6 +355,8 @@ int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
 * The calls to qsfp_{read,write} in this function correctly handle the
 * address map difference between this mapping and the mapping implemented
 * by those functions
+ *
+ * The caller must be holding the QSFP i2c chain resource.
 */
 int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
 {
@ -371,13 +373,9 @@ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)

 	if (!qsfp_mod_present(ppd)) {
 		ret = -ENODEV;
-		goto bail_no_release;
+		goto bail;
 	}

-	ret = acquire_chip_resource(ppd->dd, qsfp_resource(ppd->dd), QSFP_WAIT);
-	if (ret)
-		goto bail_no_release;
-
 	ret = qsfp_read(ppd, target, 0, cache, QSFP_PAGESIZE);
 	if (ret != QSFP_PAGESIZE) {
 		dd_dev_info(ppd->dd,
@ -440,8 +438,6 @@ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
 		}
 	}

-	release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
-
 	spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
 	ppd->qsfp_info.cache_valid = 1;
 	ppd->qsfp_info.cache_refresh_required = 0;
@ -450,8 +446,6 @@ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
 	return 0;

 bail:
-	release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
-bail_no_release:
 	memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128));
 	return ret;
 }
@ -466,7 +460,28 @@ const char * const hfi1_qsfp_devtech[16] = {
 #define QSFP_DUMP_CHUNK 16 /* Holds longest string */
 #define QSFP_DEFAULT_HDR_CNT 224

-static const char *pwr_codes = "1.5W2.0W2.5W3.5W";
+#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
+#define QSFP_HIGH_PWR(pbyte) ((pbyte) & 3)
+/* For use with QSFP_HIGH_PWR macro */
+#define QSFP_HIGH_PWR_UNUSED	0 /* Bits [1:0] = 00 implies low power module */
+
+/*
+ * Takes power class byte [Page 00 Byte 129] in SFF 8636
+ * Returns power class as integer (1 through 7, per SFF 8636 rev 2.4)
+ */
+int get_qsfp_power_class(u8 power_byte)
+{
+	if (QSFP_HIGH_PWR(power_byte) == QSFP_HIGH_PWR_UNUSED)
+		/* power classes count from 1, their bit encodings from 0 */
+		return (QSFP_PWR(power_byte) + 1);
+	/*
+	 * 00 in the high power classes stands for unused, bringing
+	 * balance to the off-by-1 offset above, we add 4 here to
+	 * account for the difference between the low and high power
+	 * groups
+	 */
+	return (QSFP_HIGH_PWR(power_byte) + 4);
+}

 int qsfp_mod_present(struct hfi1_pportdata *ppd)
 {
@ -537,6 +552,16 @@ int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, u32 len,
 	return ret;
 }

+static const char *pwr_codes[8] = {"N/AW",
+				  "1.5W",
+				  "2.0W",
+				  "2.5W",
+				  "3.5W",
+				  "4.0W",
+				  "4.5W",
+				  "5.0W"
+				 };
+
 int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
 {
 	u8 *cache = &ppd->qsfp_info.cache[0];
@ -546,6 +571,7 @@ int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
 	int bidx = 0;
 	u8 *atten = &cache[QSFP_ATTEN_OFFS];
 	u8 *vendor_oui = &cache[QSFP_VOUI_OFFS];
+	u8 power_byte = 0;

 	sofar = 0;
 	lenstr[0] = ' ';
@ -555,9 +581,9 @@ int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
 		if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
 			sprintf(lenstr, "%dM ", cache[QSFP_MOD_LEN_OFFS]);

+		power_byte = cache[QSFP_MOD_PWR_OFFS];
 		sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n",
-				pwr_codes +
-				(QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]) * 4));
+				pwr_codes[get_qsfp_power_class(power_byte)]);

 		sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n",
 				lenstr,
--- a/drivers/staging/rdma/hfi1/qsfp.h
+++ b/drivers/staging/rdma/hfi1/qsfp.h
@ -82,8 +82,9 @@
 /* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */
 #define QSFP_MOD_ID_OFFS 128
 /*
- * Byte 129 is "Extended Identifier". We only care about D7,D6: Power class
- *  0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
+ * Byte 129 is "Extended Identifier".
+ * For bits [7:6]: 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
+ * For bits [1:0]: 0:Unused, 1:4W, 2:4.5W, 3:5W
 */
 #define QSFP_MOD_PWR_OFFS 129
 /* Byte 130 is Connector type. Not Intel req'd */
@ -190,6 +191,9 @@ extern const char *const hfi1_qsfp_devtech[16];
 #define QSFP_HIGH_BIAS_WARNING		0x22
 #define QSFP_LOW_BIAS_WARNING		0x11

+#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
+#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
+
 /*
 * struct qsfp_data encapsulates state of QSFP device for one port.
 * it will be part of port-specific data if a board supports QSFP.
@ -201,12 +205,6 @@ extern const char *const hfi1_qsfp_devtech[16];
 * and let the qsfp_lock arbitrate access to common resources.
 *
 */
-
-#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
-#define QSFP_HIGH_PWR(pbyte) (((pbyte) & 3) | 4)
-#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
-#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
-
 struct qsfp_data {
 	/* Helps to find our way */
 	struct hfi1_pportdata *ppd;
@ -223,6 +221,7 @@ struct qsfp_data {

 int refresh_qsfp_cache(struct hfi1_pportdata *ppd,
 		       struct qsfp_data *cp);
+int get_qsfp_power_class(u8 power_byte);
 int qsfp_mod_present(struct hfi1_pportdata *ppd);
 int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr,
 		   u32 len, u8 *data);
--- a/drivers/staging/rdma/hfi1/rc.c
+++ b/drivers/staging/rdma/hfi1/rc.c
@ -1497,7 +1497,7 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
 		/* Ignore reserved NAK codes. */
 		goto bail_stop;
 	}
-	return ret;
+	/* cannot be reached  */
 bail_stop:
 	hfi1_stop_rc_timers(qp);
 	return ret;
@ -2021,8 +2021,6 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
 	if (sl >= OPA_MAX_SLS)
 		return;

-	cca_timer = &ppd->cca_timer[sl];
-
 	cc_state = get_cc_state(ppd);

 	if (!cc_state)
@ -2041,6 +2039,7 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,

 	spin_lock_irqsave(&ppd->cca_timer_lock, flags);

+	cca_timer = &ppd->cca_timer[sl];
 	if (cca_timer->ccti < ccti_limit) {
 		if (cca_timer->ccti + ccti_incr <= ccti_limit)
 			cca_timer->ccti += ccti_incr;
@ -2049,8 +2048,6 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
 		set_link_ipg(ppd);
 	}

-	spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
-
 	ccti = cca_timer->ccti;

 	if (!hrtimer_active(&cca_timer->hrtimer)) {
@ -2061,6 +2058,8 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
 			      HRTIMER_MODE_REL);
 	}

+	spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
+
 	if ((trigger_threshold != 0) && (ccti >= trigger_threshold))
 		log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type);
 }
--- a/drivers/staging/rdma/hfi1/ruc.c
+++ b/drivers/staging/rdma/hfi1/ruc.c
@ -831,7 +831,6 @@ void hfi1_do_send(struct rvt_qp *qp)
 	struct hfi1_pkt_state ps;
 	struct hfi1_qp_priv *priv = qp->priv;
 	int (*make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
-	unsigned long flags;
 	unsigned long timeout;
 	unsigned long timeout_int;
 	int cpu;
@ -866,11 +865,11 @@ void hfi1_do_send(struct rvt_qp *qp)
 		timeout_int = SEND_RESCHED_TIMEOUT;
 	}

-	spin_lock_irqsave(&qp->s_lock, flags);
+	spin_lock_irqsave(&qp->s_lock, ps.flags);

 	/* Return if we are already busy processing a work request. */
 	if (!hfi1_send_ok(qp)) {
-		spin_unlock_irqrestore(&qp->s_lock, flags);
+		spin_unlock_irqrestore(&qp->s_lock, ps.flags);
 		return;
 	}

@ -884,7 +883,7 @@ void hfi1_do_send(struct rvt_qp *qp)
 	do {
 		/* Check for a constructed packet to be sent. */
 		if (qp->s_hdrwords != 0) {
-			spin_unlock_irqrestore(&qp->s_lock, flags);
+			spin_unlock_irqrestore(&qp->s_lock, ps.flags);
 			/*
 			 * If the packet cannot be sent now, return and
 			 * the send tasklet will be woken up later.
@ -897,11 +896,14 @@ void hfi1_do_send(struct rvt_qp *qp)
 			if (unlikely(time_after(jiffies, timeout))) {
 				if (workqueue_congested(cpu,
 							ps.ppd->hfi1_wq)) {
-					spin_lock_irqsave(&qp->s_lock, flags);
+					spin_lock_irqsave(
+						&qp->s_lock,
+						ps.flags);
 					qp->s_flags &= ~RVT_S_BUSY;
 					hfi1_schedule_send(qp);
-					spin_unlock_irqrestore(&qp->s_lock,
-							       flags);
+					spin_unlock_irqrestore(
+						&qp->s_lock,
+						ps.flags);
 					this_cpu_inc(
 						*ps.ppd->dd->send_schedule);
 					return;
@ -913,11 +915,11 @@ void hfi1_do_send(struct rvt_qp *qp)
 				}
 				timeout = jiffies + (timeout_int) / 8;
 			}
-			spin_lock_irqsave(&qp->s_lock, flags);
+			spin_lock_irqsave(&qp->s_lock, ps.flags);
 		}
 	} while (make_req(qp, &ps));

-	spin_unlock_irqrestore(&qp->s_lock, flags);
+	spin_unlock_irqrestore(&qp->s_lock, ps.flags);
 }

 /*
--- a/drivers/staging/rdma/hfi1/sysfs.c
+++ b/drivers/staging/rdma/hfi1/sysfs.c
@ -84,7 +84,7 @@ static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj,
 		rcu_read_unlock();
 		return -EINVAL;
 	}
-	memcpy(buf, &cc_state->cct, count);
+	memcpy(buf, (void *)&cc_state->cct + pos, count);
 	rcu_read_unlock();

 	return count;
@ -131,7 +131,7 @@ static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj,
 		rcu_read_unlock();
 		return -EINVAL;
 	}
-	memcpy(buf, &cc_state->cong_setting, count);
+	memcpy(buf, (void *)&cc_state->cong_setting + pos, count);
 	rcu_read_unlock();

 	return count;
--- a/drivers/staging/rdma/hfi1/ud.c
+++ b/drivers/staging/rdma/hfi1/ud.c
@ -322,7 +322,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 			     (lid == ppd->lid ||
 			      (lid == be16_to_cpu(IB_LID_PERMISSIVE) &&
 			      qp->ibqp.qp_type == IB_QPT_GSI)))) {
-			unsigned long flags;
+			unsigned long tflags = ps->flags;
 			/*
 			 * If DMAs are in progress, we can't generate
 			 * a completion for the loopback packet since
@ -335,10 +335,10 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 				goto bail;
 			}
 			qp->s_cur = next_cur;
-			local_irq_save(flags);
-			spin_unlock_irqrestore(&qp->s_lock, flags);
+			spin_unlock_irqrestore(&qp->s_lock, tflags);
 			ud_loopback(qp, wqe);
-			spin_lock_irqsave(&qp->s_lock, flags);
+			spin_lock_irqsave(&qp->s_lock, tflags);
+			ps->flags = tflags;
 			hfi1_send_complete(qp, wqe, IB_WC_SUCCESS);
 			goto done_free_tx;
 		}
--- a/drivers/staging/rdma/hfi1/user_exp_rcv.c
+++ b/drivers/staging/rdma/hfi1/user_exp_rcv.c
@ -399,8 +399,11 @@ int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo)
 	 * pages, accept the amount pinned so far and program only that.
 	 * User space knows how to deal with partially programmed buffers.
 	 */
-	if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages))
-		return -ENOMEM;
+	if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages)) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
 	pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages);
 	if (pinned <= 0) {
 		ret = pinned;
--- a/drivers/staging/rdma/hfi1/user_sdma.c
+++ b/drivers/staging/rdma/hfi1/user_sdma.c
@ -180,6 +180,8 @@ struct user_sdma_iovec {
 	u64 offset;
 };

+#define SDMA_CACHE_NODE_EVICT BIT(0)
+
 struct sdma_mmu_node {
 	struct mmu_rb_node rb;
 	struct list_head list;
@ -187,6 +189,7 @@ struct sdma_mmu_node {
 	atomic_t refcount;
 	struct page **pages;
 	unsigned npages;
+	unsigned long flags;
 };

 struct user_sdma_request {
@ -597,6 +600,13 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
 		goto free_req;
 	}

+	/* Checking P_KEY for requests from user-space */
+	if (egress_pkey_check(dd->pport, req->hdr.lrh, req->hdr.bth, sc,
+			      PKEY_CHECK_INVALID)) {
+		ret = -EINVAL;
+		goto free_req;
+	}
+
 	/*
 	 * Also should check the BTH.lnh. If it says the next header is GRH then
 	 * the RXE parsing will be off and will land in the middle of the KDETH
@ -1030,27 +1040,29 @@ static inline int num_user_pages(const struct iovec *iov)
 	return 1 + ((epage - spage) >> PAGE_SHIFT);
 }

-/* Caller must hold pq->evict_lock */
 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
 {
 	u32 cleared = 0;
 	struct sdma_mmu_node *node, *ptr;
+	struct list_head to_evict = LIST_HEAD_INIT(to_evict);

+	spin_lock(&pq->evict_lock);
 	list_for_each_entry_safe_reverse(node, ptr, &pq->evict, list) {
 		/* Make sure that no one is still using the node. */
 		if (!atomic_read(&node->refcount)) {
-			/*
-			 * Need to use the page count now as the remove callback
-			 * will free the node.
-			 */
+			set_bit(SDMA_CACHE_NODE_EVICT, &node->flags);
+			list_del_init(&node->list);
+			list_add(&node->list, &to_evict);
 			cleared += node->npages;
-			spin_unlock(&pq->evict_lock);
-			hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb);
-			spin_lock(&pq->evict_lock);
 			if (cleared >= npages)
 				break;
 		}
 	}
+	spin_unlock(&pq->evict_lock);
+
+	list_for_each_entry_safe(node, ptr, &to_evict, list)
+		hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb);
+
 	return cleared;
 }

@ -1062,9 +1074,9 @@ static int pin_vector_pages(struct user_sdma_request *req,
 	struct sdma_mmu_node *node = NULL;
 	struct mmu_rb_node *rb_node;

-	rb_node = hfi1_mmu_rb_search(&pq->sdma_rb_root,
-				     (unsigned long)iovec->iov.iov_base,
-				     iovec->iov.iov_len);
+	rb_node = hfi1_mmu_rb_extract(&pq->sdma_rb_root,
+				      (unsigned long)iovec->iov.iov_base,
+				      iovec->iov.iov_len);
 	if (rb_node && !IS_ERR(rb_node))
 		node = container_of(rb_node, struct sdma_mmu_node, rb);
 	else
@ -1076,7 +1088,6 @@ static int pin_vector_pages(struct user_sdma_request *req,
 			return -ENOMEM;

 		node->rb.addr = (unsigned long)iovec->iov.iov_base;
-		node->rb.len = iovec->iov.iov_len;
 		node->pq = pq;
 		atomic_set(&node->refcount, 0);
 		INIT_LIST_HEAD(&node->list);
@ -1093,11 +1104,25 @@ static int pin_vector_pages(struct user_sdma_request *req,
 		memcpy(pages, node->pages, node->npages * sizeof(*pages));

 		npages -= node->npages;
+
+		/*
+		 * If rb_node is NULL, it means that this is brand new node
+		 * and, therefore not on the eviction list.
+		 * If, however, the rb_node is non-NULL, it means that the
+		 * node is already in RB tree and, therefore on the eviction
+		 * list (nodes are unconditionally inserted in the eviction
+		 * list). In that case, we have to remove the node prior to
+		 * calling the eviction function in order to prevent it from
+		 * freeing this node.
+		 */
+		if (rb_node) {
+			spin_lock(&pq->evict_lock);
+			list_del_init(&node->list);
+			spin_unlock(&pq->evict_lock);
+		}
 retry:
 		if (!hfi1_can_pin_pages(pq->dd, pq->n_locked, npages)) {
-			spin_lock(&pq->evict_lock);
 			cleared = sdma_cache_evict(pq, npages);
-			spin_unlock(&pq->evict_lock);
 			if (cleared >= npages)
 				goto retry;
 		}
@ -1117,37 +1142,32 @@ static int pin_vector_pages(struct user_sdma_request *req,
 			goto bail;
 		}
 		kfree(node->pages);
+		node->rb.len = iovec->iov.iov_len;
 		node->pages = pages;
 		node->npages += pinned;
 		npages = node->npages;
 		spin_lock(&pq->evict_lock);
-		if (!rb_node)
-			list_add(&node->list, &pq->evict);
-		else
-			list_move(&node->list, &pq->evict);
+		list_add(&node->list, &pq->evict);
 		pq->n_locked += pinned;
 		spin_unlock(&pq->evict_lock);
 	}
 	iovec->pages = node->pages;
 	iovec->npages = npages;

-	if (!rb_node) {
-		ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb);
-		if (ret) {
-			spin_lock(&pq->evict_lock);
+	ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb);
+	if (ret) {
+		spin_lock(&pq->evict_lock);
+		if (!list_empty(&node->list))
 			list_del(&node->list);
-			pq->n_locked -= node->npages;
-			spin_unlock(&pq->evict_lock);
-			ret = 0;
-			goto bail;
-		}
-	} else {
-		atomic_inc(&node->refcount);
+		pq->n_locked -= node->npages;
+		spin_unlock(&pq->evict_lock);
+		goto bail;
 	}
 	return 0;
 bail:
-	if (!rb_node)
-		kfree(node);
+	if (rb_node)
+		unpin_vector_pages(current->mm, node->pages, 0, node->npages);
+	kfree(node);
 	return ret;
 }

@ -1558,7 +1578,20 @@ static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode,
 		container_of(mnode, struct sdma_mmu_node, rb);

 	spin_lock(&node->pq->evict_lock);
-	list_del(&node->list);
+	/*
+	 * We've been called by the MMU notifier but this node has been
+	 * scheduled for eviction. The eviction function will take care
+	 * of freeing this node.
+	 * We have to take the above lock first because we are racing
+	 * against the setting of the bit in the eviction function.
+	 */
+	if (mm && test_bit(SDMA_CACHE_NODE_EVICT, &node->flags)) {
+		spin_unlock(&node->pq->evict_lock);
+		return;
+	}
+
+	if (!list_empty(&node->list))
+		list_del(&node->list);
 	node->pq->n_locked -= node->npages;
 	spin_unlock(&node->pq->evict_lock);

--- a/drivers/staging/rdma/hfi1/verbs.c
+++ b/drivers/staging/rdma/hfi1/verbs.c
@ -545,7 +545,7 @@ static inline int qp_ok(int opcode, struct hfi1_packet *packet)

 	if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
 		goto dropit;
-	if (((opcode & OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
+	if (((opcode & RVT_OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
 	    (opcode == IB_OPCODE_CNP))
 		return 1;
 dropit:
@ -1089,16 +1089,16 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,

 /*
 * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent
- * being an entry from the ingress partition key table), return 0
+ * being an entry from the partition key table), return 0
 * otherwise. Use the matching criteria for egress partition keys
 * specified in the OPAv1 spec., section 9.1l.7.
 */
 static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
 {
 	u16 mkey = pkey & PKEY_LOW_15_MASK;
-	u16 ment = ent & PKEY_LOW_15_MASK;
+	u16 mentry = ent & PKEY_LOW_15_MASK;

-	if (mkey == ment) {
+	if (mkey == mentry) {
 		/*
 		 * If pkey[15] is set (full partition member),
 		 * is bit 15 in the corresponding table element
@ -1111,32 +1111,32 @@ static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
 	return 0;
 }

-/*
- * egress_pkey_check - return 0 if hdr's pkey matches according to the
- * criteria in the OPAv1 spec., section 9.11.7.
+/**
+ * egress_pkey_check - check P_KEY of a packet
+ * @ppd:    Physical IB port data
+ * @lrh: Local route header
+ * @bth: Base transport header
+ * @sc5:    SC for packet
+ * @s_pkey_index: It will be used for look up optimization for kernel contexts
+ * only. If it is negative value, then it means user contexts is calling this
+ * function.
+ *
+ * It checks if hdr's pkey is valid.
+ *
+ * Return: 0 on success, otherwise, 1
 */
-static inline int egress_pkey_check(struct hfi1_pportdata *ppd,
-				    struct hfi1_ib_header *hdr,
-				    struct rvt_qp *qp)
+int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth,
+		      u8 sc5, int8_t s_pkey_index)
 {
-	struct hfi1_qp_priv *priv = qp->priv;
-	struct hfi1_other_headers *ohdr;
 	struct hfi1_devdata *dd;
-	int i = 0;
+	int i;
 	u16 pkey;
-	u8 lnh, sc5 = priv->s_sc;
+	int is_user_ctxt_mechanism = (s_pkey_index < 0);

 	if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT))
 		return 0;

-	/* locate the pkey within the headers */
-	lnh = be16_to_cpu(hdr->lrh[0]) & 3;
-	if (lnh == HFI1_LRH_GRH)
-		ohdr = &hdr->u.l.oth;
-	else
-		ohdr = &hdr->u.oth;
-
-	pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+	pkey = (u16)be32_to_cpu(bth[0]);

 	/* If SC15, pkey[0:14] must be 0x7fff */
 	if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
@ -1146,28 +1146,37 @@ static inline int egress_pkey_check(struct hfi1_pportdata *ppd,
 	if ((pkey & PKEY_LOW_15_MASK) == 0)
 		goto bad;

-	/* The most likely matching pkey has index qp->s_pkey_index */
-	if (unlikely(!egress_pkey_matches_entry(pkey,
-						ppd->pkeys
-						[qp->s_pkey_index]))) {
-		/* no match - try the entire table */
-		for (; i < MAX_PKEY_VALUES; i++) {
-			if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
-				break;
-		}
+	/*
+	 * For the kernel contexts only, if a qp is passed into the function,
+	 * the most likely matching pkey has index qp->s_pkey_index
+	 */
+	if (!is_user_ctxt_mechanism &&
+	    egress_pkey_matches_entry(pkey, ppd->pkeys[s_pkey_index])) {
+		return 0;
 	}

-	if (i < MAX_PKEY_VALUES)
-		return 0;
+	for (i = 0; i < MAX_PKEY_VALUES; i++) {
+		if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
+			return 0;
+	}
 bad:
-	incr_cntr64(&ppd->port_xmit_constraint_errors);
-	dd = ppd->dd;
-	if (!(dd->err_info_xmit_constraint.status & OPA_EI_STATUS_SMASK)) {
-		u16 slid = be16_to_cpu(hdr->lrh[3]);
+	/*
+	 * For the user-context mechanism, the P_KEY check would only happen
+	 * once per SDMA request, not once per packet.  Therefore, there's no
+	 * need to increment the counter for the user-context mechanism.
+	 */
+	if (!is_user_ctxt_mechanism) {
+		incr_cntr64(&ppd->port_xmit_constraint_errors);
+		dd = ppd->dd;
+		if (!(dd->err_info_xmit_constraint.status &
+		      OPA_EI_STATUS_SMASK)) {
+			u16 slid = be16_to_cpu(lrh[3]);

-		dd->err_info_xmit_constraint.status |= OPA_EI_STATUS_SMASK;
-		dd->err_info_xmit_constraint.slid = slid;
-		dd->err_info_xmit_constraint.pkey = pkey;
+			dd->err_info_xmit_constraint.status |=
+				OPA_EI_STATUS_SMASK;
+			dd->err_info_xmit_constraint.slid = slid;
+			dd->err_info_xmit_constraint.pkey = pkey;
+		}
 	}
 	return 1;
 }
@ -1227,11 +1236,26 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 {
 	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
 	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_other_headers *ohdr;
+	struct hfi1_ib_header *hdr;
 	send_routine sr;
 	int ret;
+	u8 lnh;
+
+	hdr = &ps->s_txreq->phdr.hdr;
+	/* locate the pkey within the headers */
+	lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+	if (lnh == HFI1_LRH_GRH)
+		ohdr = &hdr->u.l.oth;
+	else
+		ohdr = &hdr->u.oth;

 	sr = get_send_routine(qp, ps->s_txreq);
-	ret = egress_pkey_check(dd->pport, &ps->s_txreq->phdr.hdr, qp);
+	ret = egress_pkey_check(dd->pport,
+				hdr->lrh,
+				ohdr->bth,
+				priv->s_sc,
+				qp->s_pkey_index);
 	if (unlikely(ret)) {
 		/*
 		 * The value we are returning here does not get propagated to
--- a/drivers/staging/rdma/hfi1/verbs.h
+++ b/drivers/staging/rdma/hfi1/verbs.h
@ -215,6 +215,7 @@ struct hfi1_pkt_state {
 	struct hfi1_ibport *ibp;
 	struct hfi1_pportdata *ppd;
 	struct verbs_txreq *s_txreq;
+	unsigned long flags;
 };

 #define HFI1_PSN_CREDIT  16
@ -334,9 +335,6 @@ int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
 #endif
 #define PSN_MODIFY_MASK 0xFFFFFF

-/* Number of bits to pay attention to in the opcode for checking qp type */
-#define OPCODE_QP_MASK 0xE0
-
 /*
 * Compare the lower 24 bits of the msn values.
 * Returns an integer <, ==, or > than zero.
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@ -467,6 +467,7 @@ static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi,
 }

 struct rvt_dev_info *rvt_alloc_device(size_t size, int nports);
+void rvt_dealloc_device(struct rvt_dev_info *rdi);
 int rvt_register_device(struct rvt_dev_info *rvd);
 void rvt_unregister_device(struct rvt_dev_info *rvd);
 int rvt_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@ -117,8 +117,9 @@
 /*
 * Wait flags that would prevent any packet type from being sent.
 */
-#define RVT_S_ANY_WAIT_IO (RVT_S_WAIT_PIO | RVT_S_WAIT_TX | \
-	RVT_S_WAIT_DMA_DESC | RVT_S_WAIT_KMEM)
+#define RVT_S_ANY_WAIT_IO \
+	(RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN | RVT_S_WAIT_TX | \
+	 RVT_S_WAIT_DMA_DESC | RVT_S_WAIT_KMEM)

 /*
 * Wait flags that would prevent send work requests from making progress.