diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 506b5a59ded5..56c8c10b5a85 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -6,11 +6,27 @@ #include "hfi.h" #include "qp.h" +#include "rc.h" #include "verbs.h" #include "tid_rdma.h" #include "exp_rcv.h" #include "trace.h" +/** + * DOC: TID RDMA READ protocol + * + * This is an end-to-end protocol at the hfi1 level between two nodes that + * improves performance by avoiding data copy on the requester side. It + * converts a qualified RDMA READ request into a TID RDMA READ request on + * the requester side and thereafter handles the request and response + * differently. To be qualified, the RDMA READ request should meet the + * following: + * -- The total data length should be greater than 256K; + * -- The total data length should be a multiple of 4K page size; + * -- Each local scatter-gather entry should be 4K page aligned; + * -- Each local scatter-gather entry should be a multiple of 4K page size; + */ + #define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32) #define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33) #define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34) @@ -18,6 +34,9 @@ #define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37) #define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38) +/* Maximum number of packets within a flow generation. */ +#define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT) + #define GENERATION_MASK 0xFFFFF static u32 mask_generation(u32 a) @@ -45,6 +64,9 @@ static u32 mask_generation(u32 a) #define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE) +#define TID_RDMA_DESTQP_FLOW_SHIFT 11 +#define TID_RDMA_DESTQP_FLOW_MASK 0x1f + #define TID_OPFN_QP_CTXT_MASK 0xff #define TID_OPFN_QP_CTXT_SHIFT 56 #define TID_OPFN_QP_KDETH_MASK 0xff @@ -1597,3 +1619,181 @@ u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry, return dd->verbs_dev.n_tidwait; } + +/* TID RDMA READ functions */ +u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, u32 *bth1, + u32 *bth2, u32 *len) +{ + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_flow *flow = &req->flows[req->flow_idx]; + struct rvt_qp *qp = req->qp; + struct hfi1_qp_priv *qpriv = qp->priv; + struct hfi1_swqe_priv *wpriv = wqe->priv; + struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req; + struct tid_rdma_params *remote; + u32 req_len = 0; + void *req_addr = NULL; + + /* This is the IB psn used to send the request */ + *bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt); + + /* TID Entries for TID RDMA READ payload */ + req_addr = &flow->tid_entry[flow->tid_idx]; + req_len = sizeof(*flow->tid_entry) * + (flow->tidcnt - flow->tid_idx); + + memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req)); + wpriv->ss.sge.vaddr = req_addr; + wpriv->ss.sge.sge_length = req_len; + wpriv->ss.sge.length = wpriv->ss.sge.sge_length; + /* + * We can safely zero these out. Since the first SGE covers the + * entire packet, nothing else should even look at the MR. + */ + wpriv->ss.sge.mr = NULL; + wpriv->ss.sge.m = 0; + wpriv->ss.sge.n = 0; + + wpriv->ss.sg_list = NULL; + wpriv->ss.total_len = wpriv->ss.sge.sge_length; + wpriv->ss.num_sge = 1; + + /* Construct the TID RDMA READ REQ packet header */ + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + + KDETH_RESET(rreq->kdeth0, KVER, 0x1); + KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey); + rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr + + req->cur_seg * req->seg_len + flow->sent); + rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey); + rreq->reth.length = cpu_to_be32(*len); + rreq->tid_flow_psn = + cpu_to_be32((flow->flow_state.generation << + HFI1_KDETH_BTH_SEQ_SHIFT) | + ((flow->flow_state.spsn + flow->pkt) & + HFI1_KDETH_BTH_SEQ_MASK)); + rreq->tid_flow_qp = + cpu_to_be32(qpriv->tid_rdma.local.qp | + ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << + TID_RDMA_DESTQP_FLOW_SHIFT) | + qpriv->rcd->ctxt); + rreq->verbs_qp = cpu_to_be32(qp->remote_qpn); + *bth1 &= ~RVT_QPN_MASK; + *bth1 |= remote->qp; + *bth2 |= IB_BTH_REQ_ACK; + rcu_read_unlock(); + + /* We are done with this segment */ + flow->sent += *len; + req->cur_seg++; + qp->s_state = TID_OP(READ_REQ); + req->ack_pending++; + req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1); + qpriv->pending_tid_r_segs++; + qp->s_num_rd_atomic++; + + /* Set the TID RDMA READ request payload size */ + *len = req_len; + + return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32); +} + +/* + * @len: contains the data length to read upon entry and the read request + * payload length upon exit. + */ +u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, u32 *bth1, + u32 *bth2, u32 *len) + __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_flow *flow = NULL; + u32 hdwords = 0; + bool last; + bool retry = true; + u32 npkts = rvt_div_round_up_mtu(qp, *len); + + /* + * Check sync conditions. Make sure that there are no pending + * segments before freeing the flow. + */ +sync_check: + if (req->state == TID_REQUEST_SYNC) { + if (qpriv->pending_tid_r_segs) + goto done; + + hfi1_kern_clear_hw_flow(req->rcd, qp); + req->state = TID_REQUEST_ACTIVE; + } + + /* + * If the request for this segment is resent, the tid resources should + * have been allocated before. In this case, req->flow_idx should + * fall behind req->setup_head. + */ + if (req->flow_idx == req->setup_head) { + retry = false; + if (req->state == TID_REQUEST_RESEND) { + /* + * This is the first new segment for a request whose + * earlier segments have been re-sent. We need to + * set up the sge pointer correctly. + */ + restart_sge(&qp->s_sge, wqe, req->s_next_psn, + qp->pmtu); + req->isge = 0; + req->state = TID_REQUEST_ACTIVE; + } + + /* + * Check sync. The last PSN of each generation is reserved for + * RESYNC. + */ + if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) { + req->state = TID_REQUEST_SYNC; + goto sync_check; + } + + /* Allocate the flow if not yet */ + if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp)) + goto done; + + /* + * The following call will advance req->setup_head after + * allocating the tid entries. + */ + if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) { + req->state = TID_REQUEST_QUEUED; + + /* + * We don't have resources for this segment. The QP has + * already been queued. + */ + goto done; + } + } + + /* req->flow_idx should only be one slot behind req->setup_head */ + flow = &req->flows[req->flow_idx]; + flow->pkt = 0; + flow->tid_idx = 0; + flow->sent = 0; + if (!retry) { + /* Set the first and last IB PSN for the flow in use.*/ + flow->flow_state.ib_spsn = req->s_next_psn; + flow->flow_state.ib_lpsn = + flow->flow_state.ib_spsn + flow->npkts - 1; + } + + /* Calculate the next segment start psn.*/ + req->s_next_psn += flow->npkts; + + /* Build the packet header */ + hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len); +done: + return hdwords; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 3dbeaa8cb5b3..f692f3ff9419 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -45,6 +45,19 @@ struct tid_flow_state { u8 flags; }; +enum tid_rdma_req_state { + TID_REQUEST_INACTIVE = 0, + TID_REQUEST_INIT, + TID_REQUEST_INIT_RESEND, + TID_REQUEST_ACTIVE, + TID_REQUEST_RESEND, + TID_REQUEST_RESEND_ACTIVE, + TID_REQUEST_QUEUED, + TID_REQUEST_SYNC, + TID_REQUEST_RNR_NAK, + TID_REQUEST_COMPLETE, +}; + struct tid_rdma_request { struct rvt_qp *qp; struct hfi1_ctxtdata *rcd; @@ -60,8 +73,13 @@ struct tid_rdma_request { u16 flow_idx; /* flow index most recently set up */ u32 seg_len; + u32 s_next_psn; /* IB PSN of next segment start for read */ + u32 cur_seg; /* index of current segment */ u32 isge; /* index of "current" sge */ + u32 ack_pending; /* num acks pending for this request */ + + enum tid_rdma_req_state state; }; /* @@ -77,6 +95,10 @@ struct flow_state { u32 spsn; /* starting PSN in TID space */ u32 lpsn; /* last PSN in TID space */ u32 r_next_psn; /* next PSN to be received (in TID space) */ + + /* For tid rdma read */ + u32 ib_spsn; /* starting PSN in Verbs space */ + u32 ib_lpsn; /* last PSn in Verbs space */ }; struct tid_rdma_pageset { @@ -110,11 +132,14 @@ struct tid_rdma_flow { struct flow_state flow_state; struct tid_rdma_request *req; u32 length; + u32 sent; u8 tnode_cnt; u8 tidcnt; + u8 tid_idx; u8 idx; u8 npagesets; u8 npkts; + u8 pkt; struct kern_tid_node tnode[TID_RDMA_MAX_PAGES]; struct tid_rdma_pageset pagesets[TID_RDMA_MAX_PAGES]; u32 tid_entry[TID_RDMA_MAX_PAGES]; @@ -159,4 +184,11 @@ struct cntr_entry; u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry, void *context, int vl, int mode, u64 data); +u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u32 *bth1, u32 *bth2, u32 *len); +u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, u32 *bth1, + u32 *bth2, u32 *len); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 20729454f181..2965b0957855 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -170,12 +170,16 @@ struct hfi1_qp_priv { struct rvt_qp *owner; u8 hdr_type; /* 9B or 16B */ unsigned long tid_timer_timeout_jiffies; + + /* For TID RDMA READ */ + u32 pending_tid_r_segs; /* Num of pending tid read segments */ u16 pkts_ps; /* packets per segment */ u8 timeout_shift; /* account for number of packets per segment */ }; struct hfi1_swqe_priv { struct tid_rdma_request tid_req; + struct rvt_sge_state ss; /* Used for TID RDMA READ Request */ }; struct hfi1_ack_priv { @@ -331,6 +335,11 @@ static inline u32 delta_psn(u32 a, u32 b) return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT; } +static inline struct tid_rdma_request *wqe_to_tid_req(struct rvt_swqe *wqe) +{ + return &((struct hfi1_swqe_priv *)wqe->priv)->tid_req; +} + /* * Look through all the active flows for a TID RDMA request and find * the one (if it exists) that contains the specified PSN. diff --git a/include/rdma/ib_hdrs.h b/include/rdma/ib_hdrs.h index 6e35416170a3..58a0a0f99e7f 100644 --- a/include/rdma/ib_hdrs.h +++ b/include/rdma/ib_hdrs.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -100,6 +100,8 @@ struct ib_atomic_eth { __be64 compare_data; /* potentially unaligned */ } __packed; +#include + union ib_ehdrs { struct { __be32 deth[2]; @@ -117,6 +119,11 @@ union ib_ehdrs { __be32 aeth; __be32 ieth; struct ib_atomic_eth atomic_eth; + /* TID RDMA headers */ + union { + struct tid_rdma_read_req r_req; + struct tid_rdma_read_resp r_rsp; + } tid_rdma; } __packed; struct ib_other_headers { diff --git a/include/rdma/tid_rdma_defs.h b/include/rdma/tid_rdma_defs.h new file mode 100644 index 000000000000..1c431ea32b52 --- /dev/null +++ b/include/rdma/tid_rdma_defs.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ + +#ifndef TID_RDMA_DEFS_H +#define TID_RDMA_DEFS_H + +#include + +struct tid_rdma_read_req { + __le32 kdeth0; + __le32 kdeth1; + struct ib_reth reth; + __be32 tid_flow_psn; + __be32 tid_flow_qp; + __be32 verbs_qp; +}; + +struct tid_rdma_read_resp { + __le32 kdeth0; + __le32 kdeth1; + __be32 aeth; + __be32 reserved[4]; + __be32 verbs_psn; + __be32 verbs_qp; +}; + +/* + * TID RDMA Opcodes + */ +#define IB_OPCODE_TID_RDMA 0xe0 +enum { + IB_OPCODE_READ_REQ = 0x4, + IB_OPCODE_READ_RESP = 0x5, + + IB_OPCODE(TID_RDMA, READ_REQ), + IB_OPCODE(TID_RDMA, READ_RESP), +}; + +#define TID_OP(x) IB_OPCODE_TID_RDMA_##x + +/* + * Define TID RDMA specific WR opcodes. The ib_wr_opcode + * enum already provides some reserved values for use by + * low level drivers. Two of those are used but renamed + * to be more descriptive. + */ +#define IB_WR_TID_RDMA_READ IB_WR_RESERVED2 + +#endif /* TID_RDMA_DEFS_H */