linux/drivers/infiniband/hw/hfi1/driver.c

1813 lines
48 KiB
C
Raw Normal View History

/*
IB/hfi1: Rework fault injection machinery The packet fault injection code present in the HFI1 driver had some issues which not only fragment the code but also created user confusion. Furthermore, it suffered from the following issues: 1. The fault_packet method only worked for received packets. This meant that the only fault injection mode available for sent packets is fault_opcode, which did not allow for random packet drops on all egressing packets. 2. The mask available for the fault_opcode mode did not really work due to the fact that the opcode values are not bits in a bitmask but rather sequential integer values. Creating a opcode/mask pair that would successfully capture a set of packets was nearly impossible. 3. The code was fragmented and used too many debugfs entries to operate and control. This was confusing to users. 4. It did not allow filtering fault injection on a per direction basis - egress vs. ingress. In order to improve or fix the above issues, the following changes have been made: 1. The fault injection methods have been combined into a single fault injection facility. As such, the fault injection has been plugged into both the send and receive code paths. Regardless of method used the fault injection will operate on both egress and ingress packets. 2. The type of fault injection - by packet or by opcode - is now controlled by changing the boolean value of the file "opcode_mode". When the value is set to True, fault injection is done by opcode. Otherwise, by packet. 2. The masking ability has been removed in favor of a bitmap that holds opcodes of interest (one bit per opcode, a total of 256 bits). This works in tandem with the "opcode_mode" value. When the value of "opcode_mode" is False, this bitmap is ignored. When the value is True, the bitmap lists all opcodes to be considered for fault injection. By default, the bitmap is empty. When the user wants to filter by opcode, the user sets the corresponding bit in the bitmap by echo'ing the bit position into the 'opcodes' file. This gets around the issue that the set of opcodes does not lend itself to effective masks and allow for extremely fine-grained filtering by opcode. 4. fault_packet and fault_opcode methods have been combined. Hence, there is only one debugfs directory controlling the entire operation of the fault injection machinery. This reduces the number of debugfs entries and provides a more unified user experience. 5. A new control files - "direction" - is provided to allow the user to control the direction of packets, which are subject to fault injection. 6. A new control file - "skip_usec" - is added that would allow the user to specify a "timeout" during which no fault injection will occur. In addition, the following bug fixes have been applied: 1. The fault injection code has been split into its own header and source files. This was done to better organize the code and support conditional compilation without littering the code with #ifdef's. 2. The method by which the TX PIO packets were being marked for drop conflicted with the way send contexts were being setup. As a result, the send context was repeatedly being reset. 3. The fault injection only makes sense when the user can control it through the debugfs entries. However, a kernel configuration can enable fault injection but keep fault injection debugfs entries disabled. Therefore, it makes sense that the HFI fault injection code depends on both. 4. Error suppression did not take into account the method by which PIO packets were being dropped. Therefore, even with error suppression turned on, errors would still be displayed to the screen. A larger enough packet drop percentage would case the kernel to crash because the driver would be stuck printing errors. Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Reviewed-by: Don Hiatt <don.hiatt@intel.com> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-02 21:43:24 +08:00
* Copyright(c) 2015-2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* BSD LICENSE
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* - Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <linux/spinlock.h>
#include <linux/pci.h>
#include <linux/io.h>
#include <linux/delay.h>
#include <linux/netdevice.h>
#include <linux/vmalloc.h>
#include <linux/module.h>
#include <linux/prefetch.h>
#include <rdma/ib_verbs.h>
#include "hfi.h"
#include "trace.h"
#include "qp.h"
#include "sdma.h"
#include "debugfs.h"
#include "vnic.h"
IB/hfi1: Rework fault injection machinery The packet fault injection code present in the HFI1 driver had some issues which not only fragment the code but also created user confusion. Furthermore, it suffered from the following issues: 1. The fault_packet method only worked for received packets. This meant that the only fault injection mode available for sent packets is fault_opcode, which did not allow for random packet drops on all egressing packets. 2. The mask available for the fault_opcode mode did not really work due to the fact that the opcode values are not bits in a bitmask but rather sequential integer values. Creating a opcode/mask pair that would successfully capture a set of packets was nearly impossible. 3. The code was fragmented and used too many debugfs entries to operate and control. This was confusing to users. 4. It did not allow filtering fault injection on a per direction basis - egress vs. ingress. In order to improve or fix the above issues, the following changes have been made: 1. The fault injection methods have been combined into a single fault injection facility. As such, the fault injection has been plugged into both the send and receive code paths. Regardless of method used the fault injection will operate on both egress and ingress packets. 2. The type of fault injection - by packet or by opcode - is now controlled by changing the boolean value of the file "opcode_mode". When the value is set to True, fault injection is done by opcode. Otherwise, by packet. 2. The masking ability has been removed in favor of a bitmap that holds opcodes of interest (one bit per opcode, a total of 256 bits). This works in tandem with the "opcode_mode" value. When the value of "opcode_mode" is False, this bitmap is ignored. When the value is True, the bitmap lists all opcodes to be considered for fault injection. By default, the bitmap is empty. When the user wants to filter by opcode, the user sets the corresponding bit in the bitmap by echo'ing the bit position into the 'opcodes' file. This gets around the issue that the set of opcodes does not lend itself to effective masks and allow for extremely fine-grained filtering by opcode. 4. fault_packet and fault_opcode methods have been combined. Hence, there is only one debugfs directory controlling the entire operation of the fault injection machinery. This reduces the number of debugfs entries and provides a more unified user experience. 5. A new control files - "direction" - is provided to allow the user to control the direction of packets, which are subject to fault injection. 6. A new control file - "skip_usec" - is added that would allow the user to specify a "timeout" during which no fault injection will occur. In addition, the following bug fixes have been applied: 1. The fault injection code has been split into its own header and source files. This was done to better organize the code and support conditional compilation without littering the code with #ifdef's. 2. The method by which the TX PIO packets were being marked for drop conflicted with the way send contexts were being setup. As a result, the send context was repeatedly being reset. 3. The fault injection only makes sense when the user can control it through the debugfs entries. However, a kernel configuration can enable fault injection but keep fault injection debugfs entries disabled. Therefore, it makes sense that the HFI fault injection code depends on both. 4. Error suppression did not take into account the method by which PIO packets were being dropped. Therefore, even with error suppression turned on, errors would still be displayed to the screen. A larger enough packet drop percentage would case the kernel to crash because the driver would be stuck printing errors. Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Reviewed-by: Don Hiatt <don.hiatt@intel.com> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-02 21:43:24 +08:00
#include "fault.h"
#undef pr_fmt
#define pr_fmt(fmt) DRIVER_NAME ": " fmt
/*
* The size has to be longer than this string, so we can append
* board/chip information to it in the initialization code.
*/
const char ib_hfi1_version[] = HFI1_DRIVER_VERSION "\n";
DEFINE_MUTEX(hfi1_mutex); /* general driver use */
unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
module_param_named(max_mtu, hfi1_max_mtu, uint, S_IRUGO);
MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is " __stringify(
HFI1_DEFAULT_MAX_MTU));
unsigned int hfi1_cu = 1;
module_param_named(cu, hfi1_cu, uint, S_IRUGO);
MODULE_PARM_DESC(cu, "Credit return units");
unsigned long hfi1_cap_mask = HFI1_CAP_MASK_DEFAULT;
static int hfi1_caps_set(const char *val, const struct kernel_param *kp);
static int hfi1_caps_get(char *buffer, const struct kernel_param *kp);
static const struct kernel_param_ops cap_ops = {
.set = hfi1_caps_set,
.get = hfi1_caps_get
};
module_param_cb(cap_mask, &cap_ops, &hfi1_cap_mask, S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(cap_mask, "Bit mask of enabled/disabled HW features");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_DESCRIPTION("Intel Omni-Path Architecture driver");
/*
* MAX_PKT_RCV is the max # if packets processed per receive interrupt.
*/
#define MAX_PKT_RECV 64
/*
* MAX_PKT_THREAD_RCV is the max # of packets processed before
* the qp_wait_list queue is flushed.
*/
#define MAX_PKT_RECV_THREAD (MAX_PKT_RECV * 4)
#define EGR_HEAD_UPDATE_THRESHOLD 16
struct hfi1_ib_stats hfi1_stats;
static int hfi1_caps_set(const char *val, const struct kernel_param *kp)
{
int ret = 0;
unsigned long *cap_mask_ptr = (unsigned long *)kp->arg,
cap_mask = *cap_mask_ptr, value, diff,
write_mask = ((HFI1_CAP_WRITABLE_MASK << HFI1_CAP_USER_SHIFT) |
HFI1_CAP_WRITABLE_MASK);
ret = kstrtoul(val, 0, &value);
if (ret) {
pr_warn("Invalid module parameter value for 'cap_mask'\n");
goto done;
}
/* Get the changed bits (except the locked bit) */
diff = value ^ (cap_mask & ~HFI1_CAP_LOCKED_SMASK);
/* Remove any bits that are not allowed to change after driver load */
if (HFI1_CAP_LOCKED() && (diff & ~write_mask)) {
pr_warn("Ignoring non-writable capability bits %#lx\n",
diff & ~write_mask);
diff &= write_mask;
}
/* Mask off any reserved bits */
diff &= ~HFI1_CAP_RESERVED_MASK;
/* Clear any previously set and changing bits */
cap_mask &= ~diff;
/* Update the bits with the new capability */
cap_mask |= (value & diff);
/* Check for any kernel/user restrictions */
diff = (cap_mask & (HFI1_CAP_MUST_HAVE_KERN << HFI1_CAP_USER_SHIFT)) ^
((cap_mask & HFI1_CAP_MUST_HAVE_KERN) << HFI1_CAP_USER_SHIFT);
cap_mask &= ~diff;
/* Set the bitmask to the final set */
*cap_mask_ptr = cap_mask;
done:
return ret;
}
static int hfi1_caps_get(char *buffer, const struct kernel_param *kp)
{
unsigned long cap_mask = *(unsigned long *)kp->arg;
cap_mask &= ~HFI1_CAP_LOCKED_SMASK;
cap_mask |= ((cap_mask & HFI1_CAP_K2U) << HFI1_CAP_USER_SHIFT);
return scnprintf(buffer, PAGE_SIZE, "0x%lx", cap_mask);
}
struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi)
{
struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi);
struct hfi1_devdata *dd = container_of(ibdev,
struct hfi1_devdata, verbs_dev);
return dd->pcidev;
}
/*
* Return count of units with at least one port ACTIVE.
*/
int hfi1_count_active_units(void)
{
struct hfi1_devdata *dd;
struct hfi1_pportdata *ppd;
unsigned long index, flags;
int pidx, nunits_active = 0;
xa_lock_irqsave(&hfi1_dev_table, flags);
xa_for_each(&hfi1_dev_table, index, dd) {
IB/hfi1: Fix bar0 mapping to use write combining When the debugpat kernel boot flag is turned on the following traces are printed: [ 1884.793168] x86/PAT: Overlap at 0x90000000-0x92000000 [ 1884.803510] x86/PAT: reserve_memtype added [mem 0x91200000-0x9127ffff], track uncached-minus, req write-combining, ret uncached-minus [ 1884.818167] hfi1 0000:05:00.0: hfi1_0: WC Remapped RcvArray: ffffc9000a980000 The ioremap_wc() clearly is not returning a write combining mapping due to an overlap where the RcvArray is mapped in a uncached mapping prior to creating the proposed write combining mapping. The patch replaces the single base register for uncached CSRs that used to overlap the RcvArray with two mappings. One, kregbase1, from the bar0 up to the RcvArray and another, kregbase2, from the end of the RcvArray to the pio send buffer space. A new dd field, base2_start, is used to convert the zero-based offset in the CSR routines to the correct kregbase1/kregbase2 mapping. A single direct write of the RcvArray CSRs is replaced with hfi1_put_tid() to insure correct access using the new disjoint mapping. Additionally, the kregend field is deleted since it is only ever written. patdebug now shows the RcvArray as write combining: [ 35.688990] x86/PAT: reserve_memtype added [mem 0x91200000-0x9127ffff], track write-combining, req write-combining, ret write-combining To insulate from any potential issues with write combining, all writeq are now flushed in hfi1_put_tid() and rcv_array_wc_fill(). Reviewed-by: Mitko Haralanov <mitko.haralanov@intel.com> Reviewed-by: Ashutosh Dixit <ashutosh.dixit@intel.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-07-24 22:45:31 +08:00
if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase1)
continue;
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
ppd = dd->pport + pidx;
if (ppd->lid && ppd->linkup) {
nunits_active++;
break;
}
}
}
xa_unlock_irqrestore(&hfi1_dev_table, flags);
return nunits_active;
}
/*
* Get address of eager buffer from it's index (allocated in chunks, not
* contiguous).
*/
static inline void *get_egrbuf(const struct hfi1_ctxtdata *rcd, u64 rhf,
u8 *update)
{
u32 idx = rhf_egr_index(rhf), offset = rhf_egr_buf_offset(rhf);
*update |= !(idx & (rcd->egrbufs.threshold - 1)) && !offset;
return (void *)(((u64)(rcd->egrbufs.rcvtids[idx].addr)) +
(offset * RCV_BUF_BLOCK_SIZE));
}
static inline void *hfi1_get_header(struct hfi1_ctxtdata *rcd,
__le32 *rhf_addr)
{
u32 offset = rhf_hdrq_offset(rhf_to_cpu(rhf_addr));
return (void *)(rhf_addr - rcd->rhf_offset + offset);
}
static inline struct ib_header *hfi1_get_msgheader(struct hfi1_ctxtdata *rcd,
__le32 *rhf_addr)
{
return (struct ib_header *)hfi1_get_header(rcd, rhf_addr);
}
static inline struct hfi1_16b_header
*hfi1_get_16B_header(struct hfi1_ctxtdata *rcd,
__le32 *rhf_addr)
{
return (struct hfi1_16b_header *)hfi1_get_header(rcd, rhf_addr);
}
/*
* Validate and encode the a given RcvArray Buffer size.
* The function will check whether the given size falls within
* allowed size ranges for the respective type and, optionally,
* return the proper encoding.
*/
int hfi1_rcvbuf_validate(u32 size, u8 type, u16 *encoded)
{
if (unlikely(!PAGE_ALIGNED(size)))
return 0;
if (unlikely(size < MIN_EAGER_BUFFER))
return 0;
if (size >
(type == PT_EAGER ? MAX_EAGER_BUFFER : MAX_EXPECTED_BUFFER))
return 0;
if (encoded)
*encoded = ilog2(size / PAGE_SIZE) + 1;
return 1;
}
static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd,
struct hfi1_packet *packet)
{
struct ib_header *rhdr = packet->hdr;
u32 rte = rhf_rcv_type_err(packet->rhf);
u32 mlid_base;
struct hfi1_ibport *ibp = rcd_to_iport(rcd);
struct hfi1_devdata *dd = ppd->dd;
struct hfi1_ibdev *verbs_dev = &dd->verbs_dev;
struct rvt_dev_info *rdi = &verbs_dev->rdi;
if ((packet->rhf & RHF_DC_ERR) &&
hfi1_dbg_fault_suppress_err(verbs_dev))
return;
if (packet->rhf & RHF_ICRC_ERR)
return;
if (packet->etype == RHF_RCV_TYPE_BYPASS) {
goto drop;
} else {
u8 lnh = ib_get_lnh(rhdr);
mlid_base = be16_to_cpu(IB_MULTICAST_LID_BASE);
if (lnh == HFI1_LRH_BTH) {
packet->ohdr = &rhdr->u.oth;
} else if (lnh == HFI1_LRH_GRH) {
packet->ohdr = &rhdr->u.l.oth;
packet->grh = &rhdr->u.l.grh;
} else {
goto drop;
}
}
if (packet->rhf & RHF_TID_ERR) {
/* For TIDERR and RC QPs preemptively schedule a NAK */
u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
u32 dlid = ib_get_dlid(rhdr);
u32 qp_num;
/* Sanity check packet */
if (tlen < 24)
goto drop;
/* Check for GRH */
if (packet->grh) {
u32 vtf;
struct ib_grh *grh = packet->grh;
if (grh->next_hdr != IB_GRH_NEXT_HDR)
goto drop;
vtf = be32_to_cpu(grh->version_tclass_flow);
if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
goto drop;
}
/* Get the destination QP number. */
qp_num = ib_bth_get_qpn(packet->ohdr);
if (dlid < mlid_base) {
struct rvt_qp *qp;
unsigned long flags;
rcu_read_lock();
qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
if (!qp) {
rcu_read_unlock();
goto drop;
}
/*
* Handle only RC QPs - for other QP types drop error
* packet.
*/
spin_lock_irqsave(&qp->r_lock, flags);
/* Check for valid receive state. */
if (!(ib_rvt_state_ops[qp->state] &
RVT_PROCESS_RECV_OK)) {
ibp->rvp.n_pkt_drops++;
}
switch (qp->ibqp.qp_type) {
case IB_QPT_RC:
hfi1_rc_hdrerr(rcd, packet, qp);
break;
default:
/* For now don't handle any other QP types */
break;
}
spin_unlock_irqrestore(&qp->r_lock, flags);
rcu_read_unlock();
} /* Unicast QP */
} /* Valid packet with TIDErr */
/* handle "RcvTypeErr" flags */
switch (rte) {
case RHF_RTE_ERROR_OP_CODE_ERR:
{
void *ebuf = NULL;
u8 opcode;
if (rhf_use_egr_bfr(packet->rhf))
ebuf = packet->ebuf;
if (!ebuf)
goto drop; /* this should never happen */
opcode = ib_bth_get_opcode(packet->ohdr);
if (opcode == IB_OPCODE_CNP) {
/*
* Only in pre-B0 h/w is the CNP_OPCODE handled
* via this code path.
*/
struct rvt_qp *qp = NULL;
u32 lqpn, rqpn;
u16 rlid;
u8 svc_type, sl, sc5;
sc5 = hfi1_9B_get_sc5(rhdr, packet->rhf);
sl = ibp->sc_to_sl[sc5];
lqpn = ib_bth_get_qpn(packet->ohdr);
rcu_read_lock();
qp = rvt_lookup_qpn(rdi, &ibp->rvp, lqpn);
if (!qp) {
rcu_read_unlock();
goto drop;
}
switch (qp->ibqp.qp_type) {
case IB_QPT_UD:
rlid = 0;
rqpn = 0;
svc_type = IB_CC_SVCTYPE_UD;
break;
case IB_QPT_UC:
rlid = ib_get_slid(rhdr);
rqpn = qp->remote_qpn;
svc_type = IB_CC_SVCTYPE_UC;
break;
default:
rcu_read_unlock();
goto drop;
}
process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
rcu_read_unlock();
}
packet->rhf &= ~RHF_RCV_TYPE_ERR_SMASK;
break;
}
default:
break;
}
drop:
return;
}
static inline void init_packet(struct hfi1_ctxtdata *rcd,
struct hfi1_packet *packet)
{
packet->rsize = rcd->rcvhdrqentsize; /* words */
packet->maxcnt = rcd->rcvhdrq_cnt * packet->rsize; /* words */
packet->rcd = rcd;
packet->updegr = 0;
packet->etail = -1;
packet->rhf_addr = get_rhf_addr(rcd);
packet->rhf = rhf_to_cpu(packet->rhf_addr);
packet->rhqoff = rcd->head;
packet->numpkt = 0;
}
/* We support only two types - 9B and 16B for now */
static const hfi1_handle_cnp hfi1_handle_cnp_tbl[2] = {
[HFI1_PKT_TYPE_9B] = &return_cnp,
[HFI1_PKT_TYPE_16B] = &return_cnp_16B
};
/**
* hfi1_process_ecn_slowpath - Process FECN or BECN bits
* @qp: The packet's destination QP
* @pkt: The packet itself.
* @prescan: Is the caller the RXQ prescan
*
* Process the packet's FECN or BECN bits. By now, the packet
* has already been evaluated whether processing of those bit should
* be done.
* The significance of the @prescan argument is that if the caller
* is the RXQ prescan, a CNP will be send out instead of waiting for the
* normal packet processing to send an ACK with BECN set (or a CNP).
*/
bool hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
bool prescan)
{
struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
struct ib_other_headers *ohdr = pkt->ohdr;
struct ib_grh *grh = pkt->grh;
u32 rqpn = 0;
u16 pkey;
u32 rlid, slid, dlid = 0;
u8 hdr_type, sc, svc_type, opcode;
bool is_mcast = false, ignore_fecn = false, do_cnp = false,
fecn, becn;
/* can be called from prescan */
if (pkt->etype == RHF_RCV_TYPE_BYPASS) {
pkey = hfi1_16B_get_pkey(pkt->hdr);
sc = hfi1_16B_get_sc(pkt->hdr);
dlid = hfi1_16B_get_dlid(pkt->hdr);
slid = hfi1_16B_get_slid(pkt->hdr);
is_mcast = hfi1_is_16B_mcast(dlid);
opcode = ib_bth_get_opcode(ohdr);
hdr_type = HFI1_PKT_TYPE_16B;
fecn = hfi1_16B_get_fecn(pkt->hdr);
becn = hfi1_16B_get_becn(pkt->hdr);
} else {
pkey = ib_bth_get_pkey(ohdr);
sc = hfi1_9B_get_sc5(pkt->hdr, pkt->rhf);
dlid = qp->ibqp.qp_type != IB_QPT_UD ? ib_get_dlid(pkt->hdr) :
ppd->lid;
slid = ib_get_slid(pkt->hdr);
is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
(dlid != be16_to_cpu(IB_LID_PERMISSIVE));
opcode = ib_bth_get_opcode(ohdr);
hdr_type = HFI1_PKT_TYPE_9B;
fecn = ib_bth_get_fecn(ohdr);
becn = ib_bth_get_becn(ohdr);
}
switch (qp->ibqp.qp_type) {
case IB_QPT_UD:
rlid = slid;
rqpn = ib_get_sqpn(pkt->ohdr);
svc_type = IB_CC_SVCTYPE_UD;
break;
case IB_QPT_SMI:
case IB_QPT_GSI:
rlid = slid;
rqpn = ib_get_sqpn(pkt->ohdr);
svc_type = IB_CC_SVCTYPE_UD;
break;
case IB_QPT_UC:
rlid = rdma_ah_get_dlid(&qp->remote_ah_attr);
rqpn = qp->remote_qpn;
svc_type = IB_CC_SVCTYPE_UC;
break;
case IB_QPT_RC:
rlid = rdma_ah_get_dlid(&qp->remote_ah_attr);
rqpn = qp->remote_qpn;
svc_type = IB_CC_SVCTYPE_RC;
break;
default:
return false;
}
ignore_fecn = is_mcast || (opcode == IB_OPCODE_CNP) ||
(opcode == IB_OPCODE_RC_ACKNOWLEDGE);
/*
* ACKNOWLEDGE packets do not get a CNP but this will be
* guarded by ignore_fecn above.
*/
do_cnp = prescan ||
(opcode >= IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST &&
opcode <= IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE) ||
opcode == TID_OP(READ_RESP) ||
opcode == TID_OP(ACK);
/* Call appropriate CNP handler */
if (!ignore_fecn && do_cnp && fecn)
hfi1_handle_cnp_tbl[hdr_type](ibp, qp, rqpn, pkey,
dlid, rlid, sc, grh);
if (becn) {
u32 lqpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
u8 sl = ibp->sc_to_sl[sc];
process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
}
return !ignore_fecn && fecn;
}
struct ps_mdata {
struct hfi1_ctxtdata *rcd;
u32 rsize;
u32 maxcnt;
u32 ps_head;
u32 ps_tail;
u32 ps_seq;
};
static inline void init_ps_mdata(struct ps_mdata *mdata,
struct hfi1_packet *packet)
{
struct hfi1_ctxtdata *rcd = packet->rcd;
mdata->rcd = rcd;
mdata->rsize = packet->rsize;
mdata->maxcnt = packet->maxcnt;
mdata->ps_head = packet->rhqoff;
if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
mdata->ps_tail = get_rcvhdrtail(rcd);
if (rcd->ctxt == HFI1_CTRL_CTXT)
mdata->ps_seq = rcd->seq_cnt;
else
mdata->ps_seq = 0; /* not used with DMA_RTAIL */
} else {
mdata->ps_tail = 0; /* used only with DMA_RTAIL*/
mdata->ps_seq = rcd->seq_cnt;
}
}
static inline int ps_done(struct ps_mdata *mdata, u64 rhf,
struct hfi1_ctxtdata *rcd)
{
if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL))
return mdata->ps_head == mdata->ps_tail;
return mdata->ps_seq != rhf_rcv_seq(rhf);
}
static inline int ps_skip(struct ps_mdata *mdata, u64 rhf,
struct hfi1_ctxtdata *rcd)
{
/*
* Control context can potentially receive an invalid rhf.
* Drop such packets.
*/
if ((rcd->ctxt == HFI1_CTRL_CTXT) && (mdata->ps_head != mdata->ps_tail))
return mdata->ps_seq != rhf_rcv_seq(rhf);
return 0;
}
static inline void update_ps_mdata(struct ps_mdata *mdata,
struct hfi1_ctxtdata *rcd)
{
mdata->ps_head += mdata->rsize;
if (mdata->ps_head >= mdata->maxcnt)
mdata->ps_head = 0;
/* Control context must do seq counting */
if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ||
(rcd->ctxt == HFI1_CTRL_CTXT)) {
if (++mdata->ps_seq > 13)
mdata->ps_seq = 1;
}
}
/*
* prescan_rxq - search through the receive queue looking for packets
* containing Excplicit Congestion Notifications (FECNs, or BECNs).
* When an ECN is found, process the Congestion Notification, and toggle
* it off.
* This is declared as a macro to allow quick checking of the port to avoid
* the overhead of a function call if not enabled.
*/
#define prescan_rxq(rcd, packet) \
do { \
if (rcd->ppd->cc_prescan) \
__prescan_rxq(packet); \
} while (0)
static void __prescan_rxq(struct hfi1_packet *packet)
{
struct hfi1_ctxtdata *rcd = packet->rcd;
struct ps_mdata mdata;
init_ps_mdata(&mdata, packet);
while (1) {
struct hfi1_ibport *ibp = rcd_to_iport(rcd);
__le32 *rhf_addr = (__le32 *)rcd->rcvhdrq + mdata.ps_head +
packet->rcd->rhf_offset;
struct rvt_qp *qp;
struct ib_header *hdr;
struct rvt_dev_info *rdi = &rcd->dd->verbs_dev.rdi;
u64 rhf = rhf_to_cpu(rhf_addr);
u32 etype = rhf_rcv_type(rhf), qpn, bth1;
u8 lnh;
if (ps_done(&mdata, rhf, rcd))
break;
if (ps_skip(&mdata, rhf, rcd))
goto next;
if (etype != RHF_RCV_TYPE_IB)
goto next;
packet->hdr = hfi1_get_msgheader(packet->rcd, rhf_addr);
hdr = packet->hdr;
lnh = ib_get_lnh(hdr);
if (lnh == HFI1_LRH_BTH) {
packet->ohdr = &hdr->u.oth;
packet->grh = NULL;
} else if (lnh == HFI1_LRH_GRH) {
packet->ohdr = &hdr->u.l.oth;
packet->grh = &hdr->u.l.grh;
} else {
goto next; /* just in case */
}
if (!hfi1_may_ecn(packet))
goto next;
bth1 = be32_to_cpu(packet->ohdr->bth[1]);
qpn = bth1 & RVT_QPN_MASK;
rcu_read_lock();
qp = rvt_lookup_qpn(rdi, &ibp->rvp, qpn);
if (!qp) {
rcu_read_unlock();
goto next;
}
hfi1_process_ecn_slowpath(qp, packet, true);
rcu_read_unlock();
/* turn off BECN, FECN */
bth1 &= ~(IB_FECN_SMASK | IB_BECN_SMASK);
packet->ohdr->bth[1] = cpu_to_be32(bth1);
next:
update_ps_mdata(&mdata, rcd);
}
}
static void process_rcv_qp_work(struct hfi1_packet *packet)
{
struct rvt_qp *qp, *nqp;
struct hfi1_ctxtdata *rcd = packet->rcd;
/*
* Iterate over all QPs waiting to respond.
* The list won't change since the IRQ is only run on one CPU.
*/
list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) {
list_del_init(&qp->rspwait);
if (qp->r_flags & RVT_R_RSP_NAK) {
qp->r_flags &= ~RVT_R_RSP_NAK;
packet->qp = qp;
hfi1_send_rc_ack(packet, 0);
}
if (qp->r_flags & RVT_R_RSP_SEND) {
unsigned long flags;
qp->r_flags &= ~RVT_R_RSP_SEND;
spin_lock_irqsave(&qp->s_lock, flags);
if (ib_rvt_state_ops[qp->state] &
RVT_PROCESS_OR_FLUSH_SEND)
hfi1_schedule_send(qp);
spin_unlock_irqrestore(&qp->s_lock, flags);
}
rvt_put_qp(qp);
}
}
static noinline int max_packet_exceeded(struct hfi1_packet *packet, int thread)
{
if (thread) {
if ((packet->numpkt & (MAX_PKT_RECV_THREAD - 1)) == 0)
/* allow defered processing */
process_rcv_qp_work(packet);
cond_resched();
return RCV_PKT_OK;
} else {
this_cpu_inc(*packet->rcd->dd->rcv_limit);
return RCV_PKT_LIMIT;
}
}
static inline int check_max_packet(struct hfi1_packet *packet, int thread)
{
int ret = RCV_PKT_OK;
if (unlikely((packet->numpkt & (MAX_PKT_RECV - 1)) == 0))
ret = max_packet_exceeded(packet, thread);
return ret;
}
static noinline int skip_rcv_packet(struct hfi1_packet *packet, int thread)
{
int ret;
/* Set up for the next packet */
packet->rhqoff += packet->rsize;
if (packet->rhqoff >= packet->maxcnt)
packet->rhqoff = 0;
packet->numpkt++;
ret = check_max_packet(packet, thread);
packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff +
packet->rcd->rhf_offset;
packet->rhf = rhf_to_cpu(packet->rhf_addr);
return ret;
}
static inline int process_rcv_packet(struct hfi1_packet *packet, int thread)
{
int ret;
packet->etype = rhf_rcv_type(packet->rhf);
/* total length */
packet->tlen = rhf_pkt_len(packet->rhf); /* in bytes */
/* retrieve eager buffer details */
packet->ebuf = NULL;
if (rhf_use_egr_bfr(packet->rhf)) {
packet->etail = rhf_egr_index(packet->rhf);
packet->ebuf = get_egrbuf(packet->rcd, packet->rhf,
&packet->updegr);
/*
* Prefetch the contents of the eager buffer. It is
* OK to send a negative length to prefetch_range().
* The +2 is the size of the RHF.
*/
prefetch_range(packet->ebuf,
packet->tlen - ((packet->rcd->rcvhdrqentsize -
(rhf_hdrq_offset(packet->rhf)
+ 2)) * 4));
}
/*
* Call a type specific handler for the packet. We
* should be able to trust that etype won't be beyond
* the range of valid indexes. If so something is really
* wrong and we can probably just let things come
* crashing down. There is no need to eat another
* comparison in this performance critical code.
*/
packet->rcd->rhf_rcv_function_map[packet->etype](packet);
packet->numpkt++;
/* Set up for the next packet */
packet->rhqoff += packet->rsize;
if (packet->rhqoff >= packet->maxcnt)
packet->rhqoff = 0;
ret = check_max_packet(packet, thread);
packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff +
packet->rcd->rhf_offset;
packet->rhf = rhf_to_cpu(packet->rhf_addr);
return ret;
}
static inline void process_rcv_update(int last, struct hfi1_packet *packet)
{
/*
* Update head regs etc., every 16 packets, if not last pkt,
* to help prevent rcvhdrq overflows, when many packets
* are processed and queue is nearly full.
* Don't request an interrupt for intermediate updates.
*/
if (!last && !(packet->numpkt & 0xf)) {
update_usrhead(packet->rcd, packet->rhqoff, packet->updegr,
packet->etail, 0, 0);
packet->updegr = 0;
}
packet->grh = NULL;
}
static inline void finish_packet(struct hfi1_packet *packet)
{
/*
* Nothing we need to free for the packet.
*
* The only thing we need to do is a final update and call for an
* interrupt
*/
update_usrhead(packet->rcd, packet->rcd->head, packet->updegr,
packet->etail, rcv_intr_dynamic, packet->numpkt);
}
/*
* Handle receive interrupts when using the no dma rtail option.
*/
int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread)
{
u32 seq;
int last = RCV_PKT_OK;
struct hfi1_packet packet;
init_packet(rcd, &packet);
seq = rhf_rcv_seq(packet.rhf);
if (seq != rcd->seq_cnt) {
last = RCV_PKT_DONE;
goto bail;
}
prescan_rxq(rcd, &packet);
while (last == RCV_PKT_OK) {
last = process_rcv_packet(&packet, thread);
seq = rhf_rcv_seq(packet.rhf);
if (++rcd->seq_cnt > 13)
rcd->seq_cnt = 1;
if (seq != rcd->seq_cnt)
last = RCV_PKT_DONE;
process_rcv_update(last, &packet);
}
process_rcv_qp_work(&packet);
rcd->head = packet.rhqoff;
bail:
finish_packet(&packet);
return last;
}
int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread)
{
u32 hdrqtail;
int last = RCV_PKT_OK;
struct hfi1_packet packet;
init_packet(rcd, &packet);
hdrqtail = get_rcvhdrtail(rcd);
if (packet.rhqoff == hdrqtail) {
last = RCV_PKT_DONE;
goto bail;
}
smp_rmb(); /* prevent speculative reads of dma'ed hdrq */
prescan_rxq(rcd, &packet);
while (last == RCV_PKT_OK) {
last = process_rcv_packet(&packet, thread);
if (packet.rhqoff == hdrqtail)
last = RCV_PKT_DONE;
process_rcv_update(last, &packet);
}
process_rcv_qp_work(&packet);
rcd->head = packet.rhqoff;
bail:
finish_packet(&packet);
return last;
}
static inline void set_nodma_rtail(struct hfi1_devdata *dd, u16 ctxt)
{
struct hfi1_ctxtdata *rcd;
u16 i;
/*
* For dynamically allocated kernel contexts (like vnic) switch
* interrupt handler only for that context. Otherwise, switch
* interrupt handler for all statically allocated kernel contexts.
*/
if (ctxt >= dd->first_dyn_alloc_ctxt) {
rcd = hfi1_rcd_get_by_index_safe(dd, ctxt);
if (rcd) {
rcd->do_interrupt =
&handle_receive_interrupt_nodma_rtail;
hfi1_rcd_put(rcd);
}
return;
}
for (i = HFI1_CTRL_CTXT + 1; i < dd->first_dyn_alloc_ctxt; i++) {
rcd = hfi1_rcd_get_by_index(dd, i);
if (rcd)
rcd->do_interrupt =
&handle_receive_interrupt_nodma_rtail;
hfi1_rcd_put(rcd);
}
}
static inline void set_dma_rtail(struct hfi1_devdata *dd, u16 ctxt)
{
struct hfi1_ctxtdata *rcd;
u16 i;
/*
* For dynamically allocated kernel contexts (like vnic) switch
* interrupt handler only for that context. Otherwise, switch
* interrupt handler for all statically allocated kernel contexts.
*/
if (ctxt >= dd->first_dyn_alloc_ctxt) {
rcd = hfi1_rcd_get_by_index_safe(dd, ctxt);
if (rcd) {
rcd->do_interrupt =
&handle_receive_interrupt_dma_rtail;
hfi1_rcd_put(rcd);
}
return;
}
for (i = HFI1_CTRL_CTXT + 1; i < dd->first_dyn_alloc_ctxt; i++) {
rcd = hfi1_rcd_get_by_index(dd, i);
if (rcd)
rcd->do_interrupt =
&handle_receive_interrupt_dma_rtail;
hfi1_rcd_put(rcd);
}
}
void set_all_slowpath(struct hfi1_devdata *dd)
{
struct hfi1_ctxtdata *rcd;
u16 i;
/* HFI1_CTRL_CTXT must always use the slow path interrupt handler */
for (i = HFI1_CTRL_CTXT + 1; i < dd->num_rcv_contexts; i++) {
rcd = hfi1_rcd_get_by_index(dd, i);
if (!rcd)
continue;
if (i < dd->first_dyn_alloc_ctxt || rcd->is_vnic)
rcd->do_interrupt = &handle_receive_interrupt;
hfi1_rcd_put(rcd);
}
}
static inline int set_armed_to_active(struct hfi1_ctxtdata *rcd,
struct hfi1_packet *packet,
struct hfi1_devdata *dd)
{
struct work_struct *lsaw = &rcd->ppd->linkstate_active_work;
u8 etype = rhf_rcv_type(packet->rhf);
u8 sc = SC15_PACKET;
if (etype == RHF_RCV_TYPE_IB) {
struct ib_header *hdr = hfi1_get_msgheader(packet->rcd,
packet->rhf_addr);
sc = hfi1_9B_get_sc5(hdr, packet->rhf);
} else if (etype == RHF_RCV_TYPE_BYPASS) {
struct hfi1_16b_header *hdr = hfi1_get_16B_header(
packet->rcd,
packet->rhf_addr);
sc = hfi1_16B_get_sc(hdr);
}
if (sc != SC15_PACKET) {
int hwstate = driver_lstate(rcd->ppd);
if (hwstate != IB_PORT_ACTIVE) {
dd_dev_info(dd,
"Unexpected link state %s\n",
opa_lstate_name(hwstate));
return 0;
}
queue_work(rcd->ppd->link_wq, lsaw);
return 1;
}
return 0;
}
/*
* handle_receive_interrupt - receive a packet
* @rcd: the context
*
* Called from interrupt handler for errors or receive interrupt.
* This is the slow path interrupt handler.
*/
int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread)
{
struct hfi1_devdata *dd = rcd->dd;
u32 hdrqtail;
int needset, last = RCV_PKT_OK;
struct hfi1_packet packet;
int skip_pkt = 0;
/* Control context will always use the slow path interrupt handler */
needset = (rcd->ctxt == HFI1_CTRL_CTXT) ? 0 : 1;
init_packet(rcd, &packet);
if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
u32 seq = rhf_rcv_seq(packet.rhf);
if (seq != rcd->seq_cnt) {
last = RCV_PKT_DONE;
goto bail;
}
hdrqtail = 0;
} else {
hdrqtail = get_rcvhdrtail(rcd);
if (packet.rhqoff == hdrqtail) {
last = RCV_PKT_DONE;
goto bail;
}
smp_rmb(); /* prevent speculative reads of dma'ed hdrq */
/*
* Control context can potentially receive an invalid
* rhf. Drop such packets.
*/
if (rcd->ctxt == HFI1_CTRL_CTXT) {
u32 seq = rhf_rcv_seq(packet.rhf);
if (seq != rcd->seq_cnt)
skip_pkt = 1;
}
}
prescan_rxq(rcd, &packet);
while (last == RCV_PKT_OK) {
if (unlikely(dd->do_drop &&
atomic_xchg(&dd->drop_packet, DROP_PACKET_OFF) ==
DROP_PACKET_ON)) {
dd->do_drop = 0;
/* On to the next packet */
packet.rhqoff += packet.rsize;
packet.rhf_addr = (__le32 *)rcd->rcvhdrq +
packet.rhqoff +
rcd->rhf_offset;
packet.rhf = rhf_to_cpu(packet.rhf_addr);
} else if (skip_pkt) {
last = skip_rcv_packet(&packet, thread);
skip_pkt = 0;
} else {
/* Auto activate link on non-SC15 packet receive */
if (unlikely(rcd->ppd->host_link_state ==
HLS_UP_ARMED) &&
set_armed_to_active(rcd, &packet, dd))
goto bail;
last = process_rcv_packet(&packet, thread);
}
if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
u32 seq = rhf_rcv_seq(packet.rhf);
if (++rcd->seq_cnt > 13)
rcd->seq_cnt = 1;
if (seq != rcd->seq_cnt)
last = RCV_PKT_DONE;
if (needset) {
dd_dev_info(dd, "Switching to NO_DMA_RTAIL\n");
set_nodma_rtail(dd, rcd->ctxt);
needset = 0;
}
} else {
if (packet.rhqoff == hdrqtail)
last = RCV_PKT_DONE;
/*
* Control context can potentially receive an invalid
* rhf. Drop such packets.
*/
if (rcd->ctxt == HFI1_CTRL_CTXT) {
u32 seq = rhf_rcv_seq(packet.rhf);
if (++rcd->seq_cnt > 13)
rcd->seq_cnt = 1;
if (!last && (seq != rcd->seq_cnt))
skip_pkt = 1;
}
if (needset) {
dd_dev_info(dd,
"Switching to DMA_RTAIL\n");
set_dma_rtail(dd, rcd->ctxt);
needset = 0;
}
}
process_rcv_update(last, &packet);
}
process_rcv_qp_work(&packet);
rcd->head = packet.rhqoff;
bail:
/*
* Always write head at end, and setup rcv interrupt, even
* if no packets were processed.
*/
finish_packet(&packet);
return last;
}
/*
* We may discover in the interrupt that the hardware link state has
* changed from ARMED to ACTIVE (due to the arrival of a non-SC15 packet),
* and we need to update the driver's notion of the link state. We cannot
* run set_link_state from interrupt context, so we queue this function on
* a workqueue.
*
* We delay the regular interrupt processing until after the state changes
* so that the link will be in the correct state by the time any application
* we wake up attempts to send a reply to any message it received.
* (Subsequent receive interrupts may possibly force the wakeup before we
* update the link state.)
*
* The rcd is freed in hfi1_free_ctxtdata after hfi1_postinit_cleanup invokes
* dd->f_cleanup(dd) to disable the interrupt handler and flush workqueues,
* so we're safe from use-after-free of the rcd.
*/
void receive_interrupt_work(struct work_struct *work)
{
struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
linkstate_active_work);
struct hfi1_devdata *dd = ppd->dd;
struct hfi1_ctxtdata *rcd;
u16 i;
/* Received non-SC15 packet implies neighbor_normal */
ppd->neighbor_normal = 1;
set_link_state(ppd, HLS_UP_ACTIVE);
/*
* Interrupt all statically allocated kernel contexts that could
* have had an interrupt during auto activation.
*/
for (i = HFI1_CTRL_CTXT; i < dd->first_dyn_alloc_ctxt; i++) {
rcd = hfi1_rcd_get_by_index(dd, i);
if (rcd)
force_recv_intr(rcd);
hfi1_rcd_put(rcd);
}
}
/*
* Convert a given MTU size to the on-wire MAD packet enumeration.
* Return -1 if the size is invalid.
*/
int mtu_to_enum(u32 mtu, int default_if_bad)
{
switch (mtu) {
case 0: return OPA_MTU_0;
case 256: return OPA_MTU_256;
case 512: return OPA_MTU_512;
case 1024: return OPA_MTU_1024;
case 2048: return OPA_MTU_2048;
case 4096: return OPA_MTU_4096;
case 8192: return OPA_MTU_8192;
case 10240: return OPA_MTU_10240;
}
return default_if_bad;
}
u16 enum_to_mtu(int mtu)
{
switch (mtu) {
case OPA_MTU_0: return 0;
case OPA_MTU_256: return 256;
case OPA_MTU_512: return 512;
case OPA_MTU_1024: return 1024;
case OPA_MTU_2048: return 2048;
case OPA_MTU_4096: return 4096;
case OPA_MTU_8192: return 8192;
case OPA_MTU_10240: return 10240;
default: return 0xffff;
}
}
/*
* set_mtu - set the MTU
* @ppd: the per port data
*
* We can handle "any" incoming size, the issue here is whether we
* need to restrict our outgoing size. We do not deal with what happens
* to programs that are already running when the size changes.
*/
int set_mtu(struct hfi1_pportdata *ppd)
{
struct hfi1_devdata *dd = ppd->dd;
int i, drain, ret = 0, is_up = 0;
ppd->ibmtu = 0;
for (i = 0; i < ppd->vls_supported; i++)
if (ppd->ibmtu < dd->vld[i].mtu)
ppd->ibmtu = dd->vld[i].mtu;
ppd->ibmaxlen = ppd->ibmtu + lrh_max_header_bytes(ppd->dd);
mutex_lock(&ppd->hls_lock);
if (ppd->host_link_state == HLS_UP_INIT ||
ppd->host_link_state == HLS_UP_ARMED ||
ppd->host_link_state == HLS_UP_ACTIVE)
is_up = 1;
drain = !is_ax(dd) && is_up;
if (drain)
/*
* MTU is specified per-VL. To ensure that no packet gets
* stuck (due, e.g., to the MTU for the packet's VL being
* reduced), empty the per-VL FIFOs before adjusting MTU.
*/
ret = stop_drain_data_vls(dd);
if (ret) {
dd_dev_err(dd, "%s: cannot stop/drain VLs - refusing to change per-VL MTUs\n",
__func__);
goto err;
}
hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_MTU, 0);
if (drain)
open_fill_data_vls(dd); /* reopen all VLs */
err:
mutex_unlock(&ppd->hls_lock);
return ret;
}
int hfi1_set_lid(struct hfi1_pportdata *ppd, u32 lid, u8 lmc)
{
struct hfi1_devdata *dd = ppd->dd;
ppd->lid = lid;
ppd->lmc = lmc;
hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LIDLMC, 0);
dd_dev_info(dd, "port %u: got a lid: 0x%x\n", ppd->port, lid);
return 0;
}
void shutdown_led_override(struct hfi1_pportdata *ppd)
{
struct hfi1_devdata *dd = ppd->dd;
/*
* This pairs with the memory barrier in hfi1_start_led_override to
* ensure that we read the correct state of LED beaconing represented
* by led_override_timer_active
*/
smp_rmb();
if (atomic_read(&ppd->led_override_timer_active)) {
del_timer_sync(&ppd->led_override_timer);
atomic_set(&ppd->led_override_timer_active, 0);
/* Ensure the atomic_set is visible to all CPUs */
smp_wmb();
}
/* Hand control of the LED to the DC for normal operation */
write_csr(dd, DCC_CFG_LED_CNTRL, 0);
}
static void run_led_override(struct timer_list *t)
{
struct hfi1_pportdata *ppd = from_timer(ppd, t, led_override_timer);
struct hfi1_devdata *dd = ppd->dd;
unsigned long timeout;
int phase_idx;
if (!(dd->flags & HFI1_INITTED))
return;
phase_idx = ppd->led_override_phase & 1;
setextled(dd, phase_idx);
timeout = ppd->led_override_vals[phase_idx];
/* Set up for next phase */
ppd->led_override_phase = !ppd->led_override_phase;
mod_timer(&ppd->led_override_timer, jiffies + timeout);
}
/*
* To have the LED blink in a particular pattern, provide timeon and timeoff
* in milliseconds.
* To turn off custom blinking and return to normal operation, use
* shutdown_led_override()
*/
void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon,
unsigned int timeoff)
{
if (!(ppd->dd->flags & HFI1_INITTED))
return;
/* Convert to jiffies for direct use in timer */
ppd->led_override_vals[0] = msecs_to_jiffies(timeoff);
ppd->led_override_vals[1] = msecs_to_jiffies(timeon);
/* Arbitrarily start from LED on phase */
ppd->led_override_phase = 1;
/*
* If the timer has not already been started, do so. Use a "quick"
* timeout so the handler will be called soon to look at our request.
*/
if (!timer_pending(&ppd->led_override_timer)) {
timer_setup(&ppd->led_override_timer, run_led_override, 0);
ppd->led_override_timer.expires = jiffies + 1;
add_timer(&ppd->led_override_timer);
atomic_set(&ppd->led_override_timer_active, 1);
/* Ensure the atomic_set is visible to all CPUs */
smp_wmb();
}
}
/**
* hfi1_reset_device - reset the chip if possible
* @unit: the device to reset
*
* Whether or not reset is successful, we attempt to re-initialize the chip
* (that is, much like a driver unload/reload). We clear the INITTED flag
* so that the various entry points will fail until we reinitialize. For
* now, we only allow this if no user contexts are open that use chip resources
*/
int hfi1_reset_device(int unit)
{
int ret;
struct hfi1_devdata *dd = hfi1_lookup(unit);
struct hfi1_pportdata *ppd;
int pidx;
if (!dd) {
ret = -ENODEV;
goto bail;
}
dd_dev_info(dd, "Reset on unit %u requested\n", unit);
IB/hfi1: Fix bar0 mapping to use write combining When the debugpat kernel boot flag is turned on the following traces are printed: [ 1884.793168] x86/PAT: Overlap at 0x90000000-0x92000000 [ 1884.803510] x86/PAT: reserve_memtype added [mem 0x91200000-0x9127ffff], track uncached-minus, req write-combining, ret uncached-minus [ 1884.818167] hfi1 0000:05:00.0: hfi1_0: WC Remapped RcvArray: ffffc9000a980000 The ioremap_wc() clearly is not returning a write combining mapping due to an overlap where the RcvArray is mapped in a uncached mapping prior to creating the proposed write combining mapping. The patch replaces the single base register for uncached CSRs that used to overlap the RcvArray with two mappings. One, kregbase1, from the bar0 up to the RcvArray and another, kregbase2, from the end of the RcvArray to the pio send buffer space. A new dd field, base2_start, is used to convert the zero-based offset in the CSR routines to the correct kregbase1/kregbase2 mapping. A single direct write of the RcvArray CSRs is replaced with hfi1_put_tid() to insure correct access using the new disjoint mapping. Additionally, the kregend field is deleted since it is only ever written. patdebug now shows the RcvArray as write combining: [ 35.688990] x86/PAT: reserve_memtype added [mem 0x91200000-0x9127ffff], track write-combining, req write-combining, ret write-combining To insulate from any potential issues with write combining, all writeq are now flushed in hfi1_put_tid() and rcv_array_wc_fill(). Reviewed-by: Mitko Haralanov <mitko.haralanov@intel.com> Reviewed-by: Ashutosh Dixit <ashutosh.dixit@intel.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-07-24 22:45:31 +08:00
if (!dd->kregbase1 || !(dd->flags & HFI1_PRESENT)) {
dd_dev_info(dd,
"Invalid unit number %u or not initialized or not present\n",
unit);
ret = -ENXIO;
goto bail;
}
/* If there are any user/vnic contexts, we cannot reset */
mutex_lock(&hfi1_mutex);
if (dd->rcd)
if (hfi1_stats.sps_ctxts) {
mutex_unlock(&hfi1_mutex);
ret = -EBUSY;
goto bail;
}
mutex_unlock(&hfi1_mutex);
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
ppd = dd->pport + pidx;
shutdown_led_override(ppd);
}
if (dd->flags & HFI1_HAS_SEND_DMA)
sdma_exit(dd);
hfi1_reset_cpu_counters(dd);
ret = hfi1_init(dd, 1);
if (ret)
dd_dev_err(dd,
"Reinitialize unit %u after reset failed with %d\n",
unit, ret);
else
dd_dev_info(dd, "Reinitialized unit %u after resetting\n",
unit);
bail:
return ret;
}
static inline void hfi1_setup_ib_header(struct hfi1_packet *packet)
{
packet->hdr = (struct hfi1_ib_message_header *)
hfi1_get_msgheader(packet->rcd,
packet->rhf_addr);
packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr;
}
static int hfi1_bypass_ingress_pkt_check(struct hfi1_packet *packet)
{
struct hfi1_pportdata *ppd = packet->rcd->ppd;
/* slid and dlid cannot be 0 */
if ((!packet->slid) || (!packet->dlid))
return -EINVAL;
/* Compare port lid with incoming packet dlid */
if ((!(hfi1_is_16B_mcast(packet->dlid))) &&
(packet->dlid !=
opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B))) {
if ((packet->dlid & ~((1 << ppd->lmc) - 1)) != ppd->lid)
return -EINVAL;
}
/* No multicast packets with SC15 */
if ((hfi1_is_16B_mcast(packet->dlid)) && (packet->sc == 0xF))
return -EINVAL;
/* Packets with permissive DLID always on SC15 */
if ((packet->dlid == opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE),
16B)) &&
(packet->sc != 0xF))
return -EINVAL;
return 0;
}
static int hfi1_setup_9B_packet(struct hfi1_packet *packet)
{
struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
struct ib_header *hdr;
u8 lnh;
hfi1_setup_ib_header(packet);
hdr = packet->hdr;
lnh = ib_get_lnh(hdr);
if (lnh == HFI1_LRH_BTH) {
packet->ohdr = &hdr->u.oth;
packet->grh = NULL;
} else if (lnh == HFI1_LRH_GRH) {
u32 vtf;
packet->ohdr = &hdr->u.l.oth;
packet->grh = &hdr->u.l.grh;
if (packet->grh->next_hdr != IB_GRH_NEXT_HDR)
goto drop;
vtf = be32_to_cpu(packet->grh->version_tclass_flow);
if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
goto drop;
} else {
goto drop;
}
/* Query commonly used fields from packet header */
packet->payload = packet->ebuf;
packet->opcode = ib_bth_get_opcode(packet->ohdr);
packet->slid = ib_get_slid(hdr);
packet->dlid = ib_get_dlid(hdr);
if (unlikely((packet->dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
(packet->dlid != be16_to_cpu(IB_LID_PERMISSIVE))))
packet->dlid += opa_get_mcast_base(OPA_MCAST_NR) -
be16_to_cpu(IB_MULTICAST_LID_BASE);
packet->sl = ib_get_sl(hdr);
packet->sc = hfi1_9B_get_sc5(hdr, packet->rhf);
packet->pad = ib_bth_get_pad(packet->ohdr);
packet->extra_byte = 0;
packet->pkey = ib_bth_get_pkey(packet->ohdr);
packet->migrated = ib_bth_is_migration(packet->ohdr);
return 0;
drop:
ibp->rvp.n_pkt_drops++;
return -EINVAL;
}
static int hfi1_setup_bypass_packet(struct hfi1_packet *packet)
{
/*
* Bypass packets have a different header/payload split
* compared to an IB packet.
* Current split is set such that 16 bytes of the actual
* header is in the header buffer and the remining is in
* the eager buffer. We chose 16 since hfi1 driver only
* supports 16B bypass packets and we will be able to
* receive the entire LRH with such a split.
*/
struct hfi1_ctxtdata *rcd = packet->rcd;
struct hfi1_pportdata *ppd = rcd->ppd;
struct hfi1_ibport *ibp = &ppd->ibport_data;
u8 l4;
packet->hdr = (struct hfi1_16b_header *)
hfi1_get_16B_header(packet->rcd,
packet->rhf_addr);
l4 = hfi1_16B_get_l4(packet->hdr);
if (l4 == OPA_16B_L4_IB_LOCAL) {
packet->ohdr = packet->ebuf;
packet->grh = NULL;
packet->opcode = ib_bth_get_opcode(packet->ohdr);
packet->pad = hfi1_16B_bth_get_pad(packet->ohdr);
/* hdr_len_by_opcode already has an IB LRH factored in */
packet->hlen = hdr_len_by_opcode[packet->opcode] +
(LRH_16B_BYTES - LRH_9B_BYTES);
packet->migrated = opa_bth_is_migration(packet->ohdr);
} else if (l4 == OPA_16B_L4_IB_GLOBAL) {
u32 vtf;
u8 grh_len = sizeof(struct ib_grh);
packet->ohdr = packet->ebuf + grh_len;
packet->grh = packet->ebuf;
packet->opcode = ib_bth_get_opcode(packet->ohdr);
packet->pad = hfi1_16B_bth_get_pad(packet->ohdr);
/* hdr_len_by_opcode already has an IB LRH factored in */
packet->hlen = hdr_len_by_opcode[packet->opcode] +
(LRH_16B_BYTES - LRH_9B_BYTES) + grh_len;
packet->migrated = opa_bth_is_migration(packet->ohdr);
if (packet->grh->next_hdr != IB_GRH_NEXT_HDR)
goto drop;
vtf = be32_to_cpu(packet->grh->version_tclass_flow);
if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
goto drop;
} else if (l4 == OPA_16B_L4_FM) {
packet->mgmt = packet->ebuf;
packet->ohdr = NULL;
packet->grh = NULL;
packet->opcode = IB_OPCODE_UD_SEND_ONLY;
packet->pad = OPA_16B_L4_FM_PAD;
packet->hlen = OPA_16B_L4_FM_HLEN;
packet->migrated = false;
} else {
goto drop;
}
/* Query commonly used fields from packet header */
packet->payload = packet->ebuf + packet->hlen - LRH_16B_BYTES;
packet->slid = hfi1_16B_get_slid(packet->hdr);
packet->dlid = hfi1_16B_get_dlid(packet->hdr);
if (unlikely(hfi1_is_16B_mcast(packet->dlid)))
packet->dlid += opa_get_mcast_base(OPA_MCAST_NR) -
opa_get_lid(opa_get_mcast_base(OPA_MCAST_NR),
16B);
packet->sc = hfi1_16B_get_sc(packet->hdr);
packet->sl = ibp->sc_to_sl[packet->sc];
packet->extra_byte = SIZE_OF_LT;
packet->pkey = hfi1_16B_get_pkey(packet->hdr);
if (hfi1_bypass_ingress_pkt_check(packet))
goto drop;
return 0;
drop:
hfi1_cdbg(PKT, "%s: packet dropped\n", __func__);
ibp->rvp.n_pkt_drops++;
return -EINVAL;
}
static void show_eflags_errs(struct hfi1_packet *packet)
{
struct hfi1_ctxtdata *rcd = packet->rcd;
u32 rte = rhf_rcv_type_err(packet->rhf);
dd_dev_err(rcd->dd,
"receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s] rte 0x%x\n",
rcd->ctxt, packet->rhf,
packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
packet->rhf & RHF_DC_ERR ? "dc " : "",
packet->rhf & RHF_TID_ERR ? "tid " : "",
packet->rhf & RHF_LEN_ERR ? "len " : "",
packet->rhf & RHF_ECC_ERR ? "ecc " : "",
packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
rte);
}
void handle_eflags(struct hfi1_packet *packet)
{
struct hfi1_ctxtdata *rcd = packet->rcd;
rcv_hdrerr(rcd, rcd->ppd, packet);
if (rhf_err_flags(packet->rhf))
show_eflags_errs(packet);
}
/*
* The following functions are called by the interrupt handler. They are type
* specific handlers for each packet type.
*/
static int process_receive_ib(struct hfi1_packet *packet)
{
IB/hfi1: Rework fault injection machinery The packet fault injection code present in the HFI1 driver had some issues which not only fragment the code but also created user confusion. Furthermore, it suffered from the following issues: 1. The fault_packet method only worked for received packets. This meant that the only fault injection mode available for sent packets is fault_opcode, which did not allow for random packet drops on all egressing packets. 2. The mask available for the fault_opcode mode did not really work due to the fact that the opcode values are not bits in a bitmask but rather sequential integer values. Creating a opcode/mask pair that would successfully capture a set of packets was nearly impossible. 3. The code was fragmented and used too many debugfs entries to operate and control. This was confusing to users. 4. It did not allow filtering fault injection on a per direction basis - egress vs. ingress. In order to improve or fix the above issues, the following changes have been made: 1. The fault injection methods have been combined into a single fault injection facility. As such, the fault injection has been plugged into both the send and receive code paths. Regardless of method used the fault injection will operate on both egress and ingress packets. 2. The type of fault injection - by packet or by opcode - is now controlled by changing the boolean value of the file "opcode_mode". When the value is set to True, fault injection is done by opcode. Otherwise, by packet. 2. The masking ability has been removed in favor of a bitmap that holds opcodes of interest (one bit per opcode, a total of 256 bits). This works in tandem with the "opcode_mode" value. When the value of "opcode_mode" is False, this bitmap is ignored. When the value is True, the bitmap lists all opcodes to be considered for fault injection. By default, the bitmap is empty. When the user wants to filter by opcode, the user sets the corresponding bit in the bitmap by echo'ing the bit position into the 'opcodes' file. This gets around the issue that the set of opcodes does not lend itself to effective masks and allow for extremely fine-grained filtering by opcode. 4. fault_packet and fault_opcode methods have been combined. Hence, there is only one debugfs directory controlling the entire operation of the fault injection machinery. This reduces the number of debugfs entries and provides a more unified user experience. 5. A new control files - "direction" - is provided to allow the user to control the direction of packets, which are subject to fault injection. 6. A new control file - "skip_usec" - is added that would allow the user to specify a "timeout" during which no fault injection will occur. In addition, the following bug fixes have been applied: 1. The fault injection code has been split into its own header and source files. This was done to better organize the code and support conditional compilation without littering the code with #ifdef's. 2. The method by which the TX PIO packets were being marked for drop conflicted with the way send contexts were being setup. As a result, the send context was repeatedly being reset. 3. The fault injection only makes sense when the user can control it through the debugfs entries. However, a kernel configuration can enable fault injection but keep fault injection debugfs entries disabled. Therefore, it makes sense that the HFI fault injection code depends on both. 4. Error suppression did not take into account the method by which PIO packets were being dropped. Therefore, even with error suppression turned on, errors would still be displayed to the screen. A larger enough packet drop percentage would case the kernel to crash because the driver would be stuck printing errors. Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Reviewed-by: Don Hiatt <don.hiatt@intel.com> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-02 21:43:24 +08:00
if (hfi1_setup_9B_packet(packet))
return RHF_RCV_CONTINUE;
IB/hfi1: Rework fault injection machinery The packet fault injection code present in the HFI1 driver had some issues which not only fragment the code but also created user confusion. Furthermore, it suffered from the following issues: 1. The fault_packet method only worked for received packets. This meant that the only fault injection mode available for sent packets is fault_opcode, which did not allow for random packet drops on all egressing packets. 2. The mask available for the fault_opcode mode did not really work due to the fact that the opcode values are not bits in a bitmask but rather sequential integer values. Creating a opcode/mask pair that would successfully capture a set of packets was nearly impossible. 3. The code was fragmented and used too many debugfs entries to operate and control. This was confusing to users. 4. It did not allow filtering fault injection on a per direction basis - egress vs. ingress. In order to improve or fix the above issues, the following changes have been made: 1. The fault injection methods have been combined into a single fault injection facility. As such, the fault injection has been plugged into both the send and receive code paths. Regardless of method used the fault injection will operate on both egress and ingress packets. 2. The type of fault injection - by packet or by opcode - is now controlled by changing the boolean value of the file "opcode_mode". When the value is set to True, fault injection is done by opcode. Otherwise, by packet. 2. The masking ability has been removed in favor of a bitmap that holds opcodes of interest (one bit per opcode, a total of 256 bits). This works in tandem with the "opcode_mode" value. When the value of "opcode_mode" is False, this bitmap is ignored. When the value is True, the bitmap lists all opcodes to be considered for fault injection. By default, the bitmap is empty. When the user wants to filter by opcode, the user sets the corresponding bit in the bitmap by echo'ing the bit position into the 'opcodes' file. This gets around the issue that the set of opcodes does not lend itself to effective masks and allow for extremely fine-grained filtering by opcode. 4. fault_packet and fault_opcode methods have been combined. Hence, there is only one debugfs directory controlling the entire operation of the fault injection machinery. This reduces the number of debugfs entries and provides a more unified user experience. 5. A new control files - "direction" - is provided to allow the user to control the direction of packets, which are subject to fault injection. 6. A new control file - "skip_usec" - is added that would allow the user to specify a "timeout" during which no fault injection will occur. In addition, the following bug fixes have been applied: 1. The fault injection code has been split into its own header and source files. This was done to better organize the code and support conditional compilation without littering the code with #ifdef's. 2. The method by which the TX PIO packets were being marked for drop conflicted with the way send contexts were being setup. As a result, the send context was repeatedly being reset. 3. The fault injection only makes sense when the user can control it through the debugfs entries. However, a kernel configuration can enable fault injection but keep fault injection debugfs entries disabled. Therefore, it makes sense that the HFI fault injection code depends on both. 4. Error suppression did not take into account the method by which PIO packets were being dropped. Therefore, even with error suppression turned on, errors would still be displayed to the screen. A larger enough packet drop percentage would case the kernel to crash because the driver would be stuck printing errors. Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Reviewed-by: Don Hiatt <don.hiatt@intel.com> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-02 21:43:24 +08:00
if (unlikely(hfi1_dbg_should_fault_rx(packet)))
return RHF_RCV_CONTINUE;
trace_hfi1_rcvhdr(packet);
if (unlikely(rhf_err_flags(packet->rhf))) {
handle_eflags(packet);
return RHF_RCV_CONTINUE;
}
hfi1_ib_rcv(packet);
return RHF_RCV_CONTINUE;
}
static inline bool hfi1_is_vnic_packet(struct hfi1_packet *packet)
{
/* Packet received in VNIC context via RSM */
if (packet->rcd->is_vnic)
return true;
if ((hfi1_16B_get_l2(packet->ebuf) == OPA_16B_L2_TYPE) &&
(hfi1_16B_get_l4(packet->ebuf) == OPA_16B_L4_ETHR))
return true;
return false;
}
static int process_receive_bypass(struct hfi1_packet *packet)
{
struct hfi1_devdata *dd = packet->rcd->dd;
if (hfi1_is_vnic_packet(packet)) {
hfi1_vnic_bypass_rcv(packet);
return RHF_RCV_CONTINUE;
}
if (hfi1_setup_bypass_packet(packet))
return RHF_RCV_CONTINUE;
trace_hfi1_rcvhdr(packet);
if (unlikely(rhf_err_flags(packet->rhf))) {
handle_eflags(packet);
return RHF_RCV_CONTINUE;
}
if (hfi1_16B_get_l2(packet->hdr) == 0x2) {
hfi1_16B_rcv(packet);
} else {
dd_dev_err(dd,
"Bypass packets other than 16B are not supported in normal operation. Dropping\n");
incr_cntr64(&dd->sw_rcv_bypass_packet_errors);
if (!(dd->err_info_rcvport.status_and_code &
OPA_EI_STATUS_SMASK)) {
u64 *flits = packet->ebuf;
if (flits && !(packet->rhf & RHF_LEN_ERR)) {
dd->err_info_rcvport.packet_flit1 = flits[0];
dd->err_info_rcvport.packet_flit2 =
packet->tlen > sizeof(flits[0]) ?
flits[1] : 0;
}
dd->err_info_rcvport.status_and_code |=
(OPA_EI_STATUS_SMASK | BAD_L2_ERR);
}
}
return RHF_RCV_CONTINUE;
}
static int process_receive_error(struct hfi1_packet *packet)
{
/* KHdrHCRCErr -- KDETH packet with a bad HCRC */
if (unlikely(
hfi1_dbg_fault_suppress_err(&packet->rcd->dd->verbs_dev) &&
IB/hfi1: Rework fault injection machinery The packet fault injection code present in the HFI1 driver had some issues which not only fragment the code but also created user confusion. Furthermore, it suffered from the following issues: 1. The fault_packet method only worked for received packets. This meant that the only fault injection mode available for sent packets is fault_opcode, which did not allow for random packet drops on all egressing packets. 2. The mask available for the fault_opcode mode did not really work due to the fact that the opcode values are not bits in a bitmask but rather sequential integer values. Creating a opcode/mask pair that would successfully capture a set of packets was nearly impossible. 3. The code was fragmented and used too many debugfs entries to operate and control. This was confusing to users. 4. It did not allow filtering fault injection on a per direction basis - egress vs. ingress. In order to improve or fix the above issues, the following changes have been made: 1. The fault injection methods have been combined into a single fault injection facility. As such, the fault injection has been plugged into both the send and receive code paths. Regardless of method used the fault injection will operate on both egress and ingress packets. 2. The type of fault injection - by packet or by opcode - is now controlled by changing the boolean value of the file "opcode_mode". When the value is set to True, fault injection is done by opcode. Otherwise, by packet. 2. The masking ability has been removed in favor of a bitmap that holds opcodes of interest (one bit per opcode, a total of 256 bits). This works in tandem with the "opcode_mode" value. When the value of "opcode_mode" is False, this bitmap is ignored. When the value is True, the bitmap lists all opcodes to be considered for fault injection. By default, the bitmap is empty. When the user wants to filter by opcode, the user sets the corresponding bit in the bitmap by echo'ing the bit position into the 'opcodes' file. This gets around the issue that the set of opcodes does not lend itself to effective masks and allow for extremely fine-grained filtering by opcode. 4. fault_packet and fault_opcode methods have been combined. Hence, there is only one debugfs directory controlling the entire operation of the fault injection machinery. This reduces the number of debugfs entries and provides a more unified user experience. 5. A new control files - "direction" - is provided to allow the user to control the direction of packets, which are subject to fault injection. 6. A new control file - "skip_usec" - is added that would allow the user to specify a "timeout" during which no fault injection will occur. In addition, the following bug fixes have been applied: 1. The fault injection code has been split into its own header and source files. This was done to better organize the code and support conditional compilation without littering the code with #ifdef's. 2. The method by which the TX PIO packets were being marked for drop conflicted with the way send contexts were being setup. As a result, the send context was repeatedly being reset. 3. The fault injection only makes sense when the user can control it through the debugfs entries. However, a kernel configuration can enable fault injection but keep fault injection debugfs entries disabled. Therefore, it makes sense that the HFI fault injection code depends on both. 4. Error suppression did not take into account the method by which PIO packets were being dropped. Therefore, even with error suppression turned on, errors would still be displayed to the screen. A larger enough packet drop percentage would case the kernel to crash because the driver would be stuck printing errors. Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Reviewed-by: Don Hiatt <don.hiatt@intel.com> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-02 21:43:24 +08:00
(rhf_rcv_type_err(packet->rhf) == RHF_RCV_TYPE_ERROR ||
packet->rhf & RHF_DC_ERR)))
return RHF_RCV_CONTINUE;
hfi1_setup_ib_header(packet);
handle_eflags(packet);
if (unlikely(rhf_err_flags(packet->rhf)))
dd_dev_err(packet->rcd->dd,
"Unhandled error packet received. Dropping.\n");
return RHF_RCV_CONTINUE;
}
static int kdeth_process_expected(struct hfi1_packet *packet)
{
IB/hfi1: Rework fault injection machinery The packet fault injection code present in the HFI1 driver had some issues which not only fragment the code but also created user confusion. Furthermore, it suffered from the following issues: 1. The fault_packet method only worked for received packets. This meant that the only fault injection mode available for sent packets is fault_opcode, which did not allow for random packet drops on all egressing packets. 2. The mask available for the fault_opcode mode did not really work due to the fact that the opcode values are not bits in a bitmask but rather sequential integer values. Creating a opcode/mask pair that would successfully capture a set of packets was nearly impossible. 3. The code was fragmented and used too many debugfs entries to operate and control. This was confusing to users. 4. It did not allow filtering fault injection on a per direction basis - egress vs. ingress. In order to improve or fix the above issues, the following changes have been made: 1. The fault injection methods have been combined into a single fault injection facility. As such, the fault injection has been plugged into both the send and receive code paths. Regardless of method used the fault injection will operate on both egress and ingress packets. 2. The type of fault injection - by packet or by opcode - is now controlled by changing the boolean value of the file "opcode_mode". When the value is set to True, fault injection is done by opcode. Otherwise, by packet. 2. The masking ability has been removed in favor of a bitmap that holds opcodes of interest (one bit per opcode, a total of 256 bits). This works in tandem with the "opcode_mode" value. When the value of "opcode_mode" is False, this bitmap is ignored. When the value is True, the bitmap lists all opcodes to be considered for fault injection. By default, the bitmap is empty. When the user wants to filter by opcode, the user sets the corresponding bit in the bitmap by echo'ing the bit position into the 'opcodes' file. This gets around the issue that the set of opcodes does not lend itself to effective masks and allow for extremely fine-grained filtering by opcode. 4. fault_packet and fault_opcode methods have been combined. Hence, there is only one debugfs directory controlling the entire operation of the fault injection machinery. This reduces the number of debugfs entries and provides a more unified user experience. 5. A new control files - "direction" - is provided to allow the user to control the direction of packets, which are subject to fault injection. 6. A new control file - "skip_usec" - is added that would allow the user to specify a "timeout" during which no fault injection will occur. In addition, the following bug fixes have been applied: 1. The fault injection code has been split into its own header and source files. This was done to better organize the code and support conditional compilation without littering the code with #ifdef's. 2. The method by which the TX PIO packets were being marked for drop conflicted with the way send contexts were being setup. As a result, the send context was repeatedly being reset. 3. The fault injection only makes sense when the user can control it through the debugfs entries. However, a kernel configuration can enable fault injection but keep fault injection debugfs entries disabled. Therefore, it makes sense that the HFI fault injection code depends on both. 4. Error suppression did not take into account the method by which PIO packets were being dropped. Therefore, even with error suppression turned on, errors would still be displayed to the screen. A larger enough packet drop percentage would case the kernel to crash because the driver would be stuck printing errors. Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Reviewed-by: Don Hiatt <don.hiatt@intel.com> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-02 21:43:24 +08:00
hfi1_setup_9B_packet(packet);
if (unlikely(hfi1_dbg_should_fault_rx(packet)))
return RHF_RCV_CONTINUE;
if (unlikely(rhf_err_flags(packet->rhf))) {
struct hfi1_ctxtdata *rcd = packet->rcd;
if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
return RHF_RCV_CONTINUE;
}
hfi1_kdeth_expected_rcv(packet);
return RHF_RCV_CONTINUE;
}
static int kdeth_process_eager(struct hfi1_packet *packet)
{
IB/hfi1: Rework fault injection machinery The packet fault injection code present in the HFI1 driver had some issues which not only fragment the code but also created user confusion. Furthermore, it suffered from the following issues: 1. The fault_packet method only worked for received packets. This meant that the only fault injection mode available for sent packets is fault_opcode, which did not allow for random packet drops on all egressing packets. 2. The mask available for the fault_opcode mode did not really work due to the fact that the opcode values are not bits in a bitmask but rather sequential integer values. Creating a opcode/mask pair that would successfully capture a set of packets was nearly impossible. 3. The code was fragmented and used too many debugfs entries to operate and control. This was confusing to users. 4. It did not allow filtering fault injection on a per direction basis - egress vs. ingress. In order to improve or fix the above issues, the following changes have been made: 1. The fault injection methods have been combined into a single fault injection facility. As such, the fault injection has been plugged into both the send and receive code paths. Regardless of method used the fault injection will operate on both egress and ingress packets. 2. The type of fault injection - by packet or by opcode - is now controlled by changing the boolean value of the file "opcode_mode". When the value is set to True, fault injection is done by opcode. Otherwise, by packet. 2. The masking ability has been removed in favor of a bitmap that holds opcodes of interest (one bit per opcode, a total of 256 bits). This works in tandem with the "opcode_mode" value. When the value of "opcode_mode" is False, this bitmap is ignored. When the value is True, the bitmap lists all opcodes to be considered for fault injection. By default, the bitmap is empty. When the user wants to filter by opcode, the user sets the corresponding bit in the bitmap by echo'ing the bit position into the 'opcodes' file. This gets around the issue that the set of opcodes does not lend itself to effective masks and allow for extremely fine-grained filtering by opcode. 4. fault_packet and fault_opcode methods have been combined. Hence, there is only one debugfs directory controlling the entire operation of the fault injection machinery. This reduces the number of debugfs entries and provides a more unified user experience. 5. A new control files - "direction" - is provided to allow the user to control the direction of packets, which are subject to fault injection. 6. A new control file - "skip_usec" - is added that would allow the user to specify a "timeout" during which no fault injection will occur. In addition, the following bug fixes have been applied: 1. The fault injection code has been split into its own header and source files. This was done to better organize the code and support conditional compilation without littering the code with #ifdef's. 2. The method by which the TX PIO packets were being marked for drop conflicted with the way send contexts were being setup. As a result, the send context was repeatedly being reset. 3. The fault injection only makes sense when the user can control it through the debugfs entries. However, a kernel configuration can enable fault injection but keep fault injection debugfs entries disabled. Therefore, it makes sense that the HFI fault injection code depends on both. 4. Error suppression did not take into account the method by which PIO packets were being dropped. Therefore, even with error suppression turned on, errors would still be displayed to the screen. A larger enough packet drop percentage would case the kernel to crash because the driver would be stuck printing errors. Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Reviewed-by: Don Hiatt <don.hiatt@intel.com> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-02 21:43:24 +08:00
hfi1_setup_9B_packet(packet);
if (unlikely(hfi1_dbg_should_fault_rx(packet)))
return RHF_RCV_CONTINUE;
trace_hfi1_rcvhdr(packet);
if (unlikely(rhf_err_flags(packet->rhf))) {
struct hfi1_ctxtdata *rcd = packet->rcd;
show_eflags_errs(packet);
if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
return RHF_RCV_CONTINUE;
}
hfi1_kdeth_eager_rcv(packet);
return RHF_RCV_CONTINUE;
}
static int process_receive_invalid(struct hfi1_packet *packet)
{
dd_dev_err(packet->rcd->dd, "Invalid packet type %d. Dropping\n",
rhf_rcv_type(packet->rhf));
return RHF_RCV_CONTINUE;
}
void seqfile_dump_rcd(struct seq_file *s, struct hfi1_ctxtdata *rcd)
{
struct hfi1_packet packet;
struct ps_mdata mdata;
seq_printf(s, "Rcd %u: RcvHdr cnt %u entsize %u %s head %llu tail %llu\n",
rcd->ctxt, rcd->rcvhdrq_cnt, rcd->rcvhdrqentsize,
HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ?
"dma_rtail" : "nodma_rtail",
read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_HEAD) &
RCV_HDR_HEAD_HEAD_MASK,
read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL));
init_packet(rcd, &packet);
init_ps_mdata(&mdata, &packet);
while (1) {
__le32 *rhf_addr = (__le32 *)rcd->rcvhdrq + mdata.ps_head +
rcd->rhf_offset;
struct ib_header *hdr;
u64 rhf = rhf_to_cpu(rhf_addr);
u32 etype = rhf_rcv_type(rhf), qpn;
u8 opcode;
u32 psn;
u8 lnh;
if (ps_done(&mdata, rhf, rcd))
break;
if (ps_skip(&mdata, rhf, rcd))
goto next;
if (etype > RHF_RCV_TYPE_IB)
goto next;
packet.hdr = hfi1_get_msgheader(rcd, rhf_addr);
hdr = packet.hdr;
lnh = be16_to_cpu(hdr->lrh[0]) & 3;
if (lnh == HFI1_LRH_BTH)
packet.ohdr = &hdr->u.oth;
else if (lnh == HFI1_LRH_GRH)
packet.ohdr = &hdr->u.l.oth;
else
goto next; /* just in case */
opcode = (be32_to_cpu(packet.ohdr->bth[0]) >> 24);
qpn = be32_to_cpu(packet.ohdr->bth[1]) & RVT_QPN_MASK;
psn = mask_psn(be32_to_cpu(packet.ohdr->bth[2]));
seq_printf(s, "\tEnt %u: opcode 0x%x, qpn 0x%x, psn 0x%x\n",
mdata.ps_head, opcode, qpn, psn);
next:
update_ps_mdata(&mdata, rcd);
}
}
const rhf_rcv_function_ptr normal_rhf_rcv_functions[] = {
[RHF_RCV_TYPE_EXPECTED] = kdeth_process_expected,
[RHF_RCV_TYPE_EAGER] = kdeth_process_eager,
[RHF_RCV_TYPE_IB] = process_receive_ib,
[RHF_RCV_TYPE_ERROR] = process_receive_error,
[RHF_RCV_TYPE_BYPASS] = process_receive_bypass,
[RHF_RCV_TYPE_INVALID5] = process_receive_invalid,
[RHF_RCV_TYPE_INVALID6] = process_receive_invalid,
[RHF_RCV_TYPE_INVALID7] = process_receive_invalid,
};