From 43a440c4007b28c473afba966e8410459db4975f Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 22 Mar 2021 03:08:39 -0400 Subject: [PATCH 1/7] bnxt_en: Improve the status_reliable flag in bp->fw_health. In order to read the firmware health status, we first need to determine the register location and then the register may need to be mapped. There are 2 code paths to do this. The first one is done early as a best effort attempt by the function bnxt_try_map_fw_health_reg(). The second one is done later in the function bnxt_map_fw_health_regs() after establishing communications with the firmware. We currently only set fw_health->status_reliable if we can successfully set up the health register in the first code path. Improve the scheme by setting the fw_health->status_reliable flag if either (or both) code paths can successfully set up the health register. This flag is relied upon during run-time when we need to check the health status. So this will make it work better. During ifdown, if the health register is mapped, we need to invalidate the health register mapping because a potential fw reset will reset the mapping. Similarly, we need to do the same after firmware reset during recovery. We'll remap it during ifup. Reviewed-by: Edwin Peer Reviewed-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 31 +++++++++++++++++++---- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index b53a0d87371a..16cf18eb7b3d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7540,6 +7540,19 @@ static void __bnxt_map_fw_health_reg(struct bnxt *bp, u32 reg) BNXT_FW_HEALTH_WIN_MAP_OFF); } +static void bnxt_inv_fw_health_reg(struct bnxt *bp) +{ + struct bnxt_fw_health *fw_health = bp->fw_health; + u32 reg_type; + + if (!fw_health || !fw_health->status_reliable) + return; + + reg_type = BNXT_FW_HEALTH_REG_TYPE(fw_health->regs[BNXT_FW_HEALTH_REG]); + if (reg_type == BNXT_FW_HEALTH_REG_TYPE_GRC) + fw_health->status_reliable = false; +} + static void bnxt_try_map_fw_health_reg(struct bnxt *bp) { void __iomem *hs; @@ -7547,6 +7560,9 @@ static void bnxt_try_map_fw_health_reg(struct bnxt *bp) u32 reg_type; u32 sig; + if (bp->fw_health) + bp->fw_health->status_reliable = false; + __bnxt_map_fw_health_reg(bp, HCOMM_STATUS_STRUCT_LOC); hs = bp->bar0 + BNXT_FW_HEALTH_WIN_OFF(HCOMM_STATUS_STRUCT_LOC); @@ -7558,11 +7574,9 @@ static void bnxt_try_map_fw_health_reg(struct bnxt *bp) BNXT_FW_HEALTH_WIN_BASE + BNXT_GRC_REG_CHIP_NUM); } - if (!BNXT_CHIP_P5(bp)) { - if (bp->fw_health) - bp->fw_health->status_reliable = false; + if (!BNXT_CHIP_P5(bp)) return; - } + status_loc = BNXT_GRC_REG_STATUS_P5 | BNXT_FW_HEALTH_REG_TYPE_BAR0; } else { @@ -7592,6 +7606,7 @@ static int bnxt_map_fw_health_regs(struct bnxt *bp) u32 reg_base = 0xffffffff; int i; + bp->fw_health->status_reliable = false; /* Only pre-map the monitoring GRC registers using window 3 */ for (i = 0; i < 4; i++) { u32 reg = fw_health->regs[i]; @@ -7604,6 +7619,7 @@ static int bnxt_map_fw_health_regs(struct bnxt *bp) return -ERANGE; fw_health->mapped_regs[i] = BNXT_FW_HEALTH_WIN_OFF(reg); } + bp->fw_health->status_reliable = true; if (reg_base == 0xffffffff) return 0; @@ -9556,13 +9572,17 @@ static int bnxt_hwrm_if_change(struct bnxt *bp, bool up) if (rc) return rc; - if (!up) + if (!up) { + bnxt_inv_fw_health_reg(bp); return 0; + } if (flags & FUNC_DRV_IF_CHANGE_RESP_FLAGS_RESC_CHANGE) resc_reinit = true; if (flags & FUNC_DRV_IF_CHANGE_RESP_FLAGS_HOT_FW_RESET_DONE) fw_reset = true; + else if (bp->fw_health && !bp->fw_health->status_reliable) + bnxt_try_map_fw_health_reg(bp); if (test_bit(BNXT_STATE_IN_FW_RESET, &bp->state) && !fw_reset) { netdev_err(bp->dev, "RESET_DONE not set during FW reset.\n"); @@ -11723,6 +11743,7 @@ static void bnxt_fw_reset_task(struct work_struct *work) bnxt_queue_fw_reset_work(bp, bp->fw_reset_min_dsecs * HZ / 10); return; case BNXT_FW_RESET_STATE_ENABLE_DEV: + bnxt_inv_fw_health_reg(bp); if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state)) { u32 val; From 80a9641f09f890a27a57e8ad30472553e0f769a6 Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Mon, 22 Mar 2021 03:08:40 -0400 Subject: [PATCH 2/7] bnxt_en: Improve wait for firmware commands completion In situations where FW has crashed, the bnxt_hwrm_do_send_msg() call will have to wait until timeout for each firmware message. This generally takes about half a second for each firmware message. If we try to unload the driver n this state, the unload sequence will take a long time to complete. Improve this by checking the health register if it is available and abort the wait for the firmware response if the register shows that firmware is not healthy. The very first message HWRM_VER_GET is excluded from this check because that message is used to poll for firmware to come out of reset during error recovery. Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 28 +++++++++++++++++++---- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 5 ++++ 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 16cf18eb7b3d..deba552465f6 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -4500,12 +4500,15 @@ static int bnxt_hwrm_do_send_msg(struct bnxt *bp, void *msg, u32 msg_len, if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state)) return -EBUSY; /* on first few passes, just barely sleep */ - if (i < HWRM_SHORT_TIMEOUT_COUNTER) + if (i < HWRM_SHORT_TIMEOUT_COUNTER) { usleep_range(HWRM_SHORT_MIN_TIMEOUT, HWRM_SHORT_MAX_TIMEOUT); - else + } else { + if (HWRM_WAIT_MUST_ABORT(bp, req)) + break; usleep_range(HWRM_MIN_TIMEOUT, HWRM_MAX_TIMEOUT); + } } if (bp->hwrm_intr_seq_id != (u16)~seq_id) { @@ -4530,15 +4533,19 @@ static int bnxt_hwrm_do_send_msg(struct bnxt *bp, void *msg, u32 msg_len, if (len) break; /* on first few passes, just barely sleep */ - if (i < HWRM_SHORT_TIMEOUT_COUNTER) + if (i < HWRM_SHORT_TIMEOUT_COUNTER) { usleep_range(HWRM_SHORT_MIN_TIMEOUT, HWRM_SHORT_MAX_TIMEOUT); - else + } else { + if (HWRM_WAIT_MUST_ABORT(bp, req)) + goto timeout_abort; usleep_range(HWRM_MIN_TIMEOUT, HWRM_MAX_TIMEOUT); + } } if (i >= tmo_count) { +timeout_abort: if (!silent) netdev_err(bp->dev, "Error (timeout: %d) msg {0x%x 0x%x} len:%d\n", HWRM_TOTAL_TIMEOUT(i), @@ -7540,6 +7547,19 @@ static void __bnxt_map_fw_health_reg(struct bnxt *bp, u32 reg) BNXT_FW_HEALTH_WIN_MAP_OFF); } +bool bnxt_is_fw_healthy(struct bnxt *bp) +{ + if (bp->fw_health && bp->fw_health->status_reliable) { + u32 fw_status; + + fw_status = bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG); + if (fw_status && !BNXT_FW_IS_HEALTHY(fw_status)) + return false; + } + + return true; +} + static void bnxt_inv_fw_health_reg(struct bnxt *bp) { struct bnxt_fw_health *fw_health = bp->fw_health; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 1259e68cba2a..e77d60712954 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -671,6 +671,10 @@ struct nqe_cn { #define HWRM_MIN_TIMEOUT 25 #define HWRM_MAX_TIMEOUT 40 +#define HWRM_WAIT_MUST_ABORT(bp, req) \ + (le16_to_cpu((req)->req_type) != HWRM_VER_GET && \ + !bnxt_is_fw_healthy(bp)) + #define HWRM_TOTAL_TIMEOUT(n) (((n) <= HWRM_SHORT_TIMEOUT_COUNTER) ? \ ((n) * HWRM_SHORT_MIN_TIMEOUT) : \ (HWRM_SHORT_TIMEOUT_COUNTER * HWRM_SHORT_MIN_TIMEOUT + \ @@ -2228,6 +2232,7 @@ int bnxt_hwrm_set_link_setting(struct bnxt *, bool, bool); int bnxt_hwrm_alloc_wol_fltr(struct bnxt *bp); int bnxt_hwrm_free_wol_fltr(struct bnxt *bp); int bnxt_hwrm_func_resc_qcaps(struct bnxt *bp, bool all); +bool bnxt_is_fw_healthy(struct bnxt *bp); int bnxt_hwrm_fw_set_time(struct bnxt *); int bnxt_open_nic(struct bnxt *, bool, bool); int bnxt_half_open_nic(struct bnxt *bp); From a2f3835cc68a2222d0ab97862187ed98e65fe682 Mon Sep 17 00:00:00 2001 From: Edwin Peer Date: Mon, 22 Mar 2021 03:08:41 -0400 Subject: [PATCH 3/7] bnxt_en: don't fake firmware response success when PCI is disabled The original intent here is to allow commands during reset to succeed without error when the device is disabled, to ensure that cleanup completes normally during NIC close, where firmware is not necessarily expected to respond. The problem with faking success during reset's PCI disablement is that unrelated ULP commands will also see inadvertent success during reset when failure would otherwise be appropriate. It is better to return a different error result such that reset related code can detect this unique condition and ignore as appropriate. Note, the pci_disable_device() when firmware is fatally wounded in bnxt_fw_reset_close() does not need to be addressed, as subsequent commands are already expected to fail due to the BNXT_NO_FW_ACCESS() check in bnxt_hwrm_do_send_msg(). Reviewed-by: Scott Branden Signed-off-by: Edwin Peer Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index deba552465f6..3624e79667b6 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -4470,7 +4470,7 @@ static int bnxt_hwrm_do_send_msg(struct bnxt *bp, void *msg, u32 msg_len, writel(1, bp->bar0 + doorbell_offset); if (!pci_is_enabled(bp->pdev)) - return 0; + return -ENODEV; if (!timeout) timeout = DFLT_HWRM_CMD_TIMEOUT; @@ -11680,7 +11680,7 @@ static void bnxt_reset_all(struct bnxt *bp) req.selfrst_status = FW_RESET_REQ_SELFRST_STATUS_SELFRSTASAP; req.flags = FW_RESET_REQ_FLAGS_RESET_GRACEFUL; rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); - if (rc) + if (rc != -ENODEV) netdev_warn(bp->dev, "Unable to reset FW rc=%d\n", rc); } bp->fw_reset_timestamp = jiffies; From 15a7deb895497e4c9496b98367e4a0671add03f1 Mon Sep 17 00:00:00 2001 From: Scott Branden Date: Mon, 22 Mar 2021 03:08:42 -0400 Subject: [PATCH 4/7] bnxt_en: check return value of bnxt_hwrm_func_resc_qcaps Check return value of call to bnxt_hwrm_func_resc_qcaps in bnxt_hwrm_if_change and return failure on error. Reviewed-by: Edwin Peer Signed-off-by: Scott Branden Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 3624e79667b6..7f40dd7d847d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -9634,6 +9634,9 @@ static int bnxt_hwrm_if_change(struct bnxt *bp, bool up) struct bnxt_hw_resc *hw_resc = &bp->hw_resc; rc = bnxt_hwrm_func_resc_qcaps(bp, true); + if (rc) + netdev_err(bp->dev, "resc_qcaps failed\n"); + hw_resc->resv_cp_rings = 0; hw_resc->resv_stat_ctxs = 0; hw_resc->resv_irqs = 0; @@ -9647,7 +9650,7 @@ static int bnxt_hwrm_if_change(struct bnxt *bp, bool up) } } } - return 0; + return rc; } static int bnxt_hwrm_port_led_qcaps(struct bnxt *bp) From 2924ad95cb51673ed3544cf371cafc66e2c76cc8 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 22 Mar 2021 03:08:43 -0400 Subject: [PATCH 5/7] bnxt_en: Set BNXT_STATE_FW_RESET_DET flag earlier for the RDMA driver. During ifup, if the driver detects that firmware has gone through a reset, it will go through a re-probe sequence. If the RDMA driver is loaded, the re-probe sequence includes calling the RDMA driver to stop. We need to set the BNXT_STATE_FW_RESET_DET flag earlier so that it is visible to the RDMA driver. The RDMA driver's stop sequence is different if firmware has gone through a reset. Reviewed-by: Pavan Chebbi Reviewed-by: P B S Naresh Kumar Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 7f40dd7d847d..edbe5982cf41 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -9611,6 +9611,7 @@ static int bnxt_hwrm_if_change(struct bnxt *bp, bool up) } if (resc_reinit || fw_reset) { if (fw_reset) { + set_bit(BNXT_STATE_FW_RESET_DET, &bp->state); if (!test_bit(BNXT_STATE_IN_FW_RESET, &bp->state)) bnxt_ulp_stop(bp); bnxt_free_ctx_mem(bp); @@ -9619,16 +9620,17 @@ static int bnxt_hwrm_if_change(struct bnxt *bp, bool up) bnxt_dcb_free(bp); rc = bnxt_fw_init_one(bp); if (rc) { + clear_bit(BNXT_STATE_FW_RESET_DET, &bp->state); set_bit(BNXT_STATE_ABORT_ERR, &bp->state); return rc; } bnxt_clear_int_mode(bp); rc = bnxt_init_int_mode(bp); if (rc) { + clear_bit(BNXT_STATE_FW_RESET_DET, &bp->state); netdev_err(bp->dev, "init int mode failed\n"); return rc; } - set_bit(BNXT_STATE_FW_RESET_DET, &bp->state); } if (BNXT_NEW_RM(bp)) { struct bnxt_hw_resc *hw_resc = &bp->hw_resc; From bae8a00379f4c1327c8e38a768083460b5ad5b12 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Mon, 22 Mar 2021 03:08:44 -0400 Subject: [PATCH 6/7] bnxt_en: Remove the read of BNXT_FW_RESET_INPROG_REG after firmware reset. Once the chip goes through reset, the register mapping may be lost and any read of the mapped health registers may return garbage value until the registers are mapped again in the init path. Reading BNXT_FW_RESET_INPROG_REG after firmware reset will likely return garbage value due to the above reason. Reading this register is for information purpose only so remove it. Reviewed-by: Edwin Peer Signed-off-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 29 ++++++++--------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index edbe5982cf41..6db5e927a473 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -11769,28 +11769,19 @@ static void bnxt_fw_reset_task(struct work_struct *work) return; case BNXT_FW_RESET_STATE_ENABLE_DEV: bnxt_inv_fw_health_reg(bp); - if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state)) { - u32 val; + if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state) && + !bp->fw_reset_min_dsecs) { + u16 val; - if (!bp->fw_reset_min_dsecs) { - u16 val; - - pci_read_config_word(bp->pdev, PCI_SUBSYSTEM_ID, - &val); - if (val == 0xffff) { - if (bnxt_fw_reset_timeout(bp)) { - netdev_err(bp->dev, "Firmware reset aborted, PCI config space invalid\n"); - goto fw_reset_abort; - } - bnxt_queue_fw_reset_work(bp, HZ / 1000); - return; + pci_read_config_word(bp->pdev, PCI_SUBSYSTEM_ID, &val); + if (val == 0xffff) { + if (bnxt_fw_reset_timeout(bp)) { + netdev_err(bp->dev, "Firmware reset aborted, PCI config space invalid\n"); + goto fw_reset_abort; } + bnxt_queue_fw_reset_work(bp, HZ / 1000); + return; } - val = bnxt_fw_health_readl(bp, - BNXT_FW_RESET_INPROG_REG); - if (val) - netdev_warn(bp->dev, "FW reset inprog %x after min wait time.\n", - val); } clear_bit(BNXT_STATE_FW_FATAL_COND, &bp->state); if (pci_enable_device(bp->pdev)) { From 861aae786f2f7e1cab7926f7bb7783cb893e7edb Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 22 Mar 2021 03:08:45 -0400 Subject: [PATCH 7/7] bnxt_en: Enhance retry of the first message to the firmware. Two enhancements: 1. Read the health status first before sending the first HWRM_VER_GET message to firmware instead of the other way around. This guarantees we got the accurate health status before we attempt to send the message. 2. We currently only retry sending the first HWRM_VER_GET message to the firmware if the firmware is in the process of booting. If the firmware is in error state and is doing core dump for example, the driver should also retry if the health register has the RECOVERING flag set. This flag indicates the firmware will undergo recovery soon. Modify the retry logic to retry for this case as well. Reviewed-by: Edwin Peer Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 +++-- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 4 ++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 6db5e927a473..6f13642121c4 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -9530,9 +9530,10 @@ static int bnxt_try_recover_fw(struct bnxt *bp) mutex_lock(&bp->hwrm_cmd_lock); do { - rc = __bnxt_hwrm_ver_get(bp, true); sts = bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG); - if (!sts || !BNXT_FW_IS_BOOTING(sts)) + rc = __bnxt_hwrm_ver_get(bp, true); + if (!sts || (!BNXT_FW_IS_BOOTING(sts) && + !BNXT_FW_IS_RECOVERING(sts))) break; retry++; } while (rc == -EBUSY && retry < BNXT_FW_RETRY); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index e77d60712954..29061c577baa 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1564,6 +1564,7 @@ struct bnxt_fw_reporter_ctx { #define BNXT_FW_STATUS_HEALTH_MSK 0xffff #define BNXT_FW_STATUS_HEALTHY 0x8000 #define BNXT_FW_STATUS_SHUTDOWN 0x100000 +#define BNXT_FW_STATUS_RECOVERING 0x400000 #define BNXT_FW_IS_HEALTHY(sts) (((sts) & BNXT_FW_STATUS_HEALTH_MSK) ==\ BNXT_FW_STATUS_HEALTHY) @@ -1574,6 +1575,9 @@ struct bnxt_fw_reporter_ctx { #define BNXT_FW_IS_ERR(sts) (((sts) & BNXT_FW_STATUS_HEALTH_MSK) > \ BNXT_FW_STATUS_HEALTHY) +#define BNXT_FW_IS_RECOVERING(sts) (BNXT_FW_IS_ERR(sts) && \ + ((sts) & BNXT_FW_STATUS_RECOVERING)) + #define BNXT_FW_RETRY 5 #define BNXT_FW_IF_RETRY 10