Merge branch 'hns3-next'

Salil Mehta says:

====================
Adds support of RAS Error Handling in HNS3 Driver

This patch-set adds support related to RAS Error handling to the HNS3
Ethernet PF Driver. Set of errors occurred in the HNS3 hardware are
reported to the driver through the PCIe AER interface. The received
error information is then used to classify the received errors and
then decide the appropriate receovery action depending on the type
of error.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2018-10-22 19:31:14 -07:00
commit 8df591f35e
8 changed files with 1276 additions and 8 deletions

View File

@ -402,7 +402,7 @@ struct hnae3_ae_ops {
int (*set_vf_vlan_filter)(struct hnae3_handle *handle, int vfid,
u16 vlan, u8 qos, __be16 proto);
int (*enable_hw_strip_rxvtag)(struct hnae3_handle *handle, bool enable);
void (*reset_event)(struct hnae3_handle *handle);
void (*reset_event)(struct pci_dev *pdev, struct hnae3_handle *handle);
void (*get_channels)(struct hnae3_handle *handle,
struct ethtool_channels *ch);
void (*get_tqps_and_rss_info)(struct hnae3_handle *h,
@ -429,6 +429,7 @@ struct hnae3_ae_ops {
struct ethtool_rxnfc *cmd, u32 *rule_locs);
int (*restore_fd_rules)(struct hnae3_handle *handle);
void (*enable_fd)(struct hnae3_handle *handle, bool enable);
pci_ers_result_t (*process_hw_error)(struct hnae3_ae_dev *ae_dev);
};
struct hnae3_dcb_ops {

View File

@ -9,6 +9,7 @@
#include <linux/ipv6.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/aer.h>
#include <linux/skbuff.h>
#include <linux/sctp.h>
#include <linux/vermagic.h>
@ -1613,7 +1614,7 @@ static void hns3_nic_net_timeout(struct net_device *ndev)
/* request the reset */
if (h->ae_algo->ops->reset_event)
h->ae_algo->ops->reset_event(h);
h->ae_algo->ops->reset_event(h->pdev, h);
}
static const struct net_device_ops hns3_nic_netdev_ops = {
@ -1771,6 +1772,52 @@ static void hns3_shutdown(struct pci_dev *pdev)
pci_set_power_state(pdev, PCI_D3hot);
}
static pci_ers_result_t hns3_error_detected(struct pci_dev *pdev,
pci_channel_state_t state)
{
struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
pci_ers_result_t ret;
dev_info(&pdev->dev, "PCI error detected, state(=%d)!!\n", state);
if (state == pci_channel_io_perm_failure)
return PCI_ERS_RESULT_DISCONNECT;
if (!ae_dev) {
dev_err(&pdev->dev,
"Can't recover - error happened during device init\n");
return PCI_ERS_RESULT_NONE;
}
if (ae_dev->ops->process_hw_error)
ret = ae_dev->ops->process_hw_error(ae_dev);
else
return PCI_ERS_RESULT_NONE;
return ret;
}
static pci_ers_result_t hns3_slot_reset(struct pci_dev *pdev)
{
struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
struct device *dev = &pdev->dev;
dev_info(dev, "requesting reset due to PCI error\n");
/* request the reset */
if (ae_dev->ops->reset_event) {
ae_dev->ops->reset_event(pdev, NULL);
return PCI_ERS_RESULT_RECOVERED;
}
return PCI_ERS_RESULT_DISCONNECT;
}
static const struct pci_error_handlers hns3_err_handler = {
.error_detected = hns3_error_detected,
.slot_reset = hns3_slot_reset,
};
static struct pci_driver hns3_driver = {
.name = hns3_driver_name,
.id_table = hns3_pci_tbl,
@ -1778,6 +1825,7 @@ static struct pci_driver hns3_driver = {
.remove = hns3_remove,
.shutdown = hns3_shutdown,
.sriov_configure = hns3_pci_sriov_configure,
.err_handler = &hns3_err_handler,
};
/* set default feature to hns3 */

View File

@ -6,6 +6,6 @@
ccflags-y := -Idrivers/net/ethernet/hisilicon/hns3
obj-$(CONFIG_HNS3_HCLGE) += hclge.o
hclge-objs = hclge_main.o hclge_cmd.o hclge_mdio.o hclge_tm.o hclge_mbx.o
hclge-objs = hclge_main.o hclge_cmd.o hclge_mdio.o hclge_tm.o hclge_mbx.o hclge_err.o
hclge-$(CONFIG_HNS3_DCB) += hclge_dcb.o

View File

@ -209,6 +209,28 @@ enum hclge_opcode_type {
/* Led command */
HCLGE_OPC_LED_STATUS_CFG = 0xB000,
/* Error INT commands */
HCLGE_TM_SCH_ECC_INT_EN = 0x0829,
HCLGE_TM_SCH_ECC_ERR_RINT_CMD = 0x082d,
HCLGE_TM_SCH_ECC_ERR_RINT_CE = 0x082f,
HCLGE_TM_SCH_ECC_ERR_RINT_NFE = 0x0830,
HCLGE_TM_SCH_ECC_ERR_RINT_FE = 0x0831,
HCLGE_TM_SCH_MBIT_ECC_INFO_CMD = 0x0833,
HCLGE_COMMON_ECC_INT_CFG = 0x1505,
HCLGE_IGU_EGU_TNL_INT_QUERY = 0x1802,
HCLGE_IGU_EGU_TNL_INT_EN = 0x1803,
HCLGE_IGU_EGU_TNL_INT_CLR = 0x1804,
HCLGE_IGU_COMMON_INT_QUERY = 0x1805,
HCLGE_IGU_COMMON_INT_EN = 0x1806,
HCLGE_IGU_COMMON_INT_CLR = 0x1807,
HCLGE_TM_QCN_MEM_INT_CFG = 0x1A14,
HCLGE_TM_QCN_MEM_INT_INFO_CMD = 0x1A17,
HCLGE_PPP_CMD0_INT_CMD = 0x2100,
HCLGE_PPP_CMD1_INT_CMD = 0x2101,
HCLGE_NCSI_INT_QUERY = 0x2400,
HCLGE_NCSI_INT_EN = 0x2401,
HCLGE_NCSI_INT_CLR = 0x2402,
};
#define HCLGE_TQP_REG_OFFSET 0x80000

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,83 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/* Copyright (c) 2016-2017 Hisilicon Limited. */
#ifndef __HCLGE_ERR_H
#define __HCLGE_ERR_H
#include "hclge_main.h"
#define HCLGE_RAS_PF_OTHER_INT_STS_REG 0x20B00
#define HCLGE_RAS_REG_FE_MASK 0xFF
#define HCLGE_RAS_REG_NFE_MASK 0xFF00
#define HCLGE_RAS_REG_NFE_SHIFT 8
#define HCLGE_IMP_TCM_ECC_ERR_INT_EN 0xFFFF0000
#define HCLGE_IMP_TCM_ECC_ERR_INT_EN_MASK 0xFFFF0000
#define HCLGE_IMP_ITCM4_ECC_ERR_INT_EN 0x300
#define HCLGE_IMP_ITCM4_ECC_ERR_INT_EN_MASK 0x300
#define HCLGE_CMDQ_NIC_ECC_ERR_INT_EN 0xFFFF
#define HCLGE_CMDQ_NIC_ECC_ERR_INT_EN_MASK 0xFFFF
#define HCLGE_CMDQ_ROCEE_ECC_ERR_INT_EN 0xFFFF0000
#define HCLGE_CMDQ_ROCEE_ECC_ERR_INT_EN_MASK 0xFFFF0000
#define HCLGE_IMP_RD_POISON_ERR_INT_EN 0x0100
#define HCLGE_IMP_RD_POISON_ERR_INT_EN_MASK 0x0100
#define HCLGE_TQP_ECC_ERR_INT_EN 0x0FFF
#define HCLGE_TQP_ECC_ERR_INT_EN_MASK 0x0FFF
#define HCLGE_IGU_ERR_INT_EN 0x0000066F
#define HCLGE_IGU_ERR_INT_EN_MASK 0x000F
#define HCLGE_IGU_TNL_ERR_INT_EN 0x0002AABF
#define HCLGE_IGU_TNL_ERR_INT_EN_MASK 0x003F
#define HCLGE_PPP_MPF_ECC_ERR_INT0_EN 0xFFFFFFFF
#define HCLGE_PPP_MPF_ECC_ERR_INT0_EN_MASK 0xFFFFFFFF
#define HCLGE_PPP_MPF_ECC_ERR_INT1_EN 0xFFFFFFFF
#define HCLGE_PPP_MPF_ECC_ERR_INT1_EN_MASK 0xFFFFFFFF
#define HCLGE_PPP_PF_ERR_INT_EN 0x0003
#define HCLGE_PPP_PF_ERR_INT_EN_MASK 0x0003
#define HCLGE_PPP_MPF_ECC_ERR_INT2_EN 0x003F
#define HCLGE_PPP_MPF_ECC_ERR_INT2_EN_MASK 0x003F
#define HCLGE_PPP_MPF_ECC_ERR_INT3_EN 0x003F
#define HCLGE_PPP_MPF_ECC_ERR_INT3_EN_MASK 0x003F
#define HCLGE_TM_SCH_ECC_ERR_INT_EN 0x3
#define HCLGE_TM_QCN_MEM_ERR_INT_EN 0xFFFFFF
#define HCLGE_NCSI_ERR_INT_EN 0x3
#define HCLGE_NCSI_ERR_INT_TYPE 0x9
#define HCLGE_IMP_TCM_ECC_INT_MASK 0xFFFF
#define HCLGE_IMP_ITCM4_ECC_INT_MASK 0x3
#define HCLGE_CMDQ_ECC_INT_MASK 0xFFFF
#define HCLGE_CMDQ_ROC_ECC_INT_SHIFT 16
#define HCLGE_TQP_ECC_INT_MASK 0xFFF
#define HCLGE_TQP_ECC_INT_SHIFT 16
#define HCLGE_IMP_TCM_ECC_CLR_MASK 0xFFFF
#define HCLGE_IMP_ITCM4_ECC_CLR_MASK 0x3
#define HCLGE_CMDQ_NIC_ECC_CLR_MASK 0xFFFF
#define HCLGE_CMDQ_ROCEE_ECC_CLR_MASK 0xFFFF0000
#define HCLGE_TQP_IMP_ERR_CLR_MASK 0x0FFF0001
#define HCLGE_IGU_COM_INT_MASK 0xF
#define HCLGE_IGU_EGU_TNL_INT_MASK 0x3F
#define HCLGE_PPP_PF_INT_MASK 0x100
enum hclge_err_int_type {
HCLGE_ERR_INT_MSIX = 0,
HCLGE_ERR_INT_RAS_CE = 1,
HCLGE_ERR_INT_RAS_NFE = 2,
HCLGE_ERR_INT_RAS_FE = 3,
};
struct hclge_hw_blk {
u32 msk;
const char *name;
int (*enable_error)(struct hclge_dev *hdev, bool en);
void (*process_error)(struct hclge_dev *hdev,
enum hclge_err_int_type type);
};
struct hclge_hw_error {
u32 int_msk;
const char *msg;
};
int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state);
int hclge_enable_tm_hw_error(struct hclge_dev *hdev, bool en);
pci_ers_result_t hclge_process_ras_hw_error(struct hnae3_ae_dev *ae_dev);
#endif

View File

@ -19,6 +19,7 @@
#include "hclge_mbx.h"
#include "hclge_mdio.h"
#include "hclge_tm.h"
#include "hclge_err.h"
#include "hnae3.h"
#define HCLGE_NAME "hclge"
@ -2488,12 +2489,18 @@ static void hclge_reset(struct hclge_dev *hdev)
ae_dev->reset_type = HNAE3_NONE_RESET;
}
static void hclge_reset_event(struct hnae3_handle *handle)
static void hclge_reset_event(struct pci_dev *pdev, struct hnae3_handle *handle)
{
struct hclge_vport *vport = hclge_get_vport(handle);
struct hclge_dev *hdev = vport->back;
struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
struct hclge_dev *hdev = ae_dev->priv;
/* check if this is a new reset request and we are not here just because
/* We might end up getting called broadly because of 2 below cases:
* 1. Recoverable error was conveyed through APEI and only way to bring
* normalcy is to reset.
* 2. A new reset request from the stack due to timeout
*
* For the first case,error event might not have ae handle available.
* check if this is a new reset request and we are not here just because
* last reset attempt did not succeed and watchdog hit us again. We will
* know this if last reset request did not occur very recently (watchdog
* timer = 5*HZ, let us check after sufficiently large time, say 4*5*Hz)
@ -2502,6 +2509,9 @@ static void hclge_reset_event(struct hnae3_handle *handle)
* want to make sure we throttle the reset request. Therefore, we will
* not allow it again before 3*HZ times.
*/
if (!handle)
handle = &hdev->vport[0].nic;
if (time_before(jiffies, (handle->last_reset_time + 3 * HZ)))
return;
else if (time_after(jiffies, (handle->last_reset_time + 4 * 5 * HZ)))
@ -6749,6 +6759,13 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
goto err_mdiobus_unreg;
}
ret = hclge_hw_error_set_state(hdev, true);
if (ret) {
dev_err(&pdev->dev,
"hw error interrupts enable failed, ret =%d\n", ret);
goto err_mdiobus_unreg;
}
hclge_dcb_ops_set(hdev);
timer_setup(&hdev->service_timer, hclge_service_timer, 0);
@ -6864,6 +6881,12 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev)
return ret;
}
/* Re-enable the TM hw error interrupts because
* they get disabled on core/global reset.
*/
if (hclge_enable_tm_hw_error(hdev, true))
dev_err(&pdev->dev, "failed to enable TM hw error interrupts\n");
dev_info(&pdev->dev, "Reset done, %s driver initialization finished.\n",
HCLGE_DRIVER_NAME);
@ -6886,6 +6909,7 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev *ae_dev)
hclge_enable_vector(&hdev->misc_vector, false);
synchronize_irq(hdev->misc_vector.vector_irq);
hclge_hw_error_set_state(hdev, false);
hclge_destroy_cmd_queue(&hdev->hw);
hclge_misc_irq_uninit(hdev);
hclge_pci_uninit(hdev);
@ -7312,6 +7336,7 @@ static const struct hnae3_ae_ops hclge_ops = {
.get_fd_all_rules = hclge_get_all_rules,
.restore_fd_rules = hclge_restore_fd_entries,
.enable_fd = hclge_enable_fd,
.process_hw_error = hclge_process_ras_hw_error,
};
static struct hnae3_ae_algo ae_algo = {

View File

@ -1214,7 +1214,8 @@ static int hclgevf_do_reset(struct hclgevf_dev *hdev)
return status;
}
static void hclgevf_reset_event(struct hnae3_handle *handle)
static void hclgevf_reset_event(struct pci_dev *pdev,
struct hnae3_handle *handle)
{
struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle);