myri10ge: improve parity error detection and recovery

Improve myri10ge parity error detection and recovery:
1) Don't restore PCI config space to a rebooted NIC until AFTER the
   host is quiescent.
2) Let myri10ge_close() know the NIC is dead, so it won't waste time
   waiting for a dead nic to respond to MXGEFW_CMD_ETHERNET_DOWN
3) When the NIC is quiet (link down, or otherwise idle link) use
   a pci config space read to detect a rebooted NIC.  Otherwise
   we might never notice that a NIC rebooted

Signed-off-by: Andrew Gallatin <gallatin@myri.com>
Signed-off-by: Brice Goglin <brice@myri.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Brice Goglin 2009-08-07 10:44:22 +00:00 committed by David S. Miller
parent c9145a2df0
commit d02342151c
1 changed files with 46 additions and 17 deletions

View File

@ -75,7 +75,7 @@
#include "myri10ge_mcp.h"
#include "myri10ge_mcp_gen_header.h"
#define MYRI10GE_VERSION_STR "1.5.0-1.418"
#define MYRI10GE_VERSION_STR "1.5.0-1.432"
MODULE_DESCRIPTION("Myricom 10G driver (10GbE)");
MODULE_AUTHOR("Maintainer: help@myri.com");
@ -188,6 +188,7 @@ struct myri10ge_slice_state {
dma_addr_t fw_stats_bus;
int watchdog_tx_done;
int watchdog_tx_req;
int watchdog_rx_done;
#ifdef CONFIG_MYRI10GE_DCA
int cached_dca_tag;
int cpu;
@ -256,6 +257,7 @@ struct myri10ge_priv {
u32 link_changes;
u32 msg_enable;
unsigned int board_number;
int rebooted;
};
static char *myri10ge_fw_unaligned = "myri10ge_ethp_z8e.dat";
@ -2552,17 +2554,22 @@ static int myri10ge_close(struct net_device *dev)
netif_carrier_off(dev);
netif_tx_stop_all_queues(dev);
old_down_cnt = mgp->down_cnt;
mb();
status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd, 0);
if (status)
printk(KERN_ERR "myri10ge: %s: Couldn't bring down link\n",
dev->name);
wait_event_timeout(mgp->down_wq, old_down_cnt != mgp->down_cnt, HZ);
if (old_down_cnt == mgp->down_cnt)
printk(KERN_ERR "myri10ge: %s never got down irq\n", dev->name);
if (mgp->rebooted == 0) {
old_down_cnt = mgp->down_cnt;
mb();
status =
myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd, 0);
if (status)
printk(KERN_ERR
"myri10ge: %s: Couldn't bring down link\n",
dev->name);
wait_event_timeout(mgp->down_wq, old_down_cnt != mgp->down_cnt,
HZ);
if (old_down_cnt == mgp->down_cnt)
printk(KERN_ERR "myri10ge: %s never got down irq\n",
dev->name);
}
netif_tx_disable(dev);
myri10ge_free_irq(mgp);
for (i = 0; i < mgp->num_slices; i++)
@ -3427,12 +3434,13 @@ static void myri10ge_watchdog(struct work_struct *work)
container_of(work, struct myri10ge_priv, watchdog_work);
struct myri10ge_tx_buf *tx;
u32 reboot;
int status;
int status, rebooted;
int i;
u16 cmd, vendor;
mgp->watchdog_resets++;
pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd);
rebooted = 0;
if ((cmd & PCI_COMMAND_MASTER) == 0) {
/* Bus master DMA disabled? Check to see
* if the card rebooted due to a parity error
@ -3444,9 +3452,12 @@ static void myri10ge_watchdog(struct work_struct *work)
myri10ge_reset_recover ? " " : " not");
if (myri10ge_reset_recover == 0)
return;
rtnl_lock();
mgp->rebooted = 1;
rebooted = 1;
myri10ge_close(mgp->dev);
myri10ge_reset_recover--;
mgp->rebooted = 0;
/*
* A rebooted nic will come back with config space as
* it was after power was applied to PCIe bus.
@ -3494,8 +3505,10 @@ static void myri10ge_watchdog(struct work_struct *work)
}
}
rtnl_lock();
myri10ge_close(mgp->dev);
if (!rebooted) {
rtnl_lock();
myri10ge_close(mgp->dev);
}
status = myri10ge_load_firmware(mgp, 1);
if (status != 0)
printk(KERN_ERR "myri10ge: %s: failed to load firmware\n",
@ -3516,12 +3529,14 @@ static void myri10ge_watchdog_timer(unsigned long arg)
{
struct myri10ge_priv *mgp;
struct myri10ge_slice_state *ss;
int i, reset_needed;
int i, reset_needed, busy_slice_cnt;
u32 rx_pause_cnt;
u16 cmd;
mgp = (struct myri10ge_priv *)arg;
rx_pause_cnt = ntohl(mgp->ss[0].fw_stats->dropped_pause);
busy_slice_cnt = 0;
for (i = 0, reset_needed = 0;
i < mgp->num_slices && reset_needed == 0; ++i) {
@ -3559,8 +3574,22 @@ static void myri10ge_watchdog_timer(unsigned long arg)
reset_needed = 1;
}
}
if (ss->watchdog_tx_done != ss->tx.done ||
ss->watchdog_rx_done != ss->rx_done.cnt) {
busy_slice_cnt++;
}
ss->watchdog_tx_done = ss->tx.done;
ss->watchdog_tx_req = ss->tx.req;
ss->watchdog_rx_done = ss->rx_done.cnt;
}
/* if we've sent or received no traffic, poll the NIC to
* ensure it is still there. Otherwise, we risk not noticing
* an error in a timely fashion */
if (busy_slice_cnt == 0) {
pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd);
if ((cmd & PCI_COMMAND_MASTER) == 0) {
reset_needed = 1;
}
}
mgp->watchdog_pause = rx_pause_cnt;