linux/drivers/net/ethernet/intel/fm10k/fm10k_pci.c

2479 lines
68 KiB
C
Raw Normal View History

/* Intel(R) Ethernet Switch Host Interface Driver
fm10k: use a BITMAP for flags to avoid race conditions Replace bitwise operators and #defines with a BITMAP and enumeration values. This is similar to how we handle the "state" values as well. This has two distinct advantages over the old method. First, we ensure correctness of operations which are currently problematic due to race conditions. Suppose that two kernel threads are running, such as the watchdog and an ethtool ioctl, and both modify flags. We'll say that the watchdog is CPU A, and the ethtool ioctl is CPU B. CPU A sets FLAG_1, which can be seen as CPU A read FLAGS CPU A write FLAGS | FLAG_1 CPU B sets FLAG_2, which can be seen as CPU B read FLAGS CPU A write FLAGS | FLAG_2 However, "|=" and "&=" operators are not actually atomic. So this could be ordered like the following: CPU A read FLAGS -> variable CPU B read FLAGS -> variable CPU A write FLAGS (variable | FLAG_1) CPU B write FLAGS (variable | FLAG_2) Notice how the 2nd write from CPU B could actually undo the write from CPU A because it isn't guaranteed that the |= operation is atomic. In practice the race windows for most flag writes is incredibly narrow so it is not easy to isolate issues. However, the more flags we have, the more likely they will cause problems. Additionally, if such a problem were to arise, it would be incredibly difficult to track down. Second, there is an additional advantage beyond code correctness. We can now automatically size the BITMAP if more flags were added, so that we do not need to remember that flags is u32 and thus if we added too many flags we would over-run the variable. This is not a likely occurrence for fm10k driver, but this patch can serve as an example for other drivers which have many more flags. This particular change does have a bit of trouble converting some of the idioms previously used with the #defines for flags. Specifically, when converting FM10K_FLAG_RSS_FIELD_IPV[46]_UDP flags. This whole operation was actually quite problematic, because we actually stored flags separately. This could more easily show the problem of the above re-ordering issue. This is really difficult to test whether atomics make a difference in practical scenarios, but you can ensure that basic functionality remains the same. This patch has a lot of code coverage, but most of it is relatively simple. While we are modifying these files, update their copyright year. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-01-13 07:59:38 +08:00
* Copyright(c) 2013 - 2017 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* The full GNU General Public License is included in this distribution in
* the file called "COPYING".
*
* Contact Information:
* e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
* Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
*/
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/aer.h>
#include "fm10k.h"
static const struct fm10k_info *fm10k_info_tbl[] = {
[fm10k_device_pf] = &fm10k_pf_info,
[fm10k_device_vf] = &fm10k_vf_info,
};
/**
* fm10k_pci_tbl - PCI Device ID Table
*
* Wildcard entries (PCI_ANY_ID) should come last
* Last entry must be all 0s
*
* { Vendor ID, Device ID, SubVendor ID, SubDevice ID,
* Class, Class Mask, private data (not used) }
*/
static const struct pci_device_id fm10k_pci_tbl[] = {
{ PCI_VDEVICE(INTEL, FM10K_DEV_ID_PF), fm10k_device_pf },
{ PCI_VDEVICE(INTEL, FM10K_DEV_ID_VF), fm10k_device_vf },
/* required last entry */
{ 0, }
};
MODULE_DEVICE_TABLE(pci, fm10k_pci_tbl);
u16 fm10k_read_pci_cfg_word(struct fm10k_hw *hw, u32 reg)
{
struct fm10k_intfc *interface = hw->back;
u16 value = 0;
if (FM10K_REMOVED(hw->hw_addr))
return ~value;
pci_read_config_word(interface->pdev, reg, &value);
if (value == 0xFFFF)
fm10k_write_flush(hw);
return value;
}
u32 fm10k_read_reg(struct fm10k_hw *hw, int reg)
{
u32 __iomem *hw_addr = READ_ONCE(hw->hw_addr);
u32 value = 0;
if (FM10K_REMOVED(hw_addr))
return ~value;
value = readl(&hw_addr[reg]);
if (!(~value) && (!reg || !(~readl(hw_addr)))) {
struct fm10k_intfc *interface = hw->back;
struct net_device *netdev = interface->netdev;
hw->hw_addr = NULL;
netif_device_detach(netdev);
netdev_err(netdev, "PCIe link lost, device now detached\n");
}
return value;
}
static int fm10k_hw_ready(struct fm10k_intfc *interface)
{
struct fm10k_hw *hw = &interface->hw;
fm10k_write_flush(hw);
return FM10K_REMOVED(hw->hw_addr) ? -ENODEV : 0;
}
void fm10k_service_event_schedule(struct fm10k_intfc *interface)
{
if (!test_bit(__FM10K_SERVICE_DISABLE, interface->state) &&
fm10k: allow service task to reschedule itself If some code path executes fm10k_service_event_schedule(), it is guaranteed that we only queue the service task once, since we use __FM10K_SERVICE_SCHED flag. Unfortunately this has a side effect that if a service request occurs while we are currently running the watchdog, it is possible that we will fail to notice the request and ignore it until the next time the request occurs. This can cause problems with pf/vf mailbox communication and other service event tasks. To avoid this, introduce a FM10K_SERVICE_REQUEST bit. When we successfully schedule (and set the _SCHED bit) the service task, we will clear this bit. However, if we are unable to currently schedule the service event, we just set the new SERVICE_REQUEST bit. Finally, after the service event completes, we will re-schedule if the request bit has been set. This should ensure that we do not miss any service event schedules, since we will re-schedule it once the currently running task finishes. This means that for each request, we will always schedule the service task to run at least once in full after the request came in. This will avoid timing issues that can occur with the service event scheduling. We do pay a cost in re-running many tasks, but all the service event tasks use either flags to avoid duplicate work, or are tolerant of being run multiple times. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-01-13 07:59:40 +08:00
!test_and_set_bit(__FM10K_SERVICE_SCHED, interface->state)) {
clear_bit(__FM10K_SERVICE_REQUEST, interface->state);
queue_work(fm10k_workqueue, &interface->service_task);
fm10k: allow service task to reschedule itself If some code path executes fm10k_service_event_schedule(), it is guaranteed that we only queue the service task once, since we use __FM10K_SERVICE_SCHED flag. Unfortunately this has a side effect that if a service request occurs while we are currently running the watchdog, it is possible that we will fail to notice the request and ignore it until the next time the request occurs. This can cause problems with pf/vf mailbox communication and other service event tasks. To avoid this, introduce a FM10K_SERVICE_REQUEST bit. When we successfully schedule (and set the _SCHED bit) the service task, we will clear this bit. However, if we are unable to currently schedule the service event, we just set the new SERVICE_REQUEST bit. Finally, after the service event completes, we will re-schedule if the request bit has been set. This should ensure that we do not miss any service event schedules, since we will re-schedule it once the currently running task finishes. This means that for each request, we will always schedule the service task to run at least once in full after the request came in. This will avoid timing issues that can occur with the service event scheduling. We do pay a cost in re-running many tasks, but all the service event tasks use either flags to avoid duplicate work, or are tolerant of being run multiple times. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-01-13 07:59:40 +08:00
} else {
set_bit(__FM10K_SERVICE_REQUEST, interface->state);
}
}
static void fm10k_service_event_complete(struct fm10k_intfc *interface)
{
WARN_ON(!test_bit(__FM10K_SERVICE_SCHED, interface->state));
/* flush memory to make sure state is correct before next watchog */
smp_mb__before_atomic();
clear_bit(__FM10K_SERVICE_SCHED, interface->state);
fm10k: allow service task to reschedule itself If some code path executes fm10k_service_event_schedule(), it is guaranteed that we only queue the service task once, since we use __FM10K_SERVICE_SCHED flag. Unfortunately this has a side effect that if a service request occurs while we are currently running the watchdog, it is possible that we will fail to notice the request and ignore it until the next time the request occurs. This can cause problems with pf/vf mailbox communication and other service event tasks. To avoid this, introduce a FM10K_SERVICE_REQUEST bit. When we successfully schedule (and set the _SCHED bit) the service task, we will clear this bit. However, if we are unable to currently schedule the service event, we just set the new SERVICE_REQUEST bit. Finally, after the service event completes, we will re-schedule if the request bit has been set. This should ensure that we do not miss any service event schedules, since we will re-schedule it once the currently running task finishes. This means that for each request, we will always schedule the service task to run at least once in full after the request came in. This will avoid timing issues that can occur with the service event scheduling. We do pay a cost in re-running many tasks, but all the service event tasks use either flags to avoid duplicate work, or are tolerant of being run multiple times. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-01-13 07:59:40 +08:00
/* If a service event was requested since we started, immediately
* re-schedule now. This ensures we don't drop a request until the
* next timer event.
*/
if (test_bit(__FM10K_SERVICE_REQUEST, interface->state))
fm10k_service_event_schedule(interface);
}
static void fm10k_stop_service_event(struct fm10k_intfc *interface)
{
set_bit(__FM10K_SERVICE_DISABLE, interface->state);
cancel_work_sync(&interface->service_task);
/* It's possible that cancel_work_sync stopped the service task from
* running before it could actually start. In this case the
* __FM10K_SERVICE_SCHED bit will never be cleared. Since we know that
* the service task cannot be running at this point, we need to clear
* the scheduled bit, as otherwise the service task may never be
* restarted.
*/
clear_bit(__FM10K_SERVICE_SCHED, interface->state);
}
static void fm10k_start_service_event(struct fm10k_intfc *interface)
{
clear_bit(__FM10K_SERVICE_DISABLE, interface->state);
fm10k_service_event_schedule(interface);
}
/**
* fm10k_service_timer - Timer Call-back
* @data: pointer to interface cast into an unsigned long
**/
static void fm10k_service_timer(unsigned long data)
{
struct fm10k_intfc *interface = (struct fm10k_intfc *)data;
/* Reset the timer */
mod_timer(&interface->service_timer, (HZ * 2) + jiffies);
fm10k_service_event_schedule(interface);
}
fm10k: prepare_for_reset() when we lose PCIe Link If we lose PCIe link, such as when an unannounced PFLR event occurs, or when a device is surprise removed, we currently detach the device and close the netdev. This unfortunately leaves a lot of things still active, such as the msix_mbx_pf IRQ, and Tx/Rx resources. This can cause problems because the register reads will return potentially invalid values which may result in unknown driver behavior. Begin the process of resetting using fm10k_prepare_for_reset(), much in the same way as the suspend and resume cycle does. This will attempt to shutdown as much as possible, in order to prevent possible issues. A naive implementation for this has issues, because there are now multiple flows calling the reset logic and setting a reset bit. This would cause problems, because the "re-attach" routine might call fm10k_handle_reset() prior to the reset actually finishing. Instead, we'll add state bits to indicate which flow actually initiated the reset. For the general reset flow, we'll assume that if someone else is resetting that we do not need to handle it at all, so it does not need its own state bit. For the suspend case, we will simply issue a warning indicating that we are attempting to recover from this case when resuming. For the detached subtask, we'll simply refuse to re-attach until we've actually initiated a reset as part of that flow. Finally, we'll stop attempting to manage the mailbox subtask when we're detached, since there's nothing we can do if we don't have a PCIe address. Overall this produces a much cleaner shutdown and recovery cycle for a PCIe surprise remove event. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-07-11 04:23:14 +08:00
/**
* fm10k_prepare_for_reset - Prepare the driver and device for a pending reset
* @interface: fm10k private data structure
*
* This function prepares for a device reset by shutting as much down as we
* can. It does nothing and returns false if __FM10K_RESETTING was already set
* prior to calling this function. It returns true if it actually did work.
*/
static bool fm10k_prepare_for_reset(struct fm10k_intfc *interface)
{
struct net_device *netdev = interface->netdev;
WARN_ON(in_interrupt());
/* put off any impending NetWatchDogTimeout */
netif_trans_update(netdev);
fm10k: prepare_for_reset() when we lose PCIe Link If we lose PCIe link, such as when an unannounced PFLR event occurs, or when a device is surprise removed, we currently detach the device and close the netdev. This unfortunately leaves a lot of things still active, such as the msix_mbx_pf IRQ, and Tx/Rx resources. This can cause problems because the register reads will return potentially invalid values which may result in unknown driver behavior. Begin the process of resetting using fm10k_prepare_for_reset(), much in the same way as the suspend and resume cycle does. This will attempt to shutdown as much as possible, in order to prevent possible issues. A naive implementation for this has issues, because there are now multiple flows calling the reset logic and setting a reset bit. This would cause problems, because the "re-attach" routine might call fm10k_handle_reset() prior to the reset actually finishing. Instead, we'll add state bits to indicate which flow actually initiated the reset. For the general reset flow, we'll assume that if someone else is resetting that we do not need to handle it at all, so it does not need its own state bit. For the suspend case, we will simply issue a warning indicating that we are attempting to recover from this case when resuming. For the detached subtask, we'll simply refuse to re-attach until we've actually initiated a reset as part of that flow. Finally, we'll stop attempting to manage the mailbox subtask when we're detached, since there's nothing we can do if we don't have a PCIe address. Overall this produces a much cleaner shutdown and recovery cycle for a PCIe surprise remove event. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-07-11 04:23:14 +08:00
/* Nothing to do if a reset is already in progress */
if (test_and_set_bit(__FM10K_RESETTING, interface->state))
return false;
rtnl_lock();
fm10k_iov_suspend(interface->pdev);
if (netif_running(netdev))
fm10k_close(netdev);
fm10k_mbx_free_irq(interface);
/* free interrupts */
fm10k_clear_queueing_scheme(interface);
/* delay any future reset requests */
interface->last_reset = jiffies + (10 * HZ);
rtnl_unlock();
fm10k: prepare_for_reset() when we lose PCIe Link If we lose PCIe link, such as when an unannounced PFLR event occurs, or when a device is surprise removed, we currently detach the device and close the netdev. This unfortunately leaves a lot of things still active, such as the msix_mbx_pf IRQ, and Tx/Rx resources. This can cause problems because the register reads will return potentially invalid values which may result in unknown driver behavior. Begin the process of resetting using fm10k_prepare_for_reset(), much in the same way as the suspend and resume cycle does. This will attempt to shutdown as much as possible, in order to prevent possible issues. A naive implementation for this has issues, because there are now multiple flows calling the reset logic and setting a reset bit. This would cause problems, because the "re-attach" routine might call fm10k_handle_reset() prior to the reset actually finishing. Instead, we'll add state bits to indicate which flow actually initiated the reset. For the general reset flow, we'll assume that if someone else is resetting that we do not need to handle it at all, so it does not need its own state bit. For the suspend case, we will simply issue a warning indicating that we are attempting to recover from this case when resuming. For the detached subtask, we'll simply refuse to re-attach until we've actually initiated a reset as part of that flow. Finally, we'll stop attempting to manage the mailbox subtask when we're detached, since there's nothing we can do if we don't have a PCIe address. Overall this produces a much cleaner shutdown and recovery cycle for a PCIe surprise remove event. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-07-11 04:23:14 +08:00
return true;
}
static int fm10k_handle_reset(struct fm10k_intfc *interface)
{
struct net_device *netdev = interface->netdev;
struct fm10k_hw *hw = &interface->hw;
int err;
fm10k: prepare_for_reset() when we lose PCIe Link If we lose PCIe link, such as when an unannounced PFLR event occurs, or when a device is surprise removed, we currently detach the device and close the netdev. This unfortunately leaves a lot of things still active, such as the msix_mbx_pf IRQ, and Tx/Rx resources. This can cause problems because the register reads will return potentially invalid values which may result in unknown driver behavior. Begin the process of resetting using fm10k_prepare_for_reset(), much in the same way as the suspend and resume cycle does. This will attempt to shutdown as much as possible, in order to prevent possible issues. A naive implementation for this has issues, because there are now multiple flows calling the reset logic and setting a reset bit. This would cause problems, because the "re-attach" routine might call fm10k_handle_reset() prior to the reset actually finishing. Instead, we'll add state bits to indicate which flow actually initiated the reset. For the general reset flow, we'll assume that if someone else is resetting that we do not need to handle it at all, so it does not need its own state bit. For the suspend case, we will simply issue a warning indicating that we are attempting to recover from this case when resuming. For the detached subtask, we'll simply refuse to re-attach until we've actually initiated a reset as part of that flow. Finally, we'll stop attempting to manage the mailbox subtask when we're detached, since there's nothing we can do if we don't have a PCIe address. Overall this produces a much cleaner shutdown and recovery cycle for a PCIe surprise remove event. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-07-11 04:23:14 +08:00
WARN_ON(!test_bit(__FM10K_RESETTING, interface->state));
rtnl_lock();
pci_set_master(interface->pdev);
/* reset and initialize the hardware so it is in a known state */
err = hw->mac.ops.reset_hw(hw);
if (err) {
dev_err(&interface->pdev->dev, "reset_hw failed: %d\n", err);
goto reinit_err;
}
err = hw->mac.ops.init_hw(hw);
if (err) {
dev_err(&interface->pdev->dev, "init_hw failed: %d\n", err);
goto reinit_err;
}
err = fm10k_init_queueing_scheme(interface);
if (err) {
dev_err(&interface->pdev->dev,
"init_queueing_scheme failed: %d\n", err);
goto reinit_err;
}
/* re-associate interrupts */
err = fm10k_mbx_request_irq(interface);
if (err)
goto err_mbx_irq;
err = fm10k_hw_ready(interface);
if (err)
goto err_open;
/* update hardware address for VFs if perm_addr has changed */
if (hw->mac.type == fm10k_mac_vf) {
if (is_valid_ether_addr(hw->mac.perm_addr)) {
ether_addr_copy(hw->mac.addr, hw->mac.perm_addr);
ether_addr_copy(netdev->perm_addr, hw->mac.perm_addr);
ether_addr_copy(netdev->dev_addr, hw->mac.perm_addr);
netdev->addr_assign_type &= ~NET_ADDR_RANDOM;
}
if (hw->mac.vlan_override)
netdev->features &= ~NETIF_F_HW_VLAN_CTAG_RX;
else
netdev->features |= NETIF_F_HW_VLAN_CTAG_RX;
}
err = netif_running(netdev) ? fm10k_open(netdev) : 0;
if (err)
goto err_open;
fm10k_iov_resume(interface->pdev);
rtnl_unlock();
clear_bit(__FM10K_RESETTING, interface->state);
return err;
err_open:
fm10k_mbx_free_irq(interface);
err_mbx_irq:
fm10k_clear_queueing_scheme(interface);
reinit_err:
netif_device_detach(netdev);
rtnl_unlock();
clear_bit(__FM10K_RESETTING, interface->state);
return err;
}
static void fm10k_detach_subtask(struct fm10k_intfc *interface)
{
struct net_device *netdev = interface->netdev;
u32 __iomem *hw_addr;
u32 value;
fm10k: prepare_for_reset() when we lose PCIe Link If we lose PCIe link, such as when an unannounced PFLR event occurs, or when a device is surprise removed, we currently detach the device and close the netdev. This unfortunately leaves a lot of things still active, such as the msix_mbx_pf IRQ, and Tx/Rx resources. This can cause problems because the register reads will return potentially invalid values which may result in unknown driver behavior. Begin the process of resetting using fm10k_prepare_for_reset(), much in the same way as the suspend and resume cycle does. This will attempt to shutdown as much as possible, in order to prevent possible issues. A naive implementation for this has issues, because there are now multiple flows calling the reset logic and setting a reset bit. This would cause problems, because the "re-attach" routine might call fm10k_handle_reset() prior to the reset actually finishing. Instead, we'll add state bits to indicate which flow actually initiated the reset. For the general reset flow, we'll assume that if someone else is resetting that we do not need to handle it at all, so it does not need its own state bit. For the suspend case, we will simply issue a warning indicating that we are attempting to recover from this case when resuming. For the detached subtask, we'll simply refuse to re-attach until we've actually initiated a reset as part of that flow. Finally, we'll stop attempting to manage the mailbox subtask when we're detached, since there's nothing we can do if we don't have a PCIe address. Overall this produces a much cleaner shutdown and recovery cycle for a PCIe surprise remove event. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-07-11 04:23:14 +08:00
int err;
fm10k: prepare_for_reset() when we lose PCIe Link If we lose PCIe link, such as when an unannounced PFLR event occurs, or when a device is surprise removed, we currently detach the device and close the netdev. This unfortunately leaves a lot of things still active, such as the msix_mbx_pf IRQ, and Tx/Rx resources. This can cause problems because the register reads will return potentially invalid values which may result in unknown driver behavior. Begin the process of resetting using fm10k_prepare_for_reset(), much in the same way as the suspend and resume cycle does. This will attempt to shutdown as much as possible, in order to prevent possible issues. A naive implementation for this has issues, because there are now multiple flows calling the reset logic and setting a reset bit. This would cause problems, because the "re-attach" routine might call fm10k_handle_reset() prior to the reset actually finishing. Instead, we'll add state bits to indicate which flow actually initiated the reset. For the general reset flow, we'll assume that if someone else is resetting that we do not need to handle it at all, so it does not need its own state bit. For the suspend case, we will simply issue a warning indicating that we are attempting to recover from this case when resuming. For the detached subtask, we'll simply refuse to re-attach until we've actually initiated a reset as part of that flow. Finally, we'll stop attempting to manage the mailbox subtask when we're detached, since there's nothing we can do if we don't have a PCIe address. Overall this produces a much cleaner shutdown and recovery cycle for a PCIe surprise remove event. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-07-11 04:23:14 +08:00
/* do nothing if netdev is still present or hw_addr is set */
if (netif_device_present(netdev) || interface->hw.hw_addr)
return;
fm10k: prepare_for_reset() when we lose PCIe Link If we lose PCIe link, such as when an unannounced PFLR event occurs, or when a device is surprise removed, we currently detach the device and close the netdev. This unfortunately leaves a lot of things still active, such as the msix_mbx_pf IRQ, and Tx/Rx resources. This can cause problems because the register reads will return potentially invalid values which may result in unknown driver behavior. Begin the process of resetting using fm10k_prepare_for_reset(), much in the same way as the suspend and resume cycle does. This will attempt to shutdown as much as possible, in order to prevent possible issues. A naive implementation for this has issues, because there are now multiple flows calling the reset logic and setting a reset bit. This would cause problems, because the "re-attach" routine might call fm10k_handle_reset() prior to the reset actually finishing. Instead, we'll add state bits to indicate which flow actually initiated the reset. For the general reset flow, we'll assume that if someone else is resetting that we do not need to handle it at all, so it does not need its own state bit. For the suspend case, we will simply issue a warning indicating that we are attempting to recover from this case when resuming. For the detached subtask, we'll simply refuse to re-attach until we've actually initiated a reset as part of that flow. Finally, we'll stop attempting to manage the mailbox subtask when we're detached, since there's nothing we can do if we don't have a PCIe address. Overall this produces a much cleaner shutdown and recovery cycle for a PCIe surprise remove event. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-07-11 04:23:14 +08:00
/* We've lost the PCIe register space, and can no longer access the
* device. Shut everything except the detach subtask down and prepare
* to reset the device in case we recover. If we actually prepare for
* reset, indicate that we're detached.
*/
if (fm10k_prepare_for_reset(interface))
set_bit(__FM10K_RESET_DETACHED, interface->state);
/* check the real address space to see if we've recovered */
hw_addr = READ_ONCE(interface->uc_addr);
value = readl(hw_addr);
if (~value) {
fm10k: prepare_for_reset() when we lose PCIe Link If we lose PCIe link, such as when an unannounced PFLR event occurs, or when a device is surprise removed, we currently detach the device and close the netdev. This unfortunately leaves a lot of things still active, such as the msix_mbx_pf IRQ, and Tx/Rx resources. This can cause problems because the register reads will return potentially invalid values which may result in unknown driver behavior. Begin the process of resetting using fm10k_prepare_for_reset(), much in the same way as the suspend and resume cycle does. This will attempt to shutdown as much as possible, in order to prevent possible issues. A naive implementation for this has issues, because there are now multiple flows calling the reset logic and setting a reset bit. This would cause problems, because the "re-attach" routine might call fm10k_handle_reset() prior to the reset actually finishing. Instead, we'll add state bits to indicate which flow actually initiated the reset. For the general reset flow, we'll assume that if someone else is resetting that we do not need to handle it at all, so it does not need its own state bit. For the suspend case, we will simply issue a warning indicating that we are attempting to recover from this case when resuming. For the detached subtask, we'll simply refuse to re-attach until we've actually initiated a reset as part of that flow. Finally, we'll stop attempting to manage the mailbox subtask when we're detached, since there's nothing we can do if we don't have a PCIe address. Overall this produces a much cleaner shutdown and recovery cycle for a PCIe surprise remove event. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-07-11 04:23:14 +08:00
/* Make sure the reset was initiated because we detached,
* otherwise we might race with a different reset flow.
*/
if (!test_and_clear_bit(__FM10K_RESET_DETACHED,
interface->state))
return;
/* Restore the hardware address */
interface->hw.hw_addr = interface->uc_addr;
fm10k: prepare_for_reset() when we lose PCIe Link If we lose PCIe link, such as when an unannounced PFLR event occurs, or when a device is surprise removed, we currently detach the device and close the netdev. This unfortunately leaves a lot of things still active, such as the msix_mbx_pf IRQ, and Tx/Rx resources. This can cause problems because the register reads will return potentially invalid values which may result in unknown driver behavior. Begin the process of resetting using fm10k_prepare_for_reset(), much in the same way as the suspend and resume cycle does. This will attempt to shutdown as much as possible, in order to prevent possible issues. A naive implementation for this has issues, because there are now multiple flows calling the reset logic and setting a reset bit. This would cause problems, because the "re-attach" routine might call fm10k_handle_reset() prior to the reset actually finishing. Instead, we'll add state bits to indicate which flow actually initiated the reset. For the general reset flow, we'll assume that if someone else is resetting that we do not need to handle it at all, so it does not need its own state bit. For the suspend case, we will simply issue a warning indicating that we are attempting to recover from this case when resuming. For the detached subtask, we'll simply refuse to re-attach until we've actually initiated a reset as part of that flow. Finally, we'll stop attempting to manage the mailbox subtask when we're detached, since there's nothing we can do if we don't have a PCIe address. Overall this produces a much cleaner shutdown and recovery cycle for a PCIe surprise remove event. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-07-11 04:23:14 +08:00
/* PCIe link has been restored, and the device is active
* again. Restore everything and reset the device.
*/
err = fm10k_handle_reset(interface);
if (err) {
netdev_err(netdev, "Unable to reset device: %d\n", err);
interface->hw.hw_addr = NULL;
return;
}
/* Re-attach the netdev */
netif_device_attach(netdev);
netdev_warn(netdev, "PCIe link restored, device now attached\n");
return;
}
}
fm10k: prepare_for_reset() when we lose PCIe Link If we lose PCIe link, such as when an unannounced PFLR event occurs, or when a device is surprise removed, we currently detach the device and close the netdev. This unfortunately leaves a lot of things still active, such as the msix_mbx_pf IRQ, and Tx/Rx resources. This can cause problems because the register reads will return potentially invalid values which may result in unknown driver behavior. Begin the process of resetting using fm10k_prepare_for_reset(), much in the same way as the suspend and resume cycle does. This will attempt to shutdown as much as possible, in order to prevent possible issues. A naive implementation for this has issues, because there are now multiple flows calling the reset logic and setting a reset bit. This would cause problems, because the "re-attach" routine might call fm10k_handle_reset() prior to the reset actually finishing. Instead, we'll add state bits to indicate which flow actually initiated the reset. For the general reset flow, we'll assume that if someone else is resetting that we do not need to handle it at all, so it does not need its own state bit. For the suspend case, we will simply issue a warning indicating that we are attempting to recover from this case when resuming. For the detached subtask, we'll simply refuse to re-attach until we've actually initiated a reset as part of that flow. Finally, we'll stop attempting to manage the mailbox subtask when we're detached, since there's nothing we can do if we don't have a PCIe address. Overall this produces a much cleaner shutdown and recovery cycle for a PCIe surprise remove event. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-07-11 04:23:14 +08:00
static void fm10k_reset_subtask(struct fm10k_intfc *interface)
{
int err;
fm10k: use a BITMAP for flags to avoid race conditions Replace bitwise operators and #defines with a BITMAP and enumeration values. This is similar to how we handle the "state" values as well. This has two distinct advantages over the old method. First, we ensure correctness of operations which are currently problematic due to race conditions. Suppose that two kernel threads are running, such as the watchdog and an ethtool ioctl, and both modify flags. We'll say that the watchdog is CPU A, and the ethtool ioctl is CPU B. CPU A sets FLAG_1, which can be seen as CPU A read FLAGS CPU A write FLAGS | FLAG_1 CPU B sets FLAG_2, which can be seen as CPU B read FLAGS CPU A write FLAGS | FLAG_2 However, "|=" and "&=" operators are not actually atomic. So this could be ordered like the following: CPU A read FLAGS -> variable CPU B read FLAGS -> variable CPU A write FLAGS (variable | FLAG_1) CPU B write FLAGS (variable | FLAG_2) Notice how the 2nd write from CPU B could actually undo the write from CPU A because it isn't guaranteed that the |= operation is atomic. In practice the race windows for most flag writes is incredibly narrow so it is not easy to isolate issues. However, the more flags we have, the more likely they will cause problems. Additionally, if such a problem were to arise, it would be incredibly difficult to track down. Second, there is an additional advantage beyond code correctness. We can now automatically size the BITMAP if more flags were added, so that we do not need to remember that flags is u32 and thus if we added too many flags we would over-run the variable. This is not a likely occurrence for fm10k driver, but this patch can serve as an example for other drivers which have many more flags. This particular change does have a bit of trouble converting some of the idioms previously used with the #defines for flags. Specifically, when converting FM10K_FLAG_RSS_FIELD_IPV[46]_UDP flags. This whole operation was actually quite problematic, because we actually stored flags separately. This could more easily show the problem of the above re-ordering issue. This is really difficult to test whether atomics make a difference in practical scenarios, but you can ensure that basic functionality remains the same. This patch has a lot of code coverage, but most of it is relatively simple. While we are modifying these files, update their copyright year. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-01-13 07:59:38 +08:00
if (!test_and_clear_bit(FM10K_FLAG_RESET_REQUESTED,
interface->flags))
return;
fm10k: prepare_for_reset() when we lose PCIe Link If we lose PCIe link, such as when an unannounced PFLR event occurs, or when a device is surprise removed, we currently detach the device and close the netdev. This unfortunately leaves a lot of things still active, such as the msix_mbx_pf IRQ, and Tx/Rx resources. This can cause problems because the register reads will return potentially invalid values which may result in unknown driver behavior. Begin the process of resetting using fm10k_prepare_for_reset(), much in the same way as the suspend and resume cycle does. This will attempt to shutdown as much as possible, in order to prevent possible issues. A naive implementation for this has issues, because there are now multiple flows calling the reset logic and setting a reset bit. This would cause problems, because the "re-attach" routine might call fm10k_handle_reset() prior to the reset actually finishing. Instead, we'll add state bits to indicate which flow actually initiated the reset. For the general reset flow, we'll assume that if someone else is resetting that we do not need to handle it at all, so it does not need its own state bit. For the suspend case, we will simply issue a warning indicating that we are attempting to recover from this case when resuming. For the detached subtask, we'll simply refuse to re-attach until we've actually initiated a reset as part of that flow. Finally, we'll stop attempting to manage the mailbox subtask when we're detached, since there's nothing we can do if we don't have a PCIe address. Overall this produces a much cleaner shutdown and recovery cycle for a PCIe surprise remove event. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-07-11 04:23:14 +08:00
/* If another thread has already prepared to reset the device, we
* should not attempt to handle a reset here, since we'd race with
* that thread. This may happen if we suspend the device or if the
* PCIe link is lost. In this case, we'll just ignore the RESET
* request, as it will (eventually) be taken care of when the thread
* which actually started the reset is finished.
*/
if (!fm10k_prepare_for_reset(interface))
return;
netdev_err(interface->netdev, "Reset interface\n");
fm10k: prepare_for_reset() when we lose PCIe Link If we lose PCIe link, such as when an unannounced PFLR event occurs, or when a device is surprise removed, we currently detach the device and close the netdev. This unfortunately leaves a lot of things still active, such as the msix_mbx_pf IRQ, and Tx/Rx resources. This can cause problems because the register reads will return potentially invalid values which may result in unknown driver behavior. Begin the process of resetting using fm10k_prepare_for_reset(), much in the same way as the suspend and resume cycle does. This will attempt to shutdown as much as possible, in order to prevent possible issues. A naive implementation for this has issues, because there are now multiple flows calling the reset logic and setting a reset bit. This would cause problems, because the "re-attach" routine might call fm10k_handle_reset() prior to the reset actually finishing. Instead, we'll add state bits to indicate which flow actually initiated the reset. For the general reset flow, we'll assume that if someone else is resetting that we do not need to handle it at all, so it does not need its own state bit. For the suspend case, we will simply issue a warning indicating that we are attempting to recover from this case when resuming. For the detached subtask, we'll simply refuse to re-attach until we've actually initiated a reset as part of that flow. Finally, we'll stop attempting to manage the mailbox subtask when we're detached, since there's nothing we can do if we don't have a PCIe address. Overall this produces a much cleaner shutdown and recovery cycle for a PCIe surprise remove event. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-07-11 04:23:14 +08:00
err = fm10k_handle_reset(interface);
if (err)
dev_err(&interface->pdev->dev,
"fm10k_handle_reset failed: %d\n", err);
}
/**
* fm10k_configure_swpri_map - Configure Receive SWPRI to PC mapping
* @interface: board private structure
*
* Configure the SWPRI to PC mapping for the port.
**/
static void fm10k_configure_swpri_map(struct fm10k_intfc *interface)
{
struct net_device *netdev = interface->netdev;
struct fm10k_hw *hw = &interface->hw;
int i;
/* clear flag indicating update is needed */
fm10k: use a BITMAP for flags to avoid race conditions Replace bitwise operators and #defines with a BITMAP and enumeration values. This is similar to how we handle the "state" values as well. This has two distinct advantages over the old method. First, we ensure correctness of operations which are currently problematic due to race conditions. Suppose that two kernel threads are running, such as the watchdog and an ethtool ioctl, and both modify flags. We'll say that the watchdog is CPU A, and the ethtool ioctl is CPU B. CPU A sets FLAG_1, which can be seen as CPU A read FLAGS CPU A write FLAGS | FLAG_1 CPU B sets FLAG_2, which can be seen as CPU B read FLAGS CPU A write FLAGS | FLAG_2 However, "|=" and "&=" operators are not actually atomic. So this could be ordered like the following: CPU A read FLAGS -> variable CPU B read FLAGS -> variable CPU A write FLAGS (variable | FLAG_1) CPU B write FLAGS (variable | FLAG_2) Notice how the 2nd write from CPU B could actually undo the write from CPU A because it isn't guaranteed that the |= operation is atomic. In practice the race windows for most flag writes is incredibly narrow so it is not easy to isolate issues. However, the more flags we have, the more likely they will cause problems. Additionally, if such a problem were to arise, it would be incredibly difficult to track down. Second, there is an additional advantage beyond code correctness. We can now automatically size the BITMAP if more flags were added, so that we do not need to remember that flags is u32 and thus if we added too many flags we would over-run the variable. This is not a likely occurrence for fm10k driver, but this patch can serve as an example for other drivers which have many more flags. This particular change does have a bit of trouble converting some of the idioms previously used with the #defines for flags. Specifically, when converting FM10K_FLAG_RSS_FIELD_IPV[46]_UDP flags. This whole operation was actually quite problematic, because we actually stored flags separately. This could more easily show the problem of the above re-ordering issue. This is really difficult to test whether atomics make a difference in practical scenarios, but you can ensure that basic functionality remains the same. This patch has a lot of code coverage, but most of it is relatively simple. While we are modifying these files, update their copyright year. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-01-13 07:59:38 +08:00
clear_bit(FM10K_FLAG_SWPRI_CONFIG, interface->flags);
/* these registers are only available on the PF */
if (hw->mac.type != fm10k_mac_pf)
return;
/* configure SWPRI to PC map */
for (i = 0; i < FM10K_SWPRI_MAX; i++)
fm10k_write_reg(hw, FM10K_SWPRI_MAP(i),
netdev_get_prio_tc_map(netdev, i));
}
/**
* fm10k_watchdog_update_host_state - Update the link status based on host.
* @interface: board private structure
**/
static void fm10k_watchdog_update_host_state(struct fm10k_intfc *interface)
{
struct fm10k_hw *hw = &interface->hw;
s32 err;
if (test_bit(__FM10K_LINK_DOWN, interface->state)) {
interface->host_ready = false;
if (time_is_after_jiffies(interface->link_down_event))
return;
clear_bit(__FM10K_LINK_DOWN, interface->state);
}
fm10k: use a BITMAP for flags to avoid race conditions Replace bitwise operators and #defines with a BITMAP and enumeration values. This is similar to how we handle the "state" values as well. This has two distinct advantages over the old method. First, we ensure correctness of operations which are currently problematic due to race conditions. Suppose that two kernel threads are running, such as the watchdog and an ethtool ioctl, and both modify flags. We'll say that the watchdog is CPU A, and the ethtool ioctl is CPU B. CPU A sets FLAG_1, which can be seen as CPU A read FLAGS CPU A write FLAGS | FLAG_1 CPU B sets FLAG_2, which can be seen as CPU B read FLAGS CPU A write FLAGS | FLAG_2 However, "|=" and "&=" operators are not actually atomic. So this could be ordered like the following: CPU A read FLAGS -> variable CPU B read FLAGS -> variable CPU A write FLAGS (variable | FLAG_1) CPU B write FLAGS (variable | FLAG_2) Notice how the 2nd write from CPU B could actually undo the write from CPU A because it isn't guaranteed that the |= operation is atomic. In practice the race windows for most flag writes is incredibly narrow so it is not easy to isolate issues. However, the more flags we have, the more likely they will cause problems. Additionally, if such a problem were to arise, it would be incredibly difficult to track down. Second, there is an additional advantage beyond code correctness. We can now automatically size the BITMAP if more flags were added, so that we do not need to remember that flags is u32 and thus if we added too many flags we would over-run the variable. This is not a likely occurrence for fm10k driver, but this patch can serve as an example for other drivers which have many more flags. This particular change does have a bit of trouble converting some of the idioms previously used with the #defines for flags. Specifically, when converting FM10K_FLAG_RSS_FIELD_IPV[46]_UDP flags. This whole operation was actually quite problematic, because we actually stored flags separately. This could more easily show the problem of the above re-ordering issue. This is really difficult to test whether atomics make a difference in practical scenarios, but you can ensure that basic functionality remains the same. This patch has a lot of code coverage, but most of it is relatively simple. While we are modifying these files, update their copyright year. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-01-13 07:59:38 +08:00
if (test_bit(FM10K_FLAG_SWPRI_CONFIG, interface->flags)) {
if (rtnl_trylock()) {
fm10k_configure_swpri_map(interface);
rtnl_unlock();
}
}
/* lock the mailbox for transmit and receive */
fm10k_mbx_lock(interface);
err = hw->mac.ops.get_host_state(hw, &interface->host_ready);
if (err && time_is_before_jiffies(interface->last_reset))
fm10k: use a BITMAP for flags to avoid race conditions Replace bitwise operators and #defines with a BITMAP and enumeration values. This is similar to how we handle the "state" values as well. This has two distinct advantages over the old method. First, we ensure correctness of operations which are currently problematic due to race conditions. Suppose that two kernel threads are running, such as the watchdog and an ethtool ioctl, and both modify flags. We'll say that the watchdog is CPU A, and the ethtool ioctl is CPU B. CPU A sets FLAG_1, which can be seen as CPU A read FLAGS CPU A write FLAGS | FLAG_1 CPU B sets FLAG_2, which can be seen as CPU B read FLAGS CPU A write FLAGS | FLAG_2 However, "|=" and "&=" operators are not actually atomic. So this could be ordered like the following: CPU A read FLAGS -> variable CPU B read FLAGS -> variable CPU A write FLAGS (variable | FLAG_1) CPU B write FLAGS (variable | FLAG_2) Notice how the 2nd write from CPU B could actually undo the write from CPU A because it isn't guaranteed that the |= operation is atomic. In practice the race windows for most flag writes is incredibly narrow so it is not easy to isolate issues. However, the more flags we have, the more likely they will cause problems. Additionally, if such a problem were to arise, it would be incredibly difficult to track down. Second, there is an additional advantage beyond code correctness. We can now automatically size the BITMAP if more flags were added, so that we do not need to remember that flags is u32 and thus if we added too many flags we would over-run the variable. This is not a likely occurrence for fm10k driver, but this patch can serve as an example for other drivers which have many more flags. This particular change does have a bit of trouble converting some of the idioms previously used with the #defines for flags. Specifically, when converting FM10K_FLAG_RSS_FIELD_IPV[46]_UDP flags. This whole operation was actually quite problematic, because we actually stored flags separately. This could more easily show the problem of the above re-ordering issue. This is really difficult to test whether atomics make a difference in practical scenarios, but you can ensure that basic functionality remains the same. This patch has a lot of code coverage, but most of it is relatively simple. While we are modifying these files, update their copyright year. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-01-13 07:59:38 +08:00
set_bit(FM10K_FLAG_RESET_REQUESTED, interface->flags);
/* free the lock */
fm10k_mbx_unlock(interface);
}
/**
* fm10k_mbx_subtask - Process upstream and downstream mailboxes
* @interface: board private structure
*
* This function will process both the upstream and downstream mailboxes.
**/
static void fm10k_mbx_subtask(struct fm10k_intfc *interface)
{
fm10k: prepare_for_reset() when we lose PCIe Link If we lose PCIe link, such as when an unannounced PFLR event occurs, or when a device is surprise removed, we currently detach the device and close the netdev. This unfortunately leaves a lot of things still active, such as the msix_mbx_pf IRQ, and Tx/Rx resources. This can cause problems because the register reads will return potentially invalid values which may result in unknown driver behavior. Begin the process of resetting using fm10k_prepare_for_reset(), much in the same way as the suspend and resume cycle does. This will attempt to shutdown as much as possible, in order to prevent possible issues. A naive implementation for this has issues, because there are now multiple flows calling the reset logic and setting a reset bit. This would cause problems, because the "re-attach" routine might call fm10k_handle_reset() prior to the reset actually finishing. Instead, we'll add state bits to indicate which flow actually initiated the reset. For the general reset flow, we'll assume that if someone else is resetting that we do not need to handle it at all, so it does not need its own state bit. For the suspend case, we will simply issue a warning indicating that we are attempting to recover from this case when resuming. For the detached subtask, we'll simply refuse to re-attach until we've actually initiated a reset as part of that flow. Finally, we'll stop attempting to manage the mailbox subtask when we're detached, since there's nothing we can do if we don't have a PCIe address. Overall this produces a much cleaner shutdown and recovery cycle for a PCIe surprise remove event. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-07-11 04:23:14 +08:00
/* If we're resetting, bail out */
if (test_bit(__FM10K_RESETTING, interface->state))
return;
/* process upstream mailbox and update device state */
fm10k_watchdog_update_host_state(interface);
/* process downstream mailboxes */
fm10k_iov_mbx(interface);
}
/**
* fm10k_watchdog_host_is_ready - Update netdev status based on host ready
* @interface: board private structure
**/
static void fm10k_watchdog_host_is_ready(struct fm10k_intfc *interface)
{
struct net_device *netdev = interface->netdev;
/* only continue if link state is currently down */
if (netif_carrier_ok(netdev))
return;
netif_info(interface, drv, netdev, "NIC Link is up\n");
netif_carrier_on(netdev);
netif_tx_wake_all_queues(netdev);
}
/**
* fm10k_watchdog_host_not_ready - Update netdev status based on host not ready
* @interface: board private structure
**/
static void fm10k_watchdog_host_not_ready(struct fm10k_intfc *interface)
{
struct net_device *netdev = interface->netdev;
/* only continue if link state is currently up */
if (!netif_carrier_ok(netdev))
return;
netif_info(interface, drv, netdev, "NIC Link is down\n");
netif_carrier_off(netdev);
netif_tx_stop_all_queues(netdev);
}
/**
* fm10k_update_stats - Update the board statistics counters.
* @interface: board private structure
**/
void fm10k_update_stats(struct fm10k_intfc *interface)
{
struct net_device_stats *net_stats = &interface->netdev->stats;
struct fm10k_hw *hw = &interface->hw;
u64 hw_csum_tx_good = 0, hw_csum_rx_good = 0, rx_length_errors = 0;
u64 rx_switch_errors = 0, rx_drops = 0, rx_pp_errors = 0;
u64 rx_link_errors = 0;
u64 rx_errors = 0, rx_csum_errors = 0, tx_csum_errors = 0;
u64 restart_queue = 0, tx_busy = 0, alloc_failed = 0;
u64 rx_bytes_nic = 0, rx_pkts_nic = 0, rx_drops_nic = 0;
u64 tx_bytes_nic = 0, tx_pkts_nic = 0;
u64 bytes, pkts;
int i;
/* ensure only one thread updates stats at a time */
if (test_and_set_bit(__FM10K_UPDATING_STATS, interface->state))
return;
/* do not allow stats update via service task for next second */
interface->next_stats_update = jiffies + HZ;
/* gather some stats to the interface struct that are per queue */
for (bytes = 0, pkts = 0, i = 0; i < interface->num_tx_queues; i++) {
struct fm10k_ring *tx_ring = READ_ONCE(interface->tx_ring[i]);
if (!tx_ring)
continue;
restart_queue += tx_ring->tx_stats.restart_queue;
tx_busy += tx_ring->tx_stats.tx_busy;
tx_csum_errors += tx_ring->tx_stats.csum_err;
bytes += tx_ring->stats.bytes;
pkts += tx_ring->stats.packets;
hw_csum_tx_good += tx_ring->tx_stats.csum_good;
}
interface->restart_queue = restart_queue;
interface->tx_busy = tx_busy;
net_stats->tx_bytes = bytes;
net_stats->tx_packets = pkts;
interface->tx_csum_errors = tx_csum_errors;
interface->hw_csum_tx_good = hw_csum_tx_good;
/* gather some stats to the interface struct that are per queue */
for (bytes = 0, pkts = 0, i = 0; i < interface->num_rx_queues; i++) {
struct fm10k_ring *rx_ring = READ_ONCE(interface->rx_ring[i]);
if (!rx_ring)
continue;
bytes += rx_ring->stats.bytes;
pkts += rx_ring->stats.packets;
alloc_failed += rx_ring->rx_stats.alloc_failed;
rx_csum_errors += rx_ring->rx_stats.csum_err;
rx_errors += rx_ring->rx_stats.errors;
hw_csum_rx_good += rx_ring->rx_stats.csum_good;
rx_switch_errors += rx_ring->rx_stats.switch_errors;
rx_drops += rx_ring->rx_stats.drops;
rx_pp_errors += rx_ring->rx_stats.pp_errors;
rx_link_errors += rx_ring->rx_stats.link_errors;
rx_length_errors += rx_ring->rx_stats.length_errors;
}
net_stats->rx_bytes = bytes;
net_stats->rx_packets = pkts;
interface->alloc_failed = alloc_failed;
interface->rx_csum_errors = rx_csum_errors;
interface->hw_csum_rx_good = hw_csum_rx_good;
interface->rx_switch_errors = rx_switch_errors;
interface->rx_drops = rx_drops;
interface->rx_pp_errors = rx_pp_errors;
interface->rx_link_errors = rx_link_errors;
interface->rx_length_errors = rx_length_errors;
hw->mac.ops.update_hw_stats(hw, &interface->stats);
for (i = 0; i < hw->mac.max_queues; i++) {
struct fm10k_hw_stats_q *q = &interface->stats.q[i];
tx_bytes_nic += q->tx_bytes.count;
tx_pkts_nic += q->tx_packets.count;
rx_bytes_nic += q->rx_bytes.count;
rx_pkts_nic += q->rx_packets.count;
rx_drops_nic += q->rx_drops.count;
}
interface->tx_bytes_nic = tx_bytes_nic;
interface->tx_packets_nic = tx_pkts_nic;
interface->rx_bytes_nic = rx_bytes_nic;
interface->rx_packets_nic = rx_pkts_nic;
interface->rx_drops_nic = rx_drops_nic;
/* Fill out the OS statistics structure */
net_stats->rx_errors = rx_errors;
net_stats->rx_dropped = interface->stats.nodesc_drop.count;
clear_bit(__FM10K_UPDATING_STATS, interface->state);
}
/**
* fm10k_watchdog_flush_tx - flush queues on host not ready
* @interface - pointer to the device interface structure
**/
static void fm10k_watchdog_flush_tx(struct fm10k_intfc *interface)
{
int some_tx_pending = 0;
int i;
/* nothing to do if carrier is up */
if (netif_carrier_ok(interface->netdev))
return;
for (i = 0; i < interface->num_tx_queues; i++) {
struct fm10k_ring *tx_ring = interface->tx_ring[i];
if (tx_ring->next_to_use != tx_ring->next_to_clean) {
some_tx_pending = 1;
break;
}
}
/* We've lost link, so the controller stops DMA, but we've got
* queued Tx work that's never going to get done, so reset
* controller to flush Tx.
*/
if (some_tx_pending)
fm10k: use a BITMAP for flags to avoid race conditions Replace bitwise operators and #defines with a BITMAP and enumeration values. This is similar to how we handle the "state" values as well. This has two distinct advantages over the old method. First, we ensure correctness of operations which are currently problematic due to race conditions. Suppose that two kernel threads are running, such as the watchdog and an ethtool ioctl, and both modify flags. We'll say that the watchdog is CPU A, and the ethtool ioctl is CPU B. CPU A sets FLAG_1, which can be seen as CPU A read FLAGS CPU A write FLAGS | FLAG_1 CPU B sets FLAG_2, which can be seen as CPU B read FLAGS CPU A write FLAGS | FLAG_2 However, "|=" and "&=" operators are not actually atomic. So this could be ordered like the following: CPU A read FLAGS -> variable CPU B read FLAGS -> variable CPU A write FLAGS (variable | FLAG_1) CPU B write FLAGS (variable | FLAG_2) Notice how the 2nd write from CPU B could actually undo the write from CPU A because it isn't guaranteed that the |= operation is atomic. In practice the race windows for most flag writes is incredibly narrow so it is not easy to isolate issues. However, the more flags we have, the more likely they will cause problems. Additionally, if such a problem were to arise, it would be incredibly difficult to track down. Second, there is an additional advantage beyond code correctness. We can now automatically size the BITMAP if more flags were added, so that we do not need to remember that flags is u32 and thus if we added too many flags we would over-run the variable. This is not a likely occurrence for fm10k driver, but this patch can serve as an example for other drivers which have many more flags. This particular change does have a bit of trouble converting some of the idioms previously used with the #defines for flags. Specifically, when converting FM10K_FLAG_RSS_FIELD_IPV[46]_UDP flags. This whole operation was actually quite problematic, because we actually stored flags separately. This could more easily show the problem of the above re-ordering issue. This is really difficult to test whether atomics make a difference in practical scenarios, but you can ensure that basic functionality remains the same. This patch has a lot of code coverage, but most of it is relatively simple. While we are modifying these files, update their copyright year. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-01-13 07:59:38 +08:00
set_bit(FM10K_FLAG_RESET_REQUESTED, interface->flags);
}
/**
* fm10k_watchdog_subtask - check and bring link up
* @interface - pointer to the device interface structure
**/
static void fm10k_watchdog_subtask(struct fm10k_intfc *interface)
{
/* if interface is down do nothing */
if (test_bit(__FM10K_DOWN, interface->state) ||
test_bit(__FM10K_RESETTING, interface->state))
return;
if (interface->host_ready)
fm10k_watchdog_host_is_ready(interface);
else
fm10k_watchdog_host_not_ready(interface);
/* update stats only once every second */
if (time_is_before_jiffies(interface->next_stats_update))
fm10k_update_stats(interface);
/* flush any uncompleted work */
fm10k_watchdog_flush_tx(interface);
}
/**
* fm10k_check_hang_subtask - check for hung queues and dropped interrupts
* @interface - pointer to the device interface structure
*
* This function serves two purposes. First it strobes the interrupt lines
* in order to make certain interrupts are occurring. Secondly it sets the
* bits needed to check for TX hangs. As a result we should immediately
* determine if a hang has occurred.
*/
static void fm10k_check_hang_subtask(struct fm10k_intfc *interface)
{
int i;
/* If we're down or resetting, just bail */
if (test_bit(__FM10K_DOWN, interface->state) ||
test_bit(__FM10K_RESETTING, interface->state))
return;
/* rate limit tx hang checks to only once every 2 seconds */
if (time_is_after_eq_jiffies(interface->next_tx_hang_check))
return;
interface->next_tx_hang_check = jiffies + (2 * HZ);
if (netif_carrier_ok(interface->netdev)) {
/* Force detection of hung controller */
for (i = 0; i < interface->num_tx_queues; i++)
set_check_for_tx_hang(interface->tx_ring[i]);
/* Rearm all in-use q_vectors for immediate firing */
for (i = 0; i < interface->num_q_vectors; i++) {
struct fm10k_q_vector *qv = interface->q_vector[i];
if (!qv->tx.count && !qv->rx.count)
continue;
writel(FM10K_ITR_ENABLE | FM10K_ITR_PENDING2, qv->itr);
}
}
}
/**
* fm10k_service_task - manages and runs subtasks
* @work: pointer to work_struct containing our data
**/
static void fm10k_service_task(struct work_struct *work)
{
struct fm10k_intfc *interface;
interface = container_of(work, struct fm10k_intfc, service_task);
fm10k: prepare_for_reset() when we lose PCIe Link If we lose PCIe link, such as when an unannounced PFLR event occurs, or when a device is surprise removed, we currently detach the device and close the netdev. This unfortunately leaves a lot of things still active, such as the msix_mbx_pf IRQ, and Tx/Rx resources. This can cause problems because the register reads will return potentially invalid values which may result in unknown driver behavior. Begin the process of resetting using fm10k_prepare_for_reset(), much in the same way as the suspend and resume cycle does. This will attempt to shutdown as much as possible, in order to prevent possible issues. A naive implementation for this has issues, because there are now multiple flows calling the reset logic and setting a reset bit. This would cause problems, because the "re-attach" routine might call fm10k_handle_reset() prior to the reset actually finishing. Instead, we'll add state bits to indicate which flow actually initiated the reset. For the general reset flow, we'll assume that if someone else is resetting that we do not need to handle it at all, so it does not need its own state bit. For the suspend case, we will simply issue a warning indicating that we are attempting to recover from this case when resuming. For the detached subtask, we'll simply refuse to re-attach until we've actually initiated a reset as part of that flow. Finally, we'll stop attempting to manage the mailbox subtask when we're detached, since there's nothing we can do if we don't have a PCIe address. Overall this produces a much cleaner shutdown and recovery cycle for a PCIe surprise remove event. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-07-11 04:23:14 +08:00
/* Check whether we're detached first */
fm10k_detach_subtask(interface);
/* tasks run even when interface is down */
fm10k_mbx_subtask(interface);
fm10k_reset_subtask(interface);
/* tasks only run when interface is up */
fm10k_watchdog_subtask(interface);
fm10k_check_hang_subtask(interface);
/* release lock on service events to allow scheduling next event */
fm10k_service_event_complete(interface);
}
/**
* fm10k_configure_tx_ring - Configure Tx ring after Reset
* @interface: board private structure
* @ring: structure containing ring specific data
*
* Configure the Tx descriptor ring after a reset.
**/
static void fm10k_configure_tx_ring(struct fm10k_intfc *interface,
struct fm10k_ring *ring)
{
struct fm10k_hw *hw = &interface->hw;
u64 tdba = ring->dma;
u32 size = ring->count * sizeof(struct fm10k_tx_desc);
u32 txint = FM10K_INT_MAP_DISABLE;
u32 txdctl = BIT(FM10K_TXDCTL_MAX_TIME_SHIFT) | FM10K_TXDCTL_ENABLE;
u8 reg_idx = ring->reg_idx;
/* disable queue to avoid issues while updating state */
fm10k_write_reg(hw, FM10K_TXDCTL(reg_idx), 0);
fm10k_write_flush(hw);
/* possible poll here to verify ring resources have been cleaned */
/* set location and size for descriptor ring */
fm10k_write_reg(hw, FM10K_TDBAL(reg_idx), tdba & DMA_BIT_MASK(32));
fm10k_write_reg(hw, FM10K_TDBAH(reg_idx), tdba >> 32);
fm10k_write_reg(hw, FM10K_TDLEN(reg_idx), size);
/* reset head and tail pointers */
fm10k_write_reg(hw, FM10K_TDH(reg_idx), 0);
fm10k_write_reg(hw, FM10K_TDT(reg_idx), 0);
/* store tail pointer */
ring->tail = &interface->uc_addr[FM10K_TDT(reg_idx)];
/* reset ntu and ntc to place SW in sync with hardware */
ring->next_to_clean = 0;
ring->next_to_use = 0;
/* Map interrupt */
if (ring->q_vector) {
txint = ring->q_vector->v_idx + NON_Q_VECTORS(hw);
txint |= FM10K_INT_MAP_TIMER0;
}
fm10k_write_reg(hw, FM10K_TXINT(reg_idx), txint);
/* enable use of FTAG bit in Tx descriptor, register is RO for VF */
fm10k_write_reg(hw, FM10K_PFVTCTL(reg_idx),
FM10K_PFVTCTL_FTAG_DESC_ENABLE);
/* Initialize XPS */
if (!test_and_set_bit(__FM10K_TX_XPS_INIT_DONE, ring->state) &&
ring->q_vector)
netif_set_xps_queue(ring->netdev,
&ring->q_vector->affinity_mask,
ring->queue_index);
/* enable queue */
fm10k_write_reg(hw, FM10K_TXDCTL(reg_idx), txdctl);
}
/**
* fm10k_enable_tx_ring - Verify Tx ring is enabled after configuration
* @interface: board private structure
* @ring: structure containing ring specific data
*
* Verify the Tx descriptor ring is ready for transmit.
**/
static void fm10k_enable_tx_ring(struct fm10k_intfc *interface,
struct fm10k_ring *ring)
{
struct fm10k_hw *hw = &interface->hw;
int wait_loop = 10;
u32 txdctl;
u8 reg_idx = ring->reg_idx;
/* if we are already enabled just exit */
if (fm10k_read_reg(hw, FM10K_TXDCTL(reg_idx)) & FM10K_TXDCTL_ENABLE)
return;
/* poll to verify queue is enabled */
do {
usleep_range(1000, 2000);
txdctl = fm10k_read_reg(hw, FM10K_TXDCTL(reg_idx));
} while (!(txdctl & FM10K_TXDCTL_ENABLE) && --wait_loop);
if (!wait_loop)
netif_err(interface, drv, interface->netdev,
"Could not enable Tx Queue %d\n", reg_idx);
}
/**
* fm10k_configure_tx - Configure Transmit Unit after Reset
* @interface: board private structure
*
* Configure the Tx unit of the MAC after a reset.
**/
static void fm10k_configure_tx(struct fm10k_intfc *interface)
{
int i;
/* Setup the HW Tx Head and Tail descriptor pointers */
for (i = 0; i < interface->num_tx_queues; i++)
fm10k_configure_tx_ring(interface, interface->tx_ring[i]);
/* poll here to verify that Tx rings are now enabled */
for (i = 0; i < interface->num_tx_queues; i++)
fm10k_enable_tx_ring(interface, interface->tx_ring[i]);
}
/**
* fm10k_configure_rx_ring - Configure Rx ring after Reset
* @interface: board private structure
* @ring: structure containing ring specific data
*
* Configure the Rx descriptor ring after a reset.
**/
static void fm10k_configure_rx_ring(struct fm10k_intfc *interface,
struct fm10k_ring *ring)
{
u64 rdba = ring->dma;
struct fm10k_hw *hw = &interface->hw;
u32 size = ring->count * sizeof(union fm10k_rx_desc);
u32 rxqctl, rxdctl = FM10K_RXDCTL_WRITE_BACK_MIN_DELAY;
u32 srrctl = FM10K_SRRCTL_BUFFER_CHAINING_EN;
u32 rxint = FM10K_INT_MAP_DISABLE;
u8 rx_pause = interface->rx_pause;
u8 reg_idx = ring->reg_idx;
/* disable queue to avoid issues while updating state */
rxqctl = fm10k_read_reg(hw, FM10K_RXQCTL(reg_idx));
rxqctl &= ~FM10K_RXQCTL_ENABLE;
fm10k_write_reg(hw, FM10K_RXQCTL(reg_idx), rxqctl);
fm10k_write_flush(hw);
/* possible poll here to verify ring resources have been cleaned */
/* set location and size for descriptor ring */
fm10k_write_reg(hw, FM10K_RDBAL(reg_idx), rdba & DMA_BIT_MASK(32));
fm10k_write_reg(hw, FM10K_RDBAH(reg_idx), rdba >> 32);
fm10k_write_reg(hw, FM10K_RDLEN(reg_idx), size);
/* reset head and tail pointers */
fm10k_write_reg(hw, FM10K_RDH(reg_idx), 0);
fm10k_write_reg(hw, FM10K_RDT(reg_idx), 0);
/* store tail pointer */
ring->tail = &interface->uc_addr[FM10K_RDT(reg_idx)];
/* reset ntu and ntc to place SW in sync with hardware */
ring->next_to_clean = 0;
ring->next_to_use = 0;
ring->next_to_alloc = 0;
/* Configure the Rx buffer size for one buff without split */
srrctl |= FM10K_RX_BUFSZ >> FM10K_SRRCTL_BSIZEPKT_SHIFT;
/* Configure the Rx ring to suppress loopback packets */
srrctl |= FM10K_SRRCTL_LOOPBACK_SUPPRESS;
fm10k_write_reg(hw, FM10K_SRRCTL(reg_idx), srrctl);
/* Enable drop on empty */
#ifdef CONFIG_DCB
if (interface->pfc_en)
rx_pause = interface->pfc_en;
#endif
if (!(rx_pause & BIT(ring->qos_pc)))
rxdctl |= FM10K_RXDCTL_DROP_ON_EMPTY;
fm10k_write_reg(hw, FM10K_RXDCTL(reg_idx), rxdctl);
/* assign default VLAN to queue */
ring->vid = hw->mac.default_vid;
/* if we have an active VLAN, disable default VLAN ID */
if (test_bit(hw->mac.default_vid, interface->active_vlans))
ring->vid |= FM10K_VLAN_CLEAR;
/* Map interrupt */
if (ring->q_vector) {
rxint = ring->q_vector->v_idx + NON_Q_VECTORS(hw);
rxint |= FM10K_INT_MAP_TIMER1;
}
fm10k_write_reg(hw, FM10K_RXINT(reg_idx), rxint);
/* enable queue */
rxqctl = fm10k_read_reg(hw, FM10K_RXQCTL(reg_idx));
rxqctl |= FM10K_RXQCTL_ENABLE;
fm10k_write_reg(hw, FM10K_RXQCTL(reg_idx), rxqctl);
/* place buffers on ring for receive data */
fm10k_alloc_rx_buffers(ring, fm10k_desc_unused(ring));
}
/**
* fm10k_update_rx_drop_en - Configures the drop enable bits for Rx rings
* @interface: board private structure
*
* Configure the drop enable bits for the Rx rings.
**/
void fm10k_update_rx_drop_en(struct fm10k_intfc *interface)
{
struct fm10k_hw *hw = &interface->hw;
u8 rx_pause = interface->rx_pause;
int i;
#ifdef CONFIG_DCB
if (interface->pfc_en)
rx_pause = interface->pfc_en;
#endif
for (i = 0; i < interface->num_rx_queues; i++) {
struct fm10k_ring *ring = interface->rx_ring[i];
u32 rxdctl = FM10K_RXDCTL_WRITE_BACK_MIN_DELAY;
u8 reg_idx = ring->reg_idx;
if (!(rx_pause & BIT(ring->qos_pc)))
rxdctl |= FM10K_RXDCTL_DROP_ON_EMPTY;
fm10k_write_reg(hw, FM10K_RXDCTL(reg_idx), rxdctl);
}
}
/**
* fm10k_configure_dglort - Configure Receive DGLORT after reset
* @interface: board private structure
*
* Configure the DGLORT description and RSS tables.
**/
static void fm10k_configure_dglort(struct fm10k_intfc *interface)
{
struct fm10k_dglort_cfg dglort = { 0 };
struct fm10k_hw *hw = &interface->hw;
int i;
u32 mrqc;
/* Fill out hash function seeds */
for (i = 0; i < FM10K_RSSRK_SIZE; i++)
fm10k_write_reg(hw, FM10K_RSSRK(0, i), interface->rssrk[i]);
/* Write RETA table to hardware */
for (i = 0; i < FM10K_RETA_SIZE; i++)
fm10k_write_reg(hw, FM10K_RETA(0, i), interface->reta[i]);
/* Generate RSS hash based on packet types, TCP/UDP
* port numbers and/or IPv4/v6 src and dst addresses
*/
mrqc = FM10K_MRQC_IPV4 |
FM10K_MRQC_TCP_IPV4 |
FM10K_MRQC_IPV6 |
FM10K_MRQC_TCP_IPV6;
fm10k: use a BITMAP for flags to avoid race conditions Replace bitwise operators and #defines with a BITMAP and enumeration values. This is similar to how we handle the "state" values as well. This has two distinct advantages over the old method. First, we ensure correctness of operations which are currently problematic due to race conditions. Suppose that two kernel threads are running, such as the watchdog and an ethtool ioctl, and both modify flags. We'll say that the watchdog is CPU A, and the ethtool ioctl is CPU B. CPU A sets FLAG_1, which can be seen as CPU A read FLAGS CPU A write FLAGS | FLAG_1 CPU B sets FLAG_2, which can be seen as CPU B read FLAGS CPU A write FLAGS | FLAG_2 However, "|=" and "&=" operators are not actually atomic. So this could be ordered like the following: CPU A read FLAGS -> variable CPU B read FLAGS -> variable CPU A write FLAGS (variable | FLAG_1) CPU B write FLAGS (variable | FLAG_2) Notice how the 2nd write from CPU B could actually undo the write from CPU A because it isn't guaranteed that the |= operation is atomic. In practice the race windows for most flag writes is incredibly narrow so it is not easy to isolate issues. However, the more flags we have, the more likely they will cause problems. Additionally, if such a problem were to arise, it would be incredibly difficult to track down. Second, there is an additional advantage beyond code correctness. We can now automatically size the BITMAP if more flags were added, so that we do not need to remember that flags is u32 and thus if we added too many flags we would over-run the variable. This is not a likely occurrence for fm10k driver, but this patch can serve as an example for other drivers which have many more flags. This particular change does have a bit of trouble converting some of the idioms previously used with the #defines for flags. Specifically, when converting FM10K_FLAG_RSS_FIELD_IPV[46]_UDP flags. This whole operation was actually quite problematic, because we actually stored flags separately. This could more easily show the problem of the above re-ordering issue. This is really difficult to test whether atomics make a difference in practical scenarios, but you can ensure that basic functionality remains the same. This patch has a lot of code coverage, but most of it is relatively simple. While we are modifying these files, update their copyright year. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-01-13 07:59:38 +08:00
if (test_bit(FM10K_FLAG_RSS_FIELD_IPV4_UDP, interface->flags))
mrqc |= FM10K_MRQC_UDP_IPV4;
fm10k: use a BITMAP for flags to avoid race conditions Replace bitwise operators and #defines with a BITMAP and enumeration values. This is similar to how we handle the "state" values as well. This has two distinct advantages over the old method. First, we ensure correctness of operations which are currently problematic due to race conditions. Suppose that two kernel threads are running, such as the watchdog and an ethtool ioctl, and both modify flags. We'll say that the watchdog is CPU A, and the ethtool ioctl is CPU B. CPU A sets FLAG_1, which can be seen as CPU A read FLAGS CPU A write FLAGS | FLAG_1 CPU B sets FLAG_2, which can be seen as CPU B read FLAGS CPU A write FLAGS | FLAG_2 However, "|=" and "&=" operators are not actually atomic. So this could be ordered like the following: CPU A read FLAGS -> variable CPU B read FLAGS -> variable CPU A write FLAGS (variable | FLAG_1) CPU B write FLAGS (variable | FLAG_2) Notice how the 2nd write from CPU B could actually undo the write from CPU A because it isn't guaranteed that the |= operation is atomic. In practice the race windows for most flag writes is incredibly narrow so it is not easy to isolate issues. However, the more flags we have, the more likely they will cause problems. Additionally, if such a problem were to arise, it would be incredibly difficult to track down. Second, there is an additional advantage beyond code correctness. We can now automatically size the BITMAP if more flags were added, so that we do not need to remember that flags is u32 and thus if we added too many flags we would over-run the variable. This is not a likely occurrence for fm10k driver, but this patch can serve as an example for other drivers which have many more flags. This particular change does have a bit of trouble converting some of the idioms previously used with the #defines for flags. Specifically, when converting FM10K_FLAG_RSS_FIELD_IPV[46]_UDP flags. This whole operation was actually quite problematic, because we actually stored flags separately. This could more easily show the problem of the above re-ordering issue. This is really difficult to test whether atomics make a difference in practical scenarios, but you can ensure that basic functionality remains the same. This patch has a lot of code coverage, but most of it is relatively simple. While we are modifying these files, update their copyright year. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-01-13 07:59:38 +08:00
if (test_bit(FM10K_FLAG_RSS_FIELD_IPV6_UDP, interface->flags))
mrqc |= FM10K_MRQC_UDP_IPV6;
fm10k_write_reg(hw, FM10K_MRQC(0), mrqc);
/* configure default DGLORT mapping for RSS/DCB */
dglort.inner_rss = 1;
dglort.rss_l = fls(interface->ring_feature[RING_F_RSS].mask);
dglort.pc_l = fls(interface->ring_feature[RING_F_QOS].mask);
hw->mac.ops.configure_dglort_map(hw, &dglort);
/* assign GLORT per queue for queue mapped testing */
if (interface->glort_count > 64) {
memset(&dglort, 0, sizeof(dglort));
dglort.inner_rss = 1;
dglort.glort = interface->glort + 64;
dglort.idx = fm10k_dglort_pf_queue;
dglort.queue_l = fls(interface->num_rx_queues - 1);
hw->mac.ops.configure_dglort_map(hw, &dglort);
}
/* assign glort value for RSS/DCB specific to this interface */
memset(&dglort, 0, sizeof(dglort));
dglort.inner_rss = 1;
dglort.glort = interface->glort;
dglort.rss_l = fls(interface->ring_feature[RING_F_RSS].mask);
dglort.pc_l = fls(interface->ring_feature[RING_F_QOS].mask);
/* configure DGLORT mapping for RSS/DCB */
dglort.idx = fm10k_dglort_pf_rss;
if (interface->l2_accel)
dglort.shared_l = fls(interface->l2_accel->size);
hw->mac.ops.configure_dglort_map(hw, &dglort);
}
/**
* fm10k_configure_rx - Configure Receive Unit after Reset
* @interface: board private structure
*
* Configure the Rx unit of the MAC after a reset.
**/
static void fm10k_configure_rx(struct fm10k_intfc *interface)
{
int i;
/* Configure SWPRI to PC map */
fm10k_configure_swpri_map(interface);
/* Configure RSS and DGLORT map */
fm10k_configure_dglort(interface);
/* Setup the HW Rx Head and Tail descriptor pointers */
for (i = 0; i < interface->num_rx_queues; i++)
fm10k_configure_rx_ring(interface, interface->rx_ring[i]);
/* possible poll here to verify that Rx rings are now enabled */
}
static void fm10k_napi_enable_all(struct fm10k_intfc *interface)
{
struct fm10k_q_vector *q_vector;
int q_idx;
for (q_idx = 0; q_idx < interface->num_q_vectors; q_idx++) {
q_vector = interface->q_vector[q_idx];
napi_enable(&q_vector->napi);
}
}
static irqreturn_t fm10k_msix_clean_rings(int __always_unused irq, void *data)
{
struct fm10k_q_vector *q_vector = data;
if (q_vector->rx.count || q_vector->tx.count)
napi_schedule_irqoff(&q_vector->napi);
return IRQ_HANDLED;
}
static irqreturn_t fm10k_msix_mbx_vf(int __always_unused irq, void *data)
{
struct fm10k_intfc *interface = data;
struct fm10k_hw *hw = &interface->hw;
struct fm10k_mbx_info *mbx = &hw->mbx;
/* re-enable mailbox interrupt and indicate 20us delay */
fm10k_write_reg(hw, FM10K_VFITR(FM10K_MBX_VECTOR),
(FM10K_MBX_INT_DELAY >> hw->mac.itr_scale) |
FM10K_ITR_ENABLE);
/* service upstream mailbox */
if (fm10k_mbx_trylock(interface)) {
mbx->ops.process(hw, mbx);
fm10k_mbx_unlock(interface);
}
hw->mac.get_host_state = true;
fm10k_service_event_schedule(interface);
return IRQ_HANDLED;
}
#ifdef CONFIG_NET_POLL_CONTROLLER
/**
* fm10k_netpoll - A Polling 'interrupt' handler
* @netdev: network interface device structure
*
* This is used by netconsole to send skbs without having to re-enable
* interrupts. It's not called while the normal interrupt routine is executing.
**/
void fm10k_netpoll(struct net_device *netdev)
{
struct fm10k_intfc *interface = netdev_priv(netdev);
int i;
/* if interface is down do nothing */
if (test_bit(__FM10K_DOWN, interface->state))
return;
for (i = 0; i < interface->num_q_vectors; i++)
fm10k_msix_clean_rings(0, interface->q_vector[i]);
}
#endif
#define FM10K_ERR_MSG(type) case (type): error = #type; break
static void fm10k_handle_fault(struct fm10k_intfc *interface, int type,
struct fm10k_fault *fault)
{
struct pci_dev *pdev = interface->pdev;
struct fm10k_hw *hw = &interface->hw;
struct fm10k_iov_data *iov_data = interface->iov_data;
char *error;
switch (type) {
case FM10K_PCA_FAULT:
switch (fault->type) {
default:
error = "Unknown PCA error";
break;
FM10K_ERR_MSG(PCA_NO_FAULT);
FM10K_ERR_MSG(PCA_UNMAPPED_ADDR);
FM10K_ERR_MSG(PCA_BAD_QACCESS_PF);
FM10K_ERR_MSG(PCA_BAD_QACCESS_VF);
FM10K_ERR_MSG(PCA_MALICIOUS_REQ);
FM10K_ERR_MSG(PCA_POISONED_TLP);
FM10K_ERR_MSG(PCA_TLP_ABORT);
}
break;
case FM10K_THI_FAULT:
switch (fault->type) {
default:
error = "Unknown THI error";
break;
FM10K_ERR_MSG(THI_NO_FAULT);
FM10K_ERR_MSG(THI_MAL_DIS_Q_FAULT);
}
break;
case FM10K_FUM_FAULT:
switch (fault->type) {
default:
error = "Unknown FUM error";
break;
FM10K_ERR_MSG(FUM_NO_FAULT);
FM10K_ERR_MSG(FUM_UNMAPPED_ADDR);
FM10K_ERR_MSG(FUM_BAD_VF_QACCESS);
FM10K_ERR_MSG(FUM_ADD_DECODE_ERR);
FM10K_ERR_MSG(FUM_RO_ERROR);
FM10K_ERR_MSG(FUM_QPRC_CRC_ERROR);
FM10K_ERR_MSG(FUM_CSR_TIMEOUT);
FM10K_ERR_MSG(FUM_INVALID_TYPE);
FM10K_ERR_MSG(FUM_INVALID_LENGTH);
FM10K_ERR_MSG(FUM_INVALID_BE);
FM10K_ERR_MSG(FUM_INVALID_ALIGN);
}
break;
default:
error = "Undocumented fault";
break;
}
dev_warn(&pdev->dev,
"%s Address: 0x%llx SpecInfo: 0x%x Func: %02x.%0x\n",
error, fault->address, fault->specinfo,
PCI_SLOT(fault->func), PCI_FUNC(fault->func));
/* For VF faults, clear out the respective LPORT, reset the queue
* resources, and then reconnect to the mailbox. This allows the
* VF in question to resume behavior. For transient faults that are
* the result of non-malicious behavior this will log the fault and
* allow the VF to resume functionality. Obviously for malicious VFs
* they will be able to attempt malicious behavior again. In this
* case, the system administrator will need to step in and manually
* remove or disable the VF in question.
*/
if (fault->func && iov_data) {
int vf = fault->func - 1;
struct fm10k_vf_info *vf_info = &iov_data->vf_info[vf];
hw->iov.ops.reset_lport(hw, vf_info);
hw->iov.ops.reset_resources(hw, vf_info);
/* reset_lport disables the VF, so re-enable it */
hw->iov.ops.set_lport(hw, vf_info, vf,
FM10K_VF_FLAG_MULTI_CAPABLE);
/* reset_resources will disconnect from the mbx */
vf_info->mbx.ops.connect(hw, &vf_info->mbx);
}
}
static void fm10k_report_fault(struct fm10k_intfc *interface, u32 eicr)
{
struct fm10k_hw *hw = &interface->hw;
struct fm10k_fault fault = { 0 };
int type, err;
for (eicr &= FM10K_EICR_FAULT_MASK, type = FM10K_PCA_FAULT;
eicr;
eicr >>= 1, type += FM10K_FAULT_SIZE) {
/* only check if there is an error reported */
if (!(eicr & 0x1))
continue;
/* retrieve fault info */
err = hw->mac.ops.get_fault(hw, type, &fault);
if (err) {
dev_err(&interface->pdev->dev,
"error reading fault\n");
continue;
}
fm10k_handle_fault(interface, type, &fault);
}
}
static void fm10k_reset_drop_on_empty(struct fm10k_intfc *interface, u32 eicr)
{
struct fm10k_hw *hw = &interface->hw;
const u32 rxdctl = FM10K_RXDCTL_WRITE_BACK_MIN_DELAY;
u32 maxholdq;
int q;
if (!(eicr & FM10K_EICR_MAXHOLDTIME))
return;
maxholdq = fm10k_read_reg(hw, FM10K_MAXHOLDQ(7));
if (maxholdq)
fm10k_write_reg(hw, FM10K_MAXHOLDQ(7), maxholdq);
for (q = 255;;) {
if (maxholdq & BIT(31)) {
if (q < FM10K_MAX_QUEUES_PF) {
interface->rx_overrun_pf++;
fm10k_write_reg(hw, FM10K_RXDCTL(q), rxdctl);
} else {
interface->rx_overrun_vf++;
}
}
maxholdq *= 2;
if (!maxholdq)
q &= ~(32 - 1);
if (!q)
break;
if (q-- % 32)
continue;
maxholdq = fm10k_read_reg(hw, FM10K_MAXHOLDQ(q / 32));
if (maxholdq)
fm10k_write_reg(hw, FM10K_MAXHOLDQ(q / 32), maxholdq);
}
}
static irqreturn_t fm10k_msix_mbx_pf(int __always_unused irq, void *data)
{
struct fm10k_intfc *interface = data;
struct fm10k_hw *hw = &interface->hw;
struct fm10k_mbx_info *mbx = &hw->mbx;
u32 eicr;
s32 err = 0;
/* unmask any set bits related to this interrupt */
eicr = fm10k_read_reg(hw, FM10K_EICR);
fm10k_write_reg(hw, FM10K_EICR, eicr & (FM10K_EICR_MAILBOX |
FM10K_EICR_SWITCHREADY |
FM10K_EICR_SWITCHNOTREADY));
/* report any faults found to the message log */
fm10k_report_fault(interface, eicr);
/* reset any queues disabled due to receiver overrun */
fm10k_reset_drop_on_empty(interface, eicr);
/* service mailboxes */
if (fm10k_mbx_trylock(interface)) {
err = mbx->ops.process(hw, mbx);
2015-04-11 08:20:17 +08:00
/* handle VFLRE events */
fm10k_iov_event(interface);
fm10k_mbx_unlock(interface);
}
if (err == FM10K_ERR_RESET_REQUESTED)
fm10k: use a BITMAP for flags to avoid race conditions Replace bitwise operators and #defines with a BITMAP and enumeration values. This is similar to how we handle the "state" values as well. This has two distinct advantages over the old method. First, we ensure correctness of operations which are currently problematic due to race conditions. Suppose that two kernel threads are running, such as the watchdog and an ethtool ioctl, and both modify flags. We'll say that the watchdog is CPU A, and the ethtool ioctl is CPU B. CPU A sets FLAG_1, which can be seen as CPU A read FLAGS CPU A write FLAGS | FLAG_1 CPU B sets FLAG_2, which can be seen as CPU B read FLAGS CPU A write FLAGS | FLAG_2 However, "|=" and "&=" operators are not actually atomic. So this could be ordered like the following: CPU A read FLAGS -> variable CPU B read FLAGS -> variable CPU A write FLAGS (variable | FLAG_1) CPU B write FLAGS (variable | FLAG_2) Notice how the 2nd write from CPU B could actually undo the write from CPU A because it isn't guaranteed that the |= operation is atomic. In practice the race windows for most flag writes is incredibly narrow so it is not easy to isolate issues. However, the more flags we have, the more likely they will cause problems. Additionally, if such a problem were to arise, it would be incredibly difficult to track down. Second, there is an additional advantage beyond code correctness. We can now automatically size the BITMAP if more flags were added, so that we do not need to remember that flags is u32 and thus if we added too many flags we would over-run the variable. This is not a likely occurrence for fm10k driver, but this patch can serve as an example for other drivers which have many more flags. This particular change does have a bit of trouble converting some of the idioms previously used with the #defines for flags. Specifically, when converting FM10K_FLAG_RSS_FIELD_IPV[46]_UDP flags. This whole operation was actually quite problematic, because we actually stored flags separately. This could more easily show the problem of the above re-ordering issue. This is really difficult to test whether atomics make a difference in practical scenarios, but you can ensure that basic functionality remains the same. This patch has a lot of code coverage, but most of it is relatively simple. While we are modifying these files, update their copyright year. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-01-13 07:59:38 +08:00
set_bit(FM10K_FLAG_RESET_REQUESTED, interface->flags);
/* if switch toggled state we should reset GLORTs */
if (eicr & FM10K_EICR_SWITCHNOTREADY) {
/* force link down for at least 4 seconds */
interface->link_down_event = jiffies + (4 * HZ);
set_bit(__FM10K_LINK_DOWN, interface->state);
/* reset dglort_map back to no config */
hw->mac.dglort_map = FM10K_DGLORTMAP_NONE;
}
/* we should validate host state after interrupt event */
hw->mac.get_host_state = true;
2015-04-11 08:20:17 +08:00
/* validate host state, and handle VF mailboxes in the service task */
fm10k_service_event_schedule(interface);
/* re-enable mailbox interrupt and indicate 20us delay */
fm10k_write_reg(hw, FM10K_ITR(FM10K_MBX_VECTOR),
(FM10K_MBX_INT_DELAY >> hw->mac.itr_scale) |
FM10K_ITR_ENABLE);
return IRQ_HANDLED;
}
void fm10k_mbx_free_irq(struct fm10k_intfc *interface)
{
struct fm10k_hw *hw = &interface->hw;
struct msix_entry *entry;
int itr_reg;
/* no mailbox IRQ to free if MSI-X is not enabled */
if (!interface->msix_entries)
return;
entry = &interface->msix_entries[FM10K_MBX_VECTOR];
/* disconnect the mailbox */
hw->mbx.ops.disconnect(hw, &hw->mbx);
/* disable Mailbox cause */
if (hw->mac.type == fm10k_mac_pf) {
fm10k_write_reg(hw, FM10K_EIMR,
FM10K_EIMR_DISABLE(PCA_FAULT) |
FM10K_EIMR_DISABLE(FUM_FAULT) |
FM10K_EIMR_DISABLE(MAILBOX) |
FM10K_EIMR_DISABLE(SWITCHREADY) |
FM10K_EIMR_DISABLE(SWITCHNOTREADY) |
FM10K_EIMR_DISABLE(SRAMERROR) |
FM10K_EIMR_DISABLE(VFLR) |
FM10K_EIMR_DISABLE(MAXHOLDTIME));
itr_reg = FM10K_ITR(FM10K_MBX_VECTOR);
} else {
itr_reg = FM10K_VFITR(FM10K_MBX_VECTOR);
}
fm10k_write_reg(hw, itr_reg, FM10K_ITR_MASK_SET);
free_irq(entry->vector, interface);
}
static s32 fm10k_mbx_mac_addr(struct fm10k_hw *hw, u32 **results,
struct fm10k_mbx_info *mbx)
{
bool vlan_override = hw->mac.vlan_override;
u16 default_vid = hw->mac.default_vid;
struct fm10k_intfc *interface;
s32 err;
err = fm10k_msg_mac_vlan_vf(hw, results, mbx);
if (err)
return err;
interface = container_of(hw, struct fm10k_intfc, hw);
/* MAC was changed so we need reset */
if (is_valid_ether_addr(hw->mac.perm_addr) &&
!ether_addr_equal(hw->mac.perm_addr, hw->mac.addr))
fm10k: use a BITMAP for flags to avoid race conditions Replace bitwise operators and #defines with a BITMAP and enumeration values. This is similar to how we handle the "state" values as well. This has two distinct advantages over the old method. First, we ensure correctness of operations which are currently problematic due to race conditions. Suppose that two kernel threads are running, such as the watchdog and an ethtool ioctl, and both modify flags. We'll say that the watchdog is CPU A, and the ethtool ioctl is CPU B. CPU A sets FLAG_1, which can be seen as CPU A read FLAGS CPU A write FLAGS | FLAG_1 CPU B sets FLAG_2, which can be seen as CPU B read FLAGS CPU A write FLAGS | FLAG_2 However, "|=" and "&=" operators are not actually atomic. So this could be ordered like the following: CPU A read FLAGS -> variable CPU B read FLAGS -> variable CPU A write FLAGS (variable | FLAG_1) CPU B write FLAGS (variable | FLAG_2) Notice how the 2nd write from CPU B could actually undo the write from CPU A because it isn't guaranteed that the |= operation is atomic. In practice the race windows for most flag writes is incredibly narrow so it is not easy to isolate issues. However, the more flags we have, the more likely they will cause problems. Additionally, if such a problem were to arise, it would be incredibly difficult to track down. Second, there is an additional advantage beyond code correctness. We can now automatically size the BITMAP if more flags were added, so that we do not need to remember that flags is u32 and thus if we added too many flags we would over-run the variable. This is not a likely occurrence for fm10k driver, but this patch can serve as an example for other drivers which have many more flags. This particular change does have a bit of trouble converting some of the idioms previously used with the #defines for flags. Specifically, when converting FM10K_FLAG_RSS_FIELD_IPV[46]_UDP flags. This whole operation was actually quite problematic, because we actually stored flags separately. This could more easily show the problem of the above re-ordering issue. This is really difficult to test whether atomics make a difference in practical scenarios, but you can ensure that basic functionality remains the same. This patch has a lot of code coverage, but most of it is relatively simple. While we are modifying these files, update their copyright year. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-01-13 07:59:38 +08:00
set_bit(FM10K_FLAG_RESET_REQUESTED, interface->flags);
/* VLAN override was changed, or default VLAN changed */
if ((vlan_override != hw->mac.vlan_override) ||
(default_vid != hw->mac.default_vid))
fm10k: use a BITMAP for flags to avoid race conditions Replace bitwise operators and #defines with a BITMAP and enumeration values. This is similar to how we handle the "state" values as well. This has two distinct advantages over the old method. First, we ensure correctness of operations which are currently problematic due to race conditions. Suppose that two kernel threads are running, such as the watchdog and an ethtool ioctl, and both modify flags. We'll say that the watchdog is CPU A, and the ethtool ioctl is CPU B. CPU A sets FLAG_1, which can be seen as CPU A read FLAGS CPU A write FLAGS | FLAG_1 CPU B sets FLAG_2, which can be seen as CPU B read FLAGS CPU A write FLAGS | FLAG_2 However, "|=" and "&=" operators are not actually atomic. So this could be ordered like the following: CPU A read FLAGS -> variable CPU B read FLAGS -> variable CPU A write FLAGS (variable | FLAG_1) CPU B write FLAGS (variable | FLAG_2) Notice how the 2nd write from CPU B could actually undo the write from CPU A because it isn't guaranteed that the |= operation is atomic. In practice the race windows for most flag writes is incredibly narrow so it is not easy to isolate issues. However, the more flags we have, the more likely they will cause problems. Additionally, if such a problem were to arise, it would be incredibly difficult to track down. Second, there is an additional advantage beyond code correctness. We can now automatically size the BITMAP if more flags were added, so that we do not need to remember that flags is u32 and thus if we added too many flags we would over-run the variable. This is not a likely occurrence for fm10k driver, but this patch can serve as an example for other drivers which have many more flags. This particular change does have a bit of trouble converting some of the idioms previously used with the #defines for flags. Specifically, when converting FM10K_FLAG_RSS_FIELD_IPV[46]_UDP flags. This whole operation was actually quite problematic, because we actually stored flags separately. This could more easily show the problem of the above re-ordering issue. This is really difficult to test whether atomics make a difference in practical scenarios, but you can ensure that basic functionality remains the same. This patch has a lot of code coverage, but most of it is relatively simple. While we are modifying these files, update their copyright year. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-01-13 07:59:38 +08:00
set_bit(FM10K_FLAG_RESET_REQUESTED, interface->flags);
return 0;
}
/* generic error handler for mailbox issues */
static s32 fm10k_mbx_error(struct fm10k_hw *hw, u32 **results,
struct fm10k_mbx_info __always_unused *mbx)
{
struct fm10k_intfc *interface;
struct pci_dev *pdev;
interface = container_of(hw, struct fm10k_intfc, hw);
pdev = interface->pdev;
dev_err(&pdev->dev, "Unknown message ID %u\n",
**results & FM10K_TLV_ID_MASK);
return 0;
}
static const struct fm10k_msg_data vf_mbx_data[] = {
FM10K_TLV_MSG_TEST_HANDLER(fm10k_tlv_msg_test),
FM10K_VF_MSG_MAC_VLAN_HANDLER(fm10k_mbx_mac_addr),
FM10K_VF_MSG_LPORT_STATE_HANDLER(fm10k_msg_lport_state_vf),
FM10K_TLV_MSG_ERROR_HANDLER(fm10k_mbx_error),
};
static int fm10k_mbx_request_irq_vf(struct fm10k_intfc *interface)
{
struct msix_entry *entry = &interface->msix_entries[FM10K_MBX_VECTOR];
struct net_device *dev = interface->netdev;
struct fm10k_hw *hw = &interface->hw;
int err;
/* Use timer0 for interrupt moderation on the mailbox */
u32 itr = entry->entry | FM10K_INT_MAP_TIMER0;
/* register mailbox handlers */
err = hw->mbx.ops.register_handlers(&hw->mbx, vf_mbx_data);
if (err)
return err;
/* request the IRQ */
err = request_irq(entry->vector, fm10k_msix_mbx_vf, 0,
dev->name, interface);
if (err) {
netif_err(interface, probe, dev,
"request_irq for msix_mbx failed: %d\n", err);
return err;
}
/* map all of the interrupt sources */
fm10k_write_reg(hw, FM10K_VFINT_MAP, itr);
/* enable interrupt */
fm10k_write_reg(hw, FM10K_VFITR(entry->entry), FM10K_ITR_ENABLE);
return 0;
}
static s32 fm10k_lport_map(struct fm10k_hw *hw, u32 **results,
struct fm10k_mbx_info *mbx)
{
struct fm10k_intfc *interface;
u32 dglort_map = hw->mac.dglort_map;
s32 err;
interface = container_of(hw, struct fm10k_intfc, hw);
err = fm10k_msg_err_pf(hw, results, mbx);
if (!err && hw->swapi.status) {
/* force link down for a reasonable delay */
interface->link_down_event = jiffies + (2 * HZ);
set_bit(__FM10K_LINK_DOWN, interface->state);
/* reset dglort_map back to no config */
hw->mac.dglort_map = FM10K_DGLORTMAP_NONE;
fm10k_service_event_schedule(interface);
/* prevent overloading kernel message buffer */
if (interface->lport_map_failed)
return 0;
interface->lport_map_failed = true;
if (hw->swapi.status == FM10K_MSG_ERR_PEP_NOT_SCHEDULED)
dev_warn(&interface->pdev->dev,
"cannot obtain link because the host interface is configured for a PCIe host interface bandwidth of zero\n");
dev_warn(&interface->pdev->dev,
"request logical port map failed: %d\n",
hw->swapi.status);
return 0;
}
err = fm10k_msg_lport_map_pf(hw, results, mbx);
if (err)
return err;
interface->lport_map_failed = false;
/* we need to reset if port count was just updated */
if (dglort_map != hw->mac.dglort_map)
fm10k: use a BITMAP for flags to avoid race conditions Replace bitwise operators and #defines with a BITMAP and enumeration values. This is similar to how we handle the "state" values as well. This has two distinct advantages over the old method. First, we ensure correctness of operations which are currently problematic due to race conditions. Suppose that two kernel threads are running, such as the watchdog and an ethtool ioctl, and both modify flags. We'll say that the watchdog is CPU A, and the ethtool ioctl is CPU B. CPU A sets FLAG_1, which can be seen as CPU A read FLAGS CPU A write FLAGS | FLAG_1 CPU B sets FLAG_2, which can be seen as CPU B read FLAGS CPU A write FLAGS | FLAG_2 However, "|=" and "&=" operators are not actually atomic. So this could be ordered like the following: CPU A read FLAGS -> variable CPU B read FLAGS -> variable CPU A write FLAGS (variable | FLAG_1) CPU B write FLAGS (variable | FLAG_2) Notice how the 2nd write from CPU B could actually undo the write from CPU A because it isn't guaranteed that the |= operation is atomic. In practice the race windows for most flag writes is incredibly narrow so it is not easy to isolate issues. However, the more flags we have, the more likely they will cause problems. Additionally, if such a problem were to arise, it would be incredibly difficult to track down. Second, there is an additional advantage beyond code correctness. We can now automatically size the BITMAP if more flags were added, so that we do not need to remember that flags is u32 and thus if we added too many flags we would over-run the variable. This is not a likely occurrence for fm10k driver, but this patch can serve as an example for other drivers which have many more flags. This particular change does have a bit of trouble converting some of the idioms previously used with the #defines for flags. Specifically, when converting FM10K_FLAG_RSS_FIELD_IPV[46]_UDP flags. This whole operation was actually quite problematic, because we actually stored flags separately. This could more easily show the problem of the above re-ordering issue. This is really difficult to test whether atomics make a difference in practical scenarios, but you can ensure that basic functionality remains the same. This patch has a lot of code coverage, but most of it is relatively simple. While we are modifying these files, update their copyright year. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-01-13 07:59:38 +08:00
set_bit(FM10K_FLAG_RESET_REQUESTED, interface->flags);
return 0;
}
static s32 fm10k_update_pvid(struct fm10k_hw *hw, u32 **results,
struct fm10k_mbx_info __always_unused *mbx)
{
struct fm10k_intfc *interface;
u16 glort, pvid;
u32 pvid_update;
s32 err;
err = fm10k_tlv_attr_get_u32(results[FM10K_PF_ATTR_ID_UPDATE_PVID],
&pvid_update);
if (err)
return err;
/* extract values from the pvid update */
glort = FM10K_MSG_HDR_FIELD_GET(pvid_update, UPDATE_PVID_GLORT);
pvid = FM10K_MSG_HDR_FIELD_GET(pvid_update, UPDATE_PVID_PVID);
/* if glort is not valid return error */
if (!fm10k_glort_valid_pf(hw, glort))
return FM10K_ERR_PARAM;
/* verify VLAN ID is valid */
if (pvid >= FM10K_VLAN_TABLE_VID_MAX)
return FM10K_ERR_PARAM;
interface = container_of(hw, struct fm10k_intfc, hw);
/* check to see if this belongs to one of the VFs */
err = fm10k_iov_update_pvid(interface, glort, pvid);
if (!err)
return 0;
/* we need to reset if default VLAN was just updated */
if (pvid != hw->mac.default_vid)
fm10k: use a BITMAP for flags to avoid race conditions Replace bitwise operators and #defines with a BITMAP and enumeration values. This is similar to how we handle the "state" values as well. This has two distinct advantages over the old method. First, we ensure correctness of operations which are currently problematic due to race conditions. Suppose that two kernel threads are running, such as the watchdog and an ethtool ioctl, and both modify flags. We'll say that the watchdog is CPU A, and the ethtool ioctl is CPU B. CPU A sets FLAG_1, which can be seen as CPU A read FLAGS CPU A write FLAGS | FLAG_1 CPU B sets FLAG_2, which can be seen as CPU B read FLAGS CPU A write FLAGS | FLAG_2 However, "|=" and "&=" operators are not actually atomic. So this could be ordered like the following: CPU A read FLAGS -> variable CPU B read FLAGS -> variable CPU A write FLAGS (variable | FLAG_1) CPU B write FLAGS (variable | FLAG_2) Notice how the 2nd write from CPU B could actually undo the write from CPU A because it isn't guaranteed that the |= operation is atomic. In practice the race windows for most flag writes is incredibly narrow so it is not easy to isolate issues. However, the more flags we have, the more likely they will cause problems. Additionally, if such a problem were to arise, it would be incredibly difficult to track down. Second, there is an additional advantage beyond code correctness. We can now automatically size the BITMAP if more flags were added, so that we do not need to remember that flags is u32 and thus if we added too many flags we would over-run the variable. This is not a likely occurrence for fm10k driver, but this patch can serve as an example for other drivers which have many more flags. This particular change does have a bit of trouble converting some of the idioms previously used with the #defines for flags. Specifically, when converting FM10K_FLAG_RSS_FIELD_IPV[46]_UDP flags. This whole operation was actually quite problematic, because we actually stored flags separately. This could more easily show the problem of the above re-ordering issue. This is really difficult to test whether atomics make a difference in practical scenarios, but you can ensure that basic functionality remains the same. This patch has a lot of code coverage, but most of it is relatively simple. While we are modifying these files, update their copyright year. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-01-13 07:59:38 +08:00
set_bit(FM10K_FLAG_RESET_REQUESTED, interface->flags);
hw->mac.default_vid = pvid;
return 0;
}
static const struct fm10k_msg_data pf_mbx_data[] = {
FM10K_PF_MSG_ERR_HANDLER(XCAST_MODES, fm10k_msg_err_pf),
FM10K_PF_MSG_ERR_HANDLER(UPDATE_MAC_FWD_RULE, fm10k_msg_err_pf),
FM10K_PF_MSG_LPORT_MAP_HANDLER(fm10k_lport_map),
FM10K_PF_MSG_ERR_HANDLER(LPORT_CREATE, fm10k_msg_err_pf),
FM10K_PF_MSG_ERR_HANDLER(LPORT_DELETE, fm10k_msg_err_pf),
FM10K_PF_MSG_UPDATE_PVID_HANDLER(fm10k_update_pvid),
FM10K_TLV_MSG_ERROR_HANDLER(fm10k_mbx_error),
};
static int fm10k_mbx_request_irq_pf(struct fm10k_intfc *interface)
{
struct msix_entry *entry = &interface->msix_entries[FM10K_MBX_VECTOR];
struct net_device *dev = interface->netdev;
struct fm10k_hw *hw = &interface->hw;
int err;
/* Use timer0 for interrupt moderation on the mailbox */
u32 mbx_itr = entry->entry | FM10K_INT_MAP_TIMER0;
u32 other_itr = entry->entry | FM10K_INT_MAP_IMMEDIATE;
/* register mailbox handlers */
err = hw->mbx.ops.register_handlers(&hw->mbx, pf_mbx_data);
if (err)
return err;
/* request the IRQ */
err = request_irq(entry->vector, fm10k_msix_mbx_pf, 0,
dev->name, interface);
if (err) {
netif_err(interface, probe, dev,
"request_irq for msix_mbx failed: %d\n", err);
return err;
}
/* Enable interrupts w/ no moderation for "other" interrupts */
fm10k_write_reg(hw, FM10K_INT_MAP(fm10k_int_pcie_fault), other_itr);
fm10k_write_reg(hw, FM10K_INT_MAP(fm10k_int_switch_up_down), other_itr);
fm10k_write_reg(hw, FM10K_INT_MAP(fm10k_int_sram), other_itr);
fm10k_write_reg(hw, FM10K_INT_MAP(fm10k_int_max_hold_time), other_itr);
fm10k_write_reg(hw, FM10K_INT_MAP(fm10k_int_vflr), other_itr);
/* Enable interrupts w/ moderation for mailbox */
fm10k_write_reg(hw, FM10K_INT_MAP(fm10k_int_mailbox), mbx_itr);
/* Enable individual interrupt causes */
fm10k_write_reg(hw, FM10K_EIMR, FM10K_EIMR_ENABLE(PCA_FAULT) |
FM10K_EIMR_ENABLE(FUM_FAULT) |
FM10K_EIMR_ENABLE(MAILBOX) |
FM10K_EIMR_ENABLE(SWITCHREADY) |
FM10K_EIMR_ENABLE(SWITCHNOTREADY) |
FM10K_EIMR_ENABLE(SRAMERROR) |
FM10K_EIMR_ENABLE(VFLR) |
FM10K_EIMR_ENABLE(MAXHOLDTIME));
/* enable interrupt */
fm10k_write_reg(hw, FM10K_ITR(entry->entry), FM10K_ITR_ENABLE);
return 0;
}
int fm10k_mbx_request_irq(struct fm10k_intfc *interface)
{
struct fm10k_hw *hw = &interface->hw;
int err;
/* enable Mailbox cause */
if (hw->mac.type == fm10k_mac_pf)
err = fm10k_mbx_request_irq_pf(interface);
else
err = fm10k_mbx_request_irq_vf(interface);
if (err)
return err;
/* connect mailbox */
err = hw->mbx.ops.connect(hw, &hw->mbx);
/* if the mailbox failed to connect, then free IRQ */
if (err)
fm10k_mbx_free_irq(interface);
return err;
}
/**
* fm10k_qv_free_irq - release interrupts associated with queue vectors
* @interface: board private structure
*
* Release all interrupts associated with this interface
**/
void fm10k_qv_free_irq(struct fm10k_intfc *interface)
{
int vector = interface->num_q_vectors;
struct fm10k_hw *hw = &interface->hw;
struct msix_entry *entry;
entry = &interface->msix_entries[NON_Q_VECTORS(hw) + vector];
while (vector) {
struct fm10k_q_vector *q_vector;
vector--;
entry--;
q_vector = interface->q_vector[vector];
if (!q_vector->tx.count && !q_vector->rx.count)
continue;
/* clear the affinity_mask in the IRQ descriptor */
irq_set_affinity_hint(entry->vector, NULL);
/* disable interrupts */
writel(FM10K_ITR_MASK_SET, q_vector->itr);
free_irq(entry->vector, q_vector);
}
}
/**
* fm10k_qv_request_irq - initialize interrupts for queue vectors
* @interface: board private structure
*
* Attempts to configure interrupts using the best available
* capabilities of the hardware and kernel.
**/
int fm10k_qv_request_irq(struct fm10k_intfc *interface)
{
struct net_device *dev = interface->netdev;
struct fm10k_hw *hw = &interface->hw;
struct msix_entry *entry;
unsigned int ri = 0, ti = 0;
int vector, err;
entry = &interface->msix_entries[NON_Q_VECTORS(hw)];
for (vector = 0; vector < interface->num_q_vectors; vector++) {
struct fm10k_q_vector *q_vector = interface->q_vector[vector];
/* name the vector */
if (q_vector->tx.count && q_vector->rx.count) {
snprintf(q_vector->name, sizeof(q_vector->name),
"%s-TxRx-%u", dev->name, ri++);
ti++;
} else if (q_vector->rx.count) {
snprintf(q_vector->name, sizeof(q_vector->name),
"%s-rx-%u", dev->name, ri++);
} else if (q_vector->tx.count) {
snprintf(q_vector->name, sizeof(q_vector->name),
"%s-tx-%u", dev->name, ti++);
} else {
/* skip this unused q_vector */
continue;
}
/* Assign ITR register to q_vector */
q_vector->itr = (hw->mac.type == fm10k_mac_pf) ?
&interface->uc_addr[FM10K_ITR(entry->entry)] :
&interface->uc_addr[FM10K_VFITR(entry->entry)];
/* request the IRQ */
err = request_irq(entry->vector, &fm10k_msix_clean_rings, 0,
q_vector->name, q_vector);
if (err) {
netif_err(interface, probe, dev,
"request_irq failed for MSIX interrupt Error: %d\n",
err);
goto err_out;
}
/* assign the mask for this irq */
irq_set_affinity_hint(entry->vector, &q_vector->affinity_mask);
/* Enable q_vector */
writel(FM10K_ITR_ENABLE, q_vector->itr);
entry++;
}
return 0;
err_out:
/* wind through the ring freeing all entries and vectors */
while (vector) {
struct fm10k_q_vector *q_vector;
entry--;
vector--;
q_vector = interface->q_vector[vector];
if (!q_vector->tx.count && !q_vector->rx.count)
continue;
/* clear the affinity_mask in the IRQ descriptor */
irq_set_affinity_hint(entry->vector, NULL);
/* disable interrupts */
writel(FM10K_ITR_MASK_SET, q_vector->itr);
free_irq(entry->vector, q_vector);
}
return err;
}
void fm10k_up(struct fm10k_intfc *interface)
{
struct fm10k_hw *hw = &interface->hw;
/* Enable Tx/Rx DMA */
hw->mac.ops.start_hw(hw);
/* configure Tx descriptor rings */
fm10k_configure_tx(interface);
/* configure Rx descriptor rings */
fm10k_configure_rx(interface);
/* configure interrupts */
hw->mac.ops.update_int_moderator(hw);
/* enable statistics capture again */
clear_bit(__FM10K_UPDATING_STATS, interface->state);
/* clear down bit to indicate we are ready to go */
clear_bit(__FM10K_DOWN, interface->state);
/* enable polling cleanups */
fm10k_napi_enable_all(interface);
/* re-establish Rx filters */
fm10k_restore_rx_state(interface);
/* enable transmits */
netif_tx_start_all_queues(interface->netdev);
/* kick off the service timer now */
hw->mac.get_host_state = true;
mod_timer(&interface->service_timer, jiffies);
}
static void fm10k_napi_disable_all(struct fm10k_intfc *interface)
{
struct fm10k_q_vector *q_vector;
int q_idx;
for (q_idx = 0; q_idx < interface->num_q_vectors; q_idx++) {
q_vector = interface->q_vector[q_idx];
napi_disable(&q_vector->napi);
}
}
void fm10k_down(struct fm10k_intfc *interface)
{
struct net_device *netdev = interface->netdev;
struct fm10k_hw *hw = &interface->hw;
int err, i = 0, count = 0;
/* signal that we are down to the interrupt handler and service task */
if (test_and_set_bit(__FM10K_DOWN, interface->state))
return;
/* call carrier off first to avoid false dev_watchdog timeouts */
netif_carrier_off(netdev);
/* disable transmits */
netif_tx_stop_all_queues(netdev);
netif_tx_disable(netdev);
/* reset Rx filters */
fm10k_reset_rx_state(interface);
/* disable polling routines */
fm10k_napi_disable_all(interface);
/* capture stats one last time before stopping interface */
fm10k_update_stats(interface);
/* prevent updating statistics while we're down */
while (test_and_set_bit(__FM10K_UPDATING_STATS, interface->state))
usleep_range(1000, 2000);
/* skip waiting for TX DMA if we lost PCIe link */
if (FM10K_REMOVED(hw->hw_addr))
goto skip_tx_dma_drain;
/* In some rare circumstances it can take a while for Tx queues to
* quiesce and be fully disabled. Attempt to .stop_hw() first, and
* then if we get ERR_REQUESTS_PENDING, go ahead and wait in a loop
* until the Tx queues have emptied, or until a number of retries. If
* we fail to clear within the retry loop, we will issue a warning
* indicating that Tx DMA is probably hung. Note this means we call
* .stop_hw() twice but this shouldn't cause any problems.
*/
err = hw->mac.ops.stop_hw(hw);
if (err != FM10K_ERR_REQUESTS_PENDING)
goto skip_tx_dma_drain;
#define TX_DMA_DRAIN_RETRIES 25
for (count = 0; count < TX_DMA_DRAIN_RETRIES; count++) {
usleep_range(10000, 20000);
/* start checking at the last ring to have pending Tx */
for (; i < interface->num_tx_queues; i++)
if (fm10k_get_tx_pending(interface->tx_ring[i], false))
break;
/* if all the queues are drained, we can break now */
if (i == interface->num_tx_queues)
break;
}
if (count >= TX_DMA_DRAIN_RETRIES)
dev_err(&interface->pdev->dev,
"Tx queues failed to drain after %d tries. Tx DMA is probably hung.\n",
count);
skip_tx_dma_drain:
/* Disable DMA engine for Tx/Rx */
err = hw->mac.ops.stop_hw(hw);
if (err == FM10K_ERR_REQUESTS_PENDING)
dev_err(&interface->pdev->dev,
"due to pending requests hw was not shut down gracefully\n");
else if (err)
dev_err(&interface->pdev->dev, "stop_hw failed: %d\n", err);
/* free any buffers still on the rings */
fm10k_clean_all_tx_rings(interface);
fm10k_clean_all_rx_rings(interface);
}
/**
* fm10k_sw_init - Initialize general software structures
* @interface: host interface private structure to initialize
*
* fm10k_sw_init initializes the interface private data structure.
* Fields are initialized based on PCI device information and
* OS network device settings (MTU size).
**/
static int fm10k_sw_init(struct fm10k_intfc *interface,
const struct pci_device_id *ent)
{
const struct fm10k_info *fi = fm10k_info_tbl[ent->driver_data];
struct fm10k_hw *hw = &interface->hw;
struct pci_dev *pdev = interface->pdev;
struct net_device *netdev = interface->netdev;
u32 rss_key[FM10K_RSSRK_SIZE];
unsigned int rss;
int err;
/* initialize back pointer */
hw->back = interface;
hw->hw_addr = interface->uc_addr;
/* PCI config space info */
hw->vendor_id = pdev->vendor;
hw->device_id = pdev->device;
hw->revision_id = pdev->revision;
hw->subsystem_vendor_id = pdev->subsystem_vendor;
hw->subsystem_device_id = pdev->subsystem_device;
/* Setup hw api */
memcpy(&hw->mac.ops, fi->mac_ops, sizeof(hw->mac.ops));
hw->mac.type = fi->mac;
/* Setup IOV handlers */
if (fi->iov_ops)
memcpy(&hw->iov.ops, fi->iov_ops, sizeof(hw->iov.ops));
/* Set common capability flags and settings */
rss = min_t(int, FM10K_MAX_RSS_INDICES, num_online_cpus());
interface->ring_feature[RING_F_RSS].limit = rss;
fi->get_invariants(hw);
/* pick up the PCIe bus settings for reporting later */
if (hw->mac.ops.get_bus_info)
hw->mac.ops.get_bus_info(hw);
/* limit the usable DMA range */
if (hw->mac.ops.set_dma_mask)
hw->mac.ops.set_dma_mask(hw, dma_get_mask(&pdev->dev));
/* update netdev with DMA restrictions */
if (dma_get_mask(&pdev->dev) > DMA_BIT_MASK(32)) {
netdev->features |= NETIF_F_HIGHDMA;
netdev->vlan_features |= NETIF_F_HIGHDMA;
}
/* reset and initialize the hardware so it is in a known state */
err = hw->mac.ops.reset_hw(hw);
if (err) {
dev_err(&pdev->dev, "reset_hw failed: %d\n", err);
return err;
}
err = hw->mac.ops.init_hw(hw);
if (err) {
dev_err(&pdev->dev, "init_hw failed: %d\n", err);
return err;
}
/* initialize hardware statistics */
hw->mac.ops.update_hw_stats(hw, &interface->stats);
/* Set upper limit on IOV VFs that can be allocated */
pci_sriov_set_totalvfs(pdev, hw->iov.total_vfs);
/* Start with random Ethernet address */
eth_random_addr(hw->mac.addr);
/* Initialize MAC address from hardware */
err = hw->mac.ops.read_mac_addr(hw);
if (err) {
dev_warn(&pdev->dev,
"Failed to obtain MAC address defaulting to random\n");
/* tag address assignment as random */
netdev->addr_assign_type |= NET_ADDR_RANDOM;
}
ether_addr_copy(netdev->dev_addr, hw->mac.addr);
ether_addr_copy(netdev->perm_addr, hw->mac.addr);
if (!is_valid_ether_addr(netdev->perm_addr)) {
dev_err(&pdev->dev, "Invalid MAC Address\n");
return -EIO;
}
/* initialize DCBNL interface */
fm10k_dcbnl_set_ops(netdev);
/* set default ring sizes */
interface->tx_ring_count = FM10K_DEFAULT_TXD;
interface->rx_ring_count = FM10K_DEFAULT_RXD;
/* set default interrupt moderation */
interface->tx_itr = FM10K_TX_ITR_DEFAULT;
interface->rx_itr = FM10K_ITR_ADAPTIVE | FM10K_RX_ITR_DEFAULT;
/* initialize udp port lists */
INIT_LIST_HEAD(&interface->vxlan_port);
INIT_LIST_HEAD(&interface->geneve_port);
netdev_rss_key_fill(rss_key, sizeof(rss_key));
memcpy(interface->rssrk, rss_key, sizeof(rss_key));
/* Start off interface as being down */
set_bit(__FM10K_DOWN, interface->state);
set_bit(__FM10K_UPDATING_STATS, interface->state);
return 0;
}
static void fm10k_slot_warn(struct fm10k_intfc *interface)
{
enum pcie_link_width width = PCIE_LNK_WIDTH_UNKNOWN;
enum pci_bus_speed speed = PCI_SPEED_UNKNOWN;
struct fm10k_hw *hw = &interface->hw;
int max_gts = 0, expected_gts = 0;
if (pcie_get_minimum_link(interface->pdev, &speed, &width) ||
speed == PCI_SPEED_UNKNOWN || width == PCIE_LNK_WIDTH_UNKNOWN) {
dev_warn(&interface->pdev->dev,
"Unable to determine PCI Express bandwidth.\n");
return;
}
switch (speed) {
case PCIE_SPEED_2_5GT:
/* 8b/10b encoding reduces max throughput by 20% */
max_gts = 2 * width;
break;
case PCIE_SPEED_5_0GT:
/* 8b/10b encoding reduces max throughput by 20% */
max_gts = 4 * width;
break;
case PCIE_SPEED_8_0GT:
/* 128b/130b encoding has less than 2% impact on throughput */
max_gts = 8 * width;
break;
default:
dev_warn(&interface->pdev->dev,
"Unable to determine PCI Express bandwidth.\n");
return;
}
dev_info(&interface->pdev->dev,
"PCI Express bandwidth of %dGT/s available\n",
max_gts);
dev_info(&interface->pdev->dev,
"(Speed:%s, Width: x%d, Encoding Loss:%s, Payload:%s)\n",
(speed == PCIE_SPEED_8_0GT ? "8.0GT/s" :
speed == PCIE_SPEED_5_0GT ? "5.0GT/s" :
speed == PCIE_SPEED_2_5GT ? "2.5GT/s" :
"Unknown"),
hw->bus.width,
(speed == PCIE_SPEED_2_5GT ? "20%" :
speed == PCIE_SPEED_5_0GT ? "20%" :
speed == PCIE_SPEED_8_0GT ? "<2%" :
"Unknown"),
(hw->bus.payload == fm10k_bus_payload_128 ? "128B" :
hw->bus.payload == fm10k_bus_payload_256 ? "256B" :
hw->bus.payload == fm10k_bus_payload_512 ? "512B" :
"Unknown"));
switch (hw->bus_caps.speed) {
case fm10k_bus_speed_2500:
/* 8b/10b encoding reduces max throughput by 20% */
expected_gts = 2 * hw->bus_caps.width;
break;
case fm10k_bus_speed_5000:
/* 8b/10b encoding reduces max throughput by 20% */
expected_gts = 4 * hw->bus_caps.width;
break;
case fm10k_bus_speed_8000:
/* 128b/130b encoding has less than 2% impact on throughput */
expected_gts = 8 * hw->bus_caps.width;
break;
default:
dev_warn(&interface->pdev->dev,
"Unable to determine expected PCI Express bandwidth.\n");
return;
}
if (max_gts >= expected_gts)
return;
dev_warn(&interface->pdev->dev,
"This device requires %dGT/s of bandwidth for optimal performance.\n",
expected_gts);
dev_warn(&interface->pdev->dev,
"A %sslot with x%d lanes is suggested.\n",
(hw->bus_caps.speed == fm10k_bus_speed_2500 ? "2.5GT/s " :
hw->bus_caps.speed == fm10k_bus_speed_5000 ? "5.0GT/s " :
hw->bus_caps.speed == fm10k_bus_speed_8000 ? "8.0GT/s " : ""),
hw->bus_caps.width);
}
/**
* fm10k_probe - Device Initialization Routine
* @pdev: PCI device information struct
* @ent: entry in fm10k_pci_tbl
*
* Returns 0 on success, negative on failure
*
* fm10k_probe initializes an interface identified by a pci_dev structure.
* The OS initialization, configuring of the interface private structure,
* and a hardware reset occur.
**/
static int fm10k_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
struct net_device *netdev;
struct fm10k_intfc *interface;
int err;
if (pdev->error_state != pci_channel_io_normal) {
dev_err(&pdev->dev,
"PCI device still in an error state. Unable to load...\n");
return -EIO;
}
err = pci_enable_device_mem(pdev);
if (err) {
dev_err(&pdev->dev,
"PCI enable device failed: %d\n", err);
return err;
}
err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(48));
if (err)
err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
if (err) {
dev_err(&pdev->dev,
"DMA configuration failed: %d\n", err);
goto err_dma;
}
err = pci_request_mem_regions(pdev, fm10k_driver_name);
if (err) {
dev_err(&pdev->dev,
"pci_request_selected_regions failed: %d\n", err);
goto err_pci_reg;
}
pci_enable_pcie_error_reporting(pdev);
pci_set_master(pdev);
pci_save_state(pdev);
netdev = fm10k_alloc_netdev(fm10k_info_tbl[ent->driver_data]);
if (!netdev) {
err = -ENOMEM;
goto err_alloc_netdev;
}
SET_NETDEV_DEV(netdev, &pdev->dev);
interface = netdev_priv(netdev);
pci_set_drvdata(pdev, interface);
interface->netdev = netdev;
interface->pdev = pdev;
interface->uc_addr = ioremap(pci_resource_start(pdev, 0),
FM10K_UC_ADDR_SIZE);
if (!interface->uc_addr) {
err = -EIO;
goto err_ioremap;
}
err = fm10k_sw_init(interface, ent);
if (err)
goto err_sw_init;
fm10k: Add support for debugfs This patch adds limited debugfs support for the driver. Most of the functionality needed for dumping registers is already provided via ethtool. The only thing we saw that we really neeed was the ability to dump the descriptor rings so as such this patch will add a fm10k directory containing a listing of directories each one with a unique PCI Bus, Device, and Function number. Each of those BDF directories will have a list of q_vectors, and the q_vectors will contain a file for each of the Rx/Tx rings that are a part of the vector. For example: # ls -RD /sys/kernel/debug/fm10k/ /sys/kernel/debug/fm10k/: 0000:01:00.0 /sys/kernel/debug/fm10k/0000:01:00.0: q_vector.000 q_vector.001 q_vector.002 q_vector.003 /sys/kernel/debug/fm10k/0000:01:00.0/q_vector.000: rx_ring.000 tx_ring.000 /sys/kernel/debug/fm10k/0000:01:00.0/q_vector.001: rx_ring.001 tx_ring.001 /sys/kernel/debug/fm10k/0000:01:00.0/q_vector.002: rx_ring.002 tx_ring.002 /sys/kernel/debug/fm10k/0000:01:00.0/q_vector.003: rx_ring.003 tx_ring.003 # cat /sys/kernel/debug/fm10k/0000:01:00.0/q_vector.000/rx_ring.000 DES DATA RSS STATERR LENGTH VLAN DGLORT SGLORT TIMESTAMP --------------------------------------------------------------------------- 000 0x00000000 0x00000000 0x00000003 0x002a 0x0000 0x0000 0x0000 0x13951807dc4fedf0 001 0x00000000 0x00000000 0x00000003 0x002a 0x0000 0x0000 0x0000 0x1395180906c9f2c8 002 0x3731c000 0x00000000 0x00000000 0x0000 0x0000 0x0000 0x0000 0x0000000000000000 003 0x3731d000 0x00000000 0x00000000 0x0000 0x0000 0x0000 0x0000 0x0000000000000000 004 0xaab3a000 0x00000000 0x00000000 0x0000 0x0000 0x0000 0x0000 0x0000000000000000 ... # cat /sys/kernel/debug/fm10k/0000:01:00.0/q_vector.000/tx_ring.000 DES BUFFER_ADDRESS LENGTH VLAN MSS HDRLEN FLAGS --------------------------------------------------------- 000 0x00000000aa8a1002 0x005a 0x0000 0x0000 0x0000 0xc0 001 0x00000000aa8a2002 0x005a 0x0000 0x0000 0x0000 0xc0 002 0x000000006bc13202 0x004e 0x0000 0x0000 0x0000 0xc0 003 0x000000006bc13c02 0x002a 0x0000 0x0000 0x0000 0xe1 004 0x000000006bc13602 0x0062 0x0000 0x0000 0x0000 0xc0 Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2014-09-21 07:53:23 +08:00
/* enable debugfs support */
fm10k_dbg_intfc_init(interface);
err = fm10k_init_queueing_scheme(interface);
if (err)
goto err_sw_init;
fm10k: don't initialize service task until later in probe Delay initialization of the service timer and service task until late probe. If we don't wait, failures in probe do not properly cleanup the service timer or service task items, which results in the kernel panic below, potentially freezing the whole system. In addition, ensure that the SERVICE_DISABLE bit is set before we request the MBX IRQ since the MBX interrupt attempts to schedule the service task otherwise. This prevents a similar trace from occurring after this change. We didn't notice this issue before because probe almost always completes successfully. I discovered it due to a mis-ordered mailbox handler array, which resulted in the following failure when requesting mailbox interrupt. [ 555.325619] ------------[ cut here ]------------ [ 555.325628] WARNING: CPU: 0 PID: 4941 at lib/list_debug.c:33 __list_add+0xa0/0xd0() [ 555.325631] list_add corruption. prev->next should be next (ffffffff81f46648), but was (null). (prev=ffff8807fad5d0e8). <snip> [ 555.325722] CPU: 0 PID: 4941 Comm: insmod Tainted: G OE 4.0.4-303.fc22.x86_64 #1 [ 555.325725] Hardware name: Intel Corporation S2600CO/S2600CO, BIOS SE5C600.86B.02.03.8x23.060520140825 06/05/2014 [ 555.325727] 0000000000000000 00000000b4f161b3 ffff88081a21f8e8 ffffffff81783124 [ 555.325734] 0000000000000000 ffff88081a21f940 ffff88081a21f928 ffffffff8109c66a [ 555.325740] 0000000064000000 ffff8807fad5d0e8 ffff8807fad5d0e8 ffffffff81f46648 [ 555.325746] Call Trace: [ 555.325752] [<ffffffff81783124>] dump_stack+0x45/0x57 [ 555.325757] [<ffffffff8109c66a>] warn_slowpath_common+0x8a/0xc0 [ 555.325759] [<ffffffff8109c6f5>] warn_slowpath_fmt+0x55/0x70 [ 555.325763] [<ffffffff813ba270>] __list_add+0xa0/0xd0 [ 555.325768] [<ffffffff81102d1d>] __internal_add_timer+0x9d/0x110 [ 555.325771] [<ffffffff81102dbf>] internal_add_timer+0x2f/0xc0 [ 555.325774] [<ffffffff81104e5a>] mod_timer+0x12a/0x230 [ 555.325782] [<ffffffffa03d54ca>] fm10k_probe+0x69a/0xc80 [fm10k] [ 555.325787] [<ffffffff813e8355>] local_pci_probe+0x45/0xa0 [ 555.325791] [<ffffffff8129cf42>] ? sysfs_do_create_link_sd.isra.2+0x72/0xc0 [ 555.325794] [<ffffffff813e96b9>] pci_device_probe+0xf9/0x150 [ 555.325799] [<ffffffff814d7e73>] driver_probe_device+0xa3/0x400 [ 555.325802] [<ffffffff814d82ab>] __driver_attach+0x9b/0xa0 [ 555.325805] [<ffffffff814d8210>] ? __device_attach+0x40/0x40 [ 555.325808] [<ffffffff814d5bd3>] bus_for_each_dev+0x73/0xc0 [ 555.325811] [<ffffffff814d78ce>] driver_attach+0x1e/0x20 [ 555.325815] [<ffffffff814d7480>] bus_add_driver+0x180/0x250 [ 555.325819] [<ffffffffa03b2000>] ? 0xffffffffa03b2000 [ 555.325823] [<ffffffff814d8aa4>] driver_register+0x64/0xf0 [ 555.325826] [<ffffffff813e7bec>] __pci_register_driver+0x4c/0x50 [ 555.325832] [<ffffffffa03d6ca3>] fm10k_register_pci_driver+0x23/0x30 [fm10k] [ 555.325838] [<ffffffffa03b2080>] fm10k_init_module+0x80/0x1000 [fm10k] [ 555.325843] [<ffffffff81002128>] do_one_initcall+0xb8/0x200 [ 555.325848] [<ffffffff811e10d2>] ? __vunmap+0xa2/0x100 [ 555.325852] [<ffffffff811fe239>] ? kmem_cache_alloc_trace+0x1b9/0x240 [ 555.325855] [<ffffffff8178230e>] ? do_init_module+0x28/0x1cb [ 555.325858] [<ffffffff81782346>] do_init_module+0x60/0x1cb [ 555.325862] [<ffffffff8112168e>] load_module+0x205e/0x26b0 [ 555.325866] [<ffffffff8111d110>] ? store_uevent+0x70/0x70 [ 555.325870] [<ffffffff812234b0>] ? kernel_read+0x50/0x80 [ 555.325873] [<ffffffff81121f3e>] SyS_finit_module+0xbe/0xf0 [ 555.325878] [<ffffffff81789749>] system_call_fastpath+0x12/0x17 [ 555.325880] ---[ end trace 9e0f58d071eafd2a ]--- Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <Krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2016-02-05 02:47:55 +08:00
/* the mbx interrupt might attempt to schedule the service task, so we
* must ensure it is disabled since we haven't yet requested the timer
* or work item.
*/
set_bit(__FM10K_SERVICE_DISABLE, interface->state);
fm10k: don't initialize service task until later in probe Delay initialization of the service timer and service task until late probe. If we don't wait, failures in probe do not properly cleanup the service timer or service task items, which results in the kernel panic below, potentially freezing the whole system. In addition, ensure that the SERVICE_DISABLE bit is set before we request the MBX IRQ since the MBX interrupt attempts to schedule the service task otherwise. This prevents a similar trace from occurring after this change. We didn't notice this issue before because probe almost always completes successfully. I discovered it due to a mis-ordered mailbox handler array, which resulted in the following failure when requesting mailbox interrupt. [ 555.325619] ------------[ cut here ]------------ [ 555.325628] WARNING: CPU: 0 PID: 4941 at lib/list_debug.c:33 __list_add+0xa0/0xd0() [ 555.325631] list_add corruption. prev->next should be next (ffffffff81f46648), but was (null). (prev=ffff8807fad5d0e8). <snip> [ 555.325722] CPU: 0 PID: 4941 Comm: insmod Tainted: G OE 4.0.4-303.fc22.x86_64 #1 [ 555.325725] Hardware name: Intel Corporation S2600CO/S2600CO, BIOS SE5C600.86B.02.03.8x23.060520140825 06/05/2014 [ 555.325727] 0000000000000000 00000000b4f161b3 ffff88081a21f8e8 ffffffff81783124 [ 555.325734] 0000000000000000 ffff88081a21f940 ffff88081a21f928 ffffffff8109c66a [ 555.325740] 0000000064000000 ffff8807fad5d0e8 ffff8807fad5d0e8 ffffffff81f46648 [ 555.325746] Call Trace: [ 555.325752] [<ffffffff81783124>] dump_stack+0x45/0x57 [ 555.325757] [<ffffffff8109c66a>] warn_slowpath_common+0x8a/0xc0 [ 555.325759] [<ffffffff8109c6f5>] warn_slowpath_fmt+0x55/0x70 [ 555.325763] [<ffffffff813ba270>] __list_add+0xa0/0xd0 [ 555.325768] [<ffffffff81102d1d>] __internal_add_timer+0x9d/0x110 [ 555.325771] [<ffffffff81102dbf>] internal_add_timer+0x2f/0xc0 [ 555.325774] [<ffffffff81104e5a>] mod_timer+0x12a/0x230 [ 555.325782] [<ffffffffa03d54ca>] fm10k_probe+0x69a/0xc80 [fm10k] [ 555.325787] [<ffffffff813e8355>] local_pci_probe+0x45/0xa0 [ 555.325791] [<ffffffff8129cf42>] ? sysfs_do_create_link_sd.isra.2+0x72/0xc0 [ 555.325794] [<ffffffff813e96b9>] pci_device_probe+0xf9/0x150 [ 555.325799] [<ffffffff814d7e73>] driver_probe_device+0xa3/0x400 [ 555.325802] [<ffffffff814d82ab>] __driver_attach+0x9b/0xa0 [ 555.325805] [<ffffffff814d8210>] ? __device_attach+0x40/0x40 [ 555.325808] [<ffffffff814d5bd3>] bus_for_each_dev+0x73/0xc0 [ 555.325811] [<ffffffff814d78ce>] driver_attach+0x1e/0x20 [ 555.325815] [<ffffffff814d7480>] bus_add_driver+0x180/0x250 [ 555.325819] [<ffffffffa03b2000>] ? 0xffffffffa03b2000 [ 555.325823] [<ffffffff814d8aa4>] driver_register+0x64/0xf0 [ 555.325826] [<ffffffff813e7bec>] __pci_register_driver+0x4c/0x50 [ 555.325832] [<ffffffffa03d6ca3>] fm10k_register_pci_driver+0x23/0x30 [fm10k] [ 555.325838] [<ffffffffa03b2080>] fm10k_init_module+0x80/0x1000 [fm10k] [ 555.325843] [<ffffffff81002128>] do_one_initcall+0xb8/0x200 [ 555.325848] [<ffffffff811e10d2>] ? __vunmap+0xa2/0x100 [ 555.325852] [<ffffffff811fe239>] ? kmem_cache_alloc_trace+0x1b9/0x240 [ 555.325855] [<ffffffff8178230e>] ? do_init_module+0x28/0x1cb [ 555.325858] [<ffffffff81782346>] do_init_module+0x60/0x1cb [ 555.325862] [<ffffffff8112168e>] load_module+0x205e/0x26b0 [ 555.325866] [<ffffffff8111d110>] ? store_uevent+0x70/0x70 [ 555.325870] [<ffffffff812234b0>] ? kernel_read+0x50/0x80 [ 555.325873] [<ffffffff81121f3e>] SyS_finit_module+0xbe/0xf0 [ 555.325878] [<ffffffff81789749>] system_call_fastpath+0x12/0x17 [ 555.325880] ---[ end trace 9e0f58d071eafd2a ]--- Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <Krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2016-02-05 02:47:55 +08:00
err = fm10k_mbx_request_irq(interface);
if (err)
goto err_mbx_interrupt;
/* final check of hardware state before registering the interface */
err = fm10k_hw_ready(interface);
if (err)
goto err_register;
err = register_netdev(netdev);
if (err)
goto err_register;
/* carrier off reporting is important to ethtool even BEFORE open */
netif_carrier_off(netdev);
/* stop all the transmit queues from transmitting until link is up */
netif_tx_stop_all_queues(netdev);
fm10k: don't initialize service task until later in probe Delay initialization of the service timer and service task until late probe. If we don't wait, failures in probe do not properly cleanup the service timer or service task items, which results in the kernel panic below, potentially freezing the whole system. In addition, ensure that the SERVICE_DISABLE bit is set before we request the MBX IRQ since the MBX interrupt attempts to schedule the service task otherwise. This prevents a similar trace from occurring after this change. We didn't notice this issue before because probe almost always completes successfully. I discovered it due to a mis-ordered mailbox handler array, which resulted in the following failure when requesting mailbox interrupt. [ 555.325619] ------------[ cut here ]------------ [ 555.325628] WARNING: CPU: 0 PID: 4941 at lib/list_debug.c:33 __list_add+0xa0/0xd0() [ 555.325631] list_add corruption. prev->next should be next (ffffffff81f46648), but was (null). (prev=ffff8807fad5d0e8). <snip> [ 555.325722] CPU: 0 PID: 4941 Comm: insmod Tainted: G OE 4.0.4-303.fc22.x86_64 #1 [ 555.325725] Hardware name: Intel Corporation S2600CO/S2600CO, BIOS SE5C600.86B.02.03.8x23.060520140825 06/05/2014 [ 555.325727] 0000000000000000 00000000b4f161b3 ffff88081a21f8e8 ffffffff81783124 [ 555.325734] 0000000000000000 ffff88081a21f940 ffff88081a21f928 ffffffff8109c66a [ 555.325740] 0000000064000000 ffff8807fad5d0e8 ffff8807fad5d0e8 ffffffff81f46648 [ 555.325746] Call Trace: [ 555.325752] [<ffffffff81783124>] dump_stack+0x45/0x57 [ 555.325757] [<ffffffff8109c66a>] warn_slowpath_common+0x8a/0xc0 [ 555.325759] [<ffffffff8109c6f5>] warn_slowpath_fmt+0x55/0x70 [ 555.325763] [<ffffffff813ba270>] __list_add+0xa0/0xd0 [ 555.325768] [<ffffffff81102d1d>] __internal_add_timer+0x9d/0x110 [ 555.325771] [<ffffffff81102dbf>] internal_add_timer+0x2f/0xc0 [ 555.325774] [<ffffffff81104e5a>] mod_timer+0x12a/0x230 [ 555.325782] [<ffffffffa03d54ca>] fm10k_probe+0x69a/0xc80 [fm10k] [ 555.325787] [<ffffffff813e8355>] local_pci_probe+0x45/0xa0 [ 555.325791] [<ffffffff8129cf42>] ? sysfs_do_create_link_sd.isra.2+0x72/0xc0 [ 555.325794] [<ffffffff813e96b9>] pci_device_probe+0xf9/0x150 [ 555.325799] [<ffffffff814d7e73>] driver_probe_device+0xa3/0x400 [ 555.325802] [<ffffffff814d82ab>] __driver_attach+0x9b/0xa0 [ 555.325805] [<ffffffff814d8210>] ? __device_attach+0x40/0x40 [ 555.325808] [<ffffffff814d5bd3>] bus_for_each_dev+0x73/0xc0 [ 555.325811] [<ffffffff814d78ce>] driver_attach+0x1e/0x20 [ 555.325815] [<ffffffff814d7480>] bus_add_driver+0x180/0x250 [ 555.325819] [<ffffffffa03b2000>] ? 0xffffffffa03b2000 [ 555.325823] [<ffffffff814d8aa4>] driver_register+0x64/0xf0 [ 555.325826] [<ffffffff813e7bec>] __pci_register_driver+0x4c/0x50 [ 555.325832] [<ffffffffa03d6ca3>] fm10k_register_pci_driver+0x23/0x30 [fm10k] [ 555.325838] [<ffffffffa03b2080>] fm10k_init_module+0x80/0x1000 [fm10k] [ 555.325843] [<ffffffff81002128>] do_one_initcall+0xb8/0x200 [ 555.325848] [<ffffffff811e10d2>] ? __vunmap+0xa2/0x100 [ 555.325852] [<ffffffff811fe239>] ? kmem_cache_alloc_trace+0x1b9/0x240 [ 555.325855] [<ffffffff8178230e>] ? do_init_module+0x28/0x1cb [ 555.325858] [<ffffffff81782346>] do_init_module+0x60/0x1cb [ 555.325862] [<ffffffff8112168e>] load_module+0x205e/0x26b0 [ 555.325866] [<ffffffff8111d110>] ? store_uevent+0x70/0x70 [ 555.325870] [<ffffffff812234b0>] ? kernel_read+0x50/0x80 [ 555.325873] [<ffffffff81121f3e>] SyS_finit_module+0xbe/0xf0 [ 555.325878] [<ffffffff81789749>] system_call_fastpath+0x12/0x17 [ 555.325880] ---[ end trace 9e0f58d071eafd2a ]--- Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <Krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2016-02-05 02:47:55 +08:00
/* Initialize service timer and service task late in order to avoid
* cleanup issues.
*/
setup_timer(&interface->service_timer, &fm10k_service_timer,
(unsigned long)interface);
INIT_WORK(&interface->service_task, fm10k_service_task);
/* kick off service timer now, even when interface is down */
mod_timer(&interface->service_timer, (HZ * 2) + jiffies);
/* print warning for non-optimal configurations */
fm10k_slot_warn(interface);
/* report MAC address for logging */
dev_info(&pdev->dev, "%pM\n", netdev->dev_addr);
/* enable SR-IOV after registering netdev to enforce PF/VF ordering */
fm10k_iov_configure(pdev, 0);
/* clear the service task disable bit and kick off service task */
clear_bit(__FM10K_SERVICE_DISABLE, interface->state);
fm10k_service_event_schedule(interface);
return 0;
err_register:
fm10k_mbx_free_irq(interface);
err_mbx_interrupt:
fm10k_clear_queueing_scheme(interface);
err_sw_init:
if (interface->sw_addr)
iounmap(interface->sw_addr);
iounmap(interface->uc_addr);
err_ioremap:
free_netdev(netdev);
err_alloc_netdev:
pci_release_mem_regions(pdev);
err_pci_reg:
err_dma:
pci_disable_device(pdev);
return err;
}
/**
* fm10k_remove - Device Removal Routine
* @pdev: PCI device information struct
*
* fm10k_remove is called by the PCI subsystem to alert the driver
* that it should release a PCI device. The could be caused by a
* Hot-Plug event, or because the driver is going to be removed from
* memory.
**/
static void fm10k_remove(struct pci_dev *pdev)
{
struct fm10k_intfc *interface = pci_get_drvdata(pdev);
struct net_device *netdev = interface->netdev;
del_timer_sync(&interface->service_timer);
fm10k_stop_service_event(interface);
/* free netdev, this may bounce the interrupts due to setup_tc */
if (netdev->reg_state == NETREG_REGISTERED)
unregister_netdev(netdev);
/* release VFs */
fm10k_iov_disable(pdev);
/* disable mailbox interrupt */
fm10k_mbx_free_irq(interface);
/* free interrupts */
fm10k_clear_queueing_scheme(interface);
fm10k: Add support for debugfs This patch adds limited debugfs support for the driver. Most of the functionality needed for dumping registers is already provided via ethtool. The only thing we saw that we really neeed was the ability to dump the descriptor rings so as such this patch will add a fm10k directory containing a listing of directories each one with a unique PCI Bus, Device, and Function number. Each of those BDF directories will have a list of q_vectors, and the q_vectors will contain a file for each of the Rx/Tx rings that are a part of the vector. For example: # ls -RD /sys/kernel/debug/fm10k/ /sys/kernel/debug/fm10k/: 0000:01:00.0 /sys/kernel/debug/fm10k/0000:01:00.0: q_vector.000 q_vector.001 q_vector.002 q_vector.003 /sys/kernel/debug/fm10k/0000:01:00.0/q_vector.000: rx_ring.000 tx_ring.000 /sys/kernel/debug/fm10k/0000:01:00.0/q_vector.001: rx_ring.001 tx_ring.001 /sys/kernel/debug/fm10k/0000:01:00.0/q_vector.002: rx_ring.002 tx_ring.002 /sys/kernel/debug/fm10k/0000:01:00.0/q_vector.003: rx_ring.003 tx_ring.003 # cat /sys/kernel/debug/fm10k/0000:01:00.0/q_vector.000/rx_ring.000 DES DATA RSS STATERR LENGTH VLAN DGLORT SGLORT TIMESTAMP --------------------------------------------------------------------------- 000 0x00000000 0x00000000 0x00000003 0x002a 0x0000 0x0000 0x0000 0x13951807dc4fedf0 001 0x00000000 0x00000000 0x00000003 0x002a 0x0000 0x0000 0x0000 0x1395180906c9f2c8 002 0x3731c000 0x00000000 0x00000000 0x0000 0x0000 0x0000 0x0000 0x0000000000000000 003 0x3731d000 0x00000000 0x00000000 0x0000 0x0000 0x0000 0x0000 0x0000000000000000 004 0xaab3a000 0x00000000 0x00000000 0x0000 0x0000 0x0000 0x0000 0x0000000000000000 ... # cat /sys/kernel/debug/fm10k/0000:01:00.0/q_vector.000/tx_ring.000 DES BUFFER_ADDRESS LENGTH VLAN MSS HDRLEN FLAGS --------------------------------------------------------- 000 0x00000000aa8a1002 0x005a 0x0000 0x0000 0x0000 0xc0 001 0x00000000aa8a2002 0x005a 0x0000 0x0000 0x0000 0xc0 002 0x000000006bc13202 0x004e 0x0000 0x0000 0x0000 0xc0 003 0x000000006bc13c02 0x002a 0x0000 0x0000 0x0000 0xe1 004 0x000000006bc13602 0x0062 0x0000 0x0000 0x0000 0xc0 Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2014-09-21 07:53:23 +08:00
/* remove any debugfs interfaces */
fm10k_dbg_intfc_exit(interface);
if (interface->sw_addr)
iounmap(interface->sw_addr);
iounmap(interface->uc_addr);
free_netdev(netdev);
pci_release_mem_regions(pdev);
pci_disable_pcie_error_reporting(pdev);
pci_disable_device(pdev);
}
static void fm10k_prepare_suspend(struct fm10k_intfc *interface)
{
/* the watchdog task reads from registers, which might appear like
* a surprise remove if the PCIe device is disabled while we're
* stopped. We stop the watchdog task until after we resume software
* activity.
*/
fm10k_stop_service_event(interface);
fm10k: prepare_for_reset() when we lose PCIe Link If we lose PCIe link, such as when an unannounced PFLR event occurs, or when a device is surprise removed, we currently detach the device and close the netdev. This unfortunately leaves a lot of things still active, such as the msix_mbx_pf IRQ, and Tx/Rx resources. This can cause problems because the register reads will return potentially invalid values which may result in unknown driver behavior. Begin the process of resetting using fm10k_prepare_for_reset(), much in the same way as the suspend and resume cycle does. This will attempt to shutdown as much as possible, in order to prevent possible issues. A naive implementation for this has issues, because there are now multiple flows calling the reset logic and setting a reset bit. This would cause problems, because the "re-attach" routine might call fm10k_handle_reset() prior to the reset actually finishing. Instead, we'll add state bits to indicate which flow actually initiated the reset. For the general reset flow, we'll assume that if someone else is resetting that we do not need to handle it at all, so it does not need its own state bit. For the suspend case, we will simply issue a warning indicating that we are attempting to recover from this case when resuming. For the detached subtask, we'll simply refuse to re-attach until we've actually initiated a reset as part of that flow. Finally, we'll stop attempting to manage the mailbox subtask when we're detached, since there's nothing we can do if we don't have a PCIe address. Overall this produces a much cleaner shutdown and recovery cycle for a PCIe surprise remove event. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-07-11 04:23:14 +08:00
if (fm10k_prepare_for_reset(interface))
set_bit(__FM10K_RESET_SUSPENDED, interface->state);
}
static int fm10k_handle_resume(struct fm10k_intfc *interface)
{
struct fm10k_hw *hw = &interface->hw;
int err;
fm10k: prepare_for_reset() when we lose PCIe Link If we lose PCIe link, such as when an unannounced PFLR event occurs, or when a device is surprise removed, we currently detach the device and close the netdev. This unfortunately leaves a lot of things still active, such as the msix_mbx_pf IRQ, and Tx/Rx resources. This can cause problems because the register reads will return potentially invalid values which may result in unknown driver behavior. Begin the process of resetting using fm10k_prepare_for_reset(), much in the same way as the suspend and resume cycle does. This will attempt to shutdown as much as possible, in order to prevent possible issues. A naive implementation for this has issues, because there are now multiple flows calling the reset logic and setting a reset bit. This would cause problems, because the "re-attach" routine might call fm10k_handle_reset() prior to the reset actually finishing. Instead, we'll add state bits to indicate which flow actually initiated the reset. For the general reset flow, we'll assume that if someone else is resetting that we do not need to handle it at all, so it does not need its own state bit. For the suspend case, we will simply issue a warning indicating that we are attempting to recover from this case when resuming. For the detached subtask, we'll simply refuse to re-attach until we've actually initiated a reset as part of that flow. Finally, we'll stop attempting to manage the mailbox subtask when we're detached, since there's nothing we can do if we don't have a PCIe address. Overall this produces a much cleaner shutdown and recovery cycle for a PCIe surprise remove event. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-07-11 04:23:14 +08:00
/* Even if we didn't properly prepare for reset in
* fm10k_prepare_suspend, we'll attempt to resume anyways.
*/
if (!test_and_clear_bit(__FM10K_RESET_SUSPENDED, interface->state))
dev_warn(&interface->pdev->dev,
"Device was shut down as part of suspend... Attempting to recover\n");
/* reset statistics starting values */
hw->mac.ops.rebind_hw_stats(hw, &interface->stats);
err = fm10k_handle_reset(interface);
if (err)
return err;
/* assume host is not ready, to prevent race with watchdog in case we
* actually don't have connection to the switch
*/
interface->host_ready = false;
fm10k_watchdog_host_not_ready(interface);
/* force link to stay down for a second to prevent link flutter */
interface->link_down_event = jiffies + (HZ);
set_bit(__FM10K_LINK_DOWN, interface->state);
/* restart the service task */
fm10k_start_service_event(interface);
return err;
}
#ifdef CONFIG_PM
/**
* fm10k_resume - Restore device to pre-sleep state
* @pdev: PCI device information struct
*
* fm10k_resume is called after the system has powered back up from a sleep
* state and is ready to resume operation. This function is meant to restore
* the device back to its pre-sleep state.
**/
static int fm10k_resume(struct pci_dev *pdev)
{
struct fm10k_intfc *interface = pci_get_drvdata(pdev);
struct net_device *netdev = interface->netdev;
struct fm10k_hw *hw = &interface->hw;
u32 err;
pci_set_power_state(pdev, PCI_D0);
pci_restore_state(pdev);
/* pci_restore_state clears dev->state_saved so call
* pci_save_state to restore it.
*/
pci_save_state(pdev);
err = pci_enable_device_mem(pdev);
if (err) {
dev_err(&pdev->dev, "Cannot enable PCI device from suspend\n");
return err;
}
pci_set_master(pdev);
pci_wake_from_d3(pdev, false);
/* refresh hw_addr in case it was dropped */
hw->hw_addr = interface->uc_addr;
err = fm10k_handle_resume(interface);
if (err)
return err;
netif_device_attach(netdev);
return 0;
}
/**
* fm10k_suspend - Prepare the device for a system sleep state
* @pdev: PCI device information struct
*
* fm10k_suspend is meant to shutdown the device prior to the system entering
* a sleep state. The fm10k hardware does not support wake on lan so the
* driver simply needs to shut down the device so it is in a low power state.
**/
static int fm10k_suspend(struct pci_dev *pdev,
pm_message_t __always_unused state)
{
struct fm10k_intfc *interface = pci_get_drvdata(pdev);
struct net_device *netdev = interface->netdev;
int err = 0;
netif_device_detach(netdev);
fm10k_prepare_suspend(interface);
err = pci_save_state(pdev);
if (err)
return err;
pci_disable_device(pdev);
pci_wake_from_d3(pdev, false);
pci_set_power_state(pdev, PCI_D3hot);
return 0;
}
#endif /* CONFIG_PM */
/**
* fm10k_io_error_detected - called when PCI error is detected
* @pdev: Pointer to PCI device
* @state: The current pci connection state
*
* This function is called after a PCI bus error affecting
* this device has been detected.
*/
static pci_ers_result_t fm10k_io_error_detected(struct pci_dev *pdev,
pci_channel_state_t state)
{
struct fm10k_intfc *interface = pci_get_drvdata(pdev);
struct net_device *netdev = interface->netdev;
netif_device_detach(netdev);
if (state == pci_channel_io_perm_failure)
return PCI_ERS_RESULT_DISCONNECT;
fm10k_prepare_suspend(interface);
/* Request a slot reset. */
return PCI_ERS_RESULT_NEED_RESET;
}
/**
* fm10k_io_slot_reset - called after the pci bus has been reset.
* @pdev: Pointer to PCI device
*
* Restart the card from scratch, as if from a cold-boot.
*/
static pci_ers_result_t fm10k_io_slot_reset(struct pci_dev *pdev)
{
pci_ers_result_t result;
if (pci_reenable_device(pdev)) {
dev_err(&pdev->dev,
"Cannot re-enable PCI device after reset.\n");
result = PCI_ERS_RESULT_DISCONNECT;
} else {
pci_set_master(pdev);
pci_restore_state(pdev);
/* After second error pci->state_saved is false, this
* resets it so EEH doesn't break.
*/
pci_save_state(pdev);
pci_wake_from_d3(pdev, false);
result = PCI_ERS_RESULT_RECOVERED;
}
pci_cleanup_aer_uncorrect_error_status(pdev);
return result;
}
/**
* fm10k_io_resume - called when traffic can start flowing again.
* @pdev: Pointer to PCI device
*
* This callback is called when the error recovery driver tells us that
* its OK to resume normal operation.
*/
static void fm10k_io_resume(struct pci_dev *pdev)
{
struct fm10k_intfc *interface = pci_get_drvdata(pdev);
struct net_device *netdev = interface->netdev;
int err;
err = fm10k_handle_resume(interface);
if (err)
dev_warn(&pdev->dev,
"fm10k_io_resume failed: %d\n", err);
else
netif_device_attach(netdev);
}
static void fm10k_io_reset_prepare(struct pci_dev *pdev)
{
/* warn incase we have any active VF devices */
if (pci_num_vf(pdev))
dev_warn(&pdev->dev,
"PCIe FLR may cause issues for any active VF devices\n");
fm10k_prepare_suspend(pci_get_drvdata(pdev));
}
static void fm10k_io_reset_done(struct pci_dev *pdev)
{
struct fm10k_intfc *interface = pci_get_drvdata(pdev);
int err = fm10k_handle_resume(interface);
if (err) {
dev_warn(&pdev->dev,
"fm10k_io_reset_notify failed: %d\n", err);
netif_device_detach(interface->netdev);
}
}
static const struct pci_error_handlers fm10k_err_handler = {
.error_detected = fm10k_io_error_detected,
.slot_reset = fm10k_io_slot_reset,
.resume = fm10k_io_resume,
.reset_prepare = fm10k_io_reset_prepare,
.reset_done = fm10k_io_reset_done,
};
static struct pci_driver fm10k_driver = {
.name = fm10k_driver_name,
.id_table = fm10k_pci_tbl,
.probe = fm10k_probe,
.remove = fm10k_remove,
#ifdef CONFIG_PM
.suspend = fm10k_suspend,
.resume = fm10k_resume,
#endif
.sriov_configure = fm10k_iov_configure,
.err_handler = &fm10k_err_handler
};
/**
* fm10k_register_pci_driver - register driver interface
*
* This function is called on module load in order to register the driver.
**/
int fm10k_register_pci_driver(void)
{
return pci_register_driver(&fm10k_driver);
}
/**
* fm10k_unregister_pci_driver - unregister driver interface
*
* This function is called on module unload in order to remove the driver.
**/
void fm10k_unregister_pci_driver(void)
{
pci_unregister_driver(&fm10k_driver);
}