2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* probe.c - PCI detection and setup code
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/delay.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/pci.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/cpumask.h>
|
PCI: add PCI Express ASPM support
PCI Express ASPM defines a protocol for PCI Express components in the D0
state to reduce Link power by placing their Links into a low power state
and instructing the other end of the Link to do likewise. This
capability allows hardware-autonomous, dynamic Link power reduction
beyond what is achievable by software-only controlled power management.
However, The device should be configured by software appropriately.
Enabling ASPM will save power, but will introduce device latency.
This patch adds ASPM support in Linux. It introduces a global policy for
ASPM, a sysfs file /sys/module/pcie_aspm/parameters/policy can control
it. The interface can be used as a boot option too. Currently we have
below setting:
-default, BIOS default setting
-powersave, highest power saving mode, enable all available ASPM
state and clock power management
-performance, highest performance, disable ASPM and clock power
management
By default, the 'default' policy is used currently.
In my test, power difference between powersave mode and performance mode
is about 1.3w in a system with 3 PCIE links.
Note: some devices might not work well with aspm, either because chipset
issue or device issue. The patch provide API (pci_disable_link_state),
driver can disable ASPM for specific device.
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2008-02-25 09:46:41 +08:00
|
|
|
#include <linux/pci-aspm.h>
|
2005-04-08 13:53:31 +08:00
|
|
|
#include "pci.h"
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#define CARDBUS_LATENCY_TIMER 176 /* secondary latency timer */
|
|
|
|
#define CARDBUS_RESERVE_BUSNR 3
|
|
|
|
|
|
|
|
/* Ugh. Need to stop exporting this to modules. */
|
|
|
|
LIST_HEAD(pci_root_buses);
|
|
|
|
EXPORT_SYMBOL(pci_root_buses);
|
|
|
|
|
2008-02-14 14:30:39 +08:00
|
|
|
|
|
|
|
static int find_anything(struct device *dev, void *data)
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-07-16 14:39:39 +08:00
|
|
|
/*
|
|
|
|
* Some device drivers need know if pci is initiated.
|
|
|
|
* Basically, we think pci is not initiated when there
|
2008-02-14 14:30:39 +08:00
|
|
|
* is no device to be found on the pci_bus_type.
|
2007-07-16 14:39:39 +08:00
|
|
|
*/
|
|
|
|
int no_pci_devices(void)
|
|
|
|
{
|
2008-02-14 14:30:39 +08:00
|
|
|
struct device *dev;
|
|
|
|
int no_devices;
|
2007-07-16 14:39:39 +08:00
|
|
|
|
2008-02-14 14:30:39 +08:00
|
|
|
dev = bus_find_device(&pci_bus_type, NULL, NULL, find_anything);
|
|
|
|
no_devices = (dev == NULL);
|
|
|
|
put_device(dev);
|
|
|
|
return no_devices;
|
|
|
|
}
|
2007-07-16 14:39:39 +08:00
|
|
|
EXPORT_SYMBOL(no_pci_devices);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* PCI Bus Class
|
|
|
|
*/
|
2007-05-23 10:47:54 +08:00
|
|
|
static void release_pcibus_dev(struct device *dev)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2007-05-23 10:47:54 +08:00
|
|
|
struct pci_bus *pci_bus = to_pci_bus(dev);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (pci_bus->bridge)
|
|
|
|
put_device(pci_bus->bridge);
|
2010-02-24 01:24:36 +08:00
|
|
|
pci_bus_remove_resources(pci_bus);
|
2011-04-11 09:37:07 +08:00
|
|
|
pci_release_bus_of_node(pci_bus);
|
2005-04-17 06:20:36 +08:00
|
|
|
kfree(pci_bus);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct class pcibus_class = {
|
|
|
|
.name = "pci_bus",
|
2007-05-23 10:47:54 +08:00
|
|
|
.dev_release = &release_pcibus_dev,
|
2011-05-13 08:11:39 +08:00
|
|
|
.dev_attrs = pcibus_dev_attrs,
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
static int __init pcibus_class_init(void)
|
|
|
|
{
|
|
|
|
return class_register(&pcibus_class);
|
|
|
|
}
|
|
|
|
postcore_initcall(pcibus_class_init);
|
|
|
|
|
2008-07-29 01:38:59 +08:00
|
|
|
static u64 pci_size(u64 base, u64 maxbase, u64 mask)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-07-29 01:38:59 +08:00
|
|
|
u64 size = mask & maxbase; /* Find the significant bits */
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!size)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* Get the lowest of them to find the decode size, and
|
|
|
|
from that the extent. */
|
|
|
|
size = (size & ~(size-1)) - 1;
|
|
|
|
|
|
|
|
/* base == maxbase can be valid only if the BAR has
|
|
|
|
already been programmed with all 1s. */
|
|
|
|
if (base == maxbase && ((base | size) & mask) != mask)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return size;
|
|
|
|
}
|
|
|
|
|
2011-06-15 03:04:35 +08:00
|
|
|
static inline unsigned long decode_bar(struct pci_dev *dev, u32 bar)
|
2008-07-29 01:38:59 +08:00
|
|
|
{
|
PCI: treat mem BAR type "11" (reserved) as 32-bit, not 64-bit, BAR
This fixes a minor regression where broken PCI devices that use the
reserved "11" memory BAR type worked before e354597cce but not after.
The low four bits of a memory BAR are "PTT0" where P=1 for prefetchable
BARs, and TT is as follows:
00 32-bit BAR, anywhere in lower 4GB
01 anywhere below 1MB (reserved as of PCI 2.2)
10 64-bit BAR
11 reserved
Prior to e354597cce, we treated "0100" as a 64-bit BAR and all others,
including prefetchable 64-bit BARs ("1100") as 32-bit BARs. The e354597cce
fix, which appeared in 2.6.28, treats "x1x0" as 64-bit BARs, so the
reserved "x110" types are treated as 64-bit instead of 32-bit.
This patch returns to treating the reserved "11" type as a 32-bit BAR and
adds a warning if we see it.
It also logs a note if we see a 1M BAR. This is not a warning, because
such hardware conforms to pre-PCI 2.2 spec, but I think it's worth noting
because Linux ignores the 1M restriction if it ever has to assign the BAR.
CC: Peter Chubb <peterc@gelato.unsw.edu.au>
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=35952
Reported-by: Jan Zwiegers <jan@radicalsystems.co.za>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-06-15 03:04:29 +08:00
|
|
|
u32 mem_type;
|
2011-06-15 03:04:35 +08:00
|
|
|
unsigned long flags;
|
PCI: treat mem BAR type "11" (reserved) as 32-bit, not 64-bit, BAR
This fixes a minor regression where broken PCI devices that use the
reserved "11" memory BAR type worked before e354597cce but not after.
The low four bits of a memory BAR are "PTT0" where P=1 for prefetchable
BARs, and TT is as follows:
00 32-bit BAR, anywhere in lower 4GB
01 anywhere below 1MB (reserved as of PCI 2.2)
10 64-bit BAR
11 reserved
Prior to e354597cce, we treated "0100" as a 64-bit BAR and all others,
including prefetchable 64-bit BARs ("1100") as 32-bit BARs. The e354597cce
fix, which appeared in 2.6.28, treats "x1x0" as 64-bit BARs, so the
reserved "x110" types are treated as 64-bit instead of 32-bit.
This patch returns to treating the reserved "11" type as a 32-bit BAR and
adds a warning if we see it.
It also logs a note if we see a 1M BAR. This is not a warning, because
such hardware conforms to pre-PCI 2.2 spec, but I think it's worth noting
because Linux ignores the 1M restriction if it ever has to assign the BAR.
CC: Peter Chubb <peterc@gelato.unsw.edu.au>
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=35952
Reported-by: Jan Zwiegers <jan@radicalsystems.co.za>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-06-15 03:04:29 +08:00
|
|
|
|
2008-07-29 01:38:59 +08:00
|
|
|
if ((bar & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) {
|
2011-06-15 03:04:35 +08:00
|
|
|
flags = bar & ~PCI_BASE_ADDRESS_IO_MASK;
|
|
|
|
flags |= IORESOURCE_IO;
|
|
|
|
return flags;
|
2008-07-29 01:38:59 +08:00
|
|
|
}
|
2006-11-30 05:53:10 +08:00
|
|
|
|
2011-06-15 03:04:35 +08:00
|
|
|
flags = bar & ~PCI_BASE_ADDRESS_MEM_MASK;
|
|
|
|
flags |= IORESOURCE_MEM;
|
|
|
|
if (flags & PCI_BASE_ADDRESS_MEM_PREFETCH)
|
|
|
|
flags |= IORESOURCE_PREFETCH;
|
2006-11-30 05:53:10 +08:00
|
|
|
|
PCI: treat mem BAR type "11" (reserved) as 32-bit, not 64-bit, BAR
This fixes a minor regression where broken PCI devices that use the
reserved "11" memory BAR type worked before e354597cce but not after.
The low four bits of a memory BAR are "PTT0" where P=1 for prefetchable
BARs, and TT is as follows:
00 32-bit BAR, anywhere in lower 4GB
01 anywhere below 1MB (reserved as of PCI 2.2)
10 64-bit BAR
11 reserved
Prior to e354597cce, we treated "0100" as a 64-bit BAR and all others,
including prefetchable 64-bit BARs ("1100") as 32-bit BARs. The e354597cce
fix, which appeared in 2.6.28, treats "x1x0" as 64-bit BARs, so the
reserved "x110" types are treated as 64-bit instead of 32-bit.
This patch returns to treating the reserved "11" type as a 32-bit BAR and
adds a warning if we see it.
It also logs a note if we see a 1M BAR. This is not a warning, because
such hardware conforms to pre-PCI 2.2 spec, but I think it's worth noting
because Linux ignores the 1M restriction if it ever has to assign the BAR.
CC: Peter Chubb <peterc@gelato.unsw.edu.au>
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=35952
Reported-by: Jan Zwiegers <jan@radicalsystems.co.za>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-06-15 03:04:29 +08:00
|
|
|
mem_type = bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK;
|
|
|
|
switch (mem_type) {
|
|
|
|
case PCI_BASE_ADDRESS_MEM_TYPE_32:
|
|
|
|
break;
|
|
|
|
case PCI_BASE_ADDRESS_MEM_TYPE_1M:
|
|
|
|
dev_info(&dev->dev, "1M mem BAR treated as 32-bit BAR\n");
|
|
|
|
break;
|
|
|
|
case PCI_BASE_ADDRESS_MEM_TYPE_64:
|
2011-06-15 03:04:35 +08:00
|
|
|
flags |= IORESOURCE_MEM_64;
|
|
|
|
break;
|
PCI: treat mem BAR type "11" (reserved) as 32-bit, not 64-bit, BAR
This fixes a minor regression where broken PCI devices that use the
reserved "11" memory BAR type worked before e354597cce but not after.
The low four bits of a memory BAR are "PTT0" where P=1 for prefetchable
BARs, and TT is as follows:
00 32-bit BAR, anywhere in lower 4GB
01 anywhere below 1MB (reserved as of PCI 2.2)
10 64-bit BAR
11 reserved
Prior to e354597cce, we treated "0100" as a 64-bit BAR and all others,
including prefetchable 64-bit BARs ("1100") as 32-bit BARs. The e354597cce
fix, which appeared in 2.6.28, treats "x1x0" as 64-bit BARs, so the
reserved "x110" types are treated as 64-bit instead of 32-bit.
This patch returns to treating the reserved "11" type as a 32-bit BAR and
adds a warning if we see it.
It also logs a note if we see a 1M BAR. This is not a warning, because
such hardware conforms to pre-PCI 2.2 spec, but I think it's worth noting
because Linux ignores the 1M restriction if it ever has to assign the BAR.
CC: Peter Chubb <peterc@gelato.unsw.edu.au>
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=35952
Reported-by: Jan Zwiegers <jan@radicalsystems.co.za>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-06-15 03:04:29 +08:00
|
|
|
default:
|
|
|
|
dev_warn(&dev->dev,
|
|
|
|
"mem unknown type %x treated as 32-bit BAR\n",
|
|
|
|
mem_type);
|
|
|
|
break;
|
|
|
|
}
|
2011-06-15 03:04:35 +08:00
|
|
|
return flags;
|
2006-11-30 05:53:10 +08:00
|
|
|
}
|
|
|
|
|
2008-11-22 02:40:40 +08:00
|
|
|
/**
|
|
|
|
* pci_read_base - read a PCI BAR
|
|
|
|
* @dev: the PCI device
|
|
|
|
* @type: type of the BAR
|
|
|
|
* @res: resource buffer to be filled in
|
|
|
|
* @pos: BAR position in the config space
|
|
|
|
*
|
|
|
|
* Returns 1 if the BAR is 64-bit, or 0 if 32-bit.
|
2008-07-29 01:38:59 +08:00
|
|
|
*/
|
2008-11-22 02:40:40 +08:00
|
|
|
int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
|
2008-07-29 01:38:59 +08:00
|
|
|
struct resource *res, unsigned int pos)
|
2006-11-30 05:53:10 +08:00
|
|
|
{
|
2008-07-29 01:38:59 +08:00
|
|
|
u32 l, sz, mask;
|
2010-07-17 01:19:22 +08:00
|
|
|
u16 orig_cmd;
|
2008-07-29 01:38:59 +08:00
|
|
|
|
2009-10-29 23:24:59 +08:00
|
|
|
mask = type ? PCI_ROM_ADDRESS_MASK : ~0;
|
2008-07-29 01:38:59 +08:00
|
|
|
|
2010-07-17 01:19:22 +08:00
|
|
|
if (!dev->mmio_always_on) {
|
|
|
|
pci_read_config_word(dev, PCI_COMMAND, &orig_cmd);
|
|
|
|
pci_write_config_word(dev, PCI_COMMAND,
|
|
|
|
orig_cmd & ~(PCI_COMMAND_MEMORY | PCI_COMMAND_IO));
|
|
|
|
}
|
|
|
|
|
2008-07-29 01:38:59 +08:00
|
|
|
res->name = pci_name(dev);
|
|
|
|
|
|
|
|
pci_read_config_dword(dev, pos, &l);
|
2009-10-29 23:24:59 +08:00
|
|
|
pci_write_config_dword(dev, pos, l | mask);
|
2008-07-29 01:38:59 +08:00
|
|
|
pci_read_config_dword(dev, pos, &sz);
|
|
|
|
pci_write_config_dword(dev, pos, l);
|
|
|
|
|
2010-07-17 01:19:22 +08:00
|
|
|
if (!dev->mmio_always_on)
|
|
|
|
pci_write_config_word(dev, PCI_COMMAND, orig_cmd);
|
|
|
|
|
2008-07-29 01:38:59 +08:00
|
|
|
/*
|
|
|
|
* All bits set in sz means the device isn't working properly.
|
2010-04-22 23:02:43 +08:00
|
|
|
* If the BAR isn't implemented, all bits must be 0. If it's a
|
|
|
|
* memory BAR or a ROM, bit 0 must be clear; if it's an io BAR, bit
|
|
|
|
* 1 must be clear.
|
2008-07-29 01:38:59 +08:00
|
|
|
*/
|
2010-04-22 23:02:43 +08:00
|
|
|
if (!sz || sz == 0xffffffff)
|
2008-07-29 01:38:59 +08:00
|
|
|
goto fail;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* I don't know how l can have all bits set. Copied from old code.
|
|
|
|
* Maybe it fixes a bug on some ancient platform.
|
|
|
|
*/
|
|
|
|
if (l == 0xffffffff)
|
|
|
|
l = 0;
|
|
|
|
|
|
|
|
if (type == pci_bar_unknown) {
|
2011-06-15 03:04:35 +08:00
|
|
|
res->flags = decode_bar(dev, l);
|
|
|
|
res->flags |= IORESOURCE_SIZEALIGN;
|
|
|
|
if (res->flags & IORESOURCE_IO) {
|
2008-07-29 01:38:59 +08:00
|
|
|
l &= PCI_BASE_ADDRESS_IO_MASK;
|
2011-05-24 08:12:22 +08:00
|
|
|
mask = PCI_BASE_ADDRESS_IO_MASK & (u32) IO_SPACE_LIMIT;
|
2008-07-29 01:38:59 +08:00
|
|
|
} else {
|
|
|
|
l &= PCI_BASE_ADDRESS_MEM_MASK;
|
|
|
|
mask = (u32)PCI_BASE_ADDRESS_MEM_MASK;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
res->flags |= (l & IORESOURCE_ROM_ENABLE);
|
|
|
|
l &= PCI_ROM_ADDRESS_MASK;
|
|
|
|
mask = (u32)PCI_ROM_ADDRESS_MASK;
|
|
|
|
}
|
|
|
|
|
2011-06-15 03:04:35 +08:00
|
|
|
if (res->flags & IORESOURCE_MEM_64) {
|
2008-07-29 01:38:59 +08:00
|
|
|
u64 l64 = l;
|
|
|
|
u64 sz64 = sz;
|
|
|
|
u64 mask64 = mask | (u64)~0 << 32;
|
|
|
|
|
|
|
|
pci_read_config_dword(dev, pos + 4, &l);
|
|
|
|
pci_write_config_dword(dev, pos + 4, ~0);
|
|
|
|
pci_read_config_dword(dev, pos + 4, &sz);
|
|
|
|
pci_write_config_dword(dev, pos + 4, l);
|
|
|
|
|
|
|
|
l64 |= ((u64)l << 32);
|
|
|
|
sz64 |= ((u64)sz << 32);
|
|
|
|
|
|
|
|
sz64 = pci_size(l64, sz64, mask64);
|
|
|
|
|
|
|
|
if (!sz64)
|
|
|
|
goto fail;
|
|
|
|
|
2008-07-29 01:39:00 +08:00
|
|
|
if ((sizeof(resource_size_t) < 8) && (sz64 > 0x100000000ULL)) {
|
2009-11-05 01:32:57 +08:00
|
|
|
dev_err(&dev->dev, "reg %x: can't handle 64-bit BAR\n",
|
|
|
|
pos);
|
2008-07-29 01:38:59 +08:00
|
|
|
goto fail;
|
2009-10-28 03:26:47 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if ((sizeof(resource_size_t) < 8) && l) {
|
2008-07-29 01:38:59 +08:00
|
|
|
/* Address above 32-bit boundary; disable the BAR */
|
|
|
|
pci_write_config_dword(dev, pos, 0);
|
|
|
|
pci_write_config_dword(dev, pos + 4, 0);
|
|
|
|
res->start = 0;
|
|
|
|
res->end = sz64;
|
|
|
|
} else {
|
|
|
|
res->start = l64;
|
|
|
|
res->end = l64 + sz64;
|
2009-10-28 03:26:47 +08:00
|
|
|
dev_printk(KERN_DEBUG, &dev->dev, "reg %x: %pR\n",
|
2009-10-07 05:33:44 +08:00
|
|
|
pos, res);
|
2008-07-29 01:38:59 +08:00
|
|
|
}
|
|
|
|
} else {
|
2010-04-22 23:02:43 +08:00
|
|
|
sz = pci_size(l, sz, mask);
|
2008-07-29 01:38:59 +08:00
|
|
|
|
2010-04-22 23:02:43 +08:00
|
|
|
if (!sz)
|
2008-07-29 01:38:59 +08:00
|
|
|
goto fail;
|
|
|
|
|
|
|
|
res->start = l;
|
2010-04-22 23:02:43 +08:00
|
|
|
res->end = l + sz;
|
2008-10-12 18:26:12 +08:00
|
|
|
|
2009-10-28 03:26:47 +08:00
|
|
|
dev_printk(KERN_DEBUG, &dev->dev, "reg %x: %pR\n", pos, res);
|
2008-07-29 01:38:59 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
2011-06-15 03:04:35 +08:00
|
|
|
return (res->flags & IORESOURCE_MEM_64) ? 1 : 0;
|
2008-07-29 01:38:59 +08:00
|
|
|
fail:
|
|
|
|
res->flags = 0;
|
|
|
|
goto out;
|
2006-11-30 05:53:10 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
|
|
|
|
{
|
2008-07-29 01:38:59 +08:00
|
|
|
unsigned int pos, reg;
|
2006-11-30 05:53:10 +08:00
|
|
|
|
2008-07-29 01:38:59 +08:00
|
|
|
for (pos = 0; pos < howmany; pos++) {
|
|
|
|
struct resource *res = &dev->resource[pos];
|
2005-04-17 06:20:36 +08:00
|
|
|
reg = PCI_BASE_ADDRESS_0 + (pos << 2);
|
2008-07-29 01:38:59 +08:00
|
|
|
pos += __pci_read_base(dev, pci_bar_unknown, res, reg);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2008-07-29 01:38:59 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
if (rom) {
|
2008-07-29 01:38:59 +08:00
|
|
|
struct resource *res = &dev->resource[PCI_ROM_RESOURCE];
|
2005-04-17 06:20:36 +08:00
|
|
|
dev->rom_base_reg = rom;
|
2008-07-29 01:38:59 +08:00
|
|
|
res->flags = IORESOURCE_MEM | IORESOURCE_PREFETCH |
|
|
|
|
IORESOURCE_READONLY | IORESOURCE_CACHEABLE |
|
|
|
|
IORESOURCE_SIZEALIGN;
|
|
|
|
__pci_read_base(dev, pci_bar_mem32, res, rom);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-02-24 01:24:21 +08:00
|
|
|
static void __devinit pci_read_bridge_io(struct pci_bus *child)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct pci_dev *dev = child->self;
|
|
|
|
u8 io_base_lo, io_limit_lo;
|
|
|
|
unsigned long base, limit;
|
|
|
|
struct resource *res;
|
|
|
|
|
|
|
|
res = child->resource[0];
|
|
|
|
pci_read_config_byte(dev, PCI_IO_BASE, &io_base_lo);
|
|
|
|
pci_read_config_byte(dev, PCI_IO_LIMIT, &io_limit_lo);
|
|
|
|
base = (io_base_lo & PCI_IO_RANGE_MASK) << 8;
|
|
|
|
limit = (io_limit_lo & PCI_IO_RANGE_MASK) << 8;
|
|
|
|
|
|
|
|
if ((io_base_lo & PCI_IO_RANGE_TYPE_MASK) == PCI_IO_RANGE_TYPE_32) {
|
|
|
|
u16 io_base_hi, io_limit_hi;
|
|
|
|
pci_read_config_word(dev, PCI_IO_BASE_UPPER16, &io_base_hi);
|
|
|
|
pci_read_config_word(dev, PCI_IO_LIMIT_UPPER16, &io_limit_hi);
|
|
|
|
base |= (io_base_hi << 16);
|
|
|
|
limit |= (io_limit_hi << 16);
|
|
|
|
}
|
|
|
|
|
2010-01-22 17:02:22 +08:00
|
|
|
if (base && base <= limit) {
|
2005-04-17 06:20:36 +08:00
|
|
|
res->flags = (io_base_lo & PCI_IO_RANGE_TYPE_MASK) | IORESOURCE_IO;
|
2005-12-05 20:06:43 +08:00
|
|
|
if (!res->start)
|
|
|
|
res->start = base;
|
|
|
|
if (!res->end)
|
|
|
|
res->end = limit + 0xfff;
|
2009-10-28 03:26:47 +08:00
|
|
|
dev_printk(KERN_DEBUG, &dev->dev, " bridge window %pR\n", res);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2010-02-24 01:24:21 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void __devinit pci_read_bridge_mmio(struct pci_bus *child)
|
|
|
|
{
|
|
|
|
struct pci_dev *dev = child->self;
|
|
|
|
u16 mem_base_lo, mem_limit_lo;
|
|
|
|
unsigned long base, limit;
|
|
|
|
struct resource *res;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
res = child->resource[1];
|
|
|
|
pci_read_config_word(dev, PCI_MEMORY_BASE, &mem_base_lo);
|
|
|
|
pci_read_config_word(dev, PCI_MEMORY_LIMIT, &mem_limit_lo);
|
|
|
|
base = (mem_base_lo & PCI_MEMORY_RANGE_MASK) << 16;
|
|
|
|
limit = (mem_limit_lo & PCI_MEMORY_RANGE_MASK) << 16;
|
2010-01-22 17:02:22 +08:00
|
|
|
if (base && base <= limit) {
|
2005-04-17 06:20:36 +08:00
|
|
|
res->flags = (mem_base_lo & PCI_MEMORY_RANGE_TYPE_MASK) | IORESOURCE_MEM;
|
|
|
|
res->start = base;
|
|
|
|
res->end = limit + 0xfffff;
|
2009-10-28 03:26:47 +08:00
|
|
|
dev_printk(KERN_DEBUG, &dev->dev, " bridge window %pR\n", res);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2010-02-24 01:24:21 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void __devinit pci_read_bridge_mmio_pref(struct pci_bus *child)
|
|
|
|
{
|
|
|
|
struct pci_dev *dev = child->self;
|
|
|
|
u16 mem_base_lo, mem_limit_lo;
|
|
|
|
unsigned long base, limit;
|
|
|
|
struct resource *res;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
res = child->resource[2];
|
|
|
|
pci_read_config_word(dev, PCI_PREF_MEMORY_BASE, &mem_base_lo);
|
|
|
|
pci_read_config_word(dev, PCI_PREF_MEMORY_LIMIT, &mem_limit_lo);
|
|
|
|
base = (mem_base_lo & PCI_PREF_RANGE_MASK) << 16;
|
|
|
|
limit = (mem_limit_lo & PCI_PREF_RANGE_MASK) << 16;
|
|
|
|
|
|
|
|
if ((mem_base_lo & PCI_PREF_RANGE_TYPE_MASK) == PCI_PREF_RANGE_TYPE_64) {
|
|
|
|
u32 mem_base_hi, mem_limit_hi;
|
|
|
|
pci_read_config_dword(dev, PCI_PREF_BASE_UPPER32, &mem_base_hi);
|
|
|
|
pci_read_config_dword(dev, PCI_PREF_LIMIT_UPPER32, &mem_limit_hi);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Some bridges set the base > limit by default, and some
|
|
|
|
* (broken) BIOSes do not initialize them. If we find
|
|
|
|
* this, just assume they are not being used.
|
|
|
|
*/
|
|
|
|
if (mem_base_hi <= mem_limit_hi) {
|
|
|
|
#if BITS_PER_LONG == 64
|
|
|
|
base |= ((long) mem_base_hi) << 32;
|
|
|
|
limit |= ((long) mem_limit_hi) << 32;
|
|
|
|
#else
|
|
|
|
if (mem_base_hi || mem_limit_hi) {
|
2008-06-14 00:52:11 +08:00
|
|
|
dev_err(&dev->dev, "can't handle 64-bit "
|
|
|
|
"address space for bridge\n");
|
2005-04-17 06:20:36 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
}
|
2010-01-22 17:02:22 +08:00
|
|
|
if (base && base <= limit) {
|
2009-04-24 11:48:32 +08:00
|
|
|
res->flags = (mem_base_lo & PCI_PREF_RANGE_TYPE_MASK) |
|
|
|
|
IORESOURCE_MEM | IORESOURCE_PREFETCH;
|
|
|
|
if (res->flags & PCI_PREF_RANGE_TYPE_64)
|
|
|
|
res->flags |= IORESOURCE_MEM_64;
|
2005-04-17 06:20:36 +08:00
|
|
|
res->start = base;
|
|
|
|
res->end = limit + 0xfffff;
|
2009-10-28 03:26:47 +08:00
|
|
|
dev_printk(KERN_DEBUG, &dev->dev, " bridge window %pR\n", res);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-02-24 01:24:21 +08:00
|
|
|
void __devinit pci_read_bridge_bases(struct pci_bus *child)
|
|
|
|
{
|
|
|
|
struct pci_dev *dev = child->self;
|
2010-02-24 01:24:36 +08:00
|
|
|
struct resource *res;
|
2010-02-24 01:24:21 +08:00
|
|
|
int i;
|
|
|
|
|
|
|
|
if (pci_is_root_bus(child)) /* It's a host bus, nothing to read */
|
|
|
|
return;
|
|
|
|
|
|
|
|
dev_info(&dev->dev, "PCI bridge to [bus %02x-%02x]%s\n",
|
|
|
|
child->secondary, child->subordinate,
|
|
|
|
dev->transparent ? " (subtractive decode)" : "");
|
|
|
|
|
2010-02-24 01:24:36 +08:00
|
|
|
pci_bus_remove_resources(child);
|
|
|
|
for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++)
|
|
|
|
child->resource[i] = &dev->resource[PCI_BRIDGE_RESOURCES+i];
|
|
|
|
|
2010-02-24 01:24:21 +08:00
|
|
|
pci_read_bridge_io(child);
|
|
|
|
pci_read_bridge_mmio(child);
|
|
|
|
pci_read_bridge_mmio_pref(child);
|
2010-02-24 01:24:26 +08:00
|
|
|
|
|
|
|
if (dev->transparent) {
|
2010-02-24 01:24:36 +08:00
|
|
|
pci_bus_for_each_resource(child->parent, res, i) {
|
|
|
|
if (res) {
|
|
|
|
pci_bus_add_resource(child, res,
|
|
|
|
PCI_SUBTRACTIVE_DECODE);
|
2010-02-24 01:24:26 +08:00
|
|
|
dev_printk(KERN_DEBUG, &dev->dev,
|
|
|
|
" bridge window %pR (subtractive decode)\n",
|
2010-02-24 01:24:36 +08:00
|
|
|
res);
|
|
|
|
}
|
2010-02-24 01:24:26 +08:00
|
|
|
}
|
|
|
|
}
|
2010-02-24 01:24:21 +08:00
|
|
|
}
|
|
|
|
|
2007-03-27 13:53:30 +08:00
|
|
|
static struct pci_bus * pci_alloc_bus(void)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct pci_bus *b;
|
|
|
|
|
2006-02-28 22:34:49 +08:00
|
|
|
b = kzalloc(sizeof(*b), GFP_KERNEL);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (b) {
|
|
|
|
INIT_LIST_HEAD(&b->node);
|
|
|
|
INIT_LIST_HEAD(&b->children);
|
|
|
|
INIT_LIST_HEAD(&b->devices);
|
2008-06-11 05:28:50 +08:00
|
|
|
INIT_LIST_HEAD(&b->slots);
|
2010-02-24 01:24:36 +08:00
|
|
|
INIT_LIST_HEAD(&b->resources);
|
2009-12-13 21:11:32 +08:00
|
|
|
b->max_bus_speed = PCI_SPEED_UNKNOWN;
|
|
|
|
b->cur_bus_speed = PCI_SPEED_UNKNOWN;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
return b;
|
|
|
|
}
|
|
|
|
|
2009-12-13 21:11:33 +08:00
|
|
|
static unsigned char pcix_bus_speed[] = {
|
|
|
|
PCI_SPEED_UNKNOWN, /* 0 */
|
|
|
|
PCI_SPEED_66MHz_PCIX, /* 1 */
|
|
|
|
PCI_SPEED_100MHz_PCIX, /* 2 */
|
|
|
|
PCI_SPEED_133MHz_PCIX, /* 3 */
|
|
|
|
PCI_SPEED_UNKNOWN, /* 4 */
|
|
|
|
PCI_SPEED_66MHz_PCIX_ECC, /* 5 */
|
|
|
|
PCI_SPEED_100MHz_PCIX_ECC, /* 6 */
|
|
|
|
PCI_SPEED_133MHz_PCIX_ECC, /* 7 */
|
|
|
|
PCI_SPEED_UNKNOWN, /* 8 */
|
|
|
|
PCI_SPEED_66MHz_PCIX_266, /* 9 */
|
|
|
|
PCI_SPEED_100MHz_PCIX_266, /* A */
|
|
|
|
PCI_SPEED_133MHz_PCIX_266, /* B */
|
|
|
|
PCI_SPEED_UNKNOWN, /* C */
|
|
|
|
PCI_SPEED_66MHz_PCIX_533, /* D */
|
|
|
|
PCI_SPEED_100MHz_PCIX_533, /* E */
|
|
|
|
PCI_SPEED_133MHz_PCIX_533 /* F */
|
|
|
|
};
|
|
|
|
|
2009-12-13 21:11:32 +08:00
|
|
|
static unsigned char pcie_link_speed[] = {
|
|
|
|
PCI_SPEED_UNKNOWN, /* 0 */
|
|
|
|
PCIE_SPEED_2_5GT, /* 1 */
|
|
|
|
PCIE_SPEED_5_0GT, /* 2 */
|
2009-12-13 21:11:35 +08:00
|
|
|
PCIE_SPEED_8_0GT, /* 3 */
|
2009-12-13 21:11:32 +08:00
|
|
|
PCI_SPEED_UNKNOWN, /* 4 */
|
|
|
|
PCI_SPEED_UNKNOWN, /* 5 */
|
|
|
|
PCI_SPEED_UNKNOWN, /* 6 */
|
|
|
|
PCI_SPEED_UNKNOWN, /* 7 */
|
|
|
|
PCI_SPEED_UNKNOWN, /* 8 */
|
|
|
|
PCI_SPEED_UNKNOWN, /* 9 */
|
|
|
|
PCI_SPEED_UNKNOWN, /* A */
|
|
|
|
PCI_SPEED_UNKNOWN, /* B */
|
|
|
|
PCI_SPEED_UNKNOWN, /* C */
|
|
|
|
PCI_SPEED_UNKNOWN, /* D */
|
|
|
|
PCI_SPEED_UNKNOWN, /* E */
|
|
|
|
PCI_SPEED_UNKNOWN /* F */
|
|
|
|
};
|
|
|
|
|
|
|
|
void pcie_update_link_speed(struct pci_bus *bus, u16 linksta)
|
|
|
|
{
|
|
|
|
bus->cur_bus_speed = pcie_link_speed[linksta & 0xf];
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(pcie_update_link_speed);
|
|
|
|
|
2009-12-13 21:11:34 +08:00
|
|
|
static unsigned char agp_speeds[] = {
|
|
|
|
AGP_UNKNOWN,
|
|
|
|
AGP_1X,
|
|
|
|
AGP_2X,
|
|
|
|
AGP_4X,
|
|
|
|
AGP_8X
|
|
|
|
};
|
|
|
|
|
|
|
|
static enum pci_bus_speed agp_speed(int agp3, int agpstat)
|
|
|
|
{
|
|
|
|
int index = 0;
|
|
|
|
|
|
|
|
if (agpstat & 4)
|
|
|
|
index = 3;
|
|
|
|
else if (agpstat & 2)
|
|
|
|
index = 2;
|
|
|
|
else if (agpstat & 1)
|
|
|
|
index = 1;
|
|
|
|
else
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (agp3) {
|
|
|
|
index += 2;
|
|
|
|
if (index == 5)
|
|
|
|
index = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
return agp_speeds[index];
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-12-13 21:11:33 +08:00
|
|
|
static void pci_set_bus_speed(struct pci_bus *bus)
|
|
|
|
{
|
|
|
|
struct pci_dev *bridge = bus->self;
|
|
|
|
int pos;
|
|
|
|
|
2009-12-13 21:11:34 +08:00
|
|
|
pos = pci_find_capability(bridge, PCI_CAP_ID_AGP);
|
|
|
|
if (!pos)
|
|
|
|
pos = pci_find_capability(bridge, PCI_CAP_ID_AGP3);
|
|
|
|
if (pos) {
|
|
|
|
u32 agpstat, agpcmd;
|
|
|
|
|
|
|
|
pci_read_config_dword(bridge, pos + PCI_AGP_STATUS, &agpstat);
|
|
|
|
bus->max_bus_speed = agp_speed(agpstat & 8, agpstat & 7);
|
|
|
|
|
|
|
|
pci_read_config_dword(bridge, pos + PCI_AGP_COMMAND, &agpcmd);
|
|
|
|
bus->cur_bus_speed = agp_speed(agpstat & 8, agpcmd & 7);
|
|
|
|
}
|
|
|
|
|
2009-12-13 21:11:33 +08:00
|
|
|
pos = pci_find_capability(bridge, PCI_CAP_ID_PCIX);
|
|
|
|
if (pos) {
|
|
|
|
u16 status;
|
|
|
|
enum pci_bus_speed max;
|
|
|
|
pci_read_config_word(bridge, pos + 2, &status);
|
|
|
|
|
|
|
|
if (status & 0x8000) {
|
|
|
|
max = PCI_SPEED_133MHz_PCIX_533;
|
|
|
|
} else if (status & 0x4000) {
|
|
|
|
max = PCI_SPEED_133MHz_PCIX_266;
|
|
|
|
} else if (status & 0x0002) {
|
|
|
|
if (((status >> 12) & 0x3) == 2) {
|
|
|
|
max = PCI_SPEED_133MHz_PCIX_ECC;
|
|
|
|
} else {
|
|
|
|
max = PCI_SPEED_133MHz_PCIX;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
max = PCI_SPEED_66MHz_PCIX;
|
|
|
|
}
|
|
|
|
|
|
|
|
bus->max_bus_speed = max;
|
|
|
|
bus->cur_bus_speed = pcix_bus_speed[(status >> 6) & 0xf];
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
pos = pci_find_capability(bridge, PCI_CAP_ID_EXP);
|
|
|
|
if (pos) {
|
|
|
|
u32 linkcap;
|
|
|
|
u16 linksta;
|
|
|
|
|
|
|
|
pci_read_config_dword(bridge, pos + PCI_EXP_LNKCAP, &linkcap);
|
|
|
|
bus->max_bus_speed = pcie_link_speed[linkcap & 0xf];
|
|
|
|
|
|
|
|
pci_read_config_word(bridge, pos + PCI_EXP_LNKSTA, &linksta);
|
|
|
|
pcie_update_link_speed(bus, linksta);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-19 04:53:55 +08:00
|
|
|
static struct pci_bus *pci_alloc_child_bus(struct pci_bus *parent,
|
|
|
|
struct pci_dev *bridge, int busnr)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct pci_bus *child;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate a new bus, and inherit stuff from the parent..
|
|
|
|
*/
|
|
|
|
child = pci_alloc_bus();
|
|
|
|
if (!child)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
child->parent = parent;
|
|
|
|
child->ops = parent->ops;
|
|
|
|
child->sysdata = parent->sysdata;
|
2006-02-15 00:52:22 +08:00
|
|
|
child->bus_flags = parent->bus_flags;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-05-23 10:47:54 +08:00
|
|
|
/* initialize some portions of the bus device, but don't register it
|
|
|
|
* now as the parent is not properly set up yet. This device will get
|
|
|
|
* registered later in pci_bus_add_devices()
|
|
|
|
*/
|
|
|
|
child->dev.class = &pcibus_class;
|
2008-10-30 09:17:49 +08:00
|
|
|
dev_set_name(&child->dev, "%04x:%02x", pci_domain_nr(child), busnr);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Set up the primary, secondary and subordinate
|
|
|
|
* bus numbers.
|
|
|
|
*/
|
|
|
|
child->number = child->secondary = busnr;
|
|
|
|
child->primary = parent->secondary;
|
|
|
|
child->subordinate = 0xff;
|
|
|
|
|
2008-11-22 02:41:07 +08:00
|
|
|
if (!bridge)
|
|
|
|
return child;
|
|
|
|
|
|
|
|
child->self = bridge;
|
|
|
|
child->bridge = get_device(&bridge->dev);
|
2011-04-11 09:37:07 +08:00
|
|
|
pci_set_bus_of_node(child);
|
2009-12-13 21:11:33 +08:00
|
|
|
pci_set_bus_speed(child);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Set up default resource pointers and names.. */
|
2008-11-22 02:39:32 +08:00
|
|
|
for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
|
2005-04-17 06:20:36 +08:00
|
|
|
child->resource[i] = &bridge->resource[PCI_BRIDGE_RESOURCES+i];
|
|
|
|
child->resource[i]->name = child->name;
|
|
|
|
}
|
|
|
|
bridge->subordinate = child;
|
|
|
|
|
|
|
|
return child;
|
|
|
|
}
|
|
|
|
|
2008-02-03 05:33:43 +08:00
|
|
|
struct pci_bus *__ref pci_add_new_bus(struct pci_bus *parent, struct pci_dev *dev, int busnr)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct pci_bus *child;
|
|
|
|
|
|
|
|
child = pci_alloc_child_bus(parent, dev, busnr);
|
2005-04-28 15:25:48 +08:00
|
|
|
if (child) {
|
2006-06-02 12:35:43 +08:00
|
|
|
down_write(&pci_bus_sem);
|
2005-04-17 06:20:36 +08:00
|
|
|
list_add_tail(&child->node, &parent->children);
|
2006-06-02 12:35:43 +08:00
|
|
|
up_write(&pci_bus_sem);
|
2005-04-28 15:25:48 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
return child;
|
|
|
|
}
|
|
|
|
|
2007-03-27 13:53:30 +08:00
|
|
|
static void pci_fixup_parent_subordinate_busnr(struct pci_bus *child, int max)
|
2005-06-03 06:41:48 +08:00
|
|
|
{
|
|
|
|
struct pci_bus *parent = child->parent;
|
2005-09-23 12:06:31 +08:00
|
|
|
|
|
|
|
/* Attempts to fix that up are really dangerous unless
|
|
|
|
we're going to re-assign all bus numbers. */
|
|
|
|
if (!pcibios_assign_all_busses())
|
|
|
|
return;
|
|
|
|
|
2005-06-03 06:41:48 +08:00
|
|
|
while (parent->parent && parent->subordinate < max) {
|
|
|
|
parent->subordinate = max;
|
|
|
|
pci_write_config_byte(parent->self, PCI_SUBORDINATE_BUS, max);
|
|
|
|
parent = parent->parent;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* If it's a bridge, configure it and scan the bus behind it.
|
|
|
|
* For CardBus bridges, we don't scan behind as the devices will
|
|
|
|
* be handled by the bridge driver itself.
|
|
|
|
*
|
|
|
|
* We need to process bridges in two passes -- first we scan those
|
|
|
|
* already configured by the BIOS and after we are done with all of
|
|
|
|
* them, we proceed to assigning numbers to the remaining buses in
|
|
|
|
* order to avoid overlaps between old and new bus numbers.
|
|
|
|
*/
|
2008-02-17 17:45:28 +08:00
|
|
|
int __devinit pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, int pass)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct pci_bus *child;
|
|
|
|
int is_cardbus = (dev->hdr_type == PCI_HEADER_TYPE_CARDBUS);
|
2005-12-08 23:53:12 +08:00
|
|
|
u32 buses, i, j = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
u16 bctl;
|
2010-03-17 05:52:58 +08:00
|
|
|
u8 primary, secondary, subordinate;
|
2008-10-21 07:06:29 +08:00
|
|
|
int broken = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
pci_read_config_dword(dev, PCI_PRIMARY_BUS, &buses);
|
2010-03-17 05:52:58 +08:00
|
|
|
primary = buses & 0xFF;
|
|
|
|
secondary = (buses >> 8) & 0xFF;
|
|
|
|
subordinate = (buses >> 16) & 0xFF;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-03-17 05:52:58 +08:00
|
|
|
dev_dbg(&dev->dev, "scanning [bus %02x-%02x] behind bridge, pass %d\n",
|
|
|
|
secondary, subordinate, pass);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-10-21 07:06:29 +08:00
|
|
|
/* Check if setup is sensible at all */
|
|
|
|
if (!pass &&
|
2010-03-17 05:52:58 +08:00
|
|
|
(primary != bus->number || secondary <= bus->number)) {
|
2008-10-21 07:06:29 +08:00
|
|
|
dev_dbg(&dev->dev, "bus configuration invalid, reconfiguring\n");
|
|
|
|
broken = 1;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Disable MasterAbortMode during probing to avoid reporting
|
|
|
|
of bus errors (in some architectures) */
|
|
|
|
pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &bctl);
|
|
|
|
pci_write_config_word(dev, PCI_BRIDGE_CONTROL,
|
|
|
|
bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT);
|
|
|
|
|
2010-03-17 05:52:58 +08:00
|
|
|
if ((secondary || subordinate) && !pcibios_assign_all_busses() &&
|
|
|
|
!is_cardbus && !broken) {
|
|
|
|
unsigned int cmax;
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Bus already configured by firmware, process it in the first
|
|
|
|
* pass and just note the configuration.
|
|
|
|
*/
|
|
|
|
if (pass)
|
[PATCH] PCI: Avoid leaving MASTER_ABORT disabled permanently when returning from pci_scan_bridge.
> On Mon, Feb 13, 2006 at 05:13:21PM -0800, David S. Miller wrote:
> >
> > In drivers/pci/probe.c:pci_scan_bridge(), if this is not the first
> > pass (pass != 0) we don't restore the PCI_BRIDGE_CONTROL_REGISTER and
> > thus leave PCI_BRIDGE_CTL_MASTER_ABORT off:
> >
> > int __devinit pci_scan_bridge(struct pci_bus *bus, struct pci_dev * dev, int max, int pass)
> > {
> > ...
> > /* Disable MasterAbortMode during probing to avoid reporting
> > of bus errors (in some architectures) */
> > pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &bctl);
> > pci_write_config_word(dev, PCI_BRIDGE_CONTROL,
> > bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT);
> > ...
> > if ((buses & 0xffff00) && !pcibios_assign_all_busses() && !is_cardbus) {
> > unsigned int cmax, busnr;
> > /*
> > * Bus already configured by firmware, process it in the first
> > * pass and just note the configuration.
> > */
> > if (pass)
> > return max;
> > ...
> > }
> >
> > pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl);
> > ...
> >
> > This doesn't seem intentional.
Agreed, looks like an accident. The patch [1] originally came from Kip
Walker (Broadcom back then) between 2.6.0-test3 and 2.6.0-test4. As I
recall it was supposed to fix an issue with with PCI aborts being
signalled by the PCI bridge of the Broadcom BCM1250 family of SOCs when
probing behind pci_scan_bridge. It is undeseriable to disable
PCI_BRIDGE_CTL_MASTER_ABORT in pci_{read,write)_config_* and the
behaviour wasn't considered a bug in need of a workaround, so this was
put in probe.c.
I don't have an affected system at hand, so can't really test but I
propose something like the below patch.
[1] http://www.linux-mips.org/git?p=linux.git;a=commit;h=599457e0cb702a31a3247ea6a5d9c6c99c4cf195
[PCI] Avoid leaving MASTER_ABORT disabled permanently when returning from pci_scan_bridge.
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-02-15 00:23:57 +08:00
|
|
|
goto out;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we already got to this bus through a different bridge,
|
2009-03-21 04:56:10 +08:00
|
|
|
* don't re-add it. This can happen with the i450NX chipset.
|
|
|
|
*
|
|
|
|
* However, we continue to descend down the hierarchy and
|
|
|
|
* scan remaining child buses.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2010-03-17 05:52:58 +08:00
|
|
|
child = pci_find_bus(pci_domain_nr(bus), secondary);
|
2009-03-21 04:56:10 +08:00
|
|
|
if (!child) {
|
2010-03-17 05:52:58 +08:00
|
|
|
child = pci_add_new_bus(bus, dev, secondary);
|
2009-03-21 04:56:10 +08:00
|
|
|
if (!child)
|
|
|
|
goto out;
|
2010-03-17 05:52:58 +08:00
|
|
|
child->primary = primary;
|
|
|
|
child->subordinate = subordinate;
|
2009-03-21 04:56:10 +08:00
|
|
|
child->bridge_ctl = bctl;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
cmax = pci_scan_child_bus(child);
|
|
|
|
if (cmax > max)
|
|
|
|
max = cmax;
|
|
|
|
if (child->subordinate > max)
|
|
|
|
max = child->subordinate;
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* We need to assign a number to this bus which we always
|
|
|
|
* do in the second pass.
|
|
|
|
*/
|
2005-09-23 12:06:31 +08:00
|
|
|
if (!pass) {
|
2008-10-21 07:06:29 +08:00
|
|
|
if (pcibios_assign_all_busses() || broken)
|
2005-09-23 12:06:31 +08:00
|
|
|
/* Temporarily disable forwarding of the
|
|
|
|
configuration cycles on all bridges in
|
|
|
|
this bus segment to avoid possible
|
|
|
|
conflicts in the second pass between two
|
|
|
|
bridges programmed with overlapping
|
|
|
|
bus ranges. */
|
|
|
|
pci_write_config_dword(dev, PCI_PRIMARY_BUS,
|
|
|
|
buses & ~0xffffff);
|
[PATCH] PCI: Avoid leaving MASTER_ABORT disabled permanently when returning from pci_scan_bridge.
> On Mon, Feb 13, 2006 at 05:13:21PM -0800, David S. Miller wrote:
> >
> > In drivers/pci/probe.c:pci_scan_bridge(), if this is not the first
> > pass (pass != 0) we don't restore the PCI_BRIDGE_CONTROL_REGISTER and
> > thus leave PCI_BRIDGE_CTL_MASTER_ABORT off:
> >
> > int __devinit pci_scan_bridge(struct pci_bus *bus, struct pci_dev * dev, int max, int pass)
> > {
> > ...
> > /* Disable MasterAbortMode during probing to avoid reporting
> > of bus errors (in some architectures) */
> > pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &bctl);
> > pci_write_config_word(dev, PCI_BRIDGE_CONTROL,
> > bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT);
> > ...
> > if ((buses & 0xffff00) && !pcibios_assign_all_busses() && !is_cardbus) {
> > unsigned int cmax, busnr;
> > /*
> > * Bus already configured by firmware, process it in the first
> > * pass and just note the configuration.
> > */
> > if (pass)
> > return max;
> > ...
> > }
> >
> > pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl);
> > ...
> >
> > This doesn't seem intentional.
Agreed, looks like an accident. The patch [1] originally came from Kip
Walker (Broadcom back then) between 2.6.0-test3 and 2.6.0-test4. As I
recall it was supposed to fix an issue with with PCI aborts being
signalled by the PCI bridge of the Broadcom BCM1250 family of SOCs when
probing behind pci_scan_bridge. It is undeseriable to disable
PCI_BRIDGE_CTL_MASTER_ABORT in pci_{read,write)_config_* and the
behaviour wasn't considered a bug in need of a workaround, so this was
put in probe.c.
I don't have an affected system at hand, so can't really test but I
propose something like the below patch.
[1] http://www.linux-mips.org/git?p=linux.git;a=commit;h=599457e0cb702a31a3247ea6a5d9c6c99c4cf195
[PCI] Avoid leaving MASTER_ABORT disabled permanently when returning from pci_scan_bridge.
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-02-15 00:23:57 +08:00
|
|
|
goto out;
|
2005-09-23 12:06:31 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Clear errors */
|
|
|
|
pci_write_config_word(dev, PCI_STATUS, 0xffff);
|
|
|
|
|
2005-04-28 15:25:47 +08:00
|
|
|
/* Prevent assigning a bus number that already exists.
|
2011-06-02 11:02:50 +08:00
|
|
|
* This can happen when a bridge is hot-plugged, so in
|
|
|
|
* this case we only re-scan this bus. */
|
|
|
|
child = pci_find_bus(pci_domain_nr(bus), max+1);
|
|
|
|
if (!child) {
|
|
|
|
child = pci_add_new_bus(bus, dev, ++max);
|
|
|
|
if (!child)
|
|
|
|
goto out;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
buses = (buses & 0xff000000)
|
|
|
|
| ((unsigned int)(child->primary) << 0)
|
|
|
|
| ((unsigned int)(child->secondary) << 8)
|
|
|
|
| ((unsigned int)(child->subordinate) << 16);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* yenta.c forces a secondary latency timer of 176.
|
|
|
|
* Copy that behaviour here.
|
|
|
|
*/
|
|
|
|
if (is_cardbus) {
|
|
|
|
buses &= ~0xff000000;
|
|
|
|
buses |= CARDBUS_LATENCY_TIMER << 24;
|
|
|
|
}
|
2011-01-25 04:14:33 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* We need to blast all three values with a single write.
|
|
|
|
*/
|
|
|
|
pci_write_config_dword(dev, PCI_PRIMARY_BUS, buses);
|
|
|
|
|
|
|
|
if (!is_cardbus) {
|
2007-10-09 07:24:16 +08:00
|
|
|
child->bridge_ctl = bctl;
|
2005-06-03 06:41:48 +08:00
|
|
|
/*
|
|
|
|
* Adjust subordinate busnr in parent buses.
|
|
|
|
* We do this before scanning for children because
|
|
|
|
* some devices may not be detected if the bios
|
|
|
|
* was lazy.
|
|
|
|
*/
|
|
|
|
pci_fixup_parent_subordinate_busnr(child, max);
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Now we can scan all subordinate buses... */
|
|
|
|
max = pci_scan_child_bus(child);
|
2006-01-18 08:57:01 +08:00
|
|
|
/*
|
|
|
|
* now fix it up again since we have found
|
|
|
|
* the real value of max.
|
|
|
|
*/
|
|
|
|
pci_fixup_parent_subordinate_busnr(child, max);
|
2005-04-17 06:20:36 +08:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* For CardBus bridges, we leave 4 bus numbers
|
|
|
|
* as cards with a PCI-to-PCI bridge can be
|
|
|
|
* inserted later.
|
|
|
|
*/
|
2005-12-08 23:53:12 +08:00
|
|
|
for (i=0; i<CARDBUS_RESERVE_BUSNR; i++) {
|
|
|
|
struct pci_bus *parent = bus;
|
2005-04-28 15:25:47 +08:00
|
|
|
if (pci_find_bus(pci_domain_nr(bus),
|
|
|
|
max+i+1))
|
|
|
|
break;
|
2005-12-08 23:53:12 +08:00
|
|
|
while (parent->parent) {
|
|
|
|
if ((!pcibios_assign_all_busses()) &&
|
|
|
|
(parent->subordinate > max) &&
|
|
|
|
(parent->subordinate <= max+i)) {
|
|
|
|
j = 1;
|
|
|
|
}
|
|
|
|
parent = parent->parent;
|
|
|
|
}
|
|
|
|
if (j) {
|
|
|
|
/*
|
|
|
|
* Often, there are two cardbus bridges
|
|
|
|
* -- try to leave one valid bus number
|
|
|
|
* for each one.
|
|
|
|
*/
|
|
|
|
i /= 2;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2005-04-28 15:25:47 +08:00
|
|
|
max += i;
|
2005-06-03 06:41:48 +08:00
|
|
|
pci_fixup_parent_subordinate_busnr(child, max);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Set the subordinate bus number to its real value.
|
|
|
|
*/
|
|
|
|
child->subordinate = max;
|
|
|
|
pci_write_config_byte(dev, PCI_SUBORDINATE_BUS, max);
|
|
|
|
}
|
|
|
|
|
2008-02-09 06:00:52 +08:00
|
|
|
sprintf(child->name,
|
|
|
|
(is_cardbus ? "PCI CardBus %04x:%02x" : "PCI Bus %04x:%02x"),
|
|
|
|
pci_domain_nr(bus), child->number);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
PCI: lets kill the 'PCI hidden behind bridge' message
Adrian Bunk wrote:
> Alois Nešpor wrote
>> PCI: Bus #0b (-#0e) is hidden behind transparent bridge #0a (-#0b) (try 'pci=assign-busses')
>> Please report the result to linux-kernel to fix this permanently"
>>
>> dmesg:
>> "Yenta: Raising subordinate bus# of parent bus (#0a) from #0b to #0e"
>> without pci=assign-busses and nothing with pci=assign-busses.
>
> Bernhard?
Ok, lets kill the message. As Alois Nešpor also saw, that's fixed up by Yenta,
so PCI does not have to warn about it. PCI could still warn about it if
is_cardbus is 0 in that instance of pci_scan_bridge(), but so far I have
not seen a report where this would have been the case so I think we can
spare the kernel of that check (removes ~300 lines of asm) unless debugging
is done.
History: The whole check was added in the days before we had the fixup
for this in Yenta and pci=assign-busses was the only way to get CardBus
cards detected on many (not all) of the machines which give this warning.
In theory, there could be cases when this warning would be triggered and
it's not cardbus, then the warning should still apply, but I think this
should only be the case when working on a completely broken PCI setup,
but one may have already enabled the debug code in drivers/pci and the
patched check would then trigger.
I do not sign this off yet because it's completely untested so far, but
everyone is free to test it (with the #ifdef DEBUG replaced by #if 1 and
pr_debug( changed to printk(.
We may also dump the whole check (remove everything within the #ifdef from
the source) if that's perferred.
On Alois Nešpor's machine this would then (only when debugging) this message:
"PCI: Bus #0b (-#0e) is partially hidden behind transparent bridge #0a (-#0b)"
"partially" should be in the message on his machine because #0b of #0b-#0e
is reachable behind #0a-#0b, but not #0c-#0e.
But that differentiation is now moot anyway because the fixup in Yenta takes
care of it as far as I could see so far, which means that unless somebody
is debugging a totally broken PCI setup, this message is not needed anymore,
not even for debugging PCI.
Ok, here the patch with the following changes:
* Refined to say that the bus is only partially hidden when the parent
bus numbers are not totally way off (outside of) the child bus range
* remove the reference to pci=assign-busses and the plea to report it
We could add a pure source code-only comment to keep a reference to
pci=assign-busses the in case when this is triggered by someone who
is debugging the cause of this message and looking the way to solve it.
From: Bernhard Kaindl <bk@suse.de>
Cc: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2007-07-31 02:35:13 +08:00
|
|
|
/* Has only triggered on CardBus, fixup is in yenta_socket */
|
2005-12-08 23:53:12 +08:00
|
|
|
while (bus->parent) {
|
|
|
|
if ((child->subordinate > bus->subordinate) ||
|
|
|
|
(child->number > bus->subordinate) ||
|
|
|
|
(child->number < bus->number) ||
|
|
|
|
(child->subordinate < bus->number)) {
|
2009-11-05 01:32:57 +08:00
|
|
|
dev_info(&child->dev, "[bus %02x-%02x] %s "
|
|
|
|
"hidden behind%s bridge %s [bus %02x-%02x]\n",
|
PCI: lets kill the 'PCI hidden behind bridge' message
Adrian Bunk wrote:
> Alois Nešpor wrote
>> PCI: Bus #0b (-#0e) is hidden behind transparent bridge #0a (-#0b) (try 'pci=assign-busses')
>> Please report the result to linux-kernel to fix this permanently"
>>
>> dmesg:
>> "Yenta: Raising subordinate bus# of parent bus (#0a) from #0b to #0e"
>> without pci=assign-busses and nothing with pci=assign-busses.
>
> Bernhard?
Ok, lets kill the message. As Alois Nešpor also saw, that's fixed up by Yenta,
so PCI does not have to warn about it. PCI could still warn about it if
is_cardbus is 0 in that instance of pci_scan_bridge(), but so far I have
not seen a report where this would have been the case so I think we can
spare the kernel of that check (removes ~300 lines of asm) unless debugging
is done.
History: The whole check was added in the days before we had the fixup
for this in Yenta and pci=assign-busses was the only way to get CardBus
cards detected on many (not all) of the machines which give this warning.
In theory, there could be cases when this warning would be triggered and
it's not cardbus, then the warning should still apply, but I think this
should only be the case when working on a completely broken PCI setup,
but one may have already enabled the debug code in drivers/pci and the
patched check would then trigger.
I do not sign this off yet because it's completely untested so far, but
everyone is free to test it (with the #ifdef DEBUG replaced by #if 1 and
pr_debug( changed to printk(.
We may also dump the whole check (remove everything within the #ifdef from
the source) if that's perferred.
On Alois Nešpor's machine this would then (only when debugging) this message:
"PCI: Bus #0b (-#0e) is partially hidden behind transparent bridge #0a (-#0b)"
"partially" should be in the message on his machine because #0b of #0b-#0e
is reachable behind #0a-#0b, but not #0c-#0e.
But that differentiation is now moot anyway because the fixup in Yenta takes
care of it as far as I could see so far, which means that unless somebody
is debugging a totally broken PCI setup, this message is not needed anymore,
not even for debugging PCI.
Ok, here the patch with the following changes:
* Refined to say that the bus is only partially hidden when the parent
bus numbers are not totally way off (outside of) the child bus range
* remove the reference to pci=assign-busses and the plea to report it
We could add a pure source code-only comment to keep a reference to
pci=assign-busses the in case when this is triggered by someone who
is debugging the cause of this message and looking the way to solve it.
From: Bernhard Kaindl <bk@suse.de>
Cc: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2007-07-31 02:35:13 +08:00
|
|
|
child->number, child->subordinate,
|
|
|
|
(bus->number > child->subordinate &&
|
|
|
|
bus->subordinate < child->number) ?
|
2007-11-20 09:48:29 +08:00
|
|
|
"wholly" : "partially",
|
|
|
|
bus->self->transparent ? " transparent" : "",
|
2009-11-05 01:32:57 +08:00
|
|
|
dev_name(&bus->dev),
|
PCI: lets kill the 'PCI hidden behind bridge' message
Adrian Bunk wrote:
> Alois Nešpor wrote
>> PCI: Bus #0b (-#0e) is hidden behind transparent bridge #0a (-#0b) (try 'pci=assign-busses')
>> Please report the result to linux-kernel to fix this permanently"
>>
>> dmesg:
>> "Yenta: Raising subordinate bus# of parent bus (#0a) from #0b to #0e"
>> without pci=assign-busses and nothing with pci=assign-busses.
>
> Bernhard?
Ok, lets kill the message. As Alois Nešpor also saw, that's fixed up by Yenta,
so PCI does not have to warn about it. PCI could still warn about it if
is_cardbus is 0 in that instance of pci_scan_bridge(), but so far I have
not seen a report where this would have been the case so I think we can
spare the kernel of that check (removes ~300 lines of asm) unless debugging
is done.
History: The whole check was added in the days before we had the fixup
for this in Yenta and pci=assign-busses was the only way to get CardBus
cards detected on many (not all) of the machines which give this warning.
In theory, there could be cases when this warning would be triggered and
it's not cardbus, then the warning should still apply, but I think this
should only be the case when working on a completely broken PCI setup,
but one may have already enabled the debug code in drivers/pci and the
patched check would then trigger.
I do not sign this off yet because it's completely untested so far, but
everyone is free to test it (with the #ifdef DEBUG replaced by #if 1 and
pr_debug( changed to printk(.
We may also dump the whole check (remove everything within the #ifdef from
the source) if that's perferred.
On Alois Nešpor's machine this would then (only when debugging) this message:
"PCI: Bus #0b (-#0e) is partially hidden behind transparent bridge #0a (-#0b)"
"partially" should be in the message on his machine because #0b of #0b-#0e
is reachable behind #0a-#0b, but not #0c-#0e.
But that differentiation is now moot anyway because the fixup in Yenta takes
care of it as far as I could see so far, which means that unless somebody
is debugging a totally broken PCI setup, this message is not needed anymore,
not even for debugging PCI.
Ok, here the patch with the following changes:
* Refined to say that the bus is only partially hidden when the parent
bus numbers are not totally way off (outside of) the child bus range
* remove the reference to pci=assign-busses and the plea to report it
We could add a pure source code-only comment to keep a reference to
pci=assign-busses the in case when this is triggered by someone who
is debugging the cause of this message and looking the way to solve it.
From: Bernhard Kaindl <bk@suse.de>
Cc: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2007-07-31 02:35:13 +08:00
|
|
|
bus->number, bus->subordinate);
|
2005-12-08 23:53:12 +08:00
|
|
|
}
|
|
|
|
bus = bus->parent;
|
|
|
|
}
|
|
|
|
|
[PATCH] PCI: Avoid leaving MASTER_ABORT disabled permanently when returning from pci_scan_bridge.
> On Mon, Feb 13, 2006 at 05:13:21PM -0800, David S. Miller wrote:
> >
> > In drivers/pci/probe.c:pci_scan_bridge(), if this is not the first
> > pass (pass != 0) we don't restore the PCI_BRIDGE_CONTROL_REGISTER and
> > thus leave PCI_BRIDGE_CTL_MASTER_ABORT off:
> >
> > int __devinit pci_scan_bridge(struct pci_bus *bus, struct pci_dev * dev, int max, int pass)
> > {
> > ...
> > /* Disable MasterAbortMode during probing to avoid reporting
> > of bus errors (in some architectures) */
> > pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &bctl);
> > pci_write_config_word(dev, PCI_BRIDGE_CONTROL,
> > bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT);
> > ...
> > if ((buses & 0xffff00) && !pcibios_assign_all_busses() && !is_cardbus) {
> > unsigned int cmax, busnr;
> > /*
> > * Bus already configured by firmware, process it in the first
> > * pass and just note the configuration.
> > */
> > if (pass)
> > return max;
> > ...
> > }
> >
> > pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl);
> > ...
> >
> > This doesn't seem intentional.
Agreed, looks like an accident. The patch [1] originally came from Kip
Walker (Broadcom back then) between 2.6.0-test3 and 2.6.0-test4. As I
recall it was supposed to fix an issue with with PCI aborts being
signalled by the PCI bridge of the Broadcom BCM1250 family of SOCs when
probing behind pci_scan_bridge. It is undeseriable to disable
PCI_BRIDGE_CTL_MASTER_ABORT in pci_{read,write)_config_* and the
behaviour wasn't considered a bug in need of a workaround, so this was
put in probe.c.
I don't have an affected system at hand, so can't really test but I
propose something like the below patch.
[1] http://www.linux-mips.org/git?p=linux.git;a=commit;h=599457e0cb702a31a3247ea6a5d9c6c99c4cf195
[PCI] Avoid leaving MASTER_ABORT disabled permanently when returning from pci_scan_bridge.
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-02-15 00:23:57 +08:00
|
|
|
out:
|
|
|
|
pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
return max;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Read interrupt line and base address registers.
|
|
|
|
* The architecture-dependent code can tweak these, of course.
|
|
|
|
*/
|
|
|
|
static void pci_read_irq(struct pci_dev *dev)
|
|
|
|
{
|
|
|
|
unsigned char irq;
|
|
|
|
|
|
|
|
pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &irq);
|
2005-11-03 08:24:32 +08:00
|
|
|
dev->pin = irq;
|
2005-04-17 06:20:36 +08:00
|
|
|
if (irq)
|
|
|
|
pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq);
|
|
|
|
dev->irq = irq;
|
|
|
|
}
|
|
|
|
|
2010-01-27 01:10:03 +08:00
|
|
|
void set_pcie_port_type(struct pci_dev *pdev)
|
2009-03-20 11:25:14 +08:00
|
|
|
{
|
|
|
|
int pos;
|
|
|
|
u16 reg16;
|
|
|
|
|
|
|
|
pos = pci_find_capability(pdev, PCI_CAP_ID_EXP);
|
|
|
|
if (!pos)
|
|
|
|
return;
|
|
|
|
pdev->is_pcie = 1;
|
2009-11-05 11:05:11 +08:00
|
|
|
pdev->pcie_cap = pos;
|
2009-03-20 11:25:14 +08:00
|
|
|
pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, ®16);
|
|
|
|
pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4;
|
PCI: Set PCI-E Max Payload Size on fabric
On a given PCI-E fabric, each device, bridge, and root port can have a
different PCI-E maximum payload size. There is a sizable performance
boost for having the largest possible maximum payload size on each PCI-E
device. However, if improperly configured, fatal bus errors can occur.
Thus, it is important to ensure that PCI-E payloads sends by a device
are never larger than the MPS setting of all devices on the way to the
destination.
This can be achieved two ways:
- A conservative approach is to use the smallest common denominator of
the entire tree below a root complex for every device on that fabric.
This means for example that having a 128 bytes MPS USB controller on one
leg of a switch will dramatically reduce performances of a video card or
10GE adapter on another leg of that same switch.
It also means that any hierarchy supporting hotplug slots (including
expresscard or thunderbolt I suppose, dbl check that) will have to be
entirely clamped to 128 bytes since we cannot predict what will be
plugged into those slots, and we cannot change the MPS on a "live"
system.
- A more optimal way is possible, if it falls within a couple of
constraints:
* The top-level host bridge will never generate packets larger than the
smallest TLP (or if it can be controlled independently from its MPS at
least)
* The device will never generate packets larger than MPS (which can be
configured via MRRS)
* No support of direct PCI-E <-> PCI-E transfers between devices without
some additional code to specifically deal with that case
Then we can use an approach that basically ignores downstream requests
and focuses exclusively on upstream requests. In that case, all we need
to care about is that a device MPS is no larger than its parent MPS,
which allows us to keep all switches/bridges to the max MPS supported by
their parent and eventually the PHB.
In this case, your USB controller would no longer "starve" your 10GE
Ethernet and your hotplug slots won't affect your global MPS.
Additionally, the hotplugged devices themselves can be configured to a
larger MPS up to the value configured in the hotplug bridge.
To choose between the two available options, two PCI kernel boot args
have been added to the PCI calls. "pcie_bus_safe" will provide the
former behavior, while "pcie_bus_perf" will perform the latter behavior.
By default, the latter behavior is used.
NOTE: due to the location of the enablement, each arch will need to add
calls to this function. This patch only enables x86.
This patch includes a number of changes recommended by Benjamin
Herrenschmidt.
Tested-by: Jordan_Hargrave@dell.com
Signed-off-by: Jon Mason <mason@myri.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
|
|
|
pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, ®16);
|
|
|
|
pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD;
|
2009-03-20 11:25:14 +08:00
|
|
|
}
|
|
|
|
|
2010-01-27 01:10:03 +08:00
|
|
|
void set_pcie_hotplug_bridge(struct pci_dev *pdev)
|
2009-09-10 05:09:24 +08:00
|
|
|
{
|
|
|
|
int pos;
|
|
|
|
u16 reg16;
|
|
|
|
u32 reg32;
|
|
|
|
|
2009-11-11 13:30:56 +08:00
|
|
|
pos = pci_pcie_cap(pdev);
|
2009-09-10 05:09:24 +08:00
|
|
|
if (!pos)
|
|
|
|
return;
|
|
|
|
pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, ®16);
|
|
|
|
if (!(reg16 & PCI_EXP_FLAGS_SLOT))
|
|
|
|
return;
|
|
|
|
pci_read_config_dword(pdev, pos + PCI_EXP_SLTCAP, ®32);
|
|
|
|
if (reg32 & PCI_EXP_SLTCAP_HPC)
|
|
|
|
pdev->is_hotplug_bridge = 1;
|
|
|
|
}
|
|
|
|
|
2007-04-24 05:19:36 +08:00
|
|
|
#define LEGACY_IO_RESOURCE (IORESOURCE_IO | IORESOURCE_PCI_FIXED)
|
2006-12-30 08:47:29 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/**
|
|
|
|
* pci_setup_device - fill in class and map information of a device
|
|
|
|
* @dev: the device structure to fill
|
|
|
|
*
|
|
|
|
* Initialize the device structure with information about the device's
|
|
|
|
* vendor,class,memory and IO-space addresses,IRQ lines etc.
|
|
|
|
* Called at initialisation of the PCI subsystem and by CardBus services.
|
2009-03-20 11:25:14 +08:00
|
|
|
* Returns 0 on success and negative if unknown type of device (not normal,
|
|
|
|
* bridge or CardBus).
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2009-03-20 11:25:14 +08:00
|
|
|
int pci_setup_device(struct pci_dev *dev)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
u32 class;
|
2009-03-20 11:25:14 +08:00
|
|
|
u8 hdr_type;
|
|
|
|
struct pci_slot *slot;
|
2009-10-06 23:45:19 +08:00
|
|
|
int pos = 0;
|
2009-03-20 11:25:14 +08:00
|
|
|
|
|
|
|
if (pci_read_config_byte(dev, PCI_HEADER_TYPE, &hdr_type))
|
|
|
|
return -EIO;
|
|
|
|
|
|
|
|
dev->sysdata = dev->bus->sysdata;
|
|
|
|
dev->dev.parent = dev->bus->bridge;
|
|
|
|
dev->dev.bus = &pci_bus_type;
|
|
|
|
dev->hdr_type = hdr_type & 0x7f;
|
|
|
|
dev->multifunction = !!(hdr_type & 0x80);
|
|
|
|
dev->error_state = pci_channel_io_normal;
|
|
|
|
set_pcie_port_type(dev);
|
|
|
|
|
|
|
|
list_for_each_entry(slot, &dev->bus->slots, list)
|
|
|
|
if (PCI_SLOT(dev->devfn) == slot->number)
|
|
|
|
dev->slot = slot;
|
|
|
|
|
|
|
|
/* Assume 32-bit PCI; let 64-bit PCI cards (which are far rarer)
|
|
|
|
set this higher, assuming the system even supports it. */
|
|
|
|
dev->dma_mask = 0xffffffff;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-07-03 04:24:49 +08:00
|
|
|
dev_set_name(&dev->dev, "%04x:%02x:%02x.%d", pci_domain_nr(dev->bus),
|
|
|
|
dev->bus->number, PCI_SLOT(dev->devfn),
|
|
|
|
PCI_FUNC(dev->devfn));
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
pci_read_config_dword(dev, PCI_CLASS_REVISION, &class);
|
2007-06-09 06:46:30 +08:00
|
|
|
dev->revision = class & 0xff;
|
2005-04-17 06:20:36 +08:00
|
|
|
class >>= 8; /* upper 3 bytes */
|
|
|
|
dev->class = class;
|
|
|
|
class >>= 8;
|
|
|
|
|
2010-09-30 02:23:21 +08:00
|
|
|
dev_printk(KERN_DEBUG, &dev->dev, "[%04x:%04x] type %d class %#08x\n",
|
|
|
|
dev->vendor, dev->device, dev->hdr_type, class);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-03-21 22:05:11 +08:00
|
|
|
/* need to have dev->class ready */
|
|
|
|
dev->cfg_size = pci_cfg_space_size(dev);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* "Unknown power state" */
|
2005-08-18 06:32:19 +08:00
|
|
|
dev->current_state = PCI_UNKNOWN;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Early fixups, before probing the BARs */
|
|
|
|
pci_fixup_device(pci_fixup_early, dev);
|
2009-05-28 00:25:05 +08:00
|
|
|
/* device class may be changed after fixup */
|
|
|
|
class = dev->class >> 8;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
switch (dev->hdr_type) { /* header type */
|
|
|
|
case PCI_HEADER_TYPE_NORMAL: /* standard header */
|
|
|
|
if (class == PCI_CLASS_BRIDGE_PCI)
|
|
|
|
goto bad;
|
|
|
|
pci_read_irq(dev);
|
|
|
|
pci_read_bases(dev, 6, PCI_ROM_ADDRESS);
|
|
|
|
pci_read_config_word(dev, PCI_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor);
|
|
|
|
pci_read_config_word(dev, PCI_SUBSYSTEM_ID, &dev->subsystem_device);
|
2006-10-04 07:41:26 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Do the ugly legacy mode stuff here rather than broken chip
|
|
|
|
* quirk code. Legacy mode ATA controllers have fixed
|
|
|
|
* addresses. These are not always echoed in BAR0-3, and
|
|
|
|
* BAR0-3 in a few cases contain junk!
|
|
|
|
*/
|
|
|
|
if (class == PCI_CLASS_STORAGE_IDE) {
|
|
|
|
u8 progif;
|
|
|
|
pci_read_config_byte(dev, PCI_CLASS_PROG, &progif);
|
|
|
|
if ((progif & 1) == 0) {
|
2007-12-10 23:40:54 +08:00
|
|
|
dev->resource[0].start = 0x1F0;
|
|
|
|
dev->resource[0].end = 0x1F7;
|
|
|
|
dev->resource[0].flags = LEGACY_IO_RESOURCE;
|
|
|
|
dev->resource[1].start = 0x3F6;
|
|
|
|
dev->resource[1].end = 0x3F6;
|
|
|
|
dev->resource[1].flags = LEGACY_IO_RESOURCE;
|
2006-10-04 07:41:26 +08:00
|
|
|
}
|
|
|
|
if ((progif & 4) == 0) {
|
2007-12-10 23:40:54 +08:00
|
|
|
dev->resource[2].start = 0x170;
|
|
|
|
dev->resource[2].end = 0x177;
|
|
|
|
dev->resource[2].flags = LEGACY_IO_RESOURCE;
|
|
|
|
dev->resource[3].start = 0x376;
|
|
|
|
dev->resource[3].end = 0x376;
|
|
|
|
dev->resource[3].flags = LEGACY_IO_RESOURCE;
|
2006-10-04 07:41:26 +08:00
|
|
|
}
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
case PCI_HEADER_TYPE_BRIDGE: /* bridge header */
|
|
|
|
if (class != PCI_CLASS_BRIDGE_PCI)
|
|
|
|
goto bad;
|
|
|
|
/* The PCI-to-PCI bridge spec requires that subtractive
|
|
|
|
decoding (i.e. transparent) bridge must have programming
|
|
|
|
interface code of 0x01. */
|
2005-11-03 08:55:49 +08:00
|
|
|
pci_read_irq(dev);
|
2005-04-17 06:20:36 +08:00
|
|
|
dev->transparent = ((dev->class & 0xff) == 1);
|
|
|
|
pci_read_bases(dev, 2, PCI_ROM_ADDRESS1);
|
2009-09-10 05:09:24 +08:00
|
|
|
set_pcie_hotplug_bridge(dev);
|
2009-10-06 23:45:19 +08:00
|
|
|
pos = pci_find_capability(dev, PCI_CAP_ID_SSVID);
|
|
|
|
if (pos) {
|
|
|
|
pci_read_config_word(dev, pos + PCI_SSVID_VENDOR_ID, &dev->subsystem_vendor);
|
|
|
|
pci_read_config_word(dev, pos + PCI_SSVID_DEVICE_ID, &dev->subsystem_device);
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
case PCI_HEADER_TYPE_CARDBUS: /* CardBus bridge header */
|
|
|
|
if (class != PCI_CLASS_BRIDGE_CARDBUS)
|
|
|
|
goto bad;
|
|
|
|
pci_read_irq(dev);
|
|
|
|
pci_read_bases(dev, 1, 0);
|
|
|
|
pci_read_config_word(dev, PCI_CB_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor);
|
|
|
|
pci_read_config_word(dev, PCI_CB_SUBSYSTEM_ID, &dev->subsystem_device);
|
|
|
|
break;
|
|
|
|
|
|
|
|
default: /* unknown header */
|
2008-06-14 00:52:11 +08:00
|
|
|
dev_err(&dev->dev, "unknown header type %02x, "
|
|
|
|
"ignoring device\n", dev->hdr_type);
|
2009-03-20 11:25:14 +08:00
|
|
|
return -EIO;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
bad:
|
2008-06-14 00:52:11 +08:00
|
|
|
dev_err(&dev->dev, "ignoring class %02x (doesn't match header "
|
|
|
|
"type %02x)\n", class, dev->hdr_type);
|
2005-04-17 06:20:36 +08:00
|
|
|
dev->class = PCI_CLASS_NOT_DEFINED;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* We found a fine healthy device, go go go... */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-10-13 19:49:55 +08:00
|
|
|
static void pci_release_capabilities(struct pci_dev *dev)
|
|
|
|
{
|
|
|
|
pci_vpd_release(dev);
|
2009-03-20 11:25:11 +08:00
|
|
|
pci_iov_release(dev);
|
2008-10-13 19:49:55 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/**
|
|
|
|
* pci_release_dev - free a pci device structure when all users of it are finished.
|
|
|
|
* @dev: device that's been disconnected
|
|
|
|
*
|
|
|
|
* Will be called only by the device core when all users of this pci device are
|
|
|
|
* done.
|
|
|
|
*/
|
|
|
|
static void pci_release_dev(struct device *dev)
|
|
|
|
{
|
|
|
|
struct pci_dev *pci_dev;
|
|
|
|
|
|
|
|
pci_dev = to_pci_dev(dev);
|
2008-10-13 19:49:55 +08:00
|
|
|
pci_release_capabilities(pci_dev);
|
2011-04-11 09:37:07 +08:00
|
|
|
pci_release_of_node(pci_dev);
|
2005-04-17 06:20:36 +08:00
|
|
|
kfree(pci_dev);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* pci_cfg_space_size - get the configuration space size of the PCI device.
|
2005-10-24 02:57:38 +08:00
|
|
|
* @dev: PCI device
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* Regular PCI devices have 256 bytes, but PCI-X 2 and PCI Express devices
|
|
|
|
* have 4096 bytes. Even if the device is capable, that doesn't mean we can
|
|
|
|
* access it. Maybe we don't have a way to generate extended config space
|
|
|
|
* accesses, or the device is behind a reverse Express bridge. So we try
|
|
|
|
* reading the dword at 0x100 which must either be 0 or a valid extended
|
|
|
|
* capability header.
|
|
|
|
*/
|
2008-04-29 07:27:23 +08:00
|
|
|
int pci_cfg_space_size_ext(struct pci_dev *dev)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
u32 status;
|
2008-10-13 19:18:07 +08:00
|
|
|
int pos = PCI_CFG_SPACE_SIZE;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-10-13 19:18:07 +08:00
|
|
|
if (pci_read_config_dword(dev, pos, &status) != PCIBIOS_SUCCESSFUL)
|
2008-04-29 07:27:23 +08:00
|
|
|
goto fail;
|
|
|
|
if (status == 0xffffffff)
|
|
|
|
goto fail;
|
|
|
|
|
|
|
|
return PCI_CFG_SPACE_EXP_SIZE;
|
|
|
|
|
|
|
|
fail:
|
|
|
|
return PCI_CFG_SPACE_SIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
int pci_cfg_space_size(struct pci_dev *dev)
|
|
|
|
{
|
|
|
|
int pos;
|
|
|
|
u32 status;
|
2009-03-09 12:35:37 +08:00
|
|
|
u16 class;
|
|
|
|
|
|
|
|
class = dev->class >> 8;
|
|
|
|
if (class == PCI_CLASS_BRIDGE_HOST)
|
|
|
|
return pci_cfg_space_size_ext(dev);
|
2008-02-15 17:32:50 +08:00
|
|
|
|
2009-11-11 13:30:56 +08:00
|
|
|
pos = pci_pcie_cap(dev);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!pos) {
|
|
|
|
pos = pci_find_capability(dev, PCI_CAP_ID_PCIX);
|
|
|
|
if (!pos)
|
|
|
|
goto fail;
|
|
|
|
|
|
|
|
pci_read_config_dword(dev, pos + PCI_X_STATUS, &status);
|
|
|
|
if (!(status & (PCI_X_STATUS_266MHZ | PCI_X_STATUS_533MHZ)))
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
|
2008-04-29 07:27:23 +08:00
|
|
|
return pci_cfg_space_size_ext(dev);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
fail:
|
|
|
|
return PCI_CFG_SPACE_SIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void pci_release_bus_bridge_dev(struct device *dev)
|
|
|
|
{
|
|
|
|
kfree(dev);
|
|
|
|
}
|
|
|
|
|
2007-04-05 15:19:08 +08:00
|
|
|
struct pci_dev *alloc_pci_dev(void)
|
|
|
|
{
|
|
|
|
struct pci_dev *dev;
|
|
|
|
|
|
|
|
dev = kzalloc(sizeof(struct pci_dev), GFP_KERNEL);
|
|
|
|
if (!dev)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
INIT_LIST_HEAD(&dev->bus_list);
|
|
|
|
|
|
|
|
return dev;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(alloc_pci_dev);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Read the config data for a PCI device, sanity-check it
|
|
|
|
* and fill in the dev structure...
|
|
|
|
*/
|
2008-04-19 04:53:55 +08:00
|
|
|
static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct pci_dev *dev;
|
|
|
|
u32 l;
|
|
|
|
int delay = 1;
|
|
|
|
|
|
|
|
if (pci_bus_read_config_dword(bus, devfn, PCI_VENDOR_ID, &l))
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/* some broken boards return 0 or ~0 if a slot is empty: */
|
|
|
|
if (l == 0xffffffff || l == 0x00000000 ||
|
|
|
|
l == 0x0000ffff || l == 0xffff0000)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/* Configuration request Retry Status */
|
|
|
|
while (l == 0xffff0001) {
|
|
|
|
msleep(delay);
|
|
|
|
delay *= 2;
|
|
|
|
if (pci_bus_read_config_dword(bus, devfn, PCI_VENDOR_ID, &l))
|
|
|
|
return NULL;
|
|
|
|
/* Card hasn't responded in 60 seconds? Must be stuck. */
|
|
|
|
if (delay > 60 * 1000) {
|
2008-06-14 00:52:11 +08:00
|
|
|
printk(KERN_WARNING "pci %04x:%02x:%02x.%d: not "
|
2005-04-17 06:20:36 +08:00
|
|
|
"responding\n", pci_domain_nr(bus),
|
|
|
|
bus->number, PCI_SLOT(devfn),
|
|
|
|
PCI_FUNC(devfn));
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-04-05 15:19:09 +08:00
|
|
|
dev = alloc_pci_dev();
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!dev)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
dev->bus = bus;
|
|
|
|
dev->devfn = devfn;
|
|
|
|
dev->vendor = l & 0xffff;
|
|
|
|
dev->device = (l >> 16) & 0xffff;
|
2008-09-02 23:40:51 +08:00
|
|
|
|
2011-04-11 09:37:07 +08:00
|
|
|
pci_set_of_node(dev);
|
|
|
|
|
2009-03-20 11:25:14 +08:00
|
|
|
if (pci_setup_device(dev)) {
|
2005-04-17 06:20:36 +08:00
|
|
|
kfree(dev);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return dev;
|
|
|
|
}
|
|
|
|
|
2008-10-13 19:49:55 +08:00
|
|
|
static void pci_init_capabilities(struct pci_dev *dev)
|
|
|
|
{
|
|
|
|
/* MSI/MSI-X list */
|
|
|
|
pci_msi_init_pci_dev(dev);
|
|
|
|
|
2008-12-08 05:02:58 +08:00
|
|
|
/* Buffers for saving PCIe and PCI-X capabilities */
|
|
|
|
pci_allocate_cap_save_buffers(dev);
|
|
|
|
|
2008-10-13 19:49:55 +08:00
|
|
|
/* Power Management */
|
|
|
|
pci_pm_init(dev);
|
2008-12-18 04:10:05 +08:00
|
|
|
platform_pci_wakeup_init(dev);
|
2008-10-13 19:49:55 +08:00
|
|
|
|
|
|
|
/* Vital Product Data */
|
|
|
|
pci_vpd_pci22_init(dev);
|
2008-10-14 14:02:53 +08:00
|
|
|
|
|
|
|
/* Alternative Routing-ID Forwarding */
|
|
|
|
pci_enable_ari(dev);
|
2009-03-20 11:25:11 +08:00
|
|
|
|
|
|
|
/* Single Root I/O Virtualization */
|
|
|
|
pci_iov_init(dev);
|
2009-10-08 01:27:17 +08:00
|
|
|
|
|
|
|
/* Enable ACS P2P upstream forwarding */
|
2009-12-05 04:15:21 +08:00
|
|
|
pci_enable_acs(dev);
|
2008-10-13 19:49:55 +08:00
|
|
|
}
|
|
|
|
|
2007-03-27 13:53:30 +08:00
|
|
|
void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-09-06 07:31:03 +08:00
|
|
|
device_initialize(&dev->dev);
|
|
|
|
dev->dev.release = pci_release_dev;
|
|
|
|
pci_dev_get(dev);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-09-06 07:31:03 +08:00
|
|
|
dev->dev.dma_mask = &dev->dma_mask;
|
2008-02-05 14:27:55 +08:00
|
|
|
dev->dev.dma_parms = &dev->dma_parms;
|
2005-09-06 07:31:03 +08:00
|
|
|
dev->dev.coherent_dma_mask = 0xffffffffull;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-02-05 14:27:55 +08:00
|
|
|
pci_set_dma_max_seg_size(dev, 65536);
|
2008-02-05 14:28:14 +08:00
|
|
|
pci_set_dma_seg_boundary(dev, 0xffffffff);
|
2008-02-05 14:27:55 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Fix up broken headers */
|
|
|
|
pci_fixup_device(pci_fixup_header, dev);
|
|
|
|
|
2009-09-10 05:49:59 +08:00
|
|
|
/* Clear the state_saved flag. */
|
|
|
|
dev->state_saved = false;
|
|
|
|
|
2008-10-13 19:49:55 +08:00
|
|
|
/* Initialize various capabilities */
|
|
|
|
pci_init_capabilities(dev);
|
2008-07-07 09:34:48 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Add the device to our list of discovered devices
|
|
|
|
* and the bus list for fixup functions, etc.
|
|
|
|
*/
|
2006-06-02 12:35:43 +08:00
|
|
|
down_write(&pci_bus_sem);
|
2005-04-17 06:20:36 +08:00
|
|
|
list_add_tail(&dev->bus_list, &bus->devices);
|
2006-06-02 12:35:43 +08:00
|
|
|
up_write(&pci_bus_sem);
|
2005-09-06 07:31:03 +08:00
|
|
|
}
|
|
|
|
|
2008-02-03 05:33:43 +08:00
|
|
|
struct pci_dev *__ref pci_scan_single_device(struct pci_bus *bus, int devfn)
|
2005-09-06 07:31:03 +08:00
|
|
|
{
|
|
|
|
struct pci_dev *dev;
|
|
|
|
|
2009-03-21 04:56:00 +08:00
|
|
|
dev = pci_get_slot(bus, devfn);
|
|
|
|
if (dev) {
|
|
|
|
pci_dev_put(dev);
|
|
|
|
return dev;
|
|
|
|
}
|
|
|
|
|
2005-09-06 07:31:03 +08:00
|
|
|
dev = pci_scan_device(bus, devfn);
|
|
|
|
if (!dev)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
pci_device_add(dev, bus);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
return dev;
|
|
|
|
}
|
2007-11-22 07:07:11 +08:00
|
|
|
EXPORT_SYMBOL(pci_scan_single_device);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-12-13 21:10:02 +08:00
|
|
|
static unsigned next_ari_fn(struct pci_dev *dev, unsigned fn)
|
|
|
|
{
|
|
|
|
u16 cap;
|
2010-01-18 05:01:41 +08:00
|
|
|
unsigned pos, next_fn;
|
|
|
|
|
|
|
|
if (!dev)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ARI);
|
2009-12-13 21:10:02 +08:00
|
|
|
if (!pos)
|
|
|
|
return 0;
|
|
|
|
pci_read_config_word(dev, pos + 4, &cap);
|
2010-01-18 05:01:41 +08:00
|
|
|
next_fn = cap >> 8;
|
|
|
|
if (next_fn <= fn)
|
|
|
|
return 0;
|
|
|
|
return next_fn;
|
2009-12-13 21:10:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned next_trad_fn(struct pci_dev *dev, unsigned fn)
|
|
|
|
{
|
|
|
|
return (fn + 1) % 8;
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned no_next_fn(struct pci_dev *dev, unsigned fn)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int only_one_child(struct pci_bus *bus)
|
|
|
|
{
|
|
|
|
struct pci_dev *parent = bus->self;
|
|
|
|
if (!parent || !pci_is_pcie(parent))
|
|
|
|
return 0;
|
|
|
|
if (parent->pcie_type == PCI_EXP_TYPE_ROOT_PORT ||
|
|
|
|
parent->pcie_type == PCI_EXP_TYPE_DOWNSTREAM)
|
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/**
|
|
|
|
* pci_scan_slot - scan a PCI slot on a bus for devices.
|
|
|
|
* @bus: PCI bus to scan
|
|
|
|
* @devfn: slot number to scan (must have zero function.)
|
|
|
|
*
|
|
|
|
* Scan a PCI slot on the specified PCI bus for devices, adding
|
|
|
|
* discovered devices to the @bus->devices list. New devices
|
2008-02-15 06:56:56 +08:00
|
|
|
* will not have is_added set.
|
2009-03-21 04:56:05 +08:00
|
|
|
*
|
|
|
|
* Returns the number of new devices found.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2007-03-27 13:53:30 +08:00
|
|
|
int pci_scan_slot(struct pci_bus *bus, int devfn)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2009-12-13 21:10:02 +08:00
|
|
|
unsigned fn, nr = 0;
|
2009-03-21 04:56:05 +08:00
|
|
|
struct pci_dev *dev;
|
2009-12-13 21:10:02 +08:00
|
|
|
unsigned (*next_fn)(struct pci_dev *, unsigned) = no_next_fn;
|
|
|
|
|
|
|
|
if (only_one_child(bus) && (devfn > 0))
|
|
|
|
return 0; /* Already scanned the entire slot */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-03-21 04:56:05 +08:00
|
|
|
dev = pci_scan_single_device(bus, devfn);
|
2010-01-18 05:01:41 +08:00
|
|
|
if (!dev)
|
|
|
|
return 0;
|
|
|
|
if (!dev->is_added)
|
2009-03-21 04:56:05 +08:00
|
|
|
nr++;
|
|
|
|
|
2009-12-13 21:10:02 +08:00
|
|
|
if (pci_ari_enabled(bus))
|
|
|
|
next_fn = next_ari_fn;
|
2010-01-18 05:01:41 +08:00
|
|
|
else if (dev->multifunction)
|
2009-12-13 21:10:02 +08:00
|
|
|
next_fn = next_trad_fn;
|
|
|
|
|
|
|
|
for (fn = next_fn(dev, 0); fn > 0; fn = next_fn(dev, fn)) {
|
|
|
|
dev = pci_scan_single_device(bus, devfn + fn);
|
|
|
|
if (dev) {
|
|
|
|
if (!dev->is_added)
|
|
|
|
nr++;
|
|
|
|
dev->multifunction = 1;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
PCI: add PCI Express ASPM support
PCI Express ASPM defines a protocol for PCI Express components in the D0
state to reduce Link power by placing their Links into a low power state
and instructing the other end of the Link to do likewise. This
capability allows hardware-autonomous, dynamic Link power reduction
beyond what is achievable by software-only controlled power management.
However, The device should be configured by software appropriately.
Enabling ASPM will save power, but will introduce device latency.
This patch adds ASPM support in Linux. It introduces a global policy for
ASPM, a sysfs file /sys/module/pcie_aspm/parameters/policy can control
it. The interface can be used as a boot option too. Currently we have
below setting:
-default, BIOS default setting
-powersave, highest power saving mode, enable all available ASPM
state and clock power management
-performance, highest performance, disable ASPM and clock power
management
By default, the 'default' policy is used currently.
In my test, power difference between powersave mode and performance mode
is about 1.3w in a system with 3 PCIE links.
Note: some devices might not work well with aspm, either because chipset
issue or device issue. The patch provide API (pci_disable_link_state),
driver can disable ASPM for specific device.
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2008-02-25 09:46:41 +08:00
|
|
|
|
2008-07-23 10:32:31 +08:00
|
|
|
/* only one slot has pcie device */
|
|
|
|
if (bus->self && nr)
|
PCI: add PCI Express ASPM support
PCI Express ASPM defines a protocol for PCI Express components in the D0
state to reduce Link power by placing their Links into a low power state
and instructing the other end of the Link to do likewise. This
capability allows hardware-autonomous, dynamic Link power reduction
beyond what is achievable by software-only controlled power management.
However, The device should be configured by software appropriately.
Enabling ASPM will save power, but will introduce device latency.
This patch adds ASPM support in Linux. It introduces a global policy for
ASPM, a sysfs file /sys/module/pcie_aspm/parameters/policy can control
it. The interface can be used as a boot option too. Currently we have
below setting:
-default, BIOS default setting
-powersave, highest power saving mode, enable all available ASPM
state and clock power management
-performance, highest performance, disable ASPM and clock power
management
By default, the 'default' policy is used currently.
In my test, power difference between powersave mode and performance mode
is about 1.3w in a system with 3 PCIE links.
Note: some devices might not work well with aspm, either because chipset
issue or device issue. The patch provide API (pci_disable_link_state),
driver can disable ASPM for specific device.
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2008-02-25 09:46:41 +08:00
|
|
|
pcie_aspm_init_link_state(bus->self);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
return nr;
|
|
|
|
}
|
|
|
|
|
PCI: Set PCI-E Max Payload Size on fabric
On a given PCI-E fabric, each device, bridge, and root port can have a
different PCI-E maximum payload size. There is a sizable performance
boost for having the largest possible maximum payload size on each PCI-E
device. However, if improperly configured, fatal bus errors can occur.
Thus, it is important to ensure that PCI-E payloads sends by a device
are never larger than the MPS setting of all devices on the way to the
destination.
This can be achieved two ways:
- A conservative approach is to use the smallest common denominator of
the entire tree below a root complex for every device on that fabric.
This means for example that having a 128 bytes MPS USB controller on one
leg of a switch will dramatically reduce performances of a video card or
10GE adapter on another leg of that same switch.
It also means that any hierarchy supporting hotplug slots (including
expresscard or thunderbolt I suppose, dbl check that) will have to be
entirely clamped to 128 bytes since we cannot predict what will be
plugged into those slots, and we cannot change the MPS on a "live"
system.
- A more optimal way is possible, if it falls within a couple of
constraints:
* The top-level host bridge will never generate packets larger than the
smallest TLP (or if it can be controlled independently from its MPS at
least)
* The device will never generate packets larger than MPS (which can be
configured via MRRS)
* No support of direct PCI-E <-> PCI-E transfers between devices without
some additional code to specifically deal with that case
Then we can use an approach that basically ignores downstream requests
and focuses exclusively on upstream requests. In that case, all we need
to care about is that a device MPS is no larger than its parent MPS,
which allows us to keep all switches/bridges to the max MPS supported by
their parent and eventually the PHB.
In this case, your USB controller would no longer "starve" your 10GE
Ethernet and your hotplug slots won't affect your global MPS.
Additionally, the hotplugged devices themselves can be configured to a
larger MPS up to the value configured in the hotplug bridge.
To choose between the two available options, two PCI kernel boot args
have been added to the PCI calls. "pcie_bus_safe" will provide the
former behavior, while "pcie_bus_perf" will perform the latter behavior.
By default, the latter behavior is used.
NOTE: due to the location of the enablement, each arch will need to add
calls to this function. This patch only enables x86.
This patch includes a number of changes recommended by Benjamin
Herrenschmidt.
Tested-by: Jordan_Hargrave@dell.com
Signed-off-by: Jon Mason <mason@myri.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
|
|
|
static int pcie_find_smpss(struct pci_dev *dev, void *data)
|
|
|
|
{
|
|
|
|
u8 *smpss = data;
|
|
|
|
|
|
|
|
if (!pci_is_pcie(dev))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* For PCIE hotplug enabled slots not connected directly to a
|
|
|
|
* PCI-E root port, there can be problems when hotplugging
|
|
|
|
* devices. This is due to the possibility of hotplugging a
|
|
|
|
* device into the fabric with a smaller MPS that the devices
|
|
|
|
* currently running have configured. Modifying the MPS on the
|
|
|
|
* running devices could cause a fatal bus error due to an
|
|
|
|
* incoming frame being larger than the newly configured MPS.
|
|
|
|
* To work around this, the MPS for the entire fabric must be
|
|
|
|
* set to the minimum size. Any devices hotplugged into this
|
|
|
|
* fabric will have the minimum MPS set. If the PCI hotplug
|
|
|
|
* slot is directly connected to the root port and there are not
|
|
|
|
* other devices on the fabric (which seems to be the most
|
|
|
|
* common case), then this is not an issue and MPS discovery
|
|
|
|
* will occur as normal.
|
|
|
|
*/
|
|
|
|
if (dev->is_hotplug_bridge && (!list_is_singular(&dev->bus->devices) ||
|
2011-09-14 02:16:33 +08:00
|
|
|
(dev->bus->self &&
|
|
|
|
dev->bus->self->pcie_type != PCI_EXP_TYPE_ROOT_PORT)))
|
PCI: Set PCI-E Max Payload Size on fabric
On a given PCI-E fabric, each device, bridge, and root port can have a
different PCI-E maximum payload size. There is a sizable performance
boost for having the largest possible maximum payload size on each PCI-E
device. However, if improperly configured, fatal bus errors can occur.
Thus, it is important to ensure that PCI-E payloads sends by a device
are never larger than the MPS setting of all devices on the way to the
destination.
This can be achieved two ways:
- A conservative approach is to use the smallest common denominator of
the entire tree below a root complex for every device on that fabric.
This means for example that having a 128 bytes MPS USB controller on one
leg of a switch will dramatically reduce performances of a video card or
10GE adapter on another leg of that same switch.
It also means that any hierarchy supporting hotplug slots (including
expresscard or thunderbolt I suppose, dbl check that) will have to be
entirely clamped to 128 bytes since we cannot predict what will be
plugged into those slots, and we cannot change the MPS on a "live"
system.
- A more optimal way is possible, if it falls within a couple of
constraints:
* The top-level host bridge will never generate packets larger than the
smallest TLP (or if it can be controlled independently from its MPS at
least)
* The device will never generate packets larger than MPS (which can be
configured via MRRS)
* No support of direct PCI-E <-> PCI-E transfers between devices without
some additional code to specifically deal with that case
Then we can use an approach that basically ignores downstream requests
and focuses exclusively on upstream requests. In that case, all we need
to care about is that a device MPS is no larger than its parent MPS,
which allows us to keep all switches/bridges to the max MPS supported by
their parent and eventually the PHB.
In this case, your USB controller would no longer "starve" your 10GE
Ethernet and your hotplug slots won't affect your global MPS.
Additionally, the hotplugged devices themselves can be configured to a
larger MPS up to the value configured in the hotplug bridge.
To choose between the two available options, two PCI kernel boot args
have been added to the PCI calls. "pcie_bus_safe" will provide the
former behavior, while "pcie_bus_perf" will perform the latter behavior.
By default, the latter behavior is used.
NOTE: due to the location of the enablement, each arch will need to add
calls to this function. This patch only enables x86.
This patch includes a number of changes recommended by Benjamin
Herrenschmidt.
Tested-by: Jordan_Hargrave@dell.com
Signed-off-by: Jon Mason <mason@myri.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
|
|
|
*smpss = 0;
|
|
|
|
|
|
|
|
if (*smpss > dev->pcie_mpss)
|
|
|
|
*smpss = dev->pcie_mpss;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void pcie_write_mps(struct pci_dev *dev, int mps)
|
|
|
|
{
|
|
|
|
int rc, dev_mpss;
|
|
|
|
|
|
|
|
dev_mpss = 128 << dev->pcie_mpss;
|
|
|
|
|
|
|
|
if (pcie_bus_config == PCIE_BUS_PERFORMANCE) {
|
|
|
|
if (dev->bus->self) {
|
|
|
|
dev_dbg(&dev->bus->dev, "Bus MPSS %d\n",
|
|
|
|
128 << dev->bus->self->pcie_mpss);
|
|
|
|
|
|
|
|
/* For "MPS Force Max", the assumption is made that
|
|
|
|
* downstream communication will never be larger than
|
|
|
|
* the MRRS. So, the MPS only needs to be configured
|
|
|
|
* for the upstream communication. This being the case,
|
|
|
|
* walk from the top down and set the MPS of the child
|
|
|
|
* to that of the parent bus.
|
|
|
|
*/
|
|
|
|
mps = 128 << dev->bus->self->pcie_mpss;
|
|
|
|
if (mps > dev_mpss)
|
|
|
|
dev_warn(&dev->dev, "MPS configured higher than"
|
|
|
|
" maximum supported by the device. If"
|
|
|
|
" a bus issue occurs, try running with"
|
|
|
|
" pci=pcie_bus_safe.\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
dev->pcie_mpss = ffs(mps) - 8;
|
|
|
|
}
|
|
|
|
|
|
|
|
rc = pcie_set_mps(dev, mps);
|
|
|
|
if (rc)
|
|
|
|
dev_err(&dev->dev, "Failed attempting to set the MPS\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
static void pcie_write_mrrs(struct pci_dev *dev, int mps)
|
|
|
|
{
|
2011-09-09 05:41:18 +08:00
|
|
|
int rc, mrrs, dev_mpss;
|
PCI: Set PCI-E Max Payload Size on fabric
On a given PCI-E fabric, each device, bridge, and root port can have a
different PCI-E maximum payload size. There is a sizable performance
boost for having the largest possible maximum payload size on each PCI-E
device. However, if improperly configured, fatal bus errors can occur.
Thus, it is important to ensure that PCI-E payloads sends by a device
are never larger than the MPS setting of all devices on the way to the
destination.
This can be achieved two ways:
- A conservative approach is to use the smallest common denominator of
the entire tree below a root complex for every device on that fabric.
This means for example that having a 128 bytes MPS USB controller on one
leg of a switch will dramatically reduce performances of a video card or
10GE adapter on another leg of that same switch.
It also means that any hierarchy supporting hotplug slots (including
expresscard or thunderbolt I suppose, dbl check that) will have to be
entirely clamped to 128 bytes since we cannot predict what will be
plugged into those slots, and we cannot change the MPS on a "live"
system.
- A more optimal way is possible, if it falls within a couple of
constraints:
* The top-level host bridge will never generate packets larger than the
smallest TLP (or if it can be controlled independently from its MPS at
least)
* The device will never generate packets larger than MPS (which can be
configured via MRRS)
* No support of direct PCI-E <-> PCI-E transfers between devices without
some additional code to specifically deal with that case
Then we can use an approach that basically ignores downstream requests
and focuses exclusively on upstream requests. In that case, all we need
to care about is that a device MPS is no larger than its parent MPS,
which allows us to keep all switches/bridges to the max MPS supported by
their parent and eventually the PHB.
In this case, your USB controller would no longer "starve" your 10GE
Ethernet and your hotplug slots won't affect your global MPS.
Additionally, the hotplugged devices themselves can be configured to a
larger MPS up to the value configured in the hotplug bridge.
To choose between the two available options, two PCI kernel boot args
have been added to the PCI calls. "pcie_bus_safe" will provide the
former behavior, while "pcie_bus_perf" will perform the latter behavior.
By default, the latter behavior is used.
NOTE: due to the location of the enablement, each arch will need to add
calls to this function. This patch only enables x86.
This patch includes a number of changes recommended by Benjamin
Herrenschmidt.
Tested-by: Jordan_Hargrave@dell.com
Signed-off-by: Jon Mason <mason@myri.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
|
|
|
|
2011-09-09 05:41:18 +08:00
|
|
|
/* In the "safe" case, do not configure the MRRS. There appear to be
|
|
|
|
* issues with setting MRRS to 0 on a number of devices.
|
|
|
|
*/
|
PCI: Set PCI-E Max Payload Size on fabric
On a given PCI-E fabric, each device, bridge, and root port can have a
different PCI-E maximum payload size. There is a sizable performance
boost for having the largest possible maximum payload size on each PCI-E
device. However, if improperly configured, fatal bus errors can occur.
Thus, it is important to ensure that PCI-E payloads sends by a device
are never larger than the MPS setting of all devices on the way to the
destination.
This can be achieved two ways:
- A conservative approach is to use the smallest common denominator of
the entire tree below a root complex for every device on that fabric.
This means for example that having a 128 bytes MPS USB controller on one
leg of a switch will dramatically reduce performances of a video card or
10GE adapter on another leg of that same switch.
It also means that any hierarchy supporting hotplug slots (including
expresscard or thunderbolt I suppose, dbl check that) will have to be
entirely clamped to 128 bytes since we cannot predict what will be
plugged into those slots, and we cannot change the MPS on a "live"
system.
- A more optimal way is possible, if it falls within a couple of
constraints:
* The top-level host bridge will never generate packets larger than the
smallest TLP (or if it can be controlled independently from its MPS at
least)
* The device will never generate packets larger than MPS (which can be
configured via MRRS)
* No support of direct PCI-E <-> PCI-E transfers between devices without
some additional code to specifically deal with that case
Then we can use an approach that basically ignores downstream requests
and focuses exclusively on upstream requests. In that case, all we need
to care about is that a device MPS is no larger than its parent MPS,
which allows us to keep all switches/bridges to the max MPS supported by
their parent and eventually the PHB.
In this case, your USB controller would no longer "starve" your 10GE
Ethernet and your hotplug slots won't affect your global MPS.
Additionally, the hotplugged devices themselves can be configured to a
larger MPS up to the value configured in the hotplug bridge.
To choose between the two available options, two PCI kernel boot args
have been added to the PCI calls. "pcie_bus_safe" will provide the
former behavior, while "pcie_bus_perf" will perform the latter behavior.
By default, the latter behavior is used.
NOTE: due to the location of the enablement, each arch will need to add
calls to this function. This patch only enables x86.
This patch includes a number of changes recommended by Benjamin
Herrenschmidt.
Tested-by: Jordan_Hargrave@dell.com
Signed-off-by: Jon Mason <mason@myri.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
|
|
|
|
2011-09-09 05:41:18 +08:00
|
|
|
if (pcie_bus_config != PCIE_BUS_PERFORMANCE)
|
|
|
|
return;
|
|
|
|
|
|
|
|
dev_mpss = 128 << dev->pcie_mpss;
|
PCI: Set PCI-E Max Payload Size on fabric
On a given PCI-E fabric, each device, bridge, and root port can have a
different PCI-E maximum payload size. There is a sizable performance
boost for having the largest possible maximum payload size on each PCI-E
device. However, if improperly configured, fatal bus errors can occur.
Thus, it is important to ensure that PCI-E payloads sends by a device
are never larger than the MPS setting of all devices on the way to the
destination.
This can be achieved two ways:
- A conservative approach is to use the smallest common denominator of
the entire tree below a root complex for every device on that fabric.
This means for example that having a 128 bytes MPS USB controller on one
leg of a switch will dramatically reduce performances of a video card or
10GE adapter on another leg of that same switch.
It also means that any hierarchy supporting hotplug slots (including
expresscard or thunderbolt I suppose, dbl check that) will have to be
entirely clamped to 128 bytes since we cannot predict what will be
plugged into those slots, and we cannot change the MPS on a "live"
system.
- A more optimal way is possible, if it falls within a couple of
constraints:
* The top-level host bridge will never generate packets larger than the
smallest TLP (or if it can be controlled independently from its MPS at
least)
* The device will never generate packets larger than MPS (which can be
configured via MRRS)
* No support of direct PCI-E <-> PCI-E transfers between devices without
some additional code to specifically deal with that case
Then we can use an approach that basically ignores downstream requests
and focuses exclusively on upstream requests. In that case, all we need
to care about is that a device MPS is no larger than its parent MPS,
which allows us to keep all switches/bridges to the max MPS supported by
their parent and eventually the PHB.
In this case, your USB controller would no longer "starve" your 10GE
Ethernet and your hotplug slots won't affect your global MPS.
Additionally, the hotplugged devices themselves can be configured to a
larger MPS up to the value configured in the hotplug bridge.
To choose between the two available options, two PCI kernel boot args
have been added to the PCI calls. "pcie_bus_safe" will provide the
former behavior, while "pcie_bus_perf" will perform the latter behavior.
By default, the latter behavior is used.
NOTE: due to the location of the enablement, each arch will need to add
calls to this function. This patch only enables x86.
This patch includes a number of changes recommended by Benjamin
Herrenschmidt.
Tested-by: Jordan_Hargrave@dell.com
Signed-off-by: Jon Mason <mason@myri.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
|
|
|
|
2011-09-09 05:41:18 +08:00
|
|
|
/* For Max performance, the MRRS must be set to the largest supported
|
|
|
|
* value. However, it cannot be configured larger than the MPS the
|
|
|
|
* device or the bus can support. This assumes that the largest MRRS
|
|
|
|
* available on the device cannot be smaller than the device MPSS.
|
|
|
|
*/
|
|
|
|
mrrs = min(mps, dev_mpss);
|
PCI: Set PCI-E Max Payload Size on fabric
On a given PCI-E fabric, each device, bridge, and root port can have a
different PCI-E maximum payload size. There is a sizable performance
boost for having the largest possible maximum payload size on each PCI-E
device. However, if improperly configured, fatal bus errors can occur.
Thus, it is important to ensure that PCI-E payloads sends by a device
are never larger than the MPS setting of all devices on the way to the
destination.
This can be achieved two ways:
- A conservative approach is to use the smallest common denominator of
the entire tree below a root complex for every device on that fabric.
This means for example that having a 128 bytes MPS USB controller on one
leg of a switch will dramatically reduce performances of a video card or
10GE adapter on another leg of that same switch.
It also means that any hierarchy supporting hotplug slots (including
expresscard or thunderbolt I suppose, dbl check that) will have to be
entirely clamped to 128 bytes since we cannot predict what will be
plugged into those slots, and we cannot change the MPS on a "live"
system.
- A more optimal way is possible, if it falls within a couple of
constraints:
* The top-level host bridge will never generate packets larger than the
smallest TLP (or if it can be controlled independently from its MPS at
least)
* The device will never generate packets larger than MPS (which can be
configured via MRRS)
* No support of direct PCI-E <-> PCI-E transfers between devices without
some additional code to specifically deal with that case
Then we can use an approach that basically ignores downstream requests
and focuses exclusively on upstream requests. In that case, all we need
to care about is that a device MPS is no larger than its parent MPS,
which allows us to keep all switches/bridges to the max MPS supported by
their parent and eventually the PHB.
In this case, your USB controller would no longer "starve" your 10GE
Ethernet and your hotplug slots won't affect your global MPS.
Additionally, the hotplugged devices themselves can be configured to a
larger MPS up to the value configured in the hotplug bridge.
To choose between the two available options, two PCI kernel boot args
have been added to the PCI calls. "pcie_bus_safe" will provide the
former behavior, while "pcie_bus_perf" will perform the latter behavior.
By default, the latter behavior is used.
NOTE: due to the location of the enablement, each arch will need to add
calls to this function. This patch only enables x86.
This patch includes a number of changes recommended by Benjamin
Herrenschmidt.
Tested-by: Jordan_Hargrave@dell.com
Signed-off-by: Jon Mason <mason@myri.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
|
|
|
|
|
|
|
/* MRRS is a R/W register. Invalid values can be written, but a
|
2011-09-09 05:41:18 +08:00
|
|
|
* subsequent read will verify if the value is acceptable or not.
|
PCI: Set PCI-E Max Payload Size on fabric
On a given PCI-E fabric, each device, bridge, and root port can have a
different PCI-E maximum payload size. There is a sizable performance
boost for having the largest possible maximum payload size on each PCI-E
device. However, if improperly configured, fatal bus errors can occur.
Thus, it is important to ensure that PCI-E payloads sends by a device
are never larger than the MPS setting of all devices on the way to the
destination.
This can be achieved two ways:
- A conservative approach is to use the smallest common denominator of
the entire tree below a root complex for every device on that fabric.
This means for example that having a 128 bytes MPS USB controller on one
leg of a switch will dramatically reduce performances of a video card or
10GE adapter on another leg of that same switch.
It also means that any hierarchy supporting hotplug slots (including
expresscard or thunderbolt I suppose, dbl check that) will have to be
entirely clamped to 128 bytes since we cannot predict what will be
plugged into those slots, and we cannot change the MPS on a "live"
system.
- A more optimal way is possible, if it falls within a couple of
constraints:
* The top-level host bridge will never generate packets larger than the
smallest TLP (or if it can be controlled independently from its MPS at
least)
* The device will never generate packets larger than MPS (which can be
configured via MRRS)
* No support of direct PCI-E <-> PCI-E transfers between devices without
some additional code to specifically deal with that case
Then we can use an approach that basically ignores downstream requests
and focuses exclusively on upstream requests. In that case, all we need
to care about is that a device MPS is no larger than its parent MPS,
which allows us to keep all switches/bridges to the max MPS supported by
their parent and eventually the PHB.
In this case, your USB controller would no longer "starve" your 10GE
Ethernet and your hotplug slots won't affect your global MPS.
Additionally, the hotplugged devices themselves can be configured to a
larger MPS up to the value configured in the hotplug bridge.
To choose between the two available options, two PCI kernel boot args
have been added to the PCI calls. "pcie_bus_safe" will provide the
former behavior, while "pcie_bus_perf" will perform the latter behavior.
By default, the latter behavior is used.
NOTE: due to the location of the enablement, each arch will need to add
calls to this function. This patch only enables x86.
This patch includes a number of changes recommended by Benjamin
Herrenschmidt.
Tested-by: Jordan_Hargrave@dell.com
Signed-off-by: Jon Mason <mason@myri.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
|
|
|
* If the MRRS value provided is not acceptable (e.g., too large),
|
|
|
|
* shrink the value until it is acceptable to the HW.
|
|
|
|
*/
|
|
|
|
while (mrrs != pcie_get_readrq(dev) && mrrs >= 128) {
|
2011-09-09 05:41:18 +08:00
|
|
|
dev_warn(&dev->dev, "Attempting to modify the PCI-E MRRS value"
|
|
|
|
" to %d. If any issues are encountered, please try "
|
|
|
|
"running with pci=pcie_bus_safe\n", mrrs);
|
PCI: Set PCI-E Max Payload Size on fabric
On a given PCI-E fabric, each device, bridge, and root port can have a
different PCI-E maximum payload size. There is a sizable performance
boost for having the largest possible maximum payload size on each PCI-E
device. However, if improperly configured, fatal bus errors can occur.
Thus, it is important to ensure that PCI-E payloads sends by a device
are never larger than the MPS setting of all devices on the way to the
destination.
This can be achieved two ways:
- A conservative approach is to use the smallest common denominator of
the entire tree below a root complex for every device on that fabric.
This means for example that having a 128 bytes MPS USB controller on one
leg of a switch will dramatically reduce performances of a video card or
10GE adapter on another leg of that same switch.
It also means that any hierarchy supporting hotplug slots (including
expresscard or thunderbolt I suppose, dbl check that) will have to be
entirely clamped to 128 bytes since we cannot predict what will be
plugged into those slots, and we cannot change the MPS on a "live"
system.
- A more optimal way is possible, if it falls within a couple of
constraints:
* The top-level host bridge will never generate packets larger than the
smallest TLP (or if it can be controlled independently from its MPS at
least)
* The device will never generate packets larger than MPS (which can be
configured via MRRS)
* No support of direct PCI-E <-> PCI-E transfers between devices without
some additional code to specifically deal with that case
Then we can use an approach that basically ignores downstream requests
and focuses exclusively on upstream requests. In that case, all we need
to care about is that a device MPS is no larger than its parent MPS,
which allows us to keep all switches/bridges to the max MPS supported by
their parent and eventually the PHB.
In this case, your USB controller would no longer "starve" your 10GE
Ethernet and your hotplug slots won't affect your global MPS.
Additionally, the hotplugged devices themselves can be configured to a
larger MPS up to the value configured in the hotplug bridge.
To choose between the two available options, two PCI kernel boot args
have been added to the PCI calls. "pcie_bus_safe" will provide the
former behavior, while "pcie_bus_perf" will perform the latter behavior.
By default, the latter behavior is used.
NOTE: due to the location of the enablement, each arch will need to add
calls to this function. This patch only enables x86.
This patch includes a number of changes recommended by Benjamin
Herrenschmidt.
Tested-by: Jordan_Hargrave@dell.com
Signed-off-by: Jon Mason <mason@myri.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
|
|
|
rc = pcie_set_readrq(dev, mrrs);
|
|
|
|
if (rc)
|
2011-09-09 05:41:18 +08:00
|
|
|
dev_err(&dev->dev,
|
|
|
|
"Failed attempting to set the MRRS\n");
|
PCI: Set PCI-E Max Payload Size on fabric
On a given PCI-E fabric, each device, bridge, and root port can have a
different PCI-E maximum payload size. There is a sizable performance
boost for having the largest possible maximum payload size on each PCI-E
device. However, if improperly configured, fatal bus errors can occur.
Thus, it is important to ensure that PCI-E payloads sends by a device
are never larger than the MPS setting of all devices on the way to the
destination.
This can be achieved two ways:
- A conservative approach is to use the smallest common denominator of
the entire tree below a root complex for every device on that fabric.
This means for example that having a 128 bytes MPS USB controller on one
leg of a switch will dramatically reduce performances of a video card or
10GE adapter on another leg of that same switch.
It also means that any hierarchy supporting hotplug slots (including
expresscard or thunderbolt I suppose, dbl check that) will have to be
entirely clamped to 128 bytes since we cannot predict what will be
plugged into those slots, and we cannot change the MPS on a "live"
system.
- A more optimal way is possible, if it falls within a couple of
constraints:
* The top-level host bridge will never generate packets larger than the
smallest TLP (or if it can be controlled independently from its MPS at
least)
* The device will never generate packets larger than MPS (which can be
configured via MRRS)
* No support of direct PCI-E <-> PCI-E transfers between devices without
some additional code to specifically deal with that case
Then we can use an approach that basically ignores downstream requests
and focuses exclusively on upstream requests. In that case, all we need
to care about is that a device MPS is no larger than its parent MPS,
which allows us to keep all switches/bridges to the max MPS supported by
their parent and eventually the PHB.
In this case, your USB controller would no longer "starve" your 10GE
Ethernet and your hotplug slots won't affect your global MPS.
Additionally, the hotplugged devices themselves can be configured to a
larger MPS up to the value configured in the hotplug bridge.
To choose between the two available options, two PCI kernel boot args
have been added to the PCI calls. "pcie_bus_safe" will provide the
former behavior, while "pcie_bus_perf" will perform the latter behavior.
By default, the latter behavior is used.
NOTE: due to the location of the enablement, each arch will need to add
calls to this function. This patch only enables x86.
This patch includes a number of changes recommended by Benjamin
Herrenschmidt.
Tested-by: Jordan_Hargrave@dell.com
Signed-off-by: Jon Mason <mason@myri.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
|
|
|
|
|
|
|
mrrs /= 2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int pcie_bus_configure_set(struct pci_dev *dev, void *data)
|
|
|
|
{
|
|
|
|
int mps = 128 << *(u8 *)data;
|
|
|
|
|
|
|
|
if (!pci_is_pcie(dev))
|
|
|
|
return 0;
|
|
|
|
|
2011-09-09 05:41:18 +08:00
|
|
|
dev_dbg(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
|
PCI: Set PCI-E Max Payload Size on fabric
On a given PCI-E fabric, each device, bridge, and root port can have a
different PCI-E maximum payload size. There is a sizable performance
boost for having the largest possible maximum payload size on each PCI-E
device. However, if improperly configured, fatal bus errors can occur.
Thus, it is important to ensure that PCI-E payloads sends by a device
are never larger than the MPS setting of all devices on the way to the
destination.
This can be achieved two ways:
- A conservative approach is to use the smallest common denominator of
the entire tree below a root complex for every device on that fabric.
This means for example that having a 128 bytes MPS USB controller on one
leg of a switch will dramatically reduce performances of a video card or
10GE adapter on another leg of that same switch.
It also means that any hierarchy supporting hotplug slots (including
expresscard or thunderbolt I suppose, dbl check that) will have to be
entirely clamped to 128 bytes since we cannot predict what will be
plugged into those slots, and we cannot change the MPS on a "live"
system.
- A more optimal way is possible, if it falls within a couple of
constraints:
* The top-level host bridge will never generate packets larger than the
smallest TLP (or if it can be controlled independently from its MPS at
least)
* The device will never generate packets larger than MPS (which can be
configured via MRRS)
* No support of direct PCI-E <-> PCI-E transfers between devices without
some additional code to specifically deal with that case
Then we can use an approach that basically ignores downstream requests
and focuses exclusively on upstream requests. In that case, all we need
to care about is that a device MPS is no larger than its parent MPS,
which allows us to keep all switches/bridges to the max MPS supported by
their parent and eventually the PHB.
In this case, your USB controller would no longer "starve" your 10GE
Ethernet and your hotplug slots won't affect your global MPS.
Additionally, the hotplugged devices themselves can be configured to a
larger MPS up to the value configured in the hotplug bridge.
To choose between the two available options, two PCI kernel boot args
have been added to the PCI calls. "pcie_bus_safe" will provide the
former behavior, while "pcie_bus_perf" will perform the latter behavior.
By default, the latter behavior is used.
NOTE: due to the location of the enablement, each arch will need to add
calls to this function. This patch only enables x86.
This patch includes a number of changes recommended by Benjamin
Herrenschmidt.
Tested-by: Jordan_Hargrave@dell.com
Signed-off-by: Jon Mason <mason@myri.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
|
|
|
pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev));
|
|
|
|
|
|
|
|
pcie_write_mps(dev, mps);
|
|
|
|
pcie_write_mrrs(dev, mps);
|
|
|
|
|
2011-09-09 05:41:18 +08:00
|
|
|
dev_dbg(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
|
PCI: Set PCI-E Max Payload Size on fabric
On a given PCI-E fabric, each device, bridge, and root port can have a
different PCI-E maximum payload size. There is a sizable performance
boost for having the largest possible maximum payload size on each PCI-E
device. However, if improperly configured, fatal bus errors can occur.
Thus, it is important to ensure that PCI-E payloads sends by a device
are never larger than the MPS setting of all devices on the way to the
destination.
This can be achieved two ways:
- A conservative approach is to use the smallest common denominator of
the entire tree below a root complex for every device on that fabric.
This means for example that having a 128 bytes MPS USB controller on one
leg of a switch will dramatically reduce performances of a video card or
10GE adapter on another leg of that same switch.
It also means that any hierarchy supporting hotplug slots (including
expresscard or thunderbolt I suppose, dbl check that) will have to be
entirely clamped to 128 bytes since we cannot predict what will be
plugged into those slots, and we cannot change the MPS on a "live"
system.
- A more optimal way is possible, if it falls within a couple of
constraints:
* The top-level host bridge will never generate packets larger than the
smallest TLP (or if it can be controlled independently from its MPS at
least)
* The device will never generate packets larger than MPS (which can be
configured via MRRS)
* No support of direct PCI-E <-> PCI-E transfers between devices without
some additional code to specifically deal with that case
Then we can use an approach that basically ignores downstream requests
and focuses exclusively on upstream requests. In that case, all we need
to care about is that a device MPS is no larger than its parent MPS,
which allows us to keep all switches/bridges to the max MPS supported by
their parent and eventually the PHB.
In this case, your USB controller would no longer "starve" your 10GE
Ethernet and your hotplug slots won't affect your global MPS.
Additionally, the hotplugged devices themselves can be configured to a
larger MPS up to the value configured in the hotplug bridge.
To choose between the two available options, two PCI kernel boot args
have been added to the PCI calls. "pcie_bus_safe" will provide the
former behavior, while "pcie_bus_perf" will perform the latter behavior.
By default, the latter behavior is used.
NOTE: due to the location of the enablement, each arch will need to add
calls to this function. This patch only enables x86.
This patch includes a number of changes recommended by Benjamin
Herrenschmidt.
Tested-by: Jordan_Hargrave@dell.com
Signed-off-by: Jon Mason <mason@myri.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
|
|
|
pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev));
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* pcie_bus_configure_mps requires that pci_walk_bus work in a top-down,
|
|
|
|
* parents then children fashion. If this changes, then this code will not
|
|
|
|
* work as designed.
|
|
|
|
*/
|
|
|
|
void pcie_bus_configure_settings(struct pci_bus *bus, u8 mpss)
|
|
|
|
{
|
2011-10-03 22:50:20 +08:00
|
|
|
u8 smpss;
|
PCI: Set PCI-E Max Payload Size on fabric
On a given PCI-E fabric, each device, bridge, and root port can have a
different PCI-E maximum payload size. There is a sizable performance
boost for having the largest possible maximum payload size on each PCI-E
device. However, if improperly configured, fatal bus errors can occur.
Thus, it is important to ensure that PCI-E payloads sends by a device
are never larger than the MPS setting of all devices on the way to the
destination.
This can be achieved two ways:
- A conservative approach is to use the smallest common denominator of
the entire tree below a root complex for every device on that fabric.
This means for example that having a 128 bytes MPS USB controller on one
leg of a switch will dramatically reduce performances of a video card or
10GE adapter on another leg of that same switch.
It also means that any hierarchy supporting hotplug slots (including
expresscard or thunderbolt I suppose, dbl check that) will have to be
entirely clamped to 128 bytes since we cannot predict what will be
plugged into those slots, and we cannot change the MPS on a "live"
system.
- A more optimal way is possible, if it falls within a couple of
constraints:
* The top-level host bridge will never generate packets larger than the
smallest TLP (or if it can be controlled independently from its MPS at
least)
* The device will never generate packets larger than MPS (which can be
configured via MRRS)
* No support of direct PCI-E <-> PCI-E transfers between devices without
some additional code to specifically deal with that case
Then we can use an approach that basically ignores downstream requests
and focuses exclusively on upstream requests. In that case, all we need
to care about is that a device MPS is no larger than its parent MPS,
which allows us to keep all switches/bridges to the max MPS supported by
their parent and eventually the PHB.
In this case, your USB controller would no longer "starve" your 10GE
Ethernet and your hotplug slots won't affect your global MPS.
Additionally, the hotplugged devices themselves can be configured to a
larger MPS up to the value configured in the hotplug bridge.
To choose between the two available options, two PCI kernel boot args
have been added to the PCI calls. "pcie_bus_safe" will provide the
former behavior, while "pcie_bus_perf" will perform the latter behavior.
By default, the latter behavior is used.
NOTE: due to the location of the enablement, each arch will need to add
calls to this function. This patch only enables x86.
This patch includes a number of changes recommended by Benjamin
Herrenschmidt.
Tested-by: Jordan_Hargrave@dell.com
Signed-off-by: Jon Mason <mason@myri.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
|
|
|
|
|
|
|
if (!pci_is_pcie(bus->self))
|
|
|
|
return;
|
|
|
|
|
2011-10-03 22:50:20 +08:00
|
|
|
if (pcie_bus_config == PCIE_BUS_TUNE_OFF)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* FIXME - Peer to peer DMA is possible, though the endpoint would need
|
|
|
|
* to be aware to the MPS of the destination. To work around this,
|
|
|
|
* simply force the MPS of the entire system to the smallest possible.
|
|
|
|
*/
|
|
|
|
if (pcie_bus_config == PCIE_BUS_PEER2PEER)
|
|
|
|
smpss = 0;
|
|
|
|
|
PCI: Set PCI-E Max Payload Size on fabric
On a given PCI-E fabric, each device, bridge, and root port can have a
different PCI-E maximum payload size. There is a sizable performance
boost for having the largest possible maximum payload size on each PCI-E
device. However, if improperly configured, fatal bus errors can occur.
Thus, it is important to ensure that PCI-E payloads sends by a device
are never larger than the MPS setting of all devices on the way to the
destination.
This can be achieved two ways:
- A conservative approach is to use the smallest common denominator of
the entire tree below a root complex for every device on that fabric.
This means for example that having a 128 bytes MPS USB controller on one
leg of a switch will dramatically reduce performances of a video card or
10GE adapter on another leg of that same switch.
It also means that any hierarchy supporting hotplug slots (including
expresscard or thunderbolt I suppose, dbl check that) will have to be
entirely clamped to 128 bytes since we cannot predict what will be
plugged into those slots, and we cannot change the MPS on a "live"
system.
- A more optimal way is possible, if it falls within a couple of
constraints:
* The top-level host bridge will never generate packets larger than the
smallest TLP (or if it can be controlled independently from its MPS at
least)
* The device will never generate packets larger than MPS (which can be
configured via MRRS)
* No support of direct PCI-E <-> PCI-E transfers between devices without
some additional code to specifically deal with that case
Then we can use an approach that basically ignores downstream requests
and focuses exclusively on upstream requests. In that case, all we need
to care about is that a device MPS is no larger than its parent MPS,
which allows us to keep all switches/bridges to the max MPS supported by
their parent and eventually the PHB.
In this case, your USB controller would no longer "starve" your 10GE
Ethernet and your hotplug slots won't affect your global MPS.
Additionally, the hotplugged devices themselves can be configured to a
larger MPS up to the value configured in the hotplug bridge.
To choose between the two available options, two PCI kernel boot args
have been added to the PCI calls. "pcie_bus_safe" will provide the
former behavior, while "pcie_bus_perf" will perform the latter behavior.
By default, the latter behavior is used.
NOTE: due to the location of the enablement, each arch will need to add
calls to this function. This patch only enables x86.
This patch includes a number of changes recommended by Benjamin
Herrenschmidt.
Tested-by: Jordan_Hargrave@dell.com
Signed-off-by: Jon Mason <mason@myri.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
|
|
|
if (pcie_bus_config == PCIE_BUS_SAFE) {
|
2011-10-03 22:50:20 +08:00
|
|
|
smpss = mpss;
|
|
|
|
|
PCI: Set PCI-E Max Payload Size on fabric
On a given PCI-E fabric, each device, bridge, and root port can have a
different PCI-E maximum payload size. There is a sizable performance
boost for having the largest possible maximum payload size on each PCI-E
device. However, if improperly configured, fatal bus errors can occur.
Thus, it is important to ensure that PCI-E payloads sends by a device
are never larger than the MPS setting of all devices on the way to the
destination.
This can be achieved two ways:
- A conservative approach is to use the smallest common denominator of
the entire tree below a root complex for every device on that fabric.
This means for example that having a 128 bytes MPS USB controller on one
leg of a switch will dramatically reduce performances of a video card or
10GE adapter on another leg of that same switch.
It also means that any hierarchy supporting hotplug slots (including
expresscard or thunderbolt I suppose, dbl check that) will have to be
entirely clamped to 128 bytes since we cannot predict what will be
plugged into those slots, and we cannot change the MPS on a "live"
system.
- A more optimal way is possible, if it falls within a couple of
constraints:
* The top-level host bridge will never generate packets larger than the
smallest TLP (or if it can be controlled independently from its MPS at
least)
* The device will never generate packets larger than MPS (which can be
configured via MRRS)
* No support of direct PCI-E <-> PCI-E transfers between devices without
some additional code to specifically deal with that case
Then we can use an approach that basically ignores downstream requests
and focuses exclusively on upstream requests. In that case, all we need
to care about is that a device MPS is no larger than its parent MPS,
which allows us to keep all switches/bridges to the max MPS supported by
their parent and eventually the PHB.
In this case, your USB controller would no longer "starve" your 10GE
Ethernet and your hotplug slots won't affect your global MPS.
Additionally, the hotplugged devices themselves can be configured to a
larger MPS up to the value configured in the hotplug bridge.
To choose between the two available options, two PCI kernel boot args
have been added to the PCI calls. "pcie_bus_safe" will provide the
former behavior, while "pcie_bus_perf" will perform the latter behavior.
By default, the latter behavior is used.
NOTE: due to the location of the enablement, each arch will need to add
calls to this function. This patch only enables x86.
This patch includes a number of changes recommended by Benjamin
Herrenschmidt.
Tested-by: Jordan_Hargrave@dell.com
Signed-off-by: Jon Mason <mason@myri.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
|
|
|
pcie_find_smpss(bus->self, &smpss);
|
|
|
|
pci_walk_bus(bus, pcie_find_smpss, &smpss);
|
|
|
|
}
|
|
|
|
|
|
|
|
pcie_bus_configure_set(bus->self, &smpss);
|
|
|
|
pci_walk_bus(bus, pcie_bus_configure_set, &smpss);
|
|
|
|
}
|
2011-08-02 13:01:18 +08:00
|
|
|
EXPORT_SYMBOL_GPL(pcie_bus_configure_settings);
|
PCI: Set PCI-E Max Payload Size on fabric
On a given PCI-E fabric, each device, bridge, and root port can have a
different PCI-E maximum payload size. There is a sizable performance
boost for having the largest possible maximum payload size on each PCI-E
device. However, if improperly configured, fatal bus errors can occur.
Thus, it is important to ensure that PCI-E payloads sends by a device
are never larger than the MPS setting of all devices on the way to the
destination.
This can be achieved two ways:
- A conservative approach is to use the smallest common denominator of
the entire tree below a root complex for every device on that fabric.
This means for example that having a 128 bytes MPS USB controller on one
leg of a switch will dramatically reduce performances of a video card or
10GE adapter on another leg of that same switch.
It also means that any hierarchy supporting hotplug slots (including
expresscard or thunderbolt I suppose, dbl check that) will have to be
entirely clamped to 128 bytes since we cannot predict what will be
plugged into those slots, and we cannot change the MPS on a "live"
system.
- A more optimal way is possible, if it falls within a couple of
constraints:
* The top-level host bridge will never generate packets larger than the
smallest TLP (or if it can be controlled independently from its MPS at
least)
* The device will never generate packets larger than MPS (which can be
configured via MRRS)
* No support of direct PCI-E <-> PCI-E transfers between devices without
some additional code to specifically deal with that case
Then we can use an approach that basically ignores downstream requests
and focuses exclusively on upstream requests. In that case, all we need
to care about is that a device MPS is no larger than its parent MPS,
which allows us to keep all switches/bridges to the max MPS supported by
their parent and eventually the PHB.
In this case, your USB controller would no longer "starve" your 10GE
Ethernet and your hotplug slots won't affect your global MPS.
Additionally, the hotplugged devices themselves can be configured to a
larger MPS up to the value configured in the hotplug bridge.
To choose between the two available options, two PCI kernel boot args
have been added to the PCI calls. "pcie_bus_safe" will provide the
former behavior, while "pcie_bus_perf" will perform the latter behavior.
By default, the latter behavior is used.
NOTE: due to the location of the enablement, each arch will need to add
calls to this function. This patch only enables x86.
This patch includes a number of changes recommended by Benjamin
Herrenschmidt.
Tested-by: Jordan_Hargrave@dell.com
Signed-off-by: Jon Mason <mason@myri.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
|
|
|
|
2008-02-17 17:45:28 +08:00
|
|
|
unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
unsigned int devfn, pass, max = bus->secondary;
|
|
|
|
struct pci_dev *dev;
|
|
|
|
|
2009-11-05 01:32:52 +08:00
|
|
|
dev_dbg(&bus->dev, "scanning bus\n");
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Go find them, Rover! */
|
|
|
|
for (devfn = 0; devfn < 0x100; devfn += 8)
|
|
|
|
pci_scan_slot(bus, devfn);
|
|
|
|
|
2009-03-20 11:25:13 +08:00
|
|
|
/* Reserve buses for SR-IOV capability. */
|
|
|
|
max += pci_iov_bus_range(bus);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* After performing arch-dependent fixup of the bus, look behind
|
|
|
|
* all PCI-to-PCI bridges on this bus.
|
|
|
|
*/
|
2009-03-21 04:56:10 +08:00
|
|
|
if (!bus->is_added) {
|
2009-11-05 01:32:52 +08:00
|
|
|
dev_dbg(&bus->dev, "fixups for bus\n");
|
2009-03-21 04:56:10 +08:00
|
|
|
pcibios_fixup_bus(bus);
|
|
|
|
if (pci_is_root_bus(bus))
|
|
|
|
bus->is_added = 1;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
for (pass=0; pass < 2; pass++)
|
|
|
|
list_for_each_entry(dev, &bus->devices, bus_list) {
|
|
|
|
if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
|
|
|
|
dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
|
|
|
|
max = pci_scan_bridge(bus, dev, max, pass);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We've scanned the bus and so we know all about what's on
|
|
|
|
* the other side of any bridges that may be on this bus plus
|
|
|
|
* any devices.
|
|
|
|
*
|
|
|
|
* Return how far we've got finding sub-buses.
|
|
|
|
*/
|
2009-11-05 01:32:52 +08:00
|
|
|
dev_dbg(&bus->dev, "bus scan returning with max=%02x\n", max);
|
2005-04-17 06:20:36 +08:00
|
|
|
return max;
|
|
|
|
}
|
|
|
|
|
2007-03-27 13:53:30 +08:00
|
|
|
struct pci_bus * pci_create_bus(struct device *parent,
|
2005-09-06 07:31:03 +08:00
|
|
|
int bus, struct pci_ops *ops, void *sysdata)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
int error;
|
2009-11-05 01:32:52 +08:00
|
|
|
struct pci_bus *b, *b2;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct device *dev;
|
|
|
|
|
|
|
|
b = pci_alloc_bus();
|
|
|
|
if (!b)
|
|
|
|
return NULL;
|
|
|
|
|
2009-03-16 03:14:37 +08:00
|
|
|
dev = kzalloc(sizeof(*dev), GFP_KERNEL);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!dev){
|
|
|
|
kfree(b);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
b->sysdata = sysdata;
|
|
|
|
b->ops = ops;
|
|
|
|
|
2009-11-05 01:32:52 +08:00
|
|
|
b2 = pci_find_bus(pci_domain_nr(b), bus);
|
|
|
|
if (b2) {
|
2005-04-17 06:20:36 +08:00
|
|
|
/* If we already got to this bus through a different bridge, ignore it */
|
2009-11-05 01:32:52 +08:00
|
|
|
dev_dbg(&b2->dev, "bus already known\n");
|
2005-04-17 06:20:36 +08:00
|
|
|
goto err_out;
|
|
|
|
}
|
2006-06-02 12:35:43 +08:00
|
|
|
|
|
|
|
down_write(&pci_bus_sem);
|
2005-04-17 06:20:36 +08:00
|
|
|
list_add_tail(&b->node, &pci_root_buses);
|
2006-06-02 12:35:43 +08:00
|
|
|
up_write(&pci_bus_sem);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
dev->parent = parent;
|
|
|
|
dev->release = pci_release_bus_bridge_dev;
|
2008-10-30 09:17:49 +08:00
|
|
|
dev_set_name(dev, "pci%04x:%02x", pci_domain_nr(b), bus);
|
2005-04-17 06:20:36 +08:00
|
|
|
error = device_register(dev);
|
|
|
|
if (error)
|
|
|
|
goto dev_reg_err;
|
|
|
|
b->bridge = get_device(dev);
|
2010-02-09 02:16:33 +08:00
|
|
|
device_enable_async_suspend(b->bridge);
|
2011-04-11 09:37:07 +08:00
|
|
|
pci_set_bus_of_node(b);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-02-19 19:20:41 +08:00
|
|
|
if (!parent)
|
|
|
|
set_dev_node(b->bridge, pcibus_to_node(b));
|
|
|
|
|
2007-05-23 10:47:54 +08:00
|
|
|
b->dev.class = &pcibus_class;
|
|
|
|
b->dev.parent = b->bridge;
|
2008-10-30 09:17:49 +08:00
|
|
|
dev_set_name(&b->dev, "%04x:%02x", pci_domain_nr(b), bus);
|
2007-05-23 10:47:54 +08:00
|
|
|
error = device_register(&b->dev);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (error)
|
|
|
|
goto class_dev_reg_err;
|
|
|
|
|
|
|
|
/* Create legacy_io and legacy_mem files for this bus */
|
|
|
|
pci_create_legacy_files(b);
|
|
|
|
|
|
|
|
b->number = b->secondary = bus;
|
|
|
|
b->resource[0] = &ioport_resource;
|
|
|
|
b->resource[1] = &iomem_resource;
|
|
|
|
|
|
|
|
return b;
|
|
|
|
|
|
|
|
class_dev_reg_err:
|
|
|
|
device_unregister(dev);
|
|
|
|
dev_reg_err:
|
2006-06-02 12:35:43 +08:00
|
|
|
down_write(&pci_bus_sem);
|
2005-04-17 06:20:36 +08:00
|
|
|
list_del(&b->node);
|
2006-06-02 12:35:43 +08:00
|
|
|
up_write(&pci_bus_sem);
|
2005-04-17 06:20:36 +08:00
|
|
|
err_out:
|
|
|
|
kfree(dev);
|
|
|
|
kfree(b);
|
|
|
|
return NULL;
|
|
|
|
}
|
2005-09-06 07:31:03 +08:00
|
|
|
|
2008-02-17 17:45:28 +08:00
|
|
|
struct pci_bus * __devinit pci_scan_bus_parented(struct device *parent,
|
2005-09-06 07:31:03 +08:00
|
|
|
int bus, struct pci_ops *ops, void *sysdata)
|
|
|
|
{
|
|
|
|
struct pci_bus *b;
|
|
|
|
|
|
|
|
b = pci_create_bus(parent, bus, ops, sysdata);
|
|
|
|
if (b)
|
|
|
|
b->subordinate = pci_scan_child_bus(b);
|
|
|
|
return b;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
EXPORT_SYMBOL(pci_scan_bus_parented);
|
|
|
|
|
|
|
|
#ifdef CONFIG_HOTPLUG
|
2009-03-21 04:56:25 +08:00
|
|
|
/**
|
|
|
|
* pci_rescan_bus - scan a PCI bus for devices.
|
|
|
|
* @bus: PCI bus to scan
|
|
|
|
*
|
|
|
|
* Scan a PCI bus and child buses for new devices, adds them,
|
|
|
|
* and enables them.
|
|
|
|
*
|
|
|
|
* Returns the max number of subordinate bus discovered.
|
|
|
|
*/
|
2009-04-02 08:24:12 +08:00
|
|
|
unsigned int __ref pci_rescan_bus(struct pci_bus *bus)
|
2009-03-21 04:56:25 +08:00
|
|
|
{
|
|
|
|
unsigned int max;
|
|
|
|
struct pci_dev *dev;
|
|
|
|
|
|
|
|
max = pci_scan_child_bus(bus);
|
|
|
|
|
2009-03-21 04:56:31 +08:00
|
|
|
down_read(&pci_bus_sem);
|
2009-03-21 04:56:25 +08:00
|
|
|
list_for_each_entry(dev, &bus->devices, bus_list)
|
|
|
|
if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
|
|
|
|
dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
|
|
|
|
if (dev->subordinate)
|
|
|
|
pci_bus_size_bridges(dev->subordinate);
|
2009-03-21 04:56:31 +08:00
|
|
|
up_read(&pci_bus_sem);
|
2009-03-21 04:56:25 +08:00
|
|
|
|
|
|
|
pci_bus_assign_resources(bus);
|
|
|
|
pci_enable_bridges(bus);
|
|
|
|
pci_bus_add_devices(bus);
|
|
|
|
|
|
|
|
return max;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(pci_rescan_bus);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
EXPORT_SYMBOL(pci_add_new_bus);
|
|
|
|
EXPORT_SYMBOL(pci_scan_slot);
|
|
|
|
EXPORT_SYMBOL(pci_scan_bridge);
|
|
|
|
EXPORT_SYMBOL_GPL(pci_scan_child_bus);
|
|
|
|
#endif
|
PCI: optionally sort device lists breadth-first
Problem:
New Dell PowerEdge servers have 2 embedded ethernet ports, which are
labeled NIC1 and NIC2 on the chassis, in the BIOS setup screens, and
in the printed documentation. Assuming no other add-in ethernet ports
in the system, Linux 2.4 kernels name these eth0 and eth1
respectively. Many people have come to expect this naming. Linux 2.6
kernels name these eth1 and eth0 respectively (backwards from
expectations). I also have reports that various Sun and HP servers
have similar behavior.
Root cause:
Linux 2.4 kernels walk the pci_devices list, which happens to be
sorted in breadth-first order (or pcbios_find_device order on i386,
which most often is breadth-first also). 2.6 kernels have both the
pci_devices list and the pci_bus_type.klist_devices list, the latter
is what is walked at driver load time to match the pci_id tables; this
klist happens to be in depth-first order.
On systems where, for physical routing reasons, NIC1 appears on a
lower bus number than NIC2, but NIC2's bridge is discovered first in
the depth-first ordering, NIC2 will be discovered before NIC1. If the
list were sorted breadth-first, NIC1 would be discovered before NIC2.
A PowerEdge 1955 system has the following topology which easily
exhibits the difference between depth-first and breadth-first device
lists.
-[0000:00]-+-00.0 Intel Corporation 5000P Chipset Memory Controller Hub
+-02.0-[0000:03-08]--+-00.0-[0000:04-07]--+-00.0-[0000:05-06]----00.0-[0000:06]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC2, 2.4 kernel name eth1, 2.6 kernel name eth0)
+-1c.0-[0000:01-02]----00.0-[0000:02]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC1, 2.4 kernel name eth0, 2.6 kernel name eth1)
Other factors, such as device driver load order and the presence of
PCI slots at various points in the bus hierarchy further complicate
this problem; I'm not trying to solve those here, just restore the
device order, and thus basic behavior, that 2.4 kernels had.
Solution:
The solution can come in multiple steps.
Suggested fix #1: kernel
Patch below optionally sorts the two device lists into breadth-first
ordering to maintain compatibility with 2.4 kernels. It adds two new
command line options:
pci=bfsort
pci=nobfsort
to force the sort order, or not, as you wish. It also adds DMI checks
for the specific Dell systems which exhibit "backwards" ordering, to
make them "right".
Suggested fix #2: udev rules from userland
Many people also have the expectation that embedded NICs are always
discovered before add-in NICs (which this patch does not try to do).
Using the PCI IRQ Routing Table provided by system BIOS, it's easy to
determine which PCI devices are embedded, or if add-in, which PCI slot
they're in. I'm working on a tool that would allow udev to name
ethernet devices in ascending embedded, slot 1 .. slot N order,
subsort by PCI bus/dev/fn breadth-first. It'll be possible to use it
independent of udev as well for those distributions that don't use
udev in their installers.
Suggested fix #3: system board routing rules
One can constrain the system board layout to put NIC1 ahead of NIC2
regardless of breadth-first or depth-first discovery order. This adds
a significant level of complexity to board routing, and may not be
possible in all instances (witness the above systems from several
major manufacturers). I don't want to encourage this particular train
of thought too far, at the expense of not doing #1 or #2 above.
Feedback appreciated. Patch tested on a Dell PowerEdge 1955 blade
with 2.6.18.
You'll also note I took some liberty and temporarily break the klist
abstraction to simplify and speed up the sort algorithm. I think
that's both safe and appropriate in this instance.
Signed-off-by: Matt Domsch <Matt_Domsch@dell.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-09-30 04:23:23 +08:00
|
|
|
|
2008-08-27 00:00:57 +08:00
|
|
|
static int __init pci_sort_bf_cmp(const struct device *d_a, const struct device *d_b)
|
PCI: optionally sort device lists breadth-first
Problem:
New Dell PowerEdge servers have 2 embedded ethernet ports, which are
labeled NIC1 and NIC2 on the chassis, in the BIOS setup screens, and
in the printed documentation. Assuming no other add-in ethernet ports
in the system, Linux 2.4 kernels name these eth0 and eth1
respectively. Many people have come to expect this naming. Linux 2.6
kernels name these eth1 and eth0 respectively (backwards from
expectations). I also have reports that various Sun and HP servers
have similar behavior.
Root cause:
Linux 2.4 kernels walk the pci_devices list, which happens to be
sorted in breadth-first order (or pcbios_find_device order on i386,
which most often is breadth-first also). 2.6 kernels have both the
pci_devices list and the pci_bus_type.klist_devices list, the latter
is what is walked at driver load time to match the pci_id tables; this
klist happens to be in depth-first order.
On systems where, for physical routing reasons, NIC1 appears on a
lower bus number than NIC2, but NIC2's bridge is discovered first in
the depth-first ordering, NIC2 will be discovered before NIC1. If the
list were sorted breadth-first, NIC1 would be discovered before NIC2.
A PowerEdge 1955 system has the following topology which easily
exhibits the difference between depth-first and breadth-first device
lists.
-[0000:00]-+-00.0 Intel Corporation 5000P Chipset Memory Controller Hub
+-02.0-[0000:03-08]--+-00.0-[0000:04-07]--+-00.0-[0000:05-06]----00.0-[0000:06]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC2, 2.4 kernel name eth1, 2.6 kernel name eth0)
+-1c.0-[0000:01-02]----00.0-[0000:02]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC1, 2.4 kernel name eth0, 2.6 kernel name eth1)
Other factors, such as device driver load order and the presence of
PCI slots at various points in the bus hierarchy further complicate
this problem; I'm not trying to solve those here, just restore the
device order, and thus basic behavior, that 2.4 kernels had.
Solution:
The solution can come in multiple steps.
Suggested fix #1: kernel
Patch below optionally sorts the two device lists into breadth-first
ordering to maintain compatibility with 2.4 kernels. It adds two new
command line options:
pci=bfsort
pci=nobfsort
to force the sort order, or not, as you wish. It also adds DMI checks
for the specific Dell systems which exhibit "backwards" ordering, to
make them "right".
Suggested fix #2: udev rules from userland
Many people also have the expectation that embedded NICs are always
discovered before add-in NICs (which this patch does not try to do).
Using the PCI IRQ Routing Table provided by system BIOS, it's easy to
determine which PCI devices are embedded, or if add-in, which PCI slot
they're in. I'm working on a tool that would allow udev to name
ethernet devices in ascending embedded, slot 1 .. slot N order,
subsort by PCI bus/dev/fn breadth-first. It'll be possible to use it
independent of udev as well for those distributions that don't use
udev in their installers.
Suggested fix #3: system board routing rules
One can constrain the system board layout to put NIC1 ahead of NIC2
regardless of breadth-first or depth-first discovery order. This adds
a significant level of complexity to board routing, and may not be
possible in all instances (witness the above systems from several
major manufacturers). I don't want to encourage this particular train
of thought too far, at the expense of not doing #1 or #2 above.
Feedback appreciated. Patch tested on a Dell PowerEdge 1955 blade
with 2.6.18.
You'll also note I took some liberty and temporarily break the klist
abstraction to simplify and speed up the sort algorithm. I think
that's both safe and appropriate in this instance.
Signed-off-by: Matt Domsch <Matt_Domsch@dell.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-09-30 04:23:23 +08:00
|
|
|
{
|
2008-08-27 00:00:57 +08:00
|
|
|
const struct pci_dev *a = to_pci_dev(d_a);
|
|
|
|
const struct pci_dev *b = to_pci_dev(d_b);
|
|
|
|
|
PCI: optionally sort device lists breadth-first
Problem:
New Dell PowerEdge servers have 2 embedded ethernet ports, which are
labeled NIC1 and NIC2 on the chassis, in the BIOS setup screens, and
in the printed documentation. Assuming no other add-in ethernet ports
in the system, Linux 2.4 kernels name these eth0 and eth1
respectively. Many people have come to expect this naming. Linux 2.6
kernels name these eth1 and eth0 respectively (backwards from
expectations). I also have reports that various Sun and HP servers
have similar behavior.
Root cause:
Linux 2.4 kernels walk the pci_devices list, which happens to be
sorted in breadth-first order (or pcbios_find_device order on i386,
which most often is breadth-first also). 2.6 kernels have both the
pci_devices list and the pci_bus_type.klist_devices list, the latter
is what is walked at driver load time to match the pci_id tables; this
klist happens to be in depth-first order.
On systems where, for physical routing reasons, NIC1 appears on a
lower bus number than NIC2, but NIC2's bridge is discovered first in
the depth-first ordering, NIC2 will be discovered before NIC1. If the
list were sorted breadth-first, NIC1 would be discovered before NIC2.
A PowerEdge 1955 system has the following topology which easily
exhibits the difference between depth-first and breadth-first device
lists.
-[0000:00]-+-00.0 Intel Corporation 5000P Chipset Memory Controller Hub
+-02.0-[0000:03-08]--+-00.0-[0000:04-07]--+-00.0-[0000:05-06]----00.0-[0000:06]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC2, 2.4 kernel name eth1, 2.6 kernel name eth0)
+-1c.0-[0000:01-02]----00.0-[0000:02]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC1, 2.4 kernel name eth0, 2.6 kernel name eth1)
Other factors, such as device driver load order and the presence of
PCI slots at various points in the bus hierarchy further complicate
this problem; I'm not trying to solve those here, just restore the
device order, and thus basic behavior, that 2.4 kernels had.
Solution:
The solution can come in multiple steps.
Suggested fix #1: kernel
Patch below optionally sorts the two device lists into breadth-first
ordering to maintain compatibility with 2.4 kernels. It adds two new
command line options:
pci=bfsort
pci=nobfsort
to force the sort order, or not, as you wish. It also adds DMI checks
for the specific Dell systems which exhibit "backwards" ordering, to
make them "right".
Suggested fix #2: udev rules from userland
Many people also have the expectation that embedded NICs are always
discovered before add-in NICs (which this patch does not try to do).
Using the PCI IRQ Routing Table provided by system BIOS, it's easy to
determine which PCI devices are embedded, or if add-in, which PCI slot
they're in. I'm working on a tool that would allow udev to name
ethernet devices in ascending embedded, slot 1 .. slot N order,
subsort by PCI bus/dev/fn breadth-first. It'll be possible to use it
independent of udev as well for those distributions that don't use
udev in their installers.
Suggested fix #3: system board routing rules
One can constrain the system board layout to put NIC1 ahead of NIC2
regardless of breadth-first or depth-first discovery order. This adds
a significant level of complexity to board routing, and may not be
possible in all instances (witness the above systems from several
major manufacturers). I don't want to encourage this particular train
of thought too far, at the expense of not doing #1 or #2 above.
Feedback appreciated. Patch tested on a Dell PowerEdge 1955 blade
with 2.6.18.
You'll also note I took some liberty and temporarily break the klist
abstraction to simplify and speed up the sort algorithm. I think
that's both safe and appropriate in this instance.
Signed-off-by: Matt Domsch <Matt_Domsch@dell.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-09-30 04:23:23 +08:00
|
|
|
if (pci_domain_nr(a->bus) < pci_domain_nr(b->bus)) return -1;
|
|
|
|
else if (pci_domain_nr(a->bus) > pci_domain_nr(b->bus)) return 1;
|
|
|
|
|
|
|
|
if (a->bus->number < b->bus->number) return -1;
|
|
|
|
else if (a->bus->number > b->bus->number) return 1;
|
|
|
|
|
|
|
|
if (a->devfn < b->devfn) return -1;
|
|
|
|
else if (a->devfn > b->devfn) return 1;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-02-15 06:56:56 +08:00
|
|
|
void __init pci_sort_breadthfirst(void)
|
PCI: optionally sort device lists breadth-first
Problem:
New Dell PowerEdge servers have 2 embedded ethernet ports, which are
labeled NIC1 and NIC2 on the chassis, in the BIOS setup screens, and
in the printed documentation. Assuming no other add-in ethernet ports
in the system, Linux 2.4 kernels name these eth0 and eth1
respectively. Many people have come to expect this naming. Linux 2.6
kernels name these eth1 and eth0 respectively (backwards from
expectations). I also have reports that various Sun and HP servers
have similar behavior.
Root cause:
Linux 2.4 kernels walk the pci_devices list, which happens to be
sorted in breadth-first order (or pcbios_find_device order on i386,
which most often is breadth-first also). 2.6 kernels have both the
pci_devices list and the pci_bus_type.klist_devices list, the latter
is what is walked at driver load time to match the pci_id tables; this
klist happens to be in depth-first order.
On systems where, for physical routing reasons, NIC1 appears on a
lower bus number than NIC2, but NIC2's bridge is discovered first in
the depth-first ordering, NIC2 will be discovered before NIC1. If the
list were sorted breadth-first, NIC1 would be discovered before NIC2.
A PowerEdge 1955 system has the following topology which easily
exhibits the difference between depth-first and breadth-first device
lists.
-[0000:00]-+-00.0 Intel Corporation 5000P Chipset Memory Controller Hub
+-02.0-[0000:03-08]--+-00.0-[0000:04-07]--+-00.0-[0000:05-06]----00.0-[0000:06]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC2, 2.4 kernel name eth1, 2.6 kernel name eth0)
+-1c.0-[0000:01-02]----00.0-[0000:02]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC1, 2.4 kernel name eth0, 2.6 kernel name eth1)
Other factors, such as device driver load order and the presence of
PCI slots at various points in the bus hierarchy further complicate
this problem; I'm not trying to solve those here, just restore the
device order, and thus basic behavior, that 2.4 kernels had.
Solution:
The solution can come in multiple steps.
Suggested fix #1: kernel
Patch below optionally sorts the two device lists into breadth-first
ordering to maintain compatibility with 2.4 kernels. It adds two new
command line options:
pci=bfsort
pci=nobfsort
to force the sort order, or not, as you wish. It also adds DMI checks
for the specific Dell systems which exhibit "backwards" ordering, to
make them "right".
Suggested fix #2: udev rules from userland
Many people also have the expectation that embedded NICs are always
discovered before add-in NICs (which this patch does not try to do).
Using the PCI IRQ Routing Table provided by system BIOS, it's easy to
determine which PCI devices are embedded, or if add-in, which PCI slot
they're in. I'm working on a tool that would allow udev to name
ethernet devices in ascending embedded, slot 1 .. slot N order,
subsort by PCI bus/dev/fn breadth-first. It'll be possible to use it
independent of udev as well for those distributions that don't use
udev in their installers.
Suggested fix #3: system board routing rules
One can constrain the system board layout to put NIC1 ahead of NIC2
regardless of breadth-first or depth-first discovery order. This adds
a significant level of complexity to board routing, and may not be
possible in all instances (witness the above systems from several
major manufacturers). I don't want to encourage this particular train
of thought too far, at the expense of not doing #1 or #2 above.
Feedback appreciated. Patch tested on a Dell PowerEdge 1955 blade
with 2.6.18.
You'll also note I took some liberty and temporarily break the klist
abstraction to simplify and speed up the sort algorithm. I think
that's both safe and appropriate in this instance.
Signed-off-by: Matt Domsch <Matt_Domsch@dell.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-09-30 04:23:23 +08:00
|
|
|
{
|
2008-08-27 00:00:57 +08:00
|
|
|
bus_sort_breadthfirst(&pci_bus_type, &pci_sort_bf_cmp);
|
PCI: optionally sort device lists breadth-first
Problem:
New Dell PowerEdge servers have 2 embedded ethernet ports, which are
labeled NIC1 and NIC2 on the chassis, in the BIOS setup screens, and
in the printed documentation. Assuming no other add-in ethernet ports
in the system, Linux 2.4 kernels name these eth0 and eth1
respectively. Many people have come to expect this naming. Linux 2.6
kernels name these eth1 and eth0 respectively (backwards from
expectations). I also have reports that various Sun and HP servers
have similar behavior.
Root cause:
Linux 2.4 kernels walk the pci_devices list, which happens to be
sorted in breadth-first order (or pcbios_find_device order on i386,
which most often is breadth-first also). 2.6 kernels have both the
pci_devices list and the pci_bus_type.klist_devices list, the latter
is what is walked at driver load time to match the pci_id tables; this
klist happens to be in depth-first order.
On systems where, for physical routing reasons, NIC1 appears on a
lower bus number than NIC2, but NIC2's bridge is discovered first in
the depth-first ordering, NIC2 will be discovered before NIC1. If the
list were sorted breadth-first, NIC1 would be discovered before NIC2.
A PowerEdge 1955 system has the following topology which easily
exhibits the difference between depth-first and breadth-first device
lists.
-[0000:00]-+-00.0 Intel Corporation 5000P Chipset Memory Controller Hub
+-02.0-[0000:03-08]--+-00.0-[0000:04-07]--+-00.0-[0000:05-06]----00.0-[0000:06]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC2, 2.4 kernel name eth1, 2.6 kernel name eth0)
+-1c.0-[0000:01-02]----00.0-[0000:02]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC1, 2.4 kernel name eth0, 2.6 kernel name eth1)
Other factors, such as device driver load order and the presence of
PCI slots at various points in the bus hierarchy further complicate
this problem; I'm not trying to solve those here, just restore the
device order, and thus basic behavior, that 2.4 kernels had.
Solution:
The solution can come in multiple steps.
Suggested fix #1: kernel
Patch below optionally sorts the two device lists into breadth-first
ordering to maintain compatibility with 2.4 kernels. It adds two new
command line options:
pci=bfsort
pci=nobfsort
to force the sort order, or not, as you wish. It also adds DMI checks
for the specific Dell systems which exhibit "backwards" ordering, to
make them "right".
Suggested fix #2: udev rules from userland
Many people also have the expectation that embedded NICs are always
discovered before add-in NICs (which this patch does not try to do).
Using the PCI IRQ Routing Table provided by system BIOS, it's easy to
determine which PCI devices are embedded, or if add-in, which PCI slot
they're in. I'm working on a tool that would allow udev to name
ethernet devices in ascending embedded, slot 1 .. slot N order,
subsort by PCI bus/dev/fn breadth-first. It'll be possible to use it
independent of udev as well for those distributions that don't use
udev in their installers.
Suggested fix #3: system board routing rules
One can constrain the system board layout to put NIC1 ahead of NIC2
regardless of breadth-first or depth-first discovery order. This adds
a significant level of complexity to board routing, and may not be
possible in all instances (witness the above systems from several
major manufacturers). I don't want to encourage this particular train
of thought too far, at the expense of not doing #1 or #2 above.
Feedback appreciated. Patch tested on a Dell PowerEdge 1955 blade
with 2.6.18.
You'll also note I took some liberty and temporarily break the klist
abstraction to simplify and speed up the sort algorithm. I think
that's both safe and appropriate in this instance.
Signed-off-by: Matt Domsch <Matt_Domsch@dell.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-09-30 04:23:23 +08:00
|
|
|
}
|