Merge branch 'x86/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'x86/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (160 commits)
  x86: remove extra calling to get ext cpuid level
  x86: use setup_clear_cpu_cap() when disabling the lapic
  KVM: fix exception entry / build bug, on 64-bit
  x86: add unknown_nmi_panic kernel parameter
  x86, VisWS: turn into generic arch, eliminate leftover files
  x86: add ->pre_time_init to x86_quirks
  x86: extend and use x86_quirks to clean up NUMAQ code
  x86: introduce x86_quirks
  x86: improve debug printout: add target bootmem range in early_res_to_bootmem()
  Subject: devmem, x86: fix rename of CONFIG_NONPROMISC_DEVMEM
  x86: remove arch_get_ram_range
  x86: Add a debugfs interface to dump PAT memtype
  x86: Add a arch directory for x86 under debugfs
  x86: i386: reduce boot fixmap space
  i386/xen: add proper unwind annotations to xen_sysenter_target
  x86: reduce force_mwait visibility
  x86: reduce forbid_dac's visibility
  x86: fix two modpost warnings
  x86: check function status in EDD boot code
  x86_64: ia32_signal.c: remove signal number conversion
  ...
This commit is contained in:
Linus Torvalds 2008-07-21 10:34:25 -07:00
commit 72a73693aa
152 changed files with 4312 additions and 1773 deletions

View File

@ -1206,7 +1206,7 @@ and is between 256 and 4096 characters. It is defined in the file
or
memmap=0x10000$0x18690000
memtest= [KNL,X86_64] Enable memtest
memtest= [KNL,X86] Enable memtest
Format: <integer>
range: 0,4 : pattern number
default : 0 <disable>
@ -2158,6 +2158,10 @@ and is between 256 and 4096 characters. It is defined in the file
Note that genuine overcurrent events won't be
reported either.
unknown_nmi_panic
[X86-32,X86-64]
Set unknown_nmi_panic=1 early on boot.
usbcore.autosuspend=
[USB] The autosuspend time delay (in seconds) used
for newly-detected USB devices (default 2). This

View File

@ -447,7 +447,6 @@ config PARAVIRT_DEBUG
config MEMTEST
bool "Memtest"
depends on X86_64
help
This option adds a kernel parameter 'memtest', which allows memtest
to be set.

View File

@ -362,10 +362,6 @@ config X86_ALIGNMENT_16
def_bool y
depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
config X86_GOOD_APIC
def_bool y
depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 || X86_64
config X86_INTEL_USERCOPY
def_bool y
depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2

View File

@ -5,13 +5,15 @@ config TRACE_IRQFLAGS_SUPPORT
source "lib/Kconfig.debug"
config NONPROMISC_DEVMEM
config STRICT_DEVMEM
bool "Filter access to /dev/mem"
help
If this option is left off, you allow userspace access to all
If this option is left on, you allow userspace (root) access to all
of memory, including kernel and userspace memory. Accidental
access to this is obviously disastrous, but specific access can
be used by people debugging the kernel.
be used by people debugging the kernel. Note that with PAT support
enabled, even in this case there are restrictions on /dev/mem
use due to the cache aliasing requirements.
If this option is switched on, the /dev/mem file only allows
userspace access to PCI space and the BIOS code and data regions.
@ -287,7 +289,6 @@ config CPA_DEBUG
config OPTIMIZE_INLINING
bool "Allow gcc to uninline functions marked 'inline'"
depends on BROKEN
help
This option determines if the kernel forces gcc to inline the functions
developers have marked 'inline'. Doing so takes away freedom from gcc to
@ -298,5 +299,7 @@ config OPTIMIZE_INLINING
become the default in the future, until then this option is there to
test gcc for this.
If unsure, say N.
endmenu

View File

@ -167,9 +167,8 @@ void query_edd(void)
* Scan the BIOS-supported hard disks and query EDD
* information...
*/
get_edd_info(devno, &ei);
if (boot_params.eddbuf_entries < EDDMAXNR) {
if (!get_edd_info(devno, &ei)
&& boot_params.eddbuf_entries < EDDMAXNR) {
memcpy(edp, &ei, sizeof ei);
edp++;
boot_params.eddbuf_entries++;

View File

@ -98,12 +98,6 @@ static void reset_coprocessor(void)
/*
* Set up the GDT
*/
#define GDT_ENTRY(flags, base, limit) \
(((u64)(base & 0xff000000) << 32) | \
((u64)flags << 40) | \
((u64)(limit & 0x00ff0000) << 32) | \
((u64)(base & 0x00ffffff) << 16) | \
((u64)(limit & 0x0000ffff)))
struct gdt_ptr {
u16 len;

View File

@ -2047,7 +2047,7 @@ CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
# CONFIG_SAMPLES is not set
# CONFIG_KGDB is not set
CONFIG_HAVE_ARCH_KGDB=y
# CONFIG_NONPROMISC_DEVMEM is not set
# CONFIG_STRICT_DEVMEM is not set
CONFIG_EARLY_PRINTK=y
CONFIG_DEBUG_STACKOVERFLOW=y
CONFIG_DEBUG_STACK_USAGE=y

View File

@ -2012,7 +2012,7 @@ CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
# CONFIG_SAMPLES is not set
# CONFIG_KGDB is not set
CONFIG_HAVE_ARCH_KGDB=y
# CONFIG_NONPROMISC_DEVMEM is not set
# CONFIG_STRICT_DEVMEM is not set
CONFIG_EARLY_PRINTK=y
CONFIG_DEBUG_STACKOVERFLOW=y
CONFIG_DEBUG_STACK_USAGE=y

View File

@ -36,6 +36,11 @@
#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
X86_EFLAGS_CF)
asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
@ -248,7 +253,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
regs->ss |= 3;
err |= __get_user(tmpflags, &sc->flags);
regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5);
regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
/* disable syscall checks */
regs->orig_ax = -1;
@ -515,7 +520,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
compat_sigset_t *set, struct pt_regs *regs)
{
struct rt_sigframe __user *frame;
struct exec_domain *ed = current_thread_info()->exec_domain;
void __user *restorer;
int err = 0;
@ -538,8 +542,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
goto give_sigsegv;
err |= __put_user((ed && ed->signal_invmap && sig < 32
? ed->signal_invmap[sig] : sig), &frame->sig);
err |= __put_user(sig, &frame->sig);
err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
err |= copy_siginfo_to_user32(&frame->info, info);

View File

@ -37,6 +37,11 @@
movq %rax,R8(%rsp)
.endm
/*
* Reload arg registers from stack in case ptrace changed them.
* We don't reload %eax because syscall_trace_enter() returned
* the value it wants us to use in the table lookup.
*/
.macro LOAD_ARGS32 offset
movl \offset(%rsp),%r11d
movl \offset+8(%rsp),%r10d
@ -46,7 +51,6 @@
movl \offset+48(%rsp),%edx
movl \offset+56(%rsp),%esi
movl \offset+64(%rsp),%edi
movl \offset+72(%rsp),%eax
.endm
.macro CFI_STARTPROC32 simple
@ -137,13 +141,12 @@ ENTRY(ia32_sysenter_target)
.previous
GET_THREAD_INFO(%r10)
orl $TS_COMPAT,TI_status(%r10)
testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
TI_flags(%r10)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
CFI_REMEMBER_STATE
jnz sysenter_tracesys
sysenter_do_call:
cmpl $(IA32_NR_syscalls-1),%eax
ja ia32_badsys
sysenter_do_call:
IA32_ARG_FIXUP 1
call *ia32_sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
@ -242,8 +245,7 @@ ENTRY(ia32_cstar_target)
.previous
GET_THREAD_INFO(%r10)
orl $TS_COMPAT,TI_status(%r10)
testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
TI_flags(%r10)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
CFI_REMEMBER_STATE
jnz cstar_tracesys
cstar_do_call:
@ -321,6 +323,7 @@ ENTRY(ia32_syscall)
/*CFI_REL_OFFSET rflags,EFLAGS-RIP*/
/*CFI_REL_OFFSET cs,CS-RIP*/
CFI_REL_OFFSET rip,RIP-RIP
PARAVIRT_ADJUST_EXCEPTION_FRAME
SWAPGS
/*
* No need to follow this irqs on/off section: the syscall
@ -336,8 +339,7 @@ ENTRY(ia32_syscall)
SAVE_ARGS 0,0,1
GET_THREAD_INFO(%r10)
orl $TS_COMPAT,TI_status(%r10)
testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
TI_flags(%r10)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
jnz ia32_tracesys
ia32_do_syscall:
cmpl $(IA32_NR_syscalls-1),%eax

View File

@ -7,9 +7,10 @@ extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu
CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
ifdef CONFIG_FTRACE
# Do not profile debug utilities
# Do not profile debug and lowlevel utilities
CFLAGS_REMOVE_tsc.o = -pg
CFLAGS_REMOVE_rtc.o = -pg
CFLAGS_REMOVE_paravirt.o = -pg
endif
#
@ -102,6 +103,7 @@ obj-$(CONFIG_OLPC) += olpc.o
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
obj-y += bios_uv.o
obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
obj-$(CONFIG_AUDIT) += audit_64.o

View File

@ -9,6 +9,7 @@
#include <linux/bootmem.h>
#include <linux/dmi.h>
#include <linux/cpumask.h>
#include <asm/segment.h>
#include "realmode/wakeup.h"
#include "sleep.h"
@ -23,15 +24,6 @@ static unsigned long acpi_realmode;
static char temp_stack[10240];
#endif
/* XXX: this macro should move to asm-x86/segment.h and be shared with the
boot code... */
#define GDT_ENTRY(flags, base, limit) \
(((u64)(base & 0xff000000) << 32) | \
((u64)flags << 40) | \
((u64)(limit & 0x00ff0000) << 32) | \
((u64)(base & 0x00ffffff) << 16) | \
((u64)(limit & 0x0000ffff)))
/**
* acpi_save_state_mem - save kernel state
*

View File

@ -23,7 +23,7 @@
#include <linux/scatterlist.h>
#include <linux/iommu-helper.h>
#include <asm/proto.h>
#include <asm/gart.h>
#include <asm/iommu.h>
#include <asm/amd_iommu_types.h>
#include <asm/amd_iommu.h>
@ -32,21 +32,37 @@
#define to_pages(addr, size) \
(round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
#define EXIT_LOOP_COUNT 10000000
static DEFINE_RWLOCK(amd_iommu_devtable_lock);
struct command {
/*
* general struct to manage commands send to an IOMMU
*/
struct iommu_cmd {
u32 data[4];
};
static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
struct unity_map_entry *e);
/* returns !0 if the IOMMU is caching non-present entries in its TLB */
static int iommu_has_npcache(struct amd_iommu *iommu)
{
return iommu->cap & IOMMU_CAP_NPCACHE;
}
static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
/****************************************************************************
*
* IOMMU command queuing functions
*
****************************************************************************/
/*
* Writes the command to the IOMMUs command buffer and informs the
* hardware about the new command. Must be called with iommu->lock held.
*/
static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
{
u32 tail, head;
u8 *target;
@ -63,7 +79,11 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
return 0;
}
static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
/*
* General queuing function for commands. Takes iommu->lock and calls
* __iommu_queue_command().
*/
static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
{
unsigned long flags;
int ret;
@ -75,16 +95,24 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
return ret;
}
/*
* This function is called whenever we need to ensure that the IOMMU has
* completed execution of all commands we sent. It sends a
* COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
* us about that by writing a value to a physical address we pass with
* the command.
*/
static int iommu_completion_wait(struct amd_iommu *iommu)
{
int ret;
struct command cmd;
struct iommu_cmd cmd;
volatile u64 ready = 0;
unsigned long ready_phys = virt_to_phys(&ready);
unsigned long i = 0;
memset(&cmd, 0, sizeof(cmd));
cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK;
cmd.data[1] = HIGH_U32(ready_phys);
cmd.data[1] = upper_32_bits(ready_phys);
cmd.data[2] = 1; /* value written to 'ready' */
CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
@ -95,15 +123,23 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
if (ret)
return ret;
while (!ready)
while (!ready && (i < EXIT_LOOP_COUNT)) {
++i;
cpu_relax();
}
if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
return 0;
}
/*
* Command send function for invalidating a device table entry
*/
static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
{
struct command cmd;
struct iommu_cmd cmd;
BUG_ON(iommu == NULL);
@ -116,20 +152,23 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
return iommu_queue_command(iommu, &cmd);
}
/*
* Generic command send function for invalidaing TLB entries
*/
static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
u64 address, u16 domid, int pde, int s)
{
struct command cmd;
struct iommu_cmd cmd;
memset(&cmd, 0, sizeof(cmd));
address &= PAGE_MASK;
CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
cmd.data[1] |= domid;
cmd.data[2] = LOW_U32(address);
cmd.data[3] = HIGH_U32(address);
if (s)
cmd.data[3] = upper_32_bits(address);
if (s) /* size bit - we flush more than one 4kb page */
cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
if (pde)
if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
iommu->need_sync = 1;
@ -137,6 +176,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
return iommu_queue_command(iommu, &cmd);
}
/*
* TLB invalidation function which is called from the mapping functions.
* It invalidates a single PTE if the range to flush is within a single
* page. Otherwise it flushes the whole TLB of the IOMMU.
*/
static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
u64 address, size_t size)
{
@ -159,6 +203,20 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
return 0;
}
/****************************************************************************
*
* The functions below are used the create the page table mappings for
* unity mapped regions.
*
****************************************************************************/
/*
* Generic mapping functions. It maps a physical address into a DMA
* address space. It allocates the page table pages if necessary.
* In the future it can be extended to a generic mapping function
* supporting all features of AMD IOMMU page tables like level skipping
* and full 64 bit address spaces.
*/
static int iommu_map(struct protection_domain *dom,
unsigned long bus_addr,
unsigned long phys_addr,
@ -209,6 +267,10 @@ static int iommu_map(struct protection_domain *dom,
return 0;
}
/*
* This function checks if a specific unity mapping entry is needed for
* this specific IOMMU.
*/
static int iommu_for_unity_map(struct amd_iommu *iommu,
struct unity_map_entry *entry)
{
@ -223,6 +285,12 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
return 0;
}
/*
* Init the unity mappings for a specific IOMMU in the system
*
* Basically iterates over all unity mapping entries and applies them to
* the default domain DMA of that IOMMU if necessary.
*/
static int iommu_init_unity_mappings(struct amd_iommu *iommu)
{
struct unity_map_entry *entry;
@ -239,6 +307,10 @@ static int iommu_init_unity_mappings(struct amd_iommu *iommu)
return 0;
}
/*
* This function actually applies the mapping to the page table of the
* dma_ops domain.
*/
static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
struct unity_map_entry *e)
{
@ -261,6 +333,9 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
return 0;
}
/*
* Inits the unity mappings required for a specific device
*/
static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
u16 devid)
{
@ -278,12 +353,26 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
return 0;
}
/****************************************************************************
*
* The next functions belong to the address allocator for the dma_ops
* interface functions. They work like the allocators in the other IOMMU
* drivers. Its basically a bitmap which marks the allocated pages in
* the aperture. Maybe it could be enhanced in the future to a more
* efficient allocator.
*
****************************************************************************/
static unsigned long dma_mask_to_pages(unsigned long mask)
{
return (mask >> PAGE_SHIFT) +
(PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
}
/*
* The address allocator core function.
*
* called with domain->lock held
*/
static unsigned long dma_ops_alloc_addresses(struct device *dev,
struct dma_ops_domain *dom,
unsigned int pages)
@ -317,6 +406,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
return address;
}
/*
* The address free function.
*
* called with domain->lock held
*/
static void dma_ops_free_addresses(struct dma_ops_domain *dom,
unsigned long address,
unsigned int pages)
@ -325,6 +419,16 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
iommu_area_free(dom->bitmap, address, pages);
}
/****************************************************************************
*
* The next functions belong to the domain allocation. A domain is
* allocated for every IOMMU as the default domain. If device isolation
* is enabled, every device get its own domain. The most important thing
* about domains is the page table mapping the DMA address space they
* contain.
*
****************************************************************************/
static u16 domain_id_alloc(void)
{
unsigned long flags;
@ -342,6 +446,10 @@ static u16 domain_id_alloc(void)
return id;
}
/*
* Used to reserve address ranges in the aperture (e.g. for exclusion
* ranges.
*/
static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
unsigned long start_page,
unsigned int pages)
@ -382,6 +490,10 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
free_page((unsigned long)p1);
}
/*
* Free a domain, only used if something went wrong in the
* allocation path and we need to free an already allocated page table
*/
static void dma_ops_domain_free(struct dma_ops_domain *dom)
{
if (!dom)
@ -396,6 +508,11 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
kfree(dom);
}
/*
* Allocates a new protection domain usable for the dma_ops functions.
* It also intializes the page table and the address allocator data
* structures required for the dma_ops interface
*/
static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
unsigned order)
{
@ -436,6 +553,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
dma_dom->bitmap[0] = 1;
dma_dom->next_bit = 0;
/* Intialize the exclusion range if necessary */
if (iommu->exclusion_start &&
iommu->exclusion_start < dma_dom->aperture_size) {
unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
@ -444,6 +562,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
dma_ops_reserve_addresses(dma_dom, startpage, pages);
}
/*
* At the last step, build the page tables so we don't need to
* allocate page table pages in the dma_ops mapping/unmapping
* path.
*/
num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
GFP_KERNEL);
@ -472,6 +595,10 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
return NULL;
}
/*
* Find out the protection domain structure for a given PCI device. This
* will give us the pointer to the page table root for example.
*/
static struct protection_domain *domain_for_device(u16 devid)
{
struct protection_domain *dom;
@ -484,6 +611,10 @@ static struct protection_domain *domain_for_device(u16 devid)
return dom;
}
/*
* If a device is not yet associated with a domain, this function does
* assigns it visible for the hardware
*/
static void set_device_domain(struct amd_iommu *iommu,
struct protection_domain *domain,
u16 devid)
@ -508,6 +639,19 @@ static void set_device_domain(struct amd_iommu *iommu,
iommu->need_sync = 1;
}
/*****************************************************************************
*
* The next functions belong to the dma_ops mapping/unmapping code.
*
*****************************************************************************/
/*
* In the dma_ops path we only have the struct device. This function
* finds the corresponding IOMMU, the protection domain and the
* requestor id for a given device.
* If the device is not yet associated with a domain this is also done
* in this function.
*/
static int get_device_resources(struct device *dev,
struct amd_iommu **iommu,
struct protection_domain **domain,
@ -520,8 +664,9 @@ static int get_device_resources(struct device *dev,
BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask);
pcidev = to_pci_dev(dev);
_bdf = (pcidev->bus->number << 8) | pcidev->devfn;
_bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
/* device not translated by any IOMMU in the system? */
if (_bdf >= amd_iommu_last_bdf) {
*iommu = NULL;
*domain = NULL;
@ -547,6 +692,10 @@ static int get_device_resources(struct device *dev,
return 1;
}
/*
* This is the generic map function. It maps one 4kb page at paddr to
* the given address in the DMA address space for the domain.
*/
static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
struct dma_ops_domain *dom,
unsigned long address,
@ -578,6 +727,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
return (dma_addr_t)address;
}
/*
* The generic unmapping function for on page in the DMA address space.
*/
static void dma_ops_domain_unmap(struct amd_iommu *iommu,
struct dma_ops_domain *dom,
unsigned long address)
@ -597,6 +749,12 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
*pte = 0ULL;
}
/*
* This function contains common code for mapping of a physically
* contiguous memory region into DMA address space. It is uses by all
* mapping functions provided by this IOMMU driver.
* Must be called with the domain lock held.
*/
static dma_addr_t __map_single(struct device *dev,
struct amd_iommu *iommu,
struct dma_ops_domain *dma_dom,
@ -628,6 +786,10 @@ static dma_addr_t __map_single(struct device *dev,
return address;
}
/*
* Does the reverse of the __map_single function. Must be called with
* the domain lock held too
*/
static void __unmap_single(struct amd_iommu *iommu,
struct dma_ops_domain *dma_dom,
dma_addr_t dma_addr,
@ -652,6 +814,9 @@ static void __unmap_single(struct amd_iommu *iommu,
dma_ops_free_addresses(dma_dom, dma_addr, pages);
}
/*
* The exported map_single function for dma_ops.
*/
static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
size_t size, int dir)
{
@ -664,6 +829,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
get_device_resources(dev, &iommu, &domain, &devid);
if (iommu == NULL || domain == NULL)
/* device not handled by any AMD IOMMU */
return (dma_addr_t)paddr;
spin_lock_irqsave(&domain->lock, flags);
@ -683,6 +849,9 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
return addr;
}
/*
* The exported unmap_single function for dma_ops.
*/
static void unmap_single(struct device *dev, dma_addr_t dma_addr,
size_t size, int dir)
{
@ -692,6 +861,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
u16 devid;
if (!get_device_resources(dev, &iommu, &domain, &devid))
/* device not handled by any AMD IOMMU */
return;
spin_lock_irqsave(&domain->lock, flags);
@ -706,6 +876,10 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
spin_unlock_irqrestore(&domain->lock, flags);
}
/*
* This is a special map_sg function which is used if we should map a
* device which is not handled by an AMD IOMMU in the system.
*/
static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
int nelems, int dir)
{
@ -720,6 +894,10 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
return nelems;
}
/*
* The exported map_sg function for dma_ops (handles scatter-gather
* lists).
*/
static int map_sg(struct device *dev, struct scatterlist *sglist,
int nelems, int dir)
{
@ -775,6 +953,10 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
goto out;
}
/*
* The exported map_sg function for dma_ops (handles scatter-gather
* lists).
*/
static void unmap_sg(struct device *dev, struct scatterlist *sglist,
int nelems, int dir)
{
@ -804,6 +986,9 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
spin_unlock_irqrestore(&domain->lock, flags);
}
/*
* The exported alloc_coherent function for dma_ops.
*/
static void *alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_addr, gfp_t flag)
{
@ -851,6 +1036,11 @@ static void *alloc_coherent(struct device *dev, size_t size,
return virt_addr;
}
/*
* The exported free_coherent function for dma_ops.
* FIXME: fix the generic x86 DMA layer so that it actually calls that
* function.
*/
static void free_coherent(struct device *dev, size_t size,
void *virt_addr, dma_addr_t dma_addr)
{
@ -879,6 +1069,8 @@ static void free_coherent(struct device *dev, size_t size,
}
/*
* The function for pre-allocating protection domains.
*
* If the driver core informs the DMA layer if a driver grabs a device
* we don't need to preallocate the protection domains anymore.
* For now we have to.
@ -921,12 +1113,20 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
.unmap_sg = unmap_sg,
};
/*
* The function which clues the AMD IOMMU driver into dma_ops.
*/
int __init amd_iommu_init_dma_ops(void)
{
struct amd_iommu *iommu;
int order = amd_iommu_aperture_order;
int ret;
/*
* first allocate a default protection domain for every IOMMU we
* found in the system. Devices not assigned to any other
* protection domain will be assigned to the default one.
*/
list_for_each_entry(iommu, &amd_iommu_list, list) {
iommu->default_dom = dma_ops_domain_alloc(iommu, order);
if (iommu->default_dom == NULL)
@ -936,6 +1136,10 @@ int __init amd_iommu_init_dma_ops(void)
goto free_domains;
}
/*
* If device isolation is enabled, pre-allocate the protection
* domains for each device.
*/
if (amd_iommu_isolate)
prealloc_protection_domains();
@ -947,6 +1151,7 @@ int __init amd_iommu_init_dma_ops(void)
gart_iommu_aperture = 0;
#endif
/* Make the driver finally visible to the drivers */
dma_ops = &amd_iommu_dma_ops;
return 0;

View File

@ -25,20 +25,13 @@
#include <asm/pci-direct.h>
#include <asm/amd_iommu_types.h>
#include <asm/amd_iommu.h>
#include <asm/gart.h>
#include <asm/iommu.h>
/*
* definitions for the ACPI scanning code
*/
#define UPDATE_LAST_BDF(x) do {\
if ((x) > amd_iommu_last_bdf) \
amd_iommu_last_bdf = (x); \
} while (0);
#define DEVID(bus, devfn) (((bus) << 8) | (devfn))
#define PCI_BUS(x) (((x) >> 8) & 0xff)
#define IVRS_HEADER_LENGTH 48
#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x))))
#define ACPI_IVHD_TYPE 0x10
#define ACPI_IVMD_TYPE_ALL 0x20
@ -71,6 +64,17 @@
#define ACPI_DEVFLAG_LINT1 0x80
#define ACPI_DEVFLAG_ATSDIS 0x10000000
/*
* ACPI table definitions
*
* These data structures are laid over the table to parse the important values
* out of it.
*/
/*
* structure describing one IOMMU in the ACPI table. Typically followed by one
* or more ivhd_entrys.
*/
struct ivhd_header {
u8 type;
u8 flags;
@ -83,6 +87,10 @@ struct ivhd_header {
u32 reserved;
} __attribute__((packed));
/*
* A device entry describing which devices a specific IOMMU translates and
* which requestor ids they use.
*/
struct ivhd_entry {
u8 type;
u16 devid;
@ -90,6 +98,10 @@ struct ivhd_entry {
u32 ext;
} __attribute__((packed));
/*
* An AMD IOMMU memory definition structure. It defines things like exclusion
* ranges for devices and regions that should be unity mapped.
*/
struct ivmd_header {
u8 type;
u8 flags;
@ -103,22 +115,80 @@ struct ivmd_header {
static int __initdata amd_iommu_detected;
u16 amd_iommu_last_bdf;
struct list_head amd_iommu_unity_map;
unsigned amd_iommu_aperture_order = 26;
int amd_iommu_isolate;
u16 amd_iommu_last_bdf; /* largest PCI device id we have
to handle */
LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
we find in ACPI */
unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
int amd_iommu_isolate; /* if 1, device isolation is enabled */
struct list_head amd_iommu_list;
LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
system */
/*
* Pointer to the device table which is shared by all AMD IOMMUs
* it is indexed by the PCI device id or the HT unit id and contains
* information about the domain the device belongs to as well as the
* page table root pointer.
*/
struct dev_table_entry *amd_iommu_dev_table;
/*
* The alias table is a driver specific data structure which contains the
* mappings of the PCI device ids to the actual requestor ids on the IOMMU.
* More than one device can share the same requestor id.
*/
u16 *amd_iommu_alias_table;
/*
* The rlookup table is used to find the IOMMU which is responsible
* for a specific device. It is also indexed by the PCI device id.
*/
struct amd_iommu **amd_iommu_rlookup_table;
/*
* The pd table (protection domain table) is used to find the protection domain
* data structure a device belongs to. Indexed with the PCI device id too.
*/
struct protection_domain **amd_iommu_pd_table;
/*
* AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
* to know which ones are already in use.
*/
unsigned long *amd_iommu_pd_alloc_bitmap;
static u32 dev_table_size;
static u32 alias_table_size;
static u32 rlookup_table_size;
static u32 dev_table_size; /* size of the device table */
static u32 alias_table_size; /* size of the alias table */
static u32 rlookup_table_size; /* size if the rlookup table */
static inline void update_last_devid(u16 devid)
{
if (devid > amd_iommu_last_bdf)
amd_iommu_last_bdf = devid;
}
static inline unsigned long tbl_size(int entry_size)
{
unsigned shift = PAGE_SHIFT +
get_order(amd_iommu_last_bdf * entry_size);
return 1UL << shift;
}
/****************************************************************************
*
* AMD IOMMU MMIO register space handling functions
*
* These functions are used to program the IOMMU device registers in
* MMIO space required for that driver.
*
****************************************************************************/
/*
* This function set the exclusion range in the IOMMU. DMA accesses to the
* exclusion range are passed through untranslated
*/
static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
{
u64 start = iommu->exclusion_start & PAGE_MASK;
@ -137,6 +207,7 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
&entry, sizeof(entry));
}
/* Programs the physical address of the device table into the IOMMU hardware */
static void __init iommu_set_device_table(struct amd_iommu *iommu)
{
u32 entry;
@ -149,6 +220,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
&entry, sizeof(entry));
}
/* Generic functions to enable/disable certain features of the IOMMU. */
static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
{
u32 ctrl;
@ -167,6 +239,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
}
/* Function to enable the hardware */
void __init iommu_enable(struct amd_iommu *iommu)
{
printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
@ -176,6 +249,10 @@ void __init iommu_enable(struct amd_iommu *iommu)
iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
}
/*
* mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
* the system has one.
*/
static u8 * __init iommu_map_mmio_space(u64 address)
{
u8 *ret;
@ -199,16 +276,33 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
}
/****************************************************************************
*
* The functions below belong to the first pass of AMD IOMMU ACPI table
* parsing. In this pass we try to find out the highest device id this
* code has to handle. Upon this information the size of the shared data
* structures is determined later.
*
****************************************************************************/
/*
* This function reads the last device id the IOMMU has to handle from the PCI
* capability header for this IOMMU
*/
static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
{
u32 cap;
cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
return 0;
}
/*
* After reading the highest device id from the IOMMU PCI capability header
* this function looks if there is a higher device id defined in the ACPI table
*/
static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
{
u8 *p = (void *)h, *end = (void *)h;
@ -229,7 +323,8 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
case IVHD_DEV_RANGE_END:
case IVHD_DEV_ALIAS:
case IVHD_DEV_EXT_SELECT:
UPDATE_LAST_BDF(dev->devid);
/* all the above subfield types refer to device ids */
update_last_devid(dev->devid);
break;
default:
break;
@ -242,6 +337,11 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
return 0;
}
/*
* Iterate over all IVHD entries in the ACPI table and find the highest device
* id which we need to handle. This is the first of three functions which parse
* the ACPI table. So we check the checksum here.
*/
static int __init find_last_devid_acpi(struct acpi_table_header *table)
{
int i;
@ -277,19 +377,31 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
return 0;
}
/****************************************************************************
*
* The following functions belong the the code path which parses the ACPI table
* the second time. In this ACPI parsing iteration we allocate IOMMU specific
* data structures, initialize the device/alias/rlookup table and also
* basically initialize the hardware.
*
****************************************************************************/
/*
* Allocates the command buffer. This buffer is per AMD IOMMU. We can
* write commands to that buffer later and the IOMMU will execute them
* asynchronously
*/
static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
{
u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL,
u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
get_order(CMD_BUFFER_SIZE));
u64 entry = 0;
u64 entry;
if (cmd_buf == NULL)
return NULL;
iommu->cmd_buf_size = CMD_BUFFER_SIZE;
memset(cmd_buf, 0, CMD_BUFFER_SIZE);
entry = (u64)virt_to_phys(cmd_buf);
entry |= MMIO_CMD_SIZE_512;
memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
@ -302,11 +414,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
static void __init free_command_buffer(struct amd_iommu *iommu)
{
if (iommu->cmd_buf)
free_pages((unsigned long)iommu->cmd_buf,
get_order(CMD_BUFFER_SIZE));
free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE));
}
/* sets a specific bit in the device table entry. */
static void set_dev_entry_bit(u16 devid, u8 bit)
{
int i = (bit >> 5) & 0x07;
@ -315,7 +426,18 @@ static void set_dev_entry_bit(u16 devid, u8 bit)
amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
}
static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
/* Writes the specific IOMMU for a device into the rlookup table */
static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
{
amd_iommu_rlookup_table[devid] = iommu;
}
/*
* This function takes the device specific flags read from the ACPI
* table and sets up the device table entry with that information
*/
static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
u16 devid, u32 flags, u32 ext_flags)
{
if (flags & ACPI_DEVFLAG_INITPASS)
set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
@ -331,13 +453,14 @@ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
if (flags & ACPI_DEVFLAG_LINT1)
set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
set_iommu_for_device(iommu, devid);
}
static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
{
amd_iommu_rlookup_table[devid] = iommu;
}
/*
* Reads the device exclusion range from ACPI and initialize IOMMU with
* it
*/
static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
{
struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
@ -346,12 +469,22 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
return;
if (iommu) {
/*
* We only can configure exclusion ranges per IOMMU, not
* per device. But we can enable the exclusion range per
* device. This is done here
*/
set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
iommu->exclusion_start = m->range_start;
iommu->exclusion_length = m->range_length;
}
}
/*
* This function reads some important data from the IOMMU PCI space and
* initializes the driver data structure with it. It reads the hardware
* capabilities and the first/last device entries
*/
static void __init init_iommu_from_pci(struct amd_iommu *iommu)
{
int bus = PCI_BUS(iommu->devid);
@ -363,10 +496,16 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET);
range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range));
iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range));
iommu->first_device = calc_devid(MMIO_GET_BUS(range),
MMIO_GET_FD(range));
iommu->last_device = calc_devid(MMIO_GET_BUS(range),
MMIO_GET_LD(range));
}
/*
* Takes a pointer to an AMD IOMMU entry in the ACPI table and
* initializes the hardware and our data structures with it.
*/
static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
struct ivhd_header *h)
{
@ -374,7 +513,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
u8 *end = p, flags = 0;
u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
u32 ext_flags = 0;
bool alias = 0;
bool alias = false;
struct ivhd_entry *e;
/*
@ -414,22 +553,23 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
case IVHD_DEV_ALL:
for (dev_i = iommu->first_device;
dev_i <= iommu->last_device; ++dev_i)
set_dev_entry_from_acpi(dev_i, e->flags, 0);
set_dev_entry_from_acpi(iommu, dev_i,
e->flags, 0);
break;
case IVHD_DEV_SELECT:
devid = e->devid;
set_dev_entry_from_acpi(devid, e->flags, 0);
set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
break;
case IVHD_DEV_SELECT_RANGE_START:
devid_start = e->devid;
flags = e->flags;
ext_flags = 0;
alias = 0;
alias = false;
break;
case IVHD_DEV_ALIAS:
devid = e->devid;
devid_to = e->ext >> 8;
set_dev_entry_from_acpi(devid, e->flags, 0);
set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
amd_iommu_alias_table[devid] = devid_to;
break;
case IVHD_DEV_ALIAS_RANGE:
@ -437,24 +577,25 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
flags = e->flags;
devid_to = e->ext >> 8;
ext_flags = 0;
alias = 1;
alias = true;
break;
case IVHD_DEV_EXT_SELECT:
devid = e->devid;
set_dev_entry_from_acpi(devid, e->flags, e->ext);
set_dev_entry_from_acpi(iommu, devid, e->flags,
e->ext);
break;
case IVHD_DEV_EXT_SELECT_RANGE:
devid_start = e->devid;
flags = e->flags;
ext_flags = e->ext;
alias = 0;
alias = false;
break;
case IVHD_DEV_RANGE_END:
devid = e->devid;
for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
if (alias)
amd_iommu_alias_table[dev_i] = devid_to;
set_dev_entry_from_acpi(
set_dev_entry_from_acpi(iommu,
amd_iommu_alias_table[dev_i],
flags, ext_flags);
}
@ -467,6 +608,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
}
}
/* Initializes the device->iommu mapping for the driver */
static int __init init_iommu_devices(struct amd_iommu *iommu)
{
u16 i;
@ -494,6 +636,11 @@ static void __init free_iommu_all(void)
}
}
/*
* This function clues the initialization function for one IOMMU
* together and also allocates the command buffer and programs the
* hardware. It does NOT enable the IOMMU. This is done afterwards.
*/
static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
{
spin_lock_init(&iommu->lock);
@ -521,6 +668,10 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
return 0;
}
/*
* Iterates over all IOMMU entries in the ACPI table, allocates the
* IOMMU structure and initializes it with init_iommu_one()
*/
static int __init init_iommu_all(struct acpi_table_header *table)
{
u8 *p = (u8 *)table, *end = (u8 *)table;
@ -528,8 +679,6 @@ static int __init init_iommu_all(struct acpi_table_header *table)
struct amd_iommu *iommu;
int ret;
INIT_LIST_HEAD(&amd_iommu_list);
end += table->length;
p += IVRS_HEADER_LENGTH;
@ -555,6 +704,14 @@ static int __init init_iommu_all(struct acpi_table_header *table)
return 0;
}
/****************************************************************************
*
* The next functions belong to the third pass of parsing the ACPI
* table. In this last pass the memory mapping requirements are
* gathered (like exclusion and unity mapping reanges).
*
****************************************************************************/
static void __init free_unity_maps(void)
{
struct unity_map_entry *entry, *next;
@ -565,6 +722,7 @@ static void __init free_unity_maps(void)
}
}
/* called when we find an exclusion range definition in ACPI */
static int __init init_exclusion_range(struct ivmd_header *m)
{
int i;
@ -588,6 +746,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
return 0;
}
/* called for unity map ACPI definition */
static int __init init_unity_map_range(struct ivmd_header *m)
{
struct unity_map_entry *e = 0;
@ -619,13 +778,12 @@ static int __init init_unity_map_range(struct ivmd_header *m)
return 0;
}
/* iterates over all memory definitions we find in the ACPI table */
static int __init init_memory_definitions(struct acpi_table_header *table)
{
u8 *p = (u8 *)table, *end = (u8 *)table;
struct ivmd_header *m;
INIT_LIST_HEAD(&amd_iommu_unity_map);
end += table->length;
p += IVRS_HEADER_LENGTH;
@ -642,6 +800,10 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
return 0;
}
/*
* This function finally enables all IOMMUs found in the system after
* they have been initialized
*/
static void __init enable_iommus(void)
{
struct amd_iommu *iommu;
@ -678,6 +840,34 @@ static struct sys_device device_amd_iommu = {
.cls = &amd_iommu_sysdev_class,
};
/*
* This is the core init function for AMD IOMMU hardware in the system.
* This function is called from the generic x86 DMA layer initialization
* code.
*
* This function basically parses the ACPI table for AMD IOMMU (IVRS)
* three times:
*
* 1 pass) Find the highest PCI device id the driver has to handle.
* Upon this information the size of the data structures is
* determined that needs to be allocated.
*
* 2 pass) Initialize the data structures just allocated with the
* information in the ACPI table about available AMD IOMMUs
* in the system. It also maps the PCI devices in the
* system to specific IOMMUs
*
* 3 pass) After the basic data structures are allocated and
* initialized we update them with information about memory
* remapping requirements parsed out of the ACPI table in
* this last pass.
*
* After that the hardware is initialized and ready to go. In the last
* step we do some Linux specific things like registering the driver in
* the dma_ops interface and initializing the suspend/resume support
* functions. Finally it prints some information about AMD IOMMUs and
* the driver state and enables the hardware.
*/
int __init amd_iommu_init(void)
{
int i, ret = 0;
@ -699,14 +889,14 @@ int __init amd_iommu_init(void)
if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
return -ENODEV;
dev_table_size = TBL_SIZE(DEV_TABLE_ENTRY_SIZE);
alias_table_size = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE);
rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE);
dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
ret = -ENOMEM;
/* Device table - directly used by all IOMMUs */
amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL,
amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
get_order(dev_table_size));
if (amd_iommu_dev_table == NULL)
goto out;
@ -730,27 +920,23 @@ int __init amd_iommu_init(void)
* Protection Domain table - maps devices to protection domains
* This table has the same size as the rlookup_table
*/
amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL,
amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
get_order(rlookup_table_size));
if (amd_iommu_pd_table == NULL)
goto free;
amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL,
amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
GFP_KERNEL | __GFP_ZERO,
get_order(MAX_DOMAIN_ID/8));
if (amd_iommu_pd_alloc_bitmap == NULL)
goto free;
/*
* memory is allocated now; initialize the device table with all zeroes
* and let all alias entries point to itself
* let all alias entries point to itself
*/
memset(amd_iommu_dev_table, 0, dev_table_size);
for (i = 0; i < amd_iommu_last_bdf; ++i)
amd_iommu_alias_table[i] = i;
memset(amd_iommu_pd_table, 0, rlookup_table_size);
memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8);
/*
* never allocate domain 0 because its used as the non-allocated and
* error value placeholder
@ -795,24 +981,19 @@ int __init amd_iommu_init(void)
return ret;
free:
if (amd_iommu_pd_alloc_bitmap)
free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
if (amd_iommu_pd_table)
free_pages((unsigned long)amd_iommu_pd_table,
get_order(rlookup_table_size));
free_pages((unsigned long)amd_iommu_pd_table,
get_order(rlookup_table_size));
if (amd_iommu_rlookup_table)
free_pages((unsigned long)amd_iommu_rlookup_table,
get_order(rlookup_table_size));
free_pages((unsigned long)amd_iommu_rlookup_table,
get_order(rlookup_table_size));
if (amd_iommu_alias_table)
free_pages((unsigned long)amd_iommu_alias_table,
get_order(alias_table_size));
free_pages((unsigned long)amd_iommu_alias_table,
get_order(alias_table_size));
if (amd_iommu_dev_table)
free_pages((unsigned long)amd_iommu_dev_table,
get_order(dev_table_size));
free_pages((unsigned long)amd_iommu_dev_table,
get_order(dev_table_size));
free_iommu_all();
@ -821,6 +1002,13 @@ int __init amd_iommu_init(void)
goto out;
}
/****************************************************************************
*
* Early detect code. This code runs at IOMMU detection time in the DMA
* layer. It just looks if there is an IVRS ACPI table to detect AMD
* IOMMUs
*
****************************************************************************/
static int __init early_amd_iommu_detect(struct acpi_table_header *table)
{
return 0;
@ -828,7 +1016,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
void __init amd_iommu_detect(void)
{
if (swiotlb || no_iommu || iommu_detected)
if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
return;
if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
@ -841,6 +1029,13 @@ void __init amd_iommu_detect(void)
}
}
/****************************************************************************
*
* Parsing functions for the AMD IOMMU specific kernel command line
* options.
*
****************************************************************************/
static int __init parse_amd_iommu_options(char *str)
{
for (; *str; ++str) {
@ -853,20 +1048,10 @@ static int __init parse_amd_iommu_options(char *str)
static int __init parse_amd_iommu_size_options(char *str)
{
for (; *str; ++str) {
if (strcmp(str, "32M") == 0)
amd_iommu_aperture_order = 25;
if (strcmp(str, "64M") == 0)
amd_iommu_aperture_order = 26;
if (strcmp(str, "128M") == 0)
amd_iommu_aperture_order = 27;
if (strcmp(str, "256M") == 0)
amd_iommu_aperture_order = 28;
if (strcmp(str, "512M") == 0)
amd_iommu_aperture_order = 29;
if (strcmp(str, "1G") == 0)
amd_iommu_aperture_order = 30;
}
unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
if ((order > 24) && (order < 31))
amd_iommu_aperture_order = order;
return 1;
}

View File

@ -21,6 +21,7 @@
#include <linux/suspend.h>
#include <asm/e820.h>
#include <asm/io.h>
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>

View File

@ -75,7 +75,7 @@ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
/*
* Debug level, exported for io_apic.c
*/
int apic_verbosity;
unsigned int apic_verbosity;
int pic_mode;
@ -177,7 +177,7 @@ void __cpuinit enable_NMI_through_LVT0(void)
/* Level triggered for 82489DX */
if (!lapic_is_integrated())
v |= APIC_LVT_LEVEL_TRIGGER;
apic_write_around(APIC_LVT0, v);
apic_write(APIC_LVT0, v);
}
/**
@ -212,9 +212,6 @@ int lapic_get_maxlvt(void)
* this function twice on the boot CPU, once with a bogus timeout
* value, second time for real. The other (noncalibrating) CPUs
* call this function only once, with the real, calibrated value.
*
* We do reads before writes even if unnecessary, to get around the
* P5 APIC double write bug.
*/
static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
{
@ -229,18 +226,18 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
if (!irqen)
lvtt_value |= APIC_LVT_MASKED;
apic_write_around(APIC_LVTT, lvtt_value);
apic_write(APIC_LVTT, lvtt_value);
/*
* Divide PICLK by 16
*/
tmp_value = apic_read(APIC_TDCR);
apic_write_around(APIC_TDCR, (tmp_value
& ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
| APIC_TDR_DIV_16);
apic_write(APIC_TDCR,
(tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
APIC_TDR_DIV_16);
if (!oneshot)
apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
}
/*
@ -249,7 +246,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
static int lapic_next_event(unsigned long delta,
struct clock_event_device *evt)
{
apic_write_around(APIC_TMICT, delta);
apic_write(APIC_TMICT, delta);
return 0;
}
@ -278,7 +275,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
case CLOCK_EVT_MODE_SHUTDOWN:
v = apic_read(APIC_LVTT);
v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
apic_write_around(APIC_LVTT, v);
apic_write(APIC_LVTT, v);
break;
case CLOCK_EVT_MODE_RESUME:
/* Nothing to do here */
@ -372,12 +369,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
}
}
/*
* Setup the boot APIC
*
* Calibrate and verify the result.
*/
void __init setup_boot_APIC_clock(void)
static int __init calibrate_APIC_clock(void)
{
struct clock_event_device *levt = &__get_cpu_var(lapic_events);
const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
@ -387,24 +379,6 @@ void __init setup_boot_APIC_clock(void)
long delta, deltapm;
int pm_referenced = 0;
/*
* The local apic timer can be disabled via the kernel
* commandline or from the CPU detection code. Register the lapic
* timer as a dummy clock event source on SMP systems, so the
* broadcast mechanism is used. On UP systems simply ignore it.
*/
if (local_apic_timer_disabled) {
/* No broadcast on UP ! */
if (num_possible_cpus() > 1) {
lapic_clockevent.mult = 1;
setup_APIC_timer();
}
return;
}
apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
"calibrating APIC timer ...\n");
local_irq_disable();
/* Replace the global interrupt handler */
@ -489,8 +463,6 @@ void __init setup_boot_APIC_clock(void)
calibration_result / (1000000 / HZ),
calibration_result % (1000000 / HZ));
local_apic_timer_verify_ok = 1;
/*
* Do a sanity check on the APIC calibration result
*/
@ -498,12 +470,11 @@ void __init setup_boot_APIC_clock(void)
local_irq_enable();
printk(KERN_WARNING
"APIC frequency too slow, disabling apic timer\n");
/* No broadcast on UP ! */
if (num_possible_cpus() > 1)
setup_APIC_timer();
return;
return -1;
}
local_apic_timer_verify_ok = 1;
/* We trust the pm timer based calibration */
if (!pm_referenced) {
apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
@ -543,22 +514,55 @@ void __init setup_boot_APIC_clock(void)
if (!local_apic_timer_verify_ok) {
printk(KERN_WARNING
"APIC timer disabled due to verification failure.\n");
/* No broadcast on UP ! */
if (num_possible_cpus() == 1)
return;
} else {
/*
* If nmi_watchdog is set to IO_APIC, we need the
* PIT/HPET going. Otherwise register lapic as a dummy
* device.
*/
if (nmi_watchdog != NMI_IO_APIC)
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
else
printk(KERN_WARNING "APIC timer registered as dummy,"
" due to nmi_watchdog=%d!\n", nmi_watchdog);
return -1;
}
return 0;
}
/*
* Setup the boot APIC
*
* Calibrate and verify the result.
*/
void __init setup_boot_APIC_clock(void)
{
/*
* The local apic timer can be disabled via the kernel
* commandline or from the CPU detection code. Register the lapic
* timer as a dummy clock event source on SMP systems, so the
* broadcast mechanism is used. On UP systems simply ignore it.
*/
if (local_apic_timer_disabled) {
/* No broadcast on UP ! */
if (num_possible_cpus() > 1) {
lapic_clockevent.mult = 1;
setup_APIC_timer();
}
return;
}
apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
"calibrating APIC timer ...\n");
if (calibrate_APIC_clock()) {
/* No broadcast on UP ! */
if (num_possible_cpus() > 1)
setup_APIC_timer();
return;
}
/*
* If nmi_watchdog is set to IO_APIC, we need the
* PIT/HPET going. Otherwise register lapic as a dummy
* device.
*/
if (nmi_watchdog != NMI_IO_APIC)
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
else
printk(KERN_WARNING "APIC timer registered as dummy,"
" due to nmi_watchdog=%d!\n", nmi_watchdog);
/* Setup the lapic or request the broadcast */
setup_APIC_timer();
}
@ -693,44 +697,44 @@ void clear_local_APIC(void)
*/
if (maxlvt >= 3) {
v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED);
apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
}
/*
* Careful: we have to set masks only first to deassert
* any level-triggered sources.
*/
v = apic_read(APIC_LVTT);
apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
v = apic_read(APIC_LVT0);
apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
v = apic_read(APIC_LVT1);
apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED);
apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
if (maxlvt >= 4) {
v = apic_read(APIC_LVTPC);
apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
}
/* lets not touch this if we didn't frob it */
#ifdef CONFIG_X86_MCE_P4THERMAL
if (maxlvt >= 5) {
v = apic_read(APIC_LVTTHMR);
apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED);
apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
}
#endif
/*
* Clean APIC state for other OSs:
*/
apic_write_around(APIC_LVTT, APIC_LVT_MASKED);
apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
apic_write_around(APIC_LVT1, APIC_LVT_MASKED);
apic_write(APIC_LVTT, APIC_LVT_MASKED);
apic_write(APIC_LVT0, APIC_LVT_MASKED);
apic_write(APIC_LVT1, APIC_LVT_MASKED);
if (maxlvt >= 3)
apic_write_around(APIC_LVTERR, APIC_LVT_MASKED);
apic_write(APIC_LVTERR, APIC_LVT_MASKED);
if (maxlvt >= 4)
apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
apic_write(APIC_LVTPC, APIC_LVT_MASKED);
#ifdef CONFIG_X86_MCE_P4THERMAL
if (maxlvt >= 5)
apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
apic_write(APIC_LVTTHMR, APIC_LVT_MASKED);
#endif
/* Integrated APIC (!82489DX) ? */
if (lapic_is_integrated()) {
@ -756,7 +760,7 @@ void disable_local_APIC(void)
*/
value = apic_read(APIC_SPIV);
value &= ~APIC_SPIV_APIC_ENABLED;
apic_write_around(APIC_SPIV, value);
apic_write(APIC_SPIV, value);
/*
* When LAPIC was disabled by the BIOS and enabled by the kernel,
@ -865,8 +869,8 @@ void __init sync_Arb_IDs(void)
apic_wait_icr_idle();
apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
| APIC_DM_INIT);
apic_write(APIC_ICR,
APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT);
}
/*
@ -902,16 +906,16 @@ void __init init_bsp_APIC(void)
else
value |= APIC_SPIV_FOCUS_DISABLED;
value |= SPURIOUS_APIC_VECTOR;
apic_write_around(APIC_SPIV, value);
apic_write(APIC_SPIV, value);
/*
* Set up the virtual wire mode.
*/
apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
apic_write(APIC_LVT0, APIC_DM_EXTINT);
value = APIC_DM_NMI;
if (!lapic_is_integrated()) /* 82489DX */
value |= APIC_LVT_LEVEL_TRIGGER;
apic_write_around(APIC_LVT1, value);
apic_write(APIC_LVT1, value);
}
static void __cpuinit lapic_setup_esr(void)
@ -926,7 +930,7 @@ static void __cpuinit lapic_setup_esr(void)
/* enables sending errors */
value = ERROR_APIC_VECTOR;
apic_write_around(APIC_LVTERR, value);
apic_write(APIC_LVTERR, value);
/*
* spec says clear errors after enabling vector.
*/
@ -989,7 +993,7 @@ void __cpuinit setup_local_APIC(void)
*/
value = apic_read(APIC_TASKPRI);
value &= ~APIC_TPRI_MASK;
apic_write_around(APIC_TASKPRI, value);
apic_write(APIC_TASKPRI, value);
/*
* After a crash, we no longer service the interrupts and a pending
@ -1047,7 +1051,7 @@ void __cpuinit setup_local_APIC(void)
* Set spurious IRQ vector
*/
value |= SPURIOUS_APIC_VECTOR;
apic_write_around(APIC_SPIV, value);
apic_write(APIC_SPIV, value);
/*
* Set up LVT0, LVT1:
@ -1069,7 +1073,7 @@ void __cpuinit setup_local_APIC(void)
apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
smp_processor_id());
}
apic_write_around(APIC_LVT0, value);
apic_write(APIC_LVT0, value);
/*
* only the BP should see the LINT1 NMI signal, obviously.
@ -1080,7 +1084,7 @@ void __cpuinit setup_local_APIC(void)
value = APIC_DM_NMI | APIC_LVT_MASKED;
if (!integrated) /* 82489DX */
value |= APIC_LVT_LEVEL_TRIGGER;
apic_write_around(APIC_LVT1, value);
apic_write(APIC_LVT1, value);
}
void __cpuinit end_local_APIC_setup(void)
@ -1091,7 +1095,7 @@ void __cpuinit end_local_APIC_setup(void)
/* Disable the local apic timer */
value = apic_read(APIC_LVTT);
value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
apic_write_around(APIC_LVTT, value);
apic_write(APIC_LVTT, value);
setup_apic_nmi_watchdog(NULL);
apic_pm_activate();
@ -1214,9 +1218,6 @@ int apic_version[MAX_APICS];
int __init APIC_init_uniprocessor(void)
{
if (disable_apic)
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
if (!smp_found_config && !cpu_has_apic)
return -1;
@ -1419,7 +1420,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
value &= ~APIC_VECTOR_MASK;
value |= APIC_SPIV_APIC_ENABLED;
value |= 0xf;
apic_write_around(APIC_SPIV, value);
apic_write(APIC_SPIV, value);
if (!virt_wire_setup) {
/*
@ -1432,10 +1433,10 @@ void disconnect_bsp_APIC(int virt_wire_setup)
APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
apic_write_around(APIC_LVT0, value);
apic_write(APIC_LVT0, value);
} else {
/* Disable LVT0 */
apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
apic_write(APIC_LVT0, APIC_LVT_MASKED);
}
/*
@ -1449,7 +1450,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
apic_write_around(APIC_LVT1, value);
apic_write(APIC_LVT1, value);
}
}
@ -1700,7 +1701,7 @@ early_param("lapic", parse_lapic);
static int __init parse_nolapic(char *arg)
{
disable_apic = 1;
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
setup_clear_cpu_cap(X86_FEATURE_APIC);
return 0;
}
early_param("nolapic", parse_nolapic);

View File

@ -54,7 +54,7 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
/*
* Debug level, exported for io_apic.c
*/
int apic_verbosity;
unsigned int apic_verbosity;
/* Have we found an MP table */
int smp_found_config;
@ -314,7 +314,7 @@ static void setup_APIC_timer(void)
#define TICK_COUNT 100000000
static void __init calibrate_APIC_clock(void)
static int __init calibrate_APIC_clock(void)
{
unsigned apic, apic_start;
unsigned long tsc, tsc_start;
@ -368,6 +368,17 @@ static void __init calibrate_APIC_clock(void)
clockevent_delta2ns(0xF, &lapic_clockevent);
calibration_result = result / HZ;
/*
* Do a sanity check on the APIC calibration result
*/
if (calibration_result < (1000000 / HZ)) {
printk(KERN_WARNING
"APIC frequency too slow, disabling apic timer\n");
return -1;
}
return 0;
}
/*
@ -394,14 +405,7 @@ void __init setup_boot_APIC_clock(void)
}
printk(KERN_INFO "Using local APIC timer interrupts.\n");
calibrate_APIC_clock();
/*
* Do a sanity check on the APIC calibration result
*/
if (calibration_result < (1000000 / HZ)) {
printk(KERN_WARNING
"APIC frequency too slow, disabling apic timer\n");
if (calibrate_APIC_clock()) {
/* No broadcast on UP ! */
if (num_possible_cpus() > 1)
setup_APIC_timer();
@ -1337,7 +1341,7 @@ early_param("apic", apic_set_verbosity);
static __init int setup_disableapic(char *str)
{
disable_apic = 1;
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
setup_clear_cpu_cap(X86_FEATURE_APIC);
return 0;
}
early_param("disableapic", setup_disableapic);

View File

@ -18,6 +18,8 @@
#include <asm/ia32.h>
#include <asm/bootparam.h>
#include <xen/interface/xen.h>
#define __NO_STUBS 1
#undef __SYSCALL
#undef _ASM_X86_64_UNISTD_H_
@ -131,5 +133,14 @@ int main(void)
OFFSET(BP_loadflags, boot_params, hdr.loadflags);
OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
OFFSET(BP_version, boot_params, hdr.version);
BLANK();
DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
#ifdef CONFIG_XEN
BLANK();
OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
#undef ENTRY
#endif
return 0;
}

48
arch/x86/kernel/bios_uv.c Normal file
View File

@ -0,0 +1,48 @@
/*
* BIOS run time interface routines.
*
* Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <asm/uv/bios.h>
const char *
x86_bios_strerror(long status)
{
const char *str;
switch (status) {
case 0: str = "Call completed without error"; break;
case -1: str = "Not implemented"; break;
case -2: str = "Invalid argument"; break;
case -3: str = "Call completed with error"; break;
default: str = "Unknown BIOS status code"; break;
}
return str;
}
long
x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second,
unsigned long *drift_info)
{
struct uv_bios_retval isrv;
BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0);
*ticks_per_second = isrv.v0;
*drift_info = isrv.v1;
return isrv.status;
}
EXPORT_SYMBOL_GPL(x86_bios_freq_base);

View File

@ -24,8 +24,6 @@
extern void vide(void);
__asm__(".align 4\nvide: ret");
int force_mwait __cpuinitdata;
static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
{
if (cpuid_eax(0x80000000) >= 0x80000007) {

View File

@ -115,6 +115,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
/* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
if (c->x86_power & (1<<8))
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_SYSCALL32);
}
static void __cpuinit init_amd(struct cpuinfo_x86 *c)

View File

@ -131,13 +131,7 @@ static void __init check_popad(void)
* (for due to lack of "invlpg" and working WP on a i386)
* - In order to run on anything without a TSC, we need to be
* compiled for a i486.
* - In order to support the local APIC on a buggy Pentium machine,
* we need to be compiled with CONFIG_X86_GOOD_APIC disabled,
* which happens implicitly if compiled for a Pentium or lower
* (unless an advanced selection of CPU features is used) as an
* otherwise config implies a properly working local APIC without
* the need to do extra reads from the APIC.
*/
*/
static void __init check_config(void)
{
@ -151,21 +145,6 @@ static void __init check_config(void)
if (boot_cpu_data.x86 == 3)
panic("Kernel requires i486+ for 'invlpg' and other features");
#endif
/*
* If we were told we had a good local APIC, check for buggy Pentia,
* i.e. all B steppings and the C2 stepping of P54C when using their
* integrated APIC (see 11AP erratum in "Pentium Processor
* Specification Update").
*/
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC)
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
&& cpu_has_apic
&& boot_cpu_data.x86 == 5
&& boot_cpu_data.x86_model == 2
&& (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11))
panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!");
#endif
}

View File

@ -7,15 +7,13 @@
#include <linux/module.h>
#include <linux/kgdb.h>
#include <linux/topology.h>
#include <linux/string.h>
#include <linux/delay.h>
#include <linux/smp.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/msr.h>
#include <asm/io.h>
#include <asm/linkage.h>
#include <asm/mmu_context.h>
#include <asm/mtrr.h>
#include <asm/mce.h>
@ -305,7 +303,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
c->x86_capability[2] = cpuid_edx(0x80860001);
}
c->extended_cpuid_level = cpuid_eax(0x80000000);
if (c->extended_cpuid_level >= 0x80000007)
c->x86_power = cpuid_edx(0x80000007);
@ -316,18 +313,11 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
c->x86_phys_bits = eax & 0xff;
}
/* Assume all 64-bit CPUs support 32-bit syscall */
set_cpu_cap(c, X86_FEATURE_SYSCALL32);
if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
cpu_devs[c->x86_vendor]->c_early_init)
cpu_devs[c->x86_vendor]->c_early_init(c);
validate_pat_support(c);
/* early_param could clear that, but recall get it set again */
if (disable_apic)
clear_cpu_cap(c, X86_FEATURE_APIC);
}
/*
@ -517,8 +507,7 @@ void pda_init(int cpu)
}
char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
DEBUG_STKSZ]
__attribute__((section(".bss.page_aligned")));
DEBUG_STKSZ] __page_aligned_bss;
extern asmlinkage void ignore_sysret(void);

View File

@ -227,6 +227,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
if (cpu_has_bts)
ds_init_intel(c);
/*
* See if we have a good local APIC by checking for buggy Pentia,
* i.e. all B steppings and the C2 stepping of P54C when using their
* integrated APIC (see 11AP erratum in "Pentium Processor
* Specification Update").
*/
if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
(c->x86_mask < 0x6 || c->x86_mask == 0xb))
set_cpu_cap(c, X86_FEATURE_11AP);
#ifdef CONFIG_X86_NUMAQ
numaq_tsc_disable();
#endif

View File

@ -780,15 +780,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
}
kobject_put(per_cpu(cache_kobject, cpu));
cpuid4_cache_sysfs_exit(cpu);
break;
return retval;
}
kobject_uevent(&(this_object->kobj), KOBJ_ADD);
}
if (!retval)
cpu_set(cpu, cache_dev_map);
cpu_set(cpu, cache_dev_map);
kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
return retval;
return 0;
}
static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)

View File

@ -102,7 +102,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
/* The temperature transition interrupt handler setup */
h = THERMAL_APIC_VECTOR; /* our delivery vector */
h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
apic_write_around(APIC_LVTTHMR, h);
apic_write(APIC_LVTTHMR, h);
rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
@ -114,7 +114,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
l = apic_read(APIC_LVTTHMR);
apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
/* enable thermal throttle processing */

View File

@ -877,7 +877,8 @@ void __init early_res_to_bootmem(u64 start, u64 end)
for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
count++;
printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count);
printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
count, start, end);
for (i = 0; i < count; i++) {
struct early_res *r = &early_res[i];
printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
@ -1298,11 +1299,6 @@ void __init e820_reserve_resources(void)
}
}
/*
* Non-standard memory setup can be specified via this quirk:
*/
char * (*arch_memory_setup_quirk)(void);
char *__init default_machine_specific_memory_setup(void)
{
char *who = "BIOS-e820";
@ -1343,8 +1339,8 @@ char *__init default_machine_specific_memory_setup(void)
char *__init __attribute__((weak)) machine_specific_memory_setup(void)
{
if (arch_memory_setup_quirk) {
char *who = arch_memory_setup_quirk();
if (x86_quirks->arch_memory_setup) {
char *who = x86_quirks->arch_memory_setup();
if (who)
return who;
@ -1367,24 +1363,3 @@ void __init setup_memory_map(void)
printk(KERN_INFO "BIOS-provided physical RAM map:\n");
e820_print_map(who);
}
#ifdef CONFIG_X86_64
int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
{
int i;
if (slot < 0 || slot >= e820.nr_map)
return -1;
for (i = slot; i < e820.nr_map; i++) {
if (e820.map[i].type != E820_RAM)
continue;
break;
}
if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
return -1;
*addr = e820.map[i].addr;
*size = min_t(u64, e820.map[i].size + e820.map[i].addr,
max_pfn << PAGE_SHIFT) - *addr;
return i + 1;
}
#endif

View File

@ -16,10 +16,7 @@
#include <asm/dma.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
#ifdef CONFIG_GART_IOMMU
#include <asm/gart.h>
#endif
#include <asm/iommu.h>
static void __init fix_hypertransport_config(int num, int slot, int func)
{

View File

@ -332,7 +332,7 @@ sysenter_past_esp:
GET_THREAD_INFO(%ebp)
/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
jnz syscall_trace_entry
cmpl $(nr_syscalls), %eax
jae syscall_badsys
@ -370,7 +370,7 @@ ENTRY(system_call)
GET_THREAD_INFO(%ebp)
# system call tracing in operation / emulation
/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
jnz syscall_trace_entry
cmpl $(nr_syscalls), %eax
jae syscall_badsys
@ -383,10 +383,6 @@ syscall_exit:
# setting need_resched or sigpending
# between sampling and the iret
TRACE_IRQS_OFF
testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
jz no_singlestep
orl $_TIF_SINGLESTEP,TI_flags(%ebp)
no_singlestep:
movl TI_flags(%ebp), %ecx
testw $_TIF_ALLWORK_MASK, %cx # current->work
jne syscall_exit_work
@ -514,12 +510,8 @@ END(work_pending)
syscall_trace_entry:
movl $-ENOSYS,PT_EAX(%esp)
movl %esp, %eax
xorl %edx,%edx
call do_syscall_trace
cmpl $0, %eax
jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
# so must skip actual syscall
movl PT_ORIG_EAX(%esp), %eax
call syscall_trace_enter
/* What it returned is what we'll actually use. */
cmpl $(nr_syscalls), %eax
jnae syscall_call
jmp syscall_exit
@ -528,14 +520,13 @@ END(syscall_trace_entry)
# perform syscall exit tracing
ALIGN
syscall_exit_work:
testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
testb $_TIF_WORK_SYSCALL_EXIT, %cl
jz work_pending
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
# schedule() instead
movl %esp, %eax
movl $1, %edx
call do_syscall_trace
call syscall_trace_leave
jmp resume_userspace
END(syscall_exit_work)
CFI_ENDPROC
@ -1024,6 +1015,7 @@ ENDPROC(kernel_thread_helper)
ENTRY(xen_sysenter_target)
RING0_INT_FRAME
addl $5*4, %esp /* remove xen-provided frame */
CFI_ADJUST_CFA_OFFSET -5*4
jmp sysenter_past_esp
CFI_ENDPROC

View File

@ -349,8 +349,7 @@ ENTRY(system_call_after_swapgs)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
GET_THREAD_INFO(%rcx)
testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
TI_flags(%rcx)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
jnz tracesys
cmpq $__NR_syscall_max,%rax
ja badsys
@ -430,7 +429,12 @@ tracesys:
FIXUP_TOP_OF_STACK %rdi
movq %rsp,%rdi
call syscall_trace_enter
LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
/*
* Reload arg registers from stack in case ptrace changed them.
* We don't reload %rax because syscall_trace_enter() returned
* the value it wants us to use in the table lookup.
*/
LOAD_ARGS ARGOFFSET, 1
RESTORE_REST
cmpq $__NR_syscall_max,%rax
ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
@ -483,7 +487,7 @@ int_very_careful:
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_REST
/* Check for syscall exit trace */
testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
testl $_TIF_WORK_SYSCALL_EXIT,%edx
jz int_signal
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
@ -491,7 +495,7 @@ int_very_careful:
call syscall_trace_leave
popq %rdi
CFI_ADJUST_CFA_OFFSET -8
andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
jmp int_restore_rest
int_signal:
@ -1189,6 +1193,7 @@ END(device_not_available)
/* runs on exception stack */
KPROBE_ENTRY(debug)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $0
CFI_ADJUST_CFA_OFFSET 8
paranoidentry do_debug, DEBUG_STACK
@ -1198,6 +1203,7 @@ KPROBE_END(debug)
/* runs on exception stack */
KPROBE_ENTRY(nmi)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $-1
CFI_ADJUST_CFA_OFFSET 8
paranoidentry do_nmi, 0, 0
@ -1211,6 +1217,7 @@ KPROBE_END(nmi)
KPROBE_ENTRY(int3)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $0
CFI_ADJUST_CFA_OFFSET 8
paranoidentry do_int3, DEBUG_STACK
@ -1237,6 +1244,7 @@ END(coprocessor_segment_overrun)
/* runs on exception stack */
ENTRY(double_fault)
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
paranoidentry do_double_fault
jmp paranoid_exit1
CFI_ENDPROC
@ -1253,6 +1261,7 @@ END(segment_not_present)
/* runs on exception stack */
ENTRY(stack_segment)
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
paranoidentry do_stack_segment
jmp paranoid_exit1
CFI_ENDPROC
@ -1278,6 +1287,7 @@ END(spurious_interrupt_bug)
/* runs on exception stack */
ENTRY(machine_check)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $0
CFI_ADJUST_CFA_OFFSET 8
paranoidentry do_machine_check
@ -1312,3 +1322,103 @@ KPROBE_ENTRY(ignore_sysret)
sysret
CFI_ENDPROC
ENDPROC(ignore_sysret)
#ifdef CONFIG_XEN
ENTRY(xen_hypervisor_callback)
zeroentry xen_do_hypervisor_callback
END(xen_hypervisor_callback)
/*
# A note on the "critical region" in our callback handler.
# We want to avoid stacking callback handlers due to events occurring
# during handling of the last event. To do this, we keep events disabled
# until we've done all processing. HOWEVER, we must enable events before
# popping the stack frame (can't be done atomically) and so it would still
# be possible to get enough handler activations to overflow the stack.
# Although unlikely, bugs of that kind are hard to track down, so we'd
# like to avoid the possibility.
# So, on entry to the handler we detect whether we interrupted an
# existing activation in its critical region -- if so, we pop the current
# activation and restart the handler using the previous one.
*/
ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
CFI_STARTPROC
/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
see the correct pointer to the pt_regs */
movq %rdi, %rsp # we don't return, adjust the stack frame
CFI_ENDPROC
CFI_DEFAULT_STACK
11: incl %gs:pda_irqcount
movq %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
cmovzq %gs:pda_irqstackptr,%rsp
pushq %rbp # backlink for old unwinder
call xen_evtchn_do_upcall
popq %rsp
CFI_DEF_CFA_REGISTER rsp
decl %gs:pda_irqcount
jmp error_exit
CFI_ENDPROC
END(do_hypervisor_callback)
/*
# Hypervisor uses this for application faults while it executes.
# We get here for two reasons:
# 1. Fault while reloading DS, ES, FS or GS
# 2. Fault while executing IRET
# Category 1 we do not need to fix up as Xen has already reloaded all segment
# registers that could be reloaded and zeroed the others.
# Category 2 we fix up by killing the current process. We cannot use the
# normal Linux return path in this case because if we use the IRET hypercall
# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
# We distinguish between categories by comparing each saved segment register
# with its current contents: any discrepancy means we in category 1.
*/
ENTRY(xen_failsafe_callback)
framesz = (RIP-0x30) /* workaround buggy gas */
_frame framesz
CFI_REL_OFFSET rcx, 0
CFI_REL_OFFSET r11, 8
movw %ds,%cx
cmpw %cx,0x10(%rsp)
CFI_REMEMBER_STATE
jne 1f
movw %es,%cx
cmpw %cx,0x18(%rsp)
jne 1f
movw %fs,%cx
cmpw %cx,0x20(%rsp)
jne 1f
movw %gs,%cx
cmpw %cx,0x28(%rsp)
jne 1f
/* All segments match their saved values => Category 2 (Bad IRET). */
movq (%rsp),%rcx
CFI_RESTORE rcx
movq 8(%rsp),%r11
CFI_RESTORE r11
addq $0x30,%rsp
CFI_ADJUST_CFA_OFFSET -0x30
pushq $0
CFI_ADJUST_CFA_OFFSET 8
pushq %r11
CFI_ADJUST_CFA_OFFSET 8
pushq %rcx
CFI_ADJUST_CFA_OFFSET 8
jmp general_protection
CFI_RESTORE_STATE
1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
movq (%rsp),%rcx
CFI_RESTORE rcx
movq 8(%rsp),%r11
CFI_RESTORE r11
addq $0x30,%rsp
CFI_ADJUST_CFA_OFFSET -0x30
pushq $0
CFI_ADJUST_CFA_OFFSET 8
SAVE_ALL
jmp error_exit
CFI_ENDPROC
END(xen_failsafe_callback)
#endif /* CONFIG_XEN */

View File

@ -24,6 +24,7 @@
#include <asm/pgtable.h>
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
#include <asm/uv/bios.h>
DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
@ -40,6 +41,9 @@ EXPORT_SYMBOL_GPL(uv_cpu_to_blade);
short uv_possible_blades;
EXPORT_SYMBOL_GPL(uv_possible_blades);
unsigned long sn_rtc_cycles_per_second;
EXPORT_SYMBOL(sn_rtc_cycles_per_second);
/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
static cpumask_t uv_target_cpus(void)
@ -272,6 +276,23 @@ static __init void map_mmioh_high(int max_pnode)
map_high("MMIOH", mmioh.s.base, shift, map_uc);
}
static __init void uv_rtc_init(void)
{
long status, ticks_per_sec, drift;
status =
x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec,
&drift);
if (status != 0 || ticks_per_sec < 100000) {
printk(KERN_WARNING
"unable to determine platform RTC clock frequency, "
"guessing.\n");
/* BIOS gives wrong value for clock freq. so guess */
sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
} else
sn_rtc_cycles_per_second = ticks_per_sec;
}
static __init void uv_system_init(void)
{
union uvh_si_addr_map_config_u m_n_config;
@ -326,6 +347,8 @@ static __init void uv_system_init(void)
gnode_upper = (((unsigned long)node_id.s.node_id) &
~((1 << n_val) - 1)) << m_val;
uv_rtc_init();
for_each_present_cpu(cpu) {
nid = cpu_to_node(cpu);
pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));

View File

@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
#endif
void __init x86_64_init_pda(void)
{
_cpu_pda = __cpu_pda;
cpu_pda(0) = &_boot_cpu_pda;
pda_init(0);
}
static void __init zap_identity_mappings(void)
{
pgd_t *pgd = pgd_offset_k(0UL);
@ -102,9 +109,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
early_printk("Kernel alive\n");
_cpu_pda = __cpu_pda;
cpu_pda(0) = &_boot_cpu_pda;
pda_init(0);
x86_64_init_pda();
early_printk("Kernel really alive\n");

View File

@ -407,6 +407,7 @@ ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
.quad 0x0000000000000000
#include "../../x86/xen/xen-head.S"
.section .bss, "aw", @nobits
.align L1_CACHE_BYTES

View File

@ -756,7 +756,7 @@ void send_IPI_self(int vector)
/*
* Send the IPI. The write to APIC_ICR fires this off.
*/
apic_write_around(APIC_ICR, cfg);
apic_write(APIC_ICR, cfg);
}
#endif /* !CONFIG_SMP */
@ -2030,7 +2030,7 @@ static void mask_lapic_irq(unsigned int irq)
unsigned long v;
v = apic_read(APIC_LVT0);
apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
}
static void unmask_lapic_irq(unsigned int irq)
@ -2038,7 +2038,7 @@ static void unmask_lapic_irq(unsigned int irq)
unsigned long v;
v = apic_read(APIC_LVT0);
apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
}
static struct irq_chip lapic_chip __read_mostly = {
@ -2168,7 +2168,7 @@ static inline void __init check_timer(void)
* The AEOI mode will finish them in the 8259A
* automatically.
*/
apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
init_8259A(1);
timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
@ -2177,8 +2177,9 @@ static inline void __init check_timer(void)
pin2 = ioapic_i8259.pin;
apic2 = ioapic_i8259.apic;
printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
vector, apic1, pin1, apic2, pin2);
apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
"apic1=%d pin1=%d apic2=%d pin2=%d\n",
vector, apic1, pin1, apic2, pin2);
/*
* Some BIOS writers are clueless and report the ExtINTA
@ -2216,12 +2217,13 @@ static inline void __init check_timer(void)
}
clear_IO_APIC_pin(apic1, pin1);
if (!no_pin1)
printk(KERN_ERR "..MP-BIOS bug: "
"8254 timer not connected to IO-APIC\n");
apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
"8254 timer not connected to IO-APIC\n");
printk(KERN_INFO "...trying to set up timer (IRQ0) "
"through the 8259A ... ");
printk("\n..... (found pin %d) ...", pin2);
apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
"(IRQ0) through the 8259A ...\n");
apic_printk(APIC_QUIET, KERN_INFO
"..... (found apic %d pin %d) ...\n", apic2, pin2);
/*
* legacy devices should be connected to IO APIC #0
*/
@ -2230,7 +2232,7 @@ static inline void __init check_timer(void)
unmask_IO_APIC_irq(0);
enable_8259A_irq(0);
if (timer_irq_works()) {
printk("works.\n");
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
timer_through_8259 = 1;
if (nmi_watchdog == NMI_IO_APIC) {
disable_8259A_irq(0);
@ -2244,44 +2246,47 @@ static inline void __init check_timer(void)
*/
disable_8259A_irq(0);
clear_IO_APIC_pin(apic2, pin2);
printk(" failed.\n");
apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
}
if (nmi_watchdog == NMI_IO_APIC) {
printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
"through the IO-APIC - disabling NMI Watchdog!\n");
nmi_watchdog = NMI_NONE;
}
timer_ack = 0;
printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
apic_printk(APIC_QUIET, KERN_INFO
"...trying to set up timer as Virtual Wire IRQ...\n");
lapic_register_intr(0, vector);
apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
enable_8259A_irq(0);
if (timer_irq_works()) {
printk(" works.\n");
apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
disable_8259A_irq(0);
apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
printk(" failed.\n");
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
apic_printk(APIC_QUIET, KERN_INFO
"...trying to set up timer as ExtINT IRQ...\n");
init_8259A(0);
make_8259A_irq(0);
apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
apic_write(APIC_LVT0, APIC_DM_EXTINT);
unlock_ExtINT_logic();
if (timer_irq_works()) {
printk(" works.\n");
apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
printk(" failed :(.\n");
apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
"report. Then try booting with the 'noapic' option");
"report. Then try booting with the 'noapic' option.\n");
out:
local_irq_restore(flags);
}

View File

@ -45,6 +45,7 @@
#include <asm/proto.h>
#include <asm/acpi.h>
#include <asm/dma.h>
#include <asm/i8259.h>
#include <asm/nmi.h>
#include <asm/msidef.h>
#include <asm/hypertransport.h>
@ -1696,8 +1697,9 @@ static inline void __init check_timer(void)
pin2 = ioapic_i8259.pin;
apic2 = ioapic_i8259.apic;
apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
cfg->vector, apic1, pin1, apic2, pin2);
apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
"apic1=%d pin1=%d apic2=%d pin2=%d\n",
cfg->vector, apic1, pin1, apic2, pin2);
/*
* Some BIOS writers are clueless and report the ExtINTA
@ -1735,14 +1737,13 @@ static inline void __init check_timer(void)
}
clear_IO_APIC_pin(apic1, pin1);
if (!no_pin1)
apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: "
apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
"8254 timer not connected to IO-APIC\n");
apic_printk(APIC_VERBOSE,KERN_INFO
"...trying to set up timer (IRQ0) "
"through the 8259A ... ");
apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
apic2, pin2);
apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
"(IRQ0) through the 8259A ...\n");
apic_printk(APIC_QUIET, KERN_INFO
"..... (found apic %d pin %d) ...\n", apic2, pin2);
/*
* legacy devices should be connected to IO APIC #0
*/
@ -1751,7 +1752,7 @@ static inline void __init check_timer(void)
unmask_IO_APIC_irq(0);
enable_8259A_irq(0);
if (timer_irq_works()) {
apic_printk(APIC_VERBOSE," works.\n");
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
timer_through_8259 = 1;
if (nmi_watchdog == NMI_IO_APIC) {
disable_8259A_irq(0);
@ -1765,29 +1766,32 @@ static inline void __init check_timer(void)
*/
disable_8259A_irq(0);
clear_IO_APIC_pin(apic2, pin2);
apic_printk(APIC_VERBOSE," failed.\n");
apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
}
if (nmi_watchdog == NMI_IO_APIC) {
printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
"through the IO-APIC - disabling NMI Watchdog!\n");
nmi_watchdog = NMI_NONE;
}
apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
apic_printk(APIC_QUIET, KERN_INFO
"...trying to set up timer as Virtual Wire IRQ...\n");
lapic_register_intr(0);
apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
enable_8259A_irq(0);
if (timer_irq_works()) {
apic_printk(APIC_VERBOSE," works.\n");
apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
disable_8259A_irq(0);
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
apic_printk(APIC_VERBOSE," failed.\n");
apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
apic_printk(APIC_QUIET, KERN_INFO
"...trying to set up timer as ExtINT IRQ...\n");
init_8259A(0);
make_8259A_irq(0);
@ -1796,11 +1800,12 @@ static inline void __init check_timer(void)
unlock_ExtINT_logic();
if (timer_irq_works()) {
apic_printk(APIC_VERBOSE," works.\n");
apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
apic_printk(APIC_VERBOSE," failed :(.\n");
panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
"report. Then try booting with the 'noapic' option.\n");
out:
local_irq_restore(flags);
}

View File

@ -103,6 +103,9 @@ void __init io_delay_init(void)
static int __init io_delay_param(char *s)
{
if (!s)
return -EINVAL;
if (!strcmp(s, "0x80"))
io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
else if (!strcmp(s, "0xed"))

View File

@ -70,7 +70,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector)
/*
* Send the IPI. The write to APIC_ICR fires this off.
*/
apic_write_around(APIC_ICR, cfg);
apic_write(APIC_ICR, cfg);
}
void send_IPI_self(int vector)
@ -98,7 +98,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
* prepare target chip field
*/
cfg = __prepare_ICR2(mask);
apic_write_around(APIC_ICR2, cfg);
apic_write(APIC_ICR2, cfg);
/*
* program the ICR
@ -108,7 +108,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
/*
* Send the IPI. The write to APIC_ICR fires this off.
*/
apic_write_around(APIC_ICR, cfg);
apic_write(APIC_ICR, cfg);
}
/*

View File

@ -83,11 +83,8 @@ union irq_ctx {
static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
static char softirq_stack[NR_CPUS * THREAD_SIZE]
__attribute__((__section__(".bss.page_aligned")));
static char hardirq_stack[NR_CPUS * THREAD_SIZE]
__attribute__((__section__(".bss.page_aligned")));
static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
static void call_on_stack(void *func, void *stack)
{

View File

@ -12,9 +12,13 @@
#include <linux/init.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <asm/setup.h>
struct dentry *arch_debugfs_dir;
EXPORT_SYMBOL(arch_debugfs_dir);
#ifdef CONFIG_DEBUG_BOOT_PARAMS
struct setup_data_node {
u64 paddr;
@ -209,6 +213,10 @@ static int __init arch_kdebugfs_init(void)
{
int error = 0;
arch_debugfs_dir = debugfs_create_dir("x86", NULL);
if (!arch_debugfs_dir)
return -ENOMEM;
#ifdef CONFIG_DEBUG_BOOT_PARAMS
error = boot_params_kdebugfs_init();
#endif

View File

@ -860,7 +860,6 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs)
resume_execution(cur, regs, kcb);
regs->flags |= kcb->kprobe_saved_flags;
trace_hardirqs_fixup_flags(regs->flags);
if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
kcb->kprobe_status = KPROBE_HIT_SSDONE;

View File

@ -150,7 +150,8 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
{
const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
*para = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@ -160,6 +161,8 @@ int module_finalize(const Elf_Ehdr *hdr,
alt = s;
if (!strcmp(".smp_locks", secstrings + s->sh_name))
locks= s;
if (!strcmp(".parainstructions", secstrings + s->sh_name))
para = s;
}
if (alt) {
@ -175,6 +178,11 @@ int module_finalize(const Elf_Ehdr *hdr,
tseg, tseg + text->sh_size);
}
if (para) {
void *pseg = (void *)para->sh_addr;
apply_paravirt(pseg, pseg + para->sh_size);
}
return module_bug_finalize(hdr, sechdrs, me);
}

View File

@ -27,6 +27,7 @@
#include <asm/bios_ebda.h>
#include <asm/e820.h>
#include <asm/trampoline.h>
#include <asm/setup.h>
#include <mach_apic.h>
#ifdef CONFIG_X86_32
@ -48,76 +49,6 @@ static int __init mpf_checksum(unsigned char *mp, int len)
return sum & 0xFF;
}
#ifdef CONFIG_X86_NUMAQ
int found_numaq;
/*
* Have to match translation table entries to main table entries by counter
* hence the mpc_record variable .... can't see a less disgusting way of
* doing this ....
*/
struct mpc_config_translation {
unsigned char mpc_type;
unsigned char trans_len;
unsigned char trans_type;
unsigned char trans_quad;
unsigned char trans_global;
unsigned char trans_local;
unsigned short trans_reserved;
};
static int mpc_record;
static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
__cpuinitdata;
static inline int generate_logical_apicid(int quad, int phys_apicid)
{
return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
}
static inline int mpc_apic_id(struct mpc_config_processor *m,
struct mpc_config_translation *translation_record)
{
int quad = translation_record->trans_quad;
int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
m->mpc_apicid,
(m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
(m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
m->mpc_apicver, quad, logical_apicid);
return logical_apicid;
}
int mp_bus_id_to_node[MAX_MP_BUSSES];
int mp_bus_id_to_local[MAX_MP_BUSSES];
static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
struct mpc_config_translation *translation)
{
int quad = translation->trans_quad;
int local = translation->trans_local;
mp_bus_id_to_node[m->mpc_busid] = quad;
mp_bus_id_to_local[m->mpc_busid] = local;
printk(KERN_INFO "Bus #%d is %s (node %d)\n",
m->mpc_busid, name, quad);
}
int quad_local_to_mp_bus_id [NR_CPUS/4][4];
static void mpc_oem_pci_bus(struct mpc_config_bus *m,
struct mpc_config_translation *translation)
{
int quad = translation->trans_quad;
int local = translation->trans_local;
quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
}
#endif
static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
{
int apicid;
@ -127,14 +58,12 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
disabled_cpus++;
return;
}
#ifdef CONFIG_X86_NUMAQ
if (found_numaq)
apicid = mpc_apic_id(m, translation_table[mpc_record]);
if (x86_quirks->mpc_apic_id)
apicid = x86_quirks->mpc_apic_id(m);
else
apicid = m->mpc_apicid;
#else
apicid = m->mpc_apicid;
#endif
if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
bootup_cpu = " (Bootup-CPU)";
boot_cpu_physical_apicid = m->mpc_apicid;
@ -151,12 +80,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
memcpy(str, m->mpc_bustype, 6);
str[6] = 0;
#ifdef CONFIG_X86_NUMAQ
if (found_numaq)
mpc_oem_bus_info(m, str, translation_table[mpc_record]);
#else
printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
#endif
if (x86_quirks->mpc_oem_bus_info)
x86_quirks->mpc_oem_bus_info(m, str);
else
printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
#if MAX_MP_BUSSES < 256
if (m->mpc_busid >= MAX_MP_BUSSES) {
@ -173,10 +100,9 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
#endif
} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
#ifdef CONFIG_X86_NUMAQ
if (found_numaq)
mpc_oem_pci_bus(m, translation_table[mpc_record]);
#endif
if (x86_quirks->mpc_oem_pci_bus)
x86_quirks->mpc_oem_pci_bus(m);
clear_bit(m->mpc_busid, mp_bus_not_pci);
#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
@ -316,83 +242,6 @@ static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
}
#ifdef CONFIG_X86_NUMAQ
static void __init MP_translation_info(struct mpc_config_translation *m)
{
printk(KERN_INFO
"Translation: record %d, type %d, quad %d, global %d, local %d\n",
mpc_record, m->trans_type, m->trans_quad, m->trans_global,
m->trans_local);
if (mpc_record >= MAX_MPC_ENTRY)
printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
else
translation_table[mpc_record] = m; /* stash this for later */
if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
node_set_online(m->trans_quad);
}
/*
* Read/parse the MPC oem tables
*/
static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
unsigned short oemsize)
{
int count = sizeof(*oemtable); /* the header size */
unsigned char *oemptr = ((unsigned char *)oemtable) + count;
mpc_record = 0;
printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
oemtable);
if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
printk(KERN_WARNING
"SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
oemtable->oem_signature[0], oemtable->oem_signature[1],
oemtable->oem_signature[2], oemtable->oem_signature[3]);
return;
}
if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
return;
}
while (count < oemtable->oem_length) {
switch (*oemptr) {
case MP_TRANSLATION:
{
struct mpc_config_translation *m =
(struct mpc_config_translation *)oemptr;
MP_translation_info(m);
oemptr += sizeof(*m);
count += sizeof(*m);
++mpc_record;
break;
}
default:
{
printk(KERN_WARNING
"Unrecognised OEM table entry type! - %d\n",
(int)*oemptr);
return;
}
}
}
}
void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
char *productid)
{
if (strncmp(oem, "IBM NUMA", 8))
printk("Warning! Not a NUMA-Q system!\n");
else
found_numaq = 1;
if (mpc->mpc_oemptr)
smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
mpc->mpc_oemsize);
}
#endif /* CONFIG_X86_NUMAQ */
/*
* Read/parse the MPC
*/
@ -457,7 +306,6 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
} else
mps_oem_check(mpc, oem, str);
#endif
/* save the local APIC address, it might be non-default */
if (!acpi_lapic)
mp_lapic_addr = mpc->mpc_lapic;
@ -465,12 +313,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
if (early)
return 1;
if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
}
/*
* Now process the configuration blocks.
*/
#ifdef CONFIG_X86_NUMAQ
mpc_record = 0;
#endif
if (x86_quirks->mpc_record)
*x86_quirks->mpc_record = 0;
while (count < mpc->mpc_length) {
switch (*mpt) {
case MP_PROCESSOR:
@ -536,9 +389,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
count = mpc->mpc_length;
break;
}
#ifdef CONFIG_X86_NUMAQ
++mpc_record;
#endif
if (x86_quirks->mpc_record)
(*x86_quirks->mpc_record)++;
}
#ifdef CONFIG_X86_GENERICARCH
@ -725,12 +577,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
static struct intel_mp_floating *mpf_found;
/*
* Machine specific quirk for finding the SMP config before other setup
* activities destroy the table:
*/
int (*mach_get_smp_config_quirk)(unsigned int early);
/*
* Scan the memory blocks for an SMP configuration block.
*/
@ -738,8 +584,8 @@ static void __init __get_smp_config(unsigned int early)
{
struct intel_mp_floating *mpf = mpf_found;
if (mach_get_smp_config_quirk) {
if (mach_get_smp_config_quirk(early))
if (x86_quirks->mach_get_smp_config) {
if (x86_quirks->mach_get_smp_config(early))
return;
}
if (acpi_lapic && early)
@ -899,14 +745,12 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
return 0;
}
int (*mach_find_smp_config_quirk)(unsigned int reserve);
static void __init __find_smp_config(unsigned int reserve)
{
unsigned int address;
if (mach_find_smp_config_quirk) {
if (mach_find_smp_config_quirk(reserve))
if (x86_quirks->mach_find_smp_config) {
if (x86_quirks->mach_find_smp_config(reserve))
return;
}
/*

View File

@ -263,7 +263,7 @@ late_initcall(init_lapic_nmi_sysfs);
static void __acpi_nmi_enable(void *__unused)
{
apic_write_around(APIC_LVT0, APIC_DM_NMI);
apic_write(APIC_LVT0, APIC_DM_NMI);
}
/*
@ -277,7 +277,7 @@ void acpi_nmi_enable(void)
static void __acpi_nmi_disable(void *__unused)
{
apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
}
/*
@ -448,6 +448,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
#ifdef CONFIG_SYSCTL
static int __init setup_unknown_nmi_panic(char *str)
{
unknown_nmi_panic = 1;
return 1;
}
__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
{
unsigned char reason = get_nmi_reason();

View File

@ -33,6 +33,7 @@
#include <asm/processor.h>
#include <asm/mpspec.h>
#include <asm/e820.h>
#include <asm/setup.h>
#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
@ -71,27 +72,6 @@ static void __init smp_dump_qct(void)
}
}
static __init void early_check_numaq(void)
{
/*
* Find possible boot-time SMP configuration:
*/
early_find_smp_config();
/*
* get boot-time SMP configuration:
*/
if (smp_found_config)
early_get_smp_config();
}
int __init get_memcfg_numaq(void)
{
early_check_numaq();
if (!found_numaq)
return 0;
smp_dump_qct();
return 1;
}
void __init numaq_tsc_disable(void)
{
@ -103,3 +83,198 @@ void __init numaq_tsc_disable(void)
setup_clear_cpu_cap(X86_FEATURE_TSC);
}
}
static int __init numaq_pre_time_init(void)
{
numaq_tsc_disable();
return 0;
}
int found_numaq;
/*
* Have to match translation table entries to main table entries by counter
* hence the mpc_record variable .... can't see a less disgusting way of
* doing this ....
*/
struct mpc_config_translation {
unsigned char mpc_type;
unsigned char trans_len;
unsigned char trans_type;
unsigned char trans_quad;
unsigned char trans_global;
unsigned char trans_local;
unsigned short trans_reserved;
};
/* x86_quirks member */
static int mpc_record;
static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
__cpuinitdata;
static inline int generate_logical_apicid(int quad, int phys_apicid)
{
return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
}
/* x86_quirks member */
static int mpc_apic_id(struct mpc_config_processor *m)
{
int quad = translation_table[mpc_record]->trans_quad;
int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
m->mpc_apicid,
(m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
(m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
m->mpc_apicver, quad, logical_apicid);
return logical_apicid;
}
int mp_bus_id_to_node[MAX_MP_BUSSES];
int mp_bus_id_to_local[MAX_MP_BUSSES];
/* x86_quirks member */
static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name)
{
int quad = translation_table[mpc_record]->trans_quad;
int local = translation_table[mpc_record]->trans_local;
mp_bus_id_to_node[m->mpc_busid] = quad;
mp_bus_id_to_local[m->mpc_busid] = local;
printk(KERN_INFO "Bus #%d is %s (node %d)\n",
m->mpc_busid, name, quad);
}
int quad_local_to_mp_bus_id [NR_CPUS/4][4];
/* x86_quirks member */
static void mpc_oem_pci_bus(struct mpc_config_bus *m)
{
int quad = translation_table[mpc_record]->trans_quad;
int local = translation_table[mpc_record]->trans_local;
quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
}
static void __init MP_translation_info(struct mpc_config_translation *m)
{
printk(KERN_INFO
"Translation: record %d, type %d, quad %d, global %d, local %d\n",
mpc_record, m->trans_type, m->trans_quad, m->trans_global,
m->trans_local);
if (mpc_record >= MAX_MPC_ENTRY)
printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
else
translation_table[mpc_record] = m; /* stash this for later */
if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
node_set_online(m->trans_quad);
}
static int __init mpf_checksum(unsigned char *mp, int len)
{
int sum = 0;
while (len--)
sum += *mp++;
return sum & 0xFF;
}
/*
* Read/parse the MPC oem tables
*/
static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
unsigned short oemsize)
{
int count = sizeof(*oemtable); /* the header size */
unsigned char *oemptr = ((unsigned char *)oemtable) + count;
mpc_record = 0;
printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
oemtable);
if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
printk(KERN_WARNING
"SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
oemtable->oem_signature[0], oemtable->oem_signature[1],
oemtable->oem_signature[2], oemtable->oem_signature[3]);
return;
}
if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
return;
}
while (count < oemtable->oem_length) {
switch (*oemptr) {
case MP_TRANSLATION:
{
struct mpc_config_translation *m =
(struct mpc_config_translation *)oemptr;
MP_translation_info(m);
oemptr += sizeof(*m);
count += sizeof(*m);
++mpc_record;
break;
}
default:
{
printk(KERN_WARNING
"Unrecognised OEM table entry type! - %d\n",
(int)*oemptr);
return;
}
}
}
}
static struct x86_quirks numaq_x86_quirks __initdata = {
.arch_pre_time_init = numaq_pre_time_init,
.arch_time_init = NULL,
.arch_pre_intr_init = NULL,
.arch_memory_setup = NULL,
.arch_intr_init = NULL,
.arch_trap_init = NULL,
.mach_get_smp_config = NULL,
.mach_find_smp_config = NULL,
.mpc_record = &mpc_record,
.mpc_apic_id = mpc_apic_id,
.mpc_oem_bus_info = mpc_oem_bus_info,
.mpc_oem_pci_bus = mpc_oem_pci_bus,
.smp_read_mpc_oem = smp_read_mpc_oem,
};
void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
char *productid)
{
if (strncmp(oem, "IBM NUMA", 8))
printk("Warning! Not a NUMA-Q system!\n");
else
found_numaq = 1;
}
static __init void early_check_numaq(void)
{
/*
* Find possible boot-time SMP configuration:
*/
early_find_smp_config();
/*
* get boot-time SMP configuration:
*/
if (smp_found_config)
early_get_smp_config();
if (found_numaq)
x86_quirks = &numaq_x86_quirks;
}
int __init get_memcfg_numaq(void)
{
early_check_numaq();
if (!found_numaq)
return 0;
smp_dump_qct();
return 1;
}

View File

@ -29,6 +29,7 @@
#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/arch_hooks.h>
#include <asm/pgtable.h>
#include <asm/time.h>
#include <asm/pgalloc.h>
#include <asm/irq.h>
@ -123,6 +124,7 @@ static void *get_call_destination(u8 type)
.pv_irq_ops = pv_irq_ops,
.pv_apic_ops = pv_apic_ops,
.pv_mmu_ops = pv_mmu_ops,
.pv_lock_ops = pv_lock_ops,
};
return *((void **)&tmpl + type);
}
@ -266,6 +268,17 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
return __get_cpu_var(paravirt_lazy_mode);
}
void __init paravirt_use_bytelocks(void)
{
#ifdef CONFIG_SMP
pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
pv_lock_ops.spin_lock = __byte_spin_lock;
pv_lock_ops.spin_trylock = __byte_spin_trylock;
pv_lock_ops.spin_unlock = __byte_spin_unlock;
#endif
}
struct pv_info pv_info = {
.name = "bare hardware",
.paravirt_enabled = 0,
@ -361,7 +374,6 @@ struct pv_cpu_ops pv_cpu_ops = {
struct pv_apic_ops pv_apic_ops = {
#ifdef CONFIG_X86_LOCAL_APIC
.apic_write = native_apic_write,
.apic_write_atomic = native_apic_write_atomic,
.apic_read = native_apic_read,
.setup_boot_clock = setup_boot_APIC_clock,
.setup_secondary_clock = setup_secondary_APIC_clock,
@ -373,6 +385,9 @@ struct pv_mmu_ops pv_mmu_ops = {
#ifndef CONFIG_X86_64
.pagetable_setup_start = native_pagetable_setup_start,
.pagetable_setup_done = native_pagetable_setup_done,
#else
.pagetable_setup_start = paravirt_nop,
.pagetable_setup_done = paravirt_nop,
#endif
.read_cr2 = native_read_cr2,
@ -446,6 +461,18 @@ struct pv_mmu_ops pv_mmu_ops = {
.set_fixmap = native_set_fixmap,
};
struct pv_lock_ops pv_lock_ops = {
#ifdef CONFIG_SMP
.spin_is_locked = __ticket_spin_is_locked,
.spin_is_contended = __ticket_spin_is_contended,
.spin_lock = __ticket_spin_lock,
.spin_trylock = __ticket_spin_trylock,
.spin_unlock = __ticket_spin_unlock,
#endif
};
EXPORT_SYMBOL_GPL(pv_lock_ops);
EXPORT_SYMBOL_GPL(pv_time_ops);
EXPORT_SYMBOL (pv_cpu_ops);
EXPORT_SYMBOL (pv_mmu_ops);

View File

@ -36,7 +36,7 @@
#include <linux/delay.h>
#include <linux/scatterlist.h>
#include <linux/iommu-helper.h>
#include <asm/gart.h>
#include <asm/iommu.h>
#include <asm/calgary.h>
#include <asm/tce.h>
#include <asm/pci-direct.h>

View File

@ -5,12 +5,11 @@
#include <asm/proto.h>
#include <asm/dma.h>
#include <asm/gart.h>
#include <asm/iommu.h>
#include <asm/calgary.h>
#include <asm/amd_iommu.h>
int forbid_dac __read_mostly;
EXPORT_SYMBOL(forbid_dac);
static int forbid_dac __read_mostly;
const struct dma_mapping_ops *dma_ops;
EXPORT_SYMBOL(dma_ops);
@ -114,21 +113,15 @@ void __init pci_iommu_alloc(void)
* The order of these functions is important for
* fall-back/fail-over reasons
*/
#ifdef CONFIG_GART_IOMMU
gart_iommu_hole_init();
#endif
#ifdef CONFIG_CALGARY_IOMMU
detect_calgary();
#endif
detect_intel_iommu();
amd_iommu_detect();
#ifdef CONFIG_SWIOTLB
pci_swiotlb_init();
#endif
}
#endif
@ -184,9 +177,7 @@ static __init int iommu_setup(char *p)
swiotlb = 1;
#endif
#ifdef CONFIG_GART_IOMMU
gart_parse_options(p);
#endif
#ifdef CONFIG_CALGARY_IOMMU
if (!strncmp(p, "calgary", 7))
@ -500,17 +491,13 @@ EXPORT_SYMBOL(dma_free_coherent);
static int __init pci_iommu_init(void)
{
#ifdef CONFIG_CALGARY_IOMMU
calgary_iommu_init();
#endif
intel_iommu_init();
amd_iommu_init();
#ifdef CONFIG_GART_IOMMU
gart_iommu_init();
#endif
no_iommu_init();
return 0;

View File

@ -32,6 +32,7 @@
#include <asm/mtrr.h>
#include <asm/pgtable.h>
#include <asm/proto.h>
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/cacheflush.h>
#include <asm/swiotlb.h>

View File

@ -7,7 +7,7 @@
#include <linux/dma-mapping.h>
#include <linux/scatterlist.h>
#include <asm/gart.h>
#include <asm/iommu.h>
#include <asm/processor.h>
#include <asm/dma.h>

View File

@ -5,7 +5,7 @@
#include <linux/module.h>
#include <linux/dma-mapping.h>
#include <asm/gart.h>
#include <asm/iommu.h>
#include <asm/swiotlb.h>
#include <asm/dma.h>

View File

@ -15,6 +15,7 @@ unsigned long idle_nomwait;
EXPORT_SYMBOL(idle_nomwait);
struct kmem_cache *task_xstate_cachep;
static int force_mwait __cpuinitdata;
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
@ -199,6 +200,7 @@ static void poll_idle(void)
*
* idle=mwait overrides this decision and forces the usage of mwait.
*/
static int __cpuinitdata force_mwait;
#define MWAIT_INFO 0x05
#define MWAIT_ECX_EXTENDED_INFO 0x01
@ -326,6 +328,9 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
static int __init idle_setup(char *str)
{
if (!str)
return -EINVAL;
if (!strcmp(str, "poll")) {
printk("using polling idle threads.\n");
pm_idle = poll_idle;

View File

@ -537,8 +537,8 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
struct thread_struct *prev = &prev_p->thread;
struct thread_struct *next = &next_p->thread;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(init_tss, cpu);
unsigned fsindex, gsindex;
@ -586,35 +586,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Switch FS and GS.
*
* Segment register != 0 always requires a reload. Also
* reload when it has changed. When prev process used 64bit
* base always reload to avoid an information leak.
*/
{
/* segment register != 0 always requires a reload.
also reload when it has changed.
when prev process used 64bit base always reload
to avoid an information leak. */
if (unlikely(fsindex | next->fsindex | prev->fs)) {
loadsegment(fs, next->fsindex);
/* check if the user used a selector != 0
* if yes clear 64bit base, since overloaded base
* is always mapped to the Null selector
*/
if (fsindex)
if (unlikely(fsindex | next->fsindex | prev->fs)) {
loadsegment(fs, next->fsindex);
/*
* Check if the user used a selector != 0; if yes
* clear 64bit base, since overloaded base is always
* mapped to the Null selector
*/
if (fsindex)
prev->fs = 0;
}
/* when next process has a 64bit base use it */
if (next->fs)
wrmsrl(MSR_FS_BASE, next->fs);
prev->fsindex = fsindex;
if (unlikely(gsindex | next->gsindex | prev->gs)) {
load_gs_index(next->gsindex);
if (gsindex)
prev->gs = 0;
}
if (next->gs)
wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
prev->gsindex = gsindex;
}
/* when next process has a 64bit base use it */
if (next->fs)
wrmsrl(MSR_FS_BASE, next->fs);
prev->fsindex = fsindex;
if (unlikely(gsindex | next->gsindex | prev->gs)) {
load_gs_index(next->gsindex);
if (gsindex)
prev->gs = 0;
}
if (next->gs)
wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
prev->gsindex = gsindex;
/* Must be after DS reload */
unlazy_fpu(prev_p);
@ -627,7 +626,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
write_pda(pcurrent, next_p);
write_pda(kernelstack,
(unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE - PDA_STACKOFFSET);
#ifdef CONFIG_CC_STACKPROTECTOR
write_pda(stack_canary, next_p->stack_canary);
/*

View File

@ -1357,8 +1357,6 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
#endif
}
#ifdef CONFIG_X86_32
void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
{
struct siginfo info;
@ -1377,89 +1375,10 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
force_sig_info(SIGTRAP, &info, tsk);
}
/* notification of system call entry/exit
* - triggered by current->work.syscall_trace
*/
int do_syscall_trace(struct pt_regs *regs, int entryexit)
{
int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
/*
* With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
* interception
*/
int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
int ret = 0;
/* do the secure computing check first */
if (!entryexit)
secure_computing(regs->orig_ax);
if (unlikely(current->audit_context)) {
if (entryexit)
audit_syscall_exit(AUDITSC_RESULT(regs->ax),
regs->ax);
/* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
* on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
* not used, entry.S will call us only on syscall exit, not
* entry; so when TIF_SYSCALL_AUDIT is used we must avoid
* calling send_sigtrap() on syscall entry.
*
* Note that when PTRACE_SYSEMU_SINGLESTEP is used,
* is_singlestep is false, despite his name, so we will still do
* the correct thing.
*/
else if (is_singlestep)
goto out;
}
if (!(current->ptrace & PT_PTRACED))
goto out;
/* If a process stops on the 1st tracepoint with SYSCALL_TRACE
* and then is resumed with SYSEMU_SINGLESTEP, it will come in
* here. We have to check this and return */
if (is_sysemu && entryexit)
return 0;
/* Fake a debug trap */
if (is_singlestep)
send_sigtrap(current, regs, 0);
if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
goto out;
/* the 0x80 provides a way for the tracing parent to distinguish
between a syscall stop and SIGTRAP delivery */
/* Note that the debugger could change the result of test_thread_flag!*/
ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
/*
* this isn't the same as continuing with a signal, but it will do
* for normal use. strace only continues with a signal if the
* stopping signal is not SIGTRAP. -brl
*/
if (current->exit_code) {
send_sig(current->exit_code, current, 1);
current->exit_code = 0;
}
ret = is_sysemu;
out:
if (unlikely(current->audit_context) && !entryexit)
audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
regs->bx, regs->cx, regs->dx, regs->si);
if (ret == 0)
return 0;
regs->orig_ax = -1; /* force skip of syscall restarting */
if (unlikely(current->audit_context))
audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
return 1;
}
#else /* CONFIG_X86_64 */
static void syscall_trace(struct pt_regs *regs)
{
if (!(current->ptrace & PT_PTRACED))
return;
#if 0
printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
@ -1481,39 +1400,81 @@ static void syscall_trace(struct pt_regs *regs)
}
}
asmlinkage void syscall_trace_enter(struct pt_regs *regs)
#ifdef CONFIG_X86_32
# define IS_IA32 1
#elif defined CONFIG_IA32_EMULATION
# define IS_IA32 test_thread_flag(TIF_IA32)
#else
# define IS_IA32 0
#endif
/*
* We must return the syscall number to actually look up in the table.
* This can be -1L to skip running any syscall at all.
*/
asmregparm long syscall_trace_enter(struct pt_regs *regs)
{
long ret = 0;
/*
* If we stepped into a sysenter/syscall insn, it trapped in
* kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
* If user-mode had set TF itself, then it's still clear from
* do_debug() and we need to set it again to restore the user
* state. If we entered on the slow path, TF was already set.
*/
if (test_thread_flag(TIF_SINGLESTEP))
regs->flags |= X86_EFLAGS_TF;
/* do the secure computing check first */
secure_computing(regs->orig_ax);
if (test_thread_flag(TIF_SYSCALL_TRACE)
&& (current->ptrace & PT_PTRACED))
if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
ret = -1L;
if (ret || test_thread_flag(TIF_SYSCALL_TRACE))
syscall_trace(regs);
if (unlikely(current->audit_context)) {
if (test_thread_flag(TIF_IA32)) {
if (IS_IA32)
audit_syscall_entry(AUDIT_ARCH_I386,
regs->orig_ax,
regs->bx, regs->cx,
regs->dx, regs->si);
} else {
#ifdef CONFIG_X86_64
else
audit_syscall_entry(AUDIT_ARCH_X86_64,
regs->orig_ax,
regs->di, regs->si,
regs->dx, regs->r10);
}
#endif
}
return ret ?: regs->orig_ax;
}
asmlinkage void syscall_trace_leave(struct pt_regs *regs)
asmregparm void syscall_trace_leave(struct pt_regs *regs)
{
if (unlikely(current->audit_context))
audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
if ((test_thread_flag(TIF_SYSCALL_TRACE)
|| test_thread_flag(TIF_SINGLESTEP))
&& (current->ptrace & PT_PTRACED))
if (test_thread_flag(TIF_SYSCALL_TRACE))
syscall_trace(regs);
}
#endif /* CONFIG_X86_32 */
/*
* If TIF_SYSCALL_EMU is set, we only get here because of
* TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
* We already reported this syscall instruction in
* syscall_trace_enter(), so don't do any more now.
*/
if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
return;
/*
* If we are single-stepping, synthesize a trap to follow the
* system call instruction.
*/
if (test_thread_flag(TIF_SINGLESTEP) &&
(current->ptrace & PT_PTRACED))
send_sigtrap(current, regs, 0);
}

View File

@ -177,6 +177,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
},
},
{ /* Handle problems with rebooting on Dell T5400's */
.callback = set_bios_reboot,
.ident = "Dell Precision T5400",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
},
},
{ /* Handle problems with rebooting on HP laptops */
.callback = set_bios_reboot,
.ident = "HP Compaq Laptop",

View File

@ -57,12 +57,8 @@
#include <linux/slab.h>
#include <linux/user.h>
#include <linux/delay.h>
#include <linux/highmem.h>
#include <linux/kallsyms.h>
#include <linux/edd.h>
#include <linux/iscsi_ibft.h>
#include <linux/kexec.h>
#include <linux/cpufreq.h>
#include <linux/dma-mapping.h>
#include <linux/ctype.h>
@ -96,7 +92,7 @@
#include <asm/smp.h>
#include <asm/desc.h>
#include <asm/dma.h>
#include <asm/gart.h>
#include <asm/iommu.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
@ -104,7 +100,6 @@
#include <asm/paravirt.h>
#include <asm/percpu.h>
#include <asm/sections.h>
#include <asm/topology.h>
#include <asm/apicdef.h>
#ifdef CONFIG_X86_64
@ -579,6 +574,10 @@ static int __init setup_elfcorehdr(char *arg)
early_param("elfcorehdr", setup_elfcorehdr);
#endif
static struct x86_quirks default_x86_quirks __initdata;
struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@ -824,7 +823,10 @@ void __init setup_arch(char **cmdline_p)
vmi_init();
#endif
paravirt_pagetable_setup_start(swapper_pg_dir);
paging_init();
paravirt_pagetable_setup_done(swapper_pg_dir);
paravirt_post_allocator_init();
#ifdef CONFIG_X86_64
map_vsyscall();
@ -854,14 +856,6 @@ void __init setup_arch(char **cmdline_p)
init_cpu_to_node();
#endif
#ifdef CONFIG_X86_NUMAQ
/*
* need to check online nodes num, call it
* here before time_init/tsc_init
*/
numaq_tsc_disable();
#endif
init_apic_mappings();
ioapic_init_mappings();

View File

@ -212,7 +212,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
badframe:
if (show_unhandled_signals && printk_ratelimit()) {
printk(KERN_INFO "%s%s[%d] bad frame in sigreturn frame:"
printk("%s%s[%d] bad frame in sigreturn frame:"
"%p ip:%lx sp:%lx oeax:%lx",
task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
current->comm, task_pid_nr(current), frame, regs->ip,
@ -657,12 +657,6 @@ static void do_signal(struct pt_regs *regs)
void
do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
/* Pending single-step? */
if (thread_info_flags & _TIF_SINGLESTEP) {
regs->flags |= X86_EFLAGS_TF;
clear_thread_flag(TIF_SINGLESTEP);
}
/* deal with pending signal delivery */
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);

View File

@ -487,12 +487,6 @@ static void do_signal(struct pt_regs *regs)
void do_notify_resume(struct pt_regs *regs, void *unused,
__u32 thread_info_flags)
{
/* Pending single-step? */
if (thread_info_flags & _TIF_SINGLESTEP) {
regs->flags |= X86_EFLAGS_TF;
clear_thread_flag(TIF_SINGLESTEP);
}
#ifdef CONFIG_X86_MCE
/* notify userspace of pending MCEs */
if (thread_info_flags & _TIF_MCE_NOTIFY)

View File

@ -546,8 +546,8 @@ static inline void __inquire_remote_apic(int apicid)
printk(KERN_CONT
"a previous APIC delivery may have failed\n");
apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
timeout = 0;
do {
@ -579,11 +579,11 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
int maxlvt;
/* Target chip */
apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
/* Boot on the stack */
/* Kick the second */
apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
Dprintk("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
@ -592,14 +592,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
* Give the other CPU some time to accept the IPI.
*/
udelay(200);
/*
* Due to the Pentium erratum 3AP.
*/
maxlvt = lapic_get_maxlvt();
if (maxlvt > 3) {
apic_read_around(APIC_SPIV);
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
}
accept_status = (apic_read(APIC_ESR) & 0xEF);
Dprintk("NMI sent.\n");
@ -625,12 +620,14 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
return send_status;
}
maxlvt = lapic_get_maxlvt();
/*
* Be paranoid about clearing APIC errors.
*/
if (APIC_INTEGRATED(apic_version[phys_apicid])) {
apic_read_around(APIC_SPIV);
apic_write(APIC_ESR, 0);
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}
@ -639,13 +636,13 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
/*
* Turn INIT on target chip
*/
apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
/*
* Send IPI
*/
apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
| APIC_DM_INIT);
apic_write(APIC_ICR,
APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT);
Dprintk("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
@ -655,10 +652,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
Dprintk("Deasserting INIT.\n");
/* Target chip */
apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
/* Send IPI */
apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
Dprintk("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
@ -689,12 +686,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
*/
Dprintk("#startup loops: %d.\n", num_starts);
maxlvt = lapic_get_maxlvt();
for (j = 1; j <= num_starts; j++) {
Dprintk("Sending STARTUP #%d.\n", j);
apic_read_around(APIC_SPIV);
apic_write(APIC_ESR, 0);
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
Dprintk("After apic_write.\n");
@ -703,12 +698,11 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
*/
/* Target chip */
apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
/* Boot on the stack */
/* Kick the second */
apic_write_around(APIC_ICR, APIC_DM_STARTUP
| (start_eip >> 12));
apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12));
/*
* Give the other CPU some time to accept the IPI.
@ -724,13 +718,8 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
* Give the other CPU some time to accept the IPI.
*/
udelay(200);
/*
* Due to the Pentium erratum 3AP.
*/
if (maxlvt > 3) {
apic_read_around(APIC_SPIV);
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
}
accept_status = (apic_read(APIC_ESR) & 0xEF);
if (send_status || accept_status)
break;
@ -768,7 +757,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
*
* Must be called after the _cpu_pda pointer table is initialized.
*/
static int __cpuinit get_local_pda(int cpu)
int __cpuinit get_local_pda(int cpu)
{
struct x8664_pda *oldpda, *newpda;
unsigned long size = sizeof(struct x8664_pda);
@ -1311,7 +1300,7 @@ static void __ref remove_cpu_from_maps(int cpu)
cpu_clear(cpu, cpu_callout_map);
cpu_clear(cpu, cpu_callin_map);
/* was set by cpu_init() */
clear_bit(cpu, (unsigned long *)&cpu_initialized);
cpu_clear(cpu, cpu_initialized);
numa_remove_cpu(cpu);
}
@ -1390,7 +1379,8 @@ static int __init parse_maxcpus(char *arg)
{
extern unsigned int maxcpus;
maxcpus = simple_strtoul(arg, NULL, 0);
if (arg)
maxcpus = simple_strtoul(arg, NULL, 0);
return 0;
}
early_param("maxcpus", parse_maxcpus);

View File

@ -1 +0,0 @@

View File

@ -105,6 +105,20 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
static int enable_single_step(struct task_struct *child)
{
struct pt_regs *regs = task_pt_regs(child);
unsigned long oflags;
/*
* If we stepped into a sysenter/syscall insn, it trapped in
* kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
* If user-mode had set TF itself, then it's still clear from
* do_debug() and we need to set it again to restore the user
* state so we don't wrongly set TIF_FORCED_TF below.
* If enable_single_step() was used last and that is what
* set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are
* already set and our bookkeeping is fine.
*/
if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP)))
regs->flags |= X86_EFLAGS_TF;
/*
* Always set TIF_SINGLESTEP - this guarantees that
@ -113,11 +127,7 @@ static int enable_single_step(struct task_struct *child)
*/
set_tsk_thread_flag(child, TIF_SINGLESTEP);
/*
* If TF was already set, don't do anything else
*/
if (regs->flags & X86_EFLAGS_TF)
return 0;
oflags = regs->flags;
/* Set TF on the kernel stack.. */
regs->flags |= X86_EFLAGS_TF;
@ -126,9 +136,22 @@ static int enable_single_step(struct task_struct *child)
* ..but if TF is changed by the instruction we will trace,
* don't mark it as being "us" that set it, so that we
* won't clear it by hand later.
*
* Note that if we don't actually execute the popf because
* of a signal arriving right now or suchlike, we will lose
* track of the fact that it really was "us" that set it.
*/
if (is_setting_trap_flag(child, regs))
if (is_setting_trap_flag(child, regs)) {
clear_tsk_thread_flag(child, TIF_FORCED_TF);
return 0;
}
/*
* If TF was already set, check whether it was us who set it.
* If not, we should never attempt a block step.
*/
if (oflags & X86_EFLAGS_TF)
return test_tsk_thread_flag(child, TIF_FORCED_TF);
set_tsk_thread_flag(child, TIF_FORCED_TF);

View File

@ -129,6 +129,7 @@ void __init hpet_time_init(void)
*/
void __init time_init(void)
{
pre_time_init_hook();
tsc_init();
late_time_init = choose_time_init();
}

View File

@ -58,6 +58,7 @@
#include <asm/nmi.h>
#include <asm/smp.h>
#include <asm/io.h>
#include <asm/traps.h>
#include "mach_traps.h"
@ -77,26 +78,6 @@ char ignore_fpu_irq;
gate_desc idt_table[256]
__attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
asmlinkage void divide_error(void);
asmlinkage void debug(void);
asmlinkage void nmi(void);
asmlinkage void int3(void);
asmlinkage void overflow(void);
asmlinkage void bounds(void);
asmlinkage void invalid_op(void);
asmlinkage void device_not_available(void);
asmlinkage void coprocessor_segment_overrun(void);
asmlinkage void invalid_TSS(void);
asmlinkage void segment_not_present(void);
asmlinkage void stack_segment(void);
asmlinkage void general_protection(void);
asmlinkage void page_fault(void);
asmlinkage void coprocessor_error(void);
asmlinkage void simd_coprocessor_error(void);
asmlinkage void alignment_check(void);
asmlinkage void spurious_interrupt_bug(void);
asmlinkage void machine_check(void);
int panic_on_unrecovered_nmi;
int kstack_depth_to_print = 24;
static unsigned int code_bytes = 64;
@ -256,7 +237,7 @@ static const struct stacktrace_ops print_trace_ops = {
static void
show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp, char *log_lvl)
unsigned long *stack, unsigned long bp, char *log_lvl)
{
dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
printk("%s =======================\n", log_lvl);
@ -383,6 +364,54 @@ int is_valid_bugaddr(unsigned long ip)
return ud2 == 0x0b0f;
}
static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
static int die_owner = -1;
static unsigned int die_nest_count;
unsigned __kprobes long oops_begin(void)
{
unsigned long flags;
oops_enter();
if (die_owner != raw_smp_processor_id()) {
console_verbose();
raw_local_irq_save(flags);
__raw_spin_lock(&die_lock);
die_owner = smp_processor_id();
die_nest_count = 0;
bust_spinlocks(1);
} else {
raw_local_irq_save(flags);
}
die_nest_count++;
return flags;
}
void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
{
bust_spinlocks(0);
die_owner = -1;
add_taint(TAINT_DIE);
__raw_spin_unlock(&die_lock);
raw_local_irq_restore(flags);
if (!regs)
return;
if (kexec_should_crash(current))
crash_kexec(regs);
if (in_interrupt())
panic("Fatal exception in interrupt");
if (panic_on_oops)
panic("Fatal exception");
oops_exit();
do_exit(signr);
}
int __kprobes __die(const char *str, struct pt_regs *regs, long err)
{
unsigned short ss;
@ -423,31 +452,9 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
*/
void die(const char *str, struct pt_regs *regs, long err)
{
static struct {
raw_spinlock_t lock;
u32 lock_owner;
int lock_owner_depth;
} die = {
.lock = __RAW_SPIN_LOCK_UNLOCKED,
.lock_owner = -1,
.lock_owner_depth = 0
};
unsigned long flags;
unsigned long flags = oops_begin();
oops_enter();
if (die.lock_owner != raw_smp_processor_id()) {
console_verbose();
raw_local_irq_save(flags);
__raw_spin_lock(&die.lock);
die.lock_owner = smp_processor_id();
die.lock_owner_depth = 0;
bust_spinlocks(1);
} else {
raw_local_irq_save(flags);
}
if (++die.lock_owner_depth < 3) {
if (die_nest_count < 3) {
report_bug(regs->ip, regs);
if (__die(str, regs, err))
@ -456,26 +463,7 @@ void die(const char *str, struct pt_regs *regs, long err)
printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
}
bust_spinlocks(0);
die.lock_owner = -1;
add_taint(TAINT_DIE);
__raw_spin_unlock(&die.lock);
raw_local_irq_restore(flags);
if (!regs)
return;
if (kexec_should_crash(current))
crash_kexec(regs);
if (in_interrupt())
panic("Fatal exception in interrupt");
if (panic_on_oops)
panic("Fatal exception");
oops_exit();
do_exit(SIGSEGV);
oops_end(flags, regs, SIGSEGV);
}
static inline void

View File

@ -51,30 +51,10 @@
#include <asm/pgalloc.h>
#include <asm/proto.h>
#include <asm/pda.h>
#include <asm/traps.h>
#include <mach_traps.h>
asmlinkage void divide_error(void);
asmlinkage void debug(void);
asmlinkage void nmi(void);
asmlinkage void int3(void);
asmlinkage void overflow(void);
asmlinkage void bounds(void);
asmlinkage void invalid_op(void);
asmlinkage void device_not_available(void);
asmlinkage void double_fault(void);
asmlinkage void coprocessor_segment_overrun(void);
asmlinkage void invalid_TSS(void);
asmlinkage void segment_not_present(void);
asmlinkage void stack_segment(void);
asmlinkage void general_protection(void);
asmlinkage void page_fault(void);
asmlinkage void coprocessor_error(void);
asmlinkage void simd_coprocessor_error(void);
asmlinkage void alignment_check(void);
asmlinkage void spurious_interrupt_bug(void);
asmlinkage void machine_check(void);
int panic_on_unrecovered_nmi;
int kstack_depth_to_print = 12;
static unsigned int code_bytes = 64;
@ -355,17 +335,24 @@ static const struct stacktrace_ops print_trace_ops = {
.address = print_trace_address,
};
void show_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp)
static void
show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp, char *log_lvl)
{
printk("\nCall Trace:\n");
dump_trace(task, regs, stack, bp, &print_trace_ops, NULL);
dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
printk("\n");
}
void show_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp)
{
show_trace_log_lvl(task, regs, stack, bp, "");
}
static void
_show_stack(struct task_struct *task, struct pt_regs *regs,
unsigned long *sp, unsigned long bp)
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *sp, unsigned long bp, char *log_lvl)
{
unsigned long *stack;
int i;
@ -399,12 +386,12 @@ _show_stack(struct task_struct *task, struct pt_regs *regs,
printk(" %016lx", *stack++);
touch_nmi_watchdog();
}
show_trace(task, regs, sp, bp);
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
void show_stack(struct task_struct *task, unsigned long *sp)
{
_show_stack(task, NULL, sp, 0);
show_stack_log_lvl(task, NULL, sp, 0, "");
}
/*
@ -454,7 +441,8 @@ void show_registers(struct pt_regs *regs)
u8 *ip;
printk("Stack: ");
_show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
regs->bp, "");
printk("\n");
printk(KERN_EMERG "Code: ");
@ -518,7 +506,7 @@ unsigned __kprobes long oops_begin(void)
}
void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
{
{
die_owner = -1;
bust_spinlocks(0);
die_nest_count--;

View File

@ -73,7 +73,7 @@ int is_visws_box(void)
return visws_board_type >= 0;
}
static int __init visws_time_init_quirk(void)
static int __init visws_time_init(void)
{
printk(KERN_INFO "Starting Cobalt Timer system clock\n");
@ -93,7 +93,7 @@ static int __init visws_time_init_quirk(void)
return 0;
}
static int __init visws_pre_intr_init_quirk(void)
static int __init visws_pre_intr_init(void)
{
init_VISWS_APIC_irqs();
@ -114,7 +114,7 @@ EXPORT_SYMBOL(sgivwfb_mem_size);
long long mem_size __initdata = 0;
static char * __init visws_memory_setup_quirk(void)
static char * __init visws_memory_setup(void)
{
long long gfx_mem_size = 8 * MB;
@ -176,7 +176,7 @@ static void visws_machine_power_off(void)
outl(PIIX_SPECIAL_STOP, 0xCFC);
}
static int __init visws_get_smp_config_quirk(unsigned int early)
static int __init visws_get_smp_config(unsigned int early)
{
/*
* Prevent MP-table parsing by the generic code:
@ -192,7 +192,7 @@ extern unsigned int __cpuinitdata maxcpus;
* No problem for Linux.
*/
static void __init MP_processor_info (struct mpc_config_processor *m)
static void __init MP_processor_info(struct mpc_config_processor *m)
{
int ver, logical_apicid;
physid_mask_t apic_cpus;
@ -232,7 +232,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
apic_version[m->mpc_apicid] = ver;
}
int __init visws_find_smp_config_quirk(unsigned int reserve)
static int __init visws_find_smp_config(unsigned int reserve)
{
struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@ -258,7 +258,17 @@ int __init visws_find_smp_config_quirk(unsigned int reserve)
return 1;
}
extern int visws_trap_init_quirk(void);
static int visws_trap_init(void);
static struct x86_quirks visws_x86_quirks __initdata = {
.arch_time_init = visws_time_init,
.arch_pre_intr_init = visws_pre_intr_init,
.arch_memory_setup = visws_memory_setup,
.arch_intr_init = NULL,
.arch_trap_init = visws_trap_init,
.mach_get_smp_config = visws_get_smp_config,
.mach_find_smp_config = visws_find_smp_config,
};
void __init visws_early_detect(void)
{
@ -272,16 +282,10 @@ void __init visws_early_detect(void)
/*
* Install special quirks for timer, interrupt and memory setup:
*/
arch_time_init_quirk = visws_time_init_quirk;
arch_pre_intr_init_quirk = visws_pre_intr_init_quirk;
arch_memory_setup_quirk = visws_memory_setup_quirk;
/*
* Fall back to generic behavior for traps:
* Override generic MP-table parsing:
*/
arch_intr_init_quirk = NULL;
arch_trap_init_quirk = visws_trap_init_quirk;
x86_quirks = &visws_x86_quirks;
/*
* Install reboot quirks:
@ -294,12 +298,6 @@ void __init visws_early_detect(void)
*/
no_broadcast = 0;
/*
* Override generic MP-table parsing:
*/
mach_get_smp_config_quirk = visws_get_smp_config_quirk;
mach_find_smp_config_quirk = visws_find_smp_config_quirk;
#ifdef CONFIG_X86_IO_APIC
/*
* Turn off IO-APIC detection and initialization:
@ -426,7 +424,7 @@ static __init void cobalt_init(void)
co_apic_read(CO_APIC_ID));
}
int __init visws_trap_init_quirk(void)
static int __init visws_trap_init(void)
{
lithium_init();
cobalt_init();

View File

@ -906,7 +906,6 @@ static inline int __init activate_vmi(void)
#ifdef CONFIG_X86_LOCAL_APIC
para_fill(pv_apic_ops.apic_read, APICRead);
para_fill(pv_apic_ops.apic_write, APICWrite);
para_fill(pv_apic_ops.apic_write_atomic, APICWrite);
#endif
/*

View File

@ -991,7 +991,6 @@ __init void lguest_init(void)
#ifdef CONFIG_X86_LOCAL_APIC
/* apic read/write intercepts */
pv_apic_ops.apic_write = lguest_apic_write;
pv_apic_ops.apic_write_atomic = lguest_apic_write;
pv_apic_ops.apic_read = lguest_apic_read;
#endif

View File

@ -10,14 +10,6 @@
#include <asm/e820.h>
#include <asm/setup.h>
/*
* Any quirks to be performed to initialize timers/irqs/etc?
*/
int (*arch_time_init_quirk)(void);
int (*arch_pre_intr_init_quirk)(void);
int (*arch_intr_init_quirk)(void);
int (*arch_trap_init_quirk)(void);
#ifdef CONFIG_HOTPLUG_CPU
#define DEFAULT_SEND_IPI (1)
#else
@ -37,8 +29,8 @@ int no_broadcast=DEFAULT_SEND_IPI;
**/
void __init pre_intr_init_hook(void)
{
if (arch_pre_intr_init_quirk) {
if (arch_pre_intr_init_quirk())
if (x86_quirks->arch_pre_intr_init) {
if (x86_quirks->arch_pre_intr_init())
return;
}
init_ISA_irqs();
@ -64,8 +56,8 @@ static struct irqaction irq2 = {
**/
void __init intr_init_hook(void)
{
if (arch_intr_init_quirk) {
if (arch_intr_init_quirk())
if (x86_quirks->arch_intr_init) {
if (x86_quirks->arch_intr_init())
return;
}
#ifdef CONFIG_X86_LOCAL_APIC
@ -97,8 +89,8 @@ void __init pre_setup_arch_hook(void)
**/
void __init trap_init_hook(void)
{
if (arch_trap_init_quirk) {
if (arch_trap_init_quirk())
if (x86_quirks->arch_trap_init) {
if (x86_quirks->arch_trap_init())
return;
}
}
@ -110,6 +102,16 @@ static struct irqaction irq0 = {
.name = "timer"
};
/**
* pre_time_init_hook - do any specific initialisations before.
*
**/
void __init pre_time_init_hook(void)
{
if (x86_quirks->arch_pre_time_init)
x86_quirks->arch_pre_time_init();
}
/**
* time_init_hook - do any specific initialisations for the system timer.
*
@ -119,13 +121,13 @@ static struct irqaction irq0 = {
**/
void __init time_init_hook(void)
{
if (arch_time_init_quirk) {
if (x86_quirks->arch_time_init) {
/*
* A nonzero return code does not mean failure, it means
* that the architecture quirk does not want any
* generic (timer) setup to be performed after this:
*/
if (arch_time_init_quirk())
if (x86_quirks->arch_time_init())
return;
}

View File

@ -21,3 +21,4 @@ obj-$(CONFIG_K8_NUMA) += k8topology_64.o
endif
obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
obj-$(CONFIG_MEMTEST) += memtest.o

View File

@ -844,6 +844,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
reserve_early(table_start << PAGE_SHIFT,
table_end << PAGE_SHIFT, "PGTABLE");
if (!after_init_bootmem)
early_memtest(start, end);
return end >> PAGE_SHIFT;
}
@ -868,8 +871,6 @@ void __init paging_init(void)
*/
sparse_init();
zone_sizes_init();
paravirt_post_allocator_init();
}
/*

View File

@ -517,118 +517,6 @@ static void __init init_gbpages(void)
direct_gbpages = 0;
}
#ifdef CONFIG_MEMTEST
static void __init memtest(unsigned long start_phys, unsigned long size,
unsigned pattern)
{
unsigned long i;
unsigned long *start;
unsigned long start_bad;
unsigned long last_bad;
unsigned long val;
unsigned long start_phys_aligned;
unsigned long count;
unsigned long incr;
switch (pattern) {
case 0:
val = 0UL;
break;
case 1:
val = -1UL;
break;
case 2:
val = 0x5555555555555555UL;
break;
case 3:
val = 0xaaaaaaaaaaaaaaaaUL;
break;
default:
return;
}
incr = sizeof(unsigned long);
start_phys_aligned = ALIGN(start_phys, incr);
count = (size - (start_phys_aligned - start_phys))/incr;
start = __va(start_phys_aligned);
start_bad = 0;
last_bad = 0;
for (i = 0; i < count; i++)
start[i] = val;
for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
if (*start != val) {
if (start_phys_aligned == last_bad + incr) {
last_bad += incr;
} else {
if (start_bad) {
printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
val, start_bad, last_bad + incr);
reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
}
start_bad = last_bad = start_phys_aligned;
}
}
}
if (start_bad) {
printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
val, start_bad, last_bad + incr);
reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
}
}
/* default is disabled */
static int memtest_pattern __initdata;
static int __init parse_memtest(char *arg)
{
if (arg)
memtest_pattern = simple_strtoul(arg, NULL, 0);
return 0;
}
early_param("memtest", parse_memtest);
static void __init early_memtest(unsigned long start, unsigned long end)
{
u64 t_start, t_size;
unsigned pattern;
if (!memtest_pattern)
return;
printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
for (pattern = 0; pattern < memtest_pattern; pattern++) {
t_start = start;
t_size = 0;
while (t_start < end) {
t_start = find_e820_area_size(t_start, &t_size, 1);
/* done ? */
if (t_start >= end)
break;
if (t_start + t_size > end)
t_size = end - t_start;
printk(KERN_CONT "\n %016llx - %016llx pattern %d",
(unsigned long long)t_start,
(unsigned long long)t_start + t_size, pattern);
memtest(t_start, t_size, pattern);
t_start += t_size;
}
}
printk(KERN_CONT "\n");
}
#else
static void __init early_memtest(unsigned long start, unsigned long end)
{
}
#endif
static unsigned long __init kernel_physical_mapping_init(unsigned long start,
unsigned long end,
unsigned long page_size_mask)

123
arch/x86/mm/memtest.c Normal file
View File

@ -0,0 +1,123 @@
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/pfn.h>
#include <asm/e820.h>
static void __init memtest(unsigned long start_phys, unsigned long size,
unsigned pattern)
{
unsigned long i;
unsigned long *start;
unsigned long start_bad;
unsigned long last_bad;
unsigned long val;
unsigned long start_phys_aligned;
unsigned long count;
unsigned long incr;
switch (pattern) {
case 0:
val = 0UL;
break;
case 1:
val = -1UL;
break;
case 2:
#ifdef CONFIG_X86_64
val = 0x5555555555555555UL;
#else
val = 0x55555555UL;
#endif
break;
case 3:
#ifdef CONFIG_X86_64
val = 0xaaaaaaaaaaaaaaaaUL;
#else
val = 0xaaaaaaaaUL;
#endif
break;
default:
return;
}
incr = sizeof(unsigned long);
start_phys_aligned = ALIGN(start_phys, incr);
count = (size - (start_phys_aligned - start_phys))/incr;
start = __va(start_phys_aligned);
start_bad = 0;
last_bad = 0;
for (i = 0; i < count; i++)
start[i] = val;
for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
if (*start != val) {
if (start_phys_aligned == last_bad + incr) {
last_bad += incr;
} else {
if (start_bad) {
printk(KERN_CONT "\n %010lx bad mem addr %010lx - %010lx reserved",
val, start_bad, last_bad + incr);
reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
}
start_bad = last_bad = start_phys_aligned;
}
}
}
if (start_bad) {
printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
val, start_bad, last_bad + incr);
reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
}
}
/* default is disabled */
static int memtest_pattern __initdata;
static int __init parse_memtest(char *arg)
{
if (arg)
memtest_pattern = simple_strtoul(arg, NULL, 0);
return 0;
}
early_param("memtest", parse_memtest);
void __init early_memtest(unsigned long start, unsigned long end)
{
u64 t_start, t_size;
unsigned pattern;
if (!memtest_pattern)
return;
printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
for (pattern = 0; pattern < memtest_pattern; pattern++) {
t_start = start;
t_size = 0;
while (t_start < end) {
t_start = find_e820_area_size(t_start, &t_size, 1);
/* done ? */
if (t_start >= end)
break;
if (t_start + t_size > end)
t_size = end - t_start;
printk(KERN_CONT "\n %010llx - %010llx pattern %d",
(unsigned long long)t_start,
(unsigned long long)t_start + t_size, pattern);
memtest(t_start, t_size, pattern);
t_start += t_size;
}
}
printk(KERN_CONT "\n");
}

View File

@ -12,6 +12,8 @@
#include <linux/gfp.h>
#include <linux/fs.h>
#include <linux/bootmem.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <asm/msr.h>
#include <asm/tlbflush.h>
@ -373,8 +375,8 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
return vma_prot;
}
#ifdef CONFIG_NONPROMISC_DEVMEM
/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
#ifdef CONFIG_STRICT_DEVMEM
/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
static inline int range_is_allowed(unsigned long pfn, unsigned long size)
{
return 1;
@ -398,7 +400,7 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
}
return 1;
}
#endif /* CONFIG_NONPROMISC_DEVMEM */
#endif /* CONFIG_STRICT_DEVMEM */
int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t *vma_prot)
@ -489,3 +491,89 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
free_memtype(addr, addr + size);
}
#if defined(CONFIG_DEBUG_FS)
/* get Nth element of the linked list */
static struct memtype *memtype_get_idx(loff_t pos)
{
struct memtype *list_node, *print_entry;
int i = 1;
print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
if (!print_entry)
return NULL;
spin_lock(&memtype_lock);
list_for_each_entry(list_node, &memtype_list, nd) {
if (pos == i) {
*print_entry = *list_node;
spin_unlock(&memtype_lock);
return print_entry;
}
++i;
}
spin_unlock(&memtype_lock);
kfree(print_entry);
return NULL;
}
static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
{
if (*pos == 0) {
++*pos;
seq_printf(seq, "PAT memtype list:\n");
}
return memtype_get_idx(*pos);
}
static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
++*pos;
return memtype_get_idx(*pos);
}
static void memtype_seq_stop(struct seq_file *seq, void *v)
{
}
static int memtype_seq_show(struct seq_file *seq, void *v)
{
struct memtype *print_entry = (struct memtype *)v;
seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
print_entry->start, print_entry->end);
kfree(print_entry);
return 0;
}
static struct seq_operations memtype_seq_ops = {
.start = memtype_seq_start,
.next = memtype_seq_next,
.stop = memtype_seq_stop,
.show = memtype_seq_show,
};
static int memtype_seq_open(struct inode *inode, struct file *file)
{
return seq_open(file, &memtype_seq_ops);
}
static const struct file_operations memtype_fops = {
.open = memtype_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
static int __init pat_memtype_list_init(void)
{
debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
NULL, &memtype_fops);
return 0;
}
late_initcall(pat_memtype_list_init);
#endif /* CONFIG_DEBUG_FS */

View File

@ -5,13 +5,13 @@ obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o
obj-$(CONFIG_PCI_DIRECT) += direct.o
obj-$(CONFIG_PCI_OLPC) += olpc.o
pci-y := fixup.o
pci-$(CONFIG_ACPI) += acpi.o
pci-y += legacy.o irq.o
obj-y += fixup.o
obj-$(CONFIG_ACPI) += acpi.o
obj-y += legacy.o irq.o
pci-$(CONFIG_X86_VISWS) += visws.o
obj-$(CONFIG_X86_VISWS) += visws.o
pci-$(CONFIG_X86_NUMAQ) += numa.o
obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
obj-y += $(pci-y) common.o early.o
obj-y += common.o early.o
obj-y += amd_bus.o

View File

@ -57,14 +57,17 @@ static int __init pci_legacy_init(void)
int __init pci_subsys_init(void)
{
#ifdef CONFIG_X86_NUMAQ
pci_numaq_init();
#endif
#ifdef CONFIG_ACPI
pci_acpi_init();
#endif
#ifdef CONFIG_X86_VISWS
pci_visws_init();
#endif
pci_legacy_init();
pcibios_irq_init();
#ifdef CONFIG_X86_NUMAQ
pci_numa_init();
#endif
pcibios_init();
return 0;

View File

@ -1,5 +1,5 @@
/*
* numa.c - Low-level PCI access for NUMA-Q machines
* numaq_32.c - Low-level PCI access for NUMA-Q machines
*/
#include <linux/pci.h>
@ -151,7 +151,7 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx);
int __init pci_numa_init(void)
int __init pci_numaq_init(void)
{
int quad;

View File

@ -108,7 +108,8 @@ extern void __init dmi_check_skip_isa_align(void);
/* some common used subsys_initcalls */
extern int __init pci_acpi_init(void);
extern int __init pcibios_irq_init(void);
extern int __init pci_numa_init(void);
extern int __init pci_visws_init(void);
extern int __init pci_numaq_init(void);
extern int __init pcibios_init(void);
/* pci-mmconfig.c */

View File

@ -86,8 +86,14 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
}
static int __init pci_visws_init(void)
int __init pci_visws_init(void)
{
if (!is_visws_box())
return -1;
pcibios_enable_irq = &pci_visws_enable_irq;
pcibios_disable_irq = &pci_visws_disable_irq;
/* The VISWS supports configuration access type 1 only */
pci_probe = (pci_probe | PCI_PROBE_CONF1) &
~(PCI_PROBE_BIOS | PCI_PROBE_CONF2);
@ -105,18 +111,3 @@ static int __init pci_visws_init(void)
pcibios_resource_survey();
return 0;
}
static __init int pci_subsys_init(void)
{
if (!is_visws_box())
return -1;
pcibios_enable_irq = &pci_visws_enable_irq;
pcibios_disable_irq = &pci_visws_disable_irq;
pci_visws_init();
pcibios_init();
return 0;
}
subsys_initcall(pci_subsys_init);

View File

@ -62,7 +62,7 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
# Build multiple 32-bit vDSO images to choose from at boot time.
#
obj-$(VDSO32-y) += vdso32-syms.lds
vdso32.so-$(CONFIG_X86_32) += int80
vdso32.so-$(VDSO32-y) += int80
vdso32.so-$(CONFIG_COMPAT) += syscall
vdso32.so-$(VDSO32-y) += sysenter

View File

@ -193,17 +193,12 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
}
}
/*
* These symbols are defined by vdso32.S to mark the bounds
* of the ELF DSO images included therein.
*/
extern const char vdso32_default_start, vdso32_default_end;
extern const char vdso32_sysenter_start, vdso32_sysenter_end;
static struct page *vdso32_pages[1];
#ifdef CONFIG_X86_64
#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
/* May not be __init: called during resume */
void syscall32_cpu_init(void)
@ -226,6 +221,7 @@ static inline void map_compat_vdso(int map)
#else /* CONFIG_X86_32 */
#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
#define vdso32_syscall() (0)
void enable_sep_cpu(void)
{
@ -296,12 +292,15 @@ int __init sysenter_setup(void)
gate_vma_init();
#endif
if (!vdso32_sysenter()) {
vsyscall = &vdso32_default_start;
vsyscall_len = &vdso32_default_end - &vdso32_default_start;
} else {
if (vdso32_syscall()) {
vsyscall = &vdso32_syscall_start;
vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
} else if (vdso32_sysenter()){
vsyscall = &vdso32_sysenter_start;
vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
} else {
vsyscall = &vdso32_int80_start;
vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
}
memcpy(syscall_page, vsyscall, vsyscall_len);

View File

@ -2,14 +2,17 @@
__INITDATA
.globl vdso32_default_start, vdso32_default_end
vdso32_default_start:
#ifdef CONFIG_X86_32
.globl vdso32_int80_start, vdso32_int80_end
vdso32_int80_start:
.incbin "arch/x86/vdso/vdso32-int80.so"
#else
vdso32_int80_end:
.globl vdso32_syscall_start, vdso32_syscall_end
vdso32_syscall_start:
#ifdef CONFIG_COMPAT
.incbin "arch/x86/vdso/vdso32-syscall.so"
#endif
vdso32_default_end:
vdso32_syscall_end:
.globl vdso32_sysenter_start, vdso32_sysenter_end
vdso32_sysenter_start:

View File

@ -21,7 +21,8 @@ unsigned int __read_mostly vdso_enabled = 1;
extern char vdso_start[], vdso_end[];
extern unsigned short vdso_sync_cpuid;
struct page **vdso_pages;
static struct page **vdso_pages;
static unsigned vdso_size;
static inline void *var_ref(void *p, char *name)
{
@ -38,6 +39,7 @@ static int __init init_vdso_vars(void)
int i;
char *vbase;
vdso_size = npages << PAGE_SHIFT;
vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
if (!vdso_pages)
goto oom;
@ -101,20 +103,19 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
struct mm_struct *mm = current->mm;
unsigned long addr;
int ret;
unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE);
if (!vdso_enabled)
return 0;
down_write(&mm->mmap_sem);
addr = vdso_addr(mm->start_stack, len);
addr = get_unmapped_area(NULL, addr, len, 0, 0);
addr = vdso_addr(mm->start_stack, vdso_size);
addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0);
if (IS_ERR_VALUE(addr)) {
ret = addr;
goto up_fail;
}
ret = install_special_mapping(mm, addr, len,
ret = install_special_mapping(mm, addr, vdso_size,
VM_READ|VM_EXEC|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
VM_ALWAYSDUMP,

View File

@ -6,8 +6,8 @@ config XEN
bool "Xen guest support"
select PARAVIRT
select PARAVIRT_CLOCK
depends on X86_32
depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
depends on X86_CMPXCHG && X86_TSC
help
This is the Linux Xen port. Enabling this will allow the
kernel to boot in a paravirtualized environment under the
@ -15,10 +15,16 @@ config XEN
config XEN_MAX_DOMAIN_MEMORY
int "Maximum allowed size of a domain in gigabytes"
default 8
default 8 if X86_32
default 32 if X86_64
depends on XEN
help
The pseudo-physical to machine address array is sized
according to the maximum possible memory size of a Xen
domain. This array uses 1 page per gigabyte, so there's no
need to be too stingy here.
need to be too stingy here.
config XEN_SAVE_RESTORE
bool
depends on PM
default y

View File

@ -1,4 +1,4 @@
obj-y := enlighten.o setup.o multicalls.o mmu.o \
time.o xen-asm.o grant-table.o suspend.o
time.o xen-asm_$(BITS).o grant-table.o suspend.o
obj-$(CONFIG_SMP) += smp.o

File diff suppressed because it is too large Load Diff

View File

@ -44,8 +44,10 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/fixmap.h>
#include <asm/mmu_context.h>
#include <asm/paravirt.h>
#include <asm/linkage.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
@ -56,26 +58,29 @@
#include "multicalls.h"
#include "mmu.h"
/*
* Just beyond the highest usermode address. STACK_TOP_MAX has a
* redzone above it, so round it up to a PGD boundary.
*/
#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
/* Placeholder for holes in the address space */
static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE]
__attribute__((section(".data.page_aligned"))) =
static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
{ [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
/* Array of pointers to pages containing p2m entries */
static unsigned long *p2m_top[TOP_ENTRIES]
__attribute__((section(".data.page_aligned"))) =
static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
{ [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
/* Arrays of p2m arrays expressed in mfns used for save/restore */
static unsigned long p2m_top_mfn[TOP_ENTRIES]
__attribute__((section(".bss.page_aligned")));
static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
static unsigned long p2m_top_mfn_list[
PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)]
__attribute__((section(".bss.page_aligned")));
static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
__page_aligned_bss;
static inline unsigned p2m_top_index(unsigned long pfn)
{
@ -181,15 +186,16 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
p2m_top[topidx][idx] = mfn;
}
xmaddr_t arbitrary_virt_to_machine(unsigned long address)
xmaddr_t arbitrary_virt_to_machine(void *vaddr)
{
unsigned long address = (unsigned long)vaddr;
unsigned int level;
pte_t *pte = lookup_address(address, &level);
unsigned offset = address & ~PAGE_MASK;
BUG_ON(pte == NULL);
return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
}
void make_lowmem_page_readonly(void *vaddr)
@ -256,7 +262,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
xen_mc_batch();
u.ptr = virt_to_machine(ptr).maddr;
/* ptr may be ioremapped for 64-bit pagetable setup */
u.ptr = arbitrary_virt_to_machine(ptr).maddr;
u.val = pmd_val_ma(val);
extend_mmu_update(&u);
@ -283,35 +290,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
*/
void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pgd = swapper_pg_dir + pgd_index(vaddr);
if (pgd_none(*pgd)) {
BUG();
return;
}
pud = pud_offset(pgd, vaddr);
if (pud_none(*pud)) {
BUG();
return;
}
pmd = pmd_offset(pud, vaddr);
if (pmd_none(*pmd)) {
BUG();
return;
}
pte = pte_offset_kernel(pmd, vaddr);
/* <mfn,flags> stored as-is, to permit clearing entries */
xen_set_pte(pte, mfn_pte(mfn, flags));
/*
* It's enough to flush this one mapping.
* (PGE mappings get flushed as well)
*/
__flush_tlb_one(vaddr);
set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
}
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
@ -418,7 +397,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
xen_mc_batch();
u.ptr = virt_to_machine(ptr).maddr;
/* ptr may be ioremapped for 64-bit pagetable setup */
u.ptr = arbitrary_virt_to_machine(ptr).maddr;
u.val = pud_val_ma(val);
extend_mmu_update(&u);
@ -441,14 +421,19 @@ void xen_set_pud(pud_t *ptr, pud_t val)
void xen_set_pte(pte_t *ptep, pte_t pte)
{
#ifdef CONFIG_X86_PAE
ptep->pte_high = pte.pte_high;
smp_wmb();
ptep->pte_low = pte.pte_low;
#else
*ptep = pte;
#endif
}
#ifdef CONFIG_X86_PAE
void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
{
set_64bit((u64 *)ptep, pte_val_ma(pte));
set_64bit((u64 *)ptep, native_pte_val(pte));
}
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@ -462,6 +447,7 @@ void xen_pmd_clear(pmd_t *pmdp)
{
set_pmd(pmdp, __pmd(0));
}
#endif /* CONFIG_X86_PAE */
pmd_t xen_make_pmd(pmdval_t pmd)
{
@ -469,78 +455,189 @@ pmd_t xen_make_pmd(pmdval_t pmd)
return native_make_pmd(pmd);
}
#if PAGETABLE_LEVELS == 4
pudval_t xen_pud_val(pud_t pud)
{
return pte_mfn_to_pfn(pud.pud);
}
pud_t xen_make_pud(pudval_t pud)
{
pud = pte_pfn_to_mfn(pud);
return native_make_pud(pud);
}
pgd_t *xen_get_user_pgd(pgd_t *pgd)
{
pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
unsigned offset = pgd - pgd_page;
pgd_t *user_ptr = NULL;
if (offset < pgd_index(USER_LIMIT)) {
struct page *page = virt_to_page(pgd_page);
user_ptr = (pgd_t *)page->private;
if (user_ptr)
user_ptr += offset;
}
return user_ptr;
}
static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
{
struct mmu_update u;
u.ptr = virt_to_machine(ptr).maddr;
u.val = pgd_val_ma(val);
extend_mmu_update(&u);
}
/*
(Yet another) pagetable walker. This one is intended for pinning a
pagetable. This means that it walks a pagetable and calls the
callback function on each page it finds making up the page table,
at every level. It walks the entire pagetable, but it only bothers
pinning pte pages which are below pte_limit. In the normal case
this will be TASK_SIZE, but at boot we need to pin up to
FIXADDR_TOP. But the important bit is that we don't pin beyond
there, because then we start getting into Xen's ptes.
*/
static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
* Raw hypercall-based set_pgd, intended for in early boot before
* there's a page structure. This implies:
* 1. The only existing pagetable is the kernel's
* 2. It is always pinned
* 3. It has no user pagetable attached to it
*/
void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
{
preempt_disable();
xen_mc_batch();
__xen_set_pgd_hyper(ptr, val);
xen_mc_issue(PARAVIRT_LAZY_MMU);
preempt_enable();
}
void xen_set_pgd(pgd_t *ptr, pgd_t val)
{
pgd_t *user_ptr = xen_get_user_pgd(ptr);
/* If page is not pinned, we can just update the entry
directly */
if (!page_pinned(ptr)) {
*ptr = val;
if (user_ptr) {
WARN_ON(page_pinned(user_ptr));
*user_ptr = val;
}
return;
}
/* If it's pinned, then we can at least batch the kernel and
user updates together. */
xen_mc_batch();
__xen_set_pgd_hyper(ptr, val);
if (user_ptr)
__xen_set_pgd_hyper(user_ptr, val);
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
#endif /* PAGETABLE_LEVELS == 4 */
/*
* (Yet another) pagetable walker. This one is intended for pinning a
* pagetable. This means that it walks a pagetable and calls the
* callback function on each page it finds making up the page table,
* at every level. It walks the entire pagetable, but it only bothers
* pinning pte pages which are below limit. In the normal case this
* will be STACK_TOP_MAX, but at boot we need to pin up to
* FIXADDR_TOP.
*
* For 32-bit the important bit is that we don't pin beyond there,
* because then we start getting into Xen's ptes.
*
* For 64-bit, we must skip the Xen hole in the middle of the address
* space, just after the big x86-64 virtual hole.
*/
static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
unsigned long limit)
{
pgd_t *pgd = pgd_base;
int flush = 0;
unsigned long addr = 0;
unsigned long pgd_next;
unsigned hole_low, hole_high;
unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
unsigned pgdidx, pudidx, pmdidx;
BUG_ON(limit > FIXADDR_TOP);
/* The limit is the last byte to be touched */
limit--;
BUG_ON(limit >= FIXADDR_TOP);
if (xen_feature(XENFEAT_auto_translated_physmap))
return 0;
for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
/*
* 64-bit has a great big hole in the middle of the address
* space, which contains the Xen mappings. On 32-bit these
* will end up making a zero-sized hole and so is a no-op.
*/
hole_low = pgd_index(USER_LIMIT);
hole_high = pgd_index(PAGE_OFFSET);
pgdidx_limit = pgd_index(limit);
#if PTRS_PER_PUD > 1
pudidx_limit = pud_index(limit);
#else
pudidx_limit = 0;
#endif
#if PTRS_PER_PMD > 1
pmdidx_limit = pmd_index(limit);
#else
pmdidx_limit = 0;
#endif
flush |= (*func)(virt_to_page(pgd), PT_PGD);
for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
pud_t *pud;
unsigned long pud_limit, pud_next;
pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
if (!pgd_val(*pgd))
if (pgdidx >= hole_low && pgdidx < hole_high)
continue;
pud = pud_offset(pgd, 0);
if (!pgd_val(pgd[pgdidx]))
continue;
pud = pud_offset(&pgd[pgdidx], 0);
if (PTRS_PER_PUD > 1) /* not folded */
flush |= (*func)(virt_to_page(pud), PT_PUD);
for (; addr != pud_limit; pud++, addr = pud_next) {
for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
pmd_t *pmd;
unsigned long pmd_limit;
pud_next = pud_addr_end(addr, pud_limit);
if (pgdidx == pgdidx_limit &&
pudidx > pudidx_limit)
goto out;
if (pud_next < limit)
pmd_limit = pud_next;
else
pmd_limit = limit;
if (pud_none(*pud))
if (pud_none(pud[pudidx]))
continue;
pmd = pmd_offset(pud, 0);
pmd = pmd_offset(&pud[pudidx], 0);
if (PTRS_PER_PMD > 1) /* not folded */
flush |= (*func)(virt_to_page(pmd), PT_PMD);
for (; addr != pmd_limit; pmd++) {
addr += (PAGE_SIZE * PTRS_PER_PTE);
if ((pmd_limit-1) < (addr-1)) {
addr = pmd_limit;
break;
}
for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
struct page *pte;
if (pmd_none(*pmd))
if (pgdidx == pgdidx_limit &&
pudidx == pudidx_limit &&
pmdidx > pmdidx_limit)
goto out;
if (pmd_none(pmd[pmdidx]))
continue;
flush |= (*func)(pmd_page(*pmd), PT_PTE);
pte = pmd_page(pmd[pmdidx]);
flush |= (*func)(pte, PT_PTE);
}
}
}
flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
out:
return flush;
}
@ -622,14 +719,31 @@ void xen_pgd_pin(pgd_t *pgd)
{
xen_mc_batch();
if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
/* re-enable interrupts for kmap_flush_unused */
xen_mc_issue(0);
kmap_flush_unused();
xen_mc_batch();
}
#ifdef CONFIG_X86_64
{
pgd_t *user_pgd = xen_get_user_pgd(pgd);
xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
if (user_pgd) {
pin_page(virt_to_page(user_pgd), PT_PGD);
xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
}
}
#else /* CONFIG_X86_32 */
#ifdef CONFIG_X86_PAE
/* Need to make sure unshared kernel PMD is pinnable */
pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
#endif
xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
#endif /* CONFIG_X86_64 */
xen_mc_issue(0);
}
@ -656,9 +770,11 @@ void xen_mm_pin_all(void)
spin_unlock_irqrestore(&pgd_lock, flags);
}
/* The init_mm pagetable is really pinned as soon as its created, but
that's before we have page structures to store the bits. So do all
the book-keeping now. */
/*
* The init_mm pagetable is really pinned as soon as its created, but
* that's before we have page structures to store the bits. So do all
* the book-keeping now.
*/
static __init int mark_pinned(struct page *page, enum pt_level level)
{
SetPagePinned(page);
@ -708,7 +824,23 @@ static void xen_pgd_unpin(pgd_t *pgd)
xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
pgd_walk(pgd, unpin_page, TASK_SIZE);
#ifdef CONFIG_X86_64
{
pgd_t *user_pgd = xen_get_user_pgd(pgd);
if (user_pgd) {
xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
unpin_page(virt_to_page(user_pgd), PT_PGD);
}
}
#endif
#ifdef CONFIG_X86_PAE
/* Need to make sure unshared kernel PMD is unpinned */
pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
#endif
pgd_walk(pgd, unpin_page, USER_LIMIT);
xen_mc_issue(0);
}
@ -727,7 +859,6 @@ void xen_mm_unpin_all(void)
list_for_each_entry(page, &pgd_list, lru) {
if (PageSavePinned(page)) {
BUG_ON(!PagePinned(page));
printk("unpinning pinned %p\n", page_address(page));
xen_pgd_unpin((pgd_t *)page_address(page));
ClearPageSavePinned(page);
}
@ -757,8 +888,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
static void drop_other_mm_ref(void *info)
{
struct mm_struct *mm = info;
struct mm_struct *active_mm;
if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
#ifdef CONFIG_X86_64
active_mm = read_pda(active_mm);
#else
active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
#endif
if (active_mm == mm)
leave_mm(smp_processor_id());
/* If this cpu still has a stale cr3 reference, then make sure

View File

@ -10,18 +10,6 @@ enum pt_level {
PT_PTE
};
/*
* Page-directory addresses above 4GB do not fit into architectural %cr3.
* When accessing %cr3, or equivalent field in vcpu_guest_context, guests
* must use the following accessor macros to pack/unpack valid MFNs.
*
* Note that Xen is using the fact that the pagetable base is always
* page-aligned, and putting the 12 MSB of the address into the 12 LSB
* of cr3.
*/
#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
@ -44,13 +32,26 @@ pgd_t xen_make_pgd(pgdval_t);
void xen_set_pte(pte_t *ptep, pte_t pteval);
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
#ifdef CONFIG_X86_PAE
void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_pmd_clear(pmd_t *pmdp);
#endif /* CONFIG_X86_PAE */
void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
void xen_set_pud(pud_t *ptr, pud_t val);
void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
void xen_set_pud_hyper(pud_t *ptr, pud_t val);
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_pmd_clear(pmd_t *pmdp);
#if PAGETABLE_LEVELS == 4
pudval_t xen_pud_val(pud_t pud);
pud_t xen_make_pud(pudval_t pudval);
void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
#endif
pgd_t *xen_get_user_pgd(pgd_t *pgd);
pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,

View File

@ -76,6 +76,7 @@ void xen_mc_flush(void)
if (ret) {
printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
ret, smp_processor_id());
dump_stack();
for (i = 0; i < b->mcidx; i++) {
printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
i+1, b->mcidx,

View File

@ -83,30 +83,72 @@ static void xen_idle(void)
/*
* Set the bit indicating "nosegneg" library variants should be used.
* We only need to bother in pure 32-bit mode; compat 32-bit processes
* can have un-truncated segments, so wrapping around is allowed.
*/
static void __init fiddle_vdso(void)
{
extern const char vdso32_default_start;
u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK);
#ifdef CONFIG_X86_32
u32 *mask;
mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
#endif
}
void xen_enable_sysenter(void)
static __cpuinit int register_callback(unsigned type, const void *func)
{
int cpu = smp_processor_id();
extern void xen_sysenter_target(void);
/* Mask events on entry, even though they get enabled immediately */
static struct callback_register sysenter = {
.type = CALLBACKTYPE_sysenter,
.address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
struct callback_register callback = {
.type = type,
.address = XEN_CALLBACK(__KERNEL_CS, func),
.flags = CALLBACKF_mask_events,
};
if (!boot_cpu_has(X86_FEATURE_SEP) ||
HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) {
clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP);
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
}
void __cpuinit xen_enable_sysenter(void)
{
extern void xen_sysenter_target(void);
int ret;
unsigned sysenter_feature;
#ifdef CONFIG_X86_32
sysenter_feature = X86_FEATURE_SEP;
#else
sysenter_feature = X86_FEATURE_SYSENTER32;
#endif
if (!boot_cpu_has(sysenter_feature))
return;
ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
if(ret != 0)
setup_clear_cpu_cap(sysenter_feature);
}
void __cpuinit xen_enable_syscall(void)
{
#ifdef CONFIG_X86_64
int ret;
extern void xen_syscall_target(void);
extern void xen_syscall32_target(void);
ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
if (ret != 0) {
printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
/* Pretty fatal; 64-bit userspace has no other
mechanism for syscalls. */
}
if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
ret = register_callback(CALLBACKTYPE_syscall32,
xen_syscall32_target);
if (ret != 0)
setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
}
#endif /* CONFIG_X86_64 */
}
void __init xen_arch_setup(void)
@ -120,10 +162,12 @@ void __init xen_arch_setup(void)
if (!xen_feature(XENFEAT_auto_translated_physmap))
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
__KERNEL_CS, (unsigned long)xen_failsafe_callback);
if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
BUG();
xen_enable_sysenter();
xen_enable_syscall();
set_iopl.iopl = 1;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
@ -143,11 +187,6 @@ void __init xen_arch_setup(void)
pm_idle = xen_idle;
#ifdef CONFIG_SMP
/* fill cpus_possible with all available cpus */
xen_fill_possible_map();
#endif
paravirt_disable_iospace();
fiddle_vdso();

View File

@ -15,6 +15,7 @@
* This does not handle HOTPLUG_CPU yet.
*/
#include <linux/sched.h>
#include <linux/kernel_stat.h>
#include <linux/err.h>
#include <linux/smp.h>
@ -35,6 +36,8 @@
#include "xen-ops.h"
#include "mmu.h"
static void __cpuinit xen_init_lock_cpu(int cpu);
cpumask_t xen_cpu_initialized_map;
static DEFINE_PER_CPU(int, resched_irq);
@ -66,13 +69,22 @@ static __cpuinit void cpu_bringup_and_idle(void)
int cpu = smp_processor_id();
cpu_init();
xen_enable_sysenter();
preempt_disable();
per_cpu(cpu_state, cpu) = CPU_ONLINE;
xen_enable_sysenter();
xen_enable_syscall();
cpu = smp_processor_id();
smp_store_cpu_info(cpu);
cpu_data(cpu).x86_max_cores = 1;
set_cpu_sibling_map(cpu);
xen_setup_cpu_clockevents();
cpu_set(cpu, cpu_online_map);
x86_write_percpu(cpu_state, CPU_ONLINE);
wmb();
/* We can take interrupts now: we're officially "up". */
local_irq_enable();
@ -141,56 +153,39 @@ static int xen_smp_intr_init(unsigned int cpu)
return rc;
}
void __init xen_fill_possible_map(void)
static void __init xen_fill_possible_map(void)
{
int i, rc;
for (i = 0; i < NR_CPUS; i++) {
rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
if (rc >= 0)
if (rc >= 0) {
num_processors++;
cpu_set(i, cpu_possible_map);
}
}
}
void __init xen_smp_prepare_boot_cpu(void)
static void __init xen_smp_prepare_boot_cpu(void)
{
int cpu;
BUG_ON(smp_processor_id() != 0);
native_smp_prepare_boot_cpu();
/* We've switched to the "real" per-cpu gdt, so make sure the
old memory can be recycled */
make_lowmem_page_readwrite(&per_cpu__gdt_page);
for_each_possible_cpu(cpu) {
cpus_clear(per_cpu(cpu_sibling_map, cpu));
/*
* cpu_core_map lives in a per cpu area that is cleared
* when the per cpu array is allocated.
*
* cpus_clear(per_cpu(cpu_core_map, cpu));
*/
}
make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
xen_setup_vcpu_info_placement();
}
void __init xen_smp_prepare_cpus(unsigned int max_cpus)
static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned cpu;
for_each_possible_cpu(cpu) {
cpus_clear(per_cpu(cpu_sibling_map, cpu));
/*
* cpu_core_ map will be zeroed when the per
* cpu area is allocated.
*
* cpus_clear(per_cpu(cpu_core_map, cpu));
*/
}
xen_init_lock_cpu(0);
smp_store_cpu_info(0);
cpu_data(0).x86_max_cores = 1;
set_cpu_sibling_map(0);
if (xen_smp_intr_init(0))
@ -225,7 +220,7 @@ static __cpuinit int
cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
{
struct vcpu_guest_context *ctxt;
struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
struct desc_struct *gdt;
if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
return 0;
@ -234,12 +229,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
if (ctxt == NULL)
return -ENOMEM;
gdt = get_cpu_gdt_table(cpu);
ctxt->flags = VGCF_IN_KERNEL;
ctxt->user_regs.ds = __USER_DS;
ctxt->user_regs.es = __USER_DS;
ctxt->user_regs.fs = __KERNEL_PERCPU;
ctxt->user_regs.gs = 0;
ctxt->user_regs.ss = __KERNEL_DS;
#ifdef CONFIG_X86_32
ctxt->user_regs.fs = __KERNEL_PERCPU;
#endif
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
@ -249,11 +247,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
ctxt->ldt_ents = 0;
BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
make_lowmem_page_readonly(gdt->gdt);
BUG_ON((unsigned long)gdt & ~PAGE_MASK);
make_lowmem_page_readonly(gdt);
ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt);
ctxt->gdt_frames[0] = virt_to_mfn(gdt);
ctxt->gdt_ents = GDT_ENTRIES;
ctxt->user_regs.cs = __KERNEL_CS;
ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
@ -261,9 +259,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
ctxt->kernel_ss = __KERNEL_DS;
ctxt->kernel_sp = idle->thread.sp0;
#ifdef CONFIG_X86_32
ctxt->event_callback_cs = __KERNEL_CS;
ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
ctxt->failsafe_callback_cs = __KERNEL_CS;
#endif
ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
@ -276,7 +276,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
return 0;
}
int __cpuinit xen_cpu_up(unsigned int cpu)
static int __cpuinit xen_cpu_up(unsigned int cpu)
{
struct task_struct *idle = idle_task(cpu);
int rc;
@ -287,10 +287,28 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
return rc;
#endif
#ifdef CONFIG_X86_64
/* Allocate node local memory for AP pdas */
WARN_ON(cpu == 0);
if (cpu > 0) {
rc = get_local_pda(cpu);
if (rc)
return rc;
}
#endif
#ifdef CONFIG_X86_32
init_gdt(cpu);
per_cpu(current_task, cpu) = idle;
irq_ctx_init(cpu);
#else
cpu_pda(cpu)->pcurrent = idle;
clear_tsk_thread_flag(idle, TIF_FORK);
#endif
xen_setup_timer(cpu);
xen_init_lock_cpu(cpu);
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
/* make sure interrupts start blocked */
per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@ -306,20 +324,18 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
if (rc)
return rc;
smp_store_cpu_info(cpu);
set_cpu_sibling_map(cpu);
/* This must be done before setting cpu_online_map */
wmb();
cpu_set(cpu, cpu_online_map);
rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
BUG_ON(rc);
while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
HYPERVISOR_sched_op(SCHEDOP_yield, 0);
barrier();
}
return 0;
}
void xen_smp_cpus_done(unsigned int max_cpus)
static void xen_smp_cpus_done(unsigned int max_cpus)
{
}
@ -335,12 +351,12 @@ static void stop_self(void *v)
BUG();
}
void xen_smp_send_stop(void)
static void xen_smp_send_stop(void)
{
smp_call_function(stop_self, NULL, 0);
}
void xen_smp_send_reschedule(int cpu)
static void xen_smp_send_reschedule(int cpu)
{
xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
}
@ -355,7 +371,7 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
xen_send_IPI_one(cpu, vector);
}
void xen_smp_send_call_function_ipi(cpumask_t mask)
static void xen_smp_send_call_function_ipi(cpumask_t mask)
{
int cpu;
@ -370,7 +386,7 @@ void xen_smp_send_call_function_ipi(cpumask_t mask)
}
}
void xen_smp_send_call_function_single_ipi(int cpu)
static void xen_smp_send_call_function_single_ipi(int cpu)
{
xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
}
@ -379,7 +395,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_interrupt();
#ifdef CONFIG_X86_32
__get_cpu_var(irq_stat).irq_call_count++;
#else
add_pda(irq_call_count, 1);
#endif
irq_exit();
return IRQ_HANDLED;
@ -389,8 +409,196 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_single_interrupt();
#ifdef CONFIG_X86_32
__get_cpu_var(irq_stat).irq_call_count++;
#else
add_pda(irq_call_count, 1);
#endif
irq_exit();
return IRQ_HANDLED;
}
struct xen_spinlock {
unsigned char lock; /* 0 -> free; 1 -> locked */
unsigned short spinners; /* count of waiting cpus */
};
static int xen_spin_is_locked(struct raw_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
return xl->lock != 0;
}
static int xen_spin_is_contended(struct raw_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
/* Not strictly true; this is only the count of contended
lock-takers entering the slow path. */
return xl->spinners != 0;
}
static int xen_spin_trylock(struct raw_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
u8 old = 1;
asm("xchgb %b0,%1"
: "+q" (old), "+m" (xl->lock) : : "memory");
return old == 0;
}
static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
static inline void spinning_lock(struct xen_spinlock *xl)
{
__get_cpu_var(lock_spinners) = xl;
wmb(); /* set lock of interest before count */
asm(LOCK_PREFIX " incw %0"
: "+m" (xl->spinners) : : "memory");
}
static inline void unspinning_lock(struct xen_spinlock *xl)
{
asm(LOCK_PREFIX " decw %0"
: "+m" (xl->spinners) : : "memory");
wmb(); /* decrement count before clearing lock */
__get_cpu_var(lock_spinners) = NULL;
}
static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
int irq = __get_cpu_var(lock_kicker_irq);
int ret;
/* If kicker interrupts not initialized yet, just spin */
if (irq == -1)
return 0;
/* announce we're spinning */
spinning_lock(xl);
/* clear pending */
xen_clear_irq_pending(irq);
/* check again make sure it didn't become free while
we weren't looking */
ret = xen_spin_trylock(lock);
if (ret)
goto out;
/* block until irq becomes pending */
xen_poll_irq(irq);
kstat_this_cpu.irqs[irq]++;
out:
unspinning_lock(xl);
return ret;
}
static void xen_spin_lock(struct raw_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
int timeout;
u8 oldval;
do {
timeout = 1 << 10;
asm("1: xchgb %1,%0\n"
" testb %1,%1\n"
" jz 3f\n"
"2: rep;nop\n"
" cmpb $0,%0\n"
" je 1b\n"
" dec %2\n"
" jnz 2b\n"
"3:\n"
: "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
: "1" (1)
: "memory");
} while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
}
static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
{
int cpu;
for_each_online_cpu(cpu) {
/* XXX should mix up next cpu selection */
if (per_cpu(lock_spinners, cpu) == xl) {
xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
break;
}
}
}
static void xen_spin_unlock(struct raw_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
smp_wmb(); /* make sure no writes get moved after unlock */
xl->lock = 0; /* release lock */
/* make sure unlock happens before kick */
barrier();
if (unlikely(xl->spinners))
xen_spin_unlock_slow(xl);
}
static __cpuinit void xen_init_lock_cpu(int cpu)
{
int irq;
const char *name;
name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
cpu,
xen_reschedule_interrupt,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
name,
NULL);
if (irq >= 0) {
disable_irq(irq); /* make sure it's never delivered */
per_cpu(lock_kicker_irq, cpu) = irq;
}
printk("cpu %d spinlock event irq %d\n", cpu, irq);
}
static void __init xen_init_spinlocks(void)
{
pv_lock_ops.spin_is_locked = xen_spin_is_locked;
pv_lock_ops.spin_is_contended = xen_spin_is_contended;
pv_lock_ops.spin_lock = xen_spin_lock;
pv_lock_ops.spin_trylock = xen_spin_trylock;
pv_lock_ops.spin_unlock = xen_spin_unlock;
}
static const struct smp_ops xen_smp_ops __initdata = {
.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
.smp_prepare_cpus = xen_smp_prepare_cpus,
.cpu_up = xen_cpu_up,
.smp_cpus_done = xen_smp_cpus_done,
.smp_send_stop = xen_smp_send_stop,
.smp_send_reschedule = xen_smp_send_reschedule,
.send_call_func_ipi = xen_smp_send_call_function_ipi,
.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
};
void __init xen_smp_init(void)
{
smp_ops = xen_smp_ops;
xen_fill_possible_map();
xen_init_spinlocks();
}

View File

@ -38,8 +38,11 @@ void xen_post_suspend(int suspend_cancelled)
xen_cpu_initialized_map = cpu_online_map;
#endif
xen_vcpu_restore();
xen_timer_resume();
}
}
void xen_arch_resume(void)
{
/* nothing */
}

271
arch/x86/xen/xen-asm_64.S Normal file
View File

@ -0,0 +1,271 @@
/*
Asm versions of Xen pv-ops, suitable for either direct use or inlining.
The inline versions are the same as the direct-use versions, with the
pre- and post-amble chopped off.
This code is encoded for size rather than absolute efficiency,
with a view to being able to inline as much as possible.
We only bother with direct forms (ie, vcpu in pda) of the operations
here; the indirect forms are better handled in C, since they're
generally too large to inline anyway.
*/
#include <linux/linkage.h>
#include <asm/asm-offsets.h>
#include <asm/processor-flags.h>
#include <asm/errno.h>
#include <asm/segment.h>
#include <xen/interface/xen.h>
#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
#define ENDPATCH(x) .globl x##_end; x##_end=.
/* Pseudo-flag used for virtual NMI, which we don't implement yet */
#define XEN_EFLAGS_NMI 0x80000000
#if 0
#include <asm/percpu.h>
/*
Enable events. This clears the event mask and tests the pending
event status with one and operation. If there are pending
events, then enter the hypervisor to get them handled.
*/
ENTRY(xen_irq_enable_direct)
/* Unmask events */
movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
/* Preempt here doesn't matter because that will deal with
any pending interrupts. The pending check may end up being
run on the wrong CPU, but that doesn't hurt. */
/* Test for pending */
testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
jz 1f
2: call check_events
1:
ENDPATCH(xen_irq_enable_direct)
ret
ENDPROC(xen_irq_enable_direct)
RELOC(xen_irq_enable_direct, 2b+1)
/*
Disabling events is simply a matter of making the event mask
non-zero.
*/
ENTRY(xen_irq_disable_direct)
movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
ENDPATCH(xen_irq_disable_direct)
ret
ENDPROC(xen_irq_disable_direct)
RELOC(xen_irq_disable_direct, 0)
/*
(xen_)save_fl is used to get the current interrupt enable status.
Callers expect the status to be in X86_EFLAGS_IF, and other bits
may be set in the return value. We take advantage of this by
making sure that X86_EFLAGS_IF has the right value (and other bits
in that byte are 0), but other bits in the return value are
undefined. We need to toggle the state of the bit, because
Xen and x86 use opposite senses (mask vs enable).
*/
ENTRY(xen_save_fl_direct)
testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
setz %ah
addb %ah,%ah
ENDPATCH(xen_save_fl_direct)
ret
ENDPROC(xen_save_fl_direct)
RELOC(xen_save_fl_direct, 0)
/*
In principle the caller should be passing us a value return
from xen_save_fl_direct, but for robustness sake we test only
the X86_EFLAGS_IF flag rather than the whole byte. After
setting the interrupt mask state, it checks for unmasked
pending events and enters the hypervisor to get them delivered
if so.
*/
ENTRY(xen_restore_fl_direct)
testb $X86_EFLAGS_IF>>8, %ah
setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
/* Preempt here doesn't matter because that will deal with
any pending interrupts. The pending check may end up being
run on the wrong CPU, but that doesn't hurt. */
/* check for unmasked and pending */
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
jz 1f
2: call check_events
1:
ENDPATCH(xen_restore_fl_direct)
ret
ENDPROC(xen_restore_fl_direct)
RELOC(xen_restore_fl_direct, 2b+1)
/*
Force an event check by making a hypercall,
but preserve regs before making the call.
*/
check_events:
push %rax
push %rcx
push %rdx
push %rsi
push %rdi
push %r8
push %r9
push %r10
push %r11
call force_evtchn_callback
pop %r11
pop %r10
pop %r9
pop %r8
pop %rdi
pop %rsi
pop %rdx
pop %rcx
pop %rax
ret
#endif
ENTRY(xen_adjust_exception_frame)
mov 8+0(%rsp),%rcx
mov 8+8(%rsp),%r11
ret $16
hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
/*
Xen64 iret frame:
ss
rsp
rflags
cs
rip <-- standard iret frame
flags
rcx }
r11 }<-- pushed by hypercall page
rsp -> rax }
*/
ENTRY(xen_iret)
pushq $0
1: jmp hypercall_iret
ENDPATCH(xen_iret)
RELOC(xen_iret, 1b+1)
/*
sysexit is not used for 64-bit processes, so it's
only ever used to return to 32-bit compat userspace.
*/
ENTRY(xen_sysexit)
pushq $__USER32_DS
pushq %rcx
pushq $X86_EFLAGS_IF
pushq $__USER32_CS
pushq %rdx
pushq $VGCF_in_syscall
1: jmp hypercall_iret
ENDPATCH(xen_sysexit)
RELOC(xen_sysexit, 1b+1)
ENTRY(xen_sysret64)
/* We're already on the usermode stack at this point, but still
with the kernel gs, so we can easily switch back */
movq %rsp, %gs:pda_oldrsp
movq %gs:pda_kernelstack,%rsp
pushq $__USER_DS
pushq %gs:pda_oldrsp
pushq %r11
pushq $__USER_CS
pushq %rcx
pushq $VGCF_in_syscall
1: jmp hypercall_iret
ENDPATCH(xen_sysret64)
RELOC(xen_sysret64, 1b+1)
ENTRY(xen_sysret32)
/* We're already on the usermode stack at this point, but still
with the kernel gs, so we can easily switch back */
movq %rsp, %gs:pda_oldrsp
movq %gs:pda_kernelstack, %rsp
pushq $__USER32_DS
pushq %gs:pda_oldrsp
pushq %r11
pushq $__USER32_CS
pushq %rcx
pushq $VGCF_in_syscall
1: jmp hypercall_iret
ENDPATCH(xen_sysret32)
RELOC(xen_sysret32, 1b+1)
/*
Xen handles syscall callbacks much like ordinary exceptions,
which means we have:
- kernel gs
- kernel rsp
- an iret-like stack frame on the stack (including rcx and r11):
ss
rsp
rflags
cs
rip
r11
rsp-> rcx
In all the entrypoints, we undo all that to make it look
like a CPU-generated syscall/sysenter and jump to the normal
entrypoint.
*/
.macro undo_xen_syscall
mov 0*8(%rsp),%rcx
mov 1*8(%rsp),%r11
mov 5*8(%rsp),%rsp
.endm
/* Normal 64-bit system call target */
ENTRY(xen_syscall_target)
undo_xen_syscall
jmp system_call_after_swapgs
ENDPROC(xen_syscall_target)
#ifdef CONFIG_IA32_EMULATION
/* 32-bit compat syscall target */
ENTRY(xen_syscall32_target)
undo_xen_syscall
jmp ia32_cstar_target
ENDPROC(xen_syscall32_target)
/* 32-bit compat sysenter target */
ENTRY(xen_sysenter_target)
undo_xen_syscall
jmp ia32_sysenter_target
ENDPROC(xen_sysenter_target)
#else /* !CONFIG_IA32_EMULATION */
ENTRY(xen_syscall32_target)
ENTRY(xen_sysenter_target)
lea 16(%rsp), %rsp /* strip %rcx,%r11 */
mov $-ENOSYS, %rax
pushq $VGCF_in_syscall
jmp hypercall_iret
ENDPROC(xen_syscall32_target)
ENDPROC(xen_sysenter_target)
#endif /* CONFIG_IA32_EMULATION */

View File

@ -5,15 +5,24 @@
#include <linux/elfnote.h>
#include <linux/init.h>
#include <asm/boot.h>
#include <asm/asm.h>
#include <asm/page.h>
#include <xen/interface/elfnote.h>
#include <asm/xen/interface.h>
__INIT
ENTRY(startup_xen)
movl %esi,xen_start_info
cld
movl $(init_thread_union+THREAD_SIZE),%esp
#ifdef CONFIG_X86_32
mov %esi,xen_start_info
mov $init_thread_union+THREAD_SIZE,%esp
#else
mov %rsi,xen_start_info
mov $init_thread_union+THREAD_SIZE,%rsp
#endif
jmp xen_start_kernel
__FINIT
@ -21,21 +30,26 @@ ENTRY(startup_xen)
.pushsection .text
.align PAGE_SIZE_asm
ENTRY(hypercall_page)
.skip 0x1000
.skip PAGE_SIZE_asm
.popsection
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET)
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
#ifdef CONFIG_X86_32
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
#else
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
#endif
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
.quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long __HYPERVISOR_VIRT_START)
ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
#endif /*CONFIG_XEN */

View File

@ -26,6 +26,7 @@ char * __init xen_memory_setup(void);
void __init xen_arch_setup(void);
void __init xen_init_IRQ(void);
void xen_enable_sysenter(void);
void xen_enable_syscall(void);
void xen_vcpu_restore(void);
void __init xen_build_dynamic_phys_to_machine(void);
@ -37,7 +38,6 @@ void __init xen_time_init(void);
unsigned long xen_get_wallclock(void);
int xen_set_wallclock(unsigned long time);
unsigned long long xen_sched_clock(void);
void xen_timer_resume(void);
irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
@ -45,20 +45,15 @@ bool xen_vcpu_stolen(int vcpu);
void xen_mark_init_mm_pinned(void);
void __init xen_fill_possible_map(void);
void __init xen_setup_vcpu_info_placement(void);
void xen_smp_prepare_boot_cpu(void);
void xen_smp_prepare_cpus(unsigned int max_cpus);
int xen_cpu_up(unsigned int cpu);
void xen_smp_cpus_done(unsigned int max_cpus);
void xen_smp_send_stop(void);
void xen_smp_send_reschedule(int cpu);
void xen_smp_send_call_function_ipi(cpumask_t mask);
void xen_smp_send_call_function_single_ipi(int cpu);
#ifdef CONFIG_SMP
void xen_smp_init(void);
extern cpumask_t xen_cpu_initialized_map;
#else
static inline void xen_smp_init(void) {}
#endif
/* Declare an asm function, along with symbols needed to make it
@ -73,7 +68,11 @@ DECL_ASM(void, xen_irq_disable_direct, void);
DECL_ASM(unsigned long, xen_save_fl_direct, void);
DECL_ASM(void, xen_restore_fl_direct, unsigned long);
/* These are not functions, and cannot be called normally */
void xen_iret(void);
void xen_sysexit(void);
void xen_sysret32(void);
void xen_sysret64(void);
void xen_adjust_exception_frame(void);
#endif /* XEN_OPS_H */

View File

@ -80,7 +80,7 @@ static inline int valid_mmap_phys_addr_range(unsigned long pfn, size_t size)
}
#endif
#ifdef CONFIG_NONPROMISC_DEVMEM
#ifdef CONFIG_STRICT_DEVMEM
static inline int range_is_allowed(unsigned long pfn, unsigned long size)
{
u64 from = ((u64)pfn) << PAGE_SHIFT;

View File

@ -92,7 +92,7 @@ struct netfront_info {
*/
union skb_entry {
struct sk_buff *skb;
unsigned link;
unsigned long link;
} tx_skbs[NET_TX_RING_SIZE];
grant_ref_t gref_tx_head;
grant_ref_t grant_tx_ref[NET_TX_RING_SIZE];
@ -125,6 +125,17 @@ struct netfront_rx_info {
struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
};
static void skb_entry_set_link(union skb_entry *list, unsigned short id)
{
list->link = id;
}
static int skb_entry_is_link(const union skb_entry *list)
{
BUILD_BUG_ON(sizeof(list->skb) != sizeof(list->link));
return ((unsigned long)list->skb < PAGE_OFFSET);
}
/*
* Access macros for acquiring freeing slots in tx_skbs[].
*/
@ -132,7 +143,7 @@ struct netfront_rx_info {
static void add_id_to_freelist(unsigned *head, union skb_entry *list,
unsigned short id)
{
list[id].link = *head;
skb_entry_set_link(&list[id], *head);
*head = id;
}
@ -993,7 +1004,7 @@ static void xennet_release_tx_bufs(struct netfront_info *np)
for (i = 0; i < NET_TX_RING_SIZE; i++) {
/* Skip over entries which are actually freelist references */
if ((unsigned long)np->tx_skbs[i].skb < PAGE_OFFSET)
if (skb_entry_is_link(&np->tx_skbs[i]))
continue;
skb = np->tx_skbs[i].skb;
@ -1123,7 +1134,7 @@ static struct net_device * __devinit xennet_create_dev(struct xenbus_device *dev
/* Initialise tx_skbs as a free chain containing every entry. */
np->tx_skb_freelist = 0;
for (i = 0; i < NET_TX_RING_SIZE; i++) {
np->tx_skbs[i].link = i+1;
skb_entry_set_link(&np->tx_skbs[i], i+1);
np->grant_tx_ref[i] = GRANT_INVALID_REF;
}

View File

@ -37,7 +37,7 @@
#include "intel-iommu.h"
#include <asm/proto.h> /* force_iommu in this header in x86-64*/
#include <asm/cacheflush.h>
#include <asm/gart.h>
#include <asm/iommu.h>
#include "pci.h"
#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)

View File

@ -734,6 +734,33 @@ static void restore_cpu_ipis(unsigned int cpu)
}
}
/* Clear an irq's pending state, in preparation for polling on it */
void xen_clear_irq_pending(int irq)
{
int evtchn = evtchn_from_irq(irq);
if (VALID_EVTCHN(evtchn))
clear_evtchn(evtchn);
}
/* Poll waiting for an irq to become pending. In the usual case, the
irq will be disabled so it won't deliver an interrupt. */
void xen_poll_irq(int irq)
{
evtchn_port_t evtchn = evtchn_from_irq(irq);
if (VALID_EVTCHN(evtchn)) {
struct sched_poll poll;
poll.nr_ports = 1;
poll.timeout = 0;
poll.ports = &evtchn;
if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0)
BUG();
}
}
void xen_irq_resume(void)
{
unsigned int cpu, irq, evtchn;

View File

@ -63,11 +63,12 @@ static int xen_suspend(void *data)
gnttab_resume();
xen_mm_unpin_all();
device_power_up();
device_power_up(PMSG_RESUME);
if (!*cancelled) {
xen_irq_resume();
xen_console_resume();
xen_timer_resume();
}
return 0;
@ -107,12 +108,13 @@ static void do_suspend(void)
goto out;
}
if (!cancelled)
if (!cancelled) {
xen_arch_resume();
xenbus_resume();
else
} else
xenbus_suspend_cancel();
device_resume();
device_resume(PMSG_RESUME);
/* Make sure timer events get retriggered on all CPUs */
clock_was_set();

View File

@ -27,13 +27,12 @@
/*
* some size calculation constants
*/
#define DEV_TABLE_ENTRY_SIZE 256
#define DEV_TABLE_ENTRY_SIZE 32
#define ALIAS_TABLE_ENTRY_SIZE 2
#define RLOOKUP_TABLE_ENTRY_SIZE (sizeof(void *))
/* helper macros */
#define LOW_U32(x) ((x) & ((1ULL << 32)-1))
#define HIGH_U32(x) (LOW_U32((x) >> 32))
/* Length of the MMIO region for the AMD IOMMU */
#define MMIO_REGION_LENGTH 0x4000
@ -158,78 +157,170 @@
#define MAX_DOMAIN_ID 65536
/*
* This structure contains generic data for IOMMU protection domains
* independent of their use.
*/
struct protection_domain {
spinlock_t lock;
u16 id;
int mode;
u64 *pt_root;
void *priv;
spinlock_t lock; /* mostly used to lock the page table*/
u16 id; /* the domain id written to the device table */
int mode; /* paging mode (0-6 levels) */
u64 *pt_root; /* page table root pointer */
void *priv; /* private data */
};
/*
* Data container for a dma_ops specific protection domain
*/
struct dma_ops_domain {
struct list_head list;
/* generic protection domain information */
struct protection_domain domain;
/* size of the aperture for the mappings */
unsigned long aperture_size;
/* address we start to search for free addresses */
unsigned long next_bit;
/* address allocation bitmap */
unsigned long *bitmap;
/*
* Array of PTE pages for the aperture. In this array we save all the
* leaf pages of the domain page table used for the aperture. This way
* we don't need to walk the page table to find a specific PTE. We can
* just calculate its address in constant time.
*/
u64 **pte_pages;
};
/*
* Structure where we save information about one hardware AMD IOMMU in the
* system.
*/
struct amd_iommu {
struct list_head list;
/* locks the accesses to the hardware */
spinlock_t lock;
/* device id of this IOMMU */
u16 devid;
/*
* Capability pointer. There could be more than one IOMMU per PCI
* device function if there are more than one AMD IOMMU capability
* pointers.
*/
u16 cap_ptr;
/* physical address of MMIO space */
u64 mmio_phys;
/* virtual address of MMIO space */
u8 *mmio_base;
/* capabilities of that IOMMU read from ACPI */
u32 cap;
/* first device this IOMMU handles. read from PCI */
u16 first_device;
/* last device this IOMMU handles. read from PCI */
u16 last_device;
/* start of exclusion range of that IOMMU */
u64 exclusion_start;
/* length of exclusion range of that IOMMU */
u64 exclusion_length;
/* command buffer virtual address */
u8 *cmd_buf;
/* size of command buffer */
u32 cmd_buf_size;
/* if one, we need to send a completion wait command */
int need_sync;
/* default dma_ops domain for that IOMMU */
struct dma_ops_domain *default_dom;
};
/*
* List with all IOMMUs in the system. This list is not locked because it is
* only written and read at driver initialization or suspend time
*/
extern struct list_head amd_iommu_list;
/*
* Structure defining one entry in the device table
*/
struct dev_table_entry {
u32 data[8];
};
/*
* One entry for unity mappings parsed out of the ACPI table.
*/
struct unity_map_entry {
struct list_head list;
/* starting device id this entry is used for (including) */
u16 devid_start;
/* end device id this entry is used for (including) */
u16 devid_end;
/* start address to unity map (including) */
u64 address_start;
/* end address to unity map (including) */
u64 address_end;
/* required protection */
int prot;
};
/*
* List of all unity mappings. It is not locked because as runtime it is only
* read. It is created at ACPI table parsing time.
*/
extern struct list_head amd_iommu_unity_map;
/* data structures for device handling */
/*
* Data structures for device handling
*/
/*
* Device table used by hardware. Read and write accesses by software are
* locked with the amd_iommu_pd_table lock.
*/
extern struct dev_table_entry *amd_iommu_dev_table;
/*
* Alias table to find requestor ids to device ids. Not locked because only
* read on runtime.
*/
extern u16 *amd_iommu_alias_table;
/*
* Reverse lookup table to find the IOMMU which translates a specific device.
*/
extern struct amd_iommu **amd_iommu_rlookup_table;
/* size of the dma_ops aperture as power of 2 */
extern unsigned amd_iommu_aperture_order;
/* largest PCI device id we expect translation requests for */
extern u16 amd_iommu_last_bdf;
/* data structures for protection domain handling */
extern struct protection_domain **amd_iommu_pd_table;
/* allocation bitmap for domain ids */
extern unsigned long *amd_iommu_pd_alloc_bitmap;
/* will be 1 if device isolation is enabled */
extern int amd_iommu_isolate;
/* takes a PCI device id and prints it out in a readable form */
static inline void print_devid(u16 devid, int nl)
{
int bus = devid >> 8;
@ -241,4 +332,11 @@ static inline void print_devid(u16 devid, int nl)
printk("\n");
}
/* takes bus and device/function and returns the device id
* FIXME: should that be in generic PCI code? */
static inline u16 calc_devid(u8 bus, u8 devfn)
{
return (((u16)bus) << 8) | devfn;
}
#endif

Some files were not shown because too many files have changed in this diff Show More