From f248b6a34fdd726dd12b549fceba907b6df2771c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 21 Oct 2006 18:37:00 +0200 Subject: [PATCH 01/19] [PATCH] x86-64: Update defconfig Signed-off-by: Andi Kleen --- arch/x86_64/defconfig | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index 47bfba6e9dc4..0f5d44e86be5 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.19-rc1 -# Thu Oct 5 13:04:43 2006 +# Linux kernel version: 2.6.19-rc2-git4 +# Sat Oct 21 03:38:52 2006 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -335,8 +335,8 @@ CONFIG_IPV6=y # CONFIG_INET6_XFRM_MODE_TUNNEL is not set # CONFIG_INET6_XFRM_MODE_BEET is not set # CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set +CONFIG_IPV6_SIT=y # CONFIG_IPV6_TUNNEL is not set -# CONFIG_IPV6_SUBTREES is not set # CONFIG_IPV6_MULTIPLE_TABLES is not set # CONFIG_NETWORK_SECMARK is not set # CONFIG_NETFILTER is not set @@ -437,6 +437,13 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set +# +# Misc devices +# +# CONFIG_IBM_ASM is not set +# CONFIG_SGI_IOC4 is not set +# CONFIG_TIFM_CORE is not set + # # ATA/ATAPI/MFM/RLL support # @@ -1008,6 +1015,7 @@ CONFIG_I2C_ISA=m # # Dallas's 1-wire bus # +# CONFIG_W1 is not set # # Hardware Monitoring support @@ -1058,12 +1066,6 @@ CONFIG_SENSORS_SMSC47B397=m # CONFIG_SENSORS_HDAPS is not set # CONFIG_HWMON_DEBUG_CHIP is not set -# -# Misc devices -# -# CONFIG_IBM_ASM is not set -# CONFIG_TIFM_CORE is not set - # # Multimedia devices # @@ -1196,7 +1198,6 @@ CONFIG_USB_HIDINPUT=y # CONFIG_USB_ATI_REMOTE2 is not set # CONFIG_USB_KEYSPAN_REMOTE is not set # CONFIG_USB_APPLETOUCH is not set -# CONFIG_USB_TRANCEVIBRATOR is not set # # USB Imaging devices @@ -1242,6 +1243,7 @@ CONFIG_USB_MON=y # CONFIG_USB_APPLEDISPLAY is not set # CONFIG_USB_SISUSBVGA is not set # CONFIG_USB_LD is not set +# CONFIG_USB_TRANCEVIBRATOR is not set # CONFIG_USB_TEST is not set # @@ -1318,6 +1320,7 @@ CONFIG_EXT3_FS=y CONFIG_EXT3_FS_XATTR=y CONFIG_EXT3_FS_POSIX_ACL=y # CONFIG_EXT3_FS_SECURITY is not set +# CONFIG_EXT4DEV_FS is not set CONFIG_JBD=y # CONFIG_JBD_DEBUG is not set CONFIG_FS_MBCACHE=y @@ -1341,6 +1344,7 @@ CONFIG_DNOTIFY=y # CONFIG_AUTOFS_FS is not set CONFIG_AUTOFS4_FS=y # CONFIG_FUSE_FS is not set +CONFIG_GENERIC_ACL=y # # CD-ROM/DVD Filesystems @@ -1418,7 +1422,6 @@ CONFIG_SUNRPC=y # CONFIG_CODA_FS is not set # CONFIG_AFS_FS is not set # CONFIG_9P_FS is not set -CONFIG_GENERIC_ACL=y # # Partition Types @@ -1470,10 +1473,6 @@ CONFIG_NLS_ISO8859_15=y # CONFIG_NLS_KOI8_U is not set CONFIG_NLS_UTF8=y -# -# Distributed Lock Manager -# - # # Instrumentation Support # @@ -1512,6 +1511,7 @@ CONFIG_DEBUG_FS=y CONFIG_UNWIND_INFO=y CONFIG_STACK_UNWIND=y # CONFIG_FORCED_INLINING is not set +# CONFIG_HEADERS_CHECK is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set # CONFIG_DEBUG_RODATA is not set From 13892de19eb9007ea47430c701bdbf69df71d883 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 21 Oct 2006 18:37:01 +0200 Subject: [PATCH 02/19] [PATCH] i386: Update defconfig Signed-off-by: Andi Kleen --- arch/i386/defconfig | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/i386/defconfig b/arch/i386/defconfig index 60c0c02574f0..97aacd6bd7d8 100644 --- a/arch/i386/defconfig +++ b/arch/i386/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.19-rc1 -# Thu Oct 5 13:04:53 2006 +# Linux kernel version: 2.6.19-rc2-git4 +# Sat Oct 21 03:38:56 2006 # CONFIG_X86_32=y CONFIG_GENERIC_TIME=y @@ -380,8 +380,8 @@ CONFIG_INET6_XFRM_MODE_TRANSPORT=y CONFIG_INET6_XFRM_MODE_TUNNEL=y # CONFIG_INET6_XFRM_MODE_BEET is not set # CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set +CONFIG_IPV6_SIT=y # CONFIG_IPV6_TUNNEL is not set -# CONFIG_IPV6_SUBTREES is not set # CONFIG_IPV6_MULTIPLE_TABLES is not set # CONFIG_NETWORK_SECMARK is not set # CONFIG_NETFILTER is not set @@ -482,6 +482,13 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set +# +# Misc devices +# +# CONFIG_IBM_ASM is not set +# CONFIG_SGI_IOC4 is not set +# CONFIG_TIFM_CORE is not set + # # ATA/ATAPI/MFM/RLL support # @@ -1024,6 +1031,7 @@ CONFIG_HANGCHECK_TIMER=y # # Dallas's 1-wire bus # +# CONFIG_W1 is not set # # Hardware Monitoring support @@ -1031,12 +1039,6 @@ CONFIG_HANGCHECK_TIMER=y # CONFIG_HWMON is not set # CONFIG_HWMON_VID is not set -# -# Misc devices -# -# CONFIG_IBM_ASM is not set -# CONFIG_TIFM_CORE is not set - # # Multimedia devices # @@ -1169,7 +1171,6 @@ CONFIG_USB_HIDINPUT=y # CONFIG_USB_ATI_REMOTE2 is not set # CONFIG_USB_KEYSPAN_REMOTE is not set # CONFIG_USB_APPLETOUCH is not set -# CONFIG_USB_TRANCEVIBRATOR is not set # # USB Imaging devices @@ -1215,6 +1216,7 @@ CONFIG_USB_MON=y # CONFIG_USB_APPLEDISPLAY is not set # CONFIG_USB_SISUSBVGA is not set # CONFIG_USB_LD is not set +# CONFIG_USB_TRANCEVIBRATOR is not set # CONFIG_USB_TEST is not set # @@ -1284,6 +1286,7 @@ CONFIG_EXT3_FS=y CONFIG_EXT3_FS_XATTR=y CONFIG_EXT3_FS_POSIX_ACL=y # CONFIG_EXT3_FS_SECURITY is not set +# CONFIG_EXT4DEV_FS is not set CONFIG_JBD=y # CONFIG_JBD_DEBUG is not set CONFIG_FS_MBCACHE=y @@ -1307,6 +1310,7 @@ CONFIG_DNOTIFY=y # CONFIG_AUTOFS_FS is not set CONFIG_AUTOFS4_FS=y # CONFIG_FUSE_FS is not set +CONFIG_GENERIC_ACL=y # # CD-ROM/DVD Filesystems @@ -1384,7 +1388,6 @@ CONFIG_SUNRPC=y # CONFIG_CODA_FS is not set # CONFIG_AFS_FS is not set # CONFIG_9P_FS is not set -CONFIG_GENERIC_ACL=y # # Partition Types @@ -1436,10 +1439,6 @@ CONFIG_NLS_ISO8859_15=y # CONFIG_NLS_KOI8_U is not set CONFIG_NLS_UTF8=y -# -# Distributed Lock Manager -# - # # Instrumentation Support # @@ -1480,6 +1479,7 @@ CONFIG_DEBUG_BUGVERBOSE=y CONFIG_UNWIND_INFO=y CONFIG_STACK_UNWIND=y # CONFIG_FORCED_INLINING is not set +# CONFIG_HEADERS_CHECK is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set CONFIG_EARLY_PRINTK=y From 926fafebc48a3218fac675f12974f9a46473bd40 Mon Sep 17 00:00:00 2001 From: keith mannthey Date: Sat, 21 Oct 2006 18:37:01 +0200 Subject: [PATCH 03/19] [PATCH] x86-64: x86_64 hot-add memory srat.c fix This patch corrects the logic used in srat.c to figure out what parsing what action to take when registering hot-add areas. Hot-add areas should only be added to the node information for the MEMORY_HOTPLUG_RESERVE case. When booting MEMORY_HOTPLUG_SPARSE hot-add areas on everything but the last node are getting include in the node data and during kernel boot the pages are setup then the kernel dies when the pages are used. This patch fixes this issue. Signed-off-by: Keith Mannthey Signed-off-by: Andi Kleen --- arch/x86_64/mm/srat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 3cc0544e25f5..1087e150a218 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -207,7 +207,7 @@ static inline int save_add_info(void) return hotadd_percent > 0; } #else -int update_end_of_memory(unsigned long end) {return 0;} +int update_end_of_memory(unsigned long end) {return -1;} static int hotadd_enough_memory(struct bootnode *nd) {return 1;} #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE static inline int save_add_info(void) {return 1;} @@ -337,7 +337,7 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) push_node_boundaries(node, nd->start >> PAGE_SHIFT, nd->end >> PAGE_SHIFT); - if (ma->flags.hot_pluggable && !reserve_hotadd(node, start, end) < 0) { + if (ma->flags.hot_pluggable && (reserve_hotadd(node, start, end) < 0)) { /* Ignore hotadd region. Undo damage */ printk(KERN_NOTICE "SRAT: Hotplug region ignored\n"); *nd = oldnode; From 45edfd1db02f818b3dc7e4743ee8585af6b78f78 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 21 Oct 2006 18:37:01 +0200 Subject: [PATCH 04/19] [PATCH] x86-64: typo in __assign_irq_vector when updating pos for vector and offset typo with cpu instead of new_cpu Signed-off-by: Yinghai Lu Signed-off-by: Andi Kleen --- arch/x86_64/kernel/io_apic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 49e94f7994c5..b848f4808510 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -651,12 +651,12 @@ static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result) if (vector == IA32_SYSCALL_VECTOR) goto next; for_each_cpu_mask(new_cpu, domain) - if (per_cpu(vector_irq, cpu)[vector] != -1) + if (per_cpu(vector_irq, new_cpu)[vector] != -1) goto next; /* Found one! */ for_each_cpu_mask(new_cpu, domain) { - pos[cpu].vector = vector; - pos[cpu].offset = offset; + pos[new_cpu].vector = vector; + pos[new_cpu].offset = offset; } if (old_vector >= 0) { int old_cpu; From da8604cc2d6cd4cbde873c7ba308365e402ac7bf Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 21 Oct 2006 18:37:01 +0200 Subject: [PATCH 05/19] [PATCH] i386: fix .cfi_signal_frame copy-n-paste error This was copied, pasted but not edited. Cc: Andi Kleen Cc: Jan Beulich Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/i386/Makefile b/arch/i386/Makefile index 7cc0b189b82b..2d9d756fdd74 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -51,8 +51,8 @@ cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) # is .cfi_signal_frame supported too? -cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) -AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) +cflags-y += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,) +AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,) CFLAGS += $(cflags-y) From 469b1d8741a5970ad49f2b5a837811579ba0b6f2 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Sat, 21 Oct 2006 18:37:01 +0200 Subject: [PATCH 06/19] [PATCH] x86-64: Fix for arch/x86_64/pci/Makefile CFLAGS The arch/x86_64/pci directory was giving problems in a wierd cross-compile environment. The exact cause is unknown, but the Makefile used CFLAGS instead of EXTRA_CFLAGS. From what I can tell from Documentation/kbuild/makefiles.txt, CFLAGS should not be used for this, it should be EXTRA_CFLAGS. And it solves the cross-compile problem. Signed-off-by: Corey Minyard Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Vojtech Pavlik Cc: Sam Ravnborg Signed-off-by: Andrew Morton --- arch/x86_64/pci/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86_64/pci/Makefile b/arch/x86_64/pci/Makefile index 1eb18f421edf..149aba05a5b8 100644 --- a/arch/x86_64/pci/Makefile +++ b/arch/x86_64/pci/Makefile @@ -3,7 +3,7 @@ # # Reuse the i386 PCI subsystem # -CFLAGS += -Iarch/i386/pci +EXTRA_CFLAGS += -Iarch/i386/pci obj-y := i386.o obj-$(CONFIG_PCI_DIRECT)+= direct.o From 73bb8919b33d42cf75a0ed89bc9ca6a7128665be Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Sat, 21 Oct 2006 18:37:01 +0200 Subject: [PATCH 07/19] [PATCH] x86-64: fix page align in e820 allocator Currently some code pieces assume that address returned by find_e820_area() are page aligned. But looks like find_e820_area() had no such intention and hence one might end up stomping over some of the data. One such case is bootmem allocator initialization code stomped over bss. This patch modified find_e820_area() to return page aligned address. This might be little wasteful of memory but at the same time probably it is easier to handle page aligned memory. Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/kernel/e820.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index b3f0908668ec..a75c829c2b02 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -54,13 +54,13 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) /* various gunk below that needed for SMP startup */ if (addr < 0x8000) { - *addrp = 0x8000; + *addrp = PAGE_ALIGN(0x8000); return 1; } /* direct mapping tables of the kernel */ if (last >= table_start<= INITRD_START && addr < INITRD_START+INITRD_SIZE) { - *addrp = INITRD_START + INITRD_SIZE; + *addrp = PAGE_ALIGN(INITRD_START + INITRD_SIZE); return 1; } #endif /* kernel code */ - if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) { - *addrp = __pa_symbol(&_end); + if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) { + *addrp = PAGE_ALIGN(__pa_symbol(&_end)); return 1; } if (last >= ebda_addr && addr < ebda_addr + ebda_size) { - *addrp = ebda_addr + ebda_size; + *addrp = PAGE_ALIGN(ebda_addr + ebda_size); return 1; } @@ -152,7 +152,7 @@ unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsi continue; while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) ; - last = addr + size; + last = PAGE_ALIGN(addr) + size; if (last > ei->addr + ei->size) continue; if (last > end) From cdfce1f5714fec7b24715f569b2fee1607350a6d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 21 Oct 2006 18:37:01 +0200 Subject: [PATCH 08/19] [PATCH] x86: Use -maccumulate-outgoing-args This avoids some problems with gcc 4.x and earlier generating invalid unwind information. In 4.1 the option is default when unwind information is enabled. And it seems to generate smaller code too, so it's probably a good thing on its own. With gcc 4.0: i386: 4683198 902112 480868 6066178 5c9002 vmlinux (before) 4449895 902112 480868 5832875 5900ab vmlinux (after) x86-64: 4939761 1449584 648216 7037561 6b6279 vmlinux (before) 4854193 1449584 648216 6951993 6a1439 vmlinux (after) On 4.1 it shouldn't make much difference because it is default when unwind is enabled anyways. Suggested by Michael Matz and Jan Beulich Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- arch/i386/Makefile | 4 ++++ arch/x86_64/Makefile | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/arch/i386/Makefile b/arch/i386/Makefile index 2d9d756fdd74..0677908dfa06 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -42,6 +42,10 @@ cflags-$(CONFIG_REGPARM) += -mregparm=3 # temporary until string.h is fixed cflags-y += -ffreestanding +# this works around some issues with generating unwind tables in older gccs +# newer gccs do it by default +cflags-y += -maccumulate-outgoing-args + # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use # a lot more stack due to the lack of sharing of stacklots: CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then echo $(call cc-option,-fno-unit-at-a-time); fi ;) diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 1c0f18d4f887..13972148058d 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -54,6 +54,10 @@ endif cflags-y += $(call cc-option,-funit-at-a-time) # prevent gcc from generating any FP code by mistake cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) +# this works around some issues with generating unwind tables in older gccs +# newer gccs do it by default +cflags-y += -maccumulate-outgoing-args + # do binutils support CFI? cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) From 690a973f48b6ba2954465992c08e65059c8374fe Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Sat, 21 Oct 2006 18:37:01 +0200 Subject: [PATCH 09/19] [PATCH] x86-64: Speed up dwarf2 unwinder This changes the dwarf2 unwinder to do a binary search for CIEs instead of a linear work. The linker is unfortunately not able to build a proper lookup table at link time, instead it creates one at runtime as soon as the bootmem allocator is usable (so you'll continue using the linear lookup for the first [hopefully] few calls). The code should be ready to utilize a build-time created table once a fixed linker becomes available. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- Makefile | 1 + include/asm-generic/vmlinux.lds.h | 16 ++ include/linux/unwind.h | 2 + init/main.c | 1 + kernel/unwind.c | 316 ++++++++++++++++++++++++++---- 5 files changed, 298 insertions(+), 38 deletions(-) diff --git a/Makefile b/Makefile index 62a1343cf327..389ff0cca9a7 100644 --- a/Makefile +++ b/Makefile @@ -499,6 +499,7 @@ endif ifdef CONFIG_UNWIND_INFO CFLAGS += -fasynchronous-unwind-tables +LDFLAGS_vmlinux += --eh-frame-hdr endif ifdef CONFIG_DEBUG_INFO diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 69240b52f8e1..9d0d11c180d9 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -125,6 +125,10 @@ *(__param) \ VMLINUX_SYMBOL(__stop___param) = .; \ } \ + \ + /* Unwind data binary search table */ \ + EH_FRAME_HDR \ + \ __end_rodata = .; \ . = ALIGN(4096); @@ -157,6 +161,18 @@ *(.kprobes.text) \ VMLINUX_SYMBOL(__kprobes_text_end) = .; +#ifdef CONFIG_STACK_UNWIND + /* Unwind data binary search table */ +#define EH_FRAME_HDR \ + .eh_frame_hdr : AT(ADDR(.eh_frame_hdr) - LOAD_OFFSET) { \ + VMLINUX_SYMBOL(__start_unwind_hdr) = .; \ + *(.eh_frame_hdr) \ + VMLINUX_SYMBOL(__end_unwind_hdr) = .; \ + } +#else +#define EH_FRAME_HDR +#endif + /* DWARF debug sections. Symbols in the DWARF debugging sections are relative to the beginning of the section so we begin them at 0. */ diff --git a/include/linux/unwind.h b/include/linux/unwind.h index 73e1751d03dd..749928c161fb 100644 --- a/include/linux/unwind.h +++ b/include/linux/unwind.h @@ -26,6 +26,7 @@ struct module; * Initialize unwind support. */ extern void unwind_init(void); +extern void unwind_setup(void); #ifdef CONFIG_MODULES @@ -73,6 +74,7 @@ extern int unwind_to_user(struct unwind_frame_info *); struct unwind_frame_info {}; static inline void unwind_init(void) {} +static inline void unwind_setup(void) {} #ifdef CONFIG_MODULES diff --git a/init/main.c b/init/main.c index ee123243fb53..36f608a7cfba 100644 --- a/init/main.c +++ b/init/main.c @@ -503,6 +503,7 @@ asmlinkage void __init start_kernel(void) printk(KERN_NOTICE); printk(linux_banner); setup_arch(&command_line); + unwind_setup(); setup_per_cpu_areas(); smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ diff --git a/kernel/unwind.c b/kernel/unwind.c index 2e2368607aab..f7e50d16dbf6 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -11,13 +11,15 @@ #include #include -#include +#include +#include #include #include #include #include extern char __start_unwind[], __end_unwind[]; +extern const u8 __start_unwind_hdr[], __end_unwind_hdr[]; #define MAX_STACK_DEPTH 8 @@ -100,6 +102,8 @@ static struct unwind_table { } core, init; const void *address; unsigned long size; + const unsigned char *header; + unsigned long hdrsz; struct unwind_table *link; const char *name; } root_table; @@ -145,6 +149,10 @@ static struct unwind_table *find_table(unsigned long pc) return table; } +static unsigned long read_pointer(const u8 **pLoc, + const void *end, + signed ptrType); + static void init_unwind_table(struct unwind_table *table, const char *name, const void *core_start, @@ -152,14 +160,30 @@ static void init_unwind_table(struct unwind_table *table, const void *init_start, unsigned long init_size, const void *table_start, - unsigned long table_size) + unsigned long table_size, + const u8 *header_start, + unsigned long header_size) { + const u8 *ptr = header_start + 4; + const u8 *end = header_start + header_size; + table->core.pc = (unsigned long)core_start; table->core.range = core_size; table->init.pc = (unsigned long)init_start; table->init.range = init_size; table->address = table_start; table->size = table_size; + /* See if the linker provided table looks valid. */ + if (header_size <= 4 + || header_start[0] != 1 + || (void *)read_pointer(&ptr, end, header_start[1]) != table_start + || header_start[2] == DW_EH_PE_omit + || read_pointer(&ptr, end, header_start[2]) <= 0 + || header_start[3] == DW_EH_PE_omit) + header_start = NULL; + table->hdrsz = header_size; + smp_wmb(); + table->header = header_start; table->link = NULL; table->name = name; } @@ -169,7 +193,143 @@ void __init unwind_init(void) init_unwind_table(&root_table, "kernel", _text, _end - _text, NULL, 0, - __start_unwind, __end_unwind - __start_unwind); + __start_unwind, __end_unwind - __start_unwind, + __start_unwind_hdr, __end_unwind_hdr - __start_unwind_hdr); +} + +static const u32 bad_cie, not_fde; +static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *); +static signed fde_pointer_type(const u32 *cie); + +struct eh_frame_hdr_table_entry { + unsigned long start, fde; +}; + +static int cmp_eh_frame_hdr_table_entries(const void *p1, const void *p2) +{ + const struct eh_frame_hdr_table_entry *e1 = p1; + const struct eh_frame_hdr_table_entry *e2 = p2; + + return (e1->start > e2->start) - (e1->start < e2->start); +} + +static void swap_eh_frame_hdr_table_entries(void *p1, void *p2, int size) +{ + struct eh_frame_hdr_table_entry *e1 = p1; + struct eh_frame_hdr_table_entry *e2 = p2; + unsigned long v; + + v = e1->start; + e1->start = e2->start; + e2->start = v; + v = e1->fde; + e1->fde = e2->fde; + e2->fde = v; +} + +static void __init setup_unwind_table(struct unwind_table *table, + void *(*alloc)(unsigned long)) +{ + const u8 *ptr; + unsigned long tableSize = table->size, hdrSize; + unsigned n; + const u32 *fde; + struct { + u8 version; + u8 eh_frame_ptr_enc; + u8 fde_count_enc; + u8 table_enc; + unsigned long eh_frame_ptr; + unsigned int fde_count; + struct eh_frame_hdr_table_entry table[]; + } __attribute__((__packed__)) *header; + + if (table->header) + return; + + if (table->hdrsz) + printk(KERN_WARNING ".eh_frame_hdr for '%s' present but unusable\n", + table->name); + + if (tableSize & (sizeof(*fde) - 1)) + return; + + for (fde = table->address, n = 0; + tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; + tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) { + const u32 *cie = cie_for_fde(fde, table); + signed ptrType; + + if (cie == ¬_fde) + continue; + if (cie == NULL + || cie == &bad_cie + || (ptrType = fde_pointer_type(cie)) < 0) + return; + ptr = (const u8 *)(fde + 2); + if (!read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType)) + return; + ++n; + } + + if (tableSize || !n) + return; + + hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int) + + 2 * n * sizeof(unsigned long); + header = alloc(hdrSize); + if (!header) + return; + header->version = 1; + header->eh_frame_ptr_enc = DW_EH_PE_abs|DW_EH_PE_native; + header->fde_count_enc = DW_EH_PE_abs|DW_EH_PE_data4; + header->table_enc = DW_EH_PE_abs|DW_EH_PE_native; + put_unaligned((unsigned long)table->address, &header->eh_frame_ptr); + BUILD_BUG_ON(offsetof(typeof(*header), fde_count) + % __alignof(typeof(header->fde_count))); + header->fde_count = n; + + BUILD_BUG_ON(offsetof(typeof(*header), table) + % __alignof(typeof(*header->table))); + for (fde = table->address, tableSize = table->size, n = 0; + tableSize; + tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) { + const u32 *cie = fde + 1 - fde[1] / sizeof(*fde); + + if (!fde[1]) + continue; /* this is a CIE */ + ptr = (const u8 *)(fde + 2); + header->table[n].start = read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + fde_pointer_type(cie)); + header->table[n].fde = (unsigned long)fde; + ++n; + } + WARN_ON(n != header->fde_count); + + sort(header->table, + n, + sizeof(*header->table), + cmp_eh_frame_hdr_table_entries, + swap_eh_frame_hdr_table_entries); + + table->hdrsz = hdrSize; + smp_wmb(); + table->header = (const void *)header; +} + +static void *__init balloc(unsigned long sz) +{ + return __alloc_bootmem_nopanic(sz, + sizeof(unsigned int), + __pa(MAX_DMA_ADDRESS)); +} + +void __init unwind_setup(void) +{ + setup_unwind_table(&root_table, balloc); } #ifdef CONFIG_MODULES @@ -193,7 +353,8 @@ void *unwind_add_table(struct module *module, init_unwind_table(table, module->name, module->module_core, module->core_size, module->module_init, module->init_size, - table_start, table_size); + table_start, table_size, + NULL, 0); if (last_table) last_table->link = table; @@ -303,6 +464,26 @@ static sleb128_t get_sleb128(const u8 **pcur, const u8 *end) return value; } +static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *table) +{ + const u32 *cie; + + if (!*fde || (*fde & (sizeof(*fde) - 1))) + return &bad_cie; + if (!fde[1]) + return ¬_fde; /* this is a CIE */ + if ((fde[1] & (sizeof(*fde) - 1)) + || fde[1] > (unsigned long)(fde + 1) - (unsigned long)table->address) + return NULL; /* this is not a valid FDE */ + cie = fde + 1 - fde[1] / sizeof(*fde); + if (*cie <= sizeof(*cie) + 4 + || *cie >= fde[1] - sizeof(*fde) + || (*cie & (sizeof(*cie) - 1)) + || cie[1]) + return NULL; /* this is not a (valid) CIE */ + return cie; +} + static unsigned long read_pointer(const u8 **pLoc, const void *end, signed ptrType) @@ -610,49 +791,108 @@ int unwind(struct unwind_frame_info *frame) unsigned i; signed ptrType = -1; uleb128_t retAddrReg = 0; - struct unwind_table *table; + const struct unwind_table *table; struct unwind_state state; if (UNW_PC(frame) == 0) return -EINVAL; if ((table = find_table(pc)) != NULL && !(table->size & (sizeof(*fde) - 1))) { - unsigned long tableSize = table->size; + const u8 *hdr = table->header; + unsigned long tableSize; - for (fde = table->address; - tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; - tableSize -= sizeof(*fde) + *fde, - fde += 1 + *fde / sizeof(*fde)) { - if (!*fde || (*fde & (sizeof(*fde) - 1))) - break; - if (!fde[1]) - continue; /* this is a CIE */ - if ((fde[1] & (sizeof(*fde) - 1)) - || fde[1] > (unsigned long)(fde + 1) - - (unsigned long)table->address) - continue; /* this is not a valid FDE */ - cie = fde + 1 - fde[1] / sizeof(*fde); - if (*cie <= sizeof(*cie) + 4 - || *cie >= fde[1] - sizeof(*fde) - || (*cie & (sizeof(*cie) - 1)) - || cie[1] - || (ptrType = fde_pointer_type(cie)) < 0) { - cie = NULL; /* this is not a (valid) CIE */ - continue; + smp_rmb(); + if (hdr && hdr[0] == 1) { + switch(hdr[3] & DW_EH_PE_FORM) { + case DW_EH_PE_native: tableSize = sizeof(unsigned long); break; + case DW_EH_PE_data2: tableSize = 2; break; + case DW_EH_PE_data4: tableSize = 4; break; + case DW_EH_PE_data8: tableSize = 8; break; + default: tableSize = 0; break; } + ptr = hdr + 4; + end = hdr + table->hdrsz; + if (tableSize + && read_pointer(&ptr, end, hdr[1]) + == (unsigned long)table->address + && (i = read_pointer(&ptr, end, hdr[2])) > 0 + && i == (end - ptr) / (2 * tableSize) + && !((end - ptr) % (2 * tableSize))) { + do { + const u8 *cur = ptr + (i / 2) * (2 * tableSize); + + startLoc = read_pointer(&cur, + cur + tableSize, + hdr[3]); + if (pc < startLoc) + i /= 2; + else { + ptr = cur - tableSize; + i = (i + 1) / 2; + } + } while (startLoc && i > 1); + if (i == 1 + && (startLoc = read_pointer(&ptr, + ptr + tableSize, + hdr[3])) != 0 + && pc >= startLoc) + fde = (void *)read_pointer(&ptr, + ptr + tableSize, + hdr[3]); + } + } + + if (fde != NULL) { + cie = cie_for_fde(fde, table); ptr = (const u8 *)(fde + 2); - startLoc = read_pointer(&ptr, - (const u8 *)(fde + 1) + *fde, - ptrType); - endLoc = startLoc - + read_pointer(&ptr, - (const u8 *)(fde + 1) + *fde, - ptrType & DW_EH_PE_indirect - ? ptrType - : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed)); - if (pc >= startLoc && pc < endLoc) - break; - cie = NULL; + if(cie != NULL + && cie != &bad_cie + && cie != ¬_fde + && (ptrType = fde_pointer_type(cie)) >= 0 + && read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType) == startLoc) { + if (!(ptrType & DW_EH_PE_indirect)) + ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed; + endLoc = startLoc + + read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType); + if(pc >= endLoc) + fde = NULL; + } else + fde = NULL; + } + if (fde == NULL) { + for (fde = table->address, tableSize = table->size; + cie = NULL, tableSize > sizeof(*fde) + && tableSize - sizeof(*fde) >= *fde; + tableSize -= sizeof(*fde) + *fde, + fde += 1 + *fde / sizeof(*fde)) { + cie = cie_for_fde(fde, table); + if (cie == &bad_cie) { + cie = NULL; + break; + } + if (cie == NULL + || cie == ¬_fde + || (ptrType = fde_pointer_type(cie)) < 0) + continue; + ptr = (const u8 *)(fde + 2); + startLoc = read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType); + if (!startLoc) + continue; + if (!(ptrType & DW_EH_PE_indirect)) + ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed; + endLoc = startLoc + + read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType); + if (pc >= startLoc && pc < endLoc) + break; + } } } if (cie != NULL) { From 7a71cef780404e8c90d23b1131142e158d94354b Mon Sep 17 00:00:00 2001 From: "bibo,mao" Date: Sat, 21 Oct 2006 18:37:02 +0200 Subject: [PATCH 10/19] [PATCH] x86-64: x86_64 add NX mask for PTE entry If function change_page_attr_addr calls revert_page to revert to original pte value, mk_pte_phys does not mask NX bit. If NX bit is set on no NX hardware supported x86_64 machine, there is will be RSVD type page fault and system will crash. This patch adds NX mask bit for PTE entry. Signed-off-by: bibo,mao Signed-off-by: Andi Kleen --- include/asm-x86_64/pgtable.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index 6899e770b173..0555c1c4d8fa 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -366,6 +366,7 @@ static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot) { pte_t pte; pte_val(pte) = physpage | pgprot_val(pgprot); + pte_val(pte) &= __supported_pte_mask; return pte; } From 26fd5e084e470dbe8edc6f726fc918e89b9f988c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 21 Oct 2006 18:37:02 +0200 Subject: [PATCH 11/19] [PATCH] i386: Fix fake return address The fake return address was being set to __KERNEL_PDA, rather than 0. Push it earlier while %eax still equals 0. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Andrew Morton --- arch/i386/kernel/head.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index be9d883c62ce..ca31f18d277c 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -317,7 +317,7 @@ is386: movl $2,%ecx # set MP movl %eax,%gs lldt %ax cld # gcc2 wants the direction flag cleared at all times - pushl %eax # fake return address + pushl $0 # fake return address for unwinder #ifdef CONFIG_SMP movb ready, %cl movb $1, ready From cc7d479fe56133e79840beffe9cb4fd193af93aa Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Sat, 21 Oct 2006 18:37:02 +0200 Subject: [PATCH 12/19] [PATCH] x86-64: Fix ENOSYS in system call tracing This patch: - out of range system calls failing to return -ENOSYS under system call tracing [AK: split out from another patch by Jan as separate bugfix] Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/x86_64/kernel/entry.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 38a7b2d528e2..038dcf733a1b 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -315,6 +315,8 @@ tracesys: LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST cmpq $__NR_syscall_max,%rax + movq $-ENOSYS,%rcx + cmova %rcx,%rax ja 1f movq %r10,%rcx /* fixup for C */ call *sys_call_table(,%rax,8) From 581910e2eb952e541b8ca9b5f551d6c124903b61 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 21 Oct 2006 18:37:02 +0200 Subject: [PATCH 13/19] [PATCH] x86-64: Revert interrupt backlink changes They break more than they fix Signed-off-by: Andi Kleen --- arch/x86_64/kernel/entry.S | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 038dcf733a1b..e3eddde3d3bb 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -537,8 +537,6 @@ END(stub_rt_sigreturn) 1: incl %gs:pda_irqcount cmoveq %gs:pda_irqstackptr,%rsp push %rbp # backlink for old unwinder - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbp,0 /* * We entered an interrupt context - irqs are off: */ @@ -1178,7 +1176,6 @@ ENTRY(call_softirq) incl %gs:pda_irqcount cmove %gs:pda_irqstackptr,%rsp push %rbp # backlink for old unwinder - CFI_ADJUST_CFA_OFFSET 8 call __do_softirq leaveq CFI_DEF_CFA_REGISTER rsp From a1bae67243512ca30ceda48e3e24e25b543f8ab7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 21 Oct 2006 18:37:02 +0200 Subject: [PATCH 14/19] [PATCH] i386: Disable nmi watchdog on all ThinkPads Even newer Thinkpads have bugs in SMM code that causes hangs with NMI watchdog. Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 10 +++++----- drivers/firmware/dmi_scan.c | 20 ++++++++++++++++++++ include/linux/dmi.h | 2 ++ 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 3e8e3adb0489..eaafe233a5da 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -219,11 +219,11 @@ static int __init check_nmi_watchdog(void) int cpu; /* Enable NMI watchdog for newer systems. - Actually it should be safe for most systems before 2004 too except - for some IBM systems that corrupt registers when NMI happens - during SMM. Unfortunately we don't have more exact information - on these and use this coarse check. */ - if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004) + Probably safe on most older systems too, but let's be careful. + IBM ThinkPads use INT10 inside SMM and that allows early NMI inside SMM + which hangs the system. Disable watchdog for all thinkpads */ + if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004 && + !dmi_name_in_vendors("ThinkPad")) nmi_watchdog = NMI_LOCAL_APIC; if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT)) diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index b8b596d5778d..37deee6c0c1c 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -326,6 +326,26 @@ char *dmi_get_system_info(int field) } EXPORT_SYMBOL(dmi_get_system_info); + +/** + * dmi_name_in_vendors - Check if string is anywhere in the DMI vendor information. + * @str: Case sensitive Name + */ +int dmi_name_in_vendors(char *str) +{ + static int fields[] = { DMI_BIOS_VENDOR, DMI_BIOS_VERSION, DMI_SYS_VENDOR, + DMI_PRODUCT_NAME, DMI_PRODUCT_VERSION, DMI_BOARD_VENDOR, + DMI_BOARD_NAME, DMI_BOARD_VERSION, DMI_NONE }; + int i; + for (i = 0; fields[i] != DMI_NONE; i++) { + int f = fields[i]; + if (dmi_ident[f] && strstr(dmi_ident[f], str)) + return 1; + } + return 0; +} +EXPORT_SYMBOL(dmi_name_in_vendors); + /** * dmi_find_device - find onboard device by type/name * @type: device type or %DMI_DEV_TYPE_ANY to match all device types diff --git a/include/linux/dmi.h b/include/linux/dmi.h index 38dc403be70b..904bf3d2d90b 100644 --- a/include/linux/dmi.h +++ b/include/linux/dmi.h @@ -69,6 +69,7 @@ extern struct dmi_device * dmi_find_device(int type, const char *name, struct dmi_device *from); extern void dmi_scan_machine(void); extern int dmi_get_year(int field); +extern int dmi_name_in_vendors(char *str); #else @@ -77,6 +78,7 @@ static inline char * dmi_get_system_info(int field) { return NULL; } static inline struct dmi_device * dmi_find_device(int type, const char *name, struct dmi_device *from) { return NULL; } static inline int dmi_get_year(int year) { return 0; } +static inline int dmi_name_in_vendors(char *s) { return 0; } #endif From 6bf2dafad18c119beb534cbb3d882fe7a6c3f529 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 21 Oct 2006 18:37:02 +0200 Subject: [PATCH 15/19] [PATCH] x86-64: Use irq_domain in ioapic_retrigger_irq Thanks to YH Lu for spotting this. It appears I missed this function when I refactored allocate_irq_vector and introduced irq_domain, with the result that all retriggered irqs would go to cpu 0 even if we were not prepared to receive them there. While reviewing YH's patch I also noticed that this function was missing locking, and since I am now reading two values from two diffrent arrays that looks like a race we might be able to hit in the real world. Cc: Yinghai Lu Signed-off-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/x86_64/kernel/io_apic.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index b848f4808510..8a9a357875b7 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -1255,12 +1255,15 @@ static int ioapic_retrigger_irq(unsigned int irq) { cpumask_t mask; unsigned vector; + unsigned long flags; + spin_lock_irqsave(&vector_lock, flags); vector = irq_vector[irq]; cpus_clear(mask); - cpu_set(vector >> 8, mask); + cpu_set(first_cpu(irq_domain[irq]), mask); - send_IPI_mask(mask, vector & 0xff); + send_IPI_mask(mask, vector); + spin_unlock_irqrestore(&vector_lock, flags); return 1; } From 8cf2c51927bbeefafc25193d01b91f9ed3806e96 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 21 Oct 2006 18:37:02 +0200 Subject: [PATCH 16/19] [PATCH] x86: Revert new unwind kernel stack termination Jan convinced me that it was unnecessary because the assembly stubs do this already on the stack. Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- arch/i386/kernel/process.c | 6 +----- arch/x86_64/kernel/entry.S | 5 ----- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 57d375900afb..1e1fa3e391a3 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -336,7 +336,6 @@ extern void kernel_thread_helper(void); int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) { struct pt_regs regs; - int err; memset(®s, 0, sizeof(regs)); @@ -351,10 +350,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; /* Ok, create the new process.. */ - err = do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); - if (err == 0) /* terminate kernel stack */ - task_pt_regs(current)->eip = 0; - return err; + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); } EXPORT_SYMBOL(kernel_thread); diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index e3eddde3d3bb..7d401b00d822 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -980,11 +980,6 @@ ENTRY(kernel_thread) call do_fork movq %rax,RAX(%rsp) xorl %edi,%edi - test %rax,%rax - jnz 1f - /* terminate stack in child */ - movq %rdi,RIP(%rsp) -1: /* * It isn't worth to check for reschedule here, From 84f404f695b16bd142c8dd9910d5a398f54fb044 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 21 Oct 2006 18:37:02 +0200 Subject: [PATCH 17/19] [PATCH] x86-64: Put more than one cpu in TARGET_CPUS TARGET_CPUS is the default irq routing poicy. It specifies which cpus the kernel should aim an irq at. In physflat delivery mode we can route an irq to a single cpu. But that doesn't mean our default policy should only be a single cpu is allowed. By allowing the irq routing code to select from multiple cpus this enables systems with more irqs then we can service on a single processor to actually work. I just audited and tested the code and irqbalance doesn't care, and the io_apic.c doesn't care if we have extra cpus in the mask. Everything will use or assume we are using the lowest numbered cpu in the mask if we can't use them all. So this should result in no behavior changes except on systems that need it. Thanks for YH Lu for spotting this problem in his testing. Cc: Yinghai Lu Signed-off-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/x86_64/kernel/genapic_flat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index 0dfc223c1839..7c01db8fa9d1 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -153,7 +153,7 @@ struct genapic apic_flat = { static cpumask_t physflat_target_cpus(void) { - return cpumask_of_cpu(0); + return cpu_online_map; } static cpumask_t physflat_vector_allocation_domain(int cpu) From dbaab49f92ff6ae6255762a948375e4036cbdbd2 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Sat, 21 Oct 2006 18:37:03 +0200 Subject: [PATCH 18/19] [PATCH] x86-64: Overlapping program headers in physical addr space fix o A recent change to vmlinux.ld.S file broke kexec as now resulting vmlinux program headers are overlapping in physical address space. o Now all the vsyscall related sections are placed after data and after that mostly init data sections are placed. To avoid physical overlap among phdrs, there are three possible solutions. - Place vsyscall sections also in data phdrs instead of user - move vsyscal sections after init data in bss. - create another phdrs say data.init and move all the sections after vsyscall into this new phdr. o This patch implements the third solution. Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen Cc: Magnus Damm Cc: Andi Kleen Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- arch/x86_64/kernel/vmlinux.lds.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index b9df2ab6529f..1283614c9b24 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -17,6 +17,7 @@ PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(7); /* RWE */ user PT_LOAD FLAGS(7); /* RWE */ + data.init PT_LOAD FLAGS(7); /* RWE */ note PT_NOTE FLAGS(4); /* R__ */ } SECTIONS @@ -131,7 +132,7 @@ SECTIONS . = ALIGN(8192); /* init_task */ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { *(.data.init_task) - } :data + }:data.init . = ALIGN(4096); .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { From e70ea8c09db0e25ab58f84ba7f393e5c9125a8ee Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 21 Oct 2006 18:37:03 +0200 Subject: [PATCH 19/19] [PATCH] x86-64: Revert timer routing behaviour back to 2.6.16 state By default route the 8254 over the 8259 and only disable it on ATI boards where this causes double timer interrupts. This should unbreak some Nvidia boards where the timer doesn't seem to tick of it isn't enabled in the 8259. At least one VIA board also seemed to have a little trouble with the disabled 8259. For 2.6.20 we'll try both dynamically without black listing, but I think for .19 this is the safer approach because it has been already well tested in earlier kernels. This also makes the x86-64 behaviour the same as i386. Command line options can change all this of course. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/early-quirks.c | 9 +++++---- arch/x86_64/kernel/io_apic.c | 2 +- include/asm-x86_64/proto.h | 2 ++ 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c index 208e38a372c1..2b1245d86258 100644 --- a/arch/x86_64/kernel/early-quirks.c +++ b/arch/x86_64/kernel/early-quirks.c @@ -61,10 +61,11 @@ static void nvidia_bugs(void) static void ati_bugs(void) { -#if 1 /* for testing */ - printk("ATI board detected\n"); -#endif - /* No bugs right now */ + if (timer_over_8254 == 1) { + timer_over_8254 = 0; + printk(KERN_INFO + "ATI board detected. Disabling timer routing over 8254.\n"); + } } struct chipset { diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 8a9a357875b7..b000017e4b5d 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -57,7 +57,7 @@ static int no_timer_check; static int disable_timer_pin_1 __initdata; -int timer_over_8254 __initdata = 0; +int timer_over_8254 __initdata = 1; /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index c181fef786e4..e72cfcdf5344 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -122,6 +122,8 @@ extern int fix_aperture; extern int reboot_force; extern int notsc_setup(char *); +extern int timer_over_8254; + extern int gsi_irq_sharing(int gsi); extern void smp_local_timer_interrupt(void);