From 8d69abb08343706f88380edb83927ffc0fc83098 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sun, 18 Jan 2009 11:55:19 +0200 Subject: [PATCH 001/105] [ARM] pxa: fix NAND and MMC clock initialization for pxa3xx After commit 8c3abc7d903df492a7394b0adae4349d9a381aaf ("[ARM] pxa: convert to clkdev and match clocks by struct device where possible") get_clk in pxa3xx_nand fails with -ENOENT. Apparently, clk_get in pxamci will also fail for MCI2 on PXA310. The 'clk_find' and therefore 'clk_get' require driver to supply both 'dev_id' and 'con_id' if they are not NULL in the 'strcut clk_lookup', but neither pxa3xx_nand nor pxamci supply 'con_id'. This patch sets 'con_id' to NULL in NAND clock and MCI2 clock registration. Signed-off-by: Mike Rapoport Signed-off-by: Eric Miao --- arch/arm/mach-pxa/pxa300.c | 4 ++-- arch/arm/mach-pxa/pxa320.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/mach-pxa/pxa300.c b/arch/arm/mach-pxa/pxa300.c index f735e58e6669..83fb609b6eb7 100644 --- a/arch/arm/mach-pxa/pxa300.c +++ b/arch/arm/mach-pxa/pxa300.c @@ -88,13 +88,13 @@ static struct pxa3xx_mfp_addr_map pxa310_mfp_addr_map[] __initdata = { static DEFINE_PXA3_CKEN(common_nand, NAND, 156000000, 0); static struct clk_lookup common_clkregs[] = { - INIT_CLKREG(&clk_common_nand, "pxa3xx-nand", "NANDCLK"), + INIT_CLKREG(&clk_common_nand, "pxa3xx-nand", NULL), }; static DEFINE_PXA3_CKEN(pxa310_mmc3, MMC3, 19500000, 0); static struct clk_lookup pxa310_clkregs[] = { - INIT_CLKREG(&clk_pxa310_mmc3, "pxa2xx-mci.2", "MMCCLK"), + INIT_CLKREG(&clk_pxa310_mmc3, "pxa2xx-mci.2", NULL), }; static int __init pxa300_init(void) diff --git a/arch/arm/mach-pxa/pxa320.c b/arch/arm/mach-pxa/pxa320.c index effe408c186f..36f066196fa2 100644 --- a/arch/arm/mach-pxa/pxa320.c +++ b/arch/arm/mach-pxa/pxa320.c @@ -83,7 +83,7 @@ static struct pxa3xx_mfp_addr_map pxa320_mfp_addr_map[] __initdata = { static DEFINE_PXA3_CKEN(pxa320_nand, NAND, 104000000, 0); static struct clk_lookup pxa320_clkregs[] = { - INIT_CLKREG(&clk_pxa320_nand, "pxa3xx-nand", "NANDCLK"), + INIT_CLKREG(&clk_pxa320_nand, "pxa3xx-nand", NULL), }; static int __init pxa320_init(void) From ec971c91c55b8e2e2609f8d61eb34da13aedb37d Mon Sep 17 00:00:00 2001 From: Eric Miao Date: Mon, 19 Jan 2009 11:39:36 +0800 Subject: [PATCH 002/105] [ARM] pxa: fix missing of __REG() definition for ac97 registers access This currently happens for 'drivers/input/touch/mainstone-wm97xx.c'. Signed-off-by: Eric Miao --- arch/arm/mach-pxa/include/mach/regs-ac97.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/mach-pxa/include/mach/regs-ac97.h b/arch/arm/mach-pxa/include/mach/regs-ac97.h index e41b9d202b8c..b8d14bd9ae59 100644 --- a/arch/arm/mach-pxa/include/mach/regs-ac97.h +++ b/arch/arm/mach-pxa/include/mach/regs-ac97.h @@ -1,6 +1,8 @@ #ifndef __ASM_ARCH_REGS_AC97_H #define __ASM_ARCH_REGS_AC97_H +#include + /* * AC97 Controller registers */ From b6729deb26a131083add5b7238c7b7478ef6b502 Mon Sep 17 00:00:00 2001 From: Eric Miao Date: Mon, 19 Jan 2009 11:42:32 +0800 Subject: [PATCH 003/105] [ARM] pxa: make more SSCR0 bit definitions visible on multiple processors The only exclusive definitions are SSCR0_SCR and SSCR0_SerClkDiv(), loosen that exclusive #ifdef .. #else .. #endif to allow other definitions to be visible when slected multiple processors. This helps to pass the building of pxa-ssp.c. Signed-off-by: Eric Miao --- arch/arm/mach-pxa/include/mach/regs-ssp.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm/mach-pxa/include/mach/regs-ssp.h b/arch/arm/mach-pxa/include/mach/regs-ssp.h index 3c04cde2cf1f..cf31986f6f05 100644 --- a/arch/arm/mach-pxa/include/mach/regs-ssp.h +++ b/arch/arm/mach-pxa/include/mach/regs-ssp.h @@ -41,6 +41,9 @@ #elif defined(CONFIG_PXA27x) || defined(CONFIG_PXA3xx) #define SSCR0_SCR (0x000fff00) /* Serial Clock Rate (mask) */ #define SSCR0_SerClkDiv(x) (((x) - 1) << 8) /* Divisor [1..4096] */ +#endif + +#if defined(CONFIG_PXA27x) || defined(CONFIG_PXA3xx) #define SSCR0_EDSS (1 << 20) /* Extended data size select */ #define SSCR0_NCS (1 << 21) /* Network clock select */ #define SSCR0_RIM (1 << 22) /* Receive FIFO overrrun interrupt mask */ From 26a552264bc92d2ec4747b1babb7cb1264908018 Mon Sep 17 00:00:00 2001 From: Eric Miao Date: Wed, 21 Jan 2009 11:29:19 +0800 Subject: [PATCH 004/105] [ARM] pxa: stop and disable IRQ for each DMA channels at startup Some broken bootloaders will leave the DMA channel state unclean, which we should really initialize correctly here. Signed-off-by: Eric Miao --- arch/arm/mach-pxa/dma.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/arm/mach-pxa/dma.c b/arch/arm/mach-pxa/dma.c index b1514fb20d3a..7de17fc5d54b 100644 --- a/arch/arm/mach-pxa/dma.c +++ b/arch/arm/mach-pxa/dma.c @@ -121,6 +121,16 @@ int __init pxa_init_dma(int num_ch) if (dma_channels == NULL) return -ENOMEM; + /* dma channel priorities on pxa2xx processors: + * ch 0 - 3, 16 - 19 <--> (0) DMA_PRIO_HIGH + * ch 4 - 7, 20 - 23 <--> (1) DMA_PRIO_MEDIUM + * ch 8 - 15, 24 - 31 <--> (2) DMA_PRIO_LOW + */ + for (i = 0; i < num_ch; i++) { + DCSR(i) = 0; + dma_channels[i].prio = min((i & 0xf) >> 2, DMA_PRIO_LOW); + } + ret = request_irq(IRQ_DMA, dma_irq_handler, IRQF_DISABLED, "DMA", NULL); if (ret) { printk (KERN_CRIT "Wow! Can't register IRQ for DMA\n"); @@ -128,14 +138,6 @@ int __init pxa_init_dma(int num_ch) return ret; } - /* dma channel priorities on pxa2xx processors: - * ch 0 - 3, 16 - 19 <--> (0) DMA_PRIO_HIGH - * ch 4 - 7, 20 - 23 <--> (1) DMA_PRIO_MEDIUM - * ch 8 - 15, 24 - 31 <--> (2) DMA_PRIO_LOW - */ - for (i = 0; i < num_ch; i++) - dma_channels[i].prio = min((i & 0xf) >> 2, DMA_PRIO_LOW); - num_dma_channels = num_ch; return 0; } From ccc9c8b91c2631da2cab46a6fcd9c3106dcb9abb Mon Sep 17 00:00:00 2001 From: Balaji Rao Date: Tue, 27 Jan 2009 19:22:38 +0530 Subject: [PATCH 005/105] pcf50633_charger: Fix typo container_of(psy, struct pcf50633_mbc, usb); should be container_of(psy, struct pcf50633_mbc, adapter); Signed-off-by: Balaji Rao Cc: Andy Green Signed-off-by: Anton Vorontsov --- drivers/power/pcf50633-charger.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/power/pcf50633-charger.c b/drivers/power/pcf50633-charger.c index e988ec130fcd..41aec2acbb91 100644 --- a/drivers/power/pcf50633-charger.c +++ b/drivers/power/pcf50633-charger.c @@ -199,7 +199,8 @@ static int adapter_get_property(struct power_supply *psy, enum power_supply_property psp, union power_supply_propval *val) { - struct pcf50633_mbc *mbc = container_of(psy, struct pcf50633_mbc, usb); + struct pcf50633_mbc *mbc = container_of(psy, + struct pcf50633_mbc, adapter); int ret = 0; switch (psp) { From 48ec4d9537282a55d602136724f069faafcac8c8 Mon Sep 17 00:00:00 2001 From: Kyle McMartin Date: Wed, 4 Feb 2009 15:54:45 -0500 Subject: [PATCH 006/105] x86, 64-bit: print DMI info in the oops trace This patch echoes what we already do on 32-bit since 90f7d25c6b672137344f447a30a9159945ffea72, and prints the DMI product name in show_regs, so that system specific problems can be easily identified. Signed-off-by: Kyle McMartin Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 416fb9282f4f..85b4cb5c1980 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -151,14 +152,18 @@ void __show_regs(struct pt_regs *regs, int all) unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex, gsindex; unsigned int ds, cs, es; + const char *board; printk("\n"); print_modules(); - printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n", + board = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!board) + board = ""; + printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); + init_utsname()->version, board); printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); printk_address(regs->ip, 1); printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, From 36723bfe81f374fd8abc05a46f10a7cac2f01a75 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 4 Feb 2009 21:44:04 +0100 Subject: [PATCH 007/105] x86/Kconfig.cpu: make Kconfig help readable in the console Impact: cleanup Some lines exceed the 80 char width making them unreadable. Signed-off-by: Borislav Petkov Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 8078955845ae..c98d52e82966 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -167,9 +167,9 @@ config MK7 config MK8 bool "Opteron/Athlon64/Hammer/K8" help - Select this for an AMD Opteron or Athlon64 Hammer-family processor. Enables - use of some extended instructions, and passes appropriate optimization - flags to GCC. + Select this for an AMD Opteron or Athlon64 Hammer-family processor. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. config MCRUSOE bool "Crusoe" @@ -256,9 +256,11 @@ config MPSC config MCORE2 bool "Core 2/newer Xeon" help - Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and 53xx) - CPUs. You can distinguish newer from older Xeons by the CPU family - in /proc/cpuinfo. Newer ones have 6 and older ones 15 (not a typo) + + Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and + 53xx) CPUs. You can distinguish newer from older Xeons by the CPU + family in /proc/cpuinfo. Newer ones have 6 and older ones 15 + (not a typo) config GENERIC_CPU bool "Generic-x86-64" @@ -320,14 +322,14 @@ config X86_PPRO_FENCE bool "PentiumPro memory ordering errata workaround" depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 || MGEODEGX1 help - Old PentiumPro multiprocessor systems had errata that could cause memory - operations to violate the x86 ordering standard in rare cases. Enabling this - option will attempt to work around some (but not all) occurances of - this problem, at the cost of much heavier spinlock and memory barrier - operations. + Old PentiumPro multiprocessor systems had errata that could cause + memory operations to violate the x86 ordering standard in rare cases. + Enabling this option will attempt to work around some (but not all) + occurances of this problem, at the cost of much heavier spinlock and + memory barrier operations. - If unsure, say n here. Even distro kernels should think twice before enabling - this: there are few systems, and an unlikely bug. + If unsure, say n here. Even distro kernels should think twice before + enabling this: there are few systems, and an unlikely bug. config X86_F00F_BUG def_bool y From 483b4ee60edbefdfbff0dd538fb81f368d9e7c0d Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 4 Feb 2009 11:59:44 -0800 Subject: [PATCH 008/105] sched: fix nohz load balancer on cpu offline Christian Borntraeger reports: > After a logical cpu offline, even on a complete idle system, there > is one cpu with full ticks. It turns out that nohz.cpu_mask has the > the offlined cpu still set. > > In select_nohz_load_balancer() we check if the system is completely > idle to turn of load balancing. We compare cpu_online_map with > nohz.cpu_mask. Since cpu_online_map is updated on cpu unplug, > but nohz.cpu_mask is not, the check fails and the scheduler believes > that we need an "idle load balancer" even on a fully idle system. > Since the ilb cpu does not deactivate the timer tick this breaks NOHZ. Fix the select_nohz_load_balancer() to not set the nohz.cpu_mask while a cpu is going offline. Reported-by: Christian Borntraeger Signed-off-by: Suresh Siddha Tested-by: Christian Borntraeger Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 242d0d47a70d..e1fc67d0674c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3890,19 +3890,24 @@ int select_nohz_load_balancer(int stop_tick) int cpu = smp_processor_id(); if (stop_tick) { - cpumask_set_cpu(cpu, nohz.cpu_mask); cpu_rq(cpu)->in_nohz_recently = 1; - /* - * If we are going offline and still the leader, give up! - */ - if (!cpu_active(cpu) && - atomic_read(&nohz.load_balancer) == cpu) { + if (!cpu_active(cpu)) { + if (atomic_read(&nohz.load_balancer) != cpu) + return 0; + + /* + * If we are going offline and still the leader, + * give up! + */ if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) BUG(); + return 0; } + cpumask_set_cpu(cpu, nohz.cpu_mask); + /* time for ilb owner also to sleep */ if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { if (atomic_read(&nohz.load_balancer) == cpu) From dc4ff585ffbc6cb0c872697b2d5f42293a32e5c8 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 4 Feb 2009 13:48:11 -0800 Subject: [PATCH 009/105] sparc64: Call dump_stack() in die_nmi(). Signed-off-by: David S. Miller --- arch/sparc/kernel/nmi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c index 09f088ed4a64..f3577223c863 100644 --- a/arch/sparc/kernel/nmi.c +++ b/arch/sparc/kernel/nmi.c @@ -70,6 +70,7 @@ static void die_nmi(const char *str, struct pt_regs *regs, int do_panic) printk(" on CPU%d, ip %08lx, registers:\n", smp_processor_id(), regs->tpc); show_regs(regs); + dump_stack(); bust_spinlocks(0); From a6a95406c676ffe4f9dee708eb404a17c69f7fdd Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 4 Feb 2009 13:40:31 +0300 Subject: [PATCH 010/105] x86: fix hpet timer reinit for x86_64 There's a small problem with hpet_rtc_reinit function - it checks for the: hpet_readl(HPET_COUNTER) - hpet_t1_cmp > 0 to continue increasing both the HPET_T1_CMP (register) and the hpet_t1_cmp (variable). But since the HPET_COUNTER is always 32-bit, if the hpet_t1_cmp is 64-bit this condition will always be FALSE once the latter hits the 32-bit boundary, and we can have a situation, when we don't increase the HPET_T1_CMP register high enough. The result - timer stops ticking, since HPET_T1_CMP becomes less, than the COUNTER and never increased again. The solution is (based on Linus's suggestion) to not compare 64-bits (on 64-bit x86), but to do the comparison on 32-bit signed integers. Reported-by: Kirill Korotaev Signed-off-by: Pavel Emelyanov Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 64d5ad0b8add..c761f914430a 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1075,7 +1075,7 @@ static void hpet_rtc_timer_reinit(void) hpet_t1_cmp += delta; hpet_writel(hpet_t1_cmp, HPET_T1_CMP); lost_ints++; - } while ((long)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0); + } while ((s32)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0); if (lost_ints) { if (hpet_rtc_flags & RTC_PIE) From 4560839939f4b4a96e21e80584f87308ac93c1da Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Wed, 4 Feb 2009 16:44:01 -0700 Subject: [PATCH 011/105] x86: fix grammar in user-visible BIOS warning Fix user-visible grammo. Signed-off-by: Alex Chiang Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ae0d8042cf69..c461f6d69074 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -607,7 +607,7 @@ struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { printk(KERN_NOTICE - "%s detected: BIOS may corrupt low RAM, working it around.\n", + "%s detected: BIOS may corrupt low RAM, working around it.\n", d->ident); e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED); From b534816b552d35bbd3c60702139ed5c7da2f55c2 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 4 Feb 2009 18:33:38 -0800 Subject: [PATCH 012/105] x86: don't apply __supported_pte_mask to non-present ptes On an x86 system which doesn't support global mappings, __supported_pte_mask has _PAGE_GLOBAL clear, to make sure it never appears in the PTE. pfn_pte() and so on will enforce it with: static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & __supported_pte_mask); } However, we overload _PAGE_GLOBAL with _PAGE_PROTNONE on non-present ptes to distinguish them from swap entries. However, applying __supported_pte_mask indiscriminately will clear the bit and corrupt the pte. I guess the best fix is to only apply __supported_pte_mask to present ptes. This seems like the right solution to me, as it means we can completely ignore the issue of overlaps between the present pte bits and the non-present pte-as-swap entry use of the bits. __supported_pte_mask contains the set of flags we support on the current hardware. We also use bits in the pte for things like logically present ptes with no permissions, and swap entries for swapped out pages. We should only apply __supported_pte_mask to present ptes, because otherwise we may destroy other information being stored in the ptes. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 26 ++++++++++++++++++++------ arch/x86/include/asm/xen/page.h | 2 +- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 06bbcbd66e9c..4f5af8447d54 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -302,16 +302,30 @@ static inline pte_t pte_mkspecial(pte_t pte) extern pteval_t __supported_pte_mask; +/* + * Mask out unsupported bits in a present pgprot. Non-present pgprots + * can use those bits for other purposes, so leave them be. + */ +static inline pgprotval_t massage_pgprot(pgprot_t pgprot) +{ + pgprotval_t protval = pgprot_val(pgprot); + + if (protval & _PAGE_PRESENT) + protval &= __supported_pte_mask; + + return protval; +} + static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { - return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) | - pgprot_val(pgprot)) & __supported_pte_mask); + return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | + massage_pgprot(pgprot)); } static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) { - return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) | - pgprot_val(pgprot)) & __supported_pte_mask); + return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | + massage_pgprot(pgprot)); } static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) @@ -323,7 +337,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) * the newprot (if present): */ val &= _PAGE_CHG_MASK; - val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask; + val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK; return __pte(val); } @@ -339,7 +353,7 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK) -#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask) +#define canon_pgprot(p) __pgprot(massage_pgprot(p)) static inline int is_new_memtype_allowed(unsigned long flags, unsigned long new_flags) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 7ef617ef1df3..4bd990ee43df 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -137,7 +137,7 @@ static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot) pte_t pte; pte.pte = ((phys_addr_t)page_nr << PAGE_SHIFT) | - (pgprot_val(pgprot) & __supported_pte_mask); + massage_pgprot(pgprot); return pte; } From 32bd671d6cbeda60dc73be77fa2b9037d9a9bfa0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Feb 2009 12:24:15 +0100 Subject: [PATCH 013/105] signal: re-add dead task accumulation stats. We're going to split the process wide cpu accounting into two parts: - clocks; which can take all the time they want since they run from user context. - timers; which need constant time tracing but can affort the overhead because they're default off -- and rare. The clock readout will go back to a full sum of the thread group, for this we need to re-add the exit stats that were removed in the initial itimer rework (f06febc9: timers: fix itimer/many thread hang). Furthermore, since that full sum can be rather slow for large thread groups and we have the complete dead task stats, revert the do_notify_parent time computation. Signed-off-by: Peter Zijlstra Reviewed-by: Ingo Molnar Signed-off-by: Ingo Molnar --- include/linux/sched.h | 10 +++++++++- kernel/exit.c | 3 +++ kernel/fork.c | 3 ++- kernel/signal.c | 8 ++++---- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 2127e959e0f4..2e0646a30314 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -559,7 +559,7 @@ struct signal_struct { * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ - cputime_t cutime, cstime; + cputime_t utime, stime, cutime, cstime; cputime_t gtime; cputime_t cgtime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; @@ -567,6 +567,14 @@ struct signal_struct { unsigned long inblock, oublock, cinblock, coublock; struct task_io_accounting ioac; + /* + * Cumulative ns of schedule CPU time fo dead threads in the + * group, not including a zombie group leader, (This only differs + * from jiffies_to_ns(utime + stime) if sched_clock uses something + * other than jiffies.) + */ + unsigned long long sum_sched_runtime; + /* * We don't bother to synchronize most readers of this at all, * because there is no reader checking a limit that actually needs diff --git a/kernel/exit.c b/kernel/exit.c index f80dec3f1875..efd30ccf3858 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -118,6 +118,8 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ + sig->utime = cputime_add(sig->utime, task_utime(tsk)); + sig->stime = cputime_add(sig->stime, task_stime(tsk)); sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; @@ -126,6 +128,7 @@ static void __exit_signal(struct task_struct *tsk) sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } diff --git a/kernel/fork.c b/kernel/fork.c index 242a706e7721..e8e854a04ad2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -851,13 +851,14 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->tty_old_pgrp = NULL; sig->tty = NULL; - sig->cutime = sig->cstime = cputime_zero; + sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; sig->cgtime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; task_io_accounting_init(&sig->ioac); + sig->sum_sched_runtime = 0; taskstats_tgid_init(sig); task_lock(current->group_leader); diff --git a/kernel/signal.c b/kernel/signal.c index b6b36768b758..2a74fe87c0dd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1367,7 +1367,6 @@ int do_notify_parent(struct task_struct *tsk, int sig) struct siginfo info; unsigned long flags; struct sighand_struct *psig; - struct task_cputime cputime; int ret = sig; BUG_ON(sig == -1); @@ -1397,9 +1396,10 @@ int do_notify_parent(struct task_struct *tsk, int sig) info.si_uid = __task_cred(tsk)->uid; rcu_read_unlock(); - thread_group_cputime(tsk, &cputime); - info.si_utime = cputime_to_jiffies(cputime.utime); - info.si_stime = cputime_to_jiffies(cputime.stime); + info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, + tsk->signal->utime)); + info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, + tsk->signal->stime)); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) From 4cd4c1b40d40447fb5e7ba80746c6d7ba91d7a53 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Feb 2009 12:24:16 +0100 Subject: [PATCH 014/105] timers: split process wide cpu clocks/timers Change the process wide cpu timers/clocks so that we: 1) don't mess up the kernel with too many threads, 2) don't have a per-cpu allocation for each process, 3) have no impact when not used. In order to accomplish this we're going to split it into two parts: - clocks; which can take all the time they want since they run from user context -- ie. sys_clock_gettime(CLOCK_PROCESS_CPUTIME_ID) - timers; which need constant time sampling but since they're explicity used, the user can pay the overhead. The clock readout will go back to a full sum of the thread group, while the timers will run of a global 'clock' that only runs when needed, so only programs that make use of the facility pay the price. Signed-off-by: Peter Zijlstra Reviewed-by: Ingo Molnar Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 11 +++-- include/linux/sched.h | 54 ++++++++++++---------- kernel/itimer.c | 4 +- kernel/posix-cpu-timers.c | 95 +++++++++++++++++++++++++++++++++++++-- kernel/sched_stats.h | 45 +++++++++++-------- 5 files changed, 155 insertions(+), 54 deletions(-) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index ea0ea1a4c36f..e752d973fa21 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -48,12 +48,11 @@ extern struct fs_struct init_fs; .posix_timers = LIST_HEAD_INIT(sig.posix_timers), \ .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ .rlim = INIT_RLIMITS, \ - .cputime = { .totals = { \ - .utime = cputime_zero, \ - .stime = cputime_zero, \ - .sum_exec_runtime = 0, \ - .lock = __SPIN_LOCK_UNLOCKED(sig.cputime.totals.lock), \ - }, }, \ + .cputimer = { \ + .cputime = INIT_CPUTIME, \ + .running = 0, \ + .lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ + }, \ } extern struct nsproxy init_nsproxy; diff --git a/include/linux/sched.h b/include/linux/sched.h index 2e0646a30314..082d7619b3a1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -443,7 +443,6 @@ struct pacct_struct { * @utime: time spent in user mode, in &cputime_t units * @stime: time spent in kernel mode, in &cputime_t units * @sum_exec_runtime: total time spent on the CPU, in nanoseconds - * @lock: lock for fields in this struct * * This structure groups together three kinds of CPU time that are * tracked for threads and thread groups. Most things considering @@ -454,23 +453,33 @@ struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; - spinlock_t lock; }; /* Alternate field names when used to cache expirations. */ #define prof_exp stime #define virt_exp utime #define sched_exp sum_exec_runtime +#define INIT_CPUTIME \ + (struct task_cputime) { \ + .utime = cputime_zero, \ + .stime = cputime_zero, \ + .sum_exec_runtime = 0, \ + } + /** - * struct thread_group_cputime - thread group interval timer counts - * @totals: thread group interval timers; substructure for - * uniprocessor kernel, per-cpu for SMP kernel. + * struct thread_group_cputimer - thread group interval timer counts + * @cputime: thread group interval timers. + * @running: non-zero when there are timers running and + * @cputime receives updates. + * @lock: lock for fields in this struct. * * This structure contains the version of task_cputime, above, that is - * used for thread group CPU clock calculations. + * used for thread group CPU timer calculations. */ -struct thread_group_cputime { - struct task_cputime totals; +struct thread_group_cputimer { + struct task_cputime cputime; + int running; + spinlock_t lock; }; /* @@ -519,10 +528,10 @@ struct signal_struct { cputime_t it_prof_incr, it_virt_incr; /* - * Thread group totals for process CPU clocks. - * See thread_group_cputime(), et al, for details. + * Thread group totals for process CPU timers. + * See thread_group_cputimer(), et al, for details. */ - struct thread_group_cputime cputime; + struct thread_group_cputimer cputimer; /* Earliest-expiration cache. */ struct task_cputime cputime_expires; @@ -2191,27 +2200,26 @@ static inline int spin_needbreak(spinlock_t *lock) /* * Thread group CPU time accounting. */ +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); static inline -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) { - struct task_cputime *totals = &tsk->signal->cputime.totals; + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; unsigned long flags; - spin_lock_irqsave(&totals->lock, flags); - *times = *totals; - spin_unlock_irqrestore(&totals->lock, flags); + WARN_ON(!cputimer->running); + + spin_lock_irqsave(&cputimer->lock, flags); + *times = cputimer->cputime; + spin_unlock_irqrestore(&cputimer->lock, flags); } static inline void thread_group_cputime_init(struct signal_struct *sig) { - sig->cputime.totals = (struct task_cputime){ - .utime = cputime_zero, - .stime = cputime_zero, - .sum_exec_runtime = 0, - }; - - spin_lock_init(&sig->cputime.totals.lock); + sig->cputimer.cputime = INIT_CPUTIME; + spin_lock_init(&sig->cputimer.lock); + sig->cputimer.running = 0; } static inline void thread_group_cputime_free(struct signal_struct *sig) diff --git a/kernel/itimer.c b/kernel/itimer.c index 6a5fe93dd8bd..58762f7077ec 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -62,7 +62,7 @@ int do_getitimer(int which, struct itimerval *value) struct task_cputime cputime; cputime_t utime; - thread_group_cputime(tsk, &cputime); + thread_group_cputimer(tsk, &cputime); utime = cputime.utime; if (cputime_le(cval, utime)) { /* about to fire */ cval = jiffies_to_cputime(1); @@ -82,7 +82,7 @@ int do_getitimer(int which, struct itimerval *value) struct task_cputime times; cputime_t ptime; - thread_group_cputime(tsk, ×); + thread_group_cputimer(tsk, ×); ptime = cputime_add(times.utime, times.stime); if (cputime_le(cval, ptime)) { /* about to fire */ cval = jiffies_to_cputime(1); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index fa07da94d7be..db107c9bbc05 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -230,6 +230,37 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, return 0; } +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +{ + struct sighand_struct *sighand; + struct signal_struct *sig; + struct task_struct *t; + + *times = INIT_CPUTIME; + + rcu_read_lock(); + sighand = rcu_dereference(tsk->sighand); + if (!sighand) + goto out; + + sig = tsk->signal; + + t = tsk; + do { + times->utime = cputime_add(times->utime, t->utime); + times->stime = cputime_add(times->stime, t->stime); + times->sum_exec_runtime += t->se.sum_exec_runtime; + + t = next_thread(t); + } while (t != tsk); + + times->utime = cputime_add(times->utime, sig->utime); + times->stime = cputime_add(times->stime, sig->stime); + times->sum_exec_runtime += sig->sum_sched_runtime; +out: + rcu_read_unlock(); +} + /* * Sample a process (thread group) clock for the given group_leader task. * Must be called with tasklist_lock held for reading. @@ -475,6 +506,29 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) now); } +/* + * Enable the process wide cpu timer accounting. + * + * serialized using ->sighand->siglock + */ +static void start_process_timers(struct task_struct *tsk) +{ + tsk->signal->cputimer.running = 1; + barrier(); +} + +/* + * Release the process wide timer accounting -- timer stops ticking when + * nobody cares about it. + * + * serialized using ->sighand->siglock + */ +static void stop_process_timers(struct task_struct *tsk) +{ + tsk->signal->cputimer.running = 0; + barrier(); +} + /* * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the tasklist_lock held @@ -495,6 +549,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) BUG_ON(!irqs_disabled()); spin_lock(&p->sighand->siglock); + if (!CPUCLOCK_PERTHREAD(timer->it_clock)) + start_process_timers(p); + listpos = head; if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { list_for_each_entry(next, head, entry) { @@ -987,13 +1044,15 @@ static void check_process_timers(struct task_struct *tsk, sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && list_empty(&timers[CPUCLOCK_VIRT]) && cputime_eq(sig->it_virt_expires, cputime_zero) && - list_empty(&timers[CPUCLOCK_SCHED])) + list_empty(&timers[CPUCLOCK_SCHED])) { + stop_process_timers(tsk); return; + } /* * Collect the current process totals. */ - thread_group_cputime(tsk, &cputime); + thread_group_cputimer(tsk, &cputime); utime = cputime.utime; ptime = cputime_add(utime, cputime.stime); sum_sched_runtime = cputime.sum_exec_runtime; @@ -1259,7 +1318,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk) if (!task_cputime_zero(&sig->cputime_expires)) { struct task_cputime group_sample; - thread_group_cputime(tsk, &group_sample); + thread_group_cputimer(tsk, &group_sample); if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; } @@ -1328,6 +1387,33 @@ void run_posix_cpu_timers(struct task_struct *tsk) } } +/* + * Sample a process (thread group) timer for the given group_leader task. + * Must be called with tasklist_lock held for reading. + */ +static int cpu_timer_sample_group(const clockid_t which_clock, + struct task_struct *p, + union cpu_time_count *cpu) +{ + struct task_cputime cputime; + + thread_group_cputimer(p, &cputime); + switch (CPUCLOCK_WHICH(which_clock)) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + cpu->cpu = cputime_add(cputime.utime, cputime.stime); + break; + case CPUCLOCK_VIRT: + cpu->cpu = cputime.utime; + break; + case CPUCLOCK_SCHED: + cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); + break; + } + return 0; +} + /* * Set one of the process-wide special case CPU timers. * The tsk->sighand->siglock must be held by the caller. @@ -1341,7 +1427,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, struct list_head *head; BUG_ON(clock_idx == CPUCLOCK_SCHED); - cpu_clock_sample_group(clock_idx, tsk, &now); + start_process_timers(tsk); + cpu_timer_sample_group(clock_idx, tsk, &now); if (oldval) { if (!cputime_eq(*oldval, cputime_zero)) { diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 8ab0cef8ecab..a8f93dd374e1 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -296,19 +296,21 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) static inline void account_group_user_time(struct task_struct *tsk, cputime_t cputime) { - struct task_cputime *times; - struct signal_struct *sig; + struct thread_group_cputimer *cputimer; /* tsk == current, ensure it is safe to use ->signal */ if (unlikely(tsk->exit_state)) return; - sig = tsk->signal; - times = &sig->cputime.totals; + cputimer = &tsk->signal->cputimer; - spin_lock(×->lock); - times->utime = cputime_add(times->utime, cputime); - spin_unlock(×->lock); + if (!cputimer->running) + return; + + spin_lock(&cputimer->lock); + cputimer->cputime.utime = + cputime_add(cputimer->cputime.utime, cputime); + spin_unlock(&cputimer->lock); } /** @@ -324,19 +326,21 @@ static inline void account_group_user_time(struct task_struct *tsk, static inline void account_group_system_time(struct task_struct *tsk, cputime_t cputime) { - struct task_cputime *times; - struct signal_struct *sig; + struct thread_group_cputimer *cputimer; /* tsk == current, ensure it is safe to use ->signal */ if (unlikely(tsk->exit_state)) return; - sig = tsk->signal; - times = &sig->cputime.totals; + cputimer = &tsk->signal->cputimer; - spin_lock(×->lock); - times->stime = cputime_add(times->stime, cputime); - spin_unlock(×->lock); + if (!cputimer->running) + return; + + spin_lock(&cputimer->lock); + cputimer->cputime.stime = + cputime_add(cputimer->cputime.stime, cputime); + spin_unlock(&cputimer->lock); } /** @@ -352,7 +356,7 @@ static inline void account_group_system_time(struct task_struct *tsk, static inline void account_group_exec_runtime(struct task_struct *tsk, unsigned long long ns) { - struct task_cputime *times; + struct thread_group_cputimer *cputimer; struct signal_struct *sig; sig = tsk->signal; @@ -361,9 +365,12 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, if (unlikely(!sig)) return; - times = &sig->cputime.totals; + cputimer = &sig->cputimer; - spin_lock(×->lock); - times->sum_exec_runtime += ns; - spin_unlock(×->lock); + if (!cputimer->running) + return; + + spin_lock(&cputimer->lock); + cputimer->cputime.sum_exec_runtime += ns; + spin_unlock(&cputimer->lock); } From 0cd5c3c80a0ebd68c08312fa7d8c13149cc61c4c Mon Sep 17 00:00:00 2001 From: Kyle McMartin Date: Wed, 4 Feb 2009 14:29:19 -0800 Subject: [PATCH 015/105] x86: disable intel_iommu support by default Due to recurring issues with DMAR support on certain platforms. There's a number of filesystem corruption incidents reported: https://bugzilla.redhat.com/show_bug.cgi?id=479996 http://bugzilla.kernel.org/show_bug.cgi?id=12578 Provide a Kconfig option to change whether it is enabled by default. If disabled, it can still be reenabled by passing intel_iommu=on to the kernel. Keep the .config option off by default. Signed-off-by: Kyle McMartin Signed-off-by: Andrew Morton Acked-By: David Woodhouse Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 2 ++ arch/x86/Kconfig | 11 +++++++++++ drivers/pci/intel-iommu.c | 14 +++++++++++--- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index d8362cf9909e..b182626739ea 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -937,6 +937,8 @@ and is between 256 and 4096 characters. It is defined in the file intel_iommu= [DMAR] Intel IOMMU driver (DMAR) option + on + Enable intel iommu driver. off Disable intel iommu driver. igfx_off [Default Off] diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 73f7fe8fd4d1..9c39095b33fc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1802,6 +1802,17 @@ config DMAR and include PCI device scope covered by these DMA remapping devices. +config DMAR_DEFAULT_ON + def_bool n + prompt "Enable DMA Remapping Devices by default" + depends on DMAR + help + Selecting this option will enable a DMAR device at boot time if + one is found. If this option is not selected, DMAR support can + be enabled by passing intel_iommu=on to the kernel. It is + recommended you say N here while the DMAR code remains + experimental. + config DMAR_GFX_WA def_bool y prompt "Support for Graphics workaround" diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 3dfecb20d5e7..f4b7c79023ff 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -268,7 +268,12 @@ static long list_size; static void domain_remove_dev_info(struct dmar_domain *domain); -int dmar_disabled; +#ifdef CONFIG_DMAR_DEFAULT_ON +int dmar_disabled = 0; +#else +int dmar_disabled = 1; +#endif /*CONFIG_DMAR_DEFAULT_ON*/ + static int __initdata dmar_map_gfx = 1; static int dmar_forcedac; static int intel_iommu_strict; @@ -284,9 +289,12 @@ static int __init intel_iommu_setup(char *str) if (!str) return -EINVAL; while (*str) { - if (!strncmp(str, "off", 3)) { + if (!strncmp(str, "on", 2)) { + dmar_disabled = 0; + printk(KERN_INFO "Intel-IOMMU: enabled\n"); + } else if (!strncmp(str, "off", 3)) { dmar_disabled = 1; - printk(KERN_INFO"Intel-IOMMU: disabled\n"); + printk(KERN_INFO "Intel-IOMMU: disabled\n"); } else if (!strncmp(str, "igfx_off", 8)) { dmar_map_gfx = 0; printk(KERN_INFO From 1c2f61d40b691789626489fa947a3e003c9a84be Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 5 Feb 2009 23:59:04 -0800 Subject: [PATCH 016/105] sparc64: Don't hook up pcr_ops on spitfire chips. They can't be used for profiling and NMI watchdog currently since they lack the counter overflow interrupt. Signed-off-by: David S. Miller --- arch/sparc/kernel/pcr.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c index 92e0dda141a4..1ae8cdd7e703 100644 --- a/arch/sparc/kernel/pcr.c +++ b/arch/sparc/kernel/pcr.c @@ -133,11 +133,16 @@ int __init pcr_arch_init(void) case cheetah: case cheetah_plus: - case spitfire: pcr_ops = &direct_pcr_ops; pcr_enable = PCR_SUN4U_ENABLE; break; + case spitfire: + /* UltraSPARC-I/II and derivatives lack a profile + * counter overflow interrupt so we can't make use of + * their hardware currently. + */ + /* fallthrough */ default: err = -ENODEV; goto out_unregister; From 684de409acff8b1fe8bf188d75ff2f99c624387d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 6 Feb 2009 00:49:55 -0800 Subject: [PATCH 017/105] ipv6: Disallow rediculious flowlabel option sizes. Just like PKTINFO, limit the options area to 64K. Based upon report by Eric Sesterhenn and analysis by Roland Dreier. Signed-off-by: David S. Miller --- net/ipv6/ip6_flowlabel.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index c62dd247774f..7712578bdc66 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -323,17 +323,21 @@ static struct ip6_flowlabel * fl_create(struct net *net, struct in6_flowlabel_req *freq, char __user *optval, int optlen, int *err_p) { - struct ip6_flowlabel *fl; + struct ip6_flowlabel *fl = NULL; int olen; int addr_type; int err; + olen = optlen - CMSG_ALIGN(sizeof(*freq)); + err = -EINVAL; + if (olen > 64 * 1024) + goto done; + err = -ENOMEM; fl = kzalloc(sizeof(*fl), GFP_KERNEL); if (fl == NULL) goto done; - olen = optlen - CMSG_ALIGN(sizeof(*freq)); if (olen > 0) { struct msghdr msg; struct flowi flowi; From efc683fc2a692735029067b4f939af2a3625e31d Mon Sep 17 00:00:00 2001 From: Gautam Kachroo Date: Fri, 6 Feb 2009 00:52:04 -0800 Subject: [PATCH 018/105] neigh: some entries can be skipped during dumping neightbl_dump_info and neigh_dump_table can skip entries if the *fill*info functions return an error. This results in an incomplete dump ((invoked by netlink requests for RTM_GETNEIGHTBL or RTM_GETNEIGH) nidx and idx should not be incremented if the current entry was not placed in the output buffer Signed-off-by: Gautam Kachroo Signed-off-by: David S. Miller --- net/core/neighbour.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f66c58df8953..278a142d1047 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1994,8 +1994,8 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) if (!net_eq(neigh_parms_net(p), net)) continue; - if (nidx++ < neigh_skip) - continue; + if (nidx < neigh_skip) + goto next; if (neightbl_fill_param_info(skb, tbl, p, NETLINK_CB(cb->skb).pid, @@ -2003,6 +2003,8 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) RTM_NEWNEIGHTBL, NLM_F_MULTI) <= 0) goto out; + next: + nidx++; } neigh_skip = 0; @@ -2082,12 +2084,10 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (h > s_h) s_idx = 0; for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next) { - int lidx; if (dev_net(n->dev) != net) continue; - lidx = idx++; - if (lidx < s_idx) - continue; + if (idx < s_idx) + goto next; if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, @@ -2096,6 +2096,8 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, rc = -1; goto out; } + next: + idx++; } } read_unlock_bh(&tbl->lock); From 2783ef23128ad0a4b34e4121c1f7ff664785712f Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 6 Feb 2009 01:59:12 -0800 Subject: [PATCH 019/105] udp: Fix potential wrong ip_hdr(skb) pointers Like the UDP header fix, pskb_may_pull() can potentially alter the SKB buffer. Thus the saddr and daddr, pointers may point to the old skb->data buffer. I haven't seen corruptions, as its only seen if the old skb->data buffer were reallocated by another user and written into very quickly (or poison'd by SLAB debugging). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/ipv4/udp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cc3a0a06c004..c47c989cb1fb 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1234,8 +1234,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, struct udphdr *uh; unsigned short ulen; struct rtable *rt = (struct rtable*)skb->dst; - __be32 saddr = ip_hdr(skb)->saddr; - __be32 daddr = ip_hdr(skb)->daddr; + __be32 saddr, daddr; struct net *net = dev_net(skb->dev); /* @@ -1259,6 +1258,9 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (udp4_csum_init(skb, uh, proto)) goto csum_error; + saddr = ip_hdr(skb)->saddr; + daddr = ip_hdr(skb)->daddr; + if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return __udp4_lib_mcast_deliver(net, skb, uh, saddr, daddr, udptable); From 7d8e23df69820e6be42bcc41d441f4860e8c76f7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 6 Feb 2009 14:57:51 +0100 Subject: [PATCH 020/105] timers: split process wide cpu clocks/timers, remove spurious warning Mike Galbraith reported that the new warning in thread_group_cputimer() triggers en masse with Amarok running. Oleg Nesterov observed: Can't fastpath_timer_check()->thread_group_cputimer() have the false warning too? Suppose we had the timer, then posix_cpu_timer_del() removes this timer, but task_cputime_zero(&sig->cputime_expires) still not true. Remove the spurious debug warning. Reported-by: Mike Galbraith Explained-by: Oleg Nesterov Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 082d7619b3a1..79392916d6c9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2208,8 +2208,6 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; unsigned long flags; - WARN_ON(!cputimer->running); - spin_lock_irqsave(&cputimer->lock, flags); *times = cputimer->cputime; spin_unlock_irqrestore(&cputimer->lock, flags); From ff08f76d738d0ec0f334b187f61e160caa321d54 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 4 Feb 2009 13:40:31 +0300 Subject: [PATCH 021/105] x86: clean up hpet timer reinit Implement Linus's suggestion: introduce the hpet_cnt_ahead() helper function to compare hpet time values - like other wrapping counter comparisons are abstracted away elsewhere. (jiffies, ktime_t, etc.) Reported-by: Kirill Korotaev Signed-off-by: Pavel Emelyanov Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index c761f914430a..388254f69a2a 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -897,13 +897,21 @@ static unsigned long hpet_rtc_flags; static int hpet_prev_update_sec; static struct rtc_time hpet_alarm_time; static unsigned long hpet_pie_count; -static unsigned long hpet_t1_cmp; +static u32 hpet_t1_cmp; static unsigned long hpet_default_delta; static unsigned long hpet_pie_delta; static unsigned long hpet_pie_limit; static rtc_irq_handler irq_handler; +/* + * Check that the hpet counter c1 is ahead of the c2 + */ +static inline int hpet_cnt_ahead(u32 c1, u32 c2) +{ + return (s32)(c2 - c1) < 0; +} + /* * Registers a IRQ handler. */ @@ -1075,7 +1083,7 @@ static void hpet_rtc_timer_reinit(void) hpet_t1_cmp += delta; hpet_writel(hpet_t1_cmp, HPET_T1_CMP); lost_ints++; - } while ((s32)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0); + } while (!hpet_cnt_ahead(hpet_t1_cmp, hpet_readl(HPET_COUNTER))); if (lost_ints) { if (hpet_rtc_flags & RTC_PIE) From e2a02ba6676cdc5ebec503b558838c08c7083c14 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Fri, 6 Feb 2009 11:10:27 +1100 Subject: [PATCH 022/105] powerpc/83xx: Build breakage for CONFIG_PM but no CONFIG_SUSPEND I noticed this doing some randconfig testing (.config below). I have CONFIG_PM but no CONFIG_SUSPEND. Bug is against mainline. arch/powerpc/sysdev/built-in.o: In function `ipic_suspend': ipic.c:(.text+0x6b34): undefined reference to `fsl_deep_sleep' make[1]: *** [.tmp_vmlinux1] Error 1 make: *** [sub-make] Error 2 Looks like #ifdef CONFIG_PM in arch/powerpc/sysdev/ipic.c should be CONFIG_SUSPEND. d49747bdfb2ddebea24d1580da55b79d093d48a9 introduced this. Fix build when we have CONFIG_PM but no CONFIG_SUSPEND. Signed-off-by: Michael Neuling Acked-by: Scott Wood Signed-off-by: Kumar Gala --- arch/powerpc/sysdev/ipic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index 88a983ece5c9..9a89cd3e80a2 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -890,7 +890,7 @@ unsigned int ipic_get_irq(void) return irq_linear_revmap(primary_ipic->irqhost, irq); } -#ifdef CONFIG_PM +#ifdef CONFIG_SUSPEND static struct { u32 sicfr; u32 siprr[2]; From 1f0d4d16d9d4ffa65438425d092f7061476b95b3 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Thu, 5 Feb 2009 23:10:32 +0300 Subject: [PATCH 023/105] powerpc/83xx: Fix missing #{address,size}-cells in mpc8313erdb.dts commit b31a1d8b41513b96e9c7ec2f68c5734cef0b26a4 ("gianfar: Convert gianfar to an of_platform_driver") introduced a child node for the ethernet@25000 controller, but no address and size cells specifiers were added, and that makes dtc unhappy: DTC: dts->dtb on file "arch/powerpc/boot/dts/mpc8313erdb.dts" Warning (reg_format): "reg" property in /soc8313@e0000000/ethernet@25000/mdio@25520 has invalid length (8 bytes) (#address-cells == 2, #size-cells == 1) Warning (avoid_default_addr_size): Relying on default #address-cells value for /soc8313@e0000000/ethernet@25000/mdio@25520 Warning (avoid_default_addr_size): Relying on default #size-cells value for /soc8313@e0000000/ethernet@25000/mdio@25520 Signed-off-by: Anton Vorontsov Signed-off-by: Kumar Gala --- arch/powerpc/boot/dts/mpc8313erdb.dts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/boot/dts/mpc8313erdb.dts b/arch/powerpc/boot/dts/mpc8313erdb.dts index 909a89cab9ac..3f84cd0384ed 100644 --- a/arch/powerpc/boot/dts/mpc8313erdb.dts +++ b/arch/powerpc/boot/dts/mpc8313erdb.dts @@ -219,6 +219,8 @@ tbi0: tbi-phy@11 { }; enet1: ethernet@25000 { + #address-cells = <1>; + #size-cells = <1>; cell-index = <1>; device_type = "network"; model = "eTSEC"; From e85477f516c2de7ed515fcf94ceab5282eba7fa4 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Thu, 5 Feb 2009 23:10:40 +0300 Subject: [PATCH 024/105] powerpc/83xx: Fix TSEC0 workability on MPC8313E-RDB boards TSEC0 is connected to Vitesse 7385 5-port switch. The switch isn't connected to any mdio bus, the link to the switch is fixed to Full-duplex 1000 Mb/s (no pause). This patch fixes following failure during bootup: mdio@24520:01 not found eth0: Could not attach to PHY IP-Config: Failed to open eth0 Signed-off-by: Anton Vorontsov Signed-off-by: Kumar Gala --- arch/powerpc/boot/dts/mpc8313erdb.dts | 9 ++------- arch/powerpc/configs/83xx/mpc8313_rdb_defconfig | 2 +- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/boot/dts/mpc8313erdb.dts b/arch/powerpc/boot/dts/mpc8313erdb.dts index 3f84cd0384ed..3ebf7ec0484c 100644 --- a/arch/powerpc/boot/dts/mpc8313erdb.dts +++ b/arch/powerpc/boot/dts/mpc8313erdb.dts @@ -191,7 +191,8 @@ enet0: ethernet@24000 { interrupts = <37 0x8 36 0x8 35 0x8>; interrupt-parent = <&ipic>; tbi-handle = < &tbi0 >; - phy-handle = < &phy1 >; + /* Vitesse 7385 isn't on the MDIO bus */ + fixed-link = <1 1 1000 0 0>; fsl,magic-packet; mdio@24520 { @@ -199,12 +200,6 @@ mdio@24520 { #size-cells = <0>; compatible = "fsl,gianfar-mdio"; reg = <0x24520 0x20>; - phy1: ethernet-phy@1 { - interrupt-parent = <&ipic>; - interrupts = <19 0x8>; - reg = <0x1>; - device_type = "ethernet-phy"; - }; phy4: ethernet-phy@4 { interrupt-parent = <&ipic>; interrupts = <20 0x8>; diff --git a/arch/powerpc/configs/83xx/mpc8313_rdb_defconfig b/arch/powerpc/configs/83xx/mpc8313_rdb_defconfig index 9e47ae957e2e..409d017621a8 100644 --- a/arch/powerpc/configs/83xx/mpc8313_rdb_defconfig +++ b/arch/powerpc/configs/83xx/mpc8313_rdb_defconfig @@ -651,7 +651,7 @@ CONFIG_CICADA_PHY=y # CONFIG_NATIONAL_PHY is not set # CONFIG_STE10XP is not set # CONFIG_LSI_ET1011C_PHY is not set -# CONFIG_FIXED_PHY is not set +CONFIG_FIXED_PHY=y # CONFIG_MDIO_BITBANG is not set CONFIG_NET_ETHERNET=y CONFIG_MII=y From 7f3ea17f316577e31db868f720ac575c74d20163 Mon Sep 17 00:00:00 2001 From: paulfax Date: Tue, 27 Jan 2009 02:44:07 -0600 Subject: [PATCH 025/105] powerpc/cpm2: Fix set interrupt type This is a simple change to correct problems when using set_irq_type on platforms using CPM2. This code corrects the problem on most platform but may have issues on 8272 derived platforms for some interrupts. On 8272 PC2 & 3 are missing and PC 23 & 29 are added, which this patch does not address. Signed-off-by: Paul Bilke Reviewed-by: Anton Vorontsov Signed-off-by: Kumar Gala --- arch/powerpc/sysdev/cpm2_pic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/cpm2_pic.c b/arch/powerpc/sysdev/cpm2_pic.c index b16ca3ed65d2..78f1f7cca0a0 100644 --- a/arch/powerpc/sysdev/cpm2_pic.c +++ b/arch/powerpc/sysdev/cpm2_pic.c @@ -165,7 +165,7 @@ static int cpm2_set_irq_type(unsigned int virq, unsigned int flow_type) edibit = (14 - (src - CPM2_IRQ_EXT1)); else if (src >= CPM2_IRQ_PORTC15 && src <= CPM2_IRQ_PORTC0) - edibit = (31 - (src - CPM2_IRQ_PORTC15)); + edibit = (31 - (CPM2_IRQ_PORTC0 - src)); else return (flow_type & IRQ_TYPE_LEVEL_LOW) ? 0 : -EINVAL; From 08c6e3a57877ceda5c2f4c99ae7d433b8eef779a Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 4 Feb 2009 22:43:04 +0100 Subject: [PATCH 026/105] arch/powerpc: Eliminate double sizeof Taking sizeof the result of sizeof is quite strange and does not seem to be what is wanted here. This was fixed using the following semantic patch. (http://www.emn.fr/x-info/coccinelle/) // @@ expression E; @@ - sizeof ( sizeof (E) - ) @@ type T; @@ - sizeof ( sizeof (T) - ) // Signed-off-by: Julia Lawall Acked-by: Scott Wood Signed-off-by: Kumar Gala --- arch/powerpc/platforms/82xx/pq2ads-pci-pic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c b/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c index 9876d7e072f4..ddf0bdc0fc8b 100644 --- a/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c +++ b/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c @@ -186,7 +186,7 @@ int __init pq2ads_pci_init_irq(void) iounmap(priv->regs); out_free_bootmem: free_bootmem((unsigned long)priv, - sizeof(sizeof(struct pq2ads_pci_pic))); + sizeof(struct pq2ads_pci_pic)); of_node_put(np); out_unmap_irq: irq_dispose_mapping(irq); From 355423d0849f4506bc71ab2738d38cb74429aaef Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Fri, 6 Feb 2009 21:49:57 -0800 Subject: [PATCH 027/105] r8169: Don't update statistics counters when interface is down Some Realtek chips (RTL8169sb/8110sb in my case) are unable to retrieve ethtool statistics when the interface is down. The process stays in endless loop in rtl8169_get_ethtool_stats. This is because these chips need to have receiver enabled (CmdRxEnb bit in ChipCmd register) that is cleared when the interface is going down. It's better to update statistics only when the interface is up and otherwise return copy of statistics grabbed when the interface was up (in rtl8169_close). It is interesting that PCI-E NICs (like 8168b/8111b...) are not affected. Signed-off-by: Ivan Vecera Acked-by: Francois Romieu Signed-off-by: David S. Miller --- drivers/net/r8169.c | 93 ++++++++++++++++++++++++++++----------------- 1 file changed, 58 insertions(+), 35 deletions(-) diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c index 2c73ca606b35..0771eb6fc6eb 100644 --- a/drivers/net/r8169.c +++ b/drivers/net/r8169.c @@ -437,6 +437,22 @@ enum features { RTL_FEATURE_GMII = (1 << 2), }; +struct rtl8169_counters { + __le64 tx_packets; + __le64 rx_packets; + __le64 tx_errors; + __le32 rx_errors; + __le16 rx_missed; + __le16 align_errors; + __le32 tx_one_collision; + __le32 tx_multi_collision; + __le64 rx_unicast; + __le64 rx_broadcast; + __le32 rx_multicast; + __le16 tx_aborted; + __le16 tx_underun; +}; + struct rtl8169_private { void __iomem *mmio_addr; /* memory map physical address */ struct pci_dev *pci_dev; /* Index of PCI device */ @@ -480,6 +496,7 @@ struct rtl8169_private { unsigned features; struct mii_if_info mii; + struct rtl8169_counters counters; }; MODULE_AUTHOR("Realtek and the Linux r8169 crew "); @@ -1100,22 +1117,6 @@ static const char rtl8169_gstrings[][ETH_GSTRING_LEN] = { "tx_underrun", }; -struct rtl8169_counters { - __le64 tx_packets; - __le64 rx_packets; - __le64 tx_errors; - __le32 rx_errors; - __le16 rx_missed; - __le16 align_errors; - __le32 tx_one_collision; - __le32 tx_multi_collision; - __le64 rx_unicast; - __le64 rx_broadcast; - __le32 rx_multicast; - __le16 tx_aborted; - __le16 tx_underun; -}; - static int rtl8169_get_sset_count(struct net_device *dev, int sset) { switch (sset) { @@ -1126,16 +1127,21 @@ static int rtl8169_get_sset_count(struct net_device *dev, int sset) } } -static void rtl8169_get_ethtool_stats(struct net_device *dev, - struct ethtool_stats *stats, u64 *data) +static void rtl8169_update_counters(struct net_device *dev) { struct rtl8169_private *tp = netdev_priv(dev); void __iomem *ioaddr = tp->mmio_addr; struct rtl8169_counters *counters; dma_addr_t paddr; u32 cmd; + int wait = 1000; - ASSERT_RTNL(); + /* + * Some chips are unable to dump tally counters when the receiver + * is disabled. + */ + if ((RTL_R8(ChipCmd) & CmdRxEnb) == 0) + return; counters = pci_alloc_consistent(tp->pci_dev, sizeof(*counters), &paddr); if (!counters) @@ -1146,31 +1152,45 @@ static void rtl8169_get_ethtool_stats(struct net_device *dev, RTL_W32(CounterAddrLow, cmd); RTL_W32(CounterAddrLow, cmd | CounterDump); - while (RTL_R32(CounterAddrLow) & CounterDump) { - if (msleep_interruptible(1)) + while (wait--) { + if ((RTL_R32(CounterAddrLow) & CounterDump) == 0) { + /* copy updated counters */ + memcpy(&tp->counters, counters, sizeof(*counters)); break; + } + udelay(10); } RTL_W32(CounterAddrLow, 0); RTL_W32(CounterAddrHigh, 0); - data[0] = le64_to_cpu(counters->tx_packets); - data[1] = le64_to_cpu(counters->rx_packets); - data[2] = le64_to_cpu(counters->tx_errors); - data[3] = le32_to_cpu(counters->rx_errors); - data[4] = le16_to_cpu(counters->rx_missed); - data[5] = le16_to_cpu(counters->align_errors); - data[6] = le32_to_cpu(counters->tx_one_collision); - data[7] = le32_to_cpu(counters->tx_multi_collision); - data[8] = le64_to_cpu(counters->rx_unicast); - data[9] = le64_to_cpu(counters->rx_broadcast); - data[10] = le32_to_cpu(counters->rx_multicast); - data[11] = le16_to_cpu(counters->tx_aborted); - data[12] = le16_to_cpu(counters->tx_underun); - pci_free_consistent(tp->pci_dev, sizeof(*counters), counters, paddr); } +static void rtl8169_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, u64 *data) +{ + struct rtl8169_private *tp = netdev_priv(dev); + + ASSERT_RTNL(); + + rtl8169_update_counters(dev); + + data[0] = le64_to_cpu(tp->counters.tx_packets); + data[1] = le64_to_cpu(tp->counters.rx_packets); + data[2] = le64_to_cpu(tp->counters.tx_errors); + data[3] = le32_to_cpu(tp->counters.rx_errors); + data[4] = le16_to_cpu(tp->counters.rx_missed); + data[5] = le16_to_cpu(tp->counters.align_errors); + data[6] = le32_to_cpu(tp->counters.tx_one_collision); + data[7] = le32_to_cpu(tp->counters.tx_multi_collision); + data[8] = le64_to_cpu(tp->counters.rx_unicast); + data[9] = le64_to_cpu(tp->counters.rx_broadcast); + data[10] = le32_to_cpu(tp->counters.rx_multicast); + data[11] = le16_to_cpu(tp->counters.tx_aborted); + data[12] = le16_to_cpu(tp->counters.tx_underun); +} + static void rtl8169_get_strings(struct net_device *dev, u32 stringset, u8 *data) { switch(stringset) { @@ -3682,6 +3702,9 @@ static int rtl8169_close(struct net_device *dev) struct rtl8169_private *tp = netdev_priv(dev); struct pci_dev *pdev = tp->pci_dev; + /* update counters before going down */ + rtl8169_update_counters(dev); + rtl8169_down(dev); free_irq(dev->irq, dev); From 15bde72738f373aa060ececeda8e064e4f924360 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 6 Feb 2009 21:50:52 -0800 Subject: [PATCH 028/105] RxRPC: Fix a potential NULL dereference Fix a potential NULL dereference bug during error handling in rxrpc_kernel_begin_call(), whereby rxrpc_put_transport() may be handed a NULL pointer. This was found with a code checker (http://repo.or.cz/w/smatch.git/). Reported-by: Dan Carpenter Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/af_rxrpc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index d7d2bed7a699..eac5e7bb7365 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -284,13 +284,13 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, if (IS_ERR(trans)) { call = ERR_CAST(trans); trans = NULL; - goto out; + goto out_notrans; } } else { trans = rx->trans; if (!trans) { call = ERR_PTR(-ENOTCONN); - goto out; + goto out_notrans; } atomic_inc(&trans->usage); } @@ -315,6 +315,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, rxrpc_put_bundle(trans, bundle); out: rxrpc_put_transport(trans); +out_notrans: release_sock(&rx->sk); _leave(" = %p", call); return call; From 71822faa3bc0af5dbf5e333a2d085f1ed7cd809f Mon Sep 17 00:00:00 2001 From: Ilkka Virta Date: Fri, 6 Feb 2009 22:00:36 -0800 Subject: [PATCH 029/105] sungem: Soft lockup in sungem on Netra AC200 when switching interface up From: Ilkka Virta In the lockup situation the driver seems to go off in an eternal storm of interrupts right after calling request_irq(). It doesn't actually do anything interesting in the interrupt handler. Since connecting the link afterwards works, something later in initialization must fix this. Looking at gem_do_start() and gem_open(), it seems that the only thing done while opening the device after the request_irq(), is a call to napi_enable(). I don't know what the ordering requirements are for the initialization, but I boldly tried to move the napi_enable() call inside gem_do_start() before the link state is checked and interrupts subsequently enabled, and it seems to work for me. Doesn't even break anything too obvious... Signed-off-by: David S. Miller --- drivers/net/sungem.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/sungem.c b/drivers/net/sungem.c index b17efa9cc530..491876341068 100644 --- a/drivers/net/sungem.c +++ b/drivers/net/sungem.c @@ -2221,6 +2221,8 @@ static int gem_do_start(struct net_device *dev) gp->running = 1; + napi_enable(&gp->napi); + if (gp->lstate == link_up) { netif_carrier_on(gp->dev); gem_set_link_modes(gp); @@ -2238,6 +2240,8 @@ static int gem_do_start(struct net_device *dev) spin_lock_irqsave(&gp->lock, flags); spin_lock(&gp->tx_lock); + napi_disable(&gp->napi); + gp->running = 0; gem_reset(gp); gem_clean_rings(gp); @@ -2338,8 +2342,6 @@ static int gem_open(struct net_device *dev) if (!gp->asleep) rc = gem_do_start(dev); gp->opened = (rc == 0); - if (gp->opened) - napi_enable(&gp->napi); mutex_unlock(&gp->pm_mutex); @@ -2476,8 +2478,6 @@ static int gem_resume(struct pci_dev *pdev) /* Re-attach net device */ netif_device_attach(dev); - - napi_enable(&gp->napi); } spin_lock_irqsave(&gp->lock, flags); From 152abd139cca049c9b559a7cca762fa7fd9fd264 Mon Sep 17 00:00:00 2001 From: Ondrej Zary Date: Fri, 6 Feb 2009 22:04:08 -0800 Subject: [PATCH 030/105] 3c509: Fix resume from hibernation for PnP mode. From: Ondrej Zary last year, I posted a patch which fixed hibernation on 3c509 cards. That was back in 2.6.24. It worked fine in 2.6.25. But then I stopped using hibernation (as it did not work with my new IT8212 RAID controller). Now I fixed it and noticed that 3c509 does not wake up properly anymore (in 2.6.28) - neither in PnP nor in ISA modes. ifconfig down/up makes the card work again in PnP mode. However, in ISA mode, ifconfig up ends with "No such device" error. Comparing the 3c509 driver between 2.6.25 and 2.6.28, there's only some statistics-related change. So the cause of the problem must be somewhere else. This patch makes the resume work in PnP mode, but it's still not enough for ISA mode. Signed-off-by: David S. Miller --- drivers/net/3c509.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/3c509.c b/drivers/net/3c509.c index 535c234286ea..8c694213035b 100644 --- a/drivers/net/3c509.c +++ b/drivers/net/3c509.c @@ -1475,6 +1475,7 @@ el3_resume(struct device *pdev) spin_lock_irqsave(&lp->lock, flags); outw(PowerUp, ioaddr + EL3_CMD); + EL3WINDOW(0); el3_up(dev); if (netif_running(dev)) From b4bd07c20ba0c1fa7ad09ba257e0a5cfc2bf6bb3 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 6 Feb 2009 22:06:43 -0800 Subject: [PATCH 031/105] net_dma: call dmaengine_get only if NET_DMA enabled Based upon a patch from Atsushi Nemoto -------------------- The commit 649274d993212e7c23c0cb734572c2311c200872 ("net_dma: acquire/release dma channels on ifup/ifdown") added unconditional call of dmaengine_get() to net_dma. The API should be called only if NET_DMA was enabled. -------------------- Signed-off-by: David S. Miller Acked-by: Dan Williams --- include/linux/dmaengine.h | 12 ++++++++++++ net/core/dev.c | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 3e0f64c335c8..3e68469c1885 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -282,6 +282,18 @@ static inline void dmaengine_put(void) } #endif +#ifdef CONFIG_NET_DMA +#define net_dmaengine_get() dmaengine_get() +#define net_dmaengine_put() dmaengine_put() +#else +static inline void net_dmaengine_get(void) +{ +} +static inline void net_dmaengine_put(void) +{ +} +#endif + dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan, void *dest, void *src, size_t len); dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan, diff --git a/net/core/dev.c b/net/core/dev.c index 5379b0c1190a..a17e00662363 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1090,7 +1090,7 @@ int dev_open(struct net_device *dev) /* * Enable NET_DMA */ - dmaengine_get(); + net_dmaengine_get(); /* * Initialize multicasting status @@ -1172,7 +1172,7 @@ int dev_close(struct net_device *dev) /* * Shutdown NET_DMA */ - dmaengine_put(); + net_dmaengine_put(); return 0; } From beeebc92ee04bff6a722ebf85e23131faedd4479 Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Fri, 6 Feb 2009 22:07:41 -0800 Subject: [PATCH 032/105] 9p: fix endian issues [attempt 3] When the changes were done to the protocol last release, some endian bugs crept in. This patch fixes those endian problems and has been verified to run on 32/64 bit and x86/ppc architectures. This version of the patch incorporates the correct annotations for endian variables. Signed-off-by: Eric Van Hensbergen Signed-off-by: David S. Miller --- net/9p/protocol.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/net/9p/protocol.c b/net/9p/protocol.c index dcd7666824ba..fc70147c771e 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include "protocol.h" @@ -160,29 +161,32 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) break; case 'w':{ int16_t *val = va_arg(ap, int16_t *); - if (pdu_read(pdu, val, sizeof(*val))) { + __le16 le_val; + if (pdu_read(pdu, &le_val, sizeof(le_val))) { errcode = -EFAULT; break; } - *val = cpu_to_le16(*val); + *val = le16_to_cpu(le_val); } break; case 'd':{ int32_t *val = va_arg(ap, int32_t *); - if (pdu_read(pdu, val, sizeof(*val))) { + __le32 le_val; + if (pdu_read(pdu, &le_val, sizeof(le_val))) { errcode = -EFAULT; break; } - *val = cpu_to_le32(*val); + *val = le32_to_cpu(le_val); } break; case 'q':{ int64_t *val = va_arg(ap, int64_t *); - if (pdu_read(pdu, val, sizeof(*val))) { + __le64 le_val; + if (pdu_read(pdu, &le_val, sizeof(le_val))) { errcode = -EFAULT; break; } - *val = cpu_to_le64(*val); + *val = le64_to_cpu(le_val); } break; case 's':{ @@ -362,19 +366,19 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) } break; case 'w':{ - int16_t val = va_arg(ap, int); + __le16 val = cpu_to_le16(va_arg(ap, int)); if (pdu_write(pdu, &val, sizeof(val))) errcode = -EFAULT; } break; case 'd':{ - int32_t val = va_arg(ap, int32_t); + __le32 val = cpu_to_le32(va_arg(ap, int32_t)); if (pdu_write(pdu, &val, sizeof(val))) errcode = -EFAULT; } break; case 'q':{ - int64_t val = va_arg(ap, int64_t); + __le64 val = cpu_to_le64(va_arg(ap, int64_t)); if (pdu_write(pdu, &val, sizeof(val))) errcode = -EFAULT; } From 0b492fce3d72d982a7981905f85484a1e1ba7fde Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 7 Feb 2009 02:20:25 -0800 Subject: [PATCH 033/105] sunhme: Don't match PCI devices in SBUS probe. Unfortunately, the OF device tree nodes for SBUS and PCI hme devices have the same device node name on some systems. So if the name of the parent node isn't 'sbus', skip it. Based upon an excellent report and detective work by Meelis Roos and Eric Brower. Signed-off-by: David S. Miller Tested-by: Meelis Roos --- drivers/net/sunhme.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/sunhme.c b/drivers/net/sunhme.c index 7a72a3112f0a..cc4013be5e18 100644 --- a/drivers/net/sunhme.c +++ b/drivers/net/sunhme.c @@ -2629,6 +2629,14 @@ static int __devinit happy_meal_sbus_probe_one(struct of_device *op, int is_qfe) int i, qfe_slot = -1; int err = -ENODEV; + sbus_dp = to_of_device(op->dev.parent)->node; + if (is_qfe) + sbus_dp = to_of_device(op->dev.parent->parent)->node; + + /* We can match PCI devices too, do not accept those here. */ + if (strcmp(sbus_dp->name, "sbus")) + return err; + if (is_qfe) { qp = quattro_sbus_find(op); if (qp == NULL) @@ -2734,10 +2742,6 @@ static int __devinit happy_meal_sbus_probe_one(struct of_device *op, int is_qfe) if (qp != NULL) hp->happy_flags |= HFLAG_QUATTRO; - sbus_dp = to_of_device(op->dev.parent)->node; - if (is_qfe) - sbus_dp = to_of_device(op->dev.parent->parent)->node; - /* Get the supported DVMA burst sizes from our Happy SBUS. */ hp->happy_bursts = of_getintprop_default(sbus_dp, "burst-sizes", 0x00); From bc111d570ba87cff48ec8dfa15a2a598e59c0f4b Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 8 Feb 2009 17:00:02 -0800 Subject: [PATCH 034/105] drivers/atm: introduce missing kfree Error handling code following a kmalloc should free the allocated data. The semantic match that finds the problem is as follows: (http://www.emn.fr/x-info/coccinelle/) // @r exists@ local idexpression x; statement S; expression E; identifier f,l; position p1,p2; expression *ptr != NULL; @@ ( if ((x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...)) == NULL) S | x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...); ... if (x == NULL) S ) <... when != x when != if (...) { <+...x...+> } x->f = E ...> ( return \(0\|<+...x...+>\|ptr\); | return@p2 ...; ) @script:python@ p1 << r.p1; p2 << r.p2; @@ print "* file: %s kmalloc %s return %s" % (p1[0].file,p1[0].line,p2[0].line) // Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- drivers/atm/solos-pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c index 72fc0f799a64..89d7a6e94c9c 100644 --- a/drivers/atm/solos-pci.c +++ b/drivers/atm/solos-pci.c @@ -685,6 +685,7 @@ static int fpga_probe(struct pci_dev *dev, const struct pci_device_id *id) out_release_regions: pci_release_regions(dev); out: + kfree(card); return err; } From 23b904f35128f3c596831cc3320bab1f2db81f60 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 8 Feb 2009 17:00:49 -0800 Subject: [PATCH 035/105] drivers/isdn: introduce missing kfree Error handling code following a kmalloc should free the allocated data. The semantic match that finds the problem is as follows: (http://www.emn.fr/x-info/coccinelle/) // @r exists@ local idexpression x; statement S; expression E; identifier f,l; position p1,p2; expression *ptr != NULL; @@ ( if ((x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...)) == NULL) S | x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...); ... if (x == NULL) S ) <... when != x when != if (...) { <+...x...+> } x->f = E ...> ( return \(0\|<+...x...+>\|ptr\); | return@p2 ...; ) @script:python@ p1 << r.p1; p2 << r.p2; @@ print "* file: %s kmalloc %s return %s" % (p1[0].file,p1[0].line,p2[0].line) // Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- drivers/isdn/hardware/mISDN/hfcmulti.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/isdn/hardware/mISDN/hfcmulti.c b/drivers/isdn/hardware/mISDN/hfcmulti.c index 595ba8eb4a07..0b28141e43bf 100644 --- a/drivers/isdn/hardware/mISDN/hfcmulti.c +++ b/drivers/isdn/hardware/mISDN/hfcmulti.c @@ -4599,6 +4599,7 @@ init_e1_port(struct hfc_multi *hc, struct hm_map *m) printk(KERN_ERR "%s: no memory for coeffs\n", __func__); ret = -ENOMEM; + kfree(bch); goto free_chan; } bch->nr = ch; @@ -4767,6 +4768,7 @@ init_multi_port(struct hfc_multi *hc, int pt) printk(KERN_ERR "%s: no memory for coeffs\n", __func__); ret = -ENOMEM; + kfree(bch); goto free_chan; } bch->nr = ch + 1; From cfbf84fcbcda98bb91ada683a8dc8e6901a83ebd Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Sun, 8 Feb 2009 17:49:17 -0800 Subject: [PATCH 036/105] tun: Fix unicast filter overflow Tap devices can make use of a small MAC filter set via the TUNSETTXFILTER ioctl. The filter has a set of exact matches plus a hash for imperfect filtering of additional multicast addresses. The current code is unbalanced, adding unicast addresses to the multicast hash, but only checking the hash against multicast addresses. This results in the filter dropping unicast addresses that overflow the exact filter. The fix is simply to disable the filter by leaving count set to zero if we find non-multicast addresses after the exact match table is filled. Signed-off-by: Alex Williamson Signed-off-by: David S. Miller --- drivers/net/tun.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index d7b81e4fdd56..09fea31d3e36 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -157,10 +157,16 @@ static int update_filter(struct tap_filter *filter, void __user *arg) nexact = n; - /* The rest is hashed */ + /* Remaining multicast addresses are hashed, + * unicast will leave the filter disabled. */ memset(filter->mask, 0, sizeof(filter->mask)); - for (; n < uf.count; n++) + for (; n < uf.count; n++) { + if (!is_multicast_ether_addr(addr[n].u)) { + err = 0; /* no filter */ + goto done; + } addr_hash_set(filter->mask, addr[n].u); + } /* For ALLMULTI just set the mask to all ones. * This overrides the mask populated above. */ From b991d2bc4a6e1821555bdc2a682f9aed24650c98 Mon Sep 17 00:00:00 2001 From: Risto Suominen Date: Sun, 8 Feb 2009 17:50:34 -0800 Subject: [PATCH 037/105] de2104x: force correct order when writing to rx ring DescOwn should not be set, thus allowing the chip to use the descriptor, before everything else is set up correctly. Signed-off-by: Risto Suominen Signed-off-by: David S. Miller --- drivers/net/tulip/de2104x.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/tulip/de2104x.c b/drivers/net/tulip/de2104x.c index 0bf2114738be..d4c5ecc51f77 100644 --- a/drivers/net/tulip/de2104x.c +++ b/drivers/net/tulip/de2104x.c @@ -464,13 +464,14 @@ static void de_rx (struct de_private *de) drop = 1; rx_next: - de->rx_ring[rx_tail].opts1 = cpu_to_le32(DescOwn); if (rx_tail == (DE_RX_RING_SIZE - 1)) de->rx_ring[rx_tail].opts2 = cpu_to_le32(RingEnd | de->rx_buf_sz); else de->rx_ring[rx_tail].opts2 = cpu_to_le32(de->rx_buf_sz); de->rx_ring[rx_tail].addr1 = cpu_to_le32(mapping); + wmb(); + de->rx_ring[rx_tail].opts1 = cpu_to_le32(DescOwn); rx_tail = NEXT_RX(rx_tail); } From b3df68f8f5a29888ae693fdb84ebabbc28ed9400 Mon Sep 17 00:00:00 2001 From: Dhananjay Phadke Date: Sun, 8 Feb 2009 19:20:19 -0800 Subject: [PATCH 038/105] netxen: fix msi-x interrupt handling o Cut down msi-x vectors from 8 to 1 since only one is used for now. o Use separate handler for msi-x, that doesn't unnecessarily scrub msi status register. Signed-off-by: Dhananjay Phadke Signed-off-by: David S. Miller --- drivers/net/netxen/netxen_nic.h | 2 +- drivers/net/netxen/netxen_nic_main.c | 13 ++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/net/netxen/netxen_nic.h b/drivers/net/netxen/netxen_nic.h index 9c78c963b721..f4dd9acb6877 100644 --- a/drivers/net/netxen/netxen_nic.h +++ b/drivers/net/netxen/netxen_nic.h @@ -1203,7 +1203,7 @@ typedef struct { #define NETXEN_IS_MSI_FAMILY(adapter) \ ((adapter)->flags & (NETXEN_NIC_MSI_ENABLED | NETXEN_NIC_MSIX_ENABLED)) -#define MSIX_ENTRIES_PER_ADAPTER 8 +#define MSIX_ENTRIES_PER_ADAPTER 1 #define NETXEN_MSIX_TBL_SPACE 8192 #define NETXEN_PCI_REG_MSIX_TBL 0x44 diff --git a/drivers/net/netxen/netxen_nic_main.c b/drivers/net/netxen/netxen_nic_main.c index 645d384fe87e..3b17a7936147 100644 --- a/drivers/net/netxen/netxen_nic_main.c +++ b/drivers/net/netxen/netxen_nic_main.c @@ -76,6 +76,7 @@ static void netxen_nic_poll_controller(struct net_device *netdev); #endif static irqreturn_t netxen_intr(int irq, void *data); static irqreturn_t netxen_msi_intr(int irq, void *data); +static irqreturn_t netxen_msix_intr(int irq, void *data); /* PCI Device ID Table */ #define ENTRY(device) \ @@ -1084,7 +1085,9 @@ static int netxen_nic_open(struct net_device *netdev) for (ring = 0; ring < adapter->max_rds_rings; ring++) netxen_post_rx_buffers(adapter, ctx, ring); } - if (NETXEN_IS_MSI_FAMILY(adapter)) + if (adapter->flags & NETXEN_NIC_MSIX_ENABLED) + handler = netxen_msix_intr; + else if (adapter->flags & NETXEN_NIC_MSI_ENABLED) handler = netxen_msi_intr; else { flags |= IRQF_SHARED; @@ -1612,6 +1615,14 @@ static irqreturn_t netxen_msi_intr(int irq, void *data) return IRQ_HANDLED; } +static irqreturn_t netxen_msix_intr(int irq, void *data) +{ + struct netxen_adapter *adapter = data; + + napi_schedule(&adapter->napi); + return IRQ_HANDLED; +} + static int netxen_nic_poll(struct napi_struct *napi, int budget) { struct netxen_adapter *adapter = container_of(napi, struct netxen_adapter, napi); From 40bdac7dbc161639a498697f34fbd1ee800e51f4 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 8 Feb 2009 22:00:55 -0800 Subject: [PATCH 039/105] sparc64: Kill .fixup section bloat. This is an implementation of a suggestion made by Chris Torek: -------------------- Something else I noticed in passing: the EX and EX_LD/EX_ST macros scattered throughout the various .S files make a fair bit of .fixup code, all of which does the same thing. At the cost of one symbol in copy_in_user.S, you could just have one common two-instruction retl-and-mov-1 fixup that they all share. -------------------- The following is with a defconfig build: text data bss dec hex filename 3972767 344024 584449 4901240 4ac978 vmlinux.orig 3968887 344024 584449 4897360 4aba50 vmlinux Signed-off-by: David S. Miller --- arch/sparc/kernel/head_64.S | 31 +++++++++++++++++++++++++++--- arch/sparc/lib/GENbzero.S | 6 +----- arch/sparc/lib/GENcopy_from_user.S | 6 +----- arch/sparc/lib/GENcopy_to_user.S | 6 +----- arch/sparc/lib/NG2copy_from_user.S | 7 +------ arch/sparc/lib/NG2copy_to_user.S | 7 +------ arch/sparc/lib/NGbzero.S | 6 +----- arch/sparc/lib/NGcopy_from_user.S | 7 +------ arch/sparc/lib/NGcopy_to_user.S | 7 +------ arch/sparc/lib/U1copy_from_user.S | 6 +----- arch/sparc/lib/U1copy_to_user.S | 6 +----- arch/sparc/lib/U3copy_from_user.S | 6 +----- arch/sparc/lib/U3copy_to_user.S | 6 +----- arch/sparc/lib/bzero.S | 6 +----- arch/sparc/lib/copy_in_user.S | 6 +----- 15 files changed, 42 insertions(+), 77 deletions(-) diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S index 8ffee714f932..a46c3a21e26d 100644 --- a/arch/sparc/kernel/head_64.S +++ b/arch/sparc/kernel/head_64.S @@ -891,10 +891,35 @@ prom_tba: .xword 0 tlb_type: .word 0 /* Must NOT end up in BSS */ .section ".fixup",#alloc,#execinstr - .globl __ret_efault, __retl_efault -__ret_efault: + .globl __ret_efault, __retl_efault, __ret_one, __retl_one +ENTRY(__ret_efault) ret restore %g0, -EFAULT, %o0 -__retl_efault: +ENDPROC(__ret_efault) + +ENTRY(__retl_efault) retl mov -EFAULT, %o0 +ENDPROC(__retl_efault) + +ENTRY(__retl_one) + retl + mov 1, %o0 +ENDPROC(__retl_one) + +ENTRY(__ret_one_asi) + wr %g0, ASI_AIUS, %asi + ret + restore %g0, 1, %o0 +ENDPROC(__ret_one_asi) + +ENTRY(__retl_one_asi) + wr %g0, ASI_AIUS, %asi + retl + mov 1, %o0 +ENDPROC(__retl_one_asi) + +ENTRY(__retl_o1) + retl + mov %o1, %o0 +ENDPROC(__retl_o1) diff --git a/arch/sparc/lib/GENbzero.S b/arch/sparc/lib/GENbzero.S index 6a4f956a2f7a..8e7a843ddd88 100644 --- a/arch/sparc/lib/GENbzero.S +++ b/arch/sparc/lib/GENbzero.S @@ -6,13 +6,9 @@ #define EX_ST(x,y) \ 98: x,y; \ - .section .fixup; \ - .align 4; \ -99: retl; \ - mov %o1, %o0; \ .section __ex_table,"a";\ .align 4; \ - .word 98b, 99b; \ + .word 98b, __retl_o1; \ .text; \ .align 4; diff --git a/arch/sparc/lib/GENcopy_from_user.S b/arch/sparc/lib/GENcopy_from_user.S index 2b9df99e87f9..9b0e58f9f970 100644 --- a/arch/sparc/lib/GENcopy_from_user.S +++ b/arch/sparc/lib/GENcopy_from_user.S @@ -5,13 +5,9 @@ #define EX_LD(x) \ 98: x; \ - .section .fixup; \ - .align 4; \ -99: retl; \ - mov 1, %o0; \ .section __ex_table,"a";\ .align 4; \ - .word 98b, 99b; \ + .word 98b, __retl_one; \ .text; \ .align 4; diff --git a/arch/sparc/lib/GENcopy_to_user.S b/arch/sparc/lib/GENcopy_to_user.S index bb3f7084daf9..ce79b495c65d 100644 --- a/arch/sparc/lib/GENcopy_to_user.S +++ b/arch/sparc/lib/GENcopy_to_user.S @@ -5,13 +5,9 @@ #define EX_ST(x) \ 98: x; \ - .section .fixup; \ - .align 4; \ -99: retl; \ - mov 1, %o0; \ .section __ex_table,"a";\ .align 4; \ - .word 98b, 99b; \ + .word 98b, __retl_one; \ .text; \ .align 4; diff --git a/arch/sparc/lib/NG2copy_from_user.S b/arch/sparc/lib/NG2copy_from_user.S index c77ef5f22102..01ac546a86f4 100644 --- a/arch/sparc/lib/NG2copy_from_user.S +++ b/arch/sparc/lib/NG2copy_from_user.S @@ -5,14 +5,9 @@ #define EX_LD(x) \ 98: x; \ - .section .fixup; \ - .align 4; \ -99: wr %g0, ASI_AIUS, %asi;\ - retl; \ - mov 1, %o0; \ .section __ex_table,"a";\ .align 4; \ - .word 98b, 99b; \ + .word 98b, __retl_one_asi;\ .text; \ .align 4; diff --git a/arch/sparc/lib/NG2copy_to_user.S b/arch/sparc/lib/NG2copy_to_user.S index 4bd4093acbbd..c477e1594a72 100644 --- a/arch/sparc/lib/NG2copy_to_user.S +++ b/arch/sparc/lib/NG2copy_to_user.S @@ -5,14 +5,9 @@ #define EX_ST(x) \ 98: x; \ - .section .fixup; \ - .align 4; \ -99: wr %g0, ASI_AIUS, %asi;\ - retl; \ - mov 1, %o0; \ .section __ex_table,"a";\ .align 4; \ - .word 98b, 99b; \ + .word 98b, __retl_one_asi;\ .text; \ .align 4; diff --git a/arch/sparc/lib/NGbzero.S b/arch/sparc/lib/NGbzero.S index 814d5f7a45e1..beab29bf419b 100644 --- a/arch/sparc/lib/NGbzero.S +++ b/arch/sparc/lib/NGbzero.S @@ -6,13 +6,9 @@ #define EX_ST(x,y) \ 98: x,y; \ - .section .fixup; \ - .align 4; \ -99: retl; \ - mov %o1, %o0; \ .section __ex_table,"a";\ .align 4; \ - .word 98b, 99b; \ + .word 98b, __retl_o1; \ .text; \ .align 4; diff --git a/arch/sparc/lib/NGcopy_from_user.S b/arch/sparc/lib/NGcopy_from_user.S index e7f433f71b42..39bb8912c6e7 100644 --- a/arch/sparc/lib/NGcopy_from_user.S +++ b/arch/sparc/lib/NGcopy_from_user.S @@ -5,14 +5,9 @@ #define EX_LD(x) \ 98: x; \ - .section .fixup; \ - .align 4; \ -99: wr %g0, ASI_AIUS, %asi;\ - ret; \ - restore %g0, 1, %o0; \ .section __ex_table,"a";\ .align 4; \ - .word 98b, 99b; \ + .word 98b, __ret_one_asi;\ .text; \ .align 4; diff --git a/arch/sparc/lib/NGcopy_to_user.S b/arch/sparc/lib/NGcopy_to_user.S index 6ea01c5532a0..bf62fc1a26c7 100644 --- a/arch/sparc/lib/NGcopy_to_user.S +++ b/arch/sparc/lib/NGcopy_to_user.S @@ -5,14 +5,9 @@ #define EX_ST(x) \ 98: x; \ - .section .fixup; \ - .align 4; \ -99: wr %g0, ASI_AIUS, %asi;\ - ret; \ - restore %g0, 1, %o0; \ .section __ex_table,"a";\ .align 4; \ - .word 98b, 99b; \ + .word 98b, __ret_one_asi;\ .text; \ .align 4; diff --git a/arch/sparc/lib/U1copy_from_user.S b/arch/sparc/lib/U1copy_from_user.S index 3192b0bf4fab..f14efdd6d4a9 100644 --- a/arch/sparc/lib/U1copy_from_user.S +++ b/arch/sparc/lib/U1copy_from_user.S @@ -5,13 +5,9 @@ #define EX_LD(x) \ 98: x; \ - .section .fixup; \ - .align 4; \ -99: retl; \ - mov 1, %o0; \ .section __ex_table,"a";\ .align 4; \ - .word 98b, 99b; \ + .word 98b, __retl_one; \ .text; \ .align 4; diff --git a/arch/sparc/lib/U1copy_to_user.S b/arch/sparc/lib/U1copy_to_user.S index d1210ffb0b82..59d531b4a1a5 100644 --- a/arch/sparc/lib/U1copy_to_user.S +++ b/arch/sparc/lib/U1copy_to_user.S @@ -5,13 +5,9 @@ #define EX_ST(x) \ 98: x; \ - .section .fixup; \ - .align 4; \ -99: retl; \ - mov 1, %o0; \ .section __ex_table,"a";\ .align 4; \ - .word 98b, 99b; \ + .word 98b, __retl_one; \ .text; \ .align 4; diff --git a/arch/sparc/lib/U3copy_from_user.S b/arch/sparc/lib/U3copy_from_user.S index f5bfc8d9d216..b1acd1331c33 100644 --- a/arch/sparc/lib/U3copy_from_user.S +++ b/arch/sparc/lib/U3copy_from_user.S @@ -5,13 +5,9 @@ #define EX_LD(x) \ 98: x; \ - .section .fixup; \ - .align 4; \ -99: retl; \ - mov 1, %o0; \ .section __ex_table,"a";\ .align 4; \ - .word 98b, 99b; \ + .word 98b, __retl_one; \ .text; \ .align 4; diff --git a/arch/sparc/lib/U3copy_to_user.S b/arch/sparc/lib/U3copy_to_user.S index 2334f111bb0c..b4cbfe5a97ba 100644 --- a/arch/sparc/lib/U3copy_to_user.S +++ b/arch/sparc/lib/U3copy_to_user.S @@ -5,13 +5,9 @@ #define EX_ST(x) \ 98: x; \ - .section .fixup; \ - .align 4; \ -99: retl; \ - mov 1, %o0; \ .section __ex_table,"a";\ .align 4; \ - .word 98b, 99b; \ + .word 98b, __retl_one; \ .text; \ .align 4; diff --git a/arch/sparc/lib/bzero.S b/arch/sparc/lib/bzero.S index c7bbae8c590f..b6557297440f 100644 --- a/arch/sparc/lib/bzero.S +++ b/arch/sparc/lib/bzero.S @@ -88,13 +88,9 @@ __bzero_done: #define EX_ST(x,y) \ 98: x,y; \ - .section .fixup; \ - .align 4; \ -99: retl; \ - mov %o1, %o0; \ .section __ex_table,"a";\ .align 4; \ - .word 98b, 99b; \ + .word 98b, __retl_o1; \ .text; \ .align 4; diff --git a/arch/sparc/lib/copy_in_user.S b/arch/sparc/lib/copy_in_user.S index 650af3f21f78..884f46499d4e 100644 --- a/arch/sparc/lib/copy_in_user.S +++ b/arch/sparc/lib/copy_in_user.S @@ -9,13 +9,9 @@ #define EX(x,y) \ 98: x,y; \ - .section .fixup; \ - .align 4; \ -99: retl; \ - mov 1, %o0; \ .section __ex_table,"a";\ .align 4; \ - .word 98b, 99b; \ + .word 98b, __retl_one; \ .text; \ .align 4; From aeb398768345c74a9e4c01aa3ebf839e858312ec Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 8 Feb 2009 22:32:31 -0800 Subject: [PATCH 040/105] sparc64: Fix probe_kernel_{read,write}(). This is based upon a report from Chris Torek and his initial patch. From Chris's report: -------------------- This came up in testing kgdb, using the built-in tests -- turn on CONFIG_KGDB_TESTS, then echo V1 > /sys/module/kgdbts/parameters/kgdbts -- but it would affect using kgdb if you were debugging and looking at bad pointers. -------------------- When we get a copy_{from,to}_user() request and the %asi is set to something other than ASI_AIUS (which is userspace) then we branch off to a routine called memcpy_user_stub(). It just does a straight memcpy since we are copying from kernel to kernel in this case. The logic was that since source and destination are both kernel pointers we don't need to have exception checks. But for what probe_kernel_{read,write}() is trying to do, we have to have the checks, otherwise things like kgdb bad kernel pointer accesses don't do the right thing. Signed-off-by: David S. Miller --- arch/sparc/lib/GENcopy_from_user.S | 2 +- arch/sparc/lib/GENcopy_to_user.S | 2 +- arch/sparc/lib/NG2copy_from_user.S | 2 +- arch/sparc/lib/NG2copy_to_user.S | 2 +- arch/sparc/lib/NGcopy_from_user.S | 2 +- arch/sparc/lib/NGcopy_to_user.S | 2 +- arch/sparc/lib/U1copy_from_user.S | 2 +- arch/sparc/lib/U1copy_to_user.S | 2 +- arch/sparc/lib/U3copy_to_user.S | 2 +- arch/sparc/lib/copy_in_user.S | 55 +++++++++--------------------- 10 files changed, 25 insertions(+), 48 deletions(-) diff --git a/arch/sparc/lib/GENcopy_from_user.S b/arch/sparc/lib/GENcopy_from_user.S index 9b0e58f9f970..b7d0bd6b1406 100644 --- a/arch/sparc/lib/GENcopy_from_user.S +++ b/arch/sparc/lib/GENcopy_from_user.S @@ -23,7 +23,7 @@ #define PREAMBLE \ rd %asi, %g1; \ cmp %g1, ASI_AIUS; \ - bne,pn %icc, memcpy_user_stub; \ + bne,pn %icc, ___copy_in_user; \ nop #endif diff --git a/arch/sparc/lib/GENcopy_to_user.S b/arch/sparc/lib/GENcopy_to_user.S index ce79b495c65d..780550e1afc7 100644 --- a/arch/sparc/lib/GENcopy_to_user.S +++ b/arch/sparc/lib/GENcopy_to_user.S @@ -27,7 +27,7 @@ #define PREAMBLE \ rd %asi, %g1; \ cmp %g1, ASI_AIUS; \ - bne,pn %icc, memcpy_user_stub; \ + bne,pn %icc, ___copy_in_user; \ nop #endif diff --git a/arch/sparc/lib/NG2copy_from_user.S b/arch/sparc/lib/NG2copy_from_user.S index 01ac546a86f4..119ccb9a54f4 100644 --- a/arch/sparc/lib/NG2copy_from_user.S +++ b/arch/sparc/lib/NG2copy_from_user.S @@ -28,7 +28,7 @@ #define PREAMBLE \ rd %asi, %g1; \ cmp %g1, ASI_AIUS; \ - bne,pn %icc, memcpy_user_stub; \ + bne,pn %icc, ___copy_in_user; \ nop #endif diff --git a/arch/sparc/lib/NG2copy_to_user.S b/arch/sparc/lib/NG2copy_to_user.S index c477e1594a72..7fe1ccefd9d0 100644 --- a/arch/sparc/lib/NG2copy_to_user.S +++ b/arch/sparc/lib/NG2copy_to_user.S @@ -37,7 +37,7 @@ #define PREAMBLE \ rd %asi, %g1; \ cmp %g1, ASI_AIUS; \ - bne,pn %icc, memcpy_user_stub; \ + bne,pn %icc, ___copy_in_user; \ nop #endif diff --git a/arch/sparc/lib/NGcopy_from_user.S b/arch/sparc/lib/NGcopy_from_user.S index 39bb8912c6e7..5d1e4d1ac21e 100644 --- a/arch/sparc/lib/NGcopy_from_user.S +++ b/arch/sparc/lib/NGcopy_from_user.S @@ -25,7 +25,7 @@ #define PREAMBLE \ rd %asi, %g1; \ cmp %g1, ASI_AIUS; \ - bne,pn %icc, memcpy_user_stub; \ + bne,pn %icc, ___copy_in_user; \ nop #endif diff --git a/arch/sparc/lib/NGcopy_to_user.S b/arch/sparc/lib/NGcopy_to_user.S index bf62fc1a26c7..ff630dcb273c 100644 --- a/arch/sparc/lib/NGcopy_to_user.S +++ b/arch/sparc/lib/NGcopy_to_user.S @@ -28,7 +28,7 @@ #define PREAMBLE \ rd %asi, %g1; \ cmp %g1, ASI_AIUS; \ - bne,pn %icc, memcpy_user_stub; \ + bne,pn %icc, ___copy_in_user; \ nop #endif diff --git a/arch/sparc/lib/U1copy_from_user.S b/arch/sparc/lib/U1copy_from_user.S index f14efdd6d4a9..a6ae2ea04bf5 100644 --- a/arch/sparc/lib/U1copy_from_user.S +++ b/arch/sparc/lib/U1copy_from_user.S @@ -23,7 +23,7 @@ #define PREAMBLE \ rd %asi, %g1; \ cmp %g1, ASI_AIUS; \ - bne,pn %icc, memcpy_user_stub; \ + bne,pn %icc, ___copy_in_user; \ nop; \ #include "U1memcpy.S" diff --git a/arch/sparc/lib/U1copy_to_user.S b/arch/sparc/lib/U1copy_to_user.S index 59d531b4a1a5..f4b970eeb485 100644 --- a/arch/sparc/lib/U1copy_to_user.S +++ b/arch/sparc/lib/U1copy_to_user.S @@ -23,7 +23,7 @@ #define PREAMBLE \ rd %asi, %g1; \ cmp %g1, ASI_AIUS; \ - bne,pn %icc, memcpy_user_stub; \ + bne,pn %icc, ___copy_in_user; \ nop; \ #include "U1memcpy.S" diff --git a/arch/sparc/lib/U3copy_to_user.S b/arch/sparc/lib/U3copy_to_user.S index b4cbfe5a97ba..ef1e493afdfa 100644 --- a/arch/sparc/lib/U3copy_to_user.S +++ b/arch/sparc/lib/U3copy_to_user.S @@ -23,7 +23,7 @@ #define PREAMBLE \ rd %asi, %g1; \ cmp %g1, ASI_AIUS; \ - bne,pn %icc, memcpy_user_stub; \ + bne,pn %icc, ___copy_in_user; \ nop; \ #include "U3memcpy.S" diff --git a/arch/sparc/lib/copy_in_user.S b/arch/sparc/lib/copy_in_user.S index 884f46499d4e..302c0e60dc2c 100644 --- a/arch/sparc/lib/copy_in_user.S +++ b/arch/sparc/lib/copy_in_user.S @@ -3,6 +3,7 @@ * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) */ +#include #include #define XCC xcc @@ -27,18 +28,7 @@ * to copy register windows around during thread cloning. */ - .globl ___copy_in_user - .type ___copy_in_user,#function -___copy_in_user: /* %o0=dst, %o1=src, %o2=len */ - /* Writing to %asi is _expensive_ so we hardcode it. - * Reading %asi to check for KERNEL_DS is comparatively - * cheap. - */ - rd %asi, %g1 - cmp %g1, ASI_AIUS - bne,pn %icc, memcpy_user_stub - nop - +ENTRY(___copy_in_user) /* %o0=dst, %o1=src, %o2=len */ cmp %o2, 0 be,pn %XCC, 85f or %o0, %o1, %o3 @@ -49,22 +39,24 @@ ___copy_in_user: /* %o0=dst, %o1=src, %o2=len */ /* 16 < len <= 64 */ andcc %o3, 0x7, %g0 bne,pn %XCC, 90f - sub %o0, %o1, %o3 + nop andn %o2, 0x7, %o4 and %o2, 0x7, %o2 1: subcc %o4, 0x8, %o4 EX(ldxa [%o1] %asi, %o5) - EX(stxa %o5, [%o1 + %o3] ASI_AIUS) + EX(stxa %o5, [%o0] %asi) + add %o1, 0x8, %o1 bgu,pt %XCC, 1b - add %o1, 0x8, %o1 + add %o0, 0x8, %o0 andcc %o2, 0x4, %g0 be,pt %XCC, 1f nop sub %o2, 0x4, %o2 EX(lduwa [%o1] %asi, %o5) - EX(stwa %o5, [%o1 + %o3] ASI_AIUS) + EX(stwa %o5, [%o0] %asi) add %o1, 0x4, %o1 + add %o0, 0x4, %o0 1: cmp %o2, 0 be,pt %XCC, 85f nop @@ -74,14 +66,15 @@ ___copy_in_user: /* %o0=dst, %o1=src, %o2=len */ 80: /* 0 < len <= 16 */ andcc %o3, 0x3, %g0 bne,pn %XCC, 90f - sub %o0, %o1, %o3 + nop 82: subcc %o2, 4, %o2 EX(lduwa [%o1] %asi, %g1) - EX(stwa %g1, [%o1 + %o3] ASI_AIUS) + EX(stwa %g1, [%o0] %asi) + add %o1, 4, %o1 bgu,pt %XCC, 82b - add %o1, 4, %o1 + add %o0, 4, %o0 85: retl clr %o0 @@ -90,26 +83,10 @@ ___copy_in_user: /* %o0=dst, %o1=src, %o2=len */ 90: subcc %o2, 1, %o2 EX(lduba [%o1] %asi, %g1) - EX(stba %g1, [%o1 + %o3] ASI_AIUS) + EX(stba %g1, [%o0] %asi) + add %o1, 1, %o1 bgu,pt %XCC, 90b - add %o1, 1, %o1 + add %o0, 1, %o0 retl clr %o0 - - .size ___copy_in_user, .-___copy_in_user - - /* Act like copy_{to,in}_user(), ie. return zero instead - * of original destination pointer. This is invoked when - * copy_{to,in}_user() finds that %asi is kernel space. - */ - .globl memcpy_user_stub - .type memcpy_user_stub,#function -memcpy_user_stub: - save %sp, -192, %sp - mov %i0, %o0 - mov %i1, %o1 - call memcpy - mov %i2, %o2 - ret - restore %g0, %g0, %o0 - .size memcpy_user_stub, .-memcpy_user_stub +ENDPROC(___copy_in_user) From e736ad548db152776de61d7a26805cfae77ce5ce Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Fri, 6 Feb 2009 16:52:05 -0800 Subject: [PATCH 041/105] x86: add clflush before monitor for Intel 7400 series For Intel 7400 series CPUs, the recommendation is to use a clflush on the monitored address just before monitor and mwait pair [1]. This clflush makes sure that there are no false wakeups from mwait when the monitored address was recently written to. [1] "MONITOR/MWAIT Recommendations for Intel Xeon Processor 7400 series" section in specification update document of 7400 series http://download.intel.com/design/xeon/specupdt/32033601.pdf Signed-off-by: Venkatesh Pallipadi Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/kernel/cpu/intel.c | 3 +++ arch/x86/kernel/process.c | 6 ++++++ 3 files changed, 10 insertions(+) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index ea408dcba513..7301e60dc4a8 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -93,6 +93,7 @@ #define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ #define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ +#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 430e5c38a544..24ff26a38ade 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -291,6 +291,9 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) ds_init_intel(c); } + if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush) + set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR); + #ifdef CONFIG_X86_64 if (c->x86 == 15) c->x86_cache_alignment = c->x86_clflush_size * 2; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e68bb9e30864..6d12f7e37f8c 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -180,6 +180,9 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) trace_power_start(&it, POWER_CSTATE, (ax>>4)+1); if (!need_resched()) { + if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) + clflush((void *)¤t_thread_info()->flags); + __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) @@ -194,6 +197,9 @@ static void mwait_idle(void) struct power_trace it; if (!need_resched()) { trace_power_start(&it, POWER_CSTATE, 1); + if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) + clflush((void *)¤t_thread_info()->flags); + __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) From 3f4a739c6accd651a11fcf3c7a20ec8147c42660 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 8 Feb 2009 16:18:03 -0800 Subject: [PATCH 042/105] x86: find nr_irqs_gsi with mp_ioapic_routing Impact: find right nr_irqs_gsi on some systems. One test-system has gap between gsi's: [ 0.000000] ACPI: IOAPIC (id[0x04] address[0xfec00000] gsi_base[0]) [ 0.000000] IOAPIC[0]: apic_id 4, version 0, address 0xfec00000, GSI 0-23 [ 0.000000] ACPI: IOAPIC (id[0x05] address[0xfeafd000] gsi_base[48]) [ 0.000000] IOAPIC[1]: apic_id 5, version 0, address 0xfeafd000, GSI 48-54 [ 0.000000] ACPI: IOAPIC (id[0x06] address[0xfeafc000] gsi_base[56]) [ 0.000000] IOAPIC[2]: apic_id 6, version 0, address 0xfeafc000, GSI 56-62 ... [ 0.000000] nr_irqs_gsi: 38 So nr_irqs_gsi is not right. some irq for MSI will overwrite with io_apic. need to get that with acpi_probe_gsi when acpi io_apic is used Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mpspec.h | 6 ++++++ arch/x86/kernel/acpi/boot.c | 23 +++++++++++++++++++++++ arch/x86/kernel/io_apic.c | 20 +++++++++++++++----- 3 files changed, 44 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 62d14ce3cd00..bd22f2a3713f 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -60,6 +60,7 @@ extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi); extern void mp_config_acpi_legacy_irqs(void); extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low); +extern int acpi_probe_gsi(void); #ifdef CONFIG_X86_IO_APIC extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, u32 gsi, int triggering, int polarity); @@ -71,6 +72,11 @@ mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, return 0; } #endif +#else /* !CONFIG_ACPI: */ +static inline int acpi_probe_gsi(void) +{ + return 0; +} #endif /* CONFIG_ACPI */ #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index d37593c2f438..7678f10c4568 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -973,6 +973,29 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) nr_ioapics++; } +int __init acpi_probe_gsi(void) +{ + int idx; + int gsi; + int max_gsi = 0; + + if (acpi_disabled) + return 0; + + if (!acpi_ioapic) + return 0; + + max_gsi = 0; + for (idx = 0; idx < nr_ioapics; idx++) { + gsi = mp_ioapic_routing[idx].gsi_end; + + if (gsi > max_gsi) + max_gsi = gsi; + } + + return max_gsi + 1; +} + static void assign_to_mp_irq(struct mp_config_intsrc *m, struct mp_config_intsrc *mp_irq) { diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 9b0c480c383b..bc7ac4da90d7 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -3841,14 +3841,24 @@ int __init io_apic_get_redir_entries (int ioapic) void __init probe_nr_irqs_gsi(void) { - int idx; int nr = 0; - for (idx = 0; idx < nr_ioapics; idx++) - nr += io_apic_get_redir_entries(idx) + 1; - - if (nr > nr_irqs_gsi) + nr = acpi_probe_gsi(); + if (nr > nr_irqs_gsi) { nr_irqs_gsi = nr; + } else { + /* for acpi=off or acpi is not compiled in */ + int idx; + + nr = 0; + for (idx = 0; idx < nr_ioapics; idx++) + nr += io_apic_get_redir_entries(idx) + 1; + + if (nr > nr_irqs_gsi) + nr_irqs_gsi = nr; + } + + printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); } /* -------------------------------------------------------------------------- From 55a8ba4b7f76bebd7e8ce3f74c04b140627a1bad Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Fri, 6 Feb 2009 10:29:35 -0800 Subject: [PATCH 043/105] x86, vmi: put a missing paravirt_release_pmd in pgd_dtor Commit 6194ba6ff6ccf8d5c54c857600843c67aa82c407 ("x86: don't special-case pmd allocations as much") made changes to the way we handle pmd allocations, and while doing that it dropped a call to paravirt_release_pd on the pgd page from the pgd_dtor code path. As a result of this missing release, the hypervisor is now unaware of the pgd page being freed, and as a result it ends up tracking this page as a page table page. After this the guest may start using the same page for other purposes, and depending on what use the page is put to, it may result in various performance and/or functional issues ( hangs, reboots). Since this release is only required for VMI, I now release the pgd page from the (vmi)_pgd_free hook. Signed-off-by: Alok N Kataria Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Cc: --- arch/x86/kernel/vmi_32.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 1d3302cc2ddf..bef58b4982db 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -320,6 +320,16 @@ static void vmi_release_pmd(unsigned long pfn) vmi_ops.release_page(pfn, VMI_PAGE_L2); } +/* + * We use the pgd_free hook for releasing the pgd page: + */ +static void vmi_pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + unsigned long pfn = __pa(pgd) >> PAGE_SHIFT; + + vmi_ops.release_page(pfn, VMI_PAGE_L2); +} + /* * Helper macros for MMU update flags. We can defer updates until a flush * or page invalidation only if the update is to the current address space @@ -762,6 +772,7 @@ static inline int __init activate_vmi(void) if (vmi_ops.release_page) { pv_mmu_ops.release_pte = vmi_release_pte; pv_mmu_ops.release_pmd = vmi_release_pmd; + pv_mmu_ops.pgd_free = vmi_pgd_free; } /* Set linear is needed in all cases */ From 914c3d630b29b07d04908eab1b246812dadd5bd6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 9 Feb 2009 22:17:39 +0900 Subject: [PATCH 044/105] x86: include correct %gs in a.out core dump Impact: dump the correct %gs into a.out core dump aout_dump_thread() read %gs but didn't include it in core dump. Fix it. Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar --- arch/x86/include/asm/a.out-core.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h index 37822206083e..3c601f8224be 100644 --- a/arch/x86/include/asm/a.out-core.h +++ b/arch/x86/include/asm/a.out-core.h @@ -23,8 +23,6 @@ */ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump) { - u16 gs; - /* changed the size calculations - should hopefully work better. lbt */ dump->magic = CMAGIC; dump->start_code = 0; @@ -57,7 +55,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump) dump->regs.ds = (u16)regs->ds; dump->regs.es = (u16)regs->es; dump->regs.fs = (u16)regs->fs; - savesegment(gs, gs); + savesegment(gs, dump->regs.gs); dump->regs.orig_ax = regs->orig_ax; dump->regs.ip = regs->ip; dump->regs.cs = (u16)regs->cs; From ae6af41f5a4841f06eb92bc86ad020ad44ae2a30 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 9 Feb 2009 22:17:39 +0900 Subject: [PATCH 045/105] x86: math_emu info cleanup Impact: cleanup * Come on, struct info? s/struct info/struct math_emu_info/ * Use struct pt_regs and kernel_vm86_regs instead of defining its own register frame structure. Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar --- arch/x86/include/asm/math_emu.h | 29 ++++----------- arch/x86/include/asm/processor.h | 2 +- arch/x86/math-emu/fpu_entry.c | 2 +- arch/x86/math-emu/fpu_proto.h | 2 +- arch/x86/math-emu/fpu_system.h | 14 +++---- arch/x86/math-emu/get_address.c | 63 +++++++++++++++----------------- 6 files changed, 48 insertions(+), 64 deletions(-) diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h index 5a65b107ad58..302492c77956 100644 --- a/arch/x86/include/asm/math_emu.h +++ b/arch/x86/include/asm/math_emu.h @@ -1,31 +1,18 @@ #ifndef _ASM_X86_MATH_EMU_H #define _ASM_X86_MATH_EMU_H +#include +#include + /* This structure matches the layout of the data saved to the stack following a device-not-present interrupt, part of it saved automatically by the 80386/80486. */ -struct info { +struct math_emu_info { long ___orig_eip; - long ___ebx; - long ___ecx; - long ___edx; - long ___esi; - long ___edi; - long ___ebp; - long ___eax; - long ___ds; - long ___es; - long ___fs; - long ___orig_eax; - long ___eip; - long ___cs; - long ___eflags; - long ___esp; - long ___ss; - long ___vm86_es; /* This and the following only in vm86 mode */ - long ___vm86_ds; - long ___vm86_fs; - long ___vm86_gs; + union { + struct pt_regs regs; + struct kernel_vm86_regs vm86; + }; }; #endif /* _ASM_X86_MATH_EMU_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 091cd8855f2e..3bfd5235a9eb 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -353,7 +353,7 @@ struct i387_soft_struct { u8 no_update; u8 rm; u8 alimit; - struct info *info; + struct math_emu_info *info; u32 entry_eip; }; diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index c7b06feb139b..c268abe72253 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -659,7 +659,7 @@ static int valid_prefix(u_char *Byte, u_char __user **fpu_eip, } } -void math_abort(struct info *info, unsigned int signal) +void math_abort(struct math_emu_info *info, unsigned int signal) { FPU_EIP = FPU_ORIG_EIP; current->thread.trap_no = 16; diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h index aa49b6a0d850..51bfbb61c5b1 100644 --- a/arch/x86/math-emu/fpu_proto.h +++ b/arch/x86/math-emu/fpu_proto.h @@ -52,7 +52,7 @@ extern void fst_i_(void); extern void fstp_i(void); /* fpu_entry.c */ asmlinkage extern void math_emulate(long arg); -extern void math_abort(struct info *info, unsigned int signal); +extern void math_abort(struct math_emu_info *info, unsigned int signal); /* fpu_etc.c */ extern void FPU_etc(void); /* fpu_tags.c */ diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index 13488fa153e0..6729c6a31348 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h @@ -18,7 +18,7 @@ /* This sets the pointer FPU_info to point to the argument part of the stack frame of math_emulate() */ -#define SETUP_DATA_AREA(arg) FPU_info = (struct info *) &arg +#define SETUP_DATA_AREA(arg) FPU_info = (struct math_emu_info *) &arg /* s is always from a cpu register, and the cpu does bounds checking * during register load --> no further bounds checks needed */ @@ -38,12 +38,12 @@ #define I387 (current->thread.xstate) #define FPU_info (I387->soft.info) -#define FPU_CS (*(unsigned short *) &(FPU_info->___cs)) -#define FPU_SS (*(unsigned short *) &(FPU_info->___ss)) -#define FPU_DS (*(unsigned short *) &(FPU_info->___ds)) -#define FPU_EAX (FPU_info->___eax) -#define FPU_EFLAGS (FPU_info->___eflags) -#define FPU_EIP (FPU_info->___eip) +#define FPU_CS (*(unsigned short *) &(FPU_info->regs.cs)) +#define FPU_SS (*(unsigned short *) &(FPU_info->regs.ss)) +#define FPU_DS (*(unsigned short *) &(FPU_info->regs.ds)) +#define FPU_EAX (FPU_info->regs.ax) +#define FPU_EFLAGS (FPU_info->regs.flags) +#define FPU_EIP (FPU_info->regs.ip) #define FPU_ORIG_EIP (FPU_info->___orig_eip) #define FPU_lookahead (I387->soft.lookahead) diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c index d701e2b39e44..62daa7fcc44c 100644 --- a/arch/x86/math-emu/get_address.c +++ b/arch/x86/math-emu/get_address.c @@ -29,42 +29,39 @@ #define FPU_WRITE_BIT 0x10 static int reg_offset[] = { - offsetof(struct info, ___eax), - offsetof(struct info, ___ecx), - offsetof(struct info, ___edx), - offsetof(struct info, ___ebx), - offsetof(struct info, ___esp), - offsetof(struct info, ___ebp), - offsetof(struct info, ___esi), - offsetof(struct info, ___edi) + offsetof(struct math_emu_info, regs.ax), + offsetof(struct math_emu_info, regs.cx), + offsetof(struct math_emu_info, regs.dx), + offsetof(struct math_emu_info, regs.bx), + offsetof(struct math_emu_info, regs.sp), + offsetof(struct math_emu_info, regs.bp), + offsetof(struct math_emu_info, regs.si), + offsetof(struct math_emu_info, regs.di) }; #define REG_(x) (*(long *)(reg_offset[(x)]+(u_char *) FPU_info)) static int reg_offset_vm86[] = { - offsetof(struct info, ___cs), - offsetof(struct info, ___vm86_ds), - offsetof(struct info, ___vm86_es), - offsetof(struct info, ___vm86_fs), - offsetof(struct info, ___vm86_gs), - offsetof(struct info, ___ss), - offsetof(struct info, ___vm86_ds) + offsetof(struct math_emu_info, regs.cs), + offsetof(struct math_emu_info, vm86.ds), + offsetof(struct math_emu_info, vm86.es), + offsetof(struct math_emu_info, vm86.fs), + offsetof(struct math_emu_info, vm86.gs), + offsetof(struct math_emu_info, regs.ss), + offsetof(struct math_emu_info, vm86.ds) }; #define VM86_REG_(x) (*(unsigned short *) \ (reg_offset_vm86[((unsigned)x)]+(u_char *) FPU_info)) -/* This dummy, gs is not saved on the stack. */ -#define ___GS ___ds - static int reg_offset_pm[] = { - offsetof(struct info, ___cs), - offsetof(struct info, ___ds), - offsetof(struct info, ___es), - offsetof(struct info, ___fs), - offsetof(struct info, ___GS), - offsetof(struct info, ___ss), - offsetof(struct info, ___ds) + offsetof(struct math_emu_info, regs.cs), + offsetof(struct math_emu_info, regs.ds), + offsetof(struct math_emu_info, regs.es), + offsetof(struct math_emu_info, regs.fs), + offsetof(struct math_emu_info, regs.ds), /* dummy, not saved on stack */ + offsetof(struct math_emu_info, regs.ss), + offsetof(struct math_emu_info, regs.ds) }; #define PM_REG_(x) (*(unsigned short *) \ @@ -349,34 +346,34 @@ void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip, } switch (rm) { case 0: - address += FPU_info->___ebx + FPU_info->___esi; + address += FPU_info->regs.bx + FPU_info->regs.si; break; case 1: - address += FPU_info->___ebx + FPU_info->___edi; + address += FPU_info->regs.bx + FPU_info->regs.di; break; case 2: - address += FPU_info->___ebp + FPU_info->___esi; + address += FPU_info->regs.bp + FPU_info->regs.si; if (addr_modes.override.segment == PREFIX_DEFAULT) addr_modes.override.segment = PREFIX_SS_; break; case 3: - address += FPU_info->___ebp + FPU_info->___edi; + address += FPU_info->regs.bp + FPU_info->regs.di; if (addr_modes.override.segment == PREFIX_DEFAULT) addr_modes.override.segment = PREFIX_SS_; break; case 4: - address += FPU_info->___esi; + address += FPU_info->regs.si; break; case 5: - address += FPU_info->___edi; + address += FPU_info->regs.di; break; case 6: - address += FPU_info->___ebp; + address += FPU_info->regs.bp; if (addr_modes.override.segment == PREFIX_DEFAULT) addr_modes.override.segment = PREFIX_SS_; break; case 7: - address += FPU_info->___ebx; + address += FPU_info->regs.bx; break; } From a51f42f3c940e5582c40454ece066d033bc7e24f Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Mon, 9 Feb 2009 14:33:03 -0800 Subject: [PATCH 046/105] netfilter: fix tuple inversion for Node information request The patch fixes a typo in the inverse mapping of Node Information request. Following draft-ietf-ipngwg-icmp-name-lookups-09, "Querier" sends a type 139 (ICMPV6_NI_QUERY) packet to "Responder" which answer with a type 140 (ICMPV6_NI_REPLY) packet. Signed-off-by: Eric Leblond Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index c455cf4ee756..114a92e4258d 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -49,8 +49,8 @@ static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, static const u_int8_t invmap[] = { [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, - [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_QUERY + 1, - [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_REPLY +1 + [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_REPLY + 1, + [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_QUERY +1 }; static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, From 3f9007135c1dc896db9a9e35920aafc65b157230 Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Mon, 9 Feb 2009 14:33:20 -0800 Subject: [PATCH 047/105] netfilter: nf_conntrack_ipv6: don't track ICMPv6 negotiation message This patch removes connection tracking handling for ICMPv6 messages related to Stateless Address Autoconfiguration, MLD, and MLDv2. They can not be tracked because they are massively using multicast (on pre-defined address). But they are not invalid and should not be detected as such. Signed-off-by: Eric Leblond Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- .../netfilter/nf_conntrack_proto_icmpv6.c | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 114a92e4258d..c323643ffcf9 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -53,6 +53,17 @@ static const u_int8_t invmap[] = { [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_QUERY +1 }; +static const u_int8_t noct_valid_new[] = { + [ICMPV6_MGM_QUERY - 130] = 1, + [ICMPV6_MGM_REPORT -130] = 1, + [ICMPV6_MGM_REDUCTION - 130] = 1, + [NDISC_ROUTER_SOLICITATION - 130] = 1, + [NDISC_ROUTER_ADVERTISEMENT - 130] = 1, + [NDISC_NEIGHBOUR_SOLICITATION - 130] = 1, + [NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1, + [ICMPV6_MLD2_REPORT - 130] = 1 +}; + static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { @@ -178,6 +189,7 @@ icmpv6_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, { const struct icmp6hdr *icmp6h; struct icmp6hdr _ih; + int type; icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); if (icmp6h == NULL) { @@ -194,6 +206,15 @@ icmpv6_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, return -NF_ACCEPT; } + type = icmp6h->icmp6_type - 130; + if (type >= 0 && type < sizeof(noct_valid_new) && + noct_valid_new[type]) { + skb->nfct = &nf_conntrack_untracked.ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); + return NF_ACCEPT; + } + /* is not error message ? */ if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; From c969aa7d2cd5621ad4129dae6b6551af422944c6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 9 Feb 2009 14:33:57 -0800 Subject: [PATCH 048/105] netfilter: ctnetlink: allow changing NAT sequence adjustment in creation This patch fixes an inconsistency in the current ctnetlink code since NAT sequence adjustment bit can only be updated but not set in the conntrack entry creation. This patch is used by conntrackd to successfully recover newly created entries that represent connections with helpers and NAT payload mangling. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_netlink.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index c32a7e8e3a1b..9051bb4f81da 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1215,6 +1215,16 @@ ctnetlink_create_conntrack(struct nlattr *cda[], } } +#ifdef CONFIG_NF_NAT_NEEDED + if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) { + err = ctnetlink_change_nat_seq_adj(ct, cda); + if (err < 0) { + rcu_read_unlock(); + goto err; + } + } +#endif + if (cda[CTA_PROTOINFO]) { err = ctnetlink_change_protoinfo(ct, cda); if (err < 0) { From 1f9da256163e3ff91a12d0b861091f0e525139df Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 9 Feb 2009 14:34:26 -0800 Subject: [PATCH 049/105] netfilter: ctnetlink: fix echo if not subscribed to any multicast group This patch fixes echoing if the socket that has sent the request to create/update/delete an entry is not subscribed to any multicast group. With the current code, ctnetlink would not send the echo message via unicast as nfnetlink_send() would be skip. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_netlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 9051bb4f81da..cb78aa00399e 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -434,7 +434,7 @@ static int ctnetlink_conntrack_event(struct notifier_block *this, } else return NOTIFY_DONE; - if (!nfnetlink_has_listeners(group)) + if (!item->report && !nfnetlink_has_listeners(group)) return NOTIFY_DONE; skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); @@ -1502,7 +1502,8 @@ static int ctnetlink_expect_event(struct notifier_block *this, } else return NOTIFY_DONE; - if (!nfnetlink_has_listeners(NFNLGRP_CONNTRACK_EXP_NEW)) + if (!item->report && + !nfnetlink_has_listeners(NFNLGRP_CONNTRACK_EXP_NEW)) return NOTIFY_DONE; skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); From d4e2675a61890a84849a24affedf80d5cae8b199 Mon Sep 17 00:00:00 2001 From: Qu Haoran Date: Mon, 9 Feb 2009 14:34:56 -0800 Subject: [PATCH 050/105] netfilter: xt_sctp: sctp chunk mapping doesn't work When user tries to map all chunks given in argument, kernel works on a copy of the chunkmap, but at the end it doesn't check the copy, but the orginal one. Signed-off-by: Qu Haoran Signed-off-by: Nicolas Dichtel Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/xt_sctp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index e223cb43ae8e..a189ada9128f 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -105,7 +105,7 @@ match_packet(const struct sk_buff *skb, switch (chunk_match_type) { case SCTP_CHUNK_MATCH_ALL: - return SCTP_CHUNKMAP_IS_CLEAR(info->chunkmap); + return SCTP_CHUNKMAP_IS_CLEAR(chunkmapcopy); case SCTP_CHUNK_MATCH_ANY: return false; case SCTP_CHUNK_MATCH_ONLY: From 8707bdd48ab705a459ac1b12014075a139d1d4f9 Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Mon, 9 Feb 2009 14:59:30 -0800 Subject: [PATCH 051/105] gianfar: Fix boot hangs while bringing up gianfar ethernet Ira Snyder found that commit 8c7396aebb68994c0519e438eecdf4d5fa9c7844 "gianfar: Merge Tx and Rx interrupt for scheduling clean up ring" can cause hangs. It's because there was removed clearing of interrupts in gfar_schedule_cleanup() (which is called by an interrupt handler) in case when netif scheduling has been disabled. This patch brings back this action and a comment. Reported-by: Ira Snyder Reported-by: Peter Korsgaard Bisected-by: Ira Snyder Tested-by: Peter Korsgaard Tested-by: Ira Snyder Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- drivers/net/gianfar.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c index acae2d8cd688..9b12a13a640f 100644 --- a/drivers/net/gianfar.c +++ b/drivers/net/gianfar.c @@ -1629,6 +1629,12 @@ static void gfar_schedule_cleanup(struct net_device *dev) if (netif_rx_schedule_prep(&priv->napi)) { gfar_write(&priv->regs->imask, IMASK_RTX_DISABLED); __netif_rx_schedule(&priv->napi); + } else { + /* + * Clear IEVENT, so interrupts aren't called again + * because of the packets that have already arrived. + */ + gfar_write(&priv->regs->ievent, IEVENT_RTX_MASK); } spin_unlock(&priv->rxlock); From 20461c1740cac5e02733221c9f653098a703f55a Mon Sep 17 00:00:00 2001 From: Noriaki TAKAMIYA Date: Mon, 9 Feb 2009 15:01:19 -0800 Subject: [PATCH 052/105] IPv6: fix to set device name when new IPv6 over IPv6 tunnel device is created. When the user creates IPv6 over IPv6 tunnel, the device name created by the kernel isn't set to t->parm.name, which is referred as the result of ioctl(). Signed-off-by: Noriaki TAKAMIYA Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 58e2b0d93758..d994c55a5b16 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -249,8 +249,8 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct ip6_tnl_parm *p) } t = netdev_priv(dev); - ip6_tnl_dev_init(dev); t->parms = *p; + ip6_tnl_dev_init(dev); if ((err = register_netdevice(dev)) < 0) goto failed_free; From 4906f9985e310fc01f956256b0d58ac28b0dcb19 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 9 Feb 2009 15:07:18 -0800 Subject: [PATCH 053/105] bridge: Fix LRO crash with tun > Kernel BUG at drivers/net/tun.c:444 > invalid opcode: 0000 [1] SMP > last sysfs file: /class/net/lo/ifindex > CPU 0 > Modules linked in: tun ipt_MASQUERADE iptable_nat ip_nat xt_state ip_conntrack > nfnetlink ipt_REJECT xt_tcpudp iptable_filter d > Pid: 6912, comm: qemu-kvm Tainted: G 2.6.18-128.el5 #1 > RIP: 0010:[] [] > :tun:tun_chr_readv+0x2b1/0x3a6 > RSP: 0018:ffff8102202c5e48 EFLAGS: 00010246 > RAX: 0000000000000000 RBX: ffff8102202c5e98 RCX: 0000000004010000 > RDX: ffff810227063680 RSI: ffff8102202c5e9e RDI: ffff8102202c5e92 > RBP: 0000000000010ff6 R08: 0000000000000000 R09: 0000000000000001 > R10: ffff8102202c5e94 R11: 0000000000000202 R12: ffff8102275357c0 > R13: ffff81022755e500 R14: 0000000000000000 R15: ffff8102202c5ef8 > FS: 00002ae4398db980(0000) GS:ffffffff803ac000(0000) knlGS:0000000000000000 > CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b > CR2: 00002ae4ab514000 CR3: 0000000221344000 CR4: 00000000000026e0 > Process qemu-kvm (pid: 6912, threadinfo ffff8102202c4000, task > ffff81022e58d820) > Stack: 00000000498735cb ffff810229d1a3c0 0000000000000000 ffff81022e58d820 > ffffffff8008a461 ffff81022755e528 ffff81022755e528 ffffffff8009f925 > 000005ea05ea0000 ffff8102209d0000 00001051143e1600 ffffffff8003c00e > Call Trace: > [] default_wake_function+0x0/0xe > [] enqueue_hrtimer+0x55/0x70 > [] hrtimer_start+0xbc/0xce > [] :tun:tun_chr_read+0x1a/0x1f > [] vfs_read+0xcb/0x171 > [] sys_read+0x45/0x6e > [] system_call+0x7e/0x83 > > > Code: 0f 0b 68 40 62 6f 88 c2 bc 01 f6 42 0a 08 74 0c 80 4c 24 41 > RIP [] :tun:tun_chr_readv+0x2b1/0x3a6 > RSP > <0>Kernel panic - not syncing: Fatal exception This crashed when an LRO packet generated by bnx2x reached a tun device through the bridge. We're supposed to drop it at the bridge. However, because the check was placed in br_forward instead of __br_forward, it's only effective if we are sending the packet through a single port. This patch fixes it by moving the check into __br_forward. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/bridge/br_forward.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index bdd9ccea17ce..d2c27c808d3b 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -67,6 +67,11 @@ static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb) { struct net_device *indev; + if (skb_warn_if_lro(skb)) { + kfree_skb(skb); + return; + } + indev = skb->dev; skb->dev = to->dev; skb_forward_csum(skb); @@ -89,7 +94,7 @@ void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) /* called with rcu_read_lock */ void br_forward(const struct net_bridge_port *to, struct sk_buff *skb) { - if (!skb_warn_if_lro(skb) && should_deliver(to, skb)) { + if (should_deliver(to, skb)) { __br_forward(to, skb); return; } From d315760ffa261c15ff92699ac6f514112543d7ca Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 9 Feb 2009 22:17:39 +0900 Subject: [PATCH 054/105] x86: fix math_emu register frame access do_device_not_available() is the handler for #NM and it declares that it takes a unsigned long and calls math_emu(), which takes a long argument and surprisingly expects the stack frame starting at the zero argument would match struct math_emu_info, which isn't true regardless of configuration in the current code. This patch makes do_device_not_available() take struct pt_regs like other exception handlers and initialize struct math_emu_info with pointer to it and pass pointer to the math_emu_info to math_emulate() like normal C functions do. This way, unless gcc makes a copy of struct pt_regs in do_device_not_available(), the register frame is correctly accessed regardless of kernel configuration or compiler used. This doesn't fix all math_emu problems but it at least gets it somewhat working. Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar --- arch/x86/include/asm/math_emu.h | 4 +- arch/x86/include/asm/traps.h | 4 +- arch/x86/kernel/traps.c | 15 +++++--- arch/x86/math-emu/fpu_entry.c | 4 +- arch/x86/math-emu/fpu_proto.h | 2 +- arch/x86/math-emu/fpu_system.h | 16 +++----- arch/x86/math-emu/get_address.c | 66 ++++++++++++++++----------------- 7 files changed, 55 insertions(+), 56 deletions(-) diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h index 302492c77956..031f6266f425 100644 --- a/arch/x86/include/asm/math_emu.h +++ b/arch/x86/include/asm/math_emu.h @@ -11,8 +11,8 @@ struct math_emu_info { long ___orig_eip; union { - struct pt_regs regs; - struct kernel_vm86_regs vm86; + struct pt_regs *regs; + struct kernel_vm86_regs *vm86; }; }; #endif /* _ASM_X86_MATH_EMU_H */ diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 2ee0a3bceedf..cf3bb053da0b 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -41,7 +41,7 @@ dotraplinkage void do_int3(struct pt_regs *, long); dotraplinkage void do_overflow(struct pt_regs *, long); dotraplinkage void do_bounds(struct pt_regs *, long); dotraplinkage void do_invalid_op(struct pt_regs *, long); -dotraplinkage void do_device_not_available(struct pt_regs *, long); +dotraplinkage void do_device_not_available(struct pt_regs); dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long); dotraplinkage void do_invalid_TSS(struct pt_regs *, long); dotraplinkage void do_segment_not_present(struct pt_regs *, long); @@ -77,7 +77,7 @@ extern int panic_on_unrecovered_nmi; extern int kstack_depth_to_print; void math_error(void __user *); -asmlinkage void math_emulate(long); +void math_emulate(struct math_emu_info *); #ifdef CONFIG_X86_32 unsigned long patch_espfix_desc(unsigned long, unsigned long); #else diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 98c2d055284b..7932338d7cb3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -896,7 +896,7 @@ asmlinkage void math_state_restore(void) EXPORT_SYMBOL_GPL(math_state_restore); #ifndef CONFIG_MATH_EMULATION -asmlinkage void math_emulate(long arg) +void math_emulate(struct math_emu_info *info) { printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n"); @@ -906,16 +906,19 @@ asmlinkage void math_emulate(long arg) } #endif /* CONFIG_MATH_EMULATION */ -dotraplinkage void __kprobes -do_device_not_available(struct pt_regs *regs, long error) +dotraplinkage void __kprobes do_device_not_available(struct pt_regs regs) { #ifdef CONFIG_X86_32 if (read_cr0() & X86_CR0_EM) { - conditional_sti(regs); - math_emulate(0); + struct math_emu_info info = { }; + + conditional_sti(®s); + + info.regs = ®s; + math_emulate(&info); } else { math_state_restore(); /* interrupts still off */ - conditional_sti(regs); + conditional_sti(®s); } #else math_state_restore(); diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index c268abe72253..5d87f586f8d7 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -131,7 +131,7 @@ u_char emulating = 0; static int valid_prefix(u_char *Byte, u_char __user ** fpu_eip, overrides * override); -asmlinkage void math_emulate(long arg) +void math_emulate(struct math_emu_info *info) { u_char FPU_modrm, byte1; unsigned short code; @@ -161,7 +161,7 @@ asmlinkage void math_emulate(long arg) RE_ENTRANT_CHECK_ON; #endif /* RE_ENTRANT_CHECKING */ - SETUP_DATA_AREA(arg); + FPU_info = info; FPU_ORIG_EIP = FPU_EIP; diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h index 51bfbb61c5b1..9779df436b7d 100644 --- a/arch/x86/math-emu/fpu_proto.h +++ b/arch/x86/math-emu/fpu_proto.h @@ -51,7 +51,7 @@ extern void ffreep(void); extern void fst_i_(void); extern void fstp_i(void); /* fpu_entry.c */ -asmlinkage extern void math_emulate(long arg); +extern void math_emulate(struct math_emu_info *info); extern void math_abort(struct math_emu_info *info, unsigned int signal); /* fpu_etc.c */ extern void FPU_etc(void); diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index 6729c6a31348..50fa0ec2c8a5 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h @@ -16,10 +16,6 @@ #include #include -/* This sets the pointer FPU_info to point to the argument part - of the stack frame of math_emulate() */ -#define SETUP_DATA_AREA(arg) FPU_info = (struct math_emu_info *) &arg - /* s is always from a cpu register, and the cpu does bounds checking * during register load --> no further bounds checks needed */ #define LDT_DESCRIPTOR(s) (((struct desc_struct *)current->mm->context.ldt)[(s) >> 3]) @@ -38,12 +34,12 @@ #define I387 (current->thread.xstate) #define FPU_info (I387->soft.info) -#define FPU_CS (*(unsigned short *) &(FPU_info->regs.cs)) -#define FPU_SS (*(unsigned short *) &(FPU_info->regs.ss)) -#define FPU_DS (*(unsigned short *) &(FPU_info->regs.ds)) -#define FPU_EAX (FPU_info->regs.ax) -#define FPU_EFLAGS (FPU_info->regs.flags) -#define FPU_EIP (FPU_info->regs.ip) +#define FPU_CS (*(unsigned short *) &(FPU_info->regs->cs)) +#define FPU_SS (*(unsigned short *) &(FPU_info->regs->ss)) +#define FPU_DS (*(unsigned short *) &(FPU_info->regs->ds)) +#define FPU_EAX (FPU_info->regs->ax) +#define FPU_EFLAGS (FPU_info->regs->flags) +#define FPU_EIP (FPU_info->regs->ip) #define FPU_ORIG_EIP (FPU_info->___orig_eip) #define FPU_lookahead (I387->soft.lookahead) diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c index 62daa7fcc44c..420b3b6e3915 100644 --- a/arch/x86/math-emu/get_address.c +++ b/arch/x86/math-emu/get_address.c @@ -29,43 +29,43 @@ #define FPU_WRITE_BIT 0x10 static int reg_offset[] = { - offsetof(struct math_emu_info, regs.ax), - offsetof(struct math_emu_info, regs.cx), - offsetof(struct math_emu_info, regs.dx), - offsetof(struct math_emu_info, regs.bx), - offsetof(struct math_emu_info, regs.sp), - offsetof(struct math_emu_info, regs.bp), - offsetof(struct math_emu_info, regs.si), - offsetof(struct math_emu_info, regs.di) + offsetof(struct pt_regs, ax), + offsetof(struct pt_regs, cx), + offsetof(struct pt_regs, dx), + offsetof(struct pt_regs, bx), + offsetof(struct pt_regs, sp), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di) }; -#define REG_(x) (*(long *)(reg_offset[(x)]+(u_char *) FPU_info)) +#define REG_(x) (*(long *)(reg_offset[(x)] + (u_char *)FPU_info->regs)) static int reg_offset_vm86[] = { - offsetof(struct math_emu_info, regs.cs), - offsetof(struct math_emu_info, vm86.ds), - offsetof(struct math_emu_info, vm86.es), - offsetof(struct math_emu_info, vm86.fs), - offsetof(struct math_emu_info, vm86.gs), - offsetof(struct math_emu_info, regs.ss), - offsetof(struct math_emu_info, vm86.ds) + offsetof(struct pt_regs, cs), + offsetof(struct kernel_vm86_regs, ds), + offsetof(struct kernel_vm86_regs, es), + offsetof(struct kernel_vm86_regs, fs), + offsetof(struct kernel_vm86_regs, gs), + offsetof(struct pt_regs, ss), + offsetof(struct kernel_vm86_regs, ds) }; #define VM86_REG_(x) (*(unsigned short *) \ - (reg_offset_vm86[((unsigned)x)]+(u_char *) FPU_info)) + (reg_offset_vm86[((unsigned)x)] + (u_char *)FPU_info->regs)) static int reg_offset_pm[] = { - offsetof(struct math_emu_info, regs.cs), - offsetof(struct math_emu_info, regs.ds), - offsetof(struct math_emu_info, regs.es), - offsetof(struct math_emu_info, regs.fs), - offsetof(struct math_emu_info, regs.ds), /* dummy, not saved on stack */ - offsetof(struct math_emu_info, regs.ss), - offsetof(struct math_emu_info, regs.ds) + offsetof(struct pt_regs, cs), + offsetof(struct pt_regs, ds), + offsetof(struct pt_regs, es), + offsetof(struct pt_regs, fs), + offsetof(struct pt_regs, ds), /* dummy, not saved on stack */ + offsetof(struct pt_regs, ss), + offsetof(struct pt_regs, ds) }; #define PM_REG_(x) (*(unsigned short *) \ - (reg_offset_pm[((unsigned)x)]+(u_char *) FPU_info)) + (reg_offset_pm[((unsigned)x)] + (u_char *)FPU_info->regs)) /* Decode the SIB byte. This function assumes mod != 0 */ static int sib(int mod, unsigned long *fpu_eip) @@ -346,34 +346,34 @@ void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip, } switch (rm) { case 0: - address += FPU_info->regs.bx + FPU_info->regs.si; + address += FPU_info->regs->bx + FPU_info->regs->si; break; case 1: - address += FPU_info->regs.bx + FPU_info->regs.di; + address += FPU_info->regs->bx + FPU_info->regs->di; break; case 2: - address += FPU_info->regs.bp + FPU_info->regs.si; + address += FPU_info->regs->bp + FPU_info->regs->si; if (addr_modes.override.segment == PREFIX_DEFAULT) addr_modes.override.segment = PREFIX_SS_; break; case 3: - address += FPU_info->regs.bp + FPU_info->regs.di; + address += FPU_info->regs->bp + FPU_info->regs->di; if (addr_modes.override.segment == PREFIX_DEFAULT) addr_modes.override.segment = PREFIX_SS_; break; case 4: - address += FPU_info->regs.si; + address += FPU_info->regs->si; break; case 5: - address += FPU_info->regs.di; + address += FPU_info->regs->di; break; case 6: - address += FPU_info->regs.bp; + address += FPU_info->regs->bp; if (addr_modes.override.segment == PREFIX_DEFAULT) addr_modes.override.segment = PREFIX_SS_; break; case 7: - address += FPU_info->regs.bx; + address += FPU_info->regs->bx; break; } From 6c24b17453c8dc444a746e45b8a404498fc9fcf7 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Mon, 9 Feb 2009 21:08:07 -0600 Subject: [PATCH 055/105] powerpc/fsl-booke: Fix mapping functions to use phys_addr_t Fixed v_mapped_by_tlbcam() and p_mapped_by_tlbcam() to use phys_addr_t instead of unsigned long. In 36-bit physical mode we really need these functions to deal with phys_addr_t when trying to match a physical address or when returning one. Signed-off-by: Kumar Gala --- arch/powerpc/mm/fsl_booke_mmu.c | 4 ++-- arch/powerpc/mm/pgtable_32.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c index 1971e4ee3d6e..ea6e41e39d9f 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/fsl_booke_mmu.c @@ -73,7 +73,7 @@ extern unsigned int tlbcam_index; /* * Return PA for this VA if it is mapped by a CAM, or 0 */ -unsigned long v_mapped_by_tlbcam(unsigned long va) +phys_addr_t v_mapped_by_tlbcam(unsigned long va) { int b; for (b = 0; b < tlbcam_index; ++b) @@ -85,7 +85,7 @@ unsigned long v_mapped_by_tlbcam(unsigned long va) /* * Return VA for a given PA or 0 if not mapped */ -unsigned long p_mapped_by_tlbcam(unsigned long pa) +unsigned long p_mapped_by_tlbcam(phys_addr_t pa) { int b; for (b = 0; b < tlbcam_index; ++b) diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 22972cd83cc9..58bcaeba728d 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -61,8 +61,8 @@ void setbat(int index, unsigned long virt, phys_addr_t phys, #ifdef HAVE_TLBCAM extern unsigned int tlbcam_index; -extern unsigned long v_mapped_by_tlbcam(unsigned long va); -extern unsigned long p_mapped_by_tlbcam(unsigned long pa); +extern phys_addr_t v_mapped_by_tlbcam(unsigned long va); +extern unsigned long p_mapped_by_tlbcam(phys_addr_t pa); #else /* !HAVE_TLBCAM */ #define v_mapped_by_tlbcam(x) (0UL) #define p_mapped_by_tlbcam(x) (0UL) From eef336189b2b5ae68bfbef0df24176a4a152d981 Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Fri, 6 Feb 2009 02:02:00 +0000 Subject: [PATCH 056/105] powerpc: Don't emulate mr. instructions Currently emulate_step() emulates mr. instructions without updating cr0 and this can be disastrous. Don't emulate mr. This bug has been around for a while, but I am not sure if its a worthy -stable candidate. I'll leave it to Ben do decide. Signed-off-by: Ananth N Mavinakayanahalli Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/lib/sstep.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 4aae0c387645..13b7d54f185b 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -172,6 +172,8 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) } break; case 0x378: /* orx */ + if (instr & 1) + break; rs = (instr >> 21) & 0x1f; rb = (instr >> 11) & 0x1f; if (rs == rb) { /* mr */ From f25f9074c24f1451a74942c4bc089bb53e47f462 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 7 Feb 2009 20:22:40 +0000 Subject: [PATCH 057/105] powerpc/ftrace: Fix math to calculate offset in TOC Impact: fix dynamic ftrace with large modules in PPC64 The math to calculate the offset into the TOC that is taken from reading the trampoline is incorrect. The bottom half of the offset is a signed extended short. The current code was using an OR to create the offset when it should have been using an addition. Signed-off-by: Steven Rostedt Acked-by: Geoff Levand Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/ftrace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index 5355244c99ff..60c60ccf5e3c 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -195,8 +195,9 @@ __ftrace_make_nop(struct module *mod, return -EINVAL; } - offset = (unsigned)((unsigned short)jmp[0]) << 16 | - (unsigned)((unsigned short)jmp[1]); + /* The bottom half is signed extended */ + offset = ((unsigned)((unsigned short)jmp[0]) << 16) + + (int)((short)jmp[1]); DEBUGP(" %x ", offset); From d87bf76679bd37593ae4a3133f5da9395a4963ac Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Sun, 8 Feb 2009 13:04:14 +0000 Subject: [PATCH 058/105] powerpc/cell: Add missing #include for oprofile arch/powerpc/oprofile/cell/spu_profiler.c is missing a asm/time.h include which is required for ppc_proc_freq. This can cause compile failures for some config combinations. Signed-off-by: Michael Neuling Acked-by: Arnd Bergmann Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/oprofile/cell/spu_profiler.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c index 9305ddaac512..b129d007e7fe 100644 --- a/arch/powerpc/oprofile/cell/spu_profiler.c +++ b/arch/powerpc/oprofile/cell/spu_profiler.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "pr_util.h" #define SCALE_SHIFT 14 From 5b11abfdb572bf9284e596dd198ac2aaf95b6616 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sun, 8 Feb 2009 14:27:21 +0000 Subject: [PATCH 059/105] powerpc/pci: mmap anonymous memory when legacy_mem doesn't exist The new legacy_mem file in sysfs is causing problems with X on machines that don't support legacy memory access. The way I initially implemented it, we would fail with -ENXIO when trying to mmap it, thus exposing to X that we do support the API but there is no legacy memory. Unfortunately, X poor error handling is causing it to fail to start when it gets this error. This implements a workaround hack that instead maps anonymous memory instead (using shmem if VM_SHARED is set, just like /dev/zero does). Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/pci-common.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 19b12d2cbb4b..0f4181272311 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -561,8 +561,21 @@ int pci_mmap_legacy_page_range(struct pci_bus *bus, (unsigned long long)(offset + size - 1)); if (mmap_state == pci_mmap_mem) { - if ((offset + size) > hose->isa_mem_size) - return -ENXIO; + /* Hack alert ! + * + * Because X is lame and can fail starting if it gets an error trying + * to mmap legacy_mem (instead of just moving on without legacy memory + * access) we fake it here by giving it anonymous memory, effectively + * behaving just like /dev/zero + */ + if ((offset + size) > hose->isa_mem_size) { + printk(KERN_DEBUG + "Process %s (pid:%d) mapped non-existing PCI legacy memory for 0%04x:%02x\n", + current->comm, current->pid, pci_domain_nr(bus), bus->number); + if (vma->vm_flags & VM_SHARED) + return shmem_zero_setup(vma); + return 0; + } offset += hose->isa_mem_phys; } else { unsigned long io_offset = (unsigned long)hose->io_base_virt - _IO_BASE; From 0b2f82872ff855b92e9e8356b90ef429d96d6977 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Sun, 8 Feb 2009 14:49:39 +0000 Subject: [PATCH 060/105] powerpc: Add missing sparsemem.h include arch/powerpc/platforms/pseries/hotplug-memory.c uses remove_section_mapping() but doesn't include sparsemem.h which defines it. This can cause compilation fails for some configs. Signed-off-by: Michael Neuling Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pseries/hotplug-memory.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index a623ad256e9e..9b21ee68ea50 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -14,6 +14,7 @@ #include #include #include +#include static int pseries_remove_lmb(unsigned long base, unsigned int lmb_size) { From e0fc4f97ab6b11594d0b0658ef6cf02bd68f3b4e Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 9 Feb 2009 21:35:39 +0100 Subject: [PATCH 061/105] [ARM] Storage class should be before const qualifier The C99 specification states in section 6.11.5: The placement of a storage-class specifier other than at the beginning of the declaration specifiers in a declaration is an obsolescent feature. Signed-off-by: Tobias Klauser Signed-off-by: Russell King --- arch/arm/kernel/machine_kexec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c index 440dc62cdc3a..598ca61e7bca 100644 --- a/arch/arm/kernel/machine_kexec.c +++ b/arch/arm/kernel/machine_kexec.c @@ -13,8 +13,8 @@ #include #include -const extern unsigned char relocate_new_kernel[]; -const extern unsigned int relocate_new_kernel_size; +extern const unsigned char relocate_new_kernel[]; +extern const unsigned int relocate_new_kernel_size; extern void setup_mm_for_reboot(char mode); From f6f35bbe7c6494e66590cf519e21da2dd8d59e01 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Sun, 8 Feb 2009 15:22:25 +0100 Subject: [PATCH 062/105] [ARM] AACI: timeout will reach -1 With a postfix decrement the timeout will reach -1 rather than 0, so the warning will not be issued. Signed-off-by: Roel Kluin Signed-off-by: Russell King --- sound/arm/aaci.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/arm/aaci.c b/sound/arm/aaci.c index 89096e811a4b..772901e41ecb 100644 --- a/sound/arm/aaci.c +++ b/sound/arm/aaci.c @@ -90,7 +90,7 @@ static void aaci_ac97_write(struct snd_ac97 *ac97, unsigned short reg, */ do { v = readl(aaci->base + AACI_SLFR); - } while ((v & (SLFR_1TXB|SLFR_2TXB)) && timeout--); + } while ((v & (SLFR_1TXB|SLFR_2TXB)) && --timeout); if (!timeout) dev_err(&aaci->dev->dev, @@ -126,7 +126,7 @@ static unsigned short aaci_ac97_read(struct snd_ac97 *ac97, unsigned short reg) */ do { v = readl(aaci->base + AACI_SLFR); - } while ((v & SLFR_1TXB) && timeout--); + } while ((v & SLFR_1TXB) && --timeout); if (!timeout) { dev_err(&aaci->dev->dev, "timeout on slot 1 TX busy\n"); @@ -147,7 +147,7 @@ static unsigned short aaci_ac97_read(struct snd_ac97 *ac97, unsigned short reg) do { cond_resched(); v = readl(aaci->base + AACI_SLFR) & (SLFR_1RXV|SLFR_2RXV); - } while ((v != (SLFR_1RXV|SLFR_2RXV)) && timeout--); + } while ((v != (SLFR_1RXV|SLFR_2RXV)) && --timeout); if (!timeout) { dev_err(&aaci->dev->dev, "timeout on RX valid\n"); From b52af40923fc91a12e3c7152d833e0c0c6a508f6 Mon Sep 17 00:00:00 2001 From: Clemens Ladisch Date: Tue, 10 Feb 2009 09:21:07 +0100 Subject: [PATCH 063/105] i8327: fix outb() parameter order In i8237A_resume(), when resetting the DMA controller, the parameters to dma_outb() were mixed up. Signed-off-by: Clemens Ladisch [ cleaned up the file a tiny bit. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/i8237.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c index dbd6c1d1b638..b42ca694dc68 100644 --- a/arch/x86/kernel/i8237.c +++ b/arch/x86/kernel/i8237.c @@ -28,10 +28,10 @@ static int i8237A_resume(struct sys_device *dev) flags = claim_dma_lock(); - dma_outb(DMA1_RESET_REG, 0); - dma_outb(DMA2_RESET_REG, 0); + dma_outb(0, DMA1_RESET_REG); + dma_outb(0, DMA2_RESET_REG); - for (i = 0;i < 8;i++) { + for (i = 0; i < 8; i++) { set_dma_addr(i, 0x000000); /* DMA count is a bit weird so this is not 0 */ set_dma_count(i, 1); @@ -51,14 +51,14 @@ static int i8237A_suspend(struct sys_device *dev, pm_message_t state) } static struct sysdev_class i8237_sysdev_class = { - .name = "i8237", - .suspend = i8237A_suspend, - .resume = i8237A_resume, + .name = "i8237", + .suspend = i8237A_suspend, + .resume = i8237A_resume, }; static struct sys_device device_i8237A = { - .id = 0, - .cls = &i8237_sysdev_class, + .id = 0, + .cls = &i8237_sysdev_class, }; static int __init i8237A_init_sysfs(void) @@ -68,5 +68,4 @@ static int __init i8237A_init_sysfs(void) error = sysdev_register(&device_i8237A); return error; } - device_initcall(i8237A_init_sysfs); From 5a6fe125950676015f5108fb71b2a67441755003 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Feb 2009 14:02:27 +0000 Subject: [PATCH 064/105] Do not account for the address space used by hugetlbfs using VM_ACCOUNT When overcommit is disabled, the core VM accounts for pages used by anonymous shared, private mappings and special mappings. It keeps track of VMAs that should be accounted for with VM_ACCOUNT and VMAs that never had a reserve with VM_NORESERVE. Overcommit for hugetlbfs is much riskier than overcommit for base pages due to contiguity requirements. It avoids overcommiting on both shared and private mappings using reservation counters that are checked and updated during mmap(). This ensures (within limits) that hugepages exist in the future when faults occurs or it is too easy to applications to be SIGKILLed. As hugetlbfs makes its own reservations of a different unit to the base page size, VM_ACCOUNT should never be set. Even if the units were correct, we would double account for the usage in the core VM and hugetlbfs. VM_NORESERVE may be set because an application can request no reserves be made for hugetlbfs at the risk of getting killed later. With commit fc8744adc870a8d4366908221508bb113d8b72ee, VM_NORESERVE and VM_ACCOUNT are getting unconditionally set for hugetlbfs-backed mappings. This breaks the accounting for both the core VM and hugetlbfs, can trigger an OOM storm when hugepage pools are too small lockups and corrupted counters otherwise are used. This patch brings hugetlbfs more in line with how the core VM treats VM_NORESERVE but prevents VM_ACCOUNT being set. Signed-off-by: Mel Gorman Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 8 +++++--- include/linux/hugetlb.h | 5 +++-- include/linux/mm.h | 3 +-- ipc/shm.c | 8 +++++--- mm/fremap.c | 2 +- mm/hugetlb.c | 39 +++++++++++++++++++++++++-------------- mm/mmap.c | 38 ++++++++++++++++++++++---------------- mm/mprotect.c | 5 +++-- 8 files changed, 65 insertions(+), 43 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6903d37af037..9b800d97a687 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -108,7 +108,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), - len >> huge_page_shift(h), vma)) + len >> huge_page_shift(h), vma, + vma->vm_flags)) goto out; ret = 0; @@ -947,7 +948,7 @@ static int can_do_hugetlb_shm(void) can_do_mlock()); } -struct file *hugetlb_file_setup(const char *name, size_t size) +struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) { int error = -ENOMEM; struct file *file; @@ -981,7 +982,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size) error = -ENOMEM; if (hugetlb_reserve_pages(inode, 0, - size >> huge_page_shift(hstate_inode(inode)), NULL)) + size >> huge_page_shift(hstate_inode(inode)), NULL, + acctflag)) goto out_inode; d_instantiate(dentry, inode); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f1d2fba19ea0..af09660001c7 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -33,7 +33,8 @@ unsigned long hugetlb_total_pages(void); int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access); int hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_struct *vma); + struct vm_area_struct *vma, + int acctflags); void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); extern unsigned long hugepages_treat_as_movable; @@ -138,7 +139,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) extern const struct file_operations hugetlbfs_file_operations; extern struct vm_operations_struct hugetlb_vm_ops; -struct file *hugetlb_file_setup(const char *name, size_t); +struct file *hugetlb_file_setup(const char *name, size_t, int); int hugetlb_get_quota(struct address_space *mapping, long delta); void hugetlb_put_quota(struct address_space *mapping, long delta); diff --git a/include/linux/mm.h b/include/linux/mm.h index e8ddc98b8405..323561582c10 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1129,8 +1129,7 @@ extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long flag, unsigned long pgoff); extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, - unsigned int vm_flags, unsigned long pgoff, - int accountable); + unsigned int vm_flags, unsigned long pgoff); static inline unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, diff --git a/ipc/shm.c b/ipc/shm.c index f8f69fad3a27..05d51d2a792c 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -340,6 +340,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) struct file * file; char name[13]; int id; + int acctflag = 0; if (size < SHMMIN || size > ns->shm_ctlmax) return -EINVAL; @@ -364,11 +365,12 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) sprintf (name, "SYSV%08x", key); if (shmflg & SHM_HUGETLB) { - /* hugetlb_file_setup takes care of mlock user accounting */ - file = hugetlb_file_setup(name, size); + /* hugetlb_file_setup applies strict accounting */ + if (shmflg & SHM_NORESERVE) + acctflag = VM_NORESERVE; + file = hugetlb_file_setup(name, size, acctflag); shp->mlock_user = current_user(); } else { - int acctflag = 0; /* * Do not allow no accounting for OVERCOMMIT_NEVER, even * if it's asked for. diff --git a/mm/fremap.c b/mm/fremap.c index 736ba7f3306a..b6ec85abbb39 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -198,7 +198,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, flags &= MAP_NONBLOCK; get_file(file); addr = mmap_region(file, start, size, - flags, vma->vm_flags, pgoff, 1); + flags, vma->vm_flags, pgoff); fput(file); if (IS_ERR_VALUE(addr)) { err = addr; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 618e98304080..207464209546 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2269,14 +2269,12 @@ void hugetlb_change_protection(struct vm_area_struct *vma, int hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_struct *vma) + struct vm_area_struct *vma, + int acctflag) { - long ret, chg; + long ret = 0, chg; struct hstate *h = hstate_inode(inode); - if (vma && vma->vm_flags & VM_NORESERVE) - return 0; - /* * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need @@ -2285,22 +2283,25 @@ int hugetlb_reserve_pages(struct inode *inode, */ if (!vma || vma->vm_flags & VM_SHARED) chg = region_chg(&inode->i_mapping->private_list, from, to); - else { - struct resv_map *resv_map = resv_map_alloc(); - if (!resv_map) - return -ENOMEM; - + else chg = to - from; - set_vma_resv_map(vma, resv_map); - set_vma_resv_flags(vma, HPAGE_RESV_OWNER); - } - if (chg < 0) return chg; if (hugetlb_get_quota(inode->i_mapping, chg)) return -ENOSPC; + + /* + * Only apply hugepage reservation if asked. We still have to + * take the filesystem quota because it is an upper limit + * defined for the mount and not necessarily memory as a whole + */ + if (acctflag & VM_NORESERVE) { + reset_vma_resv_huge_pages(vma); + return 0; + } + ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugetlb_put_quota(inode->i_mapping, chg); @@ -2308,6 +2309,16 @@ int hugetlb_reserve_pages(struct inode *inode, } if (!vma || vma->vm_flags & VM_SHARED) region_add(&inode->i_mapping->private_list, from, to); + else { + struct resv_map *resv_map = resv_map_alloc(); + + if (!resv_map) + return -ENOMEM; + + set_vma_resv_map(vma, resv_map); + set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + } + return 0; } diff --git a/mm/mmap.c b/mm/mmap.c index 214b6a258eeb..eb1270bebe67 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -918,7 +918,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, struct inode *inode; unsigned int vm_flags; int error; - int accountable = 1; unsigned long reqprot = prot; /* @@ -1019,8 +1018,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, return -EPERM; vm_flags &= ~VM_MAYEXEC; } - if (is_file_hugepages(file)) - accountable = 0; if (!file->f_op || !file->f_op->mmap) return -ENODEV; @@ -1053,8 +1050,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (error) return error; - return mmap_region(file, addr, len, flags, vm_flags, pgoff, - accountable); + return mmap_region(file, addr, len, flags, vm_flags, pgoff); } EXPORT_SYMBOL(do_mmap_pgoff); @@ -1092,17 +1088,23 @@ int vma_wants_writenotify(struct vm_area_struct *vma) /* * We account for memory if it's a private writeable mapping, - * and VM_NORESERVE wasn't set. + * not hugepages and VM_NORESERVE wasn't set. */ -static inline int accountable_mapping(unsigned int vm_flags) +static inline int accountable_mapping(struct file *file, unsigned int vm_flags) { + /* + * hugetlb has its own accounting separate from the core VM + * VM_HUGETLB may not be set yet so we cannot check for that flag. + */ + if (file && is_file_hugepages(file)) + return 0; + return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; } unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, - unsigned int vm_flags, unsigned long pgoff, - int accountable) + unsigned int vm_flags, unsigned long pgoff) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; @@ -1128,18 +1130,22 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* * Set 'VM_NORESERVE' if we should not account for the - * memory use of this mapping. We only honor MAP_NORESERVE - * if we're allowed to overcommit memory. + * memory use of this mapping. */ - if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER) - vm_flags |= VM_NORESERVE; - if (!accountable) - vm_flags |= VM_NORESERVE; + if ((flags & MAP_NORESERVE)) { + /* We honor MAP_NORESERVE if allowed to overcommit */ + if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) + vm_flags |= VM_NORESERVE; + + /* hugetlb applies strict overcommit unless MAP_NORESERVE */ + if (file && is_file_hugepages(file)) + vm_flags |= VM_NORESERVE; + } /* * Private writable mapping: check memory availability */ - if (accountable_mapping(vm_flags)) { + if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; if (security_vm_enough_memory(charged)) return -ENOMEM; diff --git a/mm/mprotect.c b/mm/mprotect.c index abe2694e13f4..258197b76fb4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -151,10 +151,11 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, /* * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we - * make it unwritable again. + * make it unwritable again. hugetlb mapping were accounted for + * even if read-only so there is no need to account for them here */ if (newflags & VM_WRITE) { - if (!(oldflags & (VM_ACCOUNT|VM_WRITE| + if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| VM_SHARED|VM_NORESERVE))) { charged = nrpages; if (security_vm_enough_memory(charged)) From 1db8508cf483dc1ecf66141f90a7c03659d69512 Mon Sep 17 00:00:00 2001 From: Stefan Richter Date: Tue, 10 Feb 2009 23:27:32 +0100 Subject: [PATCH 065/105] hugetlbfs: fix build failure with !CONFIG_HUGETLBFS Fix regression due to 5a6fe125950676015f5108fb71b2a67441755003, "Do not account for the address space used by hugetlbfs using VM_ACCOUNT" which added an argument to the function hugetlb_file_setup() but not to the macro hugetlb_file_setup(). Reported-by: Chris Clayton Signed-off-by: Stefan Richter Acked-by: Mel Gorman Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index af09660001c7..03be7f29ca01 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -159,9 +159,9 @@ static inline void set_file_hugepages(struct file *file) } #else /* !CONFIG_HUGETLBFS */ -#define is_file_hugepages(file) 0 -#define set_file_hugepages(file) BUG() -#define hugetlb_file_setup(name,size) ERR_PTR(-ENOSYS) +#define is_file_hugepages(file) 0 +#define set_file_hugepages(file) BUG() +#define hugetlb_file_setup(name,size,acctflag) ERR_PTR(-ENOSYS) #endif /* !CONFIG_HUGETLBFS */ From f99fb8a2cbf0fd9ce9b2d5d298943d0d4dc479f7 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Tue, 10 Feb 2009 10:57:46 +0000 Subject: [PATCH 066/105] powerpc/mm: Fix _PAGE_COHERENT support on classic ppc32 HW The following commit: commit 64b3d0e8122b422e879b23d42f9e0e8efbbf9744 Author: Benjamin Herrenschmidt Date: Thu Dec 18 19:13:51 2008 +0000 powerpc/mm: Rework usage of _PAGE_COHERENT/NO_CACHE/GUARDED broke setting of the _PAGE_COHERENT bit in the PPC HW PTE. Since we now actually set _PAGE_COHERENT in the Linux PTE we shouldn't be clearing it out before we propogate it to the PPC HW PTE. Reported-by: Martyn Welch Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/hash_low_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index 67850ec9feb3..14af8cedab70 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -320,7 +320,7 @@ _GLOBAL(create_hpte) and r8,r8,r0 /* writable if _RW & _DIRTY */ rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */ rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */ - ori r8,r8,0xe14 /* clear out reserved bits and M */ + ori r8,r8,0xe04 /* clear out reserved bits */ andc r8,r5,r8 /* PP = user? (rw&dirty? 2: 3): 0 */ BEGIN_FTR_SECTION rlwinm r8,r8,0,~_PAGE_COHERENT /* clear M (coherence not required) */ From f47a454db9129d2e61b224a40f4365cdd4f83042 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 10 Feb 2009 11:53:23 -0500 Subject: [PATCH 067/105] tracing, x86: fix constraint for parent variable The constraint used for retrieving and restoring the parent function pointer is incorrect. The parent variable is a pointer, and the address of the pointer is modified by the asm statement and not the pointer itself. It is incorrect to pass it in as an output constraint since the asm will never update the pointer. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 9d549e4fe880..231bdd3c5b1c 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -488,8 +488,8 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) * ignore such a protection. */ asm volatile( - "1: " _ASM_MOV " (%[parent_old]), %[old]\n" - "2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n" + "1: " _ASM_MOV " (%[parent]), %[old]\n" + "2: " _ASM_MOV " %[return_hooker], (%[parent])\n" " movl $0, %[faulted]\n" "3:\n" @@ -501,9 +501,8 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) _ASM_EXTABLE(1b, 4b) _ASM_EXTABLE(2b, 4b) - : [parent_replaced] "=r" (parent), [old] "=r" (old), - [faulted] "=r" (faulted) - : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker) + : [old] "=r" (old), [faulted] "=r" (faulted) + : [parent] "r" (parent), [return_hooker] "r" (return_hooker) : "memory" ); From 06eb23b1ba39c61ee5d5faeb42a097635693e370 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 9 Feb 2009 02:02:33 +0100 Subject: [PATCH 068/105] ptrace, x86: fix the usage of ptrace_fork() I noticed by pure accident we have ptrace_fork() and friends. This was added by "x86, bts: add fork and exit handling", commit bf53de907dfdaac178c92d774aae7370d7b97d20. I can't test this, ds_request_bts() returns -EOPNOTSUPP, but I strongly believe this needs the fix. I think something like this program int main(void) { int pid = fork(); if (!pid) { ptrace(PTRACE_TRACEME, 0, NULL, NULL); kill(getpid(), SIGSTOP); fork(); } else { struct ptrace_bts_config bts = { .flags = PTRACE_BTS_O_ALLOC, .size = 4 * 4096, }; wait(NULL); ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_TRACEFORK); ptrace(PTRACE_BTS_CONFIG, pid, &bts, sizeof(bts)); ptrace(PTRACE_CONT, pid, NULL, NULL); sleep(1); } return 0; } should crash the kernel. If the task is traced by its natural parent ptrace_reparented() returns 0 but we should clear ->btsxxx anyway. Signed-off-by: Oleg Nesterov Acked-by: Markus Metzger Signed-off-by: Ingo Molnar --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index 242a706e7721..43c039d55e95 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1093,7 +1093,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif - if (unlikely(ptrace_reparented(current))) + if (unlikely(current->ptrace)) ptrace_fork(p, clone_flags); /* Perform scheduler related setup. Assign this task to a CPU. */ From d5e842c4b79cc8e454c4fbbc1ce6a43d43184367 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 11 Feb 2009 10:37:28 +0100 Subject: [PATCH 069/105] [S390] vdso: fix per cpu vdso pointer in lowcore The vdso_per_cpu_data entry in the lowcore structure uses __u32 instead of __u64. If the data page is above 4GB the pointer is truncated and the kernel crashes. Reported-by: Mijo Safradin Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/lowcore.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index ffdef5fe8587..f3720defdd16 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -384,8 +384,8 @@ struct _lowcore __u32 panic_magic; /* 0xe00 */ /* Per cpu primary space access list */ - __u8 pad_0xe04[0xe3c-0xe04]; /* 0xe04 */ - __u32 vdso_per_cpu_data; /* 0xe3c */ + __u8 pad_0xe04[0xe38-0xe04]; /* 0xe04 */ + __u64 vdso_per_cpu_data; /* 0xe38 */ __u32 paste[16]; /* 0xe40 */ __u8 pad13[0x11b8-0xe80]; /* 0xe80 */ From 0addff81513a71b279a5eca5bf7cba2052c8b737 Mon Sep 17 00:00:00 2001 From: Sachin Sant Date: Wed, 11 Feb 2009 10:37:29 +0100 Subject: [PATCH 070/105] [S390] Fix init irq proc build break. Embed init_irq_proc(s390) within CONFIG_PROC_FS to fix a build break. Signed-off-by : Sachin Sant --- arch/s390/kernel/irq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index e7c5bfb7c755..026a37a94fc9 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -95,6 +95,7 @@ asmlinkage void do_softirq(void) local_irq_restore(flags); } +#ifdef CONFIG_PROC_FS void init_irq_proc(void) { struct proc_dir_entry *root_irq_dir; @@ -102,3 +103,4 @@ void init_irq_proc(void) root_irq_dir = proc_mkdir("irq", NULL); create_prof_cpu_mask(root_irq_dir); } +#endif From ca0b4b7d2cb57a2e24d7e48ce9b411b9baa3bf63 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Wed, 11 Feb 2009 10:37:30 +0100 Subject: [PATCH 071/105] [S390] dasd: bus_id -> dev_name() conversion. bus_id usage crept in again; fix it. Signed-off-by: Cornelia Huck Signed-off-by: Heiko Carstens --- drivers/s390/block/dasd_devmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index 300e28a531f8..34339902efb9 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -677,7 +677,7 @@ static ssize_t dasd_ff_show(struct device *dev, struct device_attribute *attr, struct dasd_devmap *devmap; int ff_flag; - devmap = dasd_find_busid(dev->bus_id); + devmap = dasd_find_busid(dev_name(dev)); if (!IS_ERR(devmap)) ff_flag = (devmap->features & DASD_FEATURE_FAILFAST) != 0; else From 48cae885d5a896030588978f503c73c5ed5e62b1 Mon Sep 17 00:00:00 2001 From: Stefan Weinhuber Date: Wed, 11 Feb 2009 10:37:31 +0100 Subject: [PATCH 072/105] [S390] dasd: fix race in dasd timer handling In dasd_device_set_timer and dasd_block_set_timer we interpret the return value of mod_timer in a wrong way. If the timer expires in the small window between our check of timer_pending and the call to mod_timer, then the timer will be set, mod_timer returns zero and we will call add_timer for a timer that is already pending. As del_timer and mod_timer do all the necessary checking themselves, we can simplify our code and remove the race a the same time. Signed-off-by: Stefan Weinhuber Signed-off-by: Martin Schwidefsky --- drivers/s390/block/dasd.c | 46 ++++++++++++++------------------------- 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index bd5914994142..08c23a921012 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -57,6 +57,8 @@ static void dasd_device_tasklet(struct dasd_device *); static void dasd_block_tasklet(struct dasd_block *); static void do_kick_device(struct work_struct *); static void dasd_return_cqr_cb(struct dasd_ccw_req *, void *); +static void dasd_device_timeout(unsigned long); +static void dasd_block_timeout(unsigned long); /* * SECTION: Operations on the device structure. @@ -99,6 +101,8 @@ struct dasd_device *dasd_alloc_device(void) (unsigned long) device); INIT_LIST_HEAD(&device->ccw_queue); init_timer(&device->timer); + device->timer.function = dasd_device_timeout; + device->timer.data = (unsigned long) device; INIT_WORK(&device->kick_work, do_kick_device); device->state = DASD_STATE_NEW; device->target = DASD_STATE_NEW; @@ -138,6 +142,8 @@ struct dasd_block *dasd_alloc_block(void) INIT_LIST_HEAD(&block->ccw_queue); spin_lock_init(&block->queue_lock); init_timer(&block->timer); + block->timer.function = dasd_block_timeout; + block->timer.data = (unsigned long) block; return block; } @@ -915,19 +921,10 @@ static void dasd_device_timeout(unsigned long ptr) */ void dasd_device_set_timer(struct dasd_device *device, int expires) { - if (expires == 0) { - if (timer_pending(&device->timer)) - del_timer(&device->timer); - return; - } - if (timer_pending(&device->timer)) { - if (mod_timer(&device->timer, jiffies + expires)) - return; - } - device->timer.function = dasd_device_timeout; - device->timer.data = (unsigned long) device; - device->timer.expires = jiffies + expires; - add_timer(&device->timer); + if (expires == 0) + del_timer(&device->timer); + else + mod_timer(&device->timer, jiffies + expires); } /* @@ -935,8 +932,7 @@ void dasd_device_set_timer(struct dasd_device *device, int expires) */ void dasd_device_clear_timer(struct dasd_device *device) { - if (timer_pending(&device->timer)) - del_timer(&device->timer); + del_timer(&device->timer); } static void dasd_handle_killed_request(struct ccw_device *cdev, @@ -1586,19 +1582,10 @@ static void dasd_block_timeout(unsigned long ptr) */ void dasd_block_set_timer(struct dasd_block *block, int expires) { - if (expires == 0) { - if (timer_pending(&block->timer)) - del_timer(&block->timer); - return; - } - if (timer_pending(&block->timer)) { - if (mod_timer(&block->timer, jiffies + expires)) - return; - } - block->timer.function = dasd_block_timeout; - block->timer.data = (unsigned long) block; - block->timer.expires = jiffies + expires; - add_timer(&block->timer); + if (expires == 0) + del_timer(&block->timer); + else + mod_timer(&block->timer, jiffies + expires); } /* @@ -1606,8 +1593,7 @@ void dasd_block_set_timer(struct dasd_block *block, int expires) */ void dasd_block_clear_timer(struct dasd_block *block) { - if (timer_pending(&block->timer)) - del_timer(&block->timer); + del_timer(&block->timer); } /* From 95ec807e0a42188ec1ce29cf939816ad1e22f2d3 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 11 Feb 2009 10:37:32 +0100 Subject: [PATCH 073/105] [S390] Update default configuration. Signed-off-by: Martin Schwidefsky --- arch/s390/defconfig | 87 +++++++++++++++++++++++++++++++-------------- 1 file changed, 60 insertions(+), 27 deletions(-) diff --git a/arch/s390/defconfig b/arch/s390/defconfig index a0e748da9909..31e809c77790 100644 --- a/arch/s390/defconfig +++ b/arch/s390/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.28-rc6 -# Thu Nov 27 11:00:49 2008 +# Linux kernel version: 2.6.29-rc4 +# Wed Feb 11 10:07:16 2009 # CONFIG_SCHED_MC=y CONFIG_MMU=y @@ -14,12 +14,14 @@ CONFIG_RWSEM_XCHGADD_ALGORITHM=y # CONFIG_ARCH_HAS_ILOG2_U64 is not set CONFIG_GENERIC_HWEIGHT=y CONFIG_GENERIC_TIME=y +CONFIG_GENERIC_TIME_VSYSCALL=y CONFIG_GENERIC_CLOCKEVENTS=y CONFIG_GENERIC_BUG=y CONFIG_NO_IOMEM=y CONFIG_NO_DMA=y CONFIG_GENERIC_LOCKBREAK=y CONFIG_PGSTE=y +CONFIG_VIRT_CPU_ACCOUNTING=y CONFIG_S390=y CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" @@ -39,20 +41,29 @@ CONFIG_POSIX_MQUEUE=y # CONFIG_TASKSTATS is not set CONFIG_AUDIT=y # CONFIG_AUDITSYSCALL is not set + +# +# RCU Subsystem +# +CONFIG_CLASSIC_RCU=y +# CONFIG_TREE_RCU is not set +# CONFIG_PREEMPT_RCU is not set +# CONFIG_TREE_RCU_TRACE is not set +# CONFIG_PREEMPT_RCU_TRACE is not set CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=17 +CONFIG_GROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +# CONFIG_RT_GROUP_SCHED is not set +CONFIG_USER_SCHED=y +# CONFIG_CGROUP_SCHED is not set CONFIG_CGROUPS=y # CONFIG_CGROUP_DEBUG is not set CONFIG_CGROUP_NS=y # CONFIG_CGROUP_FREEZER is not set # CONFIG_CGROUP_DEVICE is not set # CONFIG_CPUSETS is not set -CONFIG_GROUP_SCHED=y -CONFIG_FAIR_GROUP_SCHED=y -# CONFIG_RT_GROUP_SCHED is not set -CONFIG_USER_SCHED=y -# CONFIG_CGROUP_SCHED is not set # CONFIG_CGROUP_CPUACCT is not set # CONFIG_RESOURCE_COUNTERS is not set CONFIG_SYSFS_DEPRECATED=y @@ -63,6 +74,7 @@ CONFIG_UTS_NS=y CONFIG_IPC_NS=y # CONFIG_USER_NS is not set # CONFIG_PID_NS is not set +# CONFIG_NET_NS is not set CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set @@ -91,17 +103,17 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set -# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y CONFIG_KPROBES=y +CONFIG_HAVE_SYSCALL_WRAPPERS=y CONFIG_KRETPROBES=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_USE_GENERIC_SMP_HELPERS=y # CONFIG_HAVE_GENERIC_DMA_COHERENT is not set CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y -# CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 CONFIG_MODULES=y # CONFIG_MODULE_FORCE_LOAD is not set @@ -109,7 +121,7 @@ CONFIG_MODULE_UNLOAD=y # CONFIG_MODULE_FORCE_UNLOAD is not set CONFIG_MODVERSIONS=y # CONFIG_MODULE_SRCVERSION_ALL is not set -CONFIG_KMOD=y +CONFIG_INIT_ALL_POSSIBLE=y CONFIG_STOP_MACHINE=y CONFIG_BLOCK=y # CONFIG_BLK_DEV_IO_TRACE is not set @@ -130,7 +142,6 @@ CONFIG_DEFAULT_DEADLINE=y # CONFIG_DEFAULT_NOOP is not set CONFIG_DEFAULT_IOSCHED="deadline" CONFIG_PREEMPT_NOTIFIERS=y -CONFIG_CLASSIC_RCU=y # CONFIG_FREEZER is not set # @@ -161,6 +172,7 @@ CONFIG_S390_EXEC_PROTECT=y CONFIG_MARCH_Z900=y # CONFIG_MARCH_Z990 is not set # CONFIG_MARCH_Z9_109 is not set +# CONFIG_MARCH_Z10 is not set CONFIG_PACK_STACK=y # CONFIG_SMALL_STACK is not set CONFIG_CHECK_STACK=y @@ -174,7 +186,6 @@ CONFIG_ARCH_POPULATES_NODE_MAP=y # CONFIG_PREEMPT_NONE is not set # CONFIG_PREEMPT_VOLUNTARY is not set CONFIG_PREEMPT=y -# CONFIG_PREEMPT_RCU is not set CONFIG_ARCH_SPARSEMEM_ENABLE=y CONFIG_ARCH_SPARSEMEM_DEFAULT=y CONFIG_ARCH_SELECT_MEMORY_MODEL=y @@ -195,7 +206,6 @@ CONFIG_MEMORY_HOTREMOVE=y CONFIG_PAGEFLAGS_EXTENDED=y CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_MIGRATION=y -CONFIG_RESOURCES_64BIT=y CONFIG_PHYS_ADDR_T_64BIT=y CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y @@ -207,7 +217,6 @@ CONFIG_UNEVICTABLE_LRU=y # CONFIG_MACHCHK_WARNING=y CONFIG_QDIO=y -# CONFIG_QDIO_DEBUG is not set CONFIG_CHSC_SCH=m # @@ -227,15 +236,13 @@ CONFIG_PFAULT=y # CONFIG_SHARED_KERNEL is not set # CONFIG_CMM is not set # CONFIG_PAGE_STATES is not set -CONFIG_VIRT_TIMER=y -CONFIG_VIRT_CPU_ACCOUNTING=y # CONFIG_APPLDATA_BASE is not set CONFIG_HZ_100=y # CONFIG_HZ_250 is not set # CONFIG_HZ_300 is not set # CONFIG_HZ_1000 is not set CONFIG_HZ=100 -# CONFIG_SCHED_HRTICK is not set +CONFIG_SCHED_HRTICK=y CONFIG_S390_HYPFS_FS=y CONFIG_KEXEC=y # CONFIG_ZFCPDUMP is not set @@ -245,6 +252,7 @@ CONFIG_NET=y # # Networking options # +CONFIG_COMPAT_NET_DEV_OPS=y CONFIG_PACKET=y # CONFIG_PACKET_MMAP is not set CONFIG_UNIX=y @@ -383,6 +391,7 @@ CONFIG_NET_SCH_TBF=m CONFIG_NET_SCH_GRED=m CONFIG_NET_SCH_DSMARK=m # CONFIG_NET_SCH_NETEM is not set +# CONFIG_NET_SCH_DRR is not set # CONFIG_NET_SCH_INGRESS is not set # @@ -400,6 +409,7 @@ CONFIG_CLS_U32_MARK=y CONFIG_NET_CLS_RSVP=m CONFIG_NET_CLS_RSVP6=m CONFIG_NET_CLS_FLOW=m +# CONFIG_NET_CLS_CGROUP is not set # CONFIG_NET_EMATCH is not set CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=y @@ -411,6 +421,7 @@ CONFIG_NET_ACT_NAT=m # CONFIG_NET_ACT_SKBEDIT is not set # CONFIG_NET_CLS_IND is not set CONFIG_NET_SCH_FIFO=y +# CONFIG_DCB is not set # # Network testing @@ -428,6 +439,7 @@ CONFIG_CAN_VCAN=m # CONFIG_CAN_DEBUG_DEVICES is not set # CONFIG_AF_RXRPC is not set # CONFIG_PHONET is not set +# CONFIG_WIMAX is not set # CONFIG_RFKILL is not set # CONFIG_NET_9P is not set # CONFIG_PCMCIA is not set @@ -475,10 +487,14 @@ CONFIG_DASD_DIAG=y CONFIG_DASD_EER=y CONFIG_VIRTIO_BLK=m CONFIG_MISC_DEVICES=y -# CONFIG_EEPROM_93CX6 is not set # CONFIG_ENCLOSURE_SERVICES is not set # CONFIG_C2PORT is not set +# +# EEPROM support +# +# CONFIG_EEPROM_93CX6 is not set + # # SCSI device support # @@ -520,6 +536,7 @@ CONFIG_SCSI_FC_ATTRS=y # CONFIG_SCSI_SRP_ATTRS is not set CONFIG_SCSI_LOWLEVEL=y # CONFIG_ISCSI_TCP is not set +# CONFIG_LIBFC is not set # CONFIG_SCSI_DEBUG is not set CONFIG_ZFCP=y CONFIG_SCSI_DH=m @@ -566,6 +583,10 @@ CONFIG_NET_ETHERNET=y CONFIG_NETDEV_1000=y CONFIG_NETDEV_10000=y # CONFIG_TR is not set + +# +# Enable WiMAX (Networking options) to see the WiMAX drivers +# # CONFIG_WAN is not set # @@ -593,9 +614,11 @@ CONFIG_VIRTIO_NET=m # CONFIG_DEVKMEM=y CONFIG_UNIX98_PTYS=y +# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set CONFIG_LEGACY_PTYS=y CONFIG_LEGACY_PTY_COUNT=256 CONFIG_HVC_DRIVER=y +CONFIG_HVC_IUCV=y CONFIG_VIRTIO_CONSOLE=y CONFIG_HW_RANDOM=m CONFIG_HW_RANDOM_VIRTIO=m @@ -645,7 +668,6 @@ CONFIG_S390_VMUR=m # CONFIG_NEW_LEDS is not set CONFIG_ACCESSIBILITY=y # CONFIG_STAGING is not set -CONFIG_STAGING_EXCLUDE_BUILD=y # # File systems @@ -668,6 +690,7 @@ CONFIG_FILE_LOCKING=y # CONFIG_XFS_FS is not set # CONFIG_GFS2_FS is not set # CONFIG_OCFS2_FS is not set +# CONFIG_BTRFS_FS is not set CONFIG_DNOTIFY=y CONFIG_INOTIFY=y CONFIG_INOTIFY_USER=y @@ -703,10 +726,7 @@ CONFIG_TMPFS_POSIX_ACL=y # CONFIG_HUGETLBFS is not set # CONFIG_HUGETLB_PAGE is not set CONFIG_CONFIGFS_FS=m - -# -# Miscellaneous filesystems -# +CONFIG_MISC_FILESYSTEMS=y # CONFIG_ADFS_FS is not set # CONFIG_AFFS_FS is not set # CONFIG_HFS_FS is not set @@ -715,6 +735,7 @@ CONFIG_CONFIGFS_FS=m # CONFIG_BFS_FS is not set # CONFIG_EFS_FS is not set # CONFIG_CRAMFS is not set +# CONFIG_SQUASHFS is not set # CONFIG_VXFS_FS is not set # CONFIG_MINIX_FS is not set # CONFIG_OMFS_FS is not set @@ -808,6 +829,7 @@ CONFIG_DEBUG_BUGVERBOSE=y CONFIG_DEBUG_MEMORY_INIT=y # CONFIG_DEBUG_LIST is not set # CONFIG_DEBUG_SG is not set +# CONFIG_DEBUG_NOTIFIERS is not set # CONFIG_FRAME_POINTER is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_RCU_CPU_STALL_DETECTOR is not set @@ -818,15 +840,19 @@ CONFIG_DEBUG_MEMORY_INIT=y # CONFIG_FAULT_INJECTION is not set # CONFIG_LATENCYTOP is not set CONFIG_SYSCTL_SYSCALL_CHECK=y +CONFIG_HAVE_FUNCTION_TRACER=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set # CONFIG_IRQSOFF_TRACER is not set # CONFIG_PREEMPT_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set # CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set # CONFIG_DYNAMIC_PRINTK_DEBUG is not set CONFIG_SAMPLES=y # CONFIG_SAMPLE_KOBJECT is not set @@ -847,11 +873,17 @@ CONFIG_CRYPTO=y # CONFIG_CRYPTO_FIPS=y CONFIG_CRYPTO_ALGAPI=y -CONFIG_CRYPTO_AEAD=y +CONFIG_CRYPTO_ALGAPI2=y +CONFIG_CRYPTO_AEAD=m +CONFIG_CRYPTO_AEAD2=y CONFIG_CRYPTO_BLKCIPHER=y -CONFIG_CRYPTO_HASH=y -CONFIG_CRYPTO_RNG=y +CONFIG_CRYPTO_BLKCIPHER2=y +CONFIG_CRYPTO_HASH=m +CONFIG_CRYPTO_HASH2=y +CONFIG_CRYPTO_RNG=m +CONFIG_CRYPTO_RNG2=y CONFIG_CRYPTO_MANAGER=y +CONFIG_CRYPTO_MANAGER2=y CONFIG_CRYPTO_GF128MUL=m # CONFIG_CRYPTO_NULL is not set # CONFIG_CRYPTO_CRYPTD is not set @@ -885,7 +917,7 @@ CONFIG_CRYPTO_HMAC=m # # Digest # -# CONFIG_CRYPTO_CRC32C is not set +CONFIG_CRYPTO_CRC32C=m # CONFIG_CRYPTO_MD4 is not set CONFIG_CRYPTO_MD5=m # CONFIG_CRYPTO_MICHAEL_MIC is not set @@ -942,6 +974,7 @@ CONFIG_S390_PRNG=m # Library routines # CONFIG_BITREVERSE=m +CONFIG_GENERIC_FIND_LAST_BIT=y # CONFIG_CRC_CCITT is not set # CONFIG_CRC16 is not set CONFIG_CRC_T10DIF=y From b22f4858126a6aa852ad745b94f6b25dbdea708e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 10 Feb 2009 15:49:11 +0100 Subject: [PATCH 074/105] tracing/sysprof: add missing tracing_{start,stop}_record_cmdline() Add the missing pair tracing_{start,stop}_record_cmdline() to record well the cmdline associated with pid. Changes in v2: - fix a build error, the sched_switch tracer is needed to record the cmdline. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 1 + kernel/trace/trace_sysprof.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3a331289457a..6ff928acd453 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -134,6 +134,7 @@ config SYSPROF_TRACER bool "Sysprof Tracer" depends on X86 select TRACING + select CONTEXT_SWITCH_TRACER help This tracer provides the trace needed by the 'Sysprof' userspace tool. diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 84ca9d81e74d..9902c15997ad 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -238,6 +238,8 @@ static int stack_trace_init(struct trace_array *tr) { sysprof_trace = tr; + tracing_start_cmdline_record(); + mutex_lock(&sample_timer_lock); start_stack_timers(); tracer_enabled = 1; @@ -247,6 +249,7 @@ static int stack_trace_init(struct trace_array *tr) static void stack_trace_reset(struct trace_array *tr) { + tracing_stop_cmdline_record(); stop_stack_trace(tr); } From 00f62f614bb713027b9296068d1879fbca511eb7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 9 Feb 2009 17:04:06 -0200 Subject: [PATCH 075/105] ring_buffer: pahole struct ring_buffer While fixing some bugs in pahole (built-in.o files were not being processed due to relocation problems) I found out about these packable structures: $ pahole --packable kernel/trace/ring_buffer.o | grep ring ring_buffer 72 64 8 ring_buffer_per_cpu 112 104 8 If we take a look at the current layout of struct ring_buffer we can see that we have two 4 bytes holes. $ pahole -C ring_buffer kernel/trace/ring_buffer.o struct ring_buffer { unsigned int pages; /* 0 4 */ unsigned int flags; /* 4 4 */ int cpus; /* 8 4 */ /* XXX 4 bytes hole, try to pack */ cpumask_var_t cpumask; /* 16 8 */ atomic_t record_disabled; /* 24 4 */ /* XXX 4 bytes hole, try to pack */ struct mutex mutex; /* 32 32 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct ring_buffer_per_cpu * * buffers; /* 64 8 */ /* size: 72, cachelines: 2, members: 7 */ /* sum members: 64, holes: 2, sum holes: 8 */ /* last cacheline: 8 bytes */ }; So, if I ask pahole to reorganize it: $ pahole -C ring_buffer --reorganize kernel/trace/ring_buffer.o struct ring_buffer { unsigned int pages; /* 0 4 */ unsigned int flags; /* 4 4 */ int cpus; /* 8 4 */ atomic_t record_disabled; /* 12 4 */ cpumask_var_t cpumask; /* 16 8 */ struct mutex mutex; /* 24 32 */ struct ring_buffer_per_cpu * * buffers; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ /* size: 64, cachelines: 1, members: 7 */ }; /* saved 8 bytes and 1 cacheline! */ We get it using just one 64 bytes cacheline. To see what it did: $ pahole -C ring_buffer --reorganize --show_reorg_steps \ kernel/trace/ring_buffer.o | grep \/ /* Moving 'record_disabled' from after 'cpumask' to after 'cpus' */ Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 53ba3a6d16d0..27ef3bf13ed2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -273,8 +273,8 @@ struct ring_buffer { unsigned pages; unsigned flags; int cpus; - cpumask_var_t cpumask; atomic_t record_disabled; + cpumask_var_t cpumask; struct mutex mutex; From 3fccfd67df79c6351a156eb25a7a514e5f39c4d9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 Feb 2009 16:37:31 +0100 Subject: [PATCH 076/105] timers: split process wide cpu clocks/timers, fix To decrease the chance of a missed enable, always enable the timer when we sample it, we'll always disable it when we find that there are no active timers in the jiffy tick. This fixes a flood of warnings reported by Mike Galbraith. Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/posix-cpu-timers.c | 42 +++++++++++++-------------------------- 2 files changed, 15 insertions(+), 28 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 79392916d6c9..5d10fa0b6002 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2209,6 +2209,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) unsigned long flags; spin_lock_irqsave(&cputimer->lock, flags); + cputimer->running = 1; *times = cputimer->cputime; spin_unlock_irqrestore(&cputimer->lock, flags); } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index db107c9bbc05..e5d7bfdfa7d4 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -488,7 +488,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) { struct task_cputime cputime; - thread_group_cputime(tsk, &cputime); + thread_group_cputimer(tsk, &cputime); cleanup_timers(tsk->signal->cpu_timers, cputime.utime, cputime.stime, cputime.sum_exec_runtime); } @@ -506,29 +506,6 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) now); } -/* - * Enable the process wide cpu timer accounting. - * - * serialized using ->sighand->siglock - */ -static void start_process_timers(struct task_struct *tsk) -{ - tsk->signal->cputimer.running = 1; - barrier(); -} - -/* - * Release the process wide timer accounting -- timer stops ticking when - * nobody cares about it. - * - * serialized using ->sighand->siglock - */ -static void stop_process_timers(struct task_struct *tsk) -{ - tsk->signal->cputimer.running = 0; - barrier(); -} - /* * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the tasklist_lock held @@ -549,9 +526,6 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) BUG_ON(!irqs_disabled()); spin_lock(&p->sighand->siglock); - if (!CPUCLOCK_PERTHREAD(timer->it_clock)) - start_process_timers(p); - listpos = head; if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { list_for_each_entry(next, head, entry) { @@ -1021,6 +995,19 @@ static void check_thread_timers(struct task_struct *tsk, } } +static void stop_process_timers(struct task_struct *tsk) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + unsigned long flags; + + if (!cputimer->running) + return; + + spin_lock_irqsave(&cputimer->lock, flags); + cputimer->running = 0; + spin_unlock_irqrestore(&cputimer->lock, flags); +} + /* * Check for any per-thread CPU timers that have fired and move them * off the tsk->*_timers list onto the firing list. Per-thread timers @@ -1427,7 +1414,6 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, struct list_head *head; BUG_ON(clock_idx == CPUCLOCK_SCHED); - start_process_timers(tsk); cpu_timer_sample_group(clock_idx, tsk, &now); if (oldval) { From 4da94d49b2ecb0a26e716a8811c3ecc542c2a65d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Feb 2009 11:30:27 +0100 Subject: [PATCH 077/105] timers: fix TIMER_ABSTIME for process wide cpu timers The POSIX timer interface allows for absolute time expiry values through the TIMER_ABSTIME flag, therefore we have to synchronize the timer to the clock every time we start it. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 13 +------------ kernel/posix-cpu-timers.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 5d10fa0b6002..8981e52c714f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2201,18 +2201,7 @@ static inline int spin_needbreak(spinlock_t *lock) * Thread group CPU time accounting. */ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); - -static inline -void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) -{ - struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - unsigned long flags; - - spin_lock_irqsave(&cputimer->lock, flags); - cputimer->running = 1; - *times = cputimer->cputime; - spin_unlock_irqrestore(&cputimer->lock, flags); -} +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times); static inline void thread_group_cputime_init(struct signal_struct *sig) { diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index e5d7bfdfa7d4..2313a4cc14ea 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -261,6 +261,40 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) rcu_read_unlock(); } +static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) +{ + if (cputime_gt(b->utime, a->utime)) + a->utime = b->utime; + + if (cputime_gt(b->stime, a->stime)) + a->stime = b->stime; + + if (b->sum_exec_runtime > a->sum_exec_runtime) + a->sum_exec_runtime = b->sum_exec_runtime; +} + +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct task_cputime sum; + unsigned long flags; + + spin_lock_irqsave(&cputimer->lock, flags); + if (!cputimer->running) { + cputimer->running = 1; + /* + * The POSIX timer interface allows for absolute time expiry + * values through the TIMER_ABSTIME flag, therefore we have + * to synchronize the timer to the clock every time we start + * it. + */ + thread_group_cputime(tsk, &sum); + update_gt_cputime(&cputimer->cputime, &sum); + } + *times = cputimer->cputime; + spin_unlock_irqrestore(&cputimer->lock, flags); +} + /* * Sample a process (thread group) clock for the given group_leader task. * Must be called with tasklist_lock held for reading. From fc631c82e1734d718ff0832558f64c8f5d185f26 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Feb 2009 14:27:17 +0100 Subject: [PATCH 078/105] sched: revert recent sync wakeup changes Intel reported a 10% regression (mysql+sysbench) on a 16-way machine with these patches: 1596e29: sched: symmetric sync vs avg_overlap d942fb6: sched: fix sync wakeups Revert them. Reported-by: "Zhang, Yanmin" Bisected-by: Lin Ming Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ---------- kernel/sched_fair.c | 11 +++++++++-- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index e1fc67d0674c..f11c02b86c73 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2266,16 +2266,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (!sched_feat(SYNC_WAKEUPS)) sync = 0; - if (!sync) { - if (current->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost) - sync = 1; - } else { - if (current->se.avg_overlap >= sysctl_sched_migration_cost || - p->se.avg_overlap >= sysctl_sched_migration_cost) - sync = 0; - } - #ifdef CONFIG_SMP if (sched_feat(LB_WAKEUP_UPDATE)) { struct sched_domain *sd; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a7e50ba185ac..0566f2a03c42 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1191,15 +1191,20 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, int idx, unsigned long load, unsigned long this_load, unsigned int imbalance) { + struct task_struct *curr = this_rq->curr; + struct task_group *tg; unsigned long tl = this_load; unsigned long tl_per_task; - struct task_group *tg; unsigned long weight; int balanced; if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) return 0; + if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || + p->se.avg_overlap > sysctl_sched_migration_cost)) + sync = 0; + /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load @@ -1426,7 +1431,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) if (!sched_feat(WAKEUP_PREEMPT)) return; - if (sched_feat(WAKEUP_OVERLAP) && sync) { + if (sched_feat(WAKEUP_OVERLAP) && (sync || + (se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost))) { resched_task(curr); return; } From 17c9d12e126cb0de8d535dc1908c4819d712bc68 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 11 Feb 2009 16:34:16 +0000 Subject: [PATCH 079/105] Do not account for hugetlbfs quota at mmap() time if mapping [SHM|MAP]_NORESERVE Commit 5a6fe125950676015f5108fb71b2a67441755003 brought hugetlbfs more in line with the core VM by obeying VM_NORESERVE and not reserving hugepages for both shared and private mappings when [SHM|MAP]_NORESERVE are specified. However, it is still taking filesystem quota unconditionally. At fault time, if there are no reserves and attempt is made to allocate the page and account for filesystem quota. If either fail, the fault fails. The impact is that quota is getting accounted for twice. This patch partially reverts 5a6fe125950676015f5108fb71b2a67441755003. To help prevent this mistake happening again, it improves the documentation of hugetlb_reserve_pages() Reported-by: Andy Whitcroft Signed-off-by: Mel Gorman Acked-by: Andy Whitcroft Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 69 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 207464209546..107da3d809a8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2272,9 +2272,17 @@ int hugetlb_reserve_pages(struct inode *inode, struct vm_area_struct *vma, int acctflag) { - long ret = 0, chg; + long ret, chg; struct hstate *h = hstate_inode(inode); + /* + * Only apply hugepage reservation if asked. At fault time, an + * attempt will be made for VM_NORESERVE to allocate a page + * and filesystem quota without using reserves + */ + if (acctflag & VM_NORESERVE) + return 0; + /* * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need @@ -2283,42 +2291,47 @@ int hugetlb_reserve_pages(struct inode *inode, */ if (!vma || vma->vm_flags & VM_SHARED) chg = region_chg(&inode->i_mapping->private_list, from, to); - else - chg = to - from; - - if (chg < 0) - return chg; - - if (hugetlb_get_quota(inode->i_mapping, chg)) - return -ENOSPC; - - /* - * Only apply hugepage reservation if asked. We still have to - * take the filesystem quota because it is an upper limit - * defined for the mount and not necessarily memory as a whole - */ - if (acctflag & VM_NORESERVE) { - reset_vma_resv_huge_pages(vma); - return 0; - } - - ret = hugetlb_acct_memory(h, chg); - if (ret < 0) { - hugetlb_put_quota(inode->i_mapping, chg); - return ret; - } - if (!vma || vma->vm_flags & VM_SHARED) - region_add(&inode->i_mapping->private_list, from, to); else { struct resv_map *resv_map = resv_map_alloc(); - if (!resv_map) return -ENOMEM; + chg = to - from; + set_vma_resv_map(vma, resv_map); set_vma_resv_flags(vma, HPAGE_RESV_OWNER); } + if (chg < 0) + return chg; + + /* There must be enough filesystem quota for the mapping */ + if (hugetlb_get_quota(inode->i_mapping, chg)) + return -ENOSPC; + + /* + * Check enough hugepages are available for the reservation. + * Hand back the quota if there are not + */ + ret = hugetlb_acct_memory(h, chg); + if (ret < 0) { + hugetlb_put_quota(inode->i_mapping, chg); + return ret; + } + + /* + * Account for the reservations made. Shared mappings record regions + * that have reservations as they are shared by multiple VMAs. + * When the last VMA disappears, the region map says how much + * the reservation was and the page cache tells how much of + * the reservation was consumed. Private mappings are per-VMA and + * only the consumed reservations are tracked. When the VMA + * disappears, the original reservation is the VMA size and the + * consumed reservations are stored in the map. Hence, nothing + * else has to be done for private mappings here + */ + if (!vma || vma->vm_flags & VM_SHARED) + region_add(&inode->i_mapping->private_list, from, to); return 0; } From 1001c9fb8721ab395e21f571ed2aaa523cdd1e29 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Wed, 11 Feb 2009 13:04:18 -0800 Subject: [PATCH 080/105] migration: migrate_vmas should check "vma" migrate_vmas() should check "vma" not "vma->vm_next" for for-loop condition. Signed-off-by: Daisuke Nishimura Cc: Christoph Lameter Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index 2bb4e1d63520..a9eff3f092f6 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1129,7 +1129,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, struct vm_area_struct *vma; int err = 0; - for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) { + for (vma = mm->mmap; vma && !err; vma = vma->vm_next) { if (vma->vm_ops && vma->vm_ops->migrate) { err = vma->vm_ops->migrate(vma, to, from, flags); if (err) From 57f63bc8fe79e6598e7253f10f53f58c9fdc57be Mon Sep 17 00:00:00 2001 From: Robert Jarzmik Date: Wed, 11 Feb 2009 13:04:19 -0800 Subject: [PATCH 081/105] rtc: update maintainership of pxa rtc driver Signed-off-by: Robert Jarzmik Signed-off-by: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 6 ++++++ drivers/rtc/rtc-pxa.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 0ea3a6d98714..6d5f728f989a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3537,6 +3537,12 @@ S: Maintained PXA MMCI DRIVER S: Orphan +PXA RTC DRIVER +P: Robert Jarzmik +M: robert.jarzmik@free.fr +L: rtc-linux@googlegroups.com +S: Maintained + QLOGIC QLA2XXX FC-SCSI DRIVER P: Andrew Vasquez M: linux-driver@qlogic.com diff --git a/drivers/rtc/rtc-pxa.c b/drivers/rtc/rtc-pxa.c index bd56a033bfd0..bb8cc05605ac 100644 --- a/drivers/rtc/rtc-pxa.c +++ b/drivers/rtc/rtc-pxa.c @@ -485,7 +485,7 @@ static void __exit pxa_rtc_exit(void) module_init(pxa_rtc_init); module_exit(pxa_rtc_exit); -MODULE_AUTHOR("Robert Jarzmik"); +MODULE_AUTHOR("Robert Jarzmik "); MODULE_DESCRIPTION("PXA27x/PXA3xx Realtime Clock Driver (RTC)"); MODULE_LICENSE("GPL"); MODULE_ALIAS("platform:pxa-rtc"); From 067f1293cc5916f8d88b602beeb8787d58515608 Mon Sep 17 00:00:00 2001 From: Marco La Porta Date: Wed, 11 Feb 2009 13:04:20 -0800 Subject: [PATCH 082/105] lxfb: properly alloc cmap in all cases and don't leak the memory We weren't properly allocating the cmap for depths greater than 8bpp, which caused pain for things like DirectFB. Also, we never freed the cmap memory upon module unload.. [dilinger@debian.org: dropped unnecessary code and clean up patch] [dilinger@debian.org: add error checking and handling] Signed-off-by: Andres Salomon Cc: Jordan Crouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/geode/lxfb_core.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/video/geode/lxfb_core.c b/drivers/video/geode/lxfb_core.c index b965ecdbc604..889cbe39e580 100644 --- a/drivers/video/geode/lxfb_core.c +++ b/drivers/video/geode/lxfb_core.c @@ -278,13 +278,10 @@ static int lxfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) static int lxfb_set_par(struct fb_info *info) { - if (info->var.bits_per_pixel > 8) { + if (info->var.bits_per_pixel > 8) info->fix.visual = FB_VISUAL_TRUECOLOR; - fb_dealloc_cmap(&info->cmap); - } else { + else info->fix.visual = FB_VISUAL_PSEUDOCOLOR; - fb_alloc_cmap(&info->cmap, 1<var.bits_per_pixel, 0); - } info->fix.line_length = lx_get_pitch(info->var.xres, info->var.bits_per_pixel); @@ -451,6 +448,11 @@ static struct fb_info * __init lxfb_init_fbinfo(struct device *dev) info->pseudo_palette = (void *)par + sizeof(struct lxfb_par); + if (fb_alloc_cmap(&info->cmap, 256, 0) < 0) { + framebuffer_release(info); + return NULL; + } + info->var.grayscale = 0; return info; @@ -579,8 +581,10 @@ static int __init lxfb_probe(struct pci_dev *pdev, pci_release_region(pdev, 3); } - if (info) + if (info) { + fb_dealloc_cmap(&info->cmap); framebuffer_release(info); + } return ret; } @@ -604,6 +608,7 @@ static void lxfb_remove(struct pci_dev *pdev) iounmap(par->vp_regs); pci_release_region(pdev, 3); + fb_dealloc_cmap(&info->cmap); pci_set_drvdata(pdev, NULL); framebuffer_release(info); } From b14caecdbe7730bf82c8510f1ba52e00273e15c4 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Wed, 11 Feb 2009 13:04:22 -0800 Subject: [PATCH 083/105] gxfb: properly alloc cmap and plug cmap leak We weren't properly allocating the cmap for depths greater than 8bpp, which caused pain for things like DirectFB. Also, we never freed the cmap memory upon module unload.. Signed-off-by: Andres Salomon Cc: Marco La Porta Cc: Jordan Crouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/geode/gxfb_core.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/video/geode/gxfb_core.c b/drivers/video/geode/gxfb_core.c index 484118926318..2552cac39e1c 100644 --- a/drivers/video/geode/gxfb_core.c +++ b/drivers/video/geode/gxfb_core.c @@ -171,13 +171,10 @@ static int gxfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) static int gxfb_set_par(struct fb_info *info) { - if (info->var.bits_per_pixel > 8) { + if (info->var.bits_per_pixel > 8) info->fix.visual = FB_VISUAL_TRUECOLOR; - fb_dealloc_cmap(&info->cmap); - } else { + else info->fix.visual = FB_VISUAL_PSEUDOCOLOR; - fb_alloc_cmap(&info->cmap, 1<var.bits_per_pixel, 0); - } info->fix.line_length = gx_line_delta(info->var.xres, info->var.bits_per_pixel); @@ -331,6 +328,11 @@ static struct fb_info * __init gxfb_init_fbinfo(struct device *dev) info->var.grayscale = 0; + if (fb_alloc_cmap(&info->cmap, 256, 0) < 0) { + framebuffer_release(info); + return NULL; + } + return info; } @@ -443,8 +445,10 @@ static int __init gxfb_probe(struct pci_dev *pdev, const struct pci_device_id *i pci_release_region(pdev, 1); } - if (info) + if (info) { + fb_dealloc_cmap(&info->cmap); framebuffer_release(info); + } return ret; } @@ -467,6 +471,7 @@ static void gxfb_remove(struct pci_dev *pdev) iounmap(par->gp_regs); pci_release_region(pdev, 1); + fb_dealloc_cmap(&info->cmap); pci_set_drvdata(pdev, NULL); framebuffer_release(info); From 35887b1cf74dc751dd0574b26515142d3cea9376 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Wed, 11 Feb 2009 13:04:23 -0800 Subject: [PATCH 084/105] gx1fb: properly alloc cmap and plug cmap leak We weren't properly allocating the cmap for depths greater than 8bpp, which caused pain for things like DirectFB. Also, we never freed the cmap memory upon module unload.. Signed-off-by: Andres Salomon Cc: Marco La Porta Cc: Jordan Crouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/geode/gx1fb_core.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/video/geode/gx1fb_core.c b/drivers/video/geode/gx1fb_core.c index 751e491ca8c8..f20eff8c4a81 100644 --- a/drivers/video/geode/gx1fb_core.c +++ b/drivers/video/geode/gx1fb_core.c @@ -136,13 +136,10 @@ static int gx1fb_set_par(struct fb_info *info) { struct geodefb_par *par = info->par; - if (info->var.bits_per_pixel == 16) { + if (info->var.bits_per_pixel == 16) info->fix.visual = FB_VISUAL_TRUECOLOR; - fb_dealloc_cmap(&info->cmap); - } else { + else info->fix.visual = FB_VISUAL_PSEUDOCOLOR; - fb_alloc_cmap(&info->cmap, 1<var.bits_per_pixel, 0); - } info->fix.line_length = gx1_line_delta(info->var.xres, info->var.bits_per_pixel); @@ -315,6 +312,10 @@ static struct fb_info * __init gx1fb_init_fbinfo(struct device *dev) if (!par->panel_x) par->enable_crt = 1; /* fall back to CRT if no panel is specified */ + if (fb_alloc_cmap(&info->cmap, 256, 0) < 0) { + framebuffer_release(info); + return NULL; + } return info; } @@ -374,8 +375,11 @@ static int __init gx1fb_probe(struct pci_dev *pdev, const struct pci_device_id * release_mem_region(gx1_gx_base() + 0x8300, 0x100); } - if (info) + if (info) { + fb_dealloc_cmap(&info->cmap); framebuffer_release(info); + } + return ret; } @@ -395,6 +399,7 @@ static void gx1fb_remove(struct pci_dev *pdev) iounmap(par->dc_regs); release_mem_region(gx1_gx_base() + 0x8300, 0x100); + fb_dealloc_cmap(&info->cmap); pci_set_drvdata(pdev, NULL); framebuffer_release(info); From fc3501d411d34823fb9be248a95a0c44f945866f Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Wed, 11 Feb 2009 13:04:23 -0800 Subject: [PATCH 085/105] mm: fix dirty_bytes/dirty_background_bytes sysctls on 64bit arches We need to pass an unsigned long as the minimum, because it gets casted to an unsigned long in the sysctl handler. If we pass an int, we'll access four more bytes on 64bit arches, resulting in a random minimum value. [rientjes@google.com: fix type of `old_bytes'] Signed-off-by: Sven Wegener Cc: Peter Zijlstra Cc: Dave Chinner Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 5 +++-- mm/page-writeback.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 790f9d785663..c5ef44ff850f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -101,6 +101,7 @@ static int two = 2; static int zero; static int one = 1; +static unsigned long one_ul = 1; static int one_hundred = 100; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ @@ -974,7 +975,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &dirty_background_bytes_handler, .strategy = &sysctl_intvec, - .extra1 = &one, + .extra1 = &one_ul, }, { .ctl_name = VM_DIRTY_RATIO, @@ -995,7 +996,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &dirty_bytes_handler, .strategy = &sysctl_intvec, - .extra1 = &one, + .extra1 = &one_ul, }, { .procname = "dirty_writeback_centisecs", diff --git a/mm/page-writeback.c b/mm/page-writeback.c index dc32dae01e5f..c17005e73974 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -209,7 +209,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - int old_bytes = vm_dirty_bytes; + unsigned long old_bytes = vm_dirty_bytes; int ret; ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); From 8fe4cd0dc5ea43760c59eb256404188272cc95dd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 11 Feb 2009 13:04:25 -0800 Subject: [PATCH 086/105] jbd: fix return value of journal_start_commit() journal_start_commit() returns 1 if either a transaction is committing or the function has queued a transaction commit. But it returns 0 if we raced with somebody queueing the transaction commit as well. This resulted in ext3_sync_fs() not functioning correctly (description from Arthur Jones): In the case of a data=ordered umount with pending long symlinks which are delayed due to a long list of other I/O on the backing block device, this causes the buffer associated with the long symlinks to not be moved to the inode dirty list in the second phase of fsync_super. Then, before they can be dirtied again, kjournald exits, seeing the UMOUNT flag and the dirty pages are never written to the backing block device, causing long symlink corruption and exposing new or previously freed block data to userspace. This can be reproduced with a script created by Eric Sandeen : #!/bin/bash umount /mnt/test2 mount /dev/sdb4 /mnt/test2 rm -f /mnt/test2/* dd if=/dev/zero of=/mnt/test2/bigfile bs=1M count=512 touch /mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename ln -s /mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename /mnt/test2/link umount /mnt/test2 mount /dev/sdb4 /mnt/test2 ls /mnt/test2/ This patch fixes journal_start_commit() to always return 1 when there's a transaction committing or queued for commit. Cc: Eric Sandeen Cc: Mike Snitzer Cc: Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/journal.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 9e4fa52d7dc8..e79c07812afa 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -427,7 +427,7 @@ int __log_space_left(journal_t *journal) } /* - * Called under j_state_lock. Returns true if a transaction was started. + * Called under j_state_lock. Returns true if a transaction commit was started. */ int __log_start_commit(journal_t *journal, tid_t target) { @@ -495,7 +495,8 @@ int journal_force_commit_nested(journal_t *journal) /* * Start a commit of the current running transaction (if any). Returns true - * if a transaction was started, and fills its tid in at *ptid + * if a transaction is going to be committed (or is currently already + * committing), and fills its tid in at *ptid */ int journal_start_commit(journal_t *journal, tid_t *ptid) { @@ -505,15 +506,19 @@ int journal_start_commit(journal_t *journal, tid_t *ptid) if (journal->j_running_transaction) { tid_t tid = journal->j_running_transaction->t_tid; - ret = __log_start_commit(journal, tid); - if (ret && ptid) + __log_start_commit(journal, tid); + /* There's a running transaction and we've just made sure + * it's commit has been scheduled. */ + if (ptid) *ptid = tid; - } else if (journal->j_committing_transaction && ptid) { + ret = 1; + } else if (journal->j_committing_transaction) { /* * If ext3_write_super() recently started a commit, then we * have to wait for completion of that transaction */ - *ptid = journal->j_committing_transaction->t_tid; + if (ptid) + *ptid = journal->j_committing_transaction->t_tid; ret = 1; } spin_unlock(&journal->j_state_lock); From 02ac597c9b86af49b2016aa98aee20ab59dbf0d2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 11 Feb 2009 13:04:26 -0800 Subject: [PATCH 087/105] ext3: revert "ext3: wait on all pending commits in ext3_sync_fs" This reverts commit c87591b719737b4e91eb1a9fa8fd55a4ff1886d6. Since journal_start_commit() is now fixed to return 1 when we started a transaction commit, there's some transaction waiting to be committed or there's a transaction already committing, we don't need to call ext3_force_commit() in ext3_sync_fs(). Furthermore ext3_force_commit() can unnecessarily create sync transaction which is expensive so it's worthwhile to remove it when we can. Cc: Eric Sandeen Cc: Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/super.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/ext3/super.c b/fs/ext3/super.c index b70d90e08a3c..4a970411a458 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2428,12 +2428,13 @@ static void ext3_write_super (struct super_block * sb) static int ext3_sync_fs(struct super_block *sb, int wait) { - sb->s_dirt = 0; - if (wait) - ext3_force_commit(sb); - else - journal_start_commit(EXT3_SB(sb)->s_journal, NULL); + tid_t target; + sb->s_dirt = 0; + if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { + if (wait) + log_wait_commit(EXT3_SB(sb)->s_journal, target); + } return 0; } From 508b9f8efdad123b202b228f71f59feba51e4fb5 Mon Sep 17 00:00:00 2001 From: MinChan Kim Date: Wed, 11 Feb 2009 13:04:27 -0800 Subject: [PATCH 088/105] mm: fix mlocked page counter mismatch When I tested following program, I found that the mlocked counter is strange. It cannot free some mlocked pages. It is because try_to_unmap_file() doesn't check real page mappings in vmas. That is because the goal of an address_space for a file is to find all processes into which the file's specific interval is mapped. It is related to the file's interval, not to pages. Even if the page isn't really mapped by the vma, it returns SWAP_MLOCK since the vma has VM_LOCKED, then calls try_to_mlock_page. After this the mlocked counter is increased again. COWed anon page in a file-backed vma could be a such case. This patch resolves it. -- my test program -- int main() { mlockall(MCL_CURRENT); return 0; } -- before -- root@barrios-target-linux:~# cat /proc/meminfo | egrep 'Mlo|Unev' Unevictable: 0 kB Mlocked: 0 kB -- after -- root@barrios-target-linux:~# cat /proc/meminfo | egrep 'Mlo|Unev' Unevictable: 8 kB Mlocked: 8 kB Signed-off-by: MinChan Kim Acked-by: Lee Schermerhorn Acked-by: KOSAKI Motohiro Tested-by: Lee Schermerhorn Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/rmap.c b/mm/rmap.c index ac4af8cffbf9..16521664010d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1072,7 +1072,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if (MLOCK_PAGES && unlikely(unlock)) { - if (!(vma->vm_flags & VM_LOCKED)) + if (!((vma->vm_flags & VM_LOCKED) && + page_mapped_in_vma(page, vma))) continue; /* must visit all vmas */ ret = SWAP_MLOCK; } else { From 7dcce1334fa5879dc12bee001962e8f74bce60f1 Mon Sep 17 00:00:00 2001 From: Marcel Selhorst Date: Wed, 11 Feb 2009 13:04:27 -0800 Subject: [PATCH 089/105] tpm: correct email address for tpm_infineon-driver Update my email address. Signed-off-by: Marcel Selhorst Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 4 ++-- drivers/char/tpm/tpm_infineon.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 6d5f728f989a..8ee281518c77 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4291,8 +4291,8 @@ P: Rajiv Andrade M: srajiv@linux.vnet.ibm.com W: http://tpmdd.sourceforge.net P: Marcel Selhorst -M: tpm@selhorst.net -W: http://www.prosec.rub.de/tpm/ +M: m.selhorst@sirrix.com +W: http://www.sirrix.com L: tpmdd-devel@lists.sourceforge.net (moderated for non-subscribers) S: Maintained diff --git a/drivers/char/tpm/tpm_infineon.c b/drivers/char/tpm/tpm_infineon.c index 726ee8a0277f..ecba4942fc8e 100644 --- a/drivers/char/tpm/tpm_infineon.c +++ b/drivers/char/tpm/tpm_infineon.c @@ -4,7 +4,7 @@ * SLD 9630 TT 1.1 and SLB 9635 TT 1.2 Trusted Platform Module * Specifications at www.trustedcomputinggroup.org * - * Copyright (C) 2005, Marcel Selhorst + * Copyright (C) 2005, Marcel Selhorst * Sirrix AG - security technologies, http://www.sirrix.com and * Applied Data Security Group, Ruhr-University Bochum, Germany * Project-Homepage: http://www.prosec.rub.de/tpm @@ -636,7 +636,7 @@ static void __exit cleanup_inf(void) module_init(init_inf); module_exit(cleanup_inf); -MODULE_AUTHOR("Marcel Selhorst "); +MODULE_AUTHOR("Marcel Selhorst "); MODULE_DESCRIPTION("Driver for Infineon TPM SLD 9630 TT 1.1 / SLB 9635 TT 1.2"); MODULE_VERSION("1.9"); MODULE_LICENSE("GPL"); From d4097456cd1d9285e876fc5d08a789462804cc28 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-Koenig Date: Wed, 11 Feb 2009 13:04:28 -0800 Subject: [PATCH 090/105] video/framebuffer: move the probe func into .devinit.text in Blackfin LCD driver Signed-off-by: Uwe Kleine-Koenig Signed-off-by: Mike Frysinger Signed-off-by: Bryan Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/bfin-t350mcqb-fb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/bfin-t350mcqb-fb.c b/drivers/video/bfin-t350mcqb-fb.c index 2a423d3a2a8e..90cfddabf1f7 100644 --- a/drivers/video/bfin-t350mcqb-fb.c +++ b/drivers/video/bfin-t350mcqb-fb.c @@ -447,7 +447,7 @@ static irqreturn_t bfin_t350mcqb_irq_error(int irq, void *dev_id) return IRQ_HANDLED; } -static int __init bfin_t350mcqb_probe(struct platform_device *pdev) +static int __devinit bfin_t350mcqb_probe(struct platform_device *pdev) { struct bfin_t350mcqbfb_info *info; struct fb_info *fbinfo; From 2e9c23724328ae4e56c42a35a717a956d7d3001d Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 11 Feb 2009 13:04:29 -0800 Subject: [PATCH 091/105] memcg: use __GFP_NOWARN in page cgroup allocation page_cgroup's page allocation at init/memory hotplug uses kmalloc() and vmalloc(). If kmalloc() failes, vmalloc() is used. This is because vmalloc() is very limited resource on 32bit systems. We want to use kmalloc() first. But in this kind of call, __GFP_NOWARN should be specified. Reported-by: Heiko Carstens Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Acked-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 7006a11350c8..ceecfbb143fa 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -114,7 +114,8 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) nid = page_to_nid(pfn_to_page(pfn)); table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; if (slab_is_available()) { - base = kmalloc_node(table_size, GFP_KERNEL, nid); + base = kmalloc_node(table_size, + GFP_KERNEL | __GFP_NOWARN, nid); if (!base) base = vmalloc_node(table_size, nid); } else { From f40b45a2e45b0f02aeedfcfbb28d8e2d4b8b86b1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 11 Feb 2009 13:04:31 -0800 Subject: [PATCH 092/105] kernel-doc: preferred ending marker and examples Fix kernel-doc-nano-HOWTO.txt to use */ as the ending marker in kernel-doc examples and state that */ is the preferred ending marker. Signed-off-by: Randy Dunlap Reported-by: Robert Love Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-doc-nano-HOWTO.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/kernel-doc-nano-HOWTO.txt b/Documentation/kernel-doc-nano-HOWTO.txt index d73fbd2b2b45..026ec7d57384 100644 --- a/Documentation/kernel-doc-nano-HOWTO.txt +++ b/Documentation/kernel-doc-nano-HOWTO.txt @@ -43,7 +43,8 @@ Only comments so marked will be considered by the kernel-doc scripts, and any comment so marked must be in kernel-doc format. Do not use "/**" to be begin a comment block unless the comment block contains kernel-doc formatted comments. The closing comment marker for -kernel-doc comments can be either "*/" or "**/". +kernel-doc comments can be either "*/" or "**/", but "*/" is +preferred in the Linux kernel tree. Kernel-doc comments should be placed just before the function or data structure being described. @@ -63,7 +64,7 @@ Example kernel-doc function comment: * comment lines. * * The longer description can have multiple paragraphs. - **/ + */ The first line, with the short description, must be on a single line. @@ -85,7 +86,7 @@ Example kernel-doc data structure comment. * perhaps with more lines and words. * * Longer description of this structure. - **/ + */ The kernel-doc function comments describe each parameter to the function, in order, with the @name lines. From b4870bc5ee8c7a37541a3eb1208b5c76c13a078a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 11 Feb 2009 13:04:33 -0800 Subject: [PATCH 093/105] kernel-doc: fix syscall wrapper processing Fix kernel-doc processing of SYSCALL wrappers. The SYSCALL wrapper patches played havoc with kernel-doc for syscalls. Syscalls that were scanned for DocBook processing reported warnings like this one, for sys_tgkill: Warning(kernel/signal.c:2285): No description found for parameter 'tgkill' Warning(kernel/signal.c:2285): No description found for parameter 'pid_t' Warning(kernel/signal.c:2285): No description found for parameter 'int' because the macro parameters all "look like" function parameters, although they are not: /** * sys_tgkill - send signal to one specific thread * @tgid: the thread group ID of the thread * @pid: the PID of the thread * @sig: signal to be sent * * This syscall also checks the @tgid and returns -ESRCH even if the PID * exists but it's not belonging to the target process anymore. This * method solves the problem of threads exiting and PIDs getting reused. */ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig) { ... This patch special-cases the handling SYSCALL_DEFINE* function prototypes by expanding them to long sys_foobar(type1 arg1, type1 arg2, ...) Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/kernel-doc | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/scripts/kernel-doc b/scripts/kernel-doc index 8bb83a100edb..0f11870116dc 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -1827,6 +1827,40 @@ sub reset_state { $state = 0; } +sub syscall_munge() { + my $void = 0; + + $prototype =~ s@[\r\n\t]+@ @gos; # strip newlines/CR's/tabs +## if ($prototype =~ m/SYSCALL_DEFINE0\s*\(\s*(a-zA-Z0-9_)*\s*\)/) { + if ($prototype =~ m/SYSCALL_DEFINE0/) { + $void = 1; +## $prototype = "long sys_$1(void)"; + } + + $prototype =~ s/SYSCALL_DEFINE.*\(/long sys_/; # fix return type & func name + if ($prototype =~ m/long (sys_.*?),/) { + $prototype =~ s/,/\(/; + } elsif ($void) { + $prototype =~ s/\)/\(void\)/; + } + + # now delete all of the odd-number commas in $prototype + # so that arg types & arg names don't have a comma between them + my $count = 0; + my $len = length($prototype); + if ($void) { + $len = 0; # skip the for-loop + } + for (my $ix = 0; $ix < $len; $ix++) { + if (substr($prototype, $ix, 1) eq ',') { + $count++; + if ($count % 2 == 1) { + substr($prototype, $ix, 1) = ' '; + } + } + } +} + sub process_state3_function($$) { my $x = shift; my $file = shift; @@ -1839,11 +1873,15 @@ sub process_state3_function($$) { elsif ($x =~ /([^\{]*)/) { $prototype .= $1; } + if (($x =~ /\{/) || ($x =~ /\#\s*define/) || ($x =~ /;/)) { $prototype =~ s@/\*.*?\*/@@gos; # strip comments. $prototype =~ s@[\r\n]+@ @gos; # strip newlines/cr's. $prototype =~ s@^\s+@@gos; # strip leading spaces - dump_function($prototype,$file); + if ($prototype =~ /SYSCALL_DEFINE/) { + syscall_munge(); + } + dump_function($prototype, $file); reset_state(); } } From c318c7ac49f9139f55da619bbace6137e1509390 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Wed, 11 Feb 2009 13:04:34 -0800 Subject: [PATCH 094/105] rtc: t reaches -1, tested 0 With a postfix decrement t will reach -1 rather than 0, so neither the warning nor the `goto error_out' will occur. Signed-off-by: Roel Kluin Acked-by: Manuel Lauss Acked-by: Alessandro Zummo Cc: David Brownell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-au1xxx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-au1xxx.c b/drivers/rtc/rtc-au1xxx.c index 8906a688e6a6..979ed0406ce9 100644 --- a/drivers/rtc/rtc-au1xxx.c +++ b/drivers/rtc/rtc-au1xxx.c @@ -81,7 +81,7 @@ static int __devinit au1xtoy_rtc_probe(struct platform_device *pdev) if (au_readl(SYS_TOYTRIM) != 32767) { /* wait until hardware gives access to TRIM register */ t = 0x00100000; - while ((au_readl(SYS_COUNTER_CNTRL) & SYS_CNTRL_T0S) && t--) + while ((au_readl(SYS_COUNTER_CNTRL) & SYS_CNTRL_T0S) && --t) msleep(1); if (!t) { From 01c4a4283137d24c9cc3785f1f312e895a18f273 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 11 Feb 2009 13:04:35 -0800 Subject: [PATCH 095/105] cgroups: add Li Zefan as a maintainer Add Li Zefan as co-maintainer. Acked-by: Paul Menage Acked-by: Li Zefan Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8ee281518c77..db65b4e6d132 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1202,6 +1202,8 @@ S: Supported CONTROL GROUPS (CGROUPS) P: Paul Menage M: menage@google.com +P: Li Zefan +M: lizf@cn.fujitsu.com L: containers@lists.linux-foundation.org S: Maintained From cfebe563bd0a3ff97e1bc167123120d59c7a84db Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 11 Feb 2009 13:04:36 -0800 Subject: [PATCH 096/105] cgroups: fix lockdep subclasses overflow I enabled all cgroup subsystems when compiling kernel, and then: # mount -t cgroup -o net_cls xxx /mnt # mkdir /mnt/0 This showed up immediately: BUG: MAX_LOCKDEP_SUBCLASSES too low! turning off the locking correctness validator. It's caused by the cgroup hierarchy lock: for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->root == root) mutex_lock_nested(&ss->hierarchy_mutex, i); } Now we have 9 cgroup subsystems, and the above 'i' for net_cls is 8, but MAX_LOCKDEP_SUBCLASSES is 8. This patch uses different lockdep keys for different subsystems. Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 1 + kernel/cgroup.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e4e8e117d27d..499900d0cee7 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -378,6 +378,7 @@ struct cgroup_subsys { * - initiating hotplug events */ struct mutex hierarchy_mutex; + struct lock_class_key subsys_key; /* * Link to parent, and list entry in parent's children. diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5a54ff42874e..e14db9c089b9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2351,7 +2351,7 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root) for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->root == root) - mutex_lock_nested(&ss->hierarchy_mutex, i); + mutex_lock(&ss->hierarchy_mutex); } } @@ -2637,6 +2637,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(!list_empty(&init_task.tasks)); mutex_init(&ss->hierarchy_mutex); + lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); ss->active = 1; } From 0e4a9b59282914fe057ab17027f55123964bc2e2 Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 11 Feb 2009 13:04:37 -0800 Subject: [PATCH 097/105] ext2/xip: refuse to change xip flag during remount with busy inodes For a reason that I was unable to understand in three months of debugging, mount ext2 -o remount stopped working properly when remounting from regular operation to xip, or the other way around. According to a git bisect search, the problem was introduced with the VM_MIXEDMAP/PTE_SPECIAL rework in the vm: commit 70688e4dd1647f0ceb502bbd5964fa344c5eb411 Author: Nick Piggin Date: Mon Apr 28 02:13:02 2008 -0700 xip: support non-struct page backed memory In the failing scenario, the filesystem is mounted read only via root= kernel parameter on s390x. During remount (in rc.sysinit), the inodes of the bash binary and its libraries are busy and cannot be invalidated (the bash which is running rc.sysinit resides on subject filesystem). Afterwards, another bash process (running ifup-eth) recurses into a subshell, runs dup_mm (via fork). Some of the mappings in this bash process were created from inodes that could not be invalidated during remount. Both parent and child process crash some time later due to inconsistencies in their address spaces. The issue seems to be timing sensitive, various attempts to recreate it have failed. This patch refuses to change the xip flag during remount in case some inodes cannot be invalidated. This patch keeps users from running into that issue. [akpm@linux-foundation.org: cleanup] Signed-off-by: Carsten Otte Cc: Nick Piggin Cc: Jared Hulbert Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/super.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/ext2/super.c b/fs/ext2/super.c index da8bdeaa2e6d..7c6e3606f0ec 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1185,9 +1185,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) es = sbi->s_es; if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != (old_mount_opt & EXT2_MOUNT_XIP)) && - invalidate_inodes(sb)) - ext2_warning(sb, __func__, "busy inodes while remounting "\ - "xip remain in cache (no functional problem)"); + invalidate_inodes(sb)) { + ext2_warning(sb, __func__, "refusing change of xip flag " + "with busy inodes while remounting"); + sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; + sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; + } if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; if (*flags & MS_RDONLY) { From 6c5979631b4b03c9288776562c18036765e398c1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 11 Feb 2009 13:04:38 -0800 Subject: [PATCH 098/105] syscall define: fix uml compile bug With the new system call defines we get this on uml: arch/um/sys-i386/built-in.o: In function `sys_call_table': (.rodata+0x308): undefined reference to `sys_sigprocmask' Reason for this is that uml passes the preprocessor option -Dsigprocmask=kernel_sigprocmask to gcc when compiling the kernel. This causes SYSCALL_DEFINE3(sigprocmask, ...) to be expanded to SYSCALL_DEFINEx(3, kernel_sigprocmask, ...) and finally to a system call named sys_kernel_sigprocmask. However sys_sigprocmask is missing because of this. To avoid macro expansion for the system call name just concatenate the name at first define instead of carrying it through severel levels. This was pointed out by Al Viro. Signed-off-by: Heiko Carstens Cc: Geert Uytterhoeven Cc: Al Viro Reviewed-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/syscalls.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 0eda02ff2414..f9f900cfd066 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -95,13 +95,13 @@ struct old_linux_dirent; #define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__) #define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__) -#define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) -#define SYSCALL_DEFINE1(...) SYSCALL_DEFINEx(1, __VA_ARGS__) -#define SYSCALL_DEFINE2(...) SYSCALL_DEFINEx(2, __VA_ARGS__) -#define SYSCALL_DEFINE3(...) SYSCALL_DEFINEx(3, __VA_ARGS__) -#define SYSCALL_DEFINE4(...) SYSCALL_DEFINEx(4, __VA_ARGS__) -#define SYSCALL_DEFINE5(...) SYSCALL_DEFINEx(5, __VA_ARGS__) -#define SYSCALL_DEFINE6(...) SYSCALL_DEFINEx(6, __VA_ARGS__) +#define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) +#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) +#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__) +#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) +#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__) +#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__) +#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__) #ifdef CONFIG_PPC64 #define SYSCALL_ALIAS(alias, name) \ @@ -121,21 +121,21 @@ struct old_linux_dirent; #define SYSCALL_DEFINE(name) static inline long SYSC_##name #define SYSCALL_DEFINEx(x, name, ...) \ - asmlinkage long sys_##name(__SC_DECL##x(__VA_ARGS__)); \ - static inline long SYSC_##name(__SC_DECL##x(__VA_ARGS__)); \ - asmlinkage long SyS_##name(__SC_LONG##x(__VA_ARGS__)) \ + asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)); \ + static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)); \ + asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__)) \ { \ __SC_TEST##x(__VA_ARGS__); \ - return (long) SYSC_##name(__SC_CAST##x(__VA_ARGS__)); \ + return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__)); \ } \ - SYSCALL_ALIAS(sys_##name, SyS_##name); \ - static inline long SYSC_##name(__SC_DECL##x(__VA_ARGS__)) + SYSCALL_ALIAS(sys##name, SyS##name); \ + static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)) #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */ #define SYSCALL_DEFINE(name) asmlinkage long sys_##name #define SYSCALL_DEFINEx(x, name, ...) \ - asmlinkage long sys_##name(__SC_DECL##x(__VA_ARGS__)) + asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)) #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */ From 89e1219004b3657cc014521663eeef0744f1c99d Mon Sep 17 00:00:00 2001 From: Federico Cuello Date: Wed, 11 Feb 2009 13:04:39 -0800 Subject: [PATCH 099/105] writeback: fix break condition Commit dcf6a79dda5cc2a2bec183e50d829030c0972aaa ("write-back: fix nr_to_write counter") fixed nr_to_write counter, but didn't set the break condition properly. If nr_to_write == 0 after being decremented it will loop one more time before setting done = 1 and breaking the loop. [akpm@linux-foundation.org: coding-style fixes] Cc: Artem Bityutskiy Acked-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c17005e73974..6106a5c7ed44 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1051,20 +1051,23 @@ int write_cache_pages(struct address_space *mapping, } } - if (nr_to_write > 0) + if (nr_to_write > 0) { nr_to_write--; - else if (wbc->sync_mode == WB_SYNC_NONE) { - /* - * We stop writing back only if we are not - * doing integrity sync. In case of integrity - * sync we have to keep going because someone - * may be concurrently dirtying pages, and we - * might have synced a lot of newly appeared - * dirty pages, but have not synced all of the - * old dirty pages. - */ - done = 1; - break; + if (nr_to_write == 0 && + wbc->sync_mode == WB_SYNC_NONE) { + /* + * We stop writing back only if we are + * not doing integrity sync. In case of + * integrity sync we have to keep going + * because someone may be concurrently + * dirtying pages, and we might have + * synced a lot of newly appeared dirty + * pages, but have not synced all of the + * old dirty pages. + */ + done = 1; + break; + } } if (wbc->nonblocking && bdi_write_congested(bdi)) { From 3abdbf90a3ffb006108c831c56b092e35483b6ec Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 11 Feb 2009 13:04:40 -0800 Subject: [PATCH 100/105] parport: parport_serial, don't bind netmos ibm 0299 Since netmos 9835 with subids 0x1014(IBM):0x0299 is now bound with serial/8250_pci, because it has no parallel ports and subdevice id isn't in the expected form, return -ENODEV from probe function. This is performed in netmos preinit_hook. Signed-off-by: Jiri Slaby Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/parport/parport_serial.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/parport/parport_serial.c b/drivers/parport/parport_serial.c index 101ed49a2d15..032db815b0f9 100644 --- a/drivers/parport/parport_serial.c +++ b/drivers/parport/parport_serial.c @@ -64,6 +64,11 @@ struct parport_pc_pci { static int __devinit netmos_parallel_init(struct pci_dev *dev, struct parport_pc_pci *card, int autoirq, int autodma) { + /* the rule described below doesn't hold for this device */ + if (dev->device == PCI_DEVICE_ID_NETMOS_9835 && + dev->subsystem_vendor == PCI_VENDOR_ID_IBM && + dev->subsystem_device == 0x0299) + return -ENODEV; /* * Netmos uses the subdevice ID to indicate the number of parallel * and serial ports. The form is 0x00PS, where

is the number of From 9480c53e9b2aa13a06283ffb96bb8f1873ac4e9a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 11 Feb 2009 13:04:41 -0800 Subject: [PATCH 101/105] mm: rearrange exit_mmap() to unlock before arch_exit_mmap Christophe Saout reported [in precursor to: http://marc.info/?l=linux-kernel&m=123209902707347&w=4]: > Note that I also some a different issue with CONFIG_UNEVICTABLE_LRU. > Seems like Xen tears down current->mm early on process termination, so > that __get_user_pages in exit_mmap causes nasty messages when the > process had any mlocked pages. (in fact, it somehow manages to get into > the swapping code and produces a null pointer dereference trying to get > a swap token) Jeremy explained: Yes. In the normal case under Xen, an in-use pagetable is "pinned", meaning that it is RO to the kernel, and all updates must go via hypercall (or writes are trapped and emulated, which is much the same thing). An unpinned pagetable is not currently in use by any process, and can be directly accessed as normal RW pages. As an optimisation at process exit time, we unpin the pagetable as early as possible (switching the process to init_mm), so that all the normal pagetable teardown can happen with direct memory accesses. This happens in exit_mmap() -> arch_exit_mmap(). The munlocking happens a few lines below. The obvious thing to do would be to move arch_exit_mmap() to below the munlock code, but I think we'd want to call it even if mm->mmap is NULL, just to be on the safe side. Thus, this patch: exit_mmap() needs to unlock any locked vmas before calling arch_exit_mmap, as the latter may switch the current mm to init_mm, which would cause the former to fail. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Lee Schermerhorn Cc: Christophe Saout Cc: Keir Fraser Cc: Christophe Saout Cc: Alex Williamson Cc: [2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index eb1270bebe67..00ced3ee49a8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2084,12 +2084,8 @@ void exit_mmap(struct mm_struct *mm) unsigned long end; /* mm's last user has gone, and its about to be pulled down */ - arch_exit_mmap(mm); mmu_notifier_release(mm); - if (!mm->mmap) /* Can happen if dup_mmap() received an OOM */ - return; - if (mm->locked_vm) { vma = mm->mmap; while (vma) { @@ -2098,7 +2094,13 @@ void exit_mmap(struct mm_struct *mm) vma = vma->vm_next; } } + + arch_exit_mmap(mm); + vma = mm->mmap; + if (!vma) /* Can happen if dup_mmap() received an OOM */ + return; + lru_add_drain(); flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); From 4d48a542b42747c36a5937447d9c3de7c897ea50 Mon Sep 17 00:00:00 2001 From: Paul Clements Date: Wed, 11 Feb 2009 13:04:45 -0800 Subject: [PATCH 102/105] nbd: fix I/O hang on disconnected nbds Fix a problem that causes I/O to a disconnected (or partially initialized) nbd device to hang indefinitely. To reproduce: # ioctl NBD_SET_SIZE_BLOCKS /dev/nbd23 514048 # dd if=/dev/nbd23 of=/dev/null bs=4096 count=1 ...hangs... This can also occur when an nbd device loses its nbd-client/server connection. Although we clear the queue of any outstanding I/Os after the client/server connection fails, any additional I/Os that get queued later will hang. This bug may also be the problem reported in this bug report: http://bugzilla.kernel.org/show_bug.cgi?id=12277 Testing would need to be performed to determine if the two issues are the same. This problem was introduced by the new request handling thread code ("NBD: allow nbd to be used locally", 3/2008), which entered into mainline around 2.6.25. The fix, which is fairly simple, is to restore the check for lo->sock being NULL in do_nbd_request. This causes I/O to an uninitialized nbd to immediately fail with an I/O error, as it did prior to the introduction of this bug. Signed-off-by: Paul Clements Reported-by: Jon Nelson Acked-by: Pavel Machek Cc: [2.6.26.x, 2.6.27.x, 2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/nbd.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 34f80fa6fed1..8299e2d3b611 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -549,6 +549,15 @@ static void do_nbd_request(struct request_queue * q) BUG_ON(lo->magic != LO_MAGIC); + if (unlikely(!lo->sock)) { + printk(KERN_ERR "%s: Attempted send on closed socket\n", + lo->disk->disk_name); + req->errors++; + nbd_end_request(req); + spin_lock_irq(q->queue_lock); + continue; + } + spin_lock_irq(&lo->queue_lock); list_add_tail(&req->queuelist, &lo->waiting_queue); spin_unlock_irq(&lo->queue_lock); From 507e2fbaaacb6f164b4125b87c5002f95143174b Mon Sep 17 00:00:00 2001 From: Ian Dall Date: Wed, 11 Feb 2009 13:04:46 -0800 Subject: [PATCH 103/105] w1: w1 temp calculation overflow fix Addresses http://bugzilla.kernel.org/show_bug.cgi?id=12646 When the temperature exceeds 32767 milli-degrees the temperature overflows to -32768 millidegrees. These are bothe well within the -55 - +125 degree range for the sensor. Fix overflow in left-shift of a u8. Signed-off-by: Ian Dall Signed-off-by: Evgeniy Polyakov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/w1/slaves/w1_therm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/w1/slaves/w1_therm.c b/drivers/w1/slaves/w1_therm.c index 2c8dff9f77da..1ed3d554e372 100644 --- a/drivers/w1/slaves/w1_therm.c +++ b/drivers/w1/slaves/w1_therm.c @@ -115,7 +115,7 @@ static struct w1_therm_family_converter w1_therm_families[] = { static inline int w1_DS18B20_convert_temp(u8 rom[9]) { - s16 t = (rom[1] << 8) | rom[0]; + int t = ((s16)rom[1] << 8) | rom[0]; t = t*1000/16; return t; } From b1aabecd55931ee754f6a913969516b26a0e682e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2009 15:21:44 +0200 Subject: [PATCH 104/105] mm: Export symbol ksize() Commit 7b2cd92adc5430b0c1adeb120971852b4ea1ab08 ("crypto: api - Fix zeroing on free") added modular user of ksize(). Export that to fix crypto.ko compilation. Cc: Herbert Xu Signed-off-by: Kirill A. Shutemov Signed-off-by: Pekka Enberg --- mm/slab.c | 1 + mm/slob.c | 1 + mm/slub.c | 1 + 3 files changed, 3 insertions(+) diff --git a/mm/slab.c b/mm/slab.c index ddc41f337d58..4d00855629c4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4457,3 +4457,4 @@ size_t ksize(const void *objp) return obj_size(virt_to_cache(objp)); } +EXPORT_SYMBOL(ksize); diff --git a/mm/slob.c b/mm/slob.c index bf7e8fc3aed8..52bc8a2bd9ef 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -521,6 +521,7 @@ size_t ksize(const void *block) } else return sp->page.private; } +EXPORT_SYMBOL(ksize); struct kmem_cache { unsigned int size, align; diff --git a/mm/slub.c b/mm/slub.c index bdc9abb08a23..0280eee6cf37 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2736,6 +2736,7 @@ size_t ksize(const void *object) */ return s->size; } +EXPORT_SYMBOL(ksize); void kfree(const void *x) { From 3a4c6800f31ea8395628af5e7e490270ee5d0585 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 12 Feb 2009 04:34:23 +0100 Subject: [PATCH 105/105] Fix page writeback thinko, causing Berkeley DB slowdown A bug was introduced into write_cache_pages cyclic writeout by commit 31a12666d8f0c22235297e1c1575f82061480029 ("mm: write_cache_pages cyclic fix"). The intention (and comments) is that we should cycle back and look for more dirty pages at the beginning of the file if there is no more work to be done. But the !done condition was dropped from the test. This means that any time the page writeout loop breaks (eg. due to nr_to_write == 0), we will set index to 0, then goto again. This will set done_index to index, then find done is set, so will proceed to the end of the function. When updating mapping->writeback_index for cyclic writeout, we now use done_index == 0, so we're always cycling back to 0. This seemed to be causing random mmap writes (slapadd and iozone) to start writing more pages from the LRU and writeout would slowdown, and caused bugzilla entry http://bugzilla.kernel.org/show_bug.cgi?id=12604 about Berkeley DB slowing down dramatically. With this patch, iozone random write performance is increased nearly 5x on my system (iozone -B -r 4k -s 64k -s 512m -s 1200m on ext2). Signed-off-by: Nick Piggin Reported-and-tested-by: Jan Kara Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6106a5c7ed44..3c84128596ba 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1079,7 +1079,7 @@ int write_cache_pages(struct address_space *mapping, pagevec_release(&pvec); cond_resched(); } - if (!cycled) { + if (!cycled && !done) { /* * range_cyclic: * We hit the last page and there is more work to be done: wrap