From 6536e3123e5d3371a6f52e32a3d0694bcc987702 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 20 Jan 2012 14:33:53 -0800 Subject: [PATCH 01/16] mm: fix warnings regarding enum migrate_mode sparc64 allmodconfig: In file included from include/linux/compat.h:15, from /usr/src/25/arch/sparc/include/asm/siginfo.h:19, from include/linux/signal.h:5, from include/linux/sched.h:73, from arch/sparc/kernel/asm-offsets.c:13: include/linux/fs.h:618: warning: parameter has incomplete type It seems that my sparc64 compiler (gcc-3.4.5) doesn't like the forward declaration of enums. Fix this by moving the "enum migrate_mode" definition into its own header file. Acked-by: Mel Gorman Cc: Rik van Riel Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 +- include/linux/migrate.h | 14 +------------- include/linux/migrate_mode.h | 16 ++++++++++++++++ 3 files changed, 18 insertions(+), 14 deletions(-) create mode 100644 include/linux/migrate_mode.h diff --git a/include/linux/fs.h b/include/linux/fs.h index 0244082d42c5..4b3a41fe22bf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -10,6 +10,7 @@ #include #include #include +#include /* * It's silly to have NR_OPEN bigger than NR_FILE, but you can change @@ -526,7 +527,6 @@ enum positive_aop_returns { struct page; struct address_space; struct writeback_control; -enum migrate_mode; struct iov_iter { const struct iovec *iov; diff --git a/include/linux/migrate.h b/include/linux/migrate.h index eaf867412f7a..05ed2828a553 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -3,22 +3,10 @@ #include #include +#include typedef struct page *new_page_t(struct page *, unsigned long private, int **); -/* - * MIGRATE_ASYNC means never block - * MIGRATE_SYNC_LIGHT in the current implementation means to allow blocking - * on most operations but not ->writepage as the potential stall time - * is too significant - * MIGRATE_SYNC will block when migrating pages - */ -enum migrate_mode { - MIGRATE_ASYNC, - MIGRATE_SYNC_LIGHT, - MIGRATE_SYNC, -}; - #ifdef CONFIG_MIGRATION #define PAGE_MIGRATION 1 diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h new file mode 100644 index 000000000000..ebf3d89a3919 --- /dev/null +++ b/include/linux/migrate_mode.h @@ -0,0 +1,16 @@ +#ifndef MIGRATE_MODE_H_INCLUDED +#define MIGRATE_MODE_H_INCLUDED +/* + * MIGRATE_ASYNC means never block + * MIGRATE_SYNC_LIGHT in the current implementation means to allow blocking + * on most operations but not ->writepage as the potential stall time + * is too significant + * MIGRATE_SYNC will block when migrating pages + */ +enum migrate_mode { + MIGRATE_ASYNC, + MIGRATE_SYNC_LIGHT, + MIGRATE_SYNC, +}; + +#endif /* MIGRATE_MODE_H_INCLUDED */ From 687875fb7de4a95223af20ee024282fa9099f860 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 Jan 2012 14:33:55 -0800 Subject: [PATCH 02/16] mm: fix NULL ptr dereference in __count_immobile_pages Fix the following NULL ptr dereference caused by cat /sys/devices/system/memory/memory0/removable Pid: 13979, comm: sed Not tainted 3.0.13-0.5-default #1 IBM BladeCenter LS21 -[7971PAM]-/Server Blade RIP: __count_immobile_pages+0x4/0x100 Process sed (pid: 13979, threadinfo ffff880221c36000, task ffff88022e788480) Call Trace: is_pageblock_removable_nolock+0x34/0x40 is_mem_section_removable+0x74/0xf0 show_mem_removable+0x41/0x70 sysfs_read_file+0xfe/0x1c0 vfs_read+0xc7/0x130 sys_read+0x53/0xa0 system_call_fastpath+0x16/0x1b We are crashing because we are trying to dereference NULL zone which came from pfn=0 (struct page ffffea0000000000). According to the boot log this page is marked reserved: e820 update range: 0000000000000000 - 0000000000010000 (usable) ==> (reserved) and early_node_map confirms that: early_node_map[3] active PFN ranges 1: 0x00000010 -> 0x0000009c 1: 0x00000100 -> 0x000bffa3 1: 0x00100000 -> 0x00240000 The problem is that memory_present works in PAGE_SECTION_MASK aligned blocks so the reserved range sneaks into the the section as well. This also means that free_area_init_node will not take care of those reserved pages and they stay uninitialized. When we try to read the removable status we walk through all available sections and hope that the zone is valid for all pages in the section. But this is not true in this case as the zone and nid are not initialized. We have only one node in this particular case and it is marked as node=1 (rather than 0) and that made the problem visible because page_to_nid will return 0 and there are no zones on the node. Let's check that the zone is valid and that the given pfn falls into its boundaries and mark the section not removable. This might cause some false positives, probably, but we do not have any sane way to find out whether the page is reserved by the platform or it is just not used for whatever other reasons. Signed-off-by: Michal Hocko Acked-by: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Andrea Arcangeli Cc: David Rientjes Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0027d8f4a1bb..21c272d517bc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5414,6 +5414,17 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) bool is_pageblock_removable_nolock(struct page *page) { struct zone *zone = page_zone(page); + unsigned long pfn = page_to_pfn(page); + + /* + * We have to be careful here because we are iterating over memory + * sections which are not zone aware so we might end up outside of + * the zone but still within the section. + */ + if (!zone || zone->zone_start_pfn > pfn || + zone->zone_start_pfn + zone->spanned_pages <= pfn) + return false; + return __count_immobile_pages(zone, page, 0); } From 656a070629adfe23c12768e35ddf91feab469ff7 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 Jan 2012 14:33:58 -0800 Subject: [PATCH 03/16] mm: __count_immobile_pages(): make sure the node is online page_zone() requires an online node otherwise we are accessing NULL NODE_DATA. This is not an issue at the moment because node_zones are located at the structure beginning but this might change in the future so better be careful about that. Signed-off-by: Michal Hocko Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 21c272d517bc..d2186ecb36f7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5413,15 +5413,22 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) bool is_pageblock_removable_nolock(struct page *page) { - struct zone *zone = page_zone(page); - unsigned long pfn = page_to_pfn(page); + struct zone *zone; + unsigned long pfn; /* * We have to be careful here because we are iterating over memory * sections which are not zone aware so we might end up outside of * the zone but still within the section. + * We have to take care about the node as well. If the node is offline + * its NODE_DATA will be NULL - see page_zone. */ - if (!zone || zone->zone_start_pfn > pfn || + if (!node_online(page_to_nid(page))) + return false; + + zone = page_zone(page); + pfn = page_to_pfn(page); + if (zone->zone_start_pfn > pfn || zone->zone_start_pfn + zone->spanned_pages <= pfn) return false; From e9a4593cc5e36c6d47c87b439cb41c2568e7395f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 20 Jan 2012 14:33:59 -0800 Subject: [PATCH 04/16] leds: add led driver for Bachmann's ot200 Add support for leds on Bachmann's ot200 visualisation device. The device has three leds on the back panel (led_err, led_init and led_run) and can handle up to seven leds on the front panel. The driver was written by Linutronix on behalf of Bachmann electronic GmbH. It incorporates feedback from Lars-Peter Clausen [akpm@linux-foundation.org: add dependency on HAS_IOMEM] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Christian Gmeiner Cc: Lars-Peter Clausen Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/leds/Kconfig | 7 ++ drivers/leds/Makefile | 1 + drivers/leds/leds-ot200.c | 171 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 179 insertions(+) create mode 100644 drivers/leds/leds-ot200.c diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig index c957c344233f..9ca28fced2b9 100644 --- a/drivers/leds/Kconfig +++ b/drivers/leds/Kconfig @@ -403,6 +403,13 @@ config LEDS_MAX8997 This option enables support for on-chip LED drivers on MAXIM MAX8997 PMIC. +config LEDS_OT200 + tristate "LED support for the Bachmann OT200" + depends on LEDS_CLASS && HAS_IOMEM + help + This option enables support for the LEDs on the Bachmann OT200. + Say Y to enable LEDs on the Bachmann OT200. + config LEDS_TRIGGERS bool "LED Trigger support" depends on LEDS_CLASS diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile index b8a9723477f0..1fc6875a8b20 100644 --- a/drivers/leds/Makefile +++ b/drivers/leds/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_LEDS_LP5523) += leds-lp5523.o obj-$(CONFIG_LEDS_TCA6507) += leds-tca6507.o obj-$(CONFIG_LEDS_CLEVO_MAIL) += leds-clevo-mail.o obj-$(CONFIG_LEDS_HP6XX) += leds-hp6xx.o +obj-$(CONFIG_LEDS_OT200) += leds-ot200.o obj-$(CONFIG_LEDS_FSG) += leds-fsg.o obj-$(CONFIG_LEDS_PCA955X) += leds-pca955x.o obj-$(CONFIG_LEDS_DA903X) += leds-da903x.o diff --git a/drivers/leds/leds-ot200.c b/drivers/leds/leds-ot200.c new file mode 100644 index 000000000000..c4646825a620 --- /dev/null +++ b/drivers/leds/leds-ot200.c @@ -0,0 +1,171 @@ +/* + * Bachmann ot200 leds driver. + * + * Author: Sebastian Andrzej Siewior + * Christian Gmeiner + * + * License: GPL as published by the FSF. + */ + +#include +#include +#include +#include +#include +#include +#include + + +struct ot200_led { + struct led_classdev cdev; + const char *name; + unsigned long port; + u8 mask; +}; + +/* + * The device has three leds on the back panel (led_err, led_init and led_run) + * and can handle up to seven leds on the front panel. + */ + +static struct ot200_led leds[] = { + { + .name = "led_run", + .port = 0x5a, + .mask = BIT(0), + }, + { + .name = "led_init", + .port = 0x5a, + .mask = BIT(1), + }, + { + .name = "led_err", + .port = 0x5a, + .mask = BIT(2), + }, + { + .name = "led_1", + .port = 0x49, + .mask = BIT(7), + }, + { + .name = "led_2", + .port = 0x49, + .mask = BIT(6), + }, + { + .name = "led_3", + .port = 0x49, + .mask = BIT(5), + }, + { + .name = "led_4", + .port = 0x49, + .mask = BIT(4), + }, + { + .name = "led_5", + .port = 0x49, + .mask = BIT(3), + }, + { + .name = "led_6", + .port = 0x49, + .mask = BIT(2), + }, + { + .name = "led_7", + .port = 0x49, + .mask = BIT(1), + } +}; + +static DEFINE_SPINLOCK(value_lock); + +/* + * we need to store the current led states, as it is not + * possible to read the current led state via inb(). + */ +static u8 leds_back; +static u8 leds_front; + +static void ot200_led_brightness_set(struct led_classdev *led_cdev, + enum led_brightness value) +{ + struct ot200_led *led = container_of(led_cdev, struct ot200_led, cdev); + u8 *val; + unsigned long flags; + + spin_lock_irqsave(&value_lock, flags); + + if (led->port == 0x49) + val = &leds_front; + else if (led->port == 0x5a) + val = &leds_back; + else + BUG(); + + if (value == LED_OFF) + *val &= ~led->mask; + else + *val |= led->mask; + + outb(*val, led->port); + spin_unlock_irqrestore(&value_lock, flags); +} + +static int __devinit ot200_led_probe(struct platform_device *pdev) +{ + int i; + int ret; + + for (i = 0; i < ARRAY_SIZE(leds); i++) { + + leds[i].cdev.name = leds[i].name; + leds[i].cdev.brightness_set = ot200_led_brightness_set; + + ret = led_classdev_register(&pdev->dev, &leds[i].cdev); + if (ret < 0) + goto err; + } + + leds_front = 0; /* turn off all front leds */ + leds_back = BIT(1); /* turn on init led */ + outb(leds_front, 0x49); + outb(leds_back, 0x5a); + + return 0; + +err: + for (i = i - 1; i >= 0; i--) + led_classdev_unregister(&leds[i].cdev); + + return ret; +} + +static int __devexit ot200_led_remove(struct platform_device *pdev) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(leds); i++) + led_classdev_unregister(&leds[i].cdev); + + return 0; +} + +static struct platform_driver ot200_led_driver = { + .probe = ot200_led_probe, + .remove = __devexit_p(ot200_led_remove), + .driver = { + .name = "leds-ot200", + .owner = THIS_MODULE, + }, +}; + +module_platform_driver(ot200_led_driver); + +MODULE_AUTHOR("Sebastian A. Siewior "); +MODULE_DESCRIPTION("ot200 LED driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:leds-ot200"); From 2a4e64b8f6bcbf23ddd375b78342051ae8862284 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 20 Jan 2012 14:34:01 -0800 Subject: [PATCH 05/16] ipc/mqueue: simplify reading msgqueue limit Because the current task is being used to get the limit, we can simply use rlimit() instead of task_rlimit(). Signed-off-by: Davidlohr Bueso Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/mqueue.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 9b7c8ab7d75c..86ee272de210 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -128,7 +128,6 @@ static struct inode *mqueue_get_inode(struct super_block *sb, if (S_ISREG(mode)) { struct mqueue_inode_info *info; - struct task_struct *p = current; unsigned long mq_bytes, mq_msg_tblsz; inode->i_fop = &mqueue_file_operations; @@ -159,7 +158,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, spin_lock(&mq_lock); if (u->mq_bytes + mq_bytes < u->mq_bytes || - u->mq_bytes + mq_bytes > task_rlimit(p, RLIMIT_MSGQUEUE)) { + u->mq_bytes + mq_bytes > rlimit(RLIMIT_MSGQUEUE)) { spin_unlock(&mq_lock); /* mqueue_evict_inode() releases info->messages */ ret = -EMFILE; From d496aab567e7e52b3e974c9192a5de6e77dce32c Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Fri, 20 Jan 2012 14:34:04 -0800 Subject: [PATCH 06/16] kprobes: initialize before using a hlist Commit ef53d9c5e ("kprobes: improve kretprobe scalability with hashed locking") introduced a bug where we can potentially leak kretprobe_instances since we initialize a hlist head after having used it. Initialize the hlist head before using it. Reported by: Jim Keniston Acked-by: Jim Keniston Signed-off-by: Ananth N Mavinakayanahalli Acked-by: Masami Hiramatsu Cc: Srinivasa D S Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 95dd7212e610..29f5b65bee29 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1077,6 +1077,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) /* Early boot. kretprobe_table_locks not yet initialized. */ return; + INIT_HLIST_HEAD(&empty_rp); hash = hash_ptr(tk, KPROBE_HASH_BITS); head = &kretprobe_inst_table[hash]; kretprobe_table_lock(hash, &flags); @@ -1085,7 +1086,6 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) recycle_rp_inst(ri, &empty_rp); } kretprobe_table_unlock(hash, &flags); - INIT_HLIST_HEAD(&empty_rp); hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); kfree(ri); From 36c3e75907c8cb515fad260190ca1beb7e53df96 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Fri, 20 Jan 2012 14:34:05 -0800 Subject: [PATCH 07/16] drivers/video/backlight/adp88x0_bl.c: fix bit testing logic We need to write new value if the bit mask fields of new value is not equal to old value. It does not make sense to write new value only when all the bit_mask bits are zero. Signed-off-by: Axel Lin Cc: Michael Hennerich Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/adp8860_bl.c | 2 +- drivers/video/backlight/adp8870_bl.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/video/backlight/adp8860_bl.c b/drivers/video/backlight/adp8860_bl.c index 66bc74d9ce2a..378276c9d3cf 100644 --- a/drivers/video/backlight/adp8860_bl.c +++ b/drivers/video/backlight/adp8860_bl.c @@ -146,7 +146,7 @@ static int adp8860_set_bits(struct i2c_client *client, int reg, uint8_t bit_mask ret = adp8860_read(client, reg, ®_val); - if (!ret && ((reg_val & bit_mask) == 0)) { + if (!ret && ((reg_val & bit_mask) != bit_mask)) { reg_val |= bit_mask; ret = adp8860_write(client, reg, reg_val); } diff --git a/drivers/video/backlight/adp8870_bl.c b/drivers/video/backlight/adp8870_bl.c index 6c68a6899e87..6735059376d6 100644 --- a/drivers/video/backlight/adp8870_bl.c +++ b/drivers/video/backlight/adp8870_bl.c @@ -160,7 +160,7 @@ static int adp8870_set_bits(struct i2c_client *client, int reg, uint8_t bit_mask ret = adp8870_read(client, reg, ®_val); - if (!ret && ((reg_val & bit_mask) == 0)) { + if (!ret && ((reg_val & bit_mask) != bit_mask)) { reg_val |= bit_mask; ret = adp8870_write(client, reg, reg_val); } From d59d9ebaacba32b63f24d53b1463519b445b4683 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Fri, 20 Jan 2012 14:34:06 -0800 Subject: [PATCH 08/16] drivers/video/backlight/l4f00242t03.c: return proper error in l4f00242t03_probe if regulator_get() fails Signed-off-by: Axel Lin Acked-by: Alberto Panizzo Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/l4f00242t03.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/video/backlight/l4f00242t03.c b/drivers/video/backlight/l4f00242t03.c index 4f5d1c4cb6ab..27d1d7a29c77 100644 --- a/drivers/video/backlight/l4f00242t03.c +++ b/drivers/video/backlight/l4f00242t03.c @@ -190,6 +190,7 @@ static int __devinit l4f00242t03_probe(struct spi_device *spi) priv->io_reg = regulator_get(&spi->dev, "vdd"); if (IS_ERR(priv->io_reg)) { + ret = PTR_ERR(priv->io_reg); dev_err(&spi->dev, "%s: Unable to get the IO regulator\n", __func__); goto err3; @@ -197,6 +198,7 @@ static int __devinit l4f00242t03_probe(struct spi_device *spi) priv->core_reg = regulator_get(&spi->dev, "vcore"); if (IS_ERR(priv->core_reg)) { + ret = PTR_ERR(priv->core_reg); dev_err(&spi->dev, "%s: Unable to get the core regulator\n", __func__); goto err4; From 85e72aa5384b1a614563ad63257ded0e91d1a620 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 20 Jan 2012 14:34:09 -0800 Subject: [PATCH 09/16] proc: clear_refs: do not clear reserved pages /proc/pid/clear_refs is used to clear the Referenced and YOUNG bits for pages and corresponding page table entries of the task with PID pid, which includes any special mappings inserted into the page tables in order to provide things like vDSOs and user helper functions. On ARM this causes a problem because the vectors page is mapped as a global mapping and since ec706dab ("ARM: add a vma entry for the user accessible vector page"), a VMA is also inserted into each task for this page to aid unwinding through signals and syscall restarts. Since the vectors page is required for handling faults, clearing the YOUNG bit (and subsequently writing a faulting pte) means that we lose the vectors page *globally* and cannot fault it back in. This results in a system deadlock on the next exception. To see this problem in action, just run: $ echo 1 > /proc/self/clear_refs on an ARM platform (as any user) and watch your system hang. I think this has been the case since 2.6.37 This patch avoids clearing the aforementioned bits for reserved pages, therefore leaving the vectors page intact on ARM. Since reserved pages are not candidates for swap, this change should not have any impact on the usefulness of clear_refs. Signed-off-by: Will Deacon Reported-by: Moussa Ba Acked-by: Hugh Dickins Cc: David Rientjes Cc: Russell King Acked-by: Nicolas Pitre Cc: Matt Mackall Cc: [2.6.37+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e418c5abdb0e..7dcd2a250495 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -518,6 +518,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, if (!page) continue; + if (PageReserved(page)) + continue; + /* Clear accessed and referenced bits. */ ptep_test_and_clear_young(vma, addr, pte); ClearPageReferenced(page); From 6568d4a9c9ff16d6c4f0b14dfea567806ce579e4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 20 Jan 2012 14:34:12 -0800 Subject: [PATCH 10/16] mm: memcg: update the correct soft limit tree during migration end_migration() passes the old page instead of the new page to commit the charge. This page descriptor is not used for committing itself, though, since we also pass the (correct) page_cgroup descriptor. But it's used to find the soft limit tree through the page's zone, so the soft limit tree of the old page's zone is updated instead of that of the new page's, which might get slightly out of date until the next charge reaches the ratelimit point. This glitch has been present since 5564e88 ("memcg: condense page_cgroup-to-page lookup points"). This fixes a bug that I introduced in 2.6.38. It's benign enough (to my knowledge) that we probably don't want this for stable. Reported-by: Hugh Dickins Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3dbff4dcde35..4baddbae94cc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3247,7 +3247,7 @@ int mem_cgroup_prepare_migration(struct page *page, ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); + __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype); return ret; } From 409eb8c2611b4310947a150af988111f7f52ab15 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Fri, 20 Jan 2012 14:34:13 -0800 Subject: [PATCH 11/16] mm/hugetlb.c: undo change to page mapcount in fault handler Page mapcount should be updated only if we are sure that the page ends up in the page table otherwise we would leak if we couldn't COW due to reservations or if idx is out of bounds. Signed-off-by: Hillf Danton Reviewed-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ea8c3a4cd2ae..5f34bd8dda34 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2508,6 +2508,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct hstate *h = hstate_vma(vma); int ret = VM_FAULT_SIGBUS; + int anon_rmap = 0; pgoff_t idx; unsigned long size; struct page *page; @@ -2562,14 +2563,13 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); - page_dup_rmap(page); } else { lock_page(page); if (unlikely(anon_vma_prepare(vma))) { ret = VM_FAULT_OOM; goto backout_unlocked; } - hugepage_add_new_anon_rmap(page, vma, address); + anon_rmap = 1; } } else { /* @@ -2582,7 +2582,6 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, VM_FAULT_SET_HINDEX(h - hstates); goto backout_unlocked; } - page_dup_rmap(page); } /* @@ -2606,6 +2605,10 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!huge_pte_none(huge_ptep_get(ptep))) goto backout; + if (anon_rmap) + hugepage_add_new_anon_rmap(page, vma, address); + else + page_dup_rmap(page); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, address, ptep, new_pte); From cb78edfdcef5259ac9e9088bd63810d21299928d Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Fri, 20 Jan 2012 14:34:16 -0800 Subject: [PATCH 12/16] kdump: define KEXEC_NOTE_BYTES arch specific for s390x kdump only allocates memory for the prstatus ELF note. For s390x, besides of prstatus multiple ELF notes for various different register types are stored. Therefore the currently allocated memory is not sufficient. With this patch the KEXEC_NOTE_BYTES macro can be defined by architecture code and for s390x it is set to the correct size now. Signed-off-by: Michael Holzheu Cc: "Eric W. Biederman" Cc: Vivek Goyal Cc: Martin Schwidefsky Cc: Heiko Carstens Reviewed-by: Simon Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/kexec.h | 18 ++++++++++++++++++ include/linux/kexec.h | 2 ++ 2 files changed, 20 insertions(+) diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h index cf4e47b0948c..3f30dac804ea 100644 --- a/arch/s390/include/asm/kexec.h +++ b/arch/s390/include/asm/kexec.h @@ -42,6 +42,24 @@ /* The native architecture */ #define KEXEC_ARCH KEXEC_ARCH_S390 +/* + * Size for s390x ELF notes per CPU + * + * Seven notes plus zero note at the end: prstatus, fpregset, timer, + * tod_cmp, tod_reg, control regs, and prefix + */ +#define KEXEC_NOTE_BYTES \ + (ALIGN(sizeof(struct elf_note), 4) * 8 + \ + ALIGN(sizeof("CORE"), 4) * 7 + \ + ALIGN(sizeof(struct elf_prstatus), 4) + \ + ALIGN(sizeof(elf_fpregset_t), 4) + \ + ALIGN(sizeof(u64), 4) + \ + ALIGN(sizeof(u64), 4) + \ + ALIGN(sizeof(u32), 4) + \ + ALIGN(sizeof(u64) * 16, 4) + \ + ALIGN(sizeof(u32), 4) \ + ) + /* Provide a dummy definition to avoid build failures. */ static inline void crash_setup_regs(struct pt_regs *newregs, struct pt_regs *oldregs) { } diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 2fa0901219d4..0d7d6a1b172f 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -50,9 +50,11 @@ * note header. For kdump, the code in vmcore.c runs in the context * of the second kernel to combine them into one note. */ +#ifndef KEXEC_NOTE_BYTES #define KEXEC_NOTE_BYTES ( (KEXEC_NOTE_HEAD_BYTES * 2) + \ KEXEC_CORE_NOTE_NAME_BYTES + \ KEXEC_CORE_NOTE_DESC_BYTES ) +#endif /* * This structure is used to hold the arguments that are used when loading From 85046579bde15e532983438f86b36856e358f417 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 20 Jan 2012 14:34:19 -0800 Subject: [PATCH 13/16] SHM_UNLOCK: fix long unpreemptible section scan_mapping_unevictable_pages() is used to make SysV SHM_LOCKed pages evictable again once the shared memory is unlocked. It does this with pagevec_lookup()s across the whole object (which might occupy most of memory), and takes 300ms to unlock 7GB here. A cond_resched() every PAGEVEC_SIZE pages would be good. However, KOSAKI-san points out that this is called under shmem.c's info->lock, and it's also under shm.c's shm_lock(), both spinlocks. There is no strong reason for that: we need to take these pages off the unevictable list soonish, but those locks are not required for it. So move the call to scan_mapping_unevictable_pages() from shmem.c's unlock handling up to shm.c's unlock handling. Remove the recently added barrier, not needed now we have spin_unlock() before the scan. Use get_file(), with subsequent fput(), to make sure we have a reference to mapping throughout scan_mapping_unevictable_pages(): that's something that was previously guaranteed by the shm_lock(). Remove shmctl's lru_add_drain_all(): we don't fault in pages at SHM_LOCK time, and we lazily discover them to be Unevictable later, so it serves no purpose for SHM_LOCK; and serves no purpose for SHM_UNLOCK, since pages still on pagevec are not marked Unevictable. The original code avoided redundant rescans by checking VM_LOCKED flag at its level: now avoid them by checking shp's SHM_LOCKED. The original code called scan_mapping_unevictable_pages() on a locked area at shm_destroy() time: perhaps we once had accounting cross-checks which required that, but not now, so skip the overhead and just let inode eviction deal with them. Put check_move_unevictable_page() and scan_mapping_unevictable_pages() under CONFIG_SHMEM (with stub for the TINY case when ramfs is used), more as comment than to save space; comment them used for SHM_UNLOCK. Signed-off-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Cc: Minchan Kim Cc: Rik van Riel Cc: Shaohua Li Cc: Eric Dumazet Cc: Johannes Weiner Cc: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 37 ++++++++++++++++++++++--------------- mm/shmem.c | 7 ------- mm/vmscan.c | 12 +++++++++++- 3 files changed, 33 insertions(+), 23 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index 02ecf2c078fc..854ab58e5f6e 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -870,9 +870,7 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) case SHM_LOCK: case SHM_UNLOCK: { - struct file *uninitialized_var(shm_file); - - lru_add_drain_all(); /* drain pagevecs to lru lists */ + struct file *shm_file; shp = shm_lock_check(ns, shmid); if (IS_ERR(shp)) { @@ -895,22 +893,31 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) err = security_shm_shmctl(shp, cmd); if (err) goto out_unlock; - - if(cmd==SHM_LOCK) { + + shm_file = shp->shm_file; + if (is_file_hugepages(shm_file)) + goto out_unlock; + + if (cmd == SHM_LOCK) { struct user_struct *user = current_user(); - if (!is_file_hugepages(shp->shm_file)) { - err = shmem_lock(shp->shm_file, 1, user); - if (!err && !(shp->shm_perm.mode & SHM_LOCKED)){ - shp->shm_perm.mode |= SHM_LOCKED; - shp->mlock_user = user; - } + err = shmem_lock(shm_file, 1, user); + if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) { + shp->shm_perm.mode |= SHM_LOCKED; + shp->mlock_user = user; } - } else if (!is_file_hugepages(shp->shm_file)) { - shmem_lock(shp->shm_file, 0, shp->mlock_user); - shp->shm_perm.mode &= ~SHM_LOCKED; - shp->mlock_user = NULL; + goto out_unlock; } + + /* SHM_UNLOCK */ + if (!(shp->shm_perm.mode & SHM_LOCKED)) + goto out_unlock; + shmem_lock(shm_file, 0, shp->mlock_user); + shp->shm_perm.mode &= ~SHM_LOCKED; + shp->mlock_user = NULL; + get_file(shm_file); shm_unlock(shp); + scan_mapping_unevictable_pages(shm_file->f_mapping); + fput(shm_file); goto out; } case IPC_RMID: diff --git a/mm/shmem.c b/mm/shmem.c index feead1943d92..4aaa53abe302 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1068,13 +1068,6 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) user_shm_unlock(inode->i_size, user); info->flags &= ~VM_LOCKED; mapping_clear_unevictable(file->f_mapping); - /* - * Ensure that a racing putback_lru_page() can see - * the pages of this mapping are evictable when we - * skip them due to !PageLRU during the scan. - */ - smp_mb__after_clear_bit(); - scan_mapping_unevictable_pages(file->f_mapping); } retval = 0; diff --git a/mm/vmscan.c b/mm/vmscan.c index 2880396f7953..e097c1026b58 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3499,6 +3499,7 @@ int page_evictable(struct page *page, struct vm_area_struct *vma) return 1; } +#ifdef CONFIG_SHMEM /** * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list * @page: page to check evictability and move to appropriate lru list @@ -3509,6 +3510,8 @@ int page_evictable(struct page *page, struct vm_area_struct *vma) * * Restrictions: zone->lru_lock must be held, page must be on LRU and must * have PageUnevictable set. + * + * This function is only used for SysV IPC SHM_UNLOCK. */ static void check_move_unevictable_page(struct page *page, struct zone *zone) { @@ -3545,6 +3548,8 @@ static void check_move_unevictable_page(struct page *page, struct zone *zone) * * Scan all pages in mapping. Check unevictable pages for * evictability and move them to the appropriate zone lru list. + * + * This function is only used for SysV IPC SHM_UNLOCK. */ void scan_mapping_unevictable_pages(struct address_space *mapping) { @@ -3590,9 +3595,14 @@ void scan_mapping_unevictable_pages(struct address_space *mapping) pagevec_release(&pvec); count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned); + cond_resched(); } - } +#else +void scan_mapping_unevictable_pages(struct address_space *mapping) +{ +} +#endif /* CONFIG_SHMEM */ static void warn_scan_unevictable_pages(void) { From 245132643e1cfcd145bbc86a716c1818371fcb93 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 20 Jan 2012 14:34:21 -0800 Subject: [PATCH 14/16] SHM_UNLOCK: fix Unevictable pages stranded after swap Commit cc39c6a9bbde ("mm: account skipped entries to avoid looping in find_get_pages") correctly fixed an infinite loop; but left a problem that find_get_pages() on shmem would return 0 (appearing to callers to mean end of tree) when it meets a run of nr_pages swap entries. The only uses of find_get_pages() on shmem are via pagevec_lookup(), called from invalidate_mapping_pages(), and from shmctl SHM_UNLOCK's scan_mapping_unevictable_pages(). The first is already commented, and not worth worrying about; but the second can leave pages on the Unevictable list after an unusual sequence of swapping and locking. Fix that by using shmem_find_get_pages_and_swap() (then ignoring the swap) instead of pagevec_lookup(). But I don't want to contaminate vmscan.c with shmem internals, nor shmem.c with LRU locking. So move scan_mapping_unevictable_pages() into shmem.c, renaming it shmem_unlock_mapping(); and rename check_move_unevictable_page() to check_move_unevictable_pages(), looping down an array of pages, oftentimes under the same lock. Leave out the "rotate unevictable list" block: that's a leftover from when this was used for /proc/sys/vm/scan_unevictable_pages, whose flawed handling involved looking at pages at tail of LRU. Was there significance to the sequence first ClearPageUnevictable, then test page_evictable, then SetPageUnevictable here? I think not, we're under LRU lock, and have no barriers between those. Signed-off-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Cc: Minchan Kim Cc: Rik van Riel Cc: Shaohua Li Cc: Eric Dumazet Cc: Johannes Weiner Cc: Michel Lespinasse Cc: [back to 3.1 but will need respins] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 1 + include/linux/swap.h | 2 +- ipc/shm.c | 2 +- mm/shmem.c | 46 ++++++++++++-- mm/vmscan.c | 132 ++++++++++++--------------------------- 5 files changed, 85 insertions(+), 98 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index e4c711c6f321..79ab2555b3b0 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -48,6 +48,7 @@ extern struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); extern int shmem_zero_setup(struct vm_area_struct *); extern int shmem_lock(struct file *file, int lock, struct user_struct *user); +extern void shmem_unlock_mapping(struct address_space *mapping); extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); diff --git a/include/linux/swap.h b/include/linux/swap.h index 06061a7f8e69..3e60228e7299 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -273,7 +273,7 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) #endif extern int page_evictable(struct page *page, struct vm_area_struct *vma); -extern void scan_mapping_unevictable_pages(struct address_space *); +extern void check_move_unevictable_pages(struct page **, int nr_pages); extern unsigned long scan_unevictable_pages; extern int scan_unevictable_handler(struct ctl_table *, int, diff --git a/ipc/shm.c b/ipc/shm.c index 854ab58e5f6e..b76be5bda6c2 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -916,7 +916,7 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) shp->mlock_user = NULL; get_file(shm_file); shm_unlock(shp); - scan_mapping_unevictable_pages(shm_file->f_mapping); + shmem_unlock_mapping(shm_file->f_mapping); fput(shm_file); goto out; } diff --git a/mm/shmem.c b/mm/shmem.c index 4aaa53abe302..269d049294ab 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -379,7 +379,7 @@ static int shmem_free_swap(struct address_space *mapping, /* * Pagevec may contain swap entries, so shuffle up pages before releasing. */ -static void shmem_pagevec_release(struct pagevec *pvec) +static void shmem_deswap_pagevec(struct pagevec *pvec) { int i, j; @@ -389,7 +389,36 @@ static void shmem_pagevec_release(struct pagevec *pvec) pvec->pages[j++] = page; } pvec->nr = j; - pagevec_release(pvec); +} + +/* + * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. + */ +void shmem_unlock_mapping(struct address_space *mapping) +{ + struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t index = 0; + + pagevec_init(&pvec, 0); + /* + * Minor point, but we might as well stop if someone else SHM_LOCKs it. + */ + while (!mapping_unevictable(mapping)) { + /* + * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it + * has finished, if it hits a row of PAGEVEC_SIZE swap entries. + */ + pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + PAGEVEC_SIZE, pvec.pages, indices); + if (!pvec.nr) + break; + index = indices[pvec.nr - 1] + 1; + shmem_deswap_pagevec(&pvec); + check_move_unevictable_pages(pvec.pages, pvec.nr); + pagevec_release(&pvec); + cond_resched(); + } } /* @@ -440,7 +469,8 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) } unlock_page(page); } - shmem_pagevec_release(&pvec); + shmem_deswap_pagevec(&pvec); + pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); index++; @@ -470,7 +500,8 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) continue; } if (index == start && indices[0] > end) { - shmem_pagevec_release(&pvec); + shmem_deswap_pagevec(&pvec); + pagevec_release(&pvec); break; } mem_cgroup_uncharge_start(); @@ -494,7 +525,8 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) } unlock_page(page); } - shmem_pagevec_release(&pvec); + shmem_deswap_pagevec(&pvec); + pagevec_release(&pvec); mem_cgroup_uncharge_end(); index++; } @@ -2438,6 +2470,10 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) return 0; } +void shmem_unlock_mapping(struct address_space *mapping) +{ +} + void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { truncate_inode_pages_range(inode->i_mapping, lstart, lend); diff --git a/mm/vmscan.c b/mm/vmscan.c index e097c1026b58..c52b23552659 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -26,7 +26,6 @@ #include /* for try_to_release_page(), buffer_heads_over_limit */ #include -#include #include #include #include @@ -661,7 +660,7 @@ void putback_lru_page(struct page *page) * When racing with an mlock or AS_UNEVICTABLE clearing * (page is unlocked) make sure that if the other thread * does not observe our setting of PG_lru and fails - * isolation/check_move_unevictable_page, + * isolation/check_move_unevictable_pages, * we see PG_mlocked/AS_UNEVICTABLE cleared below and move * the page back to the evictable list. * @@ -3501,106 +3500,57 @@ int page_evictable(struct page *page, struct vm_area_struct *vma) #ifdef CONFIG_SHMEM /** - * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list - * @page: page to check evictability and move to appropriate lru list - * @zone: zone page is in + * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list + * @pages: array of pages to check + * @nr_pages: number of pages to check * - * Checks a page for evictability and moves the page to the appropriate - * zone lru list. - * - * Restrictions: zone->lru_lock must be held, page must be on LRU and must - * have PageUnevictable set. + * Checks pages for evictability and moves them to the appropriate lru list. * * This function is only used for SysV IPC SHM_UNLOCK. */ -static void check_move_unevictable_page(struct page *page, struct zone *zone) +void check_move_unevictable_pages(struct page **pages, int nr_pages) { struct lruvec *lruvec; + struct zone *zone = NULL; + int pgscanned = 0; + int pgrescued = 0; + int i; - VM_BUG_ON(PageActive(page)); -retry: - ClearPageUnevictable(page); - if (page_evictable(page, NULL)) { - enum lru_list l = page_lru_base_type(page); + for (i = 0; i < nr_pages; i++) { + struct page *page = pages[i]; + struct zone *pagezone; - __dec_zone_state(zone, NR_UNEVICTABLE); - lruvec = mem_cgroup_lru_move_lists(zone, page, - LRU_UNEVICTABLE, l); - list_move(&page->lru, &lruvec->lists[l]); - __inc_zone_state(zone, NR_INACTIVE_ANON + l); - __count_vm_event(UNEVICTABLE_PGRESCUED); - } else { - /* - * rotate unevictable list - */ - SetPageUnevictable(page); - lruvec = mem_cgroup_lru_move_lists(zone, page, LRU_UNEVICTABLE, - LRU_UNEVICTABLE); - list_move(&page->lru, &lruvec->lists[LRU_UNEVICTABLE]); - if (page_evictable(page, NULL)) - goto retry; - } -} - -/** - * scan_mapping_unevictable_pages - scan an address space for evictable pages - * @mapping: struct address_space to scan for evictable pages - * - * Scan all pages in mapping. Check unevictable pages for - * evictability and move them to the appropriate zone lru list. - * - * This function is only used for SysV IPC SHM_UNLOCK. - */ -void scan_mapping_unevictable_pages(struct address_space *mapping) -{ - pgoff_t next = 0; - pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - struct zone *zone; - struct pagevec pvec; - - if (mapping->nrpages == 0) - return; - - pagevec_init(&pvec, 0); - while (next < end && - pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { - int i; - int pg_scanned = 0; - - zone = NULL; - - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - pgoff_t page_index = page->index; - struct zone *pagezone = page_zone(page); - - pg_scanned++; - if (page_index > next) - next = page_index; - next++; - - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); - } - - if (PageLRU(page) && PageUnevictable(page)) - check_move_unevictable_page(page, zone); + pgscanned++; + pagezone = page_zone(page); + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); } - if (zone) - spin_unlock_irq(&zone->lru_lock); - pagevec_release(&pvec); - count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned); - cond_resched(); + if (!PageLRU(page) || !PageUnevictable(page)) + continue; + + if (page_evictable(page, NULL)) { + enum lru_list lru = page_lru_base_type(page); + + VM_BUG_ON(PageActive(page)); + ClearPageUnevictable(page); + __dec_zone_state(zone, NR_UNEVICTABLE); + lruvec = mem_cgroup_lru_move_lists(zone, page, + LRU_UNEVICTABLE, lru); + list_move(&page->lru, &lruvec->lists[lru]); + __inc_zone_state(zone, NR_INACTIVE_ANON + lru); + pgrescued++; + } + } + + if (zone) { + __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); + __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); + spin_unlock_irq(&zone->lru_lock); } -} -#else -void scan_mapping_unevictable_pages(struct address_space *mapping) -{ } #endif /* CONFIG_SHMEM */ From 9f9f1acd713d69fae2af286fbeedc6c8963411c6 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 20 Jan 2012 14:34:24 -0800 Subject: [PATCH 15/16] mm: fix rss count leakage during migration Memory migration fills a pte with a migration entry and it doesn't update the rss counters. Then it replaces the migration entry with the new page (or the old one if migration failed). But between these two passes this pte can be unmaped, or a task can fork a child and it will get a copy of this migration entry. Nobody accounts for this in the rss counters. This patch properly adjust rss counters for migration entries in zap_pte_range() and copy_one_pte(). Thus we avoid extra atomic operations on the migration fast-path. Signed-off-by: Konstantin Khlebnikov Cc: Hugh Dickins Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 5e30583c2605..fa2f04e0337c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -878,15 +878,24 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, } if (likely(!non_swap_entry(entry))) rss[MM_SWAPENTS]++; - else if (is_write_migration_entry(entry) && - is_cow_mapping(vm_flags)) { - /* - * COW mappings require pages in both parent - * and child to be set to read. - */ - make_migration_entry_read(&entry); - pte = swp_entry_to_pte(entry); - set_pte_at(src_mm, addr, src_pte, pte); + else if (is_migration_entry(entry)) { + page = migration_entry_to_page(entry); + + if (PageAnon(page)) + rss[MM_ANONPAGES]++; + else + rss[MM_FILEPAGES]++; + + if (is_write_migration_entry(entry) && + is_cow_mapping(vm_flags)) { + /* + * COW mappings require pages in both + * parent and child to be set to read. + */ + make_migration_entry_read(&entry); + pte = swp_entry_to_pte(entry); + set_pte_at(src_mm, addr, src_pte, pte); + } } } goto out_set_pte; @@ -1191,6 +1200,16 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (!non_swap_entry(entry)) rss[MM_SWAPENTS]--; + else if (is_migration_entry(entry)) { + struct page *page; + + page = migration_entry_to_page(entry); + + if (PageAnon(page)) + rss[MM_ANONPAGES]--; + else + rss[MM_FILEPAGES]--; + } if (unlikely(!free_swap_and_cache(entry))) print_bad_pte(vma, addr, ptent, NULL); } From c25a785d6647984505fa165b5cd84cfc9a95970b Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Fri, 20 Jan 2012 14:34:27 -0800 Subject: [PATCH 16/16] score: fix off-by-one index into syscall table If the provided system call number is equal to __NR_syscalls, the current check will pass and a function pointer just after the system call table may be called, since sys_call_table is an array with total size __NR_syscalls. Whether or not this is a security bug depends on what the compiler puts immediately after the system call table. It's likely that this won't do anything bad because there is an additional NULL check on the syscall entry, but if there happens to be a non-NULL value immediately after the system call table, this may result in local privilege escalation. Signed-off-by: Dan Rosenberg Cc: Cc: Chen Liqin Cc: Lennox Wu Cc: Eugene Teo Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/score/kernel/entry.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/score/kernel/entry.S b/arch/score/kernel/entry.S index 577abba3fac6..83bb96079c43 100644 --- a/arch/score/kernel/entry.S +++ b/arch/score/kernel/entry.S @@ -408,7 +408,7 @@ ENTRY(handle_sys) sw r9, [r0, PT_EPC] cmpi.c r27, __NR_syscalls # check syscall number - bgtu illegal_syscall + bgeu illegal_syscall slli r8, r27, 2 # get syscall routine la r11, sys_call_table