From 97534127012f0e396eddea4691f4c9b170aed74b Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Tue, 14 Apr 2015 15:42:30 -0700 Subject: [PATCH 001/122] mm/hugetlb: use pmd_page() in follow_huge_pmd() Commit 61f77eda9bbf ("mm/hugetlb: reduce arch dependent code around follow_huge_*") broke follow_huge_pmd() on s390, where pmd and pte layout differ and using pte_page() on a huge pmd will return wrong results. Using pmd_page() instead fixes this. All architectures that were touched by that commit have pmd_page() defined, so this should not break anything on other architectures. Fixes: 61f77eda "mm/hugetlb: reduce arch dependent code around follow_huge_*" Signed-off-by: Gerald Schaefer Acked-by: Naoya Horiguchi Cc: Hugh Dickins Cc: Michal Hocko , Andrea Arcangeli Cc: Martin Schwidefsky Acked-by: David Rientjes Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c41b2a0ee273..caad3c5a926f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3735,8 +3735,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, if (!pmd_huge(*pmd)) goto out; if (pmd_present(*pmd)) { - page = pte_page(*(pte_t *)pmd) + - ((address & ~PMD_MASK) >> PAGE_SHIFT); + page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); if (flags & FOLL_GET) get_page(page); } else { From 5017335d6d8c1fc3bbc04f80785440885f305879 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 14 Apr 2015 15:42:34 -0700 Subject: [PATCH 002/122] scripts/coccinelle/misc/bugon.cocci: update bug_on conversion warning if()/BUG conversion to BUG_ON must be avoided when there's side effect in condition. The reason being BUG_ON won't execute the condition when CONFIG_BUG is not defined. With inspiration from Bruce Fields. Signed-off-by: Fabian Frederick Suggested-by: Julia Lawall Acked-by: Julia Lawall Cc: J. Bruce Fields Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/coccinelle/misc/bugon.cocci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/coccinelle/misc/bugon.cocci b/scripts/coccinelle/misc/bugon.cocci index 3b7eec24fb5a..27c97f1f2767 100644 --- a/scripts/coccinelle/misc/bugon.cocci +++ b/scripts/coccinelle/misc/bugon.cocci @@ -57,6 +57,6 @@ coccilib.org.print_todo(p[0], "WARNING use BUG_ON") p << r.p; @@ -msg="WARNING: Use BUG_ON" +msg="WARNING: Use BUG_ON instead of if condition followed by BUG.\nPlease make sure the condition has no side effects (see conditional BUG_ON definition in include/asm-generic/bug.h)" coccilib.report.print_report(p[0], msg) From 7b4b425897cd582897ccc38b637ce7ab5ffc5593 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2015 15:42:37 -0700 Subject: [PATCH 003/122] arch/sh/kernel/dwarf.c: destroy mempools on cleanup dwarf_reg_pool and dwarf_frame_pool are not properly destroyed when cleaning up the dwarf unwinder. Destroy them with mempool_destroy(). Also mark dwarf_unwinder_cleanup() as __init. Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/kernel/dwarf.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c index 67a049e75ec1..60437e9c345e 100644 --- a/arch/sh/kernel/dwarf.c +++ b/arch/sh/kernel/dwarf.c @@ -993,7 +993,7 @@ static struct unwinder dwarf_unwinder = { .rating = 150, }; -static void dwarf_unwinder_cleanup(void) +static void __init dwarf_unwinder_cleanup(void) { struct dwarf_fde *fde, *next_fde; struct dwarf_cie *cie, *next_cie; @@ -1009,6 +1009,10 @@ static void dwarf_unwinder_cleanup(void) rbtree_postorder_for_each_entry_safe(cie, next_cie, &cie_root, node) kfree(cie); + if (dwarf_reg_pool) + mempool_destroy(dwarf_reg_pool); + if (dwarf_frame_pool) + mempool_destroy(dwarf_frame_pool); kmem_cache_destroy(dwarf_reg_cachep); kmem_cache_destroy(dwarf_frame_cachep); } From 1cf370c61179e01313457363b21f0859be0d8cb7 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2015 15:42:40 -0700 Subject: [PATCH 004/122] arch/sh/kernel/dwarf.c: use mempool_create_slab_pool() Mempools created for slab caches should use mempool_create_slab_pool(). Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/kernel/dwarf.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c index 60437e9c345e..9d209a07235e 100644 --- a/arch/sh/kernel/dwarf.c +++ b/arch/sh/kernel/dwarf.c @@ -1180,17 +1180,13 @@ static int __init dwarf_unwinder_init(void) sizeof(struct dwarf_reg), 0, SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL); - dwarf_frame_pool = mempool_create(DWARF_FRAME_MIN_REQ, - mempool_alloc_slab, - mempool_free_slab, - dwarf_frame_cachep); + dwarf_frame_pool = mempool_create_slab_pool(DWARF_FRAME_MIN_REQ, + dwarf_frame_cachep); if (!dwarf_frame_pool) goto out; - dwarf_reg_pool = mempool_create(DWARF_REG_MIN_REQ, - mempool_alloc_slab, - mempool_free_slab, - dwarf_reg_cachep); + dwarf_reg_pool = mempool_create_slab_pool(DWARF_REG_MIN_REQ, + dwarf_reg_cachep); if (!dwarf_reg_pool) goto out; From fd90d4dfb94a8c0d626c0c85ca7dcfb905f81a65 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 14 Apr 2015 15:42:42 -0700 Subject: [PATCH 005/122] ocfs2: delete unnecessary checks before three function calls kfree(), ocfs2_free_path() and __ocfs2_free_slot_info() test whether their argument is NULL and then return immediately. Thus the test around their calls is not needed. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 17 +++++------------ fs/ocfs2/slot_map.c | 2 +- fs/ocfs2/stack_user.c | 2 +- 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 044158bd22be..fdab27c9be99 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -3453,8 +3453,7 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, subtree_index); } out: - if (right_path) - ocfs2_free_path(right_path); + ocfs2_free_path(right_path); return ret; } @@ -3647,8 +3646,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, right_path, subtree_index); } out: - if (left_path) - ocfs2_free_path(left_path); + ocfs2_free_path(left_path); return ret; } @@ -4431,11 +4429,8 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, } out: - if (left_path) - ocfs2_free_path(left_path); - if (right_path) - ocfs2_free_path(right_path); - + ocfs2_free_path(left_path); + ocfs2_free_path(right_path); return ret; } @@ -6996,9 +6991,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, out: if (data_ac) ocfs2_free_alloc_context(data_ac); - if (pages) - kfree(pages); - + kfree(pages); return ret; } diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index d5493e361a38..c5e530a9d1b1 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -452,7 +452,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) osb->slot_info = (struct ocfs2_slot_info *)si; bail: - if (status < 0 && si) + if (status < 0) __ocfs2_free_slot_info(si); return status; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 720aa389e0ea..c3b7807c65d6 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -1063,7 +1063,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) } out: - if (rc && lc) + if (rc) kfree(lc); return rc; } From 3cc79b795b53a9a04aa1bcc1a943379f06324bb6 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 14 Apr 2015 15:42:45 -0700 Subject: [PATCH 006/122] ocfs2: less function calls in ocfs2_convert_inline_data_to_extents() after error detection kfree() was called in a few cases by ocfs2_convert_inline_data_to_extents() during error handling even if the passed variable "pages" contained a null pointer. * Return from this implementation directly after failure detection for the function call "kcalloc". * Corresponding details could be improved by the introduction of another jump label. Signed-off-by: Markus Elfring Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index fdab27c9be99..bf806e58b1cb 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6853,13 +6853,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, if (pages == NULL) { ret = -ENOMEM; mlog_errno(ret); - goto out; + return ret; } ret = ocfs2_reserve_clusters(osb, 1, &data_ac); if (ret) { mlog_errno(ret); - goto out; + goto free_pages; } } @@ -6991,6 +6991,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, out: if (data_ac) ocfs2_free_alloc_context(data_ac); +free_pages: kfree(pages); return ret; } From 06a269ccdf0550671667f5cfa00bdabb6bf05259 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 14 Apr 2015 15:42:48 -0700 Subject: [PATCH 007/122] ocfs2: less function calls in ocfs2_figure_merge_contig_type() after error detection ocfs2_free_path() was called in some cases by ocfs2_figure_merge_contig_type() during error handling even if the passed variables "left_path" and "right_path" contained still a null pointer. Corresponding implementation details could be improved by adjustments for jump labels according to the current Linux coding style convention. Signed-off-by: Markus Elfring Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index bf806e58b1cb..370b4ea4c23a 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4332,17 +4332,17 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, } else if (path->p_tree_depth > 0) { status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos); if (status) - goto out; + goto exit; if (left_cpos != 0) { left_path = ocfs2_new_path_from_path(path); if (!left_path) - goto out; + goto exit; status = ocfs2_find_path(et->et_ci, left_path, left_cpos); if (status) - goto out; + goto free_left_path; new_el = path_leaf_el(left_path); @@ -4359,7 +4359,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, le16_to_cpu(new_el->l_next_free_rec), le16_to_cpu(new_el->l_count)); status = -EINVAL; - goto out; + goto free_left_path; } rec = &new_el->l_recs[ le16_to_cpu(new_el->l_next_free_rec) - 1]; @@ -4386,18 +4386,18 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, path->p_tree_depth > 0) { status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos); if (status) - goto out; + goto free_left_path; if (right_cpos == 0) - goto out; + goto free_left_path; right_path = ocfs2_new_path_from_path(path); if (!right_path) - goto out; + goto free_left_path; status = ocfs2_find_path(et->et_ci, right_path, right_cpos); if (status) - goto out; + goto free_right_path; new_el = path_leaf_el(right_path); rec = &new_el->l_recs[0]; @@ -4411,7 +4411,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, (unsigned long long)le64_to_cpu(eb->h_blkno), le16_to_cpu(new_el->l_next_free_rec)); status = -EINVAL; - goto out; + goto free_right_path; } rec = &new_el->l_recs[1]; } @@ -4428,9 +4428,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, ret = contig_type; } -out: - ocfs2_free_path(left_path); +free_right_path: ocfs2_free_path(right_path); +free_left_path: + ocfs2_free_path(left_path); +exit: return ret; } From 992ef6e794c4d6b84e7930ee79310f066c7f6727 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 14 Apr 2015 15:42:51 -0700 Subject: [PATCH 008/122] ocfs2: one function call less in ocfs2_merge_rec_left() after error detection ocfs2_free_path() was called by ocfs2_merge_rec_left() even if a call of the ocfs2_get_left_path() function failed. Return from this implementation directly after corresponding exception handling. Signed-off-by: Markus Elfring Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 370b4ea4c23a..4bdc19fb7b85 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -3535,7 +3535,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, ret = ocfs2_get_left_path(et, right_path, &left_path); if (ret) { mlog_errno(ret); - goto out; + return ret; } left_el = path_leaf_el(left_path); From 629a3b5f0b1c09025546e110ea2b2a67335ed8c5 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 14 Apr 2015 15:42:53 -0700 Subject: [PATCH 009/122] ocfs2: one function call less in ocfs2_merge_rec_right() after error detection ocfs2_free_path() was called by ocfs2_merge_rec_right() even if a call of the ocfs2_get_right_path() function failed. Return from this implementation directly after corresponding exception handling. Signed-off-by: Markus Elfring Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 4bdc19fb7b85..2d7f76e52c37 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -3370,7 +3370,7 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, ret = ocfs2_get_right_path(et, left_path, &right_path); if (ret) { mlog_errno(ret); - goto out; + return ret; } right_el = path_leaf_el(right_path); From bb34ed21bce54a900c034089a6b1fde8c09f6a6d Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 14 Apr 2015 15:42:56 -0700 Subject: [PATCH 010/122] ocfs2: one function call less in ocfs2_init_slot_info() after error detection __ocfs2_free_slot_info() was called by ocfs2_init_slot_info() even if a call of the kzalloc() function failed. Return from this implementation directly after corresponding exception handling. Signed-off-by: Markus Elfring Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/slot_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index c5e530a9d1b1..e78a203d44c8 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -427,7 +427,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) if (!si) { status = -ENOMEM; mlog_errno(status); - goto bail; + return status; } si->si_extended = ocfs2_uses_extended_slot_map(osb); From 43ee9cad8a81954eea893b52e08d0c00ca9baccc Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 14 Apr 2015 15:42:59 -0700 Subject: [PATCH 011/122] ocfs2: one function call less in user_cluster_connect() after error detection kfree() was called by user_cluster_connect() even if a previous call of the kzalloc() function failed. Return from this implementation directly after failure detection. Signed-off-by: Markus Elfring Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/stack_user.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index c3b7807c65d6..2768eb1da2b8 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -1004,10 +1004,8 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) BUG_ON(conn == NULL); lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); - if (!lc) { - rc = -ENOMEM; - goto out; - } + if (!lc) + return -ENOMEM; init_waitqueue_head(&lc->oc_wait); init_completion(&lc->oc_sync_wait); From 7a8346429d6da4039a4687a8a07f3f8cdaf96d92 Mon Sep 17 00:00:00 2001 From: Daeseok Youn Date: Tue, 14 Apr 2015 15:43:02 -0700 Subject: [PATCH 012/122] ocfs2: avoid a pointless delay in o2cb_cluster_check() Fix an off-by-one when attempting to avoid an msleep() on the final loop iteration. Signed-off-by: Daeseok Youn Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/stack_o2cb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 1724d43d3da1..220cae7bbdbc 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -295,7 +295,7 @@ static int o2cb_cluster_check(void) set_bit(node_num, netmap); if (!memcmp(hbmap, netmap, sizeof(hbmap))) return 0; - if (i < O2CB_MAP_STABILIZE_COUNT) + if (i < O2CB_MAP_STABILIZE_COUNT - 1) msleep(1000); } From bdd86215b393b743be34b854299c6cda7cbb361c Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 14 Apr 2015 15:43:05 -0700 Subject: [PATCH 013/122] ocfs2: fix a typing error in ocfs2_direct_IO_write Only when direct IO succeeds we need consider zeroing out in case of cluster not aligned. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 44db1808cdb5..0c2848a599c9 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -771,7 +771,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, if (ret < 0) mlog_errno(ret); } - } else if (written < 0 && append_write && !is_overwrite && + } else if (written > 0 && append_write && !is_overwrite && !cluster_align) { u32 p_cpos = 0; u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); From 7e9b19551c8249baf380cbd274633ee4af95bc99 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 14 Apr 2015 15:43:08 -0700 Subject: [PATCH 014/122] ocfs2: no need get dinode bh when zeroing extend Since di_bh won't be used when zeroing extend, set it to NULL. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 0c2848a599c9..2a618dd2577d 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -706,7 +706,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, } if (append_write) { - ret = ocfs2_inode_lock(inode, &di_bh, 1); + ret = ocfs2_inode_lock(inode, NULL, 1); if (ret < 0) { mlog_errno(ret); goto clean_orphan; @@ -720,7 +720,6 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, if (ret < 0) { mlog_errno(ret); ocfs2_inode_unlock(inode, 1); - brelse(di_bh); goto clean_orphan; } @@ -728,13 +727,10 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, if (is_overwrite < 0) { mlog_errno(is_overwrite); ocfs2_inode_unlock(inode, 1); - brelse(di_bh); goto clean_orphan; } ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - di_bh = NULL; } written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev, From 37a8d89aee2a5b58812e19520d96632efe987f54 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 14 Apr 2015 15:43:10 -0700 Subject: [PATCH 015/122] ocfs2: take inode lock when get clusters We need take inode lock when calling ocfs2_get_clusters. And use GFP_NOFS instead of GFP_KERNEL. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 2a618dd2577d..973a636285d1 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -772,10 +772,17 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, u32 p_cpos = 0; u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); + ret = ocfs2_inode_lock(inode, NULL, 0); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, &ext_flags); if (ret < 0) { mlog_errno(ret); + ocfs2_inode_unlock(inode, 0); goto clean_orphan; } @@ -783,9 +790,11 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, ret = blkdev_issue_zeroout(osb->sb->s_bdev, p_cpos << (osb->s_clustersize_bits - 9), - zero_len >> 9, GFP_KERNEL, false); + zero_len >> 9, GFP_NOFS, false); if (ret < 0) mlog_errno(ret); + + ocfs2_inode_unlock(inode, 0); } clean_orphan: From 14a5275d8c31ba24832f45eeb2469e835ded660d Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 14 Apr 2015 15:43:13 -0700 Subject: [PATCH 016/122] ocfs2: do not use ocfs2_zero_extend during direct IO In ocfs2_direct_IO_write, we use ocfs2_zero_extend to zero allocated clusters in case of cluster not aligned. But ocfs2_zero_extend uses page cache, this may happen that it clears the data which blockdev_direct_IO has already written. We should use blkdev_issue_zeroout instead of ocfs2_zero_extend during direct IO. So fix this issue by introducing ocfs2_direct_IO_zero_extend and ocfs2_direct_IO_extend_no_holes. Reported-by: Yiwen Jiang Signed-off-by: Joseph Qi Tested-by: Yiwen Jiang Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 138 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 130 insertions(+), 8 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 973a636285d1..1b0463a92b17 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -663,6 +663,117 @@ static int ocfs2_is_overwrite(struct ocfs2_super *osb, return 0; } +static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb, + struct inode *inode, loff_t offset, + u64 zero_len, int cluster_align) +{ + u32 p_cpos = 0; + u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode)); + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + int ret = 0; + + if (offset <= i_size_read(inode) || cluster_align) + return 0; + + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, + &ext_flags); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { + u64 s = i_size_read(inode); + sector_t sector = (p_cpos << (osb->s_clustersize_bits - 9)) + + (do_div(s, osb->s_clustersize) >> 9); + + ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector, + zero_len >> 9, GFP_NOFS, false); + if (ret < 0) + mlog_errno(ret); + } + + return ret; +} + +static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb, + struct inode *inode, loff_t offset) +{ + u64 zero_start, zero_len, total_zero_len; + u32 p_cpos = 0, clusters_to_add; + u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode)); + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + u32 size_div, offset_div; + int ret = 0; + + { + u64 o = offset; + u64 s = i_size_read(inode); + + offset_div = do_div(o, osb->s_clustersize); + size_div = do_div(s, osb->s_clustersize); + } + + if (offset <= i_size_read(inode)) + return 0; + + clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) - + ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode)); + total_zero_len = offset - i_size_read(inode); + if (clusters_to_add) + total_zero_len -= offset_div; + + /* Allocate clusters to fill out holes, and this is only needed + * when we add more than one clusters. Otherwise the cluster will + * be allocated during direct IO */ + if (clusters_to_add > 1) { + ret = ocfs2_extend_allocation(inode, + OCFS2_I(inode)->ip_clusters, + clusters_to_add - 1, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + while (total_zero_len) { + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, + &ext_flags); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) + + size_div; + zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) - + size_div; + zero_len = min(total_zero_len, zero_len); + + if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { + ret = blkdev_issue_zeroout(osb->sb->s_bdev, + zero_start >> 9, zero_len >> 9, + GFP_NOFS, false); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + total_zero_len -= zero_len; + v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div); + + /* Only at first iteration can be cluster not aligned. + * So set size_div to 0 for the rest */ + size_div = 0; + } + +out: + return ret; +} + static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) @@ -677,8 +788,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, struct buffer_head *di_bh = NULL; size_t count = iter->count; journal_t *journal = osb->journal->j_journal; - u32 zero_len; - int cluster_align; + u64 zero_len_head, zero_len_tail; + int cluster_align_head, cluster_align_tail; loff_t final_size = offset + count; int append_write = offset >= i_size_read(inode) ? 1 : 0; unsigned int num_clusters = 0; @@ -686,9 +797,16 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, { u64 o = offset; + u64 s = i_size_read(inode); - zero_len = do_div(o, 1 << osb->s_clustersize_bits); - cluster_align = !zero_len; + zero_len_head = do_div(o, 1 << osb->s_clustersize_bits); + cluster_align_head = !zero_len_head; + + zero_len_tail = osb->s_clustersize - + do_div(s, osb->s_clustersize); + if ((offset - i_size_read(inode)) < zero_len_tail) + zero_len_tail = offset - i_size_read(inode); + cluster_align_tail = !zero_len_tail; } /* @@ -712,10 +830,13 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, goto clean_orphan; } + /* zeroing out the previously allocated cluster tail + * that but not zeroed */ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) - ret = ocfs2_zero_extend(inode, di_bh, offset); + ret = ocfs2_direct_IO_zero_extend(osb, inode, offset, + zero_len_tail, cluster_align_tail); else - ret = ocfs2_extend_no_holes(inode, di_bh, offset, + ret = ocfs2_direct_IO_extend_no_holes(osb, inode, offset); if (ret < 0) { mlog_errno(ret); @@ -768,7 +889,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, mlog_errno(ret); } } else if (written > 0 && append_write && !is_overwrite && - !cluster_align) { + !cluster_align_head) { + /* zeroing out the allocated cluster head */ u32 p_cpos = 0; u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); @@ -790,7 +912,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, ret = blkdev_issue_zeroout(osb->sb->s_bdev, p_cpos << (osb->s_clustersize_bits - 9), - zero_len >> 9, GFP_NOFS, false); + zero_len_head >> 9, GFP_NOFS, false); if (ret < 0) mlog_errno(ret); From d0ba25b905ba1246d04578cd59df83014e9b9152 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 14 Apr 2015 15:43:16 -0700 Subject: [PATCH 017/122] ocfs2: fix typo in ocfs2_reserve_local_alloc_bits In ocfs2_reserve_local_alloc_bits, it calls ocfs2_error if local alloc inode bitmap used bits mismatch, but the log mistakes it as free bits. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/localalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 044013455621..096cff6f9ba8 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -666,7 +666,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, if (le32_to_cpu(alloc->id1.bitmap1.i_used) != ocfs2_local_alloc_count_bits(alloc)) { ocfs2_error(osb->sb, "local alloc inode %llu says it has " - "%u free bits, but a count shows %u", + "%u used bits, but a count shows %u", (unsigned long long)le64_to_cpu(alloc->i_blkno), le32_to_cpu(alloc->id1.bitmap1.i_used), ocfs2_local_alloc_count_bits(alloc)); From e073fc58dfe6a4c9b614320c1d56bb71cb213ec4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 14 Apr 2015 15:43:19 -0700 Subject: [PATCH 018/122] ocfs2: dereferencing freed pointers in ocfs2_reflink() The code at the "out" label assumes that "default_acl" and "acl" are NULL, but actually the pointers can be NULL, unitialized, or freed. Signed-off-by: Dan Carpenter Reviewed-by: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/refcounttree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index ee541f92dab4..df3a500789c7 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4276,7 +4276,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, error = posix_acl_create(dir, &mode, &default_acl, &acl); if (error) { mlog_errno(error); - goto out; + return error; } error = ocfs2_create_inode_in_orphan(dir, mode, From e38a573907383b3c57514fe2331322b1fc110eef Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 14 Apr 2015 15:43:22 -0700 Subject: [PATCH 019/122] ocfs2: use actual name length when find entry in ocfs2_orphan_del() If the namelen is 20 and name only has actual length 16, it will fail in ocfs2_find_entry because of mismatch. So use actual name length when find entry. Signed-off-by: Joseph Qi Signed-off-by: Yiwen Jiang Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index b5c3a5ea3ee6..49837404541e 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2322,10 +2322,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, trace_ocfs2_orphan_del( (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, - name, namelen); + name, strlen(name)); /* find it's spot in the orphan directory */ - status = ocfs2_find_entry(name, namelen, orphan_dir_inode, + status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode, &lookup); if (status) { mlog_errno(status); From 62f8b1f0d609838361d65b5b2114859d464e6baa Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 14 Apr 2015 15:43:24 -0700 Subject: [PATCH 020/122] ocfs2: use ENOENT instead of EEXIST when get system file fails When ocfs2_get_system_file_inode fails, it is obscure to set the return value to -EEXIST. So change it to -ENOENT. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/inode.c | 4 ++-- fs/ocfs2/namei.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 3025c0da6b8a..be71ca0937f7 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -624,7 +624,7 @@ static int ocfs2_remove_inode(struct inode *inode, ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, le16_to_cpu(di->i_suballoc_slot)); if (!inode_alloc_inode) { - status = -EEXIST; + status = -ENOENT; mlog_errno(status); goto bail; } @@ -742,7 +742,7 @@ static int ocfs2_wipe_inode(struct inode *inode, ORPHAN_DIR_SYSTEM_INODE, orphaned_slot); if (!orphan_dir_inode) { - status = -EEXIST; + status = -ENOENT; mlog_errno(status); goto bail; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 49837404541e..09f90cbf0e24 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2808,7 +2808,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, ORPHAN_DIR_SYSTEM_INODE, osb->slot_num); if (!orphan_dir_inode) { - status = -EEXIST; + status = -ENOENT; mlog_errno(status); goto leave; } From a47726bcf299db6b5743d574df36c423263b4e65 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 14 Apr 2015 15:43:27 -0700 Subject: [PATCH 021/122] ocfs2: rollback the cleared bits if error occurs after ocfs2_block_group_clear_bits ocfs2_block_group_clear_bits will clear bits in block group bitmap. Once it succeeds but fails in the following step, it will cause block group bitmap mismatch the corresponding count recorded in dinode. So rollback the cleared bits if error occurs. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/suballoc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 0cb889a17ae1..4479029630bb 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2499,6 +2499,8 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); + ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh, + start_bit, count); goto bail; } From 7c01ad8fe7c159d7e1ddbd8e586d4d0dfed7ab3d Mon Sep 17 00:00:00 2001 From: Daeseok Youn Date: Tue, 14 Apr 2015 15:43:30 -0700 Subject: [PATCH 022/122] ocfs2: remove goto statement in ocfs2_check_dir_for_entry() Signed-off-by: Daeseok Youn Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dir.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index b08050bd3f2e..2241a19b9335 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2047,22 +2047,19 @@ int ocfs2_check_dir_for_entry(struct inode *dir, const char *name, int namelen) { - int ret; + int ret = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; trace_ocfs2_check_dir_for_entry( (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); - ret = -EEXIST; - if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) - goto bail; + if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) { + ret = -EEXIST; + mlog_errno(ret); + } - ret = 0; -bail: ocfs2_free_dir_lookup_result(&lookup); - if (ret) - mlog_errno(ret); return ret; } From 023d4ea358494ccfeb37abfe5b0fd01b45a6051c Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 14 Apr 2015 15:43:33 -0700 Subject: [PATCH 023/122] ocfs2: fix possible uninitialized variable access In ocfs2_local_alloc_find_clear_bits and ocfs2_get_dentry, variable numfound and set may be uninitialized and then used in tracepoint. In ocfs2_xattr_block_get and ocfs2_delete_xattr_in_bucket, variable block_off and xv may be uninitialized and then used in the following logic due to unchecked return value. This patch fixes these possible issues. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/export.c | 2 +- fs/ocfs2/localalloc.c | 2 +- fs/ocfs2/xattr.c | 8 ++++++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 29651167190d..540dc4bdd042 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -82,7 +82,6 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, } status = ocfs2_test_inode_bit(osb, blkno, &set); - trace_ocfs2_get_dentry_test_bit(status, set); if (status < 0) { if (status == -EINVAL) { /* @@ -96,6 +95,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, goto unlock_nfs_sync; } + trace_ocfs2_get_dentry_test_bit(status, set); /* If the inode allocator bit is clear, this inode must be stale */ if (!set) { status = -ESTALE; diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 096cff6f9ba8..857bbbcd39f3 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -839,7 +839,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, u32 *numbits, struct ocfs2_alloc_reservation *resv) { - int numfound, bitoff, left, startoff, lastzero; + int numfound = 0, bitoff, left, startoff, lastzero; int local_resv = 0; struct ocfs2_alloc_reservation r; void *bitmap = NULL; diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 85b190dc132f..4ca7533be479 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1238,6 +1238,10 @@ static int ocfs2_xattr_block_get(struct inode *inode, i, &block_off, &name_offset); + if (ret) { + mlog_errno(ret); + goto cleanup; + } xs->base = bucket_block(xs->bucket, block_off); } if (ocfs2_xattr_is_local(xs->here)) { @@ -5665,6 +5669,10 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode, ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, i, &xv, NULL); + if (ret) { + mlog_errno(ret); + break; + } ret = ocfs2_lock_xattr_remove_allocators(inode, xv, args->ref_ci, From 762515a8e9c7ead55539cf96a63dec2363b1df50 Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Tue, 14 Apr 2015 15:43:41 -0700 Subject: [PATCH 024/122] ocfs2: fix a typo in the copyright statement Signed-off-by: Jakub Wilk Reviewed-by: Eric Ren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 2241a19b9335..ccd4dcfc3645 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -18,7 +18,7 @@ * * linux/fs/minix/dir.c * - * Copyright (C) 1991, 1992 Linux Torvalds + * Copyright (C) 1991, 1992 Linus Torvalds * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public From e2ac55b6a8e337fac7cc59c6f452caac92ab5ee6 Mon Sep 17 00:00:00 2001 From: Chengyu Song Date: Tue, 14 Apr 2015 15:43:44 -0700 Subject: [PATCH 025/122] ocfs2: incorrect check for debugfs returns debugfs_create_dir and debugfs_create_file may return -ENODEV when debugfs is not configured, so the return value should be checked against ERROR_VALUE as well, otherwise the later dereference of the dentry pointer would crash the kernel. This patch tries to solve this problem by fixing certain checks. However, I have that found other call sites are protected by #ifdef CONFIG_DEBUG_FS. In current implementation, if CONFIG_DEBUG_FS is defined, then the above two functions will never return any ERROR_VALUE. So another possibility to fix this is to surround all the buggy checks/functions with the same #ifdef CONFIG_DEBUG_FS. But I'm not sure if this would break any functionality, as only OCFS2_FS_STATS declares dependency on DEBUG_FS. Signed-off-by: Chengyu Song Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 42 ++++++++++++++++++++++++++---------- fs/ocfs2/dlmglue.c | 2 +- fs/ocfs2/super.c | 9 ++++---- 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 16eff45727ee..8e19b9d7aba8 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1312,7 +1312,9 @@ static int o2hb_debug_init(void) int ret = -ENOMEM; o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); - if (!o2hb_debug_dir) { + if (IS_ERR_OR_NULL(o2hb_debug_dir)) { + ret = o2hb_debug_dir ? + PTR_ERR(o2hb_debug_dir) : -ENOMEM; mlog_errno(ret); goto bail; } @@ -1325,7 +1327,9 @@ static int o2hb_debug_init(void) sizeof(o2hb_live_node_bitmap), O2NM_MAX_NODES, o2hb_live_node_bitmap); - if (!o2hb_debug_livenodes) { + if (IS_ERR_OR_NULL(o2hb_debug_livenodes)) { + ret = o2hb_debug_livenodes ? + PTR_ERR(o2hb_debug_livenodes) : -ENOMEM; mlog_errno(ret); goto bail; } @@ -1338,7 +1342,9 @@ static int o2hb_debug_init(void) sizeof(o2hb_live_region_bitmap), O2NM_MAX_REGIONS, o2hb_live_region_bitmap); - if (!o2hb_debug_liveregions) { + if (IS_ERR_OR_NULL(o2hb_debug_liveregions)) { + ret = o2hb_debug_liveregions ? + PTR_ERR(o2hb_debug_liveregions) : -ENOMEM; mlog_errno(ret); goto bail; } @@ -1352,7 +1358,9 @@ static int o2hb_debug_init(void) sizeof(o2hb_quorum_region_bitmap), O2NM_MAX_REGIONS, o2hb_quorum_region_bitmap); - if (!o2hb_debug_quorumregions) { + if (IS_ERR_OR_NULL(o2hb_debug_quorumregions)) { + ret = o2hb_debug_quorumregions ? + PTR_ERR(o2hb_debug_quorumregions) : -ENOMEM; mlog_errno(ret); goto bail; } @@ -1366,7 +1374,9 @@ static int o2hb_debug_init(void) sizeof(o2hb_failed_region_bitmap), O2NM_MAX_REGIONS, o2hb_failed_region_bitmap); - if (!o2hb_debug_failedregions) { + if (IS_ERR_OR_NULL(o2hb_debug_failedregions)) { + ret = o2hb_debug_failedregions ? + PTR_ERR(o2hb_debug_failedregions) : -ENOMEM; mlog_errno(ret); goto bail; } @@ -2000,7 +2010,8 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) reg->hr_debug_dir = debugfs_create_dir(config_item_name(®->hr_item), dir); - if (!reg->hr_debug_dir) { + if (IS_ERR_OR_NULL(reg->hr_debug_dir)) { + ret = reg->hr_debug_dir ? PTR_ERR(reg->hr_debug_dir) : -ENOMEM; mlog_errno(ret); goto bail; } @@ -2013,7 +2024,9 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) O2HB_DB_TYPE_REGION_LIVENODES, sizeof(reg->hr_live_node_bitmap), O2NM_MAX_NODES, reg); - if (!reg->hr_debug_livenodes) { + if (IS_ERR_OR_NULL(reg->hr_debug_livenodes)) { + ret = reg->hr_debug_livenodes ? + PTR_ERR(reg->hr_debug_livenodes) : -ENOMEM; mlog_errno(ret); goto bail; } @@ -2025,7 +2038,9 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) sizeof(*(reg->hr_db_regnum)), O2HB_DB_TYPE_REGION_NUMBER, 0, O2NM_MAX_NODES, reg); - if (!reg->hr_debug_regnum) { + if (IS_ERR_OR_NULL(reg->hr_debug_regnum)) { + ret = reg->hr_debug_regnum ? + PTR_ERR(reg->hr_debug_regnum) : -ENOMEM; mlog_errno(ret); goto bail; } @@ -2037,7 +2052,9 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) sizeof(*(reg->hr_db_elapsed_time)), O2HB_DB_TYPE_REGION_ELAPSED_TIME, 0, 0, reg); - if (!reg->hr_debug_elapsed_time) { + if (IS_ERR_OR_NULL(reg->hr_debug_elapsed_time)) { + ret = reg->hr_debug_elapsed_time ? + PTR_ERR(reg->hr_debug_elapsed_time) : -ENOMEM; mlog_errno(ret); goto bail; } @@ -2049,13 +2066,16 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) sizeof(*(reg->hr_db_pinned)), O2HB_DB_TYPE_REGION_PINNED, 0, 0, reg); - if (!reg->hr_debug_pinned) { + if (IS_ERR_OR_NULL(reg->hr_debug_pinned)) { + ret = reg->hr_debug_pinned ? + PTR_ERR(reg->hr_debug_pinned) : -ENOMEM; mlog_errno(ret); goto bail; } - ret = 0; + return 0; bail: + debugfs_remove_recursive(reg->hr_debug_dir); return ret; } diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 11849a44dc5a..23adcbf374d3 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2954,7 +2954,7 @@ static int ocfs2_dlm_init_debug(struct ocfs2_super *osb) osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops); - if (!dlm_debug->d_locking_state) { + if (IS_ERR_OR_NULL(dlm_debug->d_locking_state)) { ret = -EINVAL; mlog(ML_ERROR, "Unable to create locking state debugfs file.\n"); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 26675185b886..fb43de586791 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1112,7 +1112,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, ocfs2_debugfs_root); - if (!osb->osb_debug_root) { + if (IS_ERR_OR_NULL(osb->osb_debug_root)) { status = -EINVAL; mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n"); goto read_super_error; @@ -1122,7 +1122,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->osb_debug_root, osb, &ocfs2_osb_debug_fops); - if (!osb->osb_ctxt) { + if (IS_ERR_OR_NULL(osb->osb_ctxt)) { status = -EINVAL; mlog_errno(status); goto read_super_error; @@ -1606,8 +1606,9 @@ static int __init ocfs2_init(void) } ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); - if (!ocfs2_debugfs_root) { - status = -ENOMEM; + if (IS_ERR_OR_NULL(ocfs2_debugfs_root)) { + status = ocfs2_debugfs_root ? + PTR_ERR(ocfs2_debugfs_root) : -ENOMEM; mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); goto out4; } From 1543306e75ee40662b2c6d37c43c8659f3d6f880 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 14 Apr 2015 15:43:46 -0700 Subject: [PATCH 026/122] ocfs2: logging: remove static buffer, use vsprintf extension %pV Use the vsprintf %pV extension to avoid using a static buffer and remove the now unnecessary buffer. Signed-off-by: Joe Perches Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/super.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index fb43de586791..c558bff99165 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2564,22 +2564,22 @@ static void ocfs2_handle_error(struct super_block *sb) ocfs2_set_ro_flag(osb, 0); } -static char error_buf[1024]; - -void __ocfs2_error(struct super_block *sb, - const char *function, - const char *fmt, ...) +void __ocfs2_error(struct super_block *sb, const char *function, + const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - vsnprintf(error_buf, sizeof(error_buf), fmt, args); - va_end(args); + vaf.fmt = fmt; + vaf.va = &args; /* Not using mlog here because we want to show the actual * function the error came from. */ - printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n", - sb->s_id, function, error_buf); + printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + + va_end(args); ocfs2_handle_error(sb); } @@ -2587,18 +2587,21 @@ void __ocfs2_error(struct super_block *sb, /* Handle critical errors. This is intentionally more drastic than * ocfs2_handle_error, so we only use for things like journal errors, * etc. */ -void __ocfs2_abort(struct super_block* sb, - const char *function, +void __ocfs2_abort(struct super_block *sb, const char *function, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - vsnprintf(error_buf, sizeof(error_buf), fmt, args); - va_end(args); - printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n", - sb->s_id, function, error_buf); + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + + va_end(args); /* We don't have the cluster support yet to go straight to * hard readonly in here. Until then, we want to keep From 2f2eca20a09dac0e9d62bf57ce6a0c6ef6cf91e6 Mon Sep 17 00:00:00 2001 From: alex chen Date: Tue, 14 Apr 2015 15:43:49 -0700 Subject: [PATCH 027/122] ocfs2: check if the ocfs2 lock resource has been initialized before calling ocfs2_dlm_lock If ocfs2 lockres has not been initialized before calling ocfs2_dlm_lock, the lock won't be dropped and then will lead umount hung. The case is described below: ocfs2_mknod ocfs2_mknod_locked __ocfs2_mknod_locked ocfs2_journal_access_di Failed because of -ENOMEM or other reasons, the inode lockres has not been initialized yet. iput(inode) ocfs2_evict_inode ocfs2_delete_inode ocfs2_inode_lock ocfs2_inode_lock_full_nested __ocfs2_cluster_lock Succeeds and allocates a new dlm lockres. ocfs2_clear_inode ocfs2_open_unlock ocfs2_drop_inode_locks ocfs2_drop_lock Since lockres has not been initialized, the lock can't be dropped and the lockres can't be migrated, thus umount will hang forever. Signed-off-by: Alex Chen Reviewed-by: Joseph Qi Reviewed-by: joyce.xue Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 23adcbf374d3..956edf67be20 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1391,6 +1391,11 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb, int noqueue_attempted = 0; int dlm_locked = 0; + if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) { + mlog_errno(-EINVAL); + return -EINVAL; + } + ocfs2_init_mask_waiter(&mw); if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) From 1d5b897706d11ac59fecd85ba7b1cbf91c44fe50 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 14 Apr 2015 15:43:52 -0700 Subject: [PATCH 028/122] ocfs2: make mlog_errno return the errno ocfs2 does mlog_errno(v); return v; in many places. Change mlog_errno() so we can do return mlog_errno(v); For some weird reason this patch reduces the size of ocfs2 by 6k: akpm3:/usr/src/25> size fs/ocfs2/ocfs2.ko text data bss dec hex filename 1146613 82767 832192 2061572 1f7504 fs/ocfs2/ocfs2.ko-before 1140857 82767 832192 2055816 1f5e88 fs/ocfs2/ocfs2.ko-after [dan.carpenter@oracle.com: double evaluation concerns in mlog_errno()] Cc: Mark Fasheh Cc: Joel Becker Cc: alex chen Signed-off-by: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/masklog.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 2260fb9e6508..7fdc25a4d8c0 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -196,13 +196,14 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits; } \ } while (0) -#define mlog_errno(st) do { \ +#define mlog_errno(st) ({ \ int _st = (st); \ if (_st != -ERESTARTSYS && _st != -EINTR && \ _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC && \ _st != -EDQUOT) \ mlog(ML_ERROR, "status = %lld\n", (long long)_st); \ -} while (0) + _st; \ +}) #define mlog_bug_on_msg(cond, fmt, args...) do { \ if (cond) { \ From 84d56e66b9b4a646f04ec30696ca1aeea5e654d5 Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Tue, 14 Apr 2015 15:43:55 -0700 Subject: [PATCH 029/122] watchdog: new definitions and variables, initialization The hardlockup and softockup had always been tied together. Due to the request of KVM folks, they had a need to have one enabled but not the other. Internally rework the code to split things apart more cleanly. There is a bunch of churn here, but the end result should be code that should be easier to maintain and fix without knowing the internals of what is going on. This patch (of 9): Introduce new definitions and variables to separate the user interface in /proc/sys/kernel from the internal run state of the lockup detectors. The internal run state is represented by two bits in a new variable that is named 'watchdog_enabled'. This helps simplify the code, for example: - In order to check if any of the two lockup detectors is enabled, it is sufficient to check if 'watchdog_enabled' is not zero. - In order to enable/disable one or both lockup detectors, it is sufficient to set/clear one or both bits in 'watchdog_enabled'. - Concurrent updates of 'watchdog_enabled' need not be synchronized via a spinlock or a mutex. Updates can either be atomic or concurrency can be detected by using 'cmpxchg'. Signed-off-by: Ulrich Obergfell Signed-off-by: Don Zickus Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nmi.h | 2 ++ kernel/watchdog.c | 27 ++++++++++++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 9b2022ab4d85..3885a7d12bd2 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -68,6 +68,8 @@ static inline bool trigger_allbutself_cpu_backtrace(void) #ifdef CONFIG_LOCKUP_DETECTOR int hw_nmi_is_cpu_stuck(struct pt_regs *); u64 hw_nmi_get_sample_period(int watchdog_thresh); +extern int nmi_watchdog_enabled; +extern int soft_watchdog_enabled; extern int watchdog_user_enabled; extern int watchdog_thresh; extern int sysctl_softlockup_all_cpu_backtrace; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 3174bf8e3538..9a1c78769a33 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -24,8 +24,33 @@ #include #include -int watchdog_user_enabled = 1; +/* + * The run state of the lockup detectors is controlled by the content of the + * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - + * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. + * + * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' + * are variables that are only used as an 'interface' between the parameters + * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The + * 'watchdog_thresh' variable is handled differently because its value is not + * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' + * is equal zero. + */ +#define NMI_WATCHDOG_ENABLED_BIT 0 +#define SOFT_WATCHDOG_ENABLED_BIT 1 +#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) +#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; +#else +static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; +#endif +int __read_mostly nmi_watchdog_enabled; +int __read_mostly soft_watchdog_enabled; +int __read_mostly watchdog_user_enabled; int __read_mostly watchdog_thresh = 10; + #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; #else From a0c9cbb93da9b9a0e00907a4a5d2e5f1fed86350 Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Tue, 14 Apr 2015 15:43:58 -0700 Subject: [PATCH 030/122] watchdog: introduce the proc_watchdog_update() function This series introduces a separate handler for each watchdog parameter in /proc/sys/kernel. The separate handlers need a common function that they can call to update the run state of the lockup detectors, or to have the lockup detectors use a new sample period. Signed-off-by: Ulrich Obergfell Signed-off-by: Don Zickus Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9a1c78769a33..dcc4990097a2 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -678,6 +678,29 @@ static void watchdog_disable_all_cpus(void) } } +/* + * Update the run state of the lockup detectors. + */ +static int proc_watchdog_update(void) +{ + int err = 0; + + /* + * Watchdog threads won't be started if they are already active. + * The 'watchdog_running' variable in watchdog_*_all_cpus() takes + * care of this. If those threads are already active, the sample + * period will be updated and the lockup detectors will be enabled + * or disabled 'on the fly'. + */ + if (watchdog_enabled && watchdog_thresh) + err = watchdog_enable_all_cpus(true); + else + watchdog_disable_all_cpus(); + + return err; + +} + /* * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh */ From f54c2274f5515da6bae779c7340cd0dc69d0dd8d Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Tue, 14 Apr 2015 15:44:01 -0700 Subject: [PATCH 031/122] watchdog: move definition of 'watchdog_proc_mutex' outside of proc_dowatchdog() This series removes proc_dowatchdog(). Since multiple new functions need the 'watchdog_proc_mutex' to serialize access to the watchdog parameters in /proc/sys/kernel, move the mutex outside of any function. Signed-off-by: Ulrich Obergfell Signed-off-by: Don Zickus Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index dcc4990097a2..28c833b42124 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -701,6 +701,8 @@ static int proc_watchdog_update(void) } +static DEFINE_MUTEX(watchdog_proc_mutex); + /* * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh */ @@ -710,7 +712,6 @@ int proc_dowatchdog(struct ctl_table *table, int write, { int err, old_thresh, old_enabled; bool old_hardlockup; - static DEFINE_MUTEX(watchdog_proc_mutex); mutex_lock(&watchdog_proc_mutex); old_thresh = ACCESS_ONCE(watchdog_thresh); From ef246a216b02c604ff465b9a62bb0d2e1ea183a7 Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Tue, 14 Apr 2015 15:44:05 -0700 Subject: [PATCH 032/122] watchdog: introduce proc_watchdog_common() Three of four handlers for the watchdog parameters in /proc/sys/kernel essentially have to do the same thing. if the parameter is being read { return the state of the corresponding bit(s) in 'watchdog_enabled' } else { set/clear the state of the corresponding bit(s) in 'watchdog_enabled' update the run state of the lockup detector(s) } Hence, introduce a common function that can be called by those handlers. The callers pass a 'bit mask' to this function to indicate which bit(s) should be set/cleared in 'watchdog_enabled'. This function handles an uncommon race with watchdog_nmi_enable() where a concurrent update of 'watchdog_enabled' is possible. We use 'cmpxchg' to detect the concurrency. [This avoids introducing a new spinlock or a mutex to synchronize updates of 'watchdog_enabled'. Using the same lock or mutex in watchdog thread context and in system call context needs to be considered carefully because it can make the code prone to deadlock situations in connection with parking/unparking the watchdog threads.] Signed-off-by: Ulrich Obergfell Signed-off-by: Don Zickus Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 28c833b42124..3600a01c97a9 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -703,6 +703,71 @@ static int proc_watchdog_update(void) static DEFINE_MUTEX(watchdog_proc_mutex); +/* + * common function for watchdog, nmi_watchdog and soft_watchdog parameter + * + * caller | table->data points to | 'which' contains the flag(s) + * -------------------|-----------------------|----------------------------- + * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed + * | | with SOFT_WATCHDOG_ENABLED + * -------------------|-----------------------|----------------------------- + * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED + * -------------------|-----------------------|----------------------------- + * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED + */ +static int proc_watchdog_common(int which, struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int err, old, new; + int *watchdog_param = (int *)table->data; + + mutex_lock(&watchdog_proc_mutex); + + /* + * If the parameter is being read return the state of the corresponding + * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the + * run state of the lockup detectors. + */ + if (!write) { + *watchdog_param = (watchdog_enabled & which) != 0; + err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + } else { + err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (err) + goto out; + + /* + * There is a race window between fetching the current value + * from 'watchdog_enabled' and storing the new value. During + * this race window, watchdog_nmi_enable() can sneak in and + * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'. + * The 'cmpxchg' detects this race and the loop retries. + */ + do { + old = watchdog_enabled; + /* + * If the parameter value is not zero set the + * corresponding bit(s), else clear it(them). + */ + if (*watchdog_param) + new = old | which; + else + new = old & ~which; + } while (cmpxchg(&watchdog_enabled, old, new) != old); + + /* + * Update the run state of the lockup detectors. + * Restore 'watchdog_enabled' on failure. + */ + err = proc_watchdog_update(); + if (err) + watchdog_enabled = old; + } +out: + mutex_unlock(&watchdog_proc_mutex); + return err; +} + /* * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh */ From 83a80a39075a9ded23df1e26a4b617c289077630 Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Tue, 14 Apr 2015 15:44:08 -0700 Subject: [PATCH 033/122] watchdog: introduce separate handlers for parameters in /proc/sys/kernel Separate handlers for each watchdog parameter in /proc/sys/kernel replace the proc_dowatchdog() function. Three of those handlers merely call proc_watchdog_common() with one different argument. Signed-off-by: Ulrich Obergfell Signed-off-by: Don Zickus Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nmi.h | 8 ++++++ kernel/watchdog.c | 59 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 3885a7d12bd2..5b5450585b8a 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -74,6 +74,14 @@ extern int watchdog_user_enabled; extern int watchdog_thresh; extern int sysctl_softlockup_all_cpu_backtrace; struct ctl_table; +extern int proc_watchdog(struct ctl_table *, int , + void __user *, size_t *, loff_t *); +extern int proc_nmi_watchdog(struct ctl_table *, int , + void __user *, size_t *, loff_t *); +extern int proc_soft_watchdog(struct ctl_table *, int , + void __user *, size_t *, loff_t *); +extern int proc_watchdog_thresh(struct ctl_table *, int , + void __user *, size_t *, loff_t *); extern int proc_dowatchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); #endif diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 3600a01c97a9..26002ed4c16e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -768,6 +768,65 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, return err; } +/* + * /proc/sys/kernel/watchdog + */ +int proc_watchdog(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED, + table, write, buffer, lenp, ppos); +} + +/* + * /proc/sys/kernel/nmi_watchdog + */ +int proc_nmi_watchdog(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_watchdog_common(NMI_WATCHDOG_ENABLED, + table, write, buffer, lenp, ppos); +} + +/* + * /proc/sys/kernel/soft_watchdog + */ +int proc_soft_watchdog(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_watchdog_common(SOFT_WATCHDOG_ENABLED, + table, write, buffer, lenp, ppos); +} + +/* + * /proc/sys/kernel/watchdog_thresh + */ +int proc_watchdog_thresh(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int err, old; + + mutex_lock(&watchdog_proc_mutex); + + old = ACCESS_ONCE(watchdog_thresh); + err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (err || !write) + goto out; + + /* + * Update the sample period. + * Restore 'watchdog_thresh' on failure. + */ + set_sample_period(); + err = proc_watchdog_update(); + if (err) + watchdog_thresh = old; +out: + mutex_unlock(&watchdog_proc_mutex); + return err; +} + /* * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh */ From bcfba4f4bf3c9c7c72b459d52a9e826dfd72855e Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Tue, 14 Apr 2015 15:44:10 -0700 Subject: [PATCH 034/122] watchdog: implement error handling for failure to set up hardware perf events If watchdog_nmi_enable() fails to set up the hardware perf event of one CPU, the entire hard lockup detector is deemed unreliable. Hence, disable the hard lockup detector and shut down the hardware perf events on all CPUs. [dzickus@redhat.com: update comments to explain some code] Signed-off-by: Ulrich Obergfell Signed-off-by: Don Zickus Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 26002ed4c16e..fd2b6dc14486 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -502,6 +502,21 @@ static void watchdog(unsigned int cpu) __this_cpu_write(soft_lockup_hrtimer_cnt, __this_cpu_read(hrtimer_interrupts)); __touch_watchdog(); + + /* + * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the + * failure path. Check for failures that can occur asynchronously - + * for example, when CPUs are on-lined - and shut down the hardware + * perf event on each CPU accordingly. + * + * The only non-obvious place this bit can be cleared is through + * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a + * pr_info here would be too noisy as it would result in a message + * every few seconds if the hardlockup was disabled but the softlockup + * enabled. + */ + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + watchdog_nmi_disable(cpu); } #ifdef CONFIG_HARDLOCKUP_DETECTOR @@ -552,6 +567,18 @@ static int watchdog_nmi_enable(unsigned int cpu) goto out_save; } + /* + * Disable the hard lockup detector if _any_ CPU fails to set up + * set up the hardware perf event. The watchdog() function checks + * the NMI_WATCHDOG_ENABLED bit periodically. + * + * The barriers are for syncing up watchdog_enabled across all the + * cpus, as clear_bit() does not use barriers. + */ + smp_mb__before_atomic(); + clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); + smp_mb__after_atomic(); + /* skip displaying the same error again */ if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) return PTR_ERR(event); @@ -565,6 +592,9 @@ static int watchdog_nmi_enable(unsigned int cpu) else pr_err("disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); + + pr_info("Shutting down hard lockup detector on all cpus\n"); + return PTR_ERR(event); /* success path */ From 195daf665a6299de98a4da3843fed2dd9de19d3a Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Tue, 14 Apr 2015 15:44:13 -0700 Subject: [PATCH 035/122] watchdog: enable the new user interface of the watchdog mechanism With the current user interface of the watchdog mechanism it is only possible to disable or enable both lockup detectors at the same time. This series introduces new kernel parameters and changes the semantics of some existing kernel parameters, so that the hard lockup detector and the soft lockup detector can be disabled or enabled individually. With this series applied, the user interface is as follows. - parameters in /proc/sys/kernel . soft_watchdog This is a new parameter to control and examine the run state of the soft lockup detector. . nmi_watchdog The semantics of this parameter have changed. It can now be used to control and examine the run state of the hard lockup detector. . watchdog This parameter is still available to control the run state of both lockup detectors at the same time. If this parameter is examined, it shows the logical OR of soft_watchdog and nmi_watchdog. . watchdog_thresh The semantics of this parameter are not affected by the patch. - kernel command line parameters . nosoftlockup The semantics of this parameter have changed. It can now be used to disable the soft lockup detector at boot time. . nmi_watchdog=0 or nmi_watchdog=1 Disable or enable the hard lockup detector at boot time. The patch introduces '=1' as a new option. . nowatchdog The semantics of this parameter are not affected by the patch. It is still available to disable both lockup detectors at boot time. Also, remove the proc_dowatchdog() function which is no longer needed. [dzickus@redhat.com: wrote changelog] [dzickus@redhat.com: update documentation for kernel params and sysctl] Signed-off-by: Ulrich Obergfell Signed-off-by: Don Zickus Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 6 ++- Documentation/sysctl/kernel.txt | 62 ++++++++++++++++++---- include/linux/nmi.h | 2 - kernel/sysctl.c | 35 +++++++++---- kernel/watchdog.c | 81 ++++++----------------------- 5 files changed, 97 insertions(+), 89 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 01aa47d3b6ab..71eecb263250 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2236,8 +2236,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nmi_watchdog= [KNL,BUGS=X86] Debugging features for SMP kernels Format: [panic,][nopanic,][num] - Valid num: 0 + Valid num: 0 or 1 0 - turn nmi_watchdog off + 1 - turn nmi_watchdog on When panic is specified, panic when an NMI watchdog timeout occurs (or 'nopanic' to override the opposite default). @@ -2464,7 +2465,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nousb [USB] Disable the USB subsystem - nowatchdog [KNL] Disable the lockup detector (NMI watchdog). + nowatchdog [KNL] Disable both lockup detectors, i.e. + soft-lockup and NMI watchdog (hard-lockup). nowb [ARM] diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 83ab25660fc9..99d7eb3a1416 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -77,12 +77,14 @@ show up in /proc/sys/kernel: - shmmax [ sysv ipc ] - shmmni - softlockup_all_cpu_backtrace +- soft_watchdog - stop-a [ SPARC only ] - sysrq ==> Documentation/sysrq.txt - sysctl_writes_strict - tainted - threads-max - unknown_nmi_panic +- watchdog - watchdog_thresh - version @@ -417,16 +419,23 @@ successful IPC object allocation. nmi_watchdog: -Enables/Disables the NMI watchdog on x86 systems. When the value is -non-zero the NMI watchdog is enabled and will continuously test all -online cpus to determine whether or not they are still functioning -properly. Currently, passing "nmi_watchdog=" parameter at boot time is -required for this function to work. +This parameter can be used to control the NMI watchdog +(i.e. the hard lockup detector) on x86 systems. -If LAPIC NMI watchdog method is in use (nmi_watchdog=2 kernel -parameter), the NMI watchdog shares registers with oprofile. By -disabling the NMI watchdog, oprofile may have more registers to -utilize. + 0 - disable the hard lockup detector + 1 - enable the hard lockup detector + +The hard lockup detector monitors each CPU for its ability to respond to +timer interrupts. The mechanism utilizes CPU performance counter registers +that are programmed to generate Non-Maskable Interrupts (NMIs) periodically +while a CPU is busy. Hence, the alternative name 'NMI watchdog'. + +The NMI watchdog is disabled by default if the kernel is running as a guest +in a KVM virtual machine. This default can be overridden by adding + + nmi_watchdog=1 + +to the guest kernel command line (see Documentation/kernel-parameters.txt). ============================================================== @@ -816,6 +825,22 @@ NMI. ============================================================== +soft_watchdog + +This parameter can be used to control the soft lockup detector. + + 0 - disable the soft lockup detector + 1 - enable the soft lockup detector + +The soft lockup detector monitors CPUs for threads that are hogging the CPUs +without rescheduling voluntarily, and thus prevent the 'watchdog/N' threads +from running. The mechanism depends on the CPUs ability to respond to timer +interrupts which are needed for the 'watchdog/N' threads to be woken up by +the watchdog timer function, otherwise the NMI watchdog - if enabled - can +detect a hard lockup condition. + +============================================================== + tainted: Non-zero if the kernel has been tainted. Numeric values, which @@ -858,6 +883,25 @@ example. If a system hangs up, try pressing the NMI switch. ============================================================== +watchdog: + +This parameter can be used to disable or enable the soft lockup detector +_and_ the NMI watchdog (i.e. the hard lockup detector) at the same time. + + 0 - disable both lockup detectors + 1 - enable both lockup detectors + +The soft lockup detector and the NMI watchdog can also be disabled or +enabled individually, using the soft_watchdog and nmi_watchdog parameters. +If the watchdog parameter is read, for example by executing + + cat /proc/sys/kernel/watchdog + +the output of this command (0 or 1) shows the logical OR of soft_watchdog +and nmi_watchdog. + +============================================================== + watchdog_thresh: This value can be used to control the frequency of hrtimer and NMI diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 5b5450585b8a..0426357297d5 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -82,8 +82,6 @@ extern int proc_soft_watchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); extern int proc_watchdog_thresh(struct ctl_table *, int , void __user *, size_t *, loff_t *); -extern int proc_dowatchdog(struct ctl_table *, int , - void __user *, size_t *, loff_t *); #endif #ifdef CONFIG_HAVE_ACPI_APEI_NMI diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ce410bb9f2e1..245e7dcc3741 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -846,7 +846,7 @@ static struct ctl_table kern_table[] = { .data = &watchdog_user_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_dowatchdog, + .proc_handler = proc_watchdog, .extra1 = &zero, .extra2 = &one, }, @@ -855,10 +855,32 @@ static struct ctl_table kern_table[] = { .data = &watchdog_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dowatchdog, + .proc_handler = proc_watchdog_thresh, .extra1 = &zero, .extra2 = &sixty, }, + { + .procname = "nmi_watchdog", + .data = &nmi_watchdog_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_nmi_watchdog, + .extra1 = &zero, +#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) + .extra2 = &one, +#else + .extra2 = &zero, +#endif + }, + { + .procname = "soft_watchdog", + .data = &soft_watchdog_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_soft_watchdog, + .extra1 = &zero, + .extra2 = &one, + }, { .procname = "softlockup_panic", .data = &softlockup_panic, @@ -879,15 +901,6 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif /* CONFIG_SMP */ - { - .procname = "nmi_watchdog", - .data = &watchdog_user_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dowatchdog, - .extra1 = &zero, - .extra2 = &one, - }, #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { diff --git a/kernel/watchdog.c b/kernel/watchdog.c index fd2b6dc14486..63d702885686 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -110,15 +110,9 @@ static int __init hardlockup_panic_setup(char *str) else if (!strncmp(str, "nopanic", 7)) hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) - watchdog_user_enabled = 0; - else if (!strncmp(str, "1", 1) || !strncmp(str, "2", 1)) { - /* - * Setting 'nmi_watchdog=1' or 'nmi_watchdog=2' (legacy option) - * has the same effect. - */ - watchdog_user_enabled = 1; - watchdog_enable_hardlockup_detector(true); - } + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + else if (!strncmp(str, "1", 1)) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); @@ -137,19 +131,18 @@ __setup("softlockup_panic=", softlockup_panic_setup); static int __init nowatchdog_setup(char *str) { - watchdog_user_enabled = 0; + watchdog_enabled = 0; return 1; } __setup("nowatchdog", nowatchdog_setup); -/* deprecated */ static int __init nosoftlockup_setup(char *str) { - watchdog_user_enabled = 0; + watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED; return 1; } __setup("nosoftlockup", nosoftlockup_setup); -/* */ + #ifdef CONFIG_SMP static int __init softlockup_all_cpu_backtrace_setup(char *str) { @@ -264,10 +257,11 @@ static int is_softlockup(unsigned long touch_ts) { unsigned long now = get_timestamp(); - /* Warn about unreasonable delays: */ - if (time_after(now, touch_ts + get_softlockup_thresh())) - return now - touch_ts; - + if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) { + /* Warn about unreasonable delays. */ + if (time_after(now, touch_ts + get_softlockup_thresh())) + return now - touch_ts; + } return 0; } @@ -532,6 +526,10 @@ static int watchdog_nmi_enable(unsigned int cpu) struct perf_event_attr *wd_attr; struct perf_event *event = per_cpu(watchdog_ev, cpu); + /* nothing to do if the hard lockup detector is disabled */ + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + goto out; + /* * Some kernels need to default hard lockup detection to * 'disabled', for example a guest on a hypervisor. @@ -856,59 +854,12 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, mutex_unlock(&watchdog_proc_mutex); return err; } - -/* - * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh - */ - -int proc_dowatchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int err, old_thresh, old_enabled; - bool old_hardlockup; - - mutex_lock(&watchdog_proc_mutex); - old_thresh = ACCESS_ONCE(watchdog_thresh); - old_enabled = ACCESS_ONCE(watchdog_user_enabled); - old_hardlockup = watchdog_hardlockup_detector_is_enabled(); - - err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (err || !write) - goto out; - - set_sample_period(); - /* - * Watchdog threads shouldn't be enabled if they are - * disabled. The 'watchdog_running' variable check in - * watchdog_*_all_cpus() function takes care of this. - */ - if (watchdog_user_enabled && watchdog_thresh) { - /* - * Prevent a change in watchdog_thresh accidentally overriding - * the enablement of the hardlockup detector. - */ - if (watchdog_user_enabled != old_enabled) - watchdog_enable_hardlockup_detector(true); - err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); - } else - watchdog_disable_all_cpus(); - - /* Restore old values on failure */ - if (err) { - watchdog_thresh = old_thresh; - watchdog_user_enabled = old_enabled; - watchdog_enable_hardlockup_detector(old_hardlockup); - } -out: - mutex_unlock(&watchdog_proc_mutex); - return err; -} #endif /* CONFIG_SYSCTL */ void __init lockup_detector_init(void) { set_sample_period(); - if (watchdog_user_enabled) + if (watchdog_enabled) watchdog_enable_all_cpus(false); } From b2f57c3a0df9d168220be8848a303a32ef4d2e7d Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Tue, 14 Apr 2015 15:44:16 -0700 Subject: [PATCH 036/122] watchdog: clean up some function names and arguments Rename the update_timers*() functions to update_watchdog*(). Remove the boolean argument from watchdog_enable_all_cpus() because update_watchdog_all_cpus() is now a generic function to change the run state of the lockup detectors and to have the lockup detectors use a new sample period. Signed-off-by: Ulrich Obergfell Signed-off-by: Don Zickus Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 63d702885686..49d02250aaac 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -653,7 +653,7 @@ static void restart_watchdog_hrtimer(void *info) HRTIMER_MODE_REL_PINNED); } -static void update_timers(int cpu) +static void update_watchdog(int cpu) { /* * Make sure that perf event counter will adopt to a new @@ -668,17 +668,17 @@ static void update_timers(int cpu) watchdog_nmi_enable(cpu); } -static void update_timers_all_cpus(void) +static void update_watchdog_all_cpus(void) { int cpu; get_online_cpus(); for_each_online_cpu(cpu) - update_timers(cpu); + update_watchdog(cpu); put_online_cpus(); } -static int watchdog_enable_all_cpus(bool sample_period_changed) +static int watchdog_enable_all_cpus(void) { int err = 0; @@ -688,8 +688,12 @@ static int watchdog_enable_all_cpus(bool sample_period_changed) pr_err("Failed to create watchdog threads, disabled\n"); else watchdog_running = 1; - } else if (sample_period_changed) { - update_timers_all_cpus(); + } else { + /* + * Enable/disable the lockup detectors or + * change the sample period 'on the fly'. + */ + update_watchdog_all_cpus(); } return err; @@ -721,7 +725,7 @@ static int proc_watchdog_update(void) * or disabled 'on the fly'. */ if (watchdog_enabled && watchdog_thresh) - err = watchdog_enable_all_cpus(true); + err = watchdog_enable_all_cpus(); else watchdog_disable_all_cpus(); @@ -861,5 +865,5 @@ void __init lockup_detector_init(void) set_sample_period(); if (watchdog_enabled) - watchdog_enable_all_cpus(false); + watchdog_enable_all_cpus(); } From 692297d8f96887f836d9049a653ed05a71cf48fb Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Tue, 14 Apr 2015 15:44:19 -0700 Subject: [PATCH 037/122] watchdog: introduce the hardlockup_detector_disable() function Have kvm_guest_init() use hardlockup_detector_disable() instead of watchdog_enable_hardlockup_detector(false). Remove the watchdog_hardlockup_detector_is_enabled() and the watchdog_enable_hardlockup_detector() function which are no longer needed. Signed-off-by: Ulrich Obergfell Signed-off-by: Don Zickus Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/kvm.c | 2 +- include/linux/nmi.h | 9 ++------- kernel/watchdog.c | 21 ++------------------- 3 files changed, 5 insertions(+), 27 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e354cc6446ab..9435620062df 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -513,7 +513,7 @@ void __init kvm_guest_init(void) * can get false positives too easily, for example if the host is * overcommitted. */ - watchdog_enable_hardlockup_detector(false); + hardlockup_detector_disable(); } static noinline uint32_t __kvm_cpuid_base(void) diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 0426357297d5..3d46fb4708e0 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -25,16 +25,11 @@ static inline void touch_nmi_watchdog(void) #endif #if defined(CONFIG_HARDLOCKUP_DETECTOR) -extern void watchdog_enable_hardlockup_detector(bool val); -extern bool watchdog_hardlockup_detector_is_enabled(void); +extern void hardlockup_detector_disable(void); #else -static inline void watchdog_enable_hardlockup_detector(bool val) +static inline void hardlockup_detector_disable(void) { } -static inline bool watchdog_hardlockup_detector_is_enabled(void) -{ - return true; -} #endif /* diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 49d02250aaac..f2be11ab7e08 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -83,8 +83,6 @@ static unsigned long soft_lockup_nmi_warn; #ifdef CONFIG_HARDLOCKUP_DETECTOR static int hardlockup_panic = CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; - -static bool hardlockup_detector_enabled = true; /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these @@ -93,14 +91,9 @@ static bool hardlockup_detector_enabled = true; * kernel command line parameters are parsed, because otherwise it is not * possible to override this in hardlockup_panic_setup(). */ -void watchdog_enable_hardlockup_detector(bool val) +void hardlockup_detector_disable(void) { - hardlockup_detector_enabled = val; -} - -bool watchdog_hardlockup_detector_is_enabled(void) -{ - return hardlockup_detector_enabled; + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; } static int __init hardlockup_panic_setup(char *str) @@ -530,15 +523,6 @@ static int watchdog_nmi_enable(unsigned int cpu) if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) goto out; - /* - * Some kernels need to default hard lockup detection to - * 'disabled', for example a guest on a hypervisor. - */ - if (!watchdog_hardlockup_detector_is_enabled()) { - event = ERR_PTR(-ENOENT); - goto handle_err; - } - /* is it already setup and enabled? */ if (event && event->state > PERF_EVENT_STATE_OFF) goto out; @@ -553,7 +537,6 @@ static int watchdog_nmi_enable(unsigned int cpu) /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); -handle_err: /* save cpu0 error for future comparision */ if (cpu == 0 && IS_ERR(event)) cpu0_err = PTR_ERR(event); From ef2a5153b4d2c48c05b9280491cb5592a46df385 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 14 Apr 2015 15:44:22 -0700 Subject: [PATCH 038/122] mm/migrate: mark unmap_and_move() "noinline" to avoid ICE in gcc 4.7.3 With gcc version 4.7.3 (Ubuntu/Linaro 4.7.3-12ubuntu1) : mm/migrate.c: In function `migrate_pages': mm/migrate.c:1148:1: internal compiler error: in push_minipool_fix, at config/arm/arm.c:13500 Please submit a full bug report, with preprocessed source if appropriate. See for instructions. Preprocessed source stored into /tmp/ccPoM1tr.out file, please attach this to your bugreport. make[1]: *** [mm/migrate.o] Error 1 make: *** [mm/migrate.o] Error 2 Mark unmap_and_move() (which is used in a single place only) "noinline" to work around this compiler bug. [akpm@linux-foundation.org: make it conditional on gcc-4.7.3 and arm] [khilman@kernel.org: fine-tune compiler versions] [akpm@linux-foundation.org: fix comment] Signed-off-by: Geert Uytterhoeven Reported-by: Kevin Hilman Cc: Marc Zyngier Tested-by: Kevin Hilman Tested-by: Lina Iyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 85e042686031..ec1802d85f05 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -900,13 +900,24 @@ static int __unmap_and_move(struct page *page, struct page *newpage, return rc; } +/* + * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work + * around it. + */ +#if (GCC_VERSION >= 40700 && GCC_VERSION < 40900) && defined(CONFIG_ARM) +#define ICE_noinline noinline +#else +#define ICE_noinline +#endif + /* * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. */ -static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, - unsigned long private, struct page *page, int force, - enum migrate_mode mode) +static ICE_noinline int unmap_and_move(new_page_t get_new_page, + free_page_t put_new_page, + unsigned long private, struct page *page, + int force, enum migrate_mode mode) { int rc = 0; int *result = NULL; From 08303a73c62a1cc83c613fa1965f5127ba030c46 Mon Sep 17 00:00:00 2001 From: Chris J Arges Date: Tue, 14 Apr 2015 15:44:25 -0700 Subject: [PATCH 039/122] mm/slub.c: parse slub_debug O option in switch statement By moving the O option detection into the switch statement, we allow this parameter to be combined with other options correctly. Previously options like slub_debug=OFZ would only detect the 'o' and use DEBUG_DEFAULT_FLAGS to fill in the rest of the flags. Signed-off-by: Chris J Arges Cc: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 82c473780c91..d01f9126fcf2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1137,15 +1137,6 @@ static int __init setup_slub_debug(char *str) */ goto check_slabs; - if (tolower(*str) == 'o') { - /* - * Avoid enabling debugging on caches if its minimum order - * would increase as a result. - */ - disable_higher_order_debug = 1; - goto out; - } - slub_debug = 0; if (*str == '-') /* @@ -1176,6 +1167,13 @@ static int __init setup_slub_debug(char *str) case 'a': slub_debug |= SLAB_FAILSLAB; break; + case 'o': + /* + * Avoid enabling debugging on caches if its minimum + * order would increase as a result. + */ + disable_higher_order_debug = 1; + break; default: pr_err("slub_debug option '%c' unknown. skipped\n", *str); From 124dee09f0669b92cc0073b00984d53541ca0884 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2015 15:44:28 -0700 Subject: [PATCH 040/122] mm, slab: correct config option in comment CONFIG_SLAB_DEBUG doesn't exist, CONFIG_DEBUG_SLAB does. Signed-off-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 76f1feeabd38..ffd24c830151 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -18,7 +18,7 @@ /* * Flags to pass to kmem_cache_create(). - * The ones marked DEBUG are only valid if CONFIG_SLAB_DEBUG is set. + * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set. */ #define SLAB_DEBUG_FREE 0x00000100UL /* DEBUG: Perform (expensive) checks on free */ #define SLAB_RED_ZONE 0x00000400UL /* DEBUG: Red zone objs in a cache */ From 6f6528a1632cb9661a2ff46e217b07d84a80eff6 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 14 Apr 2015 15:44:31 -0700 Subject: [PATCH 041/122] slub: use bool function return values of true/false not 1/0 Use the normal return values for bool functions Signed-off-by: Joe Perches Cc: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index d01f9126fcf2..0fdd6c1e1f82 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -374,7 +374,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, freelist_new, counters_new)) - return 1; + return true; } else #endif { @@ -384,7 +384,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page page->freelist = freelist_new; set_page_slub_counters(page, counters_new); slab_unlock(page); - return 1; + return true; } slab_unlock(page); } @@ -396,7 +396,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page pr_info("%s %s: cmpxchg double redo ", n, s->name); #endif - return 0; + return false; } static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, @@ -410,7 +410,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, freelist_new, counters_new)) - return 1; + return true; } else #endif { @@ -424,7 +424,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, set_page_slub_counters(page, counters_new); slab_unlock(page); local_irq_restore(flags); - return 1; + return true; } slab_unlock(page); local_irq_restore(flags); @@ -437,7 +437,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, pr_info("%s %s: cmpxchg double redo ", n, s->name); #endif - return 0; + return false; } #ifdef CONFIG_SLUB_DEBUG From c21a6daf466a7bfa7bc2ac594837a1ce793a7960 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 14 Apr 2015 15:44:34 -0700 Subject: [PATCH 042/122] slob: make slob_alloc_node() static and remove EXPORT_SYMBOL() slob_alloc_node() is only used in slob.c. Remove the EXPORT_SYMBOL and make slob_alloc_node() static. Signed-off-by: Fabian Frederick Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slob.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/slob.c b/mm/slob.c index 94a7fede6d48..4765f65019c7 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -532,7 +532,7 @@ int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) return 0; } -void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) +static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) { void *b; @@ -558,7 +558,6 @@ void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); return b; } -EXPORT_SYMBOL(slob_alloc_node); void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { From 84d33df279e0380995b0e03fb8aad04cef2bc29f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:44:37 -0700 Subject: [PATCH 043/122] mm: rename FOLL_MLOCK to FOLL_POPULATE After commit a1fde08c74e9 ("VM: skip the stack guard page lookup in get_user_pages only for mlock") FOLL_MLOCK has lost its original meaning: we don't necessarily mlock the page if the flags is set -- we also take VM_LOCKED into consideration. Since we use the same codepath for __mm_populate(), let's rename FOLL_MLOCK to FOLL_POPULATE. Signed-off-by: Kirill A. Shutemov Acked-by: Linus Torvalds Acked-by: David Rientjes Cc: Michel Lespinasse Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/gup.c | 6 +++--- mm/huge_memory.c | 2 +- mm/mlock.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 47a93928b90f..cccbbba12b9d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2109,7 +2109,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma, #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ #define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO * and return without waiting upon it */ -#define FOLL_MLOCK 0x40 /* mark page as mlocked */ +#define FOLL_POPULATE 0x40 /* fault in page */ #define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ diff --git a/mm/gup.c b/mm/gup.c index a6e24e246f86..1b114ba9aebf 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -92,7 +92,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, */ mark_page_accessed(page); } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE @@ -265,8 +265,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned int fault_flags = 0; int ret; - /* For mlock, just skip the stack guard page. */ - if ((*flags & FOLL_MLOCK) && + /* For mm_populate(), just skip the stack guard page. */ + if ((*flags & FOLL_POPULATE) && (stack_guard_page_start(vma, address) || stack_guard_page_end(vma, address + PAGE_SIZE))) return -ENOENT; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6817b0350c71..10a4b6cea0d1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1231,7 +1231,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, pmd, _pmd, 1)) update_mmu_cache_pmd(vma, addr, pmd); } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { if (page->mapping && trylock_page(page)) { lru_add_drain(); if (page->mapping) diff --git a/mm/mlock.c b/mm/mlock.c index 8a54cd214925..f756e28b33fc 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -237,7 +237,7 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma, VM_BUG_ON_VMA(end > vma->vm_end, vma); VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); - gup_flags = FOLL_TOUCH | FOLL_MLOCK; + gup_flags = FOLL_TOUCH | FOLL_POPULATE; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW From fc05f566210fa57f8e68ead8762b8dbb3f1c61e3 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:44:39 -0700 Subject: [PATCH 044/122] mm: rename __mlock_vma_pages_range() to populate_vma_page_range() __mlock_vma_pages_range() doesn't necessarily mlock pages. It depends on vma flags. The same codepath is used for MAP_POPULATE. Let's rename __mlock_vma_pages_range() to populate_vma_page_range(). This patch also drops mlock_vma_pages_range() references from documentation. It has gone in cea10a19b797 ("mm: directly use __mlock_vma_pages_range() in find_extend_vma()"). Signed-off-by: Kirill A. Shutemov Acked-by: Linus Torvalds Acked-by: David Rientjes Cc: Michel Lespinasse Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/unevictable-lru.txt | 24 +++++++----------------- mm/internal.h | 2 +- mm/mlock.c | 12 ++++++------ mm/mmap.c | 4 ++-- 4 files changed, 16 insertions(+), 26 deletions(-) diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt index 744f82f86c58..86cb4624fc5a 100644 --- a/Documentation/vm/unevictable-lru.txt +++ b/Documentation/vm/unevictable-lru.txt @@ -317,7 +317,7 @@ If the VMA passes some filtering as described in "Filtering Special Vmas" below, mlock_fixup() will attempt to merge the VMA with its neighbors or split off a subset of the VMA if the range does not cover the entire VMA. Once the VMA has been merged or split or neither, mlock_fixup() will call -__mlock_vma_pages_range() to fault in the pages via get_user_pages() and to +populate_vma_page_range() to fault in the pages via get_user_pages() and to mark the pages as mlocked via mlock_vma_page(). Note that the VMA being mlocked might be mapped with PROT_NONE. In this case, @@ -327,7 +327,7 @@ fault path or in vmscan. Also note that a page returned by get_user_pages() could be truncated or migrated out from under us, while we're trying to mlock it. To detect this, -__mlock_vma_pages_range() checks page_mapping() after acquiring the page lock. +populate_vma_page_range() checks page_mapping() after acquiring the page lock. If the page is still associated with its mapping, we'll go ahead and call mlock_vma_page(). If the mapping is gone, we just unlock the page and move on. In the worst case, this will result in a page mapped in a VM_LOCKED VMA @@ -392,7 +392,7 @@ ignored for munlock. If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the specified range. The range is then munlocked via the function -__mlock_vma_pages_range() - the same function used to mlock a VMA range - +populate_vma_page_range() - the same function used to mlock a VMA range - passing a flag to indicate that munlock() is being performed. Because the VMA access protections could have been changed to PROT_NONE after @@ -402,7 +402,7 @@ get_user_pages() was enhanced to accept a flag to ignore the permissions when fetching the pages - all of which should be resident as a result of previous mlocking. -For munlock(), __mlock_vma_pages_range() unlocks individual pages by calling +For munlock(), populate_vma_page_range() unlocks individual pages by calling munlock_vma_page(). munlock_vma_page() unconditionally clears the PG_mlocked flag using TestClearPageMlocked(). As with mlock_vma_page(), munlock_vma_page() use the Test*PageMlocked() function to handle the case where @@ -463,21 +463,11 @@ populate the page table. To mlock a range of memory under the unevictable/mlock infrastructure, the mmap() handler and task address space expansion functions call -mlock_vma_pages_range() specifying the vma and the address range to mlock. -mlock_vma_pages_range() filters VMAs like mlock_fixup(), as described above in -"Filtering Special VMAs". It will clear the VM_LOCKED flag, which will have -already been set by the caller, in filtered VMAs. Thus these VMA's need not be -visited for munlock when the region is unmapped. +populate_vma_page_range() specifying the vma and the address range to mlock. -For "normal" VMAs, mlock_vma_pages_range() calls __mlock_vma_pages_range() to -fault/allocate the pages and mlock them. Again, like mlock_fixup(), -mlock_vma_pages_range() downgrades the mmap semaphore to read mode before -attempting to fault/allocate and mlock the pages and "upgrades" the semaphore -back to write mode before returning. - -The callers of mlock_vma_pages_range() will have already added the memory range +The callers of populate_vma_page_range() will have already added the memory range to be mlocked to the task's "locked_vm". To account for filtered VMAs, -mlock_vma_pages_range() returns the number of pages NOT mlocked. All of the +populate_vma_page_range() returns the number of pages NOT mlocked. All of the callers then subtract a non-negative return value from the task's locked_vm. A negative return value represent an error - for example, from get_user_pages() attempting to fault in a VMA with PROT_NONE access. In this case, we leave the diff --git a/mm/internal.h b/mm/internal.h index a96da5b0029d..7df78a5269f3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -240,7 +240,7 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent); #ifdef CONFIG_MMU -extern long __mlock_vma_pages_range(struct vm_area_struct *vma, +extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *nonblocking); extern void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); diff --git a/mm/mlock.c b/mm/mlock.c index f756e28b33fc..9d0f3cd716c5 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -206,13 +206,13 @@ unsigned int munlock_vma_page(struct page *page) } /** - * __mlock_vma_pages_range() - mlock a range of pages in the vma. + * populate_vma_page_range() - populate a range of pages in the vma. * @vma: target vma * @start: start address * @end: end address * @nonblocking: * - * This takes care of making the pages present too. + * This takes care of mlocking the pages too if VM_LOCKED is set. * * return 0 on success, negative error code on error. * @@ -224,7 +224,7 @@ unsigned int munlock_vma_page(struct page *page) * If @nonblocking is non-NULL, it must held for read only and may be * released. If it's released, *@nonblocking will be set to 0. */ -long __mlock_vma_pages_range(struct vm_area_struct *vma, +long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *nonblocking) { struct mm_struct *mm = vma->vm_mm; @@ -596,7 +596,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, /* * vm_flags is protected by the mmap_sem held in write mode. * It's okay if try_to_unmap_one unmaps a page just after we - * set VM_LOCKED, __mlock_vma_pages_range will bring it back. + * set VM_LOCKED, populate_vma_page_range will bring it back. */ if (lock) @@ -702,11 +702,11 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) if (nstart < vma->vm_start) nstart = vma->vm_start; /* - * Now fault in a range of pages. __mlock_vma_pages_range() + * Now fault in a range of pages. populate_vma_page_range() * double checks the vma flags, so that it won't mlock pages * if the vma was already munlocked. */ - ret = __mlock_vma_pages_range(vma, nstart, nend, &locked); + ret = populate_vma_page_range(vma, nstart, nend, &locked); if (ret < 0) { if (ignore_errors) { ret = 0; diff --git a/mm/mmap.c b/mm/mmap.c index 9ec50a368634..06a6076c92e5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2316,7 +2316,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) if (!prev || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) - __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); + populate_vma_page_range(prev, addr, prev->vm_end, NULL); return prev; } #else @@ -2351,7 +2351,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) if (expand_stack(vma, addr)) return NULL; if (vma->vm_flags & VM_LOCKED) - __mlock_vma_pages_range(vma, addr, start, NULL); + populate_vma_page_range(vma, addr, start, NULL); return vma; } #endif From c561259ca79a88be540a75e84b45d49123014aa4 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:44:42 -0700 Subject: [PATCH 045/122] mm: move gup() -> posix mlock() error conversion out of __mm_populate This is praparation to moving mm_populate()-related code out of mm/mlock.c. Signed-off-by: Kirill A. Shutemov Acked-by: Linus Torvalds Acked-by: David Rientjes Cc: Michel Lespinasse Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index 9d0f3cd716c5..0214263fca45 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -712,7 +712,6 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) ret = 0; continue; /* continue at next VMA */ } - ret = __mlock_posix_error_return(ret); break; } nend = nstart + ret * PAGE_SIZE; @@ -750,9 +749,13 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) error = do_mlock(start, len, 1); up_write(¤t->mm->mmap_sem); - if (!error) - error = __mm_populate(start, len, 0); - return error; + if (error) + return error; + + error = __mm_populate(start, len, 0); + if (error) + return __mlock_posix_error_return(error); + return 0; } SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) From acc3c8d15eed6b68c7edf5bfaea884753aaa8e85 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:44:45 -0700 Subject: [PATCH 046/122] mm: move mm_populate()-related code to mm/gup.c It's odd that we have populate_vma_page_range() and __mm_populate() in mm/mlock.c. It's implementation of generic memory population and mlocking is one of possible side effect, if VM_LOCKED is set. __get_user_pages() is core of the implementation. Let's move the code into mm/gup.c. Signed-off-by: Kirill A. Shutemov Acked-by: Linus Torvalds Acked-by: David Rientjes Cc: Michel Lespinasse Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/mlock.c | 118 ----------------------------------------------------- 2 files changed, 118 insertions(+), 118 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 1b114ba9aebf..ca7b607ab671 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -818,6 +818,124 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } EXPORT_SYMBOL(get_user_pages); +/** + * populate_vma_page_range() - populate a range of pages in the vma. + * @vma: target vma + * @start: start address + * @end: end address + * @nonblocking: + * + * This takes care of mlocking the pages too if VM_LOCKED is set. + * + * return 0 on success, negative error code on error. + * + * vma->vm_mm->mmap_sem must be held. + * + * If @nonblocking is NULL, it may be held for read or write and will + * be unperturbed. + * + * If @nonblocking is non-NULL, it must held for read only and may be + * released. If it's released, *@nonblocking will be set to 0. + */ +long populate_vma_page_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *nonblocking) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long nr_pages = (end - start) / PAGE_SIZE; + int gup_flags; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(end & ~PAGE_MASK); + VM_BUG_ON_VMA(start < vma->vm_start, vma); + VM_BUG_ON_VMA(end > vma->vm_end, vma); + VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); + + gup_flags = FOLL_TOUCH | FOLL_POPULATE; + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) + gup_flags |= FOLL_WRITE; + + /* + * We want mlock to succeed for regions that have any permissions + * other than PROT_NONE. + */ + if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) + gup_flags |= FOLL_FORCE; + + /* + * We made sure addr is within a VMA, so the following will + * not result in a stack expansion that recurses back here. + */ + return __get_user_pages(current, mm, start, nr_pages, gup_flags, + NULL, NULL, nonblocking); +} + +/* + * __mm_populate - populate and/or mlock pages within a range of address space. + * + * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap + * flags. VMAs must be already marked with the desired vm_flags, and + * mmap_sem must not be held. + */ +int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) +{ + struct mm_struct *mm = current->mm; + unsigned long end, nstart, nend; + struct vm_area_struct *vma = NULL; + int locked = 0; + long ret = 0; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(len != PAGE_ALIGN(len)); + end = start + len; + + for (nstart = start; nstart < end; nstart = nend) { + /* + * We want to fault in pages for [nstart; end) address range. + * Find first corresponding VMA. + */ + if (!locked) { + locked = 1; + down_read(&mm->mmap_sem); + vma = find_vma(mm, nstart); + } else if (nstart >= vma->vm_end) + vma = vma->vm_next; + if (!vma || vma->vm_start >= end) + break; + /* + * Set [nstart; nend) to intersection of desired address + * range with the first VMA. Also, skip undesirable VMA types. + */ + nend = min(end, vma->vm_end); + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + continue; + if (nstart < vma->vm_start) + nstart = vma->vm_start; + /* + * Now fault in a range of pages. populate_vma_page_range() + * double checks the vma flags, so that it won't mlock pages + * if the vma was already munlocked. + */ + ret = populate_vma_page_range(vma, nstart, nend, &locked); + if (ret < 0) { + if (ignore_errors) { + ret = 0; + continue; /* continue at next VMA */ + } + break; + } + nend = nstart + ret * PAGE_SIZE; + ret = 0; + } + if (locked) + up_read(&mm->mmap_sem); + return ret; /* 0 or negative error code */ +} + /** * get_dump_page() - pin user page in memory while writing it to core dump * @addr: user address diff --git a/mm/mlock.c b/mm/mlock.c index 0214263fca45..6fd2cf15e868 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -205,62 +205,6 @@ unsigned int munlock_vma_page(struct page *page) return nr_pages - 1; } -/** - * populate_vma_page_range() - populate a range of pages in the vma. - * @vma: target vma - * @start: start address - * @end: end address - * @nonblocking: - * - * This takes care of mlocking the pages too if VM_LOCKED is set. - * - * return 0 on success, negative error code on error. - * - * vma->vm_mm->mmap_sem must be held. - * - * If @nonblocking is NULL, it may be held for read or write and will - * be unperturbed. - * - * If @nonblocking is non-NULL, it must held for read only and may be - * released. If it's released, *@nonblocking will be set to 0. - */ -long populate_vma_page_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, int *nonblocking) -{ - struct mm_struct *mm = vma->vm_mm; - unsigned long nr_pages = (end - start) / PAGE_SIZE; - int gup_flags; - - VM_BUG_ON(start & ~PAGE_MASK); - VM_BUG_ON(end & ~PAGE_MASK); - VM_BUG_ON_VMA(start < vma->vm_start, vma); - VM_BUG_ON_VMA(end > vma->vm_end, vma); - VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); - - gup_flags = FOLL_TOUCH | FOLL_POPULATE; - /* - * We want to touch writable mappings with a write fault in order - * to break COW, except for shared mappings because these don't COW - * and we would not want to dirty them for nothing. - */ - if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) - gup_flags |= FOLL_WRITE; - - /* - * We want mlock to succeed for regions that have any permissions - * other than PROT_NONE. - */ - if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) - gup_flags |= FOLL_FORCE; - - /* - * We made sure addr is within a VMA, so the following will - * not result in a stack expansion that recurses back here. - */ - return __get_user_pages(current, mm, start, nr_pages, gup_flags, - NULL, NULL, nonblocking); -} - /* * convert get_user_pages() return value to posix mlock() error */ @@ -660,68 +604,6 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } -/* - * __mm_populate - populate and/or mlock pages within a range of address space. - * - * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap - * flags. VMAs must be already marked with the desired vm_flags, and - * mmap_sem must not be held. - */ -int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) -{ - struct mm_struct *mm = current->mm; - unsigned long end, nstart, nend; - struct vm_area_struct *vma = NULL; - int locked = 0; - long ret = 0; - - VM_BUG_ON(start & ~PAGE_MASK); - VM_BUG_ON(len != PAGE_ALIGN(len)); - end = start + len; - - for (nstart = start; nstart < end; nstart = nend) { - /* - * We want to fault in pages for [nstart; end) address range. - * Find first corresponding VMA. - */ - if (!locked) { - locked = 1; - down_read(&mm->mmap_sem); - vma = find_vma(mm, nstart); - } else if (nstart >= vma->vm_end) - vma = vma->vm_next; - if (!vma || vma->vm_start >= end) - break; - /* - * Set [nstart; nend) to intersection of desired address - * range with the first VMA. Also, skip undesirable VMA types. - */ - nend = min(end, vma->vm_end); - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) - continue; - if (nstart < vma->vm_start) - nstart = vma->vm_start; - /* - * Now fault in a range of pages. populate_vma_page_range() - * double checks the vma flags, so that it won't mlock pages - * if the vma was already munlocked. - */ - ret = populate_vma_page_range(vma, nstart, nend, &locked); - if (ret < 0) { - if (ignore_errors) { - ret = 0; - continue; /* continue at next VMA */ - } - break; - } - nend = nstart + ret * PAGE_SIZE; - ret = 0; - } - if (locked) - up_read(&mm->mmap_sem); - return ret; /* 0 or negative error code */ -} - SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) { unsigned long locked; From 7fc825b4566c591670f6b605cc3d10da77d27efb Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 14 Apr 2015 15:44:48 -0700 Subject: [PATCH 047/122] mm/memblock.c: rename local variable of memblock_type to `type' A small cleanup. Seems in e3239ff9 ("memblock: Rename memblock_region to memblock_type and memblock_property to memblock_region") this one was missed. Signed-off-by: Baoquan He Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index 252b77bdf65e..3f37a0bca5d5 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -699,14 +699,14 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base, int nid, unsigned long flags) { - struct memblock_type *_rgn = &memblock.reserved; + struct memblock_type *type = &memblock.reserved; memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, flags, (void *)_RET_IP_); - return memblock_add_range(_rgn, base, size, nid, flags); + return memblock_add_range(type, base, size, nid, flags); } int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) From 1575e68b3c1111aca9779ecbdb106a4aa563f01f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 14 Apr 2015 15:44:51 -0700 Subject: [PATCH 048/122] mm: memcontrol: update copyright notice Add myself to the list of copyright holders. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b34ef4a32a3b..68d4890fc4bd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -14,6 +14,12 @@ * Copyright (C) 2012 Parallels Inc. and Google Inc. * Authors: Glauber Costa and Suleiman Souhlal * + * Native page reclaim + * Charge lifetime sanitation + * Lockless page tracking & accounting + * Unified hierarchy configuration model + * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or From 19c07d5e0414261bd7ec3d8419dd26f468ef69d9 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Tue, 14 Apr 2015 15:44:54 -0700 Subject: [PATCH 049/122] memory hotplug: use macro to switch between section and pfn Use macro section_nr_to_pfn() to switch between section and pfn, instead of open-coding it. No semantic changes. Signed-off-by: Sheng Yong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 2 +- mm/memory_hotplug.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index af9c911cd6b5..ab5c9a7dfeca 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -228,7 +228,7 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t struct page *first_page; int ret; - start_pfn = phys_index << PFN_SECTION_SHIFT; + start_pfn = section_nr_to_pfn(phys_index); first_page = pfn_to_page(start_pfn); switch (action) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 65842d688b7c..aaec7758eec3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -502,7 +502,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); + err = __add_section(nid, zone, section_nr_to_pfn(i)); /* * EEXIST is finally dealt with by ioresource collision From 28b24c1fc8c22cabe5b8a16ffe6a61dfce51a1f2 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 14 Apr 2015 15:44:57 -0700 Subject: [PATCH 050/122] mm: cma: debugfs interface I've noticed that there is no interfaces exposed by CMA which would let me fuzz what's going on in there. This small patchset exposes some information out to userspace, plus adds the ability to trigger allocation and freeing from userspace. This patch (of 3): Implement a simple debugfs interface to expose information about CMA areas in the system. Useful for testing/sanity checks for CMA since it was impossible to previously retrieve this information in userspace. Signed-off-by: Sasha Levin Acked-by: Joonsoo Kim Cc: Marek Szyprowski Cc: Laura Abbott Cc: Konrad Rzeszutek Wilk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 6 +++++ mm/Makefile | 1 + mm/cma.c | 17 +++----------- mm/cma.h | 20 +++++++++++++++++ mm/cma_debug.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 90 insertions(+), 14 deletions(-) create mode 100644 mm/cma.h create mode 100644 mm/cma_debug.c diff --git a/mm/Kconfig b/mm/Kconfig index a03131b6ba8e..390214da4546 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -517,6 +517,12 @@ config CMA_DEBUG processing calls such as dma_alloc_from_contiguous(). This option does not affect warning and error messages. +config CMA_DEBUGFS + bool "CMA debugfs interface" + depends on CMA && DEBUG_FS + help + Turns on the DebugFS interface for CMA. + config CMA_AREAS int "Maximum count of the CMA areas" depends on CMA diff --git a/mm/Makefile b/mm/Makefile index 15dbe9903c27..668a9bb82be4 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -76,3 +76,4 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o +obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o diff --git a/mm/cma.c b/mm/cma.c index 68ecb7a42983..2655b8191656 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -35,16 +35,10 @@ #include #include -struct cma { - unsigned long base_pfn; - unsigned long count; - unsigned long *bitmap; - unsigned int order_per_bit; /* Order of pages represented by one bit */ - struct mutex lock; -}; +#include "cma.h" -static struct cma cma_areas[MAX_CMA_AREAS]; -static unsigned cma_area_count; +struct cma cma_areas[MAX_CMA_AREAS]; +unsigned cma_area_count; static DEFINE_MUTEX(cma_mutex); phys_addr_t cma_get_base(struct cma *cma) @@ -77,11 +71,6 @@ static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order) - cma->base_pfn) >> cma->order_per_bit; } -static unsigned long cma_bitmap_maxno(struct cma *cma) -{ - return cma->count >> cma->order_per_bit; -} - static unsigned long cma_bitmap_pages_to_bits(struct cma *cma, unsigned long pages) { diff --git a/mm/cma.h b/mm/cma.h new file mode 100644 index 000000000000..4141887bbfb0 --- /dev/null +++ b/mm/cma.h @@ -0,0 +1,20 @@ +#ifndef __MM_CMA_H__ +#define __MM_CMA_H__ + +struct cma { + unsigned long base_pfn; + unsigned long count; + unsigned long *bitmap; + unsigned int order_per_bit; /* Order of pages represented by one bit */ + struct mutex lock; +}; + +extern struct cma cma_areas[MAX_CMA_AREAS]; +extern unsigned cma_area_count; + +static unsigned long cma_bitmap_maxno(struct cma *cma) +{ + return cma->count >> cma->order_per_bit; +} + +#endif diff --git a/mm/cma_debug.c b/mm/cma_debug.c new file mode 100644 index 000000000000..3af2de6d4e5f --- /dev/null +++ b/mm/cma_debug.c @@ -0,0 +1,60 @@ +/* + * CMA DebugFS Interface + * + * Copyright (c) 2015 Sasha Levin + */ + + +#include +#include + +#include "cma.h" + +static struct dentry *cma_debugfs_root; + +static int cma_debugfs_get(void *data, u64 *val) +{ + unsigned long *p = data; + + *val = *p; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); + +static void cma_debugfs_add_one(struct cma *cma, int idx) +{ + struct dentry *tmp; + char name[16]; + int u32s; + + sprintf(name, "cma-%d", idx); + + tmp = debugfs_create_dir(name, cma_debugfs_root); + + debugfs_create_file("base_pfn", S_IRUGO, tmp, + &cma->base_pfn, &cma_debugfs_fops); + debugfs_create_file("count", S_IRUGO, tmp, + &cma->count, &cma_debugfs_fops); + debugfs_create_file("order_per_bit", S_IRUGO, tmp, + &cma->order_per_bit, &cma_debugfs_fops); + + u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32)); + debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s); +} + +static int __init cma_debugfs_init(void) +{ + int i; + + cma_debugfs_root = debugfs_create_dir("cma", NULL); + if (!cma_debugfs_root) + return -ENOMEM; + + for (i = 0; i < cma_area_count; i++) + cma_debugfs_add_one(&cma_areas[i], i); + + return 0; +} +late_initcall(cma_debugfs_init); From 26b02a1f9670862c51b3ff63a6128589866f5c71 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 14 Apr 2015 15:44:59 -0700 Subject: [PATCH 051/122] mm: cma: allocation trigger Provides a userspace interface to trigger a CMA allocation. Usage: echo [pages] > alloc This would provide testing/fuzzing access to the CMA allocation paths. Signed-off-by: Sasha Levin Acked-by: Joonsoo Kim Cc: Marek Szyprowski Cc: Laura Abbott Cc: Konrad Rzeszutek Wilk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 6 ++++++ mm/cma.h | 4 ++++ mm/cma_debug.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 63 insertions(+), 1 deletion(-) diff --git a/mm/cma.c b/mm/cma.c index 2655b8191656..7f6045420925 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -123,6 +123,12 @@ static int __init cma_activate_area(struct cma *cma) } while (--i); mutex_init(&cma->lock); + +#ifdef CONFIG_CMA_DEBUGFS + INIT_HLIST_HEAD(&cma->mem_head); + spin_lock_init(&cma->mem_head_lock); +#endif + return 0; err: diff --git a/mm/cma.h b/mm/cma.h index 4141887bbfb0..1132d733556d 100644 --- a/mm/cma.h +++ b/mm/cma.h @@ -7,6 +7,10 @@ struct cma { unsigned long *bitmap; unsigned int order_per_bit; /* Order of pages represented by one bit */ struct mutex lock; +#ifdef CONFIG_CMA_DEBUGFS + struct hlist_head mem_head; + spinlock_t mem_head_lock; +#endif }; extern struct cma cma_areas[MAX_CMA_AREAS]; diff --git a/mm/cma_debug.c b/mm/cma_debug.c index 3af2de6d4e5f..f3baa413ab33 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -7,9 +7,18 @@ #include #include +#include +#include +#include #include "cma.h" +struct cma_mem { + struct hlist_node node; + struct page *p; + unsigned long n; +}; + static struct dentry *cma_debugfs_root; static int cma_debugfs_get(void *data, u64 *val) @@ -23,6 +32,46 @@ static int cma_debugfs_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); +static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem) +{ + spin_lock(&cma->mem_head_lock); + hlist_add_head(&mem->node, &cma->mem_head); + spin_unlock(&cma->mem_head_lock); +} + +static int cma_alloc_mem(struct cma *cma, int count) +{ + struct cma_mem *mem; + struct page *p; + + mem = kzalloc(sizeof(*mem), GFP_KERNEL); + if (!mem) + return -ENOMEM; + + p = cma_alloc(cma, count, CONFIG_CMA_ALIGNMENT); + if (!p) { + kfree(mem); + return -ENOMEM; + } + + mem->p = p; + mem->n = count; + + cma_add_to_cma_mem_list(cma, mem); + + return 0; +} + +static int cma_alloc_write(void *data, u64 val) +{ + int pages = val; + struct cma *cma = data; + + return cma_alloc_mem(cma, pages); +} + +DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); + static void cma_debugfs_add_one(struct cma *cma, int idx) { struct dentry *tmp; @@ -33,12 +82,15 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) tmp = debugfs_create_dir(name, cma_debugfs_root); + debugfs_create_file("alloc", S_IWUSR, cma_debugfs_root, cma, + &cma_alloc_fops); + debugfs_create_file("base_pfn", S_IRUGO, tmp, &cma->base_pfn, &cma_debugfs_fops); debugfs_create_file("count", S_IRUGO, tmp, &cma->count, &cma_debugfs_fops); debugfs_create_file("order_per_bit", S_IRUGO, tmp, - &cma->order_per_bit, &cma_debugfs_fops); + &cma->order_per_bit, &cma_debugfs_fops); u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32)); debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s); From 8325330b026509127d4541e0f511c0c10648c2d5 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 14 Apr 2015 15:45:02 -0700 Subject: [PATCH 052/122] mm: cma: release trigger Provides a userspace interface to trigger a CMA release. Usage: echo [pages] > free This would provide testing/fuzzing access to the CMA release paths. [akpm@linux-foundation.org: coding-style fixes] [mhocko@suse.cz: fix build] Signed-off-by: Sasha Levin Acked-by: Joonsoo Kim Cc: Marek Szyprowski Cc: Laura Abbott Cc: Konrad Rzeszutek Wilk Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma_debug.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/mm/cma_debug.c b/mm/cma_debug.c index f3baa413ab33..ae41faeed596 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "cma.h" @@ -39,6 +40,60 @@ static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem) spin_unlock(&cma->mem_head_lock); } +static struct cma_mem *cma_get_entry_from_list(struct cma *cma) +{ + struct cma_mem *mem = NULL; + + spin_lock(&cma->mem_head_lock); + if (!hlist_empty(&cma->mem_head)) { + mem = hlist_entry(cma->mem_head.first, struct cma_mem, node); + hlist_del_init(&mem->node); + } + spin_unlock(&cma->mem_head_lock); + + return mem; +} + +static int cma_free_mem(struct cma *cma, int count) +{ + struct cma_mem *mem = NULL; + + while (count) { + mem = cma_get_entry_from_list(cma); + if (mem == NULL) + return 0; + + if (mem->n <= count) { + cma_release(cma, mem->p, mem->n); + count -= mem->n; + kfree(mem); + } else if (cma->order_per_bit == 0) { + cma_release(cma, mem->p, count); + mem->p += count; + mem->n -= count; + count = 0; + cma_add_to_cma_mem_list(cma, mem); + } else { + pr_debug("cma: cannot release partial block when order_per_bit != 0\n"); + cma_add_to_cma_mem_list(cma, mem); + break; + } + } + + return 0; + +} + +static int cma_free_write(void *data, u64 val) +{ + int pages = val; + struct cma *cma = data; + + return cma_free_mem(cma, pages); +} + +DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n"); + static int cma_alloc_mem(struct cma *cma, int count) { struct cma_mem *mem; @@ -85,6 +140,9 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) debugfs_create_file("alloc", S_IWUSR, cma_debugfs_root, cma, &cma_alloc_fops); + debugfs_create_file("free", S_IWUSR, cma_debugfs_root, cma, + &cma_free_fops); + debugfs_create_file("base_pfn", S_IRUGO, tmp, &cma->base_pfn, &cma_debugfs_fops); debugfs_create_file("count", S_IRUGO, tmp, From 875abdb6d43401c745ee8bc6d240f119a601d21f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 14 Apr 2015 15:45:05 -0700 Subject: [PATCH 053/122] mm-cma-allocation-trigger-fix s/CONFIG_CMA_ALIGNMENT/0/, per Joonsoo Cc: Joonsoo Kim Cc: Konrad Rzeszutek Wilk Cc: Laura Abbott Cc: Marek Szyprowski Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma_debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/cma_debug.c b/mm/cma_debug.c index ae41faeed596..0b377536ccde 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -103,7 +103,7 @@ static int cma_alloc_mem(struct cma *cma, int count) if (!mem) return -ENOMEM; - p = cma_alloc(cma, count, CONFIG_CMA_ALIGNMENT); + p = cma_alloc(cma, count, 0); if (!p) { kfree(mem); return -ENOMEM; From 17e0db822b00cff96c1b662ac0dc0449cb70e0ec Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 14 Apr 2015 15:45:08 -0700 Subject: [PATCH 054/122] cma: debug: document new debugfs interface Document the structure and files under the new debugfs interface. Signed-off-by: Sasha Levin Cc: Joonsoo Kim Cc: Konrad Rzeszutek Wilk Cc: Laura Abbott Cc: Marek Szyprowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cma/debugfs.txt | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 Documentation/cma/debugfs.txt diff --git a/Documentation/cma/debugfs.txt b/Documentation/cma/debugfs.txt new file mode 100644 index 000000000000..6cef20a8cedc --- /dev/null +++ b/Documentation/cma/debugfs.txt @@ -0,0 +1,21 @@ +The CMA debugfs interface is useful to retrieve basic information out of the +different CMA areas and to test allocation/release in each of the areas. + +Each CMA zone represents a directory under /cma/, indexed by the +kernel's CMA index. So the first CMA zone would be: + + /cma/cma-0 + +The structure of the files created under that directory is as follows: + + - [RO] base_pfn: The base PFN (Page Frame Number) of the zone. + - [RO] count: Amount of memory in the CMA area. + - [RO] order_per_bit: Order of pages represented by one bit. + - [RO] bitmap: The bitmap of page states in the zone. + - [WO] alloc: Allocate N pages from that CMA area. For example: + + echo 5 > /cma/cma-2/alloc + +would try to allocate 5 pages from the cma-2 area. + + - [WO] free: Free N pages from that CMA area, similar to the above. From 30467e0b3be83c286d60039f8267dd421128ca74 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2015 15:45:11 -0700 Subject: [PATCH 055/122] mm, hotplug: fix concurrent memory hot-add deadlock There's a deadlock when concurrently hot-adding memory through the probe interface and switching a memory block from offline to online. When hot-adding memory via the probe interface, add_memory() first takes mem_hotplug_begin() and then device_lock() is later taken when registering the newly initialized memory block. This creates a lock dependency of (1) mem_hotplug.lock (2) dev->mutex. When switching a memory block from offline to online, dev->mutex is first grabbed in device_online() when the write(2) transitions an existing memory block from offline to online, and then online_pages() will take mem_hotplug_begin(). This creates a lock inversion between mem_hotplug.lock and dev->mutex. Vitaly reports that this deadlock can happen when kworker handling a probe event races with systemd-udevd switching a memory block's state. This patch requires the state transition to take mem_hotplug_begin() before dev->mutex. Hot-adding memory via the probe interface creates a memory block while holding mem_hotplug_begin(), there is no way to take dev->mutex first in this case. online_pages() and offline_pages() are only called when transitioning memory block state. We now require that mem_hotplug_begin() is taken before calling them -- this requires exporting the mem_hotplug_begin() and mem_hotplug_done() to generic code. In all hot-add and hot-remove cases, mem_hotplug_begin() is done prior to device_online(). This is all that is needed to avoid the deadlock. Signed-off-by: David Rientjes Reported-by: Vitaly Kuznetsov Tested-by: Vitaly Kuznetsov Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: "K. Y. Srinivasan" Cc: Yasuaki Ishimatsu Cc: Tang Chen Cc: Vlastimil Babka Cc: Zhang Zhen Cc: Vladimir Davydov Cc: Wang Nan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 19 ++++++++++++------- include/linux/memory_hotplug.h | 6 ++++++ mm/memory_hotplug.c | 33 ++++++++++++--------------------- 3 files changed, 30 insertions(+), 28 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index ab5c9a7dfeca..2804aed3f416 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -219,6 +219,7 @@ static bool pages_correctly_reserved(unsigned long start_pfn) /* * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is * OK to have direct references to sparsemem variables in here. + * Must already be protected by mem_hotplug_begin(). */ static int memory_block_action(unsigned long phys_index, unsigned long action, int online_type) @@ -286,6 +287,7 @@ static int memory_subsys_online(struct device *dev) if (mem->online_type < 0) mem->online_type = MMOP_ONLINE_KEEP; + /* Already under protection of mem_hotplug_begin() */ ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); /* clear online_type */ @@ -328,17 +330,19 @@ store_mem_state(struct device *dev, goto err; } + /* + * Memory hotplug needs to hold mem_hotplug_begin() for probe to find + * the correct memory block to online before doing device_online(dev), + * which will take dev->mutex. Take the lock early to prevent an + * inversion, memory_subsys_online() callbacks will be implemented by + * assuming it's already protected. + */ + mem_hotplug_begin(); + switch (online_type) { case MMOP_ONLINE_KERNEL: case MMOP_ONLINE_MOVABLE: case MMOP_ONLINE_KEEP: - /* - * mem->online_type is not protected so there can be a - * race here. However, when racing online, the first - * will succeed and the second will just return as the - * block will already be online. The online type - * could be either one, but that is expected. - */ mem->online_type = online_type; ret = device_online(&mem->dev); break; @@ -349,6 +353,7 @@ store_mem_state(struct device *dev, ret = -EINVAL; /* should never happen */ } + mem_hotplug_done(); err: unlock_device_hotplug(); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 8f1a41951df9..6ffa0ac7f7d6 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -192,6 +192,9 @@ extern void get_page_bootmem(unsigned long ingo, struct page *page, void get_online_mems(void); void put_online_mems(void); +void mem_hotplug_begin(void); +void mem_hotplug_done(void); + #else /* ! CONFIG_MEMORY_HOTPLUG */ /* * Stub functions for when hotplug is off @@ -231,6 +234,9 @@ static inline int try_online_node(int nid) static inline void get_online_mems(void) {} static inline void put_online_mems(void) {} +static inline void mem_hotplug_begin(void) {} +static inline void mem_hotplug_done(void) {} + #endif /* ! CONFIG_MEMORY_HOTPLUG */ #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index aaec7758eec3..e2e8014fb755 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -104,7 +104,7 @@ void put_online_mems(void) } -static void mem_hotplug_begin(void) +void mem_hotplug_begin(void) { mem_hotplug.active_writer = current; @@ -119,7 +119,7 @@ static void mem_hotplug_begin(void) } } -static void mem_hotplug_done(void) +void mem_hotplug_done(void) { mem_hotplug.active_writer = NULL; mutex_unlock(&mem_hotplug.lock); @@ -959,6 +959,7 @@ static void node_states_set_node(int node, struct memory_notify *arg) } +/* Must be protected by mem_hotplug_begin() */ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { unsigned long flags; @@ -969,7 +970,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int ret; struct memory_notify arg; - mem_hotplug_begin(); /* * This doesn't need a lock to do pfn_to_page(). * The section can't be removed here because of the @@ -977,21 +977,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ */ zone = page_zone(pfn_to_page(pfn)); - ret = -EINVAL; if ((zone_idx(zone) > ZONE_NORMAL || online_type == MMOP_ONLINE_MOVABLE) && !can_online_high_movable(zone)) - goto out; + return -EINVAL; if (online_type == MMOP_ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) - goto out; + return -EINVAL; } if (online_type == MMOP_ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) - goto out; + return -EINVAL; } /* Previous code may changed the zone of the pfn range */ @@ -1007,7 +1006,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ ret = notifier_to_errno(ret); if (ret) { memory_notify(MEM_CANCEL_ONLINE, &arg); - goto out; + return ret; } /* * If this zone is not populated, then it is not in zonelist. @@ -1031,7 +1030,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); - goto out; + return ret; } zone->present_pages += onlined_pages; @@ -1061,9 +1060,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (onlined_pages) memory_notify(MEM_ONLINE, &arg); -out: - mem_hotplug_done(); - return ret; + return 0; } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ @@ -1688,21 +1685,18 @@ static int __ref __offline_pages(unsigned long start_pfn, if (!test_pages_in_a_zone(start_pfn, end_pfn)) return -EINVAL; - mem_hotplug_begin(); - zone = page_zone(pfn_to_page(start_pfn)); node = zone_to_nid(zone); nr_pages = end_pfn - start_pfn; - ret = -EINVAL; if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) - goto out; + return -EINVAL; /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, true); if (ret) - goto out; + return ret; arg.start_pfn = start_pfn; arg.nr_pages = nr_pages; @@ -1795,7 +1789,6 @@ static int __ref __offline_pages(unsigned long start_pfn, writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); - mem_hotplug_done(); return 0; failed_removal: @@ -1805,12 +1798,10 @@ static int __ref __offline_pages(unsigned long start_pfn, memory_notify(MEM_CANCEL_OFFLINE, &arg); /* pushback to free area */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); - -out: - mem_hotplug_done(); return ret; } +/* Must be protected by mem_hotplug_begin() */ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) { return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); From dc67647b78b92d9497f01fab95ac6764ed886b40 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 14 Apr 2015 15:45:15 -0700 Subject: [PATCH 056/122] mm/cma: change fallback behaviour for CMA freepage Freepage with MIGRATE_CMA can be used only for MIGRATE_MOVABLE and they should not be expanded to other migratetype buddy list to protect them from unmovable/reclaimable allocation. Implementing these requirements in __rmqueue_fallback(), that is, finding largest possible block of freepage has bad effect that high order freepage with MIGRATE_CMA are broken continually although there are suitable order CMA freepage. Reason is that they are not be expanded to other migratetype buddy list and next __rmqueue_fallback() invocation try to finds another largest block of freepage and break it again. So, MIGRATE_CMA fallback should be handled separately. This patch introduces __rmqueue_cma_fallback(), that just wrapper of __rmqueue_smallest() and call it before __rmqueue_fallback() if migratetype == MIGRATE_MOVABLE. This results in unintended behaviour change that MIGRATE_CMA freepage is always used first rather than other migratetype as movable allocation's fallback. But, as already mentioned above, MIGRATE_CMA can be used only for MIGRATE_MOVABLE, so it is better to use MIGRATE_CMA freepage first as much as possible. Otherwise, we needlessly take up precious freepages with other migratetype and increase chance of fragmentation. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: David Rientjes Cc: Rik van Riel Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 40e29429e7b0..9ca626756927 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1032,11 +1032,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, static int fallbacks[MIGRATE_TYPES][4] = { [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, -#ifdef CONFIG_CMA - [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, - [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ -#else [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, +#ifdef CONFIG_CMA + [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ #endif [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ #ifdef CONFIG_MEMORY_ISOLATION @@ -1044,6 +1042,17 @@ static int fallbacks[MIGRATE_TYPES][4] = { #endif }; +#ifdef CONFIG_CMA +static struct page *__rmqueue_cma_fallback(struct zone *zone, + unsigned int order) +{ + return __rmqueue_smallest(zone, order, MIGRATE_CMA); +} +#else +static inline struct page *__rmqueue_cma_fallback(struct zone *zone, + unsigned int order) { return NULL; } +#endif + /* * Move the free pages in a range to the free lists of the requested type. * Note that start_page and end_pages are not aligned on a pageblock @@ -1195,19 +1204,8 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) struct page, lru); area->nr_free--; - if (!is_migrate_cma(migratetype)) { - try_to_steal_freepages(zone, page, - start_migratetype, - migratetype); - } else { - /* - * When borrowing from MIGRATE_CMA, we need to - * release the excess buddy pages to CMA - * itself, and we do not try to steal extra - * free pages. - */ - buddy_type = migratetype; - } + try_to_steal_freepages(zone, page, start_migratetype, + migratetype); /* Remove the page from the freelists */ list_del(&page->lru); @@ -1249,7 +1247,11 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { - page = __rmqueue_fallback(zone, order, migratetype); + if (migratetype == MIGRATE_MOVABLE) + page = __rmqueue_cma_fallback(zone, order); + + if (!page) + page = __rmqueue_fallback(zone, order, migratetype); /* * Use MIGRATE_RESERVE rather than fail an allocation. goto From 4eb7dce62007113f1a2778213980fd6d8034ef5e Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 14 Apr 2015 15:45:18 -0700 Subject: [PATCH 057/122] mm/page_alloc: factor out fallback freepage checking This is preparation step to use page allocator's anti fragmentation logic in compaction. This patch just separates fallback freepage checking part from fallback freepage management part. Therefore, there is no functional change. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: David Rientjes Cc: Rik van Riel Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 143 ++++++++++++++++++++++++++++++------------------ 1 file changed, 91 insertions(+), 52 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9ca626756927..31aa943365d8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1145,14 +1145,40 @@ static void change_pageblock_range(struct page *pageblock_page, * as fragmentation caused by those allocations polluting movable pageblocks * is worse than movable allocations stealing from unmovable and reclaimable * pageblocks. - * - * If we claim more than half of the pageblock, change pageblock's migratetype - * as well. */ -static void try_to_steal_freepages(struct zone *zone, struct page *page, - int start_type, int fallback_type) +static bool can_steal_fallback(unsigned int order, int start_mt) +{ + /* + * Leaving this order check is intended, although there is + * relaxed order check in next check. The reason is that + * we can actually steal whole pageblock if this condition met, + * but, below check doesn't guarantee it and that is just heuristic + * so could be changed anytime. + */ + if (order >= pageblock_order) + return true; + + if (order >= pageblock_order / 2 || + start_mt == MIGRATE_RECLAIMABLE || + start_mt == MIGRATE_UNMOVABLE || + page_group_by_mobility_disabled) + return true; + + return false; +} + +/* + * This function implements actual steal behaviour. If order is large enough, + * we can steal whole pageblock. If not, we first move freepages in this + * pageblock and check whether half of pages are moved or not. If half of + * pages are moved, we can change migratetype of pageblock and permanently + * use it's pages as requested migratetype in the future. + */ +static void steal_suitable_fallback(struct zone *zone, struct page *page, + int start_type) { int current_order = page_order(page); + int pages; /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { @@ -1160,19 +1186,40 @@ static void try_to_steal_freepages(struct zone *zone, struct page *page, return; } - if (current_order >= pageblock_order / 2 || - start_type == MIGRATE_RECLAIMABLE || - start_type == MIGRATE_UNMOVABLE || - page_group_by_mobility_disabled) { - int pages; + pages = move_freepages_block(zone, page, start_type); - pages = move_freepages_block(zone, page, start_type); + /* Claim the whole block if over half of it is free */ + if (pages >= (1 << (pageblock_order-1)) || + page_group_by_mobility_disabled) + set_pageblock_migratetype(page, start_type); +} - /* Claim the whole block if over half of it is free */ - if (pages >= (1 << (pageblock_order-1)) || - page_group_by_mobility_disabled) - set_pageblock_migratetype(page, start_type); +/* Check whether there is a suitable fallback freepage with requested order. */ +static int find_suitable_fallback(struct free_area *area, unsigned int order, + int migratetype, bool *can_steal) +{ + int i; + int fallback_mt; + + if (area->nr_free == 0) + return -1; + + *can_steal = false; + for (i = 0;; i++) { + fallback_mt = fallbacks[migratetype][i]; + if (fallback_mt == MIGRATE_RESERVE) + break; + + if (list_empty(&area->free_list[fallback_mt])) + continue; + + if (can_steal_fallback(order, migratetype)) + *can_steal = true; + + return fallback_mt; } + + return -1; } /* Remove an element from the buddy allocator from the fallback list */ @@ -1182,53 +1229,45 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) struct free_area *area; unsigned int current_order; struct page *page; + int fallback_mt; + bool can_steal; /* Find the largest possible block of pages in the other list */ for (current_order = MAX_ORDER-1; current_order >= order && current_order <= MAX_ORDER-1; --current_order) { - int i; - for (i = 0;; i++) { - int migratetype = fallbacks[start_migratetype][i]; - int buddy_type = start_migratetype; + area = &(zone->free_area[current_order]); + fallback_mt = find_suitable_fallback(area, current_order, + start_migratetype, &can_steal); + if (fallback_mt == -1) + continue; - /* MIGRATE_RESERVE handled later if necessary */ - if (migratetype == MIGRATE_RESERVE) - break; + page = list_entry(area->free_list[fallback_mt].next, + struct page, lru); + if (can_steal) + steal_suitable_fallback(zone, page, start_migratetype); - area = &(zone->free_area[current_order]); - if (list_empty(&area->free_list[migratetype])) - continue; + /* Remove the page from the freelists */ + area->nr_free--; + list_del(&page->lru); + rmv_page_order(page); - page = list_entry(area->free_list[migratetype].next, - struct page, lru); - area->nr_free--; + expand(zone, page, order, current_order, area, + start_migratetype); + /* + * The freepage_migratetype may differ from pageblock's + * migratetype depending on the decisions in + * try_to_steal_freepages(). This is OK as long as it + * does not differ for MIGRATE_CMA pageblocks. For CMA + * we need to make sure unallocated pages flushed from + * pcp lists are returned to the correct freelist. + */ + set_freepage_migratetype(page, start_migratetype); - try_to_steal_freepages(zone, page, start_migratetype, - migratetype); + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); - /* Remove the page from the freelists */ - list_del(&page->lru); - rmv_page_order(page); - - expand(zone, page, order, current_order, area, - buddy_type); - - /* - * The freepage_migratetype may differ from pageblock's - * migratetype depending on the decisions in - * try_to_steal_freepages(). This is OK as long as it - * does not differ for MIGRATE_CMA pageblocks. For CMA - * we need to make sure unallocated pages flushed from - * pcp lists are returned to the correct freelist. - */ - set_freepage_migratetype(page, buddy_type); - - trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, migratetype); - - return page; - } + return page; } return NULL; From 2149cdaef6c0eb59a9edf3b152027392cd66b41f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 14 Apr 2015 15:45:21 -0700 Subject: [PATCH 058/122] mm/compaction: enhance compaction finish condition Compaction has anti fragmentation algorithm. It is that freepage should be more than pageblock order to finish the compaction if we don't find any freepage in requested migratetype buddy list. This is for mitigating fragmentation, but, there is a lack of migratetype consideration and it is too excessive compared to page allocator's anti fragmentation algorithm. Not considering migratetype would cause premature finish of compaction. For example, if allocation request is for unmovable migratetype, freepage with CMA migratetype doesn't help that allocation and compaction should not be stopped. But, current logic regards this situation as compaction is no longer needed, so finish the compaction. Secondly, condition is too excessive compared to page allocator's logic. We can steal freepage from other migratetype and change pageblock migratetype on more relaxed conditions in page allocator. This is designed to prevent fragmentation and we can use it here. Imposing hard constraint only to the compaction doesn't help much in this case since page allocator would cause fragmentation again. To solve these problems, this patch borrows anti fragmentation logic from page allocator. It will reduce premature compaction finish in some cases and reduce excessive compaction work. stress-highalloc test in mmtests with non movable order 7 allocation shows considerable increase of compaction success rate. Compaction success rate (Compaction success * 100 / Compaction stalls, %) 31.82 : 42.20 I tested it on non-reboot 5 runs stress-highalloc benchmark and found that there is no more degradation on allocation success rate than before. That roughly means that this patch doesn't result in more fragmentations. Vlastimil suggests additional idea that we only test for fallbacks when migration scanner has scanned a whole pageblock. It looked good for fragmentation because chance of stealing increase due to making more free pages in certain pageblock. So, I tested it, but, it results in decreased compaction success rate, roughly 38.00. I guess the reason that if system is low memory condition, watermark check could be failed due to not enough order 0 free page and so, sometimes, we can't reach a fallback check although migrate_pfn is aligned to pageblock_nr_pages. I can insert code to cope with this situation but it makes code more complicated so I don't include his idea at this patch. [akpm@linux-foundation.org: fix CONFIG_CMA=n build] Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: David Rientjes Cc: Rik van Riel Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 15 +++++++++++++-- mm/internal.h | 2 ++ mm/page_alloc.c | 19 ++++++++++++++----- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 8c0d9459b54a..a18201a8124e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1174,13 +1174,24 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { struct free_area *area = &zone->free_area[order]; + bool can_steal; /* Job done if page is free of the right migratetype */ if (!list_empty(&area->free_list[migratetype])) return COMPACT_PARTIAL; - /* Job done if allocation would set block type */ - if (order >= pageblock_order && area->nr_free) +#ifdef CONFIG_CMA + /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ + if (migratetype == MIGRATE_MOVABLE && + !list_empty(&area->free_list[MIGRATE_CMA])) + return COMPACT_PARTIAL; +#endif + /* + * Job done if allocation would steal freepages from + * other migratetype buddy lists. + */ + if (find_suitable_fallback(area, order, migratetype, + true, &can_steal) != -1) return COMPACT_PARTIAL; } diff --git a/mm/internal.h b/mm/internal.h index 7df78a5269f3..edaab69a9c35 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -200,6 +200,8 @@ isolate_freepages_range(struct compact_control *cc, unsigned long isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); +int find_suitable_fallback(struct free_area *area, unsigned int order, + int migratetype, bool only_stealable, bool *can_steal); #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 31aa943365d8..6dfa5b24cc79 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1194,9 +1194,14 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, set_pageblock_migratetype(page, start_type); } -/* Check whether there is a suitable fallback freepage with requested order. */ -static int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool *can_steal) +/* + * Check whether there is a suitable fallback freepage with requested order. + * If only_stealable is true, this function returns fallback_mt only if + * we can steal other freepages all together. This would help to reduce + * fragmentation due to mixed migratetype pages in one pageblock. + */ +int find_suitable_fallback(struct free_area *area, unsigned int order, + int migratetype, bool only_stealable, bool *can_steal) { int i; int fallback_mt; @@ -1216,7 +1221,11 @@ static int find_suitable_fallback(struct free_area *area, unsigned int order, if (can_steal_fallback(order, migratetype)) *can_steal = true; - return fallback_mt; + if (!only_stealable) + return fallback_mt; + + if (*can_steal) + return fallback_mt; } return -1; @@ -1238,7 +1247,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) --current_order) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, &can_steal); + start_migratetype, false, &can_steal); if (fallback_mt == -1) continue; From ca0984caa8235762dc4e22c1c47ae6719dcc4064 Mon Sep 17 00:00:00 2001 From: Ebru Akagunduz Date: Tue, 14 Apr 2015 15:45:24 -0700 Subject: [PATCH 059/122] mm: incorporate zero pages into transparent huge pages This patch improves THP collapse rates, by allowing zero pages. Currently THP can collapse 4kB pages into a THP when there are up to khugepaged_max_ptes_none pte_none ptes in a 2MB range. This patch counts pte none and mapped zero pages with the same variable. The patch was tested with a program that allocates 800MB of memory, and performs interleaved reads and writes, in a pattern that causes some 2MB areas to first see read accesses, resulting in the zero pfn being mapped there. To simulate memory fragmentation at allocation time, I modified do_huge_pmd_anonymous_page to return VM_FAULT_FALLBACK for read faults. Without the patch, only %50 of the program was collapsed into THP and the percentage did not increase over time. With this patch after 10 minutes of waiting khugepaged had collapsed %99 of the program's memory. [aarcange@redhat.com: fix bogus BUG()] Signed-off-by: Ebru Akagunduz Reviewed-by: Rik van Riel Reviewed-by: Andrea Arcangeli Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Hugh Dickins Cc: Johannes Weiner Cc: Sasha Levin Cc: David Rientjes Cc: Mel Gorman Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 10a4b6cea0d1..6352c1dfa898 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2109,7 +2109,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte) { while (--_pte >= pte) { pte_t pteval = *_pte; - if (!pte_none(pteval)) + if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) release_pte_page(pte_page(pteval)); } } @@ -2120,13 +2120,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, { struct page *page; pte_t *_pte; - int none = 0; + int none_or_zero = 0; bool referenced = false, writable = false; for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; - if (pte_none(pteval)) { - if (++none <= khugepaged_max_ptes_none) + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out; @@ -2207,9 +2207,21 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, pte_t pteval = *_pte; struct page *src_page; - if (pte_none(pteval)) { + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { clear_user_highpage(page, address); add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + if (is_zero_pfn(pte_pfn(pteval))) { + /* + * ptl mostly unnecessary. + */ + spin_lock(ptl); + /* + * paravirt calls inside pte_clear here are + * superfluous. + */ + pte_clear(vma->vm_mm, address, _pte); + spin_unlock(ptl); + } } else { src_page = pte_page(pteval); copy_user_highpage(page, src_page, address, vma); @@ -2543,7 +2555,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, none = 0; + int ret = 0, none_or_zero = 0; struct page *page; unsigned long _address; spinlock_t *ptl; @@ -2561,8 +2573,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; - if (pte_none(pteval)) { - if (++none <= khugepaged_max_ptes_none) + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out_unmap; From b9ea25152e56365ce149b9a39637cd7a16eec556 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 14 Apr 2015 15:45:27 -0700 Subject: [PATCH 060/122] page_writeback: clean up mess around cancel_dirty_page() This patch replaces cancel_dirty_page() with a helper function account_page_cleaned() which only updates counters. It's called from truncate_complete_page() and from try_to_free_buffers() (hack for ext3). Page is locked in both cases, page-lock protects against concurrent dirtiers: see commit 2d6d7f982846 ("mm: protect set_page_dirty() from ongoing truncation"). Delete_from_page_cache() shouldn't be called for dirty pages, they must be handled by caller (either written or truncated). This patch treats final dirty accounting fixup at the end of __delete_from_page_cache() as a debug check and adds WARN_ON_ONCE() around it. If something removes dirty pages without proper handling that might be a bug and unwritten data might be lost. Hugetlbfs has no dirty pages accounting, ClearPageDirty() is enough here. cancel_dirty_page() in nfs_wb_page_cancel() is redundant. This is helper for nfs_invalidate_page() and it's called only in case complete invalidation. The mess was started in v2.6.20 after commits 46d2277c796f ("Clean up and make try_to_free_buffers() not race with dirty pages") and 3e67c0987d75 ("truncate: clear page dirtiness before running try_to_free_buffers()") first was reverted right in v2.6.20 in commit ecdfc9787fe5 ("Resurrect 'try_to_free_buffers()' VM hackery"), second in v2.6.25 commit a2b345642f53 ("Fix dirty page accounting leak with ext3 data=journal"). Custom fixes were introduced between these points. NFS in v2.6.23, commit 1b3b4a1a2deb ("NFS: Fix a write request leak in nfs_invalidate_page()"). Kludge in __delete_from_page_cache() in v2.6.24, commit 3a6927906f1b ("Do dirty page accounting when removing a page from the page cache"). Since v2.6.25 all of them are redundant. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Konstantin Khlebnikov Cc: Tejun Heo Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../include/linux/lustre_patchless_compat.h | 4 +- fs/buffer.c | 4 +- fs/hugetlbfs/inode.c | 2 +- fs/nfs/write.c | 5 --- include/linux/mm.h | 2 + include/linux/page-flags.h | 2 - mm/filemap.c | 15 ++++---- mm/page-writeback.c | 19 ++++++++++ mm/truncate.c | 37 ++++--------------- 9 files changed, 41 insertions(+), 49 deletions(-) diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h index a260e99a4447..d72605864b0a 100644 --- a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h +++ b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h @@ -55,7 +55,9 @@ truncate_complete_page(struct address_space *mapping, struct page *page) if (PagePrivate(page)) page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); - cancel_dirty_page(page, PAGE_SIZE); + if (TestClearPageDirty(page)) + account_page_cleaned(page, mapping); + ClearPageMappedToDisk(page); ll_delete_from_page_cache(page); } diff --git a/fs/buffer.c b/fs/buffer.c index 20805db2c987..c7a5602d01ee 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3243,8 +3243,8 @@ int try_to_free_buffers(struct page *page) * to synchronise against __set_page_dirty_buffers and prevent the * dirty bit from being lost. */ - if (ret) - cancel_dirty_page(page, PAGE_CACHE_SIZE); + if (ret && TestClearPageDirty(page)) + account_page_cleaned(page, mapping); spin_unlock(&mapping->private_lock); out: if (buffers_to_free) { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c274aca8e8dc..db76cec3ce21 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -319,7 +319,7 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, static void truncate_huge_page(struct page *page) { - cancel_dirty_page(page, /* No IO accounting for huge pages? */0); + ClearPageDirty(page); ClearPageUptodate(page); delete_from_page_cache(page); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 849ed784d6ac..759931088094 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1876,11 +1876,6 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) * request from the inode / page_private pointer and * release it */ nfs_inode_remove_request(req); - /* - * In case nfs_inode_remove_request has marked the - * page as being dirty - */ - cancel_dirty_page(page, PAGE_CACHE_SIZE); nfs_unlock_and_release_request(req); } diff --git a/include/linux/mm.h b/include/linux/mm.h index cccbbba12b9d..6571dd78e984 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1294,9 +1294,11 @@ int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); void account_page_dirtied(struct page *page, struct address_space *mapping); +void account_page_cleaned(struct page *page, struct address_space *mapping); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); + int get_cmdline(struct task_struct *task, char *buffer, int buflen); /* Is the vma a continuation of the stack vma above it? */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5ed7bdaf22d5..c851ff92d5b3 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -328,8 +328,6 @@ static inline void SetPageUptodate(struct page *page) CLEARPAGEFLAG(Uptodate, uptodate) -extern void cancel_dirty_page(struct page *page, unsigned int account_size); - int test_clear_page_writeback(struct page *page); int __test_set_page_writeback(struct page *page, bool keep_write); diff --git a/mm/filemap.c b/mm/filemap.c index ad7242043bdb..434dba317400 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -203,16 +203,15 @@ void __delete_from_page_cache(struct page *page, void *shadow) BUG_ON(page_mapped(page)); /* - * Some filesystems seem to re-dirty the page even after - * the VM has canceled the dirty bit (eg ext3 journaling). + * At this point page must be either written or cleaned by truncate. + * Dirty page here signals a bug and loss of unwritten data. * - * Fix it up by doing a final dirty accounting check after - * having removed the page entirely. + * This fixes dirty accounting after removing the page entirely but + * leaves PageDirty set: it has no effect for truncated page and + * anyway will be cleared before returning page into buddy allocator. */ - if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { - dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); - } + if (WARN_ON_ONCE(PageDirty(page))) + account_page_cleaned(page, mapping); } /** diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 644bcb665773..0372411f38fc 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2110,6 +2110,25 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) } EXPORT_SYMBOL(account_page_dirtied); +/* + * Helper function for deaccounting dirty page without writeback. + * + * Doing this should *normally* only ever be done when a page + * is truncated, and is not actually mapped anywhere at all. However, + * fs/buffer.c does this when it notices that somebody has cleaned + * out all the buffers on a page without actually doing it through + * the VM. Can you say "ext3 is horribly ugly"? Thought you could. + */ +void account_page_cleaned(struct page *page, struct address_space *mapping) +{ + if (mapping_cap_account_dirty(mapping)) { + dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); + task_io_account_cancelled_write(PAGE_CACHE_SIZE); + } +} +EXPORT_SYMBOL(account_page_cleaned); + /* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. diff --git a/mm/truncate.c b/mm/truncate.c index ddec5a5966d7..7a9d8a3cb143 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -92,35 +92,6 @@ void do_invalidatepage(struct page *page, unsigned int offset, (*invalidatepage)(page, offset, length); } -/* - * This cancels just the dirty bit on the kernel page itself, it - * does NOT actually remove dirty bits on any mmap's that may be - * around. It also leaves the page tagged dirty, so any sync - * activity will still find it on the dirty lists, and in particular, - * clear_page_dirty_for_io() will still look at the dirty bits in - * the VM. - * - * Doing this should *normally* only ever be done when a page - * is truncated, and is not actually mapped anywhere at all. However, - * fs/buffer.c does this when it notices that somebody has cleaned - * out all the buffers on a page without actually doing it through - * the VM. Can you say "ext3 is horribly ugly"? Tought you could. - */ -void cancel_dirty_page(struct page *page, unsigned int account_size) -{ - if (TestClearPageDirty(page)) { - struct address_space *mapping = page->mapping; - if (mapping && mapping_cap_account_dirty(mapping)) { - dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(inode_to_bdi(mapping->host), - BDI_RECLAIMABLE); - if (account_size) - task_io_account_cancelled_write(account_size); - } - } -} -EXPORT_SYMBOL(cancel_dirty_page); - /* * If truncate cannot remove the fs-private metadata from the page, the page * becomes orphaned. It will be left on the LRU and may even be mapped into @@ -140,7 +111,13 @@ truncate_complete_page(struct address_space *mapping, struct page *page) if (page_has_private(page)) do_invalidatepage(page, 0, PAGE_CACHE_SIZE); - cancel_dirty_page(page, PAGE_CACHE_SIZE); + /* + * Some filesystems seem to re-dirty the page even after + * the VM has canceled the dirty bit (eg ext3 journaling). + * Hence dirty accounting check is placed after invalidation. + */ + if (TestClearPageDirty(page)) + account_page_cleaned(page, mapping); ClearPageMappedToDisk(page); delete_from_page_cache(page); From d1bfcdb8ce0ea6eb6034daa7ff02548e0bc9c21b Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 14 Apr 2015 15:45:30 -0700 Subject: [PATCH 061/122] mm: hide per-cpu lists in output of show_mem() This makes show_mem() much less verbose on huge machines. Instead of huge and almost useless dump of counters for each per-zone per-cpu lists this patch prints the sum of these counters for each zone (free_pcp) and size of per-cpu list for current cpu (local_pcp). The filter flag SHOW_MEM_PERCPU_LISTS reverts to the old verbose mode. [akpm@linux-foundation.org: update show_free_areas comment] Signed-off-by: Konstantin Khlebnikov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/page_alloc.c | 39 ++++++++++++++++++++++++++++++--------- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6571dd78e984..9c21b42d07bf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1126,6 +1126,7 @@ extern void pagefault_out_of_memory(void); * various contexts. */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ +#define SHOW_MEM_PERCPU_LISTS (0x0002u) /* per-zone per-cpu */ extern void show_free_areas(unsigned int flags); extern bool skip_free_areas_node(unsigned int flags, int nid); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6dfa5b24cc79..eab8e2018a46 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3251,25 +3251,37 @@ static void show_migration_types(unsigned char type) * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the * memory on each free list with the exception of the first item on the list. - * Suppresses nodes that are not allowed by current's cpuset if - * SHOW_MEM_FILTER_NODES is passed. + * + * Bits in @filter: + * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's + * cpuset. + * SHOW_MEM_PERCPU_LISTS: display full per-node per-cpu pcp lists */ void show_free_areas(unsigned int filter) { + unsigned long free_pcp = 0; int cpu; struct zone *zone; for_each_populated_zone(zone) { if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; - show_node(zone); - printk("%s per-cpu:\n", zone->name); + + if (filter & SHOW_MEM_PERCPU_LISTS) { + show_node(zone); + printk("%s per-cpu:\n", zone->name); + } for_each_online_cpu(cpu) { struct per_cpu_pageset *pageset; pageset = per_cpu_ptr(zone->pageset, cpu); + free_pcp += pageset->pcp.count; + + if (!(filter & SHOW_MEM_PERCPU_LISTS)) + continue; + printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", cpu, pageset->pcp.high, pageset->pcp.batch, pageset->pcp.count); @@ -3278,11 +3290,10 @@ void show_free_areas(unsigned int filter) printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" " active_file:%lu inactive_file:%lu isolated_file:%lu\n" - " unevictable:%lu" - " dirty:%lu writeback:%lu unstable:%lu\n" - " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" + " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" + " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" - " free_cma:%lu\n", + " free:%lu free_pcp:%lu free_cma:%lu\n", global_page_state(NR_ACTIVE_ANON), global_page_state(NR_INACTIVE_ANON), global_page_state(NR_ISOLATED_ANON), @@ -3293,13 +3304,14 @@ void show_free_areas(unsigned int filter) global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), - global_page_state(NR_FREE_PAGES), global_page_state(NR_SLAB_RECLAIMABLE), global_page_state(NR_SLAB_UNRECLAIMABLE), global_page_state(NR_FILE_MAPPED), global_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE), + global_page_state(NR_FREE_PAGES), + free_pcp, global_page_state(NR_FREE_CMA_PAGES)); for_each_populated_zone(zone) { @@ -3307,6 +3319,11 @@ void show_free_areas(unsigned int filter) if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; + + free_pcp = 0; + for_each_online_cpu(cpu) + free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; + show_node(zone); printk("%s" " free:%lukB" @@ -3333,6 +3350,8 @@ void show_free_areas(unsigned int filter) " pagetables:%lukB" " unstable:%lukB" " bounce:%lukB" + " free_pcp:%lukB" + " local_pcp:%ukB" " free_cma:%lukB" " writeback_tmp:%lukB" " pages_scanned:%lu" @@ -3364,6 +3383,8 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_UNSTABLE_NFS)), K(zone_page_state(zone, NR_BOUNCE)), + K(free_pcp), + K(this_cpu_read(zone->pageset->pcp.count)), K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), K(zone_page_state(zone, NR_PAGES_SCANNED)), From 761b06771adeeb734e9eebc6f70f916cb9e2f643 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 14 Apr 2015 15:45:32 -0700 Subject: [PATCH 062/122] mm: completely remove dumping per-cpu lists from show_mem() It seems nobody needs this. Signed-off-by: Konstantin Khlebnikov Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/page_alloc.c | 22 ++-------------------- 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9c21b42d07bf..6571dd78e984 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1126,7 +1126,6 @@ extern void pagefault_out_of_memory(void); * various contexts. */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ -#define SHOW_MEM_PERCPU_LISTS (0x0002u) /* per-zone per-cpu */ extern void show_free_areas(unsigned int flags); extern bool skip_free_areas_node(unsigned int flags, int nid); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eab8e2018a46..84466a4b1b36 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3255,7 +3255,6 @@ static void show_migration_types(unsigned char type) * Bits in @filter: * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's * cpuset. - * SHOW_MEM_PERCPU_LISTS: display full per-node per-cpu pcp lists */ void show_free_areas(unsigned int filter) { @@ -3267,25 +3266,8 @@ void show_free_areas(unsigned int filter) if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; - if (filter & SHOW_MEM_PERCPU_LISTS) { - show_node(zone); - printk("%s per-cpu:\n", zone->name); - } - - for_each_online_cpu(cpu) { - struct per_cpu_pageset *pageset; - - pageset = per_cpu_ptr(zone->pageset, cpu); - - free_pcp += pageset->pcp.count; - - if (!(filter & SHOW_MEM_PERCPU_LISTS)) - continue; - - printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", - cpu, pageset->pcp.high, - pageset->pcp.batch, pageset->pcp.count); - } + for_each_online_cpu(cpu) + free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; } printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" From f5d4547ad27737137fae8e67025e5664ecb8e790 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:45:36 -0700 Subject: [PATCH 063/122] alpha: expose number of page table levels on Kconfig level I've implemented accounting for pmd page tables as we have for pte (see mm->nr_ptes). It's requires a new counter in mm_struct: mm->nr_pmds. But the feature doesn't make any sense if an architecture has PMD level folded and it would be nice get rid of the counter in this case. The problem is that we cannot use __PAGETABLE_PMD_FOLDED in due to circular dependencies: -> -> In most cases wants to get definition of struct page and struct vm_area_struct. I've tried to split mm_struct into separate header file to be able to user there. But it doesn't fly on some architectures, like ARM: it wants mm_struct to implement tlb flushing. I don't see how to fix it without massive de-inlining or coverting a lot for inline functions to macros. This is other approach: expose number of page tables in use via Kconfig and use it in instead of __PAGETABLE_PMD_FOLDED from . This patch (of 19): We would want to use number of page table level to define mm_struct. Let's expose it as CONFIG_PGTABLE_LEVELS. Signed-off-by: Kirill A. Shutemov Acked-by: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Tested-by: Guenter Roeck Cc: "David S. Miller" Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Jeff Dike Cc: Kirill A. Shutemov Cc: Koichi Yasutake Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Ralf Baechle Cc: Richard Weinberger Cc: Russell King Cc: Thomas Gleixner Cc: Tony Luck Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index b7ff9a318c31..bf9e9d3b3792 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -76,6 +76,10 @@ config GENERIC_ISA_DMA bool default y +config PGTABLE_LEVELS + int + default 3 + source "init/Kconfig" source "kernel/Kconfig.freezer" From 9f25e6ad58e1fb3b4d441e4c55635c4598a6fa94 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:45:39 -0700 Subject: [PATCH 064/122] arm64: expose number of page table levels on Kconfig level We would want to use number of page table level to define mm_struct. Let's expose it as CONFIG_PGTABLE_LEVELS. ARM64_PGTABLE_LEVELS is renamed to PGTABLE_LEVELS and defined before sourcing init/Kconfig: arch/Kconfig will define default value and it's sourced from init/Kconfig. Signed-off-by: Kirill A. Shutemov Acked-by: Catalin Marinas Cc: Will Deacon Tested-by: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 14 +++++++------- arch/arm64/include/asm/kvm_mmu.h | 4 ++-- arch/arm64/include/asm/page.h | 4 ++-- arch/arm64/include/asm/pgalloc.h | 8 ++++---- arch/arm64/include/asm/pgtable-hwdef.h | 6 +++--- arch/arm64/include/asm/pgtable-types.h | 12 ++++++------ arch/arm64/include/asm/pgtable.h | 8 ++++---- arch/arm64/include/asm/tlb.h | 4 ++-- arch/arm64/mm/mmu.c | 4 ++-- 9 files changed, 32 insertions(+), 32 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1b8e97331ffb..3f2fba996bc2 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -143,6 +143,13 @@ config KERNEL_MODE_NEON config FIX_EARLYCON_MEM def_bool y +config PGTABLE_LEVELS + int + default 2 if ARM64_64K_PAGES && ARM64_VA_BITS_42 + default 3 if ARM64_64K_PAGES && ARM64_VA_BITS_48 + default 3 if ARM64_4K_PAGES && ARM64_VA_BITS_39 + default 4 if ARM64_4K_PAGES && ARM64_VA_BITS_48 + source "init/Kconfig" source "kernel/Kconfig.freezer" @@ -413,13 +420,6 @@ config ARM64_VA_BITS default 42 if ARM64_VA_BITS_42 default 48 if ARM64_VA_BITS_48 -config ARM64_PGTABLE_LEVELS - int - default 2 if ARM64_64K_PAGES && ARM64_VA_BITS_42 - default 3 if ARM64_64K_PAGES && ARM64_VA_BITS_48 - default 3 if ARM64_4K_PAGES && ARM64_VA_BITS_39 - default 4 if ARM64_4K_PAGES && ARM64_VA_BITS_48 - config CPU_BIG_ENDIAN bool "Build big-endian kernel" help diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index bbfb600fa822..36250705dc4c 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -163,12 +163,12 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd) /* * If we are concatenating first level stage-2 page tables, we would have less * than or equal to 16 pointers in the fake PGD, because that's what the - * architecture allows. In this case, (4 - CONFIG_ARM64_PGTABLE_LEVELS) + * architecture allows. In this case, (4 - CONFIG_PGTABLE_LEVELS) * represents the first level for the host, and we add 1 to go to the next * level (which uses contatenation) for the stage-2 tables. */ #if PTRS_PER_S2_PGD <= 16 -#define KVM_PREALLOC_LEVEL (4 - CONFIG_ARM64_PGTABLE_LEVELS + 1) +#define KVM_PREALLOC_LEVEL (4 - CONFIG_PGTABLE_LEVELS + 1) #else #define KVM_PREALLOC_LEVEL (0) #endif diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index 22b16232bd60..8fc8fa280e92 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -36,9 +36,9 @@ * for more information). */ #ifdef CONFIG_ARM64_64K_PAGES -#define SWAPPER_PGTABLE_LEVELS (CONFIG_ARM64_PGTABLE_LEVELS) +#define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS) #else -#define SWAPPER_PGTABLE_LEVELS (CONFIG_ARM64_PGTABLE_LEVELS - 1) +#define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS - 1) #endif #define SWAPPER_DIR_SIZE (SWAPPER_PGTABLE_LEVELS * PAGE_SIZE) diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index e20df38a8ff3..76420568d66a 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -28,7 +28,7 @@ #define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) -#if CONFIG_ARM64_PGTABLE_LEVELS > 2 +#if CONFIG_PGTABLE_LEVELS > 2 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { @@ -46,9 +46,9 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) set_pud(pud, __pud(__pa(pmd) | PMD_TYPE_TABLE)); } -#endif /* CONFIG_ARM64_PGTABLE_LEVELS > 2 */ +#endif /* CONFIG_PGTABLE_LEVELS > 2 */ -#if CONFIG_ARM64_PGTABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 3 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { @@ -66,7 +66,7 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) set_pgd(pgd, __pgd(__pa(pud) | PUD_TYPE_TABLE)); } -#endif /* CONFIG_ARM64_PGTABLE_LEVELS > 3 */ +#endif /* CONFIG_PGTABLE_LEVELS > 3 */ extern pgd_t *pgd_alloc(struct mm_struct *mm); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 5f930cc9ea83..80f3d241cff8 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -21,7 +21,7 @@ /* * PMD_SHIFT determines the size a level 2 page table entry can map. */ -#if CONFIG_ARM64_PGTABLE_LEVELS > 2 +#if CONFIG_PGTABLE_LEVELS > 2 #define PMD_SHIFT ((PAGE_SHIFT - 3) * 2 + 3) #define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) @@ -31,7 +31,7 @@ /* * PUD_SHIFT determines the size a level 1 page table entry can map. */ -#if CONFIG_ARM64_PGTABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 3 #define PUD_SHIFT ((PAGE_SHIFT - 3) * 3 + 3) #define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_MASK (~(PUD_SIZE-1)) @@ -42,7 +42,7 @@ * PGDIR_SHIFT determines the size a top-level page table entry can map * (depending on the configuration, this level can be 0, 1 or 2). */ -#define PGDIR_SHIFT ((PAGE_SHIFT - 3) * CONFIG_ARM64_PGTABLE_LEVELS + 3) +#define PGDIR_SHIFT ((PAGE_SHIFT - 3) * CONFIG_PGTABLE_LEVELS + 3) #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) #define PTRS_PER_PGD (1 << (VA_BITS - PGDIR_SHIFT)) diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h index ca9df80af896..2b1bd7e52c3b 100644 --- a/arch/arm64/include/asm/pgtable-types.h +++ b/arch/arm64/include/asm/pgtable-types.h @@ -38,13 +38,13 @@ typedef struct { pteval_t pte; } pte_t; #define pte_val(x) ((x).pte) #define __pte(x) ((pte_t) { (x) } ) -#if CONFIG_ARM64_PGTABLE_LEVELS > 2 +#if CONFIG_PGTABLE_LEVELS > 2 typedef struct { pmdval_t pmd; } pmd_t; #define pmd_val(x) ((x).pmd) #define __pmd(x) ((pmd_t) { (x) } ) #endif -#if CONFIG_ARM64_PGTABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 3 typedef struct { pudval_t pud; } pud_t; #define pud_val(x) ((x).pud) #define __pud(x) ((pud_t) { (x) } ) @@ -64,13 +64,13 @@ typedef pteval_t pte_t; #define pte_val(x) (x) #define __pte(x) (x) -#if CONFIG_ARM64_PGTABLE_LEVELS > 2 +#if CONFIG_PGTABLE_LEVELS > 2 typedef pmdval_t pmd_t; #define pmd_val(x) (x) #define __pmd(x) (x) #endif -#if CONFIG_ARM64_PGTABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 3 typedef pudval_t pud_t; #define pud_val(x) (x) #define __pud(x) (x) @@ -86,9 +86,9 @@ typedef pteval_t pgprot_t; #endif /* STRICT_MM_TYPECHECKS */ -#if CONFIG_ARM64_PGTABLE_LEVELS == 2 +#if CONFIG_PGTABLE_LEVELS == 2 #include -#elif CONFIG_ARM64_PGTABLE_LEVELS == 3 +#elif CONFIG_PGTABLE_LEVELS == 3 #include #endif diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 800ec0e87ed9..56283f8a675c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -374,7 +374,7 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd) */ #define mk_pte(page,prot) pfn_pte(page_to_pfn(page),prot) -#if CONFIG_ARM64_PGTABLE_LEVELS > 2 +#if CONFIG_PGTABLE_LEVELS > 2 #define pmd_ERROR(pmd) __pmd_error(__FILE__, __LINE__, pmd_val(pmd)) @@ -409,9 +409,9 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr) #define pud_page(pud) pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK)) -#endif /* CONFIG_ARM64_PGTABLE_LEVELS > 2 */ +#endif /* CONFIG_PGTABLE_LEVELS > 2 */ -#if CONFIG_ARM64_PGTABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 3 #define pud_ERROR(pud) __pud_error(__FILE__, __LINE__, pud_val(pud)) @@ -445,7 +445,7 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long addr) #define pgd_page(pgd) pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK)) -#endif /* CONFIG_ARM64_PGTABLE_LEVELS > 3 */ +#endif /* CONFIG_PGTABLE_LEVELS > 3 */ #define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd)) diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index 53d9c354219f..3a0242c7eb8d 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -53,7 +53,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, tlb_remove_entry(tlb, pte); } -#if CONFIG_ARM64_PGTABLE_LEVELS > 2 +#if CONFIG_PGTABLE_LEVELS > 2 static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr) { @@ -62,7 +62,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, } #endif -#if CONFIG_ARM64_PGTABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 3 static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp, unsigned long addr) { diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index c6daaf6c6f97..79e01163a981 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -550,10 +550,10 @@ void vmemmap_free(unsigned long start, unsigned long end) #endif /* CONFIG_SPARSEMEM_VMEMMAP */ static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss; -#if CONFIG_ARM64_PGTABLE_LEVELS > 2 +#if CONFIG_PGTABLE_LEVELS > 2 static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss; #endif -#if CONFIG_ARM64_PGTABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 3 static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss; #endif From 1bcad26e9d5362d4890ab5718d729ee9cd85a493 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:45:42 -0700 Subject: [PATCH 065/122] arm: expose number of page table levels on Kconfig level We would want to use number of page table level to define mm_struct. Let's expose it as CONFIG_PGTABLE_LEVELS. Signed-off-by: Kirill A. Shutemov Cc: Russell King Tested-by: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index cf4c0c99aa25..696cf3c61e0f 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -286,6 +286,11 @@ config GENERIC_BUG def_bool y depends on BUG +config PGTABLE_LEVELS + int + default 3 if ARM_LPAE + default 2 + source "init/Kconfig" source "kernel/Kconfig.freezer" From 4d66bcc7cf762e82703b94d416245d4a216c79ae Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:45:45 -0700 Subject: [PATCH 066/122] ia64: expose number of page table levels on Kconfig level We would want to use number of page table level to define mm_struct. Let's expose it as CONFIG_PGTABLE_LEVELS. We need to define PGTABLE_LEVELS before sourcing init/Kconfig: arch/Kconfig will define default value and it's sourced from init/Kconfig. Signed-off-by: Kirill A. Shutemov Cc: Tony Luck Cc: Fenghua Yu Tested-by: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/Kconfig | 18 +++++------------- arch/ia64/include/asm/page.h | 4 ++-- arch/ia64/include/asm/pgalloc.h | 4 ++-- arch/ia64/include/asm/pgtable.h | 12 ++++++------ arch/ia64/kernel/ivt.S | 12 ++++++------ arch/ia64/kernel/machine_kexec.c | 4 ++-- 6 files changed, 23 insertions(+), 31 deletions(-) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 074e52bf815c..4f9a6661491b 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -1,3 +1,8 @@ +config PGTABLE_LEVELS + int "Page Table Levels" if !IA64_PAGE_SIZE_64KB + range 3 4 if !IA64_PAGE_SIZE_64KB + default 3 + source "init/Kconfig" source "kernel/Kconfig.freezer" @@ -286,19 +291,6 @@ config IA64_PAGE_SIZE_64KB endchoice -choice - prompt "Page Table Levels" - default PGTABLE_3 - -config PGTABLE_3 - bool "3 Levels" - -config PGTABLE_4 - depends on !IA64_PAGE_SIZE_64KB - bool "4 Levels" - -endchoice - if IA64_HP_SIM config HZ default 32 diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h index 1f1bf144fe62..ec48bb9f95e1 100644 --- a/arch/ia64/include/asm/page.h +++ b/arch/ia64/include/asm/page.h @@ -173,7 +173,7 @@ get_order (unsigned long size) */ typedef struct { unsigned long pte; } pte_t; typedef struct { unsigned long pmd; } pmd_t; -#ifdef CONFIG_PGTABLE_4 +#if CONFIG_PGTABLE_LEVELS == 4 typedef struct { unsigned long pud; } pud_t; #endif typedef struct { unsigned long pgd; } pgd_t; @@ -182,7 +182,7 @@ get_order (unsigned long size) # define pte_val(x) ((x).pte) # define pmd_val(x) ((x).pmd) -#ifdef CONFIG_PGTABLE_4 +#if CONFIG_PGTABLE_LEVELS == 4 # define pud_val(x) ((x).pud) #endif # define pgd_val(x) ((x).pgd) diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h index 5767cdfc08db..f5e70e961948 100644 --- a/arch/ia64/include/asm/pgalloc.h +++ b/arch/ia64/include/asm/pgalloc.h @@ -32,7 +32,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) quicklist_free(0, NULL, pgd); } -#ifdef CONFIG_PGTABLE_4 +#if CONFIG_PGTABLE_LEVELS == 4 static inline void pgd_populate(struct mm_struct *mm, pgd_t * pgd_entry, pud_t * pud) { @@ -49,7 +49,7 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) quicklist_free(0, NULL, pud); } #define __pud_free_tlb(tlb, pud, address) pud_free((tlb)->mm, pud) -#endif /* CONFIG_PGTABLE_4 */ +#endif /* CONFIG_PGTABLE_LEVELS == 4 */ static inline void pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd) diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 7b6f8801df57..9f3ed9ee8f13 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -99,7 +99,7 @@ #define PMD_MASK (~(PMD_SIZE-1)) #define PTRS_PER_PMD (1UL << (PTRS_PER_PTD_SHIFT)) -#ifdef CONFIG_PGTABLE_4 +#if CONFIG_PGTABLE_LEVELS == 4 /* * Definitions for second level: * @@ -117,7 +117,7 @@ * * PGDIR_SHIFT determines what a first-level page table entry can map. */ -#ifdef CONFIG_PGTABLE_4 +#if CONFIG_PGTABLE_LEVELS == 4 #define PGDIR_SHIFT (PUD_SHIFT + (PTRS_PER_PTD_SHIFT)) #else #define PGDIR_SHIFT (PMD_SHIFT + (PTRS_PER_PTD_SHIFT)) @@ -180,7 +180,7 @@ #define __S111 __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RWX) #define pgd_ERROR(e) printk("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e)) -#ifdef CONFIG_PGTABLE_4 +#if CONFIG_PGTABLE_LEVELS == 4 #define pud_ERROR(e) printk("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e)) #endif #define pmd_ERROR(e) printk("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e)) @@ -281,7 +281,7 @@ extern unsigned long VMALLOC_END; #define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & _PFN_MASK)) #define pud_page(pud) virt_to_page((pud_val(pud) + PAGE_OFFSET)) -#ifdef CONFIG_PGTABLE_4 +#if CONFIG_PGTABLE_LEVELS == 4 #define pgd_none(pgd) (!pgd_val(pgd)) #define pgd_bad(pgd) (!ia64_phys_addr_valid(pgd_val(pgd))) #define pgd_present(pgd) (pgd_val(pgd) != 0UL) @@ -384,7 +384,7 @@ pgd_offset (const struct mm_struct *mm, unsigned long address) here. */ #define pgd_offset_gate(mm, addr) pgd_offset_k(addr) -#ifdef CONFIG_PGTABLE_4 +#if CONFIG_PGTABLE_LEVELS == 4 /* Find an entry in the second-level page table.. */ #define pud_offset(dir,addr) \ ((pud_t *) pgd_page_vaddr(*(dir)) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))) @@ -586,7 +586,7 @@ extern struct page *zero_page_memmap_ptr; #define __HAVE_ARCH_PGD_OFFSET_GATE -#ifndef CONFIG_PGTABLE_4 +#if CONFIG_PGTABLE_LEVELS == 3 #include #endif #include diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S index 18e794a57248..e42bf7a913f3 100644 --- a/arch/ia64/kernel/ivt.S +++ b/arch/ia64/kernel/ivt.S @@ -146,7 +146,7 @@ ENTRY(vhpt_miss) (p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5 (p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4] cmp.eq p7,p6=0,r21 // unused address bits all zeroes? -#ifdef CONFIG_PGTABLE_4 +#if CONFIG_PGTABLE_LEVELS == 4 shr.u r28=r22,PUD_SHIFT // shift pud index into position #else shr.u r18=r22,PMD_SHIFT // shift pmd index into position @@ -155,7 +155,7 @@ ENTRY(vhpt_miss) ld8 r17=[r17] // get *pgd (may be 0) ;; (p7) cmp.eq p6,p7=r17,r0 // was pgd_present(*pgd) == NULL? -#ifdef CONFIG_PGTABLE_4 +#if CONFIG_PGTABLE_LEVELS == 4 dep r28=r28,r17,3,(PAGE_SHIFT-3) // r28=pud_offset(pgd,addr) ;; shr.u r18=r22,PMD_SHIFT // shift pmd index into position @@ -222,13 +222,13 @@ ENTRY(vhpt_miss) */ ld8 r25=[r21] // read *pte again ld8 r26=[r17] // read *pmd again -#ifdef CONFIG_PGTABLE_4 +#if CONFIG_PGTABLE_LEVELS == 4 ld8 r19=[r28] // read *pud again #endif cmp.ne p6,p7=r0,r0 ;; cmp.ne.or.andcm p6,p7=r26,r20 // did *pmd change -#ifdef CONFIG_PGTABLE_4 +#if CONFIG_PGTABLE_LEVELS == 4 cmp.ne.or.andcm p6,p7=r19,r29 // did *pud change #endif mov r27=PAGE_SHIFT<<2 @@ -476,7 +476,7 @@ ENTRY(nested_dtlb_miss) (p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5 (p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4] cmp.eq p7,p6=0,r21 // unused address bits all zeroes? -#ifdef CONFIG_PGTABLE_4 +#if CONFIG_PGTABLE_LEVELS == 4 shr.u r18=r22,PUD_SHIFT // shift pud index into position #else shr.u r18=r22,PMD_SHIFT // shift pmd index into position @@ -487,7 +487,7 @@ ENTRY(nested_dtlb_miss) (p7) cmp.eq p6,p7=r17,r0 // was pgd_present(*pgd) == NULL? dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=p[u|m]d_offset(pgd,addr) ;; -#ifdef CONFIG_PGTABLE_4 +#if CONFIG_PGTABLE_LEVELS == 4 (p7) ld8 r17=[r17] // get *pud (may be 0) shr.u r18=r22,PMD_SHIFT // shift pmd index into position ;; diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c index 5151a649c96b..b72cd7a07222 100644 --- a/arch/ia64/kernel/machine_kexec.c +++ b/arch/ia64/kernel/machine_kexec.c @@ -156,9 +156,9 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_OFFSET(node_memblk_s, start_paddr); VMCOREINFO_OFFSET(node_memblk_s, size); #endif -#ifdef CONFIG_PGTABLE_3 +#if CONFIG_PGTABLE_LEVELS == 3 VMCOREINFO_CONFIG(PGTABLE_3); -#elif defined(CONFIG_PGTABLE_4) +#elif CONFIG_PGTABLE_LEVELS == 4 VMCOREINFO_CONFIG(PGTABLE_4); #endif } From 980d5b7387c6bbc9b411a3469274b3226b161e45 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:45:48 -0700 Subject: [PATCH 067/122] m68k: mark PMD folded and expose number of page table levels We would want to use number of page table level to define mm_struct. Let's expose it as CONFIG_PGTABLE_LEVELS. Core mm expects __PAGETABLE_{PUD,PMD}_FOLDED to be defined if these page table levels folded. Signed-off-by: Kirill A. Shutemov Cc: Geert Uytterhoeven Tested-by: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m68k/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 87b7c7581b1d..2dd8f63bfbbb 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -67,6 +67,10 @@ config HZ default 1000 if CLEOPATRA default 100 +config PGTABLE_LEVELS + default 2 if SUN3 || COLDFIRE + default 3 + source "init/Kconfig" source "kernel/Kconfig.freezer" From a728ab52709e2fb9659d2661ac30c901df42ef02 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:45:51 -0700 Subject: [PATCH 068/122] mips: expose number of page table levels on Kconfig level We would want to use number of page table level to define mm_struct. Let's expose it as CONFIG_PGTABLE_LEVELS. Signed-off-by: Kirill A. Shutemov Cc: Ralf Baechle Tested-by: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index c7a16904cd03..a9d112d2a135 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2600,6 +2600,11 @@ config STACKTRACE_SUPPORT bool default y +config PGTABLE_LEVELS + int + default 3 if 64BIT && !PAGE_SIZE_64KB + default 2 + source "init/Kconfig" source "kernel/Kconfig.freezer" From f24ffde43237755b290c46306a3dd2deb1428700 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:45:54 -0700 Subject: [PATCH 069/122] parisc: expose number of page table levels on Kconfig level We would want to use number of page table level to define mm_struct. Let's expose it as CONFIG_PGTABLE_LEVELS. Signed-off-by: Kirill A. Shutemov Cc: "James E.J. Bottomley" Cc: Helge Deller Tested-by: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/parisc/Kconfig | 5 +++++ arch/parisc/include/asm/pgalloc.h | 2 +- arch/parisc/include/asm/pgtable.h | 16 +++++++--------- arch/parisc/kernel/entry.S | 4 ++-- arch/parisc/kernel/head.S | 4 ++-- arch/parisc/mm/init.c | 2 +- 6 files changed, 18 insertions(+), 15 deletions(-) diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 8014727a2743..c36546959e86 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -103,6 +103,11 @@ config ARCH_MAY_HAVE_PC_FDC depends on BROKEN default y +config PGTABLE_LEVELS + int + default 3 if 64BIT && PARISC_PAGE_SIZE_4KB + default 2 + source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h index d17437238a2c..1ba29369257c 100644 --- a/arch/parisc/include/asm/pgalloc.h +++ b/arch/parisc/include/asm/pgalloc.h @@ -51,7 +51,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_pages((unsigned long)pgd, PGD_ALLOC_ORDER); } -#if PT_NLEVELS == 3 +#if CONFIG_PGTABLE_LEVELS == 3 /* Three Level Page Table Support for pmd's */ diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 15207b9362bf..0a183756d6ec 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -68,13 +68,11 @@ extern void purge_tlb_entries(struct mm_struct *, unsigned long); #define KERNEL_INITIAL_ORDER 24 /* 0 to 1<<24 = 16MB */ #define KERNEL_INITIAL_SIZE (1 << KERNEL_INITIAL_ORDER) -#if defined(CONFIG_64BIT) && defined(CONFIG_PARISC_PAGE_SIZE_4KB) -#define PT_NLEVELS 3 +#if CONFIG_PGTABLE_LEVELS == 3 #define PGD_ORDER 1 /* Number of pages per pgd */ #define PMD_ORDER 1 /* Number of pages per pmd */ #define PGD_ALLOC_ORDER 2 /* first pgd contains pmd */ #else -#define PT_NLEVELS 2 #define PGD_ORDER 1 /* Number of pages per pgd */ #define PGD_ALLOC_ORDER PGD_ORDER #endif @@ -93,7 +91,7 @@ extern void purge_tlb_entries(struct mm_struct *, unsigned long); #define PMD_SHIFT (PLD_SHIFT + BITS_PER_PTE) #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) -#if PT_NLEVELS == 3 +#if CONFIG_PGTABLE_LEVELS == 3 #define BITS_PER_PMD (PAGE_SHIFT + PMD_ORDER - BITS_PER_PMD_ENTRY) #else #define __PAGETABLE_PMD_FOLDED @@ -277,7 +275,7 @@ extern unsigned long *empty_zero_page; #define pgd_flag(x) (pgd_val(x) & PxD_FLAG_MASK) #define pgd_address(x) ((unsigned long)(pgd_val(x) &~ PxD_FLAG_MASK) << PxD_VALUE_SHIFT) -#if PT_NLEVELS == 3 +#if CONFIG_PGTABLE_LEVELS == 3 /* The first entry of the permanent pmd is not there if it contains * the gateway marker */ #define pmd_none(x) (!pmd_val(x) || pmd_flag(x) == PxD_FLAG_ATTACHED) @@ -287,7 +285,7 @@ extern unsigned long *empty_zero_page; #define pmd_bad(x) (!(pmd_flag(x) & PxD_FLAG_VALID)) #define pmd_present(x) (pmd_flag(x) & PxD_FLAG_PRESENT) static inline void pmd_clear(pmd_t *pmd) { -#if PT_NLEVELS == 3 +#if CONFIG_PGTABLE_LEVELS == 3 if (pmd_flag(*pmd) & PxD_FLAG_ATTACHED) /* This is the entry pointing to the permanent pmd * attached to the pgd; cannot clear it */ @@ -299,7 +297,7 @@ static inline void pmd_clear(pmd_t *pmd) { -#if PT_NLEVELS == 3 +#if CONFIG_PGTABLE_LEVELS == 3 #define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_address(pgd))) #define pgd_page(pgd) virt_to_page((void *)pgd_page_vaddr(pgd)) @@ -309,7 +307,7 @@ static inline void pmd_clear(pmd_t *pmd) { #define pgd_bad(x) (!(pgd_flag(x) & PxD_FLAG_VALID)) #define pgd_present(x) (pgd_flag(x) & PxD_FLAG_PRESENT) static inline void pgd_clear(pgd_t *pgd) { -#if PT_NLEVELS == 3 +#if CONFIG_PGTABLE_LEVELS == 3 if(pgd_flag(*pgd) & PxD_FLAG_ATTACHED) /* This is the permanent pmd attached to the pgd; cannot * free it */ @@ -393,7 +391,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) /* Find an entry in the second-level page table.. */ -#if PT_NLEVELS == 3 +#if CONFIG_PGTABLE_LEVELS == 3 #define pmd_offset(dir,address) \ ((pmd_t *) pgd_page_vaddr(*(dir)) + (((address)>>PMD_SHIFT) & (PTRS_PER_PMD-1))) #else diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index 2ab16bb160a8..75819617f93b 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -398,7 +398,7 @@ * can address up to 1TB */ .macro L2_ptep pmd,pte,index,va,fault -#if PT_NLEVELS == 3 +#if CONFIG_PGTABLE_LEVELS == 3 extru \va,31-ASM_PMD_SHIFT,ASM_BITS_PER_PMD,\index #else # if defined(CONFIG_64BIT) @@ -436,7 +436,7 @@ * all ILP32 processes and all the kernel for machines with * under 4GB of memory) */ .macro L3_ptep pgd,pte,index,va,fault -#if PT_NLEVELS == 3 /* we might have a 2-Level scheme, e.g. with 16kb page size */ +#if CONFIG_PGTABLE_LEVELS == 3 /* we might have a 2-Level scheme, e.g. with 16kb page size */ extrd,u \va,63-ASM_PGDIR_SHIFT,ASM_BITS_PER_PGD,\index copy %r0,\pte extrd,u,*= \va,63-ASM_PGDIR_SHIFT,64-ASM_PGDIR_SHIFT,%r0 diff --git a/arch/parisc/kernel/head.S b/arch/parisc/kernel/head.S index d4dc588c0dc1..e7d64527aff9 100644 --- a/arch/parisc/kernel/head.S +++ b/arch/parisc/kernel/head.S @@ -74,7 +74,7 @@ $bss_loop: mtctl %r4,%cr24 /* Initialize kernel root pointer */ mtctl %r4,%cr25 /* Initialize user root pointer */ -#if PT_NLEVELS == 3 +#if CONFIG_PGTABLE_LEVELS == 3 /* Set pmd in pgd */ load32 PA(pmd0),%r5 shrd %r5,PxD_VALUE_SHIFT,%r3 @@ -97,7 +97,7 @@ $bss_loop: stw %r3,0(%r4) ldo (PAGE_SIZE >> PxD_VALUE_SHIFT)(%r3),%r3 addib,> -1,%r1,1b -#if PT_NLEVELS == 3 +#if CONFIG_PGTABLE_LEVELS == 3 ldo ASM_PMD_ENTRY_SIZE(%r4),%r4 #else ldo ASM_PGD_ENTRY_SIZE(%r4),%r4 diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 15dbe81cf5f3..c229427fa546 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -34,7 +34,7 @@ extern int data_start; extern void parisc_kernel_start(void); /* Kernel entry point in head.S */ -#if PT_NLEVELS == 3 +#if CONFIG_PGTABLE_LEVELS == 3 /* NOTE: This layout exactly conforms to the hybrid L2/L3 page table layout * with the first pmd adjacent to the pgd and below it. gcc doesn't actually * guarantee that global objects will be laid out in memory in the same order From 06ef42a16f2dd8480ada1be3a549e12b02c45915 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:45:57 -0700 Subject: [PATCH 070/122] powerpc: expose number of page table levels on Kconfig level We would want to use number of page table level to define mm_struct. Let's expose it as CONFIG_PGTABLE_LEVELS. Signed-off-by: Kirill A. Shutemov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Tested-by: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/Kconfig | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 22b0940494bb..91ad76f30d18 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -297,6 +297,12 @@ config ZONE_DMA32 bool default y if PPC64 +config PGTABLE_LEVELS + int + default 2 if !PPC64 + default 3 if PPC_64K_PAGES + default 4 + source "init/Kconfig" source "kernel/Kconfig.freezer" From c81956c9cd587d36b87d4ab37c92016ebb79b517 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:46:00 -0700 Subject: [PATCH 071/122] s390: expose number of page table levels We would want to use number of page table level to define mm_struct. Let's expose it as CONFIG_PGTABLE_LEVELS. Core mm expects __PAGETABLE_{PUD,PMD}_FOLDED to be defined if these page table levels folded. Signed-off-by: Kirill A. Shutemov Cc: Martin Schwidefsky Cc: Heiko Carstens Tested-by: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 373cd5badf1c..f6aebcb7a0f8 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -155,6 +155,11 @@ config S390 config SCHED_OMIT_FRAME_POINTER def_bool y +config PGTABLE_LEVELS + int + default 4 if 64BIT + default 2 + source "init/Kconfig" source "kernel/Kconfig.freezer" From 69543d639938fc71b3df69d32ecccc58beb7e578 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:46:02 -0700 Subject: [PATCH 072/122] sh: expose number of page table levels We would want to use number of page table level to define mm_struct. Let's expose it as CONFIG_PGTABLE_LEVELS. Signed-off-by: Kirill A. Shutemov Cc: linux-sh@vger.kernel.org Tested-by: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index eb4ef274ae9b..50057fed819d 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -162,6 +162,10 @@ config NEED_DMA_MAP_STATE config NEED_SG_DMA_LENGTH def_bool y +config PGTABLE_LEVELS + default 3 if X2TLB + default 2 + source "init/Kconfig" source "kernel/Kconfig.freezer" From 81a2936c1a98dd007c510dc523c5bbdce263878b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:46:05 -0700 Subject: [PATCH 073/122] sparc: expose number of page table levels We would want to use number of page table level to define mm_struct. Let's expose it as CONFIG_PGTABLE_LEVELS. Signed-off-by: Kirill A. Shutemov Cc: "David S. Miller" Tested-by: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index efb00ec75805..e49502acbab4 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -146,6 +146,10 @@ config GENERIC_ISA_DMA config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y if SPARC64 +config PGTABLE_LEVELS + default 4 if 64BIT + default 3 + source "init/Kconfig" source "kernel/Kconfig.freezer" From 909d45e62801ea77422d478519baa7d2287c77f7 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:46:08 -0700 Subject: [PATCH 074/122] tile: expose number of page table levels We would want to use number of page table level to define mm_struct. Let's expose it as CONFIG_PGTABLE_LEVELS. Signed-off-by: Kirill A. Shutemov Acked-by: Chris Metcalf Tested-by: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/tile/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 7cca41842a9e..0142d578b5a8 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -147,6 +147,11 @@ config ARCH_DEFCONFIG default "arch/tile/configs/tilepro_defconfig" if !TILEGX default "arch/tile/configs/tilegx_defconfig" if TILEGX +config PGTABLE_LEVELS + int + default 3 if 64BIT + default 2 + source "init/Kconfig" source "kernel/Kconfig.freezer" From 6b8ce2a1a464526335672c33cbf3cb9fc638efff Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:46:11 -0700 Subject: [PATCH 075/122] um: expose number of page table levels We would want to use number of page table level to define mm_struct. Let's expose it as CONFIG_PGTABLE_LEVELS. Signed-off-by: Kirill A. Shutemov Acked-by: Richard Weinberger Cc: Jeff Dike Tested-by: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/Kconfig.um | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/um/Kconfig.um b/arch/um/Kconfig.um index a7520c90f62d..5dbfe3d9107c 100644 --- a/arch/um/Kconfig.um +++ b/arch/um/Kconfig.um @@ -155,3 +155,8 @@ config MMAPPER config NO_DMA def_bool y + +config PGTABLE_LEVELS + int + default 3 if 3_LEVEL_PGTABLES + default 2 From 982333683385343d8d2db9a1df69c98406f42687 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:46:14 -0700 Subject: [PATCH 076/122] x86: expose number of page table levels on Kconfig level We would want to use number of page table level to define mm_struct. Let's expose it as CONFIG_PGTABLE_LEVELS. Signed-off-by: Kirill A. Shutemov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Tested-by: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 6 ++++++ arch/x86/include/asm/paravirt.h | 8 ++++---- arch/x86/include/asm/paravirt_types.h | 8 ++++---- arch/x86/include/asm/pgalloc.h | 8 ++++---- arch/x86/include/asm/pgtable-2level_types.h | 1 - arch/x86/include/asm/pgtable-3level_types.h | 2 -- arch/x86/include/asm/pgtable.h | 8 ++++---- arch/x86/include/asm/pgtable_64_types.h | 1 - arch/x86/include/asm/pgtable_types.h | 4 ++-- arch/x86/kernel/paravirt.c | 6 +++--- arch/x86/mm/pgtable.c | 14 +++++++------- arch/x86/xen/mmu.c | 14 +++++++------- include/trace/events/xen.h | 2 +- 13 files changed, 42 insertions(+), 40 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index faff6934c05a..3e3aaad23414 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -277,6 +277,12 @@ config ARCH_SUPPORTS_UPROBES config FIX_EARLYCON_MEM def_bool y +config PGTABLE_LEVELS + int + default 4 if X86_64 + default 3 if X86_PAE + default 2 + source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 5f6051d5d139..8957810ad7d1 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -545,7 +545,7 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val); } -#if PAGETABLE_LEVELS >= 3 +#if CONFIG_PGTABLE_LEVELS >= 3 static inline pmd_t __pmd(pmdval_t val) { pmdval_t ret; @@ -585,7 +585,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud) PVOP_VCALL2(pv_mmu_ops.set_pud, pudp, val); } -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 static inline pud_t __pud(pudval_t val) { pudval_t ret; @@ -636,9 +636,9 @@ static inline void pud_clear(pud_t *pudp) set_pud(pudp, __pud(0)); } -#endif /* PAGETABLE_LEVELS == 4 */ +#endif /* CONFIG_PGTABLE_LEVELS == 4 */ -#endif /* PAGETABLE_LEVELS >= 3 */ +#endif /* CONFIG_PGTABLE_LEVELS >= 3 */ #ifdef CONFIG_X86_PAE /* Special-case pte-setting operations for PAE, which can't update a diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 7549b8b369e4..f7b0b5c112f2 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -294,7 +294,7 @@ struct pv_mmu_ops { struct paravirt_callee_save pgd_val; struct paravirt_callee_save make_pgd; -#if PAGETABLE_LEVELS >= 3 +#if CONFIG_PGTABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); void (*pte_clear)(struct mm_struct *mm, unsigned long addr, @@ -308,13 +308,13 @@ struct pv_mmu_ops { struct paravirt_callee_save pmd_val; struct paravirt_callee_save make_pmd; -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 struct paravirt_callee_save pud_val; struct paravirt_callee_save make_pud; void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); -#endif /* PAGETABLE_LEVELS == 4 */ -#endif /* PAGETABLE_LEVELS >= 3 */ +#endif /* CONFIG_PGTABLE_LEVELS == 4 */ +#endif /* CONFIG_PGTABLE_LEVELS >= 3 */ struct pv_lazy_ops lazy_mode; diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index c4412e972bbd..bf7f8b55b0f9 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -77,7 +77,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, #define pmd_pgtable(pmd) pmd_page(pmd) -#if PAGETABLE_LEVELS > 2 +#if CONFIG_PGTABLE_LEVELS > 2 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { struct page *page; @@ -116,7 +116,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) } #endif /* CONFIG_X86_PAE */ -#if PAGETABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 3 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) { paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); @@ -142,7 +142,7 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, ___pud_free_tlb(tlb, pud); } -#endif /* PAGETABLE_LEVELS > 3 */ -#endif /* PAGETABLE_LEVELS > 2 */ +#endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#endif /* CONFIG_PGTABLE_LEVELS > 2 */ #endif /* _ASM_X86_PGALLOC_H */ diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index daacc23e3fb9..392576433e77 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -17,7 +17,6 @@ typedef union { #endif /* !__ASSEMBLY__ */ #define SHARED_KERNEL_PMD 0 -#define PAGETABLE_LEVELS 2 /* * traditional i386 two-level paging structure: diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 1bd5876c8649..bcc89625ebe5 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -24,8 +24,6 @@ typedef union { #define SHARED_KERNEL_PMD 1 #endif -#define PAGETABLE_LEVELS 3 - /* * PGDIR_SHIFT determines what a top-level page table entry can map */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a0c35bf6cb92..fe57e7a98839 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -551,7 +551,7 @@ static inline unsigned long pages_to_mb(unsigned long npg) return npg >> (20 - PAGE_SHIFT); } -#if PAGETABLE_LEVELS > 2 +#if CONFIG_PGTABLE_LEVELS > 2 static inline int pud_none(pud_t pud) { return native_pud_val(pud) == 0; @@ -594,9 +594,9 @@ static inline int pud_large(pud_t pud) { return 0; } -#endif /* PAGETABLE_LEVELS > 2 */ +#endif /* CONFIG_PGTABLE_LEVELS > 2 */ -#if PAGETABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 3 static inline int pgd_present(pgd_t pgd) { return pgd_flags(pgd) & _PAGE_PRESENT; @@ -633,7 +633,7 @@ static inline int pgd_none(pgd_t pgd) { return !native_pgd_val(pgd); } -#endif /* PAGETABLE_LEVELS > 3 */ +#endif /* CONFIG_PGTABLE_LEVELS > 3 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 602b6028c5b6..e6844dfb4471 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -20,7 +20,6 @@ typedef struct { pteval_t pte; } pte_t; #endif /* !__ASSEMBLY__ */ #define SHARED_KERNEL_PMD 0 -#define PAGETABLE_LEVELS 4 /* * PGDIR_SHIFT determines what a top-level page table entry can map diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 8c7c10802e9c..78f0c8cbe316 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -234,7 +234,7 @@ static inline pgdval_t pgd_flags(pgd_t pgd) return native_pgd_val(pgd) & PTE_FLAGS_MASK; } -#if PAGETABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 3 typedef struct { pudval_t pud; } pud_t; static inline pud_t native_make_pud(pmdval_t val) @@ -255,7 +255,7 @@ static inline pudval_t native_pud_val(pud_t pud) } #endif -#if PAGETABLE_LEVELS > 2 +#if CONFIG_PGTABLE_LEVELS > 2 typedef struct { pmdval_t pmd; } pmd_t; static inline pmd_t native_make_pmd(pmdval_t val) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 548d25f00c90..c614dd492f5f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -443,7 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = { .ptep_modify_prot_start = __ptep_modify_prot_start, .ptep_modify_prot_commit = __ptep_modify_prot_commit, -#if PAGETABLE_LEVELS >= 3 +#if CONFIG_PGTABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE .set_pte_atomic = native_set_pte_atomic, .pte_clear = native_pte_clear, @@ -454,13 +454,13 @@ struct pv_mmu_ops pv_mmu_ops = { .pmd_val = PTE_IDENT, .make_pmd = PTE_IDENT, -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 .pud_val = PTE_IDENT, .make_pud = PTE_IDENT, .set_pgd = native_set_pgd, #endif -#endif /* PAGETABLE_LEVELS >= 3 */ +#endif /* CONFIG_PGTABLE_LEVELS >= 3 */ .pte_val = PTE_IDENT, .pgd_val = PTE_IDENT, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5a7e5252c878..b28edfecbdfe 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -58,7 +58,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) tlb_remove_page(tlb, pte); } -#if PAGETABLE_LEVELS > 2 +#if CONFIG_PGTABLE_LEVELS > 2 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { struct page *page = virt_to_page(pmd); @@ -74,14 +74,14 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) tlb_remove_page(tlb, page); } -#if PAGETABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pud)); } -#endif /* PAGETABLE_LEVELS > 3 */ -#endif /* PAGETABLE_LEVELS > 2 */ +#endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#endif /* CONFIG_PGTABLE_LEVELS > 2 */ static inline void pgd_list_add(pgd_t *pgd) { @@ -117,9 +117,9 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) /* If the pgd points to a shared pagetable level (either the ptes in non-PAE, or shared PMD in PAE), then just copy the references from swapper_pg_dir. */ - if (PAGETABLE_LEVELS == 2 || - (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || - PAGETABLE_LEVELS == 4) { + if (CONFIG_PGTABLE_LEVELS == 2 || + (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || + CONFIG_PGTABLE_LEVELS == 4) { clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index adca9e2b6553..65083ad63b6f 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -502,7 +502,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd) } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 __visible pudval_t xen_pud_val(pud_t pud) { return pte_mfn_to_pfn(pud.pud); @@ -589,7 +589,7 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val) xen_mc_issue(PARAVIRT_LAZY_MMU); } -#endif /* PAGETABLE_LEVELS == 4 */ +#endif /* CONFIG_PGTABLE_LEVELS == 4 */ /* * (Yet another) pagetable walker. This one is intended for pinning a @@ -1628,7 +1628,7 @@ static void xen_release_pmd(unsigned long pfn) xen_release_ptpage(pfn, PT_PMD); } -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) { xen_alloc_ptpage(mm, pfn, PT_PUD); @@ -2046,7 +2046,7 @@ static void __init xen_post_allocator_init(void) pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 pv_mmu_ops.set_pgd = xen_set_pgd; #endif @@ -2056,7 +2056,7 @@ static void __init xen_post_allocator_init(void) pv_mmu_ops.alloc_pmd = xen_alloc_pmd; pv_mmu_ops.release_pte = xen_release_pte; pv_mmu_ops.release_pmd = xen_release_pmd; -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 pv_mmu_ops.alloc_pud = xen_alloc_pud; pv_mmu_ops.release_pud = xen_release_pud; #endif @@ -2122,14 +2122,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 .pud_val = PV_CALLEE_SAVE(xen_pud_val), .make_pud = PV_CALLEE_SAVE(xen_make_pud), .set_pgd = xen_set_pgd_hyper, .alloc_pud = xen_alloc_pmd_init, .release_pud = xen_release_pmd_init, -#endif /* PAGETABLE_LEVELS == 4 */ +#endif /* CONFIG_PGTABLE_LEVELS == 4 */ .activate_mm = xen_activate_mm, .dup_mmap = xen_dup_mmap, diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h index d06b6da5c1e3..bce990f5a35d 100644 --- a/include/trace/events/xen.h +++ b/include/trace/events/xen.h @@ -224,7 +224,7 @@ TRACE_EVENT(xen_mmu_pmd_clear, TP_printk("pmdp %p", __entry->pmdp) ); -#if PAGETABLE_LEVELS >= 4 +#if CONFIG_PGTABLE_LEVELS >= 4 TRACE_EVENT(xen_mmu_set_pud, TP_PROTO(pud_t *pudp, pud_t pudval), From 235a8f0286d3de5754322e9b4abd472ed4d57209 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:46:17 -0700 Subject: [PATCH 077/122] mm: define default PGTABLE_LEVELS to two By this time all architectures which support more than two page table levels should be covered. This patch add default definiton of PGTABLE_LEVELS equal 2. We also add assert to detect inconsistence between CONFIG_PGTABLE_LEVELS and __PAGETABLE_PMD_FOLDED/__PAGETABLE_PUD_FOLDED. Signed-off-by: Kirill A. Shutemov Tested-by: Guenter Roeck Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: "David S. Miller" Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Jeff Dike Cc: Kirill A. Shutemov Cc: Koichi Yasutake Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Ralf Baechle Cc: Richard Weinberger Cc: Russell King Cc: Thomas Gleixner Cc: Tony Luck Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 4 ++++ include/asm-generic/pgtable.h | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index 05d7a8a458d5..a9c95d36ba70 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -484,6 +484,10 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK This spares a stack switch and improves cache usage on softirq processing. +config PGTABLE_LEVELS + int + default 2 + # # ABI hall of shame # diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 4d46085c1b90..1f9f5da6828f 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -7,6 +7,11 @@ #include #include +#if 4 - defined(__PAGETABLE_PUD_FOLDED) - defined(__PAGETABLE_PMD_FOLDED) != \ + CONFIG_PGTABLE_LEVELS +#error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{PUD,PMD}_FOLDED +#endif + /* * On almost all architectures and configurations, 0 can be used as the * upper ceiling to free_pgtables(): on many architectures it has the same From 5a3fbef325e872f831cf13171c7032915bb1916d Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 14 Apr 2015 15:46:21 -0700 Subject: [PATCH 078/122] mm: do not add nr_pmds into mm_struct if PMD is folded CONFIG_PGTABLE_LEVELS is now available on every architecture and we can use it to check if we need to add nr_pmds into mm_struct. Signed-off-by: Kirill A. Shutemov Tested-by: Guenter Roeck Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: "David S. Miller" Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Jeff Dike Cc: Kirill A. Shutemov Cc: Koichi Yasutake Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Ralf Baechle Cc: Richard Weinberger Cc: Russell King Cc: Thomas Gleixner Cc: Tony Luck Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 199a03aab8dc..590630eb59ba 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -364,7 +364,9 @@ struct mm_struct { atomic_t mm_users; /* How many users with user space? */ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ atomic_long_t nr_ptes; /* PTE page table pages */ +#if CONFIG_PGTABLE_LEVELS > 2 atomic_long_t nr_pmds; /* PMD page table pages */ +#endif int map_count; /* number of VMAs */ spinlock_t page_table_lock; /* Protects page tables and some counters */ From 4e047f897771222215ee572e1c0b25e9417376eb Mon Sep 17 00:00:00 2001 From: Shachar Raindel Date: Tue, 14 Apr 2015 15:46:25 -0700 Subject: [PATCH 079/122] mm: refactor do_wp_page, extract the reuse case Currently do_wp_page contains 265 code lines. It also contains 9 goto statements, of which 5 are targeting labels which are not cleanup related. This makes the function extremely difficult to understand. The following patches are an attempt at breaking the function to its basic components, and making it easier to understand. The patches are straight forward function extractions from do_wp_page. As we extract functions, we remove unneeded parameters and simplify the code as much as possible. However, the functionality is supposed to remain completely unchanged. The patches also attempt to document the functionality of each extracted function. In patch 2, we split the unlock logic to the contain logic relevant to specific needs of each use case, instead of having huge number of conditional decisions in a single unlock flow. This patch (of 4): When do_wp_page is ending, in several cases it needs to reuse the existing page. This is achieved by making the page table writable, and possibly updating the page-cache state. Currently, this logic was "called" by using a goto jump. This makes following the control flow of the function harder. It is also against the coding style guidelines for using goto. As the code can easily be refactored into a specialized function, refactor it out and simplify the code flow in do_wp_page. Acked-by: Linus Torvalds Acked-by: Kirill A. Shutemov Acked-by: Rik van Riel Acked-by: Andi Kleen Acked-by: Haggai Eran Acked-by: Johannes Weiner Cc: Mel Gorman Cc: Matthew Wilcox Cc: Dave Hansen Cc: Naoya Horiguchi Cc: Andrea Arcangeli Cc: Peter Feiner Cc: Michel Lespinasse Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 117 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 68 insertions(+), 49 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 97839f5c8c30..e70685f3e836 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1982,6 +1982,65 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, return ret; } +/* + * Handle write page faults for pages that can be reused in the current vma + * + * This can happen either due to the mapping being with the VM_SHARED flag, + * or due to us being the last reference standing to the page. In either + * case, all we need to do here is to mark the page as writable and update + * any related book-keeping. + */ +static inline int wp_page_reuse(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, + struct page *page, int page_mkwrite, + int dirty_shared) + __releases(ptl) +{ + pte_t entry; + /* + * Clear the pages cpupid information as the existing + * information potentially belongs to a now completely + * unrelated process. + */ + if (page) + page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); + + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = pte_mkyoung(orig_pte); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (ptep_set_access_flags(vma, address, page_table, entry, 1)) + update_mmu_cache(vma, address, page_table); + pte_unmap_unlock(page_table, ptl); + + if (dirty_shared) { + struct address_space *mapping; + int dirtied; + + if (!page_mkwrite) + lock_page(page); + + dirtied = set_page_dirty(page); + VM_BUG_ON_PAGE(PageAnon(page), page); + mapping = page->mapping; + unlock_page(page); + page_cache_release(page); + + if ((dirtied || page_mkwrite) && mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + + if (!page_mkwrite) + file_update_time(vma->vm_file); + } + + return VM_FAULT_WRITE; +} + /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address @@ -2008,8 +2067,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *old_page, *new_page = NULL; pte_t entry; int ret = 0; - int page_mkwrite = 0; - bool dirty_shared = false; unsigned long mmun_start = 0; /* For mmu_notifiers */ unsigned long mmun_end = 0; /* For mmu_notifiers */ struct mem_cgroup *memcg; @@ -2026,7 +2083,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) - goto reuse; + return wp_page_reuse(mm, vma, address, page_table, ptl, + orig_pte, old_page, 0, 0); goto gotten; } @@ -2055,12 +2113,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, */ page_move_anon_rmap(old_page, vma, address); unlock_page(old_page); - goto reuse; + return wp_page_reuse(mm, vma, address, page_table, ptl, + orig_pte, old_page, 0, 0); } unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { + int page_mkwrite = 0; + page_cache_get(old_page); + /* * Only catch write-faults on shared writable pages, * read-only shared pages can get COWed by @@ -2091,51 +2153,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, page_mkwrite = 1; } - dirty_shared = true; - -reuse: - /* - * Clear the pages cpupid information as the existing - * information potentially belongs to a now completely - * unrelated process. - */ - if (old_page) - page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1); - - flush_cache_page(vma, address, pte_pfn(orig_pte)); - entry = pte_mkyoung(orig_pte); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (ptep_set_access_flags(vma, address, page_table, entry,1)) - update_mmu_cache(vma, address, page_table); - pte_unmap_unlock(page_table, ptl); - ret |= VM_FAULT_WRITE; - - if (dirty_shared) { - struct address_space *mapping; - int dirtied; - - if (!page_mkwrite) - lock_page(old_page); - - dirtied = set_page_dirty(old_page); - VM_BUG_ON_PAGE(PageAnon(old_page), old_page); - mapping = old_page->mapping; - unlock_page(old_page); - page_cache_release(old_page); - - if ((dirtied || page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } - - if (!page_mkwrite) - file_update_time(vma->vm_file); - } - - return ret; + return wp_page_reuse(mm, vma, address, page_table, ptl, + orig_pte, old_page, page_mkwrite, 1); } /* From 28766805275c12c2298883cece3f98505ac764b4 Mon Sep 17 00:00:00 2001 From: Shachar Raindel Date: Tue, 14 Apr 2015 15:46:29 -0700 Subject: [PATCH 080/122] mm: refactor do_wp_page - rewrite the unlock flow When do_wp_page is ending, in several cases it needs to unlock the pages and ptls it was accessing. Currently, this logic was "called" by using a goto jump. This makes following the control flow of the function harder. Readability was further hampered by the unlock case containing large amount of logic needed only in one of the 3 cases. Using goto for cleanup is generally allowed. However, moving the trivial unlocking flows to the relevant call sites allow deeper refactoring in the next patch. Signed-off-by: Shachar Raindel Acked-by: Linus Torvalds Acked-by: Kirill A. Shutemov Acked-by: Rik van Riel Acked-by: Andi Kleen Acked-by: Haggai Eran Acked-by: Johannes Weiner Cc: Mel Gorman Cc: Matthew Wilcox Cc: Dave Hansen Cc: Naoya Horiguchi Cc: Andrea Arcangeli Cc: Peter Feiner Cc: Michel Lespinasse Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index e70685f3e836..0e28fddafdaf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2066,7 +2066,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *old_page, *new_page = NULL; pte_t entry; - int ret = 0; + int page_copied = 0; unsigned long mmun_start = 0; /* For mmu_notifiers */ unsigned long mmun_end = 0; /* For mmu_notifiers */ struct mem_cgroup *memcg; @@ -2101,7 +2101,9 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); - goto unlock; + pte_unmap_unlock(page_table, ptl); + page_cache_release(old_page); + return 0; } page_cache_release(old_page); } @@ -2148,7 +2150,9 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); - goto unlock; + pte_unmap_unlock(page_table, ptl); + page_cache_release(old_page); + return 0; } page_mkwrite = 1; } @@ -2246,29 +2250,28 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Free the old page.. */ new_page = old_page; - ret |= VM_FAULT_WRITE; + page_copied = 1; } else mem_cgroup_cancel_charge(new_page, memcg); if (new_page) page_cache_release(new_page); -unlock: + pte_unmap_unlock(page_table, ptl); - if (mmun_end > mmun_start) - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (old_page) { /* * Don't let another task, with possibly unlocked vma, * keep the mlocked page. */ - if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { + if (page_copied && (vma->vm_flags & VM_LOCKED)) { lock_page(old_page); /* LRU manipulation */ munlock_vma_page(old_page); unlock_page(old_page); } page_cache_release(old_page); } - return ret; + return page_copied ? VM_FAULT_WRITE : 0; oom_free_new: page_cache_release(new_page); oom: From 2f38ab2c3c7fef04dca0313fd89d91f142ca9281 Mon Sep 17 00:00:00 2001 From: Shachar Raindel Date: Tue, 14 Apr 2015 15:46:32 -0700 Subject: [PATCH 081/122] mm: refactor do_wp_page, extract the page copy flow In some cases, do_wp_page had to copy the page suffering a write fault to a new location. If the function logic decided that to do this, it was done by jumping with a "goto" operation to the relevant code block. This made the code really hard to understand. It is also against the kernel coding style guidelines. This patch extracts the page copy and page table update logic to a separate function. It also clean up the naming, from "gotten" to "wp_page_copy", and adds few comments. Signed-off-by: Shachar Raindel Acked-by: Linus Torvalds Acked-by: Kirill A. Shutemov Acked-by: Rik van Riel Acked-by: Andi Kleen Acked-by: Haggai Eran Acked-by: Johannes Weiner Cc: Mel Gorman Cc: Matthew Wilcox Cc: Dave Hansen Cc: Naoya Horiguchi Cc: Andrea Arcangeli Cc: Peter Feiner Cc: Michel Lespinasse Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 265 +++++++++++++++++++++++++++++----------------------- 1 file changed, 147 insertions(+), 118 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 0e28fddafdaf..cfd3c78f00fe 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2041,6 +2041,146 @@ static inline int wp_page_reuse(struct mm_struct *mm, return VM_FAULT_WRITE; } +/* + * Handle the case of a page which we actually need to copy to a new page. + * + * Called with mmap_sem locked and the old page referenced, but + * without the ptl held. + * + * High level logic flow: + * + * - Allocate a page, copy the content of the old page to the new one. + * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc. + * - Take the PTL. If the pte changed, bail out and release the allocated page + * - If the pte is still the way we remember it, update the page table and all + * relevant references. This includes dropping the reference the page-table + * held to the old page, as well as updating the rmap. + * - In any case, unlock the PTL and drop the reference we took to the old page. + */ +static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + pte_t orig_pte, struct page *old_page) +{ + struct page *new_page = NULL; + spinlock_t *ptl = NULL; + pte_t entry; + int page_copied = 0; + const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */ + const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */ + struct mem_cgroup *memcg; + + if (unlikely(anon_vma_prepare(vma))) + goto oom; + + if (is_zero_pfn(pte_pfn(orig_pte))) { + new_page = alloc_zeroed_user_highpage_movable(vma, address); + if (!new_page) + goto oom; + } else { + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!new_page) + goto oom; + cow_user_page(new_page, old_page, address, vma); + } + __SetPageUptodate(new_page); + + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) + goto oom_free_new; + + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + + /* + * Re-check the pte - we dropped the lock + */ + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (likely(pte_same(*page_table, orig_pte))) { + if (old_page) { + if (!PageAnon(old_page)) { + dec_mm_counter_fast(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); + } + } else { + inc_mm_counter_fast(mm, MM_ANONPAGES); + } + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = mk_pte(new_page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + /* + * Clear the pte entry and flush it first, before updating the + * pte with the new entry. This will avoid a race condition + * seen in the presence of one thread doing SMC and another + * thread doing COW. + */ + ptep_clear_flush_notify(vma, address, page_table); + page_add_new_anon_rmap(new_page, vma, address); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); + /* + * We call the notify macro here because, when using secondary + * mmu page tables (such as kvm shadow page tables), we want the + * new page to be mapped directly into the secondary page table. + */ + set_pte_at_notify(mm, address, page_table, entry); + update_mmu_cache(vma, address, page_table); + if (old_page) { + /* + * Only after switching the pte to the new page may + * we remove the mapcount here. Otherwise another + * process may come and find the rmap count decremented + * before the pte is switched to the new page, and + * "reuse" the old page writing into it while our pte + * here still points into it and can be read by other + * threads. + * + * The critical issue is to order this + * page_remove_rmap with the ptp_clear_flush above. + * Those stores are ordered by (if nothing else,) + * the barrier present in the atomic_add_negative + * in page_remove_rmap. + * + * Then the TLB flush in ptep_clear_flush ensures that + * no process can access the old page before the + * decremented mapcount is visible. And the old page + * cannot be reused until after the decremented + * mapcount is visible. So transitively, TLBs to + * old page will be flushed before it can be reused. + */ + page_remove_rmap(old_page); + } + + /* Free the old page.. */ + new_page = old_page; + page_copied = 1; + } else { + mem_cgroup_cancel_charge(new_page, memcg); + } + + if (new_page) + page_cache_release(new_page); + + pte_unmap_unlock(page_table, ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + if (old_page) { + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if (page_copied && (vma->vm_flags & VM_LOCKED)) { + lock_page(old_page); /* LRU manipulation */ + munlock_vma_page(old_page); + unlock_page(old_page); + } + page_cache_release(old_page); + } + return page_copied ? VM_FAULT_WRITE : 0; +oom_free_new: + page_cache_release(new_page); +oom: + if (old_page) + page_cache_release(old_page); + return VM_FAULT_OOM; +} + /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address @@ -2064,12 +2204,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl, pte_t orig_pte) __releases(ptl) { - struct page *old_page, *new_page = NULL; - pte_t entry; - int page_copied = 0; - unsigned long mmun_start = 0; /* For mmu_notifiers */ - unsigned long mmun_end = 0; /* For mmu_notifiers */ - struct mem_cgroup *memcg; + struct page *old_page; old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) { @@ -2085,7 +2220,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, (VM_WRITE|VM_SHARED)) return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, old_page, 0, 0); - goto gotten; + + pte_unmap_unlock(page_table, ptl); + return wp_page_copy(mm, vma, address, page_table, pmd, + orig_pte, old_page); } /* @@ -2165,119 +2303,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * Ok, we need to copy. Oh, well.. */ page_cache_get(old_page); -gotten: - pte_unmap_unlock(page_table, ptl); - - if (unlikely(anon_vma_prepare(vma))) - goto oom; - - if (is_zero_pfn(pte_pfn(orig_pte))) { - new_page = alloc_zeroed_user_highpage_movable(vma, address); - if (!new_page) - goto oom; - } else { - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (!new_page) - goto oom; - cow_user_page(new_page, old_page, address, vma); - } - __SetPageUptodate(new_page); - - if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) - goto oom_free_new; - - mmun_start = address & PAGE_MASK; - mmun_end = mmun_start + PAGE_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - - /* - * Re-check the pte - we dropped the lock - */ - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (likely(pte_same(*page_table, orig_pte))) { - if (old_page) { - if (!PageAnon(old_page)) { - dec_mm_counter_fast(mm, MM_FILEPAGES); - inc_mm_counter_fast(mm, MM_ANONPAGES); - } - } else - inc_mm_counter_fast(mm, MM_ANONPAGES); - flush_cache_page(vma, address, pte_pfn(orig_pte)); - entry = mk_pte(new_page, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - /* - * Clear the pte entry and flush it first, before updating the - * pte with the new entry. This will avoid a race condition - * seen in the presence of one thread doing SMC and another - * thread doing COW. - */ - ptep_clear_flush_notify(vma, address, page_table); - page_add_new_anon_rmap(new_page, vma, address); - mem_cgroup_commit_charge(new_page, memcg, false); - lru_cache_add_active_or_unevictable(new_page, vma); - /* - * We call the notify macro here because, when using secondary - * mmu page tables (such as kvm shadow page tables), we want the - * new page to be mapped directly into the secondary page table. - */ - set_pte_at_notify(mm, address, page_table, entry); - update_mmu_cache(vma, address, page_table); - if (old_page) { - /* - * Only after switching the pte to the new page may - * we remove the mapcount here. Otherwise another - * process may come and find the rmap count decremented - * before the pte is switched to the new page, and - * "reuse" the old page writing into it while our pte - * here still points into it and can be read by other - * threads. - * - * The critical issue is to order this - * page_remove_rmap with the ptp_clear_flush above. - * Those stores are ordered by (if nothing else,) - * the barrier present in the atomic_add_negative - * in page_remove_rmap. - * - * Then the TLB flush in ptep_clear_flush ensures that - * no process can access the old page before the - * decremented mapcount is visible. And the old page - * cannot be reused until after the decremented - * mapcount is visible. So transitively, TLBs to - * old page will be flushed before it can be reused. - */ - page_remove_rmap(old_page); - } - - /* Free the old page.. */ - new_page = old_page; - page_copied = 1; - } else - mem_cgroup_cancel_charge(new_page, memcg); - - if (new_page) - page_cache_release(new_page); pte_unmap_unlock(page_table, ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - if (old_page) { - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if (page_copied && (vma->vm_flags & VM_LOCKED)) { - lock_page(old_page); /* LRU manipulation */ - munlock_vma_page(old_page); - unlock_page(old_page); - } - page_cache_release(old_page); - } - return page_copied ? VM_FAULT_WRITE : 0; -oom_free_new: - page_cache_release(new_page); -oom: - if (old_page) - page_cache_release(old_page); - return VM_FAULT_OOM; + return wp_page_copy(mm, vma, address, page_table, pmd, + orig_pte, old_page); } static void unmap_mapping_range_vma(struct vm_area_struct *vma, From 93e478d4c36ecaf15b942988b8272102d661d44e Mon Sep 17 00:00:00 2001 From: Shachar Raindel Date: Tue, 14 Apr 2015 15:46:35 -0700 Subject: [PATCH 082/122] mm: refactor do_wp_page handling of shared vma into a function The do_wp_page function is extremely long. Extract the logic for handling a page belonging to a shared vma into a function of its own. This helps the readability of the code, without doing any functional change in it. Signed-off-by: Shachar Raindel Acked-by: Linus Torvalds Acked-by: Kirill A. Shutemov Acked-by: Rik van Riel Acked-by: Andi Kleen Acked-by: Haggai Eran Acked-by: Johannes Weiner Cc: Mel Gorman Cc: Matthew Wilcox Cc: Dave Hansen Cc: Naoya Horiguchi Cc: Andrea Arcangeli Cc: Peter Feiner Cc: Michel Lespinasse Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 86 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 48 insertions(+), 38 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index cfd3c78f00fe..ac20b2a6a0c3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2181,6 +2181,52 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; } +static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, + pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, + struct page *old_page) + __releases(ptl) +{ + int page_mkwrite = 0; + + page_cache_get(old_page); + + /* + * Only catch write-faults on shared writable pages, + * read-only shared pages can get COWed by + * get_user_pages(.write=1, .force=1). + */ + if (vma->vm_ops && vma->vm_ops->page_mkwrite) { + int tmp; + + pte_unmap_unlock(page_table, ptl); + tmp = do_page_mkwrite(vma, old_page, address); + if (unlikely(!tmp || (tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { + page_cache_release(old_page); + return tmp; + } + /* + * Since we dropped the lock we need to revalidate + * the PTE as someone else may have changed it. If + * they did, we just return, as we can count on the + * MMU to tell us if they didn't also make it writable. + */ + page_table = pte_offset_map_lock(mm, pmd, address, + &ptl); + if (!pte_same(*page_table, orig_pte)) { + unlock_page(old_page); + pte_unmap_unlock(page_table, ptl); + page_cache_release(old_page); + return 0; + } + page_mkwrite = 1; + } + + return wp_page_reuse(mm, vma, address, page_table, ptl, + orig_pte, old_page, page_mkwrite, 1); +} + /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address @@ -2259,44 +2305,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { - int page_mkwrite = 0; - - page_cache_get(old_page); - - /* - * Only catch write-faults on shared writable pages, - * read-only shared pages can get COWed by - * get_user_pages(.write=1, .force=1). - */ - if (vma->vm_ops && vma->vm_ops->page_mkwrite) { - int tmp; - - pte_unmap_unlock(page_table, ptl); - tmp = do_page_mkwrite(vma, old_page, address); - if (unlikely(!tmp || (tmp & - (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - page_cache_release(old_page); - return tmp; - } - /* - * Since we dropped the lock we need to revalidate - * the PTE as someone else may have changed it. If - * they did, we just return, as we can count on the - * MMU to tell us if they didn't also make it writable. - */ - page_table = pte_offset_map_lock(mm, pmd, address, - &ptl); - if (!pte_same(*page_table, orig_pte)) { - unlock_page(old_page); - pte_unmap_unlock(page_table, ptl); - page_cache_release(old_page); - return 0; - } - page_mkwrite = 1; - } - - return wp_page_reuse(mm, vma, address, page_table, ptl, - orig_pte, old_page, page_mkwrite, 1); + return wp_page_shared(mm, vma, address, page_table, pmd, + ptl, orig_pte, old_page); } /* From 58be19dcf7c0c206e60796c2ee18fc4fc1659fea Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 14 Apr 2015 15:46:39 -0700 Subject: [PATCH 083/122] ocfs2: copy fs uuid to superblock Currently, maximal number of cleancache enabled filesystems equals 32, which is insufficient nowadays, because a Linux host can have hundreds of containers on board, each of which might want its own filesystem. This patch set targets at removing this limitation - see patch 4 for more details. Patches 1-3 prepare the code for this change. This patch (of 4): This will allow us to remove the uuid argument from cleancache_init_shared_fs. Signed-off-by: Vladimir Davydov Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: David Vrabel Cc: Mark Fasheh Cc: Joel Becker Cc: Stefan Hengelein Cc: Florian Schmaus Cc: Andor Daam Cc: Dan Magenheimer Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c558bff99165..a811a95cfd5f 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2070,6 +2070,8 @@ static int ocfs2_initialize_super(struct super_block *sb, cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); + memcpy(sb->s_uuid, di->id2.i_super.s_uuid, + sizeof(di->id2.i_super.s_uuid)); osb->osb_dx_mask = (1 << (cbits - bbits)) - 1; From 9de1626290eaa7d921413ddc83544bc3bae27283 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 14 Apr 2015 15:46:42 -0700 Subject: [PATCH 084/122] cleancache: zap uuid arg of cleancache_init_shared_fs Use super_block->s_uuid instead. Every shared filesystem using cleancache must now initialize super_block->s_uuid before calling cleancache_init_shared_fs. The only one on the tree, ocfs2, already meets this requirement. Signed-off-by: Vladimir Davydov Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: David Vrabel Cc: Mark Fasheh Cc: Joel Becker Cc: Stefan Hengelein Cc: Florian Schmaus Cc: Andor Daam Cc: Dan Magenheimer Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/super.c | 2 +- include/linux/cleancache.h | 6 +++--- mm/cleancache.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index a811a95cfd5f..837ddce4b659 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2336,7 +2336,7 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog_errno(status); goto bail; } - cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb); + cleancache_init_shared_fs(sb); bail: return status; diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h index 4ce9056b31a8..29657d1c83fb 100644 --- a/include/linux/cleancache.h +++ b/include/linux/cleancache.h @@ -36,7 +36,7 @@ struct cleancache_ops { extern struct cleancache_ops * cleancache_register_ops(struct cleancache_ops *ops); extern void __cleancache_init_fs(struct super_block *); -extern void __cleancache_init_shared_fs(char *, struct super_block *); +extern void __cleancache_init_shared_fs(struct super_block *); extern int __cleancache_get_page(struct page *); extern void __cleancache_put_page(struct page *); extern void __cleancache_invalidate_page(struct address_space *, struct page *); @@ -78,10 +78,10 @@ static inline void cleancache_init_fs(struct super_block *sb) __cleancache_init_fs(sb); } -static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb) +static inline void cleancache_init_shared_fs(struct super_block *sb) { if (cleancache_enabled) - __cleancache_init_shared_fs(uuid, sb); + __cleancache_init_shared_fs(sb); } static inline int cleancache_get_page(struct page *page) diff --git a/mm/cleancache.c b/mm/cleancache.c index 053bcd8f12fb..532495f2e4f4 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -155,7 +155,7 @@ void __cleancache_init_fs(struct super_block *sb) EXPORT_SYMBOL(__cleancache_init_fs); /* Called by a cleancache-enabled clustered filesystem at time of mount */ -void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) +void __cleancache_init_shared_fs(struct super_block *sb) { int i; @@ -163,10 +163,10 @@ void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { if (shared_fs_poolid_map[i] == FS_UNKNOWN) { sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET; - uuids[i] = uuid; + uuids[i] = sb->s_uuid; if (cleancache_ops) shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs - (uuid, PAGE_SIZE); + (sb->s_uuid, PAGE_SIZE); else shared_fs_poolid_map[i] = FS_NO_BACKEND; break; From 53d85c98566535d7bc69470f008d7dcb09a0fec9 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 14 Apr 2015 15:46:45 -0700 Subject: [PATCH 085/122] cleancache: forbid overriding cleancache_ops Currently, cleancache_register_ops returns the previous value of cleancache_ops to allow chaining. However, chaining, as it is implemented now, is extremely dangerous due to possible pool id collisions. Suppose, a new cleancache driver is registered after the previous one assigned an id to a super block. If the new driver assigns the same id to another super block, which is perfectly possible, we will have two different filesystems using the same id. No matter if the new driver implements chaining or not, we are likely to get data corruption with such a configuration eventually. This patch therefore disables the ability to override cleancache_ops altogether as potentially dangerous. If there is already cleancache driver registered, all further calls to cleancache_register_ops will return EBUSY. Since no user of cleancache implements chaining, we only need to make minor changes to the code outside the cleancache core. Signed-off-by: Vladimir Davydov Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: David Vrabel Cc: Mark Fasheh Cc: Joel Becker Cc: Stefan Hengelein Cc: Florian Schmaus Cc: Andor Daam Cc: Dan Magenheimer Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/cleancache.txt | 4 +--- drivers/xen/tmem.c | 16 +++++++++------- include/linux/cleancache.h | 3 +-- mm/cleancache.c | 12 +++++++----- 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/Documentation/vm/cleancache.txt b/Documentation/vm/cleancache.txt index 01d76282444e..e4b49df7a048 100644 --- a/Documentation/vm/cleancache.txt +++ b/Documentation/vm/cleancache.txt @@ -28,9 +28,7 @@ IMPLEMENTATION OVERVIEW A cleancache "backend" that provides transcendent memory registers itself to the kernel's cleancache "frontend" by calling cleancache_register_ops, passing a pointer to a cleancache_ops structure with funcs set appropriately. -Note that cleancache_register_ops returns the previous settings so that -chaining can be performed if desired. The functions provided must conform to -certain semantics as follows: +The functions provided must conform to certain semantics as follows: Most important, cleancache is "ephemeral". Pages which are copied into cleancache have an indefinite lifetime which is completely unknowable diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c index 8a65423bc696..c4211a31612d 100644 --- a/drivers/xen/tmem.c +++ b/drivers/xen/tmem.c @@ -397,13 +397,15 @@ static int __init xen_tmem_init(void) #ifdef CONFIG_CLEANCACHE BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid)); if (tmem_enabled && cleancache) { - char *s = ""; - struct cleancache_ops *old_ops = - cleancache_register_ops(&tmem_cleancache_ops); - if (old_ops) - s = " (WARNING: cleancache_ops overridden)"; - pr_info("cleancache enabled, RAM provided by Xen Transcendent Memory%s\n", - s); + int err; + + err = cleancache_register_ops(&tmem_cleancache_ops); + if (err) + pr_warn("xen-tmem: failed to enable cleancache: %d\n", + err); + else + pr_info("cleancache enabled, RAM provided by " + "Xen Transcendent Memory\n"); } #endif #ifdef CONFIG_XEN_SELFBALLOONING diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h index 29657d1c83fb..b23611f43cfb 100644 --- a/include/linux/cleancache.h +++ b/include/linux/cleancache.h @@ -33,8 +33,7 @@ struct cleancache_ops { void (*invalidate_fs)(int); }; -extern struct cleancache_ops * - cleancache_register_ops(struct cleancache_ops *ops); +extern int cleancache_register_ops(struct cleancache_ops *ops); extern void __cleancache_init_fs(struct super_block *); extern void __cleancache_init_shared_fs(struct super_block *); extern int __cleancache_get_page(struct page *); diff --git a/mm/cleancache.c b/mm/cleancache.c index 532495f2e4f4..aa10f9a3bc88 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -106,15 +106,17 @@ static DEFINE_MUTEX(poolid_mutex); */ /* - * Register operations for cleancache, returning previous thus allowing - * detection of multiple backends and possible nesting. + * Register operations for cleancache. Returns 0 on success. */ -struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops) +int cleancache_register_ops(struct cleancache_ops *ops) { - struct cleancache_ops *old = cleancache_ops; int i; mutex_lock(&poolid_mutex); + if (cleancache_ops) { + mutex_unlock(&poolid_mutex); + return -EBUSY; + } for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { if (fs_poolid_map[i] == FS_NO_BACKEND) fs_poolid_map[i] = ops->init_fs(PAGE_SIZE); @@ -130,7 +132,7 @@ struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops) barrier(); cleancache_ops = ops; mutex_unlock(&poolid_mutex); - return old; + return 0; } EXPORT_SYMBOL(cleancache_register_ops); From 3cb29d11174f29b76addcba4374884b14f8ea4b1 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 14 Apr 2015 15:46:48 -0700 Subject: [PATCH 086/122] cleancache: remove limit on the number of cleancache enabled filesystems The limit equals 32 and is imposed by the number of entries in the fs_poolid_map and shared_fs_poolid_map. Nowadays it is insufficient, because with containers on board a Linux host can have hundreds of active fs mounts. These maps were introduced by commit 49a9ab815acb8 ("mm: cleancache: lazy initialization to allow tmem backends to build/run as modules") in order to allow compiling cleancache drivers as modules. Real pool ids are stored in these maps while super_block->cleancache_poolid points to an entry in the map, so that on cleancache registration we can walk over all (if there are <= 32 of them, of course) cleancache-enabled super blocks and assign real pool ids. Actually, there is absolutely no need in these maps, because we can iterate over all super blocks immediately using iterate_supers. This is not racy, because cleancache_init_ops is called from mount_fs with super_block->s_umount held for writing, while iterate_supers takes this semaphore for reading, so if we call iterate_supers after setting cleancache_ops, all super blocks that had been created before cleancache_register_ops was called will be assigned pool ids by the action function of iterate_supers while all newer super blocks will receive it in cleancache_init_fs. This patch therefore removes the maps and hence the artificial limit on the number of cleancache enabled filesystems. Signed-off-by: Vladimir Davydov Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: David Vrabel Cc: Mark Fasheh Cc: Joel Becker Cc: Stefan Hengelein Cc: Florian Schmaus Cc: Andor Daam Cc: Dan Magenheimer Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 2 +- include/linux/cleancache.h | 4 + mm/cleancache.c | 270 ++++++++++++------------------------- 3 files changed, 94 insertions(+), 182 deletions(-) diff --git a/fs/super.c b/fs/super.c index 2b7dc90ccdbb..928c20f47af9 100644 --- a/fs/super.c +++ b/fs/super.c @@ -224,7 +224,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; - s->cleancache_poolid = -1; + s->cleancache_poolid = CLEANCACHE_NO_POOL; s->s_shrink.seeks = DEFAULT_SEEKS; s->s_shrink.scan_objects = super_cache_scan; diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h index b23611f43cfb..bda5ec0b4b4d 100644 --- a/include/linux/cleancache.h +++ b/include/linux/cleancache.h @@ -5,6 +5,10 @@ #include #include +#define CLEANCACHE_NO_POOL -1 +#define CLEANCACHE_NO_BACKEND -2 +#define CLEANCACHE_NO_BACKEND_SHARED -3 + #define CLEANCACHE_KEY_MAX 6 /* diff --git a/mm/cleancache.c b/mm/cleancache.c index aa10f9a3bc88..8fc50811119b 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -19,7 +19,7 @@ #include /* - * cleancache_ops is set by cleancache_ops_register to contain the pointers + * cleancache_ops is set by cleancache_register_ops to contain the pointers * to the cleancache "backend" implementation functions. */ static struct cleancache_ops *cleancache_ops __read_mostly; @@ -34,104 +34,78 @@ static u64 cleancache_failed_gets; static u64 cleancache_puts; static u64 cleancache_invalidates; -/* - * When no backend is registered all calls to init_fs and init_shared_fs - * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or - * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array - * [shared_|]fs_poolid_map) are given to the respective super block - * (sb->cleancache_poolid) and no tmem_pools are created. When a backend - * registers with cleancache the previous calls to init_fs and init_shared_fs - * are executed to create tmem_pools and set the respective poolids. While no - * backend is registered all "puts", "gets" and "flushes" are ignored or failed. - */ -#define MAX_INITIALIZABLE_FS 32 -#define FAKE_FS_POOLID_OFFSET 1000 -#define FAKE_SHARED_FS_POOLID_OFFSET 2000 - -#define FS_NO_BACKEND (-1) -#define FS_UNKNOWN (-2) -static int fs_poolid_map[MAX_INITIALIZABLE_FS]; -static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS]; -static char *uuids[MAX_INITIALIZABLE_FS]; -/* - * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads - * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple - * threads calling mount (and ending up in __cleancache_init_[shared|]fs). - */ -static DEFINE_MUTEX(poolid_mutex); -/* - * When set to false (default) all calls to the cleancache functions, except - * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded - * by the if (!cleancache_ops) return. This means multiple threads (from - * different filesystems) will be checking cleancache_ops. The usage of a - * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are - * OK if the time between the backend's have been initialized (and - * cleancache_ops has been set to not NULL) and when the filesystems start - * actually calling the backends. The inverse (when unloading) is obviously - * not good - but this shim does not do that (yet). - */ - -/* - * The backends and filesystems work all asynchronously. This is b/c the - * backends can be built as modules. - * The usual sequence of events is: - * a) mount / -> __cleancache_init_fs is called. We set the - * [shared_|]fs_poolid_map and uuids for. - * - * b). user does I/Os -> we call the rest of __cleancache_* functions - * which return immediately as cleancache_ops is false. - * - * c). modprobe zcache -> cleancache_register_ops. We init the backend - * and set cleancache_ops to true, and for any fs_poolid_map - * (which is set by __cleancache_init_fs) we initialize the poolid. - * - * d). user does I/Os -> now that cleancache_ops is true all the - * __cleancache_* functions can call the backend. They all check - * that fs_poolid_map is valid and if so invoke the backend. - * - * e). umount / -> __cleancache_invalidate_fs, the fs_poolid_map is - * reset (which is the second check in the __cleancache_* ops - * to call the backend). - * - * The sequence of event could also be c), followed by a), and d). and e). The - * c) would not happen anymore. There is also the chance of c), and one thread - * doing a) + d), and another doing e). For that case we depend on the - * filesystem calling __cleancache_invalidate_fs in the proper sequence (so - * that it handles all I/Os before it invalidates the fs (which is last part - * of unmounting process). - * - * Note: The acute reader will notice that there is no "rmmod zcache" case. - * This is b/c the functionality for that is not yet implemented and when - * done, will require some extra locking not yet devised. - */ +static void cleancache_register_ops_sb(struct super_block *sb, void *unused) +{ + switch (sb->cleancache_poolid) { + case CLEANCACHE_NO_BACKEND: + __cleancache_init_fs(sb); + break; + case CLEANCACHE_NO_BACKEND_SHARED: + __cleancache_init_shared_fs(sb); + break; + } +} /* * Register operations for cleancache. Returns 0 on success. */ int cleancache_register_ops(struct cleancache_ops *ops) { - int i; - - mutex_lock(&poolid_mutex); - if (cleancache_ops) { - mutex_unlock(&poolid_mutex); + if (cmpxchg(&cleancache_ops, NULL, ops)) return -EBUSY; - } - for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { - if (fs_poolid_map[i] == FS_NO_BACKEND) - fs_poolid_map[i] = ops->init_fs(PAGE_SIZE); - if (shared_fs_poolid_map[i] == FS_NO_BACKEND) - shared_fs_poolid_map[i] = ops->init_shared_fs - (uuids[i], PAGE_SIZE); - } + /* - * We MUST set cleancache_ops _after_ we have called the backends - * init_fs or init_shared_fs functions. Otherwise the compiler might - * re-order where cleancache_ops is set in this function. + * A cleancache backend can be built as a module and hence loaded after + * a cleancache enabled filesystem has called cleancache_init_fs. To + * handle such a scenario, here we call ->init_fs or ->init_shared_fs + * for each active super block. To differentiate between local and + * shared filesystems, we temporarily initialize sb->cleancache_poolid + * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED + * respectively in case there is no backend registered at the time + * cleancache_init_fs or cleancache_init_shared_fs is called. + * + * Since filesystems can be mounted concurrently with cleancache + * backend registration, we have to be careful to guarantee that all + * cleancache enabled filesystems that has been mounted by the time + * cleancache_register_ops is called has got and all mounted later will + * get cleancache_poolid. This is assured by the following statements + * tied together: + * + * a) iterate_supers skips only those super blocks that has started + * ->kill_sb + * + * b) if iterate_supers encounters a super block that has not finished + * ->mount yet, it waits until it is finished + * + * c) cleancache_init_fs is called from ->mount and + * cleancache_invalidate_fs is called from ->kill_sb + * + * d) we call iterate_supers after cleancache_ops has been set + * + * From a) it follows that if iterate_supers skips a super block, then + * either the super block is already dead, in which case we do not need + * to bother initializing cleancache for it, or it was mounted after we + * initiated iterate_supers. In the latter case, it must have seen + * cleancache_ops set according to d) and initialized cleancache from + * ->mount by itself according to c). This proves that we call + * ->init_fs at least once for each active super block. + * + * From b) and c) it follows that if iterate_supers encounters a super + * block that has already started ->init_fs, it will wait until ->mount + * and hence ->init_fs has finished, then check cleancache_poolid, see + * that it has already been set and therefore do nothing. This proves + * that we call ->init_fs no more than once for each super block. + * + * Combined together, the last two paragraphs prove the function + * correctness. + * + * Note that various cleancache callbacks may proceed before this + * function is called or even concurrently with it, but since + * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop + * until the corresponding ->init_fs has been actually called and + * cleancache_ops has been set. */ - barrier(); - cleancache_ops = ops; - mutex_unlock(&poolid_mutex); + iterate_supers(cleancache_register_ops_sb, NULL); return 0; } EXPORT_SYMBOL(cleancache_register_ops); @@ -139,42 +113,28 @@ EXPORT_SYMBOL(cleancache_register_ops); /* Called by a cleancache-enabled filesystem at time of mount */ void __cleancache_init_fs(struct super_block *sb) { - int i; + int pool_id = CLEANCACHE_NO_BACKEND; - mutex_lock(&poolid_mutex); - for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { - if (fs_poolid_map[i] == FS_UNKNOWN) { - sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET; - if (cleancache_ops) - fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE); - else - fs_poolid_map[i] = FS_NO_BACKEND; - break; - } + if (cleancache_ops) { + pool_id = cleancache_ops->init_fs(PAGE_SIZE); + if (pool_id < 0) + pool_id = CLEANCACHE_NO_POOL; } - mutex_unlock(&poolid_mutex); + sb->cleancache_poolid = pool_id; } EXPORT_SYMBOL(__cleancache_init_fs); /* Called by a cleancache-enabled clustered filesystem at time of mount */ void __cleancache_init_shared_fs(struct super_block *sb) { - int i; + int pool_id = CLEANCACHE_NO_BACKEND_SHARED; - mutex_lock(&poolid_mutex); - for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { - if (shared_fs_poolid_map[i] == FS_UNKNOWN) { - sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET; - uuids[i] = sb->s_uuid; - if (cleancache_ops) - shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs - (sb->s_uuid, PAGE_SIZE); - else - shared_fs_poolid_map[i] = FS_NO_BACKEND; - break; - } + if (cleancache_ops) { + pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE); + if (pool_id < 0) + pool_id = CLEANCACHE_NO_POOL; } - mutex_unlock(&poolid_mutex); + sb->cleancache_poolid = pool_id; } EXPORT_SYMBOL(__cleancache_init_shared_fs); @@ -203,19 +163,6 @@ static int cleancache_get_key(struct inode *inode, return 0; } -/* - * Returns a pool_id that is associated with a given fake poolid. - */ -static int get_poolid_from_fake(int fake_pool_id) -{ - if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) - return shared_fs_poolid_map[fake_pool_id - - FAKE_SHARED_FS_POOLID_OFFSET]; - else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) - return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET]; - return FS_NO_BACKEND; -} - /* * "Get" data from cleancache associated with the poolid/inode/index * that were specified when the data was put to cleanache and, if @@ -231,7 +178,6 @@ int __cleancache_get_page(struct page *page) { int ret = -1; int pool_id; - int fake_pool_id; struct cleancache_filekey key = { .u.key = { 0 } }; if (!cleancache_ops) { @@ -240,17 +186,14 @@ int __cleancache_get_page(struct page *page) } VM_BUG_ON_PAGE(!PageLocked(page), page); - fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (fake_pool_id < 0) + pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (pool_id < 0) goto out; - pool_id = get_poolid_from_fake(fake_pool_id); if (cleancache_get_key(page->mapping->host, &key) < 0) goto out; - if (pool_id >= 0) - ret = cleancache_ops->get_page(pool_id, - key, page->index, page); + ret = cleancache_ops->get_page(pool_id, key, page->index, page); if (ret == 0) cleancache_succ_gets++; else @@ -273,7 +216,6 @@ EXPORT_SYMBOL(__cleancache_get_page); void __cleancache_put_page(struct page *page) { int pool_id; - int fake_pool_id; struct cleancache_filekey key = { .u.key = { 0 } }; if (!cleancache_ops) { @@ -282,12 +224,7 @@ void __cleancache_put_page(struct page *page) } VM_BUG_ON_PAGE(!PageLocked(page), page); - fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (fake_pool_id < 0) - return; - - pool_id = get_poolid_from_fake(fake_pool_id); - + pool_id = page->mapping->host->i_sb->cleancache_poolid; if (pool_id >= 0 && cleancache_get_key(page->mapping->host, &key) >= 0) { cleancache_ops->put_page(pool_id, key, page->index, page); @@ -308,18 +245,13 @@ void __cleancache_invalidate_page(struct address_space *mapping, struct page *page) { /* careful... page->mapping is NULL sometimes when this is called */ - int pool_id; - int fake_pool_id = mapping->host->i_sb->cleancache_poolid; + int pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; if (!cleancache_ops) return; - if (fake_pool_id >= 0) { - pool_id = get_poolid_from_fake(fake_pool_id); - if (pool_id < 0) - return; - + if (pool_id >= 0) { VM_BUG_ON_PAGE(!PageLocked(page), page); if (cleancache_get_key(mapping->host, &key) >= 0) { cleancache_ops->invalidate_page(pool_id, @@ -341,18 +273,12 @@ EXPORT_SYMBOL(__cleancache_invalidate_page); */ void __cleancache_invalidate_inode(struct address_space *mapping) { - int pool_id; - int fake_pool_id = mapping->host->i_sb->cleancache_poolid; + int pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; if (!cleancache_ops) return; - if (fake_pool_id < 0) - return; - - pool_id = get_poolid_from_fake(fake_pool_id); - if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) cleancache_ops->invalidate_inode(pool_id, key); } @@ -365,32 +291,18 @@ EXPORT_SYMBOL(__cleancache_invalidate_inode); */ void __cleancache_invalidate_fs(struct super_block *sb) { - int index; - int fake_pool_id = sb->cleancache_poolid; - int old_poolid = fake_pool_id; + int pool_id; - mutex_lock(&poolid_mutex); - if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) { - index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET; - old_poolid = shared_fs_poolid_map[index]; - shared_fs_poolid_map[index] = FS_UNKNOWN; - uuids[index] = NULL; - } else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) { - index = fake_pool_id - FAKE_FS_POOLID_OFFSET; - old_poolid = fs_poolid_map[index]; - fs_poolid_map[index] = FS_UNKNOWN; - } - sb->cleancache_poolid = -1; - if (cleancache_ops) - cleancache_ops->invalidate_fs(old_poolid); - mutex_unlock(&poolid_mutex); + pool_id = sb->cleancache_poolid; + sb->cleancache_poolid = CLEANCACHE_NO_POOL; + + if (cleancache_ops && pool_id >= 0) + cleancache_ops->invalidate_fs(pool_id); } EXPORT_SYMBOL(__cleancache_invalidate_fs); static int __init init_cleancache(void) { - int i; - #ifdef CONFIG_DEBUG_FS struct dentry *root = debugfs_create_dir("cleancache", NULL); if (root == NULL) @@ -402,10 +314,6 @@ static int __init init_cleancache(void) debugfs_create_u64("invalidates", S_IRUGO, root, &cleancache_invalidates); #endif - for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { - fs_poolid_map[i] = FS_UNKNOWN; - shared_fs_poolid_map[i] = FS_UNKNOWN; - } return 0; } module_init(init_cleancache) From b360edb43f8ed50aa7b8c9aae7d7557a1a6e32c8 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2015 15:46:52 -0700 Subject: [PATCH 087/122] mm, mempolicy: migrate_to_node should only migrate to node migrate_to_node() is intended to migrate a page from one source node to a target node. Today, migrate_to_node() could end up migrating to any node, not only the target node. This is because the page migration allocator, new_node_page() does not pass __GFP_THISNODE to alloc_pages_exact_node(). This causes the target node to be preferred but allows fallback to any other node in order of affinity. Prevent this by allocating with __GFP_THISNODE. If memory is not available, -ENOMEM will be returned as appropriate. Signed-off-by: David Rientjes Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4721046a134a..69d05acfa18c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -945,7 +945,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x return alloc_huge_page_node(page_hstate(compound_head(page)), node); else - return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); + return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE | + __GFP_THISNODE, 0); } /* From 4167e9b2cf10f8a4bcda0c713ddc8bb0a18e8187 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2015 15:46:55 -0700 Subject: [PATCH 088/122] mm: remove GFP_THISNODE NOTE: this is not about __GFP_THISNODE, this is only about GFP_THISNODE. GFP_THISNODE is a secret combination of gfp bits that have different behavior than expected. It is a combination of __GFP_THISNODE, __GFP_NORETRY, and __GFP_NOWARN and is special-cased in the page allocator slowpath to fail without trying reclaim even though it may be used in combination with __GFP_WAIT. An example of the problem this creates: commit e97ca8e5b864 ("mm: fix GFP_THISNODE callers and clarify") fixed up many users of GFP_THISNODE that really just wanted __GFP_THISNODE. The problem doesn't end there, however, because even it was a no-op for alloc_misplaced_dst_page(), which also sets __GFP_NORETRY and __GFP_NOWARN, and migrate_misplaced_transhuge_page(), where __GFP_NORETRY and __GFP_NOWAIT is set in GFP_TRANSHUGE. Converting GFP_THISNODE to __GFP_THISNODE is a no-op in these cases since the page allocator special-cases __GFP_THISNODE && __GFP_NORETRY && __GFP_NOWARN. It's time to just remove GFP_THISNODE entirely. We leave __GFP_THISNODE to restrict an allocation to a local node, but remove GFP_THISNODE and its obscurity. Instead, we require that a caller clear __GFP_WAIT if it wants to avoid reclaim. This allows the aforementioned functions to actually reclaim as they should. It also enables any future callers that want to do __GFP_THISNODE but also __GFP_NORETRY && __GFP_NOWARN to reclaim. The rule is simple: if you don't want to reclaim, then don't set __GFP_WAIT. Aside: ovs_flow_stats_update() really wants to avoid reclaim as well, so it is unchanged. Signed-off-by: David Rientjes Acked-by: Vlastimil Babka Cc: Christoph Lameter Acked-by: Pekka Enberg Cc: Joonsoo Kim Acked-by: Johannes Weiner Cc: Mel Gorman Cc: Pravin Shelar Cc: Jarno Rajahalme Cc: Li Zefan Cc: Greg Thelen Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 10 ---------- mm/page_alloc.c | 22 ++++++---------------- mm/slab.c | 22 ++++++++++++++++++---- net/openvswitch/flow.c | 4 +++- 4 files changed, 27 insertions(+), 31 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 51bd1e72a917..4423a0f8eabe 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -117,16 +117,6 @@ struct vm_area_struct; __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ __GFP_NO_KSWAPD) -/* - * GFP_THISNODE does not perform any reclaim, you most likely want to - * use __GFP_THISNODE to allocate from a given node without fallback! - */ -#ifdef CONFIG_NUMA -#define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) -#else -#define GFP_THISNODE ((__force gfp_t)0) -#endif - /* This mask makes up all the page movable related flags */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 84466a4b1b36..86af1a96a6dc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2412,13 +2412,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, *did_some_progress = 1; goto out; } - /* - * GFP_THISNODE contains __GFP_NORETRY and we never hit this. - * Sanity check for bare calls of __GFP_THISNODE, not real OOM. - * The caller should handle page allocation failure by itself if - * it specifies __GFP_THISNODE. - * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. - */ + /* The OOM killer may not free memory on a specific node */ if (gfp_mask & __GFP_THISNODE) goto out; } @@ -2673,15 +2667,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, } /* - * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and - * __GFP_NOWARN set) should not cause reclaim since the subsystem - * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim - * using a larger set of nodes after it has established that the - * allowed per node queues are empty and that nodes are - * over allocated. + * If this allocation cannot block and it is for a specific node, then + * fail early. There's no need to wakeup kswapd or retry for a + * speculative node-specific allocation. */ - if (IS_ENABLED(CONFIG_NUMA) && - (gfp_mask & GFP_THISNODE) == GFP_THISNODE) + if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait) goto nopage; retry: @@ -2874,7 +2864,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, /* * Check the zones suitable for the gfp_mask contain at least one * valid zone. It's possible to have an empty zonelist as a result - * of GFP_THISNODE and a memoryless node + * of __GFP_THISNODE and a memoryless node */ if (unlikely(!zonelist->_zonerefs->zone)) return NULL; diff --git a/mm/slab.c b/mm/slab.c index c4b89eaf4c96..7eb38dd1cefa 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -857,6 +857,11 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, return NULL; } +static inline gfp_t gfp_exact_node(gfp_t flags) +{ + return flags; +} + #else /* CONFIG_NUMA */ static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); @@ -1023,6 +1028,15 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) return __cache_free_alien(cachep, objp, node, page_node); } + +/* + * Construct gfp mask to allocate from a specific node but do not invoke reclaim + * or warn about failures. + */ +static inline gfp_t gfp_exact_node(gfp_t flags) +{ + return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT; +} #endif /* @@ -2825,7 +2839,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, if (unlikely(!ac->avail)) { int x; force_grow: - x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); + x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); /* cache_grow can reenable interrupts, then ac could change. */ ac = cpu_cache_get(cachep); @@ -3019,7 +3033,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) get_node(cache, nid) && get_node(cache, nid)->free_objects) { obj = ____cache_alloc_node(cache, - flags | GFP_THISNODE, nid); + gfp_exact_node(flags), nid); if (obj) break; } @@ -3047,7 +3061,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) nid = page_to_nid(page); if (cache_grow(cache, flags, nid, page)) { obj = ____cache_alloc_node(cache, - flags | GFP_THISNODE, nid); + gfp_exact_node(flags), nid); if (!obj) /* * Another processor may allocate the @@ -3118,7 +3132,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, must_grow: spin_unlock(&n->list_lock); - x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); + x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL); if (x) goto retry; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 50ec42f170a0..2dacc7b5af23 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -100,7 +100,9 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, new_stats = kmem_cache_alloc_node(flow_stats_cache, - GFP_THISNODE | + GFP_NOWAIT | + __GFP_THISNODE | + __GFP_NOWARN | __GFP_NOMEMALLOC, node); if (likely(new_stats)) { From 5265047ac30191ea24b16503165000c225f54feb Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2015 15:46:58 -0700 Subject: [PATCH 089/122] mm, thp: really limit transparent hugepage allocation to local node Commit 077fcf116c8c ("mm/thp: allocate transparent hugepages on local node") restructured alloc_hugepage_vma() with the intent of only allocating transparent hugepages locally when there was not an effective interleave mempolicy. alloc_pages_exact_node() does not limit the allocation to the single node, however, but rather prefers it. This is because __GFP_THISNODE is not set which would cause the node-local nodemask to be passed. Without it, only a nodemask that prefers the local node is passed. Fix this by passing __GFP_THISNODE and falling back to small pages when the allocation fails. Commit 9f1b868a13ac ("mm: thp: khugepaged: add policy for finding target node") suffers from a similar problem for khugepaged, which is also fixed. Fixes: 077fcf116c8c ("mm/thp: allocate transparent hugepages on local node") Fixes: 9f1b868a13ac ("mm: thp: khugepaged: add policy for finding target node") Signed-off-by: David Rientjes Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Johannes Weiner Cc: Mel Gorman Cc: Pravin Shelar Cc: Jarno Rajahalme Cc: Li Zefan Cc: Greg Thelen Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 9 +++++++-- mm/mempolicy.c | 3 ++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6352c1dfa898..3afb5cbe1312 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2328,8 +2328,14 @@ static struct page struct vm_area_struct *vma, unsigned long address, int node) { + gfp_t flags; + VM_BUG_ON_PAGE(*hpage, *hpage); + /* Only allocate from the target node */ + flags = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | + __GFP_THISNODE; + /* * Before allocating the hugepage, release the mmap_sem read lock. * The allocation can take potentially a long time if it involves @@ -2338,8 +2344,7 @@ static struct page */ up_read(&mm->mmap_sem); - *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( - khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); + *hpage = alloc_pages_exact_node(node, flags, HPAGE_PMD_ORDER); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); *hpage = ERR_PTR(-ENOMEM); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 69d05acfa18c..ede26291d4aa 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1986,7 +1986,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, nmask = policy_nodemask(gfp, pol); if (!nmask || node_isset(node, *nmask)) { mpol_cond_put(pol); - page = alloc_pages_exact_node(node, gfp, order); + page = alloc_pages_exact_node(node, + gfp | __GFP_THISNODE, order); goto out; } } From 6e276d2a517fba71a2be2252fdffb237e746e784 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2015 15:47:01 -0700 Subject: [PATCH 090/122] kernel, cpuset: remove exception for __GFP_THISNODE Nothing calls __cpuset_node_allowed() with __GFP_THISNODE set anymore, so remove the obscure comment about it and its special-case exception. Signed-off-by: David Rientjes Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Johannes Weiner Cc: Mel Gorman Cc: Pravin Shelar Cc: Jarno Rajahalme Cc: Li Zefan Cc: Greg Thelen Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c68f0721df10..ee14e3a35a29 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2453,20 +2453,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * @node: is this an allowed node? * @gfp_mask: memory allocation flags * - * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is - * set, yes, we can always allocate. If node is in our task's mems_allowed, - * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest - * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been - * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE - * flag, yes. + * If we're in interrupt, yes, we can always allocate. If @node is set in + * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this + * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, + * yes. If current has access to memory reserves due to TIF_MEMDIE, yes. * Otherwise, no. * - * The __GFP_THISNODE placement logic is really handled elsewhere, - * by forcibly using a zonelist starting at a specified node, and by - * (in get_page_from_freelist()) refusing to consider the zones for - * any node on the zonelist except the first. By the time any such - * calls get to this routine, we should just shut up and say 'yes'. - * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset * unless the task has been OOM killed as is marked TIF_MEMDIE. @@ -2502,7 +2494,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask) int allowed; /* is allocation in zone z allowed? */ unsigned long flags; - if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) + if (in_interrupt()) return 1; if (node_isset(node, current->mems_allowed)) return 1; From ac173824959adeb489f9fcf88858774c4535a241 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 14 Apr 2015 15:47:04 -0700 Subject: [PATCH 091/122] mm: cma: constify and use correct signness in mm/cma.c Constify function parameters and use correct signness where needed. Signed-off-by: Sasha Levin Cc: Michal Nazarewicz Cc: Marek Szyprowski Cc: Joonsoo Kim Cc: Laurent Pinchart Acked-by: Gregory Fong Cc: Pintu Kumar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cma.h | 12 ++++++------ mm/cma.c | 24 ++++++++++++++---------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/include/linux/cma.h b/include/linux/cma.h index 9384ba66e975..f7ef093ec49a 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -16,16 +16,16 @@ struct cma; extern unsigned long totalcma_pages; -extern phys_addr_t cma_get_base(struct cma *cma); -extern unsigned long cma_get_size(struct cma *cma); +extern phys_addr_t cma_get_base(const struct cma *cma); +extern unsigned long cma_get_size(const struct cma *cma); extern int __init cma_declare_contiguous(phys_addr_t base, phys_addr_t size, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, bool fixed, struct cma **res_cma); -extern int cma_init_reserved_mem(phys_addr_t base, - phys_addr_t size, int order_per_bit, +extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, + unsigned int order_per_bit, struct cma **res_cma); -extern struct page *cma_alloc(struct cma *cma, int count, unsigned int align); -extern bool cma_release(struct cma *cma, struct page *pages, int count); +extern struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align); +extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count); #endif diff --git a/mm/cma.c b/mm/cma.c index 7f6045420925..47203faaf65e 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -41,17 +41,18 @@ struct cma cma_areas[MAX_CMA_AREAS]; unsigned cma_area_count; static DEFINE_MUTEX(cma_mutex); -phys_addr_t cma_get_base(struct cma *cma) +phys_addr_t cma_get_base(const struct cma *cma) { return PFN_PHYS(cma->base_pfn); } -unsigned long cma_get_size(struct cma *cma) +unsigned long cma_get_size(const struct cma *cma) { return cma->count << PAGE_SHIFT; } -static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) +static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, + int align_order) { if (align_order <= cma->order_per_bit) return 0; @@ -62,7 +63,8 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) * Find a PFN aligned to the specified order and return an offset represented in * order_per_bits. */ -static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order) +static unsigned long cma_bitmap_aligned_offset(const struct cma *cma, + int align_order) { if (align_order <= cma->order_per_bit) return 0; @@ -71,13 +73,14 @@ static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order) - cma->base_pfn) >> cma->order_per_bit; } -static unsigned long cma_bitmap_pages_to_bits(struct cma *cma, - unsigned long pages) +static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, + unsigned long pages) { return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; } -static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count) +static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, + unsigned int count) { unsigned long bitmap_no, bitmap_count; @@ -162,7 +165,8 @@ core_initcall(cma_init_reserved_areas); * This function creates custom contiguous area from already reserved memory. */ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, - int order_per_bit, struct cma **res_cma) + unsigned int order_per_bit, + struct cma **res_cma) { struct cma *cma; phys_addr_t alignment; @@ -353,7 +357,7 @@ int __init cma_declare_contiguous(phys_addr_t base, * This function allocates part of contiguous memory on specific * contiguous memory area. */ -struct page *cma_alloc(struct cma *cma, int count, unsigned int align) +struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align) { unsigned long mask, offset, pfn, start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; @@ -424,7 +428,7 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align) * It returns false when provided pages do not belong to contiguous area and * true otherwise. */ -bool cma_release(struct cma *cma, struct page *pages, int count) +bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) { unsigned long pfn; From 647757197cd34fae041e21af39ded00f5c346fc4 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 14 Apr 2015 15:47:07 -0700 Subject: [PATCH 092/122] mm: clarify __GFP_NOFAIL deprecation status __GFP_NOFAIL is documented as a deprecated flag since commit 478352e789f5 ("mm: add comment about deprecation of __GFP_NOFAIL"). This has discouraged people from using it but in some cases an opencoded endless loop around allocator has been used instead. So the allocator is not aware of the de facto __GFP_NOFAIL allocation because this information was not communicated properly. Let's make clear that if the allocation context really cannot afford failure because there is no good failure policy then using __GFP_NOFAIL is preferable to opencoding the loop outside of the allocator. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Michal Hocko Acked-by: David Rientjes Cc: Johannes Weiner Cc: Dave Chinner Cc: "Theodore Ts'o" Cc: Mel Gorman Cc: Tetsuo Handa Cc: "David S. Miller" Cc: Vipul Pandya Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 4423a0f8eabe..97a9373e61e8 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -57,8 +57,10 @@ struct vm_area_struct; * _might_ fail. This depends upon the particular VM implementation. * * __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller - * cannot handle allocation failures. This modifier is deprecated and no new - * users should be added. + * cannot handle allocation failures. New users should be evaluated carefully + * (and the flag should be used only when there is no reasonable failure policy) + * but it is definitely preferable to use the flag rather than opencode endless + * loop around allocator. * * __GFP_NORETRY: The VM implementation must not retry indefinitely. * From f91e8d6d76866eafba255864ca617438cec268de Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 14 Apr 2015 15:47:11 -0700 Subject: [PATCH 093/122] sparc: clarify __GFP_NOFAIL allocation Commit 920c3ed74134 ("[SPARC64]: Add basic infrastructure for MD add/remove notification") has added __GFP_NOFAIL for the allocation request but it hasn't mentioned why is this strict requirement really needed. The code was handling an allocation failure and propagated it properly up the callchain so it is not clear why it is needed. Dave has clarified the intention when I tried to remove the flag as not being necessary: : It is a serious failure. : : If we miss an MDESC update due to this allocation failure, the update : is not an event which gets retransmitted so we will lose the updated : machine description forever. : : We really need this allocation to succeed. So add a comment to clarify the nofail flag and get rid of the failure check because __GFP_NOFAIL allocation doesn't fail. Signed-off-by: Michal Hocko Cc: David Rientjes Cc: Johannes Weiner Cc: Dave Chinner Cc: "Theodore Ts'o" Cc: Mel Gorman Cc: Tetsuo Handa Cc: "David S. Miller" Cc: Vipul Pandya Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/kernel/mdesc.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c index 99632a87e697..26c80e18d7b1 100644 --- a/arch/sparc/kernel/mdesc.c +++ b/arch/sparc/kernel/mdesc.c @@ -130,26 +130,26 @@ static struct mdesc_mem_ops memblock_mdesc_ops = { static struct mdesc_handle *mdesc_kmalloc(unsigned int mdesc_size) { unsigned int handle_size; + struct mdesc_handle *hp; + unsigned long addr; void *base; handle_size = (sizeof(struct mdesc_handle) - sizeof(struct mdesc_hdr) + mdesc_size); + /* + * Allocation has to succeed because mdesc update would be missed + * and such events are not retransmitted. + */ base = kmalloc(handle_size + 15, GFP_KERNEL | __GFP_NOFAIL); - if (base) { - struct mdesc_handle *hp; - unsigned long addr; + addr = (unsigned long)base; + addr = (addr + 15UL) & ~15UL; + hp = (struct mdesc_handle *) addr; - addr = (unsigned long)base; - addr = (addr + 15UL) & ~15UL; - hp = (struct mdesc_handle *) addr; + mdesc_handle_init(hp, handle_size, base); - mdesc_handle_init(hp, handle_size, base); - return hp; - } - - return NULL; + return hp; } static void mdesc_kfree(struct mdesc_handle *hp) From 42ff27035c77edccfdf346c739265040dc2155f2 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Tue, 14 Apr 2015 15:47:14 -0700 Subject: [PATCH 094/122] mm/page_alloc.c: clean up comment Signed-off-by: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 86af1a96a6dc..1b849500640c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5760,7 +5760,7 @@ static void __setup_per_zone_wmarks(void) * value here. * * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) - * deltas controls asynch page reclaim, and so should + * deltas control asynch page reclaim, and so should * not be capped for highmem. */ unsigned long min_pages; From 0f616be120c632c818faaea9adcb8f05a7a8601f Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 14 Apr 2015 15:47:17 -0700 Subject: [PATCH 095/122] mm: change __get_vm_area_node() to use fls_long() ioremap() and its related interfaces are used to create I/O mappings to memory-mapped I/O devices. The mapping sizes of the traditional I/O devices are relatively small. Non-volatile memory (NVM), however, has many GB and is going to have TB soon. It is not very efficient to create large I/O mappings with 4KB. This patchset extends the ioremap() interfaces to transparently create I/O mappings with huge pages whenever possible. ioremap() continues to use 4KB mappings when a huge page does not fit into a requested range. There is no change necessary to the drivers using ioremap(). A requested physical address must be aligned by a huge page size (1GB or 2MB on x86) for using huge page mapping, though. The kernel huge I/O mapping will improve performance of NVM and other devices with large memory, and reduce the time to create their mappings as well. On x86, MTRRs can override PAT memory types with a 4KB granularity. When using a huge page, MTRRs can override the memory type of the huge page, which may lead a performance penalty. The processor can also behave in an undefined manner if a huge page is mapped to a memory range that MTRRs have mapped with multiple different memory types. Therefore, the mapping code falls back to use a smaller page size toward 4KB when a mapping range is covered by non-WB type of MTRRs. The WB type of MTRRs has no affect on the PAT memory types. The patchset introduces HAVE_ARCH_HUGE_VMAP, which indicates that the arch supports huge KVA mappings for ioremap(). User may specify a new kernel option "nohugeiomap" to disable the huge I/O mapping capability of ioremap() when necessary. Patch 1-4 change common files to support huge I/O mappings. There is no change in the functinalities unless HAVE_ARCH_HUGE_VMAP is defined on the architecture of the system. Patch 5-6 implement the HAVE_ARCH_HUGE_VMAP funcs on x86, and set HAVE_ARCH_HUGE_VMAP on x86. This patch (of 6): __get_vm_area_node() takes unsigned long size, which is a 64-bit value on a 64-bit kernel. However, fls(size) simply ignores the upper 32-bit. Change to use fls_long() to handle the size properly. Signed-off-by: Toshi Kani Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Arnd Bergmann Cc: Dave Hansen Cc: Robert Elliott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 49abccf29a29..a48cd061f16f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -1314,7 +1315,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, BUG_ON(in_interrupt()); if (flags & VM_IOREMAP) - align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER); + align = 1ul << clamp_t(int, fls_long(size), + PAGE_SHIFT, IOREMAP_MAX_ORDER); size = PAGE_ALIGN(size); if (unlikely(!size)) From 0ddab1d2ed664c85c95488eef569786a84aedf37 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 14 Apr 2015 15:47:20 -0700 Subject: [PATCH 096/122] lib/ioremap.c: add huge I/O map capability interfaces Add ioremap_pud_enabled() and ioremap_pmd_enabled(), which return 1 when I/O mappings with pud/pmd are enabled on the kernel. ioremap_huge_init() calls arch_ioremap_pud_supported() and arch_ioremap_pmd_supported() to initialize the capabilities at boot-time. A new kernel option "nohugeiomap" is also added, so that user can disable the huge I/O map capabilities when necessary. Signed-off-by: Toshi Kani Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Arnd Bergmann Cc: Dave Hansen Cc: Robert Elliott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 2 ++ arch/Kconfig | 3 +++ include/linux/io.h | 8 +++++++ init/main.c | 2 ++ lib/ioremap.c | 37 +++++++++++++++++++++++++++++ 5 files changed, 52 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 71eecb263250..b1fa70907ccf 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2323,6 +2323,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. register save and restore. The kernel will only save legacy floating-point registers on task switch. + nohugeiomap [KNL,x86] Disable kernel huge I/O mappings. + noxsave [BUGS=X86] Disables x86 extended register state save and restore using xsave. The kernel will fallback to enabling legacy floating-point and sse state. diff --git a/arch/Kconfig b/arch/Kconfig index a9c95d36ba70..c88c23f0a1da 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -446,6 +446,9 @@ config HAVE_IRQ_TIME_ACCOUNTING config HAVE_ARCH_TRANSPARENT_HUGEPAGE bool +config HAVE_ARCH_HUGE_VMAP + bool + config HAVE_ARCH_SOFT_DIRTY bool diff --git a/include/linux/io.h b/include/linux/io.h index fa02e55e5a2e..4cc299c598e0 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -38,6 +38,14 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end, } #endif +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +void __init ioremap_huge_init(void); +int arch_ioremap_pud_supported(void); +int arch_ioremap_pmd_supported(void); +#else +static inline void ioremap_huge_init(void) { } +#endif + /* * Managed iomap interface */ diff --git a/init/main.c b/init/main.c index 4a6974e67839..f6dd8fe1f22c 100644 --- a/init/main.c +++ b/init/main.c @@ -80,6 +80,7 @@ #include #include #include +#include #include #include @@ -484,6 +485,7 @@ static void __init mm_init(void) percpu_init_late(); pgtable_init(); vmalloc_init(); + ioremap_huge_init(); } asmlinkage __visible void __init start_kernel(void) diff --git a/lib/ioremap.c b/lib/ioremap.c index 0c9216c48762..2008652c9a1f 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -13,6 +13,43 @@ #include #include +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +int __read_mostly ioremap_pud_capable; +int __read_mostly ioremap_pmd_capable; +int __read_mostly ioremap_huge_disabled; + +static int __init set_nohugeiomap(char *str) +{ + ioremap_huge_disabled = 1; + return 0; +} +early_param("nohugeiomap", set_nohugeiomap); + +void __init ioremap_huge_init(void) +{ + if (!ioremap_huge_disabled) { + if (arch_ioremap_pud_supported()) + ioremap_pud_capable = 1; + if (arch_ioremap_pmd_supported()) + ioremap_pmd_capable = 1; + } +} + +static inline int ioremap_pud_enabled(void) +{ + return ioremap_pud_capable; +} + +static inline int ioremap_pmd_enabled(void) +{ + return ioremap_pmd_capable; +} + +#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ +static inline int ioremap_pud_enabled(void) { return 0; } +static inline int ioremap_pmd_enabled(void) { return 0; } +#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ + static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { From e61ce6ade404ef17bd63d62bec78e29047a47b47 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 14 Apr 2015 15:47:23 -0700 Subject: [PATCH 097/122] mm: change ioremap to set up huge I/O mappings ioremap_pud_range() and ioremap_pmd_range() are changed to create huge I/O mappings when their capability is enabled, and a request meets required conditions -- both virtual & physical addresses are aligned by their huge page size, and a requested range fufills their huge page size. When pud_set_huge() or pmd_set_huge() returns zero, i.e. no-operation is performed, the code simply falls back to the next level. The changes are only enabled when CONFIG_HAVE_ARCH_HUGE_VMAP is defined on the architecture. Signed-off-by: Toshi Kani Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Arnd Bergmann Cc: Dave Hansen Cc: Robert Elliott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 15 +++++++++++++++ lib/ioremap.c | 16 ++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 1f9f5da6828f..9fb7dc7ca7f4 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -6,6 +6,7 @@ #include #include +#include #if 4 - defined(__PAGETABLE_PUD_FOLDED) - defined(__PAGETABLE_PMD_FOLDED) != \ CONFIG_PGTABLE_LEVELS @@ -696,6 +697,20 @@ static inline int pmd_protnone(pmd_t pmd) #endif /* CONFIG_MMU */ +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); +int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); +#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ +static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) +{ + return 0; +} +static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) +{ + return 0; +} +#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ + #endif /* !__ASSEMBLY__ */ #ifndef io_remap_pfn_range diff --git a/lib/ioremap.c b/lib/ioremap.c index 2008652c9a1f..be249062ba6e 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -80,6 +80,14 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, return -ENOMEM; do { next = pmd_addr_end(addr, end); + + if (ioremap_pmd_enabled() && + ((next - addr) == PMD_SIZE) && + IS_ALIGNED(phys_addr + addr, PMD_SIZE)) { + if (pmd_set_huge(pmd, phys_addr + addr, prot)) + continue; + } + if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot)) return -ENOMEM; } while (pmd++, addr = next, addr != end); @@ -98,6 +106,14 @@ static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr, return -ENOMEM; do { next = pud_addr_end(addr, end); + + if (ioremap_pud_enabled() && + ((next - addr) == PUD_SIZE) && + IS_ALIGNED(phys_addr + addr, PUD_SIZE)) { + if (pud_set_huge(pud, phys_addr + addr, prot)) + continue; + } + if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot)) return -ENOMEM; } while (pud++, addr = next, addr != end); From b9820d8f39f816b67112eb7ec5cdc4c1655ff060 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 14 Apr 2015 15:47:26 -0700 Subject: [PATCH 098/122] mm: change vunmap to tear down huge KVA mappings Change vunmap_pmd_range() and vunmap_pud_range() to tear down huge KVA mappings when they are set. pud_clear_huge() and pmd_clear_huge() return zero when no-operation is performed, i.e. huge page mapping was not used. These changes are only enabled when CONFIG_HAVE_ARCH_HUGE_VMAP is defined on the architecture. [akpm@linux-foundation.org: use consistent code layout] Signed-off-by: Toshi Kani Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Arnd Bergmann Cc: Dave Hansen Cc: Robert Elliott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 10 ++++++++++ mm/vmalloc.c | 4 ++++ 2 files changed, 14 insertions(+) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 9fb7dc7ca7f4..39f1d6a2b04d 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -700,6 +700,8 @@ static inline int pmd_protnone(pmd_t pmd) #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); +int pud_clear_huge(pud_t *pud); +int pmd_clear_huge(pmd_t *pmd); #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { @@ -709,6 +711,14 @@ static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { return 0; } +static inline int pud_clear_huge(pud_t *pud) +{ + return 0; +} +static inline int pmd_clear_huge(pmd_t *pmd) +{ + return 0; +} #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #endif /* !__ASSEMBLY__ */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a48cd061f16f..a5bbdd3b5d67 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -75,6 +75,8 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_clear_huge(pmd)) + continue; if (pmd_none_or_clear_bad(pmd)) continue; vunmap_pte_range(pmd, addr, next); @@ -89,6 +91,8 @@ static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); + if (pud_clear_huge(pud)) + continue; if (pud_none_or_clear_bad(pud)) continue; vunmap_pmd_range(pud, addr, next); From 5d72b4fba40ef4b3f7a1a11d6aacc85d9af81561 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 14 Apr 2015 15:47:29 -0700 Subject: [PATCH 099/122] x86, mm: support huge I/O mapping capability I/F Implement huge I/O mapping capability interfaces for ioremap() on x86. IOREMAP_MAX_ORDER is defined to PUD_SHIFT on x86/64 and PMD_SHIFT on x86/32, which overrides the default value defined in . Signed-off-by: Toshi Kani Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Arnd Bergmann Cc: Dave Hansen Cc: Robert Elliott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/page_types.h | 2 ++ arch/x86/mm/ioremap.c | 23 +++++++++++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index f97fbe3abb67..c7c712f2648b 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -40,8 +40,10 @@ #ifdef CONFIG_X86_64 #include +#define IOREMAP_MAX_ORDER (PUD_SHIFT) #else #include +#define IOREMAP_MAX_ORDER (PMD_SHIFT) #endif /* CONFIG_X86_64 */ #ifndef __ASSEMBLY__ diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index fdf617c00e2f..5ead4d6cf3a7 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -67,8 +67,13 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages, /* * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. + * address space. It transparently creates kernel huge I/O mapping when + * the physical address is aligned by a huge page size (1GB or 2MB) and + * the requested size is at least the huge page size. + * + * NOTE: MTRRs can override PAT memory types with a 4KB granularity. + * Therefore, the mapping code falls back to use a smaller page toward 4KB + * when a mapping range is covered by non-WB type of MTRRs. * * NOTE! We need to allow non-page-aligned mappings too: we will obviously * have to convert them into an offset in a page-aligned mapping, but the @@ -326,6 +331,20 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); +int arch_ioremap_pud_supported(void) +{ +#ifdef CONFIG_X86_64 + return cpu_has_gbpages; +#else + return 0; +#endif +} + +int arch_ioremap_pmd_supported(void) +{ + return cpu_has_pse; +} + /* * Convert a physical pointer to a virtual kernel pointer for /dev/mem * access From 6b6378355b925050eb6fa966742d8c2d65ff0d83 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 14 Apr 2015 15:47:32 -0700 Subject: [PATCH 100/122] x86, mm: support huge KVA mappings on x86 Implement huge KVA mapping interfaces on x86. On x86, MTRRs can override PAT memory types with a 4KB granularity. When using a huge page, MTRRs can override the memory type of the huge page, which may lead a performance penalty. The processor can also behave in an undefined manner if a huge page is mapped to a memory range that MTRRs have mapped with multiple different memory types. Therefore, the mapping code falls back to use a smaller page size toward 4KB when a mapping range is covered by non-WB type of MTRRs. The WB type of MTRRs has no affect on the PAT memory types. pud_set_huge() and pmd_set_huge() call mtrr_type_lookup() to see if a given range is covered by MTRRs. MTRR_TYPE_WRBACK indicates that the range is either covered by WB or not covered and the MTRR default value is set to WB. 0xFF indicates that MTRRs are disabled. HAVE_ARCH_HUGE_VMAP is selected when X86_64 or X86_32 with X86_PAE is set. X86_32 without X86_PAE is not supported since such config can unlikey be benefited from this feature, and there was an issue found in testing. [fengguang.wu@intel.com: ioremap_pud_capable can be static] Signed-off-by: Toshi Kani Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Arnd Bergmann Cc: Dave Hansen Cc: Robert Elliott Signed-off-by: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/mm/pgtable.c | 65 +++++++++++++++++++++++++++++++++++++++++++ lib/ioremap.c | 6 ++-- 3 files changed, 69 insertions(+), 3 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3e3aaad23414..0f948cefaeb1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -99,6 +99,7 @@ config X86 select IRQ_FORCED_THREADING select HAVE_BPF_JIT if X86_64 select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE) select ARCH_HAS_SG_CHAIN select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index b28edfecbdfe..0b97d2c75df3 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -4,6 +4,7 @@ #include #include #include +#include #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO @@ -560,3 +561,67 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, { __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); } + +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) +{ + u8 mtrr; + + /* + * Do not use a huge page when the range is covered by non-WB type + * of MTRRs. + */ + mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE); + if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF)) + return 0; + + prot = pgprot_4k_2_large(prot); + + set_pte((pte_t *)pud, pfn_pte( + (u64)addr >> PAGE_SHIFT, + __pgprot(pgprot_val(prot) | _PAGE_PSE))); + + return 1; +} + +int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) +{ + u8 mtrr; + + /* + * Do not use a huge page when the range is covered by non-WB type + * of MTRRs. + */ + mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE); + if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF)) + return 0; + + prot = pgprot_4k_2_large(prot); + + set_pte((pte_t *)pmd, pfn_pte( + (u64)addr >> PAGE_SHIFT, + __pgprot(pgprot_val(prot) | _PAGE_PSE))); + + return 1; +} + +int pud_clear_huge(pud_t *pud) +{ + if (pud_large(*pud)) { + pud_clear(pud); + return 1; + } + + return 0; +} + +int pmd_clear_huge(pmd_t *pmd) +{ + if (pmd_large(*pmd)) { + pmd_clear(pmd); + return 1; + } + + return 0; +} +#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ diff --git a/lib/ioremap.c b/lib/ioremap.c index be249062ba6e..86c8911b0e3a 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -14,9 +14,9 @@ #include #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -int __read_mostly ioremap_pud_capable; -int __read_mostly ioremap_pmd_capable; -int __read_mostly ioremap_huge_disabled; +static int __read_mostly ioremap_pud_capable; +static int __read_mostly ioremap_pmd_capable; +static int __read_mostly ioremap_huge_disabled; static int __init set_nohugeiomap(char *str) { From b1b0deabbffa922fed808d4a5d99d03372a4c701 Mon Sep 17 00:00:00 2001 From: Chen Gang <762976180@qq.com> Date: Tue, 14 Apr 2015 15:47:35 -0700 Subject: [PATCH 101/122] mm: memcontrol: let mem_cgroup_move_account() have effect only if MMU enabled When !MMU, it will report warning. The related warning with allmodconfig under c6x: CC mm/memcontrol.o mm/memcontrol.c:2802:12: warning: 'mem_cgroup_move_account' defined but not used [-Wunused-function] static int mem_cgroup_move_account(struct page *page, ^ Signed-off-by: Chen Gang Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 172 ++++++++++++++++++++++++------------------------ 1 file changed, 86 insertions(+), 86 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 68d4890fc4bd..f227786e73db 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2785,92 +2785,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -/** - * mem_cgroup_move_account - move account of the page - * @page: the page - * @nr_pages: number of regular pages (>1 for huge pages) - * @from: mem_cgroup which the page is moved from. - * @to: mem_cgroup which the page is moved to. @from != @to. - * - * The caller must confirm following. - * - page is not on LRU (isolate_page() is useful.) - * - compound_lock is held when nr_pages > 1 - * - * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" - * from old cgroup. - */ -static int mem_cgroup_move_account(struct page *page, - unsigned int nr_pages, - struct mem_cgroup *from, - struct mem_cgroup *to) -{ - unsigned long flags; - int ret; - - VM_BUG_ON(from == to); - VM_BUG_ON_PAGE(PageLRU(page), page); - /* - * The page is isolated from LRU. So, collapse function - * will not handle this page. But page splitting can happen. - * Do this check under compound_page_lock(). The caller should - * hold it. - */ - ret = -EBUSY; - if (nr_pages > 1 && !PageTransHuge(page)) - goto out; - - /* - * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup - * of its source page while we change it: page migration takes - * both pages off the LRU, but page cache replacement doesn't. - */ - if (!trylock_page(page)) - goto out; - - ret = -EINVAL; - if (page->mem_cgroup != from) - goto out_unlock; - - spin_lock_irqsave(&from->move_lock, flags); - - if (!PageAnon(page) && page_mapped(page)) { - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], - nr_pages); - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], - nr_pages); - } - - if (PageWriteback(page)) { - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], - nr_pages); - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], - nr_pages); - } - - /* - * It is safe to change page->mem_cgroup here because the page - * is referenced, charged, and isolated - we can't race with - * uncharging, charging, migration, or LRU putback. - */ - - /* caller should have done css_get */ - page->mem_cgroup = to; - spin_unlock_irqrestore(&from->move_lock, flags); - - ret = 0; - - local_irq_disable(); - mem_cgroup_charge_statistics(to, page, nr_pages); - memcg_check_events(to, page); - mem_cgroup_charge_statistics(from, page, -nr_pages); - memcg_check_events(from, page); - local_irq_enable(); -out_unlock: - unlock_page(page); -out: - return ret; -} - #ifdef CONFIG_MEMCG_SWAP static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, bool charge) @@ -4822,6 +4736,92 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, return page; } +/** + * mem_cgroup_move_account - move account of the page + * @page: the page + * @nr_pages: number of regular pages (>1 for huge pages) + * @from: mem_cgroup which the page is moved from. + * @to: mem_cgroup which the page is moved to. @from != @to. + * + * The caller must confirm following. + * - page is not on LRU (isolate_page() is useful.) + * - compound_lock is held when nr_pages > 1 + * + * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" + * from old cgroup. + */ +static int mem_cgroup_move_account(struct page *page, + unsigned int nr_pages, + struct mem_cgroup *from, + struct mem_cgroup *to) +{ + unsigned long flags; + int ret; + + VM_BUG_ON(from == to); + VM_BUG_ON_PAGE(PageLRU(page), page); + /* + * The page is isolated from LRU. So, collapse function + * will not handle this page. But page splitting can happen. + * Do this check under compound_page_lock(). The caller should + * hold it. + */ + ret = -EBUSY; + if (nr_pages > 1 && !PageTransHuge(page)) + goto out; + + /* + * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup + * of its source page while we change it: page migration takes + * both pages off the LRU, but page cache replacement doesn't. + */ + if (!trylock_page(page)) + goto out; + + ret = -EINVAL; + if (page->mem_cgroup != from) + goto out_unlock; + + spin_lock_irqsave(&from->move_lock, flags); + + if (!PageAnon(page) && page_mapped(page)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + } + + if (PageWriteback(page)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); + } + + /* + * It is safe to change page->mem_cgroup here because the page + * is referenced, charged, and isolated - we can't race with + * uncharging, charging, migration, or LRU putback. + */ + + /* caller should have done css_get */ + page->mem_cgroup = to; + spin_unlock_irqrestore(&from->move_lock, flags); + + ret = 0; + + local_irq_disable(); + mem_cgroup_charge_statistics(to, page, nr_pages); + memcg_check_events(to, page); + mem_cgroup_charge_statistics(from, page, -nr_pages); + memcg_check_events(from, page); + local_irq_enable(); +out_unlock: + unlock_page(page); +out: + return ret; +} + static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, union mc_target *target) { From a87938b2e246b81b4fb713edb371a9fa3c5c3c86 Mon Sep 17 00:00:00 2001 From: Michael Davidson Date: Tue, 14 Apr 2015 15:47:38 -0700 Subject: [PATCH 102/122] fs/binfmt_elf.c: fix bug in loading of PIE binaries With CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE enabled, and a normal top-down address allocation strategy, load_elf_binary() will attempt to map a PIE binary into an address range immediately below mm->mmap_base. Unfortunately, load_elf_ binary() does not take account of the need to allocate sufficient space for the entire binary which means that, while the first PT_LOAD segment is mapped below mm->mmap_base, the subsequent PT_LOAD segment(s) end up being mapped above mm->mmap_base into the are that is supposed to be the "gap" between the stack and the binary. Since the size of the "gap" on x86_64 is only guaranteed to be 128MB this means that binaries with large data segments > 128MB can end up mapping part of their data segment over their stack resulting in corruption of the stack (and the data segment once the binary starts to run). Any PIE binary with a data segment > 128MB is vulnerable to this although address randomization means that the actual gap between the stack and the end of the binary is normally greater than 128MB. The larger the data segment of the binary the higher the probability of failure. Fix this by calculating the total size of the binary in the same way as load_elf_interp(). Signed-off-by: Michael Davidson Cc: Alexander Viro Cc: Jiri Kosina Cc: Kees Cook Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 995986b8e36b..d925f55e4857 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -862,6 +862,7 @@ static int load_elf_binary(struct linux_binprm *bprm) i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { int elf_prot = 0, elf_flags; unsigned long k, vaddr; + unsigned long total_size = 0; if (elf_ppnt->p_type != PT_LOAD) continue; @@ -924,10 +925,16 @@ static int load_elf_binary(struct linux_binprm *bprm) #else load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); #endif + total_size = total_mapping_size(elf_phdata, + loc->elf_ex.e_phnum); + if (!total_size) { + error = -EINVAL; + goto out_free_dentry; + } } error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, - elf_prot, elf_flags, 0); + elf_prot, elf_flags, total_size); if (BAD_ADDR(error)) { retval = IS_ERR((void *)error) ? PTR_ERR((void*)error) : -EINVAL; From fbbc400f3924ce095b466c776dc294727ec0a202 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 14 Apr 2015 15:47:41 -0700 Subject: [PATCH 103/122] arm: factor out mmap ASLR into mmap_rnd To address the "offset2lib" ASLR weakness[1], this separates ET_DYN ASLR from mmap ASLR, as already done on s390. The architectures that are already randomizing mmap (arm, arm64, mips, powerpc, s390, and x86), have their various forms of arch_mmap_rnd() made available via the new CONFIG_ARCH_HAS_ELF_RANDOMIZE. For these architectures, arch_randomize_brk() is collapsed as well. This is an alternative to the solutions in: https://lkml.org/lkml/2015/2/23/442 I've been able to test x86 and arm, and the buildbot (so far) seems happy with building the rest. [1] http://cybersecurity.upv.es/attacks/offset2lib/offset2lib.html This patch (of 10): In preparation for splitting out ET_DYN ASLR, this moves the ASLR calculations for mmap on ARM into a separate routine, similar to x86. This also removes the redundant check of personality (PF_RANDOMIZE is already set before calling arch_pick_mmap_layout). Signed-off-by: Kees Cook Cc: Hector Marco-Gisbert Cc: Russell King Reviewed-by: Ingo Molnar Cc: Catalin Marinas Cc: Will Deacon Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Alexander Viro Cc: Oleg Nesterov Cc: Andy Lutomirski Cc: "David A. Long" Cc: Andrey Ryabinin Cc: Arun Chandran Cc: Yann Droneaud Cc: Min-Hua Chen Cc: Paul Burton Cc: Alex Smith Cc: Markos Chandras Cc: Vineeth Vijayan Cc: Jeff Bailey Cc: Michael Holzheu Cc: Ben Hutchings Cc: Behan Webster Cc: Ismael Ripoll Cc: Jan-Simon Mller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/mmap.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index 5e85ed371364..15a8160096b3 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -169,14 +169,22 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; } +static unsigned long mmap_rnd(void) +{ + unsigned long rnd; + + /* 8 bits of randomness in 20 address space bits */ + rnd = (unsigned long)get_random_int() % (1 << 8); + + return rnd << PAGE_SHIFT; +} + void arch_pick_mmap_layout(struct mm_struct *mm) { unsigned long random_factor = 0UL; - /* 8 bits of randomness in 20 address space bits */ - if ((current->flags & PF_RANDOMIZE) && - !(current->personality & ADDR_NO_RANDOMIZE)) - random_factor = (get_random_int() % (1 << 8)) << PAGE_SHIFT; + if (current->flags & PF_RANDOMIZE) + random_factor = mmap_rnd(); if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; From 82168140bc4cec7ec9bad39705518541149ff8b7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 14 Apr 2015 15:47:45 -0700 Subject: [PATCH 104/122] x86: standardize mmap_rnd() usage In preparation for splitting out ET_DYN ASLR, this refactors the use of mmap_rnd() to be used similarly to arm, and extracts the checking of PF_RANDOMIZE. Signed-off-by: Kees Cook Reviewed-by: Ingo Molnar Cc: Oleg Nesterov Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/mmap.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index df4552bd239e..ebfa52030d5c 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -67,22 +67,21 @@ static int mmap_is_legacy(void) static unsigned long mmap_rnd(void) { - unsigned long rnd = 0; + unsigned long rnd; /* - * 8 bits of randomness in 32bit mmaps, 20 address space bits - * 28 bits of randomness in 64bit mmaps, 40 address space bits - */ - if (current->flags & PF_RANDOMIZE) { - if (mmap_is_ia32()) - rnd = get_random_int() % (1<<8); - else - rnd = get_random_int() % (1<<28); - } + * 8 bits of randomness in 32bit mmaps, 20 address space bits + * 28 bits of randomness in 64bit mmaps, 40 address space bits + */ + if (mmap_is_ia32()) + rnd = (unsigned long)get_random_int() % (1<<8); + else + rnd = (unsigned long)get_random_int() % (1<<28); + return rnd << PAGE_SHIFT; } -static unsigned long mmap_base(void) +static unsigned long mmap_base(unsigned long rnd) { unsigned long gap = rlimit(RLIMIT_STACK); @@ -91,19 +90,19 @@ static unsigned long mmap_base(void) else if (gap > MAX_GAP) gap = MAX_GAP; - return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd()); + return PAGE_ALIGN(TASK_SIZE - gap - rnd); } /* * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 * does, but not when emulating X86_32 */ -static unsigned long mmap_legacy_base(void) +static unsigned long mmap_legacy_base(unsigned long rnd) { if (mmap_is_ia32()) return TASK_UNMAPPED_BASE; else - return TASK_UNMAPPED_BASE + mmap_rnd(); + return TASK_UNMAPPED_BASE + rnd; } /* @@ -112,13 +111,18 @@ static unsigned long mmap_legacy_base(void) */ void arch_pick_mmap_layout(struct mm_struct *mm) { - mm->mmap_legacy_base = mmap_legacy_base(); - mm->mmap_base = mmap_base(); + unsigned long random_factor = 0UL; + + if (current->flags & PF_RANDOMIZE) + random_factor = mmap_rnd(); + + mm->mmap_legacy_base = mmap_legacy_base(random_factor); if (mmap_is_legacy()) { mm->mmap_base = mm->mmap_legacy_base; mm->get_unmapped_area = arch_get_unmapped_area; } else { + mm->mmap_base = mmap_base(random_factor); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } From dd04cff1dceab18226853b555cf07914648a235f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 14 Apr 2015 15:47:48 -0700 Subject: [PATCH 105/122] arm64: standardize mmap_rnd() usage In preparation for splitting out ET_DYN ASLR, this refactors the use of mmap_rnd() to be used similarly to arm and x86. This additionally enables mmap ASLR on legacy mmap layouts, which appeared to be missing on arm64, and was already supported on arm. Additionally removes a copy/pasted declaration of an unused function. Signed-off-by: Kees Cook Cc: Russell King Cc: Catalin Marinas Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/elf.h | 1 - arch/arm64/mm/mmap.c | 18 +++++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index 1f65be393139..f724db00b235 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -125,7 +125,6 @@ typedef struct user_fpsimd_state elf_fpregset_t; * the loader. We need to make sure that it is out of the way of the program * that it will "exec", and that there is sufficient room for the brk. */ -extern unsigned long randomize_et_dyn(unsigned long base); #define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) /* diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 54922d1275b8..ba776c01b552 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -49,15 +49,14 @@ static int mmap_is_legacy(void) static unsigned long mmap_rnd(void) { - unsigned long rnd = 0; + unsigned long rnd; - if (current->flags & PF_RANDOMIZE) - rnd = (long)get_random_int() & STACK_RND_MASK; + rnd = (unsigned long)get_random_int() & STACK_RND_MASK; return rnd << PAGE_SHIFT; } -static unsigned long mmap_base(void) +static unsigned long mmap_base(unsigned long rnd) { unsigned long gap = rlimit(RLIMIT_STACK); @@ -66,7 +65,7 @@ static unsigned long mmap_base(void) else if (gap > MAX_GAP) gap = MAX_GAP; - return PAGE_ALIGN(STACK_TOP - gap - mmap_rnd()); + return PAGE_ALIGN(STACK_TOP - gap - rnd); } /* @@ -75,15 +74,20 @@ static unsigned long mmap_base(void) */ void arch_pick_mmap_layout(struct mm_struct *mm) { + unsigned long random_factor = 0UL; + + if (current->flags & PF_RANDOMIZE) + random_factor = mmap_rnd(); + /* * Fall back to the standard layout if the personality bit is set, or * if the expected stack growth is unlimited: */ if (mmap_is_legacy()) { - mm->mmap_base = TASK_UNMAPPED_BASE; + mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(); + mm->mmap_base = mmap_base(random_factor); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } From 1f0569df0b0285e7ec2432d804a4921b06a61618 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 14 Apr 2015 15:47:51 -0700 Subject: [PATCH 106/122] mips: extract logic for mmap_rnd() In preparation for splitting out ET_DYN ASLR, extract the mmap ASLR selection into a separate function. Signed-off-by: Kees Cook Reviewed-by: Ingo Molnar Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/mmap.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index f1baadd56e82..9a4f1f5c1f0e 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -142,18 +142,26 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, addr0, len, pgoff, flags, DOWN); } +static unsigned long mmap_rnd(void) +{ + unsigned long rnd; + + rnd = (unsigned long)get_random_int(); + rnd <<= PAGE_SHIFT; + if (TASK_IS_32BIT_ADDR) + rnd &= 0xfffffful; + else + rnd &= 0xffffffful; + + return rnd; +} + void arch_pick_mmap_layout(struct mm_struct *mm) { unsigned long random_factor = 0UL; - if (current->flags & PF_RANDOMIZE) { - random_factor = get_random_int(); - random_factor = random_factor << PAGE_SHIFT; - if (TASK_IS_32BIT_ADDR) - random_factor &= 0xfffffful; - else - random_factor &= 0xffffffful; - } + if (current->flags & PF_RANDOMIZE) + random_factor = mmap_rnd(); if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; From ed6322746afb74c2509e2f3a6464182793b16eb9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 14 Apr 2015 15:47:54 -0700 Subject: [PATCH 107/122] powerpc: standardize mmap_rnd() usage In preparation for splitting out ET_DYN ASLR, this refactors the use of mmap_rnd() to be used similarly to arm and x86. (Can mmap ASLR be safely enabled in the legacy mmap case here? Other archs use "mm->mmap_base = TASK_UNMAPPED_BASE + random_factor".) Signed-off-by: Kees Cook Reviewed-by: Ingo Molnar Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/mmap.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index cb8bdbe4972f..1ad2299d795d 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -55,19 +55,18 @@ static inline int mmap_is_legacy(void) static unsigned long mmap_rnd(void) { - unsigned long rnd = 0; + unsigned long rnd; + + /* 8MB for 32bit, 1GB for 64bit */ + if (is_32bit_task()) + rnd = (unsigned long)get_random_int() % (1<<(23-PAGE_SHIFT)); + else + rnd = (unsigned long)get_random_int() % (1<<(30-PAGE_SHIFT)); - if (current->flags & PF_RANDOMIZE) { - /* 8MB for 32bit, 1GB for 64bit */ - if (is_32bit_task()) - rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT))); - else - rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT))); - } return rnd << PAGE_SHIFT; } -static inline unsigned long mmap_base(void) +static inline unsigned long mmap_base(unsigned long rnd) { unsigned long gap = rlimit(RLIMIT_STACK); @@ -76,7 +75,7 @@ static inline unsigned long mmap_base(void) else if (gap > MAX_GAP) gap = MAX_GAP; - return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd()); + return PAGE_ALIGN(TASK_SIZE - gap - rnd); } /* @@ -85,6 +84,11 @@ static inline unsigned long mmap_base(void) */ void arch_pick_mmap_layout(struct mm_struct *mm) { + unsigned long random_factor = 0UL; + + if (current->flags & PF_RANDOMIZE) + random_factor = mmap_rnd(); + /* * Fall back to the standard layout if the personality * bit is set, or if the expected stack growth is unlimited: @@ -93,7 +97,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(); + mm->mmap_base = mmap_base(random_factor); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } From 8e89a356feb6f196824a72101861d931a97ac2d2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 14 Apr 2015 15:47:57 -0700 Subject: [PATCH 108/122] s390: standardize mmap_rnd() usage In preparation for splitting out ET_DYN ASLR, this refactors the use of mmap_rnd() to be used similarly to arm and x86, and extracts the checking of PF_RANDOMIZE. Signed-off-by: Kees Cook Acked-by: Martin Schwidefsky Cc: Heiko Carstens Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/mmap.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 179a2c20b01f..db57078075c5 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -62,20 +62,18 @@ static inline int mmap_is_legacy(void) static unsigned long mmap_rnd(void) { - if (!(current->flags & PF_RANDOMIZE)) - return 0; if (is_32bit_task()) return (get_random_int() & 0x7ff) << PAGE_SHIFT; else return (get_random_int() & mmap_rnd_mask) << PAGE_SHIFT; } -static unsigned long mmap_base_legacy(void) +static unsigned long mmap_base_legacy(unsigned long rnd) { - return TASK_UNMAPPED_BASE + mmap_rnd(); + return TASK_UNMAPPED_BASE + rnd; } -static inline unsigned long mmap_base(void) +static inline unsigned long mmap_base(unsigned long rnd) { unsigned long gap = rlimit(RLIMIT_STACK); @@ -84,7 +82,7 @@ static inline unsigned long mmap_base(void) else if (gap > MAX_GAP) gap = MAX_GAP; gap &= PAGE_MASK; - return STACK_TOP - stack_maxrandom_size() - mmap_rnd() - gap; + return STACK_TOP - stack_maxrandom_size() - rnd - gap; } unsigned long @@ -187,7 +185,11 @@ unsigned long randomize_et_dyn(void) if (!is_32bit_task()) /* Align to 4GB */ base &= ~((1UL << 32) - 1); - return base + mmap_rnd(); + + if (current->flags & PF_RANDOMIZE) + base += mmap_rnd(); + + return base; } #ifndef CONFIG_64BIT @@ -198,15 +200,20 @@ unsigned long randomize_et_dyn(void) */ void arch_pick_mmap_layout(struct mm_struct *mm) { + unsigned long random_factor = 0UL; + + if (current->flags & PF_RANDOMIZE) + random_factor = mmap_rnd(); + /* * Fall back to the standard layout if the personality * bit is set, or if the expected stack growth is unlimited: */ if (mmap_is_legacy()) { - mm->mmap_base = mmap_base_legacy(); + mm->mmap_base = mmap_base_legacy(random_factor); mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(); + mm->mmap_base = mmap_base(random_factor); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } @@ -273,15 +280,20 @@ s390_get_unmapped_area_topdown(struct file *filp, const unsigned long addr, */ void arch_pick_mmap_layout(struct mm_struct *mm) { + unsigned long random_factor = 0UL; + + if (current->flags & PF_RANDOMIZE) + random_factor = mmap_rnd(); + /* * Fall back to the standard layout if the personality * bit is set, or if the expected stack growth is unlimited: */ if (mmap_is_legacy()) { - mm->mmap_base = mmap_base_legacy(); + mm->mmap_base = mmap_base_legacy(random_factor); mm->get_unmapped_area = s390_get_unmapped_area; } else { - mm->mmap_base = mmap_base(); + mm->mmap_base = mmap_base(random_factor); mm->get_unmapped_area = s390_get_unmapped_area_topdown; } } From 2b68f6caeac271620cd2f9362aeaed360e317df0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 14 Apr 2015 15:48:00 -0700 Subject: [PATCH 109/122] mm: expose arch_mmap_rnd when available When an architecture fully supports randomizing the ELF load location, a per-arch mmap_rnd() function is used to find a randomized mmap base. In preparation for randomizing the location of ET_DYN binaries separately from mmap, this renames and exports these functions as arch_mmap_rnd(). Additionally introduces CONFIG_ARCH_HAS_ELF_RANDOMIZE for describing this feature on architectures that support it (which is a superset of ARCH_BINFMT_ELF_RANDOMIZE_PIE, since s390 already supports a separated ET_DYN ASLR from mmap ASLR without the ARCH_BINFMT_ELF_RANDOMIZE_PIE logic). Signed-off-by: Kees Cook Cc: Hector Marco-Gisbert Cc: Russell King Reviewed-by: Ingo Molnar Cc: Catalin Marinas Cc: Will Deacon Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Alexander Viro Cc: Oleg Nesterov Cc: Andy Lutomirski Cc: "David A. Long" Cc: Andrey Ryabinin Cc: Arun Chandran Cc: Yann Droneaud Cc: Min-Hua Chen Cc: Paul Burton Cc: Alex Smith Cc: Markos Chandras Cc: Vineeth Vijayan Cc: Jeff Bailey Cc: Michael Holzheu Cc: Ben Hutchings Cc: Behan Webster Cc: Ismael Ripoll Cc: Jan-Simon Mller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 7 +++++++ arch/arm/Kconfig | 1 + arch/arm/mm/mmap.c | 4 ++-- arch/arm64/Kconfig | 1 + arch/arm64/mm/mmap.c | 4 ++-- arch/mips/Kconfig | 1 + arch/mips/mm/mmap.c | 4 ++-- arch/powerpc/Kconfig | 1 + arch/powerpc/mm/mmap.c | 4 ++-- arch/s390/Kconfig | 1 + arch/s390/mm/mmap.c | 8 ++++---- arch/x86/Kconfig | 1 + arch/x86/mm/mmap.c | 4 ++-- include/linux/elf-randomize.h | 10 ++++++++++ 14 files changed, 37 insertions(+), 14 deletions(-) create mode 100644 include/linux/elf-randomize.h diff --git a/arch/Kconfig b/arch/Kconfig index c88c23f0a1da..474904a8e540 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -491,6 +491,13 @@ config PGTABLE_LEVELS int default 2 +config ARCH_HAS_ELF_RANDOMIZE + bool + help + An architecture supports choosing randomized locations for + stack, mmap, brk, and ET_DYN. Defined functions: + - arch_mmap_rnd() + # # ABI hall of shame # diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 696cf3c61e0f..f85200a63a8b 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -3,6 +3,7 @@ config ARM default y select ARCH_BINFMT_ELF_RANDOMIZE_PIE select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE + select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index 15a8160096b3..407dc786583a 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -169,7 +169,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; } -static unsigned long mmap_rnd(void) +unsigned long arch_mmap_rnd(void) { unsigned long rnd; @@ -184,7 +184,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) - random_factor = mmap_rnd(); + random_factor = arch_mmap_rnd(); if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3f2fba996bc2..7c1dbeb73e8d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2,6 +2,7 @@ config ARM64 def_bool y select ARCH_BINFMT_ELF_RANDOMIZE_PIE select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE + select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SG_CHAIN select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index ba776c01b552..ed177475dd8c 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -47,7 +47,7 @@ static int mmap_is_legacy(void) return sysctl_legacy_va_layout; } -static unsigned long mmap_rnd(void) +unsigned long arch_mmap_rnd(void) { unsigned long rnd; @@ -77,7 +77,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) - random_factor = mmap_rnd(); + random_factor = arch_mmap_rnd(); /* * Fall back to the standard layout if the personality bit is set, or diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index a9d112d2a135..688ce274f59d 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -24,6 +24,7 @@ config MIPS select HAVE_DEBUG_KMEMLEAK select HAVE_SYSCALL_TRACEPOINTS select ARCH_BINFMT_ELF_RANDOMIZE_PIE + select ARCH_HAS_ELF_RANDOMIZE select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES && 64BIT select RTC_LIB if !MACH_LOONGSON select GENERIC_ATOMIC64 if !64BIT diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index 9a4f1f5c1f0e..5c81fdd032c3 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -142,7 +142,7 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, addr0, len, pgoff, flags, DOWN); } -static unsigned long mmap_rnd(void) +unsigned long arch_mmap_rnd(void) { unsigned long rnd; @@ -161,7 +161,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) - random_factor = mmap_rnd(); + random_factor = arch_mmap_rnd(); if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 91ad76f30d18..fc5fffbb331b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -89,6 +89,7 @@ config PPC select ARCH_MIGHT_HAVE_PC_SERIO select BINFMT_ELF select ARCH_BINFMT_ELF_RANDOMIZE_PIE + select ARCH_HAS_ELF_RANDOMIZE select OF select OF_EARLY_FLATTREE select OF_RESERVED_MEM diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index 1ad2299d795d..0f0502e12f6c 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -53,7 +53,7 @@ static inline int mmap_is_legacy(void) return sysctl_legacy_va_layout; } -static unsigned long mmap_rnd(void) +unsigned long arch_mmap_rnd(void) { unsigned long rnd; @@ -87,7 +87,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) - random_factor = mmap_rnd(); + random_factor = arch_mmap_rnd(); /* * Fall back to the standard layout if the personality diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index f6aebcb7a0f8..ac2b75d74cd2 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -65,6 +65,7 @@ config S390 def_bool y select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SG_CHAIN select ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index db57078075c5..a94504d99c47 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -60,7 +60,7 @@ static inline int mmap_is_legacy(void) return sysctl_legacy_va_layout; } -static unsigned long mmap_rnd(void) +unsigned long arch_mmap_rnd(void) { if (is_32bit_task()) return (get_random_int() & 0x7ff) << PAGE_SHIFT; @@ -187,7 +187,7 @@ unsigned long randomize_et_dyn(void) base &= ~((1UL << 32) - 1); if (current->flags & PF_RANDOMIZE) - base += mmap_rnd(); + base += arch_mmap_rnd(); return base; } @@ -203,7 +203,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) - random_factor = mmap_rnd(); + random_factor = arch_mmap_rnd(); /* * Fall back to the standard layout if the personality @@ -283,7 +283,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) - random_factor = mmap_rnd(); + random_factor = arch_mmap_rnd(); /* * Fall back to the standard layout if the personality diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0f948cefaeb1..782ddbbc1c9a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -88,6 +88,7 @@ config X86 select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP select HAVE_USER_RETURN_NOTIFIER select ARCH_BINFMT_ELF_RANDOMIZE_PIE + select ARCH_HAS_ELF_RANDOMIZE select HAVE_ARCH_JUMP_LABEL select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select SPARSE_IRQ diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index ebfa52030d5c..9d518d693b4b 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -65,7 +65,7 @@ static int mmap_is_legacy(void) return sysctl_legacy_va_layout; } -static unsigned long mmap_rnd(void) +unsigned long arch_mmap_rnd(void) { unsigned long rnd; @@ -114,7 +114,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) - random_factor = mmap_rnd(); + random_factor = arch_mmap_rnd(); mm->mmap_legacy_base = mmap_legacy_base(random_factor); diff --git a/include/linux/elf-randomize.h b/include/linux/elf-randomize.h new file mode 100644 index 000000000000..7a4eda02d2b1 --- /dev/null +++ b/include/linux/elf-randomize.h @@ -0,0 +1,10 @@ +#ifndef _ELF_RANDOMIZE_H +#define _ELF_RANDOMIZE_H + +#ifndef CONFIG_ARCH_HAS_ELF_RANDOMIZE +static inline unsigned long arch_mmap_rnd(void) { return 0; } +#else +extern unsigned long arch_mmap_rnd(void); +#endif + +#endif From c6f5b001e65cdac592b65a08c5d2dd179cfba568 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 14 Apr 2015 15:48:04 -0700 Subject: [PATCH 110/122] s390: redefine randomize_et_dyn for ELF_ET_DYN_BASE In preparation for moving ET_DYN randomization into the ELF loader (which requires a static ELF_ET_DYN_BASE), this redefines s390's existing ET_DYN randomization in a call to arch_mmap_rnd(). This refactoring results in the same ET_DYN randomization on s390. Signed-off-by: Kees Cook Acked-by: Martin Schwidefsky Cc: Heiko Carstens Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/elf.h | 8 +++++--- arch/s390/mm/mmap.c | 11 ++--------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index c9c875d9ed31..f8db4781a4c2 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -161,10 +161,12 @@ extern unsigned int vdso_enabled; /* This is the location that an ET_DYN program is loaded if exec'ed. Typical use of this is to invoke "./ld.so someprog" to test out a new version of the loader. We need to make sure that it is out of the way of the program - that it will "exec", and that there is sufficient room for the brk. */ - + that it will "exec", and that there is sufficient room for the brk. 64-bit + tasks are aligned to 4GB. */ extern unsigned long randomize_et_dyn(void); -#define ELF_ET_DYN_BASE randomize_et_dyn() +#define ELF_ET_DYN_BASE (randomize_et_dyn() + (is_32bit_task() ? \ + (STACK_TOP / 3 * 2) : \ + (STACK_TOP / 3 * 2) & ~((1UL << 32) - 1))) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. */ diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index a94504d99c47..8c11536f972d 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -179,17 +179,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, unsigned long randomize_et_dyn(void) { - unsigned long base; - - base = STACK_TOP / 3 * 2; - if (!is_32bit_task()) - /* Align to 4GB */ - base &= ~((1UL << 32) - 1); - if (current->flags & PF_RANDOMIZE) - base += arch_mmap_rnd(); + return arch_mmap_rnd(); - return base; + return 0UL; } #ifndef CONFIG_64BIT From d1fd836dcf00d2028c700c7e44d2c23404062c90 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 14 Apr 2015 15:48:07 -0700 Subject: [PATCH 111/122] mm: split ET_DYN ASLR from mmap ASLR This fixes the "offset2lib" weakness in ASLR for arm, arm64, mips, powerpc, and x86. The problem is that if there is a leak of ASLR from the executable (ET_DYN), it means a leak of shared library offset as well (mmap), and vice versa. Further details and a PoC of this attack is available here: http://cybersecurity.upv.es/attacks/offset2lib/offset2lib.html With this patch, a PIE linked executable (ET_DYN) has its own ASLR region: $ ./show_mmaps_pie 54859ccd6000-54859ccd7000 r-xp ... /tmp/show_mmaps_pie 54859ced6000-54859ced7000 r--p ... /tmp/show_mmaps_pie 54859ced7000-54859ced8000 rw-p ... /tmp/show_mmaps_pie 7f75be764000-7f75be91f000 r-xp ... /lib/x86_64-linux-gnu/libc.so.6 7f75be91f000-7f75beb1f000 ---p ... /lib/x86_64-linux-gnu/libc.so.6 7f75beb1f000-7f75beb23000 r--p ... /lib/x86_64-linux-gnu/libc.so.6 7f75beb23000-7f75beb25000 rw-p ... /lib/x86_64-linux-gnu/libc.so.6 7f75beb25000-7f75beb2a000 rw-p ... 7f75beb2a000-7f75beb4d000 r-xp ... /lib64/ld-linux-x86-64.so.2 7f75bed45000-7f75bed46000 rw-p ... 7f75bed46000-7f75bed47000 r-xp ... 7f75bed47000-7f75bed4c000 rw-p ... 7f75bed4c000-7f75bed4d000 r--p ... /lib64/ld-linux-x86-64.so.2 7f75bed4d000-7f75bed4e000 rw-p ... /lib64/ld-linux-x86-64.so.2 7f75bed4e000-7f75bed4f000 rw-p ... 7fffb3741000-7fffb3762000 rw-p ... [stack] 7fffb377b000-7fffb377d000 r--p ... [vvar] 7fffb377d000-7fffb377f000 r-xp ... [vdso] The change is to add a call the newly created arch_mmap_rnd() into the ELF loader for handling ET_DYN ASLR in a separate region from mmap ASLR, as was already done on s390. Removes CONFIG_BINFMT_ELF_RANDOMIZE_PIE, which is no longer needed. Signed-off-by: Kees Cook Reported-by: Hector Marco-Gisbert Cc: Russell King Reviewed-by: Ingo Molnar Cc: Catalin Marinas Cc: Will Deacon Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Alexander Viro Cc: Oleg Nesterov Cc: Andy Lutomirski Cc: "David A. Long" Cc: Andrey Ryabinin Cc: Arun Chandran Cc: Yann Droneaud Cc: Min-Hua Chen Cc: Paul Burton Cc: Alex Smith Cc: Markos Chandras Cc: Vineeth Vijayan Cc: Jeff Bailey Cc: Michael Holzheu Cc: Ben Hutchings Cc: Behan Webster Cc: Ismael Ripoll Cc: Jan-Simon Mller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 1 - arch/arm64/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/s390/include/asm/elf.h | 5 ++--- arch/s390/mm/mmap.c | 8 -------- arch/x86/Kconfig | 1 - fs/Kconfig.binfmt | 3 --- fs/binfmt_elf.c | 18 ++++-------------- 9 files changed, 6 insertions(+), 33 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index f85200a63a8b..4b62f4caf0ce 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1,7 +1,6 @@ config ARM bool default y - select ARCH_BINFMT_ELF_RANDOMIZE_PIE select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7c1dbeb73e8d..34f487d5d84e 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1,6 +1,5 @@ config ARM64 def_bool y - select ARCH_BINFMT_ELF_RANDOMIZE_PIE select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 688ce274f59d..a326c4cb8cf0 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -23,7 +23,6 @@ config MIPS select HAVE_KRETPROBES select HAVE_DEBUG_KMEMLEAK select HAVE_SYSCALL_TRACEPOINTS - select ARCH_BINFMT_ELF_RANDOMIZE_PIE select ARCH_HAS_ELF_RANDOMIZE select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES && 64BIT select RTC_LIB if !MACH_LOONGSON diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index fc5fffbb331b..e99014adf017 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -88,7 +88,6 @@ config PPC select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select BINFMT_ELF - select ARCH_BINFMT_ELF_RANDOMIZE_PIE select ARCH_HAS_ELF_RANDOMIZE select OF select OF_EARLY_FLATTREE diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index f8db4781a4c2..ff662155b2c4 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -163,10 +163,9 @@ extern unsigned int vdso_enabled; the loader. We need to make sure that it is out of the way of the program that it will "exec", and that there is sufficient room for the brk. 64-bit tasks are aligned to 4GB. */ -extern unsigned long randomize_et_dyn(void); -#define ELF_ET_DYN_BASE (randomize_et_dyn() + (is_32bit_task() ? \ +#define ELF_ET_DYN_BASE (is_32bit_task() ? \ (STACK_TOP / 3 * 2) : \ - (STACK_TOP / 3 * 2) & ~((1UL << 32) - 1))) + (STACK_TOP / 3 * 2) & ~((1UL << 32) - 1)) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. */ diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 8c11536f972d..bb3367c5cb0b 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -177,14 +177,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; } -unsigned long randomize_et_dyn(void) -{ - if (current->flags & PF_RANDOMIZE) - return arch_mmap_rnd(); - - return 0UL; -} - #ifndef CONFIG_64BIT /* diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 782ddbbc1c9a..1f7f185934a5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -87,7 +87,6 @@ config X86 select HAVE_ARCH_KMEMCHECK select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP select HAVE_USER_RETURN_NOTIFIER - select ARCH_BINFMT_ELF_RANDOMIZE_PIE select ARCH_HAS_ELF_RANDOMIZE select HAVE_ARCH_JUMP_LABEL select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 270c48148f79..2d0cbbd14cfc 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -27,9 +27,6 @@ config COMPAT_BINFMT_ELF bool depends on COMPAT && BINFMT_ELF -config ARCH_BINFMT_ELF_RANDOMIZE_PIE - bool - config ARCH_BINFMT_ELF_STATE bool diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index d925f55e4857..b20c05477e90 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -910,21 +911,10 @@ static int load_elf_binary(struct linux_binprm *bprm) * default mmap base, as well as whatever program they * might try to exec. This is because the brk will * follow the loader, and is not movable. */ -#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE - /* Memory randomization might have been switched off - * in runtime via sysctl or explicit setting of - * personality flags. - * If that is the case, retain the original non-zero - * load_bias value in order to establish proper - * non-randomized mappings. - */ + load_bias = ELF_ET_DYN_BASE - vaddr; if (current->flags & PF_RANDOMIZE) - load_bias = 0; - else - load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); -#else - load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); -#endif + load_bias += arch_mmap_rnd(); + load_bias = ELF_PAGESTART(load_bias); total_size = total_mapping_size(elf_phdata, loc->elf_ex.e_phnum); if (!total_size) { From 204db6ed17743000691d930368a5abd6ea541c58 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 14 Apr 2015 15:48:12 -0700 Subject: [PATCH 112/122] mm: fold arch_randomize_brk into ARCH_HAS_ELF_RANDOMIZE The arch_randomize_brk() function is used on several architectures, even those that don't support ET_DYN ASLR. To avoid bulky extern/#define tricks, consolidate the support under CONFIG_ARCH_HAS_ELF_RANDOMIZE for the architectures that support it, while still handling CONFIG_COMPAT_BRK. Signed-off-by: Kees Cook Cc: Hector Marco-Gisbert Cc: Russell King Reviewed-by: Ingo Molnar Cc: Catalin Marinas Cc: Will Deacon Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Alexander Viro Cc: Oleg Nesterov Cc: Andy Lutomirski Cc: "David A. Long" Cc: Andrey Ryabinin Cc: Arun Chandran Cc: Yann Droneaud Cc: Min-Hua Chen Cc: Paul Burton Cc: Alex Smith Cc: Markos Chandras Cc: Vineeth Vijayan Cc: Jeff Bailey Cc: Michael Holzheu Cc: Ben Hutchings Cc: Behan Webster Cc: Ismael Ripoll Cc: Jan-Simon Mller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 1 + arch/arm/include/asm/elf.h | 4 ---- arch/arm64/include/asm/elf.h | 4 ---- arch/mips/include/asm/elf.h | 4 ---- arch/powerpc/include/asm/elf.h | 4 ---- arch/s390/include/asm/elf.h | 3 --- arch/x86/include/asm/elf.h | 3 --- fs/binfmt_elf.c | 4 +--- include/linux/elf-randomize.h | 12 ++++++++++++ 9 files changed, 14 insertions(+), 25 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 474904a8e540..e1068987bad1 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -497,6 +497,7 @@ config ARCH_HAS_ELF_RANDOMIZE An architecture supports choosing randomized locations for stack, mmap, brk, and ET_DYN. Defined functions: - arch_mmap_rnd() + - arch_randomize_brk() # # ABI hall of shame diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h index afb9cafd3786..c1ff8ab12914 100644 --- a/arch/arm/include/asm/elf.h +++ b/arch/arm/include/asm/elf.h @@ -125,10 +125,6 @@ int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs); extern void elf_set_personality(const struct elf32_hdr *); #define SET_PERSONALITY(ex) elf_set_personality(&(ex)) -struct mm_struct; -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - #ifdef CONFIG_MMU #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 struct linux_binprm; diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index f724db00b235..faad6df49e5b 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -156,10 +156,6 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, #define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12)) #endif -struct mm_struct; -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - #ifdef CONFIG_COMPAT #ifdef __AARCH64EB__ diff --git a/arch/mips/include/asm/elf.h b/arch/mips/include/asm/elf.h index 535f196ffe02..31d747d46a23 100644 --- a/arch/mips/include/asm/elf.h +++ b/arch/mips/include/asm/elf.h @@ -410,10 +410,6 @@ struct linux_binprm; extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); -struct mm_struct; -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - struct arch_elf_state { int fp_abi; int interp_fp_abi; diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h index 57d289acb803..ee46ffef608e 100644 --- a/arch/powerpc/include/asm/elf.h +++ b/arch/powerpc/include/asm/elf.h @@ -128,10 +128,6 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, (0x7ff >> (PAGE_SHIFT - 12)) : \ (0x3ffff >> (PAGE_SHIFT - 12))) -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - - #ifdef CONFIG_SPU_BASE /* Notes used in ET_CORE. Note name is "SPU//". */ #define NT_SPU 1 diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index ff662155b2c4..a5c4978462c1 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -226,9 +226,6 @@ struct linux_binprm; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 int arch_setup_additional_pages(struct linux_binprm *, int); -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - void *fill_cpu_elf_notes(void *ptr, struct save_area *sa, __vector128 *vxrs); #endif diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 935588d95c82..f161c189c27b 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -339,9 +339,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); #define compat_arch_setup_additional_pages compat_arch_setup_additional_pages -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - /* * True on X86_32 or when emulating IA32 on X86_64 */ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index b20c05477e90..241ef68d2893 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1050,15 +1050,13 @@ static int load_elf_binary(struct linux_binprm *bprm) current->mm->end_data = end_data; current->mm->start_stack = bprm->p; -#ifdef arch_randomize_brk if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) { current->mm->brk = current->mm->start_brk = arch_randomize_brk(current->mm); -#ifdef CONFIG_COMPAT_BRK +#ifdef compat_brk_randomized current->brk_randomized = 1; #endif } -#endif if (current->personality & MMAP_PAGE_ZERO) { /* Why this, you ask??? Well SVr4 maps page 0 as read-only, diff --git a/include/linux/elf-randomize.h b/include/linux/elf-randomize.h index 7a4eda02d2b1..b5f0bda9472e 100644 --- a/include/linux/elf-randomize.h +++ b/include/linux/elf-randomize.h @@ -1,10 +1,22 @@ #ifndef _ELF_RANDOMIZE_H #define _ELF_RANDOMIZE_H +struct mm_struct; + #ifndef CONFIG_ARCH_HAS_ELF_RANDOMIZE static inline unsigned long arch_mmap_rnd(void) { return 0; } +# if defined(arch_randomize_brk) && defined(CONFIG_COMPAT_BRK) +# define compat_brk_randomized +# endif +# ifndef arch_randomize_brk +# define arch_randomize_brk(mm) (mm->brk) +# endif #else extern unsigned long arch_mmap_rnd(void); +extern unsigned long arch_randomize_brk(struct mm_struct *mm); +# ifdef CONFIG_COMPAT_BRK +# define compat_brk_randomized +# endif #endif #endif From 2a8e70026435ad97570a1e0a0c4c941e0f700a3e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 14 Apr 2015 15:48:15 -0700 Subject: [PATCH 113/122] mm: numa: remove migrate_ratelimited This code is dead since commit 9e645ab6d089 ("sched/numa: Continue PTE scanning even if migrate rate limited") so remove it. Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 5 ----- mm/migrate.c | 20 -------------------- 2 files changed, 25 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 78baed5f2952..cac1c0904d5f 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -69,7 +69,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, extern bool pmd_trans_migrating(pmd_t pmd); extern int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node); -extern bool migrate_ratelimited(int node); #else static inline bool pmd_trans_migrating(pmd_t pmd) { @@ -80,10 +79,6 @@ static inline int migrate_misplaced_page(struct page *page, { return -EAGAIN; /* can't migrate now */ } -static inline bool migrate_ratelimited(int node) -{ - return false; -} #endif /* CONFIG_NUMA_BALANCING */ #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) diff --git a/mm/migrate.c b/mm/migrate.c index ec1802d85f05..a65ff72ab739 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1565,30 +1565,10 @@ static struct page *alloc_misplaced_dst_page(struct page *page, * page migration rate limiting control. * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs * window of time. Default here says do not migrate more than 1280M per second. - * If a node is rate-limited then PTE NUMA updates are also rate-limited. However - * as it is faults that reset the window, pte updates will happen unconditionally - * if there has not been a fault since @pteupdate_interval_millisecs after the - * throttle window closed. */ static unsigned int migrate_interval_millisecs __read_mostly = 100; -static unsigned int pteupdate_interval_millisecs __read_mostly = 1000; static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); -/* Returns true if NUMA migration is currently rate limited */ -bool migrate_ratelimited(int node) -{ - pg_data_t *pgdat = NODE_DATA(node); - - if (time_after(jiffies, pgdat->numabalancing_migrate_next_window + - msecs_to_jiffies(pteupdate_interval_millisecs))) - return false; - - if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages) - return false; - - return true; -} - /* Returns true if the node is migrate rate-limited after the update */ static bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) From 2415b9f5cb048a803b30b790af994ba71ff0bd4c Mon Sep 17 00:00:00 2001 From: Balasubramani Vivekanandan Date: Tue, 14 Apr 2015 15:48:18 -0700 Subject: [PATCH 114/122] memcg: print cgroup information when system panics due to panic_on_oom If kernel panics due to oom, caused by a cgroup reaching its limit, when 'compulsory panic_on_oom' is enabled, then we will only see that the OOM happened because of "compulsory panic_on_oom is enabled" but this doesn't tell the difference between mempolicy and memcg. And dumping system wide information is plain wrong and more confusing. This patch provides the information of the cgroup whose limit triggerred panic Signed-off-by: Balasubramani Vivekanandan Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 3 ++- mm/memcontrol.c | 16 +++++++++------- mm/oom_kill.c | 7 ++++--- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index d5771bed59c9..44b2f6f7bbd8 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -66,7 +66,8 @@ extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags); extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags); extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, - int order, const nodemask_t *nodemask); + int order, const nodemask_t *nodemask, + struct mem_cgroup *memcg); extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, unsigned long totalpages, const nodemask_t *nodemask, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f227786e73db..c3f09b2dda5f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1442,15 +1442,17 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) struct mem_cgroup *iter; unsigned int i; - if (!p) - return; - mutex_lock(&oom_info_lock); rcu_read_lock(); - pr_info("Task in "); - pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); - pr_cont(" killed as a result of limit of "); + if (p) { + pr_info("Task in "); + pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); + pr_cont(" killed as a result of limit of "); + } else { + pr_info("Memory limit reached of cgroup "); + } + pr_cont_cgroup_path(memcg->css.cgroup); pr_cont("\n"); @@ -1537,7 +1539,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, return; } - check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); + check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg); totalpages = mem_cgroup_get_limit(memcg) ? : 1; for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 642f38cb175a..52628c819bf7 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -612,7 +612,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, - int order, const nodemask_t *nodemask) + int order, const nodemask_t *nodemask, + struct mem_cgroup *memcg) { if (likely(!sysctl_panic_on_oom)) return; @@ -625,7 +626,7 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, if (constraint != CONSTRAINT_NONE) return; } - dump_header(NULL, gfp_mask, order, NULL, nodemask); + dump_header(NULL, gfp_mask, order, memcg, nodemask); panic("Out of memory: %s panic_on_oom is enabled\n", sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); } @@ -740,7 +741,7 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, constraint = constrained_alloc(zonelist, gfp_mask, nodemask, &totalpages); mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; - check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); + check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL); if (sysctl_oom_kill_allocating_task && current->mm && !oom_unkillable_task(current, NULL, nodemask) && From 11d83360452ea2a95e699da01f8e1bcc4676a5de Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2015 15:48:21 -0700 Subject: [PATCH 115/122] mm, mempool: do not allow atomic resizing Allocating a large number of elements in atomic context could quickly deplete memory reserves, so just disallow atomic resizing entirely. Nothing currently uses mempool_resize() with anything other than GFP_KERNEL, so convert existing callers to drop the gfp_mask. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: David Rientjes Acked-by: Steffen Maier [zfcp] Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Steve French Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/s390/scsi/zfcp_erp.c | 4 ++-- fs/cifs/connect.c | 6 ++---- include/linux/mempool.h | 2 +- mm/mempool.c | 10 ++++++---- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c index 2c5d4567d1da..acde3f5d6e9e 100644 --- a/drivers/s390/scsi/zfcp_erp.c +++ b/drivers/s390/scsi/zfcp_erp.c @@ -738,11 +738,11 @@ static int zfcp_erp_adapter_strategy_open_fsf(struct zfcp_erp_action *act) return ZFCP_ERP_FAILED; if (mempool_resize(act->adapter->pool.sr_data, - act->adapter->stat_read_buf_num, GFP_KERNEL)) + act->adapter->stat_read_buf_num)) return ZFCP_ERP_FAILED; if (mempool_resize(act->adapter->pool.status_read_req, - act->adapter->stat_read_buf_num, GFP_KERNEL)) + act->adapter->stat_read_buf_num)) return ZFCP_ERP_FAILED; atomic_set(&act->adapter->stat_miss, act->adapter->stat_read_buf_num); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 480cf9c81d50..f3bfe08e177b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -773,8 +773,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) length = atomic_dec_return(&tcpSesAllocCount); if (length > 0) - mempool_resize(cifs_req_poolp, length + cifs_min_rcv, - GFP_KERNEL); + mempool_resize(cifs_req_poolp, length + cifs_min_rcv); } static int @@ -848,8 +847,7 @@ cifs_demultiplex_thread(void *p) length = atomic_inc_return(&tcpSesAllocCount); if (length > 1) - mempool_resize(cifs_req_poolp, length + cifs_min_rcv, - GFP_KERNEL); + mempool_resize(cifs_req_poolp, length + cifs_min_rcv); set_freezable(); while (server->tcpStatus != CifsExiting) { diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 39ed62ab5b8a..b19b3023c880 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -29,7 +29,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data, gfp_t gfp_mask, int nid); -extern int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask); +extern int mempool_resize(mempool_t *pool, int new_min_nr); extern void mempool_destroy(mempool_t *pool); extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask); extern void mempool_free(void *element, mempool_t *pool); diff --git a/mm/mempool.c b/mm/mempool.c index e209c98c7203..949970db2874 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -113,23 +113,24 @@ EXPORT_SYMBOL(mempool_create_node); * mempool_create(). * @new_min_nr: the new minimum number of elements guaranteed to be * allocated for this pool. - * @gfp_mask: the usual allocation bitmask. * * This function shrinks/grows the pool. In the case of growing, * it cannot be guaranteed that the pool will be grown to the new * size immediately, but new mempool_free() calls will refill it. + * This function may sleep. * * Note, the caller must guarantee that no mempool_destroy is called * while this function is running. mempool_alloc() & mempool_free() * might be called (eg. from IRQ contexts) while this function executes. */ -int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) +int mempool_resize(mempool_t *pool, int new_min_nr) { void *element; void **new_elements; unsigned long flags; BUG_ON(new_min_nr <= 0); + might_sleep(); spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { @@ -145,7 +146,8 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) spin_unlock_irqrestore(&pool->lock, flags); /* Grow the pool */ - new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask); + new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements), + GFP_KERNEL); if (!new_elements) return -ENOMEM; @@ -164,7 +166,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) while (pool->curr_nr < pool->min_nr) { spin_unlock_irqrestore(&pool->lock, flags); - element = pool->alloc(gfp_mask, pool->pool_data); + element = pool->alloc(GFP_KERNEL, pool->pool_data); if (!element) goto out; spin_lock_irqsave(&pool->lock, flags); From 02057967b5d3b77ed47f06ee8fe98e2687fdf18b Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2015 15:48:24 -0700 Subject: [PATCH 116/122] mm, hugetlb: abort __get_user_pages if current has been oom killed If __get_user_pages() is faulting a significant number of hugetlb pages, usually as the result of mmap(MAP_LOCKED), it can potentially allocate a very large amount of memory. If the process has been oom killed, this will cause a lot of memory to potentially deplete memory reserves. In the same way that commit 4779280d1ea4 ("mm: make get_user_pages() interruptible") aborted for pending SIGKILLs when faulting non-hugetlb memory, based on the premise of commit 462e00cc7151 ("oom: stop allocating user memory if TIF_MEMDIE is set"), hugetlb page faults now terminate when the process has been oom killed. Signed-off-by: David Rientjes Acked-by: Rik van Riel Acked-by: Greg Thelen Cc: Naoya Horiguchi Acked-by: Davidlohr Bueso Acked-by: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index caad3c5a926f..8874c8ad55aa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3277,6 +3277,15 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, int absent; struct page *page; + /* + * If we have a pending SIGKILL, don't keep faulting pages and + * potentially allocating memory. + */ + if (unlikely(fatal_signal_pending(current))) { + remainder = 0; + break; + } + /* * Some archs (sparc64, sh*) have multiple pte_ts to * each hugepage. We have to make sure we get the From 4a20799d11f64e6b8725cacc7619b1ae1dbf9acd Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Tue, 14 Apr 2015 15:48:27 -0700 Subject: [PATCH 117/122] mm: move memtest under mm Memtest is a simple feature which fills the memory with a given set of patterns and validates memory contents, if bad memory regions is detected it reserves them via memblock API. Since memblock API is widely used by other architectures this feature can be enabled outside of x86 world. This patch set promotes memtest to live under generic mm umbrella and enables memtest feature for arm/arm64. It was reported that this patch set was useful for tracking down an issue with some errant DMA on an arm64 platform. This patch (of 6): There is nothing platform dependent in the core memtest code, so other platforms might benefit from this feature too. [linux@roeck-us.net: MEMTEST depends on MEMBLOCK] Signed-off-by: Vladimir Murzin Acked-by: Will Deacon Tested-by: Mark Rutland Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Catalin Marinas Cc: Russell King Cc: Paul Bolle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 11 ----------- arch/x86/include/asm/e820.h | 8 -------- arch/x86/mm/Makefile | 2 -- include/linux/memblock.h | 8 ++++++++ lib/Kconfig.debug | 12 ++++++++++++ mm/Makefile | 1 + {arch/x86/mm => mm}/memtest.c | 0 7 files changed, 21 insertions(+), 21 deletions(-) rename {arch/x86/mm => mm}/memtest.c (100%) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1f7f185934a5..d43e7e1c784b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -721,17 +721,6 @@ endif #HYPERVISOR_GUEST config NO_BOOTMEM def_bool y -config MEMTEST - bool "Memtest" - ---help--- - This option adds a kernel parameter 'memtest', which allows memtest - to be set. - memtest=0, mean disabled; -- default - memtest=1, mean do 1 test pattern; - ... - memtest=4, mean do 4 test patterns. - If you are unsure how to answer this question, answer N. - source "arch/x86/Kconfig.cpu" config HPET_TIMER diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 779c2efe2e97..3ab0537872fb 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -40,14 +40,6 @@ static inline void e820_mark_nosave_regions(unsigned long limit_pfn) } #endif -#ifdef CONFIG_MEMTEST -extern void early_memtest(unsigned long start, unsigned long end); -#else -static inline void early_memtest(unsigned long start, unsigned long end) -{ -} -#endif - extern unsigned long e820_end_of_ram_pfn(void); extern unsigned long e820_end_of_low_ram_pfn(void); extern u64 early_reserve_e820(u64 sizet, u64 align); diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index c4cc74006c61..a482d105172b 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -32,6 +32,4 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o -obj-$(CONFIG_MEMTEST) += memtest.o - obj-$(CONFIG_X86_INTEL_MPX) += mpx.o diff --git a/include/linux/memblock.h b/include/linux/memblock.h index e8cc45307f8f..6724cb020f5e 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -365,6 +365,14 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo #define __initdata_memblock #endif +#ifdef CONFIG_MEMTEST +extern void early_memtest(unsigned long start, unsigned long end); +#else +static inline void early_memtest(unsigned long start, unsigned long end) +{ +} +#endif + #else static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align) { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 36b6fa88ce5b..5c7a3183423b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1745,6 +1745,18 @@ config TEST_UDELAY If unsure, say N. +config MEMTEST + bool "Memtest" + depends on HAVE_MEMBLOCK + ---help--- + This option adds a kernel parameter 'memtest', which allows memtest + to be set. + memtest=0, mean disabled; -- default + memtest=1, mean do 1 test pattern; + ... + memtest=4, mean do 4 test patterns. + If you are unsure how to answer this question, answer N. + source "samples/Kconfig" source "lib/Kconfig.kgdb" diff --git a/mm/Makefile b/mm/Makefile index 668a9bb82be4..98c4eaeabdcb 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -55,6 +55,7 @@ obj-$(CONFIG_KMEMCHECK) += kmemcheck.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o +obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o diff --git a/arch/x86/mm/memtest.c b/mm/memtest.c similarity index 100% rename from arch/x86/mm/memtest.c rename to mm/memtest.c From 7f70baeeb9e2d2b2a37a4bd3727d709547c4ae41 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Tue, 14 Apr 2015 15:48:30 -0700 Subject: [PATCH 118/122] memtest: use phys_addr_t for physical addresses Since memtest might be used by other architectures pass input parameters as phys_addr_t instead of long to prevent overflow. Signed-off-by: Vladimir Murzin Acked-by: Will Deacon Tested-by: Mark Rutland Cc: "H. Peter Anvin" Cc: Catalin Marinas Cc: Ingo Molnar Cc: Russell King Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 4 ++-- mm/memtest.c | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 6724cb020f5e..9497ec7c77ea 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -366,9 +366,9 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo #endif #ifdef CONFIG_MEMTEST -extern void early_memtest(unsigned long start, unsigned long end); +extern void early_memtest(phys_addr_t start, phys_addr_t end); #else -static inline void early_memtest(unsigned long start, unsigned long end) +static inline void early_memtest(phys_addr_t start, phys_addr_t end) { } #endif diff --git a/mm/memtest.c b/mm/memtest.c index 1e9da795767a..1997d934b13b 100644 --- a/mm/memtest.c +++ b/mm/memtest.c @@ -29,7 +29,7 @@ static u64 patterns[] __initdata = { 0x7a6c7258554e494cULL, /* yeah ;-) */ }; -static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) +static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad) { printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n", (unsigned long long) pattern, @@ -38,11 +38,11 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) memblock_reserve(start_bad, end_bad - start_bad); } -static void __init memtest(u64 pattern, u64 start_phys, u64 size) +static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size) { u64 *p, *start, *end; - u64 start_bad, last_bad; - u64 start_phys_aligned; + phys_addr_t start_bad, last_bad; + phys_addr_t start_phys_aligned; const size_t incr = sizeof(pattern); start_phys_aligned = ALIGN(start_phys, incr); @@ -69,14 +69,14 @@ static void __init memtest(u64 pattern, u64 start_phys, u64 size) reserve_bad_mem(pattern, start_bad, last_bad + incr); } -static void __init do_one_pass(u64 pattern, u64 start, u64 end) +static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end) { u64 i; phys_addr_t this_start, this_end; for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) { - this_start = clamp_t(phys_addr_t, this_start, start, end); - this_end = clamp_t(phys_addr_t, this_end, start, end); + this_start = clamp(this_start, start, end); + this_end = clamp(this_end, start, end); if (this_start < this_end) { printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", (unsigned long long)this_start, @@ -102,7 +102,7 @@ static int __init parse_memtest(char *arg) early_param("memtest", parse_memtest); -void __init early_memtest(unsigned long start, unsigned long end) +void __init early_memtest(phys_addr_t start, phys_addr_t end) { unsigned int i; unsigned int idx = 0; From 36dd9086cb31613ace45e94c18a17241d3d0aac4 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Tue, 14 Apr 2015 15:48:33 -0700 Subject: [PATCH 119/122] arm64: add support for memtest Add support for memtest command line option. Signed-off-by: Vladimir Murzin Acked-by: Will Deacon Tested-by: Mark Rutland Cc: Catalin Marinas Cc: Russell King Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/init.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index ae85da6307bb..597831bdddf3 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -190,6 +190,8 @@ void __init bootmem_init(void) min = PFN_UP(memblock_start_of_DRAM()); max = PFN_DOWN(memblock_end_of_DRAM()); + early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT); + /* * Sparsemem tries to allocate bootmem in memory_present(), so must be * done after the fixed reservations. From d30eae473360aea25e0584d4fbf6a70417d89784 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Tue, 14 Apr 2015 15:48:37 -0700 Subject: [PATCH 120/122] arm: add support for memtest Add support for memtest command line option. Signed-off-by: Vladimir Murzin Acked-by: Will Deacon Cc: "H. Peter Anvin" Cc: Catalin Marinas Cc: Ingo Molnar Cc: Mark Rutland Cc: Russell King Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/init.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 1609b022a72f..3d0e9aed4b40 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -335,6 +335,9 @@ void __init bootmem_init(void) find_limits(&min, &max_low, &max_high); + early_memtest((phys_addr_t)min << PAGE_SHIFT, + (phys_addr_t)max_low << PAGE_SHIFT); + /* * Sparsemem tries to allocate bootmem in memory_present(), * so must be done after the fixed reservations From 8d8cfb47d67ef38f49f97fc0cd3cfe2b5dc8642e Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Tue, 14 Apr 2015 15:48:40 -0700 Subject: [PATCH 121/122] Kconfig: memtest: update number of test patterns up to 17 Additional test patterns for memtest were introduced since commit 63823126c221 ("x86: memtest: add additional (regular) test patterns"), but looks like Kconfig was not updated that time. Update Kconfig entry with the actual number of maximum test patterns. Signed-off-by: Vladimir Murzin Cc: "H. Peter Anvin" Cc: Catalin Marinas Cc: Ingo Molnar Cc: Mark Rutland Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 5c7a3183423b..15c9118f0389 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1754,7 +1754,7 @@ config MEMTEST memtest=0, mean disabled; -- default memtest=1, mean do 1 test pattern; ... - memtest=4, mean do 4 test patterns. + memtest=17, mean do 17 test patterns. If you are unsure how to answer this question, answer N. source "samples/Kconfig" From e4b0db72be2487bae0e3251c22f82c104f7c1cfd Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Tue, 14 Apr 2015 15:48:43 -0700 Subject: [PATCH 122/122] Documentation: update arch list in the 'memtest' entry Since arm64/arm support memtest command line option update the "memtest" entry. Signed-off-by: Vladimir Murzin Cc: "H. Peter Anvin" Cc: Catalin Marinas Cc: Ingo Molnar Cc: Mark Rutland Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index b1fa70907ccf..8cc635f4c4f0 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1989,7 +1989,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. seconds. Use this parameter to check at some other rate. 0 disables periodic checking. - memtest= [KNL,X86] Enable memtest + memtest= [KNL,X86,ARM] Enable memtest Format: default : 0 Specifies the number of memtest passes to be