From ac5be6b47e8bd25b62bed2c82cda7398999f59e9 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 22 Sep 2015 14:58:49 -0700 Subject: [PATCH 01/15] userfaultfd: revert "userfaultfd: waitqueue: add nr wake parameter to __wake_up_locked_key" This reverts commit 51360155eccb907ff8635bd10fc7de876408c2e0 and adapts fs/userfaultfd.c to use the old version of that function. It didn't look robust to call __wake_up_common with "nr == 1" when we absolutely require wakeall semantics, but we've full control of what we insert in the two waitqueue heads of the blocked userfaults. No exclusive waitqueue risks to be inserted into those two waitqueue heads so we can as well stick to "nr == 1" of the old code and we can rely purely on the fact no waitqueue inserted in one of the two waitqueue heads we must enforce as wakeall, has wait->flags WQ_FLAG_EXCLUSIVE set. Signed-off-by: Andrea Arcangeli Cc: Dr. David Alan Gilbert Cc: Michael Ellerman Cc: Shuah Khan Cc: Thierry Reding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 8 ++++---- include/linux/wait.h | 5 ++--- kernel/sched/wait.c | 7 +++---- net/sunrpc/sched.c | 2 +- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f9aeb40a7197..50311703135b 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -467,8 +467,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file) * the fault_*wqh. */ spin_lock(&ctx->fault_pending_wqh.lock); - __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, &range); - __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range); + __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range); + __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range); spin_unlock(&ctx->fault_pending_wqh.lock); wake_up_poll(&ctx->fd_wqh, POLLHUP); @@ -650,10 +650,10 @@ static void __wake_userfault(struct userfaultfd_ctx *ctx, spin_lock(&ctx->fault_pending_wqh.lock); /* wake all in the range and autoremove */ if (waitqueue_active(&ctx->fault_pending_wqh)) - __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, + __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, range); if (waitqueue_active(&ctx->fault_wqh)) - __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range); + __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, range); spin_unlock(&ctx->fault_pending_wqh.lock); } diff --git a/include/linux/wait.h b/include/linux/wait.h index d3d077228d4c..1e1bf9f963a9 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -147,8 +147,7 @@ __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) typedef int wait_bit_action_f(struct wait_bit_key *); void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr, - void *key); +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr); void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); @@ -180,7 +179,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); #define wake_up_poll(x, m) \ __wake_up(x, TASK_NORMAL, 1, (void *) (m)) #define wake_up_locked_poll(x, m) \ - __wake_up_locked_key((x), TASK_NORMAL, 1, (void *) (m)) + __wake_up_locked_key((x), TASK_NORMAL, (void *) (m)) #define wake_up_interruptible_poll(x, m) \ __wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m)) #define wake_up_interruptible_sync_poll(x, m) \ diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 272d9322bc5d..052e02672d12 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -106,10 +106,9 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) } EXPORT_SYMBOL_GPL(__wake_up_locked); -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr, - void *key) +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) { - __wake_up_common(q, mode, nr, 0, key); + __wake_up_common(q, mode, 1, 0, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_key); @@ -284,7 +283,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, if (!list_empty(&wait->task_list)) list_del_init(&wait->task_list); else if (waitqueue_active(q)) - __wake_up_locked_key(q, mode, 1, key); + __wake_up_locked_key(q, mode, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(abort_exclusive_wait); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index b140c092d226..337ca851a350 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -297,7 +297,7 @@ static int rpc_complete_task(struct rpc_task *task) clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate); ret = atomic_dec_and_test(&task->tk_count); if (waitqueue_active(wq)) - __wake_up_locked_key(wq, TASK_NORMAL, 1, &k); + __wake_up_locked_key(wq, TASK_NORMAL, &k); spin_unlock_irqrestore(&wq->lock, flags); return ret; } From d0a871141d07929b559f5eae9c3fc4b63d16866b Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Tue, 22 Sep 2015 14:58:52 -0700 Subject: [PATCH 02/15] userfaultfd: selftests: vm: pick up sanitized kernel headers Add the usr/include subdirectory of the top-level tree to the include path, and make sure to include headers without relative paths to make sure the sanitized headers get picked up. Otherwise the compiler will not be able to find the linux/compiler.h header included by the non- sanitized include/uapi/linux/userfaultfd.h. While at it, make sure to only hardcode the syscall numbers on x86 and PowerPC if they haven't been properly picked up from the headers. Signed-off-by: Thierry Reding Acked-by: Michael Ellerman Cc: Shuah Khan Signed-off-by: Andrea Arcangeli Cc: Dr. David Alan Gilbert Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/Makefile | 2 +- tools/testing/selftests/vm/userfaultfd.c | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index d36fab7d8ebd..949e275f11a6 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -1,6 +1,6 @@ # Makefile for vm selftests -CFLAGS = -Wall +CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS) BINARIES = compaction_test BINARIES += hugepage-mmap BINARIES += hugepage-shm diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 7c1d958857d2..59d145f20918 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -64,8 +64,9 @@ #include #include #include -#include "../../../../include/uapi/linux/userfaultfd.h" +#include +#ifndef __NR_userfaultfd #ifdef __x86_64__ #define __NR_userfaultfd 323 #elif defined(__i386__) @@ -77,6 +78,7 @@ #else #error "missing __NR_userfaultfd definition" #endif +#endif static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; From 67f6a029b2ccf3399783a0ff2f812666f290d94f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 22 Sep 2015 14:58:55 -0700 Subject: [PATCH 03/15] userfaultfd: selftest: headers fixup Depend on "make headers_install" to create proper headers to include and provide syscall numbers. Signed-off-by: Andrea Arcangeli Cc: Dr. David Alan Gilbert Cc: Michael Ellerman Cc: Shuah Khan Cc: Thierry Reding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/Makefile | 7 +++++-- tools/testing/selftests/vm/userfaultfd.c | 10 ---------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 949e275f11a6..3c53cac15de1 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -12,8 +12,11 @@ BINARIES += userfaultfd all: $(BINARIES) %: %.c $(CC) $(CFLAGS) -o $@ $^ -lrt -userfaultfd: userfaultfd.c - $(CC) $(CFLAGS) -O2 -o $@ $^ -lpthread +userfaultfd: userfaultfd.c ../../../../usr/include/linux/kernel.h + $(CC) $(CFLAGS) -O2 -o $@ $< -lpthread + +../../../../usr/include/linux/kernel.h: + make -C ../../../.. headers_install TEST_PROGS := run_vmtests TEST_FILES := $(BINARIES) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 59d145f20918..a9e0b9143f06 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -67,18 +67,8 @@ #include #ifndef __NR_userfaultfd -#ifdef __x86_64__ -#define __NR_userfaultfd 323 -#elif defined(__i386__) -#define __NR_userfaultfd 374 -#elif defined(__powewrpc__) -#define __NR_userfaultfd 364 -#elif defined(__s390__) -#define __NR_userfaultfd 355 -#else #error "missing __NR_userfaultfd definition" #endif -#endif static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; From 56ed8f169e225dce1f9e40f6eee2e2dabe7d06fc Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 22 Sep 2015 14:58:58 -0700 Subject: [PATCH 04/15] userfaultfd: selftest: only warn if __NR_userfaultfd is undefined If __NR_userfaultfd is not yet defined by the arch, warn but still build and run the userfaultfd selftest successfully. Signed-off-by: Michael Ellerman Signed-off-by: Andrea Arcangeli Cc: Dr. David Alan Gilbert Cc: Shuah Khan Cc: Thierry Reding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index a9e0b9143f06..0671ae1d1052 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -66,9 +66,7 @@ #include #include -#ifndef __NR_userfaultfd -#error "missing __NR_userfaultfd definition" -#endif +#ifdef __NR_userfaultfd static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; @@ -631,3 +629,15 @@ int main(int argc, char **argv) nr_pages, nr_pages_per_cpu); return userfaultfd_stress(); } + +#else /* __NR_userfaultfd */ + +#warning "missing __NR_userfaultfd definition" + +int main(void) +{ + printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n"); + return 0; +} + +#endif /* __NR_userfaultfd */ From 1f5fee2cf232f9fac05b65f21107d2cf3c32092c Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 22 Sep 2015 14:59:00 -0700 Subject: [PATCH 05/15] userfaultfd: selftest: avoid my_bcmp false positives with powerpc Keep a non-zero placeholder after the count, for the my_bcmp comparison of the page against the zeropage. The lockless increment between 255 to 256 against a lockless my_bcmp could otherwise return false positives on ppc32le. Signed-off-by: Andrea Arcangeli Tested-by: Michael Ellerman Cc: Dr. David Alan Gilbert Cc: Shuah Khan Cc: Thierry Reding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 0671ae1d1052..10897092823d 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -465,6 +465,14 @@ static int userfaultfd_stress(void) *area_mutex(area_src, nr) = (pthread_mutex_t) PTHREAD_MUTEX_INITIALIZER; count_verify[nr] = *area_count(area_src, nr) = 1; + /* + * In the transition between 255 to 256, powerpc will + * read out of order in my_bcmp and see both bytes as + * zero, so leave a placeholder below always non-zero + * after the count, to avoid my_bcmp to trigger false + * positives. + */ + *(area_count(area_src, nr) + 1) = 1; } pipefd = malloc(sizeof(int) * nr_cpus * 2); @@ -610,8 +618,8 @@ int main(int argc, char **argv) fprintf(stderr, "Usage: \n"), exit(1); nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); page_size = sysconf(_SC_PAGE_SIZE); - if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) > - page_size) + if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 + > page_size) fprintf(stderr, "Impossible to run this test\n"), exit(2); nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size / nr_cpus; From a5932bf5737f0b5caf6deaa92b062e4fe66cf5b2 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 22 Sep 2015 14:59:03 -0700 Subject: [PATCH 06/15] userfaultfd: selftest: return an error if BOUNCE_VERIFY fails This will report the error in the exit code, in addition of the fprintf. Signed-off-by: Andrea Arcangeli Cc: Dr. David Alan Gilbert Cc: Michael Ellerman Cc: Shuah Khan Cc: Thierry Reding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 10897092823d..174f2fc8d257 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -422,7 +422,7 @@ static int userfaultfd_stress(void) struct uffdio_register uffdio_register; struct uffdio_api uffdio_api; unsigned long cpu; - int uffd_flags; + int uffd_flags, err; unsigned long userfaults[nr_cpus]; if (posix_memalign(&area, page_size, nr_pages * page_size)) { @@ -499,6 +499,7 @@ static int userfaultfd_stress(void) pthread_attr_init(&attr); pthread_attr_setstacksize(&attr, 16*1024*1024); + err = 0; while (bounces--) { unsigned long expected_ioctls; @@ -583,8 +584,9 @@ static int userfaultfd_stress(void) area_dst + nr * page_size, sizeof(pthread_mutex_t))) { fprintf(stderr, - "error mutex 2 %lu\n", + "error mutex %lu\n", nr); + err = 1; bounces = 0; } if (*area_count(area_dst, nr) != count_verify[nr]) { @@ -593,6 +595,7 @@ static int userfaultfd_stress(void) *area_count(area_src, nr), count_verify[nr], nr); + err = 1; bounces = 0; } } @@ -609,7 +612,7 @@ static int userfaultfd_stress(void) printf("\n"); } - return 0; + return err; } int main(int argc, char **argv) From 5dd01be14565df814408327971775f36e55bf5e3 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 22 Sep 2015 14:59:06 -0700 Subject: [PATCH 07/15] userfaultfd: selftest: don't error out if pthread_mutex_t isn't identical On ppc big endian this check fails, the mutex doesn't necessarily need to be identical for all pages after pthread_mutex_lock/unlock cycles. The count verification (outside of the pthread_mutex_t structure) suffices and that is retained. Signed-off-by: Andrea Arcangeli Cc: Dr. David Alan Gilbert Cc: Michael Ellerman Cc: Shuah Khan Cc: Thierry Reding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 174f2fc8d257..d77ed41b2094 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -580,15 +580,6 @@ static int userfaultfd_stress(void) /* verification */ if (bounces & BOUNCE_VERIFY) { for (nr = 0; nr < nr_pages; nr++) { - if (my_bcmp(area_dst, - area_dst + nr * page_size, - sizeof(pthread_mutex_t))) { - fprintf(stderr, - "error mutex %lu\n", - nr); - err = 1; - bounces = 0; - } if (*area_count(area_dst, nr) != count_verify[nr]) { fprintf(stderr, "error area_count %Lu %Lu %lu\n", From 09f7298100ea9767324298ab0c7979f6d7463183 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 22 Sep 2015 14:59:09 -0700 Subject: [PATCH 08/15] userfaultfd: register uapi generic syscall (aarch64) Add the userfaultfd syscalls to uapi asm-generic, it was tested with postcopy live migration on aarch64 with both 4k and 64k pagesize kernels. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Andrea Arcangeli Cc: Michael Ellerman Cc: Shuah Khan Cc: Thierry Reding Cc: Mathieu Desnoyers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/asm-generic/unistd.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 8da542a2874d..ee124009e12a 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -709,17 +709,19 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create) __SYSCALL(__NR_bpf, sys_bpf) #define __NR_execveat 281 __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat) -#define __NR_membarrier 282 +#define __NR_userfaultfd 282 +__SYSCALL(__NR_userfaultfd, sys_userfaultfd) +#define __NR_membarrier 283 __SYSCALL(__NR_membarrier, sys_membarrier) #undef __NR_syscalls -#define __NR_syscalls 283 +#define __NR_syscalls 284 /* * All syscalls below here should go away really, * these are provided for both review and as a porting * help for the C library version. -* + * * Last chance: are any of these important enough to * enable by default? */ From 8a04446ab0cf4f35d9f583cd6adcbf7c534e4995 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 22 Sep 2015 14:59:12 -0700 Subject: [PATCH 09/15] mm, dax: VMA with vm_ops->pfn_mkwrite wants to be write-notified For VM_PFNMAP and VM_MIXEDMAP we use vm_ops->pfn_mkwrite instead of vm_ops->page_mkwrite to notify abort write access. This means we want vma->vm_page_prot to be write-protected if the VMA provides this vm_ops. A theoretical scenario that will cause these missed events is: On writable mapping with vm_ops->pfn_mkwrite, but without vm_ops->page_mkwrite: read fault followed by write access to the pfn. Writable pte will be set up on read fault and write fault will not be generated. I found it examining Dave's complaint on generic/080: http://lkml.kernel.org/g/20150831233803.GO3902@dastard Although I don't think it's the reason. It shouldn't be a problem for ext2/ext4 as they provide both pfn_mkwrite and page_mkwrite. [akpm@linux-foundation.org: add local vm_ops to avoid 80-cols mess] Signed-off-by: Kirill A. Shutemov Cc: Yigal Korman Acked-by: Boaz Harrosh Cc: Matthew Wilcox Cc: Jan Kara Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index c739d6db7193..79bcc9f92e48 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1490,13 +1490,14 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) int vma_wants_writenotify(struct vm_area_struct *vma) { vm_flags_t vm_flags = vma->vm_flags; + const struct vm_operations_struct *vm_ops = vma->vm_ops; /* If it was private or non-writable, the write bit is already clear */ if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) return 0; /* The backer wishes to know when pages are first written to? */ - if (vma->vm_ops && vma->vm_ops->page_mkwrite) + if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite)) return 1; /* The open routine did something to the protections that pgprot_modify From 3aaa76e125c1dd58c9b599baa8c6021896874c12 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Tue, 22 Sep 2015 14:59:14 -0700 Subject: [PATCH 10/15] mm: migrate: hugetlb: putback destination hugepage to active list Since commit bcc54222309c ("mm: hugetlb: introduce page_huge_active") each hugetlb page maintains its active flag to avoid a race condition betwe= en multiple calls of isolate_huge_page(), but current kernel doesn't set the f= lag on a hugepage allocated by migration because the proper putback routine isn= 't called. This means that users could still encounter the race referred to by bcc54222309c in this special case, so this patch fixes it. Fixes: bcc54222309c ("mm: hugetlb: introduce page_huge_active") Signed-off-by: Naoya Horiguchi Cc: Michal Hocko Cc: Andi Kleen Cc: Hugh Dickins Cc: [4.1.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index c3cb566af3e2..7452a00bbb50 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1075,7 +1075,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (rc != MIGRATEPAGE_SUCCESS && put_new_page) put_new_page(new_hpage, private); else - put_page(new_hpage); + putback_active_hugepage(new_hpage); if (result) { if (rc) From 769a8089c1fd2fe94c13e66fe6e03d7820953ee3 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 22 Sep 2015 14:59:17 -0700 Subject: [PATCH 11/15] x86, efi, kasan: #undef memset/memcpy/memmove per arch In not-instrumented code KASAN replaces instrumented memset/memcpy/memmove with not-instrumented analogues __memset/__memcpy/__memove. However, on x86 the EFI stub is not linked with the kernel. It uses not-instrumented mem*() functions from arch/x86/boot/compressed/string.c So we don't replace them with __mem*() variants in EFI stub. On ARM64 the EFI stub is linked with the kernel, so we should replace mem*() functions with __mem*(), because the EFI stub runs before KASAN sets up early shadow. So let's move these #undef mem* into arch's asm/efi.h which is also included by the EFI stub. Also, this will fix the warning in 32-bit build reported by kbuild test robot: efi-stub-helper.c:599:2: warning: implicit declaration of function 'memcpy' [akpm@linux-foundation.org: use 80 cols in comment] Signed-off-by: Andrey Ryabinin Reported-by: Fengguang Wu Cc: Will Deacon Cc: Catalin Marinas Cc: Matt Fleming Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/efi.h | 10 ++++++++++ drivers/firmware/efi/libstub/efistub.h | 4 ---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 155162ea0e00..ab5f1d447ef9 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -86,6 +86,16 @@ extern u64 asmlinkage efi_call(void *fp, ...); extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, u32 type, u64 attribute); +/* + * CONFIG_KASAN may redefine memset to __memset. __memset function is present + * only in kernel binary. Since the EFI stub linked into a separate binary it + * doesn't have __memset(). So we should use standard memset from + * arch/x86/boot/compressed/string.c. The same applies to memcpy and memmove. + */ +#undef memcpy +#undef memset +#undef memmove + #endif /* CONFIG_X86_32 */ extern struct efi_scratch efi_scratch; diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index e334a01cf92f..6b6548fda089 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -5,10 +5,6 @@ /* error code which can't be mistaken for valid address */ #define EFI_ERROR (~0UL) -#undef memcpy -#undef memset -#undef memmove - void efi_char16_printk(efi_system_table_t *, efi_char16_t *); efi_status_t efi_open_volume(efi_system_table_t *sys_table_arg, void *__image, From d046b770c9fc36ccb19c27afdb8322220108cbc7 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Tue, 22 Sep 2015 14:59:20 -0700 Subject: [PATCH 12/15] lib/iommu-common.c: do not try to deref a null iommu->lazy_flush() pointer when n < pool->hint The check for invoking iommu->lazy_flush() from iommu_tbl_range_alloc() has to be refactored so that we only call ->lazy_flush() if it is non-null. I had a sparc kernel that was crashing when I was trying to process some very large perf.data files- the crash happens when the scsi driver calls into dma_4v_map_sg and thus the iommu_tbl_range_alloc(). Signed-off-by: Sowmini Varadhan Cc: Benjamin Herrenschmidt Cc: Guenter Roeck Cc: David S. Miller Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/iommu-common.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/iommu-common.c b/lib/iommu-common.c index ff19f66d3f7f..b1c93e94ca7a 100644 --- a/lib/iommu-common.c +++ b/lib/iommu-common.c @@ -21,8 +21,7 @@ static DEFINE_PER_CPU(unsigned int, iommu_hash_common); static inline bool need_flush(struct iommu_map_table *iommu) { - return (iommu->lazy_flush != NULL && - (iommu->flags & IOMMU_NEED_FLUSH) != 0); + return ((iommu->flags & IOMMU_NEED_FLUSH) != 0); } static inline void set_flush(struct iommu_map_table *iommu) @@ -211,7 +210,8 @@ unsigned long iommu_tbl_range_alloc(struct device *dev, goto bail; } } - if (n < pool->hint || need_flush(iommu)) { + if (iommu->lazy_flush && + (n < pool->hint || need_flush(iommu))) { clear_flush(iommu); iommu->lazy_flush(iommu); } From d5028f9f7d8de5c375c52b98976b6f310e73398f Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 22 Sep 2015 14:59:20 -0700 Subject: [PATCH 13/15] vmscan: fix sane_reclaim helper for legacy memcg The sane_reclaim() helper is supposed to return false for memcg reclaim if the legacy hierarchy is used, because the latter lacks dirty throttling mechanism, and so it did before it was accidentally broken by commit 33398cf2f360c ("memcg: export struct mem_cgroup"). Fix it. Fixes: 33398cf2f360c ("memcg: export struct mem_cgroup") Signed-off-by: Vladimir Davydov Acked-by: Tejun Heo Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 2d978b28a410..7f63a9381f71 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -175,7 +175,7 @@ static bool sane_reclaim(struct scan_control *sc) if (!memcg) return true; #ifdef CONFIG_CGROUP_WRITEBACK - if (memcg->css.cgroup) + if (cgroup_on_dfl(memcg->css.cgroup)) return true; #endif return false; From 7a07b503bf249986a1eeef0351d66cac0d8bf721 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 22 Sep 2015 14:59:20 -0700 Subject: [PATCH 14/15] membarrier: clean up selftest We don't need to specify an explicit rule in the Makefile, the implicit one will do the same. The "__EXPORTED_HEADERS__" define is not needed, because we build the test against the installed kernel headers, not the in-tree kernel headers. Re-use "$(TEST_PROGS)" in the clean target rather than spelling the executable name twice. Include rather than the rather specific . Include rather than . In both cases, the former header is located in a standard location and includes the latter. Signed-off-by: Mathieu Desnoyers Acked-by: Michael Ellerman Cc: Pranith Kumar Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/membarrier/Makefile | 7 +++---- tools/testing/selftests/membarrier/membarrier_test.c | 5 +---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/membarrier/Makefile b/tools/testing/selftests/membarrier/Makefile index 877a50355d7f..a1a97085847d 100644 --- a/tools/testing/selftests/membarrier/Makefile +++ b/tools/testing/selftests/membarrier/Makefile @@ -1,11 +1,10 @@ CFLAGS += -g -I../../../../usr/include/ -all: - $(CC) $(CFLAGS) membarrier_test.c -o membarrier_test - TEST_PROGS := membarrier_test +all: $(TEST_PROGS) + include ../lib.mk clean: - $(RM) membarrier_test + $(RM) $(TEST_PROGS) diff --git a/tools/testing/selftests/membarrier/membarrier_test.c b/tools/testing/selftests/membarrier/membarrier_test.c index dde312508007..535f0fef4d0b 100644 --- a/tools/testing/selftests/membarrier/membarrier_test.c +++ b/tools/testing/selftests/membarrier/membarrier_test.c @@ -1,9 +1,6 @@ #define _GNU_SOURCE -#define __EXPORTED_HEADERS__ - #include -#include -#include +#include #include #include #include From 012572d4fc2e4ddd5c8ec8614d51414ec6cae02a Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 22 Sep 2015 14:59:20 -0700 Subject: [PATCH 15/15] ocfs2/dlm: fix deadlock when dispatch assert master The order of the following three spinlocks should be: dlm_domain_lock < dlm_ctxt->spinlock < dlm_lock_resource->spinlock But dlm_dispatch_assert_master() is called while holding dlm_ctxt->spinlock and dlm_lock_resource->spinlock, and then it calls dlm_grab() which will take dlm_domain_lock. Once another thread (for example, dlm_query_join_handler) has already taken dlm_domain_lock, and tries to take dlm_ctxt->spinlock deadlock happens. Signed-off-by: Joseph Qi Cc: Joel Becker Cc: Mark Fasheh Cc: "Junxiao Bi" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmmaster.c | 9 ++++++--- fs/ocfs2/dlm/dlmrecovery.c | 8 ++++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 46b8b2bbc95a..ee5aa4daaea0 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1439,6 +1439,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, int found, ret; int set_maybe; int dispatch_assert = 0; + int dispatched = 0; if (!dlm_grab(dlm)) return DLM_MASTER_RESP_NO; @@ -1658,15 +1659,18 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, mlog(ML_ERROR, "failed to dispatch assert master work\n"); response = DLM_MASTER_RESP_ERROR; dlm_lockres_put(res); - } else + } else { + dispatched = 1; __dlm_lockres_grab_inflight_worker(dlm, res); + } spin_unlock(&res->spinlock); } else { if (res) dlm_lockres_put(res); } - dlm_put(dlm); + if (!dispatched) + dlm_put(dlm); return response; } @@ -2090,7 +2094,6 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, /* queue up work for dlm_assert_master_worker */ - dlm_grab(dlm); /* get an extra ref for the work item */ dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL); item->u.am.lockres = res; /* already have a ref */ /* can optionally ignore node numbers higher than this node */ diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index ce12e0b1a31f..3d90ad7ff91f 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1694,6 +1694,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, unsigned int hash; int master = DLM_LOCK_RES_OWNER_UNKNOWN; u32 flags = DLM_ASSERT_MASTER_REQUERY; + int dispatched = 0; if (!dlm_grab(dlm)) { /* since the domain has gone away on this @@ -1719,8 +1720,10 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, dlm_put(dlm); /* sender will take care of this and retry */ return ret; - } else + } else { + dispatched = 1; __dlm_lockres_grab_inflight_worker(dlm, res); + } spin_unlock(&res->spinlock); } else { /* put.. incase we are not the master */ @@ -1730,7 +1733,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, } spin_unlock(&dlm->spinlock); - dlm_put(dlm); + if (!dispatched) + dlm_put(dlm); return master; }