From ac5be6b47e8bd25b62bed2c82cda7398999f59e9 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Tue, 22 Sep 2015 14:58:49 -0700
Subject: [PATCH 01/15] userfaultfd: revert "userfaultfd: waitqueue: add nr
 wake parameter to __wake_up_locked_key"

This reverts commit 51360155eccb907ff8635bd10fc7de876408c2e0 and adapts
fs/userfaultfd.c to use the old version of that function.

It didn't look robust to call __wake_up_common with "nr == 1" when we
absolutely require wakeall semantics, but we've full control of what we
insert in the two waitqueue heads of the blocked userfaults.  No
exclusive waitqueue risks to be inserted into those two waitqueue heads
so we can as well stick to "nr == 1" of the old code and we can rely
purely on the fact no waitqueue inserted in one of the two waitqueue
heads we must enforce as wakeall, has wait->flags WQ_FLAG_EXCLUSIVE set.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thierry Reding <treding@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c     | 8 ++++----
 include/linux/wait.h | 5 ++---
 kernel/sched/wait.c  | 7 +++----
 net/sunrpc/sched.c   | 2 +-
 4 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index f9aeb40a7197..50311703135b 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -467,8 +467,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 	 * the fault_*wqh.
 	 */
 	spin_lock(&ctx->fault_pending_wqh.lock);
-	__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, &range);
-	__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range);
+	__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
+	__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range);
 	spin_unlock(&ctx->fault_pending_wqh.lock);
 
 	wake_up_poll(&ctx->fd_wqh, POLLHUP);
@@ -650,10 +650,10 @@ static void __wake_userfault(struct userfaultfd_ctx *ctx,
 	spin_lock(&ctx->fault_pending_wqh.lock);
 	/* wake all in the range and autoremove */
 	if (waitqueue_active(&ctx->fault_pending_wqh))
-		__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0,
+		__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
 				     range);
 	if (waitqueue_active(&ctx->fault_wqh))
-		__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range);
+		__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, range);
 	spin_unlock(&ctx->fault_pending_wqh.lock);
 }
 
diff --git a/include/linux/wait.h b/include/linux/wait.h
index d3d077228d4c..1e1bf9f963a9 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -147,8 +147,7 @@ __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
 
 typedef int wait_bit_action_f(struct wait_bit_key *);
 void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
-			  void *key);
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr);
 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
@@ -180,7 +179,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
 #define wake_up_poll(x, m)						\
 	__wake_up(x, TASK_NORMAL, 1, (void *) (m))
 #define wake_up_locked_poll(x, m)					\
-	__wake_up_locked_key((x), TASK_NORMAL, 1, (void *) (m))
+	__wake_up_locked_key((x), TASK_NORMAL, (void *) (m))
 #define wake_up_interruptible_poll(x, m)				\
 	__wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
 #define wake_up_interruptible_sync_poll(x, m)				\
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 272d9322bc5d..052e02672d12 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -106,10 +106,9 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
-			  void *key)
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
 {
-	__wake_up_common(q, mode, nr, 0, key);
+	__wake_up_common(q, mode, 1, 0, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
@@ -284,7 +283,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
 	if (!list_empty(&wait->task_list))
 		list_del_init(&wait->task_list);
 	else if (waitqueue_active(q))
-		__wake_up_locked_key(q, mode, 1, key);
+		__wake_up_locked_key(q, mode, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index b140c092d226..337ca851a350 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -297,7 +297,7 @@ static int rpc_complete_task(struct rpc_task *task)
 	clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
 	ret = atomic_dec_and_test(&task->tk_count);
 	if (waitqueue_active(wq))
-		__wake_up_locked_key(wq, TASK_NORMAL, 1, &k);
+		__wake_up_locked_key(wq, TASK_NORMAL, &k);
 	spin_unlock_irqrestore(&wq->lock, flags);
 	return ret;
 }

From d0a871141d07929b559f5eae9c3fc4b63d16866b Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 22 Sep 2015 14:58:52 -0700
Subject: [PATCH 02/15] userfaultfd: selftests: vm: pick up sanitized kernel
 headers

Add the usr/include subdirectory of the top-level tree to the include
path, and make sure to include headers without relative paths to make
sure the sanitized headers get picked up.  Otherwise the compiler will
not be able to find the linux/compiler.h header included by the non-
sanitized include/uapi/linux/userfaultfd.h.

While at it, make sure to only hardcode the syscall numbers on x86 and
PowerPC if they haven't been properly picked up from the headers.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/vm/Makefile      | 2 +-
 tools/testing/selftests/vm/userfaultfd.c | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index d36fab7d8ebd..949e275f11a6 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -1,6 +1,6 @@
 # Makefile for vm selftests
 
-CFLAGS = -Wall
+CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS)
 BINARIES = compaction_test
 BINARIES += hugepage-mmap
 BINARIES += hugepage-shm
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 7c1d958857d2..59d145f20918 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -64,8 +64,9 @@
 #include <sys/syscall.h>
 #include <sys/ioctl.h>
 #include <pthread.h>
-#include "../../../../include/uapi/linux/userfaultfd.h"
+#include <linux/userfaultfd.h>
 
+#ifndef __NR_userfaultfd
 #ifdef __x86_64__
 #define __NR_userfaultfd 323
 #elif defined(__i386__)
@@ -77,6 +78,7 @@
 #else
 #error "missing __NR_userfaultfd definition"
 #endif
+#endif
 
 static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
 

From 67f6a029b2ccf3399783a0ff2f812666f290d94f Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Tue, 22 Sep 2015 14:58:55 -0700
Subject: [PATCH 03/15] userfaultfd: selftest: headers fixup

Depend on "make headers_install" to create proper headers to include and
provide syscall numbers.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thierry Reding <treding@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/vm/Makefile      |  7 +++++--
 tools/testing/selftests/vm/userfaultfd.c | 10 ----------
 2 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 949e275f11a6..3c53cac15de1 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -12,8 +12,11 @@ BINARIES += userfaultfd
 all: $(BINARIES)
 %: %.c
 	$(CC) $(CFLAGS) -o $@ $^ -lrt
-userfaultfd: userfaultfd.c
-	$(CC) $(CFLAGS) -O2 -o $@ $^ -lpthread
+userfaultfd: userfaultfd.c ../../../../usr/include/linux/kernel.h
+	$(CC) $(CFLAGS) -O2 -o $@ $< -lpthread
+
+../../../../usr/include/linux/kernel.h:
+	make -C ../../../.. headers_install
 
 TEST_PROGS := run_vmtests
 TEST_FILES := $(BINARIES)
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 59d145f20918..a9e0b9143f06 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -67,18 +67,8 @@
 #include <linux/userfaultfd.h>
 
 #ifndef __NR_userfaultfd
-#ifdef __x86_64__
-#define __NR_userfaultfd 323
-#elif defined(__i386__)
-#define __NR_userfaultfd 374
-#elif defined(__powewrpc__)
-#define __NR_userfaultfd 364
-#elif defined(__s390__)
-#define __NR_userfaultfd 355
-#else
 #error "missing __NR_userfaultfd definition"
 #endif
-#endif
 
 static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
 

From 56ed8f169e225dce1f9e40f6eee2e2dabe7d06fc Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 22 Sep 2015 14:58:58 -0700
Subject: [PATCH 04/15] userfaultfd: selftest: only warn if __NR_userfaultfd is
 undefined

If __NR_userfaultfd is not yet defined by the arch, warn but still build
and run the userfaultfd selftest successfully.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thierry Reding <treding@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/vm/userfaultfd.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index a9e0b9143f06..0671ae1d1052 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -66,9 +66,7 @@
 #include <pthread.h>
 #include <linux/userfaultfd.h>
 
-#ifndef __NR_userfaultfd
-#error "missing __NR_userfaultfd definition"
-#endif
+#ifdef __NR_userfaultfd
 
 static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
 
@@ -631,3 +629,15 @@ int main(int argc, char **argv)
 	       nr_pages, nr_pages_per_cpu);
 	return userfaultfd_stress();
 }
+
+#else /* __NR_userfaultfd */
+
+#warning "missing __NR_userfaultfd definition"
+
+int main(void)
+{
+	printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
+	return 0;
+}
+
+#endif /* __NR_userfaultfd */

From 1f5fee2cf232f9fac05b65f21107d2cf3c32092c Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Tue, 22 Sep 2015 14:59:00 -0700
Subject: [PATCH 05/15] userfaultfd: selftest: avoid my_bcmp false positives
 with powerpc

Keep a non-zero placeholder after the count, for the my_bcmp comparison
of the page against the zeropage.  The lockless increment between 255 to
256 against a lockless my_bcmp could otherwise return false positives on
ppc32le.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thierry Reding <treding@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/vm/userfaultfd.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 0671ae1d1052..10897092823d 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -465,6 +465,14 @@ static int userfaultfd_stress(void)
 		*area_mutex(area_src, nr) = (pthread_mutex_t)
 			PTHREAD_MUTEX_INITIALIZER;
 		count_verify[nr] = *area_count(area_src, nr) = 1;
+		/*
+		 * In the transition between 255 to 256, powerpc will
+		 * read out of order in my_bcmp and see both bytes as
+		 * zero, so leave a placeholder below always non-zero
+		 * after the count, to avoid my_bcmp to trigger false
+		 * positives.
+		 */
+		*(area_count(area_src, nr) + 1) = 1;
 	}
 
 	pipefd = malloc(sizeof(int) * nr_cpus * 2);
@@ -610,8 +618,8 @@ int main(int argc, char **argv)
 		fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 	page_size = sysconf(_SC_PAGE_SIZE);
-	if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) >
-	    page_size)
+	if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
+	    > page_size)
 		fprintf(stderr, "Impossible to run this test\n"), exit(2);
 	nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
 		nr_cpus;

From a5932bf5737f0b5caf6deaa92b062e4fe66cf5b2 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Tue, 22 Sep 2015 14:59:03 -0700
Subject: [PATCH 06/15] userfaultfd: selftest: return an error if BOUNCE_VERIFY
 fails

This will report the error in the exit code, in addition of the fprintf.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thierry Reding <treding@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/vm/userfaultfd.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 10897092823d..174f2fc8d257 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -422,7 +422,7 @@ static int userfaultfd_stress(void)
 	struct uffdio_register uffdio_register;
 	struct uffdio_api uffdio_api;
 	unsigned long cpu;
-	int uffd_flags;
+	int uffd_flags, err;
 	unsigned long userfaults[nr_cpus];
 
 	if (posix_memalign(&area, page_size, nr_pages * page_size)) {
@@ -499,6 +499,7 @@ static int userfaultfd_stress(void)
 	pthread_attr_init(&attr);
 	pthread_attr_setstacksize(&attr, 16*1024*1024);
 
+	err = 0;
 	while (bounces--) {
 		unsigned long expected_ioctls;
 
@@ -583,8 +584,9 @@ static int userfaultfd_stress(void)
 					    area_dst + nr * page_size,
 					    sizeof(pthread_mutex_t))) {
 					fprintf(stderr,
-						"error mutex 2 %lu\n",
+						"error mutex %lu\n",
 						nr);
+					err = 1;
 					bounces = 0;
 				}
 				if (*area_count(area_dst, nr) != count_verify[nr]) {
@@ -593,6 +595,7 @@ static int userfaultfd_stress(void)
 						*area_count(area_src, nr),
 						count_verify[nr],
 						nr);
+					err = 1;
 					bounces = 0;
 				}
 			}
@@ -609,7 +612,7 @@ static int userfaultfd_stress(void)
 		printf("\n");
 	}
 
-	return 0;
+	return err;
 }
 
 int main(int argc, char **argv)

From 5dd01be14565df814408327971775f36e55bf5e3 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Tue, 22 Sep 2015 14:59:06 -0700
Subject: [PATCH 07/15] userfaultfd: selftest: don't error out if
 pthread_mutex_t isn't identical

On ppc big endian this check fails, the mutex doesn't necessarily need
to be identical for all pages after pthread_mutex_lock/unlock cycles.
The count verification (outside of the pthread_mutex_t structure)
suffices and that is retained.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thierry Reding <treding@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/vm/userfaultfd.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 174f2fc8d257..d77ed41b2094 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -580,15 +580,6 @@ static int userfaultfd_stress(void)
 		/* verification */
 		if (bounces & BOUNCE_VERIFY) {
 			for (nr = 0; nr < nr_pages; nr++) {
-				if (my_bcmp(area_dst,
-					    area_dst + nr * page_size,
-					    sizeof(pthread_mutex_t))) {
-					fprintf(stderr,
-						"error mutex %lu\n",
-						nr);
-					err = 1;
-					bounces = 0;
-				}
 				if (*area_count(area_dst, nr) != count_verify[nr]) {
 					fprintf(stderr,
 						"error area_count %Lu %Lu %lu\n",

From 09f7298100ea9767324298ab0c7979f6d7463183 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Tue, 22 Sep 2015 14:59:09 -0700
Subject: [PATCH 08/15] userfaultfd: register uapi generic syscall (aarch64)

Add the userfaultfd syscalls to uapi asm-generic, it was tested with
postcopy live migration on aarch64 with both 4k and 64k pagesize
kernels.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thierry Reding <treding@nvidia.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/asm-generic/unistd.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 8da542a2874d..ee124009e12a 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -709,17 +709,19 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
 __SYSCALL(__NR_bpf, sys_bpf)
 #define __NR_execveat 281
 __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
-#define __NR_membarrier 282
+#define __NR_userfaultfd 282
+__SYSCALL(__NR_userfaultfd, sys_userfaultfd)
+#define __NR_membarrier 283
 __SYSCALL(__NR_membarrier, sys_membarrier)
 
 #undef __NR_syscalls
-#define __NR_syscalls 283
+#define __NR_syscalls 284
 
 /*
  * All syscalls below here should go away really,
  * these are provided for both review and as a porting
  * help for the C library version.
-*
+ *
  * Last chance: are any of these important enough to
  * enable by default?
  */

From 8a04446ab0cf4f35d9f583cd6adcbf7c534e4995 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 22 Sep 2015 14:59:12 -0700
Subject: [PATCH 09/15] mm, dax: VMA with vm_ops->pfn_mkwrite wants to be
 write-notified

For VM_PFNMAP and VM_MIXEDMAP we use vm_ops->pfn_mkwrite instead of
vm_ops->page_mkwrite to notify abort write access.  This means we want
vma->vm_page_prot to be write-protected if the VMA provides this vm_ops.

A theoretical scenario that will cause these missed events is:

  On writable mapping with vm_ops->pfn_mkwrite, but without
  vm_ops->page_mkwrite: read fault followed by write access to the pfn.
  Writable pte will be set up on read fault and write fault will not be
  generated.

I found it examining Dave's complaint on generic/080:

	http://lkml.kernel.org/g/20150831233803.GO3902@dastard

Although I don't think it's the reason.

It shouldn't be a problem for ext2/ext4 as they provide both pfn_mkwrite
and page_mkwrite.

[akpm@linux-foundation.org: add local vm_ops to avoid 80-cols mess]
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Yigal Korman <yigal@plexistor.com>
Acked-by: Boaz Harrosh <boaz@plexistor.com>
Cc: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index c739d6db7193..79bcc9f92e48 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1490,13 +1490,14 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
 int vma_wants_writenotify(struct vm_area_struct *vma)
 {
 	vm_flags_t vm_flags = vma->vm_flags;
+	const struct vm_operations_struct *vm_ops = vma->vm_ops;
 
 	/* If it was private or non-writable, the write bit is already clear */
 	if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
 		return 0;
 
 	/* The backer wishes to know when pages are first written to? */
-	if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+	if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite))
 		return 1;
 
 	/* The open routine did something to the protections that pgprot_modify

From 3aaa76e125c1dd58c9b599baa8c6021896874c12 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Tue, 22 Sep 2015 14:59:14 -0700
Subject: [PATCH 10/15] mm: migrate: hugetlb: putback destination hugepage to
 active list

Since commit bcc54222309c ("mm: hugetlb: introduce page_huge_active")
each hugetlb page maintains its active flag to avoid a race condition
betwe= en multiple calls of isolate_huge_page(), but current kernel
doesn't set the f= lag on a hugepage allocated by migration because the
proper putback routine isn= 't called.  This means that users could
still encounter the race referred to by bcc54222309c in this special
case, so this patch fixes it.

Fixes: bcc54222309c ("mm: hugetlb: introduce page_huge_active")
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: <stable@vger.kernel.org>  [4.1.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index c3cb566af3e2..7452a00bbb50 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1075,7 +1075,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
 		put_new_page(new_hpage, private);
 	else
-		put_page(new_hpage);
+		putback_active_hugepage(new_hpage);
 
 	if (result) {
 		if (rc)

From 769a8089c1fd2fe94c13e66fe6e03d7820953ee3 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Date: Tue, 22 Sep 2015 14:59:17 -0700
Subject: [PATCH 11/15] x86, efi, kasan: #undef memset/memcpy/memmove per arch

In not-instrumented code KASAN replaces instrumented memset/memcpy/memmove
with not-instrumented analogues __memset/__memcpy/__memove.

However, on x86 the EFI stub is not linked with the kernel.  It uses
not-instrumented mem*() functions from arch/x86/boot/compressed/string.c

So we don't replace them with __mem*() variants in EFI stub.

On ARM64 the EFI stub is linked with the kernel, so we should replace
mem*() functions with __mem*(), because the EFI stub runs before KASAN
sets up early shadow.

So let's move these #undef mem* into arch's asm/efi.h which is also
included by the EFI stub.

Also, this will fix the warning in 32-bit build reported by kbuild test
robot:

	efi-stub-helper.c:599:2: warning: implicit declaration of function 'memcpy'

[akpm@linux-foundation.org: use 80 cols in comment]
Signed-off-by: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Reported-by: Fengguang Wu <fengguang.wu@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Matt Fleming <matt.fleming@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/efi.h             | 10 ++++++++++
 drivers/firmware/efi/libstub/efistub.h |  4 ----
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 155162ea0e00..ab5f1d447ef9 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -86,6 +86,16 @@ extern u64 asmlinkage efi_call(void *fp, ...);
 extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size,
 					u32 type, u64 attribute);
 
+/*
+ * CONFIG_KASAN may redefine memset to __memset.  __memset function is present
+ * only in kernel binary.  Since the EFI stub linked into a separate binary it
+ * doesn't have __memset().  So we should use standard memset from
+ * arch/x86/boot/compressed/string.c.  The same applies to memcpy and memmove.
+ */
+#undef memcpy
+#undef memset
+#undef memmove
+
 #endif /* CONFIG_X86_32 */
 
 extern struct efi_scratch efi_scratch;
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index e334a01cf92f..6b6548fda089 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -5,10 +5,6 @@
 /* error code which can't be mistaken for valid address */
 #define EFI_ERROR	(~0UL)
 
-#undef memcpy
-#undef memset
-#undef memmove
-
 void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
 
 efi_status_t efi_open_volume(efi_system_table_t *sys_table_arg, void *__image,

From d046b770c9fc36ccb19c27afdb8322220108cbc7 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Tue, 22 Sep 2015 14:59:20 -0700
Subject: [PATCH 12/15] lib/iommu-common.c: do not try to deref a null
 iommu->lazy_flush() pointer when n < pool->hint

The check for invoking iommu->lazy_flush() from iommu_tbl_range_alloc()
has to be refactored so that we only call ->lazy_flush() if it is
non-null.

I had a sparc kernel that was crashing when I was trying to process some
very large perf.data files- the crash happens when the scsi driver calls
into dma_4v_map_sg and thus the iommu_tbl_range_alloc().

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/iommu-common.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/iommu-common.c b/lib/iommu-common.c
index ff19f66d3f7f..b1c93e94ca7a 100644
--- a/lib/iommu-common.c
+++ b/lib/iommu-common.c
@@ -21,8 +21,7 @@ static	DEFINE_PER_CPU(unsigned int, iommu_hash_common);
 
 static inline bool need_flush(struct iommu_map_table *iommu)
 {
-	return (iommu->lazy_flush != NULL &&
-		(iommu->flags & IOMMU_NEED_FLUSH) != 0);
+	return ((iommu->flags & IOMMU_NEED_FLUSH) != 0);
 }
 
 static inline void set_flush(struct iommu_map_table *iommu)
@@ -211,7 +210,8 @@ unsigned long iommu_tbl_range_alloc(struct device *dev,
 			goto bail;
 		}
 	}
-	if (n < pool->hint || need_flush(iommu)) {
+	if (iommu->lazy_flush &&
+	    (n < pool->hint || need_flush(iommu))) {
 		clear_flush(iommu);
 		iommu->lazy_flush(iommu);
 	}

From d5028f9f7d8de5c375c52b98976b6f310e73398f Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Tue, 22 Sep 2015 14:59:20 -0700
Subject: [PATCH 13/15] vmscan: fix sane_reclaim helper for legacy memcg

The sane_reclaim() helper is supposed to return false for memcg reclaim
if the legacy hierarchy is used, because the latter lacks dirty
throttling mechanism, and so it did before it was accidentally broken by
commit 33398cf2f360c ("memcg: export struct mem_cgroup").  Fix it.

Fixes: 33398cf2f360c ("memcg: export struct mem_cgroup")
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2d978b28a410..7f63a9381f71 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -175,7 +175,7 @@ static bool sane_reclaim(struct scan_control *sc)
 	if (!memcg)
 		return true;
 #ifdef CONFIG_CGROUP_WRITEBACK
-	if (memcg->css.cgroup)
+	if (cgroup_on_dfl(memcg->css.cgroup))
 		return true;
 #endif
 	return false;

From 7a07b503bf249986a1eeef0351d66cac0d8bf721 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 22 Sep 2015 14:59:20 -0700
Subject: [PATCH 14/15] membarrier: clean up selftest

We don't need to specify an explicit rule in the Makefile, the implicit
one will do the same.  The "__EXPORTED_HEADERS__" define is not needed,
because we build the test against the installed kernel headers, not the
in-tree kernel headers.  Re-use "$(TEST_PROGS)" in the clean target
rather than spelling the executable name twice.  Include <unistd.h>
rather than the rather specific <asm-generic/unistd.h>.  Include
<syscall.h> rather than <sys/syscall.h>.  In both cases, the former
header is located in a standard location and includes the latter.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Pranith Kumar <bobby.prani@gmail.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/membarrier/Makefile          | 7 +++----
 tools/testing/selftests/membarrier/membarrier_test.c | 5 +----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/membarrier/Makefile b/tools/testing/selftests/membarrier/Makefile
index 877a50355d7f..a1a97085847d 100644
--- a/tools/testing/selftests/membarrier/Makefile
+++ b/tools/testing/selftests/membarrier/Makefile
@@ -1,11 +1,10 @@
 CFLAGS += -g -I../../../../usr/include/
 
-all:
-	$(CC) $(CFLAGS) membarrier_test.c -o membarrier_test
-
 TEST_PROGS := membarrier_test
 
+all: $(TEST_PROGS)
+
 include ../lib.mk
 
 clean:
-	$(RM) membarrier_test
+	$(RM) $(TEST_PROGS)
diff --git a/tools/testing/selftests/membarrier/membarrier_test.c b/tools/testing/selftests/membarrier/membarrier_test.c
index dde312508007..535f0fef4d0b 100644
--- a/tools/testing/selftests/membarrier/membarrier_test.c
+++ b/tools/testing/selftests/membarrier/membarrier_test.c
@@ -1,9 +1,6 @@
 #define _GNU_SOURCE
-#define __EXPORTED_HEADERS__
-
 #include <linux/membarrier.h>
-#include <asm-generic/unistd.h>
-#include <sys/syscall.h>
+#include <syscall.h>
 #include <stdio.h>
 #include <errno.h>
 #include <string.h>

From 012572d4fc2e4ddd5c8ec8614d51414ec6cae02a Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Tue, 22 Sep 2015 14:59:20 -0700
Subject: [PATCH 15/15] ocfs2/dlm: fix deadlock when dispatch assert master

The order of the following three spinlocks should be:
dlm_domain_lock < dlm_ctxt->spinlock < dlm_lock_resource->spinlock

But dlm_dispatch_assert_master() is called while holding
dlm_ctxt->spinlock and dlm_lock_resource->spinlock, and then it calls
dlm_grab() which will take dlm_domain_lock.

Once another thread (for example, dlm_query_join_handler) has already
taken dlm_domain_lock, and tries to take dlm_ctxt->spinlock deadlock
happens.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: "Junxiao Bi" <junxiao.bi@oracle.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmmaster.c   | 9 ++++++---
 fs/ocfs2/dlm/dlmrecovery.c | 8 ++++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 46b8b2bbc95a..ee5aa4daaea0 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1439,6 +1439,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
 	int found, ret;
 	int set_maybe;
 	int dispatch_assert = 0;
+	int dispatched = 0;
 
 	if (!dlm_grab(dlm))
 		return DLM_MASTER_RESP_NO;
@@ -1658,15 +1659,18 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
 			mlog(ML_ERROR, "failed to dispatch assert master work\n");
 			response = DLM_MASTER_RESP_ERROR;
 			dlm_lockres_put(res);
-		} else
+		} else {
+			dispatched = 1;
 			__dlm_lockres_grab_inflight_worker(dlm, res);
+		}
 		spin_unlock(&res->spinlock);
 	} else {
 		if (res)
 			dlm_lockres_put(res);
 	}
 
-	dlm_put(dlm);
+	if (!dispatched)
+		dlm_put(dlm);
 	return response;
 }
 
@@ -2090,7 +2094,6 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
 
 
 	/* queue up work for dlm_assert_master_worker */
-	dlm_grab(dlm);  /* get an extra ref for the work item */
 	dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
 	item->u.am.lockres = res; /* already have a ref */
 	/* can optionally ignore node numbers higher than this node */
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index ce12e0b1a31f..3d90ad7ff91f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1694,6 +1694,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
 	unsigned int hash;
 	int master = DLM_LOCK_RES_OWNER_UNKNOWN;
 	u32 flags = DLM_ASSERT_MASTER_REQUERY;
+	int dispatched = 0;
 
 	if (!dlm_grab(dlm)) {
 		/* since the domain has gone away on this
@@ -1719,8 +1720,10 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
 				dlm_put(dlm);
 				/* sender will take care of this and retry */
 				return ret;
-			} else
+			} else {
+				dispatched = 1;
 				__dlm_lockres_grab_inflight_worker(dlm, res);
+			}
 			spin_unlock(&res->spinlock);
 		} else {
 			/* put.. incase we are not the master */
@@ -1730,7 +1733,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
 	}
 	spin_unlock(&dlm->spinlock);
 
-	dlm_put(dlm);
+	if (!dispatched)
+		dlm_put(dlm);
 	return master;
 }