From 1334be3657dd02af0591d6d8adf0e6a60a7710a6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 17 Nov 2017 15:26:12 -0800
Subject: [PATCH 01/94] mm: fix nodemask printing

The cleanup caused build warnings for constant mask pointers:

  mm/mempolicy.c: In function `mpol_to_str':
  ./include/linux/nodemask.h:108:11: warning: the comparison will always evaluate as `true' for the address of `nodes' will never be NULL [-Waddress]

An earlier workaround I suggested was incorporated in the version that
got merged, but that only solved the problem for gcc-7 and higher, while
gcc-4.6 through gcc-6.x still warn.

This changes the printing again to use inline functions that make it
clear to the compiler that the line that does the NULL check has no idea
whether the argument is a constant NULL.

Link: http://lkml.kernel.org/r/20171117101545.119689-1-arnd@arndb.de
Fixes: 0205f75571e3 ("mm: simplify nodemask printing")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Zhangshaokun <zhangshaokun@hisilicon.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nodemask.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 15cab3967d6d..1fbde8a880d9 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -104,9 +104,16 @@ extern nodemask_t _unused_nodemask_arg_;
  *
  * Can be used to provide arguments for '%*pb[l]' when printing a nodemask.
  */
-#define nodemask_pr_args(maskp)				\
-	((maskp) != NULL) ? MAX_NUMNODES : 0,		\
-	((maskp) != NULL) ? (maskp)->bits : NULL
+#define nodemask_pr_args(maskp)	__nodemask_pr_numnodes(maskp), \
+				__nodemask_pr_bits(maskp)
+static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m)
+{
+	return m ? MAX_NUMNODES : 0;
+}
+static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
+{
+	return m ? m->bits : NULL;
+}
 
 /*
  * The inline keyword gives the compiler room to decide to inline, or

From 5d03a6613957785e94af7a4a6212ad4af66aa5c2 Mon Sep 17 00:00:00 2001
From: Vitaly Wool <vitalywool@gmail.com>
Date: Fri, 17 Nov 2017 15:26:16 -0800
Subject: [PATCH 02/94] mm/z3fold.c: use kref to prevent page free/compact race

There is a race in the current z3fold implementation between
do_compact() called in a work queue context and the page release
procedure when page's kref goes to 0.

do_compact() may be waiting for page lock, which is released by
release_z3fold_page_locked right before putting the page onto the
"stale" list, and then the page may be freed as do_compact() modifies
its contents.

The mechanism currently implemented to handle that (checking the
PAGE_STALE flag) is not reliable enough.  Instead, we'll use page's kref
counter to guarantee that the page is not released if its compaction is
scheduled.  It then becomes compaction function's responsibility to
decrease the counter and quit immediately if the page was actually
freed.

Link: http://lkml.kernel.org/r/20171117092032.00ea56f42affbed19f4fcc6c@gmail.com
Signed-off-by: Vitaly Wool <vitaly.wool@sonymobile.com>
Cc: <Oleksiy.Avramchenko@sony.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/z3fold.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/mm/z3fold.c b/mm/z3fold.c
index b2ba2ba585f3..39e19125d6a0 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -404,8 +404,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
 		WARN_ON(z3fold_page_trylock(zhdr));
 	else
 		z3fold_page_lock(zhdr);
-	if (test_bit(PAGE_STALE, &page->private) ||
-	    !test_and_clear_bit(NEEDS_COMPACTING, &page->private)) {
+	if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) {
 		z3fold_page_unlock(zhdr);
 		return;
 	}
@@ -413,6 +412,11 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
 	list_del_init(&zhdr->buddy);
 	spin_unlock(&pool->lock);
 
+	if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
+		atomic64_dec(&pool->pages_nr);
+		return;
+	}
+
 	z3fold_compact_page(zhdr);
 	unbuddied = get_cpu_ptr(pool->unbuddied);
 	fchunks = num_free_chunks(zhdr);
@@ -753,9 +757,11 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
 		list_del_init(&zhdr->buddy);
 		spin_unlock(&pool->lock);
 		zhdr->cpu = -1;
+		kref_get(&zhdr->refcount);
 		do_compact_page(zhdr, true);
 		return;
 	}
+	kref_get(&zhdr->refcount);
 	queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
 	z3fold_page_unlock(zhdr);
 }

From 3aaabbf1c39effa2ac0c11103ed07ef03b0a0d89 Mon Sep 17 00:00:00 2001
From: Miles Chen <miles.chen@mediatek.com>
Date: Fri, 17 Nov 2017 15:26:19 -0800
Subject: [PATCH 03/94] lib/dma-debug.c: fix incorrect pfn calculation

dma-debug reports the following warning:

  WARNING: CPU: 3 PID: 298 at kernel-4.4/lib/dma-debug.c:604
  debug _dma_assert_idle+0x1a8/0x230()
  DMA-API: cpu touching an active dma mapped cacheline [cln=0x00000882300]
  CPU: 3 PID: 298 Comm: vold Tainted: G        W  O    4.4.22+ #1
  Hardware name: MT6739 (DT)
  Call trace:
    debug_dma_assert_idle+0x1a8/0x230
    wp_page_copy.isra.96+0x118/0x520
    do_wp_page+0x4fc/0x534
    handle_mm_fault+0xd4c/0x1310
    do_page_fault+0x1c8/0x394
    do_mem_abort+0x50/0xec

I found that debug_dma_alloc_coherent() and debug_dma_free_coherent()
assume that dma_alloc_coherent() always returns a linear address.

However it's possible that dma_alloc_coherent() returns a non-linear
address.  In this case, page_to_pfn(virt_to_page(virt)) will return an
incorrect pfn.  If the pfn is valid and mapped as a COW page, we will
hit the warning when doing wp_page_copy().

Fix this by calculating pfn for linear and non-linear addresses.

[miles.chen@mediatek.com: v4]
  Link: http://lkml.kernel.org/r/1510872972-23919-1-git-send-email-miles.chen@mediatek.com
Link: http://lkml.kernel.org/r/1506484087-1177-1-git-send-email-miles.chen@mediatek.com
Signed-off-by: Miles Chen <miles.chen@mediatek.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/dma-debug.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index ea4cc3dde4f1..1b34d210452c 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -1495,14 +1495,22 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size,
 	if (!entry)
 		return;
 
+	/* handle vmalloc and linear addresses */
+	if (!is_vmalloc_addr(virt) && !virt_to_page(virt))
+		return;
+
 	entry->type      = dma_debug_coherent;
 	entry->dev       = dev;
-	entry->pfn	 = page_to_pfn(virt_to_page(virt));
 	entry->offset	 = offset_in_page(virt);
 	entry->size      = size;
 	entry->dev_addr  = dma_addr;
 	entry->direction = DMA_BIDIRECTIONAL;
 
+	if (is_vmalloc_addr(virt))
+		entry->pfn = vmalloc_to_pfn(virt);
+	else
+		entry->pfn = page_to_pfn(virt_to_page(virt));
+
 	add_dma_entry(entry);
 }
 EXPORT_SYMBOL(debug_dma_alloc_coherent);
@@ -1513,13 +1521,21 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
 	struct dma_debug_entry ref = {
 		.type           = dma_debug_coherent,
 		.dev            = dev,
-		.pfn		= page_to_pfn(virt_to_page(virt)),
 		.offset		= offset_in_page(virt),
 		.dev_addr       = addr,
 		.size           = size,
 		.direction      = DMA_BIDIRECTIONAL,
 	};
 
+	/* handle vmalloc and linear addresses */
+	if (!is_vmalloc_addr(virt) && !virt_to_page(virt))
+		return;
+
+	if (is_vmalloc_addr(virt))
+		ref.pfn = vmalloc_to_pfn(virt);
+	else
+		ref.pfn = page_to_pfn(virt_to_page(virt));
+
 	if (unlikely(dma_debug_disabled()))
 		return;
 

From 09af5ccea26d347041ac7ca37a61f859ef9bd1f5 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Fri, 17 Nov 2017 15:26:23 -0800
Subject: [PATCH 04/94] mm: shmem: remove unused info variable

Fix the following warning by removing the unused variable:

  mm/shmem.c:3205:27: warning: variable 'info' set but not used [-Wunused-but-set-variable]

Link: http://lkml.kernel.org/r/1510774029-30652-1-git-send-email-clabbe@baylibre.com
Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 1f97d77551c3..4aa9307feab0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3202,7 +3202,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 	int len;
 	struct inode *inode;
 	struct page *page;
-	struct shmem_inode_info *info;
 
 	len = strlen(symname) + 1;
 	if (len > PAGE_SIZE)
@@ -3222,7 +3221,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 		error = 0;
 	}
 
-	info = SHMEM_I(inode);
 	inode->i_size = len-1;
 	if (len <= SHORT_SYMLINK_LEN) {
 		inode->i_link = kmemdup(symname, len, GFP_KERNEL);

From a0647dc9208fae9124ca38d43a5c3c950d955291 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Fri, 17 Nov 2017 15:26:27 -0800
Subject: [PATCH 05/94] mm, compaction: kcompactd should not ignore pageblock
 skip

Kcompactd is needlessly ignoring pageblock skip information.  It is
doing MIGRATE_SYNC_LIGHT compaction, which is no more powerful than
MIGRATE_SYNC compaction.

If compaction recently failed to isolate memory from a set of
pageblocks, there is nothing to indicate that kcompactd will be able to
do so, or that it is beneficial from attempting to isolate memory.

Use the pageblock skip hint to avoid rescanning pageblocks needlessly
until that information is reset.

Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1708151638550.106658@chino.kir.corp.google.com
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 85395dc6eb13..ad40d67421f3 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1928,9 +1928,8 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 		.total_free_scanned = 0,
 		.classzone_idx = pgdat->kcompactd_classzone_idx,
 		.mode = MIGRATE_SYNC_LIGHT,
-		.ignore_skip_hint = true,
+		.ignore_skip_hint = false,
 		.gfp_mask = GFP_KERNEL,
-
 	};
 	trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
 							cc.classzone_idx);

From 21dc7e023611fbcf8e38f255731bcf3cc38e7638 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Fri, 17 Nov 2017 15:26:30 -0800
Subject: [PATCH 06/94] mm, compaction: persistently skip hugetlbfs pageblocks

It is pointless to migrate hugetlb memory as part of memory compaction
if the hugetlb size is equal to the pageblock order.  No defragmentation
is occurring in this condition.

It is also pointless to for the freeing scanner to scan a pageblock
where a hugetlb page is pinned.  Unconditionally skip these pageblocks,
and do so peristently so that they are not rescanned until it is
observed that these hugepages are no longer pinned.

It would also be possible to do this by involving the hugetlb subsystem
in marking pageblocks to no longer be skipped when they hugetlb pages
are freed.  This is a simple solution that doesn't involve any
additional subsystems in pageblock skip manipulation.

[rientjes@google.com: fix build]
  Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1708201734390.117182@chino.kir.corp.google.com
Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1708151639130.106658@chino.kir.corp.google.com
Signed-off-by: David Rientjes <rientjes@google.com>
Tested-by: Michal Hocko <mhocko@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pageblock-flags.h | 11 +++++++
 mm/compaction.c                 | 54 ++++++++++++++++++++++++++-------
 2 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index e942558b3585..9132c5cb41f1 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -96,6 +96,17 @@ void set_pfnblock_flags_mask(struct page *page,
 #define set_pageblock_skip(page) \
 			set_pageblock_flags_group(page, 1, PB_migrate_skip,  \
 							PB_migrate_skip)
+#else
+static inline bool get_pageblock_skip(struct page *page)
+{
+	return false;
+}
+static inline void clear_pageblock_skip(struct page *page)
+{
+}
+static inline void set_pageblock_skip(struct page *page)
+{
+}
 #endif /* CONFIG_COMPACTION */
 
 #endif	/* PAGEBLOCK_FLAGS_H */
diff --git a/mm/compaction.c b/mm/compaction.c
index ad40d67421f3..94b5c0865dd1 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -218,6 +218,20 @@ static void reset_cached_positions(struct zone *zone)
 				pageblock_start_pfn(zone_end_pfn(zone) - 1);
 }
 
+/*
+ * Hugetlbfs pages should consistenly be skipped until updated by the hugetlb
+ * subsystem.  It is always pointless to compact pages of pageblock_order and
+ * the free scanner can reconsider when no longer huge.
+ */
+static bool pageblock_skip_persistent(struct page *page, unsigned int order)
+{
+	if (!PageHuge(page))
+		return false;
+	if (order != pageblock_order)
+		return false;
+	return true;
+}
+
 /*
  * This function is called to clear all cached information on pageblocks that
  * should be skipped for page isolation when the migrate and free page scanner
@@ -242,6 +256,8 @@ static void __reset_isolation_suitable(struct zone *zone)
 			continue;
 		if (zone != page_zone(page))
 			continue;
+		if (pageblock_skip_persistent(page, compound_order(page)))
+			continue;
 
 		clear_pageblock_skip(page);
 	}
@@ -307,7 +323,13 @@ static inline bool isolation_suitable(struct compact_control *cc,
 	return true;
 }
 
-static void update_pageblock_skip(struct compact_control *cc,
+static inline bool pageblock_skip_persistent(struct page *page,
+					     unsigned int order)
+{
+	return false;
+}
+
+static inline void update_pageblock_skip(struct compact_control *cc,
 			struct page *page, unsigned long nr_isolated,
 			bool migrate_scanner)
 {
@@ -449,13 +471,15 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 		 * and the only danger is skipping too much.
 		 */
 		if (PageCompound(page)) {
-			unsigned int comp_order = compound_order(page);
+			const unsigned int order = compound_order(page);
 
-			if (likely(comp_order < MAX_ORDER)) {
-				blockpfn += (1UL << comp_order) - 1;
-				cursor += (1UL << comp_order) - 1;
+			if (pageblock_skip_persistent(page, order)) {
+				set_pageblock_skip(page);
+				blockpfn = end_pfn;
+			} else if (likely(order < MAX_ORDER)) {
+				blockpfn += (1UL << order) - 1;
+				cursor += (1UL << order) - 1;
 			}
-
 			goto isolate_fail;
 		}
 
@@ -772,11 +796,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		 * danger is skipping too much.
 		 */
 		if (PageCompound(page)) {
-			unsigned int comp_order = compound_order(page);
-
-			if (likely(comp_order < MAX_ORDER))
-				low_pfn += (1UL << comp_order) - 1;
+			const unsigned int order = compound_order(page);
 
+			if (pageblock_skip_persistent(page, order)) {
+				set_pageblock_skip(page);
+				low_pfn = end_pfn;
+			} else if (likely(order < MAX_ORDER))
+				low_pfn += (1UL << order) - 1;
 			goto isolate_fail;
 		}
 
@@ -838,7 +864,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			 * is safe to read and it's 0 for tail pages.
 			 */
 			if (unlikely(PageCompound(page))) {
-				low_pfn += (1UL << compound_order(page)) - 1;
+				const unsigned int order = compound_order(page);
+
+				if (pageblock_skip_persistent(page, order)) {
+					set_pageblock_skip(page);
+					low_pfn = end_pfn;
+				} else
+					low_pfn += (1UL << order) - 1;
 				goto isolate_fail;
 			}
 		}

From b527cfe5bc23208cf9a346879501333cec638aba Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 17 Nov 2017 15:26:34 -0800
Subject: [PATCH 07/94] mm, compaction: extend pageblock_skip_persistent() to
 all compound pages

pageblock_skip_persistent() checks for HugeTLB pages of pageblock order.
When clearing pageblock skip bits for compaction, the bits are not
cleared for such pageblocks, because they cannot contain base pages
suitable for migration, nor free pages to use as migration targets.

This optimization can be simply extended to all compound pages of order
equal or larger than pageblock order, because migrating such pages (if
they support it) cannot help sub-pageblock fragmentation.  This includes
THP's and also gigantic HugeTLB pages, which the current implementation
doesn't persistently skip due to a strict pageblock_order equality check
and not recognizing tail pages.

While THP pages are generally less "persistent" than HugeTLB, we can
still expect that if a THP exists at the point of
__reset_isolation_suitable(), it will exist also during the subsequent
compaction run.  The time difference here could be actually smaller than
between a compaction run that sets a (non-persistent) skip bit on a THP,
and the next compaction run that observes it.

Link: http://lkml.kernel.org/r/20171102121706.21504-1-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 94b5c0865dd1..e8f5b4e2cb05 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -219,17 +219,21 @@ static void reset_cached_positions(struct zone *zone)
 }
 
 /*
- * Hugetlbfs pages should consistenly be skipped until updated by the hugetlb
- * subsystem.  It is always pointless to compact pages of pageblock_order and
- * the free scanner can reconsider when no longer huge.
+ * Compound pages of >= pageblock_order should consistenly be skipped until
+ * released. It is always pointless to compact pages of such order (if they are
+ * migratable), and the pageblocks they occupy cannot contain any free pages.
  */
-static bool pageblock_skip_persistent(struct page *page, unsigned int order)
+static bool pageblock_skip_persistent(struct page *page)
 {
-	if (!PageHuge(page))
+	if (!PageCompound(page))
 		return false;
-	if (order != pageblock_order)
-		return false;
-	return true;
+
+	page = compound_head(page);
+
+	if (compound_order(page) >= pageblock_order)
+		return true;
+
+	return false;
 }
 
 /*
@@ -256,7 +260,7 @@ static void __reset_isolation_suitable(struct zone *zone)
 			continue;
 		if (zone != page_zone(page))
 			continue;
-		if (pageblock_skip_persistent(page, compound_order(page)))
+		if (pageblock_skip_persistent(page))
 			continue;
 
 		clear_pageblock_skip(page);
@@ -323,8 +327,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
 	return true;
 }
 
-static inline bool pageblock_skip_persistent(struct page *page,
-					     unsigned int order)
+static inline bool pageblock_skip_persistent(struct page *page)
 {
 	return false;
 }

From 2583d6713267a4c80126e4e50dd45f5cf685ebe8 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 17 Nov 2017 15:26:38 -0800
Subject: [PATCH 08/94] mm, compaction: split off flag for not updating skip
 hints

Pageblock skip hints were added as a heuristic for compaction, which
shares core code with CMA.  Since CMA reliability would suffer from the
heuristics, compact_control flag ignore_skip_hint was added for the CMA
use case.  Since 6815bf3f233e ("mm/compaction: respect ignore_skip_hint
in update_pageblock_skip") the flag also means that CMA won't *update*
the skip hints in addition to ignoring them.

Today, direct compaction can also ignore the skip hints in the last
resort attempt, but there's no reason not to set them when isolation
fails in such case.  Thus, this patch splits off a new no_set_skip_hint
flag to avoid the updating, which only CMA sets.  This should improve
the heuristics a bit, and allow us to simplify the persistent skip bit
handling as the next step.

Link: http://lkml.kernel.org/r/20171102121706.21504-2-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 2 +-
 mm/internal.h   | 1 +
 mm/page_alloc.c | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index e8f5b4e2cb05..bb1188a9d58e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -295,7 +295,7 @@ static void update_pageblock_skip(struct compact_control *cc,
 	struct zone *zone = cc->zone;
 	unsigned long pfn;
 
-	if (cc->ignore_skip_hint)
+	if (cc->no_set_skip_hint)
 		return;
 
 	if (!page)
diff --git a/mm/internal.h b/mm/internal.h
index 1df011f62480..e6bd35182dae 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -198,6 +198,7 @@ struct compact_control {
 	const int classzone_idx;	/* zone index of a direct compactor */
 	enum migrate_mode mode;		/* Async or sync migration mode */
 	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
+	bool no_set_skip_hint;		/* Don't mark blocks for skipping */
 	bool ignore_block_suitable;	/* Scan blocks considered unsuitable */
 	bool direct_compaction;		/* False from kcompactd or /proc/... */
 	bool whole_zone;		/* Whole zone should/has been scanned */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 55ded92f9809..d4096f4a5c1f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7619,6 +7619,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 		.zone = page_zone(pfn_to_page(start)),
 		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
+		.no_set_skip_hint = true,
 		.gfp_mask = current_gfp_context(gfp_mask),
 	};
 	INIT_LIST_HEAD(&cc.migratepages);

From d3c85bad89b9153df741af14ad859ee49677f00d Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 17 Nov 2017 15:26:41 -0800
Subject: [PATCH 09/94] mm, compaction: remove unneeded
 pageblock_skip_persistent() checks

Commit f3c931633a59 ("mm, compaction: persistently skip hugetlbfs
pageblocks") has introduced pageblock_skip_persistent() checks into
migration and free scanners, to make sure pageblocks that should be
persistently skipped are marked as such, regardless of the
ignore_skip_hint flag.

Since the previous patch introduced a new no_set_skip_hint flag, the
ignore flag no longer prevents marking pageblocks as skipped.  Therefore
we can remove the special cases.  The relevant pageblocks will be marked
as skipped by the common logic which marks each pageblock where no page
could be isolated.  This makes the code simpler.

Link: http://lkml.kernel.org/r/20171102121706.21504-3-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index bb1188a9d58e..10cd757f1006 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -476,10 +476,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 		if (PageCompound(page)) {
 			const unsigned int order = compound_order(page);
 
-			if (pageblock_skip_persistent(page, order)) {
-				set_pageblock_skip(page);
-				blockpfn = end_pfn;
-			} else if (likely(order < MAX_ORDER)) {
+			if (likely(order < MAX_ORDER)) {
 				blockpfn += (1UL << order) - 1;
 				cursor += (1UL << order) - 1;
 			}
@@ -801,10 +798,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		if (PageCompound(page)) {
 			const unsigned int order = compound_order(page);
 
-			if (pageblock_skip_persistent(page, order)) {
-				set_pageblock_skip(page);
-				low_pfn = end_pfn;
-			} else if (likely(order < MAX_ORDER))
+			if (likely(order < MAX_ORDER))
 				low_pfn += (1UL << order) - 1;
 			goto isolate_fail;
 		}
@@ -867,13 +861,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			 * is safe to read and it's 0 for tail pages.
 			 */
 			if (unlikely(PageCompound(page))) {
-				const unsigned int order = compound_order(page);
-
-				if (pageblock_skip_persistent(page, order)) {
-					set_pageblock_skip(page);
-					low_pfn = end_pfn;
-				} else
-					low_pfn += (1UL << order) - 1;
+				low_pfn += (1UL << compound_order(page)) - 1;
 				goto isolate_fail;
 			}
 		}

From c643401218be0f4ab3522e0c0a63016596d6e9ca Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 17 Nov 2017 15:26:45 -0800
Subject: [PATCH 10/94] proc, coredump: add CoreDumping flag to
 /proc/pid/status

Right now there is no convenient way to check if a process is being
coredumped at the moment.

It might be necessary to recognize such state to prevent killing the
process and getting a broken coredump.  Writing a large core might take
significant time, and the process is unresponsive during it, so it might
be killed by timeout, if another process is monitoring and
killing/restarting hanging tasks.

We're getting a significant number of corrupted coredump files on
machines in our fleet, just because processes are being killed by
timeout in the middle of the core writing process.

We do have a process health check, and some agent is responsible for
restarting processes which are not responding for health check requests.
Writing a large coredump to the disk can easily exceed the reasonable
timeout (especially on an overloaded machine).

This flag will allow the agent to distinguish processes which are being
coredumped, extend the timeout for them, and let them produce a full
coredump file.

To provide an ability to detect if a process is in the state of being
coredumped, we can expose a boolean CoreDumping flag in
/proc/pid/status.

Example:
$ cat core.sh
  #!/bin/sh

  echo "|/usr/bin/sleep 10" > /proc/sys/kernel/core_pattern
  sleep 1000 &
  PID=$!

  cat /proc/$PID/status | grep CoreDumping
  kill -ABRT $PID
  sleep 1
  cat /proc/$PID/status | grep CoreDumping

$ ./core.sh
  CoreDumping:	0
  CoreDumping:	1

[guro@fb.com: document CoreDumping flag in /proc/<pid>/status]
  Link: http://lkml.kernel.org/r/20170928135357.GA8470@castle.DHCP.thefacebook.com
Link: http://lkml.kernel.org/r/20170920230634.31572-1-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 3 +++
 fs/proc/array.c                    | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index ec571b9bb18a..2a84bb334894 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -181,6 +181,7 @@ read the file /proc/PID/status:
   VmPTE:        20 kb
   VmSwap:        0 kB
   HugetlbPages:          0 kB
+  CoreDumping:    0
   Threads:        1
   SigQ:   0/28578
   SigPnd: 0000000000000000
@@ -253,6 +254,8 @@ Table 1-2: Contents of the status files (as of 4.8)
  VmSwap                      amount of swap used by anonymous private data
                              (shmem swap usage is not included)
  HugetlbPages                size of hugetlb memory portions
+ CoreDumping                 process's memory is currently being dumped
+                             (killing the process may lead to a corrupted core)
  Threads                     number of threads
  SigQ                        number of signals queued/max. number for queue
  SigPnd                      bitmap of pending signals for the thread
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 6f6fc1672ad1..79375fc115d2 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -366,6 +366,11 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 		   cpumask_pr_args(&task->cpus_allowed));
 }
 
+static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
+{
+	seq_printf(m, "CoreDumping:\t%d\n", !!mm->core_state);
+}
+
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task)
 {
@@ -376,6 +381,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 
 	if (mm) {
 		task_mem(m, mm);
+		task_core_dumping(m, mm);
 		mmput(mm);
 	}
 	task_sig(m, task);

From 3ee2a19908f27b8fea8ff14ffa8b755585eb7b4a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 17 Nov 2017 15:26:49 -0800
Subject: [PATCH 11/94] proc: : uninline name_to_int()

Save ~360 bytes.

	add/remove: 1/0 grow/shrink: 0/4 up/down: 104/-463 (-359)
	function                                     old     new   delta
	name_to_int                                    -     104    +104
	proc_pid_lookup                              217     126     -91
	proc_lookupfd_common                         212     121     -91
	proc_task_lookup                             289     194     -95
	__proc_create                                588     402    -186

Link: http://lkml.kernel.org/r/20170912194850.GA17730@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/Makefile   |  1 +
 fs/proc/internal.h | 23 +----------------------
 fs/proc/util.c     | 23 +++++++++++++++++++++++
 3 files changed, 25 insertions(+), 22 deletions(-)
 create mode 100644 fs/proc/util.c

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index f7456c4e7d0f..ead487e80510 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -21,6 +21,7 @@ proc-y	+= loadavg.o
 proc-y	+= meminfo.o
 proc-y	+= stat.o
 proc-y	+= uptime.o
+proc-y	+= util.o
 proc-y	+= version.o
 proc-y	+= softirqs.o
 proc-y	+= namespaces.o
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index a34195e92b20..9aad373cf11d 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -103,28 +103,7 @@ static inline struct task_struct *get_proc_task(struct inode *inode)
 void task_dump_owner(struct task_struct *task, mode_t mode,
 		     kuid_t *ruid, kgid_t *rgid);
 
-static inline unsigned name_to_int(const struct qstr *qstr)
-{
-	const char *name = qstr->name;
-	int len = qstr->len;
-	unsigned n = 0;
-
-	if (len > 1 && *name == '0')
-		goto out;
-	while (len-- > 0) {
-		unsigned c = *name++ - '0';
-		if (c > 9)
-			goto out;
-		if (n >= (~0U-9)/10)
-			goto out;
-		n *= 10;
-		n += c;
-	}
-	return n;
-out:
-	return ~0U;
-}
-
+unsigned name_to_int(const struct qstr *qstr);
 /*
  * Offset of the first process in the /proc root directory..
  */
diff --git a/fs/proc/util.c b/fs/proc/util.c
new file mode 100644
index 000000000000..c29aa497394b
--- /dev/null
+++ b/fs/proc/util.c
@@ -0,0 +1,23 @@
+#include <linux/dcache.h>
+
+unsigned name_to_int(const struct qstr *qstr)
+{
+	const char *name = qstr->name;
+	int len = qstr->len;
+	unsigned n = 0;
+
+	if (len > 1 && *name == '0')
+		goto out;
+	while (len-- > 0) {
+		unsigned c = *name++ - '0';
+		if (c > 9)
+			goto out;
+		if (n >= (~0U-9)/10)
+			goto out;
+		n *= 10;
+		n += c;
+	}
+	return n;
+out:
+	return ~0U;
+}

From 0746a0bc6e6e76444098cf944848554d21d28cae Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 17 Nov 2017 15:26:52 -0800
Subject: [PATCH 12/94] proc: use do-while in name_to_int()

Gcc doesn't know that "len" is guaranteed to be >=1 by dcache and
generates standard while-loop prologue duplicating loop condition.

	add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-27 (-27)
	function                                     old     new   delta
	name_to_int                                  104      77     -27

Link: http://lkml.kernel.org/r/20170912195213.GB17730@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/util.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/proc/util.c b/fs/proc/util.c
index c29aa497394b..b161cfa0f9fa 100644
--- a/fs/proc/util.c
+++ b/fs/proc/util.c
@@ -8,7 +8,7 @@ unsigned name_to_int(const struct qstr *qstr)
 
 	if (len > 1 && *name == '0')
 		goto out;
-	while (len-- > 0) {
+	do {
 		unsigned c = *name++ - '0';
 		if (c > 9)
 			goto out;
@@ -16,7 +16,7 @@ unsigned name_to_int(const struct qstr *qstr)
 			goto out;
 		n *= 10;
 		n += c;
-	}
+	} while (--len > 0);
 	return n;
 out:
 	return ~0U;

From 868038bed5fb03f95ebd4517b4370b024fd13771 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 17 Nov 2017 15:26:55 -0800
Subject: [PATCH 13/94] spelling.txt: add "unnecessary" typo variants

Add unnecessary typos by copying the necessary typos.

Link: http://lkml.kernel.org/r/1505074722.22023.6.camel@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/spelling.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index aa0cc49ad1ad..9a058cff49d4 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -1187,6 +1187,10 @@ unknonw||unknown
 unknow||unknown
 unkown||unknown
 unneded||unneeded
+unneccecary||unnecessary
+unneccesary||unnecessary
+unneccessary||unnecessary
+unnecesary||unnecessary
 unneedingly||unnecessarily
 unnsupported||unsupported
 unmached||unmatched

From fb6cc4ac15c354fa1eb449f50a0dfe5f4bf0d42a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 17 Nov 2017 15:26:59 -0800
Subject: [PATCH 14/94] sh/boot: add static stack-protector to pre-kernel

The sh decompressor code triggers stack-protector code generation when
using CONFIG_CC_STACKPROTECTOR_STRONG.  As done for arm and mips, add a
simple static stack-protector canary.  As this wasn't protected before,
the risk of using a weak canary is minimized.  Once the kernel is
actually up, a better canary is chosen.

Link: http://lkml.kernel.org/r/1506972007-80614-2-git-send-email-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michal Marek <mmarek@suse.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sh/boot/compressed/misc.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/arch/sh/boot/compressed/misc.c b/arch/sh/boot/compressed/misc.c
index f2d9d3079d4e..627ce8e75e01 100644
--- a/arch/sh/boot/compressed/misc.c
+++ b/arch/sh/boot/compressed/misc.c
@@ -104,6 +104,18 @@ static void error(char *x)
 	while(1);	/* Halt */
 }
 
+unsigned long __stack_chk_guard;
+
+void __stack_chk_guard_setup(void)
+{
+	__stack_chk_guard = 0x000a0dff;
+}
+
+void __stack_chk_fail(void)
+{
+	error("stack-protector: Kernel stack is corrupted\n");
+}
+
 #ifdef CONFIG_SUPERH64
 #define stackalign	8
 #else
@@ -118,6 +130,8 @@ void decompress_kernel(void)
 {
 	unsigned long output_addr;
 
+	__stack_chk_guard_setup();
+
 #ifdef CONFIG_SUPERH64
 	output_addr = (CONFIG_MEMORY_START + 0x2000);
 #else

From b1fca27d384e8418aac84b39f6f5179aecc1b64f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 17 Nov 2017 15:27:03 -0800
Subject: [PATCH 15/94] kernel debug: support resetting WARN*_ONCE

I like _ONCE warnings because it's guaranteed that they don't flood the
log.

During testing I find it useful to reset the state of the once warnings,
so that I can rerun tests and see if they trigger again, or can
guarantee that a test run always hits the same warnings.

This patch adds a debugfs interface to reset all the _ONCE warnings so
that they appear again:

  echo 1 > /sys/kernel/debug/clear_warn_once

This is implemented by putting all the warning booleans into a special
section, and clearing it.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20171017221455.6740-1-andi@firstfloor.org
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/clearing-warn-once.txt |  7 +++++++
 include/asm-generic/bug.h            |  6 +++---
 include/asm-generic/sections.h       |  1 +
 include/asm-generic/vmlinux.lds.h    |  3 +++
 kernel/panic.c                       | 28 ++++++++++++++++++++++++++++
 5 files changed, 42 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/clearing-warn-once.txt

diff --git a/Documentation/clearing-warn-once.txt b/Documentation/clearing-warn-once.txt
new file mode 100644
index 000000000000..5b1f5d547be1
--- /dev/null
+++ b/Documentation/clearing-warn-once.txt
@@ -0,0 +1,7 @@
+
+WARN_ONCE / WARN_ON_ONCE only print a warning once.
+
+echo 1 > /sys/kernel/debug/clear_warn_once
+
+clears the state and allows the warnings to print once again.
+This can be useful after test suite runs to reproduce problems.
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index af2cc94a61bf..7844b0df88cd 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -130,7 +130,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 
 #ifndef WARN_ON_ONCE
 #define WARN_ON_ONCE(condition)	({				\
-	static bool __section(.data.unlikely) __warned;		\
+	static bool __section(.data.once) __warned;		\
 	int __ret_warn_once = !!(condition);			\
 								\
 	if (unlikely(__ret_warn_once && !__warned)) {		\
@@ -142,7 +142,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 #endif
 
 #define WARN_ONCE(condition, format...)	({			\
-	static bool __section(.data.unlikely) __warned;		\
+	static bool __section(.data.once) __warned;		\
 	int __ret_warn_once = !!(condition);			\
 								\
 	if (unlikely(__ret_warn_once && !__warned)) {		\
@@ -153,7 +153,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 })
 
 #define WARN_TAINT_ONCE(condition, taint, format...)	({	\
-	static bool __section(.data.unlikely) __warned;		\
+	static bool __section(.data.once) __warned;		\
 	int __ret_warn_once = !!(condition);			\
 								\
 	if (unlikely(__ret_warn_once && !__warned)) {		\
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 6d9576931084..03cc5f9bba71 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -44,6 +44,7 @@ extern char __entry_text_start[], __entry_text_end[];
 extern char __start_rodata[], __end_rodata[];
 extern char __irqentry_text_start[], __irqentry_text_end[];
 extern char __softirqentry_text_start[], __softirqentry_text_end[];
+extern char __start_once[], __end_once[];
 
 /* Start and end of .ctors section - used for constructor calls. */
 extern char __ctors_start[], __ctors_end[];
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index bdcd1caae092..ee8b707d9fa9 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -223,6 +223,9 @@
 	MEM_KEEP(init.data)						\
 	MEM_KEEP(exit.data)						\
 	*(.data.unlikely)						\
+	VMLINUX_SYMBOL(__start_once) = .;				\
+	*(.data.once)							\
+	VMLINUX_SYMBOL(__end_once) = .;					\
 	STRUCT_ALIGN();							\
 	*(__tracepoints)						\
 	/* implement dynamic printk debug */				\
diff --git a/kernel/panic.c b/kernel/panic.c
index bdd18afa19a4..672a91dc20fe 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,6 +27,8 @@
 #include <linux/console.h>
 #include <linux/bug.h>
 #include <linux/ratelimit.h>
+#include <linux/debugfs.h>
+#include <asm/sections.h>
 
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
@@ -587,6 +589,32 @@ void warn_slowpath_null(const char *file, int line)
 EXPORT_SYMBOL(warn_slowpath_null);
 #endif
 
+#ifdef CONFIG_BUG
+
+/* Support resetting WARN*_ONCE state */
+
+static int clear_warn_once_set(void *data, u64 val)
+{
+	memset(__start_once, 0, __end_once - __start_once);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops,
+			NULL,
+			clear_warn_once_set,
+			"%lld\n");
+
+static __init int register_warn_debugfs(void)
+{
+	/* Don't care about failure */
+	debugfs_create_file("clear_warn_once", 0644, NULL,
+			    NULL, &clear_warn_once_fops);
+	return 0;
+}
+
+device_initcall(register_warn_debugfs);
+#endif
+
 #ifdef CONFIG_CC_STACKPROTECTOR
 
 /*

From aaf5dcfb223617ac2d16113e4b500199c65689de Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 17 Nov 2017 15:27:06 -0800
Subject: [PATCH 16/94] kernel debug: support resetting WARN_ONCE for all
 architectures

Some architectures store the WARN_ONCE state in the flags field of the
bug_entry.  Clear that one too when resetting once state through
/sys/kernel/debug/clear_warn_once

Pointed out by Michael Ellerman

Improves the earlier patch that add clear_warn_once.

[ak@linux.intel.com: add a missing ifdef CONFIG_MODULES]
  Link: http://lkml.kernel.org/r/20171020170633.9593-1-andi@firstfloor.org
[akpm@linux-foundation.org: fix unused var warning]
[akpm@linux-foundation.org: Use 0200 for clear_warn_once file, per mpe]
[akpm@linux-foundation.org: clear BUGFLAG_DONE in clear_once_table(), per mpe]
Link: http://lkml.kernel.org/r/20171019204642.7404-1-andi@firstfloor.org
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bug.h |  5 +++++
 kernel/panic.c      |  3 ++-
 lib/bug.c           | 23 +++++++++++++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/include/linux/bug.h b/include/linux/bug.h
index da4231c905c8..fe5916550da8 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -43,6 +43,8 @@ enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs);
 /* These are defined by the architecture */
 int is_valid_bugaddr(unsigned long addr);
 
+void generic_bug_clear_once(void);
+
 #else	/* !CONFIG_GENERIC_BUG */
 
 static inline enum bug_trap_type report_bug(unsigned long bug_addr,
@@ -51,6 +53,9 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr,
 	return BUG_TRAP_TYPE_BUG;
 }
 
+
+static inline void generic_bug_clear_once(void) {}
+
 #endif	/* CONFIG_GENERIC_BUG */
 
 /*
diff --git a/kernel/panic.c b/kernel/panic.c
index 672a91dc20fe..67cebf2a3b67 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -595,6 +595,7 @@ EXPORT_SYMBOL(warn_slowpath_null);
 
 static int clear_warn_once_set(void *data, u64 val)
 {
+	generic_bug_clear_once();
 	memset(__start_once, 0, __end_once - __start_once);
 	return 0;
 }
@@ -607,7 +608,7 @@ DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops,
 static __init int register_warn_debugfs(void)
 {
 	/* Don't care about failure */
-	debugfs_create_file("clear_warn_once", 0644, NULL,
+	debugfs_create_file("clear_warn_once", 0200, NULL,
 			    NULL, &clear_warn_once_fops);
 	return 0;
 }
diff --git a/lib/bug.c b/lib/bug.c
index 1e094408c893..f66be6bf6206 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -196,3 +196,26 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 
 	return BUG_TRAP_TYPE_BUG;
 }
+
+static void clear_once_table(struct bug_entry *start, struct bug_entry *end)
+{
+	struct bug_entry *bug;
+
+	for (bug = start; bug < end; bug++)
+		bug->flags &= ~BUGFLAG_DONE;
+}
+
+void generic_bug_clear_once(void)
+{
+#ifdef CONFIG_MODULES
+	struct module *mod;
+
+	rcu_read_lock_sched();
+	list_for_each_entry_rcu(mod, &module_bug_list, bug_list)
+		clear_once_table(mod->bug_table,
+				 mod->bug_table + mod->num_bugs);
+	rcu_read_unlock_sched();
+#endif
+
+	clear_once_table(__start___bug_table, __stop___bug_table);
+}

From 1e6270d07cde9bfbc27f8d0f16b323cf3a3b63dd Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 17 Nov 2017 15:27:10 -0800
Subject: [PATCH 17/94] parse-maintainers: add ability to specify filenames

parse-maintainers.pl is convenient, but currently hard-codes the
filenames that are used.

Allow user-specified filenames to simplify the use of the script.

Link: http://lkml.kernel.org/r/48703c068b3235223ffa3b2eb268fa0a125b25e0.1502251549.git.joe@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/parse-maintainers.pl | 52 ++++++++++++++++++++++++++++++++----
 1 file changed, 47 insertions(+), 5 deletions(-)

diff --git a/scripts/parse-maintainers.pl b/scripts/parse-maintainers.pl
index 5dbd2faa2449..255cef1b098d 100644
--- a/scripts/parse-maintainers.pl
+++ b/scripts/parse-maintainers.pl
@@ -2,9 +2,44 @@
 # SPDX-License-Identifier: GPL-2.0
 
 use strict;
+use Getopt::Long qw(:config no_auto_abbrev);
+
+my $input_file = "MAINTAINERS";
+my $output_file = "MAINTAINERS.new";
+my $output_section = "SECTION.new";
+my $help = 0;
 
 my $P = $0;
 
+if (!GetOptions(
+		'input=s' => \$input_file,
+		'output=s' => \$output_file,
+		'section=s' => \$output_section,
+		'h|help|usage' => \$help,
+	    )) {
+    die "$P: invalid argument - use --help if necessary\n";
+}
+
+if ($help != 0) {
+    usage();
+    exit 0;
+}
+
+sub usage {
+    print <<EOT;
+usage: $P [options] <pattern matching regexes>
+
+  --input => MAINTAINERS file to read (default: MAINTAINERS)
+  --output => sorted MAINTAINERS file to write (default: MAINTAINERS.new)
+  --section => new sorted MAINTAINERS file to write to (default: SECTION.new)
+
+If <pattern match regexes> exist, then the sections that match the
+regexes are not written to the output file but are written to the
+section file.
+
+EOT
+}
+
 # sort comparison functions
 sub by_category($$) {
     my ($a, $b) = @_;
@@ -56,13 +91,20 @@ sub trim {
 sub alpha_output {
     my ($hashref, $filename) = (@_);
 
+    return if ! scalar(keys %$hashref);
+
     open(my $file, '>', "$filename") or die "$P: $filename: open failed - $!\n";
+    my $separator;
     foreach my $key (sort by_category keys %$hashref) {
 	if ($key eq " ") {
-	    chomp $$hashref{$key};
 	    print $file $$hashref{$key};
 	} else {
-	    print $file "\n" . $key . "\n";
+	    if (! defined $separator) {
+		$separator = "\n";
+	    } else {
+		print $file $separator;
+	    }
+	    print $file $key . "\n";
 	    foreach my $pattern (sort by_pattern split('\n', %$hashref{$key})) {
 		print $file ($pattern . "\n");
 	    }
@@ -112,7 +154,7 @@ sub file_input {
 my %hash;
 my %new_hash;
 
-file_input(\%hash, "MAINTAINERS");
+file_input(\%hash, $input_file);
 
 foreach my $type (@ARGV) {
     foreach my $key (keys %hash) {
@@ -123,7 +165,7 @@ foreach my $type (@ARGV) {
     }
 }
 
-alpha_output(\%hash, "MAINTAINERS.new");
-alpha_output(\%new_hash, "SECTION.new");
+alpha_output(\%hash, $output_file);
+alpha_output(\%new_hash, $output_section);
 
 exit(0);

From d15809f3649e2ee04713a9dba9aa7bd2c208ad82 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 17 Nov 2017 15:27:13 -0800
Subject: [PATCH 18/94] iopoll: avoid -Wint-in-bool-context warning

When we pass the result of a multiplication as the timeout or the delay,
we can get a warning from gcc-7:

  drivers/mmc/host/bcm2835.c:596:149: error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]
  drivers/mfd/arizona-core.c:247:195: error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]
  drivers/gpu/drm/sun4i/sun4i_hdmi_i2c.c:49:27: error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]

The warning is a bit questionable inside of a macro, but this is
intentional on the side of the gcc developers.  It is also an indication
of another problem: we evaluate the timeout and sleep arguments multiple
times, which can have undesired side-effects when those are complex
expressions.

This changes the two iopoll variants to use local variables for storing
copies of the timeouts.  This adds some more type safety, and avoids
both the double-evaluation and the gcc warning.

Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81484
Link: http://lkml.kernel.org/r/20170726133756.2161367-1-arnd@arndb.de
Link: http://lkml.kernel.org/r/20171102114048.1526955-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/iopoll.h | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h
index d29e1e21bf3f..b1d861caca16 100644
--- a/include/linux/iopoll.h
+++ b/include/linux/iopoll.h
@@ -42,18 +42,21 @@
  */
 #define readx_poll_timeout(op, addr, val, cond, sleep_us, timeout_us)	\
 ({ \
-	ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \
-	might_sleep_if(sleep_us); \
+	u64 __timeout_us = (timeout_us); \
+	unsigned long __sleep_us = (sleep_us); \
+	ktime_t __timeout = ktime_add_us(ktime_get(), __timeout_us); \
+	might_sleep_if((__sleep_us) != 0); \
 	for (;;) { \
 		(val) = op(addr); \
 		if (cond) \
 			break; \
-		if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \
+		if (__timeout_us && \
+		    ktime_compare(ktime_get(), __timeout) > 0) { \
 			(val) = op(addr); \
 			break; \
 		} \
-		if (sleep_us) \
-			usleep_range((sleep_us >> 2) + 1, sleep_us); \
+		if (__sleep_us) \
+			usleep_range((__sleep_us >> 2) + 1, __sleep_us); \
 	} \
 	(cond) ? 0 : -ETIMEDOUT; \
 })
@@ -77,17 +80,20 @@
  */
 #define readx_poll_timeout_atomic(op, addr, val, cond, delay_us, timeout_us) \
 ({ \
-	ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \
+	u64 __timeout_us = (timeout_us); \
+	unsigned long __delay_us = (delay_us); \
+	ktime_t __timeout = ktime_add_us(ktime_get(), __timeout_us); \
 	for (;;) { \
 		(val) = op(addr); \
 		if (cond) \
 			break; \
-		if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \
+		if (__timeout_us && \
+		    ktime_compare(ktime_get(), __timeout) > 0) { \
 			(val) = op(addr); \
 			break; \
 		} \
-		if (delay_us) \
-			udelay(delay_us);	\
+		if (__delay_us) \
+			udelay(__delay_us);	\
 	} \
 	(cond) ? 0 : -ETIMEDOUT; \
 })

From d32f11ba281b7e203932c0a65ec1fb302493cbbe Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 17 Nov 2017 15:27:17 -0800
Subject: [PATCH 19/94] lkdtm: include WARN format string

In order to test the ordering of WARN format strings, actually include
one in LKDTM.

Link: http://lkml.kernel.org/r/1510100869-73751-2-git-send-email-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/lkdtm_bugs.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/lkdtm_bugs.c b/drivers/misc/lkdtm_bugs.c
index b0f7af872bb5..7eebbdfbcacd 100644
--- a/drivers/misc/lkdtm_bugs.c
+++ b/drivers/misc/lkdtm_bugs.c
@@ -63,9 +63,11 @@ void lkdtm_BUG(void)
 	BUG();
 }
 
+static int warn_counter;
+
 void lkdtm_WARNING(void)
 {
-	WARN_ON(1);
+	WARN(1, "Warning message trigger count: %d\n", warn_counter++);
 }
 
 void lkdtm_EXCEPTION(void)

From 2a8358d8a339540f00ec596526690e8eeca931a3 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 17 Nov 2017 15:27:21 -0800
Subject: [PATCH 20/94] bug: define the "cut here" string in a single place

The "cut here" string is used in a few paths.  Define it in a single
place.

Link: http://lkml.kernel.org/r/1510100869-73751-3-git-send-email-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mn10300/mm/fault.c   | 2 +-
 include/asm-generic/bug.h | 2 ++
 kernel/panic.c            | 2 +-
 lib/bug.c                 | 2 +-
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index f23781d6bbb3..f0bfa1448744 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -60,7 +60,7 @@ void bust_spinlocks(int yes)
 void do_BUG(const char *file, int line)
 {
 	bust_spinlocks(1);
-	printk(KERN_EMERG "------------[ cut here ]------------\n");
+	printk(KERN_EMERG CUT_HERE);
 	printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
 }
 
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 7844b0df88cd..1283473f234e 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -4,6 +4,8 @@
 
 #include <linux/compiler.h>
 
+#define CUT_HERE		"------------[ cut here ]------------\n"
+
 #ifdef CONFIG_GENERIC_BUG
 #define BUGFLAG_WARNING		(1 << 0)
 #define BUGFLAG_ONCE		(1 << 1)
diff --git a/kernel/panic.c b/kernel/panic.c
index 67cebf2a3b67..89df5fa2f798 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -520,7 +520,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 {
 	disable_trace_on_warning();
 
-	pr_warn("------------[ cut here ]------------\n");
+	pr_warn(CUT_HERE);
 
 	if (file)
 		pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
diff --git a/lib/bug.c b/lib/bug.c
index f66be6bf6206..c1b0fad31b10 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -186,7 +186,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 		return BUG_TRAP_TYPE_WARN;
 	}
 
-	printk(KERN_DEFAULT "------------[ cut here ]------------\n");
+	printk(KERN_DEFAULT CUT_HERE);
 
 	if (file)
 		pr_crit("kernel BUG at %s:%u!\n", file, line);

From a7bed27af194aa3f67915688039d93188ed95e2a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 17 Nov 2017 15:27:24 -0800
Subject: [PATCH 21/94] bug: fix "cut here" location for __WARN_TAINT
 architectures

Prior to v4.11, x86 used warn_slowpath_fmt() for handling WARN()s.
After WARN() was moved to using UD0 on x86, the warning text started
appearing _before_ the "cut here" line.  This appears to have been a
long-standing bug on architectures that used __WARN_TAINT, but it didn't
get fixed.

v4.11 and earlier on x86:

  ------------[ cut here ]------------
  WARNING: CPU: 0 PID: 2956 at drivers/misc/lkdtm_bugs.c:65 lkdtm_WARNING+0x21/0x30
  This is a warning message
  Modules linked in:

v4.12 and later on x86:

  This is a warning message
  ------------[ cut here ]------------
  WARNING: CPU: 1 PID: 2982 at drivers/misc/lkdtm_bugs.c:68 lkdtm_WARNING+0x15/0x20
  Modules linked in:

With this fix:

  ------------[ cut here ]------------
  This is a warning message
  WARNING: CPU: 3 PID: 3009 at drivers/misc/lkdtm_bugs.c:67 lkdtm_WARNING+0x15/0x20

Since the __FILE__ reporting happens as part of the UD0 handler, it
isn't trivial to move the message to after the WARNING line, but at
least we can fix the position of the "cut here" line so all the various
logging tools will start including the actual runtime warning message
again, when they follow the instruction and "cut here".

Link: http://lkml.kernel.org/r/1510100869-73751-4-git-send-email-keescook@chromium.org
Fixes: 9a93848fe787 ("x86/debug: Implement __WARN() using UD0")
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/bug.h |  5 +++--
 kernel/panic.c            | 16 +++++++++++++++-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 1283473f234e..963b755d19b0 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -92,10 +92,11 @@ extern void warn_slowpath_null(const char *file, const int line);
 #define __WARN_printf_taint(taint, arg...)				\
 	warn_slowpath_fmt_taint(__FILE__, __LINE__, taint, arg)
 #else
+extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
 #define __WARN()		__WARN_TAINT(TAINT_WARN)
-#define __WARN_printf(arg...)	do { printk(arg); __WARN(); } while (0)
+#define __WARN_printf(arg...)	do { __warn_printk(arg); __WARN(); } while (0)
 #define __WARN_printf_taint(taint, arg...)				\
-	do { printk(arg); __WARN_TAINT(taint); } while (0)
+	do { __warn_printk(arg); __WARN_TAINT(taint); } while (0)
 #endif
 
 /* used internally by panic.c */
diff --git a/kernel/panic.c b/kernel/panic.c
index 89df5fa2f798..3242b64b1956 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -520,7 +520,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 {
 	disable_trace_on_warning();
 
-	pr_warn(CUT_HERE);
+	if (args)
+		pr_warn(CUT_HERE);
 
 	if (file)
 		pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
@@ -584,9 +585,22 @@ EXPORT_SYMBOL(warn_slowpath_fmt_taint);
 
 void warn_slowpath_null(const char *file, int line)
 {
+	pr_warn(CUT_HERE);
 	__warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL);
 }
 EXPORT_SYMBOL(warn_slowpath_null);
+#else
+void __warn_printk(const char *fmt, ...)
+{
+	va_list args;
+
+	pr_warn(CUT_HERE);
+
+	va_start(args, fmt);
+	vprintk(fmt, args);
+	va_end(args);
+}
+EXPORT_SYMBOL(__warn_printk);
 #endif
 
 #ifdef CONFIG_BUG

From 4ca59b14e588f873795a11cdc77a25c686a29d23 Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Fri, 17 Nov 2017 15:27:28 -0800
Subject: [PATCH 22/94] include/linux/compiler-clang.h: handle randomizable
 anonymous structs

The GCC randomize layout plugin can randomize the member offsets of
sensitive kernel data structures.  To use this feature, certain
annotations and members are added to the structures which affect the
member offsets even if this plugin is not used.

All of these structures are completely randomized, except for task_struct
which leaves out some of its members.  All the other members are wrapped
within an anonymous struct with the __randomize_layout attribute.  This is
done using the randomized_struct_fields_start and
randomized_struct_fields_end defines.

When the plugin is disabled, the behaviour of this attribute can vary
based on the GCC version.  For GCC 5.1+, this attribute maps to
__designated_init otherwise it is just an empty define but the anonymous
structure is still present.  For other compilers, both
randomized_struct_fields_start and randomized_struct_fields_end default
to empty defines meaning the anonymous structure is not introduced at
all.

So, if a module compiled with Clang, such as a BPF program, needs to
access task_struct fields such as pid and comm, the offsets of these
members as recognized by Clang are different from those recognized by
modules compiled with GCC.  If GCC 4.6+ is used to build the kernel,
this can be solved by introducing appropriate defines for Clang so that
the anonymous structure is seen when determining the offsets for the
members.

Link: http://lkml.kernel.org/r/20171109064645.25581-1-sandipan@linux.vnet.ibm.com
Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-clang.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index a06583e41f80..3b609edffa8f 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -16,3 +16,6 @@
  * with any version that can compile the kernel
  */
 #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
+
+#define randomized_struct_fields_start	struct {
+#define randomized_struct_fields_end	};

From 8c703d660450c4df72ff24f79a335dc7973a9dc8 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Fri, 17 Nov 2017 15:27:32 -0800
Subject: [PATCH 23/94] kernel/umh.c: optimize 'proc_cap_handler()'

If 'write' is 0, we can avoid a call to spin_lock/spin_unlock.

Link: http://lkml.kernel.org/r/20171020193331.7233-1-christophe.jaillet@wanadoo.fr
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Acked-by: Luis R. Rodriguez <mcgrof@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/umh.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/umh.c b/kernel/umh.c
index 6ff9905250ff..18e5fa4b0e71 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -537,14 +537,14 @@ static int proc_cap_handler(struct ctl_table *table, int write,
 	/*
 	 * Drop everything not in the new_cap (but don't add things)
 	 */
-	spin_lock(&umh_sysctl_lock);
 	if (write) {
+		spin_lock(&umh_sysctl_lock);
 		if (table->data == CAP_BSET)
 			usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
 		if (table->data == CAP_PI)
 			usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+		spin_unlock(&umh_sysctl_lock);
 	}
-	spin_unlock(&umh_sysctl_lock);
 
 	return 0;
 }

From 1f3c790bd5989fcfec9e53ad8fa09f5b740c958f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 17 Nov 2017 15:27:35 -0800
Subject: [PATCH 24/94] dynamic-debug-howto: fix optional/omitted ending line
 number to be LARGE instead of 0

line-range is supposed to treat "1-" as "1-endoffile", so
handle the special case by setting last_lineno to UINT_MAX.

Fixes this error:

  dynamic_debug:ddebug_parse_query: last-line:0 < 1st-line:1
  dynamic_debug:ddebug_exec_query: query parse failed

Link: http://lkml.kernel.org/r/10a6a101-e2be-209f-1f41-54637824788e@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/dynamic_debug.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index da796e2dc4f5..c7c96bc7654a 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -360,6 +360,10 @@ static int ddebug_parse_query(char *words[], int nwords,
 				if (parse_lineno(last, &query->last_lineno) < 0)
 					return -EINVAL;
 
+				/* special case for last lineno not specified */
+				if (query->last_lineno == 0)
+					query->last_lineno = UINT_MAX;
+
 				if (query->last_lineno < query->first_lineno) {
 					pr_err("last-line:%d < 1st-line:%d\n",
 						query->last_lineno,

From 8c188759fb5788579b5975d5a8c9f529e25c2171 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 17 Nov 2017 15:27:39 -0800
Subject: [PATCH 25/94] dynamic_debug documentation: minor fixes

Fix minor typo.

Fix missing words in explaining parsing of last line number.

Link: http://lkml.kernel.org/r/ebb7ff42-4945-103f-d5b4-f07a6f3343a7@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Jason Baron <jbaron@akamai.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/dynamic-debug-howto.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/dynamic-debug-howto.rst b/Documentation/admin-guide/dynamic-debug-howto.rst
index 12278a926370..fdf72429f801 100644
--- a/Documentation/admin-guide/dynamic-debug-howto.rst
+++ b/Documentation/admin-guide/dynamic-debug-howto.rst
@@ -18,7 +18,7 @@ shortcut for ``print_hex_dump(KERN_DEBUG)``.
 
 For ``print_hex_dump_debug()``/``print_hex_dump_bytes()``, format string is
 its ``prefix_str`` argument, if it is constant string; or ``hexdump``
-in case ``prefix_str`` is build dynamically.
+in case ``prefix_str`` is built dynamically.
 
 Dynamic debug has even more useful features:
 
@@ -197,8 +197,8 @@ line
     line number matches the callsite line number exactly.  A
     range of line numbers matches any callsite between the first
     and last line number inclusive.  An empty first number means
-    the first line in the file, an empty line number means the
-    last number in the file.  Examples::
+    the first line in the file, an empty last line number means the
+    last line number in the file.  Examples::
 
 	line 1603           // exactly line 1603
 	line 1600-1605      // the six lines from line 1600 to line 1605

From e1f7590488541845d16afd6003d31773b9155270 Mon Sep 17 00:00:00 2001
From: Tom Saeger <tom.saeger@oracle.com>
Date: Fri, 17 Nov 2017 15:27:42 -0800
Subject: [PATCH 26/94] get_maintainer: add --self-test for internal
 consistency tests

Add "--self-test" option to get_maintainer.pl to show potential
issues in MAINTAINERS file(s) content.

Pattern check warnings are shown for "F" and "X" patterns found in
MAINTAINERS file(s) which do not match any files known by git.

Link: http://lkml.kernel.org/r/64994f911b3510d0f4c8ac2e113501dfcec1f3c9.1509559540.git.tom.saeger@oracle.com
Signed-off-by: Tom Saeger <tom.saeger@oracle.com>
Acked-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/get_maintainer.pl | 94 ++++++++++++++++++++++++++++++++-------
 1 file changed, 77 insertions(+), 17 deletions(-)

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index bc443201d3ef..c68a5d1ba709 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -57,6 +57,7 @@ my $sections = 0;
 my $file_emails = 0;
 my $from_filename = 0;
 my $pattern_depth = 0;
+my $self_test = 0;
 my $version = 0;
 my $help = 0;
 my $find_maintainer_files = 0;
@@ -138,6 +139,7 @@ my %VCS_cmds_git = (
     "subject_pattern" => "^GitSubject: (.*)",
     "stat_pattern" => "^(\\d+)\\t(\\d+)\\t\$file\$",
     "file_exists_cmd" => "git ls-files \$file",
+    "list_files_cmd" => "git ls-files \$file",
 );
 
 my %VCS_cmds_hg = (
@@ -167,6 +169,7 @@ my %VCS_cmds_hg = (
     "subject_pattern" => "^HgSubject: (.*)",
     "stat_pattern" => "^(\\d+)\t(\\d+)\t\$file\$",
     "file_exists_cmd" => "hg files \$file",
+    "list_files_cmd" => "hg manifest -R \$file",
 );
 
 my $conf = which_conf(".get_maintainer.conf");
@@ -216,6 +219,14 @@ if (-f $ignore_file) {
     close($ignore);
 }
 
+if ($#ARGV > 0) {
+    foreach (@ARGV) {
+        if ($_ eq "-self-test" || $_ eq "--self-test") {
+            die "$P: using --self-test does not allow any other option or argument\n";
+        }
+    }
+}
+
 if (!GetOptions(
 		'email!' => \$email,
 		'git!' => \$email_git,
@@ -252,6 +263,7 @@ if (!GetOptions(
 		'fe|file-emails!' => \$file_emails,
 		'f|file' => \$from_filename,
 		'find-maintainer-files' => \$find_maintainer_files,
+		'self-test' => \$self_test,
 		'v|version' => \$version,
 		'h|help|usage' => \$help,
 		)) {
@@ -268,6 +280,12 @@ if ($version != 0) {
     exit 0;
 }
 
+if ($self_test) {
+    read_all_maintainer_files();
+    check_maintainers_patterns();
+    exit 0;
+}
+
 if (-t STDIN && !@ARGV) {
     # We're talking to a terminal, but have no command line arguments.
     die "$P: missing patchfile or -f file - use --help if necessary\n";
@@ -311,12 +329,14 @@ if (!top_of_kernel_tree($lk_path)) {
 my @typevalue = ();
 my %keyword_hash;
 my @mfiles = ();
+my @self_test_pattern_info = ();
 
 sub read_maintainer_file {
     my ($file) = @_;
 
     open (my $maint, '<', "$file")
 	or die "$P: Can't open MAINTAINERS file '$file': $!\n";
+    my $i = 1;
     while (<$maint>) {
 	my $line = $_;
 
@@ -333,6 +353,9 @@ sub read_maintainer_file {
 		if ((-d $value)) {
 		    $value =~ s@([^/])$@$1/@;
 		}
+		if ($self_test) {
+			push(@self_test_pattern_info, {file=>$file, line=>$line, linenr=>$i, pat=>$value});
+		}
 	    } elsif ($type eq "K") {
 		$keyword_hash{@typevalue} = $value;
 	    }
@@ -341,6 +364,7 @@ sub read_maintainer_file {
 	    $line =~ s/\n$//g;
 	    push(@typevalue, $line);
 	}
+	$i++;
     }
     close($maint);
 }
@@ -357,26 +381,30 @@ sub find_ignore_git {
     return grep { $_ !~ /^\.git$/; } @_;
 }
 
-if (-d "${lk_path}MAINTAINERS") {
-    opendir(DIR, "${lk_path}MAINTAINERS") or die $!;
-    my @files = readdir(DIR);
-    closedir(DIR);
-    foreach my $file (@files) {
-	push(@mfiles, "${lk_path}MAINTAINERS/$file") if ($file !~ /^\./);
+read_all_maintainer_files();
+
+sub read_all_maintainer_files {
+    if (-d "${lk_path}MAINTAINERS") {
+        opendir(DIR, "${lk_path}MAINTAINERS") or die $!;
+        my @files = readdir(DIR);
+        closedir(DIR);
+        foreach my $file (@files) {
+            push(@mfiles, "${lk_path}MAINTAINERS/$file") if ($file !~ /^\./);
+        }
     }
-}
 
-if ($find_maintainer_files) {
-    find( { wanted => \&find_is_maintainer_file,
-	    preprocess => \&find_ignore_git,
-	    no_chdir => 1,
-	}, "${lk_path}");
-} else {
-    push(@mfiles, "${lk_path}MAINTAINERS") if -f "${lk_path}MAINTAINERS";
-}
+    if ($find_maintainer_files) {
+        find( { wanted => \&find_is_maintainer_file,
+                preprocess => \&find_ignore_git,
+                no_chdir => 1,
+        }, "${lk_path}");
+    } else {
+        push(@mfiles, "${lk_path}MAINTAINERS") if -f "${lk_path}MAINTAINERS";
+    }
 
-foreach my $file (@mfiles) {
-    read_maintainer_file("$file");
+    foreach my $file (@mfiles) {
+        read_maintainer_file("$file");
+    }
 }
 
 #
@@ -586,6 +614,20 @@ if ($web) {
 
 exit($exit);
 
+sub check_maintainers_patterns {
+    my @lsfiles = ();
+
+    @lsfiles = vcs_list_files($lk_path);
+
+    for my $x (@self_test_pattern_info) {
+        if (!grep(m@^$x->{pat}@, @lsfiles)) {
+            my $line = $x->{line};
+            chomp($line);
+            print("$x->{file}:$x->{linenr}: warning: no matches $line\n");
+        }
+    }
+}
+
 sub ignore_email_address {
     my ($address) = @_;
 
@@ -863,6 +905,7 @@ Other options:
   --sections => print all of the subsystem sections with pattern matches
   --letters => print all matching 'letter' types from all matching sections
   --mailmap => use .mailmap file (default: $email_use_mailmap)
+  --self-test => show potential issues with MAINTAINERS file content
   --version => show version
   --help => show this help information
 
@@ -2192,6 +2235,23 @@ sub vcs_file_exists {
     return $exists;
 }
 
+sub vcs_list_files {
+    my ($file) = @_;
+
+    my @lsfiles = ();
+
+    my $vcs_used = vcs_exists();
+    return 0 if (!$vcs_used);
+
+    my $cmd = $VCS_cmds{"list_files_cmd"};
+    $cmd =~ s/(\$\w+)/$1/eeg;   # interpolate $cmd
+    @lsfiles = &{$VCS_cmds{"execute_cmd"}}($cmd);
+
+    return () if ($? != 0);
+
+    return @lsfiles;
+}
+
 sub uniq {
     my (@parms) = @_;
 

From 083bf9c56d067559bd2727e1ec6ffb67d0ff53be Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 17 Nov 2017 15:27:46 -0800
Subject: [PATCH 27/94] get_maintainer: add more --self-test options

Add tests for duplicate section headers, missing section content, link and
scm reachability.

Miscellanea:

o Add --self-test=<foo> options
  (a comma separated list of any of sections, patterns, links or scm)
  where the default without options is all tests
o Rename check_maintainers_patterns to self_test
o Rename self_test_pattern_info to self_test_info

[tom.saeger@oracle.com: improvements]
Link: http://lkml.kernel.org/r/13e3986c374902fcf08ae947e36c5c608bbe3b79.1510075301.git.joe@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Reviewed-by: Tom Saeger <tom.saeger@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/get_maintainer.pl | 149 +++++++++++++++++++++++++++++++++-----
 1 file changed, 132 insertions(+), 17 deletions(-)

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index c68a5d1ba709..99c96e86eccb 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -57,7 +57,7 @@ my $sections = 0;
 my $file_emails = 0;
 my $from_filename = 0;
 my $pattern_depth = 0;
-my $self_test = 0;
+my $self_test = undef;
 my $version = 0;
 my $help = 0;
 my $find_maintainer_files = 0;
@@ -221,7 +221,7 @@ if (-f $ignore_file) {
 
 if ($#ARGV > 0) {
     foreach (@ARGV) {
-        if ($_ eq "-self-test" || $_ eq "--self-test") {
+        if ($_ =~ /^-{1,2}self-test(?:=|$)/) {
             die "$P: using --self-test does not allow any other option or argument\n";
         }
     }
@@ -263,7 +263,7 @@ if (!GetOptions(
 		'fe|file-emails!' => \$file_emails,
 		'f|file' => \$from_filename,
 		'find-maintainer-files' => \$find_maintainer_files,
-		'self-test' => \$self_test,
+		'self-test:s' => \$self_test,
 		'v|version' => \$version,
 		'h|help|usage' => \$help,
 		)) {
@@ -280,9 +280,9 @@ if ($version != 0) {
     exit 0;
 }
 
-if ($self_test) {
+if (defined $self_test) {
     read_all_maintainer_files();
-    check_maintainers_patterns();
+    self_test();
     exit 0;
 }
 
@@ -329,7 +329,7 @@ if (!top_of_kernel_tree($lk_path)) {
 my @typevalue = ();
 my %keyword_hash;
 my @mfiles = ();
-my @self_test_pattern_info = ();
+my @self_test_info = ();
 
 sub read_maintainer_file {
     my ($file) = @_;
@@ -339,6 +339,7 @@ sub read_maintainer_file {
     my $i = 1;
     while (<$maint>) {
 	my $line = $_;
+	chomp $line;
 
 	if ($line =~ m/^([A-Z]):\s*(.*)/) {
 	    my $type = $1;
@@ -353,17 +354,16 @@ sub read_maintainer_file {
 		if ((-d $value)) {
 		    $value =~ s@([^/])$@$1/@;
 		}
-		if ($self_test) {
-			push(@self_test_pattern_info, {file=>$file, line=>$line, linenr=>$i, pat=>$value});
-		}
 	    } elsif ($type eq "K") {
 		$keyword_hash{@typevalue} = $value;
 	    }
 	    push(@typevalue, "$type:$value");
 	} elsif (!(/^\s*$/ || /^\s*\#/)) {
-	    $line =~ s/\n$//g;
 	    push(@typevalue, $line);
 	}
+	if (defined $self_test) {
+	    push(@self_test_info, {file=>$file, linenr=>$i, line=>$line});
+	}
 	$i++;
     }
     close($maint);
@@ -614,17 +614,132 @@ if ($web) {
 
 exit($exit);
 
-sub check_maintainers_patterns {
+sub self_test {
     my @lsfiles = ();
+    my @good_links = ();
+    my @bad_links = ();
+    my @section_headers = ();
+    my $index = 0;
 
     @lsfiles = vcs_list_files($lk_path);
 
-    for my $x (@self_test_pattern_info) {
-        if (!grep(m@^$x->{pat}@, @lsfiles)) {
-            my $line = $x->{line};
-            chomp($line);
-            print("$x->{file}:$x->{linenr}: warning: no matches $line\n");
-        }
+    for my $x (@self_test_info) {
+	$index++;
+
+	## Section header duplication and missing section content
+	if (($self_test eq "" || $self_test =~ /\bsections\b/) &&
+	    $x->{line} =~ /^\S[^:]/ &&
+	    defined $self_test_info[$index] &&
+	    $self_test_info[$index]->{line} =~ /^([A-Z]):\s*\S/) {
+	    my $has_S = 0;
+	    my $has_F = 0;
+	    my $has_ML = 0;
+	    my $status = "";
+	    if (grep(m@^\Q$x->{line}\E@, @section_headers)) {
+		print("$x->{file}:$x->{linenr}: warning: duplicate section header\t$x->{line}\n");
+	    } else {
+		push(@section_headers, $x->{line});
+	    }
+	    my $nextline = $index;
+	    while (defined $self_test_info[$nextline] &&
+		   $self_test_info[$nextline]->{line} =~ /^([A-Z]):\s*(\S.*)/) {
+		my $type = $1;
+		my $value = $2;
+		if ($type eq "S") {
+		    $has_S = 1;
+		    $status = $value;
+		} elsif ($type eq "F" || $type eq "N") {
+		    $has_F = 1;
+		} elsif ($type eq "M" || $type eq "R" || $type eq "L") {
+		    $has_ML = 1;
+		}
+		$nextline++;
+	    }
+	    if (!$has_ML && $status !~ /orphan|obsolete/i) {
+		print("$x->{file}:$x->{linenr}: warning: section without email address\t$x->{line}\n");
+	    }
+	    if (!$has_S) {
+		print("$x->{file}:$x->{linenr}: warning: section without status \t$x->{line}\n");
+	    }
+	    if (!$has_F) {
+		print("$x->{file}:$x->{linenr}: warning: section without file pattern\t$x->{line}\n");
+	    }
+	}
+
+	next if ($x->{line} !~ /^([A-Z]):\s*(.*)/);
+
+	my $type = $1;
+	my $value = $2;
+
+	## Filename pattern matching
+	if (($type eq "F" || $type eq "X") &&
+	    ($self_test eq "" || $self_test =~ /\bpatterns\b/)) {
+	    $value =~ s@\.@\\\.@g;       ##Convert . to \.
+	    $value =~ s/\*/\.\*/g;       ##Convert * to .*
+	    $value =~ s/\?/\./g;         ##Convert ? to .
+	    ##if pattern is a directory and it lacks a trailing slash, add one
+	    if ((-d $value)) {
+		$value =~ s@([^/])$@$1/@;
+	    }
+	    if (!grep(m@^$value@, @lsfiles)) {
+		print("$x->{file}:$x->{linenr}: warning: no file matches\t$x->{line}\n");
+	    }
+
+	## Link reachability
+	} elsif (($type eq "W" || $type eq "Q" || $type eq "B") &&
+		 $value =~ /^https?:/ &&
+		 ($self_test eq "" || $self_test =~ /\blinks\b/)) {
+	    next if (grep(m@^\Q$value\E$@, @good_links));
+	    my $isbad = 0;
+	    if (grep(m@^\Q$value\E$@, @bad_links)) {
+	        $isbad = 1;
+	    } else {
+		my $output = `wget --spider -q --no-check-certificate --timeout 10 --tries 1 $value`;
+		if ($? == 0) {
+		    push(@good_links, $value);
+		} else {
+		    push(@bad_links, $value);
+		    $isbad = 1;
+		}
+	    }
+	    if ($isbad) {
+	        print("$x->{file}:$x->{linenr}: warning: possible bad link\t$x->{line}\n");
+	    }
+
+	## SCM reachability
+	} elsif ($type eq "T" &&
+		 ($self_test eq "" || $self_test =~ /\bscm\b/)) {
+	    next if (grep(m@^\Q$value\E$@, @good_links));
+	    my $isbad = 0;
+	    if (grep(m@^\Q$value\E$@, @bad_links)) {
+	        $isbad = 1;
+            } elsif ($value !~ /^(?:git|quilt|hg)\s+\S/) {
+		print("$x->{file}:$x->{linenr}: warning: malformed entry\t$x->{line}\n");
+	    } elsif ($value =~ /^git\s+(\S+)(\s+([^\(]+\S+))?/) {
+		my $url = $1;
+		my $branch = "";
+		$branch = $3 if $3;
+		my $output = `git ls-remote --exit-code -h "$url" $branch > /dev/null 2>&1`;
+		if ($? == 0) {
+		    push(@good_links, $value);
+		} else {
+		    push(@bad_links, $value);
+		    $isbad = 1;
+		}
+	    } elsif ($value =~ /^(?:quilt|hg)\s+(https?:\S+)/) {
+		my $url = $1;
+		my $output = `wget --spider -q --no-check-certificate --timeout 10 --tries 1 $url`;
+		if ($? == 0) {
+		    push(@good_links, $value);
+		} else {
+		    push(@bad_links, $value);
+		    $isbad = 1;
+		}
+	    }
+	    if ($isbad) {
+		print("$x->{file}:$x->{linenr}: warning: possible bad link\t$x->{line}\n");
+	    }
+	}
     }
 }
 

From 8001541cc333b1a3c2c38e3b34475fa446b053da Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 17 Nov 2017 15:27:49 -0800
Subject: [PATCH 28/94] include/linux/bitfield.h: include <linux/build_bug.h>
 instead of <linux/bug.h>

Since commit bc6245e5efd7 ("bug: split BUILD_BUG stuff out into
<linux/build_bug.h>"), #include <linux/build_bug.h> is better to pull
minimal headers needed for BUILG_BUG() family.

Link: http://lkml.kernel.org/r/1505700775-19826-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Cc: Dinan Gunawardena <dinan.gunawardena@netronome.com>
Cc: Kalle Valo <kvalo@codeaurora.org>
Cc: Ian Abbott <abbotti@mev.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitfield.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
index f2deb71958b2..1030651f8309 100644
--- a/include/linux/bitfield.h
+++ b/include/linux/bitfield.h
@@ -15,7 +15,7 @@
 #ifndef _LINUX_BITFIELD_H
 #define _LINUX_BITFIELD_H
 
-#include <linux/bug.h>
+#include <linux/build_bug.h>
 
 /*
  * Bitfield access macros

From f5bba9d11a256ad2a1c2f8e7fc6aabe6416b7890 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 17 Nov 2017 15:27:53 -0800
Subject: [PATCH 29/94] include/linux/radix-tree.h: remove unneeded #include
 <linux/bug.h>

This include was added by commit 187f1882b5b0 ("BUG: headers with
BUG/BUG_ON etc. need linux/bug.h") because BUG_ON() was used in this
header at that time.

Some time later, commit 6d75f366b924 ("lib: radix-tree: check accounting
of existing slot replacement users") removed the use of BUG_ON() from
this header.

Since then, there is no reason to include <linux/bug.h>.

Link: http://lkml.kernel.org/r/1505660151-4383-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Chris Mi <chrism@mellanox.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 0ca448c1cb42..23a9c89c7ad9 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -22,7 +22,6 @@
 #define _LINUX_RADIX_TREE_H
 
 #include <linux/bitops.h>
-#include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/preempt.h>

From d6b28e0996962aeadd3777ae565ae03dd5c59f18 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Fri, 17 Nov 2017 15:27:56 -0800
Subject: [PATCH 30/94] lib: add module support to string tests

Extract the string test code into its own source file, to allow
compiling it either to a loadable module, or built into the kernel.

Fixes: 03270c13c5ffaa6a ("lib/string.c: add testcases for memset16/32/64")
Link: http://lkml.kernel.org/r/1505397744-3387-1-git-send-email-geert@linux-m68k.org
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/Kconfig       |   2 +-
 lib/Makefile      |   1 +
 lib/string.c      | 141 ----------------------------------------------
 lib/test_string.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 143 insertions(+), 142 deletions(-)
 create mode 100644 lib/test_string.c

diff --git a/lib/Kconfig b/lib/Kconfig
index a2b6745324ab..368972f0db78 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -584,7 +584,7 @@ config PRIME_NUMBERS
 	tristate
 
 config STRING_SELFTEST
-	bool "Test string functions"
+	tristate "Test string functions"
 
 endmenu
 
diff --git a/lib/Makefile b/lib/Makefile
index 136a0b254564..e3cb2a241338 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -40,6 +40,7 @@ obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \
 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
 	 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
 	 once.o refcount.o usercopy.o errseq.o
+obj-$(CONFIG_STRING_SELFTEST) += test_string.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += hexdump.o
diff --git a/lib/string.c b/lib/string.c
index 5e8d410a93df..64a9e33f1daa 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -1052,144 +1052,3 @@ void fortify_panic(const char *name)
 	BUG();
 }
 EXPORT_SYMBOL(fortify_panic);
-
-#ifdef CONFIG_STRING_SELFTEST
-#include <linux/slab.h>
-#include <linux/module.h>
-
-static __init int memset16_selftest(void)
-{
-	unsigned i, j, k;
-	u16 v, *p;
-
-	p = kmalloc(256 * 2 * 2, GFP_KERNEL);
-	if (!p)
-		return -1;
-
-	for (i = 0; i < 256; i++) {
-		for (j = 0; j < 256; j++) {
-			memset(p, 0xa1, 256 * 2 * sizeof(v));
-			memset16(p + i, 0xb1b2, j);
-			for (k = 0; k < 512; k++) {
-				v = p[k];
-				if (k < i) {
-					if (v != 0xa1a1)
-						goto fail;
-				} else if (k < i + j) {
-					if (v != 0xb1b2)
-						goto fail;
-				} else {
-					if (v != 0xa1a1)
-						goto fail;
-				}
-			}
-		}
-	}
-
-fail:
-	kfree(p);
-	if (i < 256)
-		return (i << 24) | (j << 16) | k;
-	return 0;
-}
-
-static __init int memset32_selftest(void)
-{
-	unsigned i, j, k;
-	u32 v, *p;
-
-	p = kmalloc(256 * 2 * 4, GFP_KERNEL);
-	if (!p)
-		return -1;
-
-	for (i = 0; i < 256; i++) {
-		for (j = 0; j < 256; j++) {
-			memset(p, 0xa1, 256 * 2 * sizeof(v));
-			memset32(p + i, 0xb1b2b3b4, j);
-			for (k = 0; k < 512; k++) {
-				v = p[k];
-				if (k < i) {
-					if (v != 0xa1a1a1a1)
-						goto fail;
-				} else if (k < i + j) {
-					if (v != 0xb1b2b3b4)
-						goto fail;
-				} else {
-					if (v != 0xa1a1a1a1)
-						goto fail;
-				}
-			}
-		}
-	}
-
-fail:
-	kfree(p);
-	if (i < 256)
-		return (i << 24) | (j << 16) | k;
-	return 0;
-}
-
-static __init int memset64_selftest(void)
-{
-	unsigned i, j, k;
-	u64 v, *p;
-
-	p = kmalloc(256 * 2 * 8, GFP_KERNEL);
-	if (!p)
-		return -1;
-
-	for (i = 0; i < 256; i++) {
-		for (j = 0; j < 256; j++) {
-			memset(p, 0xa1, 256 * 2 * sizeof(v));
-			memset64(p + i, 0xb1b2b3b4b5b6b7b8ULL, j);
-			for (k = 0; k < 512; k++) {
-				v = p[k];
-				if (k < i) {
-					if (v != 0xa1a1a1a1a1a1a1a1ULL)
-						goto fail;
-				} else if (k < i + j) {
-					if (v != 0xb1b2b3b4b5b6b7b8ULL)
-						goto fail;
-				} else {
-					if (v != 0xa1a1a1a1a1a1a1a1ULL)
-						goto fail;
-				}
-			}
-		}
-	}
-
-fail:
-	kfree(p);
-	if (i < 256)
-		return (i << 24) | (j << 16) | k;
-	return 0;
-}
-
-static __init int string_selftest_init(void)
-{
-	int test, subtest;
-
-	test = 1;
-	subtest = memset16_selftest();
-	if (subtest)
-		goto fail;
-
-	test = 2;
-	subtest = memset32_selftest();
-	if (subtest)
-		goto fail;
-
-	test = 3;
-	subtest = memset64_selftest();
-	if (subtest)
-		goto fail;
-
-	pr_info("String selftests succeeded\n");
-	return 0;
-fail:
-	pr_crit("String selftest failure %d.%08x\n", test, subtest);
-	return 0;
-}
-
-module_init(string_selftest_init);
-#endif	/* CONFIG_STRING_SELFTEST */
diff --git a/lib/test_string.c b/lib/test_string.c
new file mode 100644
index 000000000000..0fcdb82dca86
--- /dev/null
+++ b/lib/test_string.c
@@ -0,0 +1,141 @@
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+static __init int memset16_selftest(void)
+{
+	unsigned i, j, k;
+	u16 v, *p;
+
+	p = kmalloc(256 * 2 * 2, GFP_KERNEL);
+	if (!p)
+		return -1;
+
+	for (i = 0; i < 256; i++) {
+		for (j = 0; j < 256; j++) {
+			memset(p, 0xa1, 256 * 2 * sizeof(v));
+			memset16(p + i, 0xb1b2, j);
+			for (k = 0; k < 512; k++) {
+				v = p[k];
+				if (k < i) {
+					if (v != 0xa1a1)
+						goto fail;
+				} else if (k < i + j) {
+					if (v != 0xb1b2)
+						goto fail;
+				} else {
+					if (v != 0xa1a1)
+						goto fail;
+				}
+			}
+		}
+	}
+
+fail:
+	kfree(p);
+	if (i < 256)
+		return (i << 24) | (j << 16) | k;
+	return 0;
+}
+
+static __init int memset32_selftest(void)
+{
+	unsigned i, j, k;
+	u32 v, *p;
+
+	p = kmalloc(256 * 2 * 4, GFP_KERNEL);
+	if (!p)
+		return -1;
+
+	for (i = 0; i < 256; i++) {
+		for (j = 0; j < 256; j++) {
+			memset(p, 0xa1, 256 * 2 * sizeof(v));
+			memset32(p + i, 0xb1b2b3b4, j);
+			for (k = 0; k < 512; k++) {
+				v = p[k];
+				if (k < i) {
+					if (v != 0xa1a1a1a1)
+						goto fail;
+				} else if (k < i + j) {
+					if (v != 0xb1b2b3b4)
+						goto fail;
+				} else {
+					if (v != 0xa1a1a1a1)
+						goto fail;
+				}
+			}
+		}
+	}
+
+fail:
+	kfree(p);
+	if (i < 256)
+		return (i << 24) | (j << 16) | k;
+	return 0;
+}
+
+static __init int memset64_selftest(void)
+{
+	unsigned i, j, k;
+	u64 v, *p;
+
+	p = kmalloc(256 * 2 * 8, GFP_KERNEL);
+	if (!p)
+		return -1;
+
+	for (i = 0; i < 256; i++) {
+		for (j = 0; j < 256; j++) {
+			memset(p, 0xa1, 256 * 2 * sizeof(v));
+			memset64(p + i, 0xb1b2b3b4b5b6b7b8ULL, j);
+			for (k = 0; k < 512; k++) {
+				v = p[k];
+				if (k < i) {
+					if (v != 0xa1a1a1a1a1a1a1a1ULL)
+						goto fail;
+				} else if (k < i + j) {
+					if (v != 0xb1b2b3b4b5b6b7b8ULL)
+						goto fail;
+				} else {
+					if (v != 0xa1a1a1a1a1a1a1a1ULL)
+						goto fail;
+				}
+			}
+		}
+	}
+
+fail:
+	kfree(p);
+	if (i < 256)
+		return (i << 24) | (j << 16) | k;
+	return 0;
+}
+
+static __init int string_selftest_init(void)
+{
+	int test, subtest;
+
+	test = 1;
+	subtest = memset16_selftest();
+	if (subtest)
+		goto fail;
+
+	test = 2;
+	subtest = memset32_selftest();
+	if (subtest)
+		goto fail;
+
+	test = 3;
+	subtest = memset64_selftest();
+	if (subtest)
+		goto fail;
+
+	pr_info("String selftests succeeded\n");
+	return 0;
+fail:
+	pr_crit("String selftest failure %d.%08x\n", test, subtest);
+	return 0;
+}
+
+module_init(string_selftest_init);
+MODULE_LICENSE("GPL v2");

From dc2bf000a2848cf1dee373db14ce2cd1fe3ee394 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Fri, 17 Nov 2017 15:28:00 -0800
Subject: [PATCH 31/94] lib/test: delete five error messages for failed memory
 allocations

Omit extra messages for a memory allocation failure in these functions.

This issue was detected by using the Coccinelle software.

Link: http://lkml.kernel.org/r/410a4c5a-4ee0-6fcc-969c-103d8e496b78@users.sourceforge.net
Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/test_kasan.c     | 5 ++---
 lib/test_kmod.c      | 8 ++------
 lib/test_list_sort.c | 9 +++------
 3 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index a25c9763fce1..ef1a3ac1397e 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -353,10 +353,9 @@ static noinline void __init memcg_accounted_kmem_cache(void)
 	 */
 	for (i = 0; i < 5; i++) {
 		p = kmem_cache_alloc(cache, GFP_KERNEL);
-		if (!p) {
-			pr_err("Allocation failed\n");
+		if (!p)
 			goto free_cache;
-		}
+
 		kmem_cache_free(cache, p);
 		msleep(100);
 	}
diff --git a/lib/test_kmod.c b/lib/test_kmod.c
index fba78d25e825..337f408b4de6 100644
--- a/lib/test_kmod.c
+++ b/lib/test_kmod.c
@@ -783,10 +783,8 @@ static int kmod_config_sync_info(struct kmod_test_device *test_dev)
 	free_test_dev_info(test_dev);
 	test_dev->info = vzalloc(config->num_threads *
 				 sizeof(struct kmod_test_device_info));
-	if (!test_dev->info) {
-		dev_err(test_dev->dev, "Cannot alloc test_dev info\n");
+	if (!test_dev->info)
 		return -ENOMEM;
-	}
 
 	return 0;
 }
@@ -1089,10 +1087,8 @@ static struct kmod_test_device *alloc_test_dev_kmod(int idx)
 	struct miscdevice *misc_dev;
 
 	test_dev = vzalloc(sizeof(struct kmod_test_device));
-	if (!test_dev) {
-		pr_err("Cannot alloc test_dev\n");
+	if (!test_dev)
 		goto err_out;
-	}
 
 	mutex_init(&test_dev->config_mutex);
 	mutex_init(&test_dev->trigger_mutex);
diff --git a/lib/test_list_sort.c b/lib/test_list_sort.c
index 28e817387b04..5474f3f3e41d 100644
--- a/lib/test_list_sort.c
+++ b/lib/test_list_sort.c
@@ -76,17 +76,14 @@ static int __init list_sort_test(void)
 	pr_debug("start testing list_sort()\n");
 
 	elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL);
-	if (!elts) {
-		pr_err("error: cannot allocate memory\n");
+	if (!elts)
 		return err;
-	}
 
 	for (i = 0; i < TEST_LIST_LEN; i++) {
 		el = kmalloc(sizeof(*el), GFP_KERNEL);
-		if (!el) {
-			pr_err("error: cannot allocate memory\n");
+		if (!el)
 			goto exit;
-		}
+
 		 /* force some equivalencies */
 		el->value = prandom_u32() % (TEST_LIST_LEN / 3);
 		el->serial = i;

From 3f3295709edea6268ff1609855f498035286af73 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 17 Nov 2017 15:28:04 -0800
Subject: [PATCH 32/94] lib/int_sqrt: optimize small argument

The current int_sqrt() computation is sub-optimal for the case of small
@x.  Which is the interesting case when we're going to do cumulative
distribution functions on idle times, which we assume to be a random
variable, where the target residency of the deepest idle state gives an
upper bound on the variable (5e6ns on recent Intel chips).

In the case of small @x, the compute loop:

	while (m != 0) {
		b = y + m;
		y >>= 1;

		if (x >= b) {
			x -= b;
			y += m;
		}
		m >>= 2;
	}

can be reduced to:

	while (m > x)
		m >>= 2;

Because y==0, b==m and until x>=m y will remain 0.

And while this is computationally equivalent, it runs much faster
because there's less code, in particular less branches.

      cycles:                 branches:              branch-misses:

OLD:

hot:   45.109444 +- 0.044117  44.333392 +- 0.002254  0.018723 +- 0.000593
cold: 187.737379 +- 0.156678  44.333407 +- 0.002254  6.272844 +- 0.004305

PRE:

hot:   67.937492 +- 0.064124  66.999535 +- 0.000488  0.066720 +- 0.001113
cold: 232.004379 +- 0.332811  66.999527 +- 0.000488  6.914634 +- 0.006568

POST:

hot:   43.633557 +- 0.034373  45.333132 +- 0.002277  0.023529 +- 0.000681
cold: 207.438411 +- 0.125840  45.333132 +- 0.002277  6.976486 +- 0.004219

Averages computed over all values <128k using a LFSR to generate order.
Cold numbers have a LFSR based branch trace buffer 'confuser' ran between
each int_sqrt() invocation.

Link: http://lkml.kernel.org/r/20171020164644.876503355@infradead.org
Fixes: 30493cc9dddb ("lib/int_sqrt.c: optimize square root algorithm")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Suggested-by: Anshul Garg <aksgarg1989@gmail.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Joe Perches <joe@perches.com>
Cc: David Miller <davem@davemloft.net>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michael Davidson <md@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/int_sqrt.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c
index db0b5aa071fc..036c96781ea8 100644
--- a/lib/int_sqrt.c
+++ b/lib/int_sqrt.c
@@ -23,6 +23,9 @@ unsigned long int_sqrt(unsigned long x)
 		return x;
 
 	m = 1UL << (BITS_PER_LONG - 2);
+	while (m > x)
+		m >>= 2;
+
 	while (m != 0) {
 		b = y + m;
 		y >>= 1;

From f8ae107eef209bff29a5816bc1aad40d5cd69a80 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 17 Nov 2017 15:28:08 -0800
Subject: [PATCH 33/94] lib/int_sqrt: optimize initial value compute

The initial value (@m) compute is:

	m = 1UL << (BITS_PER_LONG - 2);
	while (m > x)
		m >>= 2;

Which is a linear search for the highest even bit smaller or equal to @x
We can implement this using a binary search using __fls() (or better when
its hardware implemented).

	m = 1UL << (__fls(x) & ~1UL);

Especially for small values of @x; which are the more common arguments
when doing a CDF on idle times; the linear search is near to worst case,
while the binary search of __fls() is a constant 6 (or 5 on 32bit)
branches.

      cycles:                 branches:              branch-misses:

PRE:

hot:   43.633557 +- 0.034373  45.333132 +- 0.002277  0.023529 +- 0.000681
cold: 207.438411 +- 0.125840  45.333132 +- 0.002277  6.976486 +- 0.004219

SOFTWARE FLS:

hot:   29.576176 +- 0.028850  26.666730 +- 0.004511  0.019463 +- 0.000663
cold: 165.947136 +- 0.188406  26.666746 +- 0.004511  6.133897 +- 0.004386

HARDWARE FLS:

hot:   24.720922 +- 0.025161  20.666784 +- 0.004509  0.020836 +- 0.000677
cold: 132.777197 +- 0.127471  20.666776 +- 0.004509  5.080285 +- 0.003874

Averages computed over all values <128k using a LFSR to generate order.
Cold numbers have a LFSR based branch trace buffer 'confuser' ran between
each int_sqrt() invocation.

Link: http://lkml.kernel.org/r/20171020164644.936577234@infradead.org
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Suggested-by: Joe Perches <joe@perches.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Anshul Garg <aksgarg1989@gmail.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: David Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Michael Davidson <md@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/int_sqrt.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c
index 036c96781ea8..67bb300b5b46 100644
--- a/lib/int_sqrt.c
+++ b/lib/int_sqrt.c
@@ -8,6 +8,7 @@
 
 #include <linux/kernel.h>
 #include <linux/export.h>
+#include <linux/bitops.h>
 
 /**
  * int_sqrt - rough approximation to sqrt
@@ -22,10 +23,7 @@ unsigned long int_sqrt(unsigned long x)
 	if (x <= 1)
 		return x;
 
-	m = 1UL << (BITS_PER_LONG - 2);
-	while (m > x)
-		m >>= 2;
-
+	m = 1UL << (__fls(x) & ~1UL);
 	while (m != 0) {
 		b = y + m;
 		y >>= 1;

From e813a614007e3a8a7b53d976e86d9a20f21f81ad Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 17 Nov 2017 15:28:12 -0800
Subject: [PATCH 34/94] lib/int_sqrt: adjust comments

Our current int_sqrt() is not rough nor any approximation; it calculates
the exact value of: floor(sqrt()).  Document this.

Link: http://lkml.kernel.org/r/20171020164645.001652117@infradead.org
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Anshul Garg <aksgarg1989@gmail.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: David Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Joe Perches <joe@perches.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Michael Davidson <md@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/int_sqrt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c
index 67bb300b5b46..e2d329099bf7 100644
--- a/lib/int_sqrt.c
+++ b/lib/int_sqrt.c
@@ -11,10 +11,10 @@
 #include <linux/bitops.h>
 
 /**
- * int_sqrt - rough approximation to sqrt
+ * int_sqrt - computes the integer square root
  * @x: integer of which to calculate the sqrt
  *
- * A very rough approximation to the sqrt() function.
+ * Computes: floor(sqrt(x))
  */
 unsigned long int_sqrt(unsigned long x)
 {

From 36a3d1dd4e16bcd0d2ddfb4a2ec7092f0ae0d931 Mon Sep 17 00:00:00 2001
From: Stephen Bates <sbates@raithlin.com>
Date: Fri, 17 Nov 2017 15:28:16 -0800
Subject: [PATCH 35/94] lib/genalloc.c: make the avail variable an
 atomic_long_t

If the amount of resources allocated to a gen_pool exceeds 2^32 then the
avail atomic overflows and this causes problems when clients try and
borrow resources from the pool.  This is only expected to be an issue on
64 bit systems.

Add the <linux/atomic.h> header to pull in atomic_long* operations.  So
that 32 bit systems continue to use atomic32_t but 64 bit systems can
use atomic64_t.

Link: http://lkml.kernel.org/r/1509033843-25667-1-git-send-email-sbates@raithlin.com
Signed-off-by: Stephen Bates <sbates@raithlin.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Daniel Mentz <danielmentz@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/genalloc.h |  3 ++-
 lib/genalloc.c           | 10 +++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 6dfec4d638df..872f930f1b06 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -32,6 +32,7 @@
 
 #include <linux/types.h>
 #include <linux/spinlock_types.h>
+#include <linux/atomic.h>
 
 struct device;
 struct device_node;
@@ -71,7 +72,7 @@ struct gen_pool {
  */
 struct gen_pool_chunk {
 	struct list_head next_chunk;	/* next chunk in pool */
-	atomic_t avail;
+	atomic_long_t avail;
 	phys_addr_t phys_addr;		/* physical starting address of memory chunk */
 	unsigned long start_addr;	/* start address of memory chunk */
 	unsigned long end_addr;		/* end address of memory chunk (inclusive) */
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 144fe6b1a03e..ca06adc4f445 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -194,7 +194,7 @@ int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phy
 	chunk->phys_addr = phys;
 	chunk->start_addr = virt;
 	chunk->end_addr = virt + size - 1;
-	atomic_set(&chunk->avail, size);
+	atomic_long_set(&chunk->avail, size);
 
 	spin_lock(&pool->lock);
 	list_add_rcu(&chunk->next_chunk, &pool->chunks);
@@ -304,7 +304,7 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size,
 	nbits = (size + (1UL << order) - 1) >> order;
 	rcu_read_lock();
 	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
-		if (size > atomic_read(&chunk->avail))
+		if (size > atomic_long_read(&chunk->avail))
 			continue;
 
 		start_bit = 0;
@@ -324,7 +324,7 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size,
 
 		addr = chunk->start_addr + ((unsigned long)start_bit << order);
 		size = nbits << order;
-		atomic_sub(size, &chunk->avail);
+		atomic_long_sub(size, &chunk->avail);
 		break;
 	}
 	rcu_read_unlock();
@@ -390,7 +390,7 @@ void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)
 			remain = bitmap_clear_ll(chunk->bits, start_bit, nbits);
 			BUG_ON(remain);
 			size = nbits << order;
-			atomic_add(size, &chunk->avail);
+			atomic_long_add(size, &chunk->avail);
 			rcu_read_unlock();
 			return;
 		}
@@ -464,7 +464,7 @@ size_t gen_pool_avail(struct gen_pool *pool)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk)
-		avail += atomic_read(&chunk->avail);
+		avail += atomic_long_read(&chunk->avail);
 	rcu_read_unlock();
 	return avail;
 }

From 2f9b7e08cb27d6d8d4579bb5301fb0940ff63d19 Mon Sep 17 00:00:00 2001
From: "Liu, Changcheng" <changcheng.liu@intel.com>
Date: Fri, 17 Nov 2017 15:28:20 -0800
Subject: [PATCH 36/94] lib/nmi_backtrace.c: fix kernel text address leak

Don't leak idle function address in NMI backtrace.

Link: http://lkml.kernel.org/r/20171106165648.GA95243@sofia
Signed-off-by: Liu Changcheng <changcheng.liu@intel.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/nmi_backtrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 46e4c749e4eb..61a6b5aab07e 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -93,8 +93,8 @@ bool nmi_cpu_backtrace(struct pt_regs *regs)
 	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
 		arch_spin_lock(&lock);
 		if (regs && cpu_in_idle(instruction_pointer(regs))) {
-			pr_warn("NMI backtrace for cpu %d skipped: idling at pc %#lx\n",
-				cpu, instruction_pointer(regs));
+			pr_warn("NMI backtrace for cpu %d skipped: idling at %pS\n",
+				cpu, (void *)instruction_pointer(regs));
 		} else {
 			pr_warn("NMI backtrace for cpu %d\n", cpu);
 			if (regs)

From e4795e3bb7d7b3b3d066cea57fb459f869500284 Mon Sep 17 00:00:00 2001
From: Cheng Jian <cj.chengjian@huawei.com>
Date: Fri, 17 Nov 2017 15:28:23 -0800
Subject: [PATCH 37/94] tools/lib/traceevent/parse-filter.c: clean up clang
 build warning

The uniform structure filter_arg sets its union based on the difference
of enum filter_arg_type, However, some functions use implicit type
conversion obviously.

warning: implicit conversion from enumeration type 'enum filter_exp_type'
	 to different enumeration type 'enum filter_op_type'

warning: implicit conversion from enumeration type 'enum filter_cmp_type'
	 to different enumeration type 'enum filter_exp_type'

Link: http://lkml.kernel.org/r/1509938415-113825-1-git-send-email-cj.chengjian@huawei.com
Signed-off-by: Cheng Jian <cj.chengjian@huawei.com>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Cc: Li Bin <huawei.libin@huawei.com>
Cc: Steven Rostedt (Red Hat) <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/lib/traceevent/parse-filter.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c
index 7c214ceb9386..315df0a70265 100644
--- a/tools/lib/traceevent/parse-filter.c
+++ b/tools/lib/traceevent/parse-filter.c
@@ -436,13 +436,13 @@ create_arg_exp(enum filter_exp_type etype)
 		return NULL;
 
 	arg->type = FILTER_ARG_EXP;
-	arg->op.type = etype;
+	arg->exp.type = etype;
 
 	return arg;
 }
 
 static struct filter_arg *
-create_arg_cmp(enum filter_exp_type etype)
+create_arg_cmp(enum filter_cmp_type ctype)
 {
 	struct filter_arg *arg;
 
@@ -452,7 +452,7 @@ create_arg_cmp(enum filter_exp_type etype)
 
 	/* Use NUM and change if necessary */
 	arg->type = FILTER_ARG_NUM;
-	arg->op.type = etype;
+	arg->num.type = ctype;
 
 	return arg;
 }

From 0b548e33e6cb2bff240fdaf1783783be15c29080 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Fri, 17 Nov 2017 15:28:27 -0800
Subject: [PATCH 38/94] lib/rbtree-test: lower default params

Fengguang reported soft lockups while running the rbtree and interval
tree test modules.  The logic for these tests all occur in init phase,
and we currently are pounding with the default values for number of
nodes and number of iterations of each test.  Reduce the latter by two
orders of magnitude.  This does not influence the value of the tests in
that one thousand times by default is enough to get the picture.

Link: http://lkml.kernel.org/r/20171109161715.xai2dtwqw2frhkcm@linux-n805
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/interval_tree_test.c | 4 ++--
 lib/rbtree_test.c        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/interval_tree_test.c b/lib/interval_tree_test.c
index 0e343fd29570..835242e74aaa 100644
--- a/lib/interval_tree_test.c
+++ b/lib/interval_tree_test.c
@@ -11,10 +11,10 @@
 	MODULE_PARM_DESC(name, msg);
 
 __param(int, nnodes, 100, "Number of nodes in the interval tree");
-__param(int, perf_loops, 100000, "Number of iterations modifying the tree");
+__param(int, perf_loops, 1000, "Number of iterations modifying the tree");
 
 __param(int, nsearches, 100, "Number of searches to the interval tree");
-__param(int, search_loops, 10000, "Number of iterations searching the tree");
+__param(int, search_loops, 1000, "Number of iterations searching the tree");
 __param(bool, search_all, false, "Searches will iterate all nodes in the tree");
 
 __param(uint, max_endpoint, ~0, "Largest value for the interval's endpoint");
diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index 191a238e5a9d..7d36c1e27ff6 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -11,7 +11,7 @@
 	MODULE_PARM_DESC(name, msg);
 
 __param(int, nnodes, 100, "Number of nodes in the rb-tree");
-__param(int, perf_loops, 100000, "Number of iterations modifying the rb-tree");
+__param(int, perf_loops, 1000, "Number of iterations modifying the rb-tree");
 __param(int, check_loops, 100, "Number of iterations modifying and verifying the rb-tree");
 
 struct test_node {

From 4441fca0a27f5f0e2c652584ae9d7abec6255c1f Mon Sep 17 00:00:00 2001
From: Yury Norov <ynorov@caviumnetworks.com>
Date: Fri, 17 Nov 2017 15:28:31 -0800
Subject: [PATCH 39/94] lib: test module for find_*_bit() functions

find_bit functions are widely used in the kernel, including hot paths.
This module tests performance of those functions in 2 typical scenarios:
randomly filled bitmap with relatively equal distribution of set and
cleared bits, and sparse bitmap which has 1 set bit for 500 cleared
bits.

On ThunderX machine:

	 Start testing find_bit() with random-filled bitmap
	find_next_bit:          240043 cycles,  164062 iterations
	find_next_zero_bit:     312848 cycles,  163619 iterations
	find_last_bit:          193748 cycles,  164062 iterations
	find_first_bit:      177720874 cycles,  164062 iterations

	 Start testing find_bit() with sparse bitmap
	find_next_bit:            3633 cycles,     656 iterations
	find_next_zero_bit:     620399 cycles,  327025 iterations
	find_last_bit:            3038 cycles,     656 iterations
	find_first_bit:         691407 cycles,     656 iterations

[arnd@arndb.de: use correct format string for find-bit tests]
  Link: http://lkml.kernel.org/r/20171113135605.3166307-1-arnd@arndb.de
Link: http://lkml.kernel.org/r/20171109140714.13168-1-ynorov@caviumnetworks.com
Signed-off-by: Yury Norov <ynorov@caviumnetworks.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Clement Courbet <courbet@google.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/Kconfig.debug   |   9 +++
 lib/Makefile        |   1 +
 lib/test_find_bit.c | 144 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 154 insertions(+)
 create mode 100644 lib/test_find_bit.c

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8ffd891857ab..ce813731269f 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1850,6 +1850,15 @@ config TEST_BPF
 
 	  If unsure, say N.
 
+config TEST_FIND_BIT
+	tristate "Test find_bit functions"
+	default n
+	help
+	  This builds the "test_find_bit" module that measure find_*_bit()
+	  functions performance.
+
+	  If unsure, say N.
+
 config TEST_FIRMWARE
 	tristate "Test firmware loading via userspace interface"
 	default n
diff --git a/lib/Makefile b/lib/Makefile
index e3cb2a241338..d11c48ec8ffd 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -47,6 +47,7 @@ obj-y += hexdump.o
 obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o
 obj-y += kstrtox.o
 obj-$(CONFIG_TEST_BPF) += test_bpf.o
+obj-$(CONFIG_TEST_FIND_BIT) += test_find_bit.o
 obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
 obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o
 obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o
diff --git a/lib/test_find_bit.c b/lib/test_find_bit.c
new file mode 100644
index 000000000000..f4394a36f9aa
--- /dev/null
+++ b/lib/test_find_bit.c
@@ -0,0 +1,144 @@
+/*
+ * Test for find_*_bit functions.
+ *
+ * Copyright (c) 2017 Cavium.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/*
+ * find_bit functions are widely used in kernel, so the successful boot
+ * is good enough test for correctness.
+ *
+ * This test is focused on performance of traversing bitmaps. Two typical
+ * scenarios are reproduced:
+ * - randomly filled bitmap with approximately equal number of set and
+ *   cleared bits;
+ * - sparse bitmap with few set bits at random positions.
+ */
+
+#include <linux/bitops.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+
+#define BITMAP_LEN	(4096UL * 8 * 10)
+#define SPARSE		500
+
+static DECLARE_BITMAP(bitmap, BITMAP_LEN) __initdata;
+
+/*
+ * This is Schlemiel the Painter's algorithm. It should be called after
+ * all other tests for the same bitmap because it sets all bits of bitmap to 1.
+ */
+static int __init test_find_first_bit(void *bitmap, unsigned long len)
+{
+	unsigned long i, cnt;
+	cycles_t cycles;
+
+	cycles = get_cycles();
+	for (cnt = i = 0; i < len; cnt++) {
+		i = find_first_bit(bitmap, len);
+		__clear_bit(i, bitmap);
+	}
+	cycles = get_cycles() - cycles;
+	pr_err("find_first_bit:\t\t%llu cycles,\t%ld iterations\n",
+	       (u64)cycles, cnt);
+
+	return 0;
+}
+
+static int __init test_find_next_bit(const void *bitmap, unsigned long len)
+{
+	unsigned long i, cnt;
+	cycles_t cycles;
+
+	cycles = get_cycles();
+	for (cnt = i = 0; i < BITMAP_LEN; cnt++)
+		i = find_next_bit(bitmap, BITMAP_LEN, i) + 1;
+	cycles = get_cycles() - cycles;
+	pr_err("find_next_bit:\t\t%llu cycles,\t%ld iterations\n",
+	       (u64)cycles, cnt);
+
+	return 0;
+}
+
+static int __init test_find_next_zero_bit(const void *bitmap, unsigned long len)
+{
+	unsigned long i, cnt;
+	cycles_t cycles;
+
+	cycles = get_cycles();
+	for (cnt = i = 0; i < BITMAP_LEN; cnt++)
+		i = find_next_zero_bit(bitmap, len, i) + 1;
+	cycles = get_cycles() - cycles;
+	pr_err("find_next_zero_bit:\t%llu cycles,\t%ld iterations\n",
+	       (u64)cycles, cnt);
+
+	return 0;
+}
+
+static int __init test_find_last_bit(const void *bitmap, unsigned long len)
+{
+	unsigned long l, cnt = 0;
+	cycles_t cycles;
+
+	cycles = get_cycles();
+	do {
+		cnt++;
+		l = find_last_bit(bitmap, len);
+		if (l >= len)
+			break;
+		len = l;
+	} while (len);
+	cycles = get_cycles() - cycles;
+	pr_err("find_last_bit:\t\t%llu cycles,\t%ld iterations\n",
+	       (u64)cycles, cnt);
+
+	return 0;
+}
+
+static int __init find_bit_test(void)
+{
+	unsigned long nbits = BITMAP_LEN / SPARSE;
+
+	pr_err("\nStart testing find_bit() with random-filled bitmap\n");
+
+	get_random_bytes(bitmap, sizeof(bitmap));
+
+	test_find_next_bit(bitmap, BITMAP_LEN);
+	test_find_next_zero_bit(bitmap, BITMAP_LEN);
+	test_find_last_bit(bitmap, BITMAP_LEN);
+	test_find_first_bit(bitmap, BITMAP_LEN);
+
+	pr_err("\nStart testing find_bit() with sparse bitmap\n");
+
+	bitmap_zero(bitmap, BITMAP_LEN);
+
+	while (nbits--)
+		__set_bit(prandom_u32() % BITMAP_LEN, bitmap);
+
+	test_find_next_bit(bitmap, BITMAP_LEN);
+	test_find_next_zero_bit(bitmap, BITMAP_LEN);
+	test_find_last_bit(bitmap, BITMAP_LEN);
+	test_find_first_bit(bitmap, BITMAP_LEN);
+
+	return 0;
+}
+module_init(find_bit_test);
+
+static void __exit test_find_bit_cleanup(void)
+{
+}
+module_exit(test_find_bit_cleanup);
+
+MODULE_LICENSE("GPL");

From 25bdda2bd68ae3008759f26058f6f9b9bb2b1cc5 Mon Sep 17 00:00:00 2001
From: Miles Chen <miles.chen@mediatek.com>
Date: Fri, 17 Nov 2017 15:28:34 -0800
Subject: [PATCH 40/94] checkpatch: support function pointers for unnamed
 function definition arguments

Current unnamed function definition argument does not include function
pointer cases and it reports something like:

  WARNING: function definition argument 'void' should also have an identifier name
  +unsigned int (*dummy)(void);

Support function pointers for unnamed function arguments

Link: http://lkml.kernel.org/r/1505389925-31087-1-git-send-email-miles.chen@mediatek.com
Signed-off-by: Miles Chen <miles.chen@mediatek.com>
Acked-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 8b80bac055e4..aacbe918027b 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -5957,7 +5957,7 @@ sub process {
 
 # check for function declarations that have arguments without identifier names
 		if (defined $stat &&
-		    $stat =~ /^.\s*(?:extern\s+)?$Type\s*$Ident\s*\(\s*([^{]+)\s*\)\s*;/s &&
+		    $stat =~ /^.\s*(?:extern\s+)?$Type\s*(?:$Ident|\(\s*\*\s*$Ident\s*\))\s*\(\s*([^{]+)\s*\)\s*;/s &&
 		    $1 ne "void") {
 			my $args = trim($1);
 			while ($args =~ m/\s*($Type\s*(?:$Ident|\(\s*\*\s*$Ident?\s*\)\s*$balanced_parens)?)/g) {

From 258f79d5a1e49271f5aff38e6c1baeeaad0d82aa Mon Sep 17 00:00:00 2001
From: Heinrich Schuchardt <xypron.glpk@gmx.de>
Date: Fri, 17 Nov 2017 15:28:38 -0800
Subject: [PATCH 41/94] scripts/checkpatch.pl: avoid false warning missing
 break

void foo(int a)
	switch (a) {
	case 'h':
		fun1();
		exit(1);
	default:
}

creates a warning "Possible switch case/default not preceded by break or
fallthrough comment".

exit( should be treated like return.

Link: http://lkml.kernel.org/r/20170910154618.25819-1-xypron.glpk@gmx.de
Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
Acked-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index aacbe918027b..8dce8a8d9ed0 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -6109,7 +6109,7 @@ sub process {
 				next if ($fline =~ /^.[\s$;]*$/);
 				$has_statement = 1;
 				$count++;
-				$has_break = 1 if ($fline =~ /\bswitch\b|\b(?:break\s*;[\s$;]*$|return\b|goto\b|continue\b)/);
+				$has_break = 1 if ($fline =~ /\bswitch\b|\b(?:break\s*;[\s$;]*$|exit\s*\(\b|return\b|goto\b|continue\b)/);
 			}
 			if (!$has_break && $has_statement) {
 				WARN("MISSING_BREAK",

From eeef5733e30e736926c762fa3336c4dd5702bcdf Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 17 Nov 2017 15:28:41 -0800
Subject: [PATCH 42/94] checkpatch: printks always need a KERN_<LEVEL>

There was code in checkpatch that allowed continuation printks to be
used without KERN_CONT.  Remove the continuation check and always
require a KERN_<LEVEL>.

Link: http://lkml.kernel.org/r/61980ef41d5b9b6543da1c49055042e0ab74d308.1507047008.git.joe@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 8dce8a8d9ed0..2a8c6c3c1bdb 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3829,28 +3829,10 @@ sub process {
 			     "Prefer printk_ratelimited or pr_<level>_ratelimited to printk_ratelimit\n" . $herecurr);
 		}
 
-# printk should use KERN_* levels.  Note that follow on printk's on the
-# same line do not need a level, so we use the current block context
-# to try and find and validate the current printk.  In summary the current
-# printk includes all preceding printk's which have no newline on the end.
-# we assume the first bad printk is the one to report.
-		if ($line =~ /\bprintk\((?!KERN_)\s*"/) {
-			my $ok = 0;
-			for (my $ln = $linenr - 1; $ln >= $first_line; $ln--) {
-				#print "CHECK<$lines[$ln - 1]\n";
-				# we have a preceding printk if it ends
-				# with "\n" ignore it, else it is to blame
-				if ($lines[$ln - 1] =~ m{\bprintk\(}) {
-					if ($rawlines[$ln - 1] !~ m{\\n"}) {
-						$ok = 1;
-					}
-					last;
-				}
-			}
-			if ($ok == 0) {
-				WARN("PRINTK_WITHOUT_KERN_LEVEL",
-				     "printk() should include KERN_ facility level\n" . $herecurr);
-			}
+# printk should use KERN_* levels
+		if ($line =~ /\bprintk\s*\(\s*(?!KERN_[A-Z]+\b)/) {
+			WARN("PRINTK_WITHOUT_KERN_LEVEL",
+			     "printk() should include KERN_<LEVEL> facility level\n" . $herecurr);
 		}
 
 		if ($line =~ /\bprintk\s*\(\s*KERN_([A-Z]+)/) {

From cc147506bef96f31af799afe94d0c0e81161ee6c Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 17 Nov 2017 15:28:44 -0800
Subject: [PATCH 43/94] checkpatch: allow DEFINE_PER_CPU definitions to exceed
 line length

Some of the definitions are very long and can't be split into multiple
lines because ctags is limited.

Exempt these lines from the line length checks.

See commit 25528213fe9f ("tags: Fix DEFINE_PER_CPU expansions") for more
details.

Link: http://lkml.kernel.org/r/1508170320.6530.15.camel@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 2a8c6c3c1bdb..5fa0f5467d99 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2900,8 +2900,9 @@ sub process {
 				 $line =~ /^\+\s*#\s*define\s+\w+\s+$String$/) {
 				$msg_type = "";
 
-			# EFI_GUID is another special case
-			} elsif ($line =~ /^\+.*\bEFI_GUID\s*\(/) {
+			# More special cases
+			} elsif ($line =~ /^\+.*\bEFI_GUID\s*\(/ ||
+				 $line =~ /^\+\s*(?:\w+)?\s*DEFINE_PER_CPU/) {
 				$msg_type = "";
 
 			# Otherwise set the alternate message types

From 87bd499af5cd663c150032cca4bac822a729263b Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 17 Nov 2017 15:28:48 -0800
Subject: [PATCH 44/94] checkpatch: add TP_printk to list of logging functions

So the line length check can be bypassed by its callers.

Link: http://lkml.kernel.org/r/7de542c08a6e79f2ebe7c1416c9f403c23fdcc09.1508282823.git.joe@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Reported-by: Song Liu <songliubraving@fb.com>
Tested-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 5fa0f5467d99..6bdd43d5dec5 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -454,6 +454,7 @@ our $zero_initializer = qr{(?:(?:0[xX])?0+$Int_type?|NULL|false)\b};
 our $logFunctions = qr{(?x:
 	printk(?:_ratelimited|_once|_deferred_once|_deferred|)|
 	(?:[a-z0-9]+_){1,2}(?:printk|emerg|alert|crit|err|warning|warn|notice|info|debug|dbg|vdbg|devel|cont|WARN)(?:_ratelimited|_once|)|
+	TP_printk|
 	WARN(?:_RATELIMIT|_ONCE|)|
 	panic|
 	MODULE_[A-Z_]+|

From 5751a24edfd43a91e072d63cde2b99b5a421645f Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 17 Nov 2017 15:28:52 -0800
Subject: [PATCH 45/94] checkpatch: add --strict test for lines ending in [ or
 (

Lines that end in an open bracket or open parenthesis are generally hard
to follow.  Lines following those ending with open parenthesis are also
rarely aligned to that open parenthesis.

Suggest not ending lines with '[' or '('

Link: http://lkml.kernel.org/r/8fd0b2b4a7482064254e37931eb9302a81d5aa2f.1508340786.git.joe@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Suggested-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 6bdd43d5dec5..3453df9f90ab 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3184,6 +3184,12 @@ sub process {
 # check we are in a valid C source file if not then ignore this hunk
 		next if ($realfile !~ /\.(h|c)$/);
 
+# check for unusual line ending [ or (
+		if ($line =~ /^\+.*([\[\(])\s*$/) {
+			CHK("OPEN_ENDED_LINE",
+			    "Lines should not end with a '$1'\n" . $herecurr);
+		}
+
 # check if this appears to be the start function declaration, save the name
 		if ($sline =~ /^\+\{\s*$/ &&
 		    $prevline =~ /^\+(?:(?:(?:$Storage|$Inline)\s*)*\s*$Type\s*)?($Ident)\(/) {

From 0bc989ffc802752c5256192b4a9c8f16a00feca7 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 17 Nov 2017 15:28:55 -0800
Subject: [PATCH 46/94] checkpatch: do not check missing blank line before
 builtin_*_driver

checkpatch.pl does not check missing blank line before module_*_driver.
I want it to behave likewise for builtin_*_driver.

Link: http://lkml.kernel.org/r/1505700081-12854-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 3453df9f90ab..95cda3ecc66b 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3105,6 +3105,7 @@ sub process {
 		      $line =~ /^\+[a-z_]*init/ ||
 		      $line =~ /^\+\s*(?:static\s+)?[A-Z_]*ATTR/ ||
 		      $line =~ /^\+\s*DECLARE/ ||
+		      $line =~ /^\+\s*builtin_[\w_]*driver/ ||
 		      $line =~ /^\+\s*__setup/)) {
 			if (CHK("LINE_SPACING",
 				"Please use a blank line after function/struct/union/enum declarations\n" . $hereprev) &&

From 2ae928a9441a3b5f13952e1e8a97d03cb23ea603 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Fri, 17 Nov 2017 15:28:59 -0800
Subject: [PATCH 47/94] epoll: account epitem and eppoll_entry to kmemcg

A userspace application can directly trigger the allocations from
eventpoll_epi and eventpoll_pwq slabs.  A buggy or malicious application
can consume a significant amount of system memory by triggering such
allocations.  Indeed we have seen in production where a buggy
application was leaking the epoll references and causing a burst of
eventpoll_epi and eventpoll_pwq slab allocations.  This patch opt-in the
charging of eventpoll_epi and eventpoll_pwq slabs.

There is a per-user limit (~4% of total memory if no highmem) on these
caches.  I think it is too generous particularly in the scenario where
jobs of multiple users are running on the system and the administrator
is reducing cost by overcomitting the memory.  This is unaccounted
kernel memory and will not be considered by the oom-killer.  I think by
accounting it to kmemcg, for systems with kmem accounting enabled, we
can provide better isolation between jobs of different users.

Link: http://lkml.kernel.org/r/20171003021519.23907-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..a45360444895 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2329,11 +2329,11 @@ static int __init eventpoll_init(void)
 
 	/* Allocates slab cache used to allocate "struct epitem" items */
 	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
-			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
 
 	/* Allocates slab cache used to allocate "struct eppoll_entry" */
 	pwq_cache = kmem_cache_create("eventpoll_pwq",
-			sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
+		sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
 
 	return 0;
 }

From 57a173bdf5baab48e8e78825c7366c634acd087c Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Fri, 17 Nov 2017 15:29:02 -0800
Subject: [PATCH 48/94] epoll: avoid calling ep_call_nested() from
 ep_poll_safewake()

ep_poll_safewake() is used to wakeup potentially nested epoll file
descriptors.  The function uses ep_call_nested() to prevent entering the
same wake up queue more than once, and to prevent excessively deep
wakeup paths (deeper than EP_MAX_NESTS).  However, this is not necessary
since we are already preventing these conditions during EPOLL_CTL_ADD.
This saves extra function calls, and avoids taking a global lock during
the ep_call_nested() calls.

I have, however, left ep_call_nested() for the CONFIG_DEBUG_LOCK_ALLOC
case, since ep_call_nested() keeps track of the nesting level, and this
is required by the call to spin_lock_irqsave_nested().  It would be nice
to remove the ep_call_nested() calls for the CONFIG_DEBUG_LOCK_ALLOC
case as well, however its not clear how to simply pass the nesting level
through multiple wake_up() levels without more surgery.  In any case, I
don't think CONFIG_DEBUG_LOCK_ALLOC is generally used for production.
This patch, also apparently fixes a workload at Google that Salman Qazi
reported by completely removing the poll_safewake_ncalls->lock from
wakeup paths.

Link: http://lkml.kernel.org/r/1507920533-8812-1-git-send-email-jbaron@akamai.com
Signed-off-by: Jason Baron <jbaron@akamai.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Salman Qazi <sqazi@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 49 +++++++++++++++++++------------------------------
 1 file changed, 19 insertions(+), 30 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a45360444895..dc15bb02ee2a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -276,9 +276,6 @@ static DEFINE_MUTEX(epmutex);
 /* Used to check for epoll file descriptor inclusion loops */
 static struct nested_calls poll_loop_ncalls;
 
-/* Used for safe wake up implementation */
-static struct nested_calls poll_safewake_ncalls;
-
 /* Used to call file's f_op->poll() under the nested calls boundaries */
 static struct nested_calls poll_readywalk_ncalls;
 
@@ -551,40 +548,21 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
  * this special case of epoll.
  */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
-				     unsigned long events, int subclass)
-{
-	unsigned long flags;
 
-	spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
-	wake_up_locked_poll(wqueue, events);
-	spin_unlock_irqrestore(&wqueue->lock, flags);
-}
-#else
-static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
-				     unsigned long events, int subclass)
-{
-	wake_up_poll(wqueue, events);
-}
-#endif
+static struct nested_calls poll_safewake_ncalls;
 
 static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
 {
-	ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
-			  1 + call_nests);
+	unsigned long flags;
+	wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie;
+
+	spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1);
+	wake_up_locked_poll(wqueue, POLLIN);
+	spin_unlock_irqrestore(&wqueue->lock, flags);
+
 	return 0;
 }
 
-/*
- * Perform a safe wake up of the poll wait list. The problem is that
- * with the new callback'd wake up system, it is possible that the
- * poll callback is reentered from inside the call to wake_up() done
- * on the poll wait queue head. The rule is that we cannot reenter the
- * wake up code from the same task more than EP_MAX_NESTS times,
- * and we cannot reenter the same wait queue head at all. This will
- * enable to have a hierarchy of epoll file descriptor of no more than
- * EP_MAX_NESTS deep.
- */
 static void ep_poll_safewake(wait_queue_head_t *wq)
 {
 	int this_cpu = get_cpu();
@@ -595,6 +573,15 @@ static void ep_poll_safewake(wait_queue_head_t *wq)
 	put_cpu();
 }
 
+#else
+
+static void ep_poll_safewake(wait_queue_head_t *wq)
+{
+	wake_up_poll(wq, POLLIN);
+}
+
+#endif
+
 static void ep_remove_wait_queue(struct eppoll_entry *pwq)
 {
 	wait_queue_head_t *whead;
@@ -2315,8 +2302,10 @@ static int __init eventpoll_init(void)
 	 */
 	ep_nested_calls_init(&poll_loop_ncalls);
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
 	/* Initialize the structure used to perform safe poll wait head wake ups */
 	ep_nested_calls_init(&poll_safewake_ncalls);
+#endif
 
 	/* Initialize the structure used to perform file's f_op->poll() calls */
 	ep_nested_calls_init(&poll_readywalk_ncalls);

From 37b5e5212a448bac0fe29d2a51f088014fbaaa41 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Fri, 17 Nov 2017 15:29:06 -0800
Subject: [PATCH 49/94] epoll: remove ep_call_nested() from ep_eventpoll_poll()

The use of ep_call_nested() in ep_eventpoll_poll(), which is the .poll
routine for an epoll fd, is used to prevent excessively deep epoll
nesting, and to prevent circular paths.

However, we are already preventing these conditions during
EPOLL_CTL_ADD.  In terms of too deep epoll chains, we do in fact allow
deep nesting of the epoll fds themselves (deeper than EP_MAX_NESTS),
however we don't allow more than EP_MAX_NESTS when an epoll file
descriptor is actually connected to a wakeup source.  Thus, we do not
require the use of ep_call_nested(), since ep_eventpoll_poll(), which is
called via ep_scan_ready_list() only continues nesting if there are
events available.

Since ep_call_nested() is implemented using a global lock, applications
that make use of nested epoll can see large performance improvements
with this change.

Davidlohr said:

: Improvements are quite obscene actually, such as for the following
: epoll_wait() benchmark with 2 level nesting on a 80 core IvyBridge:
:
: ncpus  vanilla     dirty     delta
: 1      2447092     3028315   +23.75%
: 4      231265      2986954   +1191.57%
: 8      121631      2898796   +2283.27%
: 16     59749       2902056   +4757.07%
: 32     26837	     2326314   +8568.30%
: 64     12926       1341281   +10276.61%
:
: (http://linux-scalability.org/epoll/epoll-test.c)

Link: http://lkml.kernel.org/r/1509430214-5599-1-git-send-email-jbaron@akamai.com
Signed-off-by: Jason Baron <jbaron@akamai.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Salman Qazi <sqazi@google.com>
Cc: Hou Tao <houtao1@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 84 ++++++++++++++++++++++----------------------------
 1 file changed, 37 insertions(+), 47 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index dc15bb02ee2a..1e048144f17c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -276,9 +276,6 @@ static DEFINE_MUTEX(epmutex);
 /* Used to check for epoll file descriptor inclusion loops */
 static struct nested_calls poll_loop_ncalls;
 
-/* Used to call file's f_op->poll() under the nested calls boundaries */
-static struct nested_calls poll_readywalk_ncalls;
-
 /* Slab cache used to allocate "struct epitem" */
 static struct kmem_cache *epi_cache __read_mostly;
 
@@ -867,11 +864,33 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
-{
-	pt->_key = epi->event.events;
+static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
+			       void *priv);
+static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
+				 poll_table *pt);
 
-	return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
+/*
+ * Differs from ep_eventpoll_poll() in that internal callers already have
+ * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
+ * is correctly annotated.
+ */
+static unsigned int ep_item_poll(struct epitem *epi, poll_table *pt, int depth)
+{
+	struct eventpoll *ep;
+	bool locked;
+
+	pt->_key = epi->event.events;
+	if (!is_file_epoll(epi->ffd.file))
+		return epi->ffd.file->f_op->poll(epi->ffd.file, pt) &
+		       epi->event.events;
+
+	ep = epi->ffd.file->private_data;
+	poll_wait(epi->ffd.file, &ep->poll_wait, pt);
+	locked = pt && (pt->_qproc == ep_ptable_queue_proc);
+
+	return ep_scan_ready_list(epi->ffd.file->private_data,
+				  ep_read_events_proc, &depth, depth,
+				  locked) & epi->event.events;
 }
 
 static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
@@ -879,13 +898,15 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 {
 	struct epitem *epi, *tmp;
 	poll_table pt;
+	int depth = *(int *)priv;
 
 	init_poll_funcptr(&pt, NULL);
+	depth++;
 
 	list_for_each_entry_safe(epi, tmp, head, rdllink) {
-		if (ep_item_poll(epi, &pt))
+		if (ep_item_poll(epi, &pt, depth)) {
 			return POLLIN | POLLRDNORM;
-		else {
+		} else {
 			/*
 			 * Item has been dropped into the ready list by the poll
 			 * callback, but it's not actually ready, as far as
@@ -899,48 +920,20 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 	return 0;
 }
 
-static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
-				 poll_table *pt);
-
-struct readyevents_arg {
-	struct eventpoll *ep;
-	bool locked;
-};
-
-static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
-{
-	struct readyevents_arg *arg = priv;
-
-	return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL,
-				  call_nests + 1, arg->locked);
-}
-
 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 {
-	int pollflags;
 	struct eventpoll *ep = file->private_data;
-	struct readyevents_arg arg;
-
-	/*
-	 * During ep_insert() we already hold the ep->mtx for the tfile.
-	 * Prevent re-aquisition.
-	 */
-	arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc);
-	arg.ep = ep;
+	int depth = 0;
 
 	/* Insert inside our poll wait queue */
 	poll_wait(file, &ep->poll_wait, wait);
 
 	/*
 	 * Proceed to find out if wanted events are really available inside
-	 * the ready list. This need to be done under ep_call_nested()
-	 * supervision, since the call to f_op->poll() done on listed files
-	 * could re-enter here.
+	 * the ready list.
 	 */
-	pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
-				   ep_poll_readyevents_proc, &arg, ep, current);
-
-	return pollflags != -1 ? pollflags : 0;
+	return ep_scan_ready_list(ep, ep_read_events_proc,
+				  &depth, depth, false);
 }
 
 #ifdef CONFIG_PROC_FS
@@ -1459,7 +1452,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 	 * this operation completes, the poll callback can start hitting
 	 * the new item.
 	 */
-	revents = ep_item_poll(epi, &epq.pt);
+	revents = ep_item_poll(epi, &epq.pt, 1);
 
 	/*
 	 * We have to check if something went wrong during the poll wait queue
@@ -1593,7 +1586,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 	 * Get current event bits. We can safely use the file* here because
 	 * its usage count has been increased by the caller of this function.
 	 */
-	revents = ep_item_poll(epi, &pt);
+	revents = ep_item_poll(epi, &pt, 1);
 
 	/*
 	 * If the item is "hot" and it is not registered inside the ready
@@ -1661,7 +1654,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
 
 		list_del_init(&epi->rdllink);
 
-		revents = ep_item_poll(epi, &pt);
+		revents = ep_item_poll(epi, &pt, 1);
 
 		/*
 		 * If the event mask intersect the caller-requested one,
@@ -2307,9 +2300,6 @@ static int __init eventpoll_init(void)
 	ep_nested_calls_init(&poll_safewake_ncalls);
 #endif
 
-	/* Initialize the structure used to perform file's f_op->poll() calls */
-	ep_nested_calls_init(&poll_readywalk_ncalls);
-
 	/*
 	 * We can have many thousands of epitems, so prevent this from
 	 * using an extra cache line on 64-bit (and smaller) CPUs

From e4f02fdabd1092065ddd8366fcb3c8483e627fc0 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 17 Nov 2017 15:29:10 -0800
Subject: [PATCH 50/94] init/version.c: include <linux/export.h> instead of
 <linux/module.h>

init/version.c has nothing to do with modules, so remove the
<linux/modude.h>.

Instead, include <linux/export.h> for EXPORT_SYMBOL_GPL.

This cuts off a lot of unnecessary header parsing.

Link: http://lkml.kernel.org/r/1505920984-8523-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/version.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/init/version.c b/init/version.c
index 5606341e9efd..bfb4e3f4955e 100644
--- a/init/version.c
+++ b/init/version.c
@@ -7,7 +7,7 @@
  */
 
 #include <generated/compile.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/uts.h>
 #include <linux/utsname.h>
 #include <generated/utsrelease.h>

From ecc0c469f27765ed1e2b967be0aa17cee1a60b76 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 17 Nov 2017 15:29:13 -0800
Subject: [PATCH 51/94] autofs: don't fail mount for transient error

Currently if the autofs kernel module gets an error when writing to the
pipe which links to the daemon, then it marks the whole moutpoint as
catatonic, and it will stop working.

It is possible that the error is transient.  This can happen if the
daemon is slow and more than 16 requests queue up.  If a subsequent
process tries to queue a request, and is then signalled, the write to
the pipe will return -ERESTARTSYS and autofs will take that as total
failure.

So change the code to assess -ERESTARTSYS and -ENOMEM as transient
failures which only abort the current request, not the whole mountpoint.

It isn't a crash or a data corruption, but having autofs mountpoints
suddenly stop working is rather inconvenient.

Ian said:

: And given the problems with a half dozen (or so) user space applications
: consuming large amounts of CPU under heavy mount and umount activity this
: could happen more easily than we expect.

Link: http://lkml.kernel.org/r/87y3norvgp.fsf@notabene.neil.brown.name
Signed-off-by: NeilBrown <neilb@suse.com>
Acked-by: Ian Kent <raven@themaw.net>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/waitq.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 4ac49d038bf3..8fc41705c7cd 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -81,7 +81,8 @@ static int autofs4_write(struct autofs_sb_info *sbi,
 		spin_unlock_irqrestore(&current->sighand->siglock, flags);
 	}
 
-	return (bytes > 0);
+	/* if 'wr' returned 0 (impossible) we assume -EIO (safe) */
+	return bytes == 0 ? 0 : wr < 0 ? wr : -EIO;
 }
 
 static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
@@ -95,6 +96,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 	} pkt;
 	struct file *pipe = NULL;
 	size_t pktsz;
+	int ret;
 
 	pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n",
 		 (unsigned long) wq->wait_queue_token,
@@ -169,7 +171,18 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 	mutex_unlock(&sbi->wq_mutex);
 
 	if (autofs4_write(sbi, pipe, &pkt, pktsz))
+	switch (ret = autofs4_write(sbi, pipe, &pkt, pktsz)) {
+	case 0:
+		break;
+	case -ENOMEM:
+	case -ERESTARTSYS:
+		/* Just fail this one */
+		autofs4_wait_release(sbi, wq->wait_queue_token, ret);
+		break;
+	default:
 		autofs4_catatonic_mode(sbi);
+		break;
+	}
 	fput(pipe);
 }
 

From 98159d977f71c3b3dee898d1c34e56f520b094e7 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Fri, 17 Nov 2017 15:29:17 -0800
Subject: [PATCH 52/94] pipe: match pipe_max_size data type with procfs

Patch series "A few round_pipe_size() and pipe-max-size fixups", v3.

While backporting Michael's "pipe: fix limit handling" patchset to a
distro-kernel, Mikulas noticed that current upstream pipe limit handling
contains a few problems:

  1 - procfs signed wrap: echo'ing a large number into
      /proc/sys/fs/pipe-max-size and then cat'ing it back out shows a
      negative value.

  2 - round_pipe_size() nr_pages overflow on 32bit:  this would
      subsequently try roundup_pow_of_two(0), which is undefined.

  3 - visible non-rounded pipe-max-size value: there is no mutual
      exclusion or protection between the time pipe_max_size is assigned
      a raw value from proc_dointvec_minmax() and when it is rounded.

  4 - unsigned long -> unsigned int conversion makes for potential odd
      return errors from do_proc_douintvec_minmax_conv() and
      do_proc_dopipe_max_size_conv().

This version underwent the same testing as v1:
https://marc.info/?l=linux-kernel&m=150643571406022&w=2

This patch (of 4):

pipe_max_size is defined as an unsigned int:

  unsigned int pipe_max_size = 1048576;

but its procfs/sysctl representation is an integer:

  static struct ctl_table fs_table[] = {
          ...
          {
                  .procname       = "pipe-max-size",
                  .data           = &pipe_max_size,
                  .maxlen         = sizeof(int),
                  .mode           = 0644,
                  .proc_handler   = &pipe_proc_fn,
                  .extra1         = &pipe_min_size,
          },
          ...

that is signed:

  int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
                   size_t *lenp, loff_t *ppos)
  {
          ...
          ret = proc_dointvec_minmax(table, write, buf, lenp, ppos)

This leads to signed results via procfs for large values of pipe_max_size:

  % echo 2147483647 >/proc/sys/fs/pipe-max-size
  % cat /proc/sys/fs/pipe-max-size
  -2147483648

Use unsigned operations on this variable to avoid such negative values.

Link: http://lkml.kernel.org/r/1507658689-11669-2-git-send-email-joe.lawrence@redhat.com
Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c       | 2 +-
 kernel/sysctl.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index 349c9d56d4b3..3909c55ed389 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1125,7 +1125,7 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
 {
 	int ret;
 
-	ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
+	ret = proc_douintvec_minmax(table, write, buf, lenp, ppos);
 	if (ret < 0 || !write)
 		return ret;
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4a13a389e99b..2d42183b4c98 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1816,7 +1816,7 @@ static struct ctl_table fs_table[] = {
 	{
 		.procname	= "pipe-max-size",
 		.data		= &pipe_max_size,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(pipe_max_size),
 		.mode		= 0644,
 		.proc_handler	= &pipe_proc_fn,
 		.extra1		= &pipe_min_size,

From d3f14c485867cfb2e0c48aa88c41d0ef4bf5209c Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Fri, 17 Nov 2017 15:29:21 -0800
Subject: [PATCH 53/94] pipe: avoid round_pipe_size() nr_pages overflow on
 32-bit

round_pipe_size() contains a right-bit-shift expression which may
overflow, which would cause undefined results in a subsequent
roundup_pow_of_two() call.

  static inline unsigned int round_pipe_size(unsigned int size)
  {
          unsigned long nr_pages;

          nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
          return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
  }

PAGE_SIZE is defined as (1UL << PAGE_SHIFT), so:
  - 4 bytes wide on 32-bit (0 to 0xffffffff)
  - 8 bytes wide on 64-bit (0 to 0xffffffffffffffff)

That means that 32-bit round_pipe_size(), nr_pages may overflow to 0:

  size=0x00000000    nr_pages=0x0
  size=0x00000001    nr_pages=0x1
  size=0xfffff000    nr_pages=0xfffff
  size=0xfffff001    nr_pages=0x0         << !
  size=0xffffffff    nr_pages=0x0         << !

This is bad because roundup_pow_of_two(n) is undefined when n == 0!

64-bit is not a problem as the unsigned int size is 4 bytes wide
(similar to 32-bit) and the larger, 8 byte wide unsigned long, is
sufficient to handle the largest value of the bit shift expression:

  size=0xffffffff    nr_pages=100000

Modify round_pipe_size() to return 0 if n == 0 and updates its callers to
handle accordingly.

Link: http://lkml.kernel.org/r/1507658689-11669-3-git-send-email-joe.lawrence@redhat.com
Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index 3909c55ed389..f0f4ab36c444 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1018,13 +1018,19 @@ const struct file_operations pipefifo_fops = {
 
 /*
  * Currently we rely on the pipe array holding a power-of-2 number
- * of pages.
+ * of pages. Returns 0 on error.
  */
 static inline unsigned int round_pipe_size(unsigned int size)
 {
 	unsigned long nr_pages;
 
+	if (size < pipe_min_size)
+		size = pipe_min_size;
+
 	nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (nr_pages == 0)
+		return 0;
+
 	return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
 }
 
@@ -1040,6 +1046,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
 	long ret = 0;
 
 	size = round_pipe_size(arg);
+	if (size == 0)
+		return -EINVAL;
 	nr_pages = size >> PAGE_SHIFT;
 
 	if (!nr_pages)
@@ -1123,13 +1131,18 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
 int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
 		 size_t *lenp, loff_t *ppos)
 {
+	unsigned int rounded_pipe_max_size;
 	int ret;
 
 	ret = proc_douintvec_minmax(table, write, buf, lenp, ppos);
 	if (ret < 0 || !write)
 		return ret;
 
-	pipe_max_size = round_pipe_size(pipe_max_size);
+	rounded_pipe_max_size = round_pipe_size(pipe_max_size);
+	if (rounded_pipe_max_size == 0)
+		return -EINVAL;
+
+	pipe_max_size = rounded_pipe_max_size;
 	return ret;
 }
 

From 7a8d181949fb2c16be00f8cdb354794a30e46b39 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Fri, 17 Nov 2017 15:29:24 -0800
Subject: [PATCH 54/94] pipe: add proc_dopipe_max_size() to safely assign
 pipe_max_size

pipe_max_size is assigned directly via procfs sysctl:

  static struct ctl_table fs_table[] = {
          ...
          {
                  .procname       = "pipe-max-size",
                  .data           = &pipe_max_size,
                  .maxlen         = sizeof(int),
                  .mode           = 0644,
                  .proc_handler   = &pipe_proc_fn,
                  .extra1         = &pipe_min_size,
          },
          ...

  int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
                   size_t *lenp, loff_t *ppos)
  {
          ...
          ret = proc_dointvec_minmax(table, write, buf, lenp, ppos)
          ...

and then later rounded in-place a few statements later:

          ...
          pipe_max_size = round_pipe_size(pipe_max_size);
          ...

This leaves a window of time between initial assignment and rounding
that may be visible to other threads.  (For example, one thread sets a
non-rounded value to pipe_max_size while another reads its value.)

Similar reads of pipe_max_size are potentially racy:

  pipe.c :: alloc_pipe_info()
  pipe.c :: pipe_set_size()

Add a new proc_dopipe_max_size() that consolidates reading the new value
from the user buffer, verifying bounds, and calling round_pipe_size()
with a single assignment to pipe_max_size.

Link: http://lkml.kernel.org/r/1507658689-11669-4-git-send-email-joe.lawrence@redhat.com
Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c                 | 18 +++-----------
 include/linux/pipe_fs_i.h |  1 +
 include/linux/sysctl.h    |  3 +++
 kernel/sysctl.c           | 49 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 15 deletions(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index f0f4ab36c444..6d98566201ef 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1020,7 +1020,7 @@ const struct file_operations pipefifo_fops = {
  * Currently we rely on the pipe array holding a power-of-2 number
  * of pages. Returns 0 on error.
  */
-static inline unsigned int round_pipe_size(unsigned int size)
+unsigned int round_pipe_size(unsigned int size)
 {
 	unsigned long nr_pages;
 
@@ -1125,25 +1125,13 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
 }
 
 /*
- * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
+ * This should work even if CONFIG_PROC_FS isn't set, as proc_dopipe_max_size
  * will return an error.
  */
 int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
 		 size_t *lenp, loff_t *ppos)
 {
-	unsigned int rounded_pipe_max_size;
-	int ret;
-
-	ret = proc_douintvec_minmax(table, write, buf, lenp, ppos);
-	if (ret < 0 || !write)
-		return ret;
-
-	rounded_pipe_max_size = round_pipe_size(pipe_max_size);
-	if (rounded_pipe_max_size == 0)
-		return -EINVAL;
-
-	pipe_max_size = rounded_pipe_max_size;
-	return ret;
+	return proc_dopipe_max_size(table, write, buf, lenp, ppos);
 }
 
 /*
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 6a80cfc63e0c..2dc5e9870fcd 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -191,5 +191,6 @@ long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
 struct pipe_inode_info *get_pipe_info(struct file *file);
 
 int create_pipe_files(struct file **, int);
+unsigned int round_pipe_size(unsigned int size);
 
 #endif
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index b769ecfcc3bd..992bc9948232 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -51,6 +51,9 @@ extern int proc_dointvec_minmax(struct ctl_table *, int,
 extern int proc_douintvec_minmax(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp,
 				 loff_t *ppos);
+extern int proc_dopipe_max_size(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp,
+				loff_t *ppos);
 extern int proc_dointvec_jiffies(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
 extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2d42183b4c98..138b6484f277 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -66,6 +66,7 @@
 #include <linux/kexec.h>
 #include <linux/bpf.h>
 #include <linux/mount.h>
+#include <linux/pipe_fs_i.h>
 
 #include <linux/uaccess.h>
 #include <asm/processor.h>
@@ -2620,6 +2621,47 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
 				 do_proc_douintvec_minmax_conv, &param);
 }
 
+struct do_proc_dopipe_max_size_conv_param {
+	unsigned int *min;
+};
+
+static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
+					unsigned int *valp,
+					int write, void *data)
+{
+	struct do_proc_dopipe_max_size_conv_param *param = data;
+
+	if (write) {
+		unsigned int val = round_pipe_size(*lvalp);
+
+		if (val == 0)
+			return -EINVAL;
+
+		if (param->min && *param->min > val)
+			return -ERANGE;
+
+		if (*lvalp > UINT_MAX)
+			return -EINVAL;
+
+		*valp = val;
+	} else {
+		unsigned int val = *valp;
+		*lvalp = (unsigned long) val;
+	}
+
+	return 0;
+}
+
+int proc_dopipe_max_size(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct do_proc_dopipe_max_size_conv_param param = {
+		.min = (unsigned int *) table->extra1,
+	};
+	return do_proc_douintvec(table, write, buffer, lenp, ppos,
+				 do_proc_dopipe_max_size_conv, &param);
+}
+
 static void validate_coredump_safety(void)
 {
 #ifdef CONFIG_COREDUMP
@@ -3125,6 +3167,12 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
 	return -ENOSYS;
 }
 
+int proc_dopipe_max_size(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
 int proc_dointvec_jiffies(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -3168,6 +3216,7 @@ EXPORT_SYMBOL(proc_douintvec);
 EXPORT_SYMBOL(proc_dointvec_jiffies);
 EXPORT_SYMBOL(proc_dointvec_minmax);
 EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
+EXPORT_SYMBOL_GPL(proc_dopipe_max_size);
 EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
 EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
 EXPORT_SYMBOL(proc_dostring);

From fb910c42ccebf853c29296185c45c11164a56098 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Fri, 17 Nov 2017 15:29:28 -0800
Subject: [PATCH 55/94] sysctl: check for UINT_MAX before unsigned int min/max

Mikulas noticed in the existing do_proc_douintvec_minmax_conv() and
do_proc_dopipe_max_size_conv() introduced in this patchset, that they
inconsistently handle overflow and min/max range inputs:

For example:

  0 ... param->min - 1 ---> ERANGE
  param->min ... param->max ---> the value is accepted
  param->max + 1 ... 0x100000000L + param->min - 1 ---> ERANGE
  0x100000000L + param->min ... 0x100000000L + param->max ---> EINVAL
  0x100000000L + param->max + 1, 0x200000000L + param->min - 1 ---> ERANGE
  0x200000000L + param->min ... 0x200000000L + param->max ---> EINVAL
  0x200000000L + param->max + 1, 0x300000000L + param->min - 1 ---> ERANGE

In do_proc_do*() routines which store values into unsigned int variables
(4 bytes wide for 64-bit builds), first validate that the input unsigned
long value (8 bytes wide for 64-bit builds) will fit inside the smaller
unsigned int variable.  Then check that the unsigned int value falls
inside the specified parameter min, max range.  Otherwise the unsigned
long -> unsigned int conversion drops leading bits from the input value,
leading to the inconsistent pattern Mikulas documented above.

Link: http://lkml.kernel.org/r/1507658689-11669-5-git-send-email-joe.lawrence@redhat.com
Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 138b6484f277..dd25d90896fc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2576,12 +2576,13 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
 	if (write) {
 		unsigned int val = *lvalp;
 
+		if (*lvalp > UINT_MAX)
+			return -EINVAL;
+
 		if ((param->min && *param->min > val) ||
 		    (param->max && *param->max < val))
 			return -ERANGE;
 
-		if (*lvalp > UINT_MAX)
-			return -EINVAL;
 		*valp = val;
 	} else {
 		unsigned int val = *valp;
@@ -2632,17 +2633,18 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
 	struct do_proc_dopipe_max_size_conv_param *param = data;
 
 	if (write) {
-		unsigned int val = round_pipe_size(*lvalp);
+		unsigned int val;
 
+		if (*lvalp > UINT_MAX)
+			return -EINVAL;
+
+		val = round_pipe_size(*lvalp);
 		if (val == 0)
 			return -EINVAL;
 
 		if (param->min && *param->min > val)
 			return -ERANGE;
 
-		if (*lvalp > UINT_MAX)
-			return -EINVAL;
-
 		*valp = val;
 	} else {
 		unsigned int val = *valp;

From 7554e9c4cfa208acf3164a86c05aaa967b043425 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 17 Nov 2017 15:29:32 -0800
Subject: [PATCH 56/94] fs/nilfs2: convert timers to use timer_setup()

In preparation for unconditionally passing the struct timer_list pointer
to all timer callbacks, switch to using the new timer_setup() and
from_timer() to pass the timer pointer explicitly.  This requires adding
a pointer to hold the timer's target task, as the lifetime of sc_task
doesn't appear to match the timer's task.

Link: http://lkml.kernel.org/r/20171016235900.GA102729@beast
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/segment.c | 11 +++++------
 fs/nilfs2/segment.h |  1 +
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index f65392fecb5c..472f0b53a724 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2400,11 +2400,11 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
 	return err;
 }
 
-static void nilfs_construction_timeout(unsigned long data)
+static void nilfs_construction_timeout(struct timer_list *t)
 {
-	struct task_struct *p = (struct task_struct *)data;
+	struct nilfs_sc_info *sci = from_timer(sci, t, sc_timer);
 
-	wake_up_process(p);
+	wake_up_process(sci->sc_timer_task);
 }
 
 static void
@@ -2542,8 +2542,7 @@ static int nilfs_segctor_thread(void *arg)
 	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
 	int timeout = 0;
 
-	sci->sc_timer.data = (unsigned long)current;
-	sci->sc_timer.function = nilfs_construction_timeout;
+	sci->sc_timer_task = current;
 
 	/* start sync. */
 	sci->sc_task = current;
@@ -2674,7 +2673,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
 	INIT_LIST_HEAD(&sci->sc_gc_inodes);
 	INIT_LIST_HEAD(&sci->sc_iput_queue);
 	INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func);
-	init_timer(&sci->sc_timer);
+	timer_setup(&sci->sc_timer, nilfs_construction_timeout, 0);
 
 	sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
 	sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 1060949d7dd2..84084a4d9b3e 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -180,6 +180,7 @@ struct nilfs_sc_info {
 	unsigned long		sc_watermark;
 
 	struct timer_list	sc_timer;
+	struct task_struct     *sc_timer_task;
 	struct task_struct     *sc_task;
 };
 

From 31ccb1f7ba3cfe29631587d451cf5bb8ab593550 Mon Sep 17 00:00:00 2001
From: Andreas Rohner <andreas.rohner@gmx.net>
Date: Fri, 17 Nov 2017 15:29:35 -0800
Subject: [PATCH 57/94] nilfs2: fix race condition that causes file system
 corruption

There is a race condition between nilfs_dirty_inode() and
nilfs_set_file_dirty().

When a file is opened, nilfs_dirty_inode() is called to update the
access timestamp in the inode.  It calls __nilfs_mark_inode_dirty() in a
separate transaction.  __nilfs_mark_inode_dirty() caches the ifile
buffer_head in the i_bh field of the inode info structure and marks it
as dirty.

After some data was written to the file in another transaction, the
function nilfs_set_file_dirty() is called, which adds the inode to the
ns_dirty_files list.

Then the segment construction calls nilfs_segctor_collect_dirty_files(),
which goes through the ns_dirty_files list and checks the i_bh field.
If there is a cached buffer_head in i_bh it is not marked as dirty
again.

Since nilfs_dirty_inode() and nilfs_set_file_dirty() use separate
transactions, it is possible that a segment construction that writes out
the ifile occurs in-between the two.  If this happens the inode is not
on the ns_dirty_files list, but its ifile block is still marked as dirty
and written out.

In the next segment construction, the data for the file is written out
and nilfs_bmap_propagate() updates the b-tree.  Eventually the bmap root
is written into the i_bh block, which is not dirty, because it was
written out in another segment construction.

As a result the bmap update can be lost, which leads to file system
corruption.  Either the virtual block address points to an unallocated
DAT block, or the DAT entry will be reused for something different.

The error can remain undetected for a long time.  A typical error
message would be one of the "bad btree" errors or a warning that a DAT
entry could not be found.

This bug can be reproduced reliably by a simple benchmark that creates
and overwrites millions of 4k files.

Link: http://lkml.kernel.org/r/1509367935-3086-2-git-send-email-konishi.ryusuke@lab.ntt.co.jp
Signed-off-by: Andreas Rohner <andreas.rohner@gmx.net>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Tested-by: Andreas Rohner <andreas.rohner@gmx.net>
Tested-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/segment.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 472f0b53a724..f572538dcc4f 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1954,8 +1954,6 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
 					  err, ii->vfs_inode.i_ino);
 				return err;
 			}
-			mark_buffer_dirty(ibh);
-			nilfs_mdt_mark_dirty(ifile);
 			spin_lock(&nilfs->ns_inode_lock);
 			if (likely(!ii->i_bh))
 				ii->i_bh = ibh;
@@ -1964,6 +1962,10 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
 			goto retry;
 		}
 
+		// Always redirty the buffer to avoid race condition
+		mark_buffer_dirty(ii->i_bh);
+		nilfs_mdt_mark_dirty(ifile);
+
 		clear_bit(NILFS_I_QUEUED, &ii->i_state);
 		set_bit(NILFS_I_BUSY, &ii->i_state);
 		list_move_tail(&ii->i_dirty, &sci->sc_dirty_files);

From d4f0284a5969fd7809ec8df710eb10598b701638 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 17 Nov 2017 15:29:39 -0800
Subject: [PATCH 58/94] fs, nilfs: convert nilfs_root.count from atomic_t to
 refcount_t

atomic_t variables are currently used to implement reference counters
with the following properties:

 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided refcount_t
type and API that prevents accidental counter overflows and underflows.
This is important since overflows and underflows can lead to
use-after-free situation and be exploitable.

The variable nilfs_root.count is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Link: http://lkml.kernel.org/r/1509367935-3086-3-git-send-email-konishi.ryusuke@lab.ntt.co.jp
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/the_nilfs.c | 8 ++++----
 fs/nilfs2/the_nilfs.h | 5 +++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 2dd75bf619ad..afebb5067cec 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -737,7 +737,7 @@ struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno)
 		} else if (cno > root->cno) {
 			n = n->rb_right;
 		} else {
-			atomic_inc(&root->count);
+			refcount_inc(&root->count);
 			spin_unlock(&nilfs->ns_cptree_lock);
 			return root;
 		}
@@ -776,7 +776,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
 		} else if (cno > root->cno) {
 			p = &(*p)->rb_right;
 		} else {
-			atomic_inc(&root->count);
+			refcount_inc(&root->count);
 			spin_unlock(&nilfs->ns_cptree_lock);
 			kfree(new);
 			return root;
@@ -786,7 +786,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
 	new->cno = cno;
 	new->ifile = NULL;
 	new->nilfs = nilfs;
-	atomic_set(&new->count, 1);
+	refcount_set(&new->count, 1);
 	atomic64_set(&new->inodes_count, 0);
 	atomic64_set(&new->blocks_count, 0);
 
@@ -806,7 +806,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
 
 void nilfs_put_root(struct nilfs_root *root)
 {
-	if (atomic_dec_and_test(&root->count)) {
+	if (refcount_dec_and_test(&root->count)) {
 		struct the_nilfs *nilfs = root->nilfs;
 
 		nilfs_sysfs_delete_snapshot_group(root);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index b305c6f033e7..883d732b0259 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -27,6 +27,7 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/slab.h>
+#include <linux/refcount.h>
 
 struct nilfs_sc_info;
 struct nilfs_sysfs_dev_subgroups;
@@ -246,7 +247,7 @@ struct nilfs_root {
 	__u64 cno;
 	struct rb_node rb_node;
 
-	atomic_t count;
+	refcount_t count;
 	struct the_nilfs *nilfs;
 	struct inode *ifile;
 
@@ -299,7 +300,7 @@ void nilfs_swap_super_block(struct the_nilfs *);
 
 static inline void nilfs_get_root(struct nilfs_root *root)
 {
-	atomic_inc(&root->count);
+	refcount_inc(&root->count);
 }
 
 static inline int nilfs_valid_fs(struct the_nilfs *nilfs)

From 4d685f930a53632ff6b86efe43b95637006371fe Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 17 Nov 2017 15:29:43 -0800
Subject: [PATCH 59/94] nilfs2: align block comments of
 nilfs_sufile_truncate_range() at *

Fix the following checkpatch warning:

 WARNING: Block comments should align the * on each line
 #633: FILE: sufile.c:633:
 +/**
 +  * nilfs_sufile_truncate_range - truncate range of segment array

Link: http://lkml.kernel.org/r/1509367935-3086-4-git-send-email-konishi.ryusuke@lab.ntt.co.jp
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/sufile.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 1541a1e9221a..1341a41e7b43 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -630,22 +630,22 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
 }
 
 /**
-  * nilfs_sufile_truncate_range - truncate range of segment array
-  * @sufile: inode of segment usage file
-  * @start: start segment number (inclusive)
-  * @end: end segment number (inclusive)
-  *
-  * Return Value: On success, 0 is returned.  On error, one of the
-  * following negative error codes is returned.
-  *
-  * %-EIO - I/O error.
-  *
-  * %-ENOMEM - Insufficient amount of memory available.
-  *
-  * %-EINVAL - Invalid number of segments specified
-  *
-  * %-EBUSY - Dirty or active segments are present in the range
-  */
+ * nilfs_sufile_truncate_range - truncate range of segment array
+ * @sufile: inode of segment usage file
+ * @start: start segment number (inclusive)
+ * @end: end segment number (inclusive)
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid number of segments specified
+ *
+ * %-EBUSY - Dirty or active segments are present in the range
+ */
 static int nilfs_sufile_truncate_range(struct inode *sufile,
 				       __u64 start, __u64 end)
 {

From 3147db8938c7968b7be07f9b87510e334fe42ce1 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 17 Nov 2017 15:29:46 -0800
Subject: [PATCH 60/94] nilfs2: use octal for unreadable permission macro

Replace S_IRWXUGO with 0777 because symbolic permissions are considered
harmful:

 https://lwn.net/Articles/696229/

Link: http://lkml.kernel.org/r/1509367935-3086-5-git-send-email-konishi.ryusuke@lab.ntt.co.jp
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 515d13c196da..1a2894aa0194 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -150,7 +150,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (err)
 		return err;
 
-	inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+	inode = nilfs_new_inode(dir, S_IFLNK | 0777);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out;

From 577753cc57b19949b7ce0fc848c669d37e448c20 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 17 Nov 2017 15:29:50 -0800
Subject: [PATCH 61/94] nilfs2: remove inode->i_version initialization

It's never used in nilfs2.

Link: http://lkml.kernel.org/r/1510064486-1728-2-git-send-email-konishi.ryusuke@lab.ntt.co.jp
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/super.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 4fc018dfcfae..3ce20cd44a20 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -160,7 +160,6 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
 	ii->i_bh = NULL;
 	ii->i_state = 0;
 	ii->i_cno = 0;
-	ii->vfs_inode.i_version = 1;
 	nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode);
 	return &ii->vfs_inode;
 }

From 15ec37185ec66b8e199188bf8df3e7baf50ef77d Mon Sep 17 00:00:00 2001
From: Christos Gkekas <chris.gekas@gmail.com>
Date: Fri, 17 Nov 2017 15:29:54 -0800
Subject: [PATCH 62/94] hfs/hfsplus: clean up unused variables in bnode.c

Delete variables 'tree' and 'sb', which are set but never used.

Link: http://lkml.kernel.org/r/1507977146-15875-1-git-send-email-chris.gekas@gmail.com
Signed-off-by: Christos Gkekas <chris.gekas@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfs/bnode.c     | 4 ----
 fs/hfsplus/bnode.c | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 8aec5e732abf..b63a4df7327b 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -98,13 +98,11 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
 void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
 		struct hfs_bnode *src_node, int src, int len)
 {
-	struct hfs_btree *tree;
 	struct page *src_page, *dst_page;
 
 	hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
 	if (!len)
 		return;
-	tree = src_node->tree;
 	src += src_node->page_offset;
 	dst += dst_node->page_offset;
 	src_page = src_node->page[0];
@@ -237,7 +235,6 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
 
 static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 {
-	struct super_block *sb;
 	struct hfs_bnode *node, *node2;
 	struct address_space *mapping;
 	struct page *page;
@@ -249,7 +246,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 		return NULL;
 	}
 
-	sb = tree->inode->i_sb;
 	size = sizeof(struct hfs_bnode) + tree->pages_per_bnode *
 		sizeof(struct page *);
 	node = kzalloc(size, GFP_KERNEL);
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index d77015c3f22c..177fae4e6581 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -127,14 +127,12 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
 void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
 		    struct hfs_bnode *src_node, int src, int len)
 {
-	struct hfs_btree *tree;
 	struct page **src_page, **dst_page;
 	int l;
 
 	hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
 	if (!len)
 		return;
-	tree = src_node->tree;
 	src += src_node->page_offset;
 	dst += dst_node->page_offset;
 	src_page = src_node->page + (src >> PAGE_SHIFT);
@@ -401,7 +399,6 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
 
 static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 {
-	struct super_block *sb;
 	struct hfs_bnode *node, *node2;
 	struct address_space *mapping;
 	struct page *page;
@@ -414,7 +411,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 		return NULL;
 	}
 
-	sb = tree->inode->i_sb;
 	size = sizeof(struct hfs_bnode) + tree->pages_per_bnode *
 		sizeof(struct page *);
 	node = kzalloc(size, GFP_KERNEL);

From eecd7f4f5b9c2021dbde0a361b365f5970db52aa Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 17 Nov 2017 15:29:57 -0800
Subject: [PATCH 63/94] fat: remove redundant assignment of 0 to slots

The variable slots is being assigned a value of zero that is never read,
slots is being updated again a few lines later.  Remove this redundant
assignment.

Cleans clang warning: Value stored to 'slots' is never read

Link: http://lkml.kernel.org/r/20171017140258.22536-1-colin.king@canonical.com
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 81cecbe6d7cf..b833ffeee1e1 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -291,7 +291,6 @@ static int fat_parse_long(struct inode *dir, loff_t *pos,
 		}
 	}
 parse_long:
-	slots = 0;
 	ds = (struct msdos_dir_slot *)*de;
 	id = ds->id;
 	if (!(id & 0x40))

From 628c1bcba204052d19b686b5bac149a644cdb72e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 17 Nov 2017 15:30:01 -0800
Subject: [PATCH 64/94] kernel/signal.c: protect the traced SIGNAL_UNKILLABLE
 tasks from SIGKILL

The comment in sig_ignored() says "Tracers may want to know about even
ignored signals" but SIGKILL can not be reported to debugger and it is
just wrong to return 0 in this case: SIGKILL should only kill the
SIGNAL_UNKILLABLE task if it comes from the parent ns.

Change sig_ignored() to ignore ->ptrace if sig == SIGKILL and rely on
sig_task_ignored().

SISGTOP coming from within the namespace is not really right too but at
least debugger can intercept it, and we can't drop it here because this
will break "gdb -p 1": ptrace_attach() won't work.  Perhaps we will add
another ->ptrace check later, we will see.

Link: http://lkml.kernel.org/r/20171103184206.GB21036@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Tested-by: Kyle Huey <me@kylehuey.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index aa1fb9f905db..be5913134742 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -94,13 +94,15 @@ static int sig_ignored(struct task_struct *t, int sig, bool force)
 	if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
 		return 0;
 
-	if (!sig_task_ignored(t, sig, force))
+	/*
+	 * Tracers may want to know about even ignored signal unless it
+	 * is SIGKILL which can't be reported anyway but can be ignored
+	 * by SIGNAL_UNKILLABLE task.
+	 */
+	if (t->ptrace && sig != SIGKILL)
 		return 0;
 
-	/*
-	 * Tracers may want to know about even ignored signals.
-	 */
-	return !t->ptrace;
+	return sig_task_ignored(t, sig, force);
 }
 
 /*

From ac25385089f673560867eb5179228a44ade0cfc1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 17 Nov 2017 15:30:04 -0800
Subject: [PATCH 65/94] kernel/signal.c: protect the SIGNAL_UNKILLABLE tasks
 from !sig_kernel_only() signals

Change sig_task_ignored() to drop the SIG_DFL && !sig_kernel_only()
signals even if force == T.  This simplifies the next change and this
matches the same check in get_signal() which will drop these signals
anyway.

Link: http://lkml.kernel.org/r/20171103184227.GC21036@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Tested-by: Kyle Huey <me@kylehuey.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index be5913134742..01ba166a5e3a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -78,7 +78,7 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force)
 	handler = sig_handler(t, sig);
 
 	if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
-			handler == SIG_DFL && !force)
+	    handler == SIG_DFL && !(force && sig_kernel_only(sig)))
 		return 1;
 
 	return sig_handler_ignored(handler, sig);

From 426915796ccaf9c2bd9bb06dc5702225957bc2e5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 17 Nov 2017 15:30:08 -0800
Subject: [PATCH 66/94] kernel/signal.c: remove the no longer needed
 SIGNAL_UNKILLABLE check in complete_signal()

complete_signal() checks SIGNAL_UNKILLABLE before it starts to destroy
the thread group, today this is wrong in many ways.

If nothing else, fatal_signal_pending() should always imply that the
whole thread group (except ->group_exit_task if it is not NULL) is
killed, this check breaks the rule.

After the previous changes we can rely on sig_task_ignored();
sig_fatal(sig) && SIGNAL_UNKILLABLE can only be true if we actually want
to kill this task and sig == SIGKILL OR it is traced and debugger can
intercept the signal.

This should hopefully fix the problem reported by Dmitry.  This
test-case

	static int init(void *arg)
	{
		for (;;)
			pause();
	}

	int main(void)
	{
		char stack[16 * 1024];

		for (;;) {
			int pid = clone(init, stack + sizeof(stack)/2,
					CLONE_NEWPID | SIGCHLD, NULL);
			assert(pid > 0);

			assert(ptrace(PTRACE_ATTACH, pid, 0, 0) == 0);
			assert(waitpid(-1, NULL, WSTOPPED) == pid);

			assert(ptrace(PTRACE_DETACH, pid, 0, SIGSTOP) == 0);
			assert(syscall(__NR_tkill, pid, SIGKILL) == 0);
			assert(pid == wait(NULL));
		}
	}

triggers the WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING)) in
task_participate_group_stop().  do_signal_stop()->signal_group_exit()
checks SIGNAL_GROUP_EXIT and return false, but task_set_jobctl_pending()
checks fatal_signal_pending() and does not set JOBCTL_STOP_PENDING.

And his should fix the minor security problem reported by Kyle,
SECCOMP_RET_TRACE can miss fatal_signal_pending() the same way if the
task is the root of a pid namespace.

Link: http://lkml.kernel.org/r/20171103184246.GD21036@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Reported-by: Kyle Huey <me@kylehuey.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kyle Huey <me@kylehuey.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 01ba166a5e3a..6895f6bb98a7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -931,9 +931,9 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 	 * then start taking the whole group down immediately.
 	 */
 	if (sig_fatal(p, sig) &&
-	    !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
+	    !(signal->flags & SIGNAL_GROUP_EXIT) &&
 	    !sigismember(&t->real_blocked, sig) &&
-	    (sig == SIGKILL || !t->ptrace)) {
+	    (sig == SIGKILL || !p->ptrace)) {
 		/*
 		 * This signal will be fatal to the whole group.
 		 */

From de40ccefd1f19180c0a43e4d9b9d2f4dc8856c8b Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Fri, 17 Nov 2017 15:30:12 -0800
Subject: [PATCH 67/94] kdump: print a message in case parse_crashkernel_mem
 resulted in zero bytes

parse_crashkernel_mem() silently returns if we get zero bytes in the
parsing function.  It is useful for debugging to add a message,
especially if the kernel cannot boot correctly.

Add a pr_info instead of pr_warn because it is expected behavior for
size = 0, eg.  crashkernel=2G-4G:128M, size will be 0 in case system
memory is less than 2G.

Link: http://lkml.kernel.org/r/20171114080129.GA6115@dhcp-128-65.nay.redhat.com
Signed-off-by: Dave Young <dyoung@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Bhupesh Sharma <bhsharma@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/crash_core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 6db80fc0810b..b3663896278e 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -108,7 +108,8 @@ static int __init parse_crashkernel_mem(char *cmdline,
 				return -EINVAL;
 			}
 		}
-	}
+	} else
+		pr_info("crashkernel size resulted in zero bytes\n");
 
 	return 0;
 }

From c1b1418a66928bbae3d3ff057ae0110d5b122c11 Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Fri, 17 Nov 2017 15:30:15 -0800
Subject: [PATCH 68/94] rapidio: constify rio_device_id

rio_device_id are not supposed to change at runtime.  rio driver is
working with const 'id_table'.  So mark the non-const rio_device_id
structs as const.

Link: http://lkml.kernel.org/r/1503734627-6058-2-git-send-email-arvind.yadav.cs@gmail.com
Link: http://lkml.kernel.org/r/1503734627-6058-3-git-send-email-arvind.yadav.cs@gmail.com
Link: http://lkml.kernel.org/r/1503734627-6058-4-git-send-email-arvind.yadav.cs@gmail.com
Link: http://lkml.kernel.org/r/1503734627-6058-5-git-send-email-arvind.yadav.cs@gmail.com
Link: http://lkml.kernel.org/r/1503734627-6058-6-git-send-email-arvind.yadav.cs@gmail.com
Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Acked-by: Alexandre Bounine <alexandre.bounine@idt.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rapidio/switches/idt_gen2.c | 2 +-
 drivers/rapidio/switches/idt_gen3.c | 2 +-
 drivers/rapidio/switches/idtcps.c   | 2 +-
 drivers/rapidio/switches/tsi568.c   | 2 +-
 drivers/rapidio/switches/tsi57x.c   | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/rapidio/switches/idt_gen2.c b/drivers/rapidio/switches/idt_gen2.c
index e67b923b1ca6..4931ed790428 100644
--- a/drivers/rapidio/switches/idt_gen2.c
+++ b/drivers/rapidio/switches/idt_gen2.c
@@ -458,7 +458,7 @@ static void idtg2_remove(struct rio_dev *rdev)
 	idtg2_sysfs(rdev, false);
 }
 
-static struct rio_device_id idtg2_id_table[] = {
+static const struct rio_device_id idtg2_id_table[] = {
 	{RIO_DEVICE(RIO_DID_IDTCPS1848, RIO_VID_IDT)},
 	{RIO_DEVICE(RIO_DID_IDTCPS1616, RIO_VID_IDT)},
 	{RIO_DEVICE(RIO_DID_IDTVPS1616, RIO_VID_IDT)},
diff --git a/drivers/rapidio/switches/idt_gen3.c b/drivers/rapidio/switches/idt_gen3.c
index c5923a547bed..85a3908294d9 100644
--- a/drivers/rapidio/switches/idt_gen3.c
+++ b/drivers/rapidio/switches/idt_gen3.c
@@ -348,7 +348,7 @@ static void idtg3_shutdown(struct rio_dev *rdev)
 	}
 }
 
-static struct rio_device_id idtg3_id_table[] = {
+static const struct rio_device_id idtg3_id_table[] = {
 	{RIO_DEVICE(RIO_DID_IDTRXS1632, RIO_VID_IDT)},
 	{RIO_DEVICE(RIO_DID_IDTRXS2448, RIO_VID_IDT)},
 	{ 0, }	/* terminate list */
diff --git a/drivers/rapidio/switches/idtcps.c b/drivers/rapidio/switches/idtcps.c
index 7fbb60d31796..4058ce2c76fa 100644
--- a/drivers/rapidio/switches/idtcps.c
+++ b/drivers/rapidio/switches/idtcps.c
@@ -168,7 +168,7 @@ static void idtcps_remove(struct rio_dev *rdev)
 	spin_unlock(&rdev->rswitch->lock);
 }
 
-static struct rio_device_id idtcps_id_table[] = {
+static const struct rio_device_id idtcps_id_table[] = {
 	{RIO_DEVICE(RIO_DID_IDTCPS6Q, RIO_VID_IDT)},
 	{RIO_DEVICE(RIO_DID_IDTCPS8, RIO_VID_IDT)},
 	{RIO_DEVICE(RIO_DID_IDTCPS10Q, RIO_VID_IDT)},
diff --git a/drivers/rapidio/switches/tsi568.c b/drivers/rapidio/switches/tsi568.c
index 8a43561b9d17..1214628b7ded 100644
--- a/drivers/rapidio/switches/tsi568.c
+++ b/drivers/rapidio/switches/tsi568.c
@@ -169,7 +169,7 @@ static void tsi568_remove(struct rio_dev *rdev)
 	spin_unlock(&rdev->rswitch->lock);
 }
 
-static struct rio_device_id tsi568_id_table[] = {
+static const struct rio_device_id tsi568_id_table[] = {
 	{RIO_DEVICE(RIO_DID_TSI568, RIO_VID_TUNDRA)},
 	{ 0, }	/* terminate list */
 };
diff --git a/drivers/rapidio/switches/tsi57x.c b/drivers/rapidio/switches/tsi57x.c
index 2700d15f7584..9f063e214836 100644
--- a/drivers/rapidio/switches/tsi57x.c
+++ b/drivers/rapidio/switches/tsi57x.c
@@ -336,7 +336,7 @@ static void tsi57x_remove(struct rio_dev *rdev)
 	spin_unlock(&rdev->rswitch->lock);
 }
 
-static struct rio_device_id tsi57x_id_table[] = {
+static const struct rio_device_id tsi57x_id_table[] = {
 	{RIO_DEVICE(RIO_DID_TSI572, RIO_VID_TUNDRA)},
 	{RIO_DEVICE(RIO_DID_TSI574, RIO_VID_TUNDRA)},
 	{RIO_DEVICE(RIO_DID_TSI577, RIO_VID_TUNDRA)},

From b1402dcb5643b7a27d46a05edd7491d49ba0e248 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Fri, 17 Nov 2017 15:37:57 -0800
Subject: [PATCH 69/94] drivers/rapidio/devices/rio_mport_cdev.c: fix resource
 leak in error handling path in 'rio_dma_transfer()'

If 'dma_map_sg()', we should branch to the existing error handling path
to free some resources before returning.

Link: http://lkml.kernel.org/r/61292a4f369229eee03394247385e955027283f8.1505687047.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Alexandre Bounine <alexandre.bounine@idt.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Christian K_nig <christian.koenig@amd.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rapidio/devices/rio_mport_cdev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
index 5c1b6388122a..86805747a422 100644
--- a/drivers/rapidio/devices/rio_mport_cdev.c
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -963,7 +963,8 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode,
 			   req->sgt.sgl, req->sgt.nents, dir);
 	if (nents == -EFAULT) {
 		rmcd_error("Failed to map SG list");
-		return -EFAULT;
+		ret = -EFAULT;
+		goto err_pg;
 	}
 
 	ret = do_dma_request(req, xfer, sync, nents);

From c46d90cd7c3c5d3a5eb6265f2b0f06f44576d5a1 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Fri, 17 Nov 2017 15:38:03 -0800
Subject: [PATCH 70/94] drivers/rapidio/devices/rio_mport_cdev.c: fix error
 handling in 'rio_dma_transfer()'

In case of error, 'dma_map_sg()' returns 0, not a negative value.  There
is BUG_ON() in 'dma_map_sg_attrs()' which makes sure of that.

Link: http://lkml.kernel.org/r/d4235bd2b9274e99f6c86ea71b1fa1c7bd8d0c08.1505687047.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Alexandre Bounine <alexandre.bounine@idt.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Christian K_nig <christian.koenig@amd.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rapidio/devices/rio_mport_cdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
index 86805747a422..dc5a33f93689 100644
--- a/drivers/rapidio/devices/rio_mport_cdev.c
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -961,7 +961,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode,
 
 	nents = dma_map_sg(chan->device->dev,
 			   req->sgt.sgl, req->sgt.nents, dir);
-	if (nents == -EFAULT) {
+	if (nents == 0) {
 		rmcd_error("Failed to map SG list");
 		ret = -EFAULT;
 		goto err_pg;

From 2743232c0c4c82311718bb4ec1fb659ed64ecf7a Mon Sep 17 00:00:00 2001
From: Kangmin Park <l4stpr0gr4m@gmail.com>
Date: Fri, 17 Nov 2017 15:30:23 -0800
Subject: [PATCH 71/94] Documentation/sysctl/vm.txt: fix typo

Link: http://lkml.kernel.org/r/CAKW4uUyCi=PnKf3epgFVz8z=1tMtHSOHNm+fdNxrNw3-THvRCA@mail.gmail.com
Signed-off-by: Kangmin Park <l4stpr0gr4m@gmail.com>
Cc: Jiri Kosina <trivial@kernel.org>
Cc: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 055c8b3e1018..b920423f88cb 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -818,7 +818,7 @@ tooling to work, you can do:
 swappiness
 
 This control is used to define how aggressive the kernel will swap
-memory pages.  Higher values will increase agressiveness, lower values
+memory pages.  Higher values will increase aggressiveness, lower values
 decrease the amount of swap.  A value of 0 instructs the kernel not to
 initiate swap until the amount of free and file-backed pages is less
 than the high water mark in a zone.

From f9eb2fdd04d4e68fbea18970bbf65ace716d25b6 Mon Sep 17 00:00:00 2001
From: "Ola N. Kaldestad" <mail@okal.no>
Date: Fri, 17 Nov 2017 15:30:26 -0800
Subject: [PATCH 72/94] kernel/sysctl.c: code cleanups

Remove unnecessary else block, remove redundant return and call to kfree
in if block.

Link: http://lkml.kernel.org/r/1510238435-1655-1-git-send-email-mail@okal.no
Signed-off-by: Ola N. Kaldestad <mail@okal.no>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index dd25d90896fc..557d46728577 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -3127,14 +3127,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 			else
 				bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
 		}
-		kfree(tmp_bitmap);
 		*lenp -= left;
 		*ppos += *lenp;
-		return 0;
-	} else {
-		kfree(tmp_bitmap);
-		return err;
 	}
+
+	kfree(tmp_bitmap);
+	return err;
 }
 
 #else /* CONFIG_PROC_SYSCTL */

From 95846ecf9dac5089aed4b144d912225f8ef86ae4 Mon Sep 17 00:00:00 2001
From: Gargi Sharma <gs051095@gmail.com>
Date: Fri, 17 Nov 2017 15:30:30 -0800
Subject: [PATCH 73/94] pid: replace pid bitmap implementation with IDR API

Patch series "Replacing PID bitmap implementation with IDR API", v4.

This series replaces kernel bitmap implementation of PID allocation with
IDR API.  These patches are written to simplify the kernel by replacing
custom code with calls to generic code.

The following are the stats for pid and pid_namespace object files
before and after the replacement.  There is a noteworthy change between
the IDR and bitmap implementation.

Before
   text       data        bss        dec        hex    filename
   8447       3894         64      12405       3075    kernel/pid.o
After
   text       data        bss        dec        hex    filename
   3397        304          0       3701        e75    kernel/pid.o

Before
   text       data        bss        dec        hex    filename
   5692       1842        192       7726       1e2e    kernel/pid_namespace.o
After
   text       data        bss        dec        hex    filename
   2854        216         16       3086        c0e    kernel/pid_namespace.o

The following are the stats for ps, pstree and calling readdir on /proc
for 10,000 processes.

ps:
        With IDR API    With bitmap
real    0m1.479s        0m2.319s
user    0m0.070s        0m0.060s
sys     0m0.289s        0m0.516s

pstree:
        With IDR API    With bitmap
real    0m1.024s        0m1.794s
user    0m0.348s        0m0.612s
sys     0m0.184s        0m0.264s

proc:
        With IDR API    With bitmap
real    0m0.059s        0m0.074s
user    0m0.000s        0m0.004s
sys     0m0.016s        0m0.016s

This patch (of 2):

Replace the current bitmap implementation for Process ID allocation.
Functions that are no longer required, for example, free_pidmap(),
alloc_pidmap(), etc.  are removed.  The rest of the functions are
modified to use the IDR API.  The change was made to make the PID
allocation less complex by replacing custom code with calls to generic
API.

[gs051095@gmail.com: v6]
  Link: http://lkml.kernel.org/r/1507760379-21662-2-git-send-email-gs051095@gmail.com
[avagin@openvz.org: restore the old behaviour of the ns_last_pid sysctl]
  Link: http://lkml.kernel.org/r/20171106183144.16368-1-avagin@openvz.org
Link: http://lkml.kernel.org/r/1507583624-22146-2-git-send-email-gs051095@gmail.com
Signed-off-by: Gargi Sharma <gs051095@gmail.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Julia Lawall <julia.lawall@lip6.fr>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/cell/spufs/sched.c |   2 +-
 fs/proc/loadavg.c                         |   2 +-
 include/linux/pid_namespace.h             |  14 +-
 init/main.c                               |   2 +-
 kernel/pid.c                              | 201 ++++------------------
 kernel/pid_namespace.c                    |  53 +++---
 6 files changed, 65 insertions(+), 209 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 1fbb5da17dd2..e47761cdcb98 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -1093,7 +1093,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private)
 		LOAD_INT(c), LOAD_FRAC(c),
 		count_active_contexts(),
 		atomic_read(&nr_spu_contexts),
-		task_active_pid_ns(current)->last_pid);
+		idr_get_cursor(&task_active_pid_ns(current)->idr));
 	return 0;
 }
 
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 9bc5c58c00ee..a000d7547479 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -24,7 +24,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
 		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
 		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
 		nr_running(), nr_threads,
-		task_active_pid_ns(current)->last_pid);
+		idr_get_cursor(&task_active_pid_ns(current)->idr));
 	return 0;
 }
 
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index c78af6061644..92c6aa509d2e 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -10,15 +10,8 @@
 #include <linux/nsproxy.h>
 #include <linux/kref.h>
 #include <linux/ns_common.h>
+#include <linux/idr.h>
 
-struct pidmap {
-       atomic_t nr_free;
-       void *page;
-};
-
-#define BITS_PER_PAGE		(PAGE_SIZE * 8)
-#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
-#define PIDMAP_ENTRIES		((PID_MAX_LIMIT+BITS_PER_PAGE-1)/BITS_PER_PAGE)
 
 struct fs_pin;
 
@@ -30,9 +23,8 @@ enum { /* definitions for pid_namespace's hide_pid field */
 
 struct pid_namespace {
 	struct kref kref;
-	struct pidmap pidmap[PIDMAP_ENTRIES];
+	struct idr idr;
 	struct rcu_head rcu;
-	int last_pid;
 	unsigned int nr_hashed;
 	struct task_struct *child_reaper;
 	struct kmem_cache *pid_cachep;
@@ -106,6 +98,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
 
 extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
 void pidhash_init(void);
-void pidmap_init(void);
+void pid_idr_init(void);
 
 #endif /* _LINUX_PID_NS_H */
diff --git a/init/main.c b/init/main.c
index 859a786f7c0a..d0cbcfc06124 100644
--- a/init/main.c
+++ b/init/main.c
@@ -669,7 +669,7 @@ asmlinkage __visible void __init start_kernel(void)
 	if (late_time_init)
 		late_time_init();
 	calibrate_delay();
-	pidmap_init();
+	pid_idr_init();
 	anon_vma_init();
 #ifdef CONFIG_X86
 	if (efi_enabled(EFI_RUNTIME_SERVICES))
diff --git a/kernel/pid.c b/kernel/pid.c
index 020dedbdf066..0ce59369632f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -39,6 +39,7 @@
 #include <linux/proc_ns.h>
 #include <linux/proc_fs.h>
 #include <linux/sched/task.h>
+#include <linux/idr.h>
 
 #define pid_hashfn(nr, ns)	\
 	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -53,14 +54,6 @@ int pid_max = PID_MAX_DEFAULT;
 int pid_max_min = RESERVED_PIDS + 1;
 int pid_max_max = PID_MAX_LIMIT;
 
-static inline int mk_pid(struct pid_namespace *pid_ns,
-		struct pidmap *map, int off)
-{
-	return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
-}
-
-#define find_next_offset(map, off)					\
-		find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
 
 /*
  * PID-map pages start out as NULL, they get allocated upon
@@ -70,10 +63,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
  */
 struct pid_namespace init_pid_ns = {
 	.kref = KREF_INIT(2),
-	.pidmap = {
-		[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
-	},
-	.last_pid = 0,
+	.idr = IDR_INIT,
 	.nr_hashed = PIDNS_HASH_ADDING,
 	.level = 0,
 	.child_reaper = &init_task,
@@ -101,138 +91,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
 
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
 
-static void free_pidmap(struct upid *upid)
-{
-	int nr = upid->nr;
-	struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
-	int offset = nr & BITS_PER_PAGE_MASK;
-
-	clear_bit(offset, map->page);
-	atomic_inc(&map->nr_free);
-}
-
-/*
- * If we started walking pids at 'base', is 'a' seen before 'b'?
- */
-static int pid_before(int base, int a, int b)
-{
-	/*
-	 * This is the same as saying
-	 *
-	 * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
-	 * and that mapping orders 'a' and 'b' with respect to 'base'.
-	 */
-	return (unsigned)(a - base) < (unsigned)(b - base);
-}
-
-/*
- * We might be racing with someone else trying to set pid_ns->last_pid
- * at the pid allocation time (there's also a sysctl for this, but racing
- * with this one is OK, see comment in kernel/pid_namespace.c about it).
- * We want the winner to have the "later" value, because if the
- * "earlier" value prevails, then a pid may get reused immediately.
- *
- * Since pids rollover, it is not sufficient to just pick the bigger
- * value.  We have to consider where we started counting from.
- *
- * 'base' is the value of pid_ns->last_pid that we observed when
- * we started looking for a pid.
- *
- * 'pid' is the pid that we eventually found.
- */
-static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
-{
-	int prev;
-	int last_write = base;
-	do {
-		prev = last_write;
-		last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
-	} while ((prev != last_write) && (pid_before(base, last_write, pid)));
-}
-
-static int alloc_pidmap(struct pid_namespace *pid_ns)
-{
-	int i, offset, max_scan, pid, last = pid_ns->last_pid;
-	struct pidmap *map;
-
-	pid = last + 1;
-	if (pid >= pid_max)
-		pid = RESERVED_PIDS;
-	offset = pid & BITS_PER_PAGE_MASK;
-	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
-	/*
-	 * If last_pid points into the middle of the map->page we
-	 * want to scan this bitmap block twice, the second time
-	 * we start with offset == 0 (or RESERVED_PIDS).
-	 */
-	max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
-	for (i = 0; i <= max_scan; ++i) {
-		if (unlikely(!map->page)) {
-			void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-			/*
-			 * Free the page if someone raced with us
-			 * installing it:
-			 */
-			spin_lock_irq(&pidmap_lock);
-			if (!map->page) {
-				map->page = page;
-				page = NULL;
-			}
-			spin_unlock_irq(&pidmap_lock);
-			kfree(page);
-			if (unlikely(!map->page))
-				return -ENOMEM;
-		}
-		if (likely(atomic_read(&map->nr_free))) {
-			for ( ; ; ) {
-				if (!test_and_set_bit(offset, map->page)) {
-					atomic_dec(&map->nr_free);
-					set_last_pid(pid_ns, last, pid);
-					return pid;
-				}
-				offset = find_next_offset(map, offset);
-				if (offset >= BITS_PER_PAGE)
-					break;
-				pid = mk_pid(pid_ns, map, offset);
-				if (pid >= pid_max)
-					break;
-			}
-		}
-		if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
-			++map;
-			offset = 0;
-		} else {
-			map = &pid_ns->pidmap[0];
-			offset = RESERVED_PIDS;
-			if (unlikely(last == offset))
-				break;
-		}
-		pid = mk_pid(pid_ns, map, offset);
-	}
-	return -EAGAIN;
-}
-
-int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
-{
-	int offset;
-	struct pidmap *map, *end;
-
-	if (last >= PID_MAX_LIMIT)
-		return -1;
-
-	offset = (last + 1) & BITS_PER_PAGE_MASK;
-	map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
-	end = &pid_ns->pidmap[PIDMAP_ENTRIES];
-	for (; map < end; map++, offset = 0) {
-		if (unlikely(!map->page))
-			continue;
-		offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
-		if (offset < BITS_PER_PAGE)
-			return mk_pid(pid_ns, map, offset);
-	}
-	return -1;
-}
-
 void put_pid(struct pid *pid)
 {
 	struct pid_namespace *ns;
@@ -266,7 +124,7 @@ void free_pid(struct pid *pid)
 		struct upid *upid = pid->numbers + i;
 		struct pid_namespace *ns = upid->ns;
 		hlist_del_rcu(&upid->pid_chain);
-		switch(--ns->nr_hashed) {
+		switch (--ns->nr_hashed) {
 		case 2:
 		case 1:
 			/* When all that is left in the pid namespace
@@ -284,12 +142,11 @@ void free_pid(struct pid *pid)
 			schedule_work(&ns->proc_work);
 			break;
 		}
+
+		idr_remove(&ns->idr, upid->nr);
 	}
 	spin_unlock_irqrestore(&pidmap_lock, flags);
 
-	for (i = 0; i <= pid->level; i++)
-		free_pidmap(pid->numbers + i);
-
 	call_rcu(&pid->rcu, delayed_put_pid);
 }
 
@@ -308,8 +165,29 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 
 	tmp = ns;
 	pid->level = ns->level;
+
 	for (i = ns->level; i >= 0; i--) {
-		nr = alloc_pidmap(tmp);
+		int pid_min = 1;
+
+		idr_preload(GFP_KERNEL);
+		spin_lock_irq(&pidmap_lock);
+
+		/*
+		 * init really needs pid 1, but after reaching the maximum
+		 * wrap back to RESERVED_PIDS
+		 */
+		if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
+			pid_min = RESERVED_PIDS;
+
+		/*
+		 * Store a null pointer so find_pid_ns does not find
+		 * a partially initialized PID (see below).
+		 */
+		nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
+				      pid_max, GFP_ATOMIC);
+		spin_unlock_irq(&pidmap_lock);
+		idr_preload_end();
+
 		if (nr < 0) {
 			retval = nr;
 			goto out_free;
@@ -339,6 +217,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 	for ( ; upid >= pid->numbers; --upid) {
 		hlist_add_head_rcu(&upid->pid_chain,
 				&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+		/* Make the PID visible to find_pid_ns. */
+		idr_replace(&upid->ns->idr, pid, upid->nr);
 		upid->ns->nr_hashed++;
 	}
 	spin_unlock_irq(&pidmap_lock);
@@ -350,8 +230,11 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 	put_pid_ns(ns);
 
 out_free:
+	spin_lock_irq(&pidmap_lock);
 	while (++i <= ns->level)
-		free_pidmap(pid->numbers + i);
+		idr_remove(&ns->idr, (pid->numbers + i)->nr);
+
+	spin_unlock_irq(&pidmap_lock);
 
 	kmem_cache_free(ns->pid_cachep, pid);
 	return ERR_PTR(retval);
@@ -553,16 +436,7 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns);
  */
 struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 {
-	struct pid *pid;
-
-	do {
-		pid = find_pid_ns(nr, ns);
-		if (pid)
-			break;
-		nr = next_pidmap(ns, nr);
-	} while (nr > 0);
-
-	return pid;
+	return idr_get_next(&ns->idr, &nr);
 }
 
 /*
@@ -578,7 +452,7 @@ void __init pidhash_init(void)
 					   0, 4096);
 }
 
-void __init pidmap_init(void)
+void __init pid_idr_init(void)
 {
 	/* Verify no one has done anything silly: */
 	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
@@ -590,10 +464,7 @@ void __init pidmap_init(void)
 				PIDS_PER_CPU_MIN * num_possible_cpus());
 	pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
 
-	init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	/* Reserve PID 0. We never call free_pidmap(0) */
-	set_bit(0, init_pid_ns.pidmap[0].page);
-	atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+	idr_init(&init_pid_ns.idr);
 
 	init_pid_ns.pid_cachep = KMEM_CACHE(pid,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 4918314893bc..ca7c8a8823b1 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -21,6 +21,7 @@
 #include <linux/export.h>
 #include <linux/sched/task.h>
 #include <linux/sched/signal.h>
+#include <linux/idr.h>
 
 struct pid_cache {
 	int nr_ids;
@@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	struct pid_namespace *ns;
 	unsigned int level = parent_pid_ns->level + 1;
 	struct ucounts *ucounts;
-	int i;
 	int err;
 
 	err = -EINVAL;
@@ -117,17 +117,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	if (ns == NULL)
 		goto out_dec;
 
-	ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!ns->pidmap[0].page)
-		goto out_free;
+	idr_init(&ns->idr);
 
 	ns->pid_cachep = create_pid_cachep(level + 1);
 	if (ns->pid_cachep == NULL)
-		goto out_free_map;
+		goto out_free_idr;
 
 	err = ns_alloc_inum(&ns->ns);
 	if (err)
-		goto out_free_map;
+		goto out_free_idr;
 	ns->ns.ops = &pidns_operations;
 
 	kref_init(&ns->kref);
@@ -138,17 +136,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	ns->nr_hashed = PIDNS_HASH_ADDING;
 	INIT_WORK(&ns->proc_work, proc_cleanup_work);
 
-	set_bit(0, ns->pidmap[0].page);
-	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
-
-	for (i = 1; i < PIDMAP_ENTRIES; i++)
-		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-
 	return ns;
 
-out_free_map:
-	kfree(ns->pidmap[0].page);
-out_free:
+out_free_idr:
+	idr_destroy(&ns->idr);
 	kmem_cache_free(pid_ns_cachep, ns);
 out_dec:
 	dec_pid_namespaces(ucounts);
@@ -168,11 +159,9 @@ static void delayed_free_pidns(struct rcu_head *p)
 
 static void destroy_pid_namespace(struct pid_namespace *ns)
 {
-	int i;
-
 	ns_free_inum(&ns->ns);
-	for (i = 0; i < PIDMAP_ENTRIES; i++)
-		kfree(ns->pidmap[i].page);
+
+	idr_destroy(&ns->idr);
 	call_rcu(&ns->rcu, delayed_free_pidns);
 }
 
@@ -213,6 +202,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	int rc;
 	struct task_struct *task, *me = current;
 	int init_pids = thread_group_leader(me) ? 1 : 2;
+	struct pid *pid;
 
 	/* Don't allow any more processes into the pid namespace */
 	disable_pid_allocation(pid_ns);
@@ -239,20 +229,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 * 	  maintain a tasklist for each pid namespace.
 	 *
 	 */
+	rcu_read_lock();
 	read_lock(&tasklist_lock);
-	nr = next_pidmap(pid_ns, 1);
-	while (nr > 0) {
-		rcu_read_lock();
-
-		task = pid_task(find_vpid(nr), PIDTYPE_PID);
+	nr = 2;
+	idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
+		task = pid_task(pid, PIDTYPE_PID);
 		if (task && !__fatal_signal_pending(task))
 			send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
-
-		rcu_read_unlock();
-
-		nr = next_pidmap(pid_ns, nr);
 	}
 	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	/*
 	 * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
@@ -301,6 +287,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 {
 	struct pid_namespace *pid_ns = task_active_pid_ns(current);
 	struct ctl_table tmp = *table;
+	int ret, next;
 
 	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
@@ -311,8 +298,14 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 	 * it should synchronize its usage with external means.
 	 */
 
-	tmp.data = &pid_ns->last_pid;
-	return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	next = idr_get_cursor(&pid_ns->idr) - 1;
+
+	tmp.data = &next;
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	if (!ret && write)
+		idr_set_cursor(&pid_ns->idr, next + 1);
+
+	return ret;
 }
 
 extern int pid_max;

From e8cfbc245e24887e3c30235f71e9e9405e0cfc39 Mon Sep 17 00:00:00 2001
From: Gargi Sharma <gs051095@gmail.com>
Date: Fri, 17 Nov 2017 15:30:34 -0800
Subject: [PATCH 74/94] pid: remove pidhash

pidhash is no longer required as all the information can be looked up
from idr tree.  nr_hashed represented the number of pids that had been
hashed.  Since, nr_hashed and PIDNS_HASH_ADDING are no longer relevant,
it has been renamed to pid_allocated and PIDNS_ADDING respectively.

[gs051095@gmail.com: v6]
  Link: http://lkml.kernel.org/r/1507760379-21662-3-git-send-email-gs051095@gmail.com
Link: http://lkml.kernel.org/r/1507583624-22146-3-git-send-email-gs051095@gmail.com
Signed-off-by: Gargi Sharma <gs051095@gmail.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Tested-by: Tony Luck <tony.luck@intel.com>	[ia64]
Cc: Julia Lawall <julia.lawall@lip6.fr>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/asm-offsets.c |  4 +--
 include/linux/init_task.h      |  1 -
 include/linux/pid.h            |  2 --
 include/linux/pid_namespace.h  |  4 +--
 init/main.c                    |  1 -
 kernel/fork.c                  |  2 +-
 kernel/pid.c                   | 48 +++++++---------------------------
 kernel/pid_namespace.c         |  6 ++---
 8 files changed, 18 insertions(+), 50 deletions(-)

diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
index f7693f49c573..f4db2168d1b8 100644
--- a/arch/ia64/kernel/asm-offsets.c
+++ b/arch/ia64/kernel/asm-offsets.c
@@ -31,8 +31,8 @@ void foo(void)
 	DEFINE(SIGFRAME_SIZE, sizeof (struct sigframe));
 	DEFINE(UNW_FRAME_INFO_SIZE, sizeof (struct unw_frame_info));
 
-	BUILD_BUG_ON(sizeof(struct upid) != 32);
-	DEFINE(IA64_UPID_SHIFT, 5);
+	BUILD_BUG_ON(sizeof(struct upid) != 16);
+	DEFINE(IA64_UPID_SHIFT, 4);
 
 	BLANK();
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 8062e6cc607c..6a532629c983 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -105,7 +105,6 @@ extern struct group_info init_groups;
 	.numbers	= { {						\
 		.nr		= 0,					\
 		.ns		= &init_pid_ns,				\
-		.pid_chain	= { .next = NULL, .pprev = NULL },	\
 	}, }								\
 }
 
diff --git a/include/linux/pid.h b/include/linux/pid.h
index dfd684ce0787..7633d55d9a24 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -51,10 +51,8 @@ enum pid_type
  */
 
 struct upid {
-	/* Try to keep pid_chain in the same cacheline as nr for find_vpid */
 	int nr;
 	struct pid_namespace *ns;
-	struct hlist_node pid_chain;
 };
 
 struct pid
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 92c6aa509d2e..49538b172483 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -25,7 +25,7 @@ struct pid_namespace {
 	struct kref kref;
 	struct idr idr;
 	struct rcu_head rcu;
-	unsigned int nr_hashed;
+	unsigned int pid_allocated;
 	struct task_struct *child_reaper;
 	struct kmem_cache *pid_cachep;
 	unsigned int level;
@@ -49,7 +49,7 @@ struct pid_namespace {
 
 extern struct pid_namespace init_pid_ns;
 
-#define PIDNS_HASH_ADDING (1U << 31)
+#define PIDNS_ADDING (1U << 31)
 
 #ifdef CONFIG_PID_NS
 static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
diff --git a/init/main.c b/init/main.c
index d0cbcfc06124..dfec3809e740 100644
--- a/init/main.c
+++ b/init/main.c
@@ -562,7 +562,6 @@ asmlinkage __visible void __init start_kernel(void)
 	 * kmem_cache_init()
 	 */
 	setup_log_buf(0);
-	pidhash_init();
 	vfs_caches_init_early();
 	sort_main_extable();
 	trap_init();
diff --git a/kernel/fork.c b/kernel/fork.c
index 4e55eedba8d6..432eadf6b58c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1871,7 +1871,7 @@ static __latent_entropy struct task_struct *copy_process(
 		retval = -ERESTARTNOINTR;
 		goto bad_fork_cancel_cgroup;
 	}
-	if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
+	if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
 		retval = -ENOMEM;
 		goto bad_fork_cancel_cgroup;
 	}
diff --git a/kernel/pid.c b/kernel/pid.c
index 0ce59369632f..b13b624e2c49 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -41,10 +41,6 @@
 #include <linux/sched/task.h>
 #include <linux/idr.h>
 
-#define pid_hashfn(nr, ns)	\
-	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
-static struct hlist_head *pid_hash;
-static unsigned int pidhash_shift = 4;
 struct pid init_struct_pid = INIT_STRUCT_PID;
 
 int pid_max = PID_MAX_DEFAULT;
@@ -54,7 +50,6 @@ int pid_max = PID_MAX_DEFAULT;
 int pid_max_min = RESERVED_PIDS + 1;
 int pid_max_max = PID_MAX_LIMIT;
 
-
 /*
  * PID-map pages start out as NULL, they get allocated upon
  * first use and are never deallocated. This way a low pid_max
@@ -64,7 +59,7 @@ int pid_max_max = PID_MAX_LIMIT;
 struct pid_namespace init_pid_ns = {
 	.kref = KREF_INIT(2),
 	.idr = IDR_INIT,
-	.nr_hashed = PIDNS_HASH_ADDING,
+	.pid_allocated = PIDNS_ADDING,
 	.level = 0,
 	.child_reaper = &init_task,
 	.user_ns = &init_user_ns,
@@ -123,8 +118,7 @@ void free_pid(struct pid *pid)
 	for (i = 0; i <= pid->level; i++) {
 		struct upid *upid = pid->numbers + i;
 		struct pid_namespace *ns = upid->ns;
-		hlist_del_rcu(&upid->pid_chain);
-		switch (--ns->nr_hashed) {
+		switch (--ns->pid_allocated) {
 		case 2:
 		case 1:
 			/* When all that is left in the pid namespace
@@ -133,10 +127,10 @@ void free_pid(struct pid *pid)
 			 */
 			wake_up_process(ns->child_reaper);
 			break;
-		case PIDNS_HASH_ADDING:
+		case PIDNS_ADDING:
 			/* Handle a fork failure of the first process */
 			WARN_ON(ns->child_reaper);
-			ns->nr_hashed = 0;
+			ns->pid_allocated = 0;
 			/* fall through */
 		case 0:
 			schedule_work(&ns->proc_work);
@@ -212,14 +206,12 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 
 	upid = pid->numbers + ns->level;
 	spin_lock_irq(&pidmap_lock);
-	if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
+	if (!(ns->pid_allocated & PIDNS_ADDING))
 		goto out_unlock;
 	for ( ; upid >= pid->numbers; --upid) {
-		hlist_add_head_rcu(&upid->pid_chain,
-				&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
 		/* Make the PID visible to find_pid_ns. */
 		idr_replace(&upid->ns->idr, pid, upid->nr);
-		upid->ns->nr_hashed++;
+		upid->ns->pid_allocated++;
 	}
 	spin_unlock_irq(&pidmap_lock);
 
@@ -243,21 +235,13 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 void disable_pid_allocation(struct pid_namespace *ns)
 {
 	spin_lock_irq(&pidmap_lock);
-	ns->nr_hashed &= ~PIDNS_HASH_ADDING;
+	ns->pid_allocated &= ~PIDNS_ADDING;
 	spin_unlock_irq(&pidmap_lock);
 }
 
 struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
 {
-	struct upid *pnr;
-
-	hlist_for_each_entry_rcu(pnr,
-			&pid_hash[pid_hashfn(nr, ns)], pid_chain)
-		if (pnr->nr == nr && pnr->ns == ns)
-			return container_of(pnr, struct pid,
-					numbers[ns->level]);
-
-	return NULL;
+	return idr_find(&ns->idr, nr);
 }
 EXPORT_SYMBOL_GPL(find_pid_ns);
 
@@ -413,6 +397,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 		if (type != PIDTYPE_PID) {
 			if (type == __PIDTYPE_TGID)
 				type = PIDTYPE_PID;
+
 			task = task->group_leader;
 		}
 		nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
@@ -439,23 +424,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 	return idr_get_next(&ns->idr, &nr);
 }
 
-/*
- * The pid hash table is scaled according to the amount of memory in the
- * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
- * more.
- */
-void __init pidhash_init(void)
-{
-	pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
-					   HASH_EARLY | HASH_SMALL | HASH_ZERO,
-					   &pidhash_shift, NULL,
-					   0, 4096);
-}
-
 void __init pid_idr_init(void)
 {
 	/* Verify no one has done anything silly: */
-	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
+	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
 
 	/* bump default and minimum pid_max based on number of cpus */
 	pid_max = min(pid_max_max, max_t(int, pid_max,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index ca7c8a8823b1..0b53eef7d34b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -133,7 +133,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	ns->parent = get_pid_ns(parent_pid_ns);
 	ns->user_ns = get_user_ns(user_ns);
 	ns->ucounts = ucounts;
-	ns->nr_hashed = PIDNS_HASH_ADDING;
+	ns->pid_allocated = PIDNS_ADDING;
 	INIT_WORK(&ns->proc_work, proc_cleanup_work);
 
 	return ns;
@@ -254,7 +254,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 * sys_wait4() above can't reap the EXIT_DEAD children but we do not
 	 * really care, we could reparent them to the global init. We could
 	 * exit and reap ->child_reaper even if it is not the last thread in
-	 * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
+	 * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
 	 * pid_ns can not go away until proc_kill_sb() drops the reference.
 	 *
 	 * But this ns can also have other tasks injected by setns()+fork().
@@ -268,7 +268,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 */
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (pid_ns->nr_hashed == init_pids)
+		if (pid_ns->pid_allocated == init_pids)
 			break;
 		schedule();
 	}

From 4efb442cc12eb66535b7c7ed06005fd7889c1d77 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 17 Nov 2017 15:30:38 -0800
Subject: [PATCH 75/94] kernel/panic.c: add TAINT_AUX

This is the gist of a patch which we've been forward-porting in our
kernels for a long time now and it probably would make a good sense to
have such TAINT_AUX flag upstream which can be used by each distro etc,
how they see fit.  This way, we won't need to forward-port a distro-only
version indefinitely.

Add an auxiliary taint flag to be used by distros and others.  This
obviates the need to forward-port whatever internal solutions people
have in favor of a single flag which they can map arbitrarily to a
definition of their pleasing.

The "X" mnemonic could also mean eXternal, which would be taint from a
distro or something else but not the upstream kernel.  We will use it to
mark modules for which we don't provide support.  I.e., a really
eXternal module.

Link: http://lkml.kernel.org/r/20170911134533.dp5mtyku5bongx4c@pd.tnic
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 3 ++-
 kernel/panic.c         | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 4b484ab9e163..ce51455e2adf 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -549,7 +549,8 @@ extern enum system_states {
 #define TAINT_UNSIGNED_MODULE		13
 #define TAINT_SOFTLOCKUP		14
 #define TAINT_LIVEPATCH			15
-#define TAINT_FLAGS_COUNT		16
+#define TAINT_AUX			16
+#define TAINT_FLAGS_COUNT		17
 
 struct taint_flag {
 	char c_true;	/* character printed when tainted */
diff --git a/kernel/panic.c b/kernel/panic.c
index 3242b64b1956..2cfef408fec9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -324,6 +324,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
 	{ 'E', ' ', true },	/* TAINT_UNSIGNED_MODULE */
 	{ 'L', ' ', false },	/* TAINT_SOFTLOCKUP */
 	{ 'K', ' ', true },	/* TAINT_LIVEPATCH */
+	{ 'X', ' ', true },	/* TAINT_AUX */
 };
 
 /**
@@ -345,6 +346,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
  *  'E' - Unsigned module has been loaded.
  *  'L' - A soft lockup has previously occurred.
  *  'K' - Kernel has been live patched.
+ *  'X' - Auxiliary taint, for distros' use.
  *
  *	The string is overwritten by the next call to print_tainted().
  */

From fcf4edac049a8bca41658970292e2dfdbc9d5f62 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 17 Nov 2017 15:30:42 -0800
Subject: [PATCH 76/94] kcov: remove pointless current != NULL check

__sanitizer_cov_trace_pc() is a hot code, so it's worth to remove
pointless '!current' check.  Current is never NULL.

Link: http://lkml.kernel.org/r/20170929162221.32500-1-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kcov.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/kcov.c b/kernel/kcov.c
index fc6af9e1308b..d9f9fa9cacc6 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -62,7 +62,7 @@ void notrace __sanitizer_cov_trace_pc(void)
 	 * We are interested in code coverage as a function of a syscall inputs,
 	 * so we ignore code executed in interrupts.
 	 */
-	if (!t || !in_task())
+	if (!in_task())
 		return;
 	mode = READ_ONCE(t->kcov_mode);
 	if (mode == KCOV_MODE_TRACE) {

From ded97d2c2b2c5f1dcced0bc57133f7753b037dfc Mon Sep 17 00:00:00 2001
From: Victor Chibotaru <tchibo@google.com>
Date: Fri, 17 Nov 2017 15:30:46 -0800
Subject: [PATCH 77/94] kcov: support comparison operands collection

Enables kcov to collect comparison operands from instrumented code.
This is done by using Clang's -fsanitize=trace-cmp instrumentation
(currently not available for GCC).

The comparison operands help a lot in fuzz testing.  E.g.  they are used
in Syzkaller to cover the interiors of conditional statements with way
less attempts and thus make previously unreachable code reachable.

To allow separate collection of coverage and comparison operands two
different work modes are implemented.  Mode selection is now done via a
KCOV_ENABLE ioctl call with corresponding argument value.

Link: http://lkml.kernel.org/r/20171011095459.70721-1-glider@google.com
Signed-off-by: Victor Chibotaru <tchibo@google.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: <syzkaller@googlegroups.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kcov.h      |  12 ++-
 include/uapi/linux/kcov.h |  24 +++++
 kernel/kcov.c             | 218 +++++++++++++++++++++++++++++++-------
 3 files changed, 213 insertions(+), 41 deletions(-)

diff --git a/include/linux/kcov.h b/include/linux/kcov.h
index f5d8ce4f4f86..3ecf6f5e3a5f 100644
--- a/include/linux/kcov.h
+++ b/include/linux/kcov.h
@@ -8,19 +8,23 @@ struct task_struct;
 
 #ifdef CONFIG_KCOV
 
-void kcov_task_init(struct task_struct *t);
-void kcov_task_exit(struct task_struct *t);
-
 enum kcov_mode {
 	/* Coverage collection is not enabled yet. */
 	KCOV_MODE_DISABLED = 0,
+	/* KCOV was initialized, but tracing mode hasn't been chosen yet. */
+	KCOV_MODE_INIT = 1,
 	/*
 	 * Tracing coverage collection mode.
 	 * Covered PCs are collected in a per-task buffer.
 	 */
-	KCOV_MODE_TRACE = 1,
+	KCOV_MODE_TRACE_PC = 2,
+	/* Collecting comparison operands mode. */
+	KCOV_MODE_TRACE_CMP = 3,
 };
 
+void kcov_task_init(struct task_struct *t);
+void kcov_task_exit(struct task_struct *t);
+
 #else
 
 static inline void kcov_task_init(struct task_struct *t) {}
diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h
index 33eabbb8ada1..9529867717a8 100644
--- a/include/uapi/linux/kcov.h
+++ b/include/uapi/linux/kcov.h
@@ -8,4 +8,28 @@
 #define KCOV_ENABLE			_IO('c', 100)
 #define KCOV_DISABLE			_IO('c', 101)
 
+enum {
+	/*
+	 * Tracing coverage collection mode.
+	 * Covered PCs are collected in a per-task buffer.
+	 * In new KCOV version the mode is chosen by calling
+	 * ioctl(fd, KCOV_ENABLE, mode). In older versions the mode argument
+	 * was supposed to be 0 in such a call. So, for reasons of backward
+	 * compatibility, we have chosen the value KCOV_TRACE_PC to be 0.
+	 */
+	KCOV_TRACE_PC = 0,
+	/* Collecting comparison operands mode. */
+	KCOV_TRACE_CMP = 1,
+};
+
+/*
+ * The format for the types of collected comparisons.
+ *
+ * Bit 0 shows whether one of the arguments is a compile-time constant.
+ * Bits 1 & 2 contain log2 of the argument size, up to 8 bytes.
+ */
+#define KCOV_CMP_CONST          (1 << 0)
+#define KCOV_CMP_SIZE(n)        ((n) << 1)
+#define KCOV_CMP_MASK           KCOV_CMP_SIZE(3)
+
 #endif /* _LINUX_KCOV_IOCTLS_H */
diff --git a/kernel/kcov.c b/kernel/kcov.c
index d9f9fa9cacc6..15f33faf4013 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -22,13 +22,21 @@
 #include <linux/kcov.h>
 #include <asm/setup.h>
 
+/* Number of 64-bit words written per one comparison: */
+#define KCOV_WORDS_PER_CMP 4
+
 /*
  * kcov descriptor (one per opened debugfs file).
  * State transitions of the descriptor:
  *  - initial state after open()
  *  - then there must be a single ioctl(KCOV_INIT_TRACE) call
  *  - then, mmap() call (several calls are allowed but not useful)
- *  - then, repeated enable/disable for a task (only one task a time allowed)
+ *  - then, ioctl(KCOV_ENABLE, arg), where arg is
+ *	KCOV_TRACE_PC - to trace only the PCs
+ *	or
+ *	KCOV_TRACE_CMP - to trace only the comparison operands
+ *  - then, ioctl(KCOV_DISABLE) to disable the task.
+ * Enabling/disabling ioctls can be repeated (only one task a time allowed).
  */
 struct kcov {
 	/*
@@ -48,6 +56,36 @@ struct kcov {
 	struct task_struct	*t;
 };
 
+static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
+{
+	enum kcov_mode mode;
+
+	/*
+	 * We are interested in code coverage as a function of a syscall inputs,
+	 * so we ignore code executed in interrupts.
+	 */
+	if (!in_task())
+		return false;
+	mode = READ_ONCE(t->kcov_mode);
+	/*
+	 * There is some code that runs in interrupts but for which
+	 * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+	 * READ_ONCE()/barrier() effectively provides load-acquire wrt
+	 * interrupts, there are paired barrier()/WRITE_ONCE() in
+	 * kcov_ioctl_locked().
+	 */
+	barrier();
+	return mode == needed_mode;
+}
+
+static unsigned long canonicalize_ip(unsigned long ip)
+{
+#ifdef CONFIG_RANDOMIZE_BASE
+	ip -= kaslr_offset();
+#endif
+	return ip;
+}
+
 /*
  * Entry point from instrumented code.
  * This is called once per basic-block/edge.
@@ -55,44 +93,139 @@ struct kcov {
 void notrace __sanitizer_cov_trace_pc(void)
 {
 	struct task_struct *t;
-	enum kcov_mode mode;
+	unsigned long *area;
+	unsigned long ip = canonicalize_ip(_RET_IP_);
+	unsigned long pos;
 
 	t = current;
-	/*
-	 * We are interested in code coverage as a function of a syscall inputs,
-	 * so we ignore code executed in interrupts.
-	 */
-	if (!in_task())
+	if (!check_kcov_mode(KCOV_MODE_TRACE_PC, t))
 		return;
-	mode = READ_ONCE(t->kcov_mode);
-	if (mode == KCOV_MODE_TRACE) {
-		unsigned long *area;
-		unsigned long pos;
-		unsigned long ip = _RET_IP_;
 
-#ifdef CONFIG_RANDOMIZE_BASE
-		ip -= kaslr_offset();
-#endif
-
-		/*
-		 * There is some code that runs in interrupts but for which
-		 * in_interrupt() returns false (e.g. preempt_schedule_irq()).
-		 * READ_ONCE()/barrier() effectively provides load-acquire wrt
-		 * interrupts, there are paired barrier()/WRITE_ONCE() in
-		 * kcov_ioctl_locked().
-		 */
-		barrier();
-		area = t->kcov_area;
-		/* The first word is number of subsequent PCs. */
-		pos = READ_ONCE(area[0]) + 1;
-		if (likely(pos < t->kcov_size)) {
-			area[pos] = ip;
-			WRITE_ONCE(area[0], pos);
-		}
+	area = t->kcov_area;
+	/* The first 64-bit word is the number of subsequent PCs. */
+	pos = READ_ONCE(area[0]) + 1;
+	if (likely(pos < t->kcov_size)) {
+		area[pos] = ip;
+		WRITE_ONCE(area[0], pos);
 	}
 }
 EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
 
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
+{
+	struct task_struct *t;
+	u64 *area;
+	u64 count, start_index, end_pos, max_pos;
+
+	t = current;
+	if (!check_kcov_mode(KCOV_MODE_TRACE_CMP, t))
+		return;
+
+	ip = canonicalize_ip(ip);
+
+	/*
+	 * We write all comparison arguments and types as u64.
+	 * The buffer was allocated for t->kcov_size unsigned longs.
+	 */
+	area = (u64 *)t->kcov_area;
+	max_pos = t->kcov_size * sizeof(unsigned long);
+
+	count = READ_ONCE(area[0]);
+
+	/* Every record is KCOV_WORDS_PER_CMP 64-bit words. */
+	start_index = 1 + count * KCOV_WORDS_PER_CMP;
+	end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64);
+	if (likely(end_pos <= max_pos)) {
+		area[start_index] = type;
+		area[start_index + 1] = arg1;
+		area[start_index + 2] = arg2;
+		area[start_index + 3] = ip;
+		WRITE_ONCE(area[0], count + 1);
+	}
+}
+
+void notrace __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(0), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp1);
+
+void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(1), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
+
+void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4);
+
+void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp8);
+
+void notrace __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(0) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp1);
+
+void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(1) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
+
+void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4);
+
+void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8);
+
+void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases)
+{
+	u64 i;
+	u64 count = cases[0];
+	u64 size = cases[1];
+	u64 type = KCOV_CMP_CONST;
+
+	switch (size) {
+	case 8:
+		type |= KCOV_CMP_SIZE(0);
+		break;
+	case 16:
+		type |= KCOV_CMP_SIZE(1);
+		break;
+	case 32:
+		type |= KCOV_CMP_SIZE(2);
+		break;
+	case 64:
+		type |= KCOV_CMP_SIZE(3);
+		break;
+	default:
+		return;
+	}
+	for (i = 0; i < count; i++)
+		write_comp_data(type, cases[i + 2], val, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
+#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */
+
 static void kcov_get(struct kcov *kcov)
 {
 	atomic_inc(&kcov->refcount);
@@ -129,6 +262,7 @@ void kcov_task_exit(struct task_struct *t)
 	/* Just to not leave dangling references behind. */
 	kcov_task_init(t);
 	kcov->t = NULL;
+	kcov->mode = KCOV_MODE_INIT;
 	spin_unlock(&kcov->lock);
 	kcov_put(kcov);
 }
@@ -147,7 +281,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
 
 	spin_lock(&kcov->lock);
 	size = kcov->size * sizeof(unsigned long);
-	if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 ||
+	if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
 	    vma->vm_end - vma->vm_start != size) {
 		res = -EINVAL;
 		goto exit;
@@ -176,6 +310,7 @@ static int kcov_open(struct inode *inode, struct file *filep)
 	kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
 	if (!kcov)
 		return -ENOMEM;
+	kcov->mode = KCOV_MODE_DISABLED;
 	atomic_set(&kcov->refcount, 1);
 	spin_lock_init(&kcov->lock);
 	filep->private_data = kcov;
@@ -211,7 +346,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 		if (size < 2 || size > INT_MAX / sizeof(unsigned long))
 			return -EINVAL;
 		kcov->size = size;
-		kcov->mode = KCOV_MODE_TRACE;
+		kcov->mode = KCOV_MODE_INIT;
 		return 0;
 	case KCOV_ENABLE:
 		/*
@@ -221,17 +356,25 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 		 * at task exit or voluntary by KCOV_DISABLE. After that it can
 		 * be enabled for another task.
 		 */
-		unused = arg;
-		if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED ||
-		    kcov->area == NULL)
+		if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
 			return -EINVAL;
 		if (kcov->t != NULL)
 			return -EBUSY;
+		if (arg == KCOV_TRACE_PC)
+			kcov->mode = KCOV_MODE_TRACE_PC;
+		else if (arg == KCOV_TRACE_CMP)
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+			kcov->mode = KCOV_MODE_TRACE_CMP;
+#else
+		return -ENOTSUPP;
+#endif
+		else
+			return -EINVAL;
 		t = current;
 		/* Cache in task struct for performance. */
 		t->kcov_size = kcov->size;
 		t->kcov_area = kcov->area;
-		/* See comment in __sanitizer_cov_trace_pc(). */
+		/* See comment in check_kcov_mode(). */
 		barrier();
 		WRITE_ONCE(t->kcov_mode, kcov->mode);
 		t->kcov = kcov;
@@ -249,6 +392,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 			return -EINVAL;
 		kcov_task_init(t);
 		kcov->t = NULL;
+		kcov->mode = KCOV_MODE_INIT;
 		kcov_put(kcov);
 		return 0;
 	default:

From d677a4d6019385488e794cc47bd3d6f9c2aab874 Mon Sep 17 00:00:00 2001
From: Victor Chibotaru <tchibo@google.com>
Date: Fri, 17 Nov 2017 15:30:50 -0800
Subject: [PATCH 78/94] Makefile: support flag -fsanitizer-coverage=trace-cmp

The flag enables Clang instrumentation of comparison operations
(currently not supported by GCC).  This instrumentation is needed by the
new KCOV device to collect comparison operands.

Link: http://lkml.kernel.org/r/20171011095459.70721-2-glider@google.com
Signed-off-by: Victor Chibotaru <tchibo@google.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: <syzkaller@googlegroups.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Makefile              |  3 +--
 lib/Kconfig.debug     | 10 ++++++++++
 scripts/Makefile.kcov |  7 +++++++
 3 files changed, 18 insertions(+), 2 deletions(-)
 create mode 100644 scripts/Makefile.kcov

diff --git a/Makefile b/Makefile
index 763ab35df12a..ccb7d5b2fbf5 100644
--- a/Makefile
+++ b/Makefile
@@ -375,8 +375,6 @@ CFLAGS_KERNEL	=
 AFLAGS_KERNEL	=
 LDFLAGS_vmlinux =
 CFLAGS_GCOV	:= -fprofile-arcs -ftest-coverage -fno-tree-loop-im $(call cc-disable-warning,maybe-uninitialized,)
-CFLAGS_KCOV	:= $(call cc-option,-fsanitize-coverage=trace-pc,)
-
 
 # Use USERINCLUDE when you must reference the UAPI directories only.
 USERINCLUDE    := \
@@ -659,6 +657,7 @@ ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC) $(KBUILD_CFLA
 	KBUILD_AFLAGS += -DCC_HAVE_ASM_GOTO
 endif
 
+include scripts/Makefile.kcov
 include scripts/Makefile.gcc-plugins
 
 ifdef CONFIG_READABLE_ASM
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ce813731269f..947d3e2ed5c2 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -756,6 +756,16 @@ config KCOV
 
 	  For more details, see Documentation/dev-tools/kcov.rst.
 
+config KCOV_ENABLE_COMPARISONS
+	bool "Enable comparison operands collection by KCOV"
+	depends on KCOV
+	default n
+	help
+	  KCOV also exposes operands of every comparison in the instrumented
+	  code along with operand sizes and PCs of the comparison instructions.
+	  These operands can be used by fuzzing engines to improve the quality
+	  of fuzzing coverage.
+
 config KCOV_INSTRUMENT_ALL
 	bool "Instrument all code by default"
 	depends on KCOV
diff --git a/scripts/Makefile.kcov b/scripts/Makefile.kcov
new file mode 100644
index 000000000000..5cc72037e423
--- /dev/null
+++ b/scripts/Makefile.kcov
@@ -0,0 +1,7 @@
+ifdef CONFIG_KCOV
+CFLAGS_KCOV	:= $(call cc-option,-fsanitize-coverage=trace-pc,)
+ifeq ($(CONFIG_KCOV_ENABLE_COMPARISONS),y)
+CFLAGS_KCOV += $(call cc-option,-fsanitize-coverage=trace-cmp,)
+endif
+
+endif

From c512ac01d8a841033da8ec538a83f80fb0b4d1fe Mon Sep 17 00:00:00 2001
From: Victor Chibotaru <tchibo@google.com>
Date: Fri, 17 Nov 2017 15:30:53 -0800
Subject: [PATCH 79/94] kcov: update documentation

The updated documentation describes new KCOV mode for collecting
comparison operands.

Link: http://lkml.kernel.org/r/20171011095459.70721-3-glider@google.com
Signed-off-by: Victor Chibotaru <tchibo@google.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: <syzkaller@googlegroups.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/dev-tools/kcov.rst | 99 ++++++++++++++++++++++++++++++--
 1 file changed, 95 insertions(+), 4 deletions(-)

diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst
index 44886c91e112..c2f6452e38ed 100644
--- a/Documentation/dev-tools/kcov.rst
+++ b/Documentation/dev-tools/kcov.rst
@@ -12,19 +12,30 @@ To achieve this goal it does not collect coverage in soft/hard interrupts
 and instrumentation of some inherently non-deterministic parts of kernel is
 disabled (e.g. scheduler, locking).
 
-Usage
------
+kcov is also able to collect comparison operands from the instrumented code
+(this feature currently requires that the kernel is compiled with clang).
+
+Prerequisites
+-------------
 
 Configure the kernel with::
 
         CONFIG_KCOV=y
 
 CONFIG_KCOV requires gcc built on revision 231296 or later.
+
+If the comparison operands need to be collected, set::
+
+	CONFIG_KCOV_ENABLE_COMPARISONS=y
+
 Profiling data will only become accessible once debugfs has been mounted::
 
         mount -t debugfs none /sys/kernel/debug
 
-The following program demonstrates kcov usage from within a test program:
+Coverage collection
+-------------------
+The following program demonstrates coverage collection from within a test
+program using kcov:
 
 .. code-block:: c
 
@@ -44,6 +55,9 @@ The following program demonstrates kcov usage from within a test program:
     #define KCOV_DISABLE			_IO('c', 101)
     #define COVER_SIZE			(64<<10)
 
+    #define KCOV_TRACE_PC  0
+    #define KCOV_TRACE_CMP 1
+
     int main(int argc, char **argv)
     {
 	int fd;
@@ -64,7 +78,7 @@ The following program demonstrates kcov usage from within a test program:
 	if ((void*)cover == MAP_FAILED)
 		perror("mmap"), exit(1);
 	/* Enable coverage collection on the current thread. */
-	if (ioctl(fd, KCOV_ENABLE, 0))
+	if (ioctl(fd, KCOV_ENABLE, KCOV_TRACE_PC))
 		perror("ioctl"), exit(1);
 	/* Reset coverage from the tail of the ioctl() call. */
 	__atomic_store_n(&cover[0], 0, __ATOMIC_RELAXED);
@@ -111,3 +125,80 @@ The interface is fine-grained to allow efficient forking of test processes.
 That is, a parent process opens /sys/kernel/debug/kcov, enables trace mode,
 mmaps coverage buffer and then forks child processes in a loop. Child processes
 only need to enable coverage (disable happens automatically on thread end).
+
+Comparison operands collection
+------------------------------
+Comparison operands collection is similar to coverage collection:
+
+.. code-block:: c
+
+    /* Same includes and defines as above. */
+
+    /* Number of 64-bit words per record. */
+    #define KCOV_WORDS_PER_CMP 4
+
+    /*
+     * The format for the types of collected comparisons.
+     *
+     * Bit 0 shows whether one of the arguments is a compile-time constant.
+     * Bits 1 & 2 contain log2 of the argument size, up to 8 bytes.
+     */
+
+    #define KCOV_CMP_CONST          (1 << 0)
+    #define KCOV_CMP_SIZE(n)        ((n) << 1)
+    #define KCOV_CMP_MASK           KCOV_CMP_SIZE(3)
+
+    int main(int argc, char **argv)
+    {
+	int fd;
+	uint64_t *cover, type, arg1, arg2, is_const, size;
+	unsigned long n, i;
+
+	fd = open("/sys/kernel/debug/kcov", O_RDWR);
+	if (fd == -1)
+		perror("open"), exit(1);
+	if (ioctl(fd, KCOV_INIT_TRACE, COVER_SIZE))
+		perror("ioctl"), exit(1);
+	/*
+	* Note that the buffer pointer is of type uint64_t*, because all
+	* the comparison operands are promoted to uint64_t.
+	*/
+	cover = (uint64_t *)mmap(NULL, COVER_SIZE * sizeof(unsigned long),
+				     PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if ((void*)cover == MAP_FAILED)
+		perror("mmap"), exit(1);
+	/* Note KCOV_TRACE_CMP instead of KCOV_TRACE_PC. */
+	if (ioctl(fd, KCOV_ENABLE, KCOV_TRACE_CMP))
+		perror("ioctl"), exit(1);
+	__atomic_store_n(&cover[0], 0, __ATOMIC_RELAXED);
+	read(-1, NULL, 0);
+	/* Read number of comparisons collected. */
+	n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED);
+	for (i = 0; i < n; i++) {
+		type = cover[i * KCOV_WORDS_PER_CMP + 1];
+		/* arg1 and arg2 - operands of the comparison. */
+		arg1 = cover[i * KCOV_WORDS_PER_CMP + 2];
+		arg2 = cover[i * KCOV_WORDS_PER_CMP + 3];
+		/* ip - caller address. */
+		ip = cover[i * KCOV_WORDS_PER_CMP + 4];
+		/* size of the operands. */
+		size = 1 << ((type & KCOV_CMP_MASK) >> 1);
+		/* is_const - true if either operand is a compile-time constant.*/
+		is_const = type & KCOV_CMP_CONST;
+		printf("ip: 0x%lx type: 0x%lx, arg1: 0x%lx, arg2: 0x%lx, "
+			"size: %lu, %s\n",
+			ip, type, arg1, arg2, size,
+		is_const ? "const" : "non-const");
+	}
+	if (ioctl(fd, KCOV_DISABLE, 0))
+		perror("ioctl"), exit(1);
+	/* Free resources. */
+	if (munmap(cover, COVER_SIZE * sizeof(unsigned long)))
+		perror("munmap"), exit(1);
+	if (close(fd))
+		perror("close"), exit(1);
+	return 0;
+    }
+
+Note that the kcov modes (coverage collection or comparison operands) are
+mutually exclusive.

From 2d8364bae4db144df75ba85e92d2b8619ba8eedc Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Fri, 17 Nov 2017 15:30:57 -0800
Subject: [PATCH 80/94] kernel/reboot.c: add devm_register_reboot_notifier()

Add devm_* wrapper around register_reboot_notifier to simplify device
specific reboot notifier registration/unregistration.

[akpm@linux-foundation.org: move `struct device' forward decl to top-of-file]
Link: http://lkml.kernel.org/r/20170320171753.1705-1-andrew.smirnov@gmail.com
Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/reboot.h |  4 ++++
 kernel/reboot.c        | 27 +++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index d03da0eb95ca..e63799a6e895 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -6,6 +6,8 @@
 #include <linux/notifier.h>
 #include <uapi/linux/reboot.h>
 
+struct device;
+
 #define SYS_DOWN	0x0001	/* Notify of system down */
 #define SYS_RESTART	SYS_DOWN
 #define SYS_HALT	0x0002	/* Notify of system halt */
@@ -39,6 +41,8 @@ extern int reboot_force;
 extern int register_reboot_notifier(struct notifier_block *);
 extern int unregister_reboot_notifier(struct notifier_block *);
 
+extern int devm_register_reboot_notifier(struct device *, struct notifier_block *);
+
 extern int register_restart_handler(struct notifier_block *);
 extern int unregister_restart_handler(struct notifier_block *);
 extern void do_kernel_restart(char *cmd);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index bd30a973fe94..e4ced883d8de 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,6 +104,33 @@ int unregister_reboot_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_reboot_notifier);
 
+static void devm_unregister_reboot_notifier(struct device *dev, void *res)
+{
+	WARN_ON(unregister_reboot_notifier(*(struct notifier_block **)res));
+}
+
+int devm_register_reboot_notifier(struct device *dev, struct notifier_block *nb)
+{
+	struct notifier_block **rcnb;
+	int ret;
+
+	rcnb = devres_alloc(devm_unregister_reboot_notifier,
+			    sizeof(*rcnb), GFP_KERNEL);
+	if (!rcnb)
+		return -ENOMEM;
+
+	ret = register_reboot_notifier(nb);
+	if (!ret) {
+		*rcnb = nb;
+		devres_add(dev, rcnb);
+	} else {
+		devres_free(rcnb);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(devm_register_reboot_notifier);
+
 /*
  *	Notifier list for kernel code which wants to be called
  *	to restart the system.

From 44ea39420fc95e7432ddc91de4eb58c7470ab897 Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Fri, 17 Nov 2017 15:31:01 -0800
Subject: [PATCH 81/94] drivers/watchdog: make use of
 devm_register_reboot_notifier()

Save a bit of cleanup code by leveraging newly added
devm_register_reboot_notifier().

[akpm@linux-foundation.org: small cleanup: avoid 80-col tricks]
Link: http://lkml.kernel.org/r/20170411160615.9784-1-andrew.smirnov@gmail.com
Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Cc: Chris Healy <cphealy@gmail.com>
Cc: Wim Van Sebroeck <wim@iguana.be>
Cc: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/watchdog/watchdog_core.c | 35 --------------------------------
 drivers/watchdog/watchdog_dev.c  | 32 +++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 35 deletions(-)

diff --git a/drivers/watchdog/watchdog_core.c b/drivers/watchdog/watchdog_core.c
index 74265b2f806c..8a8d952f8df9 100644
--- a/drivers/watchdog/watchdog_core.c
+++ b/drivers/watchdog/watchdog_core.c
@@ -137,25 +137,6 @@ int watchdog_init_timeout(struct watchdog_device *wdd,
 }
 EXPORT_SYMBOL_GPL(watchdog_init_timeout);
 
-static int watchdog_reboot_notifier(struct notifier_block *nb,
-				    unsigned long code, void *data)
-{
-	struct watchdog_device *wdd = container_of(nb, struct watchdog_device,
-						   reboot_nb);
-
-	if (code == SYS_DOWN || code == SYS_HALT) {
-		if (watchdog_active(wdd)) {
-			int ret;
-
-			ret = wdd->ops->stop(wdd);
-			if (ret)
-				return NOTIFY_BAD;
-		}
-	}
-
-	return NOTIFY_DONE;
-}
-
 static int watchdog_restart_notifier(struct notifier_block *nb,
 				     unsigned long action, void *data)
 {
@@ -244,19 +225,6 @@ static int __watchdog_register_device(struct watchdog_device *wdd)
 		}
 	}
 
-	if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status)) {
-		wdd->reboot_nb.notifier_call = watchdog_reboot_notifier;
-
-		ret = register_reboot_notifier(&wdd->reboot_nb);
-		if (ret) {
-			pr_err("watchdog%d: Cannot register reboot notifier (%d)\n",
-			       wdd->id, ret);
-			watchdog_dev_unregister(wdd);
-			ida_simple_remove(&watchdog_ida, wdd->id);
-			return ret;
-		}
-	}
-
 	if (wdd->ops->restart) {
 		wdd->restart_nb.notifier_call = watchdog_restart_notifier;
 
@@ -302,9 +270,6 @@ static void __watchdog_unregister_device(struct watchdog_device *wdd)
 	if (wdd->ops->restart)
 		unregister_restart_handler(&wdd->restart_nb);
 
-	if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status))
-		unregister_reboot_notifier(&wdd->reboot_nb);
-
 	watchdog_dev_unregister(wdd);
 	ida_simple_remove(&watchdog_ida, wdd->id);
 }
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
index 0826e663bd5a..1e971a50d7fb 100644
--- a/drivers/watchdog/watchdog_dev.c
+++ b/drivers/watchdog/watchdog_dev.c
@@ -42,6 +42,7 @@
 #include <linux/miscdevice.h>	/* For handling misc devices */
 #include <linux/module.h>	/* For module stuff/... */
 #include <linux/mutex.h>	/* For mutexes */
+#include <linux/reboot.h>	/* For reboot notifier */
 #include <linux/slab.h>		/* For memory functions */
 #include <linux/types.h>	/* For standard types (like size_t) */
 #include <linux/watchdog.h>	/* For watchdog specific items */
@@ -1016,6 +1017,25 @@ static struct class watchdog_class = {
 	.dev_groups =	wdt_groups,
 };
 
+static int watchdog_reboot_notifier(struct notifier_block *nb,
+				    unsigned long code, void *data)
+{
+	struct watchdog_device *wdd;
+
+	wdd = container_of(nb, struct watchdog_device, reboot_nb);
+	if (code == SYS_DOWN || code == SYS_HALT) {
+		if (watchdog_active(wdd)) {
+			int ret;
+
+			ret = wdd->ops->stop(wdd);
+			if (ret)
+				return NOTIFY_BAD;
+		}
+	}
+
+	return NOTIFY_DONE;
+}
+
 /*
  *	watchdog_dev_register: register a watchdog device
  *	@wdd: watchdog device
@@ -1049,6 +1069,18 @@ int watchdog_dev_register(struct watchdog_device *wdd)
 	if (ret) {
 		device_destroy(&watchdog_class, devno);
 		watchdog_cdev_unregister(wdd);
+		return ret;
+	}
+
+	if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status)) {
+		wdd->reboot_nb.notifier_call = watchdog_reboot_notifier;
+
+		ret = devm_register_reboot_notifier(dev, &wdd->reboot_nb);
+		if (ret) {
+			pr_err("watchdog%d: Cannot register reboot notifier (%d)\n",
+			       wdd->id, ret);
+			watchdog_dev_unregister(wdd);
+		}
 	}
 
 	return ret;

From e35c4c64fe492b212f9c7d9e046626e48e89f863 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 17 Nov 2017 15:31:04 -0800
Subject: [PATCH 82/94] initramfs: use time64_t timestamps

The cpio format uses a 32-bit number to encode file timestamps, which
breaks initramfs support in 2038.  This reinterprets the timestamp as
unsigned, to give us another 68 years and avoids breaking until 2106.

Link: http://lkml.kernel.org/r/20171019095536.801199-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Lokesh Vutla <lokeshvutla@ti.com>
Cc: Stafford Horne <shorne@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/initramfs.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/init/initramfs.c b/init/initramfs.c
index 7046feffef6b..7e99a0038942 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -109,7 +109,7 @@ static void __init free_hash(void)
 	}
 }
 
-static long __init do_utime(char *filename, time_t mtime)
+static long __init do_utime(char *filename, time64_t mtime)
 {
 	struct timespec64 t[2];
 
@@ -125,10 +125,10 @@ static __initdata LIST_HEAD(dir_list);
 struct dir_entry {
 	struct list_head list;
 	char *name;
-	time_t mtime;
+	time64_t mtime;
 };
 
-static void __init dir_add(const char *name, time_t mtime)
+static void __init dir_add(const char *name, time64_t mtime)
 {
 	struct dir_entry *de = kmalloc(sizeof(struct dir_entry), GFP_KERNEL);
 	if (!de)
@@ -150,7 +150,7 @@ static void __init dir_utime(void)
 	}
 }
 
-static __initdata time_t mtime;
+static __initdata time64_t mtime;
 
 /* cpio header parsing */
 
@@ -177,7 +177,7 @@ static void __init parse_header(char *s)
 	uid = parsed[2];
 	gid = parsed[3];
 	nlink = parsed[4];
-	mtime = parsed[5];
+	mtime = parsed[5]; /* breaks in y2106 */
 	body_len = parsed[6];
 	major = parsed[7];
 	minor = parsed[8];

From b8fd99838435f9b420c3e848192bd43abc648b7f Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Fri, 17 Nov 2017 15:31:08 -0800
Subject: [PATCH 83/94] sysvipc: unteach ids->next_id for !CHECKPOINT_RESTORE

Patch series "sysvipc: ipc-key management improvements".

Here are a few improvements I spotted while eyeballing Guillaume's
rhashtable implementation for ipc keys.  The first and fourth patches
are the interesting ones, the middle two are trivial.

This patch (of 4):

The next_id object-allocation functionality was introduced in commit
03f595668017 ("ipc: add sysctl to specify desired next object id").

Given that these new entries are _only_ exported under the
CONFIG_CHECKPOINT_RESTORE option, there is no point for the common case
to even know about ->next_id.  As such rewrite ipc_buildid() such that
it can do away with the field as well as unnecessary branches when
adding a new identifier.  The end result also better differentiates both
cases, so the code ends up being cleaner; albeit the small duplications
regarding the default case.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20170831172049.14576-2-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc_namespace.h |  2 ++
 ipc/util.c                    | 60 ++++++++++++++++++++++++++---------
 ipc/util.h                    |  5 ---
 3 files changed, 47 insertions(+), 20 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 474812abe773..d7cf3a850853 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -19,7 +19,9 @@ struct ipc_ids {
 	bool tables_initialized;
 	struct rw_semaphore rwsem;
 	struct idr ipcs_idr;
+#ifdef CONFIG_CHECKPOINT_RESTORE
 	int next_id;
+#endif
 	struct rhashtable key_ht;
 };
 
diff --git a/ipc/util.c b/ipc/util.c
index 79b30eee32cd..429c06bdb8ef 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -116,13 +116,15 @@ int ipc_init_ids(struct ipc_ids *ids)
 	int err;
 	ids->in_use = 0;
 	ids->seq = 0;
-	ids->next_id = -1;
 	init_rwsem(&ids->rwsem);
 	err = rhashtable_init(&ids->key_ht, &ipc_kht_params);
 	if (err)
 		return err;
 	idr_init(&ids->ipcs_idr);
 	ids->tables_initialized = true;
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	ids->next_id = -1;
+#endif
 	return 0;
 }
 
@@ -216,6 +218,46 @@ int ipc_get_maxid(struct ipc_ids *ids)
 	return max_id;
 }
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
+/*
+ * Specify desired id for next allocated IPC object.
+ */
+#define ipc_idr_alloc(ids, new)						\
+	idr_alloc(&(ids)->ipcs_idr, (new),				\
+		  (ids)->next_id < 0 ? 0 : ipcid_to_idx((ids)->next_id),\
+		  0, GFP_NOWAIT)
+
+static inline int ipc_buildid(int id, struct ipc_ids *ids,
+			      struct kern_ipc_perm *new)
+{
+	if (ids->next_id < 0) { /* default, behave as !CHECKPOINT_RESTORE */
+		new->seq = ids->seq++;
+		if (ids->seq > IPCID_SEQ_MAX)
+			ids->seq = 0;
+	} else {
+		new->seq = ipcid_to_seqx(ids->next_id);
+		ids->next_id = -1;
+	}
+
+	return SEQ_MULTIPLIER * new->seq + id;
+}
+
+#else
+#define ipc_idr_alloc(ids, new)					\
+	idr_alloc(&(ids)->ipcs_idr, (new), 0, 0, GFP_NOWAIT)
+
+static inline int ipc_buildid(int id, struct ipc_ids *ids,
+			      struct kern_ipc_perm *new)
+{
+	new->seq = ids->seq++;
+	if (ids->seq > IPCID_SEQ_MAX)
+		ids->seq = 0;
+
+	return SEQ_MULTIPLIER * new->seq + id;
+}
+
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
 /**
  * ipc_addid - add an ipc identifier
  * @ids: ipc identifier set
@@ -234,7 +276,6 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
 	kuid_t euid;
 	kgid_t egid;
 	int id, err;
-	int next_id = ids->next_id;
 
 	if (size > IPCMNI)
 		size = IPCMNI;
@@ -254,9 +295,7 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
 	new->cuid = new->uid = euid;
 	new->gid = new->cgid = egid;
 
-	id = idr_alloc(&ids->ipcs_idr, new,
-		       (next_id < 0) ? 0 : ipcid_to_idx(next_id), 0,
-		       GFP_NOWAIT);
+	id = ipc_idr_alloc(ids, new);
 	idr_preload_end();
 
 	if (id >= 0 && new->key != IPC_PRIVATE) {
@@ -274,17 +313,8 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
 	}
 
 	ids->in_use++;
+	new->id = ipc_buildid(id, ids, new);
 
-	if (next_id < 0) {
-		new->seq = ids->seq++;
-		if (ids->seq > IPCID_SEQ_MAX)
-			ids->seq = 0;
-	} else {
-		new->seq = ipcid_to_seqx(next_id);
-		ids->next_id = -1;
-	}
-
-	new->id = ipc_buildid(id, new->seq);
 	return id;
 }
 
diff --git a/ipc/util.h b/ipc/util.h
index 579112d90016..0cd6201fe63a 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -146,11 +146,6 @@ extern struct msg_msg *load_msg(const void __user *src, size_t len);
 extern struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst);
 extern int store_msg(void __user *dest, struct msg_msg *msg, size_t len);
 
-static inline int ipc_buildid(int id, int seq)
-{
-	return SEQ_MULTIPLIER * seq + id;
-}
-
 static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int uid)
 {
 	return uid / SEQ_MULTIPLIER != ipcp->seq;

From 39c96a1b96a5991b1c9e79b85a8d74ef93b36026 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Fri, 17 Nov 2017 15:31:11 -0800
Subject: [PATCH 84/94] sysvipc: duplicate lock comments wrt ipc_addid()

The comment in msgqueues when using ipc_addid() is quite useful imo.
Duplicate it for shm and semaphores.

Link: http://lkml.kernel.org/r/20170831172049.14576-3-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/sem.c | 1 +
 ipc/shm.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/ipc/sem.c b/ipc/sem.c
index b2698ebdcb31..28a5c9f0be87 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -515,6 +515,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
 	sma->sem_nsems = nsems;
 	sma->sem_ctime = ktime_get_real_seconds();
 
+	/* ipc_addid() locks sma upon success. */
 	retval = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
 	if (retval < 0) {
 		call_rcu(&sma->sem_perm.rcu, sem_rcu_free);
diff --git a/ipc/shm.c b/ipc/shm.c
index bd652755d32c..378c929194ce 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -601,6 +601,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	shp->shm_file = file;
 	shp->shm_creator = current;
 
+	/* ipc_addid() locks shp upon success. */
 	error = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
 	if (error < 0)
 		goto no_id;

From ebf66799acfb5f52ada4ff96ecc9579867941ea9 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Fri, 17 Nov 2017 15:31:15 -0800
Subject: [PATCH 85/94] sysvipc: properly name ipc_addid() limit parameter

This is better understood as a limit, instead of size; exactly like the
function comment indicates.  Rename it.

Link: http://lkml.kernel.org/r/20170831172049.14576-4-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/util.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ipc/util.c b/ipc/util.c
index 429c06bdb8ef..e09bf76610ef 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -262,7 +262,7 @@ static inline int ipc_buildid(int id, struct ipc_ids *ids,
  * ipc_addid - add an ipc identifier
  * @ids: ipc identifier set
  * @new: new ipc permission set
- * @size: limit for the number of used ids
+ * @limit: limit for the number of used ids
  *
  * Add an entry 'new' to the ipc ids idr. The permissions object is
  * initialised and the first free entry is set up and the id assigned
@@ -271,16 +271,16 @@ static inline int ipc_buildid(int id, struct ipc_ids *ids,
  *
  * Called with writer ipc_ids.rwsem held.
  */
-int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
+int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
 {
 	kuid_t euid;
 	kgid_t egid;
 	int id, err;
 
-	if (size > IPCMNI)
-		size = IPCMNI;
+	if (limit > IPCMNI)
+		limit = IPCMNI;
 
-	if (!ids->tables_initialized || ids->in_use >= size)
+	if (!ids->tables_initialized || ids->in_use >= limit)
 		return -ENOSPC;
 
 	idr_preload(GFP_KERNEL);

From 15df03c87983660a4d1eedb4541778592bd97684 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Fri, 17 Nov 2017 15:31:18 -0800
Subject: [PATCH 86/94] sysvipc: make get_maxid O(1) again

For a custom microbenchmark on a 3.30GHz Xeon SandyBridge, which calls
IPC_STAT over and over, it was calculated that, on avg the cost of
ipc_get_maxid() for increasing amounts of keys was:

 10 keys: ~900 cycles
 100 keys: ~15000 cycles
 1000 keys: ~150000 cycles
 10000 keys: ~2100000 cycles

This is unsurprising as maxid is currently O(n).

By having the max_id available in O(1) we save all those cycles for each
semctl(_STAT) command, the idr_find can be expensive -- which some real
(customer) workloads actually poll on.

Note that this used to be the case, until commit 7ca7e564e04 ("ipc:
store ipcs into IDRs").  The cost is the extra idr_find when doing
RMIDs, but we simply go backwards, and should not take too many
iterations to find the new value.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20170831172049.14576-5-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc_namespace.h |  1 +
 ipc/util.c                    | 43 +++++++++++------------------------
 ipc/util.h                    | 21 ++++++++++++++---
 3 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index d7cf3a850853..b5630c8eb2f3 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -19,6 +19,7 @@ struct ipc_ids {
 	bool tables_initialized;
 	struct rw_semaphore rwsem;
 	struct idr ipcs_idr;
+	int max_id;
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	int next_id;
 #endif
diff --git a/ipc/util.c b/ipc/util.c
index e09bf76610ef..ff045fec8d83 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -122,6 +122,7 @@ int ipc_init_ids(struct ipc_ids *ids)
 		return err;
 	idr_init(&ids->ipcs_idr);
 	ids->tables_initialized = true;
+	ids->max_id = -1;
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	ids->next_id = -1;
 #endif
@@ -188,36 +189,6 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
 	return NULL;
 }
 
-/**
- * ipc_get_maxid - get the last assigned id
- * @ids: ipc identifier set
- *
- * Called with ipc_ids.rwsem held.
- */
-int ipc_get_maxid(struct ipc_ids *ids)
-{
-	struct kern_ipc_perm *ipc;
-	int max_id = -1;
-	int total, id;
-
-	if (ids->in_use == 0)
-		return -1;
-
-	if (ids->in_use == IPCMNI)
-		return IPCMNI - 1;
-
-	/* Look for the last assigned id */
-	total = 0;
-	for (id = 0; id < IPCMNI && total < ids->in_use; id++) {
-		ipc = idr_find(&ids->ipcs_idr, id);
-		if (ipc != NULL) {
-			max_id = id;
-			total++;
-		}
-	}
-	return max_id;
-}
-
 #ifdef CONFIG_CHECKPOINT_RESTORE
 /*
  * Specify desired id for next allocated IPC object.
@@ -313,6 +284,9 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
 	}
 
 	ids->in_use++;
+	if (id > ids->max_id)
+		ids->max_id = id;
+
 	new->id = ipc_buildid(id, ids, new);
 
 	return id;
@@ -459,6 +433,15 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
 	ipc_kht_remove(ids, ipcp);
 	ids->in_use--;
 	ipcp->deleted = true;
+
+	if (unlikely(lid == ids->max_id)) {
+		do {
+			lid--;
+			if (lid == -1)
+				break;
+		} while (!idr_find(&ids->ipcs_idr, lid));
+		ids->max_id = lid;
+	}
 }
 
 /**
diff --git a/ipc/util.h b/ipc/util.h
index 0cd6201fe63a..89b8ec176fc4 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -13,6 +13,7 @@
 
 #include <linux/unistd.h>
 #include <linux/err.h>
+#include <linux/ipc_namespace.h>
 
 #define SEQ_MULTIPLIER	(IPCMNI)
 
@@ -99,9 +100,6 @@ void __init ipc_init_proc_interface(const char *path, const char *header,
 /* must be called with ids->rwsem acquired for writing */
 int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int);
 
-/* must be called with ids->rwsem acquired for reading */
-int ipc_get_maxid(struct ipc_ids *);
-
 /* must be called with both locks acquired. */
 void ipc_rmid(struct ipc_ids *, struct kern_ipc_perm *);
 
@@ -111,6 +109,23 @@ void ipc_set_key_private(struct ipc_ids *, struct kern_ipc_perm *);
 /* must be called with ipcp locked */
 int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg);
 
+/**
+ * ipc_get_maxid - get the last assigned id
+ * @ids: ipc identifier set
+ *
+ * Called with ipc_ids.rwsem held for reading.
+ */
+static inline int ipc_get_maxid(struct ipc_ids *ids)
+{
+	if (ids->in_use == 0)
+		return -1;
+
+	if (ids->in_use == IPCMNI)
+		return IPCMNI - 1;
+
+	return ids->max_id;
+}
+
 /*
  * For allocation that need to be freed by RCU.
  * Objects are reference counted, they start with reference count 1.

From 64c349f4ae78723248c474531d0cbc524fc5ba77 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 17 Nov 2017 15:31:22 -0800
Subject: [PATCH 87/94] mm: add infrastructure for get_user_pages_fast()
 benchmarking

Performance of get_user_pages_fast() is critical for some workloads, but
it's tricky to test it directly.

This patch provides /sys/kernel/debug/gup_benchmark that helps with
testing performance of it.

See tools/testing/selftests/vm/gup_benchmark.c for userspace
counterpart.

Link: http://lkml.kernel.org/r/20170908215603.9189-2-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thorsten Leemhuis <regressions@leemhuis.info>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/Kconfig                                 |   9 ++
 mm/Makefile                                |   1 +
 mm/gup_benchmark.c                         | 100 +++++++++++++++++++++
 tools/testing/selftests/vm/Makefile        |   1 +
 tools/testing/selftests/vm/gup_benchmark.c |  91 +++++++++++++++++++
 5 files changed, 202 insertions(+)
 create mode 100644 mm/gup_benchmark.c
 create mode 100644 tools/testing/selftests/vm/gup_benchmark.c

diff --git a/mm/Kconfig b/mm/Kconfig
index 9c4bdddd80c2..03ff7703d322 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -756,3 +756,12 @@ config PERCPU_STATS
 	  This feature collects and exposes statistics via debugfs. The
 	  information includes global and per chunk statistics, which can
 	  be used to help understand percpu memory usage.
+
+config GUP_BENCHMARK
+	bool "Enable infrastructure for get_user_pages_fast() benchmarking"
+	default n
+	help
+	  Provides /sys/kernel/debug/gup_benchmark that helps with testing
+	  performance of get_user_pages_fast().
+
+	  See tools/testing/selftests/vm/gup_benchmark.c
diff --git a/mm/Makefile b/mm/Makefile
index e7ebd176fb93..e669f02c5a54 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -80,6 +80,7 @@ obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
 obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
 obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
 obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
+obj-$(CONFIG_GUP_BENCHMARK) += gup_benchmark.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
new file mode 100644
index 000000000000..5c8e2abeaa15
--- /dev/null
+++ b/mm/gup_benchmark.c
@@ -0,0 +1,100 @@
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/ktime.h>
+#include <linux/debugfs.h>
+
+#define GUP_FAST_BENCHMARK	_IOWR('g', 1, struct gup_benchmark)
+
+struct gup_benchmark {
+	__u64 delta_usec;
+	__u64 addr;
+	__u64 size;
+	__u32 nr_pages_per_call;
+	__u32 flags;
+};
+
+static int __gup_benchmark_ioctl(unsigned int cmd,
+		struct gup_benchmark *gup)
+{
+	ktime_t start_time, end_time;
+	unsigned long i, nr, nr_pages, addr, next;
+	struct page **pages;
+
+	nr_pages = gup->size / PAGE_SIZE;
+	pages = kvmalloc(sizeof(void *) * nr_pages, GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	i = 0;
+	nr = gup->nr_pages_per_call;
+	start_time = ktime_get();
+	for (addr = gup->addr; addr < gup->addr + gup->size; addr = next) {
+		if (nr != gup->nr_pages_per_call)
+			break;
+
+		next = addr + nr * PAGE_SIZE;
+		if (next > gup->addr + gup->size) {
+			next = gup->addr + gup->size;
+			nr = (next - addr) / PAGE_SIZE;
+		}
+
+		nr = get_user_pages_fast(addr, nr, gup->flags & 1, pages + i);
+		i += nr;
+	}
+	end_time = ktime_get();
+
+	gup->delta_usec = ktime_us_delta(end_time, start_time);
+	gup->size = addr - gup->addr;
+
+	for (i = 0; i < nr_pages; i++) {
+		if (!pages[i])
+			break;
+		put_page(pages[i]);
+	}
+
+	kvfree(pages);
+	return 0;
+}
+
+static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd,
+		unsigned long arg)
+{
+	struct gup_benchmark gup;
+	int ret;
+
+	if (cmd != GUP_FAST_BENCHMARK)
+		return -EINVAL;
+
+	if (copy_from_user(&gup, (void __user *)arg, sizeof(gup)))
+		return -EFAULT;
+
+	ret = __gup_benchmark_ioctl(cmd, &gup);
+	if (ret)
+		return ret;
+
+	if (copy_to_user((void __user *)arg, &gup, sizeof(gup)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static const struct file_operations gup_benchmark_fops = {
+	.open = nonseekable_open,
+	.unlocked_ioctl = gup_benchmark_ioctl,
+};
+
+static int gup_benchmark_init(void)
+{
+	void *ret;
+
+	ret = debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL,
+			&gup_benchmark_fops);
+	if (!ret)
+		pr_warn("Failed to create gup_benchmark in debugfs");
+
+	return 0;
+}
+
+late_initcall(gup_benchmark_init);
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index e49eca1915f8..7f45806bd863 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -18,6 +18,7 @@ TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += userfaultfd
 TEST_GEN_FILES += mlock-random-test
 TEST_GEN_FILES += virtual_address_range
+TEST_GEN_FILES += gup_benchmark
 
 TEST_PROGS := run_vmtests
 
diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c
new file mode 100644
index 000000000000..36df55132036
--- /dev/null
+++ b/tools/testing/selftests/vm/gup_benchmark.c
@@ -0,0 +1,91 @@
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <linux/types.h>
+
+#define MB (1UL << 20)
+#define PAGE_SIZE sysconf(_SC_PAGESIZE)
+
+#define GUP_FAST_BENCHMARK	_IOWR('g', 1, struct gup_benchmark)
+
+struct gup_benchmark {
+	__u64 delta_usec;
+	__u64 addr;
+	__u64 size;
+	__u32 nr_pages_per_call;
+	__u32 flags;
+};
+
+int main(int argc, char **argv)
+{
+	struct gup_benchmark gup;
+	unsigned long size = 128 * MB;
+	int i, fd, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0;
+	char *p;
+
+	while ((opt = getopt(argc, argv, "m:r:n:tT")) != -1) {
+		switch (opt) {
+		case 'm':
+			size = atoi(optarg) * MB;
+			break;
+		case 'r':
+			repeats = atoi(optarg);
+			break;
+		case 'n':
+			nr_pages = atoi(optarg);
+			break;
+		case 't':
+			thp = 1;
+			break;
+		case 'T':
+			thp = 0;
+			break;
+		case 'w':
+			write = 1;
+		default:
+			return -1;
+		}
+	}
+
+	gup.nr_pages_per_call = nr_pages;
+	gup.flags = write;
+
+	fd = open("/sys/kernel/debug/gup_benchmark", O_RDWR);
+	if (fd == -1)
+		perror("open"), exit(1);
+
+	p = mmap(NULL, size, PROT_READ | PROT_WRITE,
+			MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (p == MAP_FAILED)
+		perror("mmap"), exit(1);
+	gup.addr = (unsigned long)p;
+
+	if (thp == 1)
+		madvise(p, size, MADV_HUGEPAGE);
+	else if (thp == 0)
+		madvise(p, size, MADV_NOHUGEPAGE);
+
+	for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE)
+		p[0] = 0;
+
+	for (i = 0; i < repeats; i++) {
+		gup.size = size;
+		if (ioctl(fd, GUP_FAST_BENCHMARK, &gup))
+			perror("ioctl"), exit(1);
+
+		printf("Time: %lld us", gup.delta_usec);
+		if (gup.size != size)
+			printf(", truncated (size: %lld)", gup.size);
+		printf("\n");
+	}
+
+	return 0;
+}

From d4258247d9057c848cc1c1ad9581400b5124dedd Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 17 Nov 2017 15:31:26 -0800
Subject: [PATCH 88/94] drivers/pcmcia/sa1111_badge4.c: avoid unused function
 warning

pcmv_setup() is only used when the badge4 driver is built-in, but not
when it is a loadable module:

  drivers/pcmcia/sa1111_badge4.c:153:122: error: 'pcmv_setup' defined but not used [-Werror=unused-function]

This adds an #ifdef to avoid the definition of the unused function in
the modular case.

Link: http://lkml.kernel.org/r/20170911201133.3421636-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Russell King <rmk@armlinux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pcmcia/sa1111_badge4.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/pcmcia/sa1111_badge4.c b/drivers/pcmcia/sa1111_badge4.c
index 2f490930430d..93a5c7423d80 100644
--- a/drivers/pcmcia/sa1111_badge4.c
+++ b/drivers/pcmcia/sa1111_badge4.c
@@ -144,6 +144,7 @@ int pcmcia_badge4_init(struct sa1111_dev *dev)
 				 sa11xx_drv_pcmcia_add_one);
 }
 
+#ifndef MODULE
 static int __init pcmv_setup(char *s)
 {
 	int v[4];
@@ -158,3 +159,4 @@ static int __init pcmv_setup(char *s)
 }
 
 __setup("pcmv=", pcmv_setup);
+#endif

From 5eb9e8ac9a8f8ae98ae4386357683a0b5684bb48 Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Fri, 17 Nov 2017 15:31:29 -0800
Subject: [PATCH 89/94] arch/ia64/include/asm/topology.h: remove unused
 parent_node() macro

Commit a7be6e5a7f8d ("mm: drop useless local parameters of
__register_one_node()") removed the last user of parent_node().

The parent_node() macro in IA64(Itanium) platform is unnecessary.

Remove it for cleanup.

Link: http://lkml.kernel.org/r/1504234599-29533-2-git-send-email-douly.fnst@cn.fujitsu.com
Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Reported-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/include/asm/topology.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 3ad8f6988363..82f9bf702804 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -33,13 +33,6 @@
 			       cpu_all_mask :				\
 			       &node_to_cpu_mask[node])
 
-/*
- * Returns the number of the node containing Node 'nid'.
- * Not implemented here. Multi-level hierarchies detected with
- * the help of node_distance().
- */
-#define parent_node(nid) (nid)
-
 /*
  * Determines the node for a given pci bus
  */

From ece15787e72442550f33aaa43dacae033e8628b1 Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Fri, 17 Nov 2017 15:31:33 -0800
Subject: [PATCH 90/94] arch/sh/include/asm/topology.h: remove unused
 parent_node() macro

Commit a7be6e5a7f8d ("mm: drop useless local parameters of
__register_one_node()") removed the last user of parent_node().

The parent_node() macro in SUPERH platform is unnecessary.

Remove it for cleanup.

Link: http://lkml.kernel.org/r/1504234599-29533-5-git-send-email-douly.fnst@cn.fujitsu.com
Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Reported-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sh/include/asm/topology.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index 9a32eb4098df..1db470e02456 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -5,7 +5,6 @@
 #ifdef CONFIG_NUMA
 
 #define cpu_to_node(cpu)	((void)(cpu),0)
-#define parent_node(node)	((void)(node),0)
 
 #define cpumask_of_node(node)	((void)node, cpu_online_mask)
 

From 5f4cdac6bce420ba86c3ab5ec30037822d726ce5 Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Fri, 17 Nov 2017 15:31:36 -0800
Subject: [PATCH 91/94] arch/sparc/include/asm/topology_64.h: remove unused
 parent_node() macro

Commit a7be6e5a7f8d ("mm: drop useless local parameters of
__register_one_node()") removed the last user of parent_node().

The parent_node() macro in SPARC64 platform is unnecessary.

Remove it for cleanup.

Link: http://lkml.kernel.org/r/1504234599-29533-6-git-send-email-douly.fnst@cn.fujitsu.com
Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Reported-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sparc/include/asm/topology_64.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index 3831b1911a19..34c628a22ea5 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -11,8 +11,6 @@ static inline int cpu_to_node(int cpu)
 	return numa_cpu_lookup_table[cpu];
 }
 
-#define parent_node(node)	(node)
-
 #define cpumask_of_node(node) ((node) == -1 ?				\
 			       cpu_all_mask :				\
 			       &numa_cpumask_lookup_table[node])

From 52563d05f2500ec9c8d715f3c753b0ef415f9cbb Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Fri, 17 Nov 2017 15:31:40 -0800
Subject: [PATCH 92/94] arch/tile/include/asm/topology.h: remove unused
 parent_node() macro

Commit a7be6e5a7f8d ("mm: drop useless local parameters of
__register_one_node()") removed the last user of parent_node().

The parent_node() macro in tile platform is unnecessary.

Remove it for cleanup.

Link: http://lkml.kernel.org/r/1504234599-29533-7-git-send-email-douly.fnst@cn.fujitsu.com
Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Reported-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Chris Metcalf <cmetcalf@mellanox.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/tile/include/asm/topology.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h
index b11d5fcd2c41..635a0a4596f0 100644
--- a/arch/tile/include/asm/topology.h
+++ b/arch/tile/include/asm/topology.h
@@ -29,12 +29,6 @@ static inline int cpu_to_node(int cpu)
 	return cpu_2_node[cpu];
 }
 
-/*
- * Returns the number of the node containing Node 'node'.
- * This architecture is flat, so it is a pretty simple function!
- */
-#define parent_node(node) (node)
-
 /* Returns a bitmask of CPUs on Node 'node'. */
 static inline const struct cpumask *cpumask_of_node(int node)
 {

From 7016383b44e855209aff47e56f11c59b8aa7b642 Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Fri, 17 Nov 2017 15:31:43 -0800
Subject: [PATCH 93/94] include/asm-generic/topology.h: remove unused
 parent_node() macro

Commit a7be6e5a7f8d ("mm: drop useless local parameters of
__register_one_node()") removed the last user of parent_node().

The parent_node() macro in generic situation is unnecessary.

Remove it for cleanup.

Link: http://lkml.kernel.org/r/1504234599-29533-8-git-send-email-douly.fnst@cn.fujitsu.com
Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Reported-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/topology.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h
index 5d2add1a6c96..238873739550 100644
--- a/include/asm-generic/topology.h
+++ b/include/asm-generic/topology.h
@@ -44,9 +44,6 @@
 #define cpu_to_mem(cpu)		((void)(cpu),0)
 #endif
 
-#ifndef parent_node
-#define parent_node(node)	((void)(node),0)
-#endif
 #ifndef cpumask_of_node
   #ifdef CONFIG_NEED_MULTIPLE_NODES
     #define cpumask_of_node(node)	((node) == 0 ? cpu_online_mask : cpu_none_mask)

From d1b069f5febc850556cf49e9bb9092d3179c5be5 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 17 Nov 2017 15:31:47 -0800
Subject: [PATCH 94/94] EXPERT Kconfig menu: fix broken EXPERT menu

Clean up the EXPERT menu (yet again).

Move FHANDLE and CHECKPOINT_RESTORE into the primary EXPERT menu since
they already depend on EXPERT.

Move BPF_SYSCALL and USERFAULTFD out of the EXPERT Kconfig symbols menu
list since they do not depend on EXPERT and were breaking the continuity
of that menu list.

Move all of the KALLSYMS Kconfig symbols to the end of the EXPERT menu.
This separates the kernel services from the build options.

This patch depends on [PATCH] pci: move PCI_QUIRKS to the PCI bus menu
(https://lkml.org/lkml/2017/11/2/907).

Link: http://lkml.kernel.org/r/72e4465a-a5ff-cb3c-1a90-11aa4861b161@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>	[BPF]
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/Kconfig | 184 ++++++++++++++++++++++++++-------------------------
 1 file changed, 93 insertions(+), 91 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index 7d5a6fbac56a..2934249fba46 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -283,19 +283,6 @@ config CROSS_MEMORY_ATTACH
 	  to directly read from or write to another process' address space.
 	  See the man page for more details.
 
-config FHANDLE
-	bool "open by fhandle syscalls" if EXPERT
-	select EXPORTFS
-	default y
-	help
-	  If you say Y here, a user level program will be able to map
-	  file names to handle and then later use the handle for
-	  different file system operations. This is useful in implementing
-	  userspace file servers, which now track files using handles instead
-	  of names. The handle would remain the same even if file names
-	  get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2)
-	  syscalls.
-
 config USELIB
 	bool "uselib syscall"
 	def_bool ALPHA || M68K || SPARC || X86_32 || IA32_EMULATION
@@ -883,18 +870,6 @@ config SOCK_CGROUP_DATA
 
 endif # CGROUPS
 
-config CHECKPOINT_RESTORE
-	bool "Checkpoint/restore support" if EXPERT
-	select PROC_CHILDREN
-	default n
-	help
-	  Enables additional kernel features in a sake of checkpoint/restore.
-	  In particular it adds auxiliary prctl codes to setup process text,
-	  data and heap segment sizes, and a few additional /proc filesystem
-	  entries.
-
-	  If unsure, say N here.
-
 menuconfig NAMESPACES
 	bool "Namespaces support" if EXPERT
 	depends on MULTIUSER
@@ -1163,6 +1138,19 @@ config SYSCTL_SYSCALL
 
 	  If unsure say N here.
 
+config FHANDLE
+	bool "open by fhandle syscalls" if EXPERT
+	select EXPORTFS
+	default y
+	help
+	  If you say Y here, a user level program will be able to map
+	  file names to handle and then later use the handle for
+	  different file system operations. This is useful in implementing
+	  userspace file servers, which now track files using handles instead
+	  of names. The handle would remain the same even if file names
+	  get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2)
+	  syscalls.
+
 config POSIX_TIMERS
 	bool "Posix Clocks & timers" if EXPERT
 	default y
@@ -1180,54 +1168,6 @@ config POSIX_TIMERS
 
 	  If unsure say y.
 
-config KALLSYMS
-	 bool "Load all symbols for debugging/ksymoops" if EXPERT
-	 default y
-	 help
-	   Say Y here to let the kernel print out symbolic crash information and
-	   symbolic stack backtraces. This increases the size of the kernel
-	   somewhat, as all symbols have to be loaded into the kernel image.
-
-config KALLSYMS_ALL
-	bool "Include all symbols in kallsyms"
-	depends on DEBUG_KERNEL && KALLSYMS
-	help
-	   Normally kallsyms only contains the symbols of functions for nicer
-	   OOPS messages and backtraces (i.e., symbols from the text and inittext
-	   sections). This is sufficient for most cases. And only in very rare
-	   cases (e.g., when a debugger is used) all symbols are required (e.g.,
-	   names of variables from the data sections, etc).
-
-	   This option makes sure that all symbols are loaded into the kernel
-	   image (i.e., symbols from all sections) in cost of increased kernel
-	   size (depending on the kernel configuration, it may be 300KiB or
-	   something like this).
-
-	   Say N unless you really need all symbols.
-
-config KALLSYMS_ABSOLUTE_PERCPU
-	bool
-	depends on KALLSYMS
-	default X86_64 && SMP
-
-config KALLSYMS_BASE_RELATIVE
-	bool
-	depends on KALLSYMS
-	default !IA64 && !(TILE && 64BIT)
-	help
-	  Instead of emitting them as absolute values in the native word size,
-	  emit the symbol references in the kallsyms table as 32-bit entries,
-	  each containing a relative value in the range [base, base + U32_MAX]
-	  or, when KALLSYMS_ABSOLUTE_PERCPU is in effect, each containing either
-	  an absolute value in the range [0, S32_MAX] or a relative value in the
-	  range [base, base + S32_MAX], where base is the lowest relative symbol
-	  address encountered in the image.
-
-	  On 64-bit builds, this reduces the size of the address table by 50%,
-	  but more importantly, it results in entries whose values are build
-	  time constants, and no relocation pass is required at runtime to fix
-	  up the entries based on the runtime load address of the kernel.
-
 config PRINTK
 	default y
 	bool "Enable support for printk" if EXPERT
@@ -1339,16 +1279,6 @@ config EVENTFD
 
 	  If unsure, say Y.
 
-# syscall, maps, verifier
-config BPF_SYSCALL
-	bool "Enable bpf() system call"
-	select ANON_INODES
-	select BPF
-	default n
-	help
-	  Enable the bpf() system call that allows to manipulate eBPF
-	  programs and maps via file descriptors.
-
 config SHMEM
 	bool "Use full shmem filesystem" if EXPERT
 	default y
@@ -1378,14 +1308,6 @@ config ADVISE_SYSCALLS
 	  applications use these syscalls, you can disable this option to save
 	  space.
 
-config USERFAULTFD
-	bool "Enable userfaultfd() system call"
-	select ANON_INODES
-	depends on MMU
-	help
-	  Enable the userfaultfd() system call that allows to intercept and
-	  handle page faults in userland.
-
 config MEMBARRIER
 	bool "Enable membarrier() system call" if EXPERT
 	default y
@@ -1398,6 +1320,86 @@ config MEMBARRIER
 
 	  If unsure, say Y.
 
+config CHECKPOINT_RESTORE
+	bool "Checkpoint/restore support" if EXPERT
+	select PROC_CHILDREN
+	default n
+	help
+	  Enables additional kernel features in a sake of checkpoint/restore.
+	  In particular it adds auxiliary prctl codes to setup process text,
+	  data and heap segment sizes, and a few additional /proc filesystem
+	  entries.
+
+	  If unsure, say N here.
+
+config KALLSYMS
+	 bool "Load all symbols for debugging/ksymoops" if EXPERT
+	 default y
+	 help
+	   Say Y here to let the kernel print out symbolic crash information and
+	   symbolic stack backtraces. This increases the size of the kernel
+	   somewhat, as all symbols have to be loaded into the kernel image.
+
+config KALLSYMS_ALL
+	bool "Include all symbols in kallsyms"
+	depends on DEBUG_KERNEL && KALLSYMS
+	help
+	   Normally kallsyms only contains the symbols of functions for nicer
+	   OOPS messages and backtraces (i.e., symbols from the text and inittext
+	   sections). This is sufficient for most cases. And only in very rare
+	   cases (e.g., when a debugger is used) all symbols are required (e.g.,
+	   names of variables from the data sections, etc).
+
+	   This option makes sure that all symbols are loaded into the kernel
+	   image (i.e., symbols from all sections) in cost of increased kernel
+	   size (depending on the kernel configuration, it may be 300KiB or
+	   something like this).
+
+	   Say N unless you really need all symbols.
+
+config KALLSYMS_ABSOLUTE_PERCPU
+	bool
+	depends on KALLSYMS
+	default X86_64 && SMP
+
+config KALLSYMS_BASE_RELATIVE
+	bool
+	depends on KALLSYMS
+	default !IA64 && !(TILE && 64BIT)
+	help
+	  Instead of emitting them as absolute values in the native word size,
+	  emit the symbol references in the kallsyms table as 32-bit entries,
+	  each containing a relative value in the range [base, base + U32_MAX]
+	  or, when KALLSYMS_ABSOLUTE_PERCPU is in effect, each containing either
+	  an absolute value in the range [0, S32_MAX] or a relative value in the
+	  range [base, base + S32_MAX], where base is the lowest relative symbol
+	  address encountered in the image.
+
+	  On 64-bit builds, this reduces the size of the address table by 50%,
+	  but more importantly, it results in entries whose values are build
+	  time constants, and no relocation pass is required at runtime to fix
+	  up the entries based on the runtime load address of the kernel.
+
+# end of the "standard kernel features (expert users)" menu
+
+# syscall, maps, verifier
+config BPF_SYSCALL
+	bool "Enable bpf() system call"
+	select ANON_INODES
+	select BPF
+	default n
+	help
+	  Enable the bpf() system call that allows to manipulate eBPF
+	  programs and maps via file descriptors.
+
+config USERFAULTFD
+	bool "Enable userfaultfd() system call"
+	select ANON_INODES
+	depends on MMU
+	help
+	  Enable the userfaultfd() system call that allows to intercept and
+	  handle page faults in userland.
+
 config EMBEDDED
 	bool "Embedded system"
 	option allnoconfig_y