2011-01-15 18:17:56 +08:00
|
|
|
/*
|
|
|
|
* linux/arch/unicore32/include/asm/mmu_context.h
|
|
|
|
*
|
|
|
|
* Code specific to PKUnity SoC and UniCore ISA
|
|
|
|
*
|
|
|
|
* Copyright (C) 2001-2010 GUAN Xue-tao
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License version 2 as
|
|
|
|
* published by the Free Software Foundation.
|
|
|
|
*/
|
|
|
|
#ifndef __UNICORE_MMU_CONTEXT_H__
|
|
|
|
#define __UNICORE_MMU_CONTEXT_H__
|
|
|
|
|
|
|
|
#include <linux/compiler.h>
|
|
|
|
#include <linux/sched.h>
|
mm: per-thread vma caching
This patch is a continuation of efforts trying to optimize find_vma(),
avoiding potentially expensive rbtree walks to locate a vma upon faults.
The original approach (https://lkml.org/lkml/2013/11/1/410), where the
largest vma was also cached, ended up being too specific and random,
thus further comparison with other approaches were needed. There are
two things to consider when dealing with this, the cache hit rate and
the latency of find_vma(). Improving the hit-rate does not necessarily
translate in finding the vma any faster, as the overhead of any fancy
caching schemes can be too high to consider.
We currently cache the last used vma for the whole address space, which
provides a nice optimization, reducing the total cycles in find_vma() by
up to 250%, for workloads with good locality. On the other hand, this
simple scheme is pretty much useless for workloads with poor locality.
Analyzing ebizzy runs shows that, no matter how many threads are
running, the mmap_cache hit rate is less than 2%, and in many situations
below 1%.
The proposed approach is to replace this scheme with a small per-thread
cache, maximizing hit rates at a very low maintenance cost.
Invalidations are performed by simply bumping up a 32-bit sequence
number. The only expensive operation is in the rare case of a seq
number overflow, where all caches that share the same address space are
flushed. Upon a miss, the proposed replacement policy is based on the
page number that contains the virtual address in question. Concretely,
the following results are seen on an 80 core, 8 socket x86-64 box:
1) System bootup: Most programs are single threaded, so the per-thread
scheme does improve ~50% hit rate by just adding a few more slots to
the cache.
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 50.61% | 19.90 |
| patched | 73.45% | 13.58 |
+----------------+----------+------------------+
2) Kernel build: This one is already pretty good with the current
approach as we're dealing with good locality.
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 75.28% | 11.03 |
| patched | 88.09% | 9.31 |
+----------------+----------+------------------+
3) Oracle 11g Data Mining (4k pages): Similar to the kernel build workload.
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 70.66% | 17.14 |
| patched | 91.15% | 12.57 |
+----------------+----------+------------------+
4) Ebizzy: There's a fair amount of variation from run to run, but this
approach always shows nearly perfect hit rates, while baseline is just
about non-existent. The amounts of cycles can fluctuate between
anywhere from ~60 to ~116 for the baseline scheme, but this approach
reduces it considerably. For instance, with 80 threads:
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 1.06% | 91.54 |
| patched | 99.97% | 14.18 |
+----------------+----------+------------------+
[akpm@linux-foundation.org: fix nommu build, per Davidlohr]
[akpm@linux-foundation.org: document vmacache_valid() logic]
[akpm@linux-foundation.org: attempt to untangle header files]
[akpm@linux-foundation.org: add vmacache_find() BUG_ON]
[hughd@google.com: add vmacache_valid_mm() (from Oleg)]
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: adjust and enhance comments]
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Michel Lespinasse <walken@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Tested-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-08 06:37:25 +08:00
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/vmacache.h>
|
2011-01-15 18:17:56 +08:00
|
|
|
#include <linux/io.h>
|
|
|
|
|
|
|
|
#include <asm/cacheflush.h>
|
|
|
|
#include <asm/cpu-single.h>
|
|
|
|
|
|
|
|
#define init_new_context(tsk, mm) 0
|
|
|
|
|
|
|
|
#define destroy_context(mm) do { } while (0)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is called when "tsk" is about to enter lazy TLB mode.
|
|
|
|
*
|
|
|
|
* mm: describes the currently active mm context
|
|
|
|
* tsk: task which is entering lazy tlb
|
|
|
|
* cpu: cpu number which is entering lazy tlb
|
|
|
|
*
|
|
|
|
* tsk->mm will be NULL
|
|
|
|
*/
|
|
|
|
static inline void
|
|
|
|
enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is the actual mm switch as far as the scheduler
|
|
|
|
* is concerned. No registers are touched. We avoid
|
|
|
|
* calling the CPU specific function when the mm hasn't
|
|
|
|
* actually changed.
|
|
|
|
*/
|
|
|
|
static inline void
|
|
|
|
switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
|
|
|
struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
unsigned int cpu = smp_processor_id();
|
|
|
|
|
|
|
|
if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next)) || prev != next)
|
|
|
|
cpu_switch_mm(next->pgd, next);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define deactivate_mm(tsk, mm) do { } while (0)
|
|
|
|
#define activate_mm(prev, next) switch_mm(prev, next, NULL)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We are inserting a "fake" vma for the user-accessible vector page so
|
|
|
|
* gdb and friends can get to it through ptrace and /proc/<pid>/mem.
|
|
|
|
* But we also want to remove it before the generic code gets to see it
|
|
|
|
* during process exit or the unmapping of it would cause total havoc.
|
|
|
|
* (the macro is used as remove_vma() is static to mm/mmap.c)
|
|
|
|
*/
|
|
|
|
#define arch_exit_mmap(mm) \
|
|
|
|
do { \
|
|
|
|
struct vm_area_struct *high_vma = find_vma(mm, 0xffff0000); \
|
|
|
|
if (high_vma) { \
|
|
|
|
BUG_ON(high_vma->vm_next); /* it should be last */ \
|
|
|
|
if (high_vma->vm_prev) \
|
|
|
|
high_vma->vm_prev->vm_next = NULL; \
|
|
|
|
else \
|
|
|
|
mm->mmap = NULL; \
|
|
|
|
rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
|
mm: per-thread vma caching
This patch is a continuation of efforts trying to optimize find_vma(),
avoiding potentially expensive rbtree walks to locate a vma upon faults.
The original approach (https://lkml.org/lkml/2013/11/1/410), where the
largest vma was also cached, ended up being too specific and random,
thus further comparison with other approaches were needed. There are
two things to consider when dealing with this, the cache hit rate and
the latency of find_vma(). Improving the hit-rate does not necessarily
translate in finding the vma any faster, as the overhead of any fancy
caching schemes can be too high to consider.
We currently cache the last used vma for the whole address space, which
provides a nice optimization, reducing the total cycles in find_vma() by
up to 250%, for workloads with good locality. On the other hand, this
simple scheme is pretty much useless for workloads with poor locality.
Analyzing ebizzy runs shows that, no matter how many threads are
running, the mmap_cache hit rate is less than 2%, and in many situations
below 1%.
The proposed approach is to replace this scheme with a small per-thread
cache, maximizing hit rates at a very low maintenance cost.
Invalidations are performed by simply bumping up a 32-bit sequence
number. The only expensive operation is in the rare case of a seq
number overflow, where all caches that share the same address space are
flushed. Upon a miss, the proposed replacement policy is based on the
page number that contains the virtual address in question. Concretely,
the following results are seen on an 80 core, 8 socket x86-64 box:
1) System bootup: Most programs are single threaded, so the per-thread
scheme does improve ~50% hit rate by just adding a few more slots to
the cache.
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 50.61% | 19.90 |
| patched | 73.45% | 13.58 |
+----------------+----------+------------------+
2) Kernel build: This one is already pretty good with the current
approach as we're dealing with good locality.
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 75.28% | 11.03 |
| patched | 88.09% | 9.31 |
+----------------+----------+------------------+
3) Oracle 11g Data Mining (4k pages): Similar to the kernel build workload.
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 70.66% | 17.14 |
| patched | 91.15% | 12.57 |
+----------------+----------+------------------+
4) Ebizzy: There's a fair amount of variation from run to run, but this
approach always shows nearly perfect hit rates, while baseline is just
about non-existent. The amounts of cycles can fluctuate between
anywhere from ~60 to ~116 for the baseline scheme, but this approach
reduces it considerably. For instance, with 80 threads:
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 1.06% | 91.54 |
| patched | 99.97% | 14.18 |
+----------------+----------+------------------+
[akpm@linux-foundation.org: fix nommu build, per Davidlohr]
[akpm@linux-foundation.org: document vmacache_valid() logic]
[akpm@linux-foundation.org: attempt to untangle header files]
[akpm@linux-foundation.org: add vmacache_find() BUG_ON]
[hughd@google.com: add vmacache_valid_mm() (from Oleg)]
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: adjust and enhance comments]
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Michel Lespinasse <walken@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Tested-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-08 06:37:25 +08:00
|
|
|
vmacache_invalidate(mm); \
|
2011-01-15 18:17:56 +08:00
|
|
|
mm->map_count--; \
|
|
|
|
remove_vma(high_vma); \
|
|
|
|
} \
|
|
|
|
} while (0)
|
|
|
|
|
2017-12-14 19:27:29 +08:00
|
|
|
static inline int arch_dup_mmap(struct mm_struct *oldmm,
|
|
|
|
struct mm_struct *mm)
|
2011-01-15 18:17:56 +08:00
|
|
|
{
|
2017-12-14 19:27:29 +08:00
|
|
|
return 0;
|
2011-01-15 18:17:56 +08:00
|
|
|
}
|
|
|
|
|
2014-11-19 02:23:50 +08:00
|
|
|
static inline void arch_unmap(struct mm_struct *mm,
|
|
|
|
struct vm_area_struct *vma,
|
|
|
|
unsigned long start, unsigned long end)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void arch_bprm_mm_init(struct mm_struct *mm,
|
|
|
|
struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2016-02-13 05:02:21 +08:00
|
|
|
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
|
2016-03-21 19:20:53 +08:00
|
|
|
bool write, bool execute, bool foreign)
|
mm/gup, x86/mm/pkeys: Check VMAs and PTEs for protection keys
Today, for normal faults and page table walks, we check the VMA
and/or PTE to ensure that it is compatible with the action. For
instance, if we get a write fault on a non-writeable VMA, we
SIGSEGV.
We try to do the same thing for protection keys. Basically, we
try to make sure that if a user does this:
mprotect(ptr, size, PROT_NONE);
*ptr = foo;
they see the same effects with protection keys when they do this:
mprotect(ptr, size, PROT_READ|PROT_WRITE);
set_pkey(ptr, size, 4);
wrpkru(0xffffff3f); // access disable pkey 4
*ptr = foo;
The state to do that checking is in the VMA, but we also
sometimes have to do it on the page tables only, like when doing
a get_user_pages_fast() where we have no VMA.
We add two functions and expose them to generic code:
arch_pte_access_permitted(pte_flags, write)
arch_vma_access_permitted(vma, write)
These are, of course, backed up in x86 arch code with checks
against the PTE or VMA's protection key.
But, there are also cases where we do not want to respect
protection keys. When we ptrace(), for instance, we do not want
to apply the tracer's PKRU permissions to the PTEs from the
process being traced.
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: David Hildenbrand <dahi@linux.vnet.ibm.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Dominik Dingel <dingel@linux.vnet.ibm.com>
Cc: Dominik Vogt <vogt@linux.vnet.ibm.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jason Low <jason.low2@hp.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matthew Wilcox <willy@linux.intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Shachar Raindel <raindel@mellanox.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: linux-arch@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-s390@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/20160212210219.14D5D715@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-02-13 05:02:19 +08:00
|
|
|
{
|
|
|
|
/* by default, allow everything */
|
|
|
|
return true;
|
|
|
|
}
|
2011-01-15 18:17:56 +08:00
|
|
|
#endif
|