linux/arch/x86/include/asm/mmu_context.h

106 lines
2.6 KiB
C
Raw Normal View History

#ifndef _ASM_X86_MMU_CONTEXT_H
#define _ASM_X86_MMU_CONTEXT_H
#include <asm/desc.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <trace/events/tlb.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#ifndef CONFIG_PARAVIRT
#include <asm-generic/mm_hooks.h>
static inline void paravirt_activate_mm(struct mm_struct *prev,
struct mm_struct *next)
{
}
#endif /* !CONFIG_PARAVIRT */
/*
* Used for LDT copy/destruction.
*/
int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
void destroy_context(struct mm_struct *mm);
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
#ifdef CONFIG_SMP
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
#endif
}
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
unsigned cpu = smp_processor_id();
if (likely(prev != next)) {
#ifdef CONFIG_SMP
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
this_cpu_write(cpu_tlbstate.active_mm, next);
#endif
cpumask_set_cpu(cpu, mm_cpumask(next));
/* Re-load page tables */
load_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
sched/x86: Optimize switch_mm() for multi-threaded workloads Dick Fowles, Don Zickus and Joe Mario have been working on improvements to perf, and noticed heavy cache line contention on the mm_cpumask, running linpack on a 60 core / 120 thread system. The cause turned out to be unnecessary atomic accesses to the mm_cpumask. When in lazy TLB mode, the CPU is only removed from the mm_cpumask if there is a TLB flush event. Most of the time, no such TLB flush happens, and the kernel skips the TLB reload. It can also skip the atomic memory set & test. Here is a summary of Joe's test results: * The __schedule function dropped from 24% of all program cycles down to 5.5%. * The cacheline contention/hotness for accesses to that bitmask went from being the 1st/2nd hottest - down to the 84th hottest (0.3% of all shared misses which is now quite cold) * The average load latency for the bit-test-n-set instruction in __schedule dropped from 10k-15k cycles down to an average of 600 cycles. * The linpack program results improved from 133 GFlops to 144 GFlops. Peak GFlops rose from 133 to 153. Reported-by: Don Zickus <dzickus@redhat.com> Reported-by: Joe Mario <jmario@redhat.com> Tested-by: Joe Mario <jmario@redhat.com> Signed-off-by: Rik van Riel <riel@redhat.com> Reviewed-by: Paul Turner <pjt@google.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/20130731221421.616d3d20@annuminas.surriel.com [ Made the comments consistent around the modified code. ] Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-08-01 10:14:21 +08:00
/* Stop flush ipis for the previous mm */
x86, mm: avoid possible bogus tlb entries by clearing prev mm_cpumask after switching mm Clearing the cpu in prev's mm_cpumask early will avoid the flush tlb IPI's while the cr3 is still pointing to the prev mm. And this window can lead to the possibility of bogus TLB fills resulting in strange failures. One such problematic scenario is mentioned below. T1. CPU-1 is context switching from mm1 to mm2 context and got a NMI etc between the point of clearing the cpu from the mm_cpumask(mm1) and before reloading the cr3 with the new mm2. T2. CPU-2 is tearing down a specific vma for mm1 and will proceed with flushing the TLB for mm1. It doesn't send the flush TLB to CPU-1 as it doesn't see that cpu listed in the mm_cpumask(mm1). T3. After the TLB flush is complete, CPU-2 goes ahead and frees the page-table pages associated with the removed vma mapping. T4. CPU-2 now allocates those freed page-table pages for something else. T5. As the CR3 and TLB caches for mm1 is still active on CPU-1, CPU-1 can potentially speculate and walk through the page-table caches and can insert new TLB entries. As the page-table pages are already freed and being used on CPU-2, this page walk can potentially insert a bogus global TLB entry depending on the (random) contents of the page that is being used on CPU-2. T6. This bogus TLB entry being global will be active across future CR3 changes and can result in weird memory corruption etc. To avoid this issue, for the prev mm that is handing over the cpu to another mm, clear the cpu from the mm_cpumask(prev) after the cr3 is changed. Marking it for -stable, though we haven't seen any reported failure that can be attributed to this. Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> Cc: stable@kernel.org [v2.6.32+] Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-02-04 04:20:04 +08:00
cpumask_clear_cpu(cpu, mm_cpumask(prev));
sched/x86: Optimize switch_mm() for multi-threaded workloads Dick Fowles, Don Zickus and Joe Mario have been working on improvements to perf, and noticed heavy cache line contention on the mm_cpumask, running linpack on a 60 core / 120 thread system. The cause turned out to be unnecessary atomic accesses to the mm_cpumask. When in lazy TLB mode, the CPU is only removed from the mm_cpumask if there is a TLB flush event. Most of the time, no such TLB flush happens, and the kernel skips the TLB reload. It can also skip the atomic memory set & test. Here is a summary of Joe's test results: * The __schedule function dropped from 24% of all program cycles down to 5.5%. * The cacheline contention/hotness for accesses to that bitmask went from being the 1st/2nd hottest - down to the 84th hottest (0.3% of all shared misses which is now quite cold) * The average load latency for the bit-test-n-set instruction in __schedule dropped from 10k-15k cycles down to an average of 600 cycles. * The linpack program results improved from 133 GFlops to 144 GFlops. Peak GFlops rose from 133 to 153. Reported-by: Don Zickus <dzickus@redhat.com> Reported-by: Joe Mario <jmario@redhat.com> Tested-by: Joe Mario <jmario@redhat.com> Signed-off-by: Rik van Riel <riel@redhat.com> Reviewed-by: Paul Turner <pjt@google.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/20130731221421.616d3d20@annuminas.surriel.com [ Made the comments consistent around the modified code. ] Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-08-01 10:14:21 +08:00
/* Load the LDT, if the LDT is different: */
if (unlikely(prev->context.ldt != next->context.ldt))
load_LDT_nolock(&next->context);
}
#ifdef CONFIG_SMP
sched/x86: Optimize switch_mm() for multi-threaded workloads Dick Fowles, Don Zickus and Joe Mario have been working on improvements to perf, and noticed heavy cache line contention on the mm_cpumask, running linpack on a 60 core / 120 thread system. The cause turned out to be unnecessary atomic accesses to the mm_cpumask. When in lazy TLB mode, the CPU is only removed from the mm_cpumask if there is a TLB flush event. Most of the time, no such TLB flush happens, and the kernel skips the TLB reload. It can also skip the atomic memory set & test. Here is a summary of Joe's test results: * The __schedule function dropped from 24% of all program cycles down to 5.5%. * The cacheline contention/hotness for accesses to that bitmask went from being the 1st/2nd hottest - down to the 84th hottest (0.3% of all shared misses which is now quite cold) * The average load latency for the bit-test-n-set instruction in __schedule dropped from 10k-15k cycles down to an average of 600 cycles. * The linpack program results improved from 133 GFlops to 144 GFlops. Peak GFlops rose from 133 to 153. Reported-by: Don Zickus <dzickus@redhat.com> Reported-by: Joe Mario <jmario@redhat.com> Tested-by: Joe Mario <jmario@redhat.com> Signed-off-by: Rik van Riel <riel@redhat.com> Reviewed-by: Paul Turner <pjt@google.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/20130731221421.616d3d20@annuminas.surriel.com [ Made the comments consistent around the modified code. ] Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-08-01 10:14:21 +08:00
else {
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
sched/x86: Optimize switch_mm() for multi-threaded workloads Dick Fowles, Don Zickus and Joe Mario have been working on improvements to perf, and noticed heavy cache line contention on the mm_cpumask, running linpack on a 60 core / 120 thread system. The cause turned out to be unnecessary atomic accesses to the mm_cpumask. When in lazy TLB mode, the CPU is only removed from the mm_cpumask if there is a TLB flush event. Most of the time, no such TLB flush happens, and the kernel skips the TLB reload. It can also skip the atomic memory set & test. Here is a summary of Joe's test results: * The __schedule function dropped from 24% of all program cycles down to 5.5%. * The cacheline contention/hotness for accesses to that bitmask went from being the 1st/2nd hottest - down to the 84th hottest (0.3% of all shared misses which is now quite cold) * The average load latency for the bit-test-n-set instruction in __schedule dropped from 10k-15k cycles down to an average of 600 cycles. * The linpack program results improved from 133 GFlops to 144 GFlops. Peak GFlops rose from 133 to 153. Reported-by: Don Zickus <dzickus@redhat.com> Reported-by: Joe Mario <jmario@redhat.com> Tested-by: Joe Mario <jmario@redhat.com> Signed-off-by: Rik van Riel <riel@redhat.com> Reviewed-by: Paul Turner <pjt@google.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/20130731221421.616d3d20@annuminas.surriel.com [ Made the comments consistent around the modified code. ] Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-08-01 10:14:21 +08:00
if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
/*
* On established mms, the mm_cpumask is only changed
* from irq context, from ptep_clear_flush() while in
* lazy tlb mode, and here. Irqs are blocked during
* schedule, protecting us from simultaneous changes.
*/
cpumask_set_cpu(cpu, mm_cpumask(next));
/*
* We were in lazy tlb mode and leave_mm disabled
* tlb flush IPI delivery. We must reload CR3
* to make sure to use no freed page tables.
*/
load_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
load_LDT_nolock(&next->context);
}
}
#endif
}
#define activate_mm(prev, next) \
do { \
paravirt_activate_mm((prev), (next)); \
switch_mm((prev), (next), NULL); \
} while (0);
#ifdef CONFIG_X86_32
#define deactivate_mm(tsk, mm) \
do { \
lazy_load_gs(0); \
} while (0)
#else
#define deactivate_mm(tsk, mm) \
do { \
load_gs_index(0); \
loadsegment(fs, 0); \
} while (0)
#endif
#endif /* _ASM_X86_MMU_CONTEXT_H */