linux/arch/powerpc/include/asm/cputable.h

564 lines
21 KiB
C
Raw Normal View History

#ifndef __ASM_POWERPC_CPUTABLE_H
#define __ASM_POWERPC_CPUTABLE_H
#define PPC_FEATURE_32 0x80000000
#define PPC_FEATURE_64 0x40000000
#define PPC_FEATURE_601_INSTR 0x20000000
#define PPC_FEATURE_HAS_ALTIVEC 0x10000000
#define PPC_FEATURE_HAS_FPU 0x08000000
#define PPC_FEATURE_HAS_MMU 0x04000000
#define PPC_FEATURE_HAS_4xxMAC 0x02000000
#define PPC_FEATURE_UNIFIED_CACHE 0x01000000
#define PPC_FEATURE_HAS_SPE 0x00800000
#define PPC_FEATURE_HAS_EFP_SINGLE 0x00400000
#define PPC_FEATURE_HAS_EFP_DOUBLE 0x00200000
2005-10-22 14:51:34 +08:00
#define PPC_FEATURE_NO_TB 0x00100000
#define PPC_FEATURE_POWER4 0x00080000
#define PPC_FEATURE_POWER5 0x00040000
#define PPC_FEATURE_POWER5_PLUS 0x00020000
#define PPC_FEATURE_CELL 0x00010000
#define PPC_FEATURE_BOOKE 0x00008000
#define PPC_FEATURE_SMT 0x00004000
#define PPC_FEATURE_ICACHE_SNOOP 0x00002000
#define PPC_FEATURE_ARCH_2_05 0x00001000
#define PPC_FEATURE_PA6T 0x00000800
#define PPC_FEATURE_HAS_DFP 0x00000400
#define PPC_FEATURE_POWER6_EXT 0x00000200
#define PPC_FEATURE_ARCH_2_06 0x00000100
#define PPC_FEATURE_HAS_VSX 0x00000080
#define PPC_FEATURE_PSERIES_PERFMON_COMPAT \
0x00000040
#define PPC_FEATURE_TRUE_LE 0x00000002
#define PPC_FEATURE_PPC_LE 0x00000001
#ifdef __KERNEL__
#include <asm/asm-compat.h>
#include <asm/feature-fixups.h>
#ifndef __ASSEMBLY__
/* This structure can grow, it's real size is used by head.S code
* via the mkdefs mechanism.
*/
struct cpu_spec;
typedef void (*cpu_setup_t)(unsigned long offset, struct cpu_spec* spec);
typedef void (*cpu_restore_t)(void);
enum powerpc_oprofile_type {
PPC_OPROFILE_INVALID = 0,
PPC_OPROFILE_RS64 = 1,
PPC_OPROFILE_POWER4 = 2,
PPC_OPROFILE_G4 = 3,
PPC_OPROFILE_FSL_EMB = 4,
[POWERPC] cell: Add oprofile support Add PPU event-based and cycle-based profiling support to Oprofile for Cell. Oprofile is expected to collect data on all CPUs simultaneously. However, there is one set of performance counters per node. There are two hardware threads or virtual CPUs on each node. Hence, OProfile must multiplex in time the performance counter collection on the two virtual CPUs. The multiplexing of the performance counters is done by a virtual counter routine. Initially, the counters are configured to collect data on the even CPUs in the system, one CPU per node. In order to capture the PC for the virtual CPU when the performance counter interrupt occurs (the specified number of events between samples has occurred), the even processors are configured to handle the performance counter interrupts for their node. The virtual counter routine is called via a kernel timer after the virtual sample time. The routine stops the counters, saves the current counts, loads the last counts for the other virtual CPU on the node, sets interrupts to be handled by the other virtual CPU and restarts the counters, the virtual timer routine is scheduled to run again. The virtual sample time is kept relatively small to make sure sampling occurs on both CPUs on the node with a relatively small granularity. Whenever the counters overflow, the performance counter interrupt is called to collect the PC for the CPU where data is being collected. The oprofile driver relies on a firmware RTAS call to setup the debug bus to route the desired signals to the performance counter hardware to be counted. The RTAS call must set the routing registers appropriately in each of the islands to pass the signals down the debug bus as well as routing the signals from a particular island onto the bus. There is a second firmware RTAS call to reset the debug bus to the non pass thru state when the counters are not in use. Signed-off-by: Carl Love <carll@us.ibm.com> Signed-off-by: Maynard Johnson <mpjohn@us.ibm.com> Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com> Signed-off-by: Paul Mackerras <paulus@samba.org>
2006-11-21 01:45:16 +08:00
PPC_OPROFILE_CELL = 5,
PPC_OPROFILE_PA6T = 6,
};
enum powerpc_pmc_type {
PPC_PMC_DEFAULT = 0,
PPC_PMC_IBM = 1,
PPC_PMC_PA6T = 2,
PPC_PMC_G4 = 3,
};
struct pt_regs;
extern int machine_check_generic(struct pt_regs *regs);
extern int machine_check_4xx(struct pt_regs *regs);
extern int machine_check_440A(struct pt_regs *regs);
extern int machine_check_e500mc(struct pt_regs *regs);
extern int machine_check_e500(struct pt_regs *regs);
extern int machine_check_e200(struct pt_regs *regs);
extern int machine_check_47x(struct pt_regs *regs);
[POWERPC] Fix performance monitor on machines with logical PVR Some IBM machines supply a "logical" PVR (processor version register) value in the device tree in the cpu nodes rather than the real PVR. This is used for instance to indicate that the processors in a POWER6 partition have been configured by the hypervisor to run in POWER5+ mode rather than POWER6 mode. To cope with this, we call identify_cpu a second time with the logical PVR value (the first call is with the real PVR value in the very early setup code). However, POWER5+ machines can also supply a logical PVR value, and use the same value (the value that indicates a v2.04 architecture compliant processor). This causes problems for code that uses the performance monitor (such as oprofile), because the PMU registers are different in POWER6 (even in POWER5+ mode) from the real POWER5+. This change works around this problem by taking out the PMU information from the cputable entries for the logical PVR values, and changing identify_cpu so that the second call to it won't overwrite the PMU information that was established by the first call (the one with the real PVR), but does update the other fields. Specifically, if the cputable entry for the logical PVR value has num_pmcs == 0, none of the PMU-related fields get used. So that we can create a mixed cputable entry, we now make cur_cpu_spec point to a single static struct cpu_spec, and copy stuff from cpu_specs[i] into it. This has the side-effect that we can now make cpu_specs[] be initdata. Ultimately it would be good to move the PMU-related fields out to a separate structure, pointed to by the cputable entries, and change identify_cpu so that it saves the PMU info pointer, copies the whole structure, and restores the PMU info pointer, rather than identify_cpu having to list all the fields that are *not* PMU-related. Signed-off-by: Paul Mackerras <paulus@samba.org> Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2007-10-04 12:18:01 +08:00
/* NOTE WELL: Update identify_cpu() if fields are added or removed! */
struct cpu_spec {
/* CPU is matched via (PVR & pvr_mask) == pvr_value */
unsigned int pvr_mask;
unsigned int pvr_value;
char *cpu_name;
unsigned long cpu_features; /* Kernel features */
unsigned int cpu_user_features; /* Userland features */
unsigned int mmu_features; /* MMU features */
/* cache line sizes */
unsigned int icache_bsize;
unsigned int dcache_bsize;
/* number of performance monitor counters */
unsigned int num_pmcs;
enum powerpc_pmc_type pmc_type;
/* this is called to initialize various CPU bits like L1 cache,
* BHT, SPD, etc... from head.S before branching to identify_machine
*/
cpu_setup_t cpu_setup;
/* Used to restore cpu setup on secondary processors and at resume */
cpu_restore_t cpu_restore;
/* Used by oprofile userspace to select the right counters */
char *oprofile_cpu_type;
/* Processor specific oprofile operations */
enum powerpc_oprofile_type oprofile_type;
/* Bit locations inside the mmcra change */
unsigned long oprofile_mmcra_sihv;
unsigned long oprofile_mmcra_sipr;
/* Bits to clear during an oprofile exception */
unsigned long oprofile_mmcra_clear;
/* Name of processor class, for the ELF AT_PLATFORM entry */
char *platform;
/* Processor specific machine check handling. Return negative
* if the error is fatal, 1 if it was fully recovered and 0 to
* pass up (not CPU originated) */
int (*machine_check)(struct pt_regs *regs);
};
extern struct cpu_spec *cur_cpu_spec;
extern unsigned int __start___ftr_fixup, __stop___ftr_fixup;
extern struct cpu_spec *identify_cpu(unsigned long offset, unsigned int pvr);
[POWERPC] Support feature fixups in vdso's This patch reworks the feature fixup mecanism so vdso's can be fixed up. The main issue was that the construct: .long label (or .llong on 64 bits) will not work in the case of a shared library like the vdso. It will generate an empty placeholder in the fixup table along with a reloc, which is not something we can deal with in the vdso. The idea here (thanks Alan Modra !) is to instead use something like: 1: .long label - 1b That is, the feature fixup tables no longer contain addresses of bits of code to patch, but offsets of such code from the fixup table entry itself. That is properly resolved by ld when building the .so's. I've modified the fixup mecanism generically to use that method for the rest of the kernel as well. Another trick is that the 32 bits vDSO included in the 64 bits kernel need to have a table in the 64 bits format. However, gas does not support 32 bits code with a statement of the form: .llong label - 1b (Or even just .llong label) That is, it cannot emit the right fixup/relocation for the linker to use to assign a 32 bits address to an .llong field. Thus, in the specific case of the 32 bits vdso built as part of the 64 bits kernel, we are using a modified macro that generates: .long 0xffffffff .llong label - 1b Note that is assumes that the value is negative which is enforced by the .lds (those offsets are always negative as the .text is always before the fixup table and gas doesn't support emiting the reloc the other way around). Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Paul Mackerras <paulus@samba.org>
2006-10-20 09:47:18 +08:00
extern void do_feature_fixups(unsigned long value, void *fixup_start,
void *fixup_end);
extern const char *powerpc_base_platform;
#endif /* __ASSEMBLY__ */
/* CPU kernel features */
/* Retain the 32b definitions all use bottom half of word */
#define CPU_FTR_COHERENT_ICACHE ASM_CONST(0x0000000000000001)
#define CPU_FTR_L2CR ASM_CONST(0x0000000000000002)
#define CPU_FTR_SPEC7450 ASM_CONST(0x0000000000000004)
#define CPU_FTR_ALTIVEC ASM_CONST(0x0000000000000008)
#define CPU_FTR_TAU ASM_CONST(0x0000000000000010)
#define CPU_FTR_CAN_DOZE ASM_CONST(0x0000000000000020)
#define CPU_FTR_USE_TB ASM_CONST(0x0000000000000040)
#define CPU_FTR_L2CSR ASM_CONST(0x0000000000000080)
#define CPU_FTR_601 ASM_CONST(0x0000000000000100)
#define CPU_FTR_DBELL ASM_CONST(0x0000000000000200)
#define CPU_FTR_CAN_NAP ASM_CONST(0x0000000000000400)
#define CPU_FTR_L3CR ASM_CONST(0x0000000000000800)
#define CPU_FTR_L3_DISABLE_NAP ASM_CONST(0x0000000000001000)
#define CPU_FTR_NAP_DISABLE_L2_PR ASM_CONST(0x0000000000002000)
#define CPU_FTR_DUAL_PLL_750FX ASM_CONST(0x0000000000004000)
#define CPU_FTR_NO_DPM ASM_CONST(0x0000000000008000)
#define CPU_FTR_476_DD2 ASM_CONST(0x0000000000010000)
#define CPU_FTR_NEED_COHERENT ASM_CONST(0x0000000000020000)
#define CPU_FTR_NO_BTIC ASM_CONST(0x0000000000040000)
#define CPU_FTR_DEBUG_LVL_EXC ASM_CONST(0x0000000000080000)
#define CPU_FTR_NODSISRALIGN ASM_CONST(0x0000000000100000)
#define CPU_FTR_PPC_LE ASM_CONST(0x0000000000200000)
#define CPU_FTR_REAL_LE ASM_CONST(0x0000000000400000)
#define CPU_FTR_FPU_UNAVAILABLE ASM_CONST(0x0000000000800000)
#define CPU_FTR_UNIFIED_ID_CACHE ASM_CONST(0x0000000001000000)
#define CPU_FTR_SPE ASM_CONST(0x0000000002000000)
#define CPU_FTR_NEED_PAIRED_STWCX ASM_CONST(0x0000000004000000)
#define CPU_FTR_LWSYNC ASM_CONST(0x0000000008000000)
#define CPU_FTR_NOEXECUTE ASM_CONST(0x0000000010000000)
#define CPU_FTR_INDEXED_DCR ASM_CONST(0x0000000020000000)
#define CPU_FTR_EMB_HV ASM_CONST(0x0000000040000000)
/*
* Add the 64-bit processor unique features in the top half of the word;
* on 32-bit, make the names available but defined to be 0.
*/
#ifdef __powerpc64__
#define LONG_ASM_CONST(x) ASM_CONST(x)
#else
#define LONG_ASM_CONST(x) 0
#endif
#define CPU_FTR_HVMODE LONG_ASM_CONST(0x0000000200000000)
#define CPU_FTR_ARCH_201 LONG_ASM_CONST(0x0000000400000000)
#define CPU_FTR_ARCH_206 LONG_ASM_CONST(0x0000000800000000)
#define CPU_FTR_CFAR LONG_ASM_CONST(0x0000001000000000)
#define CPU_FTR_IABR LONG_ASM_CONST(0x0000002000000000)
#define CPU_FTR_MMCRA LONG_ASM_CONST(0x0000004000000000)
#define CPU_FTR_CTRL LONG_ASM_CONST(0x0000008000000000)
#define CPU_FTR_SMT LONG_ASM_CONST(0x0000010000000000)
#define CPU_FTR_PAUSE_ZERO LONG_ASM_CONST(0x0000200000000000)
#define CPU_FTR_PURR LONG_ASM_CONST(0x0000400000000000)
#define CPU_FTR_CELL_TB_BUG LONG_ASM_CONST(0x0000800000000000)
#define CPU_FTR_SPURR LONG_ASM_CONST(0x0001000000000000)
#define CPU_FTR_DSCR LONG_ASM_CONST(0x0002000000000000)
#define CPU_FTR_VSX LONG_ASM_CONST(0x0010000000000000)
#define CPU_FTR_SAO LONG_ASM_CONST(0x0020000000000000)
#define CPU_FTR_CP_USE_DCBTZ LONG_ASM_CONST(0x0040000000000000)
#define CPU_FTR_UNALIGNED_LD_STD LONG_ASM_CONST(0x0080000000000000)
#define CPU_FTR_ASYM_SMT LONG_ASM_CONST(0x0100000000000000)
powerpc: Feature nop out reservation clear when stcx checks address The POWER architecture does not require stcx to check that it is operating on the same address as the larx. This means it is possible for an an exception handler to execute a larx, get a reservation, decide not to do the stcx and then return back with an active reservation. If the interrupted code was in the middle of a larx/stcx sequence the stcx could incorrectly succeed. All recent POWER CPUs check the address before letting the stcx succeed so we can create a CPU feature and nop it out. As Ben suggested, we can only do this in our syscall path because there is a remote possibility some kernel code gets interrupted by an exception that ends up operating on the same cacheline. Thanks to Paul Mackerras and Derek Williams for the idea. To test this I used a very simple null syscall (actually getppid) testcase at http://ozlabs.org/~anton/junkcode/null_syscall.c I tested against 2.6.35-git10 with the following changes against the pseries_defconfig: CONFIG_VIRT_CPU_ACCOUNTING=n CONFIG_AUDIT=n CONFIG_PPC_4K_PAGES=n CONFIG_PPC_64K_PAGES=y CONFIG_FORCE_MAX_ZONEORDER=9 CONFIG_PPC_SUBPAGE_PROT=n CONFIG_FUNCTION_TRACER=n CONFIG_FUNCTION_GRAPH_TRACER=n CONFIG_IRQSOFF_TRACER=n CONFIG_STACK_TRACER=n to remove the overhead of virtual CPU accounting, syscall auditing and the ftrace mcount tracers. 64kB pages were enabled to minimise TLB misses. POWER6: +8.2% POWER7: +7.0% Another suggestion was to use a larx to something in the L1 instead of a stcx. This was almost as fast as removing the larx on POWER6, but only 3.5% faster on POWER7. We can use this to speed up the reservation clear in our exception exit code. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2010-08-11 09:40:27 +08:00
#define CPU_FTR_STCX_CHECKS_ADDRESS LONG_ASM_CONST(0x0200000000000000)
#define CPU_FTR_POPCNTB LONG_ASM_CONST(0x0400000000000000)
#define CPU_FTR_POPCNTD LONG_ASM_CONST(0x0800000000000000)
#define CPU_FTR_ICSWX LONG_ASM_CONST(0x1000000000000000)
powerpc: POWER7 optimised copy_to_user/copy_from_user using VMX Implement a POWER7 optimised copy_to_user/copy_from_user using VMX. For large aligned copies this new loop is over 10% faster, and for large unaligned copies it is over 200% faster. If we take a fault we fall back to the old version, this keeps things relatively simple and easy to verify. On POWER7 unaligned stores rarely slow down - they only flush when a store crosses a 4KB page boundary. Furthermore this flush is handled completely in hardware and should be 20-30 cycles. Unaligned loads on the other hand flush much more often - whenever crossing a 128 byte cache line, or a 32 byte sector if either sector is an L1 miss. Considering this information we really want to get the loads aligned and not worry about the alignment of the stores. Microbenchmarks confirm that this approach is much faster than the current unaligned copy loop that uses shifts and rotates to ensure both loads and stores are aligned. We also want to try and do the stores in cacheline aligned, cacheline sized chunks. If the store queue is unable to merge an entire cacheline of stores then the L2 cache will have to do a read/modify/write. Even worse, we will serialise this with the stores in the next iteration of the copy loop since both iterations hit the same cacheline. Based on this, the new loop does the following things: 1 - 127 bytes Get the source 8 byte aligned and use 8 byte loads and stores. Pretty boring and similar to how the current loop works. 128 - 4095 bytes Get the source 8 byte aligned and use 8 byte loads and stores, 1 cacheline at a time. We aren't doing the stores in cacheline aligned chunks so we will potentially serialise once per cacheline. Even so it is much better than the loop we have today. 4096 - bytes If both source and destination have the same alignment get them both 16 byte aligned, then get the destination cacheline aligned. Do cacheline sized loads and stores using VMX. If source and destination do not have the same alignment, we get the destination cacheline aligned, and use permute to do aligned loads. In both cases the VMX loop should be optimal - we always do aligned loads and stores and are always doing stores in cacheline aligned, cacheline sized chunks. To be able to use VMX we must be careful about interrupts and sleeping. We don't use the VMX loop when in an interrupt (which should be rare anyway) and we wrap the VMX loop in disable/enable_pagefault and fall back to the existing copy_tofrom_user loop if we do need to sleep. The VMX breakpoint of 4096 bytes was chosen using this microbenchmark: http://ozlabs.org/~anton/junkcode/copy_to_user.c Since we are using VMX and there is a cost to saving and restoring the user VMX state there are two broad cases we need to benchmark: - Best case - userspace never uses VMX - Worst case - userspace always uses VMX In reality a userspace process will sit somewhere between these two extremes. Since we need to test both aligned and unaligned copies we end up with 4 combinations. The point at which the VMX loop begins to win is: 0% VMX aligned 2048 bytes unaligned 2048 bytes 100% VMX aligned 16384 bytes unaligned 8192 bytes Considering this is a microbenchmark, the data is hot in cache and the VMX loop has better store queue merging properties we set the breakpoint to 4096 bytes, a little below the unaligned breakpoints. Some future optimisations we can look at: - Looking at the perf data, a significant part of the cost when a task is always using VMX is the extra exception we take to restore the VMX state. As such we should do something similar to the x86 optimisation that restores FPU state for heavy users. ie: /* * If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the * chances of needing FPU soon are obviously high now */ preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; and /* * fpu_counter contains the number of consecutive context switches * that the FPU is used. If this is over a threshold, the lazy fpu * saving becomes unlazy to save the trap. This is an unsigned char * so that after 256 times the counter wraps and the behavior turns * lazy again; this to deal with bursty apps that only use FPU for * a short time */ - We could create a paca bit to mirror the VMX enabled MSR bit and check that first, avoiding multiple calls to calling enable_kernel_altivec. That should help with iovec based system calls like readv. - We could have two VMX breakpoints, one for when we know the user VMX state is loaded into the registers and one when it isn't. This could be a second bit in the paca so we can calculate the break points quickly. - One suggestion from Ben was to save and restore the VSX registers we use inline instead of using enable_kernel_altivec. [BenH: Fixed a problem with preempt and fixed build without CONFIG_ALTIVEC] Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2011-12-08 04:11:45 +08:00
#define CPU_FTR_VMX_COPY LONG_ASM_CONST(0x2000000000000000)
#ifndef __ASSEMBLY__
#define CPU_FTR_PPCAS_ARCH_V2 (CPU_FTR_NOEXECUTE | CPU_FTR_NODSISRALIGN)
#define MMU_FTR_PPCAS_ARCH_V2 (MMU_FTR_SLB | MMU_FTR_TLBIEL | \
MMU_FTR_16M_PAGE)
/* We only set the altivec features if the kernel was compiled with altivec
* support
*/
#ifdef CONFIG_ALTIVEC
#define CPU_FTR_ALTIVEC_COMP CPU_FTR_ALTIVEC
#define PPC_FEATURE_HAS_ALTIVEC_COMP PPC_FEATURE_HAS_ALTIVEC
#else
#define CPU_FTR_ALTIVEC_COMP 0
#define PPC_FEATURE_HAS_ALTIVEC_COMP 0
#endif
/* We only set the VSX features if the kernel was compiled with VSX
* support
*/
#ifdef CONFIG_VSX
#define CPU_FTR_VSX_COMP CPU_FTR_VSX
#define PPC_FEATURE_HAS_VSX_COMP PPC_FEATURE_HAS_VSX
#else
#define CPU_FTR_VSX_COMP 0
#define PPC_FEATURE_HAS_VSX_COMP 0
#endif
/* We only set the spe features if the kernel was compiled with spe
* support
*/
#ifdef CONFIG_SPE
#define CPU_FTR_SPE_COMP CPU_FTR_SPE
#define PPC_FEATURE_HAS_SPE_COMP PPC_FEATURE_HAS_SPE
#define PPC_FEATURE_HAS_EFP_SINGLE_COMP PPC_FEATURE_HAS_EFP_SINGLE
#define PPC_FEATURE_HAS_EFP_DOUBLE_COMP PPC_FEATURE_HAS_EFP_DOUBLE
#else
#define CPU_FTR_SPE_COMP 0
#define PPC_FEATURE_HAS_SPE_COMP 0
#define PPC_FEATURE_HAS_EFP_SINGLE_COMP 0
#define PPC_FEATURE_HAS_EFP_DOUBLE_COMP 0
#endif
/* We need to mark all pages as being coherent if we're SMP or we have a
* 74[45]x and an MPC107 host bridge. Also 83xx and PowerQUICC II
* require it for PCI "streaming/prefetch" to work properly.
* This is also required by 52xx family.
*/
#if defined(CONFIG_SMP) || defined(CONFIG_MPC10X_BRIDGE) \
|| defined(CONFIG_PPC_83xx) || defined(CONFIG_8260) \
|| defined(CONFIG_PPC_MPC52xx)
#define CPU_FTR_COMMON CPU_FTR_NEED_COHERENT
#else
#define CPU_FTR_COMMON 0
#endif
/* The powersave features NAP & DOZE seems to confuse BDI when
debugging. So if a BDI is used, disable theses
*/
#ifndef CONFIG_BDI_SWITCH
#define CPU_FTR_MAYBE_CAN_DOZE CPU_FTR_CAN_DOZE
#define CPU_FTR_MAYBE_CAN_NAP CPU_FTR_CAN_NAP
#else
#define CPU_FTR_MAYBE_CAN_DOZE 0
#define CPU_FTR_MAYBE_CAN_NAP 0
#endif
#define CLASSIC_PPC (!defined(CONFIG_8xx) && !defined(CONFIG_4xx) && \
!defined(CONFIG_POWER3) && !defined(CONFIG_POWER4) && \
!defined(CONFIG_BOOKE))
#define CPU_FTRS_PPC601 (CPU_FTR_COMMON | CPU_FTR_601 | \
CPU_FTR_COHERENT_ICACHE | CPU_FTR_UNIFIED_ID_CACHE)
#define CPU_FTRS_603 (CPU_FTR_COMMON | \
CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB | \
CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_PPC_LE)
#define CPU_FTRS_604 (CPU_FTR_COMMON | \
CPU_FTR_USE_TB | CPU_FTR_PPC_LE)
#define CPU_FTRS_740_NOTAU (CPU_FTR_COMMON | \
CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB | CPU_FTR_L2CR | \
CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_PPC_LE)
#define CPU_FTRS_740 (CPU_FTR_COMMON | \
CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB | CPU_FTR_L2CR | \
CPU_FTR_TAU | CPU_FTR_MAYBE_CAN_NAP | \
CPU_FTR_PPC_LE)
#define CPU_FTRS_750 (CPU_FTR_COMMON | \
CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB | CPU_FTR_L2CR | \
CPU_FTR_TAU | CPU_FTR_MAYBE_CAN_NAP | \
CPU_FTR_PPC_LE)
#define CPU_FTRS_750CL (CPU_FTRS_750)
#define CPU_FTRS_750FX1 (CPU_FTRS_750 | CPU_FTR_DUAL_PLL_750FX | CPU_FTR_NO_DPM)
#define CPU_FTRS_750FX2 (CPU_FTRS_750 | CPU_FTR_NO_DPM)
#define CPU_FTRS_750FX (CPU_FTRS_750 | CPU_FTR_DUAL_PLL_750FX)
#define CPU_FTRS_750GX (CPU_FTRS_750FX)
#define CPU_FTRS_7400_NOTAU (CPU_FTR_COMMON | \
CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB | CPU_FTR_L2CR | \
CPU_FTR_ALTIVEC_COMP | \
CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_PPC_LE)
#define CPU_FTRS_7400 (CPU_FTR_COMMON | \
CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB | CPU_FTR_L2CR | \
CPU_FTR_TAU | CPU_FTR_ALTIVEC_COMP | \
CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_PPC_LE)
#define CPU_FTRS_7450_20 (CPU_FTR_COMMON | \
CPU_FTR_USE_TB | CPU_FTR_L2CR | CPU_FTR_ALTIVEC_COMP | \
CPU_FTR_L3CR | CPU_FTR_SPEC7450 | \
CPU_FTR_NEED_COHERENT | CPU_FTR_PPC_LE | CPU_FTR_NEED_PAIRED_STWCX)
#define CPU_FTRS_7450_21 (CPU_FTR_COMMON | \
CPU_FTR_USE_TB | \
CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_L2CR | CPU_FTR_ALTIVEC_COMP | \
CPU_FTR_L3CR | CPU_FTR_SPEC7450 | \
CPU_FTR_NAP_DISABLE_L2_PR | CPU_FTR_L3_DISABLE_NAP | \
CPU_FTR_NEED_COHERENT | CPU_FTR_PPC_LE | CPU_FTR_NEED_PAIRED_STWCX)
#define CPU_FTRS_7450_23 (CPU_FTR_COMMON | \
CPU_FTR_USE_TB | CPU_FTR_NEED_PAIRED_STWCX | \
CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_L2CR | CPU_FTR_ALTIVEC_COMP | \
CPU_FTR_L3CR | CPU_FTR_SPEC7450 | \
CPU_FTR_NAP_DISABLE_L2_PR | CPU_FTR_NEED_COHERENT | CPU_FTR_PPC_LE)
#define CPU_FTRS_7455_1 (CPU_FTR_COMMON | \
CPU_FTR_USE_TB | CPU_FTR_NEED_PAIRED_STWCX | \
CPU_FTR_L2CR | CPU_FTR_ALTIVEC_COMP | CPU_FTR_L3CR | \
CPU_FTR_SPEC7450 | CPU_FTR_NEED_COHERENT | CPU_FTR_PPC_LE)
#define CPU_FTRS_7455_20 (CPU_FTR_COMMON | \
CPU_FTR_USE_TB | CPU_FTR_NEED_PAIRED_STWCX | \
CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_L2CR | CPU_FTR_ALTIVEC_COMP | \
CPU_FTR_L3CR | CPU_FTR_SPEC7450 | \
CPU_FTR_NAP_DISABLE_L2_PR | CPU_FTR_L3_DISABLE_NAP | \
CPU_FTR_NEED_COHERENT | CPU_FTR_PPC_LE)
#define CPU_FTRS_7455 (CPU_FTR_COMMON | \
CPU_FTR_USE_TB | \
CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_L2CR | CPU_FTR_ALTIVEC_COMP | \
CPU_FTR_L3CR | CPU_FTR_SPEC7450 | CPU_FTR_NAP_DISABLE_L2_PR | \
CPU_FTR_NEED_COHERENT | CPU_FTR_PPC_LE | CPU_FTR_NEED_PAIRED_STWCX)
#define CPU_FTRS_7447_10 (CPU_FTR_COMMON | \
CPU_FTR_USE_TB | \
CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_L2CR | CPU_FTR_ALTIVEC_COMP | \
CPU_FTR_L3CR | CPU_FTR_SPEC7450 | CPU_FTR_NAP_DISABLE_L2_PR | \
CPU_FTR_NEED_COHERENT | CPU_FTR_NO_BTIC | CPU_FTR_PPC_LE | \
CPU_FTR_NEED_PAIRED_STWCX)
#define CPU_FTRS_7447 (CPU_FTR_COMMON | \
CPU_FTR_USE_TB | \
CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_L2CR | CPU_FTR_ALTIVEC_COMP | \
CPU_FTR_L3CR | CPU_FTR_SPEC7450 | CPU_FTR_NAP_DISABLE_L2_PR | \
CPU_FTR_NEED_COHERENT | CPU_FTR_PPC_LE | CPU_FTR_NEED_PAIRED_STWCX)
#define CPU_FTRS_7447A (CPU_FTR_COMMON | \
CPU_FTR_USE_TB | \
CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_L2CR | CPU_FTR_ALTIVEC_COMP | \
CPU_FTR_SPEC7450 | CPU_FTR_NAP_DISABLE_L2_PR | \
CPU_FTR_NEED_COHERENT | CPU_FTR_PPC_LE | CPU_FTR_NEED_PAIRED_STWCX)
#define CPU_FTRS_7448 (CPU_FTR_COMMON | \
CPU_FTR_USE_TB | \
CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_L2CR | CPU_FTR_ALTIVEC_COMP | \
CPU_FTR_SPEC7450 | CPU_FTR_NAP_DISABLE_L2_PR | \
CPU_FTR_PPC_LE | CPU_FTR_NEED_PAIRED_STWCX)
#define CPU_FTRS_82XX (CPU_FTR_COMMON | \
CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB)
#define CPU_FTRS_G2_LE (CPU_FTR_COMMON | CPU_FTR_MAYBE_CAN_DOZE | \
CPU_FTR_USE_TB | CPU_FTR_MAYBE_CAN_NAP)
#define CPU_FTRS_E300 (CPU_FTR_MAYBE_CAN_DOZE | \
CPU_FTR_USE_TB | CPU_FTR_MAYBE_CAN_NAP | \
CPU_FTR_COMMON)
#define CPU_FTRS_E300C2 (CPU_FTR_MAYBE_CAN_DOZE | \
CPU_FTR_USE_TB | CPU_FTR_MAYBE_CAN_NAP | \
CPU_FTR_COMMON | CPU_FTR_FPU_UNAVAILABLE)
#define CPU_FTRS_CLASSIC32 (CPU_FTR_COMMON | CPU_FTR_USE_TB)
#define CPU_FTRS_8XX (CPU_FTR_USE_TB)
#define CPU_FTRS_40X (CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE)
#define CPU_FTRS_44X (CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE)
#define CPU_FTRS_440x6 (CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE | \
CPU_FTR_INDEXED_DCR)
#define CPU_FTRS_47X (CPU_FTRS_440x6)
#define CPU_FTRS_E200 (CPU_FTR_USE_TB | CPU_FTR_SPE_COMP | \
CPU_FTR_NODSISRALIGN | CPU_FTR_COHERENT_ICACHE | \
CPU_FTR_UNIFIED_ID_CACHE | CPU_FTR_NOEXECUTE | \
CPU_FTR_DEBUG_LVL_EXC)
#define CPU_FTRS_E500 (CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB | \
CPU_FTR_SPE_COMP | CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_NODSISRALIGN | \
CPU_FTR_NOEXECUTE)
#define CPU_FTRS_E500_2 (CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB | \
CPU_FTR_SPE_COMP | CPU_FTR_MAYBE_CAN_NAP | \
CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE)
#define CPU_FTRS_E500MC (CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | \
CPU_FTR_L2CSR | CPU_FTR_LWSYNC | CPU_FTR_NOEXECUTE | \
CPU_FTR_DBELL | CPU_FTR_DEBUG_LVL_EXC | CPU_FTR_EMB_HV)
#define CPU_FTRS_E5500 (CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | \
CPU_FTR_L2CSR | CPU_FTR_LWSYNC | CPU_FTR_NOEXECUTE | \
CPU_FTR_DBELL | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
CPU_FTR_DEBUG_LVL_EXC | CPU_FTR_EMB_HV)
#define CPU_FTRS_E6500 (CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | \
CPU_FTR_L2CSR | CPU_FTR_LWSYNC | CPU_FTR_NOEXECUTE | \
CPU_FTR_DBELL | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
CPU_FTR_DEBUG_LVL_EXC | CPU_FTR_EMB_HV)
#define CPU_FTRS_GENERIC_32 (CPU_FTR_COMMON | CPU_FTR_NODSISRALIGN)
/* 64-bit CPUs */
#define CPU_FTRS_POWER3 (CPU_FTR_USE_TB | \
CPU_FTR_IABR | CPU_FTR_PPC_LE)
#define CPU_FTRS_RS64 (CPU_FTR_USE_TB | \
CPU_FTR_IABR | \
CPU_FTR_MMCRA | CPU_FTR_CTRL)
#define CPU_FTRS_POWER4 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
powerpc: Feature nop out reservation clear when stcx checks address The POWER architecture does not require stcx to check that it is operating on the same address as the larx. This means it is possible for an an exception handler to execute a larx, get a reservation, decide not to do the stcx and then return back with an active reservation. If the interrupted code was in the middle of a larx/stcx sequence the stcx could incorrectly succeed. All recent POWER CPUs check the address before letting the stcx succeed so we can create a CPU feature and nop it out. As Ben suggested, we can only do this in our syscall path because there is a remote possibility some kernel code gets interrupted by an exception that ends up operating on the same cacheline. Thanks to Paul Mackerras and Derek Williams for the idea. To test this I used a very simple null syscall (actually getppid) testcase at http://ozlabs.org/~anton/junkcode/null_syscall.c I tested against 2.6.35-git10 with the following changes against the pseries_defconfig: CONFIG_VIRT_CPU_ACCOUNTING=n CONFIG_AUDIT=n CONFIG_PPC_4K_PAGES=n CONFIG_PPC_64K_PAGES=y CONFIG_FORCE_MAX_ZONEORDER=9 CONFIG_PPC_SUBPAGE_PROT=n CONFIG_FUNCTION_TRACER=n CONFIG_FUNCTION_GRAPH_TRACER=n CONFIG_IRQSOFF_TRACER=n CONFIG_STACK_TRACER=n to remove the overhead of virtual CPU accounting, syscall auditing and the ftrace mcount tracers. 64kB pages were enabled to minimise TLB misses. POWER6: +8.2% POWER7: +7.0% Another suggestion was to use a larx to something in the L1 instead of a stcx. This was almost as fast as removing the larx on POWER6, but only 3.5% faster on POWER7. We can use this to speed up the reservation clear in our exception exit code. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2010-08-11 09:40:27 +08:00
CPU_FTR_MMCRA | CPU_FTR_CP_USE_DCBTZ | \
CPU_FTR_STCX_CHECKS_ADDRESS)
#define CPU_FTRS_PPC970 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_201 | \
CPU_FTR_ALTIVEC_COMP | CPU_FTR_CAN_NAP | CPU_FTR_MMCRA | \
CPU_FTR_CP_USE_DCBTZ | CPU_FTR_STCX_CHECKS_ADDRESS | \
CPU_FTR_HVMODE)
#define CPU_FTRS_POWER5 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
CPU_FTR_MMCRA | CPU_FTR_SMT | \
CPU_FTR_COHERENT_ICACHE | CPU_FTR_PURR | \
CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB)
#define CPU_FTRS_POWER6 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
CPU_FTR_MMCRA | CPU_FTR_SMT | \
CPU_FTR_COHERENT_ICACHE | \
CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
powerpc: Feature nop out reservation clear when stcx checks address The POWER architecture does not require stcx to check that it is operating on the same address as the larx. This means it is possible for an an exception handler to execute a larx, get a reservation, decide not to do the stcx and then return back with an active reservation. If the interrupted code was in the middle of a larx/stcx sequence the stcx could incorrectly succeed. All recent POWER CPUs check the address before letting the stcx succeed so we can create a CPU feature and nop it out. As Ben suggested, we can only do this in our syscall path because there is a remote possibility some kernel code gets interrupted by an exception that ends up operating on the same cacheline. Thanks to Paul Mackerras and Derek Williams for the idea. To test this I used a very simple null syscall (actually getppid) testcase at http://ozlabs.org/~anton/junkcode/null_syscall.c I tested against 2.6.35-git10 with the following changes against the pseries_defconfig: CONFIG_VIRT_CPU_ACCOUNTING=n CONFIG_AUDIT=n CONFIG_PPC_4K_PAGES=n CONFIG_PPC_64K_PAGES=y CONFIG_FORCE_MAX_ZONEORDER=9 CONFIG_PPC_SUBPAGE_PROT=n CONFIG_FUNCTION_TRACER=n CONFIG_FUNCTION_GRAPH_TRACER=n CONFIG_IRQSOFF_TRACER=n CONFIG_STACK_TRACER=n to remove the overhead of virtual CPU accounting, syscall auditing and the ftrace mcount tracers. 64kB pages were enabled to minimise TLB misses. POWER6: +8.2% POWER7: +7.0% Another suggestion was to use a larx to something in the L1 instead of a stcx. This was almost as fast as removing the larx on POWER6, but only 3.5% faster on POWER7. We can use this to speed up the reservation clear in our exception exit code. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2010-08-11 09:40:27 +08:00
CPU_FTR_DSCR | CPU_FTR_UNALIGNED_LD_STD | \
CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_CFAR)
#define CPU_FTRS_POWER7 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_206 |\
CPU_FTR_MMCRA | CPU_FTR_SMT | \
CPU_FTR_COHERENT_ICACHE | \
CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
powerpc: Feature nop out reservation clear when stcx checks address The POWER architecture does not require stcx to check that it is operating on the same address as the larx. This means it is possible for an an exception handler to execute a larx, get a reservation, decide not to do the stcx and then return back with an active reservation. If the interrupted code was in the middle of a larx/stcx sequence the stcx could incorrectly succeed. All recent POWER CPUs check the address before letting the stcx succeed so we can create a CPU feature and nop it out. As Ben suggested, we can only do this in our syscall path because there is a remote possibility some kernel code gets interrupted by an exception that ends up operating on the same cacheline. Thanks to Paul Mackerras and Derek Williams for the idea. To test this I used a very simple null syscall (actually getppid) testcase at http://ozlabs.org/~anton/junkcode/null_syscall.c I tested against 2.6.35-git10 with the following changes against the pseries_defconfig: CONFIG_VIRT_CPU_ACCOUNTING=n CONFIG_AUDIT=n CONFIG_PPC_4K_PAGES=n CONFIG_PPC_64K_PAGES=y CONFIG_FORCE_MAX_ZONEORDER=9 CONFIG_PPC_SUBPAGE_PROT=n CONFIG_FUNCTION_TRACER=n CONFIG_FUNCTION_GRAPH_TRACER=n CONFIG_IRQSOFF_TRACER=n CONFIG_STACK_TRACER=n to remove the overhead of virtual CPU accounting, syscall auditing and the ftrace mcount tracers. 64kB pages were enabled to minimise TLB misses. POWER6: +8.2% POWER7: +7.0% Another suggestion was to use a larx to something in the L1 instead of a stcx. This was almost as fast as removing the larx on POWER6, but only 3.5% faster on POWER7. We can use this to speed up the reservation clear in our exception exit code. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2010-08-11 09:40:27 +08:00
CPU_FTR_DSCR | CPU_FTR_SAO | CPU_FTR_ASYM_SMT | \
CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
powerpc: POWER7 optimised copy_to_user/copy_from_user using VMX Implement a POWER7 optimised copy_to_user/copy_from_user using VMX. For large aligned copies this new loop is over 10% faster, and for large unaligned copies it is over 200% faster. If we take a fault we fall back to the old version, this keeps things relatively simple and easy to verify. On POWER7 unaligned stores rarely slow down - they only flush when a store crosses a 4KB page boundary. Furthermore this flush is handled completely in hardware and should be 20-30 cycles. Unaligned loads on the other hand flush much more often - whenever crossing a 128 byte cache line, or a 32 byte sector if either sector is an L1 miss. Considering this information we really want to get the loads aligned and not worry about the alignment of the stores. Microbenchmarks confirm that this approach is much faster than the current unaligned copy loop that uses shifts and rotates to ensure both loads and stores are aligned. We also want to try and do the stores in cacheline aligned, cacheline sized chunks. If the store queue is unable to merge an entire cacheline of stores then the L2 cache will have to do a read/modify/write. Even worse, we will serialise this with the stores in the next iteration of the copy loop since both iterations hit the same cacheline. Based on this, the new loop does the following things: 1 - 127 bytes Get the source 8 byte aligned and use 8 byte loads and stores. Pretty boring and similar to how the current loop works. 128 - 4095 bytes Get the source 8 byte aligned and use 8 byte loads and stores, 1 cacheline at a time. We aren't doing the stores in cacheline aligned chunks so we will potentially serialise once per cacheline. Even so it is much better than the loop we have today. 4096 - bytes If both source and destination have the same alignment get them both 16 byte aligned, then get the destination cacheline aligned. Do cacheline sized loads and stores using VMX. If source and destination do not have the same alignment, we get the destination cacheline aligned, and use permute to do aligned loads. In both cases the VMX loop should be optimal - we always do aligned loads and stores and are always doing stores in cacheline aligned, cacheline sized chunks. To be able to use VMX we must be careful about interrupts and sleeping. We don't use the VMX loop when in an interrupt (which should be rare anyway) and we wrap the VMX loop in disable/enable_pagefault and fall back to the existing copy_tofrom_user loop if we do need to sleep. The VMX breakpoint of 4096 bytes was chosen using this microbenchmark: http://ozlabs.org/~anton/junkcode/copy_to_user.c Since we are using VMX and there is a cost to saving and restoring the user VMX state there are two broad cases we need to benchmark: - Best case - userspace never uses VMX - Worst case - userspace always uses VMX In reality a userspace process will sit somewhere between these two extremes. Since we need to test both aligned and unaligned copies we end up with 4 combinations. The point at which the VMX loop begins to win is: 0% VMX aligned 2048 bytes unaligned 2048 bytes 100% VMX aligned 16384 bytes unaligned 8192 bytes Considering this is a microbenchmark, the data is hot in cache and the VMX loop has better store queue merging properties we set the breakpoint to 4096 bytes, a little below the unaligned breakpoints. Some future optimisations we can look at: - Looking at the perf data, a significant part of the cost when a task is always using VMX is the extra exception we take to restore the VMX state. As such we should do something similar to the x86 optimisation that restores FPU state for heavy users. ie: /* * If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the * chances of needing FPU soon are obviously high now */ preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; and /* * fpu_counter contains the number of consecutive context switches * that the FPU is used. If this is over a threshold, the lazy fpu * saving becomes unlazy to save the trap. This is an unsigned char * so that after 256 times the counter wraps and the behavior turns * lazy again; this to deal with bursty apps that only use FPU for * a short time */ - We could create a paca bit to mirror the VMX enabled MSR bit and check that first, avoiding multiple calls to calling enable_kernel_altivec. That should help with iovec based system calls like readv. - We could have two VMX breakpoints, one for when we know the user VMX state is loaded into the registers and one when it isn't. This could be a second bit in the paca so we can calculate the break points quickly. - One suggestion from Ben was to save and restore the VSX registers we use inline instead of using enable_kernel_altivec. [BenH: Fixed a problem with preempt and fixed build without CONFIG_ALTIVEC] Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2011-12-08 04:11:45 +08:00
CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY)
#define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
CPU_FTR_PAUSE_ZERO | CPU_FTR_CELL_TB_BUG | CPU_FTR_CP_USE_DCBTZ | \
CPU_FTR_UNALIGNED_LD_STD)
#define CPU_FTRS_PA6T (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_ALTIVEC_COMP | \
CPU_FTR_PURR | CPU_FTR_REAL_LE)
#define CPU_FTRS_COMPATIBLE (CPU_FTR_USE_TB | CPU_FTR_PPCAS_ARCH_V2)
#define CPU_FTRS_A2 (CPU_FTR_USE_TB | CPU_FTR_SMT | CPU_FTR_DBELL | \
CPU_FTR_NOEXECUTE | CPU_FTR_NODSISRALIGN | CPU_FTR_ICSWX)
#ifdef __powerpc64__
#ifdef CONFIG_PPC_BOOK3E
#define CPU_FTRS_POSSIBLE (CPU_FTRS_E6500 | CPU_FTRS_E5500 | CPU_FTRS_A2)
#else
#define CPU_FTRS_POSSIBLE \
(CPU_FTRS_POWER3 | CPU_FTRS_RS64 | CPU_FTRS_POWER4 | \
CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | CPU_FTRS_POWER6 | \
CPU_FTRS_POWER7 | CPU_FTRS_CELL | CPU_FTRS_PA6T | \
CPU_FTR_VSX)
#endif
#else
enum {
CPU_FTRS_POSSIBLE =
#if CLASSIC_PPC
CPU_FTRS_PPC601 | CPU_FTRS_603 | CPU_FTRS_604 | CPU_FTRS_740_NOTAU |
CPU_FTRS_740 | CPU_FTRS_750 | CPU_FTRS_750FX1 |
CPU_FTRS_750FX2 | CPU_FTRS_750FX | CPU_FTRS_750GX |
CPU_FTRS_7400_NOTAU | CPU_FTRS_7400 | CPU_FTRS_7450_20 |
CPU_FTRS_7450_21 | CPU_FTRS_7450_23 | CPU_FTRS_7455_1 |
CPU_FTRS_7455_20 | CPU_FTRS_7455 | CPU_FTRS_7447_10 |
CPU_FTRS_7447 | CPU_FTRS_7447A | CPU_FTRS_82XX |
CPU_FTRS_G2_LE | CPU_FTRS_E300 | CPU_FTRS_E300C2 |
CPU_FTRS_CLASSIC32 |
#else
CPU_FTRS_GENERIC_32 |
#endif
#ifdef CONFIG_8xx
CPU_FTRS_8XX |
#endif
#ifdef CONFIG_40x
CPU_FTRS_40X |
#endif
#ifdef CONFIG_44x
CPU_FTRS_44X | CPU_FTRS_440x6 |
#endif
#ifdef CONFIG_PPC_47x
CPU_FTRS_47X | CPU_FTR_476_DD2 |
#endif
#ifdef CONFIG_E200
CPU_FTRS_E200 |
#endif
#ifdef CONFIG_E500
CPU_FTRS_E500 | CPU_FTRS_E500_2 |
#endif
#ifdef CONFIG_PPC_E500MC
CPU_FTRS_E500MC | CPU_FTRS_E5500 | CPU_FTRS_E6500 |
#endif
0,
};
#endif /* __powerpc64__ */
#ifdef __powerpc64__
#ifdef CONFIG_PPC_BOOK3E
#define CPU_FTRS_ALWAYS (CPU_FTRS_E6500 & CPU_FTRS_E5500 & CPU_FTRS_A2)
#else
#define CPU_FTRS_ALWAYS \
(CPU_FTRS_POWER3 & CPU_FTRS_RS64 & CPU_FTRS_POWER4 & \
CPU_FTRS_PPC970 & CPU_FTRS_POWER5 & CPU_FTRS_POWER6 & \
CPU_FTRS_POWER7 & CPU_FTRS_CELL & CPU_FTRS_PA6T & CPU_FTRS_POSSIBLE)
#endif
#else
enum {
CPU_FTRS_ALWAYS =
#if CLASSIC_PPC
CPU_FTRS_PPC601 & CPU_FTRS_603 & CPU_FTRS_604 & CPU_FTRS_740_NOTAU &
CPU_FTRS_740 & CPU_FTRS_750 & CPU_FTRS_750FX1 &
CPU_FTRS_750FX2 & CPU_FTRS_750FX & CPU_FTRS_750GX &
CPU_FTRS_7400_NOTAU & CPU_FTRS_7400 & CPU_FTRS_7450_20 &
CPU_FTRS_7450_21 & CPU_FTRS_7450_23 & CPU_FTRS_7455_1 &
CPU_FTRS_7455_20 & CPU_FTRS_7455 & CPU_FTRS_7447_10 &
CPU_FTRS_7447 & CPU_FTRS_7447A & CPU_FTRS_82XX &
CPU_FTRS_G2_LE & CPU_FTRS_E300 & CPU_FTRS_E300C2 &
CPU_FTRS_CLASSIC32 &
#else
CPU_FTRS_GENERIC_32 &
#endif
#ifdef CONFIG_8xx
CPU_FTRS_8XX &
#endif
#ifdef CONFIG_40x
CPU_FTRS_40X &
#endif
#ifdef CONFIG_44x
CPU_FTRS_44X & CPU_FTRS_440x6 &
#endif
#ifdef CONFIG_E200
CPU_FTRS_E200 &
#endif
#ifdef CONFIG_E500
CPU_FTRS_E500 & CPU_FTRS_E500_2 &
#endif
#ifdef CONFIG_PPC_E500MC
CPU_FTRS_E500MC & CPU_FTRS_E5500 & CPU_FTRS_E6500 &
#endif
~CPU_FTR_EMB_HV & /* can be removed at runtime */
CPU_FTRS_POSSIBLE,
};
#endif /* __powerpc64__ */
static inline int cpu_has_feature(unsigned long feature)
{
return (CPU_FTRS_ALWAYS & feature) ||
(CPU_FTRS_POSSIBLE
& cur_cpu_spec->cpu_features
& feature);
}
#ifdef CONFIG_HAVE_HW_BREAKPOINT
#define HBP_NUM 1
#endif /* CONFIG_HAVE_HW_BREAKPOINT */
#endif /* !__ASSEMBLY__ */
#endif /* __KERNEL__ */
#endif /* __ASM_POWERPC_CPUTABLE_H */