Merge branch 'mm-pkeys-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 protection key support from Ingo Molnar:
 "This tree adds support for a new memory protection hardware feature
  that is available in upcoming Intel CPUs: 'protection keys' (pkeys).

  There's a background article at LWN.net:

      https://lwn.net/Articles/643797/

  The gist is that protection keys allow the encoding of
  user-controllable permission masks in the pte.  So instead of having a
  fixed protection mask in the pte (which needs a system call to change
  and works on a per page basis), the user can map a (handful of)
  protection mask variants and can change the masks runtime relatively
  cheaply, without having to change every single page in the affected
  virtual memory range.

  This allows the dynamic switching of the protection bits of large
  amounts of virtual memory, via user-space instructions.  It also
  allows more precise control of MMU permission bits: for example the
  executable bit is separate from the read bit (see more about that
  below).

  This tree adds the MM infrastructure and low level x86 glue needed for
  that, plus it adds a high level API to make use of protection keys -
  if a user-space application calls:

        mmap(..., PROT_EXEC);

  or

        mprotect(ptr, sz, PROT_EXEC);

  (note PROT_EXEC-only, without PROT_READ/WRITE), the kernel will notice
  this special case, and will set a special protection key on this
  memory range.  It also sets the appropriate bits in the Protection
  Keys User Rights (PKRU) register so that the memory becomes unreadable
  and unwritable.

  So using protection keys the kernel is able to implement 'true'
  PROT_EXEC on x86 CPUs: without protection keys PROT_EXEC implies
  PROT_READ as well.  Unreadable executable mappings have security
  advantages: they cannot be read via information leaks to figure out
  ASLR details, nor can they be scanned for ROP gadgets - and they
  cannot be used by exploits for data purposes either.

  We know about no user-space code that relies on pure PROT_EXEC
  mappings today, but binary loaders could start making use of this new
  feature to map binaries and libraries in a more secure fashion.

  There is other pending pkeys work that offers more high level system
  call APIs to manage protection keys - but those are not part of this
  pull request.

  Right now there's a Kconfig that controls this feature
  (CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) that is default enabled
  (like most x86 CPU feature enablement code that has no runtime
  overhead), but it's not user-configurable at the moment.  If there's
  any serious problem with this then we can make it configurable and/or
  flip the default"

* 'mm-pkeys-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (38 commits)
  x86/mm/pkeys: Fix mismerge of protection keys CPUID bits
  mm/pkeys: Fix siginfo ABI breakage caused by new u64 field
  x86/mm/pkeys: Fix access_error() denial of writes to write-only VMA
  mm/core, x86/mm/pkeys: Add execute-only protection keys support
  x86/mm/pkeys: Create an x86 arch_calc_vm_prot_bits() for VMA flags
  x86/mm/pkeys: Allow kernel to modify user pkey rights register
  x86/fpu: Allow setting of XSAVE state
  x86/mm: Factor out LDT init from context init
  mm/core, x86/mm/pkeys: Add arch_validate_pkey()
  mm/core, arch, powerpc: Pass a protection key in to calc_vm_flag_bits()
  x86/mm/pkeys: Actually enable Memory Protection Keys in the CPU
  x86/mm/pkeys: Add Kconfig prompt to existing config option
  x86/mm/pkeys: Dump pkey from VMA in /proc/pid/smaps
  x86/mm/pkeys: Dump PKRU with other kernel registers
  mm/core, x86/mm/pkeys: Differentiate instruction fetches
  x86/mm/pkeys: Optimize fault handling in access_error()
  mm/core: Do not enforce PKEY permissions on remote mm access
  um, pkeys: Add UML arch_*_access_permitted() methods
  mm/gup, x86/mm/pkeys: Check VMAs and PTEs for protection keys
  x86/mm/gup: Simplify get_user_pages() PTE bit handling
  ...
This commit is contained in:
Linus Torvalds 2016-03-20 19:08:56 -07:00
commit 643ad15d47
85 changed files with 1406 additions and 241 deletions

View File

@ -987,6 +987,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
See Documentation/x86/intel_mpx.txt for more
information about the feature.
nopku [X86] Disable Memory Protection Keys CPU feature found
in some Intel CPUs.
eagerfpu= [X86]
on enable eager fpu restore
off disable eager fpu restore

View File

@ -2719,9 +2719,7 @@ static int cryptocop_ioctl_process(struct inode *inode, struct file *filp, unsig
/* Acquire the mm page semaphore. */
down_read(&current->mm->mmap_sem);
err = get_user_pages(current,
current->mm,
(unsigned long int)(oper.indata + prev_ix),
err = get_user_pages((unsigned long int)(oper.indata + prev_ix),
noinpages,
0, /* read access only for in data */
0, /* no force */
@ -2736,9 +2734,7 @@ static int cryptocop_ioctl_process(struct inode *inode, struct file *filp, unsig
}
noinpages = err;
if (oper.do_cipher){
err = get_user_pages(current,
current->mm,
(unsigned long int)oper.cipher_outdata,
err = get_user_pages((unsigned long int)oper.cipher_outdata,
nooutpages,
1, /* write access for out data */
0, /* no force */

View File

@ -63,10 +63,15 @@ typedef struct siginfo {
unsigned int _flags; /* see below */
unsigned long _isr; /* isr */
short _addr_lsb; /* lsb of faulting address */
struct {
void __user *_lower;
void __user *_upper;
} _addr_bnd;
union {
/* used when si_code=SEGV_BNDERR */
struct {
void __user *_lower;
void __user *_upper;
} _addr_bnd;
/* used when si_code=SEGV_PKUERR */
__u32 _pkey;
};
} _sigfault;
/* SIGPOLL */

View File

@ -142,8 +142,7 @@ store_virtual_to_phys(struct device *dev, struct device_attribute *attr,
u64 virt_addr=simple_strtoull(buf, NULL, 16);
int ret;
ret = get_user_pages(current, current->mm, virt_addr,
1, VM_READ, 0, NULL, NULL);
ret = get_user_pages(virt_addr, 1, VM_READ, 0, NULL, NULL);
if (ret<=0) {
#ifdef ERR_INJ_DEBUG
printk("Virtual address %lx is not existing.\n",virt_addr);

View File

@ -86,10 +86,15 @@ typedef struct siginfo {
int _trapno; /* TRAP # which caused the signal */
#endif
short _addr_lsb;
struct {
void __user *_lower;
void __user *_upper;
} _addr_bnd;
union {
/* used when si_code=SEGV_BNDERR */
struct {
void __user *_lower;
void __user *_upper;
} _addr_bnd;
/* used when si_code=SEGV_PKUERR */
__u32 _pkey;
};
} _sigfault;
/* SIGPOLL, SIGXFSZ (To do ...) */

View File

@ -286,8 +286,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
start += nr << PAGE_SHIFT;
pages += nr;
ret = get_user_pages_unlocked(current, mm, start,
(end - start) >> PAGE_SHIFT,
ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT,
write, 0, pages);
/* Have to be a bit careful with return values */

View File

@ -18,11 +18,12 @@
* This file is included by linux/mman.h, so we can't use cacl_vm_prot_bits()
* here. How important is the optimization?
*/
static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot)
static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
unsigned long pkey)
{
return (prot & PROT_SAO) ? VM_SAO : 0;
}
#define arch_calc_vm_prot_bits(prot) arch_calc_vm_prot_bits(prot)
#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags)
{

View File

@ -148,5 +148,17 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
{
}
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
bool write, bool execute, bool foreign)
{
/* by default, allow everything */
return true;
}
static inline bool arch_pte_access_permitted(pte_t pte, bool write)
{
/* by default, allow everything */
return true;
}
#endif /* __KERNEL__ */
#endif /* __ASM_POWERPC_MMU_CONTEXT_H */

View File

@ -136,4 +136,16 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
{
}
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
bool write, bool execute, bool foreign)
{
/* by default, allow everything */
return true;
}
static inline bool arch_pte_access_permitted(pte_t pte, bool write)
{
/* by default, allow everything */
return true;
}
#endif /* __S390_MMU_CONTEXT_H */

View File

@ -210,7 +210,6 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
struct mm_struct *mm = current->mm;
int nr, ret;
might_sleep();
@ -222,8 +221,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
/* Try to get the remaining pages with get_user_pages */
start += nr << PAGE_SHIFT;
pages += nr;
ret = get_user_pages_unlocked(current, mm, start,
nr_pages - nr, write, 0, pages);
ret = get_user_pages_unlocked(start, nr_pages - nr, write, 0, pages);
/* Have to be a bit careful with return values */
if (nr > 0)
ret = (ret < 0) ? nr : ret + nr;

View File

@ -257,7 +257,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
start += nr << PAGE_SHIFT;
pages += nr;
ret = get_user_pages_unlocked(current, mm, start,
ret = get_user_pages_unlocked(start,
(end - start) >> PAGE_SHIFT, write, 0, pages);
/* Have to be a bit careful with return values */

View File

@ -237,7 +237,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
start += nr << PAGE_SHIFT;
pages += nr;
ret = get_user_pages_unlocked(current, mm, start,
ret = get_user_pages_unlocked(start,
(end - start) >> PAGE_SHIFT, write, 0, pages);
/* Have to be a bit careful with return values */

View File

@ -27,6 +27,20 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
struct vm_area_struct *vma)
{
}
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
bool write, bool execute, bool foreign)
{
/* by default, allow everything */
return true;
}
static inline bool arch_pte_access_permitted(pte_t pte, bool write)
{
/* by default, allow everything */
return true;
}
/*
* end asm-generic/mm_hooks.h functions
*/

View File

@ -97,4 +97,16 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
{
}
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
bool write, bool foreign)
{
/* by default, allow everything */
return true;
}
static inline bool arch_pte_access_permitted(pte_t pte, bool write)
{
/* by default, allow everything */
return true;
}
#endif

View File

@ -156,6 +156,8 @@ config X86
select X86_DEV_DMA_OPS if X86_64
select X86_FEATURE_NAMES if PROC_FS
select HAVE_STACK_VALIDATION if X86_64
select ARCH_USES_HIGH_VMA_FLAGS if X86_INTEL_MEMORY_PROTECTION_KEYS
select ARCH_HAS_PKEYS if X86_INTEL_MEMORY_PROTECTION_KEYS
config INSTRUCTION_DECODER
def_bool y
@ -1719,6 +1721,20 @@ config X86_INTEL_MPX
If unsure, say N.
config X86_INTEL_MEMORY_PROTECTION_KEYS
prompt "Intel Memory Protection Keys"
def_bool y
# Note: only available in 64-bit mode
depends on CPU_SUP_INTEL && X86_64
---help---
Memory Protection Keys provides a mechanism for enforcing
page-based protections, but without requiring modification of the
page tables when an application changes protection domains.
For details, see Documentation/x86/protection-keys.txt
If unsure, say y.
config EFI
bool "EFI runtime service support"
depends on ACPI

View File

@ -26,6 +26,7 @@ enum cpuid_leafs
CPUID_8000_0008_EBX,
CPUID_6_EAX,
CPUID_8000_000A_EDX,
CPUID_7_ECX,
};
#ifdef CONFIG_X86_FEATURE_NAMES
@ -48,28 +49,42 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
test_bit(bit, (unsigned long *)((c)->x86_capability))
#define REQUIRED_MASK_BIT_SET(bit) \
( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \
(((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \
(((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \
(((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3)) || \
(((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \
(((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \
(((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
(((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \
(((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \
(((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) )
( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0 )) || \
(((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1 )) || \
(((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2 )) || \
(((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3 )) || \
(((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4 )) || \
(((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5 )) || \
(((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6 )) || \
(((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7 )) || \
(((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8 )) || \
(((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9 )) || \
(((bit)>>5)==10 && (1UL<<((bit)&31) & REQUIRED_MASK10)) || \
(((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) || \
(((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) || \
(((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) || \
(((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK14)) || \
(((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK15)) || \
(((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK16)) )
#define DISABLED_MASK_BIT_SET(bit) \
( (((bit)>>5)==0 && (1UL<<((bit)&31) & DISABLED_MASK0)) || \
(((bit)>>5)==1 && (1UL<<((bit)&31) & DISABLED_MASK1)) || \
(((bit)>>5)==2 && (1UL<<((bit)&31) & DISABLED_MASK2)) || \
(((bit)>>5)==3 && (1UL<<((bit)&31) & DISABLED_MASK3)) || \
(((bit)>>5)==4 && (1UL<<((bit)&31) & DISABLED_MASK4)) || \
(((bit)>>5)==5 && (1UL<<((bit)&31) & DISABLED_MASK5)) || \
(((bit)>>5)==6 && (1UL<<((bit)&31) & DISABLED_MASK6)) || \
(((bit)>>5)==7 && (1UL<<((bit)&31) & DISABLED_MASK7)) || \
(((bit)>>5)==8 && (1UL<<((bit)&31) & DISABLED_MASK8)) || \
(((bit)>>5)==9 && (1UL<<((bit)&31) & DISABLED_MASK9)) )
( (((bit)>>5)==0 && (1UL<<((bit)&31) & DISABLED_MASK0 )) || \
(((bit)>>5)==1 && (1UL<<((bit)&31) & DISABLED_MASK1 )) || \
(((bit)>>5)==2 && (1UL<<((bit)&31) & DISABLED_MASK2 )) || \
(((bit)>>5)==3 && (1UL<<((bit)&31) & DISABLED_MASK3 )) || \
(((bit)>>5)==4 && (1UL<<((bit)&31) & DISABLED_MASK4 )) || \
(((bit)>>5)==5 && (1UL<<((bit)&31) & DISABLED_MASK5 )) || \
(((bit)>>5)==6 && (1UL<<((bit)&31) & DISABLED_MASK6 )) || \
(((bit)>>5)==7 && (1UL<<((bit)&31) & DISABLED_MASK7 )) || \
(((bit)>>5)==8 && (1UL<<((bit)&31) & DISABLED_MASK8 )) || \
(((bit)>>5)==9 && (1UL<<((bit)&31) & DISABLED_MASK9 )) || \
(((bit)>>5)==10 && (1UL<<((bit)&31) & DISABLED_MASK10)) || \
(((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) || \
(((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) || \
(((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) || \
(((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK14)) || \
(((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK15)) || \
(((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK16)) )
#define cpu_has(c, bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \

View File

@ -12,7 +12,7 @@
/*
* Defines x86 CPU feature bits
*/
#define NCAPINTS 16 /* N 32-bit words worth of info */
#define NCAPINTS 17 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
/*
@ -274,6 +274,10 @@
#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
/*
* BUG word(s)
*/

View File

@ -28,6 +28,14 @@
# define DISABLE_CENTAUR_MCR 0
#endif /* CONFIG_X86_64 */
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
# define DISABLE_PKU (1<<(X86_FEATURE_PKU))
# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE))
#else
# define DISABLE_PKU 0
# define DISABLE_OSPKE 0
#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
/*
* Make sure to add features to the correct mask
*/
@ -41,5 +49,12 @@
#define DISABLED_MASK7 0
#define DISABLED_MASK8 0
#define DISABLED_MASK9 (DISABLE_MPX)
#define DISABLED_MASK10 0
#define DISABLED_MASK11 0
#define DISABLED_MASK12 0
#define DISABLED_MASK13 0
#define DISABLED_MASK14 0
#define DISABLED_MASK15 0
#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE)
#endif /* _ASM_X86_DISABLED_FEATURES_H */

View File

@ -25,6 +25,8 @@
extern void fpu__activate_curr(struct fpu *fpu);
extern void fpu__activate_fpstate_read(struct fpu *fpu);
extern void fpu__activate_fpstate_write(struct fpu *fpu);
extern void fpu__current_fpstate_write_begin(void);
extern void fpu__current_fpstate_write_end(void);
extern void fpu__save(struct fpu *fpu);
extern void fpu__restore(struct fpu *fpu);
extern int fpu__restore_sig(void __user *buf, int ia32_frame);

View File

@ -108,6 +108,8 @@ enum xfeature {
XFEATURE_OPMASK,
XFEATURE_ZMM_Hi256,
XFEATURE_Hi16_ZMM,
XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
XFEATURE_PKRU,
XFEATURE_MAX,
};
@ -120,6 +122,7 @@ enum xfeature {
#define XFEATURE_MASK_OPMASK (1 << XFEATURE_OPMASK)
#define XFEATURE_MASK_ZMM_Hi256 (1 << XFEATURE_ZMM_Hi256)
#define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM)
#define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
#define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \
@ -212,6 +215,15 @@ struct avx_512_hi16_state {
struct reg_512_bit hi16_zmm[16];
} __packed;
/*
* State component 9: 32-bit PKRU register. The state is
* 8 bytes long but only 4 bytes is used currently.
*/
struct pkru_state {
u32 pkru;
u32 pad;
} __packed;
struct xstate_header {
u64 xfeatures;
u64 xcomp_bv;

View File

@ -24,7 +24,8 @@
XFEATURE_MASK_YMM | \
XFEATURE_MASK_OPMASK | \
XFEATURE_MASK_ZMM_Hi256 | \
XFEATURE_MASK_Hi16_ZMM)
XFEATURE_MASK_Hi16_ZMM | \
XFEATURE_MASK_PKRU)
/* Supported features which require eager state saving */
#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)

View File

@ -52,15 +52,15 @@ struct ldt_struct {
/*
* Used for LDT copy/destruction.
*/
int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
void destroy_context(struct mm_struct *mm);
int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm);
#else /* CONFIG_MODIFY_LDT_SYSCALL */
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
static inline int init_new_context_ldt(struct task_struct *tsk,
struct mm_struct *mm)
{
return 0;
}
static inline void destroy_context(struct mm_struct *mm) {}
static inline void destroy_context_ldt(struct mm_struct *mm) {}
#endif
static inline void load_mm_ldt(struct mm_struct *mm)
@ -104,6 +104,17 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
#endif
}
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
init_new_context_ldt(tsk, mm);
return 0;
}
static inline void destroy_context(struct mm_struct *mm)
{
destroy_context_ldt(mm);
}
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
@ -275,4 +286,68 @@ static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
mpx_notify_unmap(mm, vma, start, end);
}
static inline int vma_pkey(struct vm_area_struct *vma)
{
u16 pkey = 0;
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
VM_PKEY_BIT2 | VM_PKEY_BIT3;
pkey = (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
#endif
return pkey;
}
static inline bool __pkru_allows_pkey(u16 pkey, bool write)
{
u32 pkru = read_pkru();
if (!__pkru_allows_read(pkru, pkey))
return false;
if (write && !__pkru_allows_write(pkru, pkey))
return false;
return true;
}
/*
* We only want to enforce protection keys on the current process
* because we effectively have no access to PKRU for other
* processes or any way to tell *which * PKRU in a threaded
* process we could use.
*
* So do not enforce things if the VMA is not from the current
* mm, or if we are in a kernel thread.
*/
static inline bool vma_is_foreign(struct vm_area_struct *vma)
{
if (!current->mm)
return true;
/*
* Should PKRU be enforced on the access to this VMA? If
* the VMA is from another process, then PKRU has no
* relevance and should not be enforced.
*/
if (current->mm != vma->vm_mm)
return true;
return false;
}
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
bool write, bool execute, bool foreign)
{
/* pkeys never affect instruction fetches */
if (execute)
return true;
/* allow access if the VMA is not one from this process */
if (foreign || vma_is_foreign(vma))
return true;
return __pkru_allows_pkey(vma_pkey(vma), write);
}
static inline bool arch_pte_access_permitted(pte_t pte, bool write)
{
return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write);
}
#endif /* _ASM_X86_MMU_CONTEXT_H */

View File

@ -99,6 +99,14 @@ static inline int pte_dirty(pte_t pte)
return pte_flags(pte) & _PAGE_DIRTY;
}
static inline u32 read_pkru(void)
{
if (boot_cpu_has(X86_FEATURE_OSPKE))
return __read_pkru();
return 0;
}
static inline int pte_young(pte_t pte)
{
return pte_flags(pte) & _PAGE_ACCESSED;
@ -911,6 +919,36 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
}
#endif
#define PKRU_AD_BIT 0x1
#define PKRU_WD_BIT 0x2
#define PKRU_BITS_PER_PKEY 2
static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
{
int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
}
static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
{
int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
/*
* Access-disable disables writes too so we need to check
* both bits here.
*/
return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
}
static inline u16 pte_flags_pkey(unsigned long pte_flags)
{
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
/* ifdef to avoid doing 59-bit shift on 32-bit values */
return (pte_flags & _PAGE_PKEY_MASK) >> _PAGE_BIT_PKEY_BIT0;
#else
return 0;
#endif
}
#include <asm-generic/pgtable.h>
#endif /* __ASSEMBLY__ */

View File

@ -20,13 +20,18 @@
#define _PAGE_BIT_SOFTW2 10 /* " */
#define _PAGE_BIT_SOFTW3 11 /* " */
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
#define _PAGE_BIT_SOFTW4 58 /* available for programmer */
#define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */
#define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */
#define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */
#define _PAGE_BIT_PKEY_BIT3 62 /* Protection Keys, bit 4/4 */
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_SOFTW4 58 /* available for programmer */
#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
@ -47,8 +52,24 @@
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
#define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
#define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
#define _PAGE_PKEY_BIT2 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT2)
#define _PAGE_PKEY_BIT3 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT3)
#else
#define _PAGE_PKEY_BIT0 (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT1 (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT2 (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT3 (_AT(pteval_t, 0))
#endif
#define __HAVE_ARCH_PTE_SPECIAL
#define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \
_PAGE_PKEY_BIT1 | \
_PAGE_PKEY_BIT2 | \
_PAGE_PKEY_BIT3)
#ifdef CONFIG_KMEMCHECK
#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
#else
@ -99,7 +120,12 @@
#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
_PAGE_DIRTY)
/* Set of bits not changed in pte_modify */
/*
* Set of bits not changed in pte_modify. The pte's
* protection key is treated like _PAGE_RW, for
* instance, and is *not* included in this mask since
* pte_modify() does modify it.
*/
#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
_PAGE_SOFT_DIRTY)
@ -215,7 +241,10 @@ enum page_cache_mode {
/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */
#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
/* Extracts the flags from a (pte|pmd|pud|pgd)val_t of a 4KB page */
/*
* Extracts the flags from a (pte|pmd|pud|pgd)val_t
* This includes the protection key value.
*/
#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;

View File

@ -0,0 +1,34 @@
#ifndef _ASM_X86_PKEYS_H
#define _ASM_X86_PKEYS_H
#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)
extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
/*
* Try to dedicate one of the protection keys to be used as an
* execute-only protection key.
*/
#define PKEY_DEDICATED_EXECUTE_ONLY 15
extern int __execute_only_pkey(struct mm_struct *mm);
static inline int execute_only_pkey(struct mm_struct *mm)
{
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return 0;
return __execute_only_pkey(mm);
}
extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma,
int prot, int pkey);
static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
int prot, int pkey)
{
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return 0;
return __arch_override_mprotect_pkey(vma, prot, pkey);
}
#endif /*_ASM_X86_PKEYS_H */

View File

@ -92,5 +92,12 @@
#define REQUIRED_MASK7 0
#define REQUIRED_MASK8 0
#define REQUIRED_MASK9 0
#define REQUIRED_MASK10 0
#define REQUIRED_MASK11 0
#define REQUIRED_MASK12 0
#define REQUIRED_MASK13 0
#define REQUIRED_MASK14 0
#define REQUIRED_MASK15 0
#define REQUIRED_MASK16 0
#endif /* _ASM_X86_REQUIRED_FEATURES_H */

View File

@ -98,6 +98,28 @@ static inline void native_write_cr8(unsigned long val)
}
#endif
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
static inline u32 __read_pkru(void)
{
u32 ecx = 0;
u32 edx, pkru;
/*
* "rdpkru" instruction. Places PKRU contents in to EAX,
* clears EDX and requires that ecx=0.
*/
asm volatile(".byte 0x0f,0x01,0xee\n\t"
: "=a" (pkru), "=d" (edx)
: "c" (ecx));
return pkru;
}
#else
static inline u32 __read_pkru(void)
{
return 0;
}
#endif
static inline void native_wbinvd(void)
{
asm volatile("wbinvd": : :"memory");

View File

@ -6,6 +6,28 @@
#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
/*
* Take the 4 protection key bits out of the vma->vm_flags
* value and turn them in to the bits that we can put in
* to a pte.
*
* Only override these if Protection Keys are available
* (which is only on 64-bit).
*/
#define arch_vm_get_page_prot(vm_flags) __pgprot( \
((vm_flags) & VM_PKEY_BIT0 ? _PAGE_PKEY_BIT0 : 0) | \
((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) | \
((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \
((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
#define arch_calc_vm_prot_bits(prot, key) ( \
((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \
((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \
((key) & 0x4 ? VM_PKEY_BIT2 : 0) | \
((key) & 0x8 ? VM_PKEY_BIT3 : 0))
#endif
#include <asm-generic/mman.h>
#endif /* _ASM_X86_MMAN_H */

View File

@ -118,6 +118,8 @@
#define X86_CR4_SMEP _BITUL(X86_CR4_SMEP_BIT)
#define X86_CR4_SMAP_BIT 21 /* enable SMAP support */
#define X86_CR4_SMAP _BITUL(X86_CR4_SMAP_BIT)
#define X86_CR4_PKE_BIT 22 /* enable Protection Keys support */
#define X86_CR4_PKE _BITUL(X86_CR4_PKE_BIT)
/*
* x86-64 Task Priority Register, CR8

View File

@ -303,6 +303,48 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
}
}
/*
* Protection Keys are not available in 32-bit mode.
*/
static bool pku_disabled;
static __always_inline void setup_pku(struct cpuinfo_x86 *c)
{
if (!cpu_has(c, X86_FEATURE_PKU))
return;
if (pku_disabled)
return;
cr4_set_bits(X86_CR4_PKE);
/*
* Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
* cpuid bit to be set. We need to ensure that we
* update that bit in this CPU's "cpu_info".
*/
get_cpu_cap(c);
}
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
static __init int setup_disable_pku(char *arg)
{
/*
* Do not clear the X86_FEATURE_PKU bit. All of the
* runtime checks are against OSPKE so clearing the
* bit does nothing.
*
* This way, we will see "pku" in cpuinfo, but not
* "ospke", which is exactly what we want. It shows
* that the CPU has PKU, but the OS has not enabled it.
* This happens to be exactly how a system would look
* if we disabled the config option.
*/
pr_info("x86: 'nopku' specified, disabling Memory Protection Keys\n");
pku_disabled = true;
return 1;
}
__setup("nopku", setup_disable_pku);
#endif /* CONFIG_X86_64 */
/*
* Some CPU features depend on higher CPUID levels, which may not always
* be available due to CPUID level capping or broken virtualization
@ -625,6 +667,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_7_0_EBX] = ebx;
c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
c->x86_capability[CPUID_7_ECX] = ecx;
}
/* Extended state features: level 0x0000000d */
@ -982,6 +1025,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
init_hypervisor(c);
x86_init_rdrand(c);
x86_init_cache_qos(c);
setup_pku(c);
/*
* Clear/Set all flags overriden by options, need do it

View File

@ -353,6 +353,69 @@ void fpu__activate_fpstate_write(struct fpu *fpu)
}
}
/*
* This function must be called before we write the current
* task's fpstate.
*
* This call gets the current FPU register state and moves
* it in to the 'fpstate'. Preemption is disabled so that
* no writes to the 'fpstate' can occur from context
* swiches.
*
* Must be followed by a fpu__current_fpstate_write_end().
*/
void fpu__current_fpstate_write_begin(void)
{
struct fpu *fpu = &current->thread.fpu;
/*
* Ensure that the context-switching code does not write
* over the fpstate while we are doing our update.
*/
preempt_disable();
/*
* Move the fpregs in to the fpu's 'fpstate'.
*/
fpu__activate_fpstate_read(fpu);
/*
* The caller is about to write to 'fpu'. Ensure that no
* CPU thinks that its fpregs match the fpstate. This
* ensures we will not be lazy and skip a XRSTOR in the
* future.
*/
fpu->last_cpu = -1;
}
/*
* This function must be paired with fpu__current_fpstate_write_begin()
*
* This will ensure that the modified fpstate gets placed back in
* the fpregs if necessary.
*
* Note: This function may be called whether or not an _actual_
* write to the fpstate occurred.
*/
void fpu__current_fpstate_write_end(void)
{
struct fpu *fpu = &current->thread.fpu;
/*
* 'fpu' now has an updated copy of the state, but the
* registers may still be out of date. Update them with
* an XRSTOR if they are active.
*/
if (fpregs_active())
copy_kernel_to_fpregs(&fpu->state);
/*
* Our update is done and the fpregs/fpstate are in sync
* if necessary. Context switches can happen again.
*/
preempt_enable();
}
/*
* 'fpu__restore()' is called to copy FPU registers from
* the FPU fpstate to the live hw registers and to activate

View File

@ -5,6 +5,7 @@
*/
#include <linux/compat.h>
#include <linux/cpu.h>
#include <linux/pkeys.h>
#include <asm/fpu/api.h>
#include <asm/fpu/internal.h>
@ -13,6 +14,11 @@
#include <asm/tlbflush.h>
/*
* Although we spell it out in here, the Processor Trace
* xfeature is completely unused. We use other mechanisms
* to save/restore PT state in Linux.
*/
static const char *xfeature_names[] =
{
"x87 floating point registers" ,
@ -23,6 +29,8 @@ static const char *xfeature_names[] =
"AVX-512 opmask" ,
"AVX-512 Hi256" ,
"AVX-512 ZMM_Hi256" ,
"Processor Trace (unused)" ,
"Protection Keys User registers",
"unknown xstate feature" ,
};
@ -56,6 +64,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
setup_clear_cpu_cap(X86_FEATURE_MPX);
setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
setup_clear_cpu_cap(X86_FEATURE_PKU);
}
/*
@ -234,7 +243,7 @@ static void __init print_xstate_feature(u64 xstate_mask)
const char *feature_name;
if (cpu_has_xfeatures(xstate_mask, &feature_name))
pr_info("x86/fpu: Supporting XSAVE feature 0x%02Lx: '%s'\n", xstate_mask, feature_name);
pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
}
/*
@ -250,6 +259,7 @@ static void __init print_xstate_features(void)
print_xstate_feature(XFEATURE_MASK_OPMASK);
print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
print_xstate_feature(XFEATURE_MASK_PKRU);
}
/*
@ -466,6 +476,7 @@ static void check_xstate_against_struct(int nr)
XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state);
XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state);
XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state);
/*
* Make *SURE* to add any feature numbers in below if
@ -473,7 +484,8 @@ static void check_xstate_against_struct(int nr)
* numbers.
*/
if ((nr < XFEATURE_YMM) ||
(nr >= XFEATURE_MAX)) {
(nr >= XFEATURE_MAX) ||
(nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) {
WARN_ONCE(1, "no structure for xstate: %d\n", nr);
XSTATE_WARN_ON(1);
}
@ -670,6 +682,19 @@ void fpu__resume_cpu(void)
xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
}
/*
* Given an xstate feature mask, calculate where in the xsave
* buffer the state is. Callers should ensure that the buffer
* is valid.
*
* Note: does not work for compacted buffers.
*/
void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask)
{
int feature_nr = fls64(xstate_feature_mask) - 1;
return (void *)xsave + xstate_comp_offsets[feature_nr];
}
/*
* Given the xsave area and a state inside, this function returns the
* address of the state.
@ -690,7 +715,6 @@ void fpu__resume_cpu(void)
*/
void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
{
int feature_nr = fls64(xstate_feature) - 1;
/*
* Do we even *have* xsave state?
*/
@ -718,7 +742,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
if (!(xsave->header.xfeatures & xstate_feature))
return NULL;
return (void *)xsave + xstate_comp_offsets[feature_nr];
return __raw_xsave_addr(xsave, xstate_feature);
}
EXPORT_SYMBOL_GPL(get_xsave_addr);
@ -753,3 +777,156 @@ const void *get_xsave_field_ptr(int xsave_state)
return get_xsave_addr(&fpu->state.xsave, xsave_state);
}
/*
* Set xfeatures (aka XSTATE_BV) bit for a feature that we want
* to take out of its "init state". This will ensure that an
* XRSTOR actually restores the state.
*/
static void fpu__xfeature_set_non_init(struct xregs_state *xsave,
int xstate_feature_mask)
{
xsave->header.xfeatures |= xstate_feature_mask;
}
/*
* This function is safe to call whether the FPU is in use or not.
*
* Note that this only works on the current task.
*
* Inputs:
* @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
* XFEATURE_MASK_SSE, etc...)
* @xsave_state_ptr: a pointer to a copy of the state that you would
* like written in to the current task's FPU xsave state. This pointer
* must not be located in the current tasks's xsave area.
* Output:
* address of the state in the xsave area or NULL if the state
* is not present or is in its 'init state'.
*/
static void fpu__xfeature_set_state(int xstate_feature_mask,
void *xstate_feature_src, size_t len)
{
struct xregs_state *xsave = &current->thread.fpu.state.xsave;
struct fpu *fpu = &current->thread.fpu;
void *dst;
if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
WARN_ONCE(1, "%s() attempted with no xsave support", __func__);
return;
}
/*
* Tell the FPU code that we need the FPU state to be in
* 'fpu' (not in the registers), and that we need it to
* be stable while we write to it.
*/
fpu__current_fpstate_write_begin();
/*
* This method *WILL* *NOT* work for compact-format
* buffers. If the 'xstate_feature_mask' is unset in
* xcomp_bv then we may need to move other feature state
* "up" in the buffer.
*/
if (xsave->header.xcomp_bv & xstate_feature_mask) {
WARN_ON_ONCE(1);
goto out;
}
/* find the location in the xsave buffer of the desired state */
dst = __raw_xsave_addr(&fpu->state.xsave, xstate_feature_mask);
/*
* Make sure that the pointer being passed in did not
* come from the xsave buffer itself.
*/
WARN_ONCE(xstate_feature_src == dst, "set from xsave buffer itself");
/* put the caller-provided data in the location */
memcpy(dst, xstate_feature_src, len);
/*
* Mark the xfeature so that the CPU knows there is state
* in the buffer now.
*/
fpu__xfeature_set_non_init(xsave, xstate_feature_mask);
out:
/*
* We are done writing to the 'fpu'. Reenable preeption
* and (possibly) move the fpstate back in to the fpregs.
*/
fpu__current_fpstate_write_end();
}
#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
/*
* This will go out and modify the XSAVE buffer so that PKRU is
* set to a particular state for access to 'pkey'.
*
* PKRU state does affect kernel access to user memory. We do
* not modfiy PKRU *itself* here, only the XSAVE state that will
* be restored in to PKRU when we return back to userspace.
*/
int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val)
{
struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
struct pkru_state *old_pkru_state;
struct pkru_state new_pkru_state;
int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
u32 new_pkru_bits = 0;
/*
* This check implies XSAVE support. OSPKE only gets
* set if we enable XSAVE and we enable PKU in XCR0.
*/
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return -EINVAL;
/* Set the bits we need in PKRU */
if (init_val & PKEY_DISABLE_ACCESS)
new_pkru_bits |= PKRU_AD_BIT;
if (init_val & PKEY_DISABLE_WRITE)
new_pkru_bits |= PKRU_WD_BIT;
/* Shift the bits in to the correct place in PKRU for pkey. */
new_pkru_bits <<= pkey_shift;
/* Locate old copy of the state in the xsave buffer */
old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU);
/*
* When state is not in the buffer, it is in the init
* state, set it manually. Otherwise, copy out the old
* state.
*/
if (!old_pkru_state)
new_pkru_state.pkru = 0;
else
new_pkru_state.pkru = old_pkru_state->pkru;
/* mask off any old bits in place */
new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
/* Set the newly-requested bits */
new_pkru_state.pkru |= new_pkru_bits;
/*
* We could theoretically live without zeroing pkru.pad.
* The current XSAVE feature state definition says that
* only bytes 0->3 are used. But we do not want to
* chance leaking kernel stack out to userspace in case a
* memcpy() of the whole xsave buffer was done.
*
* They're in the same cacheline anyway.
*/
new_pkru_state.pad = 0;
fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state,
sizeof(new_pkru_state));
return 0;
}

View File

@ -103,7 +103,7 @@ static void free_ldt_struct(struct ldt_struct *ldt)
* we do not have to muck with descriptors here, that is
* done in switch_mm() as needed.
*/
int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
{
struct ldt_struct *new_ldt;
struct mm_struct *old_mm;
@ -144,7 +144,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
*
* 64bit: Don't touch the LDT register - we're already in the next thread.
*/
void destroy_context(struct mm_struct *mm)
void destroy_context_ldt(struct mm_struct *mm)
{
free_ldt_struct(mm->context.ldt);
mm->context.ldt = NULL;

View File

@ -116,6 +116,8 @@ void __show_regs(struct pt_regs *regs, int all)
printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
if (boot_cpu_has(X86_FEATURE_OSPKE))
printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
}
void release_thread(struct task_struct *dead_task)

View File

@ -112,6 +112,7 @@
#include <asm/alternative.h>
#include <asm/prom.h>
#include <asm/microcode.h>
#include <asm/mmu_context.h>
/*
* max_low_pfn_mapped: highest direct mapped pfn under 4GB
@ -1282,3 +1283,11 @@ static int __init register_kernel_offset_dumper(void)
return 0;
}
__initcall(register_kernel_offset_dumper);
void arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
{
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return;
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
}

View File

@ -34,3 +34,5 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o

View File

@ -15,12 +15,14 @@
#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <linux/uaccess.h> /* faulthandler_disabled() */
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
#include <asm/fixmap.h> /* VSYSCALL_ADDR */
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
#include <asm/mmu_context.h> /* vma_pkey() */
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@ -33,6 +35,7 @@
* bit 2 == 0: kernel-mode access 1: user-mode access
* bit 3 == 1: use of reserved bit detected
* bit 4 == 1: fault was an instruction fetch
* bit 5 == 1: protection keys block access
*/
enum x86_pf_error_code {
@ -41,6 +44,7 @@ enum x86_pf_error_code {
PF_USER = 1 << 2,
PF_RSVD = 1 << 3,
PF_INSTR = 1 << 4,
PF_PK = 1 << 5,
};
/*
@ -167,9 +171,60 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
return prefetch;
}
/*
* A protection key fault means that the PKRU value did not allow
* access to some PTE. Userspace can figure out what PKRU was
* from the XSAVE state, and this function fills out a field in
* siginfo so userspace can discover which protection key was set
* on the PTE.
*
* If we get here, we know that the hardware signaled a PF_PK
* fault and that there was a VMA once we got in the fault
* handler. It does *not* guarantee that the VMA we find here
* was the one that we faulted on.
*
* 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
* 2. T1 : set PKRU to deny access to pkey=4, touches page
* 3. T1 : faults...
* 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
* 5. T1 : enters fault handler, takes mmap_sem, etc...
* 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
* faulted on a pte with its pkey=4.
*/
static void fill_sig_info_pkey(int si_code, siginfo_t *info,
struct vm_area_struct *vma)
{
/* This is effectively an #ifdef */
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return;
/* Fault not from Protection Keys: nothing to do */
if (si_code != SEGV_PKUERR)
return;
/*
* force_sig_info_fault() is called from a number of
* contexts, some of which have a VMA and some of which
* do not. The PF_PK handing happens after we have a
* valid VMA, so we should never reach this without a
* valid VMA.
*/
if (!vma) {
WARN_ONCE(1, "PKU fault with no VMA passed in");
info->si_pkey = 0;
return;
}
/*
* si_pkey should be thought of as a strong hint, but not
* absolutely guranteed to be 100% accurate because of
* the race explained above.
*/
info->si_pkey = vma_pkey(vma);
}
static void
force_sig_info_fault(int si_signo, int si_code, unsigned long address,
struct task_struct *tsk, int fault)
struct task_struct *tsk, struct vm_area_struct *vma,
int fault)
{
unsigned lsb = 0;
siginfo_t info;
@ -184,6 +239,8 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
lsb = PAGE_SHIFT;
info.si_addr_lsb = lsb;
fill_sig_info_pkey(si_code, &info, vma);
force_sig_info(si_signo, &info, tsk);
}
@ -661,6 +718,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
struct task_struct *tsk = current;
unsigned long flags;
int sig;
/* No context means no VMA to pass down */
struct vm_area_struct *vma = NULL;
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs, X86_TRAP_PF)) {
@ -684,7 +743,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
tsk->thread.cr2 = address;
/* XXX: hwpoison faults will set the wrong code. */
force_sig_info_fault(signal, si_code, address, tsk, 0);
force_sig_info_fault(signal, si_code, address,
tsk, vma, 0);
}
/*
@ -761,7 +821,8 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
unsigned long address, int si_code)
unsigned long address, struct vm_area_struct *vma,
int si_code)
{
struct task_struct *tsk = current;
@ -804,7 +865,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_PF;
force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0);
return;
}
@ -817,14 +878,14 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
static noinline void
bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
unsigned long address, struct vm_area_struct *vma)
{
__bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
__bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR);
}
static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
unsigned long address, int si_code)
unsigned long address, struct vm_area_struct *vma, int si_code)
{
struct mm_struct *mm = current->mm;
@ -834,25 +895,50 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,
*/
up_read(&mm->mmap_sem);
__bad_area_nosemaphore(regs, error_code, address, si_code);
__bad_area_nosemaphore(regs, error_code, address, vma, si_code);
}
static noinline void
bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
__bad_area(regs, error_code, address, SEGV_MAPERR);
__bad_area(regs, error_code, address, NULL, SEGV_MAPERR);
}
static inline bool bad_area_access_from_pkeys(unsigned long error_code,
struct vm_area_struct *vma)
{
/* This code is always called on the current mm */
bool foreign = false;
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return false;
if (error_code & PF_PK)
return true;
/* this checks permission keys on the VMA: */
if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
(error_code & PF_INSTR), foreign))
return true;
return false;
}
static noinline void
bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
unsigned long address, struct vm_area_struct *vma)
{
__bad_area(regs, error_code, address, SEGV_ACCERR);
/*
* This OSPKE check is not strictly necessary at runtime.
* But, doing it this way allows compiler optimizations
* if pkeys are compiled out.
*/
if (bad_area_access_from_pkeys(error_code, vma))
__bad_area(regs, error_code, address, vma, SEGV_PKUERR);
else
__bad_area(regs, error_code, address, vma, SEGV_ACCERR);
}
static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
unsigned int fault)
struct vm_area_struct *vma, unsigned int fault)
{
struct task_struct *tsk = current;
int code = BUS_ADRERR;
@ -879,12 +965,13 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
code = BUS_MCEERR_AR;
}
#endif
force_sig_info_fault(SIGBUS, code, address, tsk, fault);
force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault);
}
static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address, unsigned int fault)
unsigned long address, struct vm_area_struct *vma,
unsigned int fault)
{
if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
no_context(regs, error_code, address, 0, 0);
@ -908,9 +995,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
} else {
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
VM_FAULT_HWPOISON_LARGE))
do_sigbus(regs, error_code, address, fault);
do_sigbus(regs, error_code, address, vma, fault);
else if (fault & VM_FAULT_SIGSEGV)
bad_area_nosemaphore(regs, error_code, address);
bad_area_nosemaphore(regs, error_code, address, vma);
else
BUG();
}
@ -923,6 +1010,12 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
if ((error_code & PF_INSTR) && !pte_exec(*pte))
return 0;
/*
* Note: We do not do lazy flushing on protection key
* changes, so no spurious fault will ever set PF_PK.
*/
if ((error_code & PF_PK))
return 1;
return 1;
}
@ -1012,6 +1105,17 @@ int show_unhandled_signals = 1;
static inline int
access_error(unsigned long error_code, struct vm_area_struct *vma)
{
/* This is only called for the current mm, so: */
bool foreign = false;
/*
* Make sure to check the VMA so that we do not perform
* faults just to hit a PF_PK as soon as we fill in a
* page.
*/
if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
(error_code & PF_INSTR), foreign))
return 1;
if (error_code & PF_WRITE) {
/* write, present and write, not present: */
if (unlikely(!(vma->vm_flags & VM_WRITE)))
@ -1118,7 +1222,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
bad_area_nosemaphore(regs, error_code, address);
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
@ -1131,7 +1235,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
pgtable_bad(regs, error_code, address);
if (unlikely(smap_violation(error_code, regs))) {
bad_area_nosemaphore(regs, error_code, address);
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
@ -1140,7 +1244,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* in a region with pagefaults disabled then we must not take the fault
*/
if (unlikely(faulthandler_disabled() || !mm)) {
bad_area_nosemaphore(regs, error_code, address);
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
@ -1164,6 +1268,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
if (error_code & PF_WRITE)
flags |= FAULT_FLAG_WRITE;
if (error_code & PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
/*
* When running in the kernel we expect faults to occur only to
@ -1184,7 +1290,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
if ((error_code & PF_USER) == 0 &&
!search_exception_tables(regs->ip)) {
bad_area_nosemaphore(regs, error_code, address);
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
retry:
@ -1232,7 +1338,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
*/
good_area:
if (unlikely(access_error(error_code, vma))) {
bad_area_access_error(regs, error_code, address);
bad_area_access_error(regs, error_code, address, vma);
return;
}
@ -1270,7 +1376,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, fault);
mm_fault_error(regs, error_code, address, vma, fault);
return;
}

View File

@ -11,6 +11,7 @@
#include <linux/swap.h>
#include <linux/memremap.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
static inline pte_t gup_get_pte(pte_t *ptep)
@ -74,6 +75,28 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
}
}
/*
* 'pteval' can come from a pte, pmd or pud. We only check
* _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
* same value on all 3 types.
*/
static inline int pte_allows_gup(unsigned long pteval, int write)
{
unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
if (write)
need_pte_bits |= _PAGE_RW;
if ((pteval & need_pte_bits) != need_pte_bits)
return 0;
/* Check memory protection keys permissions. */
if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write))
return 0;
return 1;
}
/*
* The performance critical leaf functions are made noinline otherwise gcc
* inlines everything into a single function which results in too much
@ -83,14 +106,9 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
unsigned long mask;
int nr_start = *nr;
pte_t *ptep;
mask = _PAGE_PRESENT|_PAGE_USER;
if (write)
mask |= _PAGE_RW;
ptep = pte_offset_map(&pmd, addr);
do {
pte_t pte = gup_get_pte(ptep);
@ -109,7 +127,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
pte_unmap(ptep);
return 0;
}
} else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
} else if (!pte_allows_gup(pte_val(pte), write) ||
pte_special(pte)) {
pte_unmap(ptep);
return 0;
}
@ -164,14 +183,10 @@ static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
unsigned long mask;
struct page *head, *page;
int refs;
mask = _PAGE_PRESENT|_PAGE_USER;
if (write)
mask |= _PAGE_RW;
if ((pmd_flags(pmd) & mask) != mask)
if (!pte_allows_gup(pmd_val(pmd), write))
return 0;
VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
@ -231,14 +246,10 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
unsigned long mask;
struct page *head, *page;
int refs;
mask = _PAGE_PRESENT|_PAGE_USER;
if (write)
mask |= _PAGE_RW;
if ((pud_flags(pud) & mask) != mask)
if (!pte_allows_gup(pud_val(pud), write))
return 0;
/* hugepages are never "special" */
VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
@ -422,7 +433,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
start += nr << PAGE_SHIFT;
pages += nr;
ret = get_user_pages_unlocked(current, mm, start,
ret = get_user_pages_unlocked(start,
(end - start) >> PAGE_SHIFT,
write, 0, pages);

View File

@ -546,8 +546,8 @@ static int mpx_resolve_fault(long __user *addr, int write)
int nr_pages = 1;
int force = 0;
gup_ret = get_user_pages(current, current->mm, (unsigned long)addr,
nr_pages, write, force, NULL, NULL);
gup_ret = get_user_pages((unsigned long)addr, nr_pages, write,
force, NULL, NULL);
/*
* get_user_pages() returns number of pages gotten.
* 0 means we failed to fault in and get anything,

101
arch/x86/mm/pkeys.c Normal file
View File

@ -0,0 +1,101 @@
/*
* Intel Memory Protection Keys management
* Copyright (c) 2015, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <linux/mm_types.h> /* mm_struct, vma, etc... */
#include <linux/pkeys.h> /* PKEY_* */
#include <uapi/asm-generic/mman-common.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/mmu_context.h> /* vma_pkey() */
#include <asm/fpu/internal.h> /* fpregs_active() */
int __execute_only_pkey(struct mm_struct *mm)
{
int ret;
/*
* We do not want to go through the relatively costly
* dance to set PKRU if we do not need to. Check it
* first and assume that if the execute-only pkey is
* write-disabled that we do not have to set it
* ourselves. We need preempt off so that nobody
* can make fpregs inactive.
*/
preempt_disable();
if (fpregs_active() &&
!__pkru_allows_read(read_pkru(), PKEY_DEDICATED_EXECUTE_ONLY)) {
preempt_enable();
return PKEY_DEDICATED_EXECUTE_ONLY;
}
preempt_enable();
ret = arch_set_user_pkey_access(current, PKEY_DEDICATED_EXECUTE_ONLY,
PKEY_DISABLE_ACCESS);
/*
* If the PKRU-set operation failed somehow, just return
* 0 and effectively disable execute-only support.
*/
if (ret)
return 0;
return PKEY_DEDICATED_EXECUTE_ONLY;
}
static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
{
/* Do this check first since the vm_flags should be hot */
if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
return false;
if (vma_pkey(vma) != PKEY_DEDICATED_EXECUTE_ONLY)
return false;
return true;
}
/*
* This is only called for *plain* mprotect calls.
*/
int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey)
{
/*
* Is this an mprotect_pkey() call? If so, never
* override the value that came from the user.
*/
if (pkey != -1)
return pkey;
/*
* Look for a protection-key-drive execute-only mapping
* which is now being given permissions that are not
* execute-only. Move it back to the default pkey.
*/
if (vma_is_pkey_exec_only(vma) &&
(prot & (PROT_READ|PROT_WRITE))) {
return 0;
}
/*
* The mapping is execute-only. Go try to get the
* execute-only protection key. If we fail to do that,
* fall through as if we do not have execute-only
* support.
*/
if (prot == PROT_EXEC) {
pkey = execute_only_pkey(vma->vm_mm);
if (pkey > 0)
return pkey;
}
/*
* This is a vanilla, non-pkey mprotect (or we failed to
* setup execute-only), inherit the pkey from the VMA we
* are working on.
*/
return vma_pkey(vma);
}

View File

@ -156,7 +156,7 @@ static pgprot_t agp_convert_mmap_flags(int prot)
{
unsigned long prot_bits;
prot_bits = calc_vm_prot_bits(prot) | VM_SHARED;
prot_bits = calc_vm_prot_bits(prot, 0) | VM_SHARED;
return vm_get_page_prot(prot_bits);
}

View File

@ -518,8 +518,7 @@ static int amdgpu_ttm_tt_pin_userptr(struct ttm_tt *ttm)
uint64_t userptr = gtt->userptr + pinned * PAGE_SIZE;
struct page **pages = ttm->pages + pinned;
r = get_user_pages(current, current->mm, userptr, num_pages,
write, 0, pages, NULL);
r = get_user_pages(userptr, num_pages, write, 0, pages, NULL);
if (r < 0)
goto release_pages;

View File

@ -753,9 +753,9 @@ static struct page **etnaviv_gem_userptr_do_get_pages(
down_read(&mm->mmap_sem);
while (pinned < npages) {
ret = get_user_pages(task, mm, ptr, npages - pinned,
!etnaviv_obj->userptr.ro, 0,
pvec + pinned, NULL);
ret = get_user_pages_remote(task, mm, ptr, npages - pinned,
!etnaviv_obj->userptr.ro, 0,
pvec + pinned, NULL);
if (ret < 0)
break;

View File

@ -584,11 +584,11 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
down_read(&mm->mmap_sem);
while (pinned < npages) {
ret = get_user_pages(work->task, mm,
obj->userptr.ptr + pinned * PAGE_SIZE,
npages - pinned,
!obj->userptr.read_only, 0,
pvec + pinned, NULL);
ret = get_user_pages_remote(work->task, mm,
obj->userptr.ptr + pinned * PAGE_SIZE,
npages - pinned,
!obj->userptr.read_only, 0,
pvec + pinned, NULL);
if (ret < 0)
break;

View File

@ -554,8 +554,7 @@ static int radeon_ttm_tt_pin_userptr(struct ttm_tt *ttm)
uint64_t userptr = gtt->userptr + pinned * PAGE_SIZE;
struct page **pages = ttm->pages + pinned;
r = get_user_pages(current, current->mm, userptr, num_pages,
write, 0, pages, NULL);
r = get_user_pages(userptr, num_pages, write, 0, pages, NULL);
if (r < 0)
goto release_pages;

View File

@ -239,8 +239,7 @@ via_lock_all_dma_pages(drm_via_sg_info_t *vsg, drm_via_dmablit_t *xfer)
if (NULL == vsg->pages)
return -ENOMEM;
down_read(&current->mm->mmap_sem);
ret = get_user_pages(current, current->mm,
(unsigned long)xfer->mem_addr,
ret = get_user_pages((unsigned long)xfer->mem_addr,
vsg->num_pages,
(vsg->direction == DMA_FROM_DEVICE),
0, vsg->pages, NULL);

View File

@ -188,7 +188,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
sg_list_start = umem->sg_head.sgl;
while (npages) {
ret = get_user_pages(current, current->mm, cur_base,
ret = get_user_pages(cur_base,
min_t(unsigned long, npages,
PAGE_SIZE / sizeof (struct page *)),
1, !umem->writable, page_list, vma_list);

View File

@ -572,10 +572,10 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
* complex (and doesn't gain us much performance in most use
* cases).
*/
npages = get_user_pages(owning_process, owning_mm, user_virt,
gup_num_pages,
access_mask & ODP_WRITE_ALLOWED_BIT, 0,
local_page_list, NULL);
npages = get_user_pages_remote(owning_process, owning_mm,
user_virt, gup_num_pages,
access_mask & ODP_WRITE_ALLOWED_BIT,
0, local_page_list, NULL);
up_read(&owning_mm->mmap_sem);
if (npages < 0)

View File

@ -472,8 +472,7 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
goto out;
}
ret = get_user_pages(current, current->mm, uaddr & PAGE_MASK, 1, 1, 0,
pages, NULL);
ret = get_user_pages(uaddr & PAGE_MASK, 1, 1, 0, pages, NULL);
if (ret < 0)
goto out;

View File

@ -66,8 +66,7 @@ static int __qib_get_user_pages(unsigned long start_page, size_t num_pages,
}
for (got = 0; got < num_pages; got += ret) {
ret = get_user_pages(current, current->mm,
start_page + got * PAGE_SIZE,
ret = get_user_pages(start_page + got * PAGE_SIZE,
num_pages - got, 1, 1,
p + got, NULL);
if (ret < 0)

View File

@ -144,7 +144,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
ret = 0;
while (npages) {
ret = get_user_pages(current, current->mm, cur_base,
ret = get_user_pages(cur_base,
min_t(unsigned long, npages,
PAGE_SIZE / sizeof(struct page *)),
1, !writable, page_list, NULL);

View File

@ -526,6 +526,7 @@ static void do_fault(struct work_struct *work)
flags |= FAULT_FLAG_USER;
if (fault->flags & PPR_FAULT_WRITE)
flags |= FAULT_FLAG_WRITE;
flags |= FAULT_FLAG_REMOTE;
down_read(&mm->mmap_sem);
vma = find_extend_vma(mm, address);

View File

@ -124,8 +124,8 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr,
}
/* Get user pages for DMA Xfer */
err = get_user_pages_unlocked(current, current->mm,
user_dma.uaddr, user_dma.page_count, 0, 1, dma->map);
err = get_user_pages_unlocked(user_dma.uaddr, user_dma.page_count, 0,
1, dma->map);
if (user_dma.page_count != err) {
IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n",

View File

@ -75,14 +75,12 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma,
ivtv_udma_get_page_info (&uv_dma, (unsigned long)args->uv_source, 360 * uv_decode_height);
/* Get user pages for DMA Xfer */
y_pages = get_user_pages_unlocked(current, current->mm,
y_dma.uaddr, y_dma.page_count, 0, 1,
&dma->map[0]);
y_pages = get_user_pages_unlocked(y_dma.uaddr,
y_dma.page_count, 0, 1, &dma->map[0]);
uv_pages = 0; /* silence gcc. value is set and consumed only if: */
if (y_pages == y_dma.page_count) {
uv_pages = get_user_pages_unlocked(current, current->mm,
uv_dma.uaddr, uv_dma.page_count, 0, 1,
&dma->map[y_pages]);
uv_pages = get_user_pages_unlocked(uv_dma.uaddr,
uv_dma.page_count, 0, 1, &dma->map[y_pages]);
}
if (y_pages != y_dma.page_count || uv_pages != uv_dma.page_count) {

View File

@ -181,8 +181,7 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma,
dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n",
data, size, dma->nr_pages);
err = get_user_pages(current, current->mm,
data & PAGE_MASK, dma->nr_pages,
err = get_user_pages(data & PAGE_MASK, dma->nr_pages,
rw == READ, 1, /* force */
dma->pages, NULL);

View File

@ -1394,8 +1394,6 @@ int __scif_pin_pages(void *addr, size_t len, int *out_prot,
}
pinned_pages->nr_pages = get_user_pages(
current,
mm,
(u64)addr,
nr_pages,
!!(prot & SCIF_PROT_WRITE),

View File

@ -198,8 +198,7 @@ static int non_atomic_pte_lookup(struct vm_area_struct *vma,
#else
*pageshift = PAGE_SHIFT;
#endif
if (get_user_pages
(current, current->mm, vaddr, 1, write, 0, &page, NULL) <= 0)
if (get_user_pages(vaddr, 1, write, 0, &page, NULL) <= 0)
return -EFAULT;
*paddr = page_to_phys(page);
put_page(page);

View File

@ -4917,8 +4917,6 @@ static int sgl_map_user_pages(struct st_buffer *STbp,
/* Try to fault in all of the necessary pages */
/* rw==READ means read from drive, write into memory area */
res = get_user_pages_unlocked(
current,
current->mm,
uaddr,
nr_pages,
rw == READ,

View File

@ -385,8 +385,8 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
}
/* requested protection bits must match our allowed protection mask */
if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask)) &
calc_vm_prot_bits(PROT_MASK))) {
if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask, 0)) &
calc_vm_prot_bits(PROT_MASK, 0))) {
ret = -EPERM;
goto out;
}

View File

@ -686,8 +686,8 @@ static ssize_t pvr2fb_write(struct fb_info *info, const char *buf,
if (!pages)
return -ENOMEM;
ret = get_user_pages_unlocked(current, current->mm, (unsigned long)buf,
nr_pages, WRITE, 0, pages);
ret = get_user_pages_unlocked((unsigned long)buf, nr_pages, WRITE,
0, pages);
if (ret < nr_pages) {
nr_pages = ret;

View File

@ -244,9 +244,8 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p)
/* Get the physical addresses of the source buffer */
down_read(&current->mm->mmap_sem);
num_pinned = get_user_pages(current, current->mm,
param.local_vaddr - lb_offset, num_pages,
(param.source == -1) ? READ : WRITE,
num_pinned = get_user_pages(param.local_vaddr - lb_offset,
num_pages, (param.source == -1) ? READ : WRITE,
0, pages, NULL);
up_read(&current->mm->mmap_sem);

View File

@ -199,8 +199,12 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
return NULL;
}
#endif
ret = get_user_pages(current, bprm->mm, pos,
1, write, 1, &page, NULL);
/*
* We are doing an exec(). 'current' is the process
* doing the exec and bprm->mm is the new process's mm.
*/
ret = get_user_pages_remote(current, bprm->mm, pos, 1, write,
1, &page, NULL);
if (ret <= 0)
return NULL;

View File

@ -660,11 +660,20 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_MERGEABLE)] = "mg",
[ilog2(VM_UFFD_MISSING)]= "um",
[ilog2(VM_UFFD_WP)] = "uw",
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
/* These come out via ProtectionKey: */
[ilog2(VM_PKEY_BIT0)] = "",
[ilog2(VM_PKEY_BIT1)] = "",
[ilog2(VM_PKEY_BIT2)] = "",
[ilog2(VM_PKEY_BIT3)] = "",
#endif
};
size_t i;
seq_puts(m, "VmFlags: ");
for (i = 0; i < BITS_PER_LONG; i++) {
if (!mnemonics[i][0])
continue;
if (vma->vm_flags & (1UL << i)) {
seq_printf(m, "%c%c ",
mnemonics[i][0], mnemonics[i][1]);
@ -702,6 +711,10 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
}
#endif /* HUGETLB_PAGE */
void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
{
}
static int show_smap(struct seq_file *m, void *v, int is_pid)
{
struct vm_area_struct *vma = v;
@ -783,6 +796,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
(vma->vm_flags & VM_LOCKED) ?
(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
arch_show_smap(m, vma);
show_smap_vma_flags(m, vma);
m_cache_vma(m, vma);
return 0;

View File

@ -26,4 +26,16 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
{
}
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
bool write, bool execute, bool foreign)
{
/* by default, allow everything */
return true;
}
static inline bool arch_pte_access_permitted(pte_t pte, bool write)
{
/* by default, allow everything */
return true;
}
#endif /* _ASM_GENERIC_MM_HOOKS_H */

View File

@ -193,8 +193,26 @@ extern unsigned int kobjsize(const void *objp);
#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */
#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
#define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
#if defined(CONFIG_X86)
# define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */
#if defined (CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS)
# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0
# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */
# define VM_PKEY_BIT1 VM_HIGH_ARCH_1
# define VM_PKEY_BIT2 VM_HIGH_ARCH_2
# define VM_PKEY_BIT3 VM_HIGH_ARCH_3
#endif
#elif defined(CONFIG_PPC)
# define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */
#elif defined(CONFIG_PARISC)
@ -256,6 +274,8 @@ extern pgprot_t protection_map[16];
#define FAULT_FLAG_KILLABLE 0x10 /* The fault task is in SIGKILL killable region */
#define FAULT_FLAG_TRIED 0x20 /* Second try */
#define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */
#define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */
#define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */
/*
* vm_fault is filled by the the pagefault handler and passed to the vma's
@ -1224,24 +1244,82 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int foll_flags, struct page **pages,
struct vm_area_struct **vmas, int *nonblocking);
long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
struct vm_area_struct **vmas);
long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
int *locked);
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
struct vm_area_struct **vmas);
long get_user_pages6(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
struct vm_area_struct **vmas);
long get_user_pages_locked6(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages, int *locked);
long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
unsigned int gup_flags);
long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
long get_user_pages_unlocked5(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages);
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages);
/* suppress warnings from use in EXPORT_SYMBOL() */
#ifndef __DISABLE_GUP_DEPRECATED
#define __gup_deprecated __deprecated
#else
#define __gup_deprecated
#endif
/*
* These macros provide backward-compatibility with the old
* get_user_pages() variants which took tsk/mm. These
* functions/macros provide both compile-time __deprecated so we
* can catch old-style use and not break the build. The actual
* functions also have WARN_ON()s to let us know at runtime if
* the get_user_pages() should have been the "remote" variant.
*
* These are hideous, but temporary.
*
* If you run into one of these __deprecated warnings, look
* at how you are calling get_user_pages(). If you are calling
* it with current/current->mm as the first two arguments,
* simply remove those arguments. The behavior will be the same
* as it is now. If you are calling it on another task, use
* get_user_pages_remote() instead.
*
* Any questions? Ask Dave Hansen <dave@sr71.net>
*/
long
__gup_deprecated
get_user_pages8(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
struct vm_area_struct **vmas);
#define GUP_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, get_user_pages, ...) \
get_user_pages
#define get_user_pages(...) GUP_MACRO(__VA_ARGS__, \
get_user_pages8, x, \
get_user_pages6, x, x, x, x, x)(__VA_ARGS__)
__gup_deprecated
long get_user_pages_locked8(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
int *locked);
#define GUPL_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, get_user_pages_locked, ...) \
get_user_pages_locked
#define get_user_pages_locked(...) GUPL_MACRO(__VA_ARGS__, \
get_user_pages_locked8, x, \
get_user_pages_locked6, x, x, x, x)(__VA_ARGS__)
__gup_deprecated
long get_user_pages_unlocked7(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages);
#define GUPU_MACRO(_1, _2, _3, _4, _5, _6, _7, get_user_pages_unlocked, ...) \
get_user_pages_unlocked
#define get_user_pages_unlocked(...) GUPU_MACRO(__VA_ARGS__, \
get_user_pages_unlocked7, x, \
get_user_pages_unlocked5, x, x, x, x)(__VA_ARGS__)
/* Container for pinned pfns / pages */
struct frame_vector {
unsigned int nr_allocated; /* Number of frames we have space for */
@ -2169,6 +2247,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
#define FOLL_MLOCK 0x1000 /* lock present pages */
#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
void *data);

View File

@ -35,7 +35,7 @@ static inline void vm_unacct_memory(long pages)
*/
#ifndef arch_calc_vm_prot_bits
#define arch_calc_vm_prot_bits(prot) 0
#define arch_calc_vm_prot_bits(prot, pkey) 0
#endif
#ifndef arch_vm_get_page_prot
@ -70,12 +70,12 @@ static inline int arch_validate_prot(unsigned long prot)
* Combine the mmap "prot" argument into "vm_flags" used internally.
*/
static inline unsigned long
calc_vm_prot_bits(unsigned long prot)
calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
{
return _calc_vm_trans(prot, PROT_READ, VM_READ ) |
_calc_vm_trans(prot, PROT_WRITE, VM_WRITE) |
_calc_vm_trans(prot, PROT_EXEC, VM_EXEC) |
arch_calc_vm_prot_bits(prot);
arch_calc_vm_prot_bits(prot, pkey);
}
/*

33
include/linux/pkeys.h Normal file
View File

@ -0,0 +1,33 @@
#ifndef _LINUX_PKEYS_H
#define _LINUX_PKEYS_H
#include <linux/mm_types.h>
#include <asm/mmu_context.h>
#define PKEY_DISABLE_ACCESS 0x1
#define PKEY_DISABLE_WRITE 0x2
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
PKEY_DISABLE_WRITE)
#ifdef CONFIG_ARCH_HAS_PKEYS
#include <asm/pkeys.h>
#else /* ! CONFIG_ARCH_HAS_PKEYS */
#define arch_max_pkey() (1)
#define execute_only_pkey(mm) (0)
#define arch_override_mprotect_pkey(vma, prot, pkey) (0)
#define PKEY_DEDICATED_EXECUTE_ONLY 0
#endif /* ! CONFIG_ARCH_HAS_PKEYS */
/*
* This is called from mprotect_pkey().
*
* Returns true if the protection keys is valid.
*/
static inline bool validate_pkey(int pkey)
{
if (pkey < 0)
return false;
return (pkey < arch_max_pkey());
}
#endif /* _LINUX_PKEYS_H */

View File

@ -91,10 +91,15 @@ typedef struct siginfo {
int _trapno; /* TRAP # which caused the signal */
#endif
short _addr_lsb; /* LSB of the reported address */
struct {
void __user *_lower;
void __user *_upper;
} _addr_bnd;
union {
/* used when si_code=SEGV_BNDERR */
struct {
void __user *_lower;
void __user *_upper;
} _addr_bnd;
/* used when si_code=SEGV_PKUERR */
__u32 _pkey;
};
} _sigfault;
/* SIGPOLL */
@ -137,6 +142,7 @@ typedef struct siginfo {
#define si_addr_lsb _sifields._sigfault._addr_lsb
#define si_lower _sifields._sigfault._addr_bnd._lower
#define si_upper _sifields._sigfault._addr_bnd._upper
#define si_pkey _sifields._sigfault._pkey
#define si_band _sifields._sigpoll._band
#define si_fd _sifields._sigpoll._fd
#ifdef __ARCH_SIGSYS
@ -206,7 +212,8 @@ typedef struct siginfo {
#define SEGV_MAPERR (__SI_FAULT|1) /* address not mapped to object */
#define SEGV_ACCERR (__SI_FAULT|2) /* invalid permissions for mapped object */
#define SEGV_BNDERR (__SI_FAULT|3) /* failed address bound checks */
#define NSIGSEGV 3
#define SEGV_PKUERR (__SI_FAULT|4) /* failed protection key checks */
#define NSIGSEGV 4
/*
* SIGBUS si_codes

View File

@ -299,7 +299,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
retry:
/* Read the page with vaddr into memory */
ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
if (ret <= 0)
return ret;
@ -1701,7 +1701,13 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
if (likely(result == 0))
goto out;
result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
/*
* The NULL 'tsk' here ensures that any faults that occur here
* will not be accounted to the task. 'mm' *is* current->mm,
* but we treat this as a 'remote' access since it is
* essentially a kernel access to the memory.
*/
result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
if (result < 0)
return result;

View File

@ -2708,6 +2708,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
err |= __put_user(from->si_lower, &to->si_lower);
err |= __put_user(from->si_upper, &to->si_upper);
}
#endif
#ifdef SEGV_PKUERR
if (from->si_signo == SIGSEGV && from->si_code == SEGV_PKUERR)
err |= __put_user(from->si_pkey, &to->si_pkey);
#endif
break;
case __SI_CHLD:

View File

@ -667,3 +667,8 @@ config ZONE_DEVICE
config FRAME_VECTOR
bool
config ARCH_USES_HIGH_VMA_FLAGS
bool
config ARCH_HAS_PKEYS
bool

View File

@ -58,7 +58,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
vec->got_ref = true;
vec->is_pfns = false;
ret = get_user_pages_locked(current, mm, start, nr_frames,
ret = get_user_pages_locked(start, nr_frames,
write, force, (struct page **)(vec->ptrs), &locked);
goto out;
}

127
mm/gup.c
View File

@ -1,3 +1,4 @@
#define __DISABLE_GUP_DEPRECATED 1
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/err.h>
@ -14,6 +15,7 @@
#include <linux/rwsem.h>
#include <linux/hugetlb.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@ -363,6 +365,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
return -ENOENT;
if (*flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
if (*flags & FOLL_REMOTE)
fault_flags |= FAULT_FLAG_REMOTE;
if (nonblocking)
fault_flags |= FAULT_FLAG_ALLOW_RETRY;
if (*flags & FOLL_NOWAIT)
@ -413,11 +417,13 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
{
vm_flags_t vm_flags = vma->vm_flags;
int write = (gup_flags & FOLL_WRITE);
int foreign = (gup_flags & FOLL_REMOTE);
if (vm_flags & (VM_IO | VM_PFNMAP))
return -EFAULT;
if (gup_flags & FOLL_WRITE) {
if (write) {
if (!(vm_flags & VM_WRITE)) {
if (!(gup_flags & FOLL_FORCE))
return -EFAULT;
@ -443,6 +449,12 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if (!(vm_flags & VM_MAYREAD))
return -EFAULT;
}
/*
* gups are always data accesses, not instruction
* fetches, so execute=false here
*/
if (!arch_vma_access_permitted(vma, write, false, foreign))
return -EFAULT;
return 0;
}
@ -609,6 +621,28 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
}
EXPORT_SYMBOL(__get_user_pages);
bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags)
{
bool write = !!(fault_flags & FAULT_FLAG_WRITE);
bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
if (!(vm_flags & vma->vm_flags))
return false;
/*
* The architecture might have a hardware protection
* mechanism other than read/write that can deny access.
*
* gup always represents data access, not instruction
* fetches, so execute=false here:
*/
if (!arch_vma_access_permitted(vma, write, false, foreign))
return false;
return true;
}
/*
* fixup_user_fault() - manually resolve a user page fault
* @tsk: the task_struct to use for page fault accounting, or
@ -644,7 +678,6 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
bool *unlocked)
{
struct vm_area_struct *vma;
vm_flags_t vm_flags;
int ret, major = 0;
if (unlocked)
@ -655,8 +688,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
if (!vma || address < vma->vm_start)
return -EFAULT;
vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
if (!(vm_flags & vma->vm_flags))
if (!vma_permits_fault(vma, fault_flags))
return -EFAULT;
ret = handle_mm_fault(mm, vma, address, fault_flags);
@ -807,15 +839,15 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
* if (locked)
* up_read(&mm->mmap_sem);
*/
long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
long get_user_pages_locked6(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
int *locked)
{
return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
pages, NULL, locked, true, FOLL_TOUCH);
return __get_user_pages_locked(current, current->mm, start, nr_pages,
write, force, pages, NULL, locked, true,
FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages_locked);
EXPORT_SYMBOL(get_user_pages_locked6);
/*
* Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to
@ -860,17 +892,16 @@ EXPORT_SYMBOL(__get_user_pages_unlocked);
* or if "force" shall be set to 1 (get_user_pages_fast misses the
* "force" parameter).
*/
long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
long get_user_pages_unlocked5(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages)
{
return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
force, pages, FOLL_TOUCH);
return __get_user_pages_unlocked(current, current->mm, start, nr_pages,
write, force, pages, FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages_unlocked);
EXPORT_SYMBOL(get_user_pages_unlocked5);
/*
* get_user_pages() - pin user pages in memory
* get_user_pages_remote() - pin user pages in memory
* @tsk: the task_struct to use for page fault accounting, or
* NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
@ -924,14 +955,32 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
* should use get_user_pages because it cannot pass
* FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
*/
long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages, int write,
int force, struct page **pages, struct vm_area_struct **vmas)
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
struct vm_area_struct **vmas)
{
return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
pages, vmas, NULL, false, FOLL_TOUCH);
pages, vmas, NULL, false,
FOLL_TOUCH | FOLL_REMOTE);
}
EXPORT_SYMBOL(get_user_pages);
EXPORT_SYMBOL(get_user_pages_remote);
/*
* This is the same as get_user_pages_remote(), just with a
* less-flexible calling convention where we assume that the task
* and mm being operated on are the current task's. We also
* obviously don't pass FOLL_REMOTE in here.
*/
long get_user_pages6(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
struct vm_area_struct **vmas)
{
return __get_user_pages_locked(current, current->mm, start, nr_pages,
write, force, pages, vmas, NULL, false,
FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages6);
/**
* populate_vma_page_range() - populate a range of pages in the vma.
@ -1144,6 +1193,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
pte_protnone(pte) || (write && !pte_write(pte)))
goto pte_unmap;
if (!arch_pte_access_permitted(pte, write))
goto pte_unmap;
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
head = compound_head(page);
@ -1467,3 +1519,38 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
}
#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */
long get_user_pages8(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
struct vm_area_struct **vmas)
{
WARN_ONCE(tsk != current, "get_user_pages() called on remote task");
WARN_ONCE(mm != current->mm, "get_user_pages() called on remote mm");
return get_user_pages6(start, nr_pages, write, force, pages, vmas);
}
EXPORT_SYMBOL(get_user_pages8);
long get_user_pages_locked8(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages, int *locked)
{
WARN_ONCE(tsk != current, "get_user_pages_locked() called on remote task");
WARN_ONCE(mm != current->mm, "get_user_pages_locked() called on remote mm");
return get_user_pages_locked6(start, nr_pages, write, force, pages, locked);
}
EXPORT_SYMBOL(get_user_pages_locked8);
long get_user_pages_unlocked7(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages)
{
WARN_ONCE(tsk != current, "get_user_pages_unlocked() called on remote task");
WARN_ONCE(mm != current->mm, "get_user_pages_unlocked() called on remote mm");
return get_user_pages_unlocked5(start, nr_pages, write, force, pages);
}
EXPORT_SYMBOL(get_user_pages_unlocked7);

View File

@ -352,13 +352,17 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
/*
* We use break_ksm to break COW on a ksm page: it's a stripped down
*
* if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
* if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
* put_page(page);
*
* but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
* in case the application has unmapped and remapped mm,addr meanwhile.
* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
* mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
*
* FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
* of the process that owns 'vma'. We also do not want to enforce
* protection keys here anyway.
*/
static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
{
@ -367,12 +371,14 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
do {
cond_resched();
page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);
page = follow_page(vma, addr,
FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
if (IS_ERR_OR_NULL(page))
break;
if (PageKsm(page))
ret = handle_mm_fault(vma->vm_mm, vma, addr,
FAULT_FLAG_WRITE);
FAULT_FLAG_WRITE |
FAULT_FLAG_REMOTE);
else
ret = VM_FAULT_WRITE;
put_page(page);

View File

@ -65,6 +65,7 @@
#include <linux/userfaultfd_k.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
@ -3375,6 +3376,11 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pmd_t *pmd;
pte_t *pte;
if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION,
flags & FAULT_FLAG_REMOTE))
return VM_FAULT_SIGSEGV;
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
@ -3691,7 +3697,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
void *maddr;
struct page *page = NULL;
ret = get_user_pages(tsk, mm, addr, 1,
ret = get_user_pages_remote(tsk, mm, addr, 1,
write, 1, &page, &vma);
if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT

View File

@ -846,12 +846,12 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
}
}
static int lookup_node(struct mm_struct *mm, unsigned long addr)
static int lookup_node(unsigned long addr)
{
struct page *p;
int err;
err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
err = get_user_pages(addr & PAGE_MASK, 1, 0, 0, &p, NULL);
if (err >= 0) {
err = page_to_nid(p);
put_page(p);
@ -906,7 +906,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
err = lookup_node(mm, addr);
err = lookup_node(addr);
if (err < 0)
goto out;
*policy = err;

View File

@ -42,6 +42,7 @@
#include <linux/printk.h>
#include <linux/userfaultfd_k.h>
#include <linux/moduleparam.h>
#include <linux/pkeys.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@ -1145,6 +1146,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long pgoff, unsigned long *populate)
{
struct mm_struct *mm = current->mm;
int pkey = 0;
*populate = 0;
@ -1184,11 +1186,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (offset_in_page(addr))
return addr;
if (prot == PROT_EXEC) {
pkey = execute_only_pkey(mm);
if (pkey < 0)
pkey = 0;
}
/* Do simple checking here so the lower-level routines won't have
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_LOCKED)

View File

@ -24,6 +24,7 @@
#include <linux/migrate.h>
#include <linux/perf_event.h>
#include <linux/ksm.h>
#include <linux/pkeys.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
@ -354,7 +355,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
unsigned long, prot)
{
unsigned long vm_flags, nstart, end, tmp, reqprot;
unsigned long nstart, end, tmp, reqprot;
struct vm_area_struct *vma, *prev;
int error = -EINVAL;
const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
@ -380,8 +381,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
prot |= PROT_EXEC;
vm_flags = calc_vm_prot_bits(prot);
down_write(&current->mm->mmap_sem);
vma = find_vma(current->mm, start);
@ -411,10 +410,11 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
for (nstart = start ; ; ) {
unsigned long newflags;
int pkey = arch_override_mprotect_pkey(vma, prot, -1);
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
newflags = vm_flags;
newflags = calc_vm_prot_bits(prot, pkey);
newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
/* newflags >> 4 shift VM_MAY% in place of VM_% */

View File

@ -15,6 +15,8 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#define __DISABLE_GUP_DEPRECATED
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
@ -159,8 +161,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
* slab page or a secondary page from a compound page
* - don't permit access to VMAs that don't support it, such as I/O mappings
*/
long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
long get_user_pages6(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
struct vm_area_struct **vmas)
{
@ -171,20 +172,18 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (force)
flags |= FOLL_FORCE;
return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
NULL);
return __get_user_pages(current, current->mm, start, nr_pages, flags,
pages, vmas, NULL);
}
EXPORT_SYMBOL(get_user_pages);
EXPORT_SYMBOL(get_user_pages6);
long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
int *locked)
long get_user_pages_locked6(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
int *locked)
{
return get_user_pages(tsk, mm, start, nr_pages, write, force,
pages, NULL);
return get_user_pages6(start, nr_pages, write, force, pages, NULL);
}
EXPORT_SYMBOL(get_user_pages_locked);
EXPORT_SYMBOL(get_user_pages_locked6);
long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
@ -193,21 +192,20 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
{
long ret;
down_read(&mm->mmap_sem);
ret = get_user_pages(tsk, mm, start, nr_pages, write, force,
pages, NULL);
ret = __get_user_pages(tsk, mm, start, nr_pages, gup_flags, pages,
NULL, NULL);
up_read(&mm->mmap_sem);
return ret;
}
EXPORT_SYMBOL(__get_user_pages_unlocked);
long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
long get_user_pages_unlocked5(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages)
{
return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
force, pages, 0);
return __get_user_pages_unlocked(current, current->mm, start, nr_pages,
write, force, pages, 0);
}
EXPORT_SYMBOL(get_user_pages_unlocked);
EXPORT_SYMBOL(get_user_pages_unlocked5);
/**
* follow_pfn - look up PFN at a user virtual address
@ -1061,7 +1059,7 @@ static unsigned long determine_vm_flags(struct file *file,
{
unsigned long vm_flags;
vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
/* vm_flags |= mm->def_flags; */
if (!(capabilities & NOMMU_MAP_DIRECT)) {
@ -1991,3 +1989,31 @@ static int __meminit init_admin_reserve(void)
return 0;
}
subsys_initcall(init_admin_reserve);
long get_user_pages8(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
struct vm_area_struct **vmas)
{
return get_user_pages6(start, nr_pages, write, force, pages, vmas);
}
EXPORT_SYMBOL(get_user_pages8);
long get_user_pages_locked8(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
int *locked)
{
return get_user_pages_locked6(start, nr_pages, write,
force, pages, locked);
}
EXPORT_SYMBOL(get_user_pages_locked8);
long get_user_pages_unlocked7(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages)
{
return get_user_pages_unlocked5(start, nr_pages, write, force, pages);
}
EXPORT_SYMBOL(get_user_pages_unlocked7);

View File

@ -98,9 +98,14 @@ static int process_vm_rw_single_vec(unsigned long addr,
int pages = min(nr_pages, max_pages_per_loop);
size_t bytes;
/* Get the pages we're interested in */
pages = get_user_pages_unlocked(task, mm, pa, pages,
vm_write, 0, process_pages);
/*
* Get the pages we're interested in. We must
* add FOLL_REMOTE because task/mm might not
* current/current->mm
*/
pages = __get_user_pages_unlocked(task, mm, pa, pages,
vm_write, 0, process_pages,
FOLL_REMOTE);
if (pages <= 0)
return -EFAULT;

View File

@ -283,9 +283,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
int __weak get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
{
struct mm_struct *mm = current->mm;
return get_user_pages_unlocked(current, mm, start, nr_pages,
write, 0, pages);
return get_user_pages_unlocked(start, nr_pages, write, 0, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);

View File

@ -24,7 +24,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data,
return ERR_PTR(-ENOMEM);
while (got < num_pages) {
rc = get_user_pages_unlocked(current, current->mm,
rc = get_user_pages_unlocked(
(unsigned long)data + ((unsigned long)got * PAGE_SIZE),
num_pages - got, write_page, 0, pages + got);
if (rc < 0)

View File

@ -874,7 +874,14 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
}
/* Same with get_arg_page(bprm, pos, 0) in fs/exec.c */
#ifdef CONFIG_MMU
if (get_user_pages(current, bprm->mm, pos, 1, 0, 1, &page, NULL) <= 0)
/*
* This is called at execve() time in order to dig around
* in the argv/environment of the new proceess
* (represented by bprm). 'current' is the process doing
* the execve().
*/
if (get_user_pages_remote(current, bprm->mm, pos, 1,
0, 1, &page, NULL) <= 0)
return false;
#else
page = bprm->page[pos / PAGE_SIZE];

View File

@ -79,7 +79,13 @@ static void async_pf_execute(struct work_struct *work)
might_sleep();
get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL);
/*
* This work is run asynchromously to the task which owns
* mm and might be done in another context, so we must
* use FOLL_REMOTE.
*/
__get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL, FOLL_REMOTE);
kvm_async_page_present_sync(vcpu, apf);
spin_lock(&vcpu->async_pf.lock);

View File

@ -1260,15 +1260,16 @@ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w
return gfn_to_hva_memslot_prot(slot, gfn, writable);
}
static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int write, struct page **page)
static int get_user_page_nowait(unsigned long start, int write,
struct page **page)
{
int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET;
if (write)
flags |= FOLL_WRITE;
return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
return __get_user_pages(current, current->mm, start, 1, flags, page,
NULL, NULL);
}
static inline int check_user_page_hwpoison(unsigned long addr)
@ -1330,8 +1331,7 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
if (async) {
down_read(&current->mm->mmap_sem);
npages = get_user_page_nowait(current, current->mm,
addr, write_fault, page);
npages = get_user_page_nowait(addr, write_fault, page);
up_read(&current->mm->mmap_sem);
} else
npages = __get_user_pages_unlocked(current, current->mm, addr, 1,