2017-11-24 22:00:32 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2008-12-25 20:38:36 +08:00
|
|
|
/*
|
|
|
|
* vdso setup for s390
|
|
|
|
*
|
|
|
|
* Copyright IBM Corp. 2008
|
|
|
|
* Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
|
|
|
|
*/
|
|
|
|
|
2017-02-10 04:20:23 +08:00
|
|
|
#include <linux/init.h>
|
2008-12-25 20:38:36 +08:00
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/sched.h>
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/smp.h>
|
|
|
|
#include <linux/stddef.h>
|
|
|
|
#include <linux/unistd.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/user.h>
|
|
|
|
#include <linux/elf.h>
|
|
|
|
#include <linux/security.h>
|
2018-10-31 06:09:49 +08:00
|
|
|
#include <linux/memblock.h>
|
2009-06-12 16:26:25 +08:00
|
|
|
#include <linux/compat.h>
|
2010-02-27 05:37:43 +08:00
|
|
|
#include <asm/asm-offsets.h>
|
2008-12-25 20:38:36 +08:00
|
|
|
#include <asm/processor.h>
|
|
|
|
#include <asm/mmu.h>
|
|
|
|
#include <asm/mmu_context.h>
|
|
|
|
#include <asm/sections.h>
|
|
|
|
#include <asm/vdso.h>
|
2012-03-29 01:30:02 +08:00
|
|
|
#include <asm/facility.h>
|
2008-12-25 20:38:36 +08:00
|
|
|
|
|
|
|
extern char vdso64_start, vdso64_end;
|
|
|
|
static void *vdso64_kbase = &vdso64_start;
|
|
|
|
static unsigned int vdso64_pages;
|
|
|
|
static struct page **vdso64_pagelist;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Should the kernel map a VDSO page into processes and pass its
|
|
|
|
* address down to glibc upon exec()?
|
|
|
|
*/
|
|
|
|
unsigned int __read_mostly vdso_enabled = 1;
|
|
|
|
|
2018-07-22 21:41:09 +08:00
|
|
|
static vm_fault_t vdso_fault(const struct vm_special_mapping *sm,
|
2017-05-15 16:23:38 +08:00
|
|
|
struct vm_area_struct *vma, struct vm_fault *vmf)
|
|
|
|
{
|
|
|
|
struct page **vdso_pagelist;
|
|
|
|
unsigned long vdso_pages;
|
|
|
|
|
|
|
|
vdso_pagelist = vdso64_pagelist;
|
|
|
|
vdso_pages = vdso64_pages;
|
|
|
|
|
|
|
|
if (vmf->pgoff >= vdso_pages)
|
|
|
|
return VM_FAULT_SIGBUS;
|
|
|
|
|
|
|
|
vmf->page = vdso_pagelist[vmf->pgoff];
|
|
|
|
get_page(vmf->page);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vdso_mremap(const struct vm_special_mapping *sm,
|
|
|
|
struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
unsigned long vdso_pages;
|
|
|
|
|
|
|
|
vdso_pages = vdso64_pages;
|
|
|
|
|
|
|
|
if ((vdso_pages << PAGE_SHIFT) != vma->vm_end - vma->vm_start)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (WARN_ON_ONCE(current->mm != vma->vm_mm))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
current->mm->context.vdso_base = vma->vm_start;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct vm_special_mapping vdso_mapping = {
|
|
|
|
.name = "[vdso]",
|
|
|
|
.fault = vdso_fault,
|
|
|
|
.mremap = vdso_mremap,
|
|
|
|
};
|
|
|
|
|
2019-08-19 23:41:17 +08:00
|
|
|
static int __init vdso_setup(char *str)
|
2008-12-25 20:38:36 +08:00
|
|
|
{
|
2019-08-19 23:41:17 +08:00
|
|
|
bool enabled;
|
|
|
|
|
|
|
|
if (!kstrtobool(str, &enabled))
|
|
|
|
vdso_enabled = enabled;
|
|
|
|
return 1;
|
2008-12-25 20:38:36 +08:00
|
|
|
}
|
|
|
|
__setup("vdso=", vdso_setup);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The vdso data page
|
|
|
|
*/
|
|
|
|
static union {
|
|
|
|
struct vdso_data data;
|
|
|
|
u8 page[PAGE_SIZE];
|
2009-09-21 06:14:15 +08:00
|
|
|
} vdso_data_store __page_aligned_data;
|
2008-12-25 20:38:36 +08:00
|
|
|
struct vdso_data *vdso_data = &vdso_data_store.data;
|
|
|
|
|
2008-12-31 22:11:42 +08:00
|
|
|
/*
|
|
|
|
* Setup vdso data page.
|
|
|
|
*/
|
2016-01-05 20:29:38 +08:00
|
|
|
static void __init vdso_init_data(struct vdso_data *vd)
|
2008-12-31 22:11:42 +08:00
|
|
|
{
|
2013-09-24 15:14:56 +08:00
|
|
|
vd->ectg_available = test_facility(31);
|
2008-12-31 22:11:42 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate/free per cpu vdso data.
|
|
|
|
*/
|
|
|
|
#define SEGMENT_ORDER 2
|
|
|
|
|
2017-10-13 18:59:22 +08:00
|
|
|
/*
|
|
|
|
* The initial vdso_data structure for the boot CPU. Eventually
|
|
|
|
* it is replaced with a properly allocated structure in vdso_init.
|
|
|
|
* This is necessary because a valid S390_lowcore.vdso_per_cpu_data
|
|
|
|
* pointer is required to be able to return from an interrupt or
|
|
|
|
* program check. See the exit paths in entry.S.
|
|
|
|
*/
|
|
|
|
struct vdso_data boot_vdso_data __initdata;
|
|
|
|
|
|
|
|
void __init vdso_alloc_boot_cpu(struct lowcore *lowcore)
|
|
|
|
{
|
|
|
|
lowcore->vdso_per_cpu_data = (unsigned long) &boot_vdso_data;
|
|
|
|
}
|
|
|
|
|
2015-12-31 17:29:00 +08:00
|
|
|
int vdso_alloc_per_cpu(struct lowcore *lowcore)
|
2008-12-31 22:11:42 +08:00
|
|
|
{
|
|
|
|
unsigned long segment_table, page_table, page_frame;
|
2016-01-05 20:29:38 +08:00
|
|
|
struct vdso_per_cpu_data *vd;
|
2008-12-31 22:11:42 +08:00
|
|
|
|
|
|
|
segment_table = __get_free_pages(GFP_KERNEL, SEGMENT_ORDER);
|
s390: remove all code using the access register mode
The vdso code for the getcpu() and the clock_gettime() call use the access
register mode to access the per-CPU vdso data page with the current code.
An alternative to the complicated AR mode is to use the secondary space
mode. This makes the vdso faster and quite a bit simpler. The downside is
that the uaccess code has to be changed quite a bit.
Which instructions are used depends on the machine and what kind of uaccess
operation is requested. The instruction dictates which ASCE value needs
to be loaded into %cr1 and %cr7.
The different cases:
* User copy with MVCOS for z10 and newer machines
The MVCOS instruction can copy between the primary space (aka user) and
the home space (aka kernel) directly. For set_fs(KERNEL_DS) the kernel
ASCE is loaded into %cr1. For set_fs(USER_DS) the user space is already
loaded in %cr1.
* User copy with MVCP/MVCS for older machines
To be able to execute the MVCP/MVCS instructions the kernel needs to
switch to primary mode. The control register %cr1 has to be set to the
kernel ASCE and %cr7 to either the kernel ASCE or the user ASCE dependent
on set_fs(KERNEL_DS) vs set_fs(USER_DS).
* Data access in the user address space for strnlen / futex
To use "normal" instruction with data from the user address space the
secondary space mode is used. The kernel needs to switch to primary mode,
%cr1 has to contain the kernel ASCE and %cr7 either the user ASCE or the
kernel ASCE, dependent on set_fs.
To load a new value into %cr1 or %cr7 is an expensive operation, the kernel
tries to be lazy about it. E.g. for multiple user copies in a row with
MVCP/MVCS the replacement of the vdso ASCE in %cr7 with the user ASCE is
done only once. On return to user space a CPU bit is checked that loads the
vdso ASCE again.
To enable and disable the data access via the secondary space two new
functions are added, enable_sacf_uaccess and disable_sacf_uaccess. The fact
that a context is in secondary space uaccess mode is stored in the
mm_segment_t value for the task. The code of an interrupt may use set_fs
as long as it returns to the previous state it got with get_fs with another
call to set_fs. The code in finish_arch_post_lock_switch simply has to do a
set_fs with the current mm_segment_t value for the task.
For CPUs with MVCOS:
CPU running in | %cr1 ASCE | %cr7 ASCE |
--------------------------------------|-----------|-----------|
user space | user | vdso |
kernel, USER_DS, normal-mode | user | vdso |
kernel, USER_DS, normal-mode, lazy | user | user |
kernel, USER_DS, sacf-mode | kernel | user |
kernel, KERNEL_DS, normal-mode | kernel | vdso |
kernel, KERNEL_DS, normal-mode, lazy | kernel | kernel |
kernel, KERNEL_DS, sacf-mode | kernel | kernel |
For CPUs without MVCOS:
CPU running in | %cr1 ASCE | %cr7 ASCE |
--------------------------------------|-----------|-----------|
user space | user | vdso |
kernel, USER_DS, normal-mode | user | vdso |
kernel, USER_DS, normal-mode lazy | kernel | user |
kernel, USER_DS, sacf-mode | kernel | user |
kernel, KERNEL_DS, normal-mode | kernel | vdso |
kernel, KERNEL_DS, normal-mode, lazy | kernel | kernel |
kernel, KERNEL_DS, sacf-mode | kernel | kernel |
The lines with "lazy" refer to the state after a copy via the secondary
space with a delayed reload of %cr1 and %cr7.
There are three hardware address spaces that can cause a DAT exception,
primary, secondary and home space. The exception can be related to
four different fault types: user space fault, vdso fault, kernel fault,
and the gmap faults.
Dependent on the set_fs state and normal vs. sacf mode there are a number
of fault combinations:
1) user address space fault via the primary ASCE
2) gmap address space fault via the primary ASCE
3) kernel address space fault via the primary ASCE for machines with
MVCOS and set_fs(KERNEL_DS)
4) vdso address space faults via the secondary ASCE with an invalid
address while running in secondary space in problem state
5) user address space fault via the secondary ASCE for user-copy
based on the secondary space mode, e.g. futex_ops or strnlen_user
6) kernel address space fault via the secondary ASCE for user-copy
with secondary space mode with set_fs(KERNEL_DS)
7) kernel address space fault via the primary ASCE for user-copy
with secondary space mode with set_fs(USER_DS) on machines without
MVCOS.
8) kernel address space fault via the home space ASCE
Replace user_space_fault() with a new function get_fault_type() that
can distinguish all four different fault types.
With these changes the futex atomic ops from the kernel and the
strnlen_user will get a little bit slower, as well as the old style
uaccess with MVCP/MVCS. All user accesses based on MVCOS will be as
fast as before. On the positive side, the user space vdso code is a
lot faster and Linux ceases to use the complicated AR mode.
Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
2017-08-22 18:08:22 +08:00
|
|
|
page_table = get_zeroed_page(GFP_KERNEL);
|
2008-12-31 22:11:42 +08:00
|
|
|
page_frame = get_zeroed_page(GFP_KERNEL);
|
|
|
|
if (!segment_table || !page_table || !page_frame)
|
|
|
|
goto out;
|
2016-06-14 18:56:01 +08:00
|
|
|
arch_set_page_dat(virt_to_page(segment_table), SEGMENT_ORDER);
|
|
|
|
arch_set_page_dat(virt_to_page(page_table), 0);
|
2008-12-31 22:11:42 +08:00
|
|
|
|
2016-01-05 20:29:38 +08:00
|
|
|
/* Initialize per-cpu vdso data page */
|
|
|
|
vd = (struct vdso_per_cpu_data *) page_frame;
|
|
|
|
vd->cpu_nr = lowcore->cpu_nr;
|
|
|
|
vd->node_id = cpu_to_node(vd->cpu_nr);
|
|
|
|
|
s390: remove all code using the access register mode
The vdso code for the getcpu() and the clock_gettime() call use the access
register mode to access the per-CPU vdso data page with the current code.
An alternative to the complicated AR mode is to use the secondary space
mode. This makes the vdso faster and quite a bit simpler. The downside is
that the uaccess code has to be changed quite a bit.
Which instructions are used depends on the machine and what kind of uaccess
operation is requested. The instruction dictates which ASCE value needs
to be loaded into %cr1 and %cr7.
The different cases:
* User copy with MVCOS for z10 and newer machines
The MVCOS instruction can copy between the primary space (aka user) and
the home space (aka kernel) directly. For set_fs(KERNEL_DS) the kernel
ASCE is loaded into %cr1. For set_fs(USER_DS) the user space is already
loaded in %cr1.
* User copy with MVCP/MVCS for older machines
To be able to execute the MVCP/MVCS instructions the kernel needs to
switch to primary mode. The control register %cr1 has to be set to the
kernel ASCE and %cr7 to either the kernel ASCE or the user ASCE dependent
on set_fs(KERNEL_DS) vs set_fs(USER_DS).
* Data access in the user address space for strnlen / futex
To use "normal" instruction with data from the user address space the
secondary space mode is used. The kernel needs to switch to primary mode,
%cr1 has to contain the kernel ASCE and %cr7 either the user ASCE or the
kernel ASCE, dependent on set_fs.
To load a new value into %cr1 or %cr7 is an expensive operation, the kernel
tries to be lazy about it. E.g. for multiple user copies in a row with
MVCP/MVCS the replacement of the vdso ASCE in %cr7 with the user ASCE is
done only once. On return to user space a CPU bit is checked that loads the
vdso ASCE again.
To enable and disable the data access via the secondary space two new
functions are added, enable_sacf_uaccess and disable_sacf_uaccess. The fact
that a context is in secondary space uaccess mode is stored in the
mm_segment_t value for the task. The code of an interrupt may use set_fs
as long as it returns to the previous state it got with get_fs with another
call to set_fs. The code in finish_arch_post_lock_switch simply has to do a
set_fs with the current mm_segment_t value for the task.
For CPUs with MVCOS:
CPU running in | %cr1 ASCE | %cr7 ASCE |
--------------------------------------|-----------|-----------|
user space | user | vdso |
kernel, USER_DS, normal-mode | user | vdso |
kernel, USER_DS, normal-mode, lazy | user | user |
kernel, USER_DS, sacf-mode | kernel | user |
kernel, KERNEL_DS, normal-mode | kernel | vdso |
kernel, KERNEL_DS, normal-mode, lazy | kernel | kernel |
kernel, KERNEL_DS, sacf-mode | kernel | kernel |
For CPUs without MVCOS:
CPU running in | %cr1 ASCE | %cr7 ASCE |
--------------------------------------|-----------|-----------|
user space | user | vdso |
kernel, USER_DS, normal-mode | user | vdso |
kernel, USER_DS, normal-mode lazy | kernel | user |
kernel, USER_DS, sacf-mode | kernel | user |
kernel, KERNEL_DS, normal-mode | kernel | vdso |
kernel, KERNEL_DS, normal-mode, lazy | kernel | kernel |
kernel, KERNEL_DS, sacf-mode | kernel | kernel |
The lines with "lazy" refer to the state after a copy via the secondary
space with a delayed reload of %cr1 and %cr7.
There are three hardware address spaces that can cause a DAT exception,
primary, secondary and home space. The exception can be related to
four different fault types: user space fault, vdso fault, kernel fault,
and the gmap faults.
Dependent on the set_fs state and normal vs. sacf mode there are a number
of fault combinations:
1) user address space fault via the primary ASCE
2) gmap address space fault via the primary ASCE
3) kernel address space fault via the primary ASCE for machines with
MVCOS and set_fs(KERNEL_DS)
4) vdso address space faults via the secondary ASCE with an invalid
address while running in secondary space in problem state
5) user address space fault via the secondary ASCE for user-copy
based on the secondary space mode, e.g. futex_ops or strnlen_user
6) kernel address space fault via the secondary ASCE for user-copy
with secondary space mode with set_fs(KERNEL_DS)
7) kernel address space fault via the primary ASCE for user-copy
with secondary space mode with set_fs(USER_DS) on machines without
MVCOS.
8) kernel address space fault via the home space ASCE
Replace user_space_fault() with a new function get_fault_type() that
can distinguish all four different fault types.
With these changes the futex atomic ops from the kernel and the
strnlen_user will get a little bit slower, as well as the old style
uaccess with MVCP/MVCS. All user accesses based on MVCOS will be as
fast as before. On the positive side, the user space vdso code is a
lot faster and Linux ceases to use the complicated AR mode.
Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
2017-08-22 18:08:22 +08:00
|
|
|
/* Set up page table for the vdso address space */
|
2017-10-05 01:27:07 +08:00
|
|
|
memset64((u64 *)segment_table, _SEGMENT_ENTRY_EMPTY, _CRST_ENTRIES);
|
|
|
|
memset64((u64 *)page_table, _PAGE_INVALID, PTRS_PER_PTE);
|
2008-12-31 22:11:42 +08:00
|
|
|
|
|
|
|
*(unsigned long *) segment_table = _SEGMENT_ENTRY + page_table;
|
2013-07-24 02:57:57 +08:00
|
|
|
*(unsigned long *) page_table = _PAGE_PROTECT + page_frame;
|
2008-12-31 22:11:42 +08:00
|
|
|
|
s390: remove all code using the access register mode
The vdso code for the getcpu() and the clock_gettime() call use the access
register mode to access the per-CPU vdso data page with the current code.
An alternative to the complicated AR mode is to use the secondary space
mode. This makes the vdso faster and quite a bit simpler. The downside is
that the uaccess code has to be changed quite a bit.
Which instructions are used depends on the machine and what kind of uaccess
operation is requested. The instruction dictates which ASCE value needs
to be loaded into %cr1 and %cr7.
The different cases:
* User copy with MVCOS for z10 and newer machines
The MVCOS instruction can copy between the primary space (aka user) and
the home space (aka kernel) directly. For set_fs(KERNEL_DS) the kernel
ASCE is loaded into %cr1. For set_fs(USER_DS) the user space is already
loaded in %cr1.
* User copy with MVCP/MVCS for older machines
To be able to execute the MVCP/MVCS instructions the kernel needs to
switch to primary mode. The control register %cr1 has to be set to the
kernel ASCE and %cr7 to either the kernel ASCE or the user ASCE dependent
on set_fs(KERNEL_DS) vs set_fs(USER_DS).
* Data access in the user address space for strnlen / futex
To use "normal" instruction with data from the user address space the
secondary space mode is used. The kernel needs to switch to primary mode,
%cr1 has to contain the kernel ASCE and %cr7 either the user ASCE or the
kernel ASCE, dependent on set_fs.
To load a new value into %cr1 or %cr7 is an expensive operation, the kernel
tries to be lazy about it. E.g. for multiple user copies in a row with
MVCP/MVCS the replacement of the vdso ASCE in %cr7 with the user ASCE is
done only once. On return to user space a CPU bit is checked that loads the
vdso ASCE again.
To enable and disable the data access via the secondary space two new
functions are added, enable_sacf_uaccess and disable_sacf_uaccess. The fact
that a context is in secondary space uaccess mode is stored in the
mm_segment_t value for the task. The code of an interrupt may use set_fs
as long as it returns to the previous state it got with get_fs with another
call to set_fs. The code in finish_arch_post_lock_switch simply has to do a
set_fs with the current mm_segment_t value for the task.
For CPUs with MVCOS:
CPU running in | %cr1 ASCE | %cr7 ASCE |
--------------------------------------|-----------|-----------|
user space | user | vdso |
kernel, USER_DS, normal-mode | user | vdso |
kernel, USER_DS, normal-mode, lazy | user | user |
kernel, USER_DS, sacf-mode | kernel | user |
kernel, KERNEL_DS, normal-mode | kernel | vdso |
kernel, KERNEL_DS, normal-mode, lazy | kernel | kernel |
kernel, KERNEL_DS, sacf-mode | kernel | kernel |
For CPUs without MVCOS:
CPU running in | %cr1 ASCE | %cr7 ASCE |
--------------------------------------|-----------|-----------|
user space | user | vdso |
kernel, USER_DS, normal-mode | user | vdso |
kernel, USER_DS, normal-mode lazy | kernel | user |
kernel, USER_DS, sacf-mode | kernel | user |
kernel, KERNEL_DS, normal-mode | kernel | vdso |
kernel, KERNEL_DS, normal-mode, lazy | kernel | kernel |
kernel, KERNEL_DS, sacf-mode | kernel | kernel |
The lines with "lazy" refer to the state after a copy via the secondary
space with a delayed reload of %cr1 and %cr7.
There are three hardware address spaces that can cause a DAT exception,
primary, secondary and home space. The exception can be related to
four different fault types: user space fault, vdso fault, kernel fault,
and the gmap faults.
Dependent on the set_fs state and normal vs. sacf mode there are a number
of fault combinations:
1) user address space fault via the primary ASCE
2) gmap address space fault via the primary ASCE
3) kernel address space fault via the primary ASCE for machines with
MVCOS and set_fs(KERNEL_DS)
4) vdso address space faults via the secondary ASCE with an invalid
address while running in secondary space in problem state
5) user address space fault via the secondary ASCE for user-copy
based on the secondary space mode, e.g. futex_ops or strnlen_user
6) kernel address space fault via the secondary ASCE for user-copy
with secondary space mode with set_fs(KERNEL_DS)
7) kernel address space fault via the primary ASCE for user-copy
with secondary space mode with set_fs(USER_DS) on machines without
MVCOS.
8) kernel address space fault via the home space ASCE
Replace user_space_fault() with a new function get_fault_type() that
can distinguish all four different fault types.
With these changes the futex atomic ops from the kernel and the
strnlen_user will get a little bit slower, as well as the old style
uaccess with MVCP/MVCS. All user accesses based on MVCOS will be as
fast as before. On the positive side, the user space vdso code is a
lot faster and Linux ceases to use the complicated AR mode.
Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
2017-08-22 18:08:22 +08:00
|
|
|
lowcore->vdso_asce = segment_table +
|
2008-12-31 22:11:42 +08:00
|
|
|
_ASCE_TABLE_LENGTH + _ASCE_USER_BITS + _ASCE_TYPE_SEGMENT;
|
|
|
|
lowcore->vdso_per_cpu_data = page_frame;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out:
|
|
|
|
free_page(page_frame);
|
|
|
|
free_page(page_table);
|
|
|
|
free_pages(segment_table, SEGMENT_ORDER);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
2015-12-31 17:29:00 +08:00
|
|
|
void vdso_free_per_cpu(struct lowcore *lowcore)
|
2008-12-31 22:11:42 +08:00
|
|
|
{
|
|
|
|
unsigned long segment_table, page_table, page_frame;
|
|
|
|
|
s390: remove all code using the access register mode
The vdso code for the getcpu() and the clock_gettime() call use the access
register mode to access the per-CPU vdso data page with the current code.
An alternative to the complicated AR mode is to use the secondary space
mode. This makes the vdso faster and quite a bit simpler. The downside is
that the uaccess code has to be changed quite a bit.
Which instructions are used depends on the machine and what kind of uaccess
operation is requested. The instruction dictates which ASCE value needs
to be loaded into %cr1 and %cr7.
The different cases:
* User copy with MVCOS for z10 and newer machines
The MVCOS instruction can copy between the primary space (aka user) and
the home space (aka kernel) directly. For set_fs(KERNEL_DS) the kernel
ASCE is loaded into %cr1. For set_fs(USER_DS) the user space is already
loaded in %cr1.
* User copy with MVCP/MVCS for older machines
To be able to execute the MVCP/MVCS instructions the kernel needs to
switch to primary mode. The control register %cr1 has to be set to the
kernel ASCE and %cr7 to either the kernel ASCE or the user ASCE dependent
on set_fs(KERNEL_DS) vs set_fs(USER_DS).
* Data access in the user address space for strnlen / futex
To use "normal" instruction with data from the user address space the
secondary space mode is used. The kernel needs to switch to primary mode,
%cr1 has to contain the kernel ASCE and %cr7 either the user ASCE or the
kernel ASCE, dependent on set_fs.
To load a new value into %cr1 or %cr7 is an expensive operation, the kernel
tries to be lazy about it. E.g. for multiple user copies in a row with
MVCP/MVCS the replacement of the vdso ASCE in %cr7 with the user ASCE is
done only once. On return to user space a CPU bit is checked that loads the
vdso ASCE again.
To enable and disable the data access via the secondary space two new
functions are added, enable_sacf_uaccess and disable_sacf_uaccess. The fact
that a context is in secondary space uaccess mode is stored in the
mm_segment_t value for the task. The code of an interrupt may use set_fs
as long as it returns to the previous state it got with get_fs with another
call to set_fs. The code in finish_arch_post_lock_switch simply has to do a
set_fs with the current mm_segment_t value for the task.
For CPUs with MVCOS:
CPU running in | %cr1 ASCE | %cr7 ASCE |
--------------------------------------|-----------|-----------|
user space | user | vdso |
kernel, USER_DS, normal-mode | user | vdso |
kernel, USER_DS, normal-mode, lazy | user | user |
kernel, USER_DS, sacf-mode | kernel | user |
kernel, KERNEL_DS, normal-mode | kernel | vdso |
kernel, KERNEL_DS, normal-mode, lazy | kernel | kernel |
kernel, KERNEL_DS, sacf-mode | kernel | kernel |
For CPUs without MVCOS:
CPU running in | %cr1 ASCE | %cr7 ASCE |
--------------------------------------|-----------|-----------|
user space | user | vdso |
kernel, USER_DS, normal-mode | user | vdso |
kernel, USER_DS, normal-mode lazy | kernel | user |
kernel, USER_DS, sacf-mode | kernel | user |
kernel, KERNEL_DS, normal-mode | kernel | vdso |
kernel, KERNEL_DS, normal-mode, lazy | kernel | kernel |
kernel, KERNEL_DS, sacf-mode | kernel | kernel |
The lines with "lazy" refer to the state after a copy via the secondary
space with a delayed reload of %cr1 and %cr7.
There are three hardware address spaces that can cause a DAT exception,
primary, secondary and home space. The exception can be related to
four different fault types: user space fault, vdso fault, kernel fault,
and the gmap faults.
Dependent on the set_fs state and normal vs. sacf mode there are a number
of fault combinations:
1) user address space fault via the primary ASCE
2) gmap address space fault via the primary ASCE
3) kernel address space fault via the primary ASCE for machines with
MVCOS and set_fs(KERNEL_DS)
4) vdso address space faults via the secondary ASCE with an invalid
address while running in secondary space in problem state
5) user address space fault via the secondary ASCE for user-copy
based on the secondary space mode, e.g. futex_ops or strnlen_user
6) kernel address space fault via the secondary ASCE for user-copy
with secondary space mode with set_fs(KERNEL_DS)
7) kernel address space fault via the primary ASCE for user-copy
with secondary space mode with set_fs(USER_DS) on machines without
MVCOS.
8) kernel address space fault via the home space ASCE
Replace user_space_fault() with a new function get_fault_type() that
can distinguish all four different fault types.
With these changes the futex atomic ops from the kernel and the
strnlen_user will get a little bit slower, as well as the old style
uaccess with MVCP/MVCS. All user accesses based on MVCOS will be as
fast as before. On the positive side, the user space vdso code is a
lot faster and Linux ceases to use the complicated AR mode.
Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
2017-08-22 18:08:22 +08:00
|
|
|
segment_table = lowcore->vdso_asce & PAGE_MASK;
|
2008-12-31 22:11:42 +08:00
|
|
|
page_table = *(unsigned long *) segment_table;
|
|
|
|
page_frame = *(unsigned long *) page_table;
|
|
|
|
|
|
|
|
free_page(page_frame);
|
|
|
|
free_page(page_table);
|
|
|
|
free_pages(segment_table, SEGMENT_ORDER);
|
|
|
|
}
|
|
|
|
|
2008-12-25 20:38:36 +08:00
|
|
|
/*
|
|
|
|
* This is called from binfmt_elf, we create the special vma for the
|
|
|
|
* vDSO and insert it into the mm struct tree
|
|
|
|
*/
|
|
|
|
int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
|
|
|
|
{
|
|
|
|
struct mm_struct *mm = current->mm;
|
2017-05-15 16:23:38 +08:00
|
|
|
struct vm_area_struct *vma;
|
2008-12-25 20:38:36 +08:00
|
|
|
unsigned long vdso_pages;
|
|
|
|
unsigned long vdso_base;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
if (!vdso_enabled)
|
|
|
|
return 0;
|
|
|
|
|
2019-11-18 20:59:25 +08:00
|
|
|
if (is_compat_task())
|
|
|
|
return 0;
|
|
|
|
|
2008-12-25 20:38:36 +08:00
|
|
|
vdso_pages = vdso64_pages;
|
|
|
|
/*
|
|
|
|
* vDSO has a problem and was disabled, just don't "enable" it for
|
|
|
|
* the process
|
|
|
|
*/
|
|
|
|
if (vdso_pages == 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pick a base address for the vDSO in process space. We try to put
|
|
|
|
* it at vdso_base which is the "natural" base for it, but we might
|
|
|
|
* fail and end up putting it elsewhere.
|
|
|
|
*/
|
2020-06-09 12:33:25 +08:00
|
|
|
if (mmap_write_lock_killable(mm))
|
2016-05-24 07:25:54 +08:00
|
|
|
return -EINTR;
|
2011-01-12 16:55:24 +08:00
|
|
|
vdso_base = get_unmapped_area(NULL, 0, vdso_pages << PAGE_SHIFT, 0, 0);
|
2008-12-25 20:38:36 +08:00
|
|
|
if (IS_ERR_VALUE(vdso_base)) {
|
|
|
|
rc = vdso_base;
|
|
|
|
goto out_up;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* our vma flags don't have VM_WRITE so by default, the process
|
|
|
|
* isn't allowed to write those pages.
|
|
|
|
* gdb can break that with ptrace interface, and thus trigger COW
|
|
|
|
* on those pages but it's then your responsibility to never do that
|
|
|
|
* on the "data" page of the vDSO or you'll stop getting kernel
|
|
|
|
* updates and your nice userland gettimeofday will be totally dead.
|
|
|
|
* It's fine to use that for setting breakpoints in the vDSO code
|
coredump: remove VM_ALWAYSDUMP flag
The motivation for this patchset was that I was looking at a way for a
qemu-kvm process, to exclude the guest memory from its core dump, which
can be quite large. There are already a number of filter flags in
/proc/<pid>/coredump_filter, however, these allow one to specify 'types'
of kernel memory, not specific address ranges (which is needed in this
case).
Since there are no more vma flags available, the first patch eliminates
the need for the 'VM_ALWAYSDUMP' flag. The flag is used internally by
the kernel to mark vdso and vsyscall pages. However, it is simple
enough to check if a vma covers a vdso or vsyscall page without the need
for this flag.
The second patch then replaces the 'VM_ALWAYSDUMP' flag with a new
'VM_NODUMP' flag, which can be set by userspace using new madvise flags:
'MADV_DONTDUMP', and unset via 'MADV_DODUMP'. The core dump filters
continue to work the same as before unless 'MADV_DONTDUMP' is set on the
region.
The qemu code which implements this features is at:
http://people.redhat.com/~jbaron/qemu-dump/qemu-dump.patch
In my testing the qemu core dump shrunk from 383MB -> 13MB with this
patch.
I also believe that the 'MADV_DONTDUMP' flag might be useful for
security sensitive apps, which might want to select which areas are
dumped.
This patch:
The VM_ALWAYSDUMP flag is currently used by the coredump code to
indicate that a vma is part of a vsyscall or vdso section. However, we
can determine if a vma is in one these sections by checking it against
the gate_vma and checking for a non-NULL return value from
arch_vma_name(). Thus, freeing a valuable vma bit.
Signed-off-by: Jason Baron <jbaron@redhat.com>
Acked-by: Roland McGrath <roland@hack.frob.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Avi Kivity <avi@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-24 06:02:51 +08:00
|
|
|
* pages though.
|
2008-12-25 20:38:36 +08:00
|
|
|
*/
|
2017-05-15 16:23:38 +08:00
|
|
|
vma = _install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT,
|
|
|
|
VM_READ|VM_EXEC|
|
|
|
|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
|
|
|
|
&vdso_mapping);
|
|
|
|
if (IS_ERR(vma)) {
|
|
|
|
rc = PTR_ERR(vma);
|
|
|
|
goto out_up;
|
|
|
|
}
|
|
|
|
|
|
|
|
current->mm->context.vdso_base = vdso_base;
|
|
|
|
rc = 0;
|
|
|
|
|
2008-12-25 20:38:36 +08:00
|
|
|
out_up:
|
2020-06-09 12:33:25 +08:00
|
|
|
mmap_write_unlock(mm);
|
2008-12-25 20:38:36 +08:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __init vdso_init(void)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2008-12-31 22:11:42 +08:00
|
|
|
vdso_init_data(vdso_data);
|
2008-12-25 20:38:36 +08:00
|
|
|
|
|
|
|
/* Calculate the size of the 64 bit vDSO */
|
|
|
|
vdso64_pages = ((&vdso64_end - &vdso64_start
|
|
|
|
+ PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
|
|
|
|
|
|
|
|
/* Make sure pages are in the correct state */
|
treewide: kzalloc() -> kcalloc()
The kzalloc() function has a 2-factor argument form, kcalloc(). This
patch replaces cases of:
kzalloc(a * b, gfp)
with:
kcalloc(a * b, gfp)
as well as handling cases of:
kzalloc(a * b * c, gfp)
with:
kzalloc(array3_size(a, b, c), gfp)
as it's slightly less ugly than:
kzalloc_array(array_size(a, b), c, gfp)
This does, however, attempt to ignore constant size factors like:
kzalloc(4 * 1024, gfp)
though any constants defined via macros get caught up in the conversion.
Any factors with a sizeof() of "unsigned char", "char", and "u8" were
dropped, since they're redundant.
The Coccinelle script used for this was:
// Fix redundant parens around sizeof().
@@
type TYPE;
expression THING, E;
@@
(
kzalloc(
- (sizeof(TYPE)) * E
+ sizeof(TYPE) * E
, ...)
|
kzalloc(
- (sizeof(THING)) * E
+ sizeof(THING) * E
, ...)
)
// Drop single-byte sizes and redundant parens.
@@
expression COUNT;
typedef u8;
typedef __u8;
@@
(
kzalloc(
- sizeof(u8) * (COUNT)
+ COUNT
, ...)
|
kzalloc(
- sizeof(__u8) * (COUNT)
+ COUNT
, ...)
|
kzalloc(
- sizeof(char) * (COUNT)
+ COUNT
, ...)
|
kzalloc(
- sizeof(unsigned char) * (COUNT)
+ COUNT
, ...)
|
kzalloc(
- sizeof(u8) * COUNT
+ COUNT
, ...)
|
kzalloc(
- sizeof(__u8) * COUNT
+ COUNT
, ...)
|
kzalloc(
- sizeof(char) * COUNT
+ COUNT
, ...)
|
kzalloc(
- sizeof(unsigned char) * COUNT
+ COUNT
, ...)
)
// 2-factor product with sizeof(type/expression) and identifier or constant.
@@
type TYPE;
expression THING;
identifier COUNT_ID;
constant COUNT_CONST;
@@
(
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * (COUNT_ID)
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * COUNT_ID
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * (COUNT_CONST)
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * COUNT_CONST
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * (COUNT_ID)
+ COUNT_ID, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * COUNT_ID
+ COUNT_ID, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * (COUNT_CONST)
+ COUNT_CONST, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * COUNT_CONST
+ COUNT_CONST, sizeof(THING)
, ...)
)
// 2-factor product, only identifiers.
@@
identifier SIZE, COUNT;
@@
- kzalloc
+ kcalloc
(
- SIZE * COUNT
+ COUNT, SIZE
, ...)
// 3-factor product with 1 sizeof(type) or sizeof(expression), with
// redundant parens removed.
@@
expression THING;
identifier STRIDE, COUNT;
type TYPE;
@@
(
kzalloc(
- sizeof(TYPE) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc(
- sizeof(TYPE) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc(
- sizeof(TYPE) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc(
- sizeof(TYPE) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc(
- sizeof(THING) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc(
- sizeof(THING) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc(
- sizeof(THING) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc(
- sizeof(THING) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
)
// 3-factor product with 2 sizeof(variable), with redundant parens removed.
@@
expression THING1, THING2;
identifier COUNT;
type TYPE1, TYPE2;
@@
(
kzalloc(
- sizeof(TYPE1) * sizeof(TYPE2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kzalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kzalloc(
- sizeof(THING1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kzalloc(
- sizeof(THING1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kzalloc(
- sizeof(TYPE1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
|
kzalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
)
// 3-factor product, only identifiers, with redundant parens removed.
@@
identifier STRIDE, SIZE, COUNT;
@@
(
kzalloc(
- (COUNT) * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- COUNT * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- COUNT * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- (COUNT) * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- COUNT * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- (COUNT) * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- (COUNT) * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- COUNT * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
)
// Any remaining multi-factor products, first at least 3-factor products,
// when they're not all constants...
@@
expression E1, E2, E3;
constant C1, C2, C3;
@@
(
kzalloc(C1 * C2 * C3, ...)
|
kzalloc(
- (E1) * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc(
- (E1) * (E2) * E3
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc(
- (E1) * (E2) * (E3)
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc(
- E1 * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
)
// And then all remaining 2 factors products when they're not all constants,
// keeping sizeof() as the second factor argument.
@@
expression THING, E1, E2;
type TYPE;
constant C1, C2, C3;
@@
(
kzalloc(sizeof(THING) * C2, ...)
|
kzalloc(sizeof(TYPE) * C2, ...)
|
kzalloc(C1 * C2 * C3, ...)
|
kzalloc(C1 * C2, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * (E2)
+ E2, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * E2
+ E2, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * (E2)
+ E2, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * E2
+ E2, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- (E1) * E2
+ E1, E2
, ...)
|
- kzalloc
+ kcalloc
(
- (E1) * (E2)
+ E1, E2
, ...)
|
- kzalloc
+ kcalloc
(
- E1 * E2
+ E1, E2
, ...)
)
Signed-off-by: Kees Cook <keescook@chromium.org>
2018-06-13 05:03:40 +08:00
|
|
|
vdso64_pagelist = kcalloc(vdso64_pages + 1, sizeof(struct page *),
|
2008-12-25 20:38:36 +08:00
|
|
|
GFP_KERNEL);
|
|
|
|
BUG_ON(vdso64_pagelist == NULL);
|
|
|
|
for (i = 0; i < vdso64_pages - 1; i++) {
|
|
|
|
struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE);
|
|
|
|
get_page(pg);
|
|
|
|
vdso64_pagelist[i] = pg;
|
|
|
|
}
|
|
|
|
vdso64_pagelist[vdso64_pages - 1] = virt_to_page(vdso_data);
|
|
|
|
vdso64_pagelist[vdso64_pages] = NULL;
|
2012-03-11 23:59:26 +08:00
|
|
|
if (vdso_alloc_per_cpu(&S390_lowcore))
|
2009-01-09 19:14:54 +08:00
|
|
|
BUG();
|
2008-12-25 20:38:36 +08:00
|
|
|
|
|
|
|
get_page(virt_to_page(vdso_data));
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2012-03-11 23:59:26 +08:00
|
|
|
early_initcall(vdso_init);
|