Merge eccc876724 ("Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs") into android-mainline

Steps on the way to 5.10-rc4

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
Change-Id: I9e0fa89c0f6f306fe802ae95c8d01d9ba558e111
This commit is contained in:
Greg Kroah-Hartman 2020-11-11 19:31:33 +01:00
commit 7f6480e40c
78 changed files with 3317 additions and 874 deletions

View File

@ -256,6 +256,10 @@ which is 1024 bytes long:
- s\_padding2
-
* - 0x54
- \_\_be32
- s\_num\_fc\_blocks
- Number of fast commit blocks in the journal.
* - 0x58
- \_\_u32
- s\_padding[42]
-
@ -310,6 +314,8 @@ The journal incompat features are any combination of the following:
- This journal uses v3 of the checksum on-disk format. This is the same as
v2, but the journal block tag size is fixed regardless of the size of
block numbers. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3)
* - 0x20
- Journal has fast commit blocks. (JBD2\_FEATURE\_INCOMPAT\_FAST\_COMMIT)
.. _jbd2_checksum_type:

View File

@ -596,6 +596,13 @@ following:
- Sparse Super Block, v2. If this flag is set, the SB field s\_backup\_bgs
points to the two block groups that contain backup superblocks
(COMPAT\_SPARSE\_SUPER2).
* - 0x400
- Fast commits supported. Although fast commits blocks are
backward incompatible, fast commit blocks are not always
present in the journal. If fast commit blocks are present in
the journal, JBD2 incompat feature
(JBD2\_FEATURE\_INCOMPAT\_FAST\_COMMIT) gets
set (COMPAT\_FAST\_COMMIT).
.. _super_incompat:

View File

@ -136,10 +136,8 @@ Fast commits
~~~~~~~~~~~~
JBD2 to also allows you to perform file-system specific delta commits known as
fast commits. In order to use fast commits, you first need to call
:c:func:`jbd2_fc_init` and tell how many blocks at the end of journal
area should be reserved for fast commits. Along with that, you will also need
to set following callbacks that perform correspodning work:
fast commits. In order to use fast commits, you will need to set following
callbacks that perform correspodning work:
`journal->j_fc_cleanup_cb`: Cleanup function called after every full commit and
fast commit.

View File

@ -6367,7 +6367,7 @@ accesses that would usually trigger a #GP by KVM into the guest will
instead get bounced to user space through the KVM_EXIT_X86_RDMSR and
KVM_EXIT_X86_WRMSR exit notifications.
8.25 KVM_X86_SET_MSR_FILTER
8.27 KVM_X86_SET_MSR_FILTER
---------------------------
:Architectures: x86
@ -6381,8 +6381,7 @@ In combination with KVM_CAP_X86_USER_SPACE_MSR, this allows user space to
trap and emulate MSRs that are outside of the scope of KVM as well as
limit the attack surface on KVM's MSR emulation code.
8.26 KVM_CAP_ENFORCE_PV_CPUID
8.28 KVM_CAP_ENFORCE_PV_CPUID
-----------------------------
Architectures: x86

View File

@ -6614,6 +6614,7 @@ Q: http://patchwork.ozlabs.org/project/linux-ext4/list/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git
F: Documentation/filesystems/ext4/
F: fs/ext4/
F: include/trace/events/ext4.h
Extended Verification Module (EVM)
M: Mimi Zohar <zohar@linux.ibm.com>

View File

@ -788,10 +788,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
}
switch (vma_shift) {
#ifndef __PAGETABLE_PMD_FOLDED
case PUD_SHIFT:
if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
break;
fallthrough;
#endif
case CONT_PMD_SHIFT:
vma_shift = PMD_SHIFT;
fallthrough;

View File

@ -1069,7 +1069,7 @@ static bool trap_ptrauth(struct kvm_vcpu *vcpu,
static unsigned int ptrauth_visibility(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
return vcpu_has_ptrauth(vcpu) ? 0 : REG_HIDDEN_USER | REG_HIDDEN_GUEST;
return vcpu_has_ptrauth(vcpu) ? 0 : REG_HIDDEN;
}
#define __PTRAUTH_KEY(k) \
@ -1162,6 +1162,22 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
return val;
}
static unsigned int id_visibility(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *r)
{
u32 id = sys_reg((u32)r->Op0, (u32)r->Op1,
(u32)r->CRn, (u32)r->CRm, (u32)r->Op2);
switch (id) {
case SYS_ID_AA64ZFR0_EL1:
if (!vcpu_has_sve(vcpu))
return REG_RAZ;
break;
}
return 0;
}
/* cpufeature ID register access trap handlers */
static bool __access_id_reg(struct kvm_vcpu *vcpu,
@ -1180,7 +1196,9 @@ static bool access_id_reg(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
return __access_id_reg(vcpu, p, r, false);
bool raz = sysreg_visible_as_raz(vcpu, r);
return __access_id_reg(vcpu, p, r, raz);
}
static bool access_raz_id_reg(struct kvm_vcpu *vcpu,
@ -1201,72 +1219,7 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu,
if (vcpu_has_sve(vcpu))
return 0;
return REG_HIDDEN_USER | REG_HIDDEN_GUEST;
}
/* Visibility overrides for SVE-specific ID registers */
static unsigned int sve_id_visibility(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
if (vcpu_has_sve(vcpu))
return 0;
return REG_HIDDEN_USER;
}
/* Generate the emulated ID_AA64ZFR0_EL1 value exposed to the guest */
static u64 guest_id_aa64zfr0_el1(const struct kvm_vcpu *vcpu)
{
if (!vcpu_has_sve(vcpu))
return 0;
return read_sanitised_ftr_reg(SYS_ID_AA64ZFR0_EL1);
}
static bool access_id_aa64zfr0_el1(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *rd)
{
if (p->is_write)
return write_to_read_only(vcpu, p, rd);
p->regval = guest_id_aa64zfr0_el1(vcpu);
return true;
}
static int get_id_aa64zfr0_el1(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd,
const struct kvm_one_reg *reg, void __user *uaddr)
{
u64 val;
if (WARN_ON(!vcpu_has_sve(vcpu)))
return -ENOENT;
val = guest_id_aa64zfr0_el1(vcpu);
return reg_to_user(uaddr, &val, reg->id);
}
static int set_id_aa64zfr0_el1(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd,
const struct kvm_one_reg *reg, void __user *uaddr)
{
const u64 id = sys_reg_to_index(rd);
int err;
u64 val;
if (WARN_ON(!vcpu_has_sve(vcpu)))
return -ENOENT;
err = reg_from_user(&val, uaddr, id);
if (err)
return err;
/* This is what we mean by invariant: you can't change it. */
if (val != guest_id_aa64zfr0_el1(vcpu))
return -EINVAL;
return 0;
return REG_HIDDEN;
}
/*
@ -1308,13 +1261,17 @@ static int __set_id_reg(const struct kvm_vcpu *vcpu,
static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
const struct kvm_one_reg *reg, void __user *uaddr)
{
return __get_id_reg(vcpu, rd, uaddr, false);
bool raz = sysreg_visible_as_raz(vcpu, rd);
return __get_id_reg(vcpu, rd, uaddr, raz);
}
static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
const struct kvm_one_reg *reg, void __user *uaddr)
{
return __set_id_reg(vcpu, rd, uaddr, false);
bool raz = sysreg_visible_as_raz(vcpu, rd);
return __set_id_reg(vcpu, rd, uaddr, raz);
}
static int get_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
@ -1406,6 +1363,7 @@ static bool access_mte_regs(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
.access = access_id_reg, \
.get_user = get_id_reg, \
.set_user = set_id_reg, \
.visibility = id_visibility, \
}
/*
@ -1527,7 +1485,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_SANITISED(ID_AA64PFR1_EL1),
ID_UNALLOCATED(4,2),
ID_UNALLOCATED(4,3),
{ SYS_DESC(SYS_ID_AA64ZFR0_EL1), access_id_aa64zfr0_el1, .get_user = get_id_aa64zfr0_el1, .set_user = set_id_aa64zfr0_el1, .visibility = sve_id_visibility },
ID_SANITISED(ID_AA64ZFR0_EL1),
ID_UNALLOCATED(4,5),
ID_UNALLOCATED(4,6),
ID_UNALLOCATED(4,7),
@ -2194,7 +2152,7 @@ static void perform_access(struct kvm_vcpu *vcpu,
trace_kvm_sys_access(*vcpu_pc(vcpu), params, r);
/* Check for regs disabled by runtime config */
if (sysreg_hidden_from_guest(vcpu, r)) {
if (sysreg_hidden(vcpu, r)) {
kvm_inject_undefined(vcpu);
return;
}
@ -2693,7 +2651,7 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg
return get_invariant_sys_reg(reg->id, uaddr);
/* Check for regs disabled by runtime config */
if (sysreg_hidden_from_user(vcpu, r))
if (sysreg_hidden(vcpu, r))
return -ENOENT;
if (r->get_user)
@ -2718,7 +2676,7 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg
return set_invariant_sys_reg(reg->id, uaddr);
/* Check for regs disabled by runtime config */
if (sysreg_hidden_from_user(vcpu, r))
if (sysreg_hidden(vcpu, r))
return -ENOENT;
if (r->set_user)
@ -2789,7 +2747,7 @@ static int walk_one_sys_reg(const struct kvm_vcpu *vcpu,
if (!(rd->reg || rd->get_user))
return 0;
if (sysreg_hidden_from_user(vcpu, rd))
if (sysreg_hidden(vcpu, rd))
return 0;
if (!copy_reg_to_user(rd, uind))

View File

@ -59,8 +59,8 @@ struct sys_reg_desc {
const struct sys_reg_desc *rd);
};
#define REG_HIDDEN_USER (1 << 0) /* hidden from userspace ioctls */
#define REG_HIDDEN_GUEST (1 << 1) /* hidden from guest */
#define REG_HIDDEN (1 << 0) /* hidden from userspace and guest */
#define REG_RAZ (1 << 1) /* RAZ from userspace and guest */
static __printf(2, 3)
inline void print_sys_reg_msg(const struct sys_reg_params *p,
@ -111,22 +111,22 @@ static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r
__vcpu_sys_reg(vcpu, r->reg) = r->val;
}
static inline bool sysreg_hidden_from_guest(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *r)
static inline bool sysreg_hidden(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *r)
{
if (likely(!r->visibility))
return false;
return r->visibility(vcpu, r) & REG_HIDDEN_GUEST;
return r->visibility(vcpu, r) & REG_HIDDEN;
}
static inline bool sysreg_hidden_from_user(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *r)
static inline bool sysreg_visible_as_raz(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *r)
{
if (likely(!r->visibility))
return false;
return r->visibility(vcpu, r) & REG_HIDDEN_USER;
return r->visibility(vcpu, r) & REG_RAZ;
}
static inline int cmp_sys_reg(const struct sys_reg_desc *i1,

View File

@ -90,6 +90,20 @@ static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent)
return 0;
}
void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0);
/*
* save the feature bitmap to avoid cpuid lookup for every PV
* operation
*/
if (best)
vcpu->arch.pv_cpuid.features = best->eax;
}
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@ -124,13 +138,6 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
(best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
/*
* save the feature bitmap to avoid cpuid lookup for every PV
* operation
*/
if (best)
vcpu->arch.pv_cpuid.features = best->eax;
if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
if (best)
@ -162,6 +169,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vcpu->arch.guest_supported_xcr0 =
(best->eax | ((u64)best->edx << 32)) & supported_xcr0;
kvm_update_pv_runtime(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
kvm_mmu_reset_context(vcpu);

View File

@ -11,6 +11,7 @@ extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
void kvm_set_cpu_caps(void);
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
void kvm_update_pv_runtime(struct kvm_vcpu *vcpu);
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
u32 function, u32 index);
int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,

View File

@ -856,12 +856,14 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
} else {
rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
desc = desc->more;
while (desc->sptes[PTE_LIST_EXT-1]) {
count += PTE_LIST_EXT;
}
if (desc->sptes[PTE_LIST_EXT-1]) {
desc->more = mmu_alloc_pte_list_desc(vcpu);
if (!desc->more) {
desc->more = mmu_alloc_pte_list_desc(vcpu);
desc = desc->more;
break;
}
desc = desc->more;
}
for (i = 0; desc->sptes[i]; ++i)

View File

@ -255,11 +255,10 @@ static struct kmem_cache *x86_emulator_cache;
/*
* When called, it means the previous get/set msr reached an invalid msr.
* Return 0 if we want to ignore/silent this failed msr access, or 1 if we want
* to fail the caller.
* Return true if we want to ignore/silent this failed msr access.
*/
static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
u64 data, bool write)
static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
u64 data, bool write)
{
const char *op = write ? "wrmsr" : "rdmsr";
@ -268,11 +267,11 @@ static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
op, msr, data);
/* Mask the error */
return 0;
return true;
} else {
kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
op, msr, data);
return -ENOENT;
return false;
}
}
@ -1416,7 +1415,8 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
if (r == KVM_MSR_RET_INVALID) {
/* Unconditionally clear the output for simplicity */
*data = 0;
r = kvm_msr_ignored_check(vcpu, index, 0, false);
if (kvm_msr_ignored_check(vcpu, index, 0, false))
r = 0;
}
if (r)
@ -1540,7 +1540,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
struct msr_data msr;
if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
return -EPERM;
return KVM_MSR_RET_FILTERED;
switch (index) {
case MSR_FS_BASE:
@ -1581,7 +1581,8 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
if (ret == KVM_MSR_RET_INVALID)
ret = kvm_msr_ignored_check(vcpu, index, data, true);
if (kvm_msr_ignored_check(vcpu, index, data, true))
ret = 0;
return ret;
}
@ -1599,7 +1600,7 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
int ret;
if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
return -EPERM;
return KVM_MSR_RET_FILTERED;
msr.index = index;
msr.host_initiated = host_initiated;
@ -1618,7 +1619,8 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
if (ret == KVM_MSR_RET_INVALID) {
/* Unconditionally clear *data for simplicity */
*data = 0;
ret = kvm_msr_ignored_check(vcpu, index, 0, false);
if (kvm_msr_ignored_check(vcpu, index, 0, false))
ret = 0;
}
return ret;
@ -1662,9 +1664,9 @@ static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
static u64 kvm_msr_reason(int r)
{
switch (r) {
case -ENOENT:
case KVM_MSR_RET_INVALID:
return KVM_MSR_EXIT_REASON_UNKNOWN;
case -EPERM:
case KVM_MSR_RET_FILTERED:
return KVM_MSR_EXIT_REASON_FILTER;
default:
return KVM_MSR_EXIT_REASON_INVAL;
@ -1965,7 +1967,7 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
struct kvm_arch *ka = &vcpu->kvm->arch;
if (vcpu->vcpu_id == 0 && !host_initiated) {
if (ka->boot_vcpu_runs_old_kvmclock && old_msr)
if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
ka->boot_vcpu_runs_old_kvmclock = old_msr;
@ -3063,9 +3065,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
/* Values other than LBR and BTF are vendor-specific,
thus reserved and should throw a #GP */
return 1;
}
vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
__func__, data);
} else if (report_ignored_msrs)
vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
__func__, data);
break;
case 0x200 ... 0x2ff:
return kvm_mtrr_set_msr(vcpu, msr, data);
@ -3463,29 +3465,63 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vcpu->arch.efer;
break;
case MSR_KVM_WALL_CLOCK:
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
return 1;
msr_info->data = vcpu->kvm->arch.wall_clock;
break;
case MSR_KVM_WALL_CLOCK_NEW:
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
return 1;
msr_info->data = vcpu->kvm->arch.wall_clock;
break;
case MSR_KVM_SYSTEM_TIME:
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
return 1;
msr_info->data = vcpu->arch.time;
break;
case MSR_KVM_SYSTEM_TIME_NEW:
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
return 1;
msr_info->data = vcpu->arch.time;
break;
case MSR_KVM_ASYNC_PF_EN:
if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
return 1;
msr_info->data = vcpu->arch.apf.msr_en_val;
break;
case MSR_KVM_ASYNC_PF_INT:
if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
return 1;
msr_info->data = vcpu->arch.apf.msr_int_val;
break;
case MSR_KVM_ASYNC_PF_ACK:
if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
return 1;
msr_info->data = 0;
break;
case MSR_KVM_STEAL_TIME:
if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
return 1;
msr_info->data = vcpu->arch.st.msr_val;
break;
case MSR_KVM_PV_EOI_EN:
if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
return 1;
msr_info->data = vcpu->arch.pv_eoi.msr_val;
break;
case MSR_KVM_POLL_CONTROL:
if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
return 1;
msr_info->data = vcpu->arch.msr_kvm_poll_control;
break;
case MSR_IA32_P5_MC_ADDR:
@ -4575,6 +4611,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
vcpu->arch.pv_cpuid.enforce = cap->args[0];
if (vcpu->arch.pv_cpuid.enforce)
kvm_update_pv_runtime(vcpu);
return 0;

View File

@ -376,7 +376,13 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva);
bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
#define KVM_MSR_RET_INVALID 2
/*
* Internal error codes that are used to indicate that MSR emulation encountered
* an error that should result in #GP in the guest, unless userspace
* handles it.
*/
#define KVM_MSR_RET_INVALID 2 /* in-kernel MSR emulation #GP condition */
#define KVM_MSR_RET_FILTERED 3 /* #GP due to userspace MSR filter */
#define __cr4_reserved_bits(__cpu_has, __c) \
({ \

View File

@ -367,9 +367,9 @@ static void create_power_zone_common_attributes(
&dev_attr_max_energy_range_uj.attr;
if (power_zone->ops->get_energy_uj) {
if (power_zone->ops->reset_energy_uj)
dev_attr_energy_uj.attr.mode = S_IWUSR | S_IRUGO;
dev_attr_energy_uj.attr.mode = S_IWUSR | S_IRUSR;
else
dev_attr_energy_uj.attr.mode = S_IRUGO;
dev_attr_energy_uj.attr.mode = S_IRUSR;
power_zone->zone_dev_attrs[count++] =
&dev_attr_energy_uj.attr;
}

View File

@ -511,7 +511,8 @@ struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
/*DEFAULT_RATELIMIT_BURST*/ 1);
if (__ratelimit(&_rs))
WARN(1, KERN_DEBUG
"BTRFS: block rsv returned %d\n", ret);
"BTRFS: block rsv %d returned %d\n",
block_rsv->type, ret);
}
try_reserve:
ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize,

View File

@ -91,6 +91,17 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
if (ret) {
no_valid_dev_replace_entry_found:
/*
* We don't have a replace item or it's corrupted. If there is
* a replace target, fail the mount.
*/
if (btrfs_find_device(fs_info->fs_devices,
BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) {
btrfs_err(fs_info,
"found replace target device without a valid replace item");
ret = -EUCLEAN;
goto out;
}
ret = 0;
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
@ -143,8 +154,19 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
dev_replace->srcdev = NULL;
dev_replace->tgtdev = NULL;
/*
* We don't have an active replace item but if there is a
* replace target, fail the mount.
*/
if (btrfs_find_device(fs_info->fs_devices,
BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) {
btrfs_err(fs_info,
"replace devid present without an active replace item");
ret = -EUCLEAN;
} else {
dev_replace->srcdev = NULL;
dev_replace->tgtdev = NULL;
}
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:

View File

@ -1274,6 +1274,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
u64 page_start;
u64 page_end;
u64 page_cnt;
u64 start = (u64)start_index << PAGE_SHIFT;
int ret;
int i;
int i_done;
@ -1290,8 +1291,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
start_index << PAGE_SHIFT,
page_cnt << PAGE_SHIFT);
start, page_cnt << PAGE_SHIFT);
if (ret)
return ret;
i_done = 0;
@ -1380,8 +1380,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
start_index << PAGE_SHIFT,
(page_cnt - i_done) << PAGE_SHIFT, true);
start, (page_cnt - i_done) << PAGE_SHIFT, true);
}
@ -1408,8 +1407,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
put_page(pages[i]);
}
btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
start_index << PAGE_SHIFT,
page_cnt << PAGE_SHIFT, true);
start, page_cnt << PAGE_SHIFT, true);
btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
extent_changeset_free(data_reserved);
return ret;

View File

@ -3435,24 +3435,20 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode,
{
struct rb_node *node;
struct rb_node *next;
struct ulist_node *entry = NULL;
struct ulist_node *entry;
int ret = 0;
node = reserved->range_changed.root.rb_node;
if (!node)
return 0;
while (node) {
entry = rb_entry(node, struct ulist_node, rb_node);
if (entry->val < start)
node = node->rb_right;
else if (entry)
node = node->rb_left;
else
break;
node = node->rb_left;
}
/* Empty changeset */
if (!entry)
return 0;
if (entry->val > start && rb_prev(&entry->rb_node))
entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
rb_node);

View File

@ -860,6 +860,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
"dropping a ref for a root that doesn't have a ref on the block");
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
kfree(ref);
kfree(ra);
goto out_unlock;
}

View File

@ -1648,6 +1648,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
struct btrfs_root_item *root_item;
struct btrfs_path *path;
struct extent_buffer *leaf;
int reserve_level;
int level;
int max_level;
int replaced = 0;
@ -1696,7 +1697,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
* Thus the needed metadata size is at most root_level * nodesize,
* and * 2 since we have two trees to COW.
*/
min_reserved = fs_info->nodesize * btrfs_root_level(root_item) * 2;
reserve_level = max_t(int, 1, btrfs_root_level(root_item));
min_reserved = fs_info->nodesize * reserve_level * 2;
memset(&next_key, 0, sizeof(next_key));
while (1) {

View File

@ -3866,8 +3866,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (!is_dev_replace && !readonly &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
rcu_str_deref(dev->name));
btrfs_err_in_rcu(fs_info,
"scrub on devid %llu: filesystem on %s is not writable",
devid, rcu_str_deref(dev->name));
ret = -EROFS;
goto out;
}

View File

@ -1056,22 +1056,13 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
continue;
}
if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
/*
* In the first step, keep the device which has
* the correct fsid and the devid that is used
* for the dev_replace procedure.
* In the second step, the dev_replace state is
* read from the device tree and it is known
* whether the procedure is really active or
* not, which means whether this device is
* used or whether it should be removed.
*/
if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
&device->dev_state)) {
continue;
}
}
/*
* We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
* in btrfs_init_dev_replace() so just continue.
*/
if (device->devid == BTRFS_DEV_REPLACE_DEVID)
continue;
if (device->bdev) {
blkdev_put(device->bdev, device->mode);
device->bdev = NULL;
@ -1080,9 +1071,6 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
list_del_init(&device->dev_alloc_list);
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
&device->dev_state))
fs_devices->rw_devices--;
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;

View File

@ -300,9 +300,7 @@ static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci,
* New inodes may not have an inode number assigned yet.
* Hashing their inode number is delayed until later.
*/
if (ci->ci_inode->i_ino == 0)
WARN_ON(!(ci->ci_inode->i_state & I_CREATING));
else
if (ci->ci_inode->i_ino)
fscrypt_hash_inode_number(ci, mk);
return 0;
}

View File

@ -107,11 +107,9 @@ static struct page *erofs_read_inode(struct inode *inode,
i_gid_write(inode, le32_to_cpu(die->i_gid));
set_nlink(inode, le32_to_cpu(die->i_nlink));
/* ns timestamp */
inode->i_mtime.tv_sec = inode->i_ctime.tv_sec =
le64_to_cpu(die->i_ctime);
inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec =
le32_to_cpu(die->i_ctime_nsec);
/* extended inode has its own timestamp */
inode->i_ctime.tv_sec = le64_to_cpu(die->i_ctime);
inode->i_ctime.tv_nsec = le32_to_cpu(die->i_ctime_nsec);
inode->i_size = le64_to_cpu(die->i_size);
@ -149,11 +147,9 @@ static struct page *erofs_read_inode(struct inode *inode,
i_gid_write(inode, le16_to_cpu(dic->i_gid));
set_nlink(inode, le16_to_cpu(dic->i_nlink));
/* use build time to derive all file time */
inode->i_mtime.tv_sec = inode->i_ctime.tv_sec =
sbi->build_time;
inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec =
sbi->build_time_nsec;
/* use build time for compact inodes */
inode->i_ctime.tv_sec = sbi->build_time;
inode->i_ctime.tv_nsec = sbi->build_time_nsec;
inode->i_size = le32_to_cpu(dic->i_size);
if (erofs_inode_is_data_compressed(vi->datalayout))
@ -167,6 +163,11 @@ static struct page *erofs_read_inode(struct inode *inode,
goto err_out;
}
inode->i_mtime.tv_sec = inode->i_ctime.tv_sec;
inode->i_atime.tv_sec = inode->i_ctime.tv_sec;
inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec;
inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
if (!nblks)
/* measure inode.i_blocks as generic filesystems */
inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;

View File

@ -1078,8 +1078,11 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
cond_resched();
goto repeat;
}
set_page_private(page, (unsigned long)pcl);
SetPagePrivate(page);
if (tocache) {
set_page_private(page, (unsigned long)pcl);
SetPagePrivate(page);
}
out: /* the only exit (for tracing and debugging) */
return page;
}

View File

@ -1028,9 +1028,6 @@ struct ext4_inode_info {
* protected by sbi->s_fc_lock.
*/
/* Fast commit subtid when this inode was committed */
unsigned int i_fc_committed_subtid;
/* Start of lblk range that needs to be committed in this fast commit */
ext4_lblk_t i_fc_lblk_start;
@ -1422,16 +1419,6 @@ struct ext4_super_block {
#ifdef __KERNEL__
/*
* run-time mount flags
*/
#define EXT4_MF_MNTDIR_SAMPLED 0x0001
#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
#define EXT4_MF_FC_INELIGIBLE 0x0004 /* Fast commit ineligible */
#define EXT4_MF_FC_COMMITTING 0x0008 /* File system underoing a fast
* commit.
*/
#ifdef CONFIG_FS_ENCRYPTION
#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_policy.policy != NULL)
#else
@ -1466,7 +1453,7 @@ struct ext4_sb_info {
struct buffer_head * __rcu *s_group_desc;
unsigned int s_mount_opt;
unsigned int s_mount_opt2;
unsigned int s_mount_flags;
unsigned long s_mount_flags;
unsigned int s_def_mount_opt;
ext4_fsblk_t s_sb_block;
atomic64_t s_resv_clusters;
@ -1694,6 +1681,34 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
_v; \
})
/*
* run-time mount flags
*/
enum {
EXT4_MF_MNTDIR_SAMPLED,
EXT4_MF_FS_ABORTED, /* Fatal error detected */
EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */
EXT4_MF_FC_COMMITTING /* File system underoing a fast
* commit.
*/
};
static inline void ext4_set_mount_flag(struct super_block *sb, int bit)
{
set_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}
static inline void ext4_clear_mount_flag(struct super_block *sb, int bit)
{
clear_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}
static inline int ext4_test_mount_flag(struct super_block *sb, int bit)
{
return test_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}
/*
* Simulate_fail codes
*/
@ -1863,6 +1878,13 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010
#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020
#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200
/*
* The reason why "FAST_COMMIT" is a compat feature is that, FS becomes
* incompatible only if fast commit blocks are present in the FS. Since we
* clear the journal (and thus the fast commit blocks), we don't mark FS as
* incompatible. We also have a JBD2 incompat feature, which gets set when
* there are fast commit blocks present in the journal.
*/
#define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400
#define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800
@ -2776,12 +2798,16 @@ extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
int ext4_fc_info_show(struct seq_file *seq, void *v);
void ext4_fc_init(struct super_block *sb, journal_t *journal);
void ext4_fc_init_inode(struct inode *inode);
void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start,
void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
ext4_lblk_t end);
void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry);
void ext4_fc_track_link(struct inode *inode, struct dentry *dentry);
void ext4_fc_track_create(struct inode *inode, struct dentry *dentry);
void ext4_fc_track_inode(struct inode *inode);
void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode,
struct dentry *dentry);
void __ext4_fc_track_link(handle_t *handle, struct inode *inode,
struct dentry *dentry);
void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_link(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
void ext4_fc_mark_ineligible(struct super_block *sb, int reason);
void ext4_fc_start_ineligible(struct super_block *sb, int reason);
void ext4_fc_stop_ineligible(struct super_block *sb);
@ -3495,7 +3521,7 @@ extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
unsigned int blocksize);
extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,
struct inode *inode);
extern int __ext4_link(struct inode *dir, struct inode *inode,
struct dentry *dentry);

View File

@ -3724,7 +3724,6 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
out:
ext4_ext_show_leaf(inode, path);
ext4_fc_track_range(inode, ee_block, ee_block + ee_len - 1);
return err;
}
@ -3796,7 +3795,6 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
if (*allocated > map->m_len)
*allocated = map->m_len;
map->m_len = *allocated;
ext4_fc_track_range(inode, ee_block, ee_block + ee_len - 1);
return 0;
}
@ -4329,7 +4327,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
map->m_len = ar.len;
allocated = map->m_len;
ext4_ext_show_leaf(inode, path);
ext4_fc_track_range(inode, map->m_lblk, map->m_lblk + map->m_len - 1);
out:
ext4_ext_drop_refs(path);
kfree(path);
@ -4602,7 +4599,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret))
goto out_handle;
ext4_fc_track_range(inode, offset >> inode->i_sb->s_blocksize_bits,
ext4_fc_track_range(handle, inode, offset >> inode->i_sb->s_blocksize_bits,
(offset + len - 1) >> inode->i_sb->s_blocksize_bits);
/* Zero out partial block at the edges of the range */
ret = ext4_zero_partial_blocks(handle, inode, offset, len);
@ -4651,8 +4648,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
FALLOC_FL_INSERT_RANGE))
return -EOPNOTSUPP;
ext4_fc_track_range(inode, offset >> blkbits,
(offset + len - 1) >> blkbits);
ext4_fc_start_update(inode);

View File

@ -83,7 +83,7 @@
*
* Atomicity of commits
* --------------------
* In order to gaurantee atomicity during the commit operation, fast commit
* In order to guarantee atomicity during the commit operation, fast commit
* uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
* tag contains CRC of the contents and TID of the transaction after which
* this fast commit should be applied. Recovery code replays fast commit
@ -152,7 +152,31 @@ void ext4_fc_init_inode(struct inode *inode)
INIT_LIST_HEAD(&ei->i_fc_list);
init_waitqueue_head(&ei->i_fc_wait);
atomic_set(&ei->i_fc_updates, 0);
ei->i_fc_committed_subtid = 0;
}
/* This function must be called with sbi->s_fc_lock held. */
static void ext4_fc_wait_committing_inode(struct inode *inode)
__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
{
wait_queue_head_t *wq;
struct ext4_inode_info *ei = EXT4_I(inode);
#if (BITS_PER_LONG < 64)
DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
EXT4_STATE_FC_COMMITTING);
wq = bit_waitqueue(&ei->i_state_flags,
EXT4_STATE_FC_COMMITTING);
#else
DEFINE_WAIT_BIT(wait, &ei->i_flags,
EXT4_STATE_FC_COMMITTING);
wq = bit_waitqueue(&ei->i_flags,
EXT4_STATE_FC_COMMITTING);
#endif
lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
schedule();
finish_wait(wq, &wait.wq_entry);
}
/*
@ -176,22 +200,7 @@ void ext4_fc_start_update(struct inode *inode)
goto out;
if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
wait_queue_head_t *wq;
#if (BITS_PER_LONG < 64)
DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
EXT4_STATE_FC_COMMITTING);
wq = bit_waitqueue(&ei->i_state_flags,
EXT4_STATE_FC_COMMITTING);
#else
DEFINE_WAIT_BIT(wait, &ei->i_flags,
EXT4_STATE_FC_COMMITTING);
wq = bit_waitqueue(&ei->i_flags,
EXT4_STATE_FC_COMMITTING);
#endif
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
schedule();
finish_wait(wq, &wait.wq_entry);
ext4_fc_wait_committing_inode(inode);
goto restart;
}
out:
@ -234,26 +243,10 @@ void ext4_fc_del(struct inode *inode)
}
if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
wait_queue_head_t *wq;
#if (BITS_PER_LONG < 64)
DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
EXT4_STATE_FC_COMMITTING);
wq = bit_waitqueue(&ei->i_state_flags,
EXT4_STATE_FC_COMMITTING);
#else
DEFINE_WAIT_BIT(wait, &ei->i_flags,
EXT4_STATE_FC_COMMITTING);
wq = bit_waitqueue(&ei->i_flags,
EXT4_STATE_FC_COMMITTING);
#endif
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
schedule();
finish_wait(wq, &wait.wq_entry);
ext4_fc_wait_committing_inode(inode);
goto restart;
}
if (!list_empty(&ei->i_fc_list))
list_del_init(&ei->i_fc_list);
list_del_init(&ei->i_fc_list);
spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
}
@ -269,7 +262,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
return;
sbi->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
WARN_ON(reason >= EXT4_FC_REASON_MAX);
sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
}
@ -302,14 +295,14 @@ void ext4_fc_stop_ineligible(struct super_block *sb)
(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
return;
EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
}
static inline int ext4_fc_is_ineligible(struct super_block *sb)
{
return (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FC_INELIGIBLE) ||
atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates);
return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
}
/*
@ -323,13 +316,14 @@ static inline int ext4_fc_is_ineligible(struct super_block *sb)
* If enqueue is set, this function enqueues the inode in fast commit list.
*/
static int ext4_fc_track_template(
struct inode *inode, int (*__fc_track_fn)(struct inode *, void *, bool),
handle_t *handle, struct inode *inode,
int (*__fc_track_fn)(struct inode *, void *, bool),
void *args, int enqueue)
{
tid_t running_txn_tid;
bool update = false;
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
tid_t tid = 0;
int ret;
if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
@ -339,15 +333,13 @@ static int ext4_fc_track_template(
if (ext4_fc_is_ineligible(inode->i_sb))
return -EINVAL;
running_txn_tid = sbi->s_journal ?
sbi->s_journal->j_commit_sequence + 1 : 0;
tid = handle->h_transaction->t_tid;
mutex_lock(&ei->i_fc_lock);
if (running_txn_tid == ei->i_sync_tid) {
if (tid == ei->i_sync_tid) {
update = true;
} else {
ext4_fc_reset_inode(inode);
ei->i_sync_tid = running_txn_tid;
ei->i_sync_tid = tid;
}
ret = __fc_track_fn(inode, args, update);
mutex_unlock(&ei->i_fc_lock);
@ -358,7 +350,7 @@ static int ext4_fc_track_template(
spin_lock(&sbi->s_fc_lock);
if (list_empty(&EXT4_I(inode)->i_fc_list))
list_add_tail(&EXT4_I(inode)->i_fc_list,
(sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) ?
(ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
&sbi->s_fc_q[FC_Q_STAGING] :
&sbi->s_fc_q[FC_Q_MAIN]);
spin_unlock(&sbi->s_fc_lock);
@ -384,7 +376,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
mutex_unlock(&ei->i_fc_lock);
node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
if (!node) {
ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MEM);
ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
mutex_lock(&ei->i_fc_lock);
return -ENOMEM;
}
@ -397,7 +389,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
if (!node->fcd_name.name) {
kmem_cache_free(ext4_fc_dentry_cachep, node);
ext4_fc_mark_ineligible(inode->i_sb,
EXT4_FC_REASON_MEM);
EXT4_FC_REASON_NOMEM);
mutex_lock(&ei->i_fc_lock);
return -ENOMEM;
}
@ -411,7 +403,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
node->fcd_name.len = dentry->d_name.len;
spin_lock(&sbi->s_fc_lock);
if (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING)
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
list_add_tail(&node->fcd_list,
&sbi->s_fc_dentry_q[FC_Q_STAGING]);
else
@ -422,7 +414,8 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
return 0;
}
void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry)
void __ext4_fc_track_unlink(handle_t *handle,
struct inode *inode, struct dentry *dentry)
{
struct __track_dentry_update_args args;
int ret;
@ -430,12 +423,18 @@ void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry)
args.dentry = dentry;
args.op = EXT4_FC_TAG_UNLINK;
ret = ext4_fc_track_template(inode, __track_dentry_update,
ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
(void *)&args, 0);
trace_ext4_fc_track_unlink(inode, dentry, ret);
}
void ext4_fc_track_link(struct inode *inode, struct dentry *dentry)
void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
{
__ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
}
void __ext4_fc_track_link(handle_t *handle,
struct inode *inode, struct dentry *dentry)
{
struct __track_dentry_update_args args;
int ret;
@ -443,20 +442,26 @@ void ext4_fc_track_link(struct inode *inode, struct dentry *dentry)
args.dentry = dentry;
args.op = EXT4_FC_TAG_LINK;
ret = ext4_fc_track_template(inode, __track_dentry_update,
ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
(void *)&args, 0);
trace_ext4_fc_track_link(inode, dentry, ret);
}
void ext4_fc_track_create(struct inode *inode, struct dentry *dentry)
void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
{
__ext4_fc_track_link(handle, d_inode(dentry), dentry);
}
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
{
struct __track_dentry_update_args args;
struct inode *inode = d_inode(dentry);
int ret;
args.dentry = dentry;
args.op = EXT4_FC_TAG_CREAT;
ret = ext4_fc_track_template(inode, __track_dentry_update,
ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
(void *)&args, 0);
trace_ext4_fc_track_create(inode, dentry, ret);
}
@ -472,14 +477,20 @@ static int __track_inode(struct inode *inode, void *arg, bool update)
return 0;
}
void ext4_fc_track_inode(struct inode *inode)
void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
{
int ret;
if (S_ISDIR(inode->i_mode))
return;
ret = ext4_fc_track_template(inode, __track_inode, NULL, 1);
if (ext4_should_journal_data(inode)) {
ext4_fc_mark_ineligible(inode->i_sb,
EXT4_FC_REASON_INODE_JOURNAL_DATA);
return;
}
ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
trace_ext4_fc_track_inode(inode, ret);
}
@ -515,7 +526,7 @@ static int __track_range(struct inode *inode, void *arg, bool update)
return 0;
}
void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start,
void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
ext4_lblk_t end)
{
struct __track_range_args args;
@ -527,7 +538,7 @@ void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start,
args.start = start;
args.end = end;
ret = ext4_fc_track_template(inode, __track_range, &args, 1);
ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
trace_ext4_fc_track_range(inode, start, end, ret);
}
@ -537,10 +548,11 @@ static void ext4_fc_submit_bh(struct super_block *sb)
int write_flags = REQ_SYNC;
struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
/* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
if (test_opt(sb, BARRIER))
write_flags |= REQ_FUA | REQ_PREFLUSH;
lock_buffer(bh);
clear_buffer_dirty(bh);
set_buffer_dirty(bh);
set_buffer_uptodate(bh);
bh->b_end_io = ext4_end_buffer_io_sync;
submit_bh(REQ_OP_WRITE, write_flags, bh);
@ -846,7 +858,7 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal)
int ret = 0;
spin_lock(&sbi->s_fc_lock);
sbi->s_mount_flags |= EXT4_MF_FC_COMMITTING;
ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
@ -900,6 +912,8 @@ static int ext4_fc_wait_inode_data_all(journal_t *journal)
/* Commit all the directory entry updates */
static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
__acquires(&sbi->s_fc_lock)
__releases(&sbi->s_fc_lock)
{
struct super_block *sb = (struct super_block *)(journal->j_private);
struct ext4_sb_info *sbi = EXT4_SB(sb);
@ -996,6 +1010,13 @@ static int ext4_fc_perform_commit(journal_t *journal)
if (ret)
return ret;
/*
* If file system device is different from journal device, issue a cache
* flush before we start writing fast commit blocks.
*/
if (journal->j_fs_dev != journal->j_dev)
blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
blk_start_plug(&plug);
if (sbi->s_fc_bytes == 0) {
/*
@ -1031,8 +1052,6 @@ static int ext4_fc_perform_commit(journal_t *journal)
if (ret)
goto out;
spin_lock(&sbi->s_fc_lock);
EXT4_I(inode)->i_fc_committed_subtid =
atomic_read(&sbi->s_fc_subtid);
}
spin_unlock(&sbi->s_fc_lock);
@ -1131,7 +1150,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
"Fast commit ended with blks = %d, reason = %d, subtid - %d",
nblks, reason, subtid);
if (reason == EXT4_FC_REASON_FC_FAILED)
return jbd2_fc_end_commit_fallback(journal, commit_tid);
return jbd2_fc_end_commit_fallback(journal);
if (reason == EXT4_FC_REASON_FC_START_FAILED ||
reason == EXT4_FC_REASON_INELIGIBLE)
return jbd2_complete_transaction(journal, commit_tid);
@ -1190,8 +1209,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
&sbi->s_fc_q[FC_Q_STAGING]);
sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING;
sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE;
ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
if (full)
sbi->s_fc_bytes = 0;
@ -1263,7 +1282,7 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
return 0;
}
ret = __ext4_unlink(old_parent, &entry, inode);
ret = __ext4_unlink(NULL, old_parent, &entry, inode);
/* -ENOENT ok coz it might not exist anymore. */
if (ret == -ENOENT)
ret = 0;
@ -2079,8 +2098,6 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
void ext4_fc_init(struct super_block *sb, journal_t *journal)
{
int num_fc_blocks;
/*
* We set replay callback even if fast commit disabled because we may
* could still have fast commit blocks that need to be replayed even if
@ -2090,21 +2107,9 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal)
if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
return;
journal->j_fc_cleanup_callback = ext4_fc_cleanup;
if (!buffer_uptodate(journal->j_sb_buffer)
&& ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO,
true)) {
ext4_msg(sb, KERN_ERR, "I/O error on journal");
return;
}
num_fc_blocks = be32_to_cpu(journal->j_superblock->s_num_fc_blks);
if (jbd2_fc_init(journal, num_fc_blocks ? num_fc_blocks :
EXT4_NUM_FC_BLKS)) {
pr_warn("Error while enabling fast commits, turning off.");
ext4_clear_feature_fast_commit(sb);
}
}
const char *fc_ineligible_reasons[] = {
static const char *fc_ineligible_reasons[] = {
"Extended attributes changed",
"Cross rename",
"Journal flag changed",
@ -2113,6 +2118,7 @@ const char *fc_ineligible_reasons[] = {
"Resize",
"Dir renamed",
"Falloc range op",
"Data journalling",
"FC Commit Failed"
};

View File

@ -3,9 +3,6 @@
#ifndef __FAST_COMMIT_H__
#define __FAST_COMMIT_H__
/* Number of blocks in journal area to allocate for fast commits */
#define EXT4_NUM_FC_BLKS 256
/* Fast commit tags */
#define EXT4_FC_TAG_ADD_RANGE 0x0001
#define EXT4_FC_TAG_DEL_RANGE 0x0002
@ -100,11 +97,12 @@ enum {
EXT4_FC_REASON_XATTR = 0,
EXT4_FC_REASON_CROSS_RENAME,
EXT4_FC_REASON_JOURNAL_FLAG_CHANGE,
EXT4_FC_REASON_MEM,
EXT4_FC_REASON_NOMEM,
EXT4_FC_REASON_SWAP_BOOT,
EXT4_FC_REASON_RESIZE,
EXT4_FC_REASON_RENAME_DIR,
EXT4_FC_REASON_FALLOC_RANGE,
EXT4_FC_REASON_INODE_JOURNAL_DATA,
EXT4_FC_COMMIT_FAILED,
EXT4_FC_REASON_MAX
};

View File

@ -763,7 +763,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
if (!daxdev_mapping_supported(vma, dax_dev))
return -EOPNOTSUPP;
ext4_fc_start_update(inode);
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
@ -771,7 +770,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
} else {
vma->vm_ops = &ext4_file_vm_ops;
}
ext4_fc_stop_update(inode);
return 0;
}
@ -784,13 +782,13 @@ static int ext4_sample_last_mounted(struct super_block *sb,
handle_t *handle;
int err;
if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED))
if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))
return 0;
if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
return 0;
sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);
/*
* Sample where the filesystem has been mounted and
* store it in the superblock for sysadmin convenience

View File

@ -280,7 +280,7 @@ static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys,
/* Fabricate an rmap entry for the external log device. */
irec.fmr_physical = journal->j_blk_offset;
irec.fmr_length = journal->j_maxlen;
irec.fmr_length = journal->j_total_len;
irec.fmr_owner = EXT4_FMR_OWN_LOG;
irec.fmr_flags = 0;

View File

@ -143,7 +143,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (sb_rdonly(inode->i_sb)) {
/* Make sure that we read updated s_mount_flags value */
smp_rmb();
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))
ret = -EROFS;
goto out;
}

View File

@ -1899,6 +1899,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
ext4_write_lock_xattr(inode, &no_expand);
if (!ext4_has_inline_data(inode)) {
ext4_write_unlock_xattr(inode, &no_expand);
*has_inline = 0;
ext4_journal_stop(handle);
return 0;

View File

@ -328,6 +328,8 @@ void ext4_evict_inode(struct inode *inode)
ext4_xattr_inode_array_free(ea_inode_array);
return;
no_delete:
if (!list_empty(&EXT4_I(inode)->i_fc_list))
ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
}
@ -731,7 +733,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
if (ret)
return ret;
}
ext4_fc_track_range(inode, map->m_lblk,
ext4_fc_track_range(handle, inode, map->m_lblk,
map->m_lblk + map->m_len - 1);
}
@ -2453,7 +2455,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
struct super_block *sb = inode->i_sb;
if (ext4_forced_shutdown(EXT4_SB(sb)) ||
EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
goto invalidate_dirty_pages;
/*
* Let the uper layers retry transient errors.
@ -2687,7 +2689,7 @@ static int ext4_writepages(struct address_space *mapping,
* the stack trace.
*/
if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) ||
sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) {
ret = -EROFS;
goto out_writepages;
}
@ -3334,8 +3336,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
EXT4_I(inode)->i_datasync_tid))
return false;
if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
return atomic_read(&EXT4_SB(inode->i_sb)->s_fc_subtid) <
EXT4_I(inode)->i_fc_committed_subtid;
return !list_empty(&EXT4_I(inode)->i_fc_list);
return true;
}
@ -4133,7 +4134,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
up_write(&EXT4_I(inode)->i_data_sem);
}
ext4_fc_track_range(inode, first_block, stop_block);
ext4_fc_track_range(handle, inode, first_block, stop_block);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
@ -5466,14 +5467,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (shrink)
ext4_fc_track_range(inode,
ext4_fc_track_range(handle, inode,
(attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
inode->i_sb->s_blocksize_bits,
(oldsize > 0 ? oldsize - 1 : 0) >>
inode->i_sb->s_blocksize_bits);
else
ext4_fc_track_range(
inode,
handle, inode,
(oldsize > 0 ? oldsize - 1 : oldsize) >>
inode->i_sb->s_blocksize_bits,
(attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
@ -5723,7 +5724,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
put_bh(iloc->bh);
return -EIO;
}
ext4_fc_track_inode(inode);
ext4_fc_track_inode(handle, inode);
if (IS_I_VERSION(inode))
inode_inc_iversion(inode);

View File

@ -4477,7 +4477,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb)
{
ext4_group_t i, ngroups;
if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
return;
ngroups = ext4_get_groups_count(sb);
@ -4508,7 +4508,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
return;
mb_debug(sb, "Can't allocate:"
@ -5167,7 +5167,7 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
struct super_block *sb = ar->inode->i_sb;
ext4_group_t group;
ext4_grpblk_t blkoff;
int i;
int i = sb->s_blocksize;
ext4_fsblk_t goal, block;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;

View File

@ -2717,7 +2717,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
handle_t *handle;
struct inode *inode, *inode_save;
struct inode *inode;
int err, credits, retries = 0;
err = dquot_initialize(dir);
@ -2735,11 +2735,9 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
inode_save = inode;
ihold(inode_save);
err = ext4_add_nondir(handle, dentry, &inode);
ext4_fc_track_create(inode_save, dentry);
iput(inode_save);
if (!err)
ext4_fc_track_create(handle, dentry);
}
if (handle)
ext4_journal_stop(handle);
@ -2754,7 +2752,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t rdev)
{
handle_t *handle;
struct inode *inode, *inode_save;
struct inode *inode;
int err, credits, retries = 0;
err = dquot_initialize(dir);
@ -2771,12 +2769,9 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &ext4_special_inode_operations;
inode_save = inode;
ihold(inode_save);
err = ext4_add_nondir(handle, dentry, &inode);
if (!err)
ext4_fc_track_create(inode_save, dentry);
iput(inode_save);
ext4_fc_track_create(handle, dentry);
}
if (handle)
ext4_journal_stop(handle);
@ -2941,7 +2936,6 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
iput(inode);
goto out_retry;
}
ext4_fc_track_create(inode, dentry);
ext4_inc_count(dir);
ext4_update_dx_flag(dir);
@ -2949,6 +2943,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
if (err)
goto out_clear_inode;
d_instantiate_new(dentry, inode);
ext4_fc_track_create(handle, dentry);
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
@ -3286,7 +3281,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
goto end_rmdir;
ext4_dec_count(dir);
ext4_update_dx_flag(dir);
ext4_fc_track_unlink(inode, dentry);
ext4_fc_track_unlink(handle, dentry);
retval = ext4_mark_inode_dirty(handle, dir);
#ifdef CONFIG_UNICODE
@ -3307,13 +3302,12 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
return retval;
}
int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,
struct inode *inode)
{
int retval = -ENOENT;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
handle_t *handle = NULL;
int skip_remove_dentry = 0;
ext4_lblk_t lblk;
@ -3333,14 +3327,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
skip_remove_dentry = 1;
else
goto out_bh;
}
handle = ext4_journal_start(dir, EXT4_HT_DIR,
EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
goto out_bh;
goto out;
}
if (IS_DIRSYNC(dir))
@ -3349,12 +3336,12 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
if (!skip_remove_dentry) {
retval = ext4_delete_entry(handle, dir, de, lblk, bh);
if (retval)
goto out_handle;
goto out;
dir->i_ctime = dir->i_mtime = current_time(dir);
ext4_update_dx_flag(dir);
retval = ext4_mark_inode_dirty(handle, dir);
if (retval)
goto out_handle;
goto out;
} else {
retval = 0;
}
@ -3368,15 +3355,14 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
inode->i_ctime = current_time(inode);
retval = ext4_mark_inode_dirty(handle, inode);
out_handle:
ext4_journal_stop(handle);
out_bh:
out:
brelse(bh);
return retval;
}
static int ext4_unlink(struct inode *dir, struct dentry *dentry)
{
handle_t *handle;
int retval;
if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
@ -3394,9 +3380,16 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (retval)
goto out_trace;
retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry));
handle = ext4_journal_start(dir, EXT4_HT_DIR,
EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
goto out_trace;
}
retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry));
if (!retval)
ext4_fc_track_unlink(d_inode(dentry), dentry);
ext4_fc_track_unlink(handle, dentry);
#ifdef CONFIG_UNICODE
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
@ -3407,6 +3400,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (IS_CASEFOLDED(dir))
d_invalidate(dentry);
#endif
if (handle)
ext4_journal_stop(handle);
out_trace:
trace_ext4_unlink_exit(dentry, retval);
@ -3563,7 +3558,6 @@ int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry)
err = ext4_add_entry(handle, dentry, inode);
if (!err) {
ext4_fc_track_link(inode, dentry);
err = ext4_mark_inode_dirty(handle, inode);
/* this can happen only for tmpfile being
* linked the first time
@ -3571,6 +3565,7 @@ int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry)
if (inode->i_nlink == 1)
ext4_orphan_del(handle, inode);
d_instantiate(dentry, inode);
ext4_fc_track_link(handle, dentry);
} else {
drop_nlink(inode);
iput(inode);
@ -4035,9 +4030,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
EXT4_FC_REASON_RENAME_DIR);
} else {
if (new.inode)
ext4_fc_track_unlink(new.inode, new.dentry);
ext4_fc_track_link(old.inode, new.dentry);
ext4_fc_track_unlink(old.inode, old.dentry);
ext4_fc_track_unlink(handle, new.dentry);
__ext4_fc_track_link(handle, old.inode, new.dentry);
__ext4_fc_track_unlink(handle, old.inode, old.dentry);
}
if (new.inode) {

View File

@ -686,7 +686,7 @@ static void ext4_handle_error(struct super_block *sb)
if (!test_opt(sb, ERRORS_CONT)) {
journal_t *journal = EXT4_SB(sb)->s_journal;
EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
if (journal)
jbd2_journal_abort(journal, -EIO);
}
@ -904,7 +904,7 @@ void __ext4_abort(struct super_block *sb, const char *function,
va_end(args);
if (sb_rdonly(sb) == 0) {
EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
if (EXT4_SB(sb)->s_journal)
jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
@ -1716,11 +1716,10 @@ enum {
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
Opt_prefetch_block_bitmaps, Opt_no_fc,
Opt_prefetch_block_bitmaps,
#ifdef CONFIG_EXT4_DEBUG
Opt_fc_debug_max_replay,
Opt_fc_debug_max_replay, Opt_fc_debug_force
#endif
Opt_fc_debug_force
};
static const match_table_t tokens = {
@ -1807,9 +1806,8 @@ static const match_table_t tokens = {
{Opt_init_itable, "init_itable=%u"},
{Opt_init_itable, "init_itable"},
{Opt_noinit_itable, "noinit_itable"},
{Opt_no_fc, "no_fc"},
{Opt_fc_debug_force, "fc_debug_force"},
#ifdef CONFIG_EXT4_DEBUG
{Opt_fc_debug_force, "fc_debug_force"},
{Opt_fc_debug_max_replay, "fc_debug_max_replay=%u"},
#endif
{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
@ -2027,8 +2025,8 @@ static const struct mount_opts {
{Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA),
MOPT_CLEAR | MOPT_Q},
{Opt_usrjquota, 0, MOPT_Q},
{Opt_grpjquota, 0, MOPT_Q},
{Opt_usrjquota, 0, MOPT_Q | MOPT_STRING},
{Opt_grpjquota, 0, MOPT_Q | MOPT_STRING},
{Opt_offusrjquota, 0, MOPT_Q},
{Opt_offgrpjquota, 0, MOPT_Q},
{Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
@ -2039,11 +2037,9 @@ static const struct mount_opts {
{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
{Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS,
MOPT_SET},
{Opt_no_fc, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
MOPT_CLEAR | MOPT_2 | MOPT_EXT4_ONLY},
#ifdef CONFIG_EXT4_DEBUG
{Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
#ifdef CONFIG_EXT4_DEBUG
{Opt_fc_debug_max_replay, 0, MOPT_GTE0},
#endif
{Opt_err, 0, 0}
@ -2153,7 +2149,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
return 1;
case Opt_abort:
sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
return 1;
case Opt_i_version:
sb->s_flags |= SB_I_VERSION;
@ -3976,7 +3972,7 @@ int ext4_calculate_overhead(struct super_block *sb)
* loaded or not
*/
if (sbi->s_journal && !sbi->s_journal_bdev)
overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
/* j_inum for internal journal is non-zero */
j_inode = ext4_get_journal_inode(sb, j_inum);
@ -4334,9 +4330,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#endif
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!\n");
printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support!\n");
/* can't mount with both data=journal and dioread_nolock. */
clear_opt(sb, DIOREAD_NOLOCK);
clear_opt2(sb, JOURNAL_FAST_COMMIT);
if (test_opt2(sb, EXPLICIT_DELALLOC)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"both data=journal and delalloc");
@ -4771,8 +4768,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
sbi->s_fc_bytes = 0;
sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE;
sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING;
ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
spin_lock_init(&sbi->s_fc_lock);
memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
sbi->s_fc_replay_state.fc_regions = NULL;
@ -4851,6 +4848,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount_wq;
}
if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
!jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
ext4_msg(sb, KERN_ERR,
"Failed to set fast commit journal feature");
goto failed_mount_wq;
}
/* We have now updated the journal if required, so we can
* validate the data journaling mode. */
switch (test_opt(sb, DATA_FLAGS)) {
@ -5861,7 +5866,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user");
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
@ -5875,7 +5880,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) {
err = -EROFS;
goto restore_opts;
}
@ -6549,10 +6554,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
brelse(bh);
out:
if (inode->i_size < off + len) {
ext4_fc_track_range(inode,
(inode->i_size > 0 ? inode->i_size - 1 : 0)
>> inode->i_sb->s_blocksize_bits,
(off + len) >> inode->i_sb->s_blocksize_bits);
i_size_write(inode, off + len);
EXT4_I(inode)->i_disksize = inode->i_size;
err2 = ext4_mark_inode_dirty(handle, inode);

View File

@ -106,6 +106,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
* for a checkpoint to free up some space in the log.
*/
void __jbd2_log_wait_for_space(journal_t *journal)
__acquires(&journal->j_state_lock)
__releases(&journal->j_state_lock)
{
int nblocks, space_left;
/* assert_spin_locked(&journal->j_state_lock); */

View File

@ -450,6 +450,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
schedule();
write_lock(&journal->j_state_lock);
finish_wait(&journal->j_fc_wait, &wait);
/*
* TODO: by blocking fast commits here, we are increasing
* fsync() latency slightly. Strictly speaking, we don't need
* to block fast commits until the transaction enters T_FLUSH
* state. So an optimization is possible where we block new fast
* commits here and wait for existing ones to complete
* just before we enter T_FLUSH. That way, the existing fast
* commits and this full commit can proceed parallely.
*/
}
write_unlock(&journal->j_state_lock);
@ -801,7 +810,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
if (first_block < journal->j_tail)
freed += journal->j_last - journal->j_first;
/* Update tail only if we free significant amount of space */
if (freed < journal->j_maxlen / 4)
if (freed < jbd2_journal_get_max_txn_bufs(journal))
update_tail = 0;
}
J_ASSERT(commit_transaction->t_state == T_COMMIT);

View File

@ -727,6 +727,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
*/
int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
{
if (unlikely(is_journal_aborted(journal)))
return -EIO;
/*
* Fast commits only allowed if at least one full commit has
* been processed.
@ -734,10 +736,12 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
if (!journal->j_stats.ts_tid)
return -EINVAL;
if (tid <= journal->j_commit_sequence)
return -EALREADY;
write_lock(&journal->j_state_lock);
if (tid <= journal->j_commit_sequence) {
write_unlock(&journal->j_state_lock);
return -EALREADY;
}
if (journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
(journal->j_flags & JBD2_FAST_COMMIT_ONGOING)) {
DEFINE_WAIT(wait);
@ -777,13 +781,19 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
int jbd2_fc_end_commit(journal_t *journal)
{
return __jbd2_fc_end_commit(journal, 0, 0);
return __jbd2_fc_end_commit(journal, 0, false);
}
EXPORT_SYMBOL(jbd2_fc_end_commit);
int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid)
int jbd2_fc_end_commit_fallback(journal_t *journal)
{
return __jbd2_fc_end_commit(journal, tid, 1);
tid_t tid;
read_lock(&journal->j_state_lock);
tid = journal->j_running_transaction ?
journal->j_running_transaction->t_tid : 0;
read_unlock(&journal->j_state_lock);
return __jbd2_fc_end_commit(journal, tid, true);
}
EXPORT_SYMBOL(jbd2_fc_end_commit_fallback);
@ -865,7 +875,6 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)
int fc_off;
*bh_out = NULL;
write_lock(&journal->j_state_lock);
if (journal->j_fc_off + journal->j_fc_first < journal->j_fc_last) {
fc_off = journal->j_fc_off;
@ -874,7 +883,6 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)
} else {
ret = -EINVAL;
}
write_unlock(&journal->j_state_lock);
if (ret)
return ret;
@ -887,11 +895,7 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)
if (!bh)
return -ENOMEM;
lock_buffer(bh);
clear_buffer_uptodate(bh);
set_buffer_dirty(bh);
unlock_buffer(bh);
journal->j_fc_wbuf[fc_off] = bh;
*bh_out = bh;
@ -909,9 +913,7 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks)
struct buffer_head *bh;
int i, j_fc_off;
read_lock(&journal->j_state_lock);
j_fc_off = journal->j_fc_off;
read_unlock(&journal->j_state_lock);
/*
* Wait in reverse order to minimize chances of us being woken up before
@ -939,9 +941,7 @@ int jbd2_fc_release_bufs(journal_t *journal)
struct buffer_head *bh;
int i, j_fc_off;
read_lock(&journal->j_state_lock);
j_fc_off = journal->j_fc_off;
read_unlock(&journal->j_state_lock);
/*
* Wait in reverse order to minimize chances of us being woken up before
@ -1348,23 +1348,16 @@ static journal_t *journal_init_common(struct block_device *bdev,
journal->j_dev = bdev;
journal->j_fs_dev = fs_dev;
journal->j_blk_offset = start;
journal->j_maxlen = len;
journal->j_total_len = len;
/* We need enough buffers to write out full descriptor block. */
n = journal->j_blocksize / jbd2_min_tag_size();
journal->j_wbufsize = n;
journal->j_fc_wbuf = NULL;
journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *),
GFP_KERNEL);
if (!journal->j_wbuf)
goto err_cleanup;
if (journal->j_fc_wbufsize > 0) {
journal->j_fc_wbuf = kmalloc_array(journal->j_fc_wbufsize,
sizeof(struct buffer_head *),
GFP_KERNEL);
if (!journal->j_fc_wbuf)
goto err_cleanup;
}
bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize);
if (!bh) {
pr_err("%s: Cannot get buffer for journal superblock\n",
@ -1378,23 +1371,11 @@ static journal_t *journal_init_common(struct block_device *bdev,
err_cleanup:
kfree(journal->j_wbuf);
kfree(journal->j_fc_wbuf);
jbd2_journal_destroy_revoke(journal);
kfree(journal);
return NULL;
}
int jbd2_fc_init(journal_t *journal, int num_fc_blks)
{
journal->j_fc_wbufsize = num_fc_blks;
journal->j_fc_wbuf = kmalloc_array(journal->j_fc_wbufsize,
sizeof(struct buffer_head *), GFP_KERNEL);
if (!journal->j_fc_wbuf)
return -ENOMEM;
return 0;
}
EXPORT_SYMBOL(jbd2_fc_init);
/* jbd2_journal_init_dev and jbd2_journal_init_inode:
*
* Create a journal structure assigned some fixed set of disk blocks to
@ -1512,16 +1493,7 @@ static int journal_reset(journal_t *journal)
}
journal->j_first = first;
if (jbd2_has_feature_fast_commit(journal) &&
journal->j_fc_wbufsize > 0) {
journal->j_fc_last = last;
journal->j_last = last - journal->j_fc_wbufsize;
journal->j_fc_first = journal->j_last + 1;
journal->j_fc_off = 0;
} else {
journal->j_last = last;
}
journal->j_last = last;
journal->j_head = journal->j_first;
journal->j_tail = journal->j_first;
@ -1531,7 +1503,14 @@ static int journal_reset(journal_t *journal)
journal->j_commit_sequence = journal->j_transaction_sequence - 1;
journal->j_commit_request = journal->j_commit_sequence;
journal->j_max_transaction_buffers = journal->j_maxlen / 4;
journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal);
/*
* Now that journal recovery is done, turn fast commits off here. This
* way, if fast commit was enabled before the crash but if now FS has
* disabled it, we don't enable fast commits.
*/
jbd2_clear_feature_fast_commit(journal);
/*
* As a special case, if the on-disk copy is already marked as needing
@ -1792,15 +1771,15 @@ static int journal_get_superblock(journal_t *journal)
goto out;
}
if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len)
journal->j_total_len = be32_to_cpu(sb->s_maxlen);
else if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) {
printk(KERN_WARNING "JBD2: journal file too short\n");
goto out;
}
if (be32_to_cpu(sb->s_first) == 0 ||
be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
be32_to_cpu(sb->s_first) >= journal->j_total_len) {
printk(KERN_WARNING
"JBD2: Invalid start block of journal: %u\n",
be32_to_cpu(sb->s_first));
@ -1872,6 +1851,7 @@ static int load_superblock(journal_t *journal)
{
int err;
journal_superblock_t *sb;
int num_fc_blocks;
err = journal_get_superblock(journal);
if (err)
@ -1883,15 +1863,17 @@ static int load_superblock(journal_t *journal)
journal->j_tail = be32_to_cpu(sb->s_start);
journal->j_first = be32_to_cpu(sb->s_first);
journal->j_errno = be32_to_cpu(sb->s_errno);
journal->j_last = be32_to_cpu(sb->s_maxlen);
if (jbd2_has_feature_fast_commit(journal) &&
journal->j_fc_wbufsize > 0) {
if (jbd2_has_feature_fast_commit(journal)) {
journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
journal->j_last = journal->j_fc_last - journal->j_fc_wbufsize;
num_fc_blocks = be32_to_cpu(sb->s_num_fc_blks);
if (!num_fc_blocks)
num_fc_blocks = JBD2_MIN_FC_BLOCKS;
if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS)
journal->j_last = journal->j_fc_last - num_fc_blocks;
journal->j_fc_first = journal->j_last + 1;
journal->j_fc_off = 0;
} else {
journal->j_last = be32_to_cpu(sb->s_maxlen);
}
return 0;
@ -1954,9 +1936,6 @@ int jbd2_journal_load(journal_t *journal)
*/
journal->j_flags &= ~JBD2_ABORT;
if (journal->j_fc_wbufsize > 0)
jbd2_journal_set_features(journal, 0, 0,
JBD2_FEATURE_INCOMPAT_FAST_COMMIT);
/* OK, we've finished with the dynamic journal bits:
* reinitialise the dynamic contents of the superblock in memory
* and reset them on disk. */
@ -2040,8 +2019,7 @@ int jbd2_journal_destroy(journal_t *journal)
jbd2_journal_destroy_revoke(journal);
if (journal->j_chksum_driver)
crypto_free_shash(journal->j_chksum_driver);
if (journal->j_fc_wbufsize > 0)
kfree(journal->j_fc_wbuf);
kfree(journal->j_fc_wbuf);
kfree(journal->j_wbuf);
kfree(journal);
@ -2116,6 +2094,37 @@ int jbd2_journal_check_available_features(journal_t *journal, unsigned long comp
return 0;
}
static int
jbd2_journal_initialize_fast_commit(journal_t *journal)
{
journal_superblock_t *sb = journal->j_superblock;
unsigned long long num_fc_blks;
num_fc_blks = be32_to_cpu(sb->s_num_fc_blks);
if (num_fc_blks == 0)
num_fc_blks = JBD2_MIN_FC_BLOCKS;
if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS)
return -ENOSPC;
/* Are we called twice? */
WARN_ON(journal->j_fc_wbuf != NULL);
journal->j_fc_wbuf = kmalloc_array(num_fc_blks,
sizeof(struct buffer_head *), GFP_KERNEL);
if (!journal->j_fc_wbuf)
return -ENOMEM;
journal->j_fc_wbufsize = num_fc_blks;
journal->j_fc_last = journal->j_last;
journal->j_last = journal->j_fc_last - num_fc_blks;
journal->j_fc_first = journal->j_last + 1;
journal->j_fc_off = 0;
journal->j_free = journal->j_last - journal->j_first;
journal->j_max_transaction_buffers =
jbd2_journal_get_max_txn_bufs(journal);
return 0;
}
/**
* int jbd2_journal_set_features() - Mark a given journal feature in the superblock
* @journal: Journal to act on.
@ -2159,6 +2168,13 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
sb = journal->j_superblock;
if (incompat & JBD2_FEATURE_INCOMPAT_FAST_COMMIT) {
if (jbd2_journal_initialize_fast_commit(journal)) {
pr_err("JBD2: Cannot enable fast commits.\n");
return 0;
}
}
/* Load the checksum driver if necessary */
if ((journal->j_chksum_driver == NULL) &&
INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {

View File

@ -74,8 +74,8 @@ static int do_readahead(journal_t *journal, unsigned int start)
/* Do up to 128K of readahead */
max = start + (128 * 1024 / journal->j_blocksize);
if (max > journal->j_maxlen)
max = journal->j_maxlen;
if (max > journal->j_total_len)
max = journal->j_total_len;
/* Do the readahead itself. We'll submit MAXBUF buffer_heads at
* a time to the block device IO layer. */
@ -134,7 +134,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
*bhp = NULL;
if (offset >= journal->j_maxlen) {
if (offset >= journal->j_total_len) {
printk(KERN_ERR "JBD2: corrupted journal superblock\n");
return -EFSCORRUPTED;
}

View File

@ -195,8 +195,10 @@ static void wait_transaction_switching(journal_t *journal)
DEFINE_WAIT(wait);
if (WARN_ON(!journal->j_running_transaction ||
journal->j_running_transaction->t_state != T_SWITCH))
journal->j_running_transaction->t_state != T_SWITCH)) {
read_unlock(&journal->j_state_lock);
return;
}
prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
TASK_UNINTERRUPTIBLE);
read_unlock(&journal->j_state_lock);

View File

@ -316,10 +316,6 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp)
fh_copy(&resp->dirfh, &argp->fh);
fh_init(&resp->fh, NFS3_FHSIZE);
if (argp->ftype == 0 || argp->ftype >= NF3BAD) {
resp->status = nfserr_inval;
goto out;
}
if (argp->ftype == NF3CHR || argp->ftype == NF3BLK) {
rdev = MKDEV(argp->major, argp->minor);
if (MAJOR(rdev) != argp->major ||
@ -328,7 +324,7 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp)
goto out;
}
} else if (argp->ftype != NF3SOCK && argp->ftype != NF3FIFO) {
resp->status = nfserr_inval;
resp->status = nfserr_badtype;
goto out;
}

View File

@ -1114,6 +1114,7 @@ nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_pathconfres *resp = rqstp->rq_resp;
*p++ = resp->status;
*p++ = xdr_zero; /* no post_op_attr */
if (resp->status == 0) {

View File

@ -1299,7 +1299,7 @@ nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
struct nfsd_file *dst)
{
nfs42_ssc_close(src->nf_file);
nfsd_file_put(src);
/* 'src' is freed by nfsd4_do_async_copy */
nfsd_file_put(dst);
mntput(ss_mnt);
}
@ -1486,6 +1486,7 @@ static int nfsd4_do_async_copy(void *data)
cb_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL);
if (!cb_copy)
goto out;
refcount_set(&cb_copy->refcount, 1);
memcpy(&cb_copy->cp_res, &copy->cp_res, sizeof(copy->cp_res));
cb_copy->cp_clp = copy->cp_clp;
cb_copy->nfserr = copy->nfserr;

View File

@ -877,7 +877,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
goto done;
}
trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen);
trace_ocfs2_journal_init_maxlen(j_journal->j_total_len);
*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
OCFS2_JOURNAL_DIRTY_FL);

View File

@ -68,6 +68,7 @@ extern void *jbd2_alloc(size_t size, gfp_t flags);
extern void jbd2_free(void *ptr, size_t size);
#define JBD2_MIN_JOURNAL_BLOCKS 1024
#define JBD2_MIN_FC_BLOCKS 256
#ifdef __KERNEL__
@ -944,8 +945,9 @@ struct journal_s
/**
* @j_fc_off:
*
* Number of fast commit blocks currently allocated.
* [j_state_lock].
* Number of fast commit blocks currently allocated. Accessed only
* during fast commit. Currently only process can do fast commit, so
* this field is not protected by any lock.
*/
unsigned long j_fc_off;
@ -988,9 +990,9 @@ struct journal_s
struct block_device *j_fs_dev;
/**
* @j_maxlen: Total maximum capacity of the journal region on disk.
* @j_total_len: Total maximum capacity of the journal region on disk.
*/
unsigned int j_maxlen;
unsigned int j_total_len;
/**
* @j_reserved_credits:
@ -1108,8 +1110,9 @@ struct journal_s
struct buffer_head **j_wbuf;
/**
* @j_fc_wbuf: Array of fast commit bhs for
* jbd2_journal_commit_transaction.
* @j_fc_wbuf: Array of fast commit bhs for fast commit. Accessed only
* during a fast commit. Currently only process can do fast commit, so
* this field is not protected by any lock.
*/
struct buffer_head **j_fc_wbuf;
@ -1614,16 +1617,20 @@ extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
extern int jbd2_cleanup_journal_tail(journal_t *);
/* Fast commit related APIs */
int jbd2_fc_init(journal_t *journal, int num_fc_blks);
int jbd2_fc_begin_commit(journal_t *journal, tid_t tid);
int jbd2_fc_end_commit(journal_t *journal);
int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid);
int jbd2_fc_end_commit_fallback(journal_t *journal);
int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out);
int jbd2_submit_inode_data(struct jbd2_inode *jinode);
int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode);
int jbd2_fc_wait_bufs(journal_t *journal, int num_blks);
int jbd2_fc_release_bufs(journal_t *journal);
static inline int jbd2_journal_get_max_txn_bufs(journal_t *journal)
{
return (journal->j_total_len - journal->j_fc_wbufsize) / 4;
}
/*
* is_journal_abort
*

View File

@ -100,11 +100,12 @@ TRACE_DEFINE_ENUM(ES_REFERENCED_B);
{ EXT4_FC_REASON_XATTR, "XATTR"}, \
{ EXT4_FC_REASON_CROSS_RENAME, "CROSS_RENAME"}, \
{ EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, "JOURNAL_FLAG_CHANGE"}, \
{ EXT4_FC_REASON_MEM, "NO_MEM"}, \
{ EXT4_FC_REASON_NOMEM, "NO_MEM"}, \
{ EXT4_FC_REASON_SWAP_BOOT, "SWAP_BOOT"}, \
{ EXT4_FC_REASON_RESIZE, "RESIZE"}, \
{ EXT4_FC_REASON_RENAME_DIR, "RENAME_DIR"}, \
{ EXT4_FC_REASON_FALLOC_RANGE, "FALLOC_RANGE"})
{ EXT4_FC_REASON_FALLOC_RANGE, "FALLOC_RANGE"}, \
{ EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"})
TRACE_EVENT(ext4_other_inode_update_time,
TP_PROTO(struct inode *inode, ino_t orig_ino),
@ -2917,17 +2918,18 @@ TRACE_EVENT(ext4_fc_stats,
),
TP_printk("dev %d:%d fc ineligible reasons:\n"
"%s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s,%d; "
"%s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d; "
"num_commits:%ld, ineligible: %ld, numblks: %ld",
MAJOR(__entry->dev), MINOR(__entry->dev),
FC_REASON_NAME_STAT(EXT4_FC_REASON_XATTR),
FC_REASON_NAME_STAT(EXT4_FC_REASON_CROSS_RENAME),
FC_REASON_NAME_STAT(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE),
FC_REASON_NAME_STAT(EXT4_FC_REASON_MEM),
FC_REASON_NAME_STAT(EXT4_FC_REASON_NOMEM),
FC_REASON_NAME_STAT(EXT4_FC_REASON_SWAP_BOOT),
FC_REASON_NAME_STAT(EXT4_FC_REASON_RESIZE),
FC_REASON_NAME_STAT(EXT4_FC_REASON_RENAME_DIR),
FC_REASON_NAME_STAT(EXT4_FC_REASON_FALLOC_RANGE),
FC_REASON_NAME_STAT(EXT4_FC_REASON_INODE_JOURNAL_DATA),
__entry->sbi->s_fc_stats.fc_num_commits,
__entry->sbi->s_fc_stats.fc_ineligible_commits,
__entry->sbi->s_fc_stats.fc_numblks)

View File

@ -655,10 +655,10 @@ TRACE_EVENT(rpc_xdr_overflow,
__field(size_t, tail_len)
__field(unsigned int, page_len)
__field(unsigned int, len)
__string(progname,
xdr->rqst->rq_task->tk_client->cl_program->name)
__string(procedure,
xdr->rqst->rq_task->tk_msg.rpc_proc->p_name)
__string(progname, xdr->rqst ?
xdr->rqst->rq_task->tk_client->cl_program->name : "unknown")
__string(procedure, xdr->rqst ?
xdr->rqst->rq_task->tk_msg.rpc_proc->p_name : "unknown")
),
TP_fast_assign(

View File

@ -454,7 +454,10 @@ static void exit_mm(void)
mmap_read_unlock(mm);
self.task = current;
self.next = xchg(&core_state->dumper.next, &self);
if (self.task->flags & PF_SIGNALED)
self.next = xchg(&core_state->dumper.next, &self);
else
self.task = NULL;
/*
* Implies mb(), the result of xchg() must be visible
* to core_state->dumper.

View File

@ -63,19 +63,20 @@ static int proc_do_xprt(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
char tmpbuf[256];
size_t len;
ssize_t len;
if ((*ppos && !write) || !*lenp) {
if (write || *ppos) {
*lenp = 0;
return 0;
}
len = svc_print_xprts(tmpbuf, sizeof(tmpbuf));
*lenp = memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len);
len = memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len);
if (*lenp < 0) {
if (len < 0) {
*lenp = 0;
return -EINVAL;
}
*lenp = len;
return 0;
}

View File

@ -12,11 +12,12 @@ turbostat : turbostat.c
override CFLAGS += -O2 -Wall -I../../../include
override CFLAGS += -DMSRHEADER='"../../../../arch/x86/include/asm/msr-index.h"'
override CFLAGS += -DINTEL_FAMILY_HEADER='"../../../../arch/x86/include/asm/intel-family.h"'
override CFLAGS += -D_FILE_OFFSET_BITS=64
override CFLAGS += -D_FORTIFY_SOURCE=2
%: %.c
@mkdir -p $(BUILD_OUTPUT)
$(CC) $(CFLAGS) $< -o $(BUILD_OUTPUT)/$@ $(LDFLAGS) -lcap
$(CC) $(CFLAGS) $< -o $(BUILD_OUTPUT)/$@ $(LDFLAGS) -lcap -lrt
.PHONY : clean
clean :

View File

@ -335,7 +335,7 @@ that they count at TSC rate, which is true on all processors tested to date.
.SH REFERENCES
Volume 3B: System Programming Guide"
http://www.intel.com/products/processor/manuals/
https://www.intel.com/products/processor/manuals/
.SH FILES
.ta

View File

@ -79,6 +79,7 @@ unsigned long long gfx_cur_rc6_ms;
unsigned long long cpuidle_cur_cpu_lpi_us;
unsigned long long cpuidle_cur_sys_lpi_us;
unsigned int gfx_cur_mhz;
unsigned int gfx_act_mhz;
unsigned int tcc_activation_temp;
unsigned int tcc_activation_temp_override;
double rapl_power_units, rapl_time_units;
@ -210,13 +211,14 @@ struct pkg_data {
unsigned long long pkg_both_core_gfxe_c0;
long long gfx_rc6_ms;
unsigned int gfx_mhz;
unsigned int gfx_act_mhz;
unsigned int package_id;
unsigned int energy_pkg; /* MSR_PKG_ENERGY_STATUS */
unsigned int energy_dram; /* MSR_DRAM_ENERGY_STATUS */
unsigned int energy_cores; /* MSR_PP0_ENERGY_STATUS */
unsigned int energy_gfx; /* MSR_PP1_ENERGY_STATUS */
unsigned int rapl_pkg_perf_status; /* MSR_PKG_PERF_STATUS */
unsigned int rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */
unsigned long long energy_pkg; /* MSR_PKG_ENERGY_STATUS */
unsigned long long energy_dram; /* MSR_DRAM_ENERGY_STATUS */
unsigned long long energy_cores; /* MSR_PP0_ENERGY_STATUS */
unsigned long long energy_gfx; /* MSR_PP1_ENERGY_STATUS */
unsigned long long rapl_pkg_perf_status; /* MSR_PKG_PERF_STATUS */
unsigned long long rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */
unsigned int pkg_temp_c;
unsigned long long counter[MAX_ADDED_COUNTERS];
} *package_even, *package_odd;
@ -259,6 +261,113 @@ struct msr_counter {
#define SYSFS_PERCPU (1 << 1)
};
/*
* The accumulated sum of MSR is defined as a monotonic
* increasing MSR, it will be accumulated periodically,
* despite its register's bit width.
*/
enum {
IDX_PKG_ENERGY,
IDX_DRAM_ENERGY,
IDX_PP0_ENERGY,
IDX_PP1_ENERGY,
IDX_PKG_PERF,
IDX_DRAM_PERF,
IDX_COUNT,
};
int get_msr_sum(int cpu, off_t offset, unsigned long long *msr);
struct msr_sum_array {
/* get_msr_sum() = sum + (get_msr() - last) */
struct {
/*The accumulated MSR value is updated by the timer*/
unsigned long long sum;
/*The MSR footprint recorded in last timer*/
unsigned long long last;
} entries[IDX_COUNT];
};
/* The percpu MSR sum array.*/
struct msr_sum_array *per_cpu_msr_sum;
int idx_to_offset(int idx)
{
int offset;
switch (idx) {
case IDX_PKG_ENERGY:
offset = MSR_PKG_ENERGY_STATUS;
break;
case IDX_DRAM_ENERGY:
offset = MSR_DRAM_ENERGY_STATUS;
break;
case IDX_PP0_ENERGY:
offset = MSR_PP0_ENERGY_STATUS;
break;
case IDX_PP1_ENERGY:
offset = MSR_PP1_ENERGY_STATUS;
break;
case IDX_PKG_PERF:
offset = MSR_PKG_PERF_STATUS;
break;
case IDX_DRAM_PERF:
offset = MSR_DRAM_PERF_STATUS;
break;
default:
offset = -1;
}
return offset;
}
int offset_to_idx(int offset)
{
int idx;
switch (offset) {
case MSR_PKG_ENERGY_STATUS:
idx = IDX_PKG_ENERGY;
break;
case MSR_DRAM_ENERGY_STATUS:
idx = IDX_DRAM_ENERGY;
break;
case MSR_PP0_ENERGY_STATUS:
idx = IDX_PP0_ENERGY;
break;
case MSR_PP1_ENERGY_STATUS:
idx = IDX_PP1_ENERGY;
break;
case MSR_PKG_PERF_STATUS:
idx = IDX_PKG_PERF;
break;
case MSR_DRAM_PERF_STATUS:
idx = IDX_DRAM_PERF;
break;
default:
idx = -1;
}
return idx;
}
int idx_valid(int idx)
{
switch (idx) {
case IDX_PKG_ENERGY:
return do_rapl & RAPL_PKG;
case IDX_DRAM_ENERGY:
return do_rapl & RAPL_DRAM;
case IDX_PP0_ENERGY:
return do_rapl & RAPL_CORES_ENERGY_STATUS;
case IDX_PP1_ENERGY:
return do_rapl & RAPL_GFX;
case IDX_PKG_PERF:
return do_rapl & RAPL_PKG_PERF_STATUS;
case IDX_DRAM_PERF:
return do_rapl & RAPL_DRAM_PERF_STATUS;
default:
return 0;
}
}
struct sys_counters {
unsigned int added_thread_counters;
unsigned int added_core_counters;
@ -451,6 +560,7 @@ struct msr_counter bic[] = {
{ 0x0, "APIC" },
{ 0x0, "X2APIC" },
{ 0x0, "Die" },
{ 0x0, "GFXAMHz" },
};
#define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter))
@ -505,6 +615,7 @@ struct msr_counter bic[] = {
#define BIC_APIC (1ULL << 48)
#define BIC_X2APIC (1ULL << 49)
#define BIC_Die (1ULL << 50)
#define BIC_GFXACTMHz (1ULL << 51)
#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC)
@ -724,6 +835,9 @@ void print_header(char *delim)
if (DO_BIC(BIC_GFXMHz))
outp += sprintf(outp, "%sGFXMHz", (printed++ ? delim : ""));
if (DO_BIC(BIC_GFXACTMHz))
outp += sprintf(outp, "%sGFXAMHz", (printed++ ? delim : ""));
if (DO_BIC(BIC_Totl_c0))
outp += sprintf(outp, "%sTotl%%C0", (printed++ ? delim : ""));
if (DO_BIC(BIC_Any_c0))
@ -858,13 +972,13 @@ int dump_counters(struct thread_data *t, struct core_data *c,
outp += sprintf(outp, "pc10: %016llX\n", p->pc10);
outp += sprintf(outp, "cpu_lpi: %016llX\n", p->cpu_lpi);
outp += sprintf(outp, "sys_lpi: %016llX\n", p->sys_lpi);
outp += sprintf(outp, "Joules PKG: %0X\n", p->energy_pkg);
outp += sprintf(outp, "Joules COR: %0X\n", p->energy_cores);
outp += sprintf(outp, "Joules GFX: %0X\n", p->energy_gfx);
outp += sprintf(outp, "Joules RAM: %0X\n", p->energy_dram);
outp += sprintf(outp, "Throttle PKG: %0X\n",
outp += sprintf(outp, "Joules PKG: %0llX\n", p->energy_pkg);
outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores);
outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx);
outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram);
outp += sprintf(outp, "Throttle PKG: %0llX\n",
p->rapl_pkg_perf_status);
outp += sprintf(outp, "Throttle RAM: %0X\n",
outp += sprintf(outp, "Throttle RAM: %0llX\n",
p->rapl_dram_perf_status);
outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c);
@ -1062,14 +1176,7 @@ int format_counters(struct thread_data *t, struct core_data *c,
}
}
/*
* If measurement interval exceeds minimum RAPL Joule Counter range,
* indicate that results are suspect by printing "**" in fraction place.
*/
if (interval_float < rapl_joule_counter_range)
fmt8 = "%s%.2f";
else
fmt8 = "%6.0f**";
fmt8 = "%s%.2f";
if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY))
outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float);
@ -1098,6 +1205,10 @@ int format_counters(struct thread_data *t, struct core_data *c,
if (DO_BIC(BIC_GFXMHz))
outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_mhz);
/* GFXACTMHz */
if (DO_BIC(BIC_GFXACTMHz))
outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_act_mhz);
/* Totl%C0, Any%C0 GFX%C0 CPUGFX% */
if (DO_BIC(BIC_Totl_c0))
outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0/tsc);
@ -1210,11 +1321,7 @@ void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_
}
#define DELTA_WRAP32(new, old) \
if (new > old) { \
old = new - old; \
} else { \
old = 0x100000000 + new - old; \
}
old = ((((unsigned long long)new << 32) - ((unsigned long long)old << 32)) >> 32);
int
delta_package(struct pkg_data *new, struct pkg_data *old)
@ -1253,13 +1360,14 @@ delta_package(struct pkg_data *new, struct pkg_data *old)
old->gfx_rc6_ms = new->gfx_rc6_ms - old->gfx_rc6_ms;
old->gfx_mhz = new->gfx_mhz;
old->gfx_act_mhz = new->gfx_act_mhz;
DELTA_WRAP32(new->energy_pkg, old->energy_pkg);
DELTA_WRAP32(new->energy_cores, old->energy_cores);
DELTA_WRAP32(new->energy_gfx, old->energy_gfx);
DELTA_WRAP32(new->energy_dram, old->energy_dram);
DELTA_WRAP32(new->rapl_pkg_perf_status, old->rapl_pkg_perf_status);
DELTA_WRAP32(new->rapl_dram_perf_status, old->rapl_dram_perf_status);
old->energy_pkg = new->energy_pkg - old->energy_pkg;
old->energy_cores = new->energy_cores - old->energy_cores;
old->energy_gfx = new->energy_gfx - old->energy_gfx;
old->energy_dram = new->energy_dram - old->energy_dram;
old->rapl_pkg_perf_status = new->rapl_pkg_perf_status - old->rapl_pkg_perf_status;
old->rapl_dram_perf_status = new->rapl_dram_perf_status - old->rapl_dram_perf_status;
for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
if (mp->format == FORMAT_RAW)
@ -1469,6 +1577,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data
p->gfx_rc6_ms = 0;
p->gfx_mhz = 0;
p->gfx_act_mhz = 0;
for (i = 0, mp = sys.tp; mp; i++, mp = mp->next)
t->counter[i] = 0;
@ -1564,6 +1673,7 @@ int sum_counters(struct thread_data *t, struct core_data *c,
average.packages.gfx_rc6_ms = p->gfx_rc6_ms;
average.packages.gfx_mhz = p->gfx_mhz;
average.packages.gfx_act_mhz = p->gfx_act_mhz;
average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c);
@ -1784,7 +1894,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
int i;
if (cpu_migrate(cpu)) {
fprintf(outf, "Could not migrate to CPU %d\n", cpu);
fprintf(outf, "get_counters: Could not migrate to CPU %d\n", cpu);
return -1;
}
@ -1966,39 +2076,39 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
p->sys_lpi = cpuidle_cur_sys_lpi_us;
if (do_rapl & RAPL_PKG) {
if (get_msr(cpu, MSR_PKG_ENERGY_STATUS, &msr))
if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr))
return -13;
p->energy_pkg = msr & 0xFFFFFFFF;
p->energy_pkg = msr;
}
if (do_rapl & RAPL_CORES_ENERGY_STATUS) {
if (get_msr(cpu, MSR_PP0_ENERGY_STATUS, &msr))
if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr))
return -14;
p->energy_cores = msr & 0xFFFFFFFF;
p->energy_cores = msr;
}
if (do_rapl & RAPL_DRAM) {
if (get_msr(cpu, MSR_DRAM_ENERGY_STATUS, &msr))
if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr))
return -15;
p->energy_dram = msr & 0xFFFFFFFF;
p->energy_dram = msr;
}
if (do_rapl & RAPL_GFX) {
if (get_msr(cpu, MSR_PP1_ENERGY_STATUS, &msr))
if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr))
return -16;
p->energy_gfx = msr & 0xFFFFFFFF;
p->energy_gfx = msr;
}
if (do_rapl & RAPL_PKG_PERF_STATUS) {
if (get_msr(cpu, MSR_PKG_PERF_STATUS, &msr))
if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr))
return -16;
p->rapl_pkg_perf_status = msr & 0xFFFFFFFF;
p->rapl_pkg_perf_status = msr;
}
if (do_rapl & RAPL_DRAM_PERF_STATUS) {
if (get_msr(cpu, MSR_DRAM_PERF_STATUS, &msr))
if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr))
return -16;
p->rapl_dram_perf_status = msr & 0xFFFFFFFF;
p->rapl_dram_perf_status = msr;
}
if (do_rapl & RAPL_AMD_F17H) {
if (get_msr(cpu, MSR_PKG_ENERGY_STAT, &msr))
if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr))
return -13;
p->energy_pkg = msr & 0xFFFFFFFF;
p->energy_pkg = msr;
}
if (DO_BIC(BIC_PkgTmp)) {
if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
@ -2012,6 +2122,9 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
if (DO_BIC(BIC_GFXMHz))
p->gfx_mhz = gfx_cur_mhz;
if (DO_BIC(BIC_GFXACTMHz))
p->gfx_act_mhz = gfx_act_mhz;
for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
if (get_mp(cpu, mp, &p->counter[i]))
return -10;
@ -2173,6 +2286,7 @@ int has_turbo_ratio_group_limits(int family, int model)
case INTEL_FAM6_ATOM_GOLDMONT:
case INTEL_FAM6_SKYLAKE_X:
case INTEL_FAM6_ATOM_GOLDMONT_D:
case INTEL_FAM6_ATOM_TREMONT_D:
return 1;
}
return 0;
@ -2650,7 +2764,12 @@ int get_thread_siblings(struct cpu_topology *thiscpu)
sprintf(path,
"/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpu);
filep = fopen_or_die(path, "r");
filep = fopen(path, "r");
if (!filep) {
warnx("%s: open failed", path);
return -1;
}
do {
offset -= BITMASK_SIZE;
if (fscanf(filep, "%lx%c", &map, &character) != 2)
@ -2763,18 +2882,25 @@ void re_initialize(void)
{
free_all_buffers();
setup_all_buffers();
printf("turbostat: re-initialized with num_cpus %d\n", topo.num_cpus);
fprintf(outf, "turbostat: re-initialized with num_cpus %d\n", topo.num_cpus);
}
void set_max_cpu_num(void)
{
FILE *filep;
int base_cpu;
unsigned long dummy;
char pathname[64];
base_cpu = sched_getcpu();
if (base_cpu < 0)
err(1, "cannot find calling cpu ID");
sprintf(pathname,
"/sys/devices/system/cpu/cpu%d/topology/thread_siblings",
base_cpu);
filep = fopen_or_die(pathname, "r");
topo.max_cpu_num = 0;
filep = fopen_or_die(
"/sys/devices/system/cpu/cpu0/topology/thread_siblings",
"r");
while (fscanf(filep, "%lx,", &dummy) == 1)
topo.max_cpu_num += BITMASK_SIZE;
fclose(filep);
@ -2915,6 +3041,33 @@ int snapshot_gfx_mhz(void)
return 0;
}
/*
* snapshot_gfx_cur_mhz()
*
* record snapshot of
* /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz
*
* return 1 if config change requires a restart, else return 0
*/
int snapshot_gfx_act_mhz(void)
{
static FILE *fp;
int retval;
if (fp == NULL)
fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", "r");
else {
rewind(fp);
fflush(fp);
}
retval = fscanf(fp, "%d", &gfx_act_mhz);
if (retval != 1)
err(1, "GFX ACT MHz");
return 0;
}
/*
* snapshot_cpu_lpi()
*
@ -2980,6 +3133,9 @@ int snapshot_proc_sysfs_files(void)
if (DO_BIC(BIC_GFXMHz))
snapshot_gfx_mhz();
if (DO_BIC(BIC_GFXACTMHz))
snapshot_gfx_act_mhz();
if (DO_BIC(BIC_CPU_LPI))
snapshot_cpu_lpi_us();
@ -3057,6 +3213,111 @@ void do_sleep(void)
}
}
int get_msr_sum(int cpu, off_t offset, unsigned long long *msr)
{
int ret, idx;
unsigned long long msr_cur, msr_last;
if (!per_cpu_msr_sum)
return 1;
idx = offset_to_idx(offset);
if (idx < 0)
return idx;
/* get_msr_sum() = sum + (get_msr() - last) */
ret = get_msr(cpu, offset, &msr_cur);
if (ret)
return ret;
msr_last = per_cpu_msr_sum[cpu].entries[idx].last;
DELTA_WRAP32(msr_cur, msr_last);
*msr = msr_last + per_cpu_msr_sum[cpu].entries[idx].sum;
return 0;
}
timer_t timerid;
/* Timer callback, update the sum of MSRs periodically. */
static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg_data *p)
{
int i, ret;
int cpu = t->cpu_id;
for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) {
unsigned long long msr_cur, msr_last;
int offset;
if (!idx_valid(i))
continue;
offset = idx_to_offset(i);
if (offset < 0)
continue;
ret = get_msr(cpu, offset, &msr_cur);
if (ret) {
fprintf(outf, "Can not update msr(0x%x)\n", offset);
continue;
}
msr_last = per_cpu_msr_sum[cpu].entries[i].last;
per_cpu_msr_sum[cpu].entries[i].last = msr_cur & 0xffffffff;
DELTA_WRAP32(msr_cur, msr_last);
per_cpu_msr_sum[cpu].entries[i].sum += msr_last;
}
return 0;
}
static void
msr_record_handler(union sigval v)
{
for_all_cpus(update_msr_sum, EVEN_COUNTERS);
}
void msr_sum_record(void)
{
struct itimerspec its;
struct sigevent sev;
per_cpu_msr_sum = calloc(topo.max_cpu_num + 1, sizeof(struct msr_sum_array));
if (!per_cpu_msr_sum) {
fprintf(outf, "Can not allocate memory for long time MSR.\n");
return;
}
/*
* Signal handler might be restricted, so use thread notifier instead.
*/
memset(&sev, 0, sizeof(struct sigevent));
sev.sigev_notify = SIGEV_THREAD;
sev.sigev_notify_function = msr_record_handler;
sev.sigev_value.sival_ptr = &timerid;
if (timer_create(CLOCK_REALTIME, &sev, &timerid) == -1) {
fprintf(outf, "Can not create timer.\n");
goto release_msr;
}
its.it_value.tv_sec = 0;
its.it_value.tv_nsec = 1;
/*
* A wraparound time has been calculated early.
* Some sources state that the peak power for a
* microprocessor is usually 1.5 times the TDP rating,
* use 2 * TDP for safety.
*/
its.it_interval.tv_sec = rapl_joule_counter_range / 2;
its.it_interval.tv_nsec = 0;
if (timer_settime(timerid, 0, &its, NULL) == -1) {
fprintf(outf, "Can not set timer.\n");
goto release_timer;
}
return;
release_timer:
timer_delete(timerid);
release_msr:
free(per_cpu_msr_sum);
}
void turbostat_loop()
{
@ -3075,7 +3336,7 @@ void turbostat_loop()
if (retval < -1) {
exit(retval);
} else if (retval == -1) {
if (restarted > 1) {
if (restarted > 10) {
exit(retval);
}
re_initialize();
@ -3279,6 +3540,7 @@ int probe_nhm_msrs(unsigned int family, unsigned int model)
case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */
case INTEL_FAM6_ATOM_TREMONT: /* EHL */
case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */
pkg_cstate_limits = glm_pkg_cstate_limits;
break;
default:
@ -3361,6 +3623,17 @@ int is_ehl(unsigned int family, unsigned int model)
}
return 0;
}
int is_jvl(unsigned int family, unsigned int model)
{
if (!genuine_intel)
return 0;
switch (model) {
case INTEL_FAM6_ATOM_TREMONT_D:
return 1;
}
return 0;
}
int has_turbo_ratio_limit(unsigned int family, unsigned int model)
{
@ -3474,6 +3747,20 @@ int has_config_tdp(unsigned int family, unsigned int model)
}
}
static void
remove_underbar(char *s)
{
char *to = s;
while (*s) {
if (*s != '_')
*to++ = *s;
s++;
}
*to = 0;
}
static void
dump_cstate_pstate_config_info(unsigned int family, unsigned int model)
{
@ -3530,9 +3817,6 @@ dump_sysfs_cstate_config(void)
int state;
char *sp;
if (!DO_BIC(BIC_sysfs))
return;
if (access("/sys/devices/system/cpu/cpuidle", R_OK)) {
fprintf(outf, "cpuidle not loaded\n");
return;
@ -3559,6 +3843,8 @@ dump_sysfs_cstate_config(void)
*sp = '\0';
fclose(input);
remove_underbar(name_buf);
sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/desc",
base_cpu, state);
input = fopen(path, "r");
@ -3645,7 +3931,7 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p)
return 0;
if (cpu_migrate(cpu)) {
fprintf(outf, "Could not migrate to CPU %d\n", cpu);
fprintf(outf, "print_epb: Could not migrate to CPU %d\n", cpu);
return -1;
}
@ -3689,7 +3975,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
return 0;
if (cpu_migrate(cpu)) {
fprintf(outf, "Could not migrate to CPU %d\n", cpu);
fprintf(outf, "print_hwp: Could not migrate to CPU %d\n", cpu);
return -1;
}
@ -3777,7 +4063,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
return 0;
if (cpu_migrate(cpu)) {
fprintf(outf, "Could not migrate to CPU %d\n", cpu);
fprintf(outf, "print_perf_limit: Could not migrate to CPU %d\n", cpu);
return -1;
}
@ -3881,13 +4167,8 @@ double get_tdp_intel(unsigned int model)
double get_tdp_amd(unsigned int family)
{
switch (family) {
case 0x17:
case 0x18:
default:
/* This is the max stock TDP of HEDT/Server Fam17h chips */
return 250.0;
}
/* This is the max stock TDP of HEDT/Server Fam17h+ chips */
return 280.0;
}
/*
@ -3959,6 +4240,14 @@ void rapl_probe_intel(unsigned int family, unsigned int model)
BIC_PRESENT(BIC_GFXWatt);
}
break;
case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */
do_rapl = RAPL_PKG | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO;
BIC_PRESENT(BIC_PKG__);
if (rapl_joules)
BIC_PRESENT(BIC_Pkg_J);
else
BIC_PRESENT(BIC_PkgWatt);
break;
case INTEL_FAM6_SKYLAKE_L: /* SKL */
case INTEL_FAM6_CANNONLAKE_L: /* CNL */
do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_GFX | RAPL_PKG_POWER_INFO;
@ -4069,27 +4358,20 @@ void rapl_probe_amd(unsigned int family, unsigned int model)
if (max_extended_level >= 0x80000007) {
__cpuid(0x80000007, eax, ebx, ecx, edx);
/* RAPL (Fam 17h) */
/* RAPL (Fam 17h+) */
has_rapl = edx & (1 << 14);
}
if (!has_rapl)
if (!has_rapl || family < 0x17)
return;
switch (family) {
case 0x17: /* Zen, Zen+ */
case 0x18: /* Hygon Dhyana */
do_rapl = RAPL_AMD_F17H | RAPL_PER_CORE_ENERGY;
if (rapl_joules) {
BIC_PRESENT(BIC_Pkg_J);
BIC_PRESENT(BIC_Cor_J);
} else {
BIC_PRESENT(BIC_PkgWatt);
BIC_PRESENT(BIC_CorWatt);
}
break;
default:
return;
do_rapl = RAPL_AMD_F17H | RAPL_PER_CORE_ENERGY;
if (rapl_joules) {
BIC_PRESENT(BIC_Pkg_J);
BIC_PRESENT(BIC_Cor_J);
} else {
BIC_PRESENT(BIC_PkgWatt);
BIC_PRESENT(BIC_CorWatt);
}
if (get_msr(base_cpu, MSR_RAPL_PWR_UNIT, &msr))
@ -4162,7 +4444,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
return 0;
if (cpu_migrate(cpu)) {
fprintf(outf, "Could not migrate to CPU %d\n", cpu);
fprintf(outf, "print_thermal: Could not migrate to CPU %d\n", cpu);
return -1;
}
@ -4234,7 +4516,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
cpu = t->cpu_id;
if (cpu_migrate(cpu)) {
fprintf(outf, "Could not migrate to CPU %d\n", cpu);
fprintf(outf, "print_rapl: Could not migrate to CPU %d\n", cpu);
return -1;
}
@ -4361,6 +4643,7 @@ int has_snb_msrs(unsigned int family, unsigned int model)
case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */
case INTEL_FAM6_ATOM_TREMONT: /* EHL */
case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */
return 1;
}
return 0;
@ -4507,12 +4790,33 @@ double discover_bclk(unsigned int family, unsigned int model)
* below this value, including the Digital Thermal Sensor (DTS),
* Package Thermal Management Sensor (PTM), and thermal event thresholds.
*/
int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p)
int read_tcc_activation_temp()
{
unsigned long long msr;
unsigned int target_c_local;
int cpu;
unsigned int tcc, target_c, offset_c;
/* Temperature Target MSR is Nehalem and newer only */
if (!do_nhm_platform_info)
return 0;
if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr))
return 0;
target_c = (msr >> 16) & 0xFF;
offset_c = (msr >> 24) & 0xF;
tcc = target_c - offset_c;
if (!quiet)
fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n",
base_cpu, msr, tcc, target_c, offset_c);
return tcc;
}
int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p)
{
/* tcc_activation_temp is used only for dts or ptm */
if (!(do_dts || do_ptm))
return 0;
@ -4521,43 +4825,18 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk
if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
return 0;
cpu = t->cpu_id;
if (cpu_migrate(cpu)) {
fprintf(outf, "Could not migrate to CPU %d\n", cpu);
return -1;
}
if (tcc_activation_temp_override != 0) {
tcc_activation_temp = tcc_activation_temp_override;
fprintf(outf, "cpu%d: Using cmdline TCC Target (%d C)\n",
cpu, tcc_activation_temp);
fprintf(outf, "Using cmdline TCC Target (%d C)\n", tcc_activation_temp);
return 0;
}
/* Temperature Target MSR is Nehalem and newer only */
if (!do_nhm_platform_info)
goto guess;
tcc_activation_temp = read_tcc_activation_temp();
if (tcc_activation_temp)
return 0;
if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr))
goto guess;
target_c_local = (msr >> 16) & 0xFF;
if (!quiet)
fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n",
cpu, msr, target_c_local);
if (!target_c_local)
goto guess;
tcc_activation_temp = target_c_local;
return 0;
guess:
tcc_activation_temp = TJMAX_DEFAULT;
fprintf(outf, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n",
cpu, tcc_activation_temp);
fprintf(outf, "Guessing tjMax %d C, Please use -T to specify\n", tcc_activation_temp);
return 0;
}
@ -4685,19 +4964,46 @@ unsigned int intel_model_duplicates(unsigned int model)
case INTEL_FAM6_ICELAKE_NNPI:
case INTEL_FAM6_TIGERLAKE_L:
case INTEL_FAM6_TIGERLAKE:
case INTEL_FAM6_ROCKETLAKE:
case INTEL_FAM6_LAKEFIELD:
case INTEL_FAM6_ALDERLAKE:
return INTEL_FAM6_CANNONLAKE_L;
case INTEL_FAM6_ATOM_TREMONT_D:
return INTEL_FAM6_ATOM_GOLDMONT_D;
case INTEL_FAM6_ATOM_TREMONT_L:
return INTEL_FAM6_ATOM_TREMONT;
case INTEL_FAM6_ICELAKE_X:
case INTEL_FAM6_SAPPHIRERAPIDS_X:
return INTEL_FAM6_SKYLAKE_X;
}
return model;
}
void print_dev_latency(void)
{
char *path = "/dev/cpu_dma_latency";
int fd;
int value;
int retval;
fd = open(path, O_RDONLY);
if (fd < 0) {
warn("fopen %s\n", path);
return;
}
retval = read(fd, (void *)&value, sizeof(int));
if (retval != sizeof(int)) {
warn("read %s\n", path);
close(fd);
return;
}
fprintf(outf, "/dev/cpu_dma_latency: %d usec (%s)\n",
value, value == 2000000000 ? "default" : "constrained");
close(fd);
}
void process_cpuid()
{
unsigned int eax, ebx, ecx, edx;
@ -4916,6 +5222,14 @@ void process_cpuid()
BIC_PRESENT(BIC_Mod_c6);
use_c1_residency_msr = 1;
}
if (is_jvl(family, model)) {
BIC_NOT_PRESENT(BIC_CPU_c3);
BIC_NOT_PRESENT(BIC_CPU_c7);
BIC_NOT_PRESENT(BIC_Pkgpc2);
BIC_NOT_PRESENT(BIC_Pkgpc3);
BIC_NOT_PRESENT(BIC_Pkgpc6);
BIC_NOT_PRESENT(BIC_Pkgpc7);
}
if (is_dnv(family, model)) {
BIC_PRESENT(BIC_CPU_c1);
BIC_NOT_PRESENT(BIC_CPU_c3);
@ -4935,9 +5249,12 @@ void process_cpuid()
BIC_NOT_PRESENT(BIC_Pkgpc7);
}
if (has_c8910_msrs(family, model)) {
BIC_PRESENT(BIC_Pkgpc8);
BIC_PRESENT(BIC_Pkgpc9);
BIC_PRESENT(BIC_Pkgpc10);
if (pkg_cstate_limit >= PCL__8)
BIC_PRESENT(BIC_Pkgpc8);
if (pkg_cstate_limit >= PCL__9)
BIC_PRESENT(BIC_Pkgpc9);
if (pkg_cstate_limit >= PCL_10)
BIC_PRESENT(BIC_Pkgpc10);
}
do_irtl_hsw = has_c8910_msrs(family, model);
if (has_skl_msrs(family, model)) {
@ -4966,6 +5283,8 @@ void process_cpuid()
if (!quiet)
dump_cstate_pstate_config_info(family, model);
if (!quiet)
print_dev_latency();
if (!quiet)
dump_sysfs_cstate_config();
if (!quiet)
@ -4980,6 +5299,9 @@ void process_cpuid()
if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK))
BIC_PRESENT(BIC_GFXMHz);
if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK))
BIC_PRESENT(BIC_GFXACTMHz);
if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK))
BIC_PRESENT(BIC_CPU_LPI);
else
@ -5390,7 +5712,7 @@ int get_and_dump_counters(void)
}
void print_version() {
fprintf(outf, "turbostat version 20.03.20"
fprintf(outf, "turbostat version 20.09.30"
" - Len Brown <lenb@kernel.org>\n");
}
@ -5597,6 +5919,8 @@ void probe_sysfs(void)
*sp = '%';
*(sp + 1) = '\0';
remove_underbar(name_buf);
fclose(input);
sprintf(path, "cpuidle/state%d/time", state);
@ -5624,6 +5948,8 @@ void probe_sysfs(void)
*sp = '\0';
fclose(input);
remove_underbar(name_buf);
sprintf(path, "cpuidle/state%d/usage", state);
if (is_deferred_skip(name_buf))
@ -5868,6 +6194,7 @@ int main(int argc, char **argv)
return 0;
}
msr_sum_record();
/*
* if any params left, it must be a command to fork
*/

View File

@ -622,6 +622,57 @@ void cmdline(int argc, char **argv)
}
}
/*
* Open a file, and exit on failure
*/
FILE *fopen_or_die(const char *path, const char *mode)
{
FILE *filep = fopen(path, "r");
if (!filep)
err(1, "%s: open failed", path);
return filep;
}
void err_on_hypervisor(void)
{
FILE *cpuinfo;
char *flags, *hypervisor;
char *buffer;
/* On VMs /proc/cpuinfo contains a "flags" entry for hypervisor */
cpuinfo = fopen_or_die("/proc/cpuinfo", "ro");
buffer = malloc(4096);
if (!buffer) {
fclose(cpuinfo);
err(-ENOMEM, "buffer malloc fail");
}
if (!fread(buffer, 1024, 1, cpuinfo)) {
fclose(cpuinfo);
free(buffer);
err(1, "Reading /proc/cpuinfo failed");
}
flags = strstr(buffer, "flags");
rewind(cpuinfo);
fseek(cpuinfo, flags - buffer, SEEK_SET);
if (!fgets(buffer, 4096, cpuinfo)) {
fclose(cpuinfo);
free(buffer);
err(1, "Reading /proc/cpuinfo failed");
}
fclose(cpuinfo);
hypervisor = strstr(buffer, "hypervisor");
free(buffer);
if (hypervisor)
err(-1,
"not supported on this virtual machine");
}
int get_msr(int cpu, int offset, unsigned long long *msr)
{
@ -635,8 +686,10 @@ int get_msr(int cpu, int offset, unsigned long long *msr)
err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, or run as root", pathname);
retval = pread(fd, msr, sizeof(*msr), offset);
if (retval != sizeof(*msr))
if (retval != sizeof(*msr)) {
err_on_hypervisor();
err(-1, "%s offset 0x%llx read failed", pathname, (unsigned long long)offset);
}
if (debug > 1)
fprintf(stderr, "get_msr(cpu%d, 0x%X, 0x%llX)\n", cpu, offset, *msr);
@ -1086,18 +1139,6 @@ int update_cpu_msrs(int cpu)
return 0;
}
/*
* Open a file, and exit on failure
*/
FILE *fopen_or_die(const char *path, const char *mode)
{
FILE *filep = fopen(path, "r");
if (!filep)
err(1, "%s: open failed", path);
return filep;
}
unsigned int get_pkg_num(int cpu)
{
FILE *fp;

View File

@ -1,10 +1,13 @@
# SPDX-License-Identifier: GPL-2.0-only
/aarch64/get-reg-list
/aarch64/get-reg-list-sve
/s390x/memop
/s390x/resets
/s390x/sync_regs_test
/x86_64/cr4_cpuid_sync_test
/x86_64/debug_regs
/x86_64/evmcs_test
/x86_64/kvm_pv_test
/x86_64/hyperv_cpuid
/x86_64/mmio_warning_test
/x86_64/platform_info_test
@ -24,6 +27,7 @@
/clear_dirty_log_test
/demand_paging_test
/dirty_log_test
/dirty_log_perf_test
/kvm_create_max_vcpus
/set_memory_region_test
/steal_time

View File

@ -34,13 +34,14 @@ ifeq ($(ARCH),s390)
endif
LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c
LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c
LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S
LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c
LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c
TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test
TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
@ -58,14 +59,15 @@ TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
TEST_GEN_PROGS_x86_64 += x86_64/debug_regs
TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test
TEST_GEN_PROGS_x86_64 += x86_64/user_msr_test
TEST_GEN_PROGS_x86_64 += clear_dirty_log_test
TEST_GEN_PROGS_x86_64 += demand_paging_test
TEST_GEN_PROGS_x86_64 += dirty_log_test
TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
TEST_GEN_PROGS_x86_64 += set_memory_region_test
TEST_GEN_PROGS_x86_64 += steal_time
TEST_GEN_PROGS_aarch64 += clear_dirty_log_test
TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve
TEST_GEN_PROGS_aarch64 += demand_paging_test
TEST_GEN_PROGS_aarch64 += dirty_log_test
TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
@ -111,14 +113,21 @@ LDFLAGS += -pthread $(no-pie-option) $(pgste-option)
include ../lib.mk
STATIC_LIBS := $(OUTPUT)/libkvm.a
LIBKVM_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM))
EXTRA_CLEAN += $(LIBKVM_OBJ) $(STATIC_LIBS) cscope.*
LIBKVM_C := $(filter %.c,$(LIBKVM))
LIBKVM_S := $(filter %.S,$(LIBKVM))
LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C))
LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S))
EXTRA_CLEAN += $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(STATIC_LIBS) cscope.*
x := $(shell mkdir -p $(sort $(dir $(LIBKVM_OBJ))))
$(LIBKVM_OBJ): $(OUTPUT)/%.o: %.c
x := $(shell mkdir -p $(sort $(dir $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ))))
$(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.c
$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
$(OUTPUT)/libkvm.a: $(LIBKVM_OBJ)
$(LIBKVM_S_OBJ): $(OUTPUT)/%.o: %.S
$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ)
$(OUTPUT)/libkvm.a: $(LIBKVM_OBJS)
$(AR) crs $@ $^
x := $(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS))))

View File

@ -0,0 +1,3 @@
// SPDX-License-Identifier: GPL-2.0
#define REG_LIST_SVE
#include "get-reg-list.c"

View File

@ -0,0 +1,841 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Check for KVM_GET_REG_LIST regressions.
*
* Copyright (C) 2020, Red Hat, Inc.
*
* When attempting to migrate from a host with an older kernel to a host
* with a newer kernel we allow the newer kernel on the destination to
* list new registers with get-reg-list. We assume they'll be unused, at
* least until the guest reboots, and so they're relatively harmless.
* However, if the destination host with the newer kernel is missing
* registers which the source host with the older kernel has, then that's
* a regression in get-reg-list. This test checks for that regression by
* checking the current list against a blessed list. We should never have
* missing registers, but if new ones appear then they can probably be
* added to the blessed list. A completely new blessed list can be created
* by running the test with the --list command line argument.
*
* Note, the blessed list should be created from the oldest possible
* kernel. We can't go older than v4.15, though, because that's the first
* release to expose the ID system registers in KVM_GET_REG_LIST, see
* commit 93390c0a1b20 ("arm64: KVM: Hide unsupported AArch64 CPU features
* from guests"). Also, one must use the --core-reg-fixup command line
* option when running on an older kernel that doesn't include df205b5c6328
* ("KVM: arm64: Filter out invalid core register IDs in KVM_GET_REG_LIST")
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "kvm_util.h"
#include "test_util.h"
#include "processor.h"
#ifdef REG_LIST_SVE
#define reg_list_sve() (true)
#else
#define reg_list_sve() (false)
#endif
#define REG_MASK (KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_COPROC_MASK)
#define for_each_reg(i) \
for ((i) = 0; (i) < reg_list->n; ++(i))
#define for_each_missing_reg(i) \
for ((i) = 0; (i) < blessed_n; ++(i)) \
if (!find_reg(reg_list->reg, reg_list->n, blessed_reg[i]))
#define for_each_new_reg(i) \
for ((i) = 0; (i) < reg_list->n; ++(i)) \
if (!find_reg(blessed_reg, blessed_n, reg_list->reg[i]))
static struct kvm_reg_list *reg_list;
static __u64 base_regs[], vregs[], sve_regs[], rejects_set[];
static __u64 base_regs_n, vregs_n, sve_regs_n, rejects_set_n;
static __u64 *blessed_reg, blessed_n;
static bool find_reg(__u64 regs[], __u64 nr_regs, __u64 reg)
{
int i;
for (i = 0; i < nr_regs; ++i)
if (reg == regs[i])
return true;
return false;
}
static const char *str_with_index(const char *template, __u64 index)
{
char *str, *p;
int n;
str = strdup(template);
p = strstr(str, "##");
n = sprintf(p, "%lld", index);
strcat(p + n, strstr(template, "##") + 2);
return (const char *)str;
}
#define CORE_REGS_XX_NR_WORDS 2
#define CORE_SPSR_XX_NR_WORDS 2
#define CORE_FPREGS_XX_NR_WORDS 4
static const char *core_id_to_str(__u64 id)
{
__u64 core_off = id & ~REG_MASK, idx;
/*
* core_off is the offset into struct kvm_regs
*/
switch (core_off) {
case KVM_REG_ARM_CORE_REG(regs.regs[0]) ...
KVM_REG_ARM_CORE_REG(regs.regs[30]):
idx = (core_off - KVM_REG_ARM_CORE_REG(regs.regs[0])) / CORE_REGS_XX_NR_WORDS;
TEST_ASSERT(idx < 31, "Unexpected regs.regs index: %lld", idx);
return str_with_index("KVM_REG_ARM_CORE_REG(regs.regs[##])", idx);
case KVM_REG_ARM_CORE_REG(regs.sp):
return "KVM_REG_ARM_CORE_REG(regs.sp)";
case KVM_REG_ARM_CORE_REG(regs.pc):
return "KVM_REG_ARM_CORE_REG(regs.pc)";
case KVM_REG_ARM_CORE_REG(regs.pstate):
return "KVM_REG_ARM_CORE_REG(regs.pstate)";
case KVM_REG_ARM_CORE_REG(sp_el1):
return "KVM_REG_ARM_CORE_REG(sp_el1)";
case KVM_REG_ARM_CORE_REG(elr_el1):
return "KVM_REG_ARM_CORE_REG(elr_el1)";
case KVM_REG_ARM_CORE_REG(spsr[0]) ...
KVM_REG_ARM_CORE_REG(spsr[KVM_NR_SPSR - 1]):
idx = (core_off - KVM_REG_ARM_CORE_REG(spsr[0])) / CORE_SPSR_XX_NR_WORDS;
TEST_ASSERT(idx < KVM_NR_SPSR, "Unexpected spsr index: %lld", idx);
return str_with_index("KVM_REG_ARM_CORE_REG(spsr[##])", idx);
case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ...
KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]):
idx = (core_off - KVM_REG_ARM_CORE_REG(fp_regs.vregs[0])) / CORE_FPREGS_XX_NR_WORDS;
TEST_ASSERT(idx < 32, "Unexpected fp_regs.vregs index: %lld", idx);
return str_with_index("KVM_REG_ARM_CORE_REG(fp_regs.vregs[##])", idx);
case KVM_REG_ARM_CORE_REG(fp_regs.fpsr):
return "KVM_REG_ARM_CORE_REG(fp_regs.fpsr)";
case KVM_REG_ARM_CORE_REG(fp_regs.fpcr):
return "KVM_REG_ARM_CORE_REG(fp_regs.fpcr)";
}
TEST_FAIL("Unknown core reg id: 0x%llx", id);
return NULL;
}
static const char *sve_id_to_str(__u64 id)
{
__u64 sve_off, n, i;
if (id == KVM_REG_ARM64_SVE_VLS)
return "KVM_REG_ARM64_SVE_VLS";
sve_off = id & ~(REG_MASK | ((1ULL << 5) - 1));
i = id & (KVM_ARM64_SVE_MAX_SLICES - 1);
TEST_ASSERT(i == 0, "Currently we don't expect slice > 0, reg id 0x%llx", id);
switch (sve_off) {
case KVM_REG_ARM64_SVE_ZREG_BASE ...
KVM_REG_ARM64_SVE_ZREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_ZREGS - 1:
n = (id >> 5) & (KVM_ARM64_SVE_NUM_ZREGS - 1);
TEST_ASSERT(id == KVM_REG_ARM64_SVE_ZREG(n, 0),
"Unexpected bits set in SVE ZREG id: 0x%llx", id);
return str_with_index("KVM_REG_ARM64_SVE_ZREG(##, 0)", n);
case KVM_REG_ARM64_SVE_PREG_BASE ...
KVM_REG_ARM64_SVE_PREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_PREGS - 1:
n = (id >> 5) & (KVM_ARM64_SVE_NUM_PREGS - 1);
TEST_ASSERT(id == KVM_REG_ARM64_SVE_PREG(n, 0),
"Unexpected bits set in SVE PREG id: 0x%llx", id);
return str_with_index("KVM_REG_ARM64_SVE_PREG(##, 0)", n);
case KVM_REG_ARM64_SVE_FFR_BASE:
TEST_ASSERT(id == KVM_REG_ARM64_SVE_FFR(0),
"Unexpected bits set in SVE FFR id: 0x%llx", id);
return "KVM_REG_ARM64_SVE_FFR(0)";
}
return NULL;
}
static void print_reg(__u64 id)
{
unsigned op0, op1, crn, crm, op2;
const char *reg_size = NULL;
TEST_ASSERT((id & KVM_REG_ARCH_MASK) == KVM_REG_ARM64,
"KVM_REG_ARM64 missing in reg id: 0x%llx", id);
switch (id & KVM_REG_SIZE_MASK) {
case KVM_REG_SIZE_U8:
reg_size = "KVM_REG_SIZE_U8";
break;
case KVM_REG_SIZE_U16:
reg_size = "KVM_REG_SIZE_U16";
break;
case KVM_REG_SIZE_U32:
reg_size = "KVM_REG_SIZE_U32";
break;
case KVM_REG_SIZE_U64:
reg_size = "KVM_REG_SIZE_U64";
break;
case KVM_REG_SIZE_U128:
reg_size = "KVM_REG_SIZE_U128";
break;
case KVM_REG_SIZE_U256:
reg_size = "KVM_REG_SIZE_U256";
break;
case KVM_REG_SIZE_U512:
reg_size = "KVM_REG_SIZE_U512";
break;
case KVM_REG_SIZE_U1024:
reg_size = "KVM_REG_SIZE_U1024";
break;
case KVM_REG_SIZE_U2048:
reg_size = "KVM_REG_SIZE_U2048";
break;
default:
TEST_FAIL("Unexpected reg size: 0x%llx in reg id: 0x%llx",
(id & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT, id);
}
switch (id & KVM_REG_ARM_COPROC_MASK) {
case KVM_REG_ARM_CORE:
printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_CORE | %s,\n", reg_size, core_id_to_str(id));
break;
case KVM_REG_ARM_DEMUX:
TEST_ASSERT(!(id & ~(REG_MASK | KVM_REG_ARM_DEMUX_ID_MASK | KVM_REG_ARM_DEMUX_VAL_MASK)),
"Unexpected bits set in DEMUX reg id: 0x%llx", id);
printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | %lld,\n",
reg_size, id & KVM_REG_ARM_DEMUX_VAL_MASK);
break;
case KVM_REG_ARM64_SYSREG:
op0 = (id & KVM_REG_ARM64_SYSREG_OP0_MASK) >> KVM_REG_ARM64_SYSREG_OP0_SHIFT;
op1 = (id & KVM_REG_ARM64_SYSREG_OP1_MASK) >> KVM_REG_ARM64_SYSREG_OP1_SHIFT;
crn = (id & KVM_REG_ARM64_SYSREG_CRN_MASK) >> KVM_REG_ARM64_SYSREG_CRN_SHIFT;
crm = (id & KVM_REG_ARM64_SYSREG_CRM_MASK) >> KVM_REG_ARM64_SYSREG_CRM_SHIFT;
op2 = (id & KVM_REG_ARM64_SYSREG_OP2_MASK) >> KVM_REG_ARM64_SYSREG_OP2_SHIFT;
TEST_ASSERT(id == ARM64_SYS_REG(op0, op1, crn, crm, op2),
"Unexpected bits set in SYSREG reg id: 0x%llx", id);
printf("\tARM64_SYS_REG(%d, %d, %d, %d, %d),\n", op0, op1, crn, crm, op2);
break;
case KVM_REG_ARM_FW:
TEST_ASSERT(id == KVM_REG_ARM_FW_REG(id & 0xffff),
"Unexpected bits set in FW reg id: 0x%llx", id);
printf("\tKVM_REG_ARM_FW_REG(%lld),\n", id & 0xffff);
break;
case KVM_REG_ARM64_SVE:
if (reg_list_sve())
printf("\t%s,\n", sve_id_to_str(id));
else
TEST_FAIL("KVM_REG_ARM64_SVE is an unexpected coproc type in reg id: 0x%llx", id);
break;
default:
TEST_FAIL("Unexpected coproc type: 0x%llx in reg id: 0x%llx",
(id & KVM_REG_ARM_COPROC_MASK) >> KVM_REG_ARM_COPROC_SHIFT, id);
}
}
/*
* Older kernels listed each 32-bit word of CORE registers separately.
* For 64 and 128-bit registers we need to ignore the extra words. We
* also need to fixup the sizes, because the older kernels stated all
* registers were 64-bit, even when they weren't.
*/
static void core_reg_fixup(void)
{
struct kvm_reg_list *tmp;
__u64 id, core_off;
int i;
tmp = calloc(1, sizeof(*tmp) + reg_list->n * sizeof(__u64));
for (i = 0; i < reg_list->n; ++i) {
id = reg_list->reg[i];
if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM_CORE) {
tmp->reg[tmp->n++] = id;
continue;
}
core_off = id & ~REG_MASK;
switch (core_off) {
case 0x52: case 0xd2: case 0xd6:
/*
* These offsets are pointing at padding.
* We need to ignore them too.
*/
continue;
case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ...
KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]):
if (core_off & 3)
continue;
id &= ~KVM_REG_SIZE_MASK;
id |= KVM_REG_SIZE_U128;
tmp->reg[tmp->n++] = id;
continue;
case KVM_REG_ARM_CORE_REG(fp_regs.fpsr):
case KVM_REG_ARM_CORE_REG(fp_regs.fpcr):
id &= ~KVM_REG_SIZE_MASK;
id |= KVM_REG_SIZE_U32;
tmp->reg[tmp->n++] = id;
continue;
default:
if (core_off & 1)
continue;
tmp->reg[tmp->n++] = id;
break;
}
}
free(reg_list);
reg_list = tmp;
}
static void prepare_vcpu_init(struct kvm_vcpu_init *init)
{
if (reg_list_sve())
init->features[0] |= 1 << KVM_ARM_VCPU_SVE;
}
static void finalize_vcpu(struct kvm_vm *vm, uint32_t vcpuid)
{
int feature;
if (reg_list_sve()) {
feature = KVM_ARM_VCPU_SVE;
vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_FINALIZE, &feature);
}
}
static void check_supported(void)
{
if (reg_list_sve() && !kvm_check_cap(KVM_CAP_ARM_SVE)) {
fprintf(stderr, "SVE not available, skipping tests\n");
exit(KSFT_SKIP);
}
}
int main(int ac, char **av)
{
struct kvm_vcpu_init init = { .target = -1, };
int new_regs = 0, missing_regs = 0, i;
int failed_get = 0, failed_set = 0, failed_reject = 0;
bool print_list = false, fixup_core_regs = false;
struct kvm_vm *vm;
__u64 *vec_regs;
check_supported();
for (i = 1; i < ac; ++i) {
if (strcmp(av[i], "--core-reg-fixup") == 0)
fixup_core_regs = true;
else if (strcmp(av[i], "--list") == 0)
print_list = true;
else
fprintf(stderr, "Ignoring unknown option: %s\n", av[i]);
}
vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR);
prepare_vcpu_init(&init);
aarch64_vcpu_add_default(vm, 0, &init, NULL);
finalize_vcpu(vm, 0);
reg_list = vcpu_get_reg_list(vm, 0);
if (fixup_core_regs)
core_reg_fixup();
if (print_list) {
putchar('\n');
for_each_reg(i)
print_reg(reg_list->reg[i]);
putchar('\n');
return 0;
}
/*
* We only test that we can get the register and then write back the
* same value. Some registers may allow other values to be written
* back, but others only allow some bits to be changed, and at least
* for ID registers set will fail if the value does not exactly match
* what was returned by get. If registers that allow other values to
* be written need to have the other values tested, then we should
* create a new set of tests for those in a new independent test
* executable.
*/
for_each_reg(i) {
uint8_t addr[2048 / 8];
struct kvm_one_reg reg = {
.id = reg_list->reg[i],
.addr = (__u64)&addr,
};
int ret;
ret = _vcpu_ioctl(vm, 0, KVM_GET_ONE_REG, &reg);
if (ret) {
puts("Failed to get ");
print_reg(reg.id);
putchar('\n');
++failed_get;
}
/* rejects_set registers are rejected after KVM_ARM_VCPU_FINALIZE */
if (find_reg(rejects_set, rejects_set_n, reg.id)) {
ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, &reg);
if (ret != -1 || errno != EPERM) {
printf("Failed to reject (ret=%d, errno=%d) ", ret, errno);
print_reg(reg.id);
putchar('\n');
++failed_reject;
}
continue;
}
ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, &reg);
if (ret) {
puts("Failed to set ");
print_reg(reg.id);
putchar('\n');
++failed_set;
}
}
if (reg_list_sve()) {
blessed_n = base_regs_n + sve_regs_n;
vec_regs = sve_regs;
} else {
blessed_n = base_regs_n + vregs_n;
vec_regs = vregs;
}
blessed_reg = calloc(blessed_n, sizeof(__u64));
for (i = 0; i < base_regs_n; ++i)
blessed_reg[i] = base_regs[i];
for (i = 0; i < blessed_n - base_regs_n; ++i)
blessed_reg[base_regs_n + i] = vec_regs[i];
for_each_new_reg(i)
++new_regs;
for_each_missing_reg(i)
++missing_regs;
if (new_regs || missing_regs) {
printf("Number blessed registers: %5lld\n", blessed_n);
printf("Number registers: %5lld\n", reg_list->n);
}
if (new_regs) {
printf("\nThere are %d new registers.\n"
"Consider adding them to the blessed reg "
"list with the following lines:\n\n", new_regs);
for_each_new_reg(i)
print_reg(reg_list->reg[i]);
putchar('\n');
}
if (missing_regs) {
printf("\nThere are %d missing registers.\n"
"The following lines are missing registers:\n\n", missing_regs);
for_each_missing_reg(i)
print_reg(blessed_reg[i]);
putchar('\n');
}
TEST_ASSERT(!missing_regs && !failed_get && !failed_set && !failed_reject,
"There are %d missing registers; "
"%d registers failed get; %d registers failed set; %d registers failed reject",
missing_regs, failed_get, failed_set, failed_reject);
return 0;
}
/*
* The current blessed list was primed with the output of kernel version
* v4.15 with --core-reg-fixup and then later updated with new registers.
*/
static __u64 base_regs[] = {
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[0]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[1]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[2]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[3]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[4]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[5]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[6]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[7]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[8]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[9]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[10]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[11]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[12]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[13]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[14]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[15]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[16]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[17]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[18]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[19]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[20]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[21]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[22]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[23]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[24]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[25]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[26]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[27]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[28]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[29]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[30]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.sp),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.pc),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.pstate),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(sp_el1),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(elr_el1),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[0]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[1]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[2]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[3]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[4]),
KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpsr),
KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpcr),
KVM_REG_ARM_FW_REG(0),
KVM_REG_ARM_FW_REG(1),
KVM_REG_ARM_FW_REG(2),
ARM64_SYS_REG(3, 3, 14, 3, 1), /* CNTV_CTL_EL0 */
ARM64_SYS_REG(3, 3, 14, 3, 2), /* CNTV_CVAL_EL0 */
ARM64_SYS_REG(3, 3, 14, 0, 2),
ARM64_SYS_REG(3, 0, 0, 0, 0), /* MIDR_EL1 */
ARM64_SYS_REG(3, 0, 0, 0, 6), /* REVIDR_EL1 */
ARM64_SYS_REG(3, 1, 0, 0, 1), /* CLIDR_EL1 */
ARM64_SYS_REG(3, 1, 0, 0, 7), /* AIDR_EL1 */
ARM64_SYS_REG(3, 3, 0, 0, 1), /* CTR_EL0 */
ARM64_SYS_REG(2, 0, 0, 0, 4),
ARM64_SYS_REG(2, 0, 0, 0, 5),
ARM64_SYS_REG(2, 0, 0, 0, 6),
ARM64_SYS_REG(2, 0, 0, 0, 7),
ARM64_SYS_REG(2, 0, 0, 1, 4),
ARM64_SYS_REG(2, 0, 0, 1, 5),
ARM64_SYS_REG(2, 0, 0, 1, 6),
ARM64_SYS_REG(2, 0, 0, 1, 7),
ARM64_SYS_REG(2, 0, 0, 2, 0), /* MDCCINT_EL1 */
ARM64_SYS_REG(2, 0, 0, 2, 2), /* MDSCR_EL1 */
ARM64_SYS_REG(2, 0, 0, 2, 4),
ARM64_SYS_REG(2, 0, 0, 2, 5),
ARM64_SYS_REG(2, 0, 0, 2, 6),
ARM64_SYS_REG(2, 0, 0, 2, 7),
ARM64_SYS_REG(2, 0, 0, 3, 4),
ARM64_SYS_REG(2, 0, 0, 3, 5),
ARM64_SYS_REG(2, 0, 0, 3, 6),
ARM64_SYS_REG(2, 0, 0, 3, 7),
ARM64_SYS_REG(2, 0, 0, 4, 4),
ARM64_SYS_REG(2, 0, 0, 4, 5),
ARM64_SYS_REG(2, 0, 0, 4, 6),
ARM64_SYS_REG(2, 0, 0, 4, 7),
ARM64_SYS_REG(2, 0, 0, 5, 4),
ARM64_SYS_REG(2, 0, 0, 5, 5),
ARM64_SYS_REG(2, 0, 0, 5, 6),
ARM64_SYS_REG(2, 0, 0, 5, 7),
ARM64_SYS_REG(2, 0, 0, 6, 4),
ARM64_SYS_REG(2, 0, 0, 6, 5),
ARM64_SYS_REG(2, 0, 0, 6, 6),
ARM64_SYS_REG(2, 0, 0, 6, 7),
ARM64_SYS_REG(2, 0, 0, 7, 4),
ARM64_SYS_REG(2, 0, 0, 7, 5),
ARM64_SYS_REG(2, 0, 0, 7, 6),
ARM64_SYS_REG(2, 0, 0, 7, 7),
ARM64_SYS_REG(2, 0, 0, 8, 4),
ARM64_SYS_REG(2, 0, 0, 8, 5),
ARM64_SYS_REG(2, 0, 0, 8, 6),
ARM64_SYS_REG(2, 0, 0, 8, 7),
ARM64_SYS_REG(2, 0, 0, 9, 4),
ARM64_SYS_REG(2, 0, 0, 9, 5),
ARM64_SYS_REG(2, 0, 0, 9, 6),
ARM64_SYS_REG(2, 0, 0, 9, 7),
ARM64_SYS_REG(2, 0, 0, 10, 4),
ARM64_SYS_REG(2, 0, 0, 10, 5),
ARM64_SYS_REG(2, 0, 0, 10, 6),
ARM64_SYS_REG(2, 0, 0, 10, 7),
ARM64_SYS_REG(2, 0, 0, 11, 4),
ARM64_SYS_REG(2, 0, 0, 11, 5),
ARM64_SYS_REG(2, 0, 0, 11, 6),
ARM64_SYS_REG(2, 0, 0, 11, 7),
ARM64_SYS_REG(2, 0, 0, 12, 4),
ARM64_SYS_REG(2, 0, 0, 12, 5),
ARM64_SYS_REG(2, 0, 0, 12, 6),
ARM64_SYS_REG(2, 0, 0, 12, 7),
ARM64_SYS_REG(2, 0, 0, 13, 4),
ARM64_SYS_REG(2, 0, 0, 13, 5),
ARM64_SYS_REG(2, 0, 0, 13, 6),
ARM64_SYS_REG(2, 0, 0, 13, 7),
ARM64_SYS_REG(2, 0, 0, 14, 4),
ARM64_SYS_REG(2, 0, 0, 14, 5),
ARM64_SYS_REG(2, 0, 0, 14, 6),
ARM64_SYS_REG(2, 0, 0, 14, 7),
ARM64_SYS_REG(2, 0, 0, 15, 4),
ARM64_SYS_REG(2, 0, 0, 15, 5),
ARM64_SYS_REG(2, 0, 0, 15, 6),
ARM64_SYS_REG(2, 0, 0, 15, 7),
ARM64_SYS_REG(2, 4, 0, 7, 0), /* DBGVCR32_EL2 */
ARM64_SYS_REG(3, 0, 0, 0, 5), /* MPIDR_EL1 */
ARM64_SYS_REG(3, 0, 0, 1, 0), /* ID_PFR0_EL1 */
ARM64_SYS_REG(3, 0, 0, 1, 1), /* ID_PFR1_EL1 */
ARM64_SYS_REG(3, 0, 0, 1, 2), /* ID_DFR0_EL1 */
ARM64_SYS_REG(3, 0, 0, 1, 3), /* ID_AFR0_EL1 */
ARM64_SYS_REG(3, 0, 0, 1, 4), /* ID_MMFR0_EL1 */
ARM64_SYS_REG(3, 0, 0, 1, 5), /* ID_MMFR1_EL1 */
ARM64_SYS_REG(3, 0, 0, 1, 6), /* ID_MMFR2_EL1 */
ARM64_SYS_REG(3, 0, 0, 1, 7), /* ID_MMFR3_EL1 */
ARM64_SYS_REG(3, 0, 0, 2, 0), /* ID_ISAR0_EL1 */
ARM64_SYS_REG(3, 0, 0, 2, 1), /* ID_ISAR1_EL1 */
ARM64_SYS_REG(3, 0, 0, 2, 2), /* ID_ISAR2_EL1 */
ARM64_SYS_REG(3, 0, 0, 2, 3), /* ID_ISAR3_EL1 */
ARM64_SYS_REG(3, 0, 0, 2, 4), /* ID_ISAR4_EL1 */
ARM64_SYS_REG(3, 0, 0, 2, 5), /* ID_ISAR5_EL1 */
ARM64_SYS_REG(3, 0, 0, 2, 6), /* ID_MMFR4_EL1 */
ARM64_SYS_REG(3, 0, 0, 2, 7), /* ID_ISAR6_EL1 */
ARM64_SYS_REG(3, 0, 0, 3, 0), /* MVFR0_EL1 */
ARM64_SYS_REG(3, 0, 0, 3, 1), /* MVFR1_EL1 */
ARM64_SYS_REG(3, 0, 0, 3, 2), /* MVFR2_EL1 */
ARM64_SYS_REG(3, 0, 0, 3, 3),
ARM64_SYS_REG(3, 0, 0, 3, 4), /* ID_PFR2_EL1 */
ARM64_SYS_REG(3, 0, 0, 3, 5), /* ID_DFR1_EL1 */
ARM64_SYS_REG(3, 0, 0, 3, 6), /* ID_MMFR5_EL1 */
ARM64_SYS_REG(3, 0, 0, 3, 7),
ARM64_SYS_REG(3, 0, 0, 4, 0), /* ID_AA64PFR0_EL1 */
ARM64_SYS_REG(3, 0, 0, 4, 1), /* ID_AA64PFR1_EL1 */
ARM64_SYS_REG(3, 0, 0, 4, 2),
ARM64_SYS_REG(3, 0, 0, 4, 3),
ARM64_SYS_REG(3, 0, 0, 4, 4), /* ID_AA64ZFR0_EL1 */
ARM64_SYS_REG(3, 0, 0, 4, 5),
ARM64_SYS_REG(3, 0, 0, 4, 6),
ARM64_SYS_REG(3, 0, 0, 4, 7),
ARM64_SYS_REG(3, 0, 0, 5, 0), /* ID_AA64DFR0_EL1 */
ARM64_SYS_REG(3, 0, 0, 5, 1), /* ID_AA64DFR1_EL1 */
ARM64_SYS_REG(3, 0, 0, 5, 2),
ARM64_SYS_REG(3, 0, 0, 5, 3),
ARM64_SYS_REG(3, 0, 0, 5, 4), /* ID_AA64AFR0_EL1 */
ARM64_SYS_REG(3, 0, 0, 5, 5), /* ID_AA64AFR1_EL1 */
ARM64_SYS_REG(3, 0, 0, 5, 6),
ARM64_SYS_REG(3, 0, 0, 5, 7),
ARM64_SYS_REG(3, 0, 0, 6, 0), /* ID_AA64ISAR0_EL1 */
ARM64_SYS_REG(3, 0, 0, 6, 1), /* ID_AA64ISAR1_EL1 */
ARM64_SYS_REG(3, 0, 0, 6, 2),
ARM64_SYS_REG(3, 0, 0, 6, 3),
ARM64_SYS_REG(3, 0, 0, 6, 4),
ARM64_SYS_REG(3, 0, 0, 6, 5),
ARM64_SYS_REG(3, 0, 0, 6, 6),
ARM64_SYS_REG(3, 0, 0, 6, 7),
ARM64_SYS_REG(3, 0, 0, 7, 0), /* ID_AA64MMFR0_EL1 */
ARM64_SYS_REG(3, 0, 0, 7, 1), /* ID_AA64MMFR1_EL1 */
ARM64_SYS_REG(3, 0, 0, 7, 2), /* ID_AA64MMFR2_EL1 */
ARM64_SYS_REG(3, 0, 0, 7, 3),
ARM64_SYS_REG(3, 0, 0, 7, 4),
ARM64_SYS_REG(3, 0, 0, 7, 5),
ARM64_SYS_REG(3, 0, 0, 7, 6),
ARM64_SYS_REG(3, 0, 0, 7, 7),
ARM64_SYS_REG(3, 0, 1, 0, 0), /* SCTLR_EL1 */
ARM64_SYS_REG(3, 0, 1, 0, 1), /* ACTLR_EL1 */
ARM64_SYS_REG(3, 0, 1, 0, 2), /* CPACR_EL1 */
ARM64_SYS_REG(3, 0, 2, 0, 0), /* TTBR0_EL1 */
ARM64_SYS_REG(3, 0, 2, 0, 1), /* TTBR1_EL1 */
ARM64_SYS_REG(3, 0, 2, 0, 2), /* TCR_EL1 */
ARM64_SYS_REG(3, 0, 5, 1, 0), /* AFSR0_EL1 */
ARM64_SYS_REG(3, 0, 5, 1, 1), /* AFSR1_EL1 */
ARM64_SYS_REG(3, 0, 5, 2, 0), /* ESR_EL1 */
ARM64_SYS_REG(3, 0, 6, 0, 0), /* FAR_EL1 */
ARM64_SYS_REG(3, 0, 7, 4, 0), /* PAR_EL1 */
ARM64_SYS_REG(3, 0, 9, 14, 1), /* PMINTENSET_EL1 */
ARM64_SYS_REG(3, 0, 9, 14, 2), /* PMINTENCLR_EL1 */
ARM64_SYS_REG(3, 0, 10, 2, 0), /* MAIR_EL1 */
ARM64_SYS_REG(3, 0, 10, 3, 0), /* AMAIR_EL1 */
ARM64_SYS_REG(3, 0, 12, 0, 0), /* VBAR_EL1 */
ARM64_SYS_REG(3, 0, 12, 1, 1), /* DISR_EL1 */
ARM64_SYS_REG(3, 0, 13, 0, 1), /* CONTEXTIDR_EL1 */
ARM64_SYS_REG(3, 0, 13, 0, 4), /* TPIDR_EL1 */
ARM64_SYS_REG(3, 0, 14, 1, 0), /* CNTKCTL_EL1 */
ARM64_SYS_REG(3, 2, 0, 0, 0), /* CSSELR_EL1 */
ARM64_SYS_REG(3, 3, 9, 12, 0), /* PMCR_EL0 */
ARM64_SYS_REG(3, 3, 9, 12, 1), /* PMCNTENSET_EL0 */
ARM64_SYS_REG(3, 3, 9, 12, 2), /* PMCNTENCLR_EL0 */
ARM64_SYS_REG(3, 3, 9, 12, 3), /* PMOVSCLR_EL0 */
ARM64_SYS_REG(3, 3, 9, 12, 4), /* PMSWINC_EL0 */
ARM64_SYS_REG(3, 3, 9, 12, 5), /* PMSELR_EL0 */
ARM64_SYS_REG(3, 3, 9, 13, 0), /* PMCCNTR_EL0 */
ARM64_SYS_REG(3, 3, 9, 14, 0), /* PMUSERENR_EL0 */
ARM64_SYS_REG(3, 3, 9, 14, 3), /* PMOVSSET_EL0 */
ARM64_SYS_REG(3, 3, 13, 0, 2), /* TPIDR_EL0 */
ARM64_SYS_REG(3, 3, 13, 0, 3), /* TPIDRRO_EL0 */
ARM64_SYS_REG(3, 3, 14, 8, 0),
ARM64_SYS_REG(3, 3, 14, 8, 1),
ARM64_SYS_REG(3, 3, 14, 8, 2),
ARM64_SYS_REG(3, 3, 14, 8, 3),
ARM64_SYS_REG(3, 3, 14, 8, 4),
ARM64_SYS_REG(3, 3, 14, 8, 5),
ARM64_SYS_REG(3, 3, 14, 8, 6),
ARM64_SYS_REG(3, 3, 14, 8, 7),
ARM64_SYS_REG(3, 3, 14, 9, 0),
ARM64_SYS_REG(3, 3, 14, 9, 1),
ARM64_SYS_REG(3, 3, 14, 9, 2),
ARM64_SYS_REG(3, 3, 14, 9, 3),
ARM64_SYS_REG(3, 3, 14, 9, 4),
ARM64_SYS_REG(3, 3, 14, 9, 5),
ARM64_SYS_REG(3, 3, 14, 9, 6),
ARM64_SYS_REG(3, 3, 14, 9, 7),
ARM64_SYS_REG(3, 3, 14, 10, 0),
ARM64_SYS_REG(3, 3, 14, 10, 1),
ARM64_SYS_REG(3, 3, 14, 10, 2),
ARM64_SYS_REG(3, 3, 14, 10, 3),
ARM64_SYS_REG(3, 3, 14, 10, 4),
ARM64_SYS_REG(3, 3, 14, 10, 5),
ARM64_SYS_REG(3, 3, 14, 10, 6),
ARM64_SYS_REG(3, 3, 14, 10, 7),
ARM64_SYS_REG(3, 3, 14, 11, 0),
ARM64_SYS_REG(3, 3, 14, 11, 1),
ARM64_SYS_REG(3, 3, 14, 11, 2),
ARM64_SYS_REG(3, 3, 14, 11, 3),
ARM64_SYS_REG(3, 3, 14, 11, 4),
ARM64_SYS_REG(3, 3, 14, 11, 5),
ARM64_SYS_REG(3, 3, 14, 11, 6),
ARM64_SYS_REG(3, 3, 14, 12, 0),
ARM64_SYS_REG(3, 3, 14, 12, 1),
ARM64_SYS_REG(3, 3, 14, 12, 2),
ARM64_SYS_REG(3, 3, 14, 12, 3),
ARM64_SYS_REG(3, 3, 14, 12, 4),
ARM64_SYS_REG(3, 3, 14, 12, 5),
ARM64_SYS_REG(3, 3, 14, 12, 6),
ARM64_SYS_REG(3, 3, 14, 12, 7),
ARM64_SYS_REG(3, 3, 14, 13, 0),
ARM64_SYS_REG(3, 3, 14, 13, 1),
ARM64_SYS_REG(3, 3, 14, 13, 2),
ARM64_SYS_REG(3, 3, 14, 13, 3),
ARM64_SYS_REG(3, 3, 14, 13, 4),
ARM64_SYS_REG(3, 3, 14, 13, 5),
ARM64_SYS_REG(3, 3, 14, 13, 6),
ARM64_SYS_REG(3, 3, 14, 13, 7),
ARM64_SYS_REG(3, 3, 14, 14, 0),
ARM64_SYS_REG(3, 3, 14, 14, 1),
ARM64_SYS_REG(3, 3, 14, 14, 2),
ARM64_SYS_REG(3, 3, 14, 14, 3),
ARM64_SYS_REG(3, 3, 14, 14, 4),
ARM64_SYS_REG(3, 3, 14, 14, 5),
ARM64_SYS_REG(3, 3, 14, 14, 6),
ARM64_SYS_REG(3, 3, 14, 14, 7),
ARM64_SYS_REG(3, 3, 14, 15, 0),
ARM64_SYS_REG(3, 3, 14, 15, 1),
ARM64_SYS_REG(3, 3, 14, 15, 2),
ARM64_SYS_REG(3, 3, 14, 15, 3),
ARM64_SYS_REG(3, 3, 14, 15, 4),
ARM64_SYS_REG(3, 3, 14, 15, 5),
ARM64_SYS_REG(3, 3, 14, 15, 6),
ARM64_SYS_REG(3, 3, 14, 15, 7), /* PMCCFILTR_EL0 */
ARM64_SYS_REG(3, 4, 3, 0, 0), /* DACR32_EL2 */
ARM64_SYS_REG(3, 4, 5, 0, 1), /* IFSR32_EL2 */
ARM64_SYS_REG(3, 4, 5, 3, 0), /* FPEXC32_EL2 */
KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 0,
KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 1,
KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 2,
};
static __u64 base_regs_n = ARRAY_SIZE(base_regs);
static __u64 vregs[] = {
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[1]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[2]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[3]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[4]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[5]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[6]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[7]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[8]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[9]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[10]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[11]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[12]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[13]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[14]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[15]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[16]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[17]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[18]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[19]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[20]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[21]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[22]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[23]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[24]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[25]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[26]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[27]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[28]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[29]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[30]),
KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]),
};
static __u64 vregs_n = ARRAY_SIZE(vregs);
static __u64 sve_regs[] = {
KVM_REG_ARM64_SVE_VLS,
KVM_REG_ARM64_SVE_ZREG(0, 0),
KVM_REG_ARM64_SVE_ZREG(1, 0),
KVM_REG_ARM64_SVE_ZREG(2, 0),
KVM_REG_ARM64_SVE_ZREG(3, 0),
KVM_REG_ARM64_SVE_ZREG(4, 0),
KVM_REG_ARM64_SVE_ZREG(5, 0),
KVM_REG_ARM64_SVE_ZREG(6, 0),
KVM_REG_ARM64_SVE_ZREG(7, 0),
KVM_REG_ARM64_SVE_ZREG(8, 0),
KVM_REG_ARM64_SVE_ZREG(9, 0),
KVM_REG_ARM64_SVE_ZREG(10, 0),
KVM_REG_ARM64_SVE_ZREG(11, 0),
KVM_REG_ARM64_SVE_ZREG(12, 0),
KVM_REG_ARM64_SVE_ZREG(13, 0),
KVM_REG_ARM64_SVE_ZREG(14, 0),
KVM_REG_ARM64_SVE_ZREG(15, 0),
KVM_REG_ARM64_SVE_ZREG(16, 0),
KVM_REG_ARM64_SVE_ZREG(17, 0),
KVM_REG_ARM64_SVE_ZREG(18, 0),
KVM_REG_ARM64_SVE_ZREG(19, 0),
KVM_REG_ARM64_SVE_ZREG(20, 0),
KVM_REG_ARM64_SVE_ZREG(21, 0),
KVM_REG_ARM64_SVE_ZREG(22, 0),
KVM_REG_ARM64_SVE_ZREG(23, 0),
KVM_REG_ARM64_SVE_ZREG(24, 0),
KVM_REG_ARM64_SVE_ZREG(25, 0),
KVM_REG_ARM64_SVE_ZREG(26, 0),
KVM_REG_ARM64_SVE_ZREG(27, 0),
KVM_REG_ARM64_SVE_ZREG(28, 0),
KVM_REG_ARM64_SVE_ZREG(29, 0),
KVM_REG_ARM64_SVE_ZREG(30, 0),
KVM_REG_ARM64_SVE_ZREG(31, 0),
KVM_REG_ARM64_SVE_PREG(0, 0),
KVM_REG_ARM64_SVE_PREG(1, 0),
KVM_REG_ARM64_SVE_PREG(2, 0),
KVM_REG_ARM64_SVE_PREG(3, 0),
KVM_REG_ARM64_SVE_PREG(4, 0),
KVM_REG_ARM64_SVE_PREG(5, 0),
KVM_REG_ARM64_SVE_PREG(6, 0),
KVM_REG_ARM64_SVE_PREG(7, 0),
KVM_REG_ARM64_SVE_PREG(8, 0),
KVM_REG_ARM64_SVE_PREG(9, 0),
KVM_REG_ARM64_SVE_PREG(10, 0),
KVM_REG_ARM64_SVE_PREG(11, 0),
KVM_REG_ARM64_SVE_PREG(12, 0),
KVM_REG_ARM64_SVE_PREG(13, 0),
KVM_REG_ARM64_SVE_PREG(14, 0),
KVM_REG_ARM64_SVE_PREG(15, 0),
KVM_REG_ARM64_SVE_FFR(0),
ARM64_SYS_REG(3, 0, 1, 2, 0), /* ZCR_EL1 */
};
static __u64 sve_regs_n = ARRAY_SIZE(sve_regs);
static __u64 rejects_set[] = {
#ifdef REG_LIST_SVE
KVM_REG_ARM64_SVE_VLS,
#endif
};
static __u64 rejects_set_n = ARRAY_SIZE(rejects_set);

View File

@ -1,6 +0,0 @@
#define USE_CLEAR_DIRTY_LOG
#define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (1 << 0)
#define KVM_DIRTY_LOG_INITIALLY_SET (1 << 1)
#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
KVM_DIRTY_LOG_INITIALLY_SET)
#include "dirty_log_test.c"

View File

@ -21,20 +21,12 @@
#include <linux/bitops.h>
#include <linux/userfaultfd.h>
#include "test_util.h"
#include "kvm_util.h"
#include "perf_test_util.h"
#include "processor.h"
#include "test_util.h"
#ifdef __NR_userfaultfd
/* The memory slot index demand page */
#define TEST_MEM_SLOT_INDEX 1
/* Default guest test virtual memory offset */
#define DEFAULT_GUEST_TEST_MEM 0xc0000000
#define DEFAULT_GUEST_TEST_MEM_SIZE (1 << 30) /* 1G */
#ifdef PRINT_PER_PAGE_UPDATES
#define PER_PAGE_DEBUG(...) printf(__VA_ARGS__)
#else
@ -47,77 +39,17 @@
#define PER_VCPU_DEBUG(...) _no_printf(__VA_ARGS__)
#endif
#define MAX_VCPUS 512
/*
* Guest/Host shared variables. Ensure addr_gva2hva() and/or
* sync_global_to/from_guest() are used when accessing from
* the host. READ/WRITE_ONCE() should also be used with anything
* that may change.
*/
static uint64_t host_page_size;
static uint64_t guest_page_size;
static char *guest_data_prototype;
/*
* Guest physical memory offset of the testing memory slot.
* This will be set to the topmost valid physical address minus
* the test memory size.
*/
static uint64_t guest_test_phys_mem;
/*
* Guest virtual memory offset of the testing memory slot.
* Must not conflict with identity mapped test code.
*/
static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM;
struct vcpu_args {
uint64_t gva;
uint64_t pages;
/* Only used by the host userspace part of the vCPU thread */
int vcpu_id;
struct kvm_vm *vm;
};
static struct vcpu_args vcpu_args[MAX_VCPUS];
/*
* Continuously write to the first 8 bytes of each page in the demand paging
* memory region.
*/
static void guest_code(uint32_t vcpu_id)
{
uint64_t gva;
uint64_t pages;
int i;
/* Make sure vCPU args data structure is not corrupt. */
GUEST_ASSERT(vcpu_args[vcpu_id].vcpu_id == vcpu_id);
gva = vcpu_args[vcpu_id].gva;
pages = vcpu_args[vcpu_id].pages;
for (i = 0; i < pages; i++) {
uint64_t addr = gva + (i * guest_page_size);
addr &= ~(host_page_size - 1);
*(uint64_t *)addr = 0x0123456789ABCDEF;
}
GUEST_SYNC(1);
}
static void *vcpu_worker(void *data)
{
int ret;
struct vcpu_args *args = (struct vcpu_args *)data;
struct kvm_vm *vm = args->vm;
int vcpu_id = args->vcpu_id;
struct vcpu_args *vcpu_args = (struct vcpu_args *)data;
int vcpu_id = vcpu_args->vcpu_id;
struct kvm_vm *vm = perf_test_args.vm;
struct kvm_run *run;
struct timespec start, end, ts_diff;
struct timespec start;
struct timespec ts_diff;
vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
run = vcpu_state(vm, vcpu_id);
@ -133,52 +65,18 @@ static void *vcpu_worker(void *data)
exit_reason_str(run->exit_reason));
}
clock_gettime(CLOCK_MONOTONIC, &end);
ts_diff = timespec_sub(end, start);
ts_diff = timespec_diff_now(start);
PER_VCPU_DEBUG("vCPU %d execution time: %ld.%.9lds\n", vcpu_id,
ts_diff.tv_sec, ts_diff.tv_nsec);
return NULL;
}
#define PAGE_SHIFT_4K 12
#define PTES_PER_4K_PT 512
static struct kvm_vm *create_vm(enum vm_guest_mode mode, int vcpus,
uint64_t vcpu_memory_bytes)
{
struct kvm_vm *vm;
uint64_t pages = DEFAULT_GUEST_PHY_PAGES;
/* Account for a few pages per-vCPU for stacks */
pages += DEFAULT_STACK_PGS * vcpus;
/*
* Reserve twice the ammount of memory needed to map the test region and
* the page table / stacks region, at 4k, for page tables. Do the
* calculation with 4K page size: the smallest of all archs. (e.g., 64K
* page size guest will need even less memory for page tables).
*/
pages += (2 * pages) / PTES_PER_4K_PT;
pages += ((2 * vcpus * vcpu_memory_bytes) >> PAGE_SHIFT_4K) /
PTES_PER_4K_PT;
pages = vm_adjust_num_guest_pages(mode, pages);
pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
vm = _vm_create(mode, pages, O_RDWR);
kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
#ifdef __x86_64__
vm_create_irqchip(vm);
#endif
return vm;
}
static int handle_uffd_page_request(int uffd, uint64_t addr)
{
pid_t tid;
struct timespec start;
struct timespec end;
struct timespec ts_diff;
struct uffdio_copy copy;
int r;
@ -186,7 +84,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr)
copy.src = (uint64_t)guest_data_prototype;
copy.dst = addr;
copy.len = host_page_size;
copy.len = perf_test_args.host_page_size;
copy.mode = 0;
clock_gettime(CLOCK_MONOTONIC, &start);
@ -198,12 +96,12 @@ static int handle_uffd_page_request(int uffd, uint64_t addr)
return r;
}
clock_gettime(CLOCK_MONOTONIC, &end);
ts_diff = timespec_diff_now(start);
PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid,
timespec_to_ns(timespec_sub(end, start)));
timespec_to_ns(ts_diff));
PER_PAGE_DEBUG("Paged in %ld bytes at 0x%lx from thread %d\n",
host_page_size, addr, tid);
perf_test_args.host_page_size, addr, tid);
return 0;
}
@ -223,7 +121,8 @@ static void *uffd_handler_thread_fn(void *arg)
int pipefd = uffd_args->pipefd;
useconds_t delay = uffd_args->delay;
int64_t pages = 0;
struct timespec start, end, ts_diff;
struct timespec start;
struct timespec ts_diff;
clock_gettime(CLOCK_MONOTONIC, &start);
while (!quit_uffd_thread) {
@ -292,8 +191,7 @@ static void *uffd_handler_thread_fn(void *arg)
pages++;
}
clock_gettime(CLOCK_MONOTONIC, &end);
ts_diff = timespec_sub(end, start);
ts_diff = timespec_diff_now(start);
PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n",
pages, ts_diff.tv_sec, ts_diff.tv_nsec,
pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
@ -351,99 +249,54 @@ static int setup_demand_paging(struct kvm_vm *vm,
}
static void run_test(enum vm_guest_mode mode, bool use_uffd,
useconds_t uffd_delay, int vcpus,
uint64_t vcpu_memory_bytes)
useconds_t uffd_delay)
{
pthread_t *vcpu_threads;
pthread_t *uffd_handler_threads = NULL;
struct uffd_handler_args *uffd_args = NULL;
struct timespec start, end, ts_diff;
struct timespec start;
struct timespec ts_diff;
int *pipefds = NULL;
struct kvm_vm *vm;
uint64_t guest_num_pages;
int vcpu_id;
int r;
vm = create_vm(mode, vcpus, vcpu_memory_bytes);
vm = create_vm(mode, nr_vcpus, guest_percpu_mem_size);
guest_page_size = vm_get_page_size(vm);
perf_test_args.wr_fract = 1;
TEST_ASSERT(vcpu_memory_bytes % guest_page_size == 0,
"Guest memory size is not guest page size aligned.");
guest_num_pages = (vcpus * vcpu_memory_bytes) / guest_page_size;
guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
/*
* If there should be more memory in the guest test region than there
* can be pages in the guest, it will definitely cause problems.
*/
TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm),
"Requested more guest memory than address space allows.\n"
" guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n",
guest_num_pages, vm_get_max_gfn(vm), vcpus,
vcpu_memory_bytes);
host_page_size = getpagesize();
TEST_ASSERT(vcpu_memory_bytes % host_page_size == 0,
"Guest memory size is not host page size aligned.");
guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) *
guest_page_size;
guest_test_phys_mem &= ~(host_page_size - 1);
#ifdef __s390x__
/* Align to 1M (segment size) */
guest_test_phys_mem &= ~((1 << 20) - 1);
#endif
pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem);
/* Add an extra memory slot for testing demand paging */
vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
guest_test_phys_mem,
TEST_MEM_SLOT_INDEX,
guest_num_pages, 0);
/* Do mapping for the demand paging memory slot */
virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0);
ucall_init(vm, NULL);
guest_data_prototype = malloc(host_page_size);
guest_data_prototype = malloc(perf_test_args.host_page_size);
TEST_ASSERT(guest_data_prototype,
"Failed to allocate buffer for guest data pattern");
memset(guest_data_prototype, 0xAB, host_page_size);
memset(guest_data_prototype, 0xAB, perf_test_args.host_page_size);
vcpu_threads = malloc(vcpus * sizeof(*vcpu_threads));
vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
TEST_ASSERT(vcpu_threads, "Memory allocation failed");
add_vcpus(vm, nr_vcpus, guest_percpu_mem_size);
if (use_uffd) {
uffd_handler_threads =
malloc(vcpus * sizeof(*uffd_handler_threads));
malloc(nr_vcpus * sizeof(*uffd_handler_threads));
TEST_ASSERT(uffd_handler_threads, "Memory allocation failed");
uffd_args = malloc(vcpus * sizeof(*uffd_args));
uffd_args = malloc(nr_vcpus * sizeof(*uffd_args));
TEST_ASSERT(uffd_args, "Memory allocation failed");
pipefds = malloc(sizeof(int) * vcpus * 2);
pipefds = malloc(sizeof(int) * nr_vcpus * 2);
TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd");
}
for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) {
vm_paddr_t vcpu_gpa;
void *vcpu_hva;
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
vm_paddr_t vcpu_gpa;
void *vcpu_hva;
vm_vcpu_add_default(vm, vcpu_id, guest_code);
vcpu_gpa = guest_test_phys_mem + (vcpu_id * guest_percpu_mem_size);
PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n",
vcpu_id, vcpu_gpa, vcpu_gpa + guest_percpu_mem_size);
vcpu_gpa = guest_test_phys_mem + (vcpu_id * vcpu_memory_bytes);
PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n",
vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_memory_bytes);
/* Cache the HVA pointer of the region */
vcpu_hva = addr_gpa2hva(vm, vcpu_gpa);
/* Cache the HVA pointer of the region */
vcpu_hva = addr_gpa2hva(vm, vcpu_gpa);
if (use_uffd) {
/*
* Set up user fault fd to handle demand paging
* requests.
@ -456,53 +309,41 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd,
&uffd_handler_threads[vcpu_id],
pipefds[vcpu_id * 2],
uffd_delay, &uffd_args[vcpu_id],
vcpu_hva, vcpu_memory_bytes);
vcpu_hva, guest_percpu_mem_size);
if (r < 0)
exit(-r);
}
#ifdef __x86_64__
vcpu_set_cpuid(vm, vcpu_id, kvm_get_supported_cpuid());
#endif
vcpu_args[vcpu_id].vm = vm;
vcpu_args[vcpu_id].vcpu_id = vcpu_id;
vcpu_args[vcpu_id].gva = guest_test_virt_mem +
(vcpu_id * vcpu_memory_bytes);
vcpu_args[vcpu_id].pages = vcpu_memory_bytes / guest_page_size;
}
/* Export the shared variables to the guest */
sync_global_to_guest(vm, host_page_size);
sync_global_to_guest(vm, guest_page_size);
sync_global_to_guest(vm, vcpu_args);
sync_global_to_guest(vm, perf_test_args);
pr_info("Finished creating vCPUs and starting uffd threads\n");
clock_gettime(CLOCK_MONOTONIC, &start);
for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) {
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
&vcpu_args[vcpu_id]);
&perf_test_args.vcpu_args[vcpu_id]);
}
pr_info("Started all vCPUs\n");
/* Wait for the vcpu threads to quit */
for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) {
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
pthread_join(vcpu_threads[vcpu_id], NULL);
PER_VCPU_DEBUG("Joined thread for vCPU %d\n", vcpu_id);
}
pr_info("All vCPU threads joined\n");
ts_diff = timespec_diff_now(start);
clock_gettime(CLOCK_MONOTONIC, &end);
pr_info("All vCPU threads joined\n");
if (use_uffd) {
char c;
/* Tell the user fault fd handler threads to quit */
for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) {
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
r = write(pipefds[vcpu_id * 2 + 1], &c, 1);
TEST_ASSERT(r == 1, "Unable to write to pipefd");
@ -510,11 +351,11 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd,
}
}
ts_diff = timespec_sub(end, start);
pr_info("Total guest execution time: %ld.%.9lds\n",
ts_diff.tv_sec, ts_diff.tv_nsec);
pr_info("Overall demand paging rate: %f pgs/sec\n",
guest_num_pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
perf_test_args.vcpu_args[0].pages * nr_vcpus /
((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
ucall_uninit(vm);
kvm_vm_free(vm);
@ -568,9 +409,8 @@ static void help(char *name)
int main(int argc, char *argv[])
{
int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
bool mode_selected = false;
uint64_t vcpu_memory_bytes = DEFAULT_GUEST_TEST_MEM_SIZE;
int vcpus = 1;
unsigned int mode;
int opt, i;
bool use_uffd = false;
@ -619,15 +459,12 @@ int main(int argc, char *argv[])
"A negative UFFD delay is not supported.");
break;
case 'b':
vcpu_memory_bytes = parse_size(optarg);
guest_percpu_mem_size = parse_size(optarg);
break;
case 'v':
vcpus = atoi(optarg);
TEST_ASSERT(vcpus > 0,
"Must have a positive number of vCPUs");
TEST_ASSERT(vcpus <= MAX_VCPUS,
"This test does not currently support\n"
"more than %d vCPUs.", MAX_VCPUS);
nr_vcpus = atoi(optarg);
TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus,
"Invalid number of vcpus, must be between 1 and %d", max_vcpus);
break;
case 'h':
default:
@ -642,7 +479,7 @@ int main(int argc, char *argv[])
TEST_ASSERT(guest_modes[i].supported,
"Guest mode ID %d (%s) not supported.",
i, vm_guest_mode_string(i));
run_test(i, use_uffd, uffd_delay, vcpus, vcpu_memory_bytes);
run_test(i, use_uffd, uffd_delay);
}
return 0;

View File

@ -0,0 +1,376 @@
// SPDX-License-Identifier: GPL-2.0
/*
* KVM dirty page logging performance test
*
* Based on dirty_log_test.c
*
* Copyright (C) 2018, Red Hat, Inc.
* Copyright (C) 2020, Google, Inc.
*/
#define _GNU_SOURCE /* for program_invocation_name */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <pthread.h>
#include <linux/bitmap.h>
#include <linux/bitops.h>
#include "kvm_util.h"
#include "perf_test_util.h"
#include "processor.h"
#include "test_util.h"
/* How many host loops to run by default (one KVM_GET_DIRTY_LOG for each loop)*/
#define TEST_HOST_LOOP_N 2UL
/* Host variables */
static bool host_quit;
static uint64_t iteration;
static uint64_t vcpu_last_completed_iteration[MAX_VCPUS];
static void *vcpu_worker(void *data)
{
int ret;
struct kvm_vm *vm = perf_test_args.vm;
uint64_t pages_count = 0;
struct kvm_run *run;
struct timespec start;
struct timespec ts_diff;
struct timespec total = (struct timespec){0};
struct timespec avg;
struct vcpu_args *vcpu_args = (struct vcpu_args *)data;
int vcpu_id = vcpu_args->vcpu_id;
vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
run = vcpu_state(vm, vcpu_id);
while (!READ_ONCE(host_quit)) {
uint64_t current_iteration = READ_ONCE(iteration);
clock_gettime(CLOCK_MONOTONIC, &start);
ret = _vcpu_run(vm, vcpu_id);
ts_diff = timespec_diff_now(start);
TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
TEST_ASSERT(get_ucall(vm, vcpu_id, NULL) == UCALL_SYNC,
"Invalid guest sync status: exit_reason=%s\n",
exit_reason_str(run->exit_reason));
pr_debug("Got sync event from vCPU %d\n", vcpu_id);
vcpu_last_completed_iteration[vcpu_id] = current_iteration;
pr_debug("vCPU %d updated last completed iteration to %lu\n",
vcpu_id, vcpu_last_completed_iteration[vcpu_id]);
if (current_iteration) {
pages_count += vcpu_args->pages;
total = timespec_add(total, ts_diff);
pr_debug("vCPU %d iteration %lu dirty memory time: %ld.%.9lds\n",
vcpu_id, current_iteration, ts_diff.tv_sec,
ts_diff.tv_nsec);
} else {
pr_debug("vCPU %d iteration %lu populate memory time: %ld.%.9lds\n",
vcpu_id, current_iteration, ts_diff.tv_sec,
ts_diff.tv_nsec);
}
while (current_iteration == READ_ONCE(iteration) &&
!READ_ONCE(host_quit)) {}
}
avg = timespec_div(total, vcpu_last_completed_iteration[vcpu_id]);
pr_debug("\nvCPU %d dirtied 0x%lx pages over %lu iterations in %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n",
vcpu_id, pages_count, vcpu_last_completed_iteration[vcpu_id],
total.tv_sec, total.tv_nsec, avg.tv_sec, avg.tv_nsec);
return NULL;
}
#ifdef USE_CLEAR_DIRTY_LOG
static u64 dirty_log_manual_caps;
#endif
static void run_test(enum vm_guest_mode mode, unsigned long iterations,
uint64_t phys_offset, int wr_fract)
{
pthread_t *vcpu_threads;
struct kvm_vm *vm;
unsigned long *bmap;
uint64_t guest_num_pages;
uint64_t host_num_pages;
int vcpu_id;
struct timespec start;
struct timespec ts_diff;
struct timespec get_dirty_log_total = (struct timespec){0};
struct timespec vcpu_dirty_total = (struct timespec){0};
struct timespec avg;
#ifdef USE_CLEAR_DIRTY_LOG
struct kvm_enable_cap cap = {};
struct timespec clear_dirty_log_total = (struct timespec){0};
#endif
vm = create_vm(mode, nr_vcpus, guest_percpu_mem_size);
perf_test_args.wr_fract = wr_fract;
guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm_get_page_shift(vm);
guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
host_num_pages = vm_num_host_pages(mode, guest_num_pages);
bmap = bitmap_alloc(host_num_pages);
#ifdef USE_CLEAR_DIRTY_LOG
cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2;
cap.args[0] = dirty_log_manual_caps;
vm_enable_cap(vm, &cap);
#endif
vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
TEST_ASSERT(vcpu_threads, "Memory allocation failed");
add_vcpus(vm, nr_vcpus, guest_percpu_mem_size);
sync_global_to_guest(vm, perf_test_args);
/* Start the iterations */
iteration = 0;
host_quit = false;
clock_gettime(CLOCK_MONOTONIC, &start);
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
&perf_test_args.vcpu_args[vcpu_id]);
}
/* Allow the vCPU to populate memory */
pr_debug("Starting iteration %lu - Populating\n", iteration);
while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != iteration)
pr_debug("Waiting for vcpu_last_completed_iteration == %lu\n",
iteration);
ts_diff = timespec_diff_now(start);
pr_info("Populate memory time: %ld.%.9lds\n",
ts_diff.tv_sec, ts_diff.tv_nsec);
/* Enable dirty logging */
clock_gettime(CLOCK_MONOTONIC, &start);
vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX,
KVM_MEM_LOG_DIRTY_PAGES);
ts_diff = timespec_diff_now(start);
pr_info("Enabling dirty logging time: %ld.%.9lds\n\n",
ts_diff.tv_sec, ts_diff.tv_nsec);
while (iteration < iterations) {
/*
* Incrementing the iteration number will start the vCPUs
* dirtying memory again.
*/
clock_gettime(CLOCK_MONOTONIC, &start);
iteration++;
pr_debug("Starting iteration %lu\n", iteration);
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != iteration)
pr_debug("Waiting for vCPU %d vcpu_last_completed_iteration == %lu\n",
vcpu_id, iteration);
}
ts_diff = timespec_diff_now(start);
vcpu_dirty_total = timespec_add(vcpu_dirty_total, ts_diff);
pr_info("Iteration %lu dirty memory time: %ld.%.9lds\n",
iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
clock_gettime(CLOCK_MONOTONIC, &start);
kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap);
ts_diff = timespec_diff_now(start);
get_dirty_log_total = timespec_add(get_dirty_log_total,
ts_diff);
pr_info("Iteration %lu get dirty log time: %ld.%.9lds\n",
iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
#ifdef USE_CLEAR_DIRTY_LOG
clock_gettime(CLOCK_MONOTONIC, &start);
kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0,
host_num_pages);
ts_diff = timespec_diff_now(start);
clear_dirty_log_total = timespec_add(clear_dirty_log_total,
ts_diff);
pr_info("Iteration %lu clear dirty log time: %ld.%.9lds\n",
iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
#endif
}
/* Tell the vcpu thread to quit */
host_quit = true;
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++)
pthread_join(vcpu_threads[vcpu_id], NULL);
/* Disable dirty logging */
clock_gettime(CLOCK_MONOTONIC, &start);
vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX, 0);
ts_diff = timespec_diff_now(start);
pr_info("Disabling dirty logging time: %ld.%.9lds\n",
ts_diff.tv_sec, ts_diff.tv_nsec);
avg = timespec_div(get_dirty_log_total, iterations);
pr_info("Get dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n",
iterations, get_dirty_log_total.tv_sec,
get_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec);
#ifdef USE_CLEAR_DIRTY_LOG
avg = timespec_div(clear_dirty_log_total, iterations);
pr_info("Clear dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n",
iterations, clear_dirty_log_total.tv_sec,
clear_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec);
#endif
free(bmap);
free(vcpu_threads);
ucall_uninit(vm);
kvm_vm_free(vm);
}
struct guest_mode {
bool supported;
bool enabled;
};
static struct guest_mode guest_modes[NUM_VM_MODES];
#define guest_mode_init(mode, supported, enabled) ({ \
guest_modes[mode] = (struct guest_mode){ supported, enabled }; \
})
static void help(char *name)
{
int i;
puts("");
printf("usage: %s [-h] [-i iterations] [-p offset] "
"[-m mode] [-b vcpu bytes] [-v vcpus]\n", name);
puts("");
printf(" -i: specify iteration counts (default: %"PRIu64")\n",
TEST_HOST_LOOP_N);
printf(" -p: specify guest physical test memory offset\n"
" Warning: a low offset can conflict with the loaded test code.\n");
printf(" -m: specify the guest mode ID to test "
"(default: test all supported modes)\n"
" This option may be used multiple times.\n"
" Guest mode IDs:\n");
for (i = 0; i < NUM_VM_MODES; ++i) {
printf(" %d: %s%s\n", i, vm_guest_mode_string(i),
guest_modes[i].supported ? " (supported)" : "");
}
printf(" -b: specify the size of the memory region which should be\n"
" dirtied by each vCPU. e.g. 10M or 3G.\n"
" (default: 1G)\n");
printf(" -f: specify the fraction of pages which should be written to\n"
" as opposed to simply read, in the form\n"
" 1/<fraction of pages to write>.\n"
" (default: 1 i.e. all pages are written to.)\n");
printf(" -v: specify the number of vCPUs to run.\n");
puts("");
exit(0);
}
int main(int argc, char *argv[])
{
unsigned long iterations = TEST_HOST_LOOP_N;
bool mode_selected = false;
uint64_t phys_offset = 0;
unsigned int mode;
int opt, i;
int wr_fract = 1;
#ifdef USE_CLEAR_DIRTY_LOG
dirty_log_manual_caps =
kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
if (!dirty_log_manual_caps) {
print_skip("KVM_CLEAR_DIRTY_LOG not available");
exit(KSFT_SKIP);
}
dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
KVM_DIRTY_LOG_INITIALLY_SET);
#endif
#ifdef __x86_64__
guest_mode_init(VM_MODE_PXXV48_4K, true, true);
#endif
#ifdef __aarch64__
guest_mode_init(VM_MODE_P40V48_4K, true, true);
guest_mode_init(VM_MODE_P40V48_64K, true, true);
{
unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);
if (limit >= 52)
guest_mode_init(VM_MODE_P52V48_64K, true, true);
if (limit >= 48) {
guest_mode_init(VM_MODE_P48V48_4K, true, true);
guest_mode_init(VM_MODE_P48V48_64K, true, true);
}
}
#endif
#ifdef __s390x__
guest_mode_init(VM_MODE_P40V48_4K, true, true);
#endif
while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:")) != -1) {
switch (opt) {
case 'i':
iterations = strtol(optarg, NULL, 10);
break;
case 'p':
phys_offset = strtoull(optarg, NULL, 0);
break;
case 'm':
if (!mode_selected) {
for (i = 0; i < NUM_VM_MODES; ++i)
guest_modes[i].enabled = false;
mode_selected = true;
}
mode = strtoul(optarg, NULL, 10);
TEST_ASSERT(mode < NUM_VM_MODES,
"Guest mode ID %d too big", mode);
guest_modes[mode].enabled = true;
break;
case 'b':
guest_percpu_mem_size = parse_size(optarg);
break;
case 'f':
wr_fract = atoi(optarg);
TEST_ASSERT(wr_fract >= 1,
"Write fraction cannot be less than one");
break;
case 'v':
nr_vcpus = atoi(optarg);
TEST_ASSERT(nr_vcpus > 0,
"Must have a positive number of vCPUs");
TEST_ASSERT(nr_vcpus <= MAX_VCPUS,
"This test does not currently support\n"
"more than %d vCPUs.", MAX_VCPUS);
break;
case 'h':
default:
help(argv[0]);
break;
}
}
TEST_ASSERT(iterations >= 2, "The test should have at least two iterations");
pr_info("Test iterations: %"PRIu64"\n", iterations);
for (i = 0; i < NUM_VM_MODES; ++i) {
if (!guest_modes[i].enabled)
continue;
TEST_ASSERT(guest_modes[i].supported,
"Guest mode ID %d (%s) not supported.",
i, vm_guest_mode_string(i));
run_test(i, iterations, phys_offset, wr_fract);
}
return 0;
}

View File

@ -128,6 +128,78 @@ static uint64_t host_dirty_count;
static uint64_t host_clear_count;
static uint64_t host_track_next_count;
enum log_mode_t {
/* Only use KVM_GET_DIRTY_LOG for logging */
LOG_MODE_DIRTY_LOG = 0,
/* Use both KVM_[GET|CLEAR]_DIRTY_LOG for logging */
LOG_MODE_CLEAR_LOG = 1,
LOG_MODE_NUM,
/* Run all supported modes */
LOG_MODE_ALL = LOG_MODE_NUM,
};
/* Mode of logging to test. Default is to run all supported modes */
static enum log_mode_t host_log_mode_option = LOG_MODE_ALL;
/* Logging mode for current run */
static enum log_mode_t host_log_mode;
static bool clear_log_supported(void)
{
return kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
}
static void clear_log_create_vm_done(struct kvm_vm *vm)
{
struct kvm_enable_cap cap = {};
u64 manual_caps;
manual_caps = kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
TEST_ASSERT(manual_caps, "MANUAL_CAPS is zero!");
manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
KVM_DIRTY_LOG_INITIALLY_SET);
cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2;
cap.args[0] = manual_caps;
vm_enable_cap(vm, &cap);
}
static void dirty_log_collect_dirty_pages(struct kvm_vm *vm, int slot,
void *bitmap, uint32_t num_pages)
{
kvm_vm_get_dirty_log(vm, slot, bitmap);
}
static void clear_log_collect_dirty_pages(struct kvm_vm *vm, int slot,
void *bitmap, uint32_t num_pages)
{
kvm_vm_get_dirty_log(vm, slot, bitmap);
kvm_vm_clear_dirty_log(vm, slot, bitmap, 0, num_pages);
}
struct log_mode {
const char *name;
/* Return true if this mode is supported, otherwise false */
bool (*supported)(void);
/* Hook when the vm creation is done (before vcpu creation) */
void (*create_vm_done)(struct kvm_vm *vm);
/* Hook to collect the dirty pages into the bitmap provided */
void (*collect_dirty_pages) (struct kvm_vm *vm, int slot,
void *bitmap, uint32_t num_pages);
} log_modes[LOG_MODE_NUM] = {
{
.name = "dirty-log",
.collect_dirty_pages = dirty_log_collect_dirty_pages,
},
{
.name = "clear-log",
.supported = clear_log_supported,
.create_vm_done = clear_log_create_vm_done,
.collect_dirty_pages = clear_log_collect_dirty_pages,
},
};
/*
* We use this bitmap to track some pages that should have its dirty
* bit set in the _next_ iteration. For example, if we detected the
@ -137,6 +209,44 @@ static uint64_t host_track_next_count;
*/
static unsigned long *host_bmap_track;
static void log_modes_dump(void)
{
int i;
printf("all");
for (i = 0; i < LOG_MODE_NUM; i++)
printf(", %s", log_modes[i].name);
printf("\n");
}
static bool log_mode_supported(void)
{
struct log_mode *mode = &log_modes[host_log_mode];
if (mode->supported)
return mode->supported();
return true;
}
static void log_mode_create_vm_done(struct kvm_vm *vm)
{
struct log_mode *mode = &log_modes[host_log_mode];
if (mode->create_vm_done)
mode->create_vm_done(vm);
}
static void log_mode_collect_dirty_pages(struct kvm_vm *vm, int slot,
void *bitmap, uint32_t num_pages)
{
struct log_mode *mode = &log_modes[host_log_mode];
TEST_ASSERT(mode->collect_dirty_pages != NULL,
"collect_dirty_pages() is required for any log mode!");
mode->collect_dirty_pages(vm, slot, bitmap, num_pages);
}
static void generate_random_array(uint64_t *guest_array, uint64_t size)
{
uint64_t i;
@ -195,7 +305,7 @@ static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap)
page);
}
if (test_bit_le(page, bmap)) {
if (test_and_clear_bit_le(page, bmap)) {
host_dirty_count++;
/*
* If the bit is set, the value written onto
@ -252,11 +362,12 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid,
pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
vm = _vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR);
vm = vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR);
kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
#ifdef __x86_64__
vm_create_irqchip(vm);
#endif
log_mode_create_vm_done(vm);
vm_vcpu_add_default(vm, vcpuid, guest_code);
return vm;
}
@ -264,10 +375,6 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid,
#define DIRTY_MEM_BITS 30 /* 1G */
#define PAGE_SHIFT_4K 12
#ifdef USE_CLEAR_DIRTY_LOG
static u64 dirty_log_manual_caps;
#endif
static void run_test(enum vm_guest_mode mode, unsigned long iterations,
unsigned long interval, uint64_t phys_offset)
{
@ -275,6 +382,12 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
struct kvm_vm *vm;
unsigned long *bmap;
if (!log_mode_supported()) {
print_skip("Log mode '%s' not supported",
log_modes[host_log_mode].name);
return;
}
/*
* We reserve page table for 2 times of extra dirty mem which
* will definitely cover the original (1G+) test range. Here
@ -317,14 +430,6 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
bmap = bitmap_alloc(host_num_pages);
host_bmap_track = bitmap_alloc(host_num_pages);
#ifdef USE_CLEAR_DIRTY_LOG
struct kvm_enable_cap cap = {};
cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2;
cap.args[0] = dirty_log_manual_caps;
vm_enable_cap(vm, &cap);
#endif
/* Add an extra memory slot for testing dirty logging */
vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
guest_test_phys_mem,
@ -362,11 +467,8 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
while (iteration < iterations) {
/* Give the vcpu thread some time to dirty some pages */
usleep(interval * 1000);
kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap);
#ifdef USE_CLEAR_DIRTY_LOG
kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0,
host_num_pages);
#endif
log_mode_collect_dirty_pages(vm, TEST_MEM_SLOT_INDEX,
bmap, host_num_pages);
vm_dirty_log_verify(mode, bmap);
iteration++;
sync_global_to_guest(vm, iteration);
@ -410,6 +512,9 @@ static void help(char *name)
TEST_HOST_LOOP_INTERVAL);
printf(" -p: specify guest physical test memory offset\n"
" Warning: a low offset can conflict with the loaded test code.\n");
printf(" -M: specify the host logging mode "
"(default: run all log modes). Supported modes: \n\t");
log_modes_dump();
printf(" -m: specify the guest mode ID to test "
"(default: test all supported modes)\n"
" This option may be used multiple times.\n"
@ -429,18 +534,7 @@ int main(int argc, char *argv[])
bool mode_selected = false;
uint64_t phys_offset = 0;
unsigned int mode;
int opt, i;
#ifdef USE_CLEAR_DIRTY_LOG
dirty_log_manual_caps =
kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
if (!dirty_log_manual_caps) {
print_skip("KVM_CLEAR_DIRTY_LOG not available");
exit(KSFT_SKIP);
}
dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
KVM_DIRTY_LOG_INITIALLY_SET);
#endif
int opt, i, j;
#ifdef __x86_64__
guest_mode_init(VM_MODE_PXXV48_4K, true, true);
@ -464,7 +558,7 @@ int main(int argc, char *argv[])
guest_mode_init(VM_MODE_P40V48_4K, true, true);
#endif
while ((opt = getopt(argc, argv, "hi:I:p:m:")) != -1) {
while ((opt = getopt(argc, argv, "hi:I:p:m:M:")) != -1) {
switch (opt) {
case 'i':
iterations = strtol(optarg, NULL, 10);
@ -486,6 +580,26 @@ int main(int argc, char *argv[])
"Guest mode ID %d too big", mode);
guest_modes[mode].enabled = true;
break;
case 'M':
if (!strcmp(optarg, "all")) {
host_log_mode_option = LOG_MODE_ALL;
break;
}
for (i = 0; i < LOG_MODE_NUM; i++) {
if (!strcmp(optarg, log_modes[i].name)) {
pr_info("Setting log mode to: '%s'\n",
optarg);
host_log_mode_option = i;
break;
}
}
if (i == LOG_MODE_NUM) {
printf("Log mode '%s' invalid. Please choose "
"from: ", optarg);
log_modes_dump();
exit(1);
}
break;
case 'h':
default:
help(argv[0]);
@ -507,7 +621,18 @@ int main(int argc, char *argv[])
TEST_ASSERT(guest_modes[i].supported,
"Guest mode ID %d (%s) not supported.",
i, vm_guest_mode_string(i));
run_test(i, iterations, interval, phys_offset);
if (host_log_mode_option == LOG_MODE_ALL) {
/* Run each log mode */
for (j = 0; j < LOG_MODE_NUM; j++) {
pr_info("Testing Log Mode '%s'\n",
log_modes[j].name);
host_log_mode = j;
run_test(i, iterations, interval, phys_offset);
}
} else {
host_log_mode = host_log_mode_option;
run_test(i, iterations, interval, phys_offset);
}
}
return 0;

View File

@ -63,9 +63,11 @@ enum vm_mem_backing_src_type {
int kvm_check_cap(long cap);
int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
struct kvm_enable_cap *cap);
void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size);
struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
void kvm_vm_free(struct kvm_vm *vmp);
void kvm_vm_restart(struct kvm_vm *vmp, int perm);
void kvm_vm_release(struct kvm_vm *vmp);
@ -149,6 +151,7 @@ void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid,
struct kvm_guest_debug *debug);
void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid,
struct kvm_mp_state *mp_state);
struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid);
void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs);
void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs);
@ -294,6 +297,8 @@ int vm_create_device(struct kvm_vm *vm, struct kvm_create_device *cd);
memcpy(&(g), _p, sizeof(g)); \
})
void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid);
/* Common ucalls */
enum {
UCALL_NONE,

View File

@ -0,0 +1,198 @@
// SPDX-License-Identifier: GPL-2.0
/*
* tools/testing/selftests/kvm/include/perf_test_util.h
*
* Copyright (C) 2020, Google LLC.
*/
#ifndef SELFTEST_KVM_PERF_TEST_UTIL_H
#define SELFTEST_KVM_PERF_TEST_UTIL_H
#include "kvm_util.h"
#include "processor.h"
#define MAX_VCPUS 512
#define PAGE_SHIFT_4K 12
#define PTES_PER_4K_PT 512
#define TEST_MEM_SLOT_INDEX 1
/* Default guest test virtual memory offset */
#define DEFAULT_GUEST_TEST_MEM 0xc0000000
#define DEFAULT_PER_VCPU_MEM_SIZE (1 << 30) /* 1G */
/*
* Guest physical memory offset of the testing memory slot.
* This will be set to the topmost valid physical address minus
* the test memory size.
*/
static uint64_t guest_test_phys_mem;
/*
* Guest virtual memory offset of the testing memory slot.
* Must not conflict with identity mapped test code.
*/
static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM;
static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
/* Number of VCPUs for the test */
static int nr_vcpus = 1;
struct vcpu_args {
uint64_t gva;
uint64_t pages;
/* Only used by the host userspace part of the vCPU thread */
int vcpu_id;
};
struct perf_test_args {
struct kvm_vm *vm;
uint64_t host_page_size;
uint64_t guest_page_size;
int wr_fract;
struct vcpu_args vcpu_args[MAX_VCPUS];
};
static struct perf_test_args perf_test_args;
/*
* Continuously write to the first 8 bytes of each page in the
* specified region.
*/
static void guest_code(uint32_t vcpu_id)
{
struct vcpu_args *vcpu_args = &perf_test_args.vcpu_args[vcpu_id];
uint64_t gva;
uint64_t pages;
int i;
/* Make sure vCPU args data structure is not corrupt. */
GUEST_ASSERT(vcpu_args->vcpu_id == vcpu_id);
gva = vcpu_args->gva;
pages = vcpu_args->pages;
while (true) {
for (i = 0; i < pages; i++) {
uint64_t addr = gva + (i * perf_test_args.guest_page_size);
if (i % perf_test_args.wr_fract == 0)
*(uint64_t *)addr = 0x0123456789ABCDEF;
else
READ_ONCE(*(uint64_t *)addr);
}
GUEST_SYNC(1);
}
}
static struct kvm_vm *create_vm(enum vm_guest_mode mode, int vcpus,
uint64_t vcpu_memory_bytes)
{
struct kvm_vm *vm;
uint64_t pages = DEFAULT_GUEST_PHY_PAGES;
uint64_t guest_num_pages;
/* Account for a few pages per-vCPU for stacks */
pages += DEFAULT_STACK_PGS * vcpus;
/*
* Reserve twice the ammount of memory needed to map the test region and
* the page table / stacks region, at 4k, for page tables. Do the
* calculation with 4K page size: the smallest of all archs. (e.g., 64K
* page size guest will need even less memory for page tables).
*/
pages += (2 * pages) / PTES_PER_4K_PT;
pages += ((2 * vcpus * vcpu_memory_bytes) >> PAGE_SHIFT_4K) /
PTES_PER_4K_PT;
pages = vm_adjust_num_guest_pages(mode, pages);
pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
vm = vm_create(mode, pages, O_RDWR);
kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
#ifdef __x86_64__
vm_create_irqchip(vm);
#endif
perf_test_args.vm = vm;
perf_test_args.guest_page_size = vm_get_page_size(vm);
perf_test_args.host_page_size = getpagesize();
TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0,
"Guest memory size is not guest page size aligned.");
guest_num_pages = (vcpus * vcpu_memory_bytes) /
perf_test_args.guest_page_size;
guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
/*
* If there should be more memory in the guest test region than there
* can be pages in the guest, it will definitely cause problems.
*/
TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm),
"Requested more guest memory than address space allows.\n"
" guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n",
guest_num_pages, vm_get_max_gfn(vm), vcpus,
vcpu_memory_bytes);
TEST_ASSERT(vcpu_memory_bytes % perf_test_args.host_page_size == 0,
"Guest memory size is not host page size aligned.");
guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) *
perf_test_args.guest_page_size;
guest_test_phys_mem &= ~(perf_test_args.host_page_size - 1);
#ifdef __s390x__
/* Align to 1M (segment size) */
guest_test_phys_mem &= ~((1 << 20) - 1);
#endif
pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem);
/* Add an extra memory slot for testing */
vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
guest_test_phys_mem,
TEST_MEM_SLOT_INDEX,
guest_num_pages, 0);
/* Do mapping for the demand paging memory slot */
virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0);
ucall_init(vm, NULL);
return vm;
}
static void add_vcpus(struct kvm_vm *vm, int vcpus, uint64_t vcpu_memory_bytes)
{
vm_paddr_t vcpu_gpa;
struct vcpu_args *vcpu_args;
int vcpu_id;
for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) {
vcpu_args = &perf_test_args.vcpu_args[vcpu_id];
vm_vcpu_add_default(vm, vcpu_id, guest_code);
#ifdef __x86_64__
vcpu_set_cpuid(vm, vcpu_id, kvm_get_supported_cpuid());
#endif
vcpu_args->vcpu_id = vcpu_id;
vcpu_args->gva = guest_test_virt_mem +
(vcpu_id * vcpu_memory_bytes);
vcpu_args->pages = vcpu_memory_bytes /
perf_test_args.guest_page_size;
vcpu_gpa = guest_test_phys_mem + (vcpu_id * vcpu_memory_bytes);
pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n",
vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_memory_bytes);
}
}
#endif /* SELFTEST_KVM_PERF_TEST_UTIL_H */

View File

@ -64,5 +64,7 @@ int64_t timespec_to_ns(struct timespec ts);
struct timespec timespec_add_ns(struct timespec ts, int64_t ns);
struct timespec timespec_add(struct timespec ts1, struct timespec ts2);
struct timespec timespec_sub(struct timespec ts1, struct timespec ts2);
struct timespec timespec_diff_now(struct timespec start);
struct timespec timespec_div(struct timespec ts, int divisor);
#endif /* SELFTEST_KVM_TEST_UTIL_H */

View File

@ -36,6 +36,8 @@
#define X86_CR4_SMAP (1ul << 21)
#define X86_CR4_PKE (1ul << 22)
#define UNEXPECTED_VECTOR_PORT 0xfff0u
/* General Registers in 64-Bit Mode */
struct gpr64_regs {
u64 rax;
@ -59,7 +61,7 @@ struct gpr64_regs {
struct desc64 {
uint16_t limit0;
uint16_t base0;
unsigned base1:8, s:1, type:4, dpl:2, p:1;
unsigned base1:8, type:4, s:1, dpl:2, p:1;
unsigned limit1:4, avl:1, l:1, db:1, g:1, base2:8;
uint32_t base3;
uint32_t zero1;
@ -239,6 +241,11 @@ static inline struct desc_ptr get_idt(void)
return idt;
}
static inline void outl(uint16_t port, uint32_t value)
{
__asm__ __volatile__("outl %%eax, %%dx" : : "d"(port), "a"(value));
}
#define SET_XMM(__var, __xmm) \
asm volatile("movq %0, %%"#__xmm : : "r"(__var) : #__xmm)
@ -338,6 +345,35 @@ uint32_t kvm_get_cpuid_max_basic(void);
uint32_t kvm_get_cpuid_max_extended(void);
void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits);
struct ex_regs {
uint64_t rax, rcx, rdx, rbx;
uint64_t rbp, rsi, rdi;
uint64_t r8, r9, r10, r11;
uint64_t r12, r13, r14, r15;
uint64_t vector;
uint64_t error_code;
uint64_t rip;
uint64_t cs;
uint64_t rflags;
};
void vm_init_descriptor_tables(struct kvm_vm *vm);
void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid);
void vm_handle_exception(struct kvm_vm *vm, int vector,
void (*handler)(struct ex_regs *));
/*
* set_cpuid() - overwrites a matching cpuid entry with the provided value.
* matches based on ent->function && ent->index. returns true
* if a match was found and successfully overwritten.
* @cpuid: the kvm cpuid list to modify.
* @ent: cpuid entry to insert
*/
bool set_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *ent);
uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
uint64_t a3);
/*
* Basic CPU control in CR0
*/

View File

@ -350,3 +350,7 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
va_end(ap);
}
void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid)
{
}

View File

@ -94,6 +94,9 @@ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc)
struct kvm_run *run = vcpu_state(vm, vcpu_id);
struct ucall ucall = {};
if (uc)
memset(uc, 0, sizeof(*uc));
if (run->exit_reason == KVM_EXIT_MMIO &&
run->mmio.phys_addr == (uint64_t)ucall_exit_mmio_addr) {
vm_vaddr_t gva;

View File

@ -86,6 +86,34 @@ int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap)
return ret;
}
/* VCPU Enable Capability
*
* Input Args:
* vm - Virtual Machine
* vcpu_id - VCPU
* cap - Capability
*
* Output Args: None
*
* Return: On success, 0. On failure a TEST_ASSERT failure is produced.
*
* Enables a capability (KVM_CAP_*) on the VCPU.
*/
int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
struct kvm_enable_cap *cap)
{
struct vcpu *vcpu = vcpu_find(vm, vcpu_id);
int r;
TEST_ASSERT(vcpu, "cannot find vcpu %d", vcpu_id);
r = ioctl(vcpu->fd, KVM_ENABLE_CAP, cap);
TEST_ASSERT(!r, "KVM_ENABLE_CAP vCPU ioctl failed,\n"
" rc: %i, errno: %i", r, errno);
return r;
}
static void vm_open(struct kvm_vm *vm, int perm)
{
vm->kvm_fd = open(KVM_DEV_PATH, perm);
@ -152,7 +180,7 @@ _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params)
* descriptor to control the created VM is created with the permissions
* given by perm (e.g. O_RDWR).
*/
struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
{
struct kvm_vm *vm;
@ -243,11 +271,6 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
return vm;
}
struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
{
return _vm_create(mode, phy_pages, perm);
}
/*
* VM Restart
*
@ -1204,6 +1227,9 @@ int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid)
do {
rc = ioctl(vcpu->fd, KVM_RUN, NULL);
} while (rc == -1 && errno == EINTR);
assert_on_unhandled_exception(vm, vcpuid);
return rc;
}
@ -1260,6 +1286,35 @@ void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid,
"rc: %i errno: %i", ret, errno);
}
/*
* VM VCPU Get Reg List
*
* Input Args:
* vm - Virtual Machine
* vcpuid - VCPU ID
*
* Output Args:
* None
*
* Return:
* A pointer to an allocated struct kvm_reg_list
*
* Get the list of guest registers which are supported for
* KVM_GET_ONE_REG/KVM_SET_ONE_REG calls
*/
struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid)
{
struct kvm_reg_list reg_list_n = { .n = 0 }, *reg_list;
int ret;
ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_REG_LIST, &reg_list_n);
TEST_ASSERT(ret == -1 && errno == E2BIG, "KVM_GET_REG_LIST n=0");
reg_list = calloc(1, sizeof(*reg_list) + reg_list_n.n * sizeof(__u64));
reg_list->n = reg_list_n.n;
vcpu_ioctl(vm, vcpuid, KVM_GET_REG_LIST, reg_list);
return reg_list;
}
/*
* VM VCPU Regs Get
*

View File

@ -50,6 +50,8 @@ struct kvm_vm {
vm_paddr_t pgd;
vm_vaddr_t gdt;
vm_vaddr_t tss;
vm_vaddr_t idt;
vm_vaddr_t handlers;
};
struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid);

View File

@ -241,3 +241,7 @@ void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
fprintf(stream, "%*spstate: psw: 0x%.16llx:0x%.16llx\n",
indent, "", vcpu->state->psw_mask, vcpu->state->psw_addr);
}
void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid)
{
}

View File

@ -38,6 +38,9 @@ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc)
struct kvm_run *run = vcpu_state(vm, vcpu_id);
struct ucall ucall = {};
if (uc)
memset(uc, 0, sizeof(*uc));
if (run->exit_reason == KVM_EXIT_S390_SIEIC &&
run->s390_sieic.icptcode == 4 &&
(run->s390_sieic.ipa >> 8) == 0x83 && /* 0x83 means DIAGNOSE */

View File

@ -4,10 +4,13 @@
*
* Copyright (C) 2020, Google LLC.
*/
#include <stdlib.h>
#include <assert.h>
#include <ctype.h>
#include <limits.h>
#include <assert.h>
#include <stdlib.h>
#include <time.h>
#include "test_util.h"
/*
@ -81,6 +84,21 @@ struct timespec timespec_sub(struct timespec ts1, struct timespec ts2)
return timespec_add_ns((struct timespec){0}, ns1 - ns2);
}
struct timespec timespec_diff_now(struct timespec start)
{
struct timespec end;
clock_gettime(CLOCK_MONOTONIC, &end);
return timespec_sub(end, start);
}
struct timespec timespec_div(struct timespec ts, int divisor)
{
int64_t ns = timespec_to_ns(ts) / divisor;
return timespec_add_ns((struct timespec){0}, ns);
}
void print_skip(const char *fmt, ...)
{
va_list ap;

View File

@ -0,0 +1,81 @@
handle_exception:
push %r15
push %r14
push %r13
push %r12
push %r11
push %r10
push %r9
push %r8
push %rdi
push %rsi
push %rbp
push %rbx
push %rdx
push %rcx
push %rax
mov %rsp, %rdi
call route_exception
pop %rax
pop %rcx
pop %rdx
pop %rbx
pop %rbp
pop %rsi
pop %rdi
pop %r8
pop %r9
pop %r10
pop %r11
pop %r12
pop %r13
pop %r14
pop %r15
/* Discard vector and error code. */
add $16, %rsp
iretq
/*
* Build the handle_exception wrappers which push the vector/error code on the
* stack and an array of pointers to those wrappers.
*/
.pushsection .rodata
.globl idt_handlers
idt_handlers:
.popsection
.macro HANDLERS has_error from to
vector = \from
.rept \to - \from + 1
.align 8
/* Fetch current address and append it to idt_handlers. */
current_handler = .
.pushsection .rodata
.quad current_handler
.popsection
.if ! \has_error
pushq $0
.endif
pushq $vector
jmp handle_exception
vector = vector + 1
.endr
.endm
.global idt_handler_code
idt_handler_code:
HANDLERS has_error=0 from=0 to=7
HANDLERS has_error=1 from=8 to=8
HANDLERS has_error=0 from=9 to=9
HANDLERS has_error=1 from=10 to=14
HANDLERS has_error=0 from=15 to=16
HANDLERS has_error=1 from=17 to=17
HANDLERS has_error=0 from=18 to=255
.section .note.GNU-stack, "", %progbits

View File

@ -12,9 +12,18 @@
#include "../kvm_util_internal.h"
#include "processor.h"
#ifndef NUM_INTERRUPTS
#define NUM_INTERRUPTS 256
#endif
#define DEFAULT_CODE_SELECTOR 0x8
#define DEFAULT_DATA_SELECTOR 0x10
/* Minimum physical address used for virtual translation tables. */
#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000
vm_vaddr_t exception_handlers;
/* Virtual translation table structure declarations */
struct pageMapL4Entry {
uint64_t present:1;
@ -392,11 +401,12 @@ static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp)
desc->limit0 = segp->limit & 0xFFFF;
desc->base0 = segp->base & 0xFFFF;
desc->base1 = segp->base >> 16;
desc->s = segp->s;
desc->type = segp->type;
desc->s = segp->s;
desc->dpl = segp->dpl;
desc->p = segp->present;
desc->limit1 = segp->limit >> 16;
desc->avl = segp->avl;
desc->l = segp->l;
desc->db = segp->db;
desc->g = segp->g;
@ -556,9 +566,9 @@ static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_m
sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);
kvm_seg_set_unusable(&sregs.ldt);
kvm_seg_set_kernel_code_64bit(vm, 0x8, &sregs.cs);
kvm_seg_set_kernel_data_64bit(vm, 0x10, &sregs.ds);
kvm_seg_set_kernel_data_64bit(vm, 0x10, &sregs.es);
kvm_seg_set_kernel_code_64bit(vm, DEFAULT_CODE_SELECTOR, &sregs.cs);
kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.ds);
kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.es);
kvm_setup_tss_64bit(vm, &sregs.tr, 0x18, gdt_memslot, pgd_memslot);
break;
@ -1118,3 +1128,131 @@ void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
*va_bits = (entry->eax >> 8) & 0xff;
}
}
struct idt_entry {
uint16_t offset0;
uint16_t selector;
uint16_t ist : 3;
uint16_t : 5;
uint16_t type : 4;
uint16_t : 1;
uint16_t dpl : 2;
uint16_t p : 1;
uint16_t offset1;
uint32_t offset2; uint32_t reserved;
};
static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr,
int dpl, unsigned short selector)
{
struct idt_entry *base =
(struct idt_entry *)addr_gva2hva(vm, vm->idt);
struct idt_entry *e = &base[vector];
memset(e, 0, sizeof(*e));
e->offset0 = addr;
e->selector = selector;
e->ist = 0;
e->type = 14;
e->dpl = dpl;
e->p = 1;
e->offset1 = addr >> 16;
e->offset2 = addr >> 32;
}
void kvm_exit_unexpected_vector(uint32_t value)
{
outl(UNEXPECTED_VECTOR_PORT, value);
}
void route_exception(struct ex_regs *regs)
{
typedef void(*handler)(struct ex_regs *);
handler *handlers = (handler *)exception_handlers;
if (handlers && handlers[regs->vector]) {
handlers[regs->vector](regs);
return;
}
kvm_exit_unexpected_vector(regs->vector);
}
void vm_init_descriptor_tables(struct kvm_vm *vm)
{
extern void *idt_handlers;
int i;
vm->idt = vm_vaddr_alloc(vm, getpagesize(), 0x2000, 0, 0);
vm->handlers = vm_vaddr_alloc(vm, 256 * sizeof(void *), 0x2000, 0, 0);
/* Handlers have the same address in both address spaces.*/
for (i = 0; i < NUM_INTERRUPTS; i++)
set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0,
DEFAULT_CODE_SELECTOR);
}
void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid)
{
struct kvm_sregs sregs;
vcpu_sregs_get(vm, vcpuid, &sregs);
sregs.idt.base = vm->idt;
sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1;
sregs.gdt.base = vm->gdt;
sregs.gdt.limit = getpagesize() - 1;
kvm_seg_set_kernel_data_64bit(NULL, DEFAULT_DATA_SELECTOR, &sregs.gs);
vcpu_sregs_set(vm, vcpuid, &sregs);
*(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers;
}
void vm_handle_exception(struct kvm_vm *vm, int vector,
void (*handler)(struct ex_regs *))
{
vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers);
handlers[vector] = (vm_vaddr_t)handler;
}
void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid)
{
if (vcpu_state(vm, vcpuid)->exit_reason == KVM_EXIT_IO
&& vcpu_state(vm, vcpuid)->io.port == UNEXPECTED_VECTOR_PORT
&& vcpu_state(vm, vcpuid)->io.size == 4) {
/* Grab pointer to io data */
uint32_t *data = (void *)vcpu_state(vm, vcpuid)
+ vcpu_state(vm, vcpuid)->io.data_offset;
TEST_ASSERT(false,
"Unexpected vectored event in guest (vector:0x%x)",
*data);
}
}
bool set_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 *ent)
{
int i;
for (i = 0; i < cpuid->nent; i++) {
struct kvm_cpuid_entry2 *cur = &cpuid->entries[i];
if (cur->function != ent->function || cur->index != ent->index)
continue;
memcpy(cur, ent, sizeof(struct kvm_cpuid_entry2));
return true;
}
return false;
}
uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
uint64_t a3)
{
uint64_t r;
asm volatile("vmcall"
: "=a"(r)
: "b"(a0), "c"(a1), "d"(a2), "S"(a3));
return r;
}

View File

@ -40,6 +40,9 @@ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc)
struct kvm_run *run = vcpu_state(vm, vcpu_id);
struct ucall ucall = {};
if (uc)
memset(uc, 0, sizeof(*uc));
if (run->exit_reason == KVM_EXIT_IO && run->io.port == UCALL_PIO_PORT) {
struct kvm_regs regs;

View File

@ -0,0 +1,234 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2020, Google LLC.
*
* Tests for KVM paravirtual feature disablement
*/
#include <asm/kvm_para.h>
#include <linux/kvm_para.h>
#include <stdint.h>
#include "test_util.h"
#include "kvm_util.h"
#include "processor.h"
extern unsigned char rdmsr_start;
extern unsigned char rdmsr_end;
static u64 do_rdmsr(u32 idx)
{
u32 lo, hi;
asm volatile("rdmsr_start: rdmsr;"
"rdmsr_end:"
: "=a"(lo), "=c"(hi)
: "c"(idx));
return (((u64) hi) << 32) | lo;
}
extern unsigned char wrmsr_start;
extern unsigned char wrmsr_end;
static void do_wrmsr(u32 idx, u64 val)
{
u32 lo, hi;
lo = val;
hi = val >> 32;
asm volatile("wrmsr_start: wrmsr;"
"wrmsr_end:"
: : "a"(lo), "c"(idx), "d"(hi));
}
static int nr_gp;
static void guest_gp_handler(struct ex_regs *regs)
{
unsigned char *rip = (unsigned char *)regs->rip;
bool r, w;
r = rip == &rdmsr_start;
w = rip == &wrmsr_start;
GUEST_ASSERT(r || w);
nr_gp++;
if (r)
regs->rip = (uint64_t)&rdmsr_end;
else
regs->rip = (uint64_t)&wrmsr_end;
}
struct msr_data {
uint32_t idx;
const char *name;
};
#define TEST_MSR(msr) { .idx = msr, .name = #msr }
#define UCALL_PR_MSR 0xdeadbeef
#define PR_MSR(msr) ucall(UCALL_PR_MSR, 1, msr)
/*
* KVM paravirtual msrs to test. Expect a #GP if any of these msrs are read or
* written, as the KVM_CPUID_FEATURES leaf is cleared.
*/
static struct msr_data msrs_to_test[] = {
TEST_MSR(MSR_KVM_SYSTEM_TIME),
TEST_MSR(MSR_KVM_SYSTEM_TIME_NEW),
TEST_MSR(MSR_KVM_WALL_CLOCK),
TEST_MSR(MSR_KVM_WALL_CLOCK_NEW),
TEST_MSR(MSR_KVM_ASYNC_PF_EN),
TEST_MSR(MSR_KVM_STEAL_TIME),
TEST_MSR(MSR_KVM_PV_EOI_EN),
TEST_MSR(MSR_KVM_POLL_CONTROL),
TEST_MSR(MSR_KVM_ASYNC_PF_INT),
TEST_MSR(MSR_KVM_ASYNC_PF_ACK),
};
static void test_msr(struct msr_data *msr)
{
PR_MSR(msr);
do_rdmsr(msr->idx);
GUEST_ASSERT(READ_ONCE(nr_gp) == 1);
nr_gp = 0;
do_wrmsr(msr->idx, 0);
GUEST_ASSERT(READ_ONCE(nr_gp) == 1);
nr_gp = 0;
}
struct hcall_data {
uint64_t nr;
const char *name;
};
#define TEST_HCALL(hc) { .nr = hc, .name = #hc }
#define UCALL_PR_HCALL 0xdeadc0de
#define PR_HCALL(hc) ucall(UCALL_PR_HCALL, 1, hc)
/*
* KVM hypercalls to test. Expect -KVM_ENOSYS when called, as the corresponding
* features have been cleared in KVM_CPUID_FEATURES.
*/
static struct hcall_data hcalls_to_test[] = {
TEST_HCALL(KVM_HC_KICK_CPU),
TEST_HCALL(KVM_HC_SEND_IPI),
TEST_HCALL(KVM_HC_SCHED_YIELD),
};
static void test_hcall(struct hcall_data *hc)
{
uint64_t r;
PR_HCALL(hc);
r = kvm_hypercall(hc->nr, 0, 0, 0, 0);
GUEST_ASSERT(r == -KVM_ENOSYS);
}
static void guest_main(void)
{
int i;
for (i = 0; i < ARRAY_SIZE(msrs_to_test); i++) {
test_msr(&msrs_to_test[i]);
}
for (i = 0; i < ARRAY_SIZE(hcalls_to_test); i++) {
test_hcall(&hcalls_to_test[i]);
}
GUEST_DONE();
}
static void clear_kvm_cpuid_features(struct kvm_cpuid2 *cpuid)
{
struct kvm_cpuid_entry2 ent = {0};
ent.function = KVM_CPUID_FEATURES;
TEST_ASSERT(set_cpuid(cpuid, &ent),
"failed to clear KVM_CPUID_FEATURES leaf");
}
static void pr_msr(struct ucall *uc)
{
struct msr_data *msr = (struct msr_data *)uc->args[0];
pr_info("testing msr: %s (%#x)\n", msr->name, msr->idx);
}
static void pr_hcall(struct ucall *uc)
{
struct hcall_data *hc = (struct hcall_data *)uc->args[0];
pr_info("testing hcall: %s (%lu)\n", hc->name, hc->nr);
}
static void handle_abort(struct ucall *uc)
{
TEST_FAIL("%s at %s:%ld", (const char *)uc->args[0],
__FILE__, uc->args[1]);
}
#define VCPU_ID 0
static void enter_guest(struct kvm_vm *vm)
{
struct kvm_run *run;
struct ucall uc;
int r;
run = vcpu_state(vm, VCPU_ID);
while (true) {
r = _vcpu_run(vm, VCPU_ID);
TEST_ASSERT(!r, "vcpu_run failed: %d\n", r);
TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
"unexpected exit reason: %u (%s)",
run->exit_reason, exit_reason_str(run->exit_reason));
switch (get_ucall(vm, VCPU_ID, &uc)) {
case UCALL_PR_MSR:
pr_msr(&uc);
break;
case UCALL_PR_HCALL:
pr_hcall(&uc);
break;
case UCALL_ABORT:
handle_abort(&uc);
return;
case UCALL_DONE:
return;
}
}
}
int main(void)
{
struct kvm_enable_cap cap = {0};
struct kvm_cpuid2 *best;
struct kvm_vm *vm;
if (!kvm_check_cap(KVM_CAP_ENFORCE_PV_FEATURE_CPUID)) {
pr_info("will skip kvm paravirt restriction tests.\n");
return 0;
}
vm = vm_create_default(VCPU_ID, 0, guest_main);
cap.cap = KVM_CAP_ENFORCE_PV_FEATURE_CPUID;
cap.args[0] = 1;
vcpu_enable_cap(vm, VCPU_ID, &cap);
best = kvm_get_supported_cpuid();
clear_kvm_cpuid_features(best);
vcpu_set_cpuid(vm, VCPU_ID, best);
vm_init_descriptor_tables(vm);
vcpu_init_descriptor_tables(vm, VCPU_ID);
vm_handle_exception(vm, GP_VECTOR, guest_gp_handler);
enter_guest(vm);
kvm_vm_free(vm);
}