mirror of https://gitee.com/openkylin/linux.git
Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 RAS updates from Thomas Gleixner: "A small set of changes to the RAS core: - Rework of the MCE bank scanning code - Y2038 converion" * 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mce: Cleanup __mc_scan_banks() x86/mce: Carve out bank scanning code x86/mce: Remove !banks check x86/mce: Carve out the crashing_cpu check x86/mce: Always use 64-bit timestamps
This commit is contained in:
commit
37a1604680
|
@ -123,8 +123,8 @@ void mce_setup(struct mce *m)
|
|||
{
|
||||
memset(m, 0, sizeof(struct mce));
|
||||
m->cpu = m->extcpu = smp_processor_id();
|
||||
/* We hope get_seconds stays lockless */
|
||||
m->time = get_seconds();
|
||||
/* need the internal __ version to avoid deadlocks */
|
||||
m->time = __ktime_get_real_seconds();
|
||||
m->cpuvendor = boot_cpu_data.x86_vendor;
|
||||
m->cpuid = cpuid_eax(1);
|
||||
m->socketid = cpu_data(m->extcpu).phys_proc_id;
|
||||
|
@ -1104,6 +1104,101 @@ static void mce_unmap_kpfn(unsigned long pfn)
|
|||
}
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
* Cases where we avoid rendezvous handler timeout:
|
||||
* 1) If this CPU is offline.
|
||||
*
|
||||
* 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
|
||||
* skip those CPUs which remain looping in the 1st kernel - see
|
||||
* crash_nmi_callback().
|
||||
*
|
||||
* Note: there still is a small window between kexec-ing and the new,
|
||||
* kdump kernel establishing a new #MC handler where a broadcasted MCE
|
||||
* might not get handled properly.
|
||||
*/
|
||||
static bool __mc_check_crashing_cpu(int cpu)
|
||||
{
|
||||
if (cpu_is_offline(cpu) ||
|
||||
(crashing_cpu != -1 && crashing_cpu != cpu)) {
|
||||
u64 mcgstatus;
|
||||
|
||||
mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
|
||||
if (mcgstatus & MCG_STATUS_RIPV) {
|
||||
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static void __mc_scan_banks(struct mce *m, struct mce *final,
|
||||
unsigned long *toclear, unsigned long *valid_banks,
|
||||
int no_way_out, int *worst)
|
||||
{
|
||||
struct mca_config *cfg = &mca_cfg;
|
||||
int severity, i;
|
||||
|
||||
for (i = 0; i < cfg->banks; i++) {
|
||||
__clear_bit(i, toclear);
|
||||
if (!test_bit(i, valid_banks))
|
||||
continue;
|
||||
|
||||
if (!mce_banks[i].ctl)
|
||||
continue;
|
||||
|
||||
m->misc = 0;
|
||||
m->addr = 0;
|
||||
m->bank = i;
|
||||
|
||||
m->status = mce_rdmsrl(msr_ops.status(i));
|
||||
if (!(m->status & MCI_STATUS_VAL))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Corrected or non-signaled errors are handled by
|
||||
* machine_check_poll(). Leave them alone, unless this panics.
|
||||
*/
|
||||
if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
|
||||
!no_way_out)
|
||||
continue;
|
||||
|
||||
/* Set taint even when machine check was not enabled. */
|
||||
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
|
||||
|
||||
severity = mce_severity(m, cfg->tolerant, NULL, true);
|
||||
|
||||
/*
|
||||
* When machine check was for corrected/deferred handler don't
|
||||
* touch, unless we're panicking.
|
||||
*/
|
||||
if ((severity == MCE_KEEP_SEVERITY ||
|
||||
severity == MCE_UCNA_SEVERITY) && !no_way_out)
|
||||
continue;
|
||||
|
||||
__set_bit(i, toclear);
|
||||
|
||||
/* Machine check event was not enabled. Clear, but ignore. */
|
||||
if (severity == MCE_NO_SEVERITY)
|
||||
continue;
|
||||
|
||||
mce_read_aux(m, i);
|
||||
|
||||
/* assuming valid severity level != 0 */
|
||||
m->severity = severity;
|
||||
|
||||
mce_log(m);
|
||||
|
||||
if (severity > *worst) {
|
||||
*final = *m;
|
||||
*worst = severity;
|
||||
}
|
||||
}
|
||||
|
||||
/* mce_clear_state will clear *final, save locally for use later */
|
||||
*m = *final;
|
||||
}
|
||||
|
||||
/*
|
||||
* The actual machine check handler. This only handles real
|
||||
* exceptions when something got corrupted coming in through int 18.
|
||||
|
@ -1118,68 +1213,45 @@ static void mce_unmap_kpfn(unsigned long pfn)
|
|||
*/
|
||||
void do_machine_check(struct pt_regs *regs, long error_code)
|
||||
{
|
||||
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
|
||||
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
|
||||
struct mca_config *cfg = &mca_cfg;
|
||||
int cpu = smp_processor_id();
|
||||
char *msg = "Unknown";
|
||||
struct mce m, *final;
|
||||
int i;
|
||||
int worst = 0;
|
||||
int severity;
|
||||
|
||||
/*
|
||||
* Establish sequential order between the CPUs entering the machine
|
||||
* check handler.
|
||||
*/
|
||||
int order = -1;
|
||||
|
||||
/*
|
||||
* If no_way_out gets set, there is no safe way to recover from this
|
||||
* MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
|
||||
*/
|
||||
int no_way_out = 0;
|
||||
|
||||
/*
|
||||
* If kill_it gets set, there might be a way to recover from this
|
||||
* error.
|
||||
*/
|
||||
int kill_it = 0;
|
||||
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
|
||||
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
|
||||
char *msg = "Unknown";
|
||||
|
||||
/*
|
||||
* MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
|
||||
* on Intel.
|
||||
*/
|
||||
int lmce = 1;
|
||||
int cpu = smp_processor_id();
|
||||
|
||||
/*
|
||||
* Cases where we avoid rendezvous handler timeout:
|
||||
* 1) If this CPU is offline.
|
||||
*
|
||||
* 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
|
||||
* skip those CPUs which remain looping in the 1st kernel - see
|
||||
* crash_nmi_callback().
|
||||
*
|
||||
* Note: there still is a small window between kexec-ing and the new,
|
||||
* kdump kernel establishing a new #MC handler where a broadcasted MCE
|
||||
* might not get handled properly.
|
||||
*/
|
||||
if (cpu_is_offline(cpu) ||
|
||||
(crashing_cpu != -1 && crashing_cpu != cpu)) {
|
||||
u64 mcgstatus;
|
||||
|
||||
mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
|
||||
if (mcgstatus & MCG_STATUS_RIPV) {
|
||||
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (__mc_check_crashing_cpu(cpu))
|
||||
return;
|
||||
|
||||
ist_enter(regs);
|
||||
|
||||
this_cpu_inc(mce_exception_count);
|
||||
|
||||
if (!cfg->banks)
|
||||
goto out;
|
||||
|
||||
mce_gather_info(&m, regs);
|
||||
m.tsc = rdtsc();
|
||||
|
||||
|
@ -1220,67 +1292,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|||
order = mce_start(&no_way_out);
|
||||
}
|
||||
|
||||
for (i = 0; i < cfg->banks; i++) {
|
||||
__clear_bit(i, toclear);
|
||||
if (!test_bit(i, valid_banks))
|
||||
continue;
|
||||
if (!mce_banks[i].ctl)
|
||||
continue;
|
||||
|
||||
m.misc = 0;
|
||||
m.addr = 0;
|
||||
m.bank = i;
|
||||
|
||||
m.status = mce_rdmsrl(msr_ops.status(i));
|
||||
if ((m.status & MCI_STATUS_VAL) == 0)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Non uncorrected or non signaled errors are handled by
|
||||
* machine_check_poll. Leave them alone, unless this panics.
|
||||
*/
|
||||
if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
|
||||
!no_way_out)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Set taint even when machine check was not enabled.
|
||||
*/
|
||||
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
|
||||
|
||||
severity = mce_severity(&m, cfg->tolerant, NULL, true);
|
||||
|
||||
/*
|
||||
* When machine check was for corrected/deferred handler don't
|
||||
* touch, unless we're panicing.
|
||||
*/
|
||||
if ((severity == MCE_KEEP_SEVERITY ||
|
||||
severity == MCE_UCNA_SEVERITY) && !no_way_out)
|
||||
continue;
|
||||
__set_bit(i, toclear);
|
||||
if (severity == MCE_NO_SEVERITY) {
|
||||
/*
|
||||
* Machine check event was not enabled. Clear, but
|
||||
* ignore.
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
|
||||
mce_read_aux(&m, i);
|
||||
|
||||
/* assuming valid severity level != 0 */
|
||||
m.severity = severity;
|
||||
|
||||
mce_log(&m);
|
||||
|
||||
if (severity > worst) {
|
||||
*final = m;
|
||||
worst = severity;
|
||||
}
|
||||
}
|
||||
|
||||
/* mce_clear_state will clear *final, save locally for use later */
|
||||
m = *final;
|
||||
__mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst);
|
||||
|
||||
if (!no_way_out)
|
||||
mce_clear_state(toclear);
|
||||
|
@ -1319,7 +1331,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|||
if (worst > 0)
|
||||
mce_report_event(regs);
|
||||
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
|
||||
out:
|
||||
|
||||
sync_core();
|
||||
|
||||
if (worst != MCE_AR_SEVERITY && !kill_it)
|
||||
|
|
Loading…
Reference in New Issue