mirror of https://gitee.com/openkylin/linux.git
This has been long in the making - an AMD-specific MCE-severity grading
function. And it is actually readable at a quick glance. Further error recovery actions will be based on its output. Patches tested on every relevant AMD family out there. -----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iQIcBAABAgAGBQJVGQTEAAoJEBLB8Bhh3lVKjFkQAIF9M+wxU6a39bMlzFBj06E4 3k4PcTntkmlonINGODT5nQjKVChSSUkDEZqPQJ4yFV2mO0R0sRfs23pRF27qV3G9 MjbR6BA7g32r4ORQw0sXMp4IIqSE9otj/JqhTG3FmLjGLrUt+kxeLayg558nuxlZ QRIIjoD1u55fSXRaNLt6k2pHG2BbOd2eREvqbiXCp5Jq48T62fDe4UK3Op4a+zFr cWqz/Q4FnTNyiTgAPnlYMHU85s19vR1S/LMgC78Rcd3Pq7vKeo/9dMDDbccadpva ABpvRNvzuAFiusgG9WyK1cWgFA2MUjjHDafbDXA4TC8n8euRhuiAdF5xjWelPoDL yB/vBqCNE+XueRUPC4FXsUu04612wFY/DjRrRLtWqa12h2ZFvIDl/tUUqmxWKF33 XM/QuwJT/7VKt3aF2TzK1CwDWoYhktry+mF+wPB+TrjNwRg/BcyIMut9qcogvuKQ CskksS/rn5Vxyq2LE5E1+K/EFcJqzOSVbJYM03RJGi6fFhmP/XDbCE9mxvYzb5sS wwKmS0xksdAeyEJ8gCp+0rFpvTCW5mQbBo/MccGp7NNMvDGRJxXel02gZOBt2xBb QScLfMMMdkk+1hcER+7e6j0Xdhedmkla1JcumY8sp66tsHJZ4w8noOIUTszdnWHQ 0OAl/+41wHX1xhAJuOnW =vZSN -----END PGP SIGNATURE----- Merge tag 'amd_severity' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras into x86/ras Pull RAS update from Borislav Petkov: "This has been long in the making - an AMD-specific MCE-severity grading function. And it is actually readable at a quick glance. Further error recovery actions will be based on its output. Patches tested on every relevant AMD family out there." Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
commit
f5c8a10411
|
@ -116,6 +116,12 @@ struct mca_config {
|
||||||
u32 rip_msr;
|
u32 rip_msr;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct mce_vendor_flags {
|
||||||
|
__u64 overflow_recov : 1, /* cpuid_ebx(80000007) */
|
||||||
|
__reserved_0 : 63;
|
||||||
|
};
|
||||||
|
extern struct mce_vendor_flags mce_flags;
|
||||||
|
|
||||||
extern struct mca_config mca_cfg;
|
extern struct mca_config mca_cfg;
|
||||||
extern void mce_register_decode_chain(struct notifier_block *nb);
|
extern void mce_register_decode_chain(struct notifier_block *nb);
|
||||||
extern void mce_unregister_decode_chain(struct notifier_block *nb);
|
extern void mce_unregister_decode_chain(struct notifier_block *nb);
|
||||||
|
@ -128,9 +134,11 @@ extern int mce_p5_enabled;
|
||||||
#ifdef CONFIG_X86_MCE
|
#ifdef CONFIG_X86_MCE
|
||||||
int mcheck_init(void);
|
int mcheck_init(void);
|
||||||
void mcheck_cpu_init(struct cpuinfo_x86 *c);
|
void mcheck_cpu_init(struct cpuinfo_x86 *c);
|
||||||
|
void mcheck_vendor_init_severity(void);
|
||||||
#else
|
#else
|
||||||
static inline int mcheck_init(void) { return 0; }
|
static inline int mcheck_init(void) { return 0; }
|
||||||
static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
|
static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
|
||||||
|
static inline void mcheck_vendor_init_severity(void) {}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef CONFIG_X86_ANCIENT_MCE
|
#ifdef CONFIG_X86_ANCIENT_MCE
|
||||||
|
|
|
@ -24,7 +24,7 @@ struct mce_bank {
|
||||||
char attrname[ATTR_LEN]; /* attribute name */
|
char attrname[ATTR_LEN]; /* attribute name */
|
||||||
};
|
};
|
||||||
|
|
||||||
int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp);
|
extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
|
||||||
struct dentry *mce_get_debugfs_dir(void);
|
struct dentry *mce_get_debugfs_dir(void);
|
||||||
|
|
||||||
extern struct mce_bank *mce_banks;
|
extern struct mce_bank *mce_banks;
|
||||||
|
|
|
@ -186,7 +186,62 @@ static int error_context(struct mce *m)
|
||||||
return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
|
return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
|
||||||
}
|
}
|
||||||
|
|
||||||
int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
|
/*
|
||||||
|
* See AMD Error Scope Hierarchy table in a newer BKDG. For example
|
||||||
|
* 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
|
||||||
|
*/
|
||||||
|
static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp)
|
||||||
|
{
|
||||||
|
enum context ctx = error_context(m);
|
||||||
|
|
||||||
|
/* Processor Context Corrupt, no need to fumble too much, die! */
|
||||||
|
if (m->status & MCI_STATUS_PCC)
|
||||||
|
return MCE_PANIC_SEVERITY;
|
||||||
|
|
||||||
|
if (m->status & MCI_STATUS_UC) {
|
||||||
|
|
||||||
|
/*
|
||||||
|
* On older systems where overflow_recov flag is not present, we
|
||||||
|
* should simply panic if an error overflow occurs. If
|
||||||
|
* overflow_recov flag is present and set, then software can try
|
||||||
|
* to at least kill process to prolong system operation.
|
||||||
|
*/
|
||||||
|
if (mce_flags.overflow_recov) {
|
||||||
|
/* software can try to contain */
|
||||||
|
if (!(m->mcgstatus & MCG_STATUS_RIPV))
|
||||||
|
if (ctx == IN_KERNEL)
|
||||||
|
return MCE_PANIC_SEVERITY;
|
||||||
|
|
||||||
|
/* kill current process */
|
||||||
|
return MCE_AR_SEVERITY;
|
||||||
|
} else {
|
||||||
|
/* at least one error was not logged */
|
||||||
|
if (m->status & MCI_STATUS_OVER)
|
||||||
|
return MCE_PANIC_SEVERITY;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For any other case, return MCE_UC_SEVERITY so that we log the
|
||||||
|
* error and exit #MC handler.
|
||||||
|
*/
|
||||||
|
return MCE_UC_SEVERITY;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* deferred error: poll handler catches these and adds to mce_ring so
|
||||||
|
* memory-failure can take recovery actions.
|
||||||
|
*/
|
||||||
|
if (m->status & MCI_STATUS_DEFERRED)
|
||||||
|
return MCE_DEFERRED_SEVERITY;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* corrected error: poll handler catches these and passes responsibility
|
||||||
|
* of decoding the error to EDAC
|
||||||
|
*/
|
||||||
|
return MCE_KEEP_SEVERITY;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp)
|
||||||
{
|
{
|
||||||
enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
|
enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
|
||||||
enum context ctx = error_context(m);
|
enum context ctx = error_context(m);
|
||||||
|
@ -216,6 +271,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Default to mce_severity_intel */
|
||||||
|
int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
|
||||||
|
mce_severity_intel;
|
||||||
|
|
||||||
|
void __init mcheck_vendor_init_severity(void)
|
||||||
|
{
|
||||||
|
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
|
||||||
|
mce_severity = mce_severity_amd;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_DEBUG_FS
|
#ifdef CONFIG_DEBUG_FS
|
||||||
static void *s_start(struct seq_file *f, loff_t *pos)
|
static void *s_start(struct seq_file *f, loff_t *pos)
|
||||||
{
|
{
|
||||||
|
|
|
@ -64,6 +64,7 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
|
||||||
DEFINE_PER_CPU(unsigned, mce_exception_count);
|
DEFINE_PER_CPU(unsigned, mce_exception_count);
|
||||||
|
|
||||||
struct mce_bank *mce_banks __read_mostly;
|
struct mce_bank *mce_banks __read_mostly;
|
||||||
|
struct mce_vendor_flags mce_flags __read_mostly;
|
||||||
|
|
||||||
struct mca_config mca_cfg __read_mostly = {
|
struct mca_config mca_cfg __read_mostly = {
|
||||||
.bootlog = -1,
|
.bootlog = -1,
|
||||||
|
@ -1534,6 +1535,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
|
||||||
if (c->x86 == 6 && cfg->banks > 0)
|
if (c->x86 == 6 && cfg->banks > 0)
|
||||||
mce_banks[0].ctl = 0;
|
mce_banks[0].ctl = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* overflow_recov is supported for F15h Models 00h-0fh
|
||||||
|
* even though we don't have a CPUID bit for it.
|
||||||
|
*/
|
||||||
|
if (c->x86 == 0x15 && c->x86_model <= 0xf)
|
||||||
|
mce_flags.overflow_recov = 1;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Turn off MC4_MISC thresholding banks on those models since
|
* Turn off MC4_MISC thresholding banks on those models since
|
||||||
* they're not supported there.
|
* they're not supported there.
|
||||||
|
@ -1633,6 +1641,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
|
||||||
break;
|
break;
|
||||||
case X86_VENDOR_AMD:
|
case X86_VENDOR_AMD:
|
||||||
mce_amd_feature_init(c);
|
mce_amd_feature_init(c);
|
||||||
|
mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
|
@ -2017,6 +2026,7 @@ __setup("mce", mcheck_enable);
|
||||||
int __init mcheck_init(void)
|
int __init mcheck_init(void)
|
||||||
{
|
{
|
||||||
mcheck_intel_therm_init();
|
mcheck_intel_therm_init();
|
||||||
|
mcheck_vendor_init_severity();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue