EDAC, sb_edac: Fix reporting for patrol scrubber errors

sb_edac sometimes reports the wrong DIMM for a memory error found by
the patrol scrubber. That is because the hardware provides only a 4KB
page-aligned address for the error case.

This means that the EDAC driver will point at the DIMM matching offset
0x0 in the 4KB page, but because of interleaving across channels and
ranks, the actual DIMM involved may be different if the error is on some
other cache line within the page.

Therefore, reconstruct the socket/iMC/channel information from the "mce"
structure passed to the EDAC driver. The DIMM cannot be determined, so
pass "dimm=-1" to the EDAC core. It will report that all the DIMMs on
that channel may be affected.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Cc: Aristeu Rozanski <aris@redhat.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20180907230828.13901-3-tony.luck@intel.com
[ Improve comments on the functions to convert bank number
  to memory controller number. Minor cleanup to commit message. ]
Signed-off-by: Tony Luck <tony.luck@intel.com>
[ Massage commit message more. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
This commit is contained in:
Qiuxu Zhuo 2018-09-10 14:11:45 -07:00 committed by Borislav Petkov
parent dcc960b225
commit 8489b17ce2
1 changed files with 110 additions and 6 deletions

View File

@ -326,6 +326,7 @@ struct sbridge_info {
const struct interleave_pkg *interleave_pkg;
u8 max_sad;
u8 (*get_node_id)(struct sbridge_pvt *pvt);
u8 (*get_ha)(u8 bank);
enum mem_type (*get_memory_type)(struct sbridge_pvt *pvt);
enum dev_type (*get_width)(struct sbridge_pvt *pvt, u32 mtr);
struct pci_dev *pci_vtd;
@ -1002,6 +1003,39 @@ static u8 knl_get_node_id(struct sbridge_pvt *pvt)
return GET_BITFIELD(reg, 0, 2);
}
/*
* Use the reporting bank number to determine which memory
* controller (also known as "ha" for "home agent"). Sandy
* Bridge only has one memory controller per socket, so the
* answer is always zero.
*/
static u8 sbridge_get_ha(u8 bank)
{
return 0;
}
/*
* On Ivy Bridge, Haswell and Broadwell the error may be in a
* home agent bank (7, 8), or one of the per-channel memory
* controller banks (9 .. 16).
*/
static u8 ibridge_get_ha(u8 bank)
{
switch (bank) {
case 7 ... 8:
return bank - 7;
case 9 ... 16:
return (bank - 9) / 4;
default:
return -EINVAL;
}
}
/* Not used, but included for safety/symmetry */
static u8 knl_get_ha(u8 bank)
{
return -EINVAL;
}
static u64 haswell_get_tolm(struct sbridge_pvt *pvt)
{
@ -2207,6 +2241,60 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
return 0;
}
static int get_memory_error_data_from_mce(struct mem_ctl_info *mci,
const struct mce *m, u8 *socket,
u8 *ha, long *channel_mask,
char *msg)
{
u32 reg, channel = GET_BITFIELD(m->status, 0, 3);
struct mem_ctl_info *new_mci;
struct sbridge_pvt *pvt;
struct pci_dev *pci_ha;
bool tad0;
if (channel >= NUM_CHANNELS) {
sprintf(msg, "Invalid channel 0x%x", channel);
return -EINVAL;
}
pvt = mci->pvt_info;
if (!pvt->info.get_ha) {
sprintf(msg, "No get_ha()");
return -EINVAL;
}
*ha = pvt->info.get_ha(m->bank);
if (*ha != 0 && *ha != 1) {
sprintf(msg, "Impossible bank %d", m->bank);
return -EINVAL;
}
*socket = m->socketid;
new_mci = get_mci_for_node_id(*socket, *ha);
if (!new_mci) {
strcpy(msg, "mci socket got corrupted!");
return -EINVAL;
}
pvt = new_mci->pvt_info;
pci_ha = pvt->pci_ha;
pci_read_config_dword(pci_ha, tad_dram_rule[0], &reg);
tad0 = m->addr <= TAD_LIMIT(reg);
*channel_mask = 1 << channel;
if (pvt->mirror_mode == FULL_MIRRORING ||
(pvt->mirror_mode == ADDR_RANGE_MIRRORING && tad0)) {
*channel_mask |= 1 << ((channel + 2) % 4);
pvt->is_cur_addr_mirrored = true;
} else {
pvt->is_cur_addr_mirrored = false;
}
if (pvt->is_lockstep)
*channel_mask |= 1 << ((channel + 1) % 4);
return 0;
}
/****************************************************************************
Device initialization routines: put/get, init/exit
****************************************************************************/
@ -2877,10 +2965,16 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
u32 errcode = GET_BITFIELD(m->status, 0, 15);
u32 channel = GET_BITFIELD(m->status, 0, 3);
u32 optypenum = GET_BITFIELD(m->status, 4, 6);
/*
* Bits 5-0 of MCi_MISC give the least significant bit that is valid.
* A value 6 is for cache line aligned address, a value 12 is for page
* aligned address reported by patrol scrubber.
*/
u32 lsb = GET_BITFIELD(m->misc, 0, 5);
long channel_mask, first_channel;
u8 rank, socket, ha;
u8 rank = 0xff, socket, ha;
int rc, dimm;
char *area_type = NULL;
char *area_type = "DRAM";
if (pvt->info.type != SANDY_BRIDGE)
recoverable = true;
@ -2964,9 +3058,13 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
optype, msg);
}
return;
} else {
} else if (lsb < 12) {
rc = get_memory_error_data(mci, m->addr, &socket, &ha,
&channel_mask, &rank, &area_type, msg);
&channel_mask, &rank,
&area_type, msg);
} else {
rc = get_memory_error_data_from_mce(mci, m, &socket, &ha,
&channel_mask, msg);
}
if (rc < 0)
@ -2981,14 +3079,15 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
first_channel = find_first_bit(&channel_mask, NUM_CHANNELS);
if (rank < 4)
if (rank == 0xff)
dimm = -1;
else if (rank < 4)
dimm = 0;
else if (rank < 8)
dimm = 1;
else
dimm = 2;
/*
* FIXME: On some memory configurations (mirror, lockstep), the
* Memory Controller can't point the error to a single DIMM. The
@ -3175,6 +3274,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.dram_rule = ibridge_dram_rule;
pvt->info.get_memory_type = get_memory_type;
pvt->info.get_node_id = get_node_id;
pvt->info.get_ha = ibridge_get_ha;
pvt->info.rir_limit = rir_limit;
pvt->info.sad_limit = sad_limit;
pvt->info.interleave_mode = interleave_mode;
@ -3199,6 +3299,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.dram_rule = sbridge_dram_rule;
pvt->info.get_memory_type = get_memory_type;
pvt->info.get_node_id = get_node_id;
pvt->info.get_ha = sbridge_get_ha;
pvt->info.rir_limit = rir_limit;
pvt->info.sad_limit = sad_limit;
pvt->info.interleave_mode = interleave_mode;
@ -3223,6 +3324,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.dram_rule = ibridge_dram_rule;
pvt->info.get_memory_type = haswell_get_memory_type;
pvt->info.get_node_id = haswell_get_node_id;
pvt->info.get_ha = ibridge_get_ha;
pvt->info.rir_limit = haswell_rir_limit;
pvt->info.sad_limit = sad_limit;
pvt->info.interleave_mode = interleave_mode;
@ -3247,6 +3349,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.dram_rule = ibridge_dram_rule;
pvt->info.get_memory_type = haswell_get_memory_type;
pvt->info.get_node_id = haswell_get_node_id;
pvt->info.get_ha = ibridge_get_ha;
pvt->info.rir_limit = haswell_rir_limit;
pvt->info.sad_limit = sad_limit;
pvt->info.interleave_mode = interleave_mode;
@ -3271,6 +3374,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.dram_rule = knl_dram_rule;
pvt->info.get_memory_type = knl_get_memory_type;
pvt->info.get_node_id = knl_get_node_id;
pvt->info.get_ha = knl_get_ha;
pvt->info.rir_limit = NULL;
pvt->info.sad_limit = knl_sad_limit;
pvt->info.interleave_mode = knl_interleave_mode;