From e35fca4791fcdd43dc1fd769797df40c562ab491 Mon Sep 17 00:00:00 2001 From: Chen Gong Date: Tue, 8 May 2012 20:40:12 -0300 Subject: [PATCH 1/4] edac: avoid mce decoding crash after edac driver unloaded Some edac drivers register themselves as mce decoders via notifier_chain. But in current notifier_chain implementation logic, it doesn't accept same notifier registered twice. If so, it will be wrong when adding/removing the element from the list. For example, on one SandyBridge platform, remove module sb_edac and then trigger one error, it will hit oops because it has no mce decoder registered but related notifier_chain still points to an invalid callback function. Here is an example: Call Trace: [] atomic_notifier_call_chain+0x1a/0x20 [] mce_log+0x46/0x180 [] apei_mce_report_mem_error+0x4a/0x60 [] ghes_do_proc+0x192/0x210 [] ghes_proc+0x46/0x70 [] ghes_notify_sci+0x48/0x80 [] notifier_call_chain+0x55/0x80 [] __blocking_notifier_call_chain+0x5a/0x80 [] ? acpi_os_wait_events_complete+0x23/0x23 [] blocking_notifier_call_chain+0x16/0x20 [] acpi_hed_notify+0x19/0x1b [] acpi_device_notify+0x19/0x1b [] acpi_ev_notify_dispatch+0x67/0x7f [] acpi_os_execute_deferred+0x29/0x36 [] process_one_work+0x132/0x450 [] worker_thread+0x17b/0x3c0 [] ? manage_workers+0x120/0x120 [] kthread+0x9e/0xb0 [] kernel_thread_helper+0x4/0x10 [] ? kthread_freezable_should_stop+0x70/0x70 [] ? gs_change+0x13/0x13 Code: f3 49 89 d4 45 85 ed 4d 89 c6 48 8b 0f 74 48 48 85 c9 75 17 eb 41 0f 1f 80 00 00 00 00 41 83 ed 01 4c 89 f9 74 22 4d 85 ff 74 1d <4c> 8b 79 08 4c 89 e2 48 89 de 48 89 cf ff 11 4d 85 f6 74 04 41 RIP [] notifier_call_chain+0x46/0x80 RSP CR2: ffffffffa01af838 ---[ end trace 0100930068e73e6f ]--- BUG: unable to handle kernel paging request at fffffffffffffff8 IP: [] kthread_data+0x10/0x20 PGD 1a0d067 PUD 1a0e067 PMD 0 Oops: 0000 [#2] SMP Only i7core_edac and sb_edac have such issues because they have more than one memory controller which means they have to register mce decoder many times. Cc: # 3.2 and upper Signed-off-by: Chen Gong Signed-off-by: Mauro Carvalho Chehab --- drivers/edac/i7core_edac.c | 15 ++++----------- drivers/edac/sb_edac.c | 8 ++++---- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c index d27778f65a5d..a499c7ed820a 100644 --- a/drivers/edac/i7core_edac.c +++ b/drivers/edac/i7core_edac.c @@ -1814,12 +1814,6 @@ static int i7core_mce_check_error(struct notifier_block *nb, unsigned long val, if (mce->bank != 8) return NOTIFY_DONE; -#ifdef CONFIG_SMP - /* Only handle if it is the right mc controller */ - if (mce->socketid != pvt->i7core_dev->socket) - return NOTIFY_DONE; -#endif - smp_rmb(); if ((pvt->mce_out + 1) % MCE_LOG_LEN == pvt->mce_in) { smp_wmb(); @@ -2116,8 +2110,6 @@ static void i7core_unregister_mci(struct i7core_dev *i7core_dev) if (pvt->enable_scrub) disable_sdram_scrub_setting(mci); - mce_unregister_decode_chain(&i7_mce_dec); - /* Disable EDAC polling */ i7core_pci_ctl_release(pvt); @@ -2222,8 +2214,6 @@ static int i7core_register_mci(struct i7core_dev *i7core_dev) /* DCLK for scrub rate setting */ pvt->dclk_freq = get_dclk_freq(); - mce_register_decode_chain(&i7_mce_dec); - return 0; fail0: @@ -2367,8 +2357,10 @@ static int __init i7core_init(void) pci_rc = pci_register_driver(&i7core_driver); - if (pci_rc >= 0) + if (pci_rc >= 0) { + mce_register_decode_chain(&i7_mce_dec); return 0; + } i7core_printk(KERN_ERR, "Failed to register device with error %d.\n", pci_rc); @@ -2384,6 +2376,7 @@ static void __exit i7core_exit(void) { debugf2("MC: " __FILE__ ": %s()\n", __func__); pci_unregister_driver(&i7core_driver); + mce_unregister_decode_chain(&i7_mce_dec); } module_init(i7core_init); diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 4adaf4b7da99..a21ace0b709d 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -1604,8 +1604,6 @@ static void sbridge_unregister_mci(struct sbridge_dev *sbridge_dev) debugf0("MC: " __FILE__ ": %s(): mci = %p, dev = %p\n", __func__, mci, &sbridge_dev->pdev[0]->dev); - mce_unregister_decode_chain(&sbridge_mce_dec); - /* Remove MC sysfs nodes */ edac_mc_del_mc(mci->dev); @@ -1682,7 +1680,6 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev) goto fail0; } - mce_register_decode_chain(&sbridge_mce_dec); return 0; fail0: @@ -1811,8 +1808,10 @@ static int __init sbridge_init(void) pci_rc = pci_register_driver(&sbridge_driver); - if (pci_rc >= 0) + if (pci_rc >= 0) { + mce_register_decode_chain(&sbridge_mce_dec); return 0; + } sbridge_printk(KERN_ERR, "Failed to register device with error %d.\n", pci_rc); @@ -1828,6 +1827,7 @@ static void __exit sbridge_exit(void) { debugf2("MC: " __FILE__ ": %s()\n", __func__); pci_unregister_driver(&sbridge_driver); + mce_unregister_decode_chain(&sbridge_mce_dec); } module_init(sbridge_init); From 2cbb587d3bc41a305168e91b4f3c5b6944a12566 Mon Sep 17 00:00:00 2001 From: Chen Gong Date: Mon, 14 May 2012 05:51:26 -0300 Subject: [PATCH 2/4] edac: fix the error about memory type detection on SandyBridge On SandyBridge, DDRIOA(Dev: 17 Func: 0 Offset: 328) is used to detect whether DIMM is RDIMM/LRDIMM, not TA(Dev: 15 Func: 0). Signed-off-by: Chen Gong Signed-off-by: Mauro Carvalho Chehab --- drivers/edac/sb_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index a21ace0b709d..36ad17e79d61 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -555,7 +555,7 @@ static int get_dimm_config(struct mem_ctl_info *mci) pvt->is_close_pg = false; } - pci_read_config_dword(pvt->pci_ta, RANK_CFG_A, ®); + pci_read_config_dword(pvt->pci_ddrio, RANK_CFG_A, ®); if (IS_RDIMM_ENABLED(reg)) { /* FIXME: Can also be LRDIMM */ debugf0("Memory is registered\n"); From b9bc5ddb1b76d3f7ee14c533300aa95907c6969e Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Wed, 6 Jun 2012 19:49:42 -0500 Subject: [PATCH 3/4] mpc85xx_edac: fix error: too few arguments to function 'edac_mc_alloc' commit ca0907b "edac: Remove the legacy EDAC ABI" broke mpc85xx_edac in the following manner: mpc85xx_edac.c:983:35: error: too few arguments to function 'edac_mc_alloc' this patch puts back the missing 'layers' argument. [mchehab@redhat.com: As Ben sent a similar fix, I added his SOB on this patch] Signed-off-by: Kim Phillips Signed-off-by: Ben Collins Signed-off-by: Mauro Carvalho Chehab --- drivers/edac/mpc85xx_edac.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c index 4c402353ba98..0e374625f6f8 100644 --- a/drivers/edac/mpc85xx_edac.c +++ b/drivers/edac/mpc85xx_edac.c @@ -980,7 +980,8 @@ static int __devinit mpc85xx_mc_err_probe(struct platform_device *op) layers[1].type = EDAC_MC_LAYER_CHANNEL; layers[1].size = 1; layers[1].is_virt_csrow = false; - mci = edac_mc_alloc(edac_mc_idx, ARRAY_SIZE(layers), sizeof(*pdata)); + mci = edac_mc_alloc(edac_mc_idx, ARRAY_SIZE(layers), layers, + sizeof(*pdata)); if (!mci) { devres_release_group(&op->dev, mpc85xx_mc_err_probe); return -ENOMEM; From 8447c4d15e357a458c9051ddc84aa6c8b9c27000 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Wed, 6 Jun 2012 13:11:05 -0400 Subject: [PATCH 4/4] edac: Do alignment logic properly in edac_align_ptr() The logic was checking the sizeof the structure being allocated to determine whether an alignment fixup was required. This isn't right; what we actually care about is the alignment of the actual pointer that's about to be returned. This became an issue recently because struct edac_mc_layer has a size that is not zero modulo eight, so we were taking the correctly-aligned pointer and forcing it to be misaligned. On Tile this caused an alignment exception. Signed-off-by: Chris Metcalf Signed-off-by: Mauro Carvalho Chehab --- drivers/edac/edac_mc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 10f375032e96..de5ba86e8b89 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -164,7 +164,7 @@ void *edac_align_ptr(void **p, unsigned size, int n_elems) else return (char *)ptr; - r = size % align; + r = (unsigned long)p % align; if (r == 0) return (char *)ptr;