From 628ea92f0928c72d5ab88785e794ab4059f3f17d Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Wed, 8 Jun 2016 18:35:56 +0100 Subject: EDAC: Make dev_attr_sdram_scrub_rate static The dev_attr_sdram_scrub_rate is not declared in a header or used anywhere else, so make it static to fix the following warning: drivers/edac/edac_mc_sysfs.c:816:1: warning: symbol 'dev_attr_sdram_scrub_rate' was not declared. Should it be static? Signed-off-by: Ben Dooks Reviewed-by: Mauro Carvalho Chehab Cc: linux-edac Link: http://lkml.kernel.org/r/1465407356-7357-1-git-send-email-ben.dooks@codethink.co.uk Signed-off-by: Borislav Petkov --- drivers/edac/edac_mc_sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 39dbab7d62f1..cc45b1dc3321 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -831,7 +831,7 @@ static DEVICE_ATTR(ce_count, S_IRUGO, mci_ce_count_show, NULL); static DEVICE_ATTR(max_location, S_IRUGO, mci_max_location_show, NULL); /* memory scrubber attribute file */ -DEVICE_ATTR(sdram_scrub_rate, 0, mci_sdram_scrub_rate_show, +static DEVICE_ATTR(sdram_scrub_rate, 0, mci_sdram_scrub_rate_show, mci_sdram_scrub_rate_store); /* umode set later in is_visible */ static struct attribute *mci_attrs[] = { -- cgit v1.2.3 From eca90a3b3226fcecb4b3bbf4b0b6ff72422674bc Mon Sep 17 00:00:00 2001 From: Alexander Alemayhu Date: Thu, 5 Jan 2017 22:11:50 +0100 Subject: EDAC: Fix typos in enum mem_type comments s/labed/labeled/ s/differenciate/differentiate/ Signed-off-by: Alexander Alemayhu Cc: linux-edac Link: http://lkml.kernel.org/r/20170105211150.24003-1-alexander@alemayhu.com Signed-off-by: Borislav Petkov --- include/linux/edac.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/edac.h b/include/linux/edac.h index 07c52c0af62d..5b6adf964248 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -190,8 +190,8 @@ static inline char *mc_event_error_type(const unsigned int err_type) * part of the memory details to the memory controller. * @MEM_RMBS: Rambus DRAM, used on a few Pentium III/IV controllers. * @MEM_DDR2: DDR2 RAM, as described at JEDEC JESD79-2F. - * Those memories are labed as "PC2-" instead of "PC" to - * differenciate from DDR. + * Those memories are labeled as "PC2-" instead of "PC" to + * differentiate from DDR. * @MEM_FB_DDR2: Fully-Buffered DDR2, as described at JEDEC Std No. 205 * and JESD206. * Those memories are accessed per DIMM slot, and not by -- cgit v1.2.3 From f5c61277f67f03df334c4486397ecff54968971b Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Mon, 16 Jan 2017 12:14:52 +0530 Subject: EDAC, i82975x: Add ioremap_nocache() error handling If ioremap_nocache() fails, it will return NULL. Which will then cause a NULL-pointer dereference. Handle the returned value properly. Signed-off-by: Arvind Yadav Cc: "Arvind R." Cc: linux-edac Link: http://lkml.kernel.org/r/1484549092-11349-1-git-send-email-arvind.yadav.cs@gmail.com [ Boris: massage commit message and improve error message. ] Signed-off-by: Borislav Petkov --- drivers/edac/i82975x_edac.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/edac/i82975x_edac.c b/drivers/edac/i82975x_edac.c index 7baa8ace267b..9dcdab28f665 100644 --- a/drivers/edac/i82975x_edac.c +++ b/drivers/edac/i82975x_edac.c @@ -494,6 +494,10 @@ static int i82975x_probe1(struct pci_dev *pdev, int dev_idx) } mchbar &= 0xffffc000; /* bits 31:14 used for 16K window */ mch_window = ioremap_nocache(mchbar, 0x1000); + if (!mch_window) { + edac_dbg(3, "error ioremapping MCHBAR!\n"); + goto fail0; + } #ifdef i82975x_DEBUG_IOMEM i82975x_printk(KERN_INFO, "MCHBAR real = %0x, remapped = %p\n", -- cgit v1.2.3 From 2287c63643f0f52d9d5452b9dc4079aec0889fe8 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 13 Jan 2017 09:52:19 -0600 Subject: EDAC, amd64: Save and return err code from probe_one_instance() We should save the return code from probe_one_instance() so that it can be returned from the module init function. Otherwise, we'll be returning the -ENOMEM from above. Signed-off-by: Yazen Ghannam Cc: linux-edac Link: http://lkml.kernel.org/r/1484322741-41884-1-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 260251177830..4a480da713b9 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3466,14 +3466,16 @@ static int __init amd64_edac_init(void) if (!msrs) goto err_free; - for (i = 0; i < amd_nb_num(); i++) - if (probe_one_instance(i)) { + for (i = 0; i < amd_nb_num(); i++) { + err = probe_one_instance(i); + if (err) { /* unwind properly */ while (--i >= 0) remove_one_instance(i); goto err_pci; } + } setup_pci_device(); -- cgit v1.2.3 From 4fb6fde74d6724dc6d64ec729f950fbdeefd7f07 Mon Sep 17 00:00:00 2001 From: Aaron Miller Date: Thu, 3 Nov 2016 15:01:53 -0700 Subject: EDAC: Expose per-DIMM error counts in sysfs The old csrowX sysfs directories have per-csrow error counters, but the new dimmX directories do not currently expose error counts. EDAC already keeps these counts, add them to sysfs so per-DIMM counts are still available when CONFIG_EDAC_LEGACY_SYSFS=n. Signed-off-by: Aaron Miller Cc: linux-edac Link: http://lkml.kernel.org/r/20161103220153.3997328-1-aaronmiller@fb.com Signed-off-by: Borislav Petkov --- Documentation/ABI/testing/sysfs-devices-edac | 17 +++++++++++++ Documentation/admin-guide/ras.rst | 20 +++++++++++++++ drivers/edac/edac_mc_sysfs.c | 38 ++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-devices-edac b/Documentation/ABI/testing/sysfs-devices-edac index 6568e0010e1a..46ff929fd52a 100644 --- a/Documentation/ABI/testing/sysfs-devices-edac +++ b/Documentation/ABI/testing/sysfs-devices-edac @@ -138,3 +138,20 @@ Contact: Mauro Carvalho Chehab Description: This attribute file will display what type of memory is currently on this csrow. Normally, either buffered or unbuffered memory (for example, Unbuffered-DDR3). + +What: /sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_ce_count +Date: October 2016 +Contact: linux-edac@vger.kernel.org +Description: This attribute file displays the total count of correctable + errors that have occurred on this DIMM. This count is very important + to examine. CEs provide early indications that a DIMM is beginning + to fail. This count field should be monitored for non-zero values + and report such information to the system administrator. + +What: /sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_ue_count +Date: October 2016 +Contact: linux-edac@vger.kernel.org +Description: This attribute file displays the total count of uncorrectable + errors that have occurred on this DIMM. If panic_on_ue is set, this + counter will not have a chance to increment, since EDAC will panic the + system diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/ras.rst index d71340e86c27..9939348bd4a3 100644 --- a/Documentation/admin-guide/ras.rst +++ b/Documentation/admin-guide/ras.rst @@ -438,11 +438,13 @@ A typical EDAC system has the following structure under │   │   ├── ce_count │   │   ├── ce_noinfo_count │   │   ├── dimm0 + │   │   │   ├── dimm_ce_count │   │   │   ├── dimm_dev_type │   │   │   ├── dimm_edac_mode │   │   │   ├── dimm_label │   │   │   ├── dimm_location │   │   │   ├── dimm_mem_type + │   │   │   ├── dimm_ue_count │   │   │   ├── size │   │   │   └── uevent │   │   ├── max_location @@ -457,11 +459,13 @@ A typical EDAC system has the following structure under │   │   ├── ce_count │   │   ├── ce_noinfo_count │   │   ├── dimm0 + │   │   │   ├── dimm_ce_count │   │   │   ├── dimm_dev_type │   │   │   ├── dimm_edac_mode │   │   │   ├── dimm_label │   │   │   ├── dimm_location │   │   │   ├── dimm_mem_type + │   │   │   ├── dimm_ue_count │   │   │   ├── size │   │   │   └── uevent │   │   ├── max_location @@ -483,6 +487,22 @@ this ``X`` memory module: This attribute file displays, in count of megabytes, the memory that this csrow contains. +- ``dimm_ue_count`` - Uncorrectable Errors count attribute file + + This attribute file displays the total count of uncorrectable + errors that have occurred on this DIMM. If panic_on_ue is set + this counter will not have a chance to increment, since EDAC + will panic the system. + +- ``dimm_ce_count`` - Correctable Errors count attribute file + + This attribute file displays the total count of correctable + errors that have occurred on this DIMM. This count is very + important to examine. CEs provide early indications that a + DIMM is beginning to fail. This count field should be + monitored for non-zero values and report such information + to the system administrator. + - ``dimm_dev_type`` - Device type attribute file This attribute file will display what type of DRAM device is diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index cc45b1dc3321..445862dac273 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -569,6 +569,40 @@ static ssize_t dimmdev_edac_mode_show(struct device *dev, return sprintf(data, "%s\n", edac_caps[dimm->edac_mode]); } +static ssize_t dimmdev_ce_count_show(struct device *dev, + struct device_attribute *mattr, + char *data) +{ + struct dimm_info *dimm = to_dimm(dev); + u32 count; + int off; + + off = EDAC_DIMM_OFF(dimm->mci->layers, + dimm->mci->n_layers, + dimm->location[0], + dimm->location[1], + dimm->location[2]); + count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][off]; + return sprintf(data, "%u\n", count); +} + +static ssize_t dimmdev_ue_count_show(struct device *dev, + struct device_attribute *mattr, + char *data) +{ + struct dimm_info *dimm = to_dimm(dev); + u32 count; + int off; + + off = EDAC_DIMM_OFF(dimm->mci->layers, + dimm->mci->n_layers, + dimm->location[0], + dimm->location[1], + dimm->location[2]); + count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][off]; + return sprintf(data, "%u\n", count); +} + /* dimm/rank attribute files */ static DEVICE_ATTR(dimm_label, S_IRUGO | S_IWUSR, dimmdev_label_show, dimmdev_label_store); @@ -577,6 +611,8 @@ static DEVICE_ATTR(size, S_IRUGO, dimmdev_size_show, NULL); static DEVICE_ATTR(dimm_mem_type, S_IRUGO, dimmdev_mem_type_show, NULL); static DEVICE_ATTR(dimm_dev_type, S_IRUGO, dimmdev_dev_type_show, NULL); static DEVICE_ATTR(dimm_edac_mode, S_IRUGO, dimmdev_edac_mode_show, NULL); +static DEVICE_ATTR(dimm_ce_count, S_IRUGO, dimmdev_ce_count_show, NULL); +static DEVICE_ATTR(dimm_ue_count, S_IRUGO, dimmdev_ue_count_show, NULL); /* attributes of the dimm/rank object */ static struct attribute *dimm_attrs[] = { @@ -586,6 +622,8 @@ static struct attribute *dimm_attrs[] = { &dev_attr_dimm_mem_type.attr, &dev_attr_dimm_dev_type.attr, &dev_attr_dimm_edac_mode.attr, + &dev_attr_dimm_ce_count.attr, + &dev_attr_dimm_ue_count.attr, NULL, }; -- cgit v1.2.3 From 127c1225bf89764976f80e6a0afb4e26df33c2f2 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Sun, 22 Jan 2017 18:28:06 +0100 Subject: EDAC, sb_edac: Get rid of ->show_interleave_mode() Function sbridge_register_mci() sets pvt->info.show_interleave_mode to knl_show_interleave_mode() on Knight's Landing and show_interleave_mode() anywhere else. Merge show_interleave_mode() and knl_show_interleave_mode() in a single implementation and use it without an indirect function pointer. Signed-off-by: Nicolas Iooss Cc: Mauro Carvalho Chehab Cc: linux-edac Link: http://lkml.kernel.org/r/20170122172806.10412-1-nicolas.iooss_linux@m4x.org [ Call it get_intlv_mode_str(). ] Signed-off-by: Borislav Petkov --- drivers/edac/sb_edac.c | 44 ++++++++++---------------------------------- 1 file changed, 10 insertions(+), 34 deletions(-) diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 54ae6dc45ab2..573be9c0ffb8 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -304,7 +304,6 @@ struct sbridge_info { u64 (*rir_limit)(u32 reg); u64 (*sad_limit)(u32 reg); u32 (*interleave_mode)(u32 reg); - char* (*show_interleave_mode)(u32 reg); u32 (*dram_attr)(u32 reg); const u32 *dram_rule; const u32 *interleave_list; @@ -811,11 +810,6 @@ static u32 interleave_mode(u32 reg) return GET_BITFIELD(reg, 1, 1); } -char *show_interleave_mode(u32 reg) -{ - return interleave_mode(reg) ? "8:6" : "[8:6]XOR[18:16]"; -} - static u32 dram_attr(u32 reg) { return GET_BITFIELD(reg, 2, 3); @@ -831,29 +825,16 @@ static u32 knl_interleave_mode(u32 reg) return GET_BITFIELD(reg, 1, 2); } -static char *knl_show_interleave_mode(u32 reg) -{ - char *s; - - switch (knl_interleave_mode(reg)) { - case 0: - s = "use address bits [8:6]"; - break; - case 1: - s = "use address bits [10:8]"; - break; - case 2: - s = "use address bits [14:12]"; - break; - case 3: - s = "use address bits [32:30]"; - break; - default: - WARN_ON(1); - break; - } +static const char * const knl_intlv_mode[] = { + "[8:6]", "[10:8]", "[14:12]", "[32:30]" +}; - return s; +static const char *get_intlv_mode_str(u32 reg, enum type t) +{ + if (t == KNIGHTS_LANDING) + return knl_intlv_mode[knl_interleave_mode(reg)]; + else + return interleave_mode(reg) ? "[8:6]" : "[8:6]XOR[18:16]"; } static u32 dram_attr_knl(u32 reg) @@ -1810,7 +1791,7 @@ static void get_memory_layout(const struct mem_ctl_info *mci) show_dram_attr(pvt->info.dram_attr(reg)), gb, (mb*1000)/1024, ((u64)tmp_mb) << 20L, - pvt->info.show_interleave_mode(reg), + get_intlv_mode_str(reg, pvt->info.type), reg); prv = limit; @@ -3227,7 +3208,6 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) pvt->info.rir_limit = rir_limit; pvt->info.sad_limit = sad_limit; pvt->info.interleave_mode = interleave_mode; - pvt->info.show_interleave_mode = show_interleave_mode; pvt->info.dram_attr = dram_attr; pvt->info.max_sad = ARRAY_SIZE(ibridge_dram_rule); pvt->info.interleave_list = ibridge_interleave_list; @@ -3251,7 +3231,6 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) pvt->info.rir_limit = rir_limit; pvt->info.sad_limit = sad_limit; pvt->info.interleave_mode = interleave_mode; - pvt->info.show_interleave_mode = show_interleave_mode; pvt->info.dram_attr = dram_attr; pvt->info.max_sad = ARRAY_SIZE(sbridge_dram_rule); pvt->info.interleave_list = sbridge_interleave_list; @@ -3275,7 +3254,6 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) pvt->info.rir_limit = haswell_rir_limit; pvt->info.sad_limit = sad_limit; pvt->info.interleave_mode = interleave_mode; - pvt->info.show_interleave_mode = show_interleave_mode; pvt->info.dram_attr = dram_attr; pvt->info.max_sad = ARRAY_SIZE(ibridge_dram_rule); pvt->info.interleave_list = ibridge_interleave_list; @@ -3299,7 +3277,6 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) pvt->info.rir_limit = haswell_rir_limit; pvt->info.sad_limit = sad_limit; pvt->info.interleave_mode = interleave_mode; - pvt->info.show_interleave_mode = show_interleave_mode; pvt->info.dram_attr = dram_attr; pvt->info.max_sad = ARRAY_SIZE(ibridge_dram_rule); pvt->info.interleave_list = ibridge_interleave_list; @@ -3323,7 +3300,6 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) pvt->info.rir_limit = NULL; pvt->info.sad_limit = knl_sad_limit; pvt->info.interleave_mode = knl_interleave_mode; - pvt->info.show_interleave_mode = knl_show_interleave_mode; pvt->info.dram_attr = dram_attr_knl; pvt->info.max_sad = ARRAY_SIZE(knl_dram_rule); pvt->info.interleave_list = knl_interleave_list; -- cgit v1.2.3 From 58fb24cb95562da6e54763463464d168ec5a1caa Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 25 Jan 2017 16:08:27 +0100 Subject: EDAC, i7300: Test for the second channel properly REDMEMB[17] is the ECC_Locator bit, which, when set, identifies the CS[3:2] as the simbols in error. And thus the second channel. The macro computing it was wrong so get rid of it (it was used at one place only) and get rid of the conditional too. Generates better code this way anyway. Signed-off-by: Borislav Petkov Reported-by: David Binderman Reviewed-by: Mauro Carvalho Chehab --- drivers/edac/i7300_edac.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/edac/i7300_edac.c b/drivers/edac/i7300_edac.c index 0a912bf6de00..e391f5a716be 100644 --- a/drivers/edac/i7300_edac.c +++ b/drivers/edac/i7300_edac.c @@ -304,7 +304,6 @@ static const char *ferr_global_lo_name[] = { #define REDMEMA 0xdc #define REDMEMB 0x7c - #define IS_SECOND_CH(v) ((v) * (1 << 17)) #define RECMEMA 0xe0 #define RECMEMA_BANK(v) (((v) >> 12) & 7) @@ -483,8 +482,9 @@ static void i7300_process_fbd_error(struct mem_ctl_info *mci) pci_read_config_dword(pvt->pci_dev_16_1_fsb_addr_map, REDMEMB, &value); channel = (branch << 1); - if (IS_SECOND_CH(value)) - channel++; + + /* Second channel ? */ + channel += !!(value & BIT(17)); /* Clear the error bit */ pci_write_config_dword(pvt->pci_dev_16_1_fsb_addr_map, -- cgit v1.2.3 From 67d7fd306ef2ef1ba5cdb8ce2dfde1339d4c8136 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 24 Jan 2017 16:32:23 -0600 Subject: EDAC, mce_amd: Give more context to deferred error message Users may not be familiar with the concept of deferred errors. There is no action for users to take on this type of error, so give more context in the error message to make this more clear. Signed-off-by: Yazen Ghannam Cc: linux-edac Link: http://lkml.kernel.org/r/1485297149-13733-2-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 34208f38c5b1..3acb3c0231f2 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -937,7 +937,7 @@ static const char *decode_error_status(struct mce *m) } if (m->status & MCI_STATUS_DEFERRED) - return "Deferred error."; + return "Deferred error, no action required."; return "Corrected error, no action required."; } -- cgit v1.2.3 From 2b9b2c465928260f40e2f570c953881bff291bef Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 24 Jan 2017 16:32:24 -0600 Subject: EDAC, amd64: Free unused memory when init_one_instance() fails Jump to memory freeing routines when init_one_instance() fails. Signed-off-by: Yazen Ghannam Cc: linux-edac Link: http://lkml.kernel.org/r/1485297149-13733-3-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 4a480da713b9..5fa6e5ea8be3 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3363,6 +3363,8 @@ static int probe_one_instance(unsigned int nid) if (boot_cpu_data.x86 < 0x17) restore_ecc_error_reporting(s, nid, F3); + + goto err_enable; } return ret; -- cgit v1.2.3 From 234365f56e75d91c5bd7fbde2b8a9b02851845b8 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 24 Jan 2017 16:32:25 -0600 Subject: EDAC, amd64: Move global code out of instance functions We have a few functions that register/unregister an ECC error decoding routine. These functions are called when we init/remove instances. However, they are global and so don't need to be registered/unregistered multiple times. So move them out of the init/remove instance functions and into the module init/exit routines. Signed-off-by: Yazen Ghannam Cc: linux-edac Link: http://lkml.kernel.org/r/1485297149-13733-4-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 5fa6e5ea8be3..95c6a1440103 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3300,15 +3300,6 @@ static int init_one_instance(unsigned int nid) goto err_add_mc; } - /* register stuff with EDAC MCE */ - if (report_gart_errors) - amd_report_gart_errors(true); - - if (pvt->umc) - amd_register_ecc_decoder(decode_umc_error); - else - amd_register_ecc_decoder(decode_bus_error); - return 0; err_add_mc: @@ -3398,14 +3389,6 @@ static void remove_one_instance(unsigned int nid) free_mc_sibling_devs(pvt); - /* unregister from EDAC MCE */ - amd_report_gart_errors(false); - - if (pvt->umc) - amd_unregister_ecc_decoder(decode_umc_error); - else - amd_unregister_ecc_decoder(decode_bus_error); - kfree(ecc_stngs[nid]); ecc_stngs[nid] = NULL; @@ -3479,6 +3462,15 @@ static int __init amd64_edac_init(void) } } + /* register stuff with EDAC MCE */ + if (report_gart_errors) + amd_report_gart_errors(true); + + if (boot_cpu_data.x86 >= 0x17) + amd_register_ecc_decoder(decode_umc_error); + else + amd_register_ecc_decoder(decode_bus_error); + setup_pci_device(); #ifdef CONFIG_X86_32 @@ -3508,6 +3500,14 @@ static void __exit amd64_edac_exit(void) if (pci_ctl) edac_pci_release_generic_ctl(pci_ctl); + /* unregister from EDAC MCE */ + amd_report_gart_errors(false); + + if (boot_cpu_data.x86 >= 0x17) + amd_unregister_ecc_decoder(decode_umc_error); + else + amd_unregister_ecc_decoder(decode_bus_error); + for (i = 0; i < amd_nb_num(); i++) remove_one_instance(i); -- cgit v1.2.3 From 11ab1cae5881fe47fd4a18e8f6c4982c673bb16c Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 27 Jan 2017 11:24:19 -0600 Subject: EDAC, amd64: Rework messages in ecc_enabled() Print the node number when informing that DRAM ECC is disabled so that we can show which nodes have DRAM ECC disabled. Also, print more detailed system information as edac_dbg(), so as to not bother general users. Switch amd64_notice to amd64_info to match the message above it. Signed-off-by: Yazen Ghannam Cc: linux-edac Link: http://lkml.kernel.org/r/1485537863-2707-5-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 95c6a1440103..27246aa8128d 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3065,6 +3065,8 @@ static bool ecc_enabled(struct pci_dev *F3, u16 nid) /* Check whether at least one UMC is enabled: */ if (umc_en_mask) ecc_en = umc_en_mask == ecc_en_mask; + else + edac_dbg(0, "Node %d: No enabled UMCs.\n", nid); /* Assume UMC MCA banks are enabled. */ nb_mce_en = true; @@ -3075,14 +3077,15 @@ static bool ecc_enabled(struct pci_dev *F3, u16 nid) nb_mce_en = nb_mce_bank_enabled_on_node(nid); if (!nb_mce_en) - amd64_notice("NB MCE bank disabled, set MSR 0x%08x[4] on node %d to enable.\n", + edac_dbg(0, "NB MCE bank disabled, set MSR 0x%08x[4] on node %d to enable.\n", MSR_IA32_MCG_CTL, nid); } - amd64_info("DRAM ECC %s.\n", (ecc_en ? "enabled" : "disabled")); + amd64_info("Node %d: DRAM ECC %s.\n", + nid, (ecc_en ? "enabled" : "disabled")); if (!ecc_en || !nb_mce_en) { - amd64_notice("%s", ecc_msg); + amd64_info("%s", ecc_msg); return false; } return true; -- cgit v1.2.3 From df64636fa4816b6d562835475d9846dcfbfd8c7e Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 27 Jan 2017 11:24:20 -0600 Subject: EDAC, amd64: Remove unused printing macros amd64_{debug,notice} don't have any users, so remove them. Signed-off-by: Yazen Ghannam Cc: linux-edac Link: http://lkml.kernel.org/r/1485537863-2707-6-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 496603d8f3d2..469506fcc0fc 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -20,15 +20,9 @@ #include "edac_module.h" #include "mce_amd.h" -#define amd64_debug(fmt, arg...) \ - edac_printk(KERN_DEBUG, "amd64", fmt, ##arg) - #define amd64_info(fmt, arg...) \ edac_printk(KERN_INFO, "amd64", fmt, ##arg) -#define amd64_notice(fmt, arg...) \ - edac_printk(KERN_NOTICE, "amd64", fmt, ##arg) - #define amd64_warn(fmt, arg...) \ edac_printk(KERN_WARNING, "amd64", "Warning: " fmt, ##arg) -- cgit v1.2.3 From d7fc9d77acb8651e5fcb77df71dfd11fcf1f08ba Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 27 Jan 2017 11:24:21 -0600 Subject: EDAC: Add routine to check if MC devices list is empty We need to know if any MC devices have been allocated. Signed-off-by: Yazen Ghannam Cc: linux-edac Link: http://lkml.kernel.org/r/1485537863-2707-7-git-send-email-Yazen.Ghannam@amd.com [ Prettify text. ] Signed-off-by: Borislav Petkov --- drivers/edac/edac_mc.c | 14 ++++++++++++++ drivers/edac/edac_mc.h | 9 +++++++++ 2 files changed, 23 insertions(+) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 750891ea07de..e5573c56b15e 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -453,6 +453,20 @@ void edac_mc_free(struct mem_ctl_info *mci) } EXPORT_SYMBOL_GPL(edac_mc_free); +bool edac_has_mcs(void) +{ + bool ret; + + mutex_lock(&mem_ctls_mutex); + + ret = list_empty(&mc_devices); + + mutex_unlock(&mem_ctls_mutex); + + return !ret; +} +EXPORT_SYMBOL_GPL(edac_has_mcs); + /* Caller must hold mem_ctls_mutex */ static struct mem_ctl_info *__find_mci_by_dev(struct device *dev) { diff --git a/drivers/edac/edac_mc.h b/drivers/edac/edac_mc.h index 50fc1dc9c0d8..5357800e418d 100644 --- a/drivers/edac/edac_mc.h +++ b/drivers/edac/edac_mc.h @@ -148,6 +148,15 @@ extern int edac_mc_add_mc_with_groups(struct mem_ctl_info *mci, */ extern void edac_mc_free(struct mem_ctl_info *mci); +/** + * edac_has_mcs() - Check if any MCs have been allocated. + * + * Returns: + * True if MC instances have been registered successfully. + * False otherwise. + */ +extern bool edac_has_mcs(void); + /** * edac_mc_find() - Search for a mem_ctl_info structure whose index is @idx. * -- cgit v1.2.3 From 4688c9b42dd2040673a1c2208a1008822b07ee4a Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 27 Jan 2017 11:24:22 -0600 Subject: EDAC, amd64: Don't treat ECC disabled as failure Having ECC disabled on a node doesn't necessarily mean that it's disabled for the entire system. So let's return a non-failing code when ECC is disabled on a node. This way we can skip initialization for the node but still continue with the remaining nodes. After probing all instances, make sure we have at least one MC device allocated. This issue is seen and fix tested on Fam15h and Fam17h MCM systems. Signed-off-by: Yazen Ghannam Cc: linux-edac Link: http://lkml.kernel.org/r/1485537863-2707-8-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 27246aa8128d..565dc52dbb6f 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3336,7 +3336,7 @@ static int probe_one_instance(unsigned int nid) ecc_stngs[nid] = s; if (!ecc_enabled(F3, nid)) { - ret = -ENODEV; + ret = 0; if (!ecc_enable_override) goto err_enable; @@ -3465,6 +3465,11 @@ static int __init amd64_edac_init(void) } } + if (!edac_has_mcs()) { + err = -ENODEV; + goto err_pci; + } + /* register stuff with EDAC MCE */ if (report_gart_errors) amd_report_gart_errors(true); -- cgit v1.2.3 From 1bd9900b8301fc505f032c90ea487824cf824e99 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 27 Jan 2017 11:24:23 -0600 Subject: EDAC, amd64: Add x86cpuid sanity check during init Match one of the devices in amd64_cpuids[] before loading the module. This is an additional sanity check against users trying to load amd64_edac_mod on unsupported systems. Signed-off-by: Yazen Ghannam Cc: linux-edac Link: http://lkml.kernel.org/r/1485537863-2707-9-git-send-email-Yazen.Ghannam@amd.com [ Get rid of err_ret label, make it a bit more readable this way. ] Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 6 ++++-- drivers/edac/amd64_edac.h | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 565dc52dbb6f..82dab1692264 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3440,8 +3440,11 @@ static int __init amd64_edac_init(void) int err = -ENODEV; int i; + if (!x86_match_cpu(amd64_cpuids)) + return -ENODEV; + if (amd_cache_northbridges() < 0) - goto err_ret; + return -ENODEV; opstate_init(); @@ -3497,7 +3500,6 @@ err_free: kfree(ecc_stngs); ecc_stngs = NULL; -err_ret: return err; } diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 469506fcc0fc..6acbfd3e0158 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "edac_module.h" #include "mce_amd.h" -- cgit v1.2.3 From 321d17c19bf51a5e0ea19163f7091563c70415a6 Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Thu, 2 Feb 2017 12:16:24 +1300 Subject: EDAC, mpc85xx: Add T2080 l2-cache support The L2 cache controller on the T2080 SoC has similar capabilities to the others already supported by the mpc85xx_edac driver. Add it to the list of compatible devices. Signed-off-by: Chris Packham Acked-by: Johannes Thumshirn Acked-by: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: devicetree@vger.kernel.org Cc: linux-edac Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20170201231624.28843-1-chris.packham@alliedtelesis.co.nz Signed-off-by: Borislav Petkov --- arch/powerpc/boot/dts/fsl/t2081si-post.dtsi | 1 + drivers/edac/mpc85xx_edac.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi b/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi index c744569a20e1..a97296c64eb2 100644 --- a/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi @@ -678,5 +678,6 @@ compatible = "fsl,t2080-l2-cache-controller"; reg = <0xc20000 0x40000>; next-level-cache = <&cpc>; + interrupts = <16 2 1 9>; }; }; diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c index 8f66cbed70b7..67f7bc3fe5b3 100644 --- a/drivers/edac/mpc85xx_edac.c +++ b/drivers/edac/mpc85xx_edac.c @@ -629,6 +629,7 @@ static const struct of_device_id mpc85xx_l2_err_of_match[] = { { .compatible = "fsl,p1020-l2-cache-controller", }, { .compatible = "fsl,p1021-l2-cache-controller", }, { .compatible = "fsl,p2020-l2-cache-controller", }, + { .compatible = "fsl,t2080-l2-cache-controller", }, {}, }; MODULE_DEVICE_TABLE(of, mpc85xx_l2_err_of_match); -- cgit v1.2.3 From 279fa580356301df27724a4b14ab4a95b65828fb Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 9 Feb 2017 15:04:24 +0000 Subject: EDAC, fsl_ddr: Make locally used symbols static Fix the following sparse warnings: drivers/edac/fsl_ddr_edac.c:148:1: warning: symbol 'dev_attr_inject_data_hi' was not declared. Should it be static? drivers/edac/fsl_ddr_edac.c:150:1: warning: symbol 'dev_attr_inject_data_lo' was not declared. Should it be static? drivers/edac/fsl_ddr_edac.c:152:1: warning: symbol 'dev_attr_inject_ctrl' was not declared. Should it be static? Signed-off-by: Wei Yongjun Cc: linux-edac Link: http://lkml.kernel.org/r/20170209150424.15124-1-weiyj.lk@gmail.com Signed-off-by: Borislav Petkov --- drivers/edac/fsl_ddr_edac.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/edac/fsl_ddr_edac.c b/drivers/edac/fsl_ddr_edac.c index 4e9608a958e7..efc8276d1d9c 100644 --- a/drivers/edac/fsl_ddr_edac.c +++ b/drivers/edac/fsl_ddr_edac.c @@ -145,12 +145,12 @@ static ssize_t fsl_mc_inject_ctrl_store(struct device *dev, return 0; } -DEVICE_ATTR(inject_data_hi, S_IRUGO | S_IWUSR, - fsl_mc_inject_data_hi_show, fsl_mc_inject_data_hi_store); -DEVICE_ATTR(inject_data_lo, S_IRUGO | S_IWUSR, - fsl_mc_inject_data_lo_show, fsl_mc_inject_data_lo_store); -DEVICE_ATTR(inject_ctrl, S_IRUGO | S_IWUSR, - fsl_mc_inject_ctrl_show, fsl_mc_inject_ctrl_store); +static DEVICE_ATTR(inject_data_hi, S_IRUGO | S_IWUSR, + fsl_mc_inject_data_hi_show, fsl_mc_inject_data_hi_store); +static DEVICE_ATTR(inject_data_lo, S_IRUGO | S_IWUSR, + fsl_mc_inject_data_lo_show, fsl_mc_inject_data_lo_store); +static DEVICE_ATTR(inject_ctrl, S_IRUGO | S_IWUSR, + fsl_mc_inject_ctrl_show, fsl_mc_inject_ctrl_store); static struct attribute *fsl_ddr_dev_attrs[] = { &dev_attr_inject_data_hi.attr, -- cgit v1.2.3 From 06c177cb12ef25e63da1698f294babf43fbc1451 Mon Sep 17 00:00:00 2001 From: Thor Thayer Date: Mon, 13 Feb 2017 13:30:41 -0600 Subject: MAINTAINERS, EDAC: Update email for Thor Thayer My opensource.altera.com email will be going away soon. Switch to new email address (linux.intel.com). Signed-off-by: Thor Thayer Cc: linux-edac Link: http://lkml.kernel.org/r/1487014241-3771-1-git-send-email-thor.thayer@linux.intel.com Signed-off-by: Borislav Petkov --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index cfff2c9e3d94..cc6108f30fce 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -644,7 +644,7 @@ S: Maintained F: drivers/gpio/gpio-altera.c ALTERA SYSTEM RESOURCE DRIVER FOR ARRIA10 DEVKIT -M: Thor Thayer +M: Thor Thayer S: Maintained F: drivers/gpio/gpio-altera-a10sr.c F: drivers/mfd/altera-a10sr.c @@ -1786,7 +1786,7 @@ S: Maintained F: drivers/clk/socfpga/ ARM/SOCFPGA EDAC SUPPORT -M: Thor Thayer +M: Thor Thayer S: Maintained F: drivers/edac/altera_edac. -- cgit v1.2.3 From e62d2ca9d05c9b558d11114c6494e820728f8e9a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 14 Feb 2017 11:58:05 +0100 Subject: EDAC, amd64: Bump driver version Last time we did that was when we enabled Bulldozer. Now, we enabled Zen so it is only natural ... :-) Signed-off-by: Borislav Petkov Cc: Yazen Ghannam --- drivers/edac/amd64_edac.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 6acbfd3e0158..1d4b74e9a037 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -85,7 +85,7 @@ * sections 3.5.4 and 3.5.5 for more information. */ -#define EDAC_AMD64_VERSION "3.4.0" +#define EDAC_AMD64_VERSION "3.5.0" #define EDAC_MOD_STR "amd64_edac" /* Extended Model from CPUID, for CPU Revision numbers */ -- cgit v1.2.3 From 75bf2f6478cab9b0c1d7f5f674a765d1e2ad530e Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Wed, 15 Feb 2017 14:56:22 -0600 Subject: EDAC, mce_amd: Print IPID and Syndrome on a separate line Currently, the IPID and Syndrome are printed on the same line as the Address. There are cases when we can have a valid Syndrome but not a valid Address. For example, the MCA_SYND register can be used to hold more detailed error info that the hardware folks can use. It's not just DRAM ECC syndromes. There are some error types that aren't related to memory that may have valid syndromes, like some errors related to links in the Data Fabric, etc. In these cases, the IPID and Syndrome are not printed at the same log level as the rest of the stanza, so users won't see them on the console. Console: [Hardware Error]: CPU:16 (17:1:0) MC22_STATUS[Over|CE|MiscV|-|-|-|-|SyndV|-]: 0xd82000000002080b [Hardware Error]: Power, Interrupts, etc. Extended Error Code: 2 Dmesg: [Hardware Error]: CPU:16 (17:1:0) MC22_STATUS[Over|CE|MiscV|-|-|-|-|SyndV|-]: 0xd82000000002080b , Syndrome: 0x000000010b404000, IPID: 0x0001002e00000002 [Hardware Error]: Power, Interrupts, etc. Extended Error Code: 2 Print the IPID first and on a new line. The IPID should always be printed on SMCA systems. The Syndrome will then be printed with the IPID and at the same log level when valid: [Hardware Error]: CPU:16 (17:1:0) MC22_STATUS[Over|CE|MiscV|-|-|-|-|SyndV|-]: 0xd82000000002080b [Hardware Error]: IPID: 0x0001002e00000002, Syndrome: 0x000000010b404000 [Hardware Error]: Power, Interrupts, etc. Extended Error Code: 2 Signed-off-by: Yazen Ghannam Cc: linux-edac Link: http://lkml.kernel.org/r/1487192182-2474-1-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 3acb3c0231f2..27513dca8009 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -991,20 +991,19 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) pr_cont("]: 0x%016llx\n", m->status); if (m->status & MCI_STATUS_ADDRV) - pr_emerg(HW_ERR "Error Addr: 0x%016llx", m->addr); + pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr); if (boot_cpu_has(X86_FEATURE_SMCA)) { + pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid); + if (m->status & MCI_STATUS_SYNDV) pr_cont(", Syndrome: 0x%016llx", m->synd); - pr_cont(", IPID: 0x%016llx", m->ipid); - pr_cont("\n"); decode_smca_errors(m); goto err_code; - } else - pr_cont("\n"); + } if (!fam_ops) goto err_code; -- cgit v1.2.3