From 79040cad3f8235937e229f1b9401ba36dd5ad69b Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 10 Feb 2014 14:25:43 -0800 Subject: drivers/edac/edac_mc_sysfs.c: poll timeout cannot be zero If you do echo 0 > /sys/module/edac_core/parameters/edac_mc_poll_msec the following stack trace is output because the edac module is not designed to poll with a timeout of zero. WARNING: CPU: 12 PID: 0 at lib/list_debug.c:33 __list_add+0xac/0xc0() list_add corruption. prev->next should be next (ffff8808291dd1b8), but was (null). (prev=ffff8808286fe3f8). Modules linked in: sg nfsv3 rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache cfg80211 rfkill x86_pkg_temp_thermal coretemp kvm_intel kvm ixgbe e1000e crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd iTCO_wdt ptp sb_edac iTCO_vendor_support pps_core mdio ipmi_devintf edac_core ioatdma microcode shpchp lpc_ich pcspkr i2c_i801 dca mfd_core ipmi_si wmi ipmi_msghandler nfsd auth_rpcgss nfs_acl lockd sunrpc xfs libcrc32c sd_mod sr_mod cdrom crc_t10dif crct10dif_common mgag200 syscopyarea sysfillrect sysimgblt isci i2c_algo_bit drm_kms_helper ttm drm libsas ahci libahci scsi_transport_sas libata i2c_core dm_mirror dm_region_hash dm_log dm_mod CPU: 12 PID: 0 Comm: swapper/12 Not tainted 3.13.0+ #1 Hardware name: Intel Corporation LH Pass ........../SVRBD-ROW_T, BIOS SE5C600.86B.01.08.0003.022620131521 02/26/2013 Call Trace: __list_add+0xac/0xc0 __internal_add_timer+0xab/0x130 internal_add_timer+0x17/0x40 mod_timer_pinned+0xca/0x170 intel_pstate_timer_func+0x28a/0x380 call_timer_fn+0x36/0x100 run_timer_softirq+0x1ff/0x2f0 __do_softirq+0xf5/0x2e0 irq_exit+0x10d/0x120 smp_apic_timer_interrupt+0x45/0x60 apic_timer_interrupt+0x6d/0x80 cpuidle_idle_call+0xb9/0x1f0 arch_cpu_idle+0xe/0x30 cpu_startup_entry+0x9e/0x240 start_secondary+0x1e4/0x290 kernel BUG at kernel/timer.c:1084! invalid opcode: 0000 [#1] SMP Modules linked in: sg nfsv3 rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache cfg80211 rfkill x86_pkg_temp_thermal coretemp kvm_intel kvm ixgbe e1000e crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd iTCO_wdt ptp sb_edac iTCO_vendor_support pps_core mdio ipmi_devintf edac_core ioatdma microcode shpchp lpc_ich pcspkr i2c_i801 dca mfd_core ipmi_si wmi ipmi_msghandler nfsd auth_rpcgss nfs_acl lockd sunrpc xfs libcrc32c sd_mod sr_mod cdrom crc_t10dif crct10dif_common mgag200 syscopyarea sysfillrect sysimgblt isci i2c_algo_bit drm_kms_helper ttm drm libsas ahci libahci scsi_transport_sas libata i2c_core dm_mirror dm_region_hash dm_log dm_mod CPU: 12 PID: 0 Comm: swapper/12 Tainted: G W 3.13.0+ #1 Hardware name: Intel Corporation LH Pass ........../SVRBD-ROW_T, BIOS SE5C600.86B.01.08.0003.022620131521 02/26/2013 Call Trace: run_timer_softirq+0x245/0x2f0 __do_softirq+0xf5/0x2e0 irq_exit+0x10d/0x120 smp_apic_timer_interrupt+0x45/0x60 apic_timer_interrupt+0x6d/0x80 cpuidle_idle_call+0xb9/0x1f0 arch_cpu_idle+0xe/0x30 cpu_startup_entry+0x9e/0x240 start_secondary+0x1e4/0x290 RIP cascade+0x93/0xa0 WARNING: CPU: 36 PID: 1154 at kernel/workqueue.c:1461 __queue_delayed_work+0xed/0x1a0() Modules linked in: sg nfsv3 rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache cfg80211 rfkill x86_pkg_temp_thermal coretemp kvm_intel kvm ixgbe e1000e crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd iTCO_wdt ptp sb_edac iTCO_vendor_support pps_core mdio ipmi_devintf edac_core ioatdma microcode shpchp lpc_ich pcspkr i2c_i801 dca mfd_core ipmi_si wmi ipmi_msghandler nfsd auth_rpcgss nfs_acl lockd sunrpc xfs libcrc32c sd_mod sr_mod cdrom crc_t10dif crct10dif_common mgag200 syscopyarea sysfillrect sysimgblt isci i2c_algo_bit drm_kms_helper ttm drm libsas ahci libahci scsi_transport_sas libata i2c_core dm_mirror dm_region_hash dm_log dm_mod CPU: 36 PID: 1154 Comm: kworker/u481:3 Tainted: G W 3.13.0+ #1 Hardware name: Intel Corporation LH Pass ........../SVRBD-ROW_T, BIOS SE5C600.86B.01.08.0003.022620131521 02/26/2013 Workqueue: edac-poller edac_mc_workq_function [edac_core] Call Trace: dump_stack+0x45/0x56 warn_slowpath_common+0x7d/0xa0 warn_slowpath_null+0x1a/0x20 __queue_delayed_work+0xed/0x1a0 queue_delayed_work_on+0x27/0x50 edac_mc_workq_function+0x72/0xa0 [edac_core] process_one_work+0x17b/0x460 worker_thread+0x11b/0x400 kthread+0xd2/0xf0 ret_from_fork+0x7c/0xb0 This patch adds a range check in the edac_mc_poll_msec code to check for 0. Signed-off-by: Prarit Bhargava Cc: Doug Thompson Cc: Mauro Carvalho Chehab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/edac/edac_mc_sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/edac') diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 51c0362acf5c..8ec1747b1c39 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -61,7 +61,7 @@ static int edac_set_poll_msec(const char *val, struct kernel_param *kp) ret = kstrtol(val, 0, &l); if (ret) return ret; - if ((int)l != l) + if (!l || ((int)l != l)) return -EINVAL; *((int *)kp->arg) = l; -- cgit v1.2.3 From 9da21b1509d8aa7ab4846722817d16c72d656c91 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 3 Feb 2014 15:05:13 -0500 Subject: EDAC: Poll timeout cannot be zero, p2 Sanitize code even more to accept unsigned longs only and to not allow polling intervals below 1 second as this is unnecessary and doesn't make much sense anyway for polling errors. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1391457913-881-1-git-send-email-prarit@redhat.com Cc: Doug Thompson Cc: --- drivers/edac/edac_mc.c | 4 ++-- drivers/edac/edac_mc_sysfs.c | 10 ++++++---- drivers/edac/edac_module.h | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index e8c9ef03495b..aef5ec24908e 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -601,7 +601,7 @@ static void edac_mc_workq_teardown(struct mem_ctl_info *mci) * user space has updated our poll period value, need to * reset our workq delays */ -void edac_mc_reset_delay_period(int value) +void edac_mc_reset_delay_period(unsigned long value) { struct mem_ctl_info *mci; struct list_head *item; @@ -611,7 +611,7 @@ void edac_mc_reset_delay_period(int value) list_for_each(item, &mc_devices) { mci = list_entry(item, struct mem_ctl_info, link); - edac_mc_workq_setup(mci, (unsigned long) value); + edac_mc_workq_setup(mci, value); } mutex_unlock(&mem_ctls_mutex); diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 8ec1747b1c39..b335c6ab5efe 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -52,18 +52,20 @@ int edac_mc_get_poll_msec(void) static int edac_set_poll_msec(const char *val, struct kernel_param *kp) { - long l; + unsigned long l; int ret; if (!val) return -EINVAL; - ret = kstrtol(val, 0, &l); + ret = kstrtoul(val, 0, &l); if (ret) return ret; - if (!l || ((int)l != l)) + + if (l < 1000) return -EINVAL; - *((int *)kp->arg) = l; + + *((unsigned long *)kp->arg) = l; /* notify edac_mc engine to reset the poll period */ edac_mc_reset_delay_period(l); diff --git a/drivers/edac/edac_module.h b/drivers/edac/edac_module.h index 3d139c6e7fe3..f2118bfcf8df 100644 --- a/drivers/edac/edac_module.h +++ b/drivers/edac/edac_module.h @@ -52,7 +52,7 @@ extern void edac_device_workq_setup(struct edac_device_ctl_info *edac_dev, extern void edac_device_workq_teardown(struct edac_device_ctl_info *edac_dev); extern void edac_device_reset_delay_period(struct edac_device_ctl_info *edac_dev, unsigned long value); -extern void edac_mc_reset_delay_period(int value); +extern void edac_mc_reset_delay_period(unsigned long value); extern void *edac_align_ptr(void **p, unsigned size, int n_elems); -- cgit v1.2.3 From cb6ef42e516cb8948f15e4b70dc03af8020050a2 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 12 Feb 2014 18:15:00 +0100 Subject: EDAC: Correct workqueue setup path We're using edac_mc_workq_setup() both on the init path, when we load an edac driver and when we change the polling period (edac_mc_reset_delay_period) through /sys/.../edac_mc_poll_msec. On that second path we don't need to init the workqueue which has been initialized already. Thanks to Tejun for workqueue insights. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1391457913-881-1-git-send-email-prarit@redhat.com Cc: --- drivers/edac/edac_mc.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index aef5ec24908e..33edd6766344 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -559,7 +559,8 @@ static void edac_mc_workq_function(struct work_struct *work_req) * * called with the mem_ctls_mutex held */ -static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec) +static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec, + bool init) { edac_dbg(0, "\n"); @@ -567,7 +568,9 @@ static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec) if (mci->op_state != OP_RUNNING_POLL) return; - INIT_DELAYED_WORK(&mci->work, edac_mc_workq_function); + if (init) + INIT_DELAYED_WORK(&mci->work, edac_mc_workq_function); + mod_delayed_work(edac_workqueue, &mci->work, msecs_to_jiffies(msec)); } @@ -611,7 +614,7 @@ void edac_mc_reset_delay_period(unsigned long value) list_for_each(item, &mc_devices) { mci = list_entry(item, struct mem_ctl_info, link); - edac_mc_workq_setup(mci, value); + edac_mc_workq_setup(mci, value, false); } mutex_unlock(&mem_ctls_mutex); @@ -782,7 +785,7 @@ int edac_mc_add_mc(struct mem_ctl_info *mci) /* This instance is NOW RUNNING */ mci->op_state = OP_RUNNING_POLL; - edac_mc_workq_setup(mci, edac_mc_get_poll_msec()); + edac_mc_workq_setup(mci, edac_mc_get_poll_msec(), true); } else { mci->op_state = OP_RUNNING_INTERRUPT; } -- cgit v1.2.3 From c0f5eeed0f4cef4f05b74883a7160e7edde58b6a Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 24 Feb 2014 09:39:27 +0100 Subject: i7core_edac: Fix PCI device reference count The reference count changes done by pci_get_device can be a little misleading when the usage diverges from the most common scheme. The reference count of the device passed as the last parameter is always decreased, even if the function returns no new device. So if we are going to try alternative device IDs, we must manually increment the device reference count before each retry. If we don't, we end up decreasing the reference count, and after a few modprobe/rmmod cycles the PCI devices will vanish. In other words and as Alan put it: without this fix the EDAC code corrupts the PCI device list. This fixes kernel bug #50491: https://bugzilla.kernel.org/show_bug.cgi?id=50491 Signed-off-by: Jean Delvare Link: http://lkml.kernel.org/r/20140224093927.7659dd9d@endymion.delvare Reviewed-by: Alan Cox Cc: Mauro Carvalho Chehab Cc: Doug Thompson Cc: stable@vger.kernel.org Signed-off-by: Borislav Petkov --- drivers/edac/i7core_edac.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c index 87533ca7752e..d871275196f6 100644 --- a/drivers/edac/i7core_edac.c +++ b/drivers/edac/i7core_edac.c @@ -1334,14 +1334,19 @@ static int i7core_get_onedevice(struct pci_dev **prev, * is at addr 8086:2c40, instead of 8086:2c41. So, we need * to probe for the alternate address in case of failure */ - if (dev_descr->dev_id == PCI_DEVICE_ID_INTEL_I7_NONCORE && !pdev) + if (dev_descr->dev_id == PCI_DEVICE_ID_INTEL_I7_NONCORE && !pdev) { + pci_dev_get(*prev); /* pci_get_device will put it */ pdev = pci_get_device(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_I7_NONCORE_ALT, *prev); + } - if (dev_descr->dev_id == PCI_DEVICE_ID_INTEL_LYNNFIELD_NONCORE && !pdev) + if (dev_descr->dev_id == PCI_DEVICE_ID_INTEL_LYNNFIELD_NONCORE && + !pdev) { + pci_dev_get(*prev); /* pci_get_device will put it */ pdev = pci_get_device(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_LYNNFIELD_NONCORE_ALT, *prev); + } if (!pdev) { if (*prev) { -- cgit v1.2.3 From 75135da0d68419ef8a925f4c1d5f63d8046e314d Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 25 Feb 2014 09:43:13 +0100 Subject: i7300_edac: Fix device reference count pci_get_device() decrements the reference count of "from" (last argument) so when we break off the loop successfully we have only one device reference - and we don't know which device we have. If we want a reference to each device, we must take them explicitly and let the pci_get_device() walk complete to avoid duplicate references. This is serious, as over-putting device references will cause the device to eventually disappear. Without this fix, the kernel crashes after a few insmod/rmmod cycles. Tested on an Intel S7000FC4UR system with a 7300 chipset. Signed-off-by: Jean Delvare Link: http://lkml.kernel.org/r/20140224111656.09bbb7ed@endymion.delvare Cc: Mauro Carvalho Chehab Cc: Doug Thompson Cc: stable@vger.kernel.org Signed-off-by: Borislav Petkov --- drivers/edac/i7300_edac.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/i7300_edac.c b/drivers/edac/i7300_edac.c index d63f4798f7d0..57e96a3350f0 100644 --- a/drivers/edac/i7300_edac.c +++ b/drivers/edac/i7300_edac.c @@ -943,33 +943,35 @@ static int i7300_get_devices(struct mem_ctl_info *mci) /* Attempt to 'get' the MCH register we want */ pdev = NULL; - while (!pvt->pci_dev_16_1_fsb_addr_map || - !pvt->pci_dev_16_2_fsb_err_regs) { - pdev = pci_get_device(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_I7300_MCH_ERR, pdev); - if (!pdev) { - /* End of list, leave */ - i7300_printk(KERN_ERR, - "'system address,Process Bus' " - "device not found:" - "vendor 0x%x device 0x%x ERR funcs " - "(broken BIOS?)\n", - PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_I7300_MCH_ERR); - goto error; - } - + while ((pdev = pci_get_device(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_I7300_MCH_ERR, + pdev))) { /* Store device 16 funcs 1 and 2 */ switch (PCI_FUNC(pdev->devfn)) { case 1: - pvt->pci_dev_16_1_fsb_addr_map = pdev; + if (!pvt->pci_dev_16_1_fsb_addr_map) + pvt->pci_dev_16_1_fsb_addr_map = + pci_dev_get(pdev); break; case 2: - pvt->pci_dev_16_2_fsb_err_regs = pdev; + if (!pvt->pci_dev_16_2_fsb_err_regs) + pvt->pci_dev_16_2_fsb_err_regs = + pci_dev_get(pdev); break; } } + if (!pvt->pci_dev_16_1_fsb_addr_map || + !pvt->pci_dev_16_2_fsb_err_regs) { + /* At least one device was not found */ + i7300_printk(KERN_ERR, + "'system address,Process Bus' device not found:" + "vendor 0x%x device 0x%x ERR funcs (broken BIOS?)\n", + PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_I7300_MCH_ERR); + goto error; + } + edac_dbg(1, "System Address, processor bus- PCI Bus ID: %s %x:%x\n", pci_name(pvt->pci_dev_16_0_fsb_ctlr), pvt->pci_dev_16_0_fsb_ctlr->vendor, -- cgit v1.2.3