From fd079facb3fdd1b0517f0b2087ac05c30ea09cfe Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 25 Jul 2011 11:01:09 -0700 Subject: KVM: fix TASK_DELAY_ACCT kconfig warning Fix kconfig dependency warning: warning: (KVM) selects TASK_DELAY_ACCT which has unmet direct dependencies (TASKSTATS) Signed-off-by: Randy Dunlap Signed-off-by: Avi Kivity --- arch/x86/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 988724b236b6..0a09b58bb1cb 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -31,6 +31,7 @@ config KVM select KVM_ASYNC_PF select USER_RETURN_NOTIFIER select KVM_MMIO + select TASKSTATS select TASK_DELAY_ACCT ---help--- Support hosting fully virtualized guest machines using hardware -- cgit v1.2.3 From b03e7495a862b028294f59fc87286d6d78ee7fa1 Mon Sep 17 00:00:00 2001 From: Jon Mason Date: Wed, 20 Jul 2011 15:20:54 -0500 Subject: PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 9 +++ drivers/pci/hotplug/pcihp_slot.c | 45 +----------- drivers/pci/pci.c | 67 ++++++++++++++++++ drivers/pci/probe.c | 145 +++++++++++++++++++++++++++++++++++++++ include/linux/pci.h | 15 +++- 5 files changed, 236 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index ae3cb23cd89b..c95330267f08 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -360,6 +360,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) } } + /* After the PCI-E bus has been walked and all devices discovered, + * configure any settings of the fabric that might be necessary. + */ + if (bus) { + struct pci_bus *child; + list_for_each_entry(child, &bus->children, node) + pcie_bus_configure_settings(child, child->self->pcie_mpss); + } + if (!bus) kfree(sd); diff --git a/drivers/pci/hotplug/pcihp_slot.c b/drivers/pci/hotplug/pcihp_slot.c index 749fdf070319..753b21aaea61 100644 --- a/drivers/pci/hotplug/pcihp_slot.c +++ b/drivers/pci/hotplug/pcihp_slot.c @@ -158,47 +158,6 @@ static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp) */ } -/* Program PCIE MaxPayload setting on device: ensure parent maxpayload <= device */ -static int pci_set_payload(struct pci_dev *dev) -{ - int pos, ppos; - u16 pctl, psz; - u16 dctl, dsz, dcap, dmax; - struct pci_dev *parent; - - parent = dev->bus->self; - pos = pci_find_capability(dev, PCI_CAP_ID_EXP); - if (!pos) - return 0; - - /* Read Device MaxPayload capability and setting */ - pci_read_config_word(dev, pos + PCI_EXP_DEVCTL, &dctl); - pci_read_config_word(dev, pos + PCI_EXP_DEVCAP, &dcap); - dsz = (dctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5; - dmax = (dcap & PCI_EXP_DEVCAP_PAYLOAD); - - /* Read Parent MaxPayload setting */ - ppos = pci_find_capability(parent, PCI_CAP_ID_EXP); - if (!ppos) - return 0; - pci_read_config_word(parent, ppos + PCI_EXP_DEVCTL, &pctl); - psz = (pctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5; - - /* If parent payload > device max payload -> error - * If parent payload > device payload -> set speed - * If parent payload <= device payload -> do nothing - */ - if (psz > dmax) - return -1; - else if (psz > dsz) { - dev_info(&dev->dev, "Setting MaxPayload to %d\n", 128 << psz); - pci_write_config_word(dev, pos + PCI_EXP_DEVCTL, - (dctl & ~PCI_EXP_DEVCTL_PAYLOAD) + - (psz << 5)); - } - return 0; -} - void pci_configure_slot(struct pci_dev *dev) { struct pci_dev *cdev; @@ -210,9 +169,7 @@ void pci_configure_slot(struct pci_dev *dev) (dev->class >> 8) == PCI_CLASS_BRIDGE_PCI))) return; - ret = pci_set_payload(dev); - if (ret) - dev_warn(&dev->dev, "could not set device max payload\n"); + pcie_bus_configure_settings(dev->bus, dev->bus->self->pcie_mpss); memset(&hpp, 0, sizeof(hpp)); ret = pci_get_hp_params(dev, &hpp); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 08a95b369d85..466fad6e6ee2 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -77,6 +77,8 @@ unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE; unsigned long pci_hotplug_io_size = DEFAULT_HOTPLUG_IO_SIZE; unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE; +enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_PERFORMANCE; + /* * The default CLS is used if arch didn't set CLS explicitly and not * all pci devices agree on the same value. Arch can override either @@ -3222,6 +3224,67 @@ out: } EXPORT_SYMBOL(pcie_set_readrq); +/** + * pcie_get_mps - get PCI Express maximum payload size + * @dev: PCI device to query + * + * Returns maximum payload size in bytes + * or appropriate error value. + */ +int pcie_get_mps(struct pci_dev *dev) +{ + int ret, cap; + u16 ctl; + + cap = pci_pcie_cap(dev); + if (!cap) + return -EINVAL; + + ret = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl); + if (!ret) + ret = 128 << ((ctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5); + + return ret; +} + +/** + * pcie_set_mps - set PCI Express maximum payload size + * @dev: PCI device to query + * @rq: maximum payload size in bytes + * valid values are 128, 256, 512, 1024, 2048, 4096 + * + * If possible sets maximum payload size + */ +int pcie_set_mps(struct pci_dev *dev, int mps) +{ + int cap, err = -EINVAL; + u16 ctl, v; + + if (mps < 128 || mps > 4096 || !is_power_of_2(mps)) + goto out; + + v = ffs(mps) - 8; + if (v > dev->pcie_mpss) + goto out; + v <<= 5; + + cap = pci_pcie_cap(dev); + if (!cap) + goto out; + + err = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl); + if (err) + goto out; + + if ((ctl & PCI_EXP_DEVCTL_PAYLOAD) != v) { + ctl &= ~PCI_EXP_DEVCTL_PAYLOAD; + ctl |= v; + err = pci_write_config_word(dev, cap + PCI_EXP_DEVCTL, ctl); + } +out: + return err; +} + /** * pci_select_bars - Make BAR mask from the type of resource * @dev: the PCI device for which BAR mask is made @@ -3505,6 +3568,10 @@ static int __init pci_setup(char *str) pci_hotplug_io_size = memparse(str + 9, &str); } else if (!strncmp(str, "hpmemsize=", 10)) { pci_hotplug_mem_size = memparse(str + 10, &str); + } else if (!strncmp(str, "pcie_bus_safe", 13)) { + pcie_bus_config = PCIE_BUS_SAFE; + } else if (!strncmp(str, "pcie_bus_perf", 13)) { + pcie_bus_config = PCIE_BUS_PERFORMANCE; } else { printk(KERN_ERR "PCI: Unknown option `%s'\n", str); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 795c9026d55f..5becf7cd50d8 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -856,6 +856,8 @@ void set_pcie_port_type(struct pci_dev *pdev) pdev->pcie_cap = pos; pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, ®16); pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4; + pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, ®16); + pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD; } void set_pcie_hotplug_bridge(struct pci_dev *pdev) @@ -1326,6 +1328,149 @@ int pci_scan_slot(struct pci_bus *bus, int devfn) return nr; } +static int pcie_find_smpss(struct pci_dev *dev, void *data) +{ + u8 *smpss = data; + + if (!pci_is_pcie(dev)) + return 0; + + /* For PCIE hotplug enabled slots not connected directly to a + * PCI-E root port, there can be problems when hotplugging + * devices. This is due to the possibility of hotplugging a + * device into the fabric with a smaller MPS that the devices + * currently running have configured. Modifying the MPS on the + * running devices could cause a fatal bus error due to an + * incoming frame being larger than the newly configured MPS. + * To work around this, the MPS for the entire fabric must be + * set to the minimum size. Any devices hotplugged into this + * fabric will have the minimum MPS set. If the PCI hotplug + * slot is directly connected to the root port and there are not + * other devices on the fabric (which seems to be the most + * common case), then this is not an issue and MPS discovery + * will occur as normal. + */ + if (dev->is_hotplug_bridge && (!list_is_singular(&dev->bus->devices) || + dev->bus->self->pcie_type != PCI_EXP_TYPE_ROOT_PORT)) + *smpss = 0; + + if (*smpss > dev->pcie_mpss) + *smpss = dev->pcie_mpss; + + return 0; +} + +static void pcie_write_mps(struct pci_dev *dev, int mps) +{ + int rc, dev_mpss; + + dev_mpss = 128 << dev->pcie_mpss; + + if (pcie_bus_config == PCIE_BUS_PERFORMANCE) { + if (dev->bus->self) { + dev_dbg(&dev->bus->dev, "Bus MPSS %d\n", + 128 << dev->bus->self->pcie_mpss); + + /* For "MPS Force Max", the assumption is made that + * downstream communication will never be larger than + * the MRRS. So, the MPS only needs to be configured + * for the upstream communication. This being the case, + * walk from the top down and set the MPS of the child + * to that of the parent bus. + */ + mps = 128 << dev->bus->self->pcie_mpss; + if (mps > dev_mpss) + dev_warn(&dev->dev, "MPS configured higher than" + " maximum supported by the device. If" + " a bus issue occurs, try running with" + " pci=pcie_bus_safe.\n"); + } + + dev->pcie_mpss = ffs(mps) - 8; + } + + rc = pcie_set_mps(dev, mps); + if (rc) + dev_err(&dev->dev, "Failed attempting to set the MPS\n"); +} + +static void pcie_write_mrrs(struct pci_dev *dev, int mps) +{ + int rc, mrrs; + + if (pcie_bus_config == PCIE_BUS_PERFORMANCE) { + int dev_mpss = 128 << dev->pcie_mpss; + + /* For Max performance, the MRRS must be set to the largest + * supported value. However, it cannot be configured larger + * than the MPS the device or the bus can support. This assumes + * that the largest MRRS available on the device cannot be + * smaller than the device MPSS. + */ + mrrs = mps < dev_mpss ? mps : dev_mpss; + } else + /* In the "safe" case, configure the MRRS for fairness on the + * bus by making all devices have the same size + */ + mrrs = mps; + + + /* MRRS is a R/W register. Invalid values can be written, but a + * subsiquent read will verify if the value is acceptable or not. + * If the MRRS value provided is not acceptable (e.g., too large), + * shrink the value until it is acceptable to the HW. + */ + while (mrrs != pcie_get_readrq(dev) && mrrs >= 128) { + rc = pcie_set_readrq(dev, mrrs); + if (rc) + dev_err(&dev->dev, "Failed attempting to set the MRRS\n"); + + mrrs /= 2; + } +} + +static int pcie_bus_configure_set(struct pci_dev *dev, void *data) +{ + int mps = 128 << *(u8 *)data; + + if (!pci_is_pcie(dev)) + return 0; + + dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n", + pcie_get_mps(dev), 128<pcie_mpss, pcie_get_readrq(dev)); + + pcie_write_mps(dev, mps); + pcie_write_mrrs(dev, mps); + + dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n", + pcie_get_mps(dev), 128<pcie_mpss, pcie_get_readrq(dev)); + + return 0; +} + +/* pcie_bus_configure_mps requires that pci_walk_bus work in a top-down, + * parents then children fashion. If this changes, then this code will not + * work as designed. + */ +void pcie_bus_configure_settings(struct pci_bus *bus, u8 mpss) +{ + u8 smpss = mpss; + + if (!bus->self) + return; + + if (!pci_is_pcie(bus->self)) + return; + + if (pcie_bus_config == PCIE_BUS_SAFE) { + pcie_find_smpss(bus->self, &smpss); + pci_walk_bus(bus, pcie_find_smpss, &smpss); + } + + pcie_bus_configure_set(bus->self, &smpss); + pci_walk_bus(bus, pcie_bus_configure_set, &smpss); +} + unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus) { unsigned int devfn, pass, max = bus->secondary; diff --git a/include/linux/pci.h b/include/linux/pci.h index f27893b3b724..1ff9bbafd932 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -251,7 +251,8 @@ struct pci_dev { u8 revision; /* PCI revision, low byte of class word */ u8 hdr_type; /* PCI header type (`multi' flag masked out) */ u8 pcie_cap; /* PCI-E capability offset */ - u8 pcie_type; /* PCI-E device/port type */ + u8 pcie_type:4; /* PCI-E device/port type */ + u8 pcie_mpss:3; /* PCI-E Max Payload Size Supported */ u8 rom_base_reg; /* which config register controls the ROM */ u8 pin; /* which interrupt pin this device uses */ @@ -617,6 +618,16 @@ struct pci_driver { /* these external functions are only available when PCI support is enabled */ #ifdef CONFIG_PCI +extern void pcie_bus_configure_settings(struct pci_bus *bus, u8 smpss); + +enum pcie_bus_config_types { + PCIE_BUS_PERFORMANCE, + PCIE_BUS_SAFE, + PCIE_BUS_PEER2PEER, +}; + +extern enum pcie_bus_config_types pcie_bus_config; + extern struct bus_type pci_bus_type; /* Do NOT directly access these two variables, unless you are arch specific pci @@ -796,6 +807,8 @@ int pcix_get_mmrbc(struct pci_dev *dev); int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc); int pcie_get_readrq(struct pci_dev *dev); int pcie_set_readrq(struct pci_dev *dev, int rq); +int pcie_get_mps(struct pci_dev *dev); +int pcie_set_mps(struct pci_dev *dev, int mps); int __pci_reset_function(struct pci_dev *dev); int pci_reset_function(struct pci_dev *dev); void pci_update_resource(struct pci_dev *dev, int resno); -- cgit v1.2.3 From 1bdfac19b3ecfca545281c15c7aea7ebc2eaef31 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Aug 2011 09:31:49 -0400 Subject: x86-64: Pad vDSO to a page boundary This avoids an information leak to userspace. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/a63380a3c58a0506a2f5a18ba1b12dbde1f25e58.1312378163.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vdso.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S index 1b979c12ba85..01f5e3b4613c 100644 --- a/arch/x86/vdso/vdso.S +++ b/arch/x86/vdso/vdso.S @@ -9,6 +9,7 @@ __PAGE_ALIGNED_DATA vdso_start: .incbin "arch/x86/vdso/vdso.so" vdso_end: + .align PAGE_SIZE /* extra data here leaks to userspace. */ .previous -- cgit v1.2.3 From 9c40818da5b39fca236029059ab839857b1ef56c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Aug 2011 09:31:50 -0400 Subject: x86-64: Move the "user" vsyscall segment out of the data segment. The kernel's loader doesn't seem to care, but gold complains. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/f0716870c297242a841b949953d80c0d87bf3d3f.1312378163.git.luto@mit.edu Reported-by: Arkadiusz Miskiewicz Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vmlinux.lds.S | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4aa9c54a9b76..e79fb3951fce 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -154,6 +154,24 @@ SECTIONS #ifdef CONFIG_X86_64 + . = ALIGN(PAGE_SIZE); + __vvar_page = .; + + .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { + + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) \ + . = offset; \ + *(.vvar_ ## name) +#define __VVAR_KERNEL_LDS +#include +#undef __VVAR_KERNEL_LDS +#undef EMIT_VVAR + + } :data + + . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); + #define VSYSCALL_ADDR (-10*1024*1024) #define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET) @@ -162,7 +180,6 @@ SECTIONS #define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - . = ALIGN(4096); __vsyscall_0 = .; . = VSYSCALL_ADDR; @@ -185,23 +202,6 @@ SECTIONS #undef VVIRT_OFFSET #undef VVIRT - __vvar_page = .; - - .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { - - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) \ - . = offset; \ - *(.vvar_ ## name) -#define __VVAR_KERNEL_LDS -#include -#undef __VVAR_KERNEL_LDS -#undef EMIT_VVAR - - } :data - - . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); - #endif /* CONFIG_X86_64 */ /* Init code and data - will be freed after init */ -- cgit v1.2.3 From f670bb760e7d32ec9c690e748a1d5d04921363ab Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Aug 2011 09:31:51 -0400 Subject: x86-64: Work around gold bug 13023 Gold has trouble assigning numbers to the location counter inside of an output section description. The bug was triggered by 9fd67b4ed0714ab718f1f9bd14c344af336a6df7, which consolidated all of the vsyscall sections into a single section. The workaround is IMO still nicer than the old way of doing it. This produces an apparently valid kernel image and passes my vdso tests on both GNU ld version 2.21.51.0.6-2.fc15 20110118 and GNU gold (version 2.21.51.0.6-2.fc15 20110118) 1.10 as distributed by Fedora 15. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/0b260cb806f1f9a25c00ce8377a5f035d57f557a.1312378163.git.luto@mit.edu Reported-by: Arkadiusz Miskiewicz Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vmlinux.lds.S | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index e79fb3951fce..8f3a265476d7 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -158,10 +158,12 @@ SECTIONS __vvar_page = .; .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { + /* work around gold bug 13023 */ + __vvar_beginning_hack = .; - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) \ - . = offset; \ + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) \ + . = __vvar_beginning_hack + offset; \ *(.vvar_ ## name) #define __VVAR_KERNEL_LDS #include @@ -184,15 +186,17 @@ SECTIONS . = VSYSCALL_ADDR; .vsyscall : AT(VLOAD(.vsyscall)) { + /* work around gold bug 13023 */ + __vsyscall_beginning_hack = .; *(.vsyscall_0) - . = 1024; + . = __vsyscall_beginning_hack + 1024; *(.vsyscall_1) - . = 2048; + . = __vsyscall_beginning_hack + 2048; *(.vsyscall_2) - . = 4096; /* Pad the whole page. */ + . = __vsyscall_beginning_hack + 4096; /* Pad the whole page. */ } :user =0xcc . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE); -- cgit v1.2.3 From 5d5791af4c0d4fd32093882357506355c3357503 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Aug 2011 09:31:52 -0400 Subject: x86-64, xen: Enable the vvar mapping Xen needs to handle VVAR_PAGE, introduced in git commit: 9fd67b4ed0714ab718f1f9bd14c344af336a6df7 x86-64: Give vvars their own page Otherwise we die during bootup with a message like: (XEN) mm.c:940:d10 Error getting mfn 1888 (pfn 1e3e48) from L1 entry 8000000001888465 for l1e_owner=10, pg_owner=10 (XEN) mm.c:5049:d10 ptwr_emulate: could not get_page_from_l1e() [ 0.000000] BUG: unable to handle kernel NULL pointer dereference at (null) [ 0.000000] IP: [] xen_set_pte+0x20/0xe0 Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/4659478ed2f3480938f96491c2ecbe2b2e113a23.1312378163.git.luto@mit.edu Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: H. Peter Anvin --- arch/x86/xen/mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 0ccccb67a993..2e78619bc538 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1829,6 +1829,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) # endif #else case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: + case VVAR_PAGE: #endif case FIX_TEXT_POKE0: case FIX_TEXT_POKE1: @@ -1869,7 +1870,8 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #ifdef CONFIG_X86_64 /* Replicate changes to map the vsyscall page into the user pagetable vsyscall mapping. */ - if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) { + if ((idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) || + idx == VVAR_PAGE) { unsigned long vaddr = __fix_to_virt(idx); set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); } -- cgit v1.2.3 From 318f5a2a672152328c9fb4dead504b89ec738a43 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Aug 2011 09:31:53 -0400 Subject: x86-64: Add user_64bit_mode paravirt op Three places in the kernel assume that the only long mode CPL 3 selector is __USER_CS. This is not true on Xen -- Xen's sysretq changes cs to the magic value 0xe033. Two of the places are corner cases, but as of "x86-64: Improve vsyscall emulation CS and RIP handling" (c9712944b2a12373cb6ff8059afcfb7e826a6c54), vsyscalls will segfault if called with Xen's extra CS selector. This causes a panic when older init builds die. It seems impossible to make Xen use __USER_CS reliably without taking a performance hit on every system call, so this fixes the tests instead with a new paravirt op. It's a little ugly because ptrace.h can't include paravirt.h. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/f4fcb3947340d9e96ce1054a432f183f9da9db83.1312378163.git.luto@mit.edu Reported-by: Konrad Rzeszutek Wilk Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/desc.h | 4 ++-- arch/x86/include/asm/paravirt_types.h | 6 ++++++ arch/x86/include/asm/ptrace.h | 19 +++++++++++++++++++ arch/x86/kernel/paravirt.c | 4 ++++ arch/x86/kernel/step.c | 2 +- arch/x86/kernel/vsyscall_64.c | 6 +----- arch/x86/mm/fault.c | 2 +- arch/x86/xen/enlighten.c | 4 ++++ 8 files changed, 38 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 7b439d9aea2a..41935fadfdfc 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -27,8 +27,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in desc->base2 = (info->base_addr & 0xff000000) >> 24; /* - * Don't allow setting of the lm bit. It is useless anyway - * because 64bit system calls require __USER_CS: + * Don't allow setting of the lm bit. It would confuse + * user_64bit_mode and would get overridden by sysret anyway. */ desc->l = 0; } diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 82885099c869..96a0f80407b8 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -41,6 +41,7 @@ #include #include +#include struct page; struct thread_struct; @@ -63,6 +64,11 @@ struct paravirt_callee_save { struct pv_info { unsigned int kernel_rpl; int shared_kernel_pmd; + +#ifdef CONFIG_X86_64 + u16 extra_user_64bit_cs; /* __USER_CS if none */ +#endif + int paravirt_enabled; const char *name; }; diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 94e7618fcac8..35664547125b 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -131,6 +131,9 @@ struct pt_regs { #ifdef __KERNEL__ #include +#ifdef CONFIG_PARAVIRT +#include +#endif struct cpuinfo_x86; struct task_struct; @@ -187,6 +190,22 @@ static inline int v8086_mode(struct pt_regs *regs) #endif } +#ifdef CONFIG_X86_64 +static inline bool user_64bit_mode(struct pt_regs *regs) +{ +#ifndef CONFIG_PARAVIRT + /* + * On non-paravirt systems, this is the only long mode CPL 3 + * selector. We do not allow long mode selectors in the LDT. + */ + return regs->cs == __USER_CS; +#else + /* Headers are too twisted for this to go in paravirt.h. */ + return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs; +#endif +} +#endif + /* * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode * when it traps. The previous stack will be directly underneath the saved diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 869e1aeeb71b..681f15994218 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -299,6 +299,10 @@ struct pv_info pv_info = { .paravirt_enabled = 0, .kernel_rpl = 0, .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ + +#ifdef CONFIG_X86_64 + .extra_user_64bit_cs = __USER_CS, +#endif }; struct pv_init_ops pv_init_ops = { diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 7977f0cfe339..c346d1161488 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -74,7 +74,7 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) #ifdef CONFIG_X86_64 case 0x40 ... 0x4f: - if (regs->cs != __USER_CS) + if (!user_64bit_mode(regs)) /* 32-bit mode: register increment */ return 0; /* 64-bit mode: REX prefix */ diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index dda7dff9cef7..1725930a6f9f 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -127,11 +127,7 @@ void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) local_irq_enable(); - /* - * Real 64-bit user mode code has cs == __USER_CS. Anything else - * is bogus. - */ - if (regs->cs != __USER_CS) { + if (!user_64bit_mode(regs)) { /* * If we trapped from kernel mode, we might as well OOPS now * instead of returning to some random address and OOPSing diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 2dbf6bf4c7e5..c1d018238f32 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -105,7 +105,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, * but for now it's good enough to assume that long * mode only uses well known segments or kernel. */ - return (!user_mode(regs)) || (regs->cs == __USER_CS); + return (!user_mode(regs) || user_64bit_mode(regs)); #endif case 0x60: /* 0x64 thru 0x67 are valid prefixes in all modes. */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5525163a0398..78fe33d03565 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -937,6 +937,10 @@ static const struct pv_info xen_info __initconst = { .paravirt_enabled = 1, .shared_kernel_pmd = 0, +#ifdef CONFIG_X86_64 + .extra_user_64bit_cs = FLAT_USER_CS64, +#endif + .name = "Xen", }; -- cgit v1.2.3 From c149a665ac488e0dac22a42287f45ad1bda06ff1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Aug 2011 09:31:54 -0400 Subject: x86-64: Add vsyscall:emulate_vsyscall trace event Vsyscall emulation is slow, so make it easy to track down. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/cdaad7da946a80b200df16647c1700db3e1171e9.1312378163.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vsyscall_64.c | 6 ++++++ arch/x86/kernel/vsyscall_trace.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 arch/x86/kernel/vsyscall_trace.h (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 1725930a6f9f..93a0d46d9569 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -50,6 +50,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include "vsyscall_trace.h" + DEFINE_VVAR(int, vgetcpu_mode); DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = { @@ -146,6 +149,9 @@ void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) * and int 0xcc is two bytes long. */ vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2); + + trace_emulate_vsyscall(vsyscall_nr); + if (vsyscall_nr < 0) { warn_bad_vsyscall(KERN_WARNING, regs, "illegal int 0xcc (exploit attempt?)"); diff --git a/arch/x86/kernel/vsyscall_trace.h b/arch/x86/kernel/vsyscall_trace.h new file mode 100644 index 000000000000..a8b2edec54fe --- /dev/null +++ b/arch/x86/kernel/vsyscall_trace.h @@ -0,0 +1,29 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vsyscall + +#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define __VSYSCALL_TRACE_H + +#include + +TRACE_EVENT(emulate_vsyscall, + + TP_PROTO(int nr), + + TP_ARGS(nr), + + TP_STRUCT__entry(__field(int, nr)), + + TP_fast_assign( + __entry->nr = nr; + ), + + TP_printk("nr = %d", __entry->nr) +); + +#endif + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../arch/x86/kernel +#define TRACE_INCLUDE_FILE vsyscall_trace +#include -- cgit v1.2.3 From a3ea14df0e383f44dcb2e61badb71180dbffe526 Mon Sep 17 00:00:00 2001 From: Paul Fox Date: Tue, 26 Jul 2011 16:42:26 +0100 Subject: x86, olpc: Wait for last byte of EC command to be accepted When executing EC commands, only waiting when there are still more bytes to write is usually fine. However, if the system suspends very quickly after a call to olpc_ec_cmd(), the last data byte may not yet be transferred to the EC, and the command will not complete. This solves a bug where the SCI wakeup mask was not correctly written when going into suspend. It means that sometimes, on XO-1.5 (but not XO-1), the devices that were marked as wakeup sources can't wake up the system. e.g. you ask for wifi wakeups, suspend, but then incoming wifi frames don't wake up the system as they should. Signed-off-by: Paul Fox Signed-off-by: Daniel Drake Acked-by: Andres Salomon Cc: Signed-off-by: Ingo Molnar --- arch/x86/platform/olpc/olpc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index 8b9940e78e2f..7cce722667b8 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -161,13 +161,13 @@ restart: if (inbuf && inlen) { /* write data to EC */ for (i = 0; i < inlen; i++) { + pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]); + outb(inbuf[i], 0x68); if (wait_on_ibf(0x6c, 0)) { printk(KERN_ERR "olpc-ec: timeout waiting for" " EC accept data!\n"); goto err; } - pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]); - outb(inbuf[i], 0x68); } } if (outbuf && outlen) { -- cgit v1.2.3 From 05e33fc20ea5e493a2a1e7f1d04f43cdf89f83ed Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Fri, 5 Aug 2011 09:09:00 -0500 Subject: x86, UV: Remove UV delay in starting slave cpus Delete the 10 msec delay between the INIT and SIPI when starting slave cpus. I can find no requirement for this delay. BIOS also has similar code sequences without the delay. Removing the delay reduces boot time by 40 sec. Every bit helps. Signed-off-by: Jack Steiner Cc: Link: http://lkml.kernel.org/r/20110805140900.GA6774@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index adc66c3a1fef..34b18594e724 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -207,7 +207,6 @@ static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_ri ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | APIC_DM_INIT; uv_write_global_mmr64(pnode, UVH_IPI_INT, val); - mdelay(10); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | -- cgit v1.2.3 From f3fb5b7bb70d6e679c15fef85707810a067f5fb6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Aug 2011 11:15:30 -0400 Subject: x86: Remove unnecessary compile flag tweaks for vsyscall code As of commit 98d0ac38ca7b1b7a552c9a2359174ff84decb600 Author: Andy Lutomirski Date: Thu Jul 14 06:47:22 2011 -0400 x86-64: Move vread_tsc and vread_hpet into the vDSO user code no longer directly calls into code in arch/x86/kernel/, so we don't need compile flag hacks to make it safe. All vdso code is in the vdso directory now. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/835cd05a4c7740544d09723d6ba48f4406f9826c.1312988155.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/kernel/Makefile | 13 ------------- arch/x86/kernel/vsyscall_64.c | 3 --- 2 files changed, 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 2deef3d2435a..3d1ac3978707 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -17,19 +17,6 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg endif -# -# vsyscalls (which work on the user stack) should have -# no stack-protector checks: -# -nostackp := $(call cc-option, -fno-stack-protector) -CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) -CFLAGS_hpet.o := $(nostackp) -CFLAGS_paravirt.o := $(nostackp) -GCOV_PROFILE_vsyscall_64.o := n -GCOV_PROFILE_hpet.o := n -GCOV_PROFILE_tsc.o := n -GCOV_PROFILE_paravirt.o := n - obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 93a0d46d9569..bf8e9ffee6e9 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -18,9 +18,6 @@ * use the vDSO. */ -/* Disable profiling for userspace code: */ -#define DISABLE_BRANCH_PROFILING - #include #include #include -- cgit v1.2.3 From fce8dc06423d6fb2709469dc5c55b04e09c1d126 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Aug 2011 11:15:31 -0400 Subject: x86-64: Wire up getcpu syscall getcpu is available as a vdso entry and an emulated vsyscall. Programs that for some reason don't want to use the vdso should still be able to call getcpu without relying on the slow emulated vsyscall. It costs almost nothing to expose it as a real syscall. We also need this for the following patch in vsyscall=native mode. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/6b19f55bdb06a0c32c2fa6dba9b6f222e1fde999.1312988155.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/unistd_64.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 705bf139288c..d92641cc7acc 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -681,6 +681,8 @@ __SYSCALL(__NR_syncfs, sys_syncfs) __SYSCALL(__NR_sendmmsg, sys_sendmmsg) #define __NR_setns 308 __SYSCALL(__NR_setns, sys_setns) +#define __NR_getcpu 309 +__SYSCALL(__NR_getcpu, sys_getcpu) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR -- cgit v1.2.3 From 3ae36655b97a03fa1decf72f04078ef945647c1a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Aug 2011 11:15:32 -0400 Subject: x86-64: Rework vsyscall emulation and add vsyscall= parameter There are three choices: vsyscall=native: Vsyscalls are native code that issues the corresponding syscalls. vsyscall=emulate (default): Vsyscalls are emulated by instruction fault traps, tested in the bad_area path. The actual contents of the vsyscall page is the same as the vsyscall=native case except that it's marked NX. This way programs that make assumptions about what the code in the page does will not be confused when they read that code. vsyscall=none: Trying to execute a vsyscall will segfault. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/8449fb3abf89851fd6b2260972666a6f82542284.1312988155.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- Documentation/kernel-parameters.txt | 21 ++++++++++ arch/x86/include/asm/irq_vectors.h | 4 -- arch/x86/include/asm/traps.h | 2 - arch/x86/include/asm/vsyscall.h | 6 +++ arch/x86/kernel/entry_64.S | 1 - arch/x86/kernel/traps.c | 6 --- arch/x86/kernel/vmlinux.lds.S | 33 ---------------- arch/x86/kernel/vsyscall_64.c | 79 +++++++++++++++++++++++-------------- arch/x86/kernel/vsyscall_emu_64.S | 36 +++++++++++------ arch/x86/mm/fault.c | 12 ++++++ 10 files changed, 111 insertions(+), 89 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index aa47be71df4c..9cfd6bb9198e 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2657,6 +2657,27 @@ bytes respectively. Such letter suffixes can also be entirely omitted. vmpoff= [KNL,S390] Perform z/VM CP command after power off. Format: + vsyscall= [X86-64] + Controls the behavior of vsyscalls (i.e. calls to + fixed addresses of 0xffffffffff600x00 from legacy + code). Most statically-linked binaries and older + versions of glibc use these calls. Because these + functions are at fixed addresses, they make nice + targets for exploits that can control RIP. + + emulate [default] Vsyscalls turn into traps and are + emulated reasonably safely. + + native Vsyscalls are native syscall instructions. + This is a little bit faster than trapping + and makes a few dynamic recompilers work + better than they would in emulation mode. + It also makes exploits much easier to write. + + none Vsyscalls don't work at all. This makes + them quite hard to use for exploits but + might break your system. + vt.cur_default= [VT] Default cursor shape. Format: 0xCCBBAA, where AA, BB, and CC are the same as the parameters of the [?A;B;Cc escape sequence; diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index a563c509edcb..2c224e183b52 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -17,7 +17,6 @@ * Vectors 0 ... 31 : system traps and exceptions - hardcoded events * Vectors 32 ... 127 : device interrupts * Vector 128 : legacy int80 syscall interface - * Vector 204 : legacy x86_64 vsyscall emulation * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts * @@ -51,9 +50,6 @@ #ifdef CONFIG_X86_32 # define SYSCALL_VECTOR 0x80 #endif -#ifdef CONFIG_X86_64 -# define VSYSCALL_EMU_VECTOR 0xcc -#endif /* * Vectors 0x30-0x3f are used for ISA interrupts. diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 2bae0a513b40..0012d0902c5f 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -40,7 +40,6 @@ asmlinkage void alignment_check(void); asmlinkage void machine_check(void); #endif /* CONFIG_X86_MCE */ asmlinkage void simd_coprocessor_error(void); -asmlinkage void emulate_vsyscall(void); dotraplinkage void do_divide_error(struct pt_regs *, long); dotraplinkage void do_debug(struct pt_regs *, long); @@ -67,7 +66,6 @@ dotraplinkage void do_alignment_check(struct pt_regs *, long); dotraplinkage void do_machine_check(struct pt_regs *, long); #endif dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long); -dotraplinkage void do_emulate_vsyscall(struct pt_regs *, long); #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *, long); #endif diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index 60107072c28b..eaea1d31f753 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -27,6 +27,12 @@ extern struct timezone sys_tz; extern void map_vsyscall(void); +/* + * Called on instruction fetch fault in vsyscall page. + * Returns true if handled. + */ +extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); + #endif /* __KERNEL__ */ #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e949793d6b93..46792d900018 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1123,7 +1123,6 @@ zeroentry spurious_interrupt_bug do_spurious_interrupt_bug zeroentry coprocessor_error do_coprocessor_error errorentry alignment_check do_alignment_check zeroentry simd_coprocessor_error do_simd_coprocessor_error -zeroentry emulate_vsyscall do_emulate_vsyscall /* Reload gs selector with exception handling */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index fbc097a085ca..b9b67166f9de 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -872,12 +872,6 @@ void __init trap_init(void) set_bit(SYSCALL_VECTOR, used_vectors); #endif -#ifdef CONFIG_X86_64 - BUG_ON(test_bit(VSYSCALL_EMU_VECTOR, used_vectors)); - set_system_intr_gate(VSYSCALL_EMU_VECTOR, &emulate_vsyscall); - set_bit(VSYSCALL_EMU_VECTOR, used_vectors); -#endif - /* * Should be a barrier for any external CPU state: */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 8f3a265476d7..0f703f10901a 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -71,7 +71,6 @@ PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(6); /* RW_ */ #ifdef CONFIG_X86_64 - user PT_LOAD FLAGS(5); /* R_E */ #ifdef CONFIG_SMP percpu PT_LOAD FLAGS(6); /* RW_ */ #endif @@ -174,38 +173,6 @@ SECTIONS . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); -#define VSYSCALL_ADDR (-10*1024*1024) - -#define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET) -#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) - -#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) -#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - - __vsyscall_0 = .; - - . = VSYSCALL_ADDR; - .vsyscall : AT(VLOAD(.vsyscall)) { - /* work around gold bug 13023 */ - __vsyscall_beginning_hack = .; - *(.vsyscall_0) - - . = __vsyscall_beginning_hack + 1024; - *(.vsyscall_1) - - . = __vsyscall_beginning_hack + 2048; - *(.vsyscall_2) - - . = __vsyscall_beginning_hack + 4096; /* Pad the whole page. */ - } :user =0xcc - . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE); - -#undef VSYSCALL_ADDR -#undef VLOAD_OFFSET -#undef VLOAD -#undef VVIRT_OFFSET -#undef VVIRT - #endif /* CONFIG_X86_64 */ /* Init code and data - will be freed after init */ diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index bf8e9ffee6e9..18ae83dd1cd7 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -56,6 +56,27 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), }; +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; + +static int __init vsyscall_setup(char *str) +{ + if (str) { + if (!strcmp("emulate", str)) + vsyscall_mode = EMULATE; + else if (!strcmp("native", str)) + vsyscall_mode = NATIVE; + else if (!strcmp("none", str)) + vsyscall_mode = NONE; + else + return -EINVAL; + + return 0; + } + + return -EINVAL; +} +early_param("vsyscall", vsyscall_setup); + void update_vsyscall_tz(void) { unsigned long flags; @@ -100,7 +121,7 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", level, tsk->comm, task_pid_nr(tsk), - message, regs->ip - 2, regs->cs, + message, regs->ip, regs->cs, regs->sp, regs->ax, regs->si, regs->di); } @@ -118,45 +139,39 @@ static int addr_to_vsyscall_nr(unsigned long addr) return nr; } -void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) +bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) { struct task_struct *tsk; unsigned long caller; int vsyscall_nr; long ret; - local_irq_enable(); + /* + * No point in checking CS -- the only way to get here is a user mode + * trap to a high address, which means that we're in 64-bit user code. + */ - if (!user_64bit_mode(regs)) { - /* - * If we trapped from kernel mode, we might as well OOPS now - * instead of returning to some random address and OOPSing - * then. - */ - BUG_ON(!user_mode(regs)); + WARN_ON_ONCE(address != regs->ip); - /* Compat mode and non-compat 32-bit CS should both segfault. */ - warn_bad_vsyscall(KERN_WARNING, regs, - "illegal int 0xcc from 32-bit mode"); - goto sigsegv; + if (vsyscall_mode == NONE) { + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall attempted with vsyscall=none"); + return false; } - /* - * x86-ism here: regs->ip points to the instruction after the int 0xcc, - * and int 0xcc is two bytes long. - */ - vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2); + vsyscall_nr = addr_to_vsyscall_nr(address); trace_emulate_vsyscall(vsyscall_nr); if (vsyscall_nr < 0) { warn_bad_vsyscall(KERN_WARNING, regs, - "illegal int 0xcc (exploit attempt?)"); + "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); goto sigsegv; } if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { - warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)"); + warn_bad_vsyscall(KERN_WARNING, regs, + "vsyscall with bad stack (exploit attempt?)"); goto sigsegv; } @@ -201,13 +216,11 @@ void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) regs->ip = caller; regs->sp += 8; - local_irq_disable(); - return; + return true; sigsegv: - regs->ip -= 2; /* The faulting instruction should be the int 0xcc. */ force_sig(SIGSEGV, current); - local_irq_disable(); + return true; } /* @@ -255,15 +268,21 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) void __init map_vsyscall(void) { - extern char __vsyscall_0; - unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); + extern char __vsyscall_page; + unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); extern char __vvar_page; unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); - /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ - __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); + __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall, + vsyscall_mode == NATIVE + ? PAGE_KERNEL_VSYSCALL + : PAGE_KERNEL_VVAR); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) != + (unsigned long)VSYSCALL_START); + __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR); - BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != + (unsigned long)VVAR_ADDRESS); } static int __init vsyscall_init(void) diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S index ffa845eae5ca..c9596a9af159 100644 --- a/arch/x86/kernel/vsyscall_emu_64.S +++ b/arch/x86/kernel/vsyscall_emu_64.S @@ -7,21 +7,31 @@ */ #include + #include +#include +#include + +__PAGE_ALIGNED_DATA + .globl __vsyscall_page + .balign PAGE_SIZE, 0xcc + .type __vsyscall_page, @object +__vsyscall_page: + + mov $__NR_gettimeofday, %rax + syscall + ret -/* The unused parts of the page are filled with 0xcc by the linker script. */ + .balign 1024, 0xcc + mov $__NR_time, %rax + syscall + ret -.section .vsyscall_0, "a" -ENTRY(vsyscall_0) - int $VSYSCALL_EMU_VECTOR -END(vsyscall_0) + .balign 1024, 0xcc + mov $__NR_getcpu, %rax + syscall + ret -.section .vsyscall_1, "a" -ENTRY(vsyscall_1) - int $VSYSCALL_EMU_VECTOR -END(vsyscall_1) + .balign 4096, 0xcc -.section .vsyscall_2, "a" -ENTRY(vsyscall_2) - int $VSYSCALL_EMU_VECTOR -END(vsyscall_2) + .size __vsyscall_page, 4096 diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c1d018238f32..e58935c25b94 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -720,6 +720,18 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (is_errata100(regs, address)) return; +#ifdef CONFIG_X86_64 + /* + * Instruction fetch faults in the vsyscall page might need + * emulation. + */ + if (unlikely((error_code & PF_INSTR) && + ((address & ~0xfff) == VSYSCALL_START))) { + if (emulate_vsyscall(regs, address)) + return; + } +#endif + if (unlikely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); -- cgit v1.2.3 From cedf03bd9aa54d1d7a9065dddc9e76505f476b12 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 15 Aug 2011 10:18:46 -0700 Subject: x86: fix mm/fault.c build arch/x86/mm/fault.c needs to include asm/vsyscall.h to fix a build error: arch/x86/mm/fault.c: In function '__bad_area_nosemaphore': arch/x86/mm/fault.c:728: error: 'VSYSCALL_START' undeclared (first use in this function) Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- arch/x86/mm/fault.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 247aae3dc008..0d17c8c50acd 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -17,6 +17,7 @@ #include /* dotraplinkage, ... */ #include /* pgd_*(), ... */ #include /* kmemcheck_*(), ... */ +#include /* * Page fault error code bits: -- cgit v1.2.3 From df3d8ae1f8780166a16dd7d08b4842a4d5b5f2b4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 2 Aug 2011 12:54:31 -0700 Subject: KVM: uses TASKSTATS, depends on NET CONFIG_TASKSTATS just had a change to use netlink, including a change to "depends on NET". Since "select" does not follow dependencies, KVM also needs to depend on NET to prevent build errors when CONFIG_NET is not enabled. Sample of the reported "undefined reference" build errors: taskstats.c:(.text+0x8f686): undefined reference to `nla_put' taskstats.c:(.text+0x8f721): undefined reference to `nla_reserve' taskstats.c:(.text+0x8f8fb): undefined reference to `init_net' taskstats.c:(.text+0x8f905): undefined reference to `netlink_unicast' taskstats.c:(.text+0x8f934): undefined reference to `kfree_skb' taskstats.c:(.text+0x8f9e9): undefined reference to `skb_clone' taskstats.c:(.text+0x90060): undefined reference to `__alloc_skb' taskstats.c:(.text+0x901e9): undefined reference to `skb_put' taskstats.c:(.init.text+0x4665): undefined reference to `genl_register_family' taskstats.c:(.init.text+0x4699): undefined reference to `genl_register_ops' taskstats.c:(.init.text+0x4710): undefined reference to `genl_unregister_ops' taskstats.c:(.init.text+0x471c): undefined reference to `genl_unregister_family' Signed-off-by: Randy Dunlap Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/Kconfig | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 0a09b58bb1cb..ff5790d8e990 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -22,6 +22,8 @@ config KVM depends on HAVE_KVM # for device assignment: depends on PCI + # for TASKSTATS/TASK_DELAY_ACCT: + depends on NET select PREEMPT_NOTIFIERS select MMU_NOTIFIER select ANON_INODES -- cgit v1.2.3 From ccbcdf7cf1b5f6c6db30d84095b9c6c53043af55 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 16 Aug 2011 15:07:41 +0100 Subject: xen/x86: replace order-based range checking of M2P table by linear one The order-based approach is not only less efficient (requiring a shift and a compare, typical generated code looking like this mov eax, [machine_to_phys_order] mov ecx, eax shr ebx, cl test ebx, ebx jnz ... whereas a direct check requires just a compare, like in cmp ebx, [machine_to_phys_nr] jae ... ), but also slightly dangerous in the 32-on-64 case - the element address calculation can wrap if the next power of two boundary is sufficiently far away from the actual upper limit of the table, and hence can result in user space addresses being accessed (with it being unknown what may actually be mapped there). Additionally, the elimination of the mistaken use of fls() here (should have been __fls()) fixes a latent issue on x86-64 that would trigger if the code was run on a system with memory extending beyond the 44-bit boundary. CC: stable@kernel.org Signed-off-by: Jan Beulich [v1: Based on Jeremy's feedback] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/page.h | 4 ++-- arch/x86/xen/enlighten.c | 4 ++-- arch/x86/xen/mmu.c | 12 ++++++++---- 3 files changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 64a619d47d34..7ff4669580cf 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -39,7 +39,7 @@ typedef struct xpaddr { ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) extern unsigned long *machine_to_phys_mapping; -extern unsigned int machine_to_phys_order; +extern unsigned long machine_to_phys_nr; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); @@ -87,7 +87,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) if (xen_feature(XENFEAT_auto_translated_physmap)) return mfn; - if (unlikely((mfn >> machine_to_phys_order) != 0)) { + if (unlikely(mfn >= machine_to_phys_nr)) { pfn = ~0; goto try_override; } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 974a528458a0..b960429d5b65 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -77,8 +77,8 @@ EXPORT_SYMBOL_GPL(xen_domain_type); unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; EXPORT_SYMBOL(machine_to_phys_mapping); -unsigned int machine_to_phys_order; -EXPORT_SYMBOL(machine_to_phys_order); +unsigned long machine_to_phys_nr; +EXPORT_SYMBOL(machine_to_phys_nr); struct start_info *xen_start_info; EXPORT_SYMBOL_GPL(xen_start_info); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f987bde77c49..24abc1f50dc5 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1713,15 +1713,19 @@ static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) void __init xen_setup_machphys_mapping(void) { struct xen_machphys_mapping mapping; - unsigned long machine_to_phys_nr_ents; if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { machine_to_phys_mapping = (unsigned long *)mapping.v_start; - machine_to_phys_nr_ents = mapping.max_mfn + 1; + machine_to_phys_nr = mapping.max_mfn + 1; } else { - machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; + machine_to_phys_nr = MACH2PHYS_NR_ENTRIES; } - machine_to_phys_order = fls(machine_to_phys_nr_ents - 1); +#ifdef CONFIG_X86_32 + if ((machine_to_phys_mapping + machine_to_phys_nr) + < machine_to_phys_mapping) + machine_to_phys_nr = (unsigned long *)NULL + - machine_to_phys_mapping; +#endif } #ifdef CONFIG_X86_64 -- cgit v1.2.3 From 3c05c4bed4ccce3f22f6d7899b308faae24ad198 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Wed, 17 Aug 2011 15:15:00 +0200 Subject: xen: Do not enable PV IPIs when vector callback not present Fix regression for HVM case on older (<4.1.1) hypervisors caused by commit 99bbb3a84a99cd04ab16b998b20f01a72cfa9f4f Author: Stefano Stabellini Date: Thu Dec 2 17:55:10 2010 +0000 xen: PV on HVM: support PV spinlocks and IPIs This change replaced the SMP operations with event based handlers without taking into account that this only works when the hypervisor supports callback vectors. This causes unexplainable hangs early on boot for HVM guests with more than one CPU. BugLink: http://bugs.launchpad.net/bugs/791850 CC: stable@kernel.org Signed-off-by: Stefan Bader Signed-off-by: Stefano Stabellini Tested-and-Reported-by: Stefan Bader Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index b4533a86d7e4..e79dbb95482b 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -521,8 +521,6 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) native_smp_prepare_cpus(max_cpus); WARN_ON(xen_smp_intr_init(0)); - if (!xen_have_vector_callback) - return; xen_init_lock_cpu(0); xen_init_spinlocks(); } @@ -546,6 +544,8 @@ static void xen_hvm_cpu_die(unsigned int cpu) void __init xen_hvm_smp_init(void) { + if (!xen_have_vector_callback) + return; smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; smp_ops.smp_send_reschedule = xen_smp_send_reschedule; smp_ops.cpu_up = xen_hvm_cpu_up; -- cgit v1.2.3 From 60c5f08e154fd235056645e050f2cd5671b19125 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 11 Aug 2011 13:17:20 -0700 Subject: xen/tracing: Fix tracing config option properly Steven Rostedt says we should use CONFIG_EVENT_TRACING. Cc:Steven Rostedt Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 3326204e251f..add2c2d729ce 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -15,7 +15,7 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ grant-table.o suspend.o platform-pci-unplug.o \ p2m.o -obj-$(CONFIG_FTRACE) += trace.o +obj-$(CONFIG_EVENT_TRACING) += trace.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o -- cgit v1.2.3 From 7ca0758cdb7c241cb4e0490a8d95f0eb5b861daf Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 22 Aug 2011 13:27:06 -0700 Subject: x86-32, vdso: On system call restart after SYSENTER, use int $0x80 When we enter a 32-bit system call via SYSENTER or SYSCALL, we shuffle the arguments to match the int $0x80 calling convention. This was probably a design mistake, but it's what it is now. This causes errors if the system call as to be restarted. For SYSENTER, we have to invoke the instruction from the vdso as the return address is hardcoded. Accordingly, we can simply replace the jump in the vdso with an int $0x80 instruction and use the slower entry point for a post-restart. Suggested-by: Linus Torvalds Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/CA%2B55aFztZ=r5wa0x26KJQxvZOaQq8s2v3u50wCyJcA-Sc4g8gQ@mail.gmail.com Cc: --- arch/x86/vdso/vdso32/sysenter.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdso32/sysenter.S b/arch/x86/vdso/vdso32/sysenter.S index e2800affa754..e354bceee0e0 100644 --- a/arch/x86/vdso/vdso32/sysenter.S +++ b/arch/x86/vdso/vdso32/sysenter.S @@ -43,7 +43,7 @@ __kernel_vsyscall: .space 7,0x90 /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ - jmp .Lenter_kernel + int $0x80 /* 16: System call normal return point is here! */ VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */ pop %ebp -- cgit v1.2.3 From f1c39625d63c9f8eba8f036429c10a9cb9e32920 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 24 Aug 2011 09:54:24 -0700 Subject: xen: use non-tracing preempt in xen_clocksource_read() The tracing code used sched_clock() to get tracing timestamps, which ends up calling xen_clocksource_read(). xen_clocksource_read() must disable preemption, but if preemption tracing is enabled, this results in infinite recursion. I've only noticed this when boot-time tracing tests are enabled, but it seems like a generic bug. It looks like it would also affect kvm_clocksource_read(). Reported-by: Konrad Rzeszutek Wilk Signed-off-by: Jeremy Fitzhardinge Cc: Avi Kivity Cc: Marcelo Tosatti --- arch/x86/xen/time.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 5158c505bef9..163b4679556e 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -168,9 +168,10 @@ cycle_t xen_clocksource_read(void) struct pvclock_vcpu_time_info *src; cycle_t ret; - src = &get_cpu_var(xen_vcpu)->time; + preempt_disable_notrace(); + src = &__get_cpu_var(xen_vcpu)->time; ret = pvclock_clocksource_read(src); - put_cpu_var(xen_vcpu); + preempt_enable_notrace(); return ret; } -- cgit v1.2.3 From cbbfa38fcb95930babc5233cf6927ec430f38abc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 25 Aug 2011 19:46:56 +0200 Subject: mtrr: fix UP breakage caused during switch to stop_machine While removing custom rendezvous code and switching to stop_machine, commit 192d8857427d ("x86, mtrr: use stop_machine APIs for doing MTRR rendezvous") completely dropped mtrr setting code on !CONFIG_SMP breaking MTRR settting on UP. Fix it by removing the incorrect CONFIG_SMP. Signed-off-by: Tejun Heo Reported-by: Anders Eriksson Tested-and-acked-by: Suresh Siddha Acked-by: H. Peter Anvin Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/mtrr/main.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 08119a37e53c..6b96110bb0c3 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -149,7 +149,6 @@ struct set_mtrr_data { */ static int mtrr_rendezvous_handler(void *info) { -#ifdef CONFIG_SMP struct set_mtrr_data *data = info; /* @@ -171,7 +170,6 @@ static int mtrr_rendezvous_handler(void *info) } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) { mtrr_if->set_all(); } -#endif return 0; } -- cgit v1.2.3 From b4ca46e4e82a0a5976fe5eab85be585d75f8202f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Aug 2011 16:10:33 -0400 Subject: x86-32: Fix boot with CONFIG_X86_INVD_BUG entry_32.S contained a hardcoded alternative instruction entry, and the format changed in commit 59e97e4d6fbc ("x86: Make alternative instruction pointers relative"). Replace the hardcoded entry with the altinstruction_entry macro. This fixes the 32-bit boot with CONFIG_X86_INVD_BUG=y. Reported-and-tested-by: Arnaud Lacombe Signed-off-by: Andy Lutomirski Cc: Peter Anvin Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- arch/x86/kernel/entry_32.S | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 5c1a91974918..f3f6f5344001 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -54,6 +54,7 @@ #include #include #include +#include /* Avoid __ASSEMBLER__'ifying just for this. */ #include @@ -873,12 +874,7 @@ ENTRY(simd_coprocessor_error) 661: pushl_cfi $do_general_protection 662: .section .altinstructions,"a" - .balign 4 - .long 661b - .long 663f - .word X86_FEATURE_XMM - .byte 662b-661b - .byte 664f-663f + altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f .previous .section .altinstr_replacement,"ax" 663: pushl $do_simd_coprocessor_error -- cgit v1.2.3 From a94cc4e6c0a26a7c8f79a432ab2c89534aa674d5 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 26 Aug 2011 12:20:59 +0100 Subject: sfi: table irq 0xFF means 'no interrupt' According to the SFI specification irq number 0xFF means device has no interrupt or interrupt attached via GPIO. Currently, we don't handle this special case and set irq field in *_board_info structs to 255. It leads to confusion in some drivers. Accelerometer driver tries to register interrupt 255, fails and prints "Cannot get IRQ" to dmesg. Signed-off-by: Kirill A. Shutemov Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- arch/x86/platform/mrst/mrst.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 7000e74b3087..58425adc22c6 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -689,7 +689,9 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) irq_attr.trigger = 1; irq_attr.polarity = 1; io_apic_set_pci_routing(NULL, pentry->irq, &irq_attr); - } + } else + pentry->irq = 0; /* No irq */ + switch (pentry->type) { case SFI_DEV_TYPE_IPC: /* ID as IRQ is a hack that will go away */ -- cgit v1.2.3 From f5b940997397229975ea073679b03967932a541b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 26 Aug 2011 18:03:11 -0400 Subject: All Arch: remove linkage for sys_nfsservctl system call The nfsservctl system call is now gone, so we should remove all linkage for it. Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields Signed-off-by: Linus Torvalds --- arch/alpha/kernel/systbls.S | 2 +- arch/arm/kernel/calls.S | 2 +- arch/avr32/kernel/syscall_table.S | 2 +- arch/blackfin/mach-common/entry.S | 2 +- arch/cris/arch-v10/kernel/entry.S | 2 +- arch/cris/arch-v32/kernel/entry.S | 2 +- arch/frv/kernel/entry.S | 2 +- arch/h8300/kernel/syscalls.S | 2 +- arch/ia64/kernel/entry.S | 2 +- arch/m32r/kernel/syscall_table.S | 2 +- arch/m68k/kernel/syscalltable.S | 2 +- arch/microblaze/kernel/syscall_table.S | 2 +- arch/mips/kernel/scall32-o32.S | 2 +- arch/mips/kernel/scall64-64.S | 2 +- arch/mips/kernel/scall64-n32.S | 2 +- arch/mips/kernel/scall64-o32.S | 2 +- arch/mn10300/kernel/entry.S | 2 +- arch/s390/kernel/compat_wrapper.S | 6 ------ arch/s390/kernel/syscalls.S | 2 +- arch/sh/kernel/syscalls_32.S | 2 +- arch/sh/kernel/syscalls_64.S | 2 +- arch/sparc/kernel/sys32.S | 1 - arch/sparc/kernel/systbls_32.S | 2 +- arch/sparc/kernel/systbls_64.S | 2 +- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/include/asm/unistd_64.h | 2 +- arch/x86/kernel/syscall_table_32.S | 2 +- arch/xtensa/include/asm/unistd.h | 2 +- fs/compat.c | 5 ----- include/asm-generic/unistd.h | 2 +- include/linux/compat.h | 1 - include/linux/syscalls.h | 3 --- kernel/sys_ni.c | 1 - 33 files changed, 27 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/systbls.S b/arch/alpha/kernel/systbls.S index b9c28f3f1956..6acea1f96de3 100644 --- a/arch/alpha/kernel/systbls.S +++ b/arch/alpha/kernel/systbls.S @@ -360,7 +360,7 @@ sys_call_table: .quad sys_newuname .quad sys_nanosleep /* 340 */ .quad sys_mremap - .quad sys_nfsservctl + .quad sys_ni_syscall /* old nfsservctl */ .quad sys_setresuid .quad sys_getresuid .quad sys_pciconfig_read /* 345 */ diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S index 80f7896cc016..9943e9e74a1b 100644 --- a/arch/arm/kernel/calls.S +++ b/arch/arm/kernel/calls.S @@ -178,7 +178,7 @@ CALL(sys_ni_syscall) /* vm86 */ CALL(sys_ni_syscall) /* was sys_query_module */ CALL(sys_poll) - CALL(sys_nfsservctl) + CALL(sys_ni_syscall) /* was nfsservctl */ /* 170 */ CALL(sys_setresgid16) CALL(sys_getresgid16) CALL(sys_prctl) diff --git a/arch/avr32/kernel/syscall_table.S b/arch/avr32/kernel/syscall_table.S index c7fd394d28a4..6eba53530d1c 100644 --- a/arch/avr32/kernel/syscall_table.S +++ b/arch/avr32/kernel/syscall_table.S @@ -158,7 +158,7 @@ sys_call_table: .long sys_sched_rr_get_interval .long sys_nanosleep .long sys_poll - .long sys_nfsservctl /* 145 */ + .long sys_ni_syscall /* 145 was nfsservctl */ .long sys_setresgid .long sys_getresgid .long sys_prctl diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S index 225d311c9701..e4137297b790 100644 --- a/arch/blackfin/mach-common/entry.S +++ b/arch/blackfin/mach-common/entry.S @@ -1543,7 +1543,7 @@ ENTRY(_sys_call_table) .long _sys_ni_syscall /* for vm86 */ .long _sys_ni_syscall /* old "query_module" */ .long _sys_ni_syscall /* sys_poll */ - .long _sys_nfsservctl + .long _sys_ni_syscall /* old nfsservctl */ .long _sys_setresgid /* setresgid16 */ /* 170 */ .long _sys_getresgid /* getresgid16 */ .long _sys_prctl diff --git a/arch/cris/arch-v10/kernel/entry.S b/arch/cris/arch-v10/kernel/entry.S index 1161883eb582..592fbe9dfb62 100644 --- a/arch/cris/arch-v10/kernel/entry.S +++ b/arch/cris/arch-v10/kernel/entry.S @@ -771,7 +771,7 @@ sys_call_table: .long sys_ni_syscall /* sys_vm86 */ .long sys_ni_syscall /* Old sys_query_module */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* old nfsservctl */ .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl diff --git a/arch/cris/arch-v32/kernel/entry.S b/arch/cris/arch-v32/kernel/entry.S index 84fed7e91ada..c3ea4694fbaf 100644 --- a/arch/cris/arch-v32/kernel/entry.S +++ b/arch/cris/arch-v32/kernel/entry.S @@ -714,7 +714,7 @@ sys_call_table: .long sys_ni_syscall /* sys_vm86 */ .long sys_ni_syscall /* Old sys_query_module */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* Old nfsservctl */ .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl diff --git a/arch/frv/kernel/entry.S b/arch/frv/kernel/entry.S index 017d6d7b784f..5ba23f715ea5 100644 --- a/arch/frv/kernel/entry.S +++ b/arch/frv/kernel/entry.S @@ -1358,7 +1358,7 @@ sys_call_table: .long sys_ni_syscall /* for vm86 */ .long sys_ni_syscall /* Old sys_query_module */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* Old nfsservctl */ .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl diff --git a/arch/h8300/kernel/syscalls.S b/arch/h8300/kernel/syscalls.S index f4b2e67bcc34..4be2ea2fbe26 100644 --- a/arch/h8300/kernel/syscalls.S +++ b/arch/h8300/kernel/syscalls.S @@ -183,7 +183,7 @@ SYMBOL_NAME_LABEL(sys_call_table) .long SYMBOL_NAME(sys_ni_syscall) /* for vm86 */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_query_module */ .long SYMBOL_NAME(sys_poll) - .long SYMBOL_NAME(sys_nfsservctl) + .long SYMBOL_NAME(sys_ni_syscall) /* old nfsservctl */ .long SYMBOL_NAME(sys_setresgid16) /* 170 */ .long SYMBOL_NAME(sys_getresgid16) .long SYMBOL_NAME(sys_prctl) diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index 97dd2abdeb1a..198c753d1006 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1614,7 +1614,7 @@ sys_call_table: data8 sys_sched_get_priority_min data8 sys_sched_rr_get_interval data8 sys_nanosleep - data8 sys_nfsservctl + data8 sys_ni_syscall // old nfsservctl data8 sys_prctl // 1170 data8 sys_getpagesize data8 sys_mmap2 diff --git a/arch/m32r/kernel/syscall_table.S b/arch/m32r/kernel/syscall_table.S index 528f2e6ad064..f365c19795ef 100644 --- a/arch/m32r/kernel/syscall_table.S +++ b/arch/m32r/kernel/syscall_table.S @@ -168,7 +168,7 @@ ENTRY(sys_call_table) .long sys_tas /* vm86 syscall holder */ .long sys_ni_syscall /* query_module syscall holder */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* was nfsservctl */ .long sys_setresgid /* 170 */ .long sys_getresgid .long sys_prctl diff --git a/arch/m68k/kernel/syscalltable.S b/arch/m68k/kernel/syscalltable.S index 00d1452f9571..c468f2edaa85 100644 --- a/arch/m68k/kernel/syscalltable.S +++ b/arch/m68k/kernel/syscalltable.S @@ -189,7 +189,7 @@ ENTRY(sys_call_table) .long sys_getpagesize .long sys_ni_syscall /* old "query_module" */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* old nfsservctl */ .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl diff --git a/arch/microblaze/kernel/syscall_table.S b/arch/microblaze/kernel/syscall_table.S index d915a122c865..8789daa2a346 100644 --- a/arch/microblaze/kernel/syscall_table.S +++ b/arch/microblaze/kernel/syscall_table.S @@ -173,7 +173,7 @@ ENTRY(sys_call_table) .long sys_ni_syscall /* sys_vm86 */ .long sys_ni_syscall /* Old sys_query_module */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* old nfsservctl */ .long sys_setresgid /* 170 */ .long sys_getresgid .long sys_prctl diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index e521420a45a5..865bc7a6f5a1 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -424,7 +424,7 @@ einval: li v0, -ENOSYS sys sys_getresuid 3 sys sys_ni_syscall 0 /* was sys_query_module */ sys sys_poll 3 - sys sys_nfsservctl 3 + sys sys_ni_syscall 0 /* was nfsservctl */ sys sys_setresgid 3 /* 4190 */ sys sys_getresgid 3 sys sys_prctl 5 diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S index 85874d6a8a70..fb7334bea731 100644 --- a/arch/mips/kernel/scall64-64.S +++ b/arch/mips/kernel/scall64-64.S @@ -299,7 +299,7 @@ sys_call_table: PTR sys_ni_syscall /* 5170, was get_kernel_syms */ PTR sys_ni_syscall /* was query_module */ PTR sys_quotactl - PTR sys_nfsservctl + PTR sys_ni_syscall /* was nfsservctl */ PTR sys_ni_syscall /* res. for getpmsg */ PTR sys_ni_syscall /* 5175 for putpmsg */ PTR sys_ni_syscall /* res. for afs_syscall */ diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index b85842fc87ae..f9296e894e46 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -294,7 +294,7 @@ EXPORT(sysn32_call_table) PTR sys_ni_syscall /* 6170, was get_kernel_syms */ PTR sys_ni_syscall /* was query_module */ PTR sys_quotactl - PTR compat_sys_nfsservctl + PTR sys_ni_syscall /* was nfsservctl */ PTR sys_ni_syscall /* res. for getpmsg */ PTR sys_ni_syscall /* 6175 for putpmsg */ PTR sys_ni_syscall /* res. for afs_syscall */ diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index 46c4763edf21..4d7c9827706f 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -392,7 +392,7 @@ sys_call_table: PTR sys_getresuid PTR sys_ni_syscall /* was query_module */ PTR sys_poll - PTR compat_sys_nfsservctl + PTR sys_ni_syscall /* was nfsservctl */ PTR sys_setresgid /* 4190 */ PTR sys_getresgid PTR sys_prctl diff --git a/arch/mn10300/kernel/entry.S b/arch/mn10300/kernel/entry.S index ae435e1d5669..3e3620d9fc45 100644 --- a/arch/mn10300/kernel/entry.S +++ b/arch/mn10300/kernel/entry.S @@ -589,7 +589,7 @@ ENTRY(sys_call_table) .long sys_ni_syscall /* vm86 */ .long sys_ni_syscall /* Old sys_query_module */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* was nfsservctl */ .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S index 08ab9aa6a0d5..7526db6bf501 100644 --- a/arch/s390/kernel/compat_wrapper.S +++ b/arch/s390/kernel/compat_wrapper.S @@ -665,12 +665,6 @@ ENTRY(sys32_poll_wrapper) lgfr %r4,%r4 # long jg sys_poll # branch to system call -ENTRY(compat_sys_nfsservctl_wrapper) - lgfr %r2,%r2 # int - llgtr %r3,%r3 # struct compat_nfsctl_arg* - llgtr %r4,%r4 # union compat_nfsctl_res* - jg compat_sys_nfsservctl # branch to system call - ENTRY(sys32_setresgid16_wrapper) llgfr %r2,%r2 # __kernel_old_gid_emu31_t llgfr %r3,%r3 # __kernel_old_gid_emu31_t diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S index 6ee39ef8fe4a..73eb08c874fb 100644 --- a/arch/s390/kernel/syscalls.S +++ b/arch/s390/kernel/syscalls.S @@ -177,7 +177,7 @@ SYSCALL(sys_getresuid16,sys_ni_syscall,sys32_getresuid16_wrapper) /* 165 old get NI_SYSCALL /* for vm86 */ NI_SYSCALL /* old sys_query_module */ SYSCALL(sys_poll,sys_poll,sys32_poll_wrapper) -SYSCALL(sys_nfsservctl,sys_nfsservctl,compat_sys_nfsservctl_wrapper) +NI_SYSCALL /* old nfsservctl */ SYSCALL(sys_setresgid16,sys_ni_syscall,sys32_setresgid16_wrapper) /* 170 old setresgid16 syscall */ SYSCALL(sys_getresgid16,sys_ni_syscall,sys32_getresgid16_wrapper) /* old getresgid16 syscall */ SYSCALL(sys_prctl,sys_prctl,sys32_prctl_wrapper) diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S index 39b051de4c7c..293e39c59c00 100644 --- a/arch/sh/kernel/syscalls_32.S +++ b/arch/sh/kernel/syscalls_32.S @@ -185,7 +185,7 @@ ENTRY(sys_call_table) .long sys_ni_syscall /* vm86 */ .long sys_ni_syscall /* old "query_module" */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* was nfsservctl */ .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl diff --git a/arch/sh/kernel/syscalls_64.S b/arch/sh/kernel/syscalls_64.S index 089c4d825d08..ceb34b94afa9 100644 --- a/arch/sh/kernel/syscalls_64.S +++ b/arch/sh/kernel/syscalls_64.S @@ -189,7 +189,7 @@ sys_call_table: .long sys_ni_syscall /* vm86 */ .long sys_ni_syscall /* old "query_module" */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* was nfsservctl */ .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl diff --git a/arch/sparc/kernel/sys32.S b/arch/sparc/kernel/sys32.S index 44e5faf1ad5f..d97f3eb72e06 100644 --- a/arch/sparc/kernel/sys32.S +++ b/arch/sparc/kernel/sys32.S @@ -81,7 +81,6 @@ SIGN2(sys32_fadvise64, compat_sys_fadvise64, %o0, %o4) SIGN2(sys32_fadvise64_64, compat_sys_fadvise64_64, %o0, %o5) SIGN2(sys32_bdflush, sys_bdflush, %o0, %o1) SIGN1(sys32_mlockall, sys_mlockall, %o0) -SIGN1(sys32_nfsservctl, compat_sys_nfsservctl, %o0) SIGN1(sys32_clock_nanosleep, compat_sys_clock_nanosleep, %o1) SIGN1(sys32_timer_settime, compat_sys_timer_settime, %o1) SIGN1(sys32_io_submit, compat_sys_io_submit, %o1) diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S index 6e492d59f6b1..09d8ec454450 100644 --- a/arch/sparc/kernel/systbls_32.S +++ b/arch/sparc/kernel/systbls_32.S @@ -67,7 +67,7 @@ sys_call_table: /*235*/ .long sys_fstatfs64, sys_llseek, sys_mlock, sys_munlock, sys_mlockall /*240*/ .long sys_munlockall, sys_sched_setparam, sys_sched_getparam, sys_sched_setscheduler, sys_sched_getscheduler /*245*/ .long sys_sched_yield, sys_sched_get_priority_max, sys_sched_get_priority_min, sys_sched_rr_get_interval, sys_nanosleep -/*250*/ .long sys_mremap, sys_sysctl, sys_getsid, sys_fdatasync, sys_nfsservctl +/*250*/ .long sys_mremap, sys_sysctl, sys_getsid, sys_fdatasync, sys_ni_syscall /*255*/ .long sys_sync_file_range, sys_clock_settime, sys_clock_gettime, sys_clock_getres, sys_clock_nanosleep /*260*/ .long sys_sched_getaffinity, sys_sched_setaffinity, sys_timer_settime, sys_timer_gettime, sys_timer_getoverrun /*265*/ .long sys_timer_delete, sys_timer_create, sys_nis_syscall, sys_io_setup, sys_io_destroy diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S index f566518483b5..c9296ab0b1f4 100644 --- a/arch/sparc/kernel/systbls_64.S +++ b/arch/sparc/kernel/systbls_64.S @@ -145,7 +145,7 @@ sys_call_table: .word sys_fstatfs64, sys_llseek, sys_mlock, sys_munlock, sys_mlockall /*240*/ .word sys_munlockall, sys_sched_setparam, sys_sched_getparam, sys_sched_setscheduler, sys_sched_getscheduler .word sys_sched_yield, sys_sched_get_priority_max, sys_sched_get_priority_min, sys_sched_rr_get_interval, sys_nanosleep -/*250*/ .word sys_64_mremap, sys_sysctl, sys_getsid, sys_fdatasync, sys_nfsservctl +/*250*/ .word sys_64_mremap, sys_sysctl, sys_getsid, sys_fdatasync, sys_nis_syscall .word sys_sync_file_range, sys_clock_settime, sys_clock_gettime, sys_clock_getres, sys_clock_nanosleep /*260*/ .word sys_sched_getaffinity, sys_sched_setaffinity, sys_timer_settime, sys_timer_gettime, sys_timer_getoverrun .word sys_timer_delete, sys_timer_create, sys_ni_syscall, sys_io_setup, sys_io_destroy diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a0e866d233ee..54edb207ff3a 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -672,7 +672,7 @@ ia32_sys_call_table: .quad sys32_vm86_warning /* vm86 */ .quad quiet_ni_syscall /* query_module */ .quad sys_poll - .quad compat_sys_nfsservctl + .quad quiet_ni_syscall /* old nfsservctl */ .quad sys_setresgid16 /* 170 */ .quad sys_getresgid16 .quad sys_prctl diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index d92641cc7acc..201040573444 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -414,7 +414,7 @@ __SYSCALL(__NR_query_module, sys_ni_syscall) __SYSCALL(__NR_quotactl, sys_quotactl) #define __NR_nfsservctl 180 -__SYSCALL(__NR_nfsservctl, sys_nfsservctl) +__SYSCALL(__NR_nfsservctl, sys_ni_syscall) /* reserved for LiS/STREAMS */ #define __NR_getpmsg 181 diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index fbb0a045a1a2..bc19be332bc9 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -168,7 +168,7 @@ ENTRY(sys_call_table) .long ptregs_vm86 .long sys_ni_syscall /* Old sys_query_module */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* Old nfsservctl */ .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl diff --git a/arch/xtensa/include/asm/unistd.h b/arch/xtensa/include/asm/unistd.h index a6f934f37f1a..798ee6d285a1 100644 --- a/arch/xtensa/include/asm/unistd.h +++ b/arch/xtensa/include/asm/unistd.h @@ -455,7 +455,7 @@ __SYSCALL(203, sys_reboot, 3) #define __NR_quotactl 204 __SYSCALL(204, sys_quotactl, 4) #define __NR_nfsservctl 205 -__SYSCALL(205, sys_nfsservctl, 3) +__SYSCALL(205, sys_ni_syscall, 0) #define __NR__sysctl 206 __SYSCALL(206, sys_sysctl, 1) #define __NR_bdflush 207 diff --git a/fs/compat.c b/fs/compat.c index 0b48d018e38a..58b1da459893 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1675,11 +1675,6 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, } #endif /* HAVE_SET_RESTORE_SIGMASK */ -long asmlinkage compat_sys_nfsservctl(int cmd, void *notused, void *notused2) -{ - return sys_ni_syscall(); -} - #ifdef CONFIG_EPOLL #ifdef HAVE_SET_RESTORE_SIGMASK diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h index 4f76959397fa..f4c38d8c6674 100644 --- a/include/asm-generic/unistd.h +++ b/include/asm-generic/unistd.h @@ -143,7 +143,7 @@ __SYSCALL(__NR_pivot_root, sys_pivot_root) /* fs/nfsctl.c */ #define __NR_nfsservctl 42 -__SC_COMP(__NR_nfsservctl, sys_nfsservctl, compat_sys_nfsservctl) +__SYSCALL(__NR_nfsservctl, sys_ni_syscall) /* fs/open.c */ #define __NR3264_statfs 43 diff --git a/include/linux/compat.h b/include/linux/compat.h index 8779405e15a8..c6e7523bf765 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -438,7 +438,6 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, struct compat_timespec __user *tsp, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize); -asmlinkage long compat_sys_nfsservctl(int cmd, void *notused, void *notused2); asmlinkage long compat_sys_signalfd4(int ufd, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize, int flags); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 8c03b98df5f9..1ff0ec2a5e8d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -702,9 +702,6 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args); asmlinkage long sys_sysinfo(struct sysinfo __user *info); asmlinkage long sys_sysfs(int option, unsigned long arg1, unsigned long arg2); -asmlinkage long sys_nfsservctl(int cmd, - struct nfsctl_arg __user *arg, - void __user *res); asmlinkage long sys_syslog(int type, char __user *buf, int len); asmlinkage long sys_uselib(const char __user *library); asmlinkage long sys_ni_syscall(void); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 62cbc8877fef..a9a5de07c4f1 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -16,7 +16,6 @@ asmlinkage long sys_ni_syscall(void) return -ENOSYS; } -cond_syscall(sys_nfsservctl); cond_syscall(sys_quotactl); cond_syscall(sys32_quotactl); cond_syscall(sys_acct); -- cgit v1.2.3 From 3b217116edaac634bf31e85c35708298059a8171 Mon Sep 17 00:00:00 2001 From: Duncan Sands Date: Tue, 30 Aug 2011 10:58:22 +0200 Subject: KVM: Fix instruction size issue in pvclock scaling Commit de2d1a524e94 ("KVM: Fix register corruption in pvclock_scale_delta") introduced a mul instruction that may have only a memory operand; the assembler therefore cannot select the correct size: pvclock.s:229: Error: no instruction mnemonic suffix given and no register operands; can't size instruction In this example the assembler is: #APP mul -48(%rbp) ; shrd $32, %rdx, %rax #NO_APP A simple solution is to use mulq. Signed-off-by: Duncan Sands Signed-off-by: Avi Kivity --- arch/x86/include/asm/pvclock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index a518c0a45044..c59cc97fe6c1 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -44,7 +44,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); #elif defined(__x86_64__) __asm__ ( - "mul %[mul_frac] ; shrd $32, %[hi], %[lo]" + "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]" : [lo]"=a"(product), [hi]"=d"(tmp) : "0"(delta), -- cgit v1.2.3 From 20afc60f892d285fde179ead4b24e6a7938c2f1b Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Tue, 30 Aug 2011 12:32:36 +0400 Subject: x86, perf: Check that current->mm is alive before getting user callchain An event may occur when an mm is already released. I added an event in dequeue_entity() and caught a panic with the following backtrace: [ 434.421110] BUG: unable to handle kernel NULL pointer dereference at 0000000000000050 [ 434.421258] IP: [] __get_user_pages_fast+0x9c/0x120 ... [ 434.421258] Call Trace: [ 434.421258] [] copy_from_user_nmi+0x51/0xf0 [ 434.421258] [] ? sched_clock_local+0x25/0x90 [ 434.421258] [] perf_callchain_user+0x128/0x170 [ 434.421258] [] ? __perf_event_header__init_id+0xed/0x100 [ 434.421258] [] perf_prepare_sample+0x200/0x280 [ 434.421258] [] __perf_event_overflow+0x1b8/0x290 [ 434.421258] [] ? tg_shares_up+0x0/0x670 [ 434.421258] [] ? walk_tg_tree+0x6a/0xb0 [ 434.421258] [] perf_swevent_overflow+0xc4/0xf0 [ 434.421258] [] do_perf_sw_event+0x1e0/0x250 [ 434.421258] [] perf_tp_event+0x44/0x70 [ 434.421258] [] ftrace_profile_sched_block+0xdf/0x110 [ 434.421258] [] dequeue_entity+0x2ad/0x2d0 [ 434.421258] [] dequeue_task_fair+0x1c/0x60 [ 434.421258] [] dequeue_task+0x9a/0xb0 [ 434.421258] [] deactivate_task+0x42/0xe0 [ 434.421258] [] thread_return+0x191/0x808 [ 434.421258] [] ? switch_task_namespaces+0x24/0x60 [ 434.421258] [] do_exit+0x464/0x910 [ 434.421258] [] do_group_exit+0x58/0xd0 [ 434.421258] [] sys_exit_group+0x17/0x20 [ 434.421258] [] system_call_fastpath+0x16/0x1b Signed-off-by: Andrey Vagin Signed-off-by: Peter Zijlstra Cc: stable@kernel.org Link: http://lkml.kernel.org/r/1314693156-24131-1-git-send-email-avagin@openvz.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4ee3abf20ed6..cfa62ec090ec 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1900,6 +1900,9 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) perf_callchain_store(entry, regs->ip); + if (!current->mm) + return; + if (perf_callchain_user32(regs, entry)) return; -- cgit v1.2.3 From d312ae878b6aed3912e1acaaf5d0b2a9d08a4f11 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Fri, 19 Aug 2011 15:57:16 +0100 Subject: xen: use maximum reservation to limit amount of usable RAM Use the domain's maximum reservation to limit the amount of extra RAM for the memory balloon. This reduces the size of the pages tables and the amount of reserved low memory (which defaults to about 1/32 of the total RAM). On a system with 8 GiB of RAM with the domain limited to 1 GiB the kernel reports: Before: Memory: 627792k/4472000k available After: Memory: 549740k/11132224k available A increase of about 76 MiB (~1.5% of the unused 7 GiB). The reserved low memory is also reduced from 253 MiB to 32 MiB. The total additional usable RAM is 329 MiB. For dom0, this requires at patch to Xen ('x86: use 'dom0_mem' to limit the number of pages for dom0') (c/s 23790) CC: stable@kernel.org Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 02ffd9e48c9f..ff3dfa176814 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -183,6 +183,19 @@ static unsigned long __init xen_set_identity(const struct e820entry *list, PFN_UP(start_pci), PFN_DOWN(last)); return identity; } + +static unsigned long __init xen_get_max_pages(void) +{ + unsigned long max_pages = MAX_DOMAIN_PAGES; + domid_t domid = DOMID_SELF; + int ret; + + ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); + if (ret > 0) + max_pages = ret; + return min(max_pages, MAX_DOMAIN_PAGES); +} + /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ @@ -291,6 +304,12 @@ char * __init xen_memory_setup(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + extra_limit = xen_get_max_pages(); + if (extra_limit >= max_pfn) + extra_pages = extra_limit - max_pfn; + else + extra_pages = 0; + extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); /* -- cgit v1.2.3 From d198d499148a0c64a41b3aba9e7dd43772832b91 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Thu, 1 Sep 2011 13:46:55 +0200 Subject: xen: x86_32: do not enable iterrupts when returning from exception in interrupt context If vmalloc page_fault happens inside of interrupt handler with interrupts disabled then on exit path from exception handler when there is no pending interrupts, the following code (arch/x86/xen/xen-asm_32.S:112): cmpw $0x0001, XEN_vcpu_info_pending(%eax) sete XEN_vcpu_info_mask(%eax) will enable interrupts even if they has been previously disabled according to eflags from the bounce frame (arch/x86/xen/xen-asm_32.S:99) testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) setz XEN_vcpu_info_mask(%eax) Solution is in setting XEN_vcpu_info_mask only when it should be set according to cmpw $0x0001, XEN_vcpu_info_pending(%eax) but not clearing it if there isn't any pending events. Reproducer for bug is attached to RHBZ 707552 CC: stable@kernel.org Signed-off-by: Igor Mammedov Acked-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/xen-asm_32.S | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 22a2093b5862..b040b0e518ca 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -113,11 +113,13 @@ xen_iret_start_crit: /* * If there's something pending, mask events again so we can - * jump back into xen_hypervisor_callback + * jump back into xen_hypervisor_callback. Otherwise do not + * touch XEN_vcpu_info_mask. */ - sete XEN_vcpu_info_mask(%eax) + jne 1f + movb $1, XEN_vcpu_info_mask(%eax) - popl %eax +1: popl %eax /* * From this point on the registers are restored and the stack -- cgit v1.2.3 From ed467e69f16e6b480e2face7bc5963834d025f91 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 1 Sep 2011 09:48:27 -0400 Subject: xen/smp: Warn user why they keel over - nosmp or noapic and what to use instead. We have hit a couple of customer bugs where they would like to use those parameters to run an UP kernel - but both of those options turn of important sources of interrupt information so we end up not being able to boot. The correct way is to pass in 'dom0_max_vcpus=1' on the Xen hypervisor line and the kernel will patch itself to be a UP kernel. Fixes bug: http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=637308 CC: stable@kernel.org Acked-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/smp.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index e79dbb95482b..d4fc6d454f8d 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -32,6 +32,7 @@ #include #include +#include #include "xen-ops.h" #include "mmu.h" @@ -207,6 +208,15 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) unsigned cpu; unsigned int i; + if (skip_ioapic_setup) { + char *m = (max_cpus == 0) ? + "The nosmp parameter is incompatible with Xen; " \ + "use Xen dom0_max_vcpus=1 parameter" : + "The noapic parameter is incompatible with Xen"; + + xen_raw_printk(m); + panic(m); + } xen_init_lock_cpu(0); smp_store_cpu_info(0); -- cgit v1.2.3 From 5307f6d5fb12fd01f9f321bc4a8fd77e74858647 Mon Sep 17 00:00:00 2001 From: Shyam Iyer Date: Thu, 8 Sep 2011 16:41:17 -0500 Subject: Fix pointer dereference before call to pcie_bus_configure_settings Commit b03e7495a862 ("PCI: Set PCI-E Max Payload Size on fabric") introduced a potential NULL pointer dereference in calls to pcie_bus_configure_settings due to attempts to access pci_bus self variables when the self pointer is NULL. To correct this, verify that the self pointer in pci_bus is non-NULL before dereferencing it. Reported-by: Stanislaw Gruszka Signed-off-by: Shyam Iyer Signed-off-by: Jon Mason Acked-by: Jesse Barnes Signed-off-by: Linus Torvalds --- arch/x86/pci/acpi.c | 9 +++++++-- drivers/pci/hotplug/pcihp_slot.c | 4 +++- drivers/pci/probe.c | 3 --- 3 files changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index c95330267f08..039d91315bc5 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -365,8 +365,13 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) */ if (bus) { struct pci_bus *child; - list_for_each_entry(child, &bus->children, node) - pcie_bus_configure_settings(child, child->self->pcie_mpss); + list_for_each_entry(child, &bus->children, node) { + struct pci_dev *self = child->self; + if (!self) + continue; + + pcie_bus_configure_settings(child, self->pcie_mpss); + } } if (!bus) diff --git a/drivers/pci/hotplug/pcihp_slot.c b/drivers/pci/hotplug/pcihp_slot.c index 753b21aaea61..3ffd9c1acc0a 100644 --- a/drivers/pci/hotplug/pcihp_slot.c +++ b/drivers/pci/hotplug/pcihp_slot.c @@ -169,7 +169,9 @@ void pci_configure_slot(struct pci_dev *dev) (dev->class >> 8) == PCI_CLASS_BRIDGE_PCI))) return; - pcie_bus_configure_settings(dev->bus, dev->bus->self->pcie_mpss); + if (dev->bus && dev->bus->self) + pcie_bus_configure_settings(dev->bus, + dev->bus->self->pcie_mpss); memset(&hpp, 0, sizeof(hpp)); ret = pci_get_hp_params(dev, &hpp); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 8473727b29fa..0820fc1544e8 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1456,9 +1456,6 @@ void pcie_bus_configure_settings(struct pci_bus *bus, u8 mpss) { u8 smpss = mpss; - if (!bus->self) - return; - if (!pci_is_pcie(bus->self)) return; -- cgit v1.2.3