From 665d92e38f65d70796aad2b8e49e42e80815d4a4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 25 Dec 2014 14:31:24 +0900 Subject: kbuild: do not add $(call ...) to invoke cc-version or cc-fullversion The macros cc-version, cc-fullversion and ld-version take no argument. It is not necessary to add $(call ...) to invoke them. Signed-off-by: Masahiro Yamada Acked-by: Helge Deller [parisc] Signed-off-by: Michal Marek --- arch/x86/Makefile.um | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index 36b62bc52638..95eba554baf9 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -30,7 +30,7 @@ cflags-y += -ffreestanding # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use # a lot more stack due to the lack of sharing of stacklots. Also, gcc # 4.3.0 needs -funit-at-a-time for extern inline functions. -KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \ +KBUILD_CFLAGS += $(shell if [ $(cc-version) -lt 0400 ] ; then \ echo $(call cc-option,-fno-unit-at-a-time); \ else echo $(call cc-option,-funit-at-a-time); fi ;) -- cgit v1.2.3 From e182c570e9953859aee5cb016583217d9e68ea18 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 12 Dec 2014 01:56:04 +0200 Subject: x86/uaccess: fix sparse errors virtio wants to read bitwise types from userspace using get_user. At the moment this triggers sparse errors, since the value is passed through an integer. Fix that up using __force. Signed-off-by: Michael S. Tsirkin Acked-by: Thomas Gleixner --- arch/x86/include/asm/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 0d592e0a5b84..ace9dec050b1 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -179,7 +179,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) asm volatile("call __get_user_%P3" \ : "=a" (__ret_gu), "=r" (__val_gu) \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ - (x) = (__typeof__(*(ptr))) __val_gu; \ + (x) = (__force __typeof__(*(ptr))) __val_gu; \ __ret_gu; \ }) -- cgit v1.2.3 From d1c29465b8a52d8fc5a59aac92c6b206b69fe631 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 11 Feb 2015 15:15:10 +1030 Subject: lguest: don't disable iospace. This no longer speeds up boot (IDE got better, I guess), but it does stop us probing for a PCI bus. Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index c1c1544b8485..47ec7f201d27 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1399,14 +1399,6 @@ __init void lguest_init(void) /* Hook in our special panic hypercall code. */ atomic_notifier_chain_register(&panic_notifier_list, &paniced); - /* - * The IDE code spends about 3 seconds probing for disks: if we reserve - * all the I/O ports up front it can't get them and so doesn't probe. - * Other device drivers are similar (but less severe). This cuts the - * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. - */ - paravirt_disable_iospace(); - /* * This is messy CPU setup stuff which the native boot code does before * start_kernel, so we have to do, too: -- cgit v1.2.3 From ee72576c143d8e9081ae1fe8644122454dd323c5 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 11 Feb 2015 15:15:10 +1030 Subject: lguest: disable ACPI explicitly. Once we add PCI, it starts trying to manage our interrupts. Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 47ec7f201d27..aa6e3b4ce29c 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -1428,6 +1429,9 @@ __init void lguest_init(void) /* Register our very early console. */ virtio_cons_early_init(early_put_chars); + /* Don't let ACPI try to control our PCI interrupts. */ + disable_acpi(); + /* * Last of all, we set the power management poweroff hook to point to * the Guest routine to power off, and the reboot hook to our restart -- cgit v1.2.3 From e1b83e27881cf3153ce420aea853797fed29a9ea Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 11 Feb 2015 15:15:10 +1030 Subject: lguest: Override pcibios_enable_irq/pcibios_disable_irq to our stupid PIC This lets us deliver interrupts for our emulated PCI devices using our dumb PIC, and not emulate an 8259 and PCI irq mapping tables or whatever. Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index aa6e3b4ce29c..2943ab931671 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -72,6 +73,7 @@ #include #include /* for struct machine_ops */ #include +#include /*G:010 * Welcome to the Guest! @@ -832,6 +834,24 @@ static struct irq_chip lguest_irq_controller = { .irq_unmask = enable_lguest_irq, }; +static int lguest_enable_irq(struct pci_dev *dev) +{ + u8 line = 0; + + /* We literally use the PCI interrupt line as the irq number. */ + pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line); + irq_set_chip_and_handler_name(line, &lguest_irq_controller, + handle_level_irq, "level"); + dev->irq = line; + return 0; +} + +/* We don't do hotplug PCI, so this shouldn't be called. */ +static void lguest_disable_irq(struct pci_dev *dev) +{ + WARN_ON(1); +} + /* * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware * interrupt (except 128, which is used for system calls), and then tells the @@ -1432,6 +1452,10 @@ __init void lguest_init(void) /* Don't let ACPI try to control our PCI interrupts. */ disable_acpi(); + /* We control them ourselves, by overriding these two hooks. */ + pcibios_enable_irq = lguest_enable_irq; + pcibios_disable_irq = lguest_disable_irq; + /* * Last of all, we set the power management poweroff hook to point to * the Guest routine to power off, and the reboot hook to our restart -- cgit v1.2.3 From a561adfaecc9eb6fb66941b450458801f3f60ca0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 11 Feb 2015 15:26:01 +1030 Subject: lguest: use the PCI console device's emerg_wr for early boot messages. This involves manually checking the console device (which is always in slot 1 of bus 0) and using the window in VIRTIO_PCI_CAP_PCI_CFG to program it (as we can't map the BAR yet). We could in fact do this much earlier, but we wait for the first write from the virtio_cons_early_init() facility. Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 146 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 134 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 2943ab931671..531b844cb48d 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -74,6 +75,7 @@ #include /* for struct machine_ops */ #include #include +#include /*G:010 * Welcome to the Guest! @@ -1202,25 +1204,145 @@ static __init char *lguest_memory_setup(void) return "LGUEST"; } +/* Offset within PCI config space of BAR access capability. */ +static int console_cfg_offset = 0; +static int console_access_cap; + +/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */ +static void set_cfg_window(u32 cfg_offset, u32 off) +{ + write_pci_config_byte(0, 1, 0, + cfg_offset + offsetof(struct virtio_pci_cap, bar), + 0); + write_pci_config(0, 1, 0, + cfg_offset + offsetof(struct virtio_pci_cap, length), + 4); + write_pci_config(0, 1, 0, + cfg_offset + offsetof(struct virtio_pci_cap, offset), + off); +} + +static u32 read_bar_via_cfg(u32 cfg_offset, u32 off) +{ + set_cfg_window(cfg_offset, off); + return read_pci_config(0, 1, 0, + cfg_offset + sizeof(struct virtio_pci_cap)); +} + +static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val) +{ + set_cfg_window(cfg_offset, off); + write_pci_config(0, 1, 0, + cfg_offset + sizeof(struct virtio_pci_cap), val); +} + +static void probe_pci_console(void) +{ + u8 cap, common_cap = 0, device_cap = 0; + /* Offsets within BAR0 */ + u32 common_offset, device_offset; + + /* Avoid recursive printk into here. */ + console_cfg_offset = -1; + + if (!early_pci_allowed()) { + printk(KERN_ERR "lguest: early PCI access not allowed!\n"); + return; + } + + /* We expect a console PCI device at BUS0, slot 1. */ + if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) { + printk(KERN_ERR "lguest: PCI device is %#x!\n", + read_pci_config(0, 1, 0, 0)); + return; + } + + /* Find the capabilities we need (must be in bar0) */ + cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST); + while (cap) { + u8 vndr = read_pci_config_byte(0, 1, 0, cap); + if (vndr == PCI_CAP_ID_VNDR) { + u8 type, bar; + u32 offset; + + type = read_pci_config_byte(0, 1, 0, + cap + offsetof(struct virtio_pci_cap, cfg_type)); + bar = read_pci_config_byte(0, 1, 0, + cap + offsetof(struct virtio_pci_cap, bar)); + offset = read_pci_config(0, 1, 0, + cap + offsetof(struct virtio_pci_cap, offset)); + + switch (type) { + case VIRTIO_PCI_CAP_COMMON_CFG: + if (bar == 0) { + common_cap = cap; + common_offset = offset; + } + break; + case VIRTIO_PCI_CAP_DEVICE_CFG: + if (bar == 0) { + device_cap = cap; + device_offset = offset; + } + break; + case VIRTIO_PCI_CAP_PCI_CFG: + console_access_cap = cap; + break; + } + } + cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT); + } + if (!common_cap || !device_cap || !console_access_cap) { + printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n", + common_cap, device_cap, console_access_cap); + return; + } + + +#define write_common_config(reg, val) \ + write_bar_via_cfg(console_access_cap, \ + common_offset+offsetof(struct virtio_pci_common_cfg,reg),\ + val) + +#define read_common_config(reg) \ + read_bar_via_cfg(console_access_cap, \ + common_offset+offsetof(struct virtio_pci_common_cfg,reg)) + + /* Check features: they must offer EMERG_WRITE */ + write_common_config(device_feature_select, 0); + + if (!(read_common_config(device_feature) + & (1 << VIRTIO_CONSOLE_F_EMERG_WRITE))) { + printk(KERN_ERR "lguest: console missing EMERG_WRITE\n"); + return; + } + + console_cfg_offset = device_offset; +} + /* * We will eventually use the virtio console device to produce console output, - * but before that is set up we use LHCALL_NOTIFY on normal memory to produce - * console output. + * but before that is set up we use the virtio PCI console's backdoor mmio + * access and the "emergency" write facility (which is legal even before the + * device is configured). */ static __init int early_put_chars(u32 vtermno, const char *buf, int count) { - char scratch[17]; - unsigned int len = count; + /* If we couldn't find PCI console, forget it. */ + if (console_cfg_offset < 0) + return count; - /* We use a nul-terminated string, so we make a copy. Icky, huh? */ - if (len > sizeof(scratch) - 1) - len = sizeof(scratch) - 1; - scratch[len] = '\0'; - memcpy(scratch, buf, len); - hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0, 0); + if (unlikely(!console_cfg_offset)) { + probe_pci_console(); + if (console_cfg_offset < 0) + return count; + } - /* This routine returns the number of bytes actually written. */ - return len; + write_bar_via_cfg(console_access_cap, + console_cfg_offset + + offsetof(struct virtio_console_config, emerg_wr), + buf[0]); + return 1; } /* -- cgit v1.2.3 From d9bab50aa46ce46dd4537d455eb13b200cdac516 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 11 Feb 2015 15:28:01 +1030 Subject: lguest: remove NOTIFY call and eventfd facility. Disappointing, as this was kind of neat (especially getting to use RCU to manage the address -> eventfd mapping). But now the devices are PCI handled in userspace, we get rid of both the NOTIFY hypercall and the interface to connect an eventfd. Signed-off-by: Rusty Russell --- arch/x86/include/asm/lguest_hcall.h | 1 - drivers/lguest/core.c | 20 +--- drivers/lguest/hypercalls.c | 4 - drivers/lguest/lg.h | 12 --- drivers/lguest/lguest_user.c | 186 +----------------------------------- include/linux/lguest_launcher.h | 2 +- 6 files changed, 10 insertions(+), 215 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index 879fd7d33877..ef01fef3eebc 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -16,7 +16,6 @@ #define LHCALL_SET_PTE 14 #define LHCALL_SET_PGD 15 #define LHCALL_LOAD_TLS 16 -#define LHCALL_NOTIFY 17 #define LHCALL_LOAD_GDT_ENTRY 18 #define LHCALL_SEND_INTERRUPTS 19 diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 9159dbc583f6..7dc93aa004c8 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -225,22 +225,12 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) if (cpu->hcall) do_hypercalls(cpu); - /* - * It's possible the Guest did a NOTIFY hypercall to the - * Launcher. - */ + /* Do we have to tell the Launcher about a trap? */ if (cpu->pending.trap) { - /* - * Does it just needs to write to a registered - * eventfd (ie. the appropriate virtqueue thread)? - */ - if (!send_notify_to_eventfd(cpu)) { - /* OK, we tell the main Launcher. */ - if (copy_to_user(user, &cpu->pending, - sizeof(cpu->pending))) - return -EFAULT; - return sizeof(cpu->pending); - } + if (copy_to_user(user, &cpu->pending, + sizeof(cpu->pending))) + return -EFAULT; + return sizeof(cpu->pending); } /* diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 5dd1fb8a6610..1219af493c0f 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -117,10 +117,6 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) /* Similarly, this sets the halted flag for run_guest(). */ cpu->halted = 1; break; - case LHCALL_NOTIFY: - cpu->pending.trap = LGUEST_TRAP_ENTRY; - cpu->pending.addr = args->arg1; - break; default: /* It should be an architecture-specific hypercall. */ if (lguest_arch_do_hcall(cpu, args)) diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index eb81abc05995..307e8b39e7d1 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -81,16 +81,6 @@ struct lg_cpu { struct lg_cpu_arch arch; }; -struct lg_eventfd { - unsigned long addr; - struct eventfd_ctx *event; -}; - -struct lg_eventfd_map { - unsigned int num; - struct lg_eventfd map[]; -}; - /* The private info the thread maintains about the guest. */ struct lguest { struct lguest_data __user *lguest_data; @@ -117,8 +107,6 @@ struct lguest { unsigned int stack_pages; u32 tsc_khz; - struct lg_eventfd_map *eventfds; - /* Dead? */ const char *dead; }; diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index c8b0e8575b44..c4c6113eb9a6 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -2,182 +2,20 @@ * launcher controls and communicates with the Guest. For example, * the first write will tell us the Guest's memory layout and entry * point. A read will run the Guest until something happens, such as - * a signal or the Guest doing a NOTIFY out to the Launcher. There is - * also a way for the Launcher to attach eventfds to particular NOTIFY - * values instead of returning from the read() call. + * a signal or the Guest accessing a device. :*/ #include #include #include #include -#include #include #include #include #include "lg.h" -/*L:056 - * Before we move on, let's jump ahead and look at what the kernel does when - * it needs to look up the eventfds. That will complete our picture of how we - * use RCU. - * - * The notification value is in cpu->pending_notify: we return true if it went - * to an eventfd. - */ -bool send_notify_to_eventfd(struct lg_cpu *cpu) -{ - unsigned int i; - struct lg_eventfd_map *map; - - /* We only connect LHCALL_NOTIFY to event fds, not other traps. */ - if (cpu->pending.trap != LGUEST_TRAP_ENTRY) - return false; - - /* - * This "rcu_read_lock()" helps track when someone is still looking at - * the (RCU-using) eventfds array. It's not actually a lock at all; - * indeed it's a noop in many configurations. (You didn't expect me to - * explain all the RCU secrets here, did you?) - */ - rcu_read_lock(); - /* - * rcu_dereference is the counter-side of rcu_assign_pointer(); it - * makes sure we don't access the memory pointed to by - * cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy, - * but Alpha allows this! Paul McKenney points out that a really - * aggressive compiler could have the same effect: - * http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html - * - * So play safe, use rcu_dereference to get the rcu-protected pointer: - */ - map = rcu_dereference(cpu->lg->eventfds); - /* - * Simple array search: even if they add an eventfd while we do this, - * we'll continue to use the old array and just won't see the new one. - */ - for (i = 0; i < map->num; i++) { - if (map->map[i].addr == cpu->pending.addr) { - eventfd_signal(map->map[i].event, 1); - cpu->pending.trap = 0; - break; - } - } - /* We're done with the rcu-protected variable cpu->lg->eventfds. */ - rcu_read_unlock(); - - /* If we cleared the notification, it's because we found a match. */ - return cpu->pending.trap == 0; -} - -/*L:055 - * One of the more tricksy tricks in the Linux Kernel is a technique called - * Read Copy Update. Since one point of lguest is to teach lguest journeyers - * about kernel coding, I use it here. (In case you're curious, other purposes - * include learning about virtualization and instilling a deep appreciation for - * simplicity and puppies). - * - * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we - * add new eventfds without ever blocking readers from accessing the array. - * The current Launcher only does this during boot, so that never happens. But - * Read Copy Update is cool, and adding a lock risks damaging even more puppies - * than this code does. - * - * We allocate a brand new one-larger array, copy the old one and add our new - * element. Then we make the lg eventfd pointer point to the new array. - * That's the easy part: now we need to free the old one, but we need to make - * sure no slow CPU somewhere is still looking at it. That's what - * synchronize_rcu does for us: waits until every CPU has indicated that it has - * moved on to know it's no longer using the old one. - * - * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update. - */ -static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) -{ - struct lg_eventfd_map *new, *old = lg->eventfds; - - /* - * We don't allow notifications on value 0 anyway (pending_notify of - * 0 means "nothing pending"). - */ - if (!addr) - return -EINVAL; - - /* - * Replace the old array with the new one, carefully: others can - * be accessing it at the same time. - */ - new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1), - GFP_KERNEL); - if (!new) - return -ENOMEM; - - /* First make identical copy. */ - memcpy(new->map, old->map, sizeof(old->map[0]) * old->num); - new->num = old->num; - - /* Now append new entry. */ - new->map[new->num].addr = addr; - new->map[new->num].event = eventfd_ctx_fdget(fd); - if (IS_ERR(new->map[new->num].event)) { - int err = PTR_ERR(new->map[new->num].event); - kfree(new); - return err; - } - new->num++; - - /* - * Now put new one in place: rcu_assign_pointer() is a fancy way of - * doing "lg->eventfds = new", but it uses memory barriers to make - * absolutely sure that the contents of "new" written above is nailed - * down before we actually do the assignment. - * - * We have to think about these kinds of things when we're operating on - * live data without locks. - */ - rcu_assign_pointer(lg->eventfds, new); - - /* - * We're not in a big hurry. Wait until no one's looking at old - * version, then free it. - */ - synchronize_rcu(); - kfree(old); - - return 0; -} - /*L:052 - * Receiving notifications from the Guest is usually done by attaching a - * particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will - * become readable when the Guest does an LHCALL_NOTIFY with that value. - * - * This is really convenient for processing each virtqueue in a separate - * thread. - */ -static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) -{ - unsigned long addr, fd; - int err; - - if (get_user(addr, input) != 0) - return -EFAULT; - input++; - if (get_user(fd, input) != 0) - return -EFAULT; - - /* - * Just make sure two callers don't add eventfds at once. We really - * only need to lock against callers adding to the same Guest, so using - * the Big Lguest Lock is overkill. But this is setup, not a fast path. - */ - mutex_lock(&lguest_lock); - err = add_eventfd(lg, addr, fd); - mutex_unlock(&lguest_lock); - - return err; -} - -/* The Launcher can get the registers, and also set some of them. */ + The Launcher can get the registers, and also set some of them. +*/ static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input) { unsigned long which; @@ -409,13 +247,6 @@ static int initialize(struct file *file, const unsigned long __user *input) goto unlock; } - lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL); - if (!lg->eventfds) { - err = -ENOMEM; - goto free_lg; - } - lg->eventfds->num = 0; - /* Populate the easy fields of our "struct lguest" */ lg->mem_base = (void __user *)args[0]; lg->pfn_limit = args[1]; @@ -424,7 +255,7 @@ static int initialize(struct file *file, const unsigned long __user *input) /* This is the first cpu (cpu 0) and it will start booting at args[2] */ err = lg_cpu_start(&lg->cpus[0], 0, args[2]); if (err) - goto free_eventfds; + goto free_lg; /* * Initialize the Guest's shadow page tables. This allocates @@ -445,8 +276,6 @@ static int initialize(struct file *file, const unsigned long __user *input) free_regs: /* FIXME: This should be in free_vcpu */ free_page(lg->cpus[0].regs_page); -free_eventfds: - kfree(lg->eventfds); free_lg: kfree(lg); unlock: @@ -499,8 +328,6 @@ static ssize_t write(struct file *file, const char __user *in, return initialize(file, input); case LHREQ_IRQ: return user_send_irq(cpu, input); - case LHREQ_EVENTFD: - return attach_eventfd(lg, input); case LHREQ_GETREG: return getreg_setup(cpu, input); case LHREQ_SETREG: @@ -551,11 +378,6 @@ static int close(struct inode *inode, struct file *file) mmput(lg->cpus[i].mm); } - /* Release any eventfds they registered. */ - for (i = 0; i < lg->eventfds->num; i++) - eventfd_ctx_put(lg->eventfds->map[i].event); - kfree(lg->eventfds); - /* * If lg->dead doesn't contain an error code it will be NULL or a * kmalloc()ed string, either of which is ok to hand to kfree(). diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index 677cde735d4b..acd5b12565cc 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -23,7 +23,7 @@ enum lguest_req LHREQ_GETDMA, /* No longer used */ LHREQ_IRQ, /* + irq */ LHREQ_BREAK, /* No longer used */ - LHREQ_EVENTFD, /* + address, fd. */ + LHREQ_EVENTFD, /* No longer used. */ LHREQ_GETREG, /* + offset within struct pt_regs (then read value). */ LHREQ_SETREG, /* + offset within struct pt_regs, value. */ LHREQ_TRAP, /* + trap number to deliver to guest. */ -- cgit v1.2.3 From 55c2d7884e9a97c2f2d46d5818f783bf3dcc5314 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Feb 2015 17:13:43 +1030 Subject: lguest: don't look in console features to find emerg_wr. The 1.0 spec clearly states that you must set the ACKNOWLEDGE and DRIVER status bits before accessing the feature bits. This is a problem for the early console code, which doesn't really want to acknowledge the device (the spec specifically excepts writing to the console's emerg_wr from the usual ordering constrains). Instead, we check that the *size* of the device configuration is sufficient to hold emerg_wr: at worst (if the device doesn't support the VIRTIO_CONSOLE_F_EMERG_WRITE feature), it will ignore the writes. Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 57 +++++++++++++++++++++----------------------------- 1 file changed, 24 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 531b844cb48d..ac4453d8520e 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1222,15 +1222,13 @@ static void set_cfg_window(u32 cfg_offset, u32 off) off); } -static u32 read_bar_via_cfg(u32 cfg_offset, u32 off) -{ - set_cfg_window(cfg_offset, off); - return read_pci_config(0, 1, 0, - cfg_offset + sizeof(struct virtio_pci_cap)); -} - static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val) { + /* + * We could set this up once, then leave it; nothing else in the * + * kernel should touch these registers. But if it went wrong, that + * would be a horrible bug to find. + */ set_cfg_window(cfg_offset, off); write_pci_config(0, 1, 0, cfg_offset + sizeof(struct virtio_pci_cap), val); @@ -1239,8 +1237,9 @@ static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val) static void probe_pci_console(void) { u8 cap, common_cap = 0, device_cap = 0; - /* Offsets within BAR0 */ - u32 common_offset, device_offset; + /* Offset within BAR0 */ + u32 device_offset; + u32 device_len; /* Avoid recursive printk into here. */ console_cfg_offset = -1; @@ -1263,7 +1262,7 @@ static void probe_pci_console(void) u8 vndr = read_pci_config_byte(0, 1, 0, cap); if (vndr == PCI_CAP_ID_VNDR) { u8 type, bar; - u32 offset; + u32 offset, length; type = read_pci_config_byte(0, 1, 0, cap + offsetof(struct virtio_pci_cap, cfg_type)); @@ -1271,18 +1270,15 @@ static void probe_pci_console(void) cap + offsetof(struct virtio_pci_cap, bar)); offset = read_pci_config(0, 1, 0, cap + offsetof(struct virtio_pci_cap, offset)); + length = read_pci_config(0, 1, 0, + cap + offsetof(struct virtio_pci_cap, length)); switch (type) { - case VIRTIO_PCI_CAP_COMMON_CFG: - if (bar == 0) { - common_cap = cap; - common_offset = offset; - } - break; case VIRTIO_PCI_CAP_DEVICE_CFG: if (bar == 0) { device_cap = cap; device_offset = offset; + device_len = length; } break; case VIRTIO_PCI_CAP_PCI_CFG: @@ -1292,32 +1288,27 @@ static void probe_pci_console(void) } cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT); } - if (!common_cap || !device_cap || !console_access_cap) { + if (!device_cap || !console_access_cap) { printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n", common_cap, device_cap, console_access_cap); return; } - -#define write_common_config(reg, val) \ - write_bar_via_cfg(console_access_cap, \ - common_offset+offsetof(struct virtio_pci_common_cfg,reg),\ - val) - -#define read_common_config(reg) \ - read_bar_via_cfg(console_access_cap, \ - common_offset+offsetof(struct virtio_pci_common_cfg,reg)) - - /* Check features: they must offer EMERG_WRITE */ - write_common_config(device_feature_select, 0); - - if (!(read_common_config(device_feature) - & (1 << VIRTIO_CONSOLE_F_EMERG_WRITE))) { - printk(KERN_ERR "lguest: console missing EMERG_WRITE\n"); + /* + * Note that we can't check features, until we've set the DRIVER + * status bit. We don't want to do that until we have a real driver, + * so we just check that the device-specific config has room for + * emerg_wr. If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE + * it should ignore the access. + */ + if (device_len < (offsetof(struct virtio_console_config, emerg_wr) + + sizeof(u32))) { + printk(KERN_ERR "lguest: console missing emerg_wr field\n"); return; } console_cfg_offset = device_offset; + printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n"); } /* -- cgit v1.2.3 From 96738c69a7fcdbf0d7c9df0c8a27660011e82a7b Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 13 Jan 2015 15:25:00 +0000 Subject: x86/efi: Avoid triple faults during EFI mixed mode calls Andy pointed out that if an NMI or MCE is received while we're in the middle of an EFI mixed mode call a triple fault will occur. This can happen, for example, when issuing an EFI mixed mode call while running perf. The reason for the triple fault is that we execute the mixed mode call in 32-bit mode with paging disabled but with 64-bit kernel IDT handlers installed throughout the call. At Andy's suggestion, stop playing the games we currently do at runtime, such as disabling paging and installing a 32-bit GDT for __KERNEL_CS. We can simply switch to the __KERNEL32_CS descriptor before invoking firmware services, and run in compatibility mode. This way, if an NMI/MCE does occur the kernel IDT handler will execute correctly, since it'll jump to __KERNEL_CS automatically. However, this change is only possible post-ExitBootServices(). Before then the firmware "owns" the machine and expects for its 32-bit IDT handlers to be left intact to service interrupts, etc. So, we now need to distinguish between early boot and runtime invocations of EFI services. During early boot, we need to restore the GDT that the firmware expects to be present. We can only jump to the __KERNEL32_CS code segment for mixed mode calls after ExitBootServices() has been invoked. A liberal sprinkling of comments in the thunking code should make the differences in early and late environments more apparent. Reported-by: Andy Lutomirski Tested-by: Borislav Petkov Cc: Signed-off-by: Matt Fleming --- arch/x86/boot/compressed/Makefile | 1 + arch/x86/boot/compressed/efi_stub_64.S | 25 ---- arch/x86/boot/compressed/efi_thunk_64.S | 196 ++++++++++++++++++++++++++++++++ arch/x86/platform/efi/efi_stub_64.S | 161 -------------------------- arch/x86/platform/efi/efi_thunk_64.S | 121 +++++++++++++++++--- 5 files changed, 301 insertions(+), 203 deletions(-) create mode 100644 arch/x86/boot/compressed/efi_thunk_64.S (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index ad754b4411f7..8bd44e8ee6e2 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -49,6 +49,7 @@ $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \ $(objtree)/drivers/firmware/efi/libstub/lib.a +vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o $(obj)/vmlinux: $(vmlinux-objs-y) FORCE $(call if_changed,ld) diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S index 7ff3632806b1..99494dff2113 100644 --- a/arch/x86/boot/compressed/efi_stub_64.S +++ b/arch/x86/boot/compressed/efi_stub_64.S @@ -3,28 +3,3 @@ #include #include "../../platform/efi/efi_stub_64.S" - -#ifdef CONFIG_EFI_MIXED - .code64 - .text -ENTRY(efi64_thunk) - push %rbp - push %rbx - - subq $16, %rsp - leaq efi_exit32(%rip), %rax - movl %eax, 8(%rsp) - leaq efi_gdt64(%rip), %rax - movl %eax, 4(%rsp) - movl %eax, 2(%rax) /* Fixup the gdt base address */ - leaq efi32_boot_gdt(%rip), %rax - movl %eax, (%rsp) - - call __efi64_thunk - - addq $16, %rsp - pop %rbx - pop %rbp - ret -ENDPROC(efi64_thunk) -#endif /* CONFIG_EFI_MIXED */ diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S new file mode 100644 index 000000000000..630384a4c14a --- /dev/null +++ b/arch/x86/boot/compressed/efi_thunk_64.S @@ -0,0 +1,196 @@ +/* + * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming + * + * Early support for invoking 32-bit EFI services from a 64-bit kernel. + * + * Because this thunking occurs before ExitBootServices() we have to + * restore the firmware's 32-bit GDT before we make EFI serivce calls, + * since the firmware's 32-bit IDT is still currently installed and it + * needs to be able to service interrupts. + * + * On the plus side, we don't have to worry about mangling 64-bit + * addresses into 32-bits because we're executing with an identify + * mapped pagetable and haven't transitioned to 64-bit virtual addresses + * yet. + */ + +#include +#include +#include +#include +#include + + .code64 + .text +ENTRY(efi64_thunk) + push %rbp + push %rbx + + subq $8, %rsp + leaq efi_exit32(%rip), %rax + movl %eax, 4(%rsp) + leaq efi_gdt64(%rip), %rax + movl %eax, (%rsp) + movl %eax, 2(%rax) /* Fixup the gdt base address */ + + movl %ds, %eax + push %rax + movl %es, %eax + push %rax + movl %ss, %eax + push %rax + + /* + * Convert x86-64 ABI params to i386 ABI + */ + subq $32, %rsp + movl %esi, 0x0(%rsp) + movl %edx, 0x4(%rsp) + movl %ecx, 0x8(%rsp) + movq %r8, %rsi + movl %esi, 0xc(%rsp) + movq %r9, %rsi + movl %esi, 0x10(%rsp) + + sgdt save_gdt(%rip) + + leaq 1f(%rip), %rbx + movq %rbx, func_rt_ptr(%rip) + + /* + * Switch to gdt with 32-bit segments. This is the firmware GDT + * that was installed when the kernel started executing. This + * pointer was saved at the EFI stub entry point in head_64.S. + */ + leaq efi32_boot_gdt(%rip), %rax + lgdt (%rax) + + pushq $__KERNEL_CS + leaq efi_enter32(%rip), %rax + pushq %rax + lretq + +1: addq $32, %rsp + + lgdt save_gdt(%rip) + + pop %rbx + movl %ebx, %ss + pop %rbx + movl %ebx, %es + pop %rbx + movl %ebx, %ds + + /* + * Convert 32-bit status code into 64-bit. + */ + test %rax, %rax + jz 1f + movl %eax, %ecx + andl $0x0fffffff, %ecx + andl $0xf0000000, %eax + shl $32, %rax + or %rcx, %rax +1: + addq $8, %rsp + pop %rbx + pop %rbp + ret +ENDPROC(efi64_thunk) + +ENTRY(efi_exit32) + movq func_rt_ptr(%rip), %rax + push %rax + mov %rdi, %rax + ret +ENDPROC(efi_exit32) + + .code32 +/* + * EFI service pointer must be in %edi. + * + * The stack should represent the 32-bit calling convention. + */ +ENTRY(efi_enter32) + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + + /* Reload pgtables */ + movl %cr3, %eax + movl %eax, %cr3 + + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + + /* Disable long mode via EFER */ + movl $MSR_EFER, %ecx + rdmsr + btrl $_EFER_LME, %eax + wrmsr + + call *%edi + + /* We must preserve return value */ + movl %eax, %edi + + /* + * Some firmware will return with interrupts enabled. Be sure to + * disable them before we switch GDTs. + */ + cli + + movl 56(%esp), %eax + movl %eax, 2(%eax) + lgdtl (%eax) + + movl %cr4, %eax + btsl $(X86_CR4_PAE_BIT), %eax + movl %eax, %cr4 + + movl %cr3, %eax + movl %eax, %cr3 + + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + wrmsr + + xorl %eax, %eax + lldt %ax + + movl 60(%esp), %eax + pushl $__KERNEL_CS + pushl %eax + + /* Enable paging */ + movl %cr0, %eax + btsl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + lret +ENDPROC(efi_enter32) + + .data + .balign 8 + .global efi32_boot_gdt +efi32_boot_gdt: .word 0 + .quad 0 + +save_gdt: .word 0 + .quad 0 +func_rt_ptr: .quad 0 + + .global efi_gdt64 +efi_gdt64: + .word efi_gdt64_end - efi_gdt64 + .long 0 /* Filled out by user */ + .word 0 + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x00af9a000000ffff /* __KERNEL_CS */ + .quad 0x00cf92000000ffff /* __KERNEL_DS */ + .quad 0x0080890000000000 /* TS descriptor */ + .quad 0x0000000000000000 /* TS continued */ +efi_gdt64_end: diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S index 5fcda7272550..86d0f9e08dd9 100644 --- a/arch/x86/platform/efi/efi_stub_64.S +++ b/arch/x86/platform/efi/efi_stub_64.S @@ -91,167 +91,6 @@ ENTRY(efi_call) ret ENDPROC(efi_call) -#ifdef CONFIG_EFI_MIXED - -/* - * We run this function from the 1:1 mapping. - * - * This function must be invoked with a 1:1 mapped stack. - */ -ENTRY(__efi64_thunk) - movl %ds, %eax - push %rax - movl %es, %eax - push %rax - movl %ss, %eax - push %rax - - subq $32, %rsp - movl %esi, 0x0(%rsp) - movl %edx, 0x4(%rsp) - movl %ecx, 0x8(%rsp) - movq %r8, %rsi - movl %esi, 0xc(%rsp) - movq %r9, %rsi - movl %esi, 0x10(%rsp) - - sgdt save_gdt(%rip) - - leaq 1f(%rip), %rbx - movq %rbx, func_rt_ptr(%rip) - - /* Switch to gdt with 32-bit segments */ - movl 64(%rsp), %eax - lgdt (%rax) - - leaq efi_enter32(%rip), %rax - pushq $__KERNEL_CS - pushq %rax - lretq - -1: addq $32, %rsp - - lgdt save_gdt(%rip) - - pop %rbx - movl %ebx, %ss - pop %rbx - movl %ebx, %es - pop %rbx - movl %ebx, %ds - - /* - * Convert 32-bit status code into 64-bit. - */ - test %rax, %rax - jz 1f - movl %eax, %ecx - andl $0x0fffffff, %ecx - andl $0xf0000000, %eax - shl $32, %rax - or %rcx, %rax -1: - ret -ENDPROC(__efi64_thunk) - -ENTRY(efi_exit32) - movq func_rt_ptr(%rip), %rax - push %rax - mov %rdi, %rax - ret -ENDPROC(efi_exit32) - - .code32 -/* - * EFI service pointer must be in %edi. - * - * The stack should represent the 32-bit calling convention. - */ -ENTRY(efi_enter32) - movl $__KERNEL_DS, %eax - movl %eax, %ds - movl %eax, %es - movl %eax, %ss - - /* Reload pgtables */ - movl %cr3, %eax - movl %eax, %cr3 - - /* Disable paging */ - movl %cr0, %eax - btrl $X86_CR0_PG_BIT, %eax - movl %eax, %cr0 - - /* Disable long mode via EFER */ - movl $MSR_EFER, %ecx - rdmsr - btrl $_EFER_LME, %eax - wrmsr - - call *%edi - - /* We must preserve return value */ - movl %eax, %edi - - /* - * Some firmware will return with interrupts enabled. Be sure to - * disable them before we switch GDTs. - */ - cli - - movl 68(%esp), %eax - movl %eax, 2(%eax) - lgdtl (%eax) - - movl %cr4, %eax - btsl $(X86_CR4_PAE_BIT), %eax - movl %eax, %cr4 - - movl %cr3, %eax - movl %eax, %cr3 - - movl $MSR_EFER, %ecx - rdmsr - btsl $_EFER_LME, %eax - wrmsr - - xorl %eax, %eax - lldt %ax - - movl 72(%esp), %eax - pushl $__KERNEL_CS - pushl %eax - - /* Enable paging */ - movl %cr0, %eax - btsl $X86_CR0_PG_BIT, %eax - movl %eax, %cr0 - lret -ENDPROC(efi_enter32) - - .data - .balign 8 - .global efi32_boot_gdt -efi32_boot_gdt: .word 0 - .quad 0 - -save_gdt: .word 0 - .quad 0 -func_rt_ptr: .quad 0 - - .global efi_gdt64 -efi_gdt64: - .word efi_gdt64_end - efi_gdt64 - .long 0 /* Filled out by user */ - .word 0 - .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x00af9a000000ffff /* __KERNEL_CS */ - .quad 0x00cf92000000ffff /* __KERNEL_DS */ - .quad 0x0080890000000000 /* TS descriptor */ - .quad 0x0000000000000000 /* TS continued */ -efi_gdt64_end: -#endif /* CONFIG_EFI_MIXED */ - .data ENTRY(efi_scratch) .fill 3,8,0 diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S index 8806fa73e6e6..ff85d28c50f2 100644 --- a/arch/x86/platform/efi/efi_thunk_64.S +++ b/arch/x86/platform/efi/efi_thunk_64.S @@ -1,9 +1,26 @@ /* * Copyright (C) 2014 Intel Corporation; author Matt Fleming + * + * Support for invoking 32-bit EFI runtime services from a 64-bit + * kernel. + * + * The below thunking functions are only used after ExitBootServices() + * has been called. This simplifies things considerably as compared with + * the early EFI thunking because we can leave all the kernel state + * intact (GDT, IDT, etc) and simply invoke the the 32-bit EFI runtime + * services from __KERNEL32_CS. This means we can continue to service + * interrupts across an EFI mixed mode call. + * + * We do however, need to handle the fact that we're running in a full + * 64-bit virtual address space. Things like the stack and instruction + * addresses need to be accessible by the 32-bit firmware, so we rely on + * using the identity mappings in the EFI page table to access the stack + * and kernel text (see efi_setup_page_tables()). */ #include #include +#include .text .code64 @@ -33,14 +50,6 @@ ENTRY(efi64_thunk) leaq efi_exit32(%rip), %rbx subq %rax, %rbx movl %ebx, 8(%rsp) - leaq efi_gdt64(%rip), %rbx - subq %rax, %rbx - movl %ebx, 2(%ebx) - movl %ebx, 4(%rsp) - leaq efi_gdt32(%rip), %rbx - subq %rax, %rbx - movl %ebx, 2(%ebx) - movl %ebx, (%rsp) leaq __efi64_thunk(%rip), %rbx subq %rax, %rbx @@ -52,14 +61,92 @@ ENTRY(efi64_thunk) retq ENDPROC(efi64_thunk) - .data -efi_gdt32: - .word efi_gdt32_end - efi_gdt32 - .long 0 /* Filled out above */ - .word 0 - .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x00cf9a000000ffff /* __KERNEL_CS */ - .quad 0x00cf93000000ffff /* __KERNEL_DS */ -efi_gdt32_end: +/* + * We run this function from the 1:1 mapping. + * + * This function must be invoked with a 1:1 mapped stack. + */ +ENTRY(__efi64_thunk) + movl %ds, %eax + push %rax + movl %es, %eax + push %rax + movl %ss, %eax + push %rax + + subq $32, %rsp + movl %esi, 0x0(%rsp) + movl %edx, 0x4(%rsp) + movl %ecx, 0x8(%rsp) + movq %r8, %rsi + movl %esi, 0xc(%rsp) + movq %r9, %rsi + movl %esi, 0x10(%rsp) + + leaq 1f(%rip), %rbx + movq %rbx, func_rt_ptr(%rip) + + /* Switch to 32-bit descriptor */ + pushq $__KERNEL32_CS + leaq efi_enter32(%rip), %rax + pushq %rax + lretq + +1: addq $32, %rsp + + pop %rbx + movl %ebx, %ss + pop %rbx + movl %ebx, %es + pop %rbx + movl %ebx, %ds + /* + * Convert 32-bit status code into 64-bit. + */ + test %rax, %rax + jz 1f + movl %eax, %ecx + andl $0x0fffffff, %ecx + andl $0xf0000000, %eax + shl $32, %rax + or %rcx, %rax +1: + ret +ENDPROC(__efi64_thunk) + +ENTRY(efi_exit32) + movq func_rt_ptr(%rip), %rax + push %rax + mov %rdi, %rax + ret +ENDPROC(efi_exit32) + + .code32 +/* + * EFI service pointer must be in %edi. + * + * The stack should represent the 32-bit calling convention. + */ +ENTRY(efi_enter32) + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + + call *%edi + + /* We must preserve return value */ + movl %eax, %edi + + movl 72(%esp), %eax + pushl $__KERNEL_CS + pushl %eax + + lret +ENDPROC(efi_enter32) + + .data + .balign 8 +func_rt_ptr: .quad 0 efi_saved_sp: .quad 0 -- cgit v1.2.3 From d6abfdb2022368d8c6c4be3f11a06656601a6cc2 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Fri, 6 Feb 2015 16:44:11 +0530 Subject: x86/spinlocks/paravirt: Fix memory corruption on unlock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Paravirt spinlock clears slowpath flag after doing unlock. As explained by Linus currently it does: prev = *lock; add_smp(&lock->tickets.head, TICKET_LOCK_INC); /* add_smp() is a full mb() */ if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) __ticket_unlock_slowpath(lock, prev); which is *exactly* the kind of things you cannot do with spinlocks, because after you've done the "add_smp()" and released the spinlock for the fast-path, you can't access the spinlock any more. Exactly because a fast-path lock might come in, and release the whole data structure. Linus suggested that we should not do any writes to lock after unlock(), and we can move slowpath clearing to fastpath lock. So this patch implements the fix with: 1. Moving slowpath flag to head (Oleg): Unlocked locks don't care about the slowpath flag; therefore we can keep it set after the last unlock, and clear it again on the first (try)lock. -- this removes the write after unlock. note that keeping slowpath flag would result in unnecessary kicks. By moving the slowpath flag from the tail to the head ticket we also avoid the need to access both the head and tail tickets on unlock. 2. use xadd to avoid read/write after unlock that checks the need for unlock_kick (Linus): We further avoid the need for a read-after-release by using xadd; the prev head value will include the slowpath flag and indicate if we need to do PV kicking of suspended spinners -- on modern chips xadd isn't (much) more expensive than an add + load. Result: setup: 16core (32 cpu +ht sandy bridge 8GB 16vcpu guest) benchmark overcommit %improve kernbench 1x -0.13 kernbench 2x 0.02 dbench 1x -1.77 dbench 2x -0.63 [Jeremy: Hinted missing TICKET_LOCK_INC for kick] [Oleg: Moved slowpath flag to head, ticket_equals idea] [PeterZ: Added detailed changelog] Suggested-by: Linus Torvalds Reported-by: Sasha Levin Tested-by: Sasha Levin Signed-off-by: Raghavendra K T Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Cc: Andrew Jones Cc: Andrew Morton Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Christian Borntraeger Cc: Christoph Lameter Cc: Dave Hansen Cc: Dave Jones Cc: David Vrabel Cc: Fernando Luis Vázquez Cao Cc: Konrad Rzeszutek Wilk Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Ulrich Obergfell Cc: Waiman Long Cc: a.ryabinin@samsung.com Cc: dave@stgolabs.net Cc: hpa@zytor.com Cc: jasowang@redhat.com Cc: jeremy@goop.org Cc: paul.gortmaker@windriver.com Cc: riel@redhat.com Cc: tglx@linutronix.de Cc: waiman.long@hp.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/20150215173043.GA7471@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/spinlock.h | 94 ++++++++++++++++++++--------------------- arch/x86/kernel/kvm.c | 13 ++++-- arch/x86/xen/spinlock.c | 13 ++++-- 3 files changed, 64 insertions(+), 56 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 625660f8a2fc..cf87de3fc390 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -46,7 +46,7 @@ static __always_inline bool static_key_false(struct static_key *key); static inline void __ticket_enter_slowpath(arch_spinlock_t *lock) { - set_bit(0, (volatile unsigned long *)&lock->tickets.tail); + set_bit(0, (volatile unsigned long *)&lock->tickets.head); } #else /* !CONFIG_PARAVIRT_SPINLOCKS */ @@ -60,10 +60,30 @@ static inline void __ticket_unlock_kick(arch_spinlock_t *lock, } #endif /* CONFIG_PARAVIRT_SPINLOCKS */ +static inline int __tickets_equal(__ticket_t one, __ticket_t two) +{ + return !((one ^ two) & ~TICKET_SLOWPATH_FLAG); +} + +static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock, + __ticket_t head) +{ + if (head & TICKET_SLOWPATH_FLAG) { + arch_spinlock_t old, new; + + old.tickets.head = head; + new.tickets.head = head & ~TICKET_SLOWPATH_FLAG; + old.tickets.tail = new.tickets.head + TICKET_LOCK_INC; + new.tickets.tail = old.tickets.tail; + + /* try to clear slowpath flag when there are no contenders */ + cmpxchg(&lock->head_tail, old.head_tail, new.head_tail); + } +} static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) { - return lock.tickets.head == lock.tickets.tail; + return __tickets_equal(lock.tickets.head, lock.tickets.tail); } /* @@ -87,18 +107,21 @@ static __always_inline void arch_spin_lock(arch_spinlock_t *lock) if (likely(inc.head == inc.tail)) goto out; - inc.tail &= ~TICKET_SLOWPATH_FLAG; for (;;) { unsigned count = SPIN_THRESHOLD; do { - if (READ_ONCE(lock->tickets.head) == inc.tail) - goto out; + inc.head = READ_ONCE(lock->tickets.head); + if (__tickets_equal(inc.head, inc.tail)) + goto clear_slowpath; cpu_relax(); } while (--count); __ticket_lock_spinning(lock, inc.tail); } -out: barrier(); /* make sure nothing creeps before the lock is taken */ +clear_slowpath: + __ticket_check_and_clear_slowpath(lock, inc.head); +out: + barrier(); /* make sure nothing creeps before the lock is taken */ } static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) @@ -106,56 +129,30 @@ static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) arch_spinlock_t old, new; old.tickets = READ_ONCE(lock->tickets); - if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG)) + if (!__tickets_equal(old.tickets.head, old.tickets.tail)) return 0; new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT); + new.head_tail &= ~TICKET_SLOWPATH_FLAG; /* cmpxchg is a full barrier, so nothing can move before it */ return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; } -static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock, - arch_spinlock_t old) -{ - arch_spinlock_t new; - - BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); - - /* Perform the unlock on the "before" copy */ - old.tickets.head += TICKET_LOCK_INC; - - /* Clear the slowpath flag */ - new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT); - - /* - * If the lock is uncontended, clear the flag - use cmpxchg in - * case it changes behind our back though. - */ - if (new.tickets.head != new.tickets.tail || - cmpxchg(&lock->head_tail, old.head_tail, - new.head_tail) != old.head_tail) { - /* - * Lock still has someone queued for it, so wake up an - * appropriate waiter. - */ - __ticket_unlock_kick(lock, old.tickets.head); - } -} - static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) { if (TICKET_SLOWPATH_FLAG && - static_key_false(¶virt_ticketlocks_enabled)) { - arch_spinlock_t prev; + static_key_false(¶virt_ticketlocks_enabled)) { + __ticket_t head; - prev = *lock; - add_smp(&lock->tickets.head, TICKET_LOCK_INC); + BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); - /* add_smp() is a full mb() */ + head = xadd(&lock->tickets.head, TICKET_LOCK_INC); - if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) - __ticket_unlock_slowpath(lock, prev); + if (unlikely(head & TICKET_SLOWPATH_FLAG)) { + head &= ~TICKET_SLOWPATH_FLAG; + __ticket_unlock_kick(lock, (head + TICKET_LOCK_INC)); + } } else __add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX); } @@ -164,14 +161,15 @@ static inline int arch_spin_is_locked(arch_spinlock_t *lock) { struct __raw_tickets tmp = READ_ONCE(lock->tickets); - return tmp.tail != tmp.head; + return !__tickets_equal(tmp.tail, tmp.head); } static inline int arch_spin_is_contended(arch_spinlock_t *lock) { struct __raw_tickets tmp = READ_ONCE(lock->tickets); - return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC; + tmp.head &= ~TICKET_SLOWPATH_FLAG; + return (tmp.tail - tmp.head) > TICKET_LOCK_INC; } #define arch_spin_is_contended arch_spin_is_contended @@ -183,16 +181,16 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) { - __ticket_t head = ACCESS_ONCE(lock->tickets.head); + __ticket_t head = READ_ONCE(lock->tickets.head); for (;;) { - struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + struct __raw_tickets tmp = READ_ONCE(lock->tickets); /* * We need to check "unlocked" in a loop, tmp.head == head * can be false positive because of overflow. */ - if (tmp.head == (tmp.tail & ~TICKET_SLOWPATH_FLAG) || - tmp.head != head) + if (__tickets_equal(tmp.head, tmp.tail) || + !__tickets_equal(tmp.head, head)) break; cpu_relax(); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 94f643484300..e354cc6446ab 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -609,7 +609,7 @@ static inline void check_zero(void) u8 ret; u8 old; - old = ACCESS_ONCE(zero_stats); + old = READ_ONCE(zero_stats); if (unlikely(old)) { ret = cmpxchg(&zero_stats, old, 0); /* This ensures only one fellow resets the stat */ @@ -727,6 +727,7 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) int cpu; u64 start; unsigned long flags; + __ticket_t head; if (in_nmi()) return; @@ -768,11 +769,15 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) */ __ticket_enter_slowpath(lock); + /* make sure enter_slowpath, which is atomic does not cross the read */ + smp_mb__after_atomic(); + /* * check again make sure it didn't become free while * we weren't looking. */ - if (ACCESS_ONCE(lock->tickets.head) == want) { + head = READ_ONCE(lock->tickets.head); + if (__tickets_equal(head, want)) { add_stats(TAKEN_SLOW_PICKUP, 1); goto out; } @@ -803,8 +808,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) add_stats(RELEASED_SLOW, 1); for_each_cpu(cpu, &waiting_cpus) { const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu); - if (ACCESS_ONCE(w->lock) == lock && - ACCESS_ONCE(w->want) == ticket) { + if (READ_ONCE(w->lock) == lock && + READ_ONCE(w->want) == ticket) { add_stats(RELEASED_SLOW_KICKED, 1); kvm_kick_cpu(cpu); break; diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 23b45eb9a89c..956374c1edbc 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -41,7 +41,7 @@ static u8 zero_stats; static inline void check_zero(void) { u8 ret; - u8 old = ACCESS_ONCE(zero_stats); + u8 old = READ_ONCE(zero_stats); if (unlikely(old)) { ret = cmpxchg(&zero_stats, old, 0); /* This ensures only one fellow resets the stat */ @@ -112,6 +112,7 @@ __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) struct xen_lock_waiting *w = this_cpu_ptr(&lock_waiting); int cpu = smp_processor_id(); u64 start; + __ticket_t head; unsigned long flags; /* If kicker interrupts not initialized yet, just spin */ @@ -159,11 +160,15 @@ __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) */ __ticket_enter_slowpath(lock); + /* make sure enter_slowpath, which is atomic does not cross the read */ + smp_mb__after_atomic(); + /* * check again make sure it didn't become free while * we weren't looking */ - if (ACCESS_ONCE(lock->tickets.head) == want) { + head = READ_ONCE(lock->tickets.head); + if (__tickets_equal(head, want)) { add_stats(TAKEN_SLOW_PICKUP, 1); goto out; } @@ -204,8 +209,8 @@ static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next) const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu); /* Make sure we read lock before want */ - if (ACCESS_ONCE(w->lock) == lock && - ACCESS_ONCE(w->want) == next) { + if (READ_ONCE(w->lock) == lock && + READ_ONCE(w->want) == next) { add_stats(RELEASED_SLOW_KICKED, 1); xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); break; -- cgit v1.2.3 From 1ea76fbadd667b19c4fa4466f3a3b55a505e83d9 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 16 Feb 2015 10:11:13 +0800 Subject: x86/irq: Fix regression caused by commit b568b8601f05 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit b568b8601f05 ("Treat SCI interrupt as normal GSI interrupt") accidently removes support of legacy PIC interrupt when fixing a regression for Xen, which causes a nasty regression on HP/Compaq nc6000 where we fail to register the ACPI interrupt, and thus lose eg. thermal notifications leading a potentially overheated machine. So reintroduce support of legacy PIC based ACPI SCI interrupt. Reported-by: Ville Syrjälä Tested-by: Ville Syrjälä Signed-off-by: Jiang Liu Signed-off-by: Peter Zijlstra (Intel) Acked-by: Pavel Machek Cc: # 3.19+ Cc: H. Peter Anvin Cc: Len Brown Cc: Linus Torvalds Cc: Rafael J. Wysocki Cc: Sander Eikelenboom Cc: linux-pm@vger.kernel.org Link: http://lkml.kernel.org/r/1424052673-22974-1-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a18fff361c7f..8b5916342902 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -613,6 +613,11 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp) { int rc, irq, trigger, polarity; + if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { + *irqp = gsi; + return 0; + } + rc = acpi_get_override_irq(gsi, &trigger, &polarity); if (rc == 0) { trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE; -- cgit v1.2.3 From d97eb8966c91f2c9d05f0a22eb89ed5b76d966d1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 4 Feb 2015 13:33:33 +0100 Subject: x86/irq: Check for valid irq descriptor in check_irq_vectors_for_cpu_disable() When an interrupt is migrated away from a cpu it will stay in its vector_irq array until smp_irq_move_cleanup_interrupt succeeded. The cfg->move_in_progress flag is cleared already when the IPI was sent. When the interrupt is destroyed after migration its 'struct irq_desc' is freed and the vector_irq arrays are cleaned up. But since cfg->move_in_progress is already 0 the references at cpus before the last migration will not be cleared. So this would leave a reference to an already destroyed irq alive. When the cpu is taken down at this point, the check_irq_vectors_for_cpu_disable() function finds a valid irq number in the vector_irq array, but gets NULL for its descriptor and dereferences it, causing a kernel panic. This has been observed on real systems at shutdown. Add a check to check_irq_vectors_for_cpu_disable() for a valid 'struct irq_desc' to prevent this issue. Signed-off-by: Joerg Roedel Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiang Liu Cc: H. Peter Anvin Cc: Jan Beulich Cc: K. Y. Srinivasan Cc: Linus Torvalds Cc: Prarit Bhargava Cc: Rasmus Villemoes Cc: Yinghai Lu Cc: alnovak@suse.com Cc: joro@8bytes.org Link: http://lkml.kernel.org/r/20150204132754.GA10078@suse.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 705ef8d48e2d..67b1cbe0093a 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -302,6 +302,9 @@ int check_irq_vectors_for_cpu_disable(void) irq = __this_cpu_read(vector_irq[vector]); if (irq >= 0) { desc = irq_to_desc(irq); + if (!desc) + continue; + data = irq_desc_get_irq_data(desc); cpumask_copy(&affinity_new, data->affinity); cpu_clear(this_cpu, affinity_new); -- cgit v1.2.3 From 097f4e5e839359021c8f0ea273655031e6ed04ff Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 12 Feb 2015 20:18:50 +0100 Subject: uprobes/x86: Add comment with insn opcodes, mnemonics and why we dont support them After adding these, it's clear we have some awkward choices there. Some valid instructions are prohibited from uprobing while several invalid ones are allowed. Hopefully future edits to the good-opcode tables will fix wrong bits or explain why those bits are not wrong. No actual code changes. Signed-off-by: Denys Vlasenko Cc: Andy Lutomirski Cc: Jim Keniston Cc: Masami Hiramatsu Cc: Oleg Nesterov Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1423768732-32194-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/uprobes.c | 153 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 134 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 8b96a947021f..54e36248e9c0 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -66,6 +66,49 @@ * Good-instruction tables for 32-bit apps. This is non-const and volatile * to keep gcc from statically optimizing it out, as variable_test_bit makes * some versions of gcc to think only *(unsigned long*) is used. + * + * Prefixes. Most marked as "bad", but it doesn't matter, since insn decoder + * won't report *prefixes* as OPCODE1(insn). + * 0f - 2-byte opcode prefix + * 26,2e,36,3e - es:/cs:/ss:/ds: + * 64 - fs: (marked as "good", why?) + * 65 - gs: (marked as "good", why?) + * 66 - operand-size prefix + * 67 - address-size prefix + * f0 - lock prefix + * f2 - repnz (marked as "good", why?) + * f3 - rep/repz (marked as "good", why?) + * + * Opcodes we'll probably never support: + * 6c-6f - ins,outs. SEGVs if used in userspace + * e4-e7 - in,out imm. SEGVs if used in userspace + * ec-ef - in,out acc. SEGVs if used in userspace + * cc - int3. SIGTRAP if used in userspace + * ce - into. Not used in userspace - no kernel support to make it useful. SEGVs + * (why we support bound (62) then? it's similar, and similarly unused...) + * f1 - int1. SIGTRAP if used in userspace + * f4 - hlt. SEGVs if used in userspace + * fa - cli. SEGVs if used in userspace + * fb - sti. SEGVs if used in userspace + * + * Opcodes which need some work to be supported: + * 07,17,1f - pop es/ss/ds + * Normally not used in userspace, but would execute if used. + * Can cause GP or stack exception if tries to load wrong segment descriptor. + * We hesitate to run them under single step since kernel's handling + * of userspace single-stepping (TF flag) is fragile. + * We can easily refuse to support push es/cs/ss/ds (06/0e/16/1e) + * on the same grounds that they are never used. + * cd - int N. + * Used by userspace for "int 80" syscall entry. (Other "int N" + * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3). + * Not supported since kernel's handling of userspace single-stepping + * (TF flag) is fragile. + * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad + * + * Opcodes which can be enabled right away: + * 63 - arpl. This insn has no unusual exceptions (it's basically an arith op). + * d6 - salc. Undocumented "sign-extend carry flag to AL" insn */ #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static volatile u32 good_insns_32[256 / 32] = { @@ -94,7 +137,55 @@ static volatile u32 good_insns_32[256 / 32] = { #define good_insns_32 NULL #endif -/* Good-instruction tables for 64-bit apps */ +/* Good-instruction tables for 64-bit apps. + * + * Prefixes. Most marked as "bad", but it doesn't matter, since insn decoder + * won't report *prefixes* as OPCODE1(insn). + * 0f - 2-byte opcode prefix + * 26,2e,36,3e - es:/cs:/ss:/ds: + * 40-4f - rex prefixes + * 64 - fs: (marked as "good", why?) + * 65 - gs: (marked as "good", why?) + * 66 - operand-size prefix + * 67 - address-size prefix + * f0 - lock prefix + * f2 - repnz (marked as "good", why?) + * f3 - rep/repz (marked as "good", why?) + * + * Genuinely invalid opcodes: + * 06,07 - formerly push/pop es + * 0e - formerly push cs + * 16,17 - formerly push/pop ss + * 1e,1f - formerly push/pop ds + * 27,2f,37,3f - formerly daa/das/aaa/aas + * 60,61 - formerly pusha/popa + * 62 - formerly bound. EVEX prefix for AVX512 + * 82 - formerly redundant encoding of Group1 + * 9a - formerly call seg:ofs (marked as "supported"???) + * c4,c5 - formerly les/lds. VEX prefixes for AVX + * ce - formerly into + * d4,d5 - formerly aam/aad + * d6 - formerly undocumented salc + * ea - formerly jmp seg:ofs (marked as "supported"???) + * + * Opcodes we'll probably never support: + * 6c-6f - ins,outs. SEGVs if used in userspace + * e4-e7 - in,out imm. SEGVs if used in userspace + * ec-ef - in,out acc. SEGVs if used in userspace + * cc - int3. SIGTRAP if used in userspace + * f1 - int1. SIGTRAP if used in userspace + * f4 - hlt. SEGVs if used in userspace + * fa - cli. SEGVs if used in userspace + * fb - sti. SEGVs if used in userspace + * + * Opcodes which need some work to be supported: + * cd - int N. + * Used by userspace for "int 80" syscall entry. (Other "int N" + * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3). + * Not supported since kernel's handling of userspace single-stepping + * (TF flag) is fragile. + * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad + */ #if defined(CONFIG_X86_64) static volatile u32 good_insns_64[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ @@ -122,7 +213,48 @@ static volatile u32 good_insns_64[256 / 32] = { #define good_insns_64 NULL #endif -/* Using this for both 64-bit and 32-bit apps */ +/* Using this for both 64-bit and 32-bit apps. + * Opcodes we don't support: + * 0f 00 - SLDT/STR/LLDT/LTR/VERR/VERW/-/- group. System insns + * 0f 01 - SGDT/SIDT/LGDT/LIDT/SMSW/-/LMSW/INVLPG group. + * Also encodes tons of other system insns if mod=11. + * Some are in fact non-system: xend, xtest, rdtscp, maybe more + * 0f 02 - lar (why? should be safe, it throws no exceptipons) + * 0f 03 - lsl (why? should be safe, it throws no exceptipons) + * 0f 04 - undefined + * 0f 05 - syscall + * 0f 06 - clts (CPL0 insn) + * 0f 07 - sysret + * 0f 08 - invd (CPL0 insn) + * 0f 09 - wbinvd (CPL0 insn) + * 0f 0a - undefined + * 0f 0b - ud2 + * 0f 0c - undefined + * 0f 0d - prefetchFOO (amd prefetch insns) + * 0f 18 - prefetchBAR (intel prefetch insns) + * 0f 24 - mov from test regs (perhaps entire 20-27 area can be disabled (special reg ops)) + * 0f 25 - undefined + * 0f 26 - mov to test regs + * 0f 27 - undefined + * 0f 30 - wrmsr (CPL0 insn) + * 0f 34 - sysenter + * 0f 35 - sysexit + * 0f 36 - undefined + * 0f 37 - getsec + * 0f 38-3f - 3-byte opcodes (why?? all look safe) + * 0f 78 - vmread + * 0f 79 - vmwrite + * 0f 7a - undefined + * 0f 7b - undefined + * 0f 7c - undefined + * 0f 7d - undefined + * 0f a6 - undefined + * 0f a7 - undefined + * 0f b8 - popcnt (why?? it's an ordinary ALU op) + * 0f d0 - undefined + * 0f f0 - lddqu (why?? it's an ordinary vector load op) + * 0f ff - undefined + */ static volatile u32 good_2byte_insns[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ @@ -148,23 +280,6 @@ static volatile u32 good_2byte_insns[256 / 32] = { #undef W /* - * opcodes we'll probably never support: - * - * 6c-6d, e4-e5, ec-ed - in - * 6e-6f, e6-e7, ee-ef - out - * cc, cd - int3, int - * cf - iret - * d6 - illegal instruction - * f1 - int1/icebp - * f4 - hlt - * fa, fb - cli, sti - * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2 - * - * invalid opcodes in 64-bit mode: - * - * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5 - * 63 - we support this opcode in x86_64 but not in i386. - * * opcodes we may need to refine support for: * * 0f - 2-byte instructions: For many of these instructions, the validity -- cgit v1.2.3 From 67fc809217dc7fd793211585b2a8d7b61715d06b Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 12 Feb 2015 20:18:51 +0100 Subject: uprobes/x86: Fix 1-byte opcode tables This change fixes 1-byte opcode tables so that only insns for which we have real reasons to disallow probing are marked with unset bits. To that end: Set bits for all prefix bytes. Their setting is ignored anyway - we check the bitmap against OPCODE1(insn), not against first byte. Keeping them set to 0 only confuses code reader with "why we don't support that opcode" question. Thus: enable bytes c4,c5 in 64-bit mode (VEX prefixes). Byte 62 (EVEX prefix) is not yet enabled since insn decoder does not support that yet. For 32-bit mode, enable probing of opcodes 63 (arpl) and d6 (salc). They don't require any special handling. For 64-bit mode, disable 9a and ea - these undefined opcodes were mistakenly left enabled. Signed-off-by: Denys Vlasenko Cc: Andy Lutomirski Cc: Jim Keniston Cc: Masami Hiramatsu Cc: Oleg Nesterov Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1423768732-32194-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/uprobes.c | 66 +++++++++++++---------------------------------- 1 file changed, 18 insertions(+), 48 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 54e36248e9c0..aa1da96d09ff 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -67,18 +67,6 @@ * to keep gcc from statically optimizing it out, as variable_test_bit makes * some versions of gcc to think only *(unsigned long*) is used. * - * Prefixes. Most marked as "bad", but it doesn't matter, since insn decoder - * won't report *prefixes* as OPCODE1(insn). - * 0f - 2-byte opcode prefix - * 26,2e,36,3e - es:/cs:/ss:/ds: - * 64 - fs: (marked as "good", why?) - * 65 - gs: (marked as "good", why?) - * 66 - operand-size prefix - * 67 - address-size prefix - * f0 - lock prefix - * f2 - repnz (marked as "good", why?) - * f3 - rep/repz (marked as "good", why?) - * * Opcodes we'll probably never support: * 6c-6f - ins,outs. SEGVs if used in userspace * e4-e7 - in,out imm. SEGVs if used in userspace @@ -105,31 +93,27 @@ * Not supported since kernel's handling of userspace single-stepping * (TF flag) is fragile. * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad - * - * Opcodes which can be enabled right away: - * 63 - arpl. This insn has no unusual exceptions (it's basically an arith op). - * d6 - salc. Undocumented "sign-extend carry flag to AL" insn */ #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static volatile u32 good_insns_32[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */ + W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 00 */ W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */ - W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */ - W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */ + W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ + W(0x30, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ - W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ - W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ - W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ + W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; @@ -138,19 +122,6 @@ static volatile u32 good_insns_32[256 / 32] = { #endif /* Good-instruction tables for 64-bit apps. - * - * Prefixes. Most marked as "bad", but it doesn't matter, since insn decoder - * won't report *prefixes* as OPCODE1(insn). - * 0f - 2-byte opcode prefix - * 26,2e,36,3e - es:/cs:/ss:/ds: - * 40-4f - rex prefixes - * 64 - fs: (marked as "good", why?) - * 65 - gs: (marked as "good", why?) - * 66 - operand-size prefix - * 67 - address-size prefix - * f0 - lock prefix - * f2 - repnz (marked as "good", why?) - * f3 - rep/repz (marked as "good", why?) * * Genuinely invalid opcodes: * 06,07 - formerly push/pop es @@ -159,14 +130,13 @@ static volatile u32 good_insns_32[256 / 32] = { * 1e,1f - formerly push/pop ds * 27,2f,37,3f - formerly daa/das/aaa/aas * 60,61 - formerly pusha/popa - * 62 - formerly bound. EVEX prefix for AVX512 + * 62 - formerly bound. EVEX prefix for AVX512 (not yet supported) * 82 - formerly redundant encoding of Group1 - * 9a - formerly call seg:ofs (marked as "supported"???) - * c4,c5 - formerly les/lds. VEX prefixes for AVX + * 9a - formerly call seg:ofs * ce - formerly into * d4,d5 - formerly aam/aad * d6 - formerly undocumented salc - * ea - formerly jmp seg:ofs (marked as "supported"???) + * ea - formerly jmp seg:ofs * * Opcodes we'll probably never support: * 6c-6f - ins,outs. SEGVs if used in userspace @@ -190,22 +160,22 @@ static volatile u32 good_insns_32[256 / 32] = { static volatile u32 good_insns_64[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */ + W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* 00 */ W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */ - W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */ - W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */ - W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ + W(0x20, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 20 */ + W(0x30, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ - W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ + W(0x60, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ - W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1) , /* 90 */ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ - W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ + W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ - W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ - W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ + W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0) | /* e0 */ + W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; -- cgit v1.2.3 From 5154d4f2adfdabe5aeb247e5b2b6b10fae5f6d50 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 12 Feb 2015 20:18:52 +0100 Subject: uprobes/x86: Fix 2-byte opcode table Enabled probing of lar, lsl, popcnt, lddqu, prefetch insns. They should be safe to probe, they throw no exceptions. Enabled probing of 3-byte opcodes 0f 38-3f xx - these are vector isns, so should be safe. Enabled probing of many currently undefined 0f xx insns. At the rate new vector instructions are getting added, we don't want to constantly enable more bits. We want to only occasionally *disable* ones which for some reason can't be probed. This includes 0f 24,26 opcodes, which are undefined since Pentium. On 486, they were "mov to/from test register". Explained more fully what 0f 78,79 opcodes are. Explained what 0f ae opcode is. (It's unclear why we don't allow probing it, but let's not change it for now). Signed-off-by: Denys Vlasenko Cc: Andy Lutomirski Cc: Jim Keniston Cc: Masami Hiramatsu Cc: Oleg Nesterov Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1423768732-32194-3-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/uprobes.c | 52 ++++++++++++++++------------------------------- 1 file changed, 17 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index aa1da96d09ff..81f8adb0679e 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -189,61 +189,43 @@ static volatile u32 good_insns_64[256 / 32] = { * 0f 01 - SGDT/SIDT/LGDT/LIDT/SMSW/-/LMSW/INVLPG group. * Also encodes tons of other system insns if mod=11. * Some are in fact non-system: xend, xtest, rdtscp, maybe more - * 0f 02 - lar (why? should be safe, it throws no exceptipons) - * 0f 03 - lsl (why? should be safe, it throws no exceptipons) - * 0f 04 - undefined * 0f 05 - syscall * 0f 06 - clts (CPL0 insn) * 0f 07 - sysret * 0f 08 - invd (CPL0 insn) * 0f 09 - wbinvd (CPL0 insn) - * 0f 0a - undefined * 0f 0b - ud2 - * 0f 0c - undefined - * 0f 0d - prefetchFOO (amd prefetch insns) - * 0f 18 - prefetchBAR (intel prefetch insns) - * 0f 24 - mov from test regs (perhaps entire 20-27 area can be disabled (special reg ops)) - * 0f 25 - undefined - * 0f 26 - mov to test regs - * 0f 27 - undefined - * 0f 30 - wrmsr (CPL0 insn) + * 0f 30 - wrmsr (CPL0 insn) (then why rdmsr is allowed, it's also CPL0 insn?) * 0f 34 - sysenter * 0f 35 - sysexit - * 0f 36 - undefined * 0f 37 - getsec - * 0f 38-3f - 3-byte opcodes (why?? all look safe) - * 0f 78 - vmread - * 0f 79 - vmwrite - * 0f 7a - undefined - * 0f 7b - undefined - * 0f 7c - undefined - * 0f 7d - undefined - * 0f a6 - undefined - * 0f a7 - undefined - * 0f b8 - popcnt (why?? it's an ordinary ALU op) - * 0f d0 - undefined - * 0f f0 - lddqu (why?? it's an ordinary vector load op) - * 0f ff - undefined + * 0f 78 - vmread (Intel VMX. CPL0 insn) + * 0f 79 - vmwrite (Intel VMX. CPL0 insn) + * Note: with prefixes, these two opcodes are + * extrq/insertq/AVX512 convert vector ops. + * 0f ae - group15: [f]xsave,[f]xrstor,[v]{ld,st}mxcsr,clflush[opt], + * {rd,wr}{fs,gs}base,{s,l,m}fence. + * Why? They are all user-executable. */ static volatile u32 good_2byte_insns[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ - W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */ - W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ - W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ - W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ + W(0x00, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ + W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ + W(0x30, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ - W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* 70 */ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ - W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ - W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ - W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ - W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */ + W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) /* f0 */ /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; -- cgit v1.2.3 From b7e37567d080301d38a302bb93ba79d1ca446dca Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Tue, 10 Feb 2015 09:34:05 +0800 Subject: kprobes/x86: Mark 2 bytes NOP as boostable Currently, x86 kprobes is unable to boost 2 bytes nop like: nopl 0x0(%rax,%rax,1) which is 0x0f 0x1f 0x44 0x00 0x00. Such nops have exactly 5 bytes to hold a relative jmp instruction. Boosting them should be obviously safe. This patch enable boosting such nops by simply updating twobyte_is_boostable[] array. Signed-off-by: Wang Nan Acked-by: Masami Hiramatsu Cc: Link: http://lkml.kernel.org/r/1423532045-41049-1-git-send-email-wangnan0@huawei.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 98f654d466e5..6a1146ea4d4d 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -84,7 +84,7 @@ static volatile u32 twobyte_is_boostable[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */ - W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */ + W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1) , /* 10 */ W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */ W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ -- cgit v1.2.3 From b273c2c2f2d2d13dc0bfa8923d52fbaf8fa56ae8 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Delgado Date: Mon, 2 Feb 2015 20:27:11 +0100 Subject: x86/apic: Fix the devicetree build in certain configs Without this patch: LD init/built-in.o arch/x86/built-in.o: In function `dtb_lapic_setup': kernel/devicetree.c:155: undefined reference to `apic_force_enable' Makefile:923: recipe for target 'vmlinux' failed make: *** [vmlinux] Error 1 Signed-off-by: Ricardo Ribalda Delgado Reviewed-by: Maciej W. Rozycki Cc: David Rientjes Cc: Jan Beulich Link: http://lkml.kernel.org/r/1422905231-16067-1-git-send-email-ricardo.ribalda@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 92003f3c8a42..efc3b22d896e 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -213,7 +213,15 @@ void register_lapic_address(unsigned long address); extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); extern int APIC_init_uniprocessor(void); + +#ifdef CONFIG_X86_64 +static inline int apic_force_enable(unsigned long addr) +{ + return -1; +} +#else extern int apic_force_enable(unsigned long addr); +#endif extern int apic_bsp_setup(bool upmode); extern void apic_ap_setup(void); -- cgit v1.2.3 From 28a375df16c2b6d01227541f3956568995aa5fda Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Fri, 30 Jan 2015 16:29:38 +0000 Subject: x86/intel/quark: Add Isolated Memory Regions for Quark X1000 Intel's Quark X1000 SoC contains a set of registers called Isolated Memory Regions. IMRs are accessed over the IOSF mailbox interface. IMRs are areas carved out of memory that define read/write access rights to the various system agents within the Quark system. For a given agent in the system it is possible to specify if that agent may read or write an area of memory defined by an IMR with a granularity of 1 KiB. Quark_SecureBootPRM_330234_001.pdf section 4.5 details the concept of IMRs quark-x1000-datasheet.pdf section 12.7.4 details the implementation of IMRs in silicon. eSRAM flush, CPU Snoop write-only, CPU SMM Mode, CPU non-SMM mode, RMU and PCIe Virtual Channels (VC0 and VC1) can have individual read/write access masks applied to them for a given memory region in Quark X1000. This enables IMRs to treat each memory transaction type listed above on an individual basis and to filter appropriately based on the IMR access mask for the memory region. Quark supports eight IMRs. Since all of the DMA capable SoC components in the X1000 are mapped to VC0 it is possible to define sections of memory as invalid for DMA write operations originating from Ethernet, USB, SD and any other DMA capable south-cluster component on VC0. Similarly it is possible to mark kernel memory as non-SMM mode read/write only or to mark BIOS runtime memory as SMM mode accessible only depending on the particular memory footprint on a given system. On an IMR violation Quark SoC X1000 systems are configured to reset the system, so ensuring that the IMR memory map is consistent with the EFI provided memory map is critical to ensure no IMR violations reset the system. The API for accessing IMRs is based on MTRR code but doesn't provide a /proc or /sys interface to manipulate IMRs. Defining the size and extent of IMRs is exclusively the domain of in-kernel code. Quark firmware sets up a series of locked IMRs around pieces of memory that firmware owns such as ACPI runtime data. During boot a series of unlocked IMRs are placed around items in memory to guarantee no DMA modification of those items can take place. Grub also places an unlocked IMR around the kernel boot params data structure and compressed kernel image. It is necessary for the kernel to tear down all unlocked IMRs in order to ensure that the kernel's view of memory passed via the EFI memory map is consistent with the IMR memory map. Without tearing down all unlocked IMRs on boot transitory IMRs such as those used to protect the compressed kernel image will cause IMR violations and system reboots. The IMR init code tears down all unlocked IMRs and sets a protective IMR around the kernel .text and .rodata as one contiguous block. This sanitizes the IMR memory map with respect to the EFI memory map and protects the read-only portions of the kernel from unwarranted DMA access. Tested-by: Ong, Boon Leong Signed-off-by: Bryan O'Donoghue Reviewed-by: Andy Shevchenko Reviewed-by: Darren Hart Reviewed-by: Ong, Boon Leong Cc: andy.shevchenko@gmail.com Cc: dvhart@infradead.org Link: http://lkml.kernel.org/r/1422635379-12476-2-git-send-email-pure.logic@nexus-software.ie Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 13 + arch/x86/include/asm/imr.h | 60 +++ arch/x86/platform/intel-quark/Makefile | 2 + arch/x86/platform/intel-quark/imr.c | 668 +++++++++++++++++++++++++++ arch/x86/platform/intel-quark/imr_selftest.c | 129 ++++++ drivers/platform/x86/Kconfig | 25 + 6 files changed, 897 insertions(+) create mode 100644 arch/x86/include/asm/imr.h create mode 100644 arch/x86/platform/intel-quark/Makefile create mode 100644 arch/x86/platform/intel-quark/imr.c create mode 100644 arch/x86/platform/intel-quark/imr_selftest.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 61bd2ad94281..20028da8ae18 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -313,6 +313,19 @@ config DEBUG_NMI_SELFTEST If unsure, say N. +config DEBUG_IMR_SELFTEST + bool "Isolated Memory Region self test" + default n + depends on INTEL_IMR + ---help--- + This option enables automated sanity testing of the IMR code. + Some simple tests are run to verify IMR bounds checking, alignment + and overlapping. This option is really only useful if you are + debugging an IMR memory map or are modifying the IMR code and want to + test your changes. + + If unsure say N here. + config X86_DEBUG_STATIC_CPU_HAS bool "Debug alternatives" depends on DEBUG_KERNEL diff --git a/arch/x86/include/asm/imr.h b/arch/x86/include/asm/imr.h new file mode 100644 index 000000000000..cd2ce4068441 --- /dev/null +++ b/arch/x86/include/asm/imr.h @@ -0,0 +1,60 @@ +/* + * imr.h: Isolated Memory Region API + * + * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2015 Bryan O'Donoghue + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#ifndef _IMR_H +#define _IMR_H + +#include + +/* + * IMR agent access mask bits + * See section 12.7.4.7 from quark-x1000-datasheet.pdf for register + * definitions. + */ +#define IMR_ESRAM_FLUSH BIT(31) +#define IMR_CPU_SNOOP BIT(30) /* Applicable only to write */ +#define IMR_RMU BIT(29) +#define IMR_VC1_SAI_ID3 BIT(15) +#define IMR_VC1_SAI_ID2 BIT(14) +#define IMR_VC1_SAI_ID1 BIT(13) +#define IMR_VC1_SAI_ID0 BIT(12) +#define IMR_VC0_SAI_ID3 BIT(11) +#define IMR_VC0_SAI_ID2 BIT(10) +#define IMR_VC0_SAI_ID1 BIT(9) +#define IMR_VC0_SAI_ID0 BIT(8) +#define IMR_CPU_0 BIT(1) /* SMM mode */ +#define IMR_CPU BIT(0) /* Non SMM mode */ +#define IMR_ACCESS_NONE 0 + +/* + * Read/Write access-all bits here include some reserved bits + * These are the values firmware uses and are accepted by hardware. + * The kernel defines read/write access-all in the same way as firmware + * in order to have a consistent and crisp definition across firmware, + * bootloader and kernel. + */ +#define IMR_READ_ACCESS_ALL 0xBFFFFFFF +#define IMR_WRITE_ACCESS_ALL 0xFFFFFFFF + +/* Number of IMRs provided by Quark X1000 SoC */ +#define QUARK_X1000_IMR_MAX 0x08 +#define QUARK_X1000_IMR_REGBASE 0x40 + +/* IMR alignment bits - only bits 31:10 are checked for IMR validity */ +#define IMR_ALIGN 0x400 +#define IMR_MASK (IMR_ALIGN - 1) + +int imr_add_range(phys_addr_t base, size_t size, + unsigned int rmask, unsigned int wmask, bool lock); + +int imr_remove_range(phys_addr_t base, size_t size); + +#endif /* _IMR_H */ diff --git a/arch/x86/platform/intel-quark/Makefile b/arch/x86/platform/intel-quark/Makefile new file mode 100644 index 000000000000..9cc57ed36022 --- /dev/null +++ b/arch/x86/platform/intel-quark/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_INTEL_IMR) += imr.o +obj-$(CONFIG_DEBUG_IMR_SELFTEST) += imr_selftest.o diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c new file mode 100644 index 000000000000..16e4df1c9290 --- /dev/null +++ b/arch/x86/platform/intel-quark/imr.c @@ -0,0 +1,668 @@ +/** + * imr.c + * + * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2015 Bryan O'Donoghue + * + * IMR registers define an isolated region of memory that can + * be masked to prohibit certain system agents from accessing memory. + * When a device behind a masked port performs an access - snooped or + * not, an IMR may optionally prevent that transaction from changing + * the state of memory or from getting correct data in response to the + * operation. + * + * Write data will be dropped and reads will return 0xFFFFFFFF, the + * system will reset and system BIOS will print out an error message to + * inform the user that an IMR has been violated. + * + * This code is based on the Linux MTRR code and reference code from + * Intel's Quark BSP EFI, Linux and grub code. + * + * See quark-x1000-datasheet.pdf for register definitions. + * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/quark-x1000-datasheet.pdf + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct imr_device { + struct dentry *file; + bool init; + struct mutex lock; + int max_imr; + int reg_base; +}; + +static struct imr_device imr_dev; + +/* + * IMR read/write mask control registers. + * See quark-x1000-datasheet.pdf sections 12.7.4.5 and 12.7.4.6 for + * bit definitions. + * + * addr_hi + * 31 Lock bit + * 30:24 Reserved + * 23:2 1 KiB aligned lo address + * 1:0 Reserved + * + * addr_hi + * 31:24 Reserved + * 23:2 1 KiB aligned hi address + * 1:0 Reserved + */ +#define IMR_LOCK BIT(31) + +struct imr_regs { + u32 addr_lo; + u32 addr_hi; + u32 rmask; + u32 wmask; +}; + +#define IMR_NUM_REGS (sizeof(struct imr_regs)/sizeof(u32)) +#define IMR_SHIFT 8 +#define imr_to_phys(x) ((x) << IMR_SHIFT) +#define phys_to_imr(x) ((x) >> IMR_SHIFT) + +/** + * imr_is_enabled - true if an IMR is enabled false otherwise. + * + * Determines if an IMR is enabled based on address range and read/write + * mask. An IMR set with an address range set to zero and a read/write + * access mask set to all is considered to be disabled. An IMR in any + * other state - for example set to zero but without read/write access + * all is considered to be enabled. This definition of disabled is how + * firmware switches off an IMR and is maintained in kernel for + * consistency. + * + * @imr: pointer to IMR descriptor. + * @return: true if IMR enabled false if disabled. + */ +static inline int imr_is_enabled(struct imr_regs *imr) +{ + return !(imr->rmask == IMR_READ_ACCESS_ALL && + imr->wmask == IMR_WRITE_ACCESS_ALL && + imr_to_phys(imr->addr_lo) == 0 && + imr_to_phys(imr->addr_hi) == 0); +} + +/** + * imr_read - read an IMR at a given index. + * + * Requires caller to hold imr mutex. + * + * @idev: pointer to imr_device structure. + * @imr_id: IMR entry to read. + * @imr: IMR structure representing address and access masks. + * @return: 0 on success or error code passed from mbi_iosf on failure. + */ +static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr) +{ + u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base; + int ret; + + ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ, + reg++, &imr->addr_lo); + if (ret) + return ret; + + ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ, + reg++, &imr->addr_hi); + if (ret) + return ret; + + ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ, + reg++, &imr->rmask); + if (ret) + return ret; + + ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ, + reg++, &imr->wmask); + if (ret) + return ret; + + return 0; +} + +/** + * imr_write - write an IMR at a given index. + * + * Requires caller to hold imr mutex. + * Note lock bits need to be written independently of address bits. + * + * @idev: pointer to imr_device structure. + * @imr_id: IMR entry to write. + * @imr: IMR structure representing address and access masks. + * @lock: indicates if the IMR lock bit should be applied. + * @return: 0 on success or error code passed from mbi_iosf on failure. + */ +static int imr_write(struct imr_device *idev, u32 imr_id, + struct imr_regs *imr, bool lock) +{ + unsigned long flags; + u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base; + int ret; + + local_irq_save(flags); + + ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, reg++, + imr->addr_lo); + if (ret) + goto failed; + + ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, + reg++, imr->addr_hi); + if (ret) + goto failed; + + ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, + reg++, imr->rmask); + if (ret) + goto failed; + + ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, + reg++, imr->wmask); + if (ret) + goto failed; + + /* Lock bit must be set separately to addr_lo address bits. */ + if (lock) { + imr->addr_lo |= IMR_LOCK; + ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, + reg - IMR_NUM_REGS, imr->addr_lo); + if (ret) + goto failed; + } + + local_irq_restore(flags); + return 0; +failed: + /* + * If writing to the IOSF failed then we're in an unknown state, + * likely a very bad state. An IMR in an invalid state will almost + * certainly lead to a memory access violation. + */ + local_irq_restore(flags); + WARN(ret, "IOSF-MBI write fail range 0x%08x-0x%08x unreliable\n", + imr_to_phys(imr->addr_lo), imr_to_phys(imr->addr_hi) + IMR_MASK); + + return ret; +} + +/** + * imr_dbgfs_state_show - print state of IMR registers. + * + * @s: pointer to seq_file for output. + * @unused: unused parameter. + * @return: 0 on success or error code passed from mbi_iosf on failure. + */ +static int imr_dbgfs_state_show(struct seq_file *s, void *unused) +{ + phys_addr_t base; + phys_addr_t end; + int i; + struct imr_device *idev = s->private; + struct imr_regs imr; + size_t size; + int ret = -ENODEV; + + mutex_lock(&idev->lock); + + for (i = 0; i < idev->max_imr; i++) { + + ret = imr_read(idev, i, &imr); + if (ret) + break; + + /* + * Remember to add IMR_ALIGN bytes to size to indicate the + * inherent IMR_ALIGN size bytes contained in the masked away + * lower ten bits. + */ + if (imr_is_enabled(&imr)) { + base = imr_to_phys(imr.addr_lo); + end = imr_to_phys(imr.addr_hi) + IMR_MASK; + } else { + base = 0; + end = 0; + } + size = end - base; + seq_printf(s, "imr%02i: base=%pa, end=%pa, size=0x%08zx " + "rmask=0x%08x, wmask=0x%08x, %s, %s\n", i, + &base, &end, size, imr.rmask, imr.wmask, + imr_is_enabled(&imr) ? "enabled " : "disabled", + imr.addr_lo & IMR_LOCK ? "locked" : "unlocked"); + } + + mutex_unlock(&idev->lock); + return ret; +} + +/** + * imr_state_open - debugfs open callback. + * + * @inode: pointer to struct inode. + * @file: pointer to struct file. + * @return: result of single open. + */ +static int imr_state_open(struct inode *inode, struct file *file) +{ + return single_open(file, imr_dbgfs_state_show, inode->i_private); +} + +static const struct file_operations imr_state_ops = { + .open = imr_state_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/** + * imr_debugfs_register - register debugfs hooks. + * + * @idev: pointer to imr_device structure. + * @return: 0 on success - errno on failure. + */ +static int imr_debugfs_register(struct imr_device *idev) +{ + idev->file = debugfs_create_file("imr_state", S_IFREG | S_IRUGO, NULL, + idev, &imr_state_ops); + if (IS_ERR(idev->file)) + return PTR_ERR(idev->file); + + return 0; +} + +/** + * imr_debugfs_unregister - unregister debugfs hooks. + * + * @idev: pointer to imr_device structure. + * @return: + */ +static void imr_debugfs_unregister(struct imr_device *idev) +{ + debugfs_remove(idev->file); +} + +/** + * imr_check_params - check passed address range IMR alignment and non-zero size + * + * @base: base address of intended IMR. + * @size: size of intended IMR. + * @return: zero on valid range -EINVAL on unaligned base/size. + */ +static int imr_check_params(phys_addr_t base, size_t size) +{ + if ((base & IMR_MASK) || (size & IMR_MASK)) { + pr_err("base %pa size 0x%08zx must align to 1KiB\n", + &base, size); + return -EINVAL; + } + if (size == 0) + return -EINVAL; + + return 0; +} + +/** + * imr_raw_size - account for the IMR_ALIGN bytes that addr_hi appends. + * + * IMR addr_hi has a built in offset of plus IMR_ALIGN (0x400) bytes from the + * value in the register. We need to subtract IMR_ALIGN bytes from input sizes + * as a result. + * + * @size: input size bytes. + * @return: reduced size. + */ +static inline size_t imr_raw_size(size_t size) +{ + return size - IMR_ALIGN; +} + +/** + * imr_address_overlap - detects an address overlap. + * + * @addr: address to check against an existing IMR. + * @imr: imr being checked. + * @return: true for overlap false for no overlap. + */ +static inline int imr_address_overlap(phys_addr_t addr, struct imr_regs *imr) +{ + return addr >= imr_to_phys(imr->addr_lo) && addr <= imr_to_phys(imr->addr_hi); +} + +/** + * imr_add_range - add an Isolated Memory Region. + * + * @base: physical base address of region aligned to 1KiB. + * @size: physical size of region in bytes must be aligned to 1KiB. + * @read_mask: read access mask. + * @write_mask: write access mask. + * @lock: indicates whether or not to permanently lock this region. + * @return: zero on success or negative value indicating error. + */ +int imr_add_range(phys_addr_t base, size_t size, + unsigned int rmask, unsigned int wmask, bool lock) +{ + phys_addr_t end; + unsigned int i; + struct imr_device *idev = &imr_dev; + struct imr_regs imr; + size_t raw_size; + int reg; + int ret; + + if (WARN_ONCE(idev->init == false, "driver not initialized")) + return -ENODEV; + + ret = imr_check_params(base, size); + if (ret) + return ret; + + /* Tweak the size value. */ + raw_size = imr_raw_size(size); + end = base + raw_size; + + /* + * Check for reserved IMR value common to firmware, kernel and grub + * indicating a disabled IMR. + */ + imr.addr_lo = phys_to_imr(base); + imr.addr_hi = phys_to_imr(end); + imr.rmask = rmask; + imr.wmask = wmask; + if (!imr_is_enabled(&imr)) + return -ENOTSUPP; + + mutex_lock(&idev->lock); + + /* + * Find a free IMR while checking for an existing overlapping range. + * Note there's no restriction in silicon to prevent IMR overlaps. + * For the sake of simplicity and ease in defining/debugging an IMR + * memory map we exclude IMR overlaps. + */ + reg = -1; + for (i = 0; i < idev->max_imr; i++) { + ret = imr_read(idev, i, &imr); + if (ret) + goto failed; + + /* Find overlap @ base or end of requested range. */ + ret = -EINVAL; + if (imr_is_enabled(&imr)) { + if (imr_address_overlap(base, &imr)) + goto failed; + if (imr_address_overlap(end, &imr)) + goto failed; + } else { + reg = i; + } + } + + /* Error out if we have no free IMR entries. */ + if (reg == -1) { + ret = -ENOMEM; + goto failed; + } + + pr_debug("add %d phys %pa-%pa size %zx mask 0x%08x wmask 0x%08x\n", + reg, &base, &end, raw_size, rmask, wmask); + + /* Enable IMR at specified range and access mask. */ + imr.addr_lo = phys_to_imr(base); + imr.addr_hi = phys_to_imr(end); + imr.rmask = rmask; + imr.wmask = wmask; + + ret = imr_write(idev, reg, &imr, lock); + if (ret < 0) { + /* + * In the highly unlikely event iosf_mbi_write failed + * attempt to rollback the IMR setup skipping the trapping + * of further IOSF write failures. + */ + imr.addr_lo = 0; + imr.addr_hi = 0; + imr.rmask = IMR_READ_ACCESS_ALL; + imr.wmask = IMR_WRITE_ACCESS_ALL; + imr_write(idev, reg, &imr, false); + } +failed: + mutex_unlock(&idev->lock); + return ret; +} +EXPORT_SYMBOL_GPL(imr_add_range); + +/** + * __imr_remove_range - delete an Isolated Memory Region. + * + * This function allows you to delete an IMR by its index specified by reg or + * by address range specified by base and size respectively. If you specify an + * index on its own the base and size parameters are ignored. + * imr_remove_range(0, base, size); delete IMR at index 0 base/size ignored. + * imr_remove_range(-1, base, size); delete IMR from base to base+size. + * + * @reg: imr index to remove. + * @base: physical base address of region aligned to 1 KiB. + * @size: physical size of region in bytes aligned to 1 KiB. + * @return: -EINVAL on invalid range or out or range id + * -ENODEV if reg is valid but no IMR exists or is locked + * 0 on success. + */ +static int __imr_remove_range(int reg, phys_addr_t base, size_t size) +{ + phys_addr_t end; + bool found = false; + unsigned int i; + struct imr_device *idev = &imr_dev; + struct imr_regs imr; + size_t raw_size; + int ret = 0; + + if (WARN_ONCE(idev->init == false, "driver not initialized")) + return -ENODEV; + + /* + * Validate address range if deleting by address, else we are + * deleting by index where base and size will be ignored. + */ + if (reg == -1) { + ret = imr_check_params(base, size); + if (ret) + return ret; + } + + /* Tweak the size value. */ + raw_size = imr_raw_size(size); + end = base + raw_size; + + mutex_lock(&idev->lock); + + if (reg >= 0) { + /* If a specific IMR is given try to use it. */ + ret = imr_read(idev, reg, &imr); + if (ret) + goto failed; + + if (!imr_is_enabled(&imr) || imr.addr_lo & IMR_LOCK) { + ret = -ENODEV; + goto failed; + } + found = true; + } else { + /* Search for match based on address range. */ + for (i = 0; i < idev->max_imr; i++) { + ret = imr_read(idev, i, &imr); + if (ret) + goto failed; + + if (!imr_is_enabled(&imr) || imr.addr_lo & IMR_LOCK) + continue; + + if ((imr_to_phys(imr.addr_lo) == base) && + (imr_to_phys(imr.addr_hi) == end)) { + found = true; + reg = i; + break; + } + } + } + + if (!found) { + ret = -ENODEV; + goto failed; + } + + pr_debug("remove %d phys %pa-%pa size %zx\n", reg, &base, &end, raw_size); + + /* Tear down the IMR. */ + imr.addr_lo = 0; + imr.addr_hi = 0; + imr.rmask = IMR_READ_ACCESS_ALL; + imr.wmask = IMR_WRITE_ACCESS_ALL; + + ret = imr_write(idev, reg, &imr, false); + +failed: + mutex_unlock(&idev->lock); + return ret; +} + +/** + * imr_remove_range - delete an Isolated Memory Region by address + * + * This function allows you to delete an IMR by an address range specified + * by base and size respectively. + * imr_remove_range(base, size); delete IMR from base to base+size. + * + * @base: physical base address of region aligned to 1 KiB. + * @size: physical size of region in bytes aligned to 1 KiB. + * @return: -EINVAL on invalid range or out or range id + * -ENODEV if reg is valid but no IMR exists or is locked + * 0 on success. + */ +int imr_remove_range(phys_addr_t base, size_t size) +{ + return __imr_remove_range(-1, base, size); +} +EXPORT_SYMBOL_GPL(imr_remove_range); + +/** + * imr_clear - delete an Isolated Memory Region by index + * + * This function allows you to delete an IMR by an address range specified + * by the index of the IMR. Useful for initial sanitization of the IMR + * address map. + * imr_ge(base, size); delete IMR from base to base+size. + * + * @reg: imr index to remove. + * @return: -EINVAL on invalid range or out or range id + * -ENODEV if reg is valid but no IMR exists or is locked + * 0 on success. + */ +static inline int imr_clear(int reg) +{ + return __imr_remove_range(reg, 0, 0); +} + +/** + * imr_fixup_memmap - Tear down IMRs used during bootup. + * + * BIOS and Grub both setup IMRs around compressed kernel, initrd memory + * that need to be removed before the kernel hands out one of the IMR + * encased addresses to a downstream DMA agent such as the SD or Ethernet. + * IMRs on Galileo are setup to immediately reset the system on violation. + * As a result if you're running a root filesystem from SD - you'll need + * the boot-time IMRs torn down or you'll find seemingly random resets when + * using your filesystem. + * + * @idev: pointer to imr_device structure. + * @return: + */ +static void __init imr_fixup_memmap(struct imr_device *idev) +{ + phys_addr_t base = virt_to_phys(&_text); + size_t size = virt_to_phys(&__end_rodata) - base; + int i; + int ret; + + /* Tear down all existing unlocked IMRs. */ + for (i = 0; i < idev->max_imr; i++) + imr_clear(i); + + /* + * Setup a locked IMR around the physical extent of the kernel + * from the beginning of the .text secton to the end of the + * .rodata section as one physically contiguous block. + */ + ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, true); + if (ret < 0) { + pr_err("unable to setup IMR for kernel: (%p - %p)\n", + &_text, &__end_rodata); + } else { + pr_info("protecting kernel .text - .rodata: %zu KiB (%p - %p)\n", + size / 1024, &_text, &__end_rodata); + } + +} + +static const struct x86_cpu_id imr_ids[] __initconst = { + { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */ + {} +}; +MODULE_DEVICE_TABLE(x86cpu, imr_ids); + +/** + * imr_init - entry point for IMR driver. + * + * return: -ENODEV for no IMR support 0 if good to go. + */ +static int __init imr_init(void) +{ + struct imr_device *idev = &imr_dev; + int ret; + + if (!x86_match_cpu(imr_ids) || !iosf_mbi_available()) + return -ENODEV; + + idev->max_imr = QUARK_X1000_IMR_MAX; + idev->reg_base = QUARK_X1000_IMR_REGBASE; + idev->init = true; + + mutex_init(&idev->lock); + ret = imr_debugfs_register(idev); + if (ret != 0) + pr_warn("debugfs register failed!\n"); + imr_fixup_memmap(idev); + return 0; +} + +/** + * imr_exit - exit point for IMR code. + * + * Deregisters debugfs, leave IMR state as-is. + * + * return: + */ +static void __exit imr_exit(void) +{ + imr_debugfs_unregister(&imr_dev); +} + +module_init(imr_init); +module_exit(imr_exit); + +MODULE_AUTHOR("Bryan O'Donoghue "); +MODULE_DESCRIPTION("Intel Isolated Memory Region driver"); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c new file mode 100644 index 000000000000..c9a0838890e2 --- /dev/null +++ b/arch/x86/platform/intel-quark/imr_selftest.c @@ -0,0 +1,129 @@ +/** + * imr_selftest.c + * + * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2015 Bryan O'Donoghue + * + * IMR self test. The purpose of this module is to run a set of tests on the + * IMR API to validate it's sanity. We check for overlapping, reserved + * addresses and setup/teardown sanity. + * + */ + +#include +#include +#include +#include +#include +#include + +#define SELFTEST KBUILD_MODNAME ": " +/** + * imr_self_test_result - Print result string for self test. + * + * @res: result code - true if test passed false otherwise. + * @fmt: format string. + * ... variadic argument list. + */ +static void __init imr_self_test_result(int res, const char *fmt, ...) +{ + va_list vlist; + + /* Print pass/fail. */ + if (res) + pr_info(SELFTEST "pass "); + else + pr_info(SELFTEST "fail "); + + /* Print variable string. */ + va_start(vlist, fmt); + vprintk(fmt, vlist); + va_end(vlist); + + /* Optional warning. */ + WARN(res == 0, "test failed"); +} +#undef SELFTEST + +/** + * imr_self_test + * + * Verify IMR self_test with some simple tests to verify overlap, + * zero sized allocations and 1 KiB sized areas. + * + */ +static void __init imr_self_test(void) +{ + phys_addr_t base = virt_to_phys(&_text); + size_t size = virt_to_phys(&__end_rodata) - base; + const char *fmt_over = "overlapped IMR @ (0x%08lx - 0x%08lx)\n"; + int ret; + + /* Test zero zero. */ + ret = imr_add_range(0, 0, 0, 0, false); + imr_self_test_result(ret < 0, "zero sized IMR\n"); + + /* Test exact overlap. */ + ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); + imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); + + /* Test overlap with base inside of existing. */ + base += size - IMR_ALIGN; + ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); + imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); + + /* Test overlap with end inside of existing. */ + base -= size + IMR_ALIGN * 2; + ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); + imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); + + /* Test that a 1 KiB IMR @ zero with read/write all will bomb out. */ + ret = imr_add_range(0, IMR_ALIGN, IMR_READ_ACCESS_ALL, + IMR_WRITE_ACCESS_ALL, false); + imr_self_test_result(ret < 0, "1KiB IMR @ 0x00000000 - access-all\n"); + + /* Test that a 1 KiB IMR @ zero with CPU only will work. */ + ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU, false); + imr_self_test_result(ret >= 0, "1KiB IMR @ 0x00000000 - cpu-access\n"); + if (ret >= 0) { + ret = imr_remove_range(0, IMR_ALIGN); + imr_self_test_result(ret == 0, "teardown - cpu-access\n"); + } + + /* Test 2 KiB works. */ + size = IMR_ALIGN * 2; + ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL, + IMR_WRITE_ACCESS_ALL, false); + imr_self_test_result(ret >= 0, "2KiB IMR @ 0x00000000\n"); + if (ret >= 0) { + ret = imr_remove_range(0, size); + imr_self_test_result(ret == 0, "teardown 2KiB\n"); + } +} + +/** + * imr_self_test_init - entry point for IMR driver. + * + * return: -ENODEV for no IMR support 0 if good to go. + */ +static int __init imr_self_test_init(void) +{ + imr_self_test(); + return 0; +} + +/** + * imr_self_test_exit - exit point for IMR code. + * + * return: + */ +static void __exit imr_self_test_exit(void) +{ +} + +module_init(imr_self_test_init); +module_exit(imr_self_test_exit); + +MODULE_AUTHOR("Bryan O'Donoghue "); +MODULE_DESCRIPTION("Intel Isolated Memory Region self-test driver"); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 638e797037da..97527614141b 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -735,6 +735,31 @@ config INTEL_IPS functionality. If in doubt, say Y here; it will only load on supported platforms. +config INTEL_IMR + bool "Intel Isolated Memory Region support" + default n + depends on X86_INTEL_QUARK && IOSF_MBI + ---help--- + This option provides a means to manipulate Isolated Memory Regions. + IMRs are a set of registers that define read and write access masks + to prohibit certain system agents from accessing memory with 1 KiB + granularity. + + IMRs make it possible to control read/write access to an address + by hardware agents inside the SoC. Read and write masks can be + defined for: + - eSRAM flush + - Dirty CPU snoop (write only) + - RMU access + - PCI Virtual Channel 0/Virtual Channel 1 + - SMM mode + - Non SMM mode + + Quark contains a set of eight IMR registers and makes use of those + registers during its bootup process. + + If you are running on a Galileo/Quark say Y here. + config IBM_RTL tristate "Device driver to enable PRTL support" depends on X86 && PCI -- cgit v1.2.3 From 8bbc2a135b63bee6b41fa90f415521a97995a49f Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Fri, 30 Jan 2015 16:29:39 +0000 Subject: x86/intel/quark: Add Intel Quark platform support Add Intel Quark platform support. Quark needs to pull down all unlocked IMRs to ensure agreement with the EFI memory map post boot. This patch adds an entry in Kconfig for Quark as a platform and makes IMR support mandatory if selected. Suggested-by: Thomas Gleixner Suggested-by: Andy Shevchenko Tested-by: Ong, Boon Leong Signed-off-by: Bryan O'Donoghue Reviewed-by: Andy Shevchenko Reviewed-by: Darren Hart Reviewed-by: Ong, Boon Leong Cc: dvhart@infradead.org Link: http://lkml.kernel.org/r/1422635379-12476-3-git-send-email-pure.logic@nexus-software.ie Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 16 ++++++++++++++++ arch/x86/platform/Makefile | 1 + 2 files changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5e28e2be3a41..1f97c7f0d049 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -486,6 +486,22 @@ config X86_INTEL_MID Intel MID platforms are based on an Intel processor and chipset which consume less power than most of the x86 derivatives. +config X86_INTEL_QUARK + bool "Intel Quark platform support" + depends on X86_32 + depends on X86_EXTENDED_PLATFORM + depends on X86_PLATFORM_DEVICES + depends on X86_TSC + depends on PCI + depends on PCI_GOANY + depends on X86_IO_APIC + select IOSF_MBI + select INTEL_IMR + ---help--- + Select to include support for Quark X1000 SoC. + Say Y here if you have a Quark based system such as the Arduino + compatible Intel Galileo. + config X86_INTEL_LPSS bool "Intel Low Power Subsystem Support" depends on ACPI diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index 85afde1fa3e5..a62e0be3a2f1 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -5,6 +5,7 @@ obj-y += geode/ obj-y += goldfish/ obj-y += iris/ obj-y += intel-mid/ +obj-y += intel-quark/ obj-y += olpc/ obj-y += scx200/ obj-y += sfi/ -- cgit v1.2.3 From b0bd96fe9a1428a04c82f8c3834db69468869d65 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 19 Feb 2015 12:30:49 +1030 Subject: lguest: now depends on PCI Reported-by: Randy Dunlap Signed-off-by: Rusty Russell --- arch/x86/lguest/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 4a0890f815c4..21e89807244c 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig @@ -1,6 +1,6 @@ config LGUEST_GUEST bool "Lguest guest support" - depends on X86_32 && PARAVIRT + depends on X86_32 && PARAVIRT && PCI select TTY select VIRTUALIZATION select VIRTIO -- cgit v1.2.3 From f476893459318cb2eff3ecd2a05d4ceacf82e73e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 19 Feb 2015 14:43:21 +1030 Subject: lguest: update help text. We now add about 10k, not 6k, when lguest support is compiled in. Signed-off-by: Rusty Russell --- arch/x86/lguest/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 21e89807244c..08f41caada45 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig @@ -8,7 +8,7 @@ config LGUEST_GUEST help Lguest is a tiny in-kernel hypervisor. Selecting this will allow your kernel to boot under lguest. This option will increase - your kernel size by about 6k. If in doubt, say N. + your kernel size by about 10k. If in doubt, say N. If you say Y here, make sure you say Y (or M) to the virtio block and net drivers which lguest needs. -- cgit v1.2.3 From 32d39169d7f56849b8c6c8c51aca7b73194d05f1 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Thu, 19 Feb 2015 16:14:32 +0800 Subject: x86/intel/quark: Fix ptr_ret.cocci warnings arch/x86/platform/intel-quark/imr.c:280:1-3: WARNING: PTR_ERR_OR_ZERO can be used Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR Generated by: scripts/coccinelle/api/ptr_ret.cocci Signed-off-by: Fengguang Wu Cc: Andy Shevchenko Cc: Ong, Boon Leong Cc: Bryan O'Donoghue Cc: Darren Hart Cc: kbuild-all@01.org Link: http://lkml.kernel.org/r/20150219081432.GA21983@waimea Signed-off-by: Ingo Molnar --- arch/x86/platform/intel-quark/imr.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c index 16e4df1c9290..60c01eb47dd6 100644 --- a/arch/x86/platform/intel-quark/imr.c +++ b/arch/x86/platform/intel-quark/imr.c @@ -277,10 +277,7 @@ static int imr_debugfs_register(struct imr_device *idev) { idev->file = debugfs_create_file("imr_state", S_IFREG | S_IRUGO, NULL, idev, &imr_state_ops); - if (IS_ERR(idev->file)) - return PTR_ERR(idev->file); - - return 0; + return PTR_ERR_OR_ZERO(idev->file); } /** -- cgit v1.2.3 From c11a25f443e9bee06fe302b6a78ff44dac554036 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Thu, 19 Feb 2015 16:14:32 +0800 Subject: x86/intel/quark: Fix simple_return.cocci warnings arch/x86/platform/intel-quark/imr.c:129:1-4: WARNING: end returns can be simpified Simplify a trivial if-return sequence. Possibly combine with a preceding function call. Generated by: scripts/coccinelle/misc/simple_return.cocci Signed-off-by: Fengguang Wu Cc: Andy Shevchenko Cc: Ong, Boon Leong Cc: Bryan O'Donoghue Cc: Darren Hart Cc: kbuild-all@01.org Link: http://lkml.kernel.org/r/20150219081432.GA21996@waimea Signed-off-by: Ingo Molnar --- arch/x86/platform/intel-quark/imr.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c index 60c01eb47dd6..0ee619f9fcb7 100644 --- a/arch/x86/platform/intel-quark/imr.c +++ b/arch/x86/platform/intel-quark/imr.c @@ -126,12 +126,8 @@ static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr) if (ret) return ret; - ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ, + return iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ, reg++, &imr->wmask); - if (ret) - return ret; - - return 0; } /** -- cgit v1.2.3 From f47233c2d34f243ecdaac179c3408a39ff9216a7 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 13 Feb 2015 16:04:55 +0100 Subject: x86/mm/ASLR: Propagate base load address calculation Commit: e2b32e678513 ("x86, kaslr: randomize module base load address") makes the base address for module to be unconditionally randomized in case when CONFIG_RANDOMIZE_BASE is defined and "nokaslr" option isn't present on the commandline. This is not consistent with how choose_kernel_location() decides whether it will randomize kernel load base. Namely, CONFIG_HIBERNATION disables kASLR (unless "kaslr" option is explicitly specified on kernel commandline), which makes the state space larger than what module loader is looking at. IOW CONFIG_HIBERNATION && CONFIG_RANDOMIZE_BASE is a valid config option, kASLR wouldn't be applied by default in that case, but module loader is not aware of that. Instead of fixing the logic in module.c, this patch takes more generic aproach. It introduces a new bootparam setup data_type SETUP_KASLR and uses that to pass the information whether kaslr has been applied during kernel decompression, and sets a global 'kaslr_enabled' variable accordingly, so that any kernel code (module loading, livepatching, ...) can make decisions based on its value. x86 module loader is converted to make use of this flag. Signed-off-by: Jiri Kosina Acked-by: Kees Cook Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/alpine.LNX.2.00.1502101411280.10719@pobox.suse.cz [ Always dump correct kaslr status when panicking ] Signed-off-by: Borislav Petkov --- arch/x86/boot/compressed/aslr.c | 34 +++++++++++++++++++++++++++++++++- arch/x86/boot/compressed/misc.c | 3 ++- arch/x86/boot/compressed/misc.h | 6 ++++-- arch/x86/include/asm/page_types.h | 3 +++ arch/x86/include/uapi/asm/bootparam.h | 1 + arch/x86/kernel/module.c | 11 ++--------- arch/x86/kernel/setup.c | 22 ++++++++++++++++++---- 7 files changed, 63 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index bb1376381985..7083c16cccba 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -14,6 +14,13 @@ static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; +struct kaslr_setup_data { + __u64 next; + __u32 type; + __u32 len; + __u8 data[1]; +} kaslr_setup_data; + #define I8254_PORT_CONTROL 0x43 #define I8254_PORT_COUNTER0 0x40 #define I8254_CMD_READBACK 0xC0 @@ -295,7 +302,29 @@ static unsigned long find_random_addr(unsigned long minimum, return slots_fetch_random(); } -unsigned char *choose_kernel_location(unsigned char *input, +static void add_kaslr_setup_data(struct boot_params *params, __u8 enabled) +{ + struct setup_data *data; + + kaslr_setup_data.type = SETUP_KASLR; + kaslr_setup_data.len = 1; + kaslr_setup_data.next = 0; + kaslr_setup_data.data[0] = enabled; + + data = (struct setup_data *)(unsigned long)params->hdr.setup_data; + + while (data && data->next) + data = (struct setup_data *)(unsigned long)data->next; + + if (data) + data->next = (unsigned long)&kaslr_setup_data; + else + params->hdr.setup_data = (unsigned long)&kaslr_setup_data; + +} + +unsigned char *choose_kernel_location(struct boot_params *params, + unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size) @@ -306,14 +335,17 @@ unsigned char *choose_kernel_location(unsigned char *input, #ifdef CONFIG_HIBERNATION if (!cmdline_find_option_bool("kaslr")) { debug_putstr("KASLR disabled by default...\n"); + add_kaslr_setup_data(params, 0); goto out; } #else if (cmdline_find_option_bool("nokaslr")) { debug_putstr("KASLR disabled by cmdline...\n"); + add_kaslr_setup_data(params, 0); goto out; } #endif + add_kaslr_setup_data(params, 1); /* Record the various known unsafe memory ranges. */ mem_avoid_init((unsigned long)input, input_size, diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index a950864a64da..5903089c818f 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -401,7 +401,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, * the entire decompressed kernel plus relocation table, or the * entire decompressed kernel plus .bss and .brk sections. */ - output = choose_kernel_location(input_data, input_len, output, + output = choose_kernel_location(real_mode, input_data, input_len, + output, output_len > run_size ? output_len : run_size); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 24e3e569a13c..6d6730743024 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -56,7 +56,8 @@ int cmdline_find_option_bool(const char *option); #if CONFIG_RANDOMIZE_BASE /* aslr.c */ -unsigned char *choose_kernel_location(unsigned char *input, +unsigned char *choose_kernel_location(struct boot_params *params, + unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size); @@ -64,7 +65,8 @@ unsigned char *choose_kernel_location(unsigned char *input, bool has_cpuflag(int flag); #else static inline -unsigned char *choose_kernel_location(unsigned char *input, +unsigned char *choose_kernel_location(struct boot_params *params, + unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index f97fbe3abb67..3d43ce36eaba 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -3,6 +3,7 @@ #include #include +#include /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 @@ -51,6 +52,8 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; +extern bool kaslr_enabled; + static inline phys_addr_t get_max_mapped(void) { return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 225b0988043a..44e6dd7e36a2 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -7,6 +7,7 @@ #define SETUP_DTB 2 #define SETUP_PCI 3 #define SETUP_EFI 4 +#define SETUP_KASLR 5 /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index e69f9882bf95..c3c59a3a14ad 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -32,6 +32,7 @@ #include #include +#include #if 0 #define DEBUGP(fmt, ...) \ @@ -46,21 +47,13 @@ do { \ #ifdef CONFIG_RANDOMIZE_BASE static unsigned long module_load_offset; -static int randomize_modules = 1; /* Mutex protects the module_load_offset. */ static DEFINE_MUTEX(module_kaslr_mutex); -static int __init parse_nokaslr(char *p) -{ - randomize_modules = 0; - return 0; -} -early_param("nokaslr", parse_nokaslr); - static unsigned long int get_module_load_offset(void) { - if (randomize_modules) { + if (kaslr_enabled) { mutex_lock(&module_kaslr_mutex); /* * Calculate the module_load_offset the first time this diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ab4734e5411d..16b6043cb073 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -121,6 +121,8 @@ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; +bool __read_mostly kaslr_enabled = false; + #ifdef CONFIG_DMI RESERVE_BRK(dmi_alloc, 65536); #endif @@ -424,6 +426,11 @@ static void __init reserve_initrd(void) } #endif /* CONFIG_BLK_DEV_INITRD */ +static void __init parse_kaslr_setup(u64 pa_data, u32 data_len) +{ + kaslr_enabled = (bool)(pa_data + sizeof(struct setup_data)); +} + static void __init parse_setup_data(void) { struct setup_data *data; @@ -451,6 +458,9 @@ static void __init parse_setup_data(void) case SETUP_EFI: parse_efi_setup(pa_data, data_len); break; + case SETUP_KASLR: + parse_kaslr_setup(pa_data, data_len); + break; default: break; } @@ -833,10 +843,14 @@ static void __init trim_low_memory_range(void) static int dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) { - pr_emerg("Kernel Offset: 0x%lx from 0x%lx " - "(relocation range: 0x%lx-0x%lx)\n", - (unsigned long)&_text - __START_KERNEL, __START_KERNEL, - __START_KERNEL_map, MODULES_VADDR-1); + if (kaslr_enabled) + pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n", + (unsigned long)&_text - __START_KERNEL, + __START_KERNEL, + __START_KERNEL_map, + MODULES_VADDR-1); + else + pr_emerg("Kernel Offset: disabled\n"); return 0; } -- cgit v1.2.3 From f15e05186c3244e9195378a0a568283a8ccc60b0 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 10 Feb 2015 13:20:30 -0800 Subject: x86/mm/init: Fix incorrect page size in init_memory_mapping() printks With 32-bit non-PAE kernels, we have 2 page sizes available (at most): 4k and 4M. Enabling PAE replaces that 4M size with a 2M one (which 64-bit systems use too). But, when booting a 32-bit non-PAE kernel, in one of our early-boot printouts, we say: init_memory_mapping: [mem 0x00000000-0x000fffff] [mem 0x00000000-0x000fffff] page 4k init_memory_mapping: [mem 0x37000000-0x373fffff] [mem 0x37000000-0x373fffff] page 2M init_memory_mapping: [mem 0x00100000-0x36ffffff] [mem 0x00100000-0x003fffff] page 4k [mem 0x00400000-0x36ffffff] page 2M init_memory_mapping: [mem 0x37400000-0x377fdfff] [mem 0x37400000-0x377fdfff] page 4k Which is obviously wrong. There is no 2M page available. This is probably because of a badly-named variable: in the map_range code: PG_LEVEL_2M. Instead of renaming all the PG_LEVEL_2M's. This patch just fixes the printout: init_memory_mapping: [mem 0x00000000-0x000fffff] [mem 0x00000000-0x000fffff] page 4k init_memory_mapping: [mem 0x37000000-0x373fffff] [mem 0x37000000-0x373fffff] page 4M init_memory_mapping: [mem 0x00100000-0x36ffffff] [mem 0x00100000-0x003fffff] page 4k [mem 0x00400000-0x36ffffff] page 4M init_memory_mapping: [mem 0x37400000-0x377fdfff] [mem 0x37400000-0x377fdfff] page 4k BRK [0x03206000, 0x03206fff] PGTABLE Signed-off-by: Dave Hansen Cc: Pekka Enberg Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20150210212030.665EC267@viggo.jf.intel.com Signed-off-by: Borislav Petkov --- arch/x86/mm/init.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 079c3b6a3ff1..7ff24240d863 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -238,6 +238,31 @@ static void __init_refok adjust_range_page_size_mask(struct map_range *mr, } } +static const char *page_size_string(struct map_range *mr) +{ + static const char str_1g[] = "1G"; + static const char str_2m[] = "2M"; + static const char str_4m[] = "4M"; + static const char str_4k[] = "4k"; + + if (mr->page_size_mask & (1<page_size_mask & (1<page_size_mask & (1< Date: Sat, 14 Feb 2015 09:33:50 -0800 Subject: x86, mm/ASLR: Fix stack randomization on 64-bit systems The issue is that the stack for processes is not properly randomized on 64 bit architectures due to an integer overflow. The affected function is randomize_stack_top() in file "fs/binfmt_elf.c": static unsigned long randomize_stack_top(unsigned long stack_top) { unsigned int random_variable = 0; if ((current->flags & PF_RANDOMIZE) && !(current->personality & ADDR_NO_RANDOMIZE)) { random_variable = get_random_int() & STACK_RND_MASK; random_variable <<= PAGE_SHIFT; } return PAGE_ALIGN(stack_top) + random_variable; return PAGE_ALIGN(stack_top) - random_variable; } Note that, it declares the "random_variable" variable as "unsigned int". Since the result of the shifting operation between STACK_RND_MASK (which is 0x3fffff on x86_64, 22 bits) and PAGE_SHIFT (which is 12 on x86_64): random_variable <<= PAGE_SHIFT; then the two leftmost bits are dropped when storing the result in the "random_variable". This variable shall be at least 34 bits long to hold the (22+12) result. These two dropped bits have an impact on the entropy of process stack. Concretely, the total stack entropy is reduced by four: from 2^28 to 2^30 (One fourth of expected entropy). This patch restores back the entropy by correcting the types involved in the operations in the functions randomize_stack_top() and stack_maxrandom_size(). The successful fix can be tested with: $ for i in `seq 1 10`; do cat /proc/self/maps | grep stack; done 7ffeda566000-7ffeda587000 rw-p 00000000 00:00 0 [stack] 7fff5a332000-7fff5a353000 rw-p 00000000 00:00 0 [stack] 7ffcdb7a1000-7ffcdb7c2000 rw-p 00000000 00:00 0 [stack] 7ffd5e2c4000-7ffd5e2e5000 rw-p 00000000 00:00 0 [stack] ... Once corrected, the leading bytes should be between 7ffc and 7fff, rather than always being 7fff. Signed-off-by: Hector Marco-Gisbert Signed-off-by: Ismael Ripoll [ Rebased, fixed 80 char bugs, cleaned up commit message, added test example and CVE ] Signed-off-by: Kees Cook Cc: Cc: Linus Torvalds Cc: Andrew Morton Cc: Al Viro Fixes: CVE-2015-1593 Link: http://lkml.kernel.org/r/20150214173350.GA18393@www.outflux.net Signed-off-by: Borislav Petkov --- arch/x86/mm/mmap.c | 6 +++--- fs/binfmt_elf.c | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 919b91205cd4..df4552bd239e 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -35,12 +35,12 @@ struct va_alignment __read_mostly va_align = { .flags = -1, }; -static unsigned int stack_maxrandom_size(void) +static unsigned long stack_maxrandom_size(void) { - unsigned int max = 0; + unsigned long max = 0; if ((current->flags & PF_RANDOMIZE) && !(current->personality & ADDR_NO_RANDOMIZE)) { - max = ((-1U) & STACK_RND_MASK) << PAGE_SHIFT; + max = ((-1UL) & STACK_RND_MASK) << PAGE_SHIFT; } return max; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 02b16910f4c9..995986b8e36b 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -645,11 +645,12 @@ out: static unsigned long randomize_stack_top(unsigned long stack_top) { - unsigned int random_variable = 0; + unsigned long random_variable = 0; if ((current->flags & PF_RANDOMIZE) && !(current->personality & ADDR_NO_RANDOMIZE)) { - random_variable = get_random_int() & STACK_RND_MASK; + random_variable = (unsigned long) get_random_int(); + random_variable &= STACK_RND_MASK; random_variable <<= PAGE_SHIFT; } #ifdef CONFIG_STACK_GROWSUP -- cgit v1.2.3 From f84598bd7c851f8b0bf8cd0d7c3be0d73c432ff4 Mon Sep 17 00:00:00 2001 From: Quentin Casasnovas Date: Tue, 3 Feb 2015 13:00:22 +0100 Subject: x86/microcode/intel: Guard against stack overflow in the loader mc_saved_tmp is a static array allocated on the stack, we need to make sure mc_saved_count stays within its bounds, otherwise we're overflowing the stack in _save_mc(). A specially crafted microcode header could lead to a kernel crash or potentially kernel execution. Signed-off-by: Quentin Casasnovas Cc: "H. Peter Anvin" Cc: Fenghua Yu Link: http://lkml.kernel.org/r/1422964824-22056-1-git-send-email-quentin.casasnovas@oracle.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/microcode/intel_early.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index ec9df6f9cd47..5e109a31f62b 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -321,7 +321,7 @@ get_matching_model_microcode(int cpu, unsigned long start, unsigned int mc_saved_count = mc_saved_data->mc_saved_count; int i; - while (leftover) { + while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) { mc_header = (struct microcode_header_intel *)ucode_ptr; mc_size = get_totalsize(mc_header); -- cgit v1.2.3 From 35a9ff4eec7a1725ac4364972fc6c156e4feedd0 Mon Sep 17 00:00:00 2001 From: Quentin Casasnovas Date: Tue, 3 Feb 2015 13:00:24 +0100 Subject: x86/microcode/intel: Handle truncated microcode images more robustly We do not check the input data bounds containing the microcode before copying a struct microcode_intel_header from it. A specially crafted microcode could cause the kernel to read invalid memory and lead to a denial-of-service. Signed-off-by: Quentin Casasnovas Cc: "H. Peter Anvin" Cc: Fenghua Yu Link: http://lkml.kernel.org/r/1422964824-22056-3-git-send-email-quentin.casasnovas@oracle.com [ Made error message differ from the next one and flipped comparison. ] Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/microcode/intel.c | 5 +++++ arch/x86/kernel/cpu/microcode/intel_early.c | 4 ++++ 2 files changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index c6826d1e8082..746e7fd08aad 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -196,6 +196,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, struct microcode_header_intel mc_header; unsigned int mc_size; + if (leftover < sizeof(mc_header)) { + pr_err("error! Truncated header in microcode data file\n"); + break; + } + if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header))) break; diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index 5e109a31f62b..420eb933189c 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -322,6 +322,10 @@ get_matching_model_microcode(int cpu, unsigned long start, int i; while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) { + + if (leftover < sizeof(mc_header)) + break; + mc_header = (struct microcode_header_intel *)ucode_ptr; mc_size = get_totalsize(mc_header); -- cgit v1.2.3 From e3a1f6cac1fe20e7ac01d96c914c25726723a64e Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 19 Feb 2015 13:06:53 +0000 Subject: x86: pte_protnone() and pmd_protnone() must check entry is not present Since _PAGE_PROTNONE aliases _PAGE_GLOBAL it is only valid if _PAGE_PRESENT is clear. Make pte_protnone() and pmd_protnone() check for this. This fixes a 64-bit Xen PV guest regression introduced by 8a0516ed8b90 ("mm: convert p[te|md]_numa users to p[te|md]_protnone_numa"). Any userspace process would endlessly fault. In a 64-bit PV guest, userspace page table entries have _PAGE_GLOBAL set by the hypervisor. This meant that any fault on a present userspace entry (e.g., a write to a read-only mapping) would be misinterpreted as a NUMA hinting fault and the fault would not be correctly handled, resulting in the access endlessly faulting. Signed-off-by: David Vrabel Acked-by: Mel Gorman Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 67fc3d2b0aab..a0c35bf6cb92 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -476,12 +476,14 @@ static inline int pmd_present(pmd_t pmd) */ static inline int pte_protnone(pte_t pte) { - return pte_flags(pte) & _PAGE_PROTNONE; + return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT)) + == _PAGE_PROTNONE; } static inline int pmd_protnone(pmd_t pmd) { - return pmd_flags(pmd) & _PAGE_PROTNONE; + return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT)) + == _PAGE_PROTNONE; } #endif /* CONFIG_NUMA_BALANCING */ -- cgit v1.2.3 From 570e1aa84c376ff39809442f09c7606ddf62cfd1 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 20 Feb 2015 10:18:59 +0100 Subject: x86/mm/ASLR: Avoid PAGE_SIZE redefinition for UML subarch Commit f47233c2d34 ("x86/mm/ASLR: Propagate base load address calculation") causes PAGE_SIZE redefinition warnings for UML subarch builds. This is caused by added includes that were leftovers from previous patch versions are are not actually needed (especially page_types.h inlcude in module.c). Drop those stray includes. Reported-by: kbuild test robot Signed-off-by: Jiri Kosina Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Kees Cook Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1502201017240.28769@pobox.suse.cz Signed-off-by: Ingo Molnar --- arch/x86/include/asm/page_types.h | 1 - arch/x86/kernel/module.c | 1 - 2 files changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 3d43ce36eaba..95e11f79f123 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -3,7 +3,6 @@ #include #include -#include /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index c3c59a3a14ad..ef00116e8270 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -32,7 +32,6 @@ #include #include -#include #if 0 #define DEBUGP(fmt, ...) \ -- cgit v1.2.3 From 650b7b23cb1e32d77daeefbac1ceb1329abf3b23 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 20 Feb 2015 15:07:29 +0100 Subject: kprobes/x86: Use 5-byte NOP when the code might be modified by ftrace can_probe() checks if the given address points to the beginning of an instruction. It analyzes all the instructions from the beginning of the function until the given address. The code might be modified by another Kprobe. In this case, the current code is read into a buffer, int3 breakpoint is replaced by the saved opcode in the buffer, and can_probe() analyzes the buffer instead. There is a bug that __recover_probed_insn() tries to restore the original code even for Kprobes using the ftrace framework. But in this case, the opcode is not stored. See the difference between arch_prepare_kprobe() and arch_prepare_kprobe_ftrace(). The opcode is stored by arch_copy_kprobe() only from arch_prepare_kprobe(). This patch makes Kprobe to use the ideal 5-byte NOP when the code can be modified by ftrace. It is the original instruction, see ftrace_make_nop() and ftrace_nop_replace(). Note that we always need to use the NOP for ftrace locations. Kprobes do not block ftrace and the instruction might get modified at anytime. It might even be in an inconsistent state because it is modified step by step using the int3 breakpoint. The patch also fixes indentation of the touched comment. Note that I found this problem when playing with Kprobes. I did it on x86_64 with gcc-4.8.3 that supported -mfentry. I modified samples/kprobes/kprobe_example.c and added offset 5 to put the probe right after the fentry area: static struct kprobe kp = { .symbol_name = "do_fork", + .offset = 5, }; Then I was able to load kprobe_example before jprobe_example but not the other way around: $> modprobe jprobe_example $> modprobe kprobe_example modprobe: ERROR: could not insert 'kprobe_example': Invalid or incomplete multibyte or wide character It did not make much sense and debugging pointed to the bug described above. Signed-off-by: Petr Mladek Acked-by: Masami Hiramatsu Cc: Ananth NMavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S. Miller Cc: Frederic Weisbecker Cc: Jiri Kosina Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1424441250-27146-2-git-send-email-pmladek@suse.cz Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/core.c | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 6a1146ea4d4d..c3b4b46b4797 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -223,27 +223,41 @@ static unsigned long __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) { struct kprobe *kp; + unsigned long faddr; kp = get_kprobe((void *)addr); - /* There is no probe, return original address */ - if (!kp) + faddr = ftrace_location(addr); + /* + * Use the current code if it is not modified by Kprobe + * and it cannot be modified by ftrace. + */ + if (!kp && !faddr) return addr; /* - * Basically, kp->ainsn.insn has an original instruction. - * However, RIP-relative instruction can not do single-stepping - * at different place, __copy_instruction() tweaks the displacement of - * that instruction. In that case, we can't recover the instruction - * from the kp->ainsn.insn. + * Basically, kp->ainsn.insn has an original instruction. + * However, RIP-relative instruction can not do single-stepping + * at different place, __copy_instruction() tweaks the displacement of + * that instruction. In that case, we can't recover the instruction + * from the kp->ainsn.insn. * - * On the other hand, kp->opcode has a copy of the first byte of - * the probed instruction, which is overwritten by int3. And - * the instruction at kp->addr is not modified by kprobes except - * for the first byte, we can recover the original instruction - * from it and kp->opcode. + * On the other hand, in case on normal Kprobe, kp->opcode has a copy + * of the first byte of the probed instruction, which is overwritten + * by int3. And the instruction at kp->addr is not modified by kprobes + * except for the first byte, we can recover the original instruction + * from it and kp->opcode. + * + * In case of Kprobes using ftrace, we do not have a copy of + * the original instruction. In fact, the ftrace location might + * be modified at anytime and even could be in an inconsistent state. + * Fortunately, we know that the original code is the ideal 5-byte + * long NOP. */ - memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); - buf[0] = kp->opcode; + memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + if (faddr) + memcpy(buf, ideal_nops[NOP_ATOMIC5], 5); + else + buf[0] = kp->opcode; return (unsigned long)buf; } -- cgit v1.2.3 From 2a6730c8b6e075adf826a89a3e2caa705807afdb Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 20 Feb 2015 15:07:30 +0100 Subject: kprobes/x86: Check for invalid ftrace location in __recover_probed_insn() __recover_probed_insn() should always be called from an address where an instructions starts. The check for ftrace_location() might help to discover a potential inconsistency. This patch adds WARN_ON() when the inconsistency is detected. Also it adds handling of the situation when the original code can not get recovered. Suggested-by: Masami Hiramatsu Signed-off-by: Petr Mladek Cc: Ananth NMavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S. Miller Cc: Frederic Weisbecker Cc: Jiri Kosina Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1424441250-27146-3-git-send-email-pmladek@suse.cz Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/core.c | 12 ++++++++++++ arch/x86/kernel/kprobes/opt.c | 2 ++ 2 files changed, 14 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index c3b4b46b4797..4e3d5a9621fe 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -227,6 +227,13 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) kp = get_kprobe((void *)addr); faddr = ftrace_location(addr); + /* + * Addresses inside the ftrace location are refused by + * arch_check_ftrace_location(). Something went terribly wrong + * if such an address is checked here. + */ + if (WARN_ON(faddr && faddr != addr)) + return 0UL; /* * Use the current code if it is not modified by Kprobe * and it cannot be modified by ftrace. @@ -265,6 +272,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) * Recover the probed instruction at addr for further analysis. * Caller must lock kprobes by kprobe_mutex, or disable preemption * for preventing to release referencing kprobes. + * Returns zero if the instruction can not get recovered. */ unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) { @@ -299,6 +307,8 @@ static int can_probe(unsigned long paddr) * normally used, we just go through if there is no kprobe. */ __addr = recover_probed_instruction(buf, addr); + if (!__addr) + return 0; kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE); insn_get_length(&insn); @@ -347,6 +357,8 @@ int __copy_instruction(u8 *dest, u8 *src) unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src); + if (!recovered_insn) + return 0; kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); insn_get_length(&insn); /* Another subsystem puts a breakpoint, failed to recover */ diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 7c523bbf3dc8..3aef248ec1ee 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -259,6 +259,8 @@ static int can_optimize(unsigned long paddr) */ return 0; recovered_insn = recover_probed_instruction(buf, addr); + if (!recovered_insn) + return 0; kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); insn_get_length(&insn); /* Another subsystem puts a breakpoint */ -- cgit v1.2.3 From a927792c196f1c24410f3c12ccf45238a353783a Mon Sep 17 00:00:00 2001 From: Yannick Guerrini Date: Sat, 21 Feb 2015 23:41:50 +0100 Subject: x86/cpu/intel: Fix trivial typo in intel_tlb_table[] Change 'ssociative' to 'associative' Signed-off-by: Yannick Guerrini Cc: Borislav Petkov Cc: Bryan O'Donoghue Cc: Chris Bainbridge Cc: Dave Hansen Cc: Steven Honeyman Cc: trivial@kernel.org Link: http://lkml.kernel.org/r/1424558510-1420-1-git-send-email-yguerrini@tomshardware.fr Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 94d7dcb12145..50163fa9034f 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -565,8 +565,8 @@ static const struct _tlb_table intel_tlb_table[] = { { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, - { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set ssociative" }, - { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set ssociative" }, + { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set associative" }, + { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set associative" }, { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" }, -- cgit v1.2.3 From 31795b470b0872b66f7fa26f791b695c79821220 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Wed, 11 Feb 2015 14:39:18 -0500 Subject: x86/xen: Make sure X2APIC_ENABLE bit of MSR_IA32_APICBASE is not set Commit d524165cb8db ("x86/apic: Check x2apic early") tests X2APIC_ENABLE bit of MSR_IA32_APICBASE when CONFIG_X86_X2APIC is off and panics the kernel when this bit is set. Xen's PV guests will pass this MSR read to the hypervisor which will return its version of the MSR, where this bit might be set. Make sure we clear it before returning MSR value to the caller. Signed-off-by: Boris Ostrovsky Signed-off-by: David Vrabel --- arch/x86/xen/enlighten.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index bd8b8459c3d0..efee14db009b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1070,6 +1070,23 @@ static inline void xen_write_cr8(unsigned long val) BUG_ON(val); } #endif + +static u64 xen_read_msr_safe(unsigned int msr, int *err) +{ + u64 val; + + val = native_read_msr_safe(msr, err); + switch (msr) { + case MSR_IA32_APICBASE: +#ifdef CONFIG_X86_X2APIC + if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31)))) +#endif + val &= ~X2APIC_ENABLE; + break; + } + return val; +} + static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) { int ret; @@ -1240,7 +1257,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .wbinvd = native_wbinvd, - .read_msr = native_read_msr_safe, + .read_msr = xen_read_msr_safe, .write_msr = xen_write_msr_safe, .read_tsc = native_read_tsc, -- cgit v1.2.3 From fdfd811ddde3678247248ca9a27faa999ca4cd51 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 19 Feb 2015 15:23:17 +0000 Subject: x86/xen: allow privcmd hypercalls to be preempted Hypercalls submitted by user space tools via the privcmd driver can take a long time (potentially many 10s of seconds) if the hypercall has many sub-operations. A fully preemptible kernel may deschedule such as task in any upcall called from a hypercall continuation. However, in a kernel with voluntary or no preemption, hypercall continuations in Xen allow event handlers to be run but the task issuing the hypercall will not be descheduled until the hypercall is complete and the ioctl returns to user space. These long running tasks may also trigger the kernel's soft lockup detection. Add xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() to bracket hypercalls that may be preempted. Use these in the privcmd driver. When returning from an upcall, call xen_maybe_preempt_hcall() which adds a schedule point if if the current task was within a preemptible hypercall. Since _cond_resched() can move the task to a different CPU, clear and set xen_in_preemptible_hcall around the call. Signed-off-by: David Vrabel Reviewed-by: Boris Ostrovsky --- arch/x86/kernel/entry_32.S | 3 +++ arch/x86/kernel/entry_64.S | 3 +++ drivers/xen/Makefile | 2 +- drivers/xen/preempt.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ drivers/xen/privcmd.c | 2 ++ include/xen/xen-ops.h | 26 ++++++++++++++++++++++++++ 6 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 drivers/xen/preempt.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 000d4199b03e..31e2d5bf3e38 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -982,6 +982,9 @@ ENTRY(xen_hypervisor_callback) ENTRY(xen_do_upcall) 1: mov %esp, %eax call xen_evtchn_do_upcall +#ifndef CONFIG_PREEMPT + call xen_maybe_preempt_hcall +#endif jmp ret_from_intr CFI_ENDPROC ENDPROC(xen_hypervisor_callback) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index db13655c3a2a..10074ad9ebf8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1208,6 +1208,9 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) popq %rsp CFI_DEF_CFA_REGISTER rsp decl PER_CPU_VAR(irq_count) +#ifndef CONFIG_PREEMPT + call xen_maybe_preempt_hcall +#endif jmp error_exit CFI_ENDPROC END(xen_do_hypervisor_callback) diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 2140398a2a8c..2ccd3592d41f 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -2,7 +2,7 @@ ifeq ($(filter y, $(CONFIG_ARM) $(CONFIG_ARM64)),) obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o endif obj-$(CONFIG_X86) += fallback.o -obj-y += grant-table.o features.o balloon.o manage.o +obj-y += grant-table.o features.o balloon.o manage.o preempt.o obj-y += events/ obj-y += xenbus/ diff --git a/drivers/xen/preempt.c b/drivers/xen/preempt.c new file mode 100644 index 000000000000..a1800c150839 --- /dev/null +++ b/drivers/xen/preempt.c @@ -0,0 +1,44 @@ +/* + * Preemptible hypercalls + * + * Copyright (C) 2014 Citrix Systems R&D ltd. + * + * This source code is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + */ + +#include +#include + +#ifndef CONFIG_PREEMPT + +/* + * Some hypercalls issued by the toolstack can take many 10s of + * seconds. Allow tasks running hypercalls via the privcmd driver to + * be voluntarily preempted even if full kernel preemption is + * disabled. + * + * Such preemptible hypercalls are bracketed by + * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() + * calls. + */ + +DEFINE_PER_CPU(bool, xen_in_preemptible_hcall); +EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); + +asmlinkage __visible void xen_maybe_preempt_hcall(void) +{ + if (unlikely(__this_cpu_read(xen_in_preemptible_hcall) + && should_resched())) { + /* + * Clear flag as we may be rescheduled on a different + * cpu. + */ + __this_cpu_write(xen_in_preemptible_hcall, false); + _cond_resched(); + __this_cpu_write(xen_in_preemptible_hcall, true); + } +} +#endif /* CONFIG_PREEMPT */ diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 569a13b9e856..59ac71c4a043 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -56,10 +56,12 @@ static long privcmd_ioctl_hypercall(void __user *udata) if (copy_from_user(&hypercall, udata, sizeof(hypercall))) return -EFAULT; + xen_preemptible_hcall_begin(); ret = privcmd_call(hypercall.op, hypercall.arg[0], hypercall.arg[1], hypercall.arg[2], hypercall.arg[3], hypercall.arg[4]); + xen_preemptible_hcall_end(); return ret; } diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 7491ee5d8164..83338210ee04 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -46,4 +46,30 @@ static inline efi_system_table_t __init *xen_efi_probe(void) } #endif +#ifdef CONFIG_PREEMPT + +static inline void xen_preemptible_hcall_begin(void) +{ +} + +static inline void xen_preemptible_hcall_end(void) +{ +} + +#else + +DECLARE_PER_CPU(bool, xen_in_preemptible_hcall); + +static inline void xen_preemptible_hcall_begin(void) +{ + __this_cpu_write(xen_in_preemptible_hcall, true); +} + +static inline void xen_preemptible_hcall_end(void) +{ + __this_cpu_write(xen_in_preemptible_hcall, false); +} + +#endif /* CONFIG_PREEMPT */ + #endif /* INCLUDE_XEN_OPS_H */ -- cgit v1.2.3 From 5054daa285beaf706f051fbd395dc36c9f0f907f Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 23 Feb 2015 11:01:00 -0500 Subject: x86/xen: Initialize cr4 shadow for 64-bit PV(H) guests Commit 1e02ce4cccdc ("x86: Store a per-cpu shadow copy of CR4") introduced CR4 shadows. These shadows are initialized in early boot code. The commit missed initialization for 64-bit PV(H) guests that this patch adds. Signed-off-by: Boris Ostrovsky Signed-off-by: David Vrabel --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index efee14db009b..5240f563076d 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1758,6 +1758,7 @@ asmlinkage __visible void __init xen_start_kernel(void) #ifdef CONFIG_X86_32 i386_start_kernel(); #else + cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */ x86_64_start_reservations((char *)__pa_symbol(&boot_params)); #endif } -- cgit v1.2.3 From 579deee571a755c485ad702ef82c77a98a2ccc05 Mon Sep 17 00:00:00 2001 From: Yannick Guerrini Date: Mon, 23 Feb 2015 17:52:38 +0100 Subject: x86/platform/intel-mid: Fix trivial printk message typo in intel_mid_arch_setup() Change 'Uknown' to 'Unknown' Signed-off-by: Yannick Guerrini Cc: trivial@kernel.org Link: http://lkml.kernel.org/r/1424710358-10140-1-git-send-email-yguerrini@tomshardware.fr Signed-off-by: Ingo Molnar --- arch/x86/platform/intel-mid/intel-mid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 1bbedc4b0f88..3005f0c89f2e 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -130,7 +130,7 @@ static void intel_mid_arch_setup(void) intel_mid_ops = get_intel_mid_ops[__intel_mid_cpu_chip](); else { intel_mid_ops = get_intel_mid_ops[INTEL_MID_CPU_CHIP_PENWELL](); - pr_info("ARCH: Uknown SoC, assuming PENWELL!\n"); + pr_info("ARCH: Unknown SoC, assuming PENWELL!\n"); } out: -- cgit v1.2.3 From 5b2bdbc84556774afbe11bcfd24c2f6411cfa92b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 27 Feb 2015 14:50:19 -0500 Subject: x86: Init per-cpu shadow copy of CR4 on 32-bit CPUs too Commit: 1e02ce4cccdc ("x86: Store a per-cpu shadow copy of CR4") added a shadow CR4 such that reads and writes that do not modify the CR4 execute much faster than always reading the register itself. The change modified cpu_init() in common.c, so that the shadow CR4 gets initialized before anything uses it. Unfortunately, there's two cpu_init()s in common.c. There's one for 64-bit and one for 32-bit. The commit only added the shadow init to the 64-bit path, but the 32-bit path needs the init too. Link: http://lkml.kernel.org/r/20150227125208.71c36402@gandalf.local.home Fixes: 1e02ce4cccdc "x86: Store a per-cpu shadow copy of CR4" Signed-off-by: Steven Rostedt Acked-by: Andy Lutomirski Cc: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20150227145019.2bdd4354@gandalf.local.home Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b5c8ff5e9dfc..2346c95c6ab1 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1396,6 +1396,12 @@ void cpu_init(void) wait_for_master_cpu(cpu); + /* + * Initialize the CR4 shadow before doing anything that could + * try to read it. + */ + cr4_init_shadow(); + show_ucode_info_early(); printk(KERN_INFO "Initializing CPU#%d\n", cpu); -- cgit v1.2.3