From 5f35eb0e29ca26da82febe49d7698dbeb8882ea0 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Thu, 1 May 2014 21:15:48 +0800 Subject: x86/efi: earlyprintk=efi,keep fix earlyprintk=efi,keep will cause kernel hangs while freeing initmem like below: VFS: Mounted root (ext4 filesystem) readonly on device 254:2. devtmpfs: mounted Freeing unused kernel memory: 880K (ffffffff817d4000 - ffffffff818b0000) It is caused by efi earlyprintk use __init function which will be freed later. Such as early_efi_write is marked as __init, also it will use early_ioremap which is init function as well. To fix this issue, I added early initcall early_efi_map_fb which maps the whole efi fb for later use. OTOH, adding a wrapper function early_efi_map which calls early_ioremap before ioremap is available. With this patch applied efi boot ok with earlyprintk=efi,keep console=efi Signed-off-by: Dave Young Signed-off-by: Matt Fleming --- arch/x86/platform/efi/early_printk.c | 83 +++++++++++++++++++++++++++--------- 1 file changed, 64 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/early_printk.c b/arch/x86/platform/efi/early_printk.c index 81b506d5befd..524142117296 100644 --- a/arch/x86/platform/efi/early_printk.c +++ b/arch/x86/platform/efi/early_printk.c @@ -14,48 +14,92 @@ static const struct font_desc *font; static u32 efi_x, efi_y; +static void *efi_fb; +static bool early_efi_keep; -static __init void early_efi_clear_scanline(unsigned int y) +/* + * efi earlyprintk need use early_ioremap to map the framebuffer. + * But early_ioremap is not usable for earlyprintk=efi,keep, ioremap should + * be used instead. ioremap will be available after paging_init() which is + * earlier than initcall callbacks. Thus adding this early initcall function + * early_efi_map_fb to map the whole efi framebuffer. + */ +static __init int early_efi_map_fb(void) { - unsigned long base, *dst; - u16 len; + unsigned long base, size; + + if (!early_efi_keep) + return 0; base = boot_params.screen_info.lfb_base; - len = boot_params.screen_info.lfb_linelength; + size = boot_params.screen_info.lfb_size; + efi_fb = ioremap(base, size); + + return efi_fb ? 0 : -ENOMEM; +} +early_initcall(early_efi_map_fb); + +/* + * early_efi_map maps efi framebuffer region [start, start + len -1] + * In case earlyprintk=efi,keep we have the whole framebuffer mapped already + * so just return the offset efi_fb + start. + */ +static __init_refok void *early_efi_map(unsigned long start, unsigned long len) +{ + unsigned long base; + + base = boot_params.screen_info.lfb_base; + + if (efi_fb) + return (efi_fb + start); + else + return early_ioremap(base + start, len); +} - dst = early_ioremap(base + y*len, len); +static __init_refok void early_efi_unmap(void *addr, unsigned long len) +{ + if (!efi_fb) + early_iounmap(addr, len); +} + +static void early_efi_clear_scanline(unsigned int y) +{ + unsigned long *dst; + u16 len; + + len = boot_params.screen_info.lfb_linelength; + dst = early_efi_map(y*len, len); if (!dst) return; memset(dst, 0, len); - early_iounmap(dst, len); + early_efi_unmap(dst, len); } -static __init void early_efi_scroll_up(void) +static void early_efi_scroll_up(void) { - unsigned long base, *dst, *src; + unsigned long *dst, *src; u16 len; u32 i, height; - base = boot_params.screen_info.lfb_base; len = boot_params.screen_info.lfb_linelength; height = boot_params.screen_info.lfb_height; for (i = 0; i < height - font->height; i++) { - dst = early_ioremap(base + i*len, len); + dst = early_efi_map(i*len, len); if (!dst) return; - src = early_ioremap(base + (i + font->height) * len, len); + src = early_efi_map((i + font->height) * len, len); if (!src) { - early_iounmap(dst, len); + early_efi_unmap(dst, len); return; } memmove(dst, src, len); - early_iounmap(src, len); - early_iounmap(dst, len); + early_efi_unmap(src, len); + early_efi_unmap(dst, len); } } @@ -79,16 +123,14 @@ static void early_efi_write_char(u32 *dst, unsigned char c, unsigned int h) } } -static __init void +static void early_efi_write(struct console *con, const char *str, unsigned int num) { struct screen_info *si; - unsigned long base; unsigned int len; const char *s; void *dst; - base = boot_params.screen_info.lfb_base; si = &boot_params.screen_info; len = si->lfb_linelength; @@ -109,7 +151,7 @@ early_efi_write(struct console *con, const char *str, unsigned int num) for (h = 0; h < font->height; h++) { unsigned int n, x; - dst = early_ioremap(base + (efi_y + h) * len, len); + dst = early_efi_map((efi_y + h) * len, len); if (!dst) return; @@ -123,7 +165,7 @@ early_efi_write(struct console *con, const char *str, unsigned int num) s++; } - early_iounmap(dst, len); + early_efi_unmap(dst, len); } num -= count; @@ -179,6 +221,9 @@ static __init int early_efi_setup(struct console *con, char *options) for (i = 0; i < (yres - efi_y) / font->height; i++) early_efi_scroll_up(); + /* early_console_register will unset CON_BOOT in case ,keep */ + if (!(con->flags & CON_BOOT)) + early_efi_keep = true; return 0; } -- cgit v1.2.3 From ac008fe0a3236729751ccde655c215b436dfdaeb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 5 May 2014 15:23:35 -0700 Subject: x86, build: Don't get confused by local symbols arch/x86/crypto/sha1_avx2_x86_64_asm.S introduced _end as a local symbol, which broke the build under certain circumstances. Although the wisdom of _end as a local symbol can definitely be questioned, the build should not break for that reason. Thus, filter the output of nm to only get global symbols of appropriate type. Reported-by: Andy Lutomirski Cc: Chandramouli Narayanan Cc: Herbert Xu Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/n/tip-uxm3j3w3odglcwhafwq5tjqu@git.kernel.org --- arch/x86/boot/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index abb9eba61b50..dbe8dd2fe247 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -71,7 +71,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-voffset := -e 's/^\([0-9a-fA-F]*\) . \(_text\|_end\)$$/\#define VO_\2 0x\1/p' +sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|_end\)$$/\#define VO_\2 0x\1/p' quiet_cmd_voffset = VOFFSET $@ cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ @@ -80,7 +80,7 @@ targets += voffset.h $(obj)/voffset.h: vmlinux FORCE $(call if_changed,voffset) -sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ -- cgit v1.2.3 From 2605fc216fa492f9e7c488bdc7f687cd6dcc703b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 2 May 2014 00:44:37 +0200 Subject: asmlinkage, x86: Add explicit __visible to arch/x86/* As requested by Linus add explicit __visible to the asmlinkage users. This marks all functions visible to assembler. Tree sweep for arch/x86/* Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/1398984278-29319-3-git-send-email-andi@firstfloor.org Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/misc.c | 2 +- arch/x86/kernel/acpi/sleep.c | 2 +- arch/x86/kernel/apic/io_apic.c | 2 +- arch/x86/kernel/cpu/mcheck/therm_throt.c | 4 ++-- arch/x86/kernel/cpu/mcheck/threshold.c | 4 ++-- arch/x86/kernel/head32.c | 2 +- arch/x86/kernel/head64.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/smp.c | 2 +- arch/x86/kernel/traps.c | 6 +++--- arch/x86/kernel/vsmp_64.c | 6 +++--- arch/x86/kvm/x86.c | 2 +- arch/x86/lguest/boot.c | 4 ++-- arch/x86/math-emu/errors.c | 16 ++++++++-------- arch/x86/platform/olpc/olpc-xo1-pm.c | 2 +- arch/x86/power/hibernate_64.c | 2 +- arch/x86/xen/enlighten.c | 2 +- arch/x86/xen/irq.c | 6 +++--- 18 files changed, 34 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 17684615374b..57ab74df7eea 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -354,7 +354,7 @@ static void parse_elf(void *output) free(phdrs); } -asmlinkage void *decompress_kernel(void *rmode, memptr heap, +asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, unsigned char *input_data, unsigned long input_len, unsigned char *output, diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 3a2ae4c88948..31368207837c 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -31,7 +31,7 @@ static char temp_stack[4096]; * * Wrapper around acpi_enter_sleep_state() to be called by assmebly. */ -acpi_status asmlinkage x86_acpi_enter_sleep_state(u8 state) +acpi_status asmlinkage __visible x86_acpi_enter_sleep_state(u8 state) { return acpi_enter_sleep_state(state); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d23aa82e7a7b..992060e09897 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2189,7 +2189,7 @@ void send_cleanup_vector(struct irq_cfg *cfg) cfg->move_in_progress = 0; } -asmlinkage void smp_irq_move_cleanup_interrupt(void) +asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index d921b7ee6595..36a1bb6d1ee0 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -429,14 +429,14 @@ static inline void __smp_thermal_interrupt(void) smp_thermal_vector(); } -asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs) { entering_irq(); __smp_thermal_interrupt(); exiting_ack_irq(); } -asmlinkage void smp_trace_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs) { entering_irq(); trace_thermal_apic_entry(THERMAL_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index fe6b1c86645b..7245980186ee 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -24,14 +24,14 @@ static inline void __smp_threshold_interrupt(void) mce_threshold_vector(); } -asmlinkage void smp_threshold_interrupt(void) +asmlinkage __visible void smp_threshold_interrupt(void) { entering_irq(); __smp_threshold_interrupt(); exiting_ack_irq(); } -asmlinkage void smp_trace_threshold_interrupt(void) +asmlinkage __visible void smp_trace_threshold_interrupt(void) { entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index c61a14a4a310..d6c1b9836995 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -29,7 +29,7 @@ static void __init i386_default_early_setup(void) reserve_ebda_region(); } -asmlinkage void __init i386_start_kernel(void) +asmlinkage __visible void __init i386_start_kernel(void) { sanitize_boot_params(&boot_params); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 85126ccbdf6b..068054f4bf20 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -137,7 +137,7 @@ static void __init copy_bootdata(char *real_mode_data) } } -asmlinkage void __init x86_64_start_kernel(char * real_mode_data) +asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) { int i; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9c0280f93d05..898d077617a9 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ asmlinkage extern void ret_from_fork(void); -asmlinkage DEFINE_PER_CPU(unsigned long, old_rsp); +__visible DEFINE_PER_CPU(unsigned long, old_rsp); /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, int all) diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 7c3a5a61f2e4..be8e1bde07aa 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -168,7 +168,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) * this function calls the 'stop' function on all other CPUs in the system. */ -asmlinkage void smp_reboot_interrupt(void) +asmlinkage __visible void smp_reboot_interrupt(void) { ack_APIC_irq(); irq_enter(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 57409f6b8c62..f73b5d435bdc 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -357,7 +357,7 @@ exit: * for scheduling or signal handling. The actual stack switch is done in * entry.S */ -asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) +asmlinkage __visible __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = eregs; /* Did already sync */ @@ -601,11 +601,11 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) #endif } -asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) +asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void) { } -asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) +asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void) { } diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 5edc34b5b951..b99b9ad8540c 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -36,7 +36,7 @@ static int irq_routing_comply = 1; * and vice versa. */ -asmlinkage unsigned long vsmp_save_fl(void) +asmlinkage __visible unsigned long vsmp_save_fl(void) { unsigned long flags = native_save_fl(); @@ -56,7 +56,7 @@ __visible void vsmp_restore_fl(unsigned long flags) } PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl); -asmlinkage void vsmp_irq_disable(void) +asmlinkage __visible void vsmp_irq_disable(void) { unsigned long flags = native_save_fl(); @@ -64,7 +64,7 @@ asmlinkage void vsmp_irq_disable(void) } PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable); -asmlinkage void vsmp_irq_enable(void) +asmlinkage __visible void vsmp_irq_enable(void) { unsigned long flags = native_save_fl(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8b8fc0b792ba..b6c0bacca9bd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -280,7 +280,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); -asmlinkage void kvm_spurious_fault(void) +asmlinkage __visible void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ BUG(); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index ad1fb5f53925..aae94132bc24 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -233,13 +233,13 @@ static void lguest_end_context_switch(struct task_struct *next) * flags word contains all kind of stuff, but in practice Linux only cares * about the interrupt flag. Our "save_flags()" just returns that. */ -asmlinkage unsigned long lguest_save_fl(void) +asmlinkage __visible unsigned long lguest_save_fl(void) { return lguest_data.irq_enabled; } /* Interrupts go off... */ -asmlinkage void lguest_irq_disable(void) +asmlinkage __visible void lguest_irq_disable(void) { lguest_data.irq_enabled = 0; } diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c index a5449089cd9f..9e6545f269e5 100644 --- a/arch/x86/math-emu/errors.c +++ b/arch/x86/math-emu/errors.c @@ -302,7 +302,7 @@ static struct { 0x242 in div_Xsig.S */ -asmlinkage void FPU_exception(int n) +asmlinkage __visible void FPU_exception(int n) { int i, int_type; @@ -492,7 +492,7 @@ int real_2op_NaN(FPU_REG const *b, u_char tagb, /* Invalid arith operation on Valid registers */ /* Returns < 0 if the exception is unmasked */ -asmlinkage int arith_invalid(int deststnr) +asmlinkage __visible int arith_invalid(int deststnr) { EXCEPTION(EX_Invalid); @@ -507,7 +507,7 @@ asmlinkage int arith_invalid(int deststnr) } /* Divide a finite number by zero */ -asmlinkage int FPU_divide_by_zero(int deststnr, u_char sign) +asmlinkage __visible int FPU_divide_by_zero(int deststnr, u_char sign) { FPU_REG *dest = &st(deststnr); int tag = TAG_Valid; @@ -539,7 +539,7 @@ int set_precision_flag(int flags) } /* This may be called often, so keep it lean */ -asmlinkage void set_precision_flag_up(void) +asmlinkage __visible void set_precision_flag_up(void) { if (control_word & CW_Precision) partial_status |= (SW_Precision | SW_C1); /* The masked response */ @@ -548,7 +548,7 @@ asmlinkage void set_precision_flag_up(void) } /* This may be called often, so keep it lean */ -asmlinkage void set_precision_flag_down(void) +asmlinkage __visible void set_precision_flag_down(void) { if (control_word & CW_Precision) { /* The masked response */ partial_status &= ~SW_C1; @@ -557,7 +557,7 @@ asmlinkage void set_precision_flag_down(void) EXCEPTION(EX_Precision); } -asmlinkage int denormal_operand(void) +asmlinkage __visible int denormal_operand(void) { if (control_word & CW_Denormal) { /* The masked response */ partial_status |= SW_Denorm_Op; @@ -568,7 +568,7 @@ asmlinkage int denormal_operand(void) } } -asmlinkage int arith_overflow(FPU_REG *dest) +asmlinkage __visible int arith_overflow(FPU_REG *dest) { int tag = TAG_Valid; @@ -596,7 +596,7 @@ asmlinkage int arith_overflow(FPU_REG *dest) } -asmlinkage int arith_underflow(FPU_REG *dest) +asmlinkage __visible int arith_underflow(FPU_REG *dest) { int tag = TAG_Valid; diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c index ff0174dda810..a9acde72d4ed 100644 --- a/arch/x86/platform/olpc/olpc-xo1-pm.c +++ b/arch/x86/platform/olpc/olpc-xo1-pm.c @@ -75,7 +75,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state) return 0; } -asmlinkage int xo1_do_sleep(u8 sleep_state) +asmlinkage __visible int xo1_do_sleep(u8 sleep_state) { void *pgd_addr = __va(read_cr3()); diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 304fca20d96e..35e2bb6c0f37 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -23,7 +23,7 @@ extern __visible const void __nosave_begin, __nosave_end; /* Defined in hibernate_asm_64.S */ -extern asmlinkage int restore_image(void); +extern asmlinkage __visible int restore_image(void); /* * Address to jump to in the last phase of restore in order to get to the image diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 201d09a7c46b..c34bfc4bbe7f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1515,7 +1515,7 @@ static void __init xen_pvh_early_guest_init(void) } /* First C function to be called on Xen boot */ -asmlinkage void __init xen_start_kernel(void) +asmlinkage __visible void __init xen_start_kernel(void) { struct physdev_set_iopl set_iopl; int rc; diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 08f763de26fe..a1207cb6472a 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -23,7 +23,7 @@ void xen_force_evtchn_callback(void) (void)HYPERVISOR_xen_version(0, NULL); } -asmlinkage unsigned long xen_save_fl(void) +asmlinkage __visible unsigned long xen_save_fl(void) { struct vcpu_info *vcpu; unsigned long flags; @@ -63,7 +63,7 @@ __visible void xen_restore_fl(unsigned long flags) } PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); -asmlinkage void xen_irq_disable(void) +asmlinkage __visible void xen_irq_disable(void) { /* There's a one instruction preempt window here. We need to make sure we're don't switch CPUs between getting the vcpu @@ -74,7 +74,7 @@ asmlinkage void xen_irq_disable(void) } PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable); -asmlinkage void xen_irq_enable(void) +asmlinkage __visible void xen_irq_enable(void) { struct vcpu_info *vcpu; -- cgit v1.2.3 From aadca6fa4068ad1f92c492bc8507b7ed350825a2 Mon Sep 17 00:00:00 2001 From: Christian Gmeiner Date: Wed, 7 May 2014 09:01:54 +0200 Subject: x86/reboot: Add reboot quirk for Certec BPC600 Certec BPC600 needs reboot=pci to actually reboot. Signed-off-by: Christian Gmeiner Cc: Matthew Garrett Cc: Li Aubrey Cc: Andrew Morton Cc: Dave Jones Cc: Fenghua Yu Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1399446114-2147-1-git-send-email-christian.gmeiner@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 3399d3a99730..52b1157c53eb 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -191,6 +191,16 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { }, }, + /* Certec */ + { /* Handle problems with rebooting on Certec BPC600 */ + .callback = set_pci_reboot, + .ident = "Certec BPC600", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Certec"), + DMI_MATCH(DMI_PRODUCT_NAME, "BPC600"), + }, + }, + /* Dell */ { /* Handle problems with rebooting on Dell DXP061 */ .callback = set_bios_reboot, -- cgit v1.2.3 From 696dfd95ba9838327a7013e5988ff3ba60dcc8c8 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 7 May 2014 11:20:54 +0200 Subject: KVM: vmx: disable APIC virtualization in nested guests While running a nested guest, we should disable APIC virtualization controls (virtualized APIC register accesses, virtual interrupt delivery and posted interrupts), because we do not expose them to the nested guest. Reported-by: Hu Yaohui Suggested-by: Abel Gordon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 33e8c028842f..138ceffc6377 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7778,7 +7778,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control = vmcs12->pin_based_vm_exec_control; exec_control |= vmcs_config.pin_based_exec_ctrl; - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER | + PIN_BASED_POSTED_INTR); vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); vmx->nested.preemption_timer_expired = false; @@ -7815,7 +7816,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (!vmx->rdtscp_enabled) exec_control &= ~SECONDARY_EXEC_RDTSCP; /* Take the following fields only from vmcs12 */ - exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + SECONDARY_EXEC_APIC_REGISTER_VIRT); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; -- cgit v1.2.3 From 14262d67fe348018af368a07430fbc06eadeabb1 Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Wed, 7 May 2014 17:05:52 -0400 Subject: x86-64, build: Fix stack protector Makefile breakage with 32-bit userland If you are using a 64-bit kernel with 32-bit userland, then scripts/gcc-x86_64-has-stack-protector.sh invokes 32-bit gcc with -mcmodel=kernel, which produces: :1:0: error: code model 'kernel' not supported in the 32 bit mode and trips the "broken compiler" test at arch/x86/Makefile:120. There are several places a fix is possible, but the following seems cleanest. (But it's minimal; it would also be possible to factor out a bunch of stuff from the two branches of the if.) Signed-off-by: George Spelvin Link: http://lkml.kernel.org/r/20140507210552.7581.qmail@ns.horizon.com Cc: # v3.14 Signed-off-by: H. Peter Anvin --- arch/x86/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index ce6ad7e6a7d7..33f71b01fd22 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -79,6 +79,7 @@ else UTS_MACHINE := x86_64 CHECKFLAGS += -D__x86_64__ -m64 + biarch := -m64 KBUILD_AFLAGS += -m64 KBUILD_CFLAGS += -m64 -- cgit v1.2.3 From f10f383d8414bfe3357e24432ed8a26eeb58ffb8 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 24 Apr 2014 16:18:17 +0800 Subject: x86/hpet: Make boot_hpet_disable extern HPET on some platform has accuracy problem. Making "boot_hpet_disable" extern so that we can runtime disable the HPET timer by using quirk to check the platform. Signed-off-by: Feng Tang Cc: Clemens Ladisch Cc: John Stultz Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1398327498-13163-1-git-send-email-feng.tang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hpet.h | 1 + arch/x86/kernel/hpet.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index b18df579c0e9..36f7125945e3 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -63,6 +63,7 @@ /* hpet memory map physical address */ extern unsigned long hpet_address; extern unsigned long force_hpet_address; +extern int boot_hpet_disable; extern u8 hpet_blockid; extern int hpet_force_user; extern u8 hpet_msi_disable; diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 8d80ae011603..4177bfbc80b0 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -88,7 +88,7 @@ static inline void hpet_clear_mapping(void) /* * HPET command line enable / disable */ -static int boot_hpet_disable; +int boot_hpet_disable; int hpet_force_user; static int hpet_verbose; -- cgit v1.2.3 From 62187910b0fc7a75cfec9c30fda58ce2f39d689b Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 24 Apr 2014 16:18:18 +0800 Subject: x86/intel: Add quirk to disable HPET for the Baytrail platform HPET on current Baytrail platform has accuracy problem to be used as reliable clocksource/clockevent, so add a early quirk to disable it. Signed-off-by: Feng Tang Cc: Clemens Ladisch Cc: John Stultz Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1398327498-13163-2-git-send-email-feng.tang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/early-quirks.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 6e2537c32190..6cda0baeac9d 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -530,6 +531,15 @@ static void __init intel_graphics_stolen(int num, int slot, int func) } } +static void __init force_disable_hpet(int num, int slot, int func) +{ +#ifdef CONFIG_HPET_TIMER + boot_hpet_disable = 1; + pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n"); +#endif +} + + #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) @@ -567,6 +577,12 @@ static struct chipset early_qrk[] __initdata = { PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, QFLAG_APPLY_ONCE, intel_graphics_stolen }, + /* + * HPET on current version of Baytrail platform has accuracy + * problems, disable it for now: + */ + { PCI_VENDOR_ID_INTEL, 0x0f00, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, {} }; -- cgit v1.2.3 From 722a0d22d028bd74061cf582de1764884e73674f Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Fri, 9 May 2014 03:29:16 +0200 Subject: x86: Fix typo preventing msr_set/clear_bit from having an effect Due to a typo the msr accessor function introduced in 22085a66c2fab6cf9b9393c056a3600a6b4735de didn't have any lasting effects because they accidentally wrote the old value back. After c0a639ad0bc6b178b46996bd1f821a04643e2bde this at the very least this causes cpuid limits not to be lifted on some cpus leading to missing capabilities for those. Signed-off-by: Andres Freund Link: http://lkml.kernel.org/r/1399598957-7011-2-git-send-email-andres@anarazel.de Cc: Borislav Petkov Signed-off-by: H. Peter Anvin --- arch/x86/lib/msr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index db9db446b71a..43623739c7cf 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -76,7 +76,7 @@ static inline int __flip_bit(u32 msr, u8 bit, bool set) if (m1.q == m.q) return 0; - err = msr_write(msr, &m); + err = msr_write(msr, &m1); if (err) return err; -- cgit v1.2.3 From c45f77364ba060395b7eff1bf45e6c537f913380 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Fri, 9 May 2014 03:29:17 +0200 Subject: x86: Fix typo in MSR_IA32_MISC_ENABLE_LIMIT_CPUID macro The spuriously added semicolon didn't have any effect because the macro isn't currently in use. c0a639ad0bc6b178b46996bd1f821a04643e2bde Signed-off-by: Andres Freund Link: http://lkml.kernel.org/r/1399598957-7011-3-git-send-email-andres@anarazel.de Cc: Borislav Petkov Signed-off-by: H. Peter Anvin --- arch/x86/include/uapi/asm/msr-index.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index c827ace3121b..fcf2b3ae1bf0 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -384,7 +384,7 @@ #define MSR_IA32_MISC_ENABLE_MWAIT_BIT 18 #define MSR_IA32_MISC_ENABLE_MWAIT (1ULL << MSR_IA32_MISC_ENABLE_MWAIT_BIT) #define MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT 22 -#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT); +#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) #define MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT 23 #define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT) #define MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT 34 -- cgit v1.2.3 From 28b92e09e25bdc0ae864b22eacf195a74f861389 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Fri, 9 May 2014 11:11:27 -0400 Subject: x86, vdso, time: Cast tv_nsec to u64 for proper shifting in update_vsyscall() With tk->wall_to_monotonic.tv_nsec being a 32-bit value on 32-bit systems, (tk->wall_to_monotonic.tv_nsec << tk->shift) in update_vsyscall() may lose upper bits or, worse, add them since compiler will do this: (u64)(tk->wall_to_monotonic.tv_nsec << tk->shift) instead of ((u64)tk->wall_to_monotonic.tv_nsec << tk->shift) So if, for example, tv_nsec is 0x800000 and shift is 8 we will end up with 0xffffffff80000000 instead of 0x80000000. And then we are stuck in the subsequent 'while' loop. We need an explicit cast. Signed-off-by: Boris Ostrovsky Link: http://lkml.kernel.org/r/1399648287-15178-1-git-send-email-boris.ostrovsky@oracle.com Acked-by: Konrad Rzeszutek Wilk Cc: # v3.14 Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vsyscall_gtod.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c index f9c6e56e14b5..9531fbb123ba 100644 --- a/arch/x86/kernel/vsyscall_gtod.c +++ b/arch/x86/kernel/vsyscall_gtod.c @@ -43,7 +43,7 @@ void update_vsyscall(struct timekeeper *tk) vdata->monotonic_time_sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; vdata->monotonic_time_snsec = tk->xtime_nsec - + (tk->wall_to_monotonic.tv_nsec + + ((u64)tk->wall_to_monotonic.tv_nsec << tk->shift); while (vdata->monotonic_time_snsec >= (((u64)NSEC_PER_SEC) << tk->shift)) { -- cgit v1.2.3 From 7a5091d58419b4e5222abce58a40c072786ea1d6 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 11 May 2014 20:25:20 -0700 Subject: x86, rdrand: When nordrand is specified, disable RDSEED as well One can logically expect that when the user has specified "nordrand", the user doesn't want any use of the CPU random number generator, neither RDRAND nor RDSEED, so disable both. Reported-by: Stephan Mueller Cc: Theodore Ts'o Link: http://lkml.kernel.org/r/21542339.0lFnPSyGRS@myon.chronox.de Signed-off-by: H. Peter Anvin --- Documentation/kernel-parameters.txt | 8 ++++---- arch/x86/kernel/cpu/rdrand.c | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 43842177b771..30a8ad0dae53 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2218,10 +2218,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. noreplace-smp [X86-32,SMP] Don't replace SMP instructions with UP alternatives - nordrand [X86] Disable the direct use of the RDRAND - instruction even if it is supported by the - processor. RDRAND is still available to user - space applications. + nordrand [X86] Disable kernel use of the RDRAND and + RDSEED instructions even if they are supported + by the processor. RDRAND and RDSEED are still + available to user space applications. noresume [SWSUSP] Disables resume and restores original swap space. diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 384df5105fbc..136ac74dee82 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -27,6 +27,7 @@ static int __init x86_rdrand_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_RDRAND); + setup_clear_cpu_cap(X86_FEATURE_RDSEED); return 1; } __setup("nordrand", x86_rdrand_setup); -- cgit v1.2.3 From 773cd38f40b8834be991dbfed36683acc1dd41ee Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 13 May 2014 15:05:55 -0700 Subject: net: filter: x86: fix JIT address randomization bpf_alloc_binary() adds 128 bytes of room to JITed program image and rounds it up to the nearest page size. If image size is close to page size (like 4000), it is rounded to two pages: round_up(4000 + 4 + 128) == 8192 then 'hole' is computed as 8192 - (4000 + 4) = 4188 If prandom_u32() % hole selects a number >= PAGE_SIZE - sizeof(*header) then kernel will crash during bpf_jit_free(): kernel BUG at arch/x86/mm/pageattr.c:887! Call Trace: [] change_page_attr_set_clr+0x135/0x460 [] ? _raw_spin_unlock_irq+0x30/0x50 [] set_memory_rw+0x2f/0x40 [] bpf_jit_free_deferred+0x2d/0x60 [] process_one_work+0x1d8/0x6a0 [] ? process_one_work+0x178/0x6a0 [] worker_thread+0x11c/0x370 since bpf_jit_free() does: unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; struct bpf_binary_header *header = (void *)addr; to compute start address of 'bpf_binary_header' and header->pages will pass junk to: set_memory_rw(addr, header->pages); Fix it by making sure that &header->image[prandom_u32() % hole] and &header are in the same page Fixes: 314beb9bcabfd ("x86: bpf_jit_comp: secure bpf jit against spraying attacks") Signed-off-by: Alexei Starovoitov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/x86/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index dc017735bb91..6d5663a599a7 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -171,7 +171,7 @@ static struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen, memset(header, 0xcc, sz); /* fill whole space with int3 instructions */ header->pages = sz / PAGE_SIZE; - hole = sz - (proglen + sizeof(*header)); + hole = min(sz - (proglen + sizeof(*header)), PAGE_SIZE - sizeof(*header)); /* insert a random number of int3 instructions before BPF code */ *image_ptr = &header->image[prandom_u32() % hole]; -- cgit v1.2.3 From 9844f5462392b53824e8b86726e7c33b5ecbb676 Mon Sep 17 00:00:00 2001 From: Anthony Iliopoulos Date: Wed, 14 May 2014 11:29:48 +0200 Subject: x86, mm, hugetlb: Add missing TLB page invalidation for hugetlb_cow() The invalidation is required in order to maintain proper semantics under CoW conditions. In scenarios where a process clones several threads, a thread operating on a core whose DTLB entry for a particular hugepage has not been invalidated, will be reading from the hugepage that belongs to the forked child process, even after hugetlb_cow(). The thread will not see the updated page as long as the stale DTLB entry remains cached, the thread attempts to write into the page, the child process exits, or the thread gets migrated to a different processor. Signed-off-by: Anthony Iliopoulos Link: http://lkml.kernel.org/r/20140514092948.GA17391@server-36.huawei.corp Suggested-by: Shay Goikhman Acked-by: Dave Hansen Signed-off-by: H. Peter Anvin Cc: # v2.6.16+ (!) --- arch/x86/include/asm/hugetlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index a8091216963b..68c05398bba9 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -52,6 +52,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { + ptep_clear_flush(vma, addr, ptep); } static inline int huge_pte_none(pte_t pte) -- cgit v1.2.3 From 16a9602158861687c78b6de6dc6a79e6e8a9136f Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 14 May 2014 12:43:24 -0300 Subject: KVM: x86: disable master clock if TSC is reset during suspend Updating system_time from the kernel clock once master clock has been enabled can result in time backwards event, in case kernel clock frequency is lower than TSC frequency. Disable master clock in case it is necessary to update it from the resume path. Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8b8fc0b792ba..84a2d4152a63 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -106,6 +106,8 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); static u32 tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); +static bool backwards_tsc_observed = false; + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -1486,7 +1488,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) &ka->master_kernel_ns, &ka->master_cycle_now); - ka->use_master_clock = host_tsc_clocksource & vcpus_matched; + ka->use_master_clock = host_tsc_clocksource && vcpus_matched + && !backwards_tsc_observed; if (ka->use_master_clock) atomic_set(&kvm_guest_has_master_clock, 1); @@ -6945,6 +6948,7 @@ int kvm_arch_hardware_enable(void *garbage) */ if (backwards_tsc) { u64 delta_cyc = max_tsc - local_tsc; + backwards_tsc_observed = true; list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { vcpu->arch.tsc_offset_adjustment += delta_cyc; -- cgit v1.2.3 From fa81511bb0bbb2b1aace3695ce869da9762624ff Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 14 May 2014 16:33:54 -0700 Subject: x86-64, modify_ldt: Make support for 16-bit segments a runtime option Checkin: b3b42ac2cbae x86-64, modify_ldt: Ban 16-bit segments on 64-bit kernels disabled 16-bit segments on 64-bit kernels due to an information leak. However, it does seem that people are genuinely using Wine to run old 16-bit Windows programs on Linux. A proper fix for this ("espfix64") is coming in the upcoming merge window, but as a temporary fix, create a sysctl to allow the administrator to re-enable support for 16-bit segments. It adds a "/proc/sys/abi/ldt16" sysctl that defaults to zero (off). If you hit this issue and care about your old Windows program more than you care about a kernel stack address information leak, you can do echo 1 > /proc/sys/abi/ldt16 as root (add it to your startup scripts), and you should be ok. The sysctl table is only added if you have COMPAT support enabled on x86-64, but I assume anybody who runs old windows binaries very much does that ;) Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/CA%2B55aFw9BPoD10U1LfHbOMpHWZkvJTkMcfCs9s3urPr1YyWBxw@mail.gmail.com Cc: --- arch/x86/kernel/ldt.c | 4 +++- arch/x86/vdso/vdso32-setup.c | 8 ++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index af1d14a9ebda..dcbbaa165bde 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -20,6 +20,8 @@ #include #include +int sysctl_ldt16 = 0; + #ifdef CONFIG_SMP static void flush_ldt(void *current_mm) { @@ -234,7 +236,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) * IRET leaking the high bits of the kernel stack address. */ #ifdef CONFIG_X86_64 - if (!ldt_info.seg_32bit) { + if (!ldt_info.seg_32bit && !sysctl_ldt16) { error = -EINVAL; goto out_unlock; } diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 00348980a3a6..e1f220e3ca68 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -39,6 +39,7 @@ #ifdef CONFIG_X86_64 #define vdso_enabled sysctl_vsyscall32 #define arch_setup_additional_pages syscall32_setup_pages +extern int sysctl_ldt16; #endif /* @@ -249,6 +250,13 @@ static struct ctl_table abi_table2[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "ldt16", + .data = &sysctl_ldt16, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, {} }; -- cgit v1.2.3 From 6538b8ea886e472f4431db8ca1d60478f838d14b Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 28 May 2014 15:53:59 +0900 Subject: x86_64: expand kernel stack to 16K While I play inhouse patches with much memory pressure on qemu-kvm, 3.14 kernel was randomly crashed. The reason was kernel stack overflow. When I investigated the problem, the callstack was a little bit deeper by involve with reclaim functions but not direct reclaim path. I tried to diet stack size of some functions related with alloc/reclaim so did a hundred of byte but overflow was't disappeard so that I encounter overflow by another deeper callstack on reclaim/allocator path. Of course, we might sweep every sites we have found for reducing stack usage but I'm not sure how long it saves the world(surely, lots of developer start to add nice features which will use stack agains) and if we consider another more complex feature in I/O layer and/or reclaim path, it might be better to increase stack size( meanwhile, stack usage on 64bit machine was doubled compared to 32bit while it have sticked to 8K. Hmm, it's not a fair to me and arm64 already expaned to 16K. ) So, my stupid idea is just let's expand stack size and keep an eye toward stack consumption on each kernel functions via stacktrace of ftrace. For example, we can have a bar like that each funcion shouldn't exceed 200K and emit the warning when some function consumes more in runtime. Of course, it could make false positive but at least, it could make a chance to think over it. I guess this topic was discussed several time so there might be strong reason not to increase kernel stack size on x86_64, for me not knowing so Ccing x86_64 maintainers, other MM guys and virtio maintainers. Here's an example call trace using up the kernel stack: Depth Size Location (51 entries) ----- ---- -------- 0) 7696 16 lookup_address 1) 7680 16 _lookup_address_cpa.isra.3 2) 7664 24 __change_page_attr_set_clr 3) 7640 392 kernel_map_pages 4) 7248 256 get_page_from_freelist 5) 6992 352 __alloc_pages_nodemask 6) 6640 8 alloc_pages_current 7) 6632 168 new_slab 8) 6464 8 __slab_alloc 9) 6456 80 __kmalloc 10) 6376 376 vring_add_indirect 11) 6000 144 virtqueue_add_sgs 12) 5856 288 __virtblk_add_req 13) 5568 96 virtio_queue_rq 14) 5472 128 __blk_mq_run_hw_queue 15) 5344 16 blk_mq_run_hw_queue 16) 5328 96 blk_mq_insert_requests 17) 5232 112 blk_mq_flush_plug_list 18) 5120 112 blk_flush_plug_list 19) 5008 64 io_schedule_timeout 20) 4944 128 mempool_alloc 21) 4816 96 bio_alloc_bioset 22) 4720 48 get_swap_bio 23) 4672 160 __swap_writepage 24) 4512 32 swap_writepage 25) 4480 320 shrink_page_list 26) 4160 208 shrink_inactive_list 27) 3952 304 shrink_lruvec 28) 3648 80 shrink_zone 29) 3568 128 do_try_to_free_pages 30) 3440 208 try_to_free_pages 31) 3232 352 __alloc_pages_nodemask 32) 2880 8 alloc_pages_current 33) 2872 200 __page_cache_alloc 34) 2672 80 find_or_create_page 35) 2592 80 ext4_mb_load_buddy 36) 2512 176 ext4_mb_regular_allocator 37) 2336 128 ext4_mb_new_blocks 38) 2208 256 ext4_ext_map_blocks 39) 1952 160 ext4_map_blocks 40) 1792 384 ext4_writepages 41) 1408 16 do_writepages 42) 1392 96 __writeback_single_inode 43) 1296 176 writeback_sb_inodes 44) 1120 80 __writeback_inodes_wb 45) 1040 160 wb_writeback 46) 880 208 bdi_writeback_workfn 47) 672 144 process_one_work 48) 528 112 worker_thread 49) 416 240 kthread 50) 176 176 ret_from_fork [ Note: the problem is exacerbated by certain gcc versions that seem to generate much bigger stack frames due to apparently bad coalescing of temporaries and generating too many spills. Rusty saw gcc-4.6.4 using 35% more stack on the virtio path than 4.8.2 does, for example. Minchan not only uses such a bad gcc version (4.6.3 in his case), but some of the stack use is due to debugging (CONFIG_DEBUG_PAGEALLOC is what causes that kernel_map_pages() frame, for example). But we're clearly getting too close. The VM code also seems to have excessive stack frames partly for the same compiler reason, triggered by excessive inlining and lots of function arguments. We need to improve on our stack use, but in the meantime let's do this simple stack increase too. Unlike most earlier reports, there is nothing simple that stands out as being really horribly wrong here, apart from the fact that the stack frames are just bigger than they should need to be. - Linus ] Signed-off-by: Minchan Kim Cc: Peter Anvin Cc: Dave Chinner Cc: Dave Jones Cc: Jens Axboe Cc: Andrew Morton Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Mel Gorman Cc: Rik van Riel Cc: Johannes Weiner Cc: Hugh Dickins Cc: Rusty Russell Cc: Michael S Tsirkin Cc: Dave Hansen Cc: Steven Rostedt Cc: PJ Waskiewicz Signed-off-by: Linus Torvalds --- arch/x86/include/asm/page_64_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 8de6d9cf3b95..678205195ae1 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_PAGE_64_DEFS_H #define _ASM_X86_PAGE_64_DEFS_H -#define THREAD_SIZE_ORDER 1 +#define THREAD_SIZE_ORDER 2 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) #define CURRENT_MASK (~(THREAD_SIZE - 1)) -- cgit v1.2.3