From 4d178f94ebe123d462a51169b53854cb7f198888 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sun, 12 Apr 2015 09:14:45 -0400 Subject: x86/asm: Merge common 32-bit values in asm-offsets.c Merge common values for 32-bit native and compat. Signed-off-by: Brian Gerst Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Denys Vlasenko Link: http://lkml.kernel.org/r/1428844486-6638-1-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/asm-offsets.c | 19 +++++++++++++++++++ arch/x86/kernel/asm-offsets_32.c | 15 --------------- arch/x86/kernel/asm-offsets_64.c | 21 --------------------- 3 files changed, 19 insertions(+), 36 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 9f6b9341950f..b27f6ec90caa 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -41,6 +41,25 @@ void common(void) { OFFSET(pbe_orig_address, pbe, orig_address); OFFSET(pbe_next, pbe, next); +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) + BLANK(); + OFFSET(IA32_SIGCONTEXT_ax, sigcontext_ia32, ax); + OFFSET(IA32_SIGCONTEXT_bx, sigcontext_ia32, bx); + OFFSET(IA32_SIGCONTEXT_cx, sigcontext_ia32, cx); + OFFSET(IA32_SIGCONTEXT_dx, sigcontext_ia32, dx); + OFFSET(IA32_SIGCONTEXT_si, sigcontext_ia32, si); + OFFSET(IA32_SIGCONTEXT_di, sigcontext_ia32, di); + OFFSET(IA32_SIGCONTEXT_bp, sigcontext_ia32, bp); + OFFSET(IA32_SIGCONTEXT_sp, sigcontext_ia32, sp); + OFFSET(IA32_SIGCONTEXT_ip, sigcontext_ia32, ip); + + BLANK(); + OFFSET(TI_sysenter_return, thread_info, sysenter_return); + + BLANK(); + OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); +#endif + #ifdef CONFIG_PARAVIRT BLANK(); OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 47703aed74cf..628bfd4c06bb 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -17,17 +17,6 @@ void foo(void); void foo(void) { - OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax); - OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx); - OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx); - OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx); - OFFSET(IA32_SIGCONTEXT_si, sigcontext, si); - OFFSET(IA32_SIGCONTEXT_di, sigcontext, di); - OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp); - OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp); - OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip); - BLANK(); - OFFSET(CPUINFO_x86, cpuinfo_x86, x86); OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); @@ -37,7 +26,6 @@ void foo(void) OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); BLANK(); - OFFSET(TI_sysenter_return, thread_info, sysenter_return); OFFSET(TI_cpu, thread_info, cpu); BLANK(); @@ -60,9 +48,6 @@ void foo(void) OFFSET(PT_OLDSS, pt_regs, ss); BLANK(); - OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); - BLANK(); - OFFSET(saved_context_gdt_desc, saved_context, gdt_desc); BLANK(); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 5ce6f2da8763..dcaab87da629 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -29,27 +29,6 @@ int main(void) BLANK(); #endif -#ifdef CONFIG_IA32_EMULATION - OFFSET(TI_sysenter_return, thread_info, sysenter_return); - BLANK(); - -#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry) - ENTRY(ax); - ENTRY(bx); - ENTRY(cx); - ENTRY(dx); - ENTRY(si); - ENTRY(di); - ENTRY(bp); - ENTRY(sp); - ENTRY(ip); - BLANK(); -#undef ENTRY - - OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); - BLANK(); -#endif - #define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry) ENTRY(bx); ENTRY(cx); -- cgit v1.2.3 From 14434052ffb3b7fe8f491e9d0a7793376fb79155 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sun, 12 Apr 2015 09:14:46 -0400 Subject: x86/asm: Remove unused TI_cpu Signed-off-by: Brian Gerst Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Denys Vlasenko Link: http://lkml.kernel.org/r/1428844486-6638-2-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/asm-offsets_32.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 628bfd4c06bb..6ce39025f467 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -26,9 +26,6 @@ void foo(void) OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); BLANK(); - OFFSET(TI_cpu, thread_info, cpu); - BLANK(); - OFFSET(PT_EBX, pt_regs, bx); OFFSET(PT_ECX, pt_regs, cx); OFFSET(PT_EDX, pt_regs, dx); -- cgit v1.2.3 From c0f6feba784e1087b905ad097d2d9ac0aaf744a5 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 15 Apr 2015 08:50:14 +0200 Subject: x86/asm, x86/acpi/wakeup_64.S: Make global label a local one Make it a local symbol so that it doesn't appear in objdump output. No functionality change - code remains the same, just the global label disappears: ffffffff81039dbe: bf 03 00 00 00 mov $0x3,%edi ffffffff81039dc3: 31 c0 xor %eax,%eax ffffffff81039dc5: e8 b6 fd ff ff callq ffffffff81039b80 -ffffffff81039dca: eb 00 jmp ffffffff81039dcc - -ffffffff81039dcc : +ffffffff81039dca: eb 00 jmp ffffffff81039dcc ffffffff81039dcc: 48 c7 c0 80 1a ca 82 mov $0xffffffff82ca1a80,%rax ffffffff81039dd3: 48 8b 98 e2 00 00 00 mov 0xe2(%rax),%rbx ffffffff81039dda: 0f 22 e3 mov %rbx,%cr4 Signed-off-by: Borislav Petkov Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Len Brown Cc: Linus Torvalds Cc: Pavel Machek Cc: Rafael J. Wysocki Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1429080614-22610-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/wakeup_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index ae693b51ed8e..8c35df468104 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -62,7 +62,7 @@ ENTRY(do_suspend_lowlevel) pushfq popq pt_regs_flags(%rax) - movq $resume_point, saved_rip(%rip) + movq $.Lresume_point, saved_rip(%rip) movq %rsp, saved_rsp movq %rbp, saved_rbp @@ -75,10 +75,10 @@ ENTRY(do_suspend_lowlevel) xorl %eax, %eax call x86_acpi_enter_sleep_state /* in case something went wrong, restore the machine status and go on */ - jmp resume_point + jmp .Lresume_point .align 4 -resume_point: +.Lresume_point: /* We don't restore %rax, it must be 0 anyway */ movq $saved_context, %rax movq saved_context_cr4(%rax), %rbx -- cgit v1.2.3 From aac82d319148c6a84e1bf90b86d3e0ec8bf0ee38 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Apr 2015 15:51:54 -0700 Subject: x86, paravirt, xen: Remove the 64-bit ->irq_enable_sysexit() pvop We don't use irq_enable_sysexit on 64-bit kernels any more. Remove all the paravirt and Xen machinery to support it on 64-bit kernels. Tested-by: Boris Ostrovsky Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/8a03355698fe5b94194e9e7360f19f91c1b2cf1f.1428100853.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 6 ------ arch/x86/include/asm/paravirt_types.h | 7 ++++--- arch/x86/kernel/asm-offsets.c | 2 ++ arch/x86/kernel/paravirt.c | 4 +++- arch/x86/kernel/paravirt_patch_64.c | 1 - arch/x86/xen/enlighten.c | 3 ++- arch/x86/xen/xen-asm_64.S | 16 ---------------- arch/x86/xen/xen-ops.h | 2 ++ 8 files changed, 13 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a821b1cd4fa7..3cdb9eafbf8c 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -77,12 +77,6 @@ ENTRY(native_usergs_sysret32) swapgs sysretl ENDPROC(native_usergs_sysret32) - -ENTRY(native_irq_enable_sysexit) - swapgs - sti - sysexit -ENDPROC(native_irq_enable_sysexit) #endif /* diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 7549b8b369e4..38a0ff9ef06e 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -160,13 +160,14 @@ struct pv_cpu_ops { u64 (*read_pmc)(int counter); unsigned long long (*read_tscp)(unsigned int *aux); +#ifdef CONFIG_X86_32 /* * Atomically enable interrupts and return to userspace. This - * is only ever used to return to 32-bit processes; in a - * 64-bit kernel, it's used for 32-on-64 compat processes, but - * never native 64-bit processes. (Jump, not call.) + * is only used in 32-bit kernels. 64-bit kernels use + * usergs_sysret32 instead. */ void (*irq_enable_sysexit)(void); +#endif /* * Switch to usermode gs and return to 64-bit usermode using diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index b27f6ec90caa..8e3d22a1af94 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -68,7 +68,9 @@ void common(void) { OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_CPU_iret, pv_cpu_ops, iret); +#ifdef CONFIG_X86_32 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); +#endif OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); #endif diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 548d25f00c90..7563114d9c3a 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -154,7 +154,9 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, ret = paravirt_patch_ident_64(insnbuf, len); else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || +#ifdef CONFIG_X86_32 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || +#endif type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64)) /* If operation requires a jmp, then jmp */ @@ -371,7 +373,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .load_sp0 = native_load_sp0, -#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) +#if defined(CONFIG_X86_32) .irq_enable_sysexit = native_irq_enable_sysexit, #endif #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index a1da6737ba5b..0de21c62c348 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -49,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, save_fl); PATCH_SITE(pv_irq_ops, irq_enable); PATCH_SITE(pv_irq_ops, irq_disable); - PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); PATCH_SITE(pv_cpu_ops, usergs_sysret32); PATCH_SITE(pv_cpu_ops, usergs_sysret64); PATCH_SITE(pv_cpu_ops, swapgs); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 81665c9f2132..3797b6b31f95 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1267,10 +1267,11 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .read_tscp = native_read_tscp, .iret = xen_iret, - .irq_enable_sysexit = xen_sysexit, #ifdef CONFIG_X86_64 .usergs_sysret32 = xen_sysret32, .usergs_sysret64 = xen_sysret64, +#else + .irq_enable_sysexit = xen_sysexit, #endif .load_tr_desc = paravirt_nop, diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 985fc3ee0973..a2cabb8bd6bf 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -47,22 +47,6 @@ ENTRY(xen_iret) ENDPATCH(xen_iret) RELOC(xen_iret, 1b+1) -/* - * sysexit is not used for 64-bit processes, so it's only ever used to - * return to 32-bit compat userspace. - */ -ENTRY(xen_sysexit) - pushq $__USER32_DS - pushq %rcx - pushq $X86_EFLAGS_IF - pushq $__USER32_CS - pushq %rdx - - pushq $0 -1: jmp hypercall_iret -ENDPATCH(xen_sysexit) -RELOC(xen_sysexit, 1b+1) - ENTRY(xen_sysret64) /* * We're already on the usermode stack at this point, but diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9e195c683549..c20fe29e65f4 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -134,7 +134,9 @@ DECL_ASM(void, xen_restore_fl_direct, unsigned long); /* These are not functions, and cannot be called normally */ __visible void xen_iret(void); +#ifdef CONFIG_X86_32 __visible void xen_sysexit(void); +#endif __visible void xen_sysret32(void); __visible void xen_sysret64(void); __visible void xen_adjust_exception_frame(void); -- cgit v1.2.3 From 17be0aec74fb036eb4eb32c2268f3420a034762b Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 21 Apr 2015 18:27:29 +0200 Subject: x86/asm/entry/64: Implement better check for canonical addresses This change makes the check exact (no more false positives on "negative" addresses). Andy explains: "Canonical addresses either start with 17 zeros or 17 ones. In the old code, we checked that the top (64-47) = 17 bits were all zero. We did this by shifting right by 47 bits and making sure that nothing was left. In the new code, we're shifting left by (64 - 48) = 16 bits and then signed shifting right by the same amount, this propagating the 17th highest bit to all positions to its left. If we get the same value we started with, then we're good to go." While it isn't really important to be fully correct here - almost all addresses we'll ever see will be userspace ones, but OTOH it looks to be cheap enough: the new code uses two more ALU ops but preserves %rcx, allowing to not reload it from pt_regs->cx again. On disassembly level, the changes are: cmp %rcx,0x80(%rsp) -> mov 0x80(%rsp),%r11; cmp %rcx,%r11 shr $0x2f,%rcx -> shl $0x10,%rcx; sar $0x10,%rcx; cmp %rcx,%r11 mov 0x58(%rsp),%rcx -> (eliminated) Signed-off-by: Denys Vlasenko Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1429633649-20169-1-git-send-email-dvlasenk@redhat.com [ Changelog massage. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c7b238494b31..3c78a15a537d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -410,26 +410,27 @@ syscall_return: * a completely clean 64-bit userspace context. */ movq RCX(%rsp),%rcx - cmpq %rcx,RIP(%rsp) /* RCX == RIP */ + movq RIP(%rsp),%r11 + cmpq %rcx,%r11 /* RCX == RIP */ jne opportunistic_sysret_failed /* * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP * in kernel space. This essentially lets the user take over - * the kernel, since userspace controls RSP. It's not worth - * testing for canonicalness exactly -- this check detects any - * of the 17 high bits set, which is true for non-canonical - * or kernel addresses. (This will pessimize vsyscall=native. - * Big deal.) + * the kernel, since userspace controls RSP. * - * If virtual addresses ever become wider, this will need + * If width of "canonical tail" ever becomes variable, this will need * to be updated to remain correct on both old and new CPUs. */ .ifne __VIRTUAL_MASK_SHIFT - 47 .error "virtual address width changed -- SYSRET checks need update" .endif - shr $__VIRTUAL_MASK_SHIFT, %rcx - jnz opportunistic_sysret_failed + /* Change top 16 bits to be the sign-extension of 47th bit */ + shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx + sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx + /* If this changed %rcx, it was not canonical */ + cmpq %rcx, %r11 + jne opportunistic_sysret_failed cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ jne opportunistic_sysret_failed @@ -466,8 +467,8 @@ syscall_return: */ syscall_return_via_sysret: CFI_REMEMBER_STATE - /* r11 is already restored (see code above) */ - RESTORE_C_REGS_EXCEPT_R11 + /* rcx and r11 are already restored (see code above) */ + RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp),%rsp USERGS_SYSRET64 CFI_RESTORE_STATE -- cgit v1.2.3 From ac7f5dfb0348a33b2ea92a0c477103c4db45ad4e Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 21 Apr 2015 18:03:13 +0200 Subject: x86/asm/entry/64: Merge 32-bit execve stubs with x32 ones, as they are identical Run-tested. Suggested-by: Brian Gerst Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1429632194-13445-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3c78a15a537d..e952f6bf1d6d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -525,40 +525,27 @@ GLOBAL(stub_execveat) CFI_ENDPROC END(stub_execveat) -#ifdef CONFIG_X86_X32_ABI +#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) .align 8 GLOBAL(stub_x32_execve) +GLOBAL(stub32_execve) CFI_STARTPROC DEFAULT_FRAME 0, 8 call compat_sys_execve jmp return_from_execve CFI_ENDPROC +END(stub32_execve) END(stub_x32_execve) .align 8 GLOBAL(stub_x32_execveat) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 - call compat_sys_execveat - jmp return_from_execve - CFI_ENDPROC -END(stub_x32_execveat) -#endif - -#ifdef CONFIG_IA32_EMULATION - .align 8 -GLOBAL(stub32_execve) - CFI_STARTPROC - call compat_sys_execve - jmp return_from_execve - CFI_ENDPROC -END(stub32_execve) - .align 8 GLOBAL(stub32_execveat) CFI_STARTPROC + DEFAULT_FRAME 0, 8 call compat_sys_execveat jmp return_from_execve CFI_ENDPROC END(stub32_execveat) +END(stub_x32_execveat) #endif /* -- cgit v1.2.3 From 5f0052f9522b84269e1b3b435a806f873d992702 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:23 +0800 Subject: x86/irq: Save destination CPU ID in irq_cfg Cache destination CPU APIC ID into struct irq_cfg when assigning vector for interrupt. Upper layer just needs to read the cached APIC ID instead of calling apic->cpu_mask_to_apicid_and(), it helps to hide APIC driver details from IOAPIC/HPET/MSI drivers.. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-2-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 1 + arch/x86/kernel/apic/vector.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index e9571ddabc4f..cda96954cbbf 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -116,6 +116,7 @@ struct irq_data; struct irq_cfg { cpumask_var_t domain; cpumask_var_t old_domain; + unsigned int dest_apicid; u8 vector; u8 move_in_progress : 1; #ifdef CONFIG_IRQ_REMAP diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 6cedd7914581..c724ef6b218c 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -188,6 +188,12 @@ next: } free_cpumask_var(tmp_mask); + if (!err) { + /* cache destination APIC IDs into cfg->dest_apicid */ + err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, + &cfg->dest_apicid); + } + return err; } -- cgit v1.2.3 From b5dc8e6c21e7ffba0246bf39cea97805c142bf85 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:24 +0800 Subject: x86/irq: Use hierarchical irqdomain to manage CPU interrupt vectors Abstract CPU local APIC as an interrupt controller and create an irqdomain for it to manage CPU interrupt vectors. It's the base to enable hierarchical irqdomains on x86 systems. The final irqdomain hierarchy will look like this: IOAPIC domain ----| MSI/MSI-x domain ----> [Interrupt Remapping domain] -> CPU vector domain HPET_IRQ domain ----| ^ | DMAR domain ----------------------------------------------| HT_IRQ domain ----------------------------------------------| Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Prarit Bhargava Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428905519-23704-3-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 3 +- arch/x86/include/asm/hw_irq.h | 17 +++++ arch/x86/kernel/apic/io_apic.c | 3 - arch/x86/kernel/apic/vector.c | 155 +++++++++++++++++++++++++++++++++++++---- 4 files changed, 160 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6049d587599e..e75a96c38c58 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -465,7 +465,6 @@ config X86_INTEL_CE select X86_REBOOTFIXUPS select OF select OF_EARLY_FLATTREE - select IRQ_DOMAIN ---help--- Select for the Intel CE media processor (CE4100) SOC. This option compiles in support for the CE4100 SOC for settop @@ -914,11 +913,11 @@ config X86_LOCAL_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ + select IRQ_DOMAIN_HIERARCHY config X86_IO_APIC def_bool y depends on X86_LOCAL_APIC || X86_UP_IOAPIC - select IRQ_DOMAIN config X86_REROUTE_FOR_BROKEN_BOOT_IRQS bool "Reroute for broken boot IRQs" diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index cda96954cbbf..5b951ac56aa1 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -112,6 +112,17 @@ struct irq_2_irte { #ifdef CONFIG_X86_LOCAL_APIC struct irq_data; +struct irq_domain; + +struct irq_alloc_info { + u32 flags; + const struct cpumask *mask; /* CPU mask for vector allocation */ +}; + +enum { + /* Allocate contiguous CPU vectors */ + X86_IRQ_ALLOC_CONTIGUOUS_VECTORS = 0x1, +}; struct irq_cfg { cpumask_var_t domain; @@ -135,6 +146,12 @@ struct irq_cfg { }; }; +extern struct irq_domain *x86_vector_domain; + +extern void init_irq_alloc_info(struct irq_alloc_info *info, + const struct cpumask *mask); +extern void copy_irq_alloc_info(struct irq_alloc_info *dst, + struct irq_alloc_info *src); extern struct irq_cfg *irq_cfg(unsigned int irq); extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data); extern struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f4dc2462a1ac..56d532106ef3 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2356,9 +2356,6 @@ static int mp_irqdomain_create(int ioapic) ioapic_dynirq_base = max(ioapic_dynirq_base, gsi_cfg->gsi_end + 1); - if (gsi_cfg->gsi_base == 0) - irq_set_default_host(ip->irqdomain); - return 0; } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index c724ef6b218c..6358d8d351f5 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -3,6 +3,8 @@ * * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo * Moved from arch/x86/kernel/apic/io_apic.c. + * Jiang Liu + * Enable support of hierarchical irqdomains * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -19,7 +21,9 @@ #include #include +struct irq_domain *x86_vector_domain; static DEFINE_RAW_SPINLOCK(vector_lock); +static struct irq_chip lapic_controller; void lock_vector_lock(void) { @@ -36,15 +40,21 @@ void unlock_vector_lock(void) struct irq_cfg *irq_cfg(unsigned int irq) { - return irq_get_chip_data(irq); + return irqd_cfg(irq_get_irq_data(irq)); } struct irq_cfg *irqd_cfg(struct irq_data *irq_data) { + if (!irq_data) + return NULL; + + while (irq_data->parent_data) + irq_data = irq_data->parent_data; + return irq_data->chip_data; } -static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) +static struct irq_cfg *alloc_irq_cfg(int node) { struct irq_cfg *cfg; @@ -79,7 +89,7 @@ struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) return cfg; } - cfg = alloc_irq_cfg(at, node); + cfg = alloc_irq_cfg(node); if (cfg) irq_set_chip_data(at, cfg); else @@ -87,14 +97,13 @@ struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) return cfg; } -static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) +static void free_irq_cfg(struct irq_cfg *cfg) { - if (!cfg) - return; - irq_set_chip_data(at, NULL); - free_cpumask_var(cfg->domain); - free_cpumask_var(cfg->old_domain); - kfree(cfg); + if (cfg) { + free_cpumask_var(cfg->domain); + free_cpumask_var(cfg->old_domain); + kfree(cfg); + } } static int @@ -241,6 +250,90 @@ void clear_irq_vector(int irq, struct irq_cfg *cfg) raw_spin_unlock_irqrestore(&vector_lock, flags); } +void init_irq_alloc_info(struct irq_alloc_info *info, + const struct cpumask *mask) +{ + memset(info, 0, sizeof(*info)); + info->mask = mask; +} + +void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src) +{ + if (src) + *dst = *src; + else + memset(dst, 0, sizeof(*dst)); +} + +static inline const struct cpumask * +irq_alloc_info_get_mask(struct irq_alloc_info *info) +{ + return (!info || !info->mask) ? apic->target_cpus() : info->mask; +} + +static void x86_vector_free_irqs(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *irq_data; + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i); + if (irq_data && irq_data->chip_data) { + free_remapped_irq(virq); + clear_irq_vector(virq + i, irq_data->chip_data); + free_irq_cfg(irq_data->chip_data); + irq_domain_reset_irq_data(irq_data); + } + } +} + +static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_alloc_info *info = arg; + const struct cpumask *mask; + struct irq_data *irq_data; + struct irq_cfg *cfg; + int i, err; + + if (disable_apic) + return -ENXIO; + + /* Currently vector allocator can't guarantee contiguous allocations */ + if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1) + return -ENOSYS; + + mask = irq_alloc_info_get_mask(info); + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(domain, virq + i); + BUG_ON(!irq_data); + cfg = alloc_irq_cfg(irq_data->node); + if (!cfg) { + err = -ENOMEM; + goto error; + } + + irq_data->chip = &lapic_controller; + irq_data->chip_data = cfg; + irq_data->hwirq = virq + i; + err = assign_irq_vector(virq, cfg, mask); + if (err) + goto error; + } + + return 0; + +error: + x86_vector_free_irqs(domain, virq, i + 1); + return err; +} + +static struct irq_domain_ops x86_vector_domain_ops = { + .alloc = x86_vector_alloc_irqs, + .free = x86_vector_free_irqs, +}; + int __init arch_probe_nr_irqs(void) { int nr; @@ -266,6 +359,11 @@ int __init arch_probe_nr_irqs(void) int __init arch_early_irq_init(void) { + x86_vector_domain = irq_domain_add_tree(NULL, &x86_vector_domain_ops, + NULL); + BUG_ON(x86_vector_domain == NULL); + irq_set_default_host(x86_vector_domain); + return arch_early_ioapic_init(); } @@ -380,6 +478,36 @@ int apic_set_affinity(struct irq_data *data, const struct cpumask *mask, return 0; } +static int vector_set_affinity(struct irq_data *irq_data, + const struct cpumask *dest, bool force) +{ + struct irq_cfg *cfg = irq_data->chip_data; + int err, irq = irq_data->irq; + + if (!config_enabled(CONFIG_SMP)) + return -EPERM; + + if (!cpumask_intersects(dest, cpu_online_mask)) + return -EINVAL; + + err = assign_irq_vector(irq, cfg, dest); + if (err) { + struct irq_data *top = irq_get_irq_data(irq); + + if (assign_irq_vector(irq, cfg, top->affinity)) + pr_err("Failed to recover vector for irq %d\n", irq); + return err; + } + + return IRQ_SET_MASK_OK; +} + +static struct irq_chip lapic_controller = { + .irq_ack = apic_ack_edge, + .irq_set_affinity = vector_set_affinity, + .irq_retrigger = apic_retrigger_irq, +}; + #ifdef CONFIG_SMP void send_cleanup_vector(struct irq_cfg *cfg) { @@ -497,7 +625,7 @@ int arch_setup_hwirq(unsigned int irq, int node) unsigned long flags; int ret; - cfg = alloc_irq_cfg(irq, node); + cfg = alloc_irq_cfg(node); if (!cfg) return -ENOMEM; @@ -508,7 +636,7 @@ int arch_setup_hwirq(unsigned int irq, int node) if (!ret) irq_set_chip_data(irq, cfg); else - free_irq_cfg(irq, cfg); + free_irq_cfg(cfg); return ret; } @@ -518,7 +646,8 @@ void arch_teardown_hwirq(unsigned int irq) free_remapped_irq(irq); clear_irq_vector(irq, cfg); - free_irq_cfg(irq, cfg); + irq_set_chip_data(irq, NULL); + free_irq_cfg(cfg); } static void __init print_APIC_field(int base) -- cgit v1.2.3 From bd8eb63f8a3907bb477992145cb6ce0064a1e43f Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:25 +0800 Subject: x86/hpet: Use new irqdomain interfaces to allocate/free IRQ Use new irqdomain interfaces to allocate/free IRQ for HPET, so we can remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ later. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Srivatsa S. Bhat Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Andy Lutomirski Link: http://lkml.kernel.org/r/1416894816-23245-4-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 3acbff4716b0..ae29554f57ea 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -476,7 +477,7 @@ static int hpet_msi_next_event(unsigned long delta, static int hpet_setup_msi_irq(unsigned int irq) { if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) { - irq_free_hwirq(irq); + irq_domain_free_irqs(irq, 1); return -EINVAL; } return 0; @@ -484,9 +485,10 @@ static int hpet_setup_msi_irq(unsigned int irq) static int hpet_assign_irq(struct hpet_dev *dev) { - unsigned int irq = irq_alloc_hwirq(-1); + int irq; - if (!irq) + irq = irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL); + if (irq <= 0) return -EINVAL; irq_set_handler_data(irq, dev); -- cgit v1.2.3 From 4c8f9960ee497020d0858362c81ece984bc89aa5 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:26 +0800 Subject: x86/MSI: Use new irqdomain interfaces to allocate/free IRQ Use new irqdomain interfaces to allocate/free IRQ for PCI MSI, so we can remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ later. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1416894816-23245-5-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index d6ba2d660dc5..76cc2c902176 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -146,23 +147,20 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { struct msi_desc *msidesc; - unsigned int irq; - int node, ret; + int irq, ret; /* Multiple MSI vectors only supported with interrupt remapping */ if (type == PCI_CAP_ID_MSI && nvec > 1) return 1; - node = dev_to_node(&dev->dev); - list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = irq_alloc_hwirq(node); - if (!irq) + irq = irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL); + if (irq <= 0) return -ENOSPC; ret = setup_msi_irq(dev, msidesc, irq, 0); if (ret < 0) { - irq_free_hwirq(irq); + irq_domain_free_irqs(irq, 1); return ret; } @@ -172,7 +170,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) void native_teardown_msi_irq(unsigned int irq) { - irq_free_hwirq(irq); + irq_domain_free_irqs(irq, 1); } #ifdef CONFIG_DMAR_TABLE -- cgit v1.2.3 From af87baedf2c23b1181f51323339210a26a64f7fc Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:28 +0800 Subject: x86/htirq: Use new irqdomain interfaces to allocate/free IRQ Use new irqdomain interfaces to allocate/free IRQ for HTIRQ, so we can remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ later. This patch changes the interfaces between arch independent PCI driver and arch specific code. Currently HT_IRQ is only enabled on x86, so it does not affect other architectures. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-7-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/htirq.c | 26 +++++++++++++------------- drivers/pci/htirq.c | 7 +++---- include/linux/htirq.h | 2 ++ 3 files changed, 18 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c index 816f36e979ad..b307ee7a7148 100644 --- a/arch/x86/kernel/apic/htirq.c +++ b/arch/x86/kernel/apic/htirq.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -61,31 +62,30 @@ static struct irq_chip ht_irq_chip = { .flags = IRQCHIP_SKIP_SET_WAKE, }; +int arch_alloc_ht_irq(struct pci_dev *dev) +{ + return irq_domain_alloc_irqs(NULL, 1, dev_to_node(&dev->dev), NULL); +} + +void arch_free_ht_irq(int irq) +{ + irq_domain_free_irqs(irq, 1); +} + int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) { struct irq_cfg *cfg; struct ht_irq_msg msg; - unsigned dest; - int err; if (disable_apic) return -ENXIO; cfg = irq_cfg(irq); - err = assign_irq_vector(irq, cfg, apic->target_cpus()); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(cfg->domain, - apic->target_cpus(), &dest); - if (err) - return err; - - msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); + msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid); msg.address_lo = HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(dest) | + HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) | HT_IRQ_LOW_VECTOR(cfg->vector) | ((apic->irq_dest_mode == 0) ? HT_IRQ_LOW_DM_PHYSICAL : diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c index a94dd2c4183a..ceb0ebeb7b5f 100644 --- a/drivers/pci/htirq.c +++ b/drivers/pci/htirq.c @@ -117,8 +117,8 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) cfg->msg.address_lo = 0xffffffff; cfg->msg.address_hi = 0xffffffff; - irq = irq_alloc_hwirq(dev_to_node(&dev->dev)); - if (!irq) { + irq = arch_alloc_ht_irq(dev); + if (irq <= 0) { kfree(cfg); return -EBUSY; } @@ -163,8 +163,7 @@ void ht_destroy_irq(unsigned int irq) cfg = irq_get_handler_data(irq); irq_set_chip(irq, NULL); irq_set_handler_data(irq, NULL); - irq_free_hwirq(irq); - + arch_free_ht_irq(irq); kfree(cfg); } EXPORT_SYMBOL(ht_destroy_irq); diff --git a/include/linux/htirq.h b/include/linux/htirq.h index 70a1dbbf2093..5caa51b7b95c 100644 --- a/include/linux/htirq.h +++ b/include/linux/htirq.h @@ -15,6 +15,8 @@ void unmask_ht_irq(struct irq_data *data); /* The arch hook for getting things started */ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev); +int arch_alloc_ht_irq(struct pci_dev *dev); +void arch_free_ht_irq(int irq); /* For drivers of buggy hardware */ typedef void (ht_irq_update_t)(struct pci_dev *dev, int irq, -- cgit v1.2.3 From a62b32cdd0a6324c959f40b3c9b928b275297066 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:29 +0800 Subject: x86/dmar: Use new irqdomain interfaces to allocate/free IRQ Use new irqdomain interfaces to allocate/free IRQ for DMAR and interrupt remapping, so we can remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ later. The private definitions of irq_alloc_hwirqs()/irq_free_hwirqs() are a temporary solution, they will be removed once we have converted the interrupt remapping driver to use irqdomain framework. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Joerg Roedel Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428905519-23704-8-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/irq_remapping.h | 4 ++-- arch/x86/kernel/apic/msi.c | 10 ++++++++++ drivers/iommu/irq_remapping.c | 17 +++++++++++++++-- 3 files changed, 27 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 6224d316c405..86d897bc15dd 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -99,7 +99,7 @@ static inline bool setup_remapped_irq(int irq, } #endif /* CONFIG_IRQ_REMAP */ -#define dmar_alloc_hwirq() irq_alloc_hwirq(-1) -#define dmar_free_hwirq irq_free_hwirq +extern int dmar_alloc_hwirq(void); +extern void dmar_free_hwirq(int irq); #endif /* __X86_IRQ_REMAPPING_H */ diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 76cc2c902176..9be7d6d8a579 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -223,6 +223,16 @@ int arch_setup_dmar_msi(unsigned int irq) "edge"); return 0; } + +int dmar_alloc_hwirq(void) +{ + return irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL); +} + +void dmar_free_hwirq(int irq) +{ + irq_domain_free_irqs(irq, 1); +} #endif /* diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index 390079ee1350..5617150fd8fb 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -49,6 +50,18 @@ static void irq_remapping_disable_io_apic(void) disconnect_bsp_APIC(0); } +#ifndef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ +static unsigned int irq_alloc_hwirqs(int cnt, int node) +{ + return irq_domain_alloc_irqs(NULL, -1, cnt, node, NULL); +} + +static void irq_free_hwirqs(unsigned int from, int cnt) +{ + irq_domain_free_irqs(from, cnt); +} +#endif + static int do_setup_msi_irqs(struct pci_dev *dev, int nvec) { int ret, sub_handle, nvec_pow2, index = 0; @@ -104,7 +117,7 @@ static int do_setup_msix_irqs(struct pci_dev *dev, int nvec) list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = irq_alloc_hwirq(node); + irq = irq_alloc_hwirqs(1, node); if (irq == 0) return -1; @@ -127,7 +140,7 @@ static int do_setup_msix_irqs(struct pci_dev *dev, int nvec) return 0; error: - irq_free_hwirq(irq); + irq_free_hwirqs(irq, 1); return ret; } -- cgit v1.2.3 From 3cb96f0c97330834929abe9bd2ca3c252a83def0 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:34 +0800 Subject: x86/hpet: Enhance HPET IRQ to support hierarchical irqdomains Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Srivatsa S. Bhat Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Andy Lutomirski Link: http://lkml.kernel.org/r/1428905519-23704-13-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hpet.h | 7 +- arch/x86/kernel/apic/msi.c | 166 +++++++++++++++++++++++++++++++++++++++----- arch/x86/kernel/hpet.c | 57 ++++----------- 3 files changed, 167 insertions(+), 63 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 36f7125945e3..e87e9faf87a9 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -74,11 +74,16 @@ extern unsigned int hpet_readl(unsigned int a); extern void force_hpet_resume(void); struct irq_data; +struct hpet_dev; +struct irq_domain; + extern void hpet_msi_unmask(struct irq_data *data); extern void hpet_msi_mask(struct irq_data *data); -struct hpet_dev; extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg); extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg); +extern struct irq_domain *hpet_create_irq_domain(int hpet_id); +extern int hpet_assign_irq(struct irq_domain *domain, + struct hpet_dev *dev, int dev_num); #ifdef CONFIG_PCI_MSI extern int default_setup_hpet_msi(unsigned int irq, unsigned int id); diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 9be7d6d8a579..10d9ae8f2166 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -51,6 +51,44 @@ void native_compose_msi_msg(struct pci_dev *pdev, MSI_DATA_VECTOR(cfg->vector); } +static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct irq_cfg *cfg = irqd_cfg(data); + + msg->address_hi = MSI_ADDR_BASE_HI; + + if (x2apic_enabled()) + msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); + + msg->address_lo = + MSI_ADDR_BASE_LO | + ((apic->irq_dest_mode == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL : + MSI_ADDR_DEST_MODE_LOGICAL) | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU : + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(cfg->dest_apicid); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED : + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(cfg->vector); +} + +static void msi_update_msg(struct msi_msg *msg, struct irq_data *irq_data) +{ + struct irq_cfg *cfg = irqd_cfg(irq_data); + + msg->data &= ~MSI_DATA_VECTOR_MASK; + msg->data |= MSI_DATA_VECTOR(cfg->vector); + msg->address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg->address_lo |= MSI_ADDR_DEST_ID(cfg->dest_apicid); +} + static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg, u8 hpet_id) { @@ -239,44 +277,43 @@ void dmar_free_hwirq(int irq) * MSI message composition */ #ifdef CONFIG_HPET_TIMER +static inline int hpet_dev_id(struct irq_domain *domain) +{ + return (int)(long)domain->host_data; +} static int hpet_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { - struct irq_cfg *cfg = irqd_cfg(data); + struct irq_data *parent = data->parent_data; struct msi_msg msg; - unsigned int dest; int ret; - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - hpet_msi_read(data->handler_data, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - hpet_msi_write(data->handler_data, &msg); + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { + hpet_msi_read(data->handler_data, &msg); + msi_update_msg(&msg, data); + hpet_msi_write(data->handler_data, &msg); + } - return IRQ_SET_MASK_OK_NOCOPY; + return ret; } -static struct irq_chip hpet_msi_type = { +static struct irq_chip hpet_msi_controller = { .name = "HPET_MSI", .irq_unmask = hpet_msi_unmask, .irq_mask = hpet_msi_mask, - .irq_ack = apic_ack_edge, + .irq_ack = irq_chip_ack_parent, .irq_set_affinity = hpet_msi_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_print_chip = irq_remapping_print_chip, + .irq_compose_msi_msg = irq_msi_compose_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; int default_setup_hpet_msi(unsigned int irq, unsigned int id) { - struct irq_chip *chip = &hpet_msi_type; + struct irq_chip *chip = &hpet_msi_controller; struct msi_msg msg; int ret; @@ -291,4 +328,95 @@ int default_setup_hpet_msi(unsigned int irq, unsigned int id) irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); return 0; } + +static int hpet_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_alloc_info *info = arg; + int ret; + + if (nr_irqs > 1 || !info || info->type != X86_IRQ_ALLOC_TYPE_HPET) + return -EINVAL; + if (irq_find_mapping(domain, info->hpet_index)) { + pr_warn("IRQ for HPET%d already exists.\n", info->hpet_index); + return -EEXIST; + } + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret >= 0) { + irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); + irq_domain_set_hwirq_and_chip(domain, virq, info->hpet_index, + &hpet_msi_controller, NULL); + irq_set_handler_data(virq, info->hpet_data); + __irq_set_handler(virq, handle_edge_irq, 0, "edge"); + } + + return ret; +} + +static void hpet_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + BUG_ON(nr_irqs > 1); + irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); + irq_domain_free_irqs_top(domain, virq, nr_irqs); +} + +static void hpet_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct msi_msg msg; + + BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); + hpet_msi_write(irq_get_handler_data(irq_data->irq), &msg); +} + +static void hpet_domain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct msi_msg msg; + + memset(&msg, 0, sizeof(msg)); + hpet_msi_write(irq_get_handler_data(irq_data->irq), &msg); +} + +static struct irq_domain_ops hpet_domain_ops = { + .alloc = hpet_domain_alloc, + .free = hpet_domain_free, + .activate = hpet_domain_activate, + .deactivate = hpet_domain_deactivate, +}; + +struct irq_domain *hpet_create_irq_domain(int hpet_id) +{ + struct irq_domain *parent; + struct irq_alloc_info info; + + if (x86_vector_domain == NULL) + return NULL; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_HPET; + info.hpet_id = hpet_id; + parent = irq_remapping_get_ir_irq_domain(&info); + if (parent == NULL) + parent = x86_vector_domain; + + return irq_domain_add_hierarchy(parent, 0, 0, NULL, &hpet_domain_ops, + (void *)(long)hpet_id); +} + +int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev, + int dev_num) +{ + struct irq_alloc_info info; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_HPET; + info.hpet_data = dev; + info.hpet_id = hpet_dev_id(domain); + info.hpet_index = dev_num; + + return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, NULL); +} #endif diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ae29554f57ea..e3bc18080052 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -306,8 +306,6 @@ static void hpet_legacy_clockevent_register(void) printk(KERN_DEBUG "hpet clockevent registered\n"); } -static int hpet_setup_msi_irq(unsigned int irq); - static void hpet_set_mode(enum clock_event_mode mode, struct clock_event_device *evt, int timer) { @@ -358,7 +356,7 @@ static void hpet_set_mode(enum clock_event_mode mode, hpet_enable_legacy_int(); } else { struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - hpet_setup_msi_irq(hdev->irq); + irq_domain_activate_irq(irq_get_irq_data(hdev->irq)); disable_irq(hdev->irq); irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); enable_irq(hdev->irq); @@ -424,6 +422,7 @@ static int hpet_legacy_next_event(unsigned long delta, static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); static struct hpet_dev *hpet_devs; +static struct irq_domain *hpet_domain; void hpet_msi_unmask(struct irq_data *data) { @@ -474,32 +473,6 @@ static int hpet_msi_next_event(unsigned long delta, return hpet_next_event(delta, evt, hdev->num); } -static int hpet_setup_msi_irq(unsigned int irq) -{ - if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) { - irq_domain_free_irqs(irq, 1); - return -EINVAL; - } - return 0; -} - -static int hpet_assign_irq(struct hpet_dev *dev) -{ - int irq; - - irq = irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL); - if (irq <= 0) - return -EINVAL; - - irq_set_handler_data(irq, dev); - - if (hpet_setup_msi_irq(irq)) - return -EINVAL; - - dev->irq = irq; - return 0; -} - static irqreturn_t hpet_interrupt_handler(int irq, void *data) { struct hpet_dev *dev = (struct hpet_dev *)data; @@ -542,9 +515,6 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) if (!(hdev->flags & HPET_DEV_VALID)) return; - if (hpet_setup_msi_irq(hdev->irq)) - return; - hdev->cpu = cpu; per_cpu(cpu_hpet_dev, cpu) = hdev; evt->name = hdev->name; @@ -576,7 +546,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) unsigned int id; unsigned int num_timers; unsigned int num_timers_used = 0; - int i; + int i, irq; if (hpet_msi_disable) return; @@ -589,6 +559,10 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) num_timers++; /* Value read out starts from 0 */ hpet_print_config(); + hpet_domain = hpet_create_irq_domain(hpet_blockid); + if (!hpet_domain) + return; + hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL); if (!hpet_devs) return; @@ -603,15 +577,16 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) if (!(cfg & HPET_TN_FSB_CAP)) continue; + irq = hpet_assign_irq(hpet_domain, hdev, hdev->num); + if (irq < 0) + continue; + + sprintf(hdev->name, "hpet%d", i); + hdev->num = i; + hdev->irq = irq; hdev->flags = 0; if (cfg & HPET_TN_PERIODIC_CAP) hdev->flags |= HPET_DEV_PERI_CAP; - hdev->num = i; - - sprintf(hdev->name, "hpet%d", i); - if (hpet_assign_irq(hdev)) - continue; - hdev->flags |= HPET_DEV_FSB_CAP; hdev->flags |= HPET_DEV_VALID; num_timers_used++; @@ -711,10 +686,6 @@ static int hpet_cpuhp_notify(struct notifier_block *n, } #else -static int hpet_setup_msi_irq(unsigned int irq) -{ - return 0; -} static void hpet_msi_capability_lookup(unsigned int start_timer) { return; -- cgit v1.2.3 From 52f518a3a7c2f80551a38d38be28bc9f335e713c Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:35 +0800 Subject: x86/MSI: Use hierarchical irqdomains to manage MSI interrupts Enhance MSI code to support hierarchical irqdomains, it helps to make the architecture more clear. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Joerg Roedel Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428905519-23704-14-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 1 + arch/x86/include/asm/hw_irq.h | 9 ++- arch/x86/include/asm/irq_remapping.h | 6 +- arch/x86/include/asm/msi.h | 7 ++ arch/x86/kernel/apic/msi.c | 141 +++++++++++++++++++---------------- arch/x86/kernel/apic/vector.c | 2 + drivers/iommu/irq_remapping.c | 1 - 7 files changed, 94 insertions(+), 73 deletions(-) create mode 100644 arch/x86/include/asm/msi.h (limited to 'arch/x86/kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e75a96c38c58..26e066579637 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -914,6 +914,7 @@ config X86_LOCAL_APIC depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ select IRQ_DOMAIN_HIERARCHY + select PCI_MSI_IRQ_DOMAIN if PCI_MSI config X86_IO_APIC def_bool y diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 75a97a5bbfa8..05829e973a2a 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -110,9 +110,10 @@ struct irq_2_irte { }; #endif /* CONFIG_IRQ_REMAP */ +struct irq_domain; + #ifdef CONFIG_X86_LOCAL_APIC struct irq_data; -struct irq_domain; struct pci_dev; struct msi_desc; @@ -214,6 +215,12 @@ static inline void lock_vector_lock(void) {} static inline void unlock_vector_lock(void) {} #endif /* CONFIG_X86_LOCAL_APIC */ +#ifdef CONFIG_PCI_MSI +extern void arch_init_msi_domain(struct irq_domain *domain); +#else +static inline void arch_init_msi_domain(struct irq_domain *domain) { } +#endif + /* Statistics */ extern atomic_t irq_err_count; extern atomic_t irq_mis_count; diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 0cd6195cc375..0d3bbd0e2c42 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -66,11 +66,7 @@ irq_remapping_get_irq_domain(struct irq_alloc_info *info); extern void irq_remapping_print_chip(struct irq_data *data, struct seq_file *p); /* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */ -static inline struct irq_domain * -arch_create_msi_irq_domain(struct irq_domain *parent) -{ - return NULL; -} +extern struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent); /* Get parent irqdomain for interrupt remapping irqdomain */ static inline struct irq_domain *arch_get_ir_parent_domain(void) diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h new file mode 100644 index 000000000000..93724cc62177 --- /dev/null +++ b/arch/x86/include/asm/msi.h @@ -0,0 +1,7 @@ +#ifndef _ASM_X86_MSI_H +#define _ASM_X86_MSI_H +#include + +typedef struct irq_alloc_info msi_alloc_info_t; + +#endif /* _ASM_X86_MSI_H */ diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 10d9ae8f2166..c426cd58844e 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -3,6 +3,8 @@ * * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo * Moved from arch/x86/kernel/apic/io_apic.c. + * Jiang Liu + * Convert to hierarchical irqdomain * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,6 +23,8 @@ #include #include +static struct irq_domain *msi_default_domain; + void native_compose_msi_msg(struct pci_dev *pdev, unsigned int irq, unsigned int dest, struct msi_msg *msg, u8 hpet_id) @@ -114,102 +118,107 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, return 0; } -static int -msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) -{ - struct irq_cfg *cfg = irqd_cfg(data); - struct msi_msg msg; - unsigned int dest; - int ret; - - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - __get_cached_msi_msg(data->msi_desc, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - __pci_write_msi_msg(data->msi_desc, &msg); - - return IRQ_SET_MASK_OK_NOCOPY; -} - /* * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, * which implement the MSI or MSI-X Capability Structure. */ -static struct irq_chip msi_chip = { +static struct irq_chip pci_msi_controller = { .name = "PCI-MSI", .irq_unmask = pci_msi_unmask_irq, .irq_mask = pci_msi_mask_irq, - .irq_ack = apic_ack_edge, - .irq_set_affinity = msi_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_ack = irq_chip_ack_parent, + .irq_set_affinity = msi_domain_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_print_chip = irq_remapping_print_chip, + .irq_compose_msi_msg = irq_msi_compose_msg, + .irq_write_msi_msg = pci_msi_domain_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, - unsigned int irq_base, unsigned int irq_offset) +int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - struct irq_chip *chip = &msi_chip; - struct msi_msg msg; - unsigned int irq = irq_base + irq_offset; - int ret; + struct irq_domain *domain; + struct irq_alloc_info info; - ret = msi_compose_msg(dev, irq, &msg, -1); - if (ret < 0) - return ret; + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_MSI; + info.msi_dev = dev; - irq_set_msi_desc_off(irq_base, irq_offset, msidesc); + domain = irq_remapping_get_irq_domain(&info); + if (domain == NULL) + domain = msi_default_domain; + if (domain == NULL) + return -ENOSYS; - /* - * MSI-X message is written per-IRQ, the offset is always 0. - * MSI message denotes a contiguous group of IRQs, written for 0th IRQ. - */ - if (!irq_offset) - pci_write_msi_msg(irq, &msg); + return pci_msi_domain_alloc_irqs(domain, dev, nvec, type); +} - setup_remapped_irq(irq, irq_cfg(irq), chip); +void native_teardown_msi_irq(unsigned int irq) +{ + irq_domain_free_irqs(irq, 1); +} - irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); +static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) +{ + return arg->msi_hwirq; +} - dev_dbg(&dev->dev, "irq %d for MSI/MSI-X\n", irq); +static int pci_msi_prepare(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct msi_desc *desc = first_pci_msi_entry(pdev); + + init_irq_alloc_info(arg, NULL); + arg->msi_dev = pdev; + if (desc->msi_attrib.is_msix) { + arg->type = X86_IRQ_ALLOC_TYPE_MSIX; + } else { + arg->type = X86_IRQ_ALLOC_TYPE_MSI; + arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; + } return 0; } -int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +static void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) { - struct msi_desc *msidesc; - int irq, ret; + arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc); +} - /* Multiple MSI vectors only supported with interrupt remapping */ - if (type == PCI_CAP_ID_MSI && nvec > 1) - return 1; +static struct msi_domain_ops pci_msi_domain_ops = { + .get_hwirq = pci_msi_get_hwirq, + .msi_prepare = pci_msi_prepare, + .set_desc = pci_msi_set_desc, +}; - list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL); - if (irq <= 0) - return -ENOSPC; +static struct msi_domain_info pci_msi_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX, + .ops = &pci_msi_domain_ops, + .chip = &pci_msi_controller, + .handler = handle_edge_irq, + .handler_name = "edge", +}; - ret = setup_msi_irq(dev, msidesc, irq, 0); - if (ret < 0) { - irq_domain_free_irqs(irq, 1); - return ret; - } +void arch_init_msi_domain(struct irq_domain *parent) +{ + if (disable_apic) + return; - } - return 0; + msi_default_domain = pci_msi_create_irq_domain(NULL, + &pci_msi_domain_info, parent); + if (!msi_default_domain) + pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n"); } -void native_teardown_msi_irq(unsigned int irq) +#ifdef CONFIG_IRQ_REMAP +struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent) { - irq_domain_free_irqs(irq, 1); + return msi_create_irq_domain(NULL, &pci_msi_domain_info, parent); } +#endif #ifdef CONFIG_DMAR_TABLE static int diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 6358d8d351f5..a8d82896be75 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -364,6 +364,8 @@ int __init arch_early_irq_init(void) BUG_ON(x86_vector_domain == NULL); irq_set_default_host(x86_vector_domain); + arch_init_msi_domain(x86_vector_domain); + return arch_early_ioapic_init(); } diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index c306421d86c1..d77e3711c2aa 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -170,7 +170,6 @@ static void __init irq_remapping_modify_x86_ops(void) x86_io_apic_ops.set_affinity = set_remapped_irq_affinity; x86_io_apic_ops.setup_entry = setup_ioapic_remapped_entry; x86_io_apic_ops.eoi_ioapic_pin = eoi_ioapic_pin_remapped; - x86_msi.setup_msi_irqs = irq_remapping_setup_msi_irqs; x86_msi.setup_hpet_msi = setup_hpet_msi_remapped; x86_msi.compose_msi_msg = compose_remapped_msi_msg; } -- cgit v1.2.3 From 80aa283364a17998dceb577bd185e3380b927544 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:36 +0800 Subject: x86/irq: Directly call native_compose_msi_msg() for DMAR IRQ DMAR interrupt won't be remapped by interrupt remapping hardware, so directly call native_compose_msi_msg() for DMAR IRQ to compose MSI message data. This will help to simplify MSI code later. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-15-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index c426cd58844e..9adb87100ffe 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -259,12 +259,10 @@ static struct irq_chip dmar_msi_type = { int arch_setup_dmar_msi(unsigned int irq) { - int ret; struct msi_msg msg; + struct irq_cfg *cfg = irq_cfg(irq); - ret = msi_compose_msg(NULL, irq, &msg, -1); - if (ret < 0) - return ret; + native_compose_msi_msg(NULL, irq, cfg->dest_apicid, &msg, -1); dmar_msi_write(irq, &msg); irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, "edge"); -- cgit v1.2.3 From 7a53a12162cbe5feb66380b96cc794a031a8f39a Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:39 +0800 Subject: irq_remapping: Clean up unused MSI related code Now MSI interrupt has been converted to new hierarchical irqdomain interfaces, so remove legacy MSI related code and interfaces. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Rafael J. Wysocki Cc: Joerg Roedel Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Cc: Yijing Wang Link: http://lkml.kernel.org/r/1428905519-23704-18-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/irq_remapping.h | 13 --- arch/x86/include/asm/pci.h | 5 -- arch/x86/kernel/x86_init.c | 2 - drivers/iommu/irq_remapping.c | 151 ----------------------------------- drivers/iommu/irq_remapping.h | 14 ---- 5 files changed, 185 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 0d3bbd0e2c42..b978c68f5d29 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -48,10 +48,6 @@ extern int setup_ioapic_remapped_entry(int irq, int vector, struct io_apic_irq_attr *attr); extern void free_remapped_irq(int irq); -extern void compose_remapped_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id); -extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id); extern void panic_if_irq_remap(const char *msg); extern bool setup_remapped_irq(int irq, struct irq_cfg *cfg, @@ -91,15 +87,6 @@ static inline int setup_ioapic_remapped_entry(int irq, return -ENODEV; } static inline void free_remapped_irq(int irq) { } -static inline void compose_remapped_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id) -{ -} -static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id) -{ - return -ENODEV; -} static inline void panic_if_irq_remap(const char *msg) { diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 4e370a5d8117..d8c80ff32e8c 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -96,15 +96,10 @@ extern void pci_iommu_alloc(void); #ifdef CONFIG_PCI_MSI /* implemented in arch/x86/kernel/apic/io_apic. */ struct msi_desc; -void native_compose_msi_msg(struct pci_dev *pdev, unsigned int irq, - unsigned int dest, struct msi_msg *msg, u8 hpet_id); int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); void native_teardown_msi_irq(unsigned int irq); void native_restore_msi_irqs(struct pci_dev *dev); -int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, - unsigned int irq_base, unsigned int irq_offset); #else -#define native_compose_msi_msg NULL #define native_setup_msi_irqs NULL #define native_teardown_msi_irq NULL #endif diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 234b0722de53..b094d691f2fe 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -111,11 +111,9 @@ EXPORT_SYMBOL_GPL(x86_platform); #if defined(CONFIG_PCI_MSI) struct x86_msi_ops x86_msi = { .setup_msi_irqs = native_setup_msi_irqs, - .compose_msi_msg = native_compose_msi_msg, .teardown_msi_irq = native_teardown_msi_irq, .teardown_msi_irqs = default_teardown_msi_irqs, .restore_msi_irqs = default_restore_msi_irqs, - .setup_hpet_msi = default_setup_hpet_msi, }; /* MSI arch specific hooks */ diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index d77e3711c2aa..3eaa822c30a9 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -25,9 +25,6 @@ int no_x2apic_optout; static int disable_irq_remap; static struct irq_remap_ops *remap_ops; -static int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec); -static int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq, - int index, int sub_handle); static int set_remapped_irq_affinity(struct irq_data *data, const struct cpumask *mask, bool force); @@ -50,109 +47,6 @@ static void irq_remapping_disable_io_apic(void) disconnect_bsp_APIC(0); } -#ifndef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ -static unsigned int irq_alloc_hwirqs(int cnt, int node) -{ - return irq_domain_alloc_irqs(NULL, -1, cnt, node, NULL); -} - -static void irq_free_hwirqs(unsigned int from, int cnt) -{ - irq_domain_free_irqs(from, cnt); -} -#endif - -static int do_setup_msi_irqs(struct pci_dev *dev, int nvec) -{ - int ret, sub_handle, nvec_pow2, index = 0; - unsigned int irq; - struct msi_desc *msidesc; - - msidesc = list_entry(dev->msi_list.next, struct msi_desc, list); - - irq = irq_alloc_hwirqs(nvec, dev_to_node(&dev->dev)); - if (irq == 0) - return -ENOSPC; - - nvec_pow2 = __roundup_pow_of_two(nvec); - for (sub_handle = 0; sub_handle < nvec; sub_handle++) { - if (!sub_handle) { - index = msi_alloc_remapped_irq(dev, irq, nvec_pow2); - if (index < 0) { - ret = index; - goto error; - } - } else { - ret = msi_setup_remapped_irq(dev, irq + sub_handle, - index, sub_handle); - if (ret < 0) - goto error; - } - ret = setup_msi_irq(dev, msidesc, irq, sub_handle); - if (ret < 0) - goto error; - } - return 0; - -error: - irq_free_hwirqs(irq, nvec); - - /* - * Restore altered MSI descriptor fields and prevent just destroyed - * IRQs from tearing down again in default_teardown_msi_irqs() - */ - msidesc->irq = 0; - - return ret; -} - -static int do_setup_msix_irqs(struct pci_dev *dev, int nvec) -{ - int node, ret, sub_handle, index = 0; - struct msi_desc *msidesc; - unsigned int irq; - - node = dev_to_node(&dev->dev); - sub_handle = 0; - - list_for_each_entry(msidesc, &dev->msi_list, list) { - - irq = irq_alloc_hwirqs(1, node); - if (irq == 0) - return -1; - - if (sub_handle == 0) - ret = index = msi_alloc_remapped_irq(dev, irq, nvec); - else - ret = msi_setup_remapped_irq(dev, irq, index, sub_handle); - - if (ret < 0) - goto error; - - ret = setup_msi_irq(dev, msidesc, irq, 0); - if (ret < 0) - goto error; - - sub_handle += 1; - irq += 1; - } - - return 0; - -error: - irq_free_hwirqs(irq, 1); - return ret; -} - -static int irq_remapping_setup_msi_irqs(struct pci_dev *dev, - int nvec, int type) -{ - if (type == PCI_CAP_ID_MSI) - return do_setup_msi_irqs(dev, nvec); - else - return do_setup_msix_irqs(dev, nvec); -} - static void eoi_ioapic_pin_remapped(int apic, int pin, int vector) { /* @@ -170,8 +64,6 @@ static void __init irq_remapping_modify_x86_ops(void) x86_io_apic_ops.set_affinity = set_remapped_irq_affinity; x86_io_apic_ops.setup_entry = setup_ioapic_remapped_entry; x86_io_apic_ops.eoi_ioapic_pin = eoi_ioapic_pin_remapped; - x86_msi.setup_hpet_msi = setup_hpet_msi_remapped; - x86_msi.compose_msi_msg = compose_remapped_msi_msg; } static __init int setup_nointremap(char *str) @@ -295,49 +187,6 @@ void free_remapped_irq(int irq) remap_ops->free_irq(irq); } -void compose_remapped_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id) -{ - struct irq_cfg *cfg = irq_cfg(irq); - - if (!irq_remapped(cfg)) - native_compose_msi_msg(pdev, irq, dest, msg, hpet_id); - else if (remap_ops->compose_msi_msg) - remap_ops->compose_msi_msg(pdev, irq, dest, msg, hpet_id); -} - -static int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec) -{ - if (!remap_ops->msi_alloc_irq) - return -ENODEV; - - return remap_ops->msi_alloc_irq(pdev, irq, nvec); -} - -static int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq, - int index, int sub_handle) -{ - if (!remap_ops->msi_setup_irq) - return -ENODEV; - - return remap_ops->msi_setup_irq(pdev, irq, index, sub_handle); -} - -int setup_hpet_msi_remapped(unsigned int irq, unsigned int id) -{ - int ret; - - if (!remap_ops->alloc_hpet_msi) - return -ENODEV; - - ret = remap_ops->alloc_hpet_msi(irq, id); - if (ret) - return -EINVAL; - - return default_setup_hpet_msi(irq, id); -} - void panic_if_irq_remap(const char *msg) { if (irq_remapping_enabled) diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h index 3e109b1ea688..16b7d814e6fe 100644 --- a/drivers/iommu/irq_remapping.h +++ b/drivers/iommu/irq_remapping.h @@ -66,20 +66,6 @@ struct irq_remap_ops { /* Free an IRQ */ int (*free_irq)(int); - /* Create MSI msg to use for interrupt remapping */ - void (*compose_msi_msg)(struct pci_dev *, - unsigned int, unsigned int, - struct msi_msg *, u8); - - /* Allocate remapping resources for MSI */ - int (*msi_alloc_irq)(struct pci_dev *, int, int); - - /* Setup the remapped MSI irq */ - int (*msi_setup_irq)(struct pci_dev *, unsigned int, int, int); - - /* Setup interrupt remapping for an HPET MSI */ - int (*alloc_hpet_msi)(unsigned int, unsigned int); - /* Get the irqdomain associated the IOMMU device */ struct irq_domain *(*get_ir_irq_domain)(struct irq_alloc_info *); -- cgit v1.2.3 From b1855c752e67d1125d41fadb499014b49a245db8 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:40 +0800 Subject: x86/MSI: Clean up unused MSI related code and interfaces Now MSI interrupt has been converted to new hierarchical irqdomain interfaces, so remove legacy MSI related code and interfaces. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Yijing Wang Link: http://lkml.kernel.org/r/1428905519-23704-19-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hpet.h | 9 ------- arch/x86/include/asm/x86_init.h | 4 --- arch/x86/kernel/apic/msi.c | 55 +++-------------------------------------- 3 files changed, 4 insertions(+), 64 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index e87e9faf87a9..5fa9fb0f8809 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -85,15 +85,6 @@ extern struct irq_domain *hpet_create_irq_domain(int hpet_id); extern int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev, int dev_num); -#ifdef CONFIG_PCI_MSI -extern int default_setup_hpet_msi(unsigned int irq, unsigned int id); -#else -static inline int default_setup_hpet_msi(unsigned int irq, unsigned int id) -{ - return -EINVAL; -} -#endif - #ifdef CONFIG_HPET_EMULATE_RTC #include diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index f58a9c7a3c86..1649bb9ca27c 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -175,13 +175,9 @@ struct msi_msg; struct x86_msi_ops { int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type); - void (*compose_msi_msg)(struct pci_dev *dev, unsigned int irq, - unsigned int dest, struct msi_msg *msg, - u8 hpet_id); void (*teardown_msi_irq)(unsigned int irq); void (*teardown_msi_irqs)(struct pci_dev *dev); void (*restore_msi_irqs)(struct pci_dev *dev); - int (*setup_hpet_msi)(unsigned int irq, unsigned int id); }; struct IO_APIC_route_entry; diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 9adb87100ffe..9fe7a08479fa 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -25,16 +25,12 @@ static struct irq_domain *msi_default_domain; -void native_compose_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id) +static void native_compose_msi_msg(struct irq_cfg *cfg, struct msi_msg *msg) { - struct irq_cfg *cfg = irq_cfg(irq); - msg->address_hi = MSI_ADDR_BASE_HI; if (x2apic_enabled()) - msg->address_hi |= MSI_ADDR_EXT_DEST_ID(dest); + msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); msg->address_lo = MSI_ADDR_BASE_LO | @@ -44,7 +40,7 @@ void native_compose_msi_msg(struct pci_dev *pdev, ((apic->irq_delivery_mode != dest_LowestPrio) ? MSI_ADDR_REDIRECTION_CPU : MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); + MSI_ADDR_DEST_ID(cfg->dest_apicid); msg->data = MSI_DATA_TRIGGER_EDGE | @@ -93,31 +89,6 @@ static void msi_update_msg(struct msi_msg *msg, struct irq_data *irq_data) msg->address_lo |= MSI_ADDR_DEST_ID(cfg->dest_apicid); } -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, - struct msi_msg *msg, u8 hpet_id) -{ - struct irq_cfg *cfg; - int err; - unsigned dest; - - if (disable_apic) - return -ENXIO; - - cfg = irq_cfg(irq); - err = assign_irq_vector(irq, cfg, apic->target_cpus()); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(cfg->domain, - apic->target_cpus(), &dest); - if (err) - return err; - - x86_msi.compose_msi_msg(pdev, irq, dest, msg, hpet_id); - - return 0; -} - /* * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, * which implement the MSI or MSI-X Capability Structure. @@ -262,7 +233,7 @@ int arch_setup_dmar_msi(unsigned int irq) struct msi_msg msg; struct irq_cfg *cfg = irq_cfg(irq); - native_compose_msi_msg(NULL, irq, cfg->dest_apicid, &msg, -1); + native_compose_msi_msg(cfg, &msg); dmar_msi_write(irq, &msg); irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, "edge"); @@ -318,24 +289,6 @@ static struct irq_chip hpet_msi_controller = { .flags = IRQCHIP_SKIP_SET_WAKE, }; -int default_setup_hpet_msi(unsigned int irq, unsigned int id) -{ - struct irq_chip *chip = &hpet_msi_controller; - struct msi_msg msg; - int ret; - - ret = msi_compose_msg(NULL, irq, &msg, id); - if (ret < 0) - return ret; - - hpet_msi_write(irq_get_handler_data(irq), &msg); - irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - setup_remapped_irq(irq, irq_cfg(irq), chip); - - irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); - return 0; -} - static int hpet_domain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { -- cgit v1.2.3 From 34742db8eaf9ff364034f214ee5827701e131d4b Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:41 +0800 Subject: iommu/vt-d: Refine the interfaces to create IRQ for DMAR unit Refine the interfaces to create IRQ for DMAR unit. It's a preparation for converting DMAR IRQ to hierarchical irqdomain on x86. It also moves dmar_alloc_hwirq()/dmar_free_hwirq() from irq_remapping.h to dmar.h. They are not irq_remapping specific. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Vinod Koul Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Tony Luck Cc: Fenghua Yu Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428905519-23704-20-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/ia64/include/asm/irq_remapping.h | 2 -- arch/ia64/kernel/msi_ia64.c | 30 +++++++++++++++++++----------- arch/x86/include/asm/irq_remapping.h | 4 ---- arch/x86/kernel/apic/msi.c | 24 +++++++++++++----------- drivers/iommu/dmar.c | 19 +++++-------------- include/linux/dmar.h | 3 ++- 6 files changed, 39 insertions(+), 43 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/ia64/include/asm/irq_remapping.h b/arch/ia64/include/asm/irq_remapping.h index e3b3556e2e1b..a8687b1d8906 100644 --- a/arch/ia64/include/asm/irq_remapping.h +++ b/arch/ia64/include/asm/irq_remapping.h @@ -1,6 +1,4 @@ #ifndef __IA64_INTR_REMAPPING_H #define __IA64_INTR_REMAPPING_H #define irq_remapping_enabled 0 -#define dmar_alloc_hwirq create_irq -#define dmar_free_hwirq destroy_irq #endif diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c index 9dd7464f8c17..d70bf15c690a 100644 --- a/arch/ia64/kernel/msi_ia64.c +++ b/arch/ia64/kernel/msi_ia64.c @@ -165,7 +165,7 @@ static struct irq_chip dmar_msi_type = { .irq_retrigger = ia64_msi_retrigger_irq, }; -static int +static void msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) { struct irq_cfg *cfg = irq_cfg + irq; @@ -186,21 +186,29 @@ msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) MSI_DATA_LEVEL_ASSERT | MSI_DATA_DELIVERY_FIXED | MSI_DATA_VECTOR(cfg->vector); - return 0; } -int arch_setup_dmar_msi(unsigned int irq) +int dmar_alloc_hwirq(int id, int node, void *arg) { - int ret; + int irq; struct msi_msg msg; - ret = msi_compose_msg(NULL, irq, &msg); - if (ret < 0) - return ret; - dmar_msi_write(irq, &msg); - irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, - "edge"); - return 0; + irq = create_irq(); + if (irq > 0) { + irq_set_handler_data(irq, arg); + irq_set_chip_and_handler_name(irq, &dmar_msi_type, + handle_edge_irq, "edge"); + msi_compose_msg(NULL, irq, &msg); + dmar_msi_write(irq, &msg); + } + + return irq; +} + +void dmar_free_hwirq(int irq) +{ + irq_set_handler_data(irq, NULL); + destroy_irq(irq); } #endif /* CONFIG_INTEL_IOMMU */ diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index b978c68f5d29..bacac1052262 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -117,8 +117,4 @@ irq_remapping_get_irq_domain(struct irq_alloc_info *info) #define irq_remapping_print_chip NULL #endif /* CONFIG_IRQ_REMAP */ - -extern int dmar_alloc_hwirq(void); -extern void dmar_free_hwirq(int irq); - #endif /* __X86_IRQ_REMAPPING_H */ diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 9fe7a08479fa..ca6250439acc 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -228,25 +228,27 @@ static struct irq_chip dmar_msi_type = { .flags = IRQCHIP_SKIP_SET_WAKE, }; -int arch_setup_dmar_msi(unsigned int irq) +int dmar_alloc_hwirq(int id, int node, void *arg) { + int irq; struct msi_msg msg; - struct irq_cfg *cfg = irq_cfg(irq); - native_compose_msi_msg(cfg, &msg); - dmar_msi_write(irq, &msg); - irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, - "edge"); - return 0; -} + irq = irq_domain_alloc_irqs(NULL, 1, node, NULL); + if (irq > 0) { + irq_set_handler_data(irq, arg); + irq_set_chip_and_handler_name(irq, &dmar_msi_type, + handle_edge_irq, "edge"); + native_compose_msi_msg(irq_cfg(irq), &msg); + dmar_msi_write(irq, &msg); + } -int dmar_alloc_hwirq(void) -{ - return irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL); + return irq; } void dmar_free_hwirq(int irq) { + irq_set_handler_data(irq, NULL); + irq_set_handler(irq, NULL); irq_domain_free_irqs(irq, 1); } #endif diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 9847613085e1..536f2d8ea41a 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -1087,8 +1087,8 @@ static void free_iommu(struct intel_iommu *iommu) if (iommu->irq) { free_irq(iommu->irq, iommu); - irq_set_handler_data(iommu->irq, NULL); dmar_free_hwirq(iommu->irq); + iommu->irq = 0; } if (iommu->qi) { @@ -1642,23 +1642,14 @@ int dmar_set_interrupt(struct intel_iommu *iommu) if (iommu->irq) return 0; - irq = dmar_alloc_hwirq(); - if (irq <= 0) { + irq = dmar_alloc_hwirq(iommu->seq_id, iommu->node, iommu); + if (irq > 0) { + iommu->irq = irq; + } else { pr_err("IOMMU: no free vectors\n"); return -EINVAL; } - irq_set_handler_data(irq, iommu); - iommu->irq = irq; - - ret = arch_setup_dmar_msi(irq); - if (ret) { - irq_set_handler_data(irq, NULL); - iommu->irq = 0; - dmar_free_hwirq(irq); - return ret; - } - ret = request_irq(irq, dmar_fault, IRQF_NO_THREAD, iommu->name, iommu); if (ret) pr_err("IOMMU: can't request irq\n"); diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 30624954dec5..84737565c1fd 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -227,6 +227,7 @@ extern void dmar_msi_read(int irq, struct msi_msg *msg); extern void dmar_msi_write(int irq, struct msi_msg *msg); extern int dmar_set_interrupt(struct intel_iommu *iommu); extern irqreturn_t dmar_fault(int irq, void *dev_id); -extern int arch_setup_dmar_msi(unsigned int irq); +extern int dmar_alloc_hwirq(int id, int node, void *arg); +extern void dmar_free_hwirq(int irq); #endif /* __DMAR_H__ */ -- cgit v1.2.3 From 0921f1da6425f05a1f56803069124b7ec13b79e2 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:42 +0800 Subject: x86/irq: Use hierarchical irqdomain to manage DMAR interrupts Enhance DMAR code to support hierarchical irqdomain, it helps to make the architecture more clear. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-21-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 7 ++ arch/x86/kernel/apic/msi.c | 153 ++++++++++++++++++++++++++---------------- 2 files changed, 103 insertions(+), 57 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 05829e973a2a..bf1725047c27 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -122,6 +122,7 @@ enum irq_alloc_type { X86_IRQ_ALLOC_TYPE_HPET, X86_IRQ_ALLOC_TYPE_MSI, X86_IRQ_ALLOC_TYPE_MSIX, + X86_IRQ_ALLOC_TYPE_DMAR, }; struct irq_alloc_info { @@ -153,6 +154,12 @@ struct irq_alloc_info { u32 ioapic_valid : 1; struct IO_APIC_route_entry *ioapic_entry; }; +#endif +#ifdef CONFIG_DMAR_TABLE + struct { + int dmar_id; + void *dmar_data; + }; #endif }; }; diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index ca6250439acc..f23d17d759b6 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -25,32 +25,6 @@ static struct irq_domain *msi_default_domain; -static void native_compose_msi_msg(struct irq_cfg *cfg, struct msi_msg *msg) -{ - msg->address_hi = MSI_ADDR_BASE_HI; - - if (x2apic_enabled()) - msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); - - msg->address_lo = - MSI_ADDR_BASE_LO | - ((apic->irq_dest_mode == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL : - MSI_ADDR_DEST_MODE_LOGICAL) | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU : - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(cfg->dest_apicid); - - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED : - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(cfg->vector); -} - static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) { struct irq_cfg *cfg = irqd_cfg(data); @@ -87,6 +61,9 @@ static void msi_update_msg(struct msi_msg *msg, struct irq_data *irq_data) msg->data |= MSI_DATA_VECTOR(cfg->vector); msg->address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg->address_lo |= MSI_ADDR_DEST_ID(cfg->dest_apicid); + if (x2apic_enabled()) + msg->address_hi = MSI_ADDR_BASE_HI | + MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); } /* @@ -196,59 +173,121 @@ static int dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { - struct irq_cfg *cfg = irqd_cfg(data); - unsigned int dest, irq = data->irq; + struct irq_data *parent = data->parent_data; struct msi_msg msg; int ret; - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - dmar_msi_read(irq, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest); - - dmar_msi_write(irq, &msg); + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret >= 0) { + dmar_msi_read(data->irq, &msg); + msi_update_msg(&msg, data); + dmar_msi_write(data->irq, &msg); + } - return IRQ_SET_MASK_OK_NOCOPY; + return ret; } -static struct irq_chip dmar_msi_type = { +static struct irq_chip dmar_msi_controller = { .name = "DMAR_MSI", .irq_unmask = dmar_msi_unmask, .irq_mask = dmar_msi_mask, - .irq_ack = apic_ack_edge, + .irq_ack = irq_chip_ack_parent, .irq_set_affinity = dmar_msi_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_compose_msi_msg = irq_msi_compose_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -int dmar_alloc_hwirq(int id, int node, void *arg) +static int dmar_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_alloc_info *info = arg; + int ret; + + if (nr_irqs > 1 || !info || info->type != X86_IRQ_ALLOC_TYPE_DMAR) + return -EINVAL; + if (irq_find_mapping(domain, info->dmar_id)) { + pr_warn("IRQ for DMAR%d already exists.\n", info->dmar_id); + return -EEXIST; + } + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret >= 0) { + irq_domain_set_hwirq_and_chip(domain, virq, info->dmar_id, + &dmar_msi_controller, NULL); + irq_set_handler_data(virq, info->dmar_data); + __irq_set_handler(virq, handle_edge_irq, 0, "edge"); + } + + return ret; +} + +static void dmar_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + BUG_ON(nr_irqs > 1); + irq_domain_free_irqs_top(domain, virq, nr_irqs); +} + +static void dmar_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data) { - int irq; struct msi_msg msg; - irq = irq_domain_alloc_irqs(NULL, 1, node, NULL); - if (irq > 0) { - irq_set_handler_data(irq, arg); - irq_set_chip_and_handler_name(irq, &dmar_msi_type, - handle_edge_irq, "edge"); - native_compose_msi_msg(irq_cfg(irq), &msg); - dmar_msi_write(irq, &msg); + BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); + dmar_msi_write(irq_data->irq, &msg); +} + +static void dmar_domain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct msi_msg msg; + + memset(&msg, 0, sizeof(msg)); + dmar_msi_write(irq_data->irq, &msg); +} + +static struct irq_domain_ops dmar_domain_ops = { + .alloc = dmar_domain_alloc, + .free = dmar_domain_free, + .activate = dmar_domain_activate, + .deactivate = dmar_domain_deactivate, +}; + +static struct irq_domain *dmar_get_irq_domain(void) +{ + static struct irq_domain *dmar_domain; + static DEFINE_MUTEX(dmar_lock); + + mutex_lock(&dmar_lock); + if (dmar_domain == NULL) { + dmar_domain = irq_domain_add_tree(NULL, &dmar_domain_ops, NULL); + if (dmar_domain) + dmar_domain->parent = x86_vector_domain; } + mutex_unlock(&dmar_lock); + + return dmar_domain; +} + +int dmar_alloc_hwirq(int id, int node, void *arg) +{ + struct irq_domain *domain = dmar_get_irq_domain(); + struct irq_alloc_info info; + + if (!domain) + return -1; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_DMAR; + info.dmar_id = id; + info.dmar_data = arg; - return irq; + return irq_domain_alloc_irqs(domain, 1, node, &info); } void dmar_free_hwirq(int irq) { - irq_set_handler_data(irq, NULL); - irq_set_handler(irq, NULL); irq_domain_free_irqs(irq, 1); } #endif -- cgit v1.2.3 From 49e07d8f28c05347f237146a9ec66f6d958db83e Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:43 +0800 Subject: x86/htirq: Use hierarchical irqdomain to manage Hypertransport interrupts We have slightly changed the architecture interfaces to support htirq PCI driver. It's safe because currently Hypertransport interrupt is only enabled on x86 platforms. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-22-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 13 ++++ arch/x86/kernel/apic/htirq.c | 161 +++++++++++++++++++++++++++++++----------- arch/x86/kernel/apic/vector.c | 1 + drivers/pci/htirq.c | 47 ++---------- include/linux/htirq.h | 24 +++++-- 5 files changed, 158 insertions(+), 88 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index bf1725047c27..5e0b031d1a7d 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -160,6 +160,14 @@ struct irq_alloc_info { int dmar_id; void *dmar_data; }; +#endif +#ifdef CONFIG_HT_IRQ + struct { + int ht_pos; + int ht_idx; + struct pci_dev *ht_dev; + void *ht_update; + }; #endif }; }; @@ -227,6 +235,11 @@ extern void arch_init_msi_domain(struct irq_domain *domain); #else static inline void arch_init_msi_domain(struct irq_domain *domain) { } #endif +#ifdef CONFIG_HT_IRQ +extern void arch_init_htirq_domain(struct irq_domain *domain); +#else +static inline void arch_init_htirq_domain(struct irq_domain *domain) { } +#endif /* Statistics */ extern atomic_t irq_err_count; diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c index b307ee7a7148..1cae104415ea 100644 --- a/arch/x86/kernel/apic/htirq.c +++ b/arch/x86/kernel/apic/htirq.c @@ -3,6 +3,8 @@ * * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo * Moved from arch/x86/kernel/apic/io_apic.c. + * Jiang Liu + * Add support of hierarchical irqdomain * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -19,70 +21,104 @@ #include #include +static struct irq_domain *htirq_domain; + /* * Hypertransport interrupt support */ -static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) -{ - struct ht_irq_msg msg; - - fetch_ht_irq_msg(irq, &msg); - - msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); - msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - - msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); - msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); - - write_ht_irq_msg(irq, &msg); -} - static int ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { - struct irq_cfg *cfg = irqd_cfg(data); - unsigned int dest; + struct irq_data *parent = data->parent_data; int ret; - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - target_ht_irq(data->irq, dest, cfg->vector); - return IRQ_SET_MASK_OK_NOCOPY; + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret >= 0) { + struct ht_irq_msg msg; + struct irq_cfg *cfg = irqd_cfg(data); + + fetch_ht_irq_msg(data->irq, &msg); + msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | + HT_IRQ_LOW_DEST_ID_MASK); + msg.address_lo |= HT_IRQ_LOW_VECTOR(cfg->vector) | + HT_IRQ_LOW_DEST_ID(cfg->dest_apicid); + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid); + write_ht_irq_msg(data->irq, &msg); + } + + return ret; } static struct irq_chip ht_irq_chip = { .name = "PCI-HT", .irq_mask = mask_ht_irq, .irq_unmask = unmask_ht_irq, - .irq_ack = apic_ack_edge, + .irq_ack = irq_chip_ack_parent, .irq_set_affinity = ht_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_retrigger = irq_chip_retrigger_hierarchy, .flags = IRQCHIP_SKIP_SET_WAKE, }; -int arch_alloc_ht_irq(struct pci_dev *dev) +static int htirq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) { - return irq_domain_alloc_irqs(NULL, 1, dev_to_node(&dev->dev), NULL); + struct ht_irq_cfg *ht_cfg; + struct irq_alloc_info *info = arg; + struct pci_dev *dev; + irq_hw_number_t hwirq; + int ret; + + if (nr_irqs > 1 || !info) + return -EINVAL; + + dev = info->ht_dev; + hwirq = (info->ht_idx & 0xFF) | + PCI_DEVID(dev->bus->number, dev->devfn) << 8 | + (pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 24; + if (irq_find_mapping(domain, hwirq) > 0) + return -EEXIST; + + ht_cfg = kmalloc(sizeof(*ht_cfg), GFP_KERNEL); + if (!ht_cfg) + return -ENOMEM; + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); + if (ret < 0) { + kfree(ht_cfg); + return ret; + } + + /* Initialize msg to a value that will never match the first write. */ + ht_cfg->msg.address_lo = 0xffffffff; + ht_cfg->msg.address_hi = 0xffffffff; + ht_cfg->dev = info->ht_dev; + ht_cfg->update = info->ht_update; + ht_cfg->pos = info->ht_pos; + ht_cfg->idx = 0x10 + (info->ht_idx * 2); + irq_domain_set_info(domain, virq, hwirq, &ht_irq_chip, ht_cfg, + handle_edge_irq, ht_cfg, "edge"); + + return 0; } -void arch_free_ht_irq(int irq) +static void htirq_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) { - irq_domain_free_irqs(irq, 1); + struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); + + BUG_ON(nr_irqs != 1); + kfree(irq_data->chip_data); + irq_domain_free_irqs_top(domain, virq, nr_irqs); } -int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) +static void htirq_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data) { - struct irq_cfg *cfg; struct ht_irq_msg msg; + struct irq_cfg *cfg = irqd_cfg(irq_data); - if (disable_apic) - return -ENXIO; - - cfg = irq_cfg(irq); msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid); - msg.address_lo = HT_IRQ_LOW_BASE | HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) | @@ -95,13 +131,56 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) HT_IRQ_LOW_MT_FIXED : HT_IRQ_LOW_MT_ARBITRATED) | HT_IRQ_LOW_IRQ_MASKED; + write_ht_irq_msg(irq_data->irq, &msg); +} - write_ht_irq_msg(irq, &msg); +static void htirq_domain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct ht_irq_msg msg; - irq_set_chip_and_handler_name(irq, &ht_irq_chip, - handle_edge_irq, "edge"); + memset(&msg, 0, sizeof(msg)); + write_ht_irq_msg(irq_data->irq, &msg); +} - dev_dbg(&dev->dev, "irq %d for HT\n", irq); +static struct irq_domain_ops htirq_domain_ops = { + .alloc = htirq_domain_alloc, + .free = htirq_domain_free, + .activate = htirq_domain_activate, + .deactivate = htirq_domain_deactivate, +}; - return 0; +void arch_init_htirq_domain(struct irq_domain *parent) +{ + if (disable_apic) + return; + + htirq_domain = irq_domain_add_tree(NULL, &htirq_domain_ops, NULL); + if (!htirq_domain) + pr_warn("failed to initialize irqdomain for HTIRQ.\n"); + else + htirq_domain->parent = parent; +} + +int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev, + ht_irq_update_t *update) +{ + struct irq_alloc_info info; + + if (!htirq_domain) + return -ENOSYS; + + init_irq_alloc_info(&info, NULL); + info.ht_idx = idx; + info.ht_pos = pos; + info.ht_dev = dev; + info.ht_update = update; + + return irq_domain_alloc_irqs(htirq_domain, 1, dev_to_node(&dev->dev), + &info); +} + +void arch_teardown_ht_irq(unsigned int irq) +{ + irq_domain_free_irqs(irq, 1); } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index a8d82896be75..b4b6b5a13440 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -365,6 +365,7 @@ int __init arch_early_irq_init(void) irq_set_default_host(x86_vector_domain); arch_init_msi_domain(x86_vector_domain); + arch_init_htirq_domain(x86_vector_domain); return arch_early_ioapic_init(); } diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c index ceb0ebeb7b5f..7eb4109a3df4 100644 --- a/drivers/pci/htirq.c +++ b/drivers/pci/htirq.c @@ -23,20 +23,11 @@ */ static DEFINE_SPINLOCK(ht_irq_lock); -struct ht_irq_cfg { - struct pci_dev *dev; - /* Update callback used to cope with buggy hardware */ - ht_irq_update_t *update; - unsigned pos; - unsigned idx; - struct ht_irq_msg msg; -}; - - void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg) { struct ht_irq_cfg *cfg = irq_get_handler_data(irq); unsigned long flags; + spin_lock_irqsave(&ht_irq_lock, flags); if (cfg->msg.address_lo != msg->address_lo) { pci_write_config_byte(cfg->dev, cfg->pos + 2, cfg->idx); @@ -55,6 +46,7 @@ void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg) void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg) { struct ht_irq_cfg *cfg = irq_get_handler_data(irq); + *msg = cfg->msg; } @@ -86,7 +78,6 @@ void unmask_ht_irq(struct irq_data *data) */ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) { - struct ht_irq_cfg *cfg; int max_irq, pos, irq; unsigned long flags; u32 data; @@ -105,29 +96,9 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) if (idx > max_irq) return -EINVAL; - cfg = kmalloc(sizeof(*cfg), GFP_KERNEL); - if (!cfg) - return -ENOMEM; - - cfg->dev = dev; - cfg->update = update; - cfg->pos = pos; - cfg->idx = 0x10 + (idx * 2); - /* Initialize msg to a value that will never match the first write. */ - cfg->msg.address_lo = 0xffffffff; - cfg->msg.address_hi = 0xffffffff; - - irq = arch_alloc_ht_irq(dev); - if (irq <= 0) { - kfree(cfg); - return -EBUSY; - } - irq_set_handler_data(irq, cfg); - - if (arch_setup_ht_irq(irq, dev) < 0) { - ht_destroy_irq(irq); - return -EBUSY; - } + irq = arch_setup_ht_irq(idx, pos, dev, update); + if (irq > 0) + dev_dbg(&dev->dev, "irq %d for HT\n", irq); return irq; } @@ -158,12 +129,6 @@ EXPORT_SYMBOL(ht_create_irq); */ void ht_destroy_irq(unsigned int irq) { - struct ht_irq_cfg *cfg; - - cfg = irq_get_handler_data(irq); - irq_set_chip(irq, NULL); - irq_set_handler_data(irq, NULL); - arch_free_ht_irq(irq); - kfree(cfg); + arch_teardown_ht_irq(irq); } EXPORT_SYMBOL(ht_destroy_irq); diff --git a/include/linux/htirq.h b/include/linux/htirq.h index 5caa51b7b95c..d4a527e58434 100644 --- a/include/linux/htirq.h +++ b/include/linux/htirq.h @@ -1,26 +1,38 @@ #ifndef LINUX_HTIRQ_H #define LINUX_HTIRQ_H +struct pci_dev; +struct irq_data; + struct ht_irq_msg { u32 address_lo; /* low 32 bits of the ht irq message */ u32 address_hi; /* high 32 bits of the it irq message */ }; +typedef void (ht_irq_update_t)(struct pci_dev *dev, int irq, + struct ht_irq_msg *msg); + +struct ht_irq_cfg { + struct pci_dev *dev; + /* Update callback used to cope with buggy hardware */ + ht_irq_update_t *update; + unsigned pos; + unsigned idx; + struct ht_irq_msg msg; +}; + /* Helper functions.. */ void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg); void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg); -struct irq_data; void mask_ht_irq(struct irq_data *data); void unmask_ht_irq(struct irq_data *data); /* The arch hook for getting things started */ -int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev); -int arch_alloc_ht_irq(struct pci_dev *dev); -void arch_free_ht_irq(int irq); +int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev, + ht_irq_update_t *update); +void arch_teardown_ht_irq(unsigned int irq); /* For drivers of buggy hardware */ -typedef void (ht_irq_update_t)(struct pci_dev *dev, int irq, - struct ht_irq_msg *msg); int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update); #endif /* LINUX_HTIRQ_H */ -- cgit v1.2.3 From 81dabe2e739d5e0ad8ca2369738fb84bd64f967d Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:45 +0800 Subject: x86/irq: Normalize x86 irq_chip name Some irq_chip names use underscore, others use hyphen. So normalize them to use hyphen as separator. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-24-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index f23d17d759b6..d17eb6a52c84 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -188,7 +188,7 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, } static struct irq_chip dmar_msi_controller = { - .name = "DMAR_MSI", + .name = "DMAR-MSI", .irq_unmask = dmar_msi_unmask, .irq_mask = dmar_msi_mask, .irq_ack = irq_chip_ack_parent, @@ -319,7 +319,7 @@ static int hpet_msi_set_affinity(struct irq_data *data, } static struct irq_chip hpet_msi_controller = { - .name = "HPET_MSI", + .name = "HPET-MSI", .irq_unmask = hpet_msi_unmask, .irq_mask = hpet_msi_mask, .irq_ack = irq_chip_ack_parent, -- cgit v1.2.3 From 68682a2687bf7dbe51309d297757a7ea6a96d312 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:46 +0800 Subject: x86/MSI: Simplify the way to deal with remapped MSI interrupts Simplify the way to deal with remapped MSI interrupts, so we can remove irq_chip.irq_print_chip later. We simply change the name when the setup detects that the parent domain is remapping. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-25-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index d17eb6a52c84..87df03ae99ba 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -77,7 +77,6 @@ static struct irq_chip pci_msi_controller = { .irq_ack = irq_chip_ack_parent, .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_print_chip = irq_remapping_print_chip, .irq_compose_msi_msg = irq_msi_compose_msg, .irq_write_msi_msg = pci_msi_domain_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, @@ -143,7 +142,7 @@ static struct msi_domain_ops pci_msi_domain_ops = { static struct msi_domain_info pci_msi_domain_info = { .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX, + MSI_FLAG_PCI_MSIX, .ops = &pci_msi_domain_ops, .chip = &pci_msi_controller, .handler = handle_edge_irq, @@ -162,9 +161,29 @@ void arch_init_msi_domain(struct irq_domain *parent) } #ifdef CONFIG_IRQ_REMAP +static struct irq_chip pci_msi_ir_controller = { + .name = "IR-PCI-MSI", + .irq_unmask = pci_msi_unmask_irq, + .irq_mask = pci_msi_mask_irq, + .irq_ack = irq_chip_ack_parent, + .irq_set_affinity = msi_domain_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_write_msi_msg = pci_msi_domain_write_msg, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +static struct msi_domain_info pci_msi_ir_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX, + .ops = &pci_msi_domain_ops, + .chip = &pci_msi_ir_controller, + .handler = handle_edge_irq, + .handler_name = "edge", +}; + struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent) { - return msi_create_irq_domain(NULL, &pci_msi_domain_info, parent); + return pci_msi_create_irq_domain(NULL, &pci_msi_ir_domain_info, parent); } #endif @@ -325,7 +344,6 @@ static struct irq_chip hpet_msi_controller = { .irq_ack = irq_chip_ack_parent, .irq_set_affinity = hpet_msi_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_print_chip = irq_remapping_print_chip, .irq_compose_msi_msg = irq_msi_compose_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -402,6 +420,8 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id) parent = irq_remapping_get_ir_irq_domain(&info); if (parent == NULL) parent = x86_vector_domain; + else + hpet_msi_controller.name = "IR-HPET-MSI"; return irq_domain_add_hierarchy(parent, 0, 0, NULL, &hpet_domain_ops, (void *)(long)hpet_id); -- cgit v1.2.3 From 90d84fe95dd6b418383aa0e0e5cace8f1b1e7e30 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:47 +0800 Subject: x86/MSI: Replace msi_update_msg() with irq_chip_compose_msi_msg() Function irq_chip_compose_msi_msg() can achieve the same goal as msi_update_msg(), so remove msi_update_msg(). Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-26-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 87df03ae99ba..5b5ef5bd23f5 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -53,19 +53,6 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) MSI_DATA_VECTOR(cfg->vector); } -static void msi_update_msg(struct msi_msg *msg, struct irq_data *irq_data) -{ - struct irq_cfg *cfg = irqd_cfg(irq_data); - - msg->data &= ~MSI_DATA_VECTOR_MASK; - msg->data |= MSI_DATA_VECTOR(cfg->vector); - msg->address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg->address_lo |= MSI_ADDR_DEST_ID(cfg->dest_apicid); - if (x2apic_enabled()) - msg->address_hi = MSI_ADDR_BASE_HI | - MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); -} - /* * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, * which implement the MSI or MSI-X Capability Structure. @@ -198,8 +185,7 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, ret = parent->chip->irq_set_affinity(parent, mask, force); if (ret >= 0) { - dmar_msi_read(data->irq, &msg); - msi_update_msg(&msg, data); + irq_chip_compose_msi_msg(data, &msg); dmar_msi_write(data->irq, &msg); } @@ -329,8 +315,7 @@ static int hpet_msi_set_affinity(struct irq_data *data, ret = parent->chip->irq_set_affinity(parent, mask, force); if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { - hpet_msi_read(data->handler_data, &msg); - msi_update_msg(&msg, data); + irq_chip_compose_msi_msg(data, &msg); hpet_msi_write(data->handler_data, &msg); } -- cgit v1.2.3 From 62ac1780830ed64a9a46f80a03e91de71957d670 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:48 +0800 Subject: x86/irq: Implement irq_chip.irq_write_msi_msg for MSI/DMAR/HPET irq_chips Implement irq_chip.irq_write_msi_msg for MSI/DMAR/HPET irq_chips, they will be used to replace duplicated code. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-27-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 5b5ef5bd23f5..3c825867aeb5 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -192,6 +192,11 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, return ret; } +static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg) +{ + dmar_msi_write(data->irq, msg); +} + static struct irq_chip dmar_msi_controller = { .name = "DMAR-MSI", .irq_unmask = dmar_msi_unmask, @@ -200,6 +205,7 @@ static struct irq_chip dmar_msi_controller = { .irq_set_affinity = dmar_msi_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, + .irq_write_msi_msg = dmar_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -322,6 +328,11 @@ static int hpet_msi_set_affinity(struct irq_data *data, return ret; } +static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) +{ + hpet_msi_write(data->handler_data, msg); +} + static struct irq_chip hpet_msi_controller = { .name = "HPET-MSI", .irq_unmask = hpet_msi_unmask, @@ -330,6 +341,7 @@ static struct irq_chip hpet_msi_controller = { .irq_set_affinity = hpet_msi_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, + .irq_write_msi_msg = hpet_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -- cgit v1.2.3 From e390d895ae14ad655c6b830e62a22a81b69290ef Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:49 +0800 Subject: x86/irq: Simplify MSI/DMAR/HPET implementation by using common code Use common MSI interfaces instead of private implementations of the same functionality to simplify DMAR/HPET driver implementation. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428905519-23704-28-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 192 +++++++++++++-------------------------------- 1 file changed, 54 insertions(+), 138 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 3c825867aeb5..109584261c4e 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -62,10 +62,8 @@ static struct irq_chip pci_msi_controller = { .irq_unmask = pci_msi_unmask_irq, .irq_mask = pci_msi_mask_irq, .irq_ack = irq_chip_ack_parent, - .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, - .irq_write_msi_msg = pci_msi_domain_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -153,9 +151,7 @@ static struct irq_chip pci_msi_ir_controller = { .irq_unmask = pci_msi_unmask_irq, .irq_mask = pci_msi_mask_irq, .irq_ack = irq_chip_ack_parent, - .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_write_msi_msg = pci_msi_domain_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -175,23 +171,6 @@ struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent) #endif #ifdef CONFIG_DMAR_TABLE -static int -dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - struct irq_data *parent = data->parent_data; - struct msi_msg msg; - int ret; - - ret = parent->chip->irq_set_affinity(parent, mask, force); - if (ret >= 0) { - irq_chip_compose_msi_msg(data, &msg); - dmar_msi_write(data->irq, &msg); - } - - return ret; -} - static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg) { dmar_msi_write(data->irq, msg); @@ -202,67 +181,37 @@ static struct irq_chip dmar_msi_controller = { .irq_unmask = dmar_msi_unmask, .irq_mask = dmar_msi_mask, .irq_ack = irq_chip_ack_parent, - .irq_set_affinity = dmar_msi_set_affinity, + .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, .irq_write_msi_msg = dmar_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -static int dmar_domain_alloc(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs, void *arg) +static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) { - struct irq_alloc_info *info = arg; - int ret; - - if (nr_irqs > 1 || !info || info->type != X86_IRQ_ALLOC_TYPE_DMAR) - return -EINVAL; - if (irq_find_mapping(domain, info->dmar_id)) { - pr_warn("IRQ for DMAR%d already exists.\n", info->dmar_id); - return -EEXIST; - } - - ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); - if (ret >= 0) { - irq_domain_set_hwirq_and_chip(domain, virq, info->dmar_id, - &dmar_msi_controller, NULL); - irq_set_handler_data(virq, info->dmar_data); - __irq_set_handler(virq, handle_edge_irq, 0, "edge"); - } - - return ret; + return arg->dmar_id; } -static void dmar_domain_free(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs) +static int dmar_msi_init(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq, + irq_hw_number_t hwirq, msi_alloc_info_t *arg) { - BUG_ON(nr_irqs > 1); - irq_domain_free_irqs_top(domain, virq, nr_irqs); -} + irq_domain_set_info(domain, virq, arg->dmar_id, info->chip, NULL, + handle_edge_irq, arg->dmar_data, "edge"); -static void dmar_domain_activate(struct irq_domain *domain, - struct irq_data *irq_data) -{ - struct msi_msg msg; - - BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); - dmar_msi_write(irq_data->irq, &msg); + return 0; } -static void dmar_domain_deactivate(struct irq_domain *domain, - struct irq_data *irq_data) -{ - struct msi_msg msg; - - memset(&msg, 0, sizeof(msg)); - dmar_msi_write(irq_data->irq, &msg); -} +static struct msi_domain_ops dmar_msi_domain_ops = { + .get_hwirq = dmar_msi_get_hwirq, + .msi_init = dmar_msi_init, +}; -static struct irq_domain_ops dmar_domain_ops = { - .alloc = dmar_domain_alloc, - .free = dmar_domain_free, - .activate = dmar_domain_activate, - .deactivate = dmar_domain_deactivate, +static struct msi_domain_info dmar_msi_domain_info = { + .ops = &dmar_msi_domain_ops, + .chip = &dmar_msi_controller, }; static struct irq_domain *dmar_get_irq_domain(void) @@ -271,11 +220,9 @@ static struct irq_domain *dmar_get_irq_domain(void) static DEFINE_MUTEX(dmar_lock); mutex_lock(&dmar_lock); - if (dmar_domain == NULL) { - dmar_domain = irq_domain_add_tree(NULL, &dmar_domain_ops, NULL); - if (dmar_domain) - dmar_domain->parent = x86_vector_domain; - } + if (dmar_domain == NULL) + dmar_domain = msi_create_irq_domain(NULL, &dmar_msi_domain_info, + x86_vector_domain); mutex_unlock(&dmar_lock); return dmar_domain; @@ -309,23 +256,9 @@ void dmar_free_hwirq(int irq) #ifdef CONFIG_HPET_TIMER static inline int hpet_dev_id(struct irq_domain *domain) { - return (int)(long)domain->host_data; -} + struct msi_domain_info *info = msi_get_domain_info(domain); -static int hpet_msi_set_affinity(struct irq_data *data, - const struct cpumask *mask, bool force) -{ - struct irq_data *parent = data->parent_data; - struct msi_msg msg; - int ret; - - ret = parent->chip->irq_set_affinity(parent, mask, force); - if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { - irq_chip_compose_msi_msg(data, &msg); - hpet_msi_write(data->handler_data, &msg); - } - - return ret; + return (int)(long)info->data; } static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) @@ -338,79 +271,63 @@ static struct irq_chip hpet_msi_controller = { .irq_unmask = hpet_msi_unmask, .irq_mask = hpet_msi_mask, .irq_ack = irq_chip_ack_parent, - .irq_set_affinity = hpet_msi_set_affinity, + .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, .irq_write_msi_msg = hpet_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -static int hpet_domain_alloc(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs, void *arg) -{ - struct irq_alloc_info *info = arg; - int ret; - - if (nr_irqs > 1 || !info || info->type != X86_IRQ_ALLOC_TYPE_HPET) - return -EINVAL; - if (irq_find_mapping(domain, info->hpet_index)) { - pr_warn("IRQ for HPET%d already exists.\n", info->hpet_index); - return -EEXIST; - } - - ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); - if (ret >= 0) { - irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); - irq_domain_set_hwirq_and_chip(domain, virq, info->hpet_index, - &hpet_msi_controller, NULL); - irq_set_handler_data(virq, info->hpet_data); - __irq_set_handler(virq, handle_edge_irq, 0, "edge"); - } - - return ret; -} - -static void hpet_domain_free(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs) +static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) { - BUG_ON(nr_irqs > 1); - irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); - irq_domain_free_irqs_top(domain, virq, nr_irqs); + return arg->hpet_index; } -static void hpet_domain_activate(struct irq_domain *domain, - struct irq_data *irq_data) +static int hpet_msi_init(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq, + irq_hw_number_t hwirq, msi_alloc_info_t *arg) { - struct msi_msg msg; + irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); + irq_domain_set_info(domain, virq, arg->hpet_index, info->chip, NULL, + handle_edge_irq, arg->hpet_data, "edge"); - BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); - hpet_msi_write(irq_get_handler_data(irq_data->irq), &msg); + return 0; } -static void hpet_domain_deactivate(struct irq_domain *domain, - struct irq_data *irq_data) +static void hpet_msi_free(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq) { - struct msi_msg msg; - - memset(&msg, 0, sizeof(msg)); - hpet_msi_write(irq_get_handler_data(irq_data->irq), &msg); + irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); } -static struct irq_domain_ops hpet_domain_ops = { - .alloc = hpet_domain_alloc, - .free = hpet_domain_free, - .activate = hpet_domain_activate, - .deactivate = hpet_domain_deactivate, +static struct msi_domain_ops hpet_msi_domain_ops = { + .get_hwirq = hpet_msi_get_hwirq, + .msi_init = hpet_msi_init, + .msi_free = hpet_msi_free, +}; + +static struct msi_domain_info hpet_msi_domain_info = { + .ops = &hpet_msi_domain_ops, + .chip = &hpet_msi_controller, }; struct irq_domain *hpet_create_irq_domain(int hpet_id) { struct irq_domain *parent; struct irq_alloc_info info; + struct msi_domain_info *domain_info; if (x86_vector_domain == NULL) return NULL; + domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL); + if (!domain_info) + return NULL; + + *domain_info = hpet_msi_domain_info; + domain_info->data = (void *)(long)hpet_id; + init_irq_alloc_info(&info, NULL); info.type = X86_IRQ_ALLOC_TYPE_HPET; info.hpet_id = hpet_id; @@ -420,8 +337,7 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id) else hpet_msi_controller.name = "IR-HPET-MSI"; - return irq_domain_add_hierarchy(parent, 0, 0, NULL, &hpet_domain_ops, - (void *)(long)hpet_id); + return msi_create_irq_domain(NULL, domain_info, parent); } int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev, -- cgit v1.2.3 From 6648d1b42c349d748839d7bad91cc8a65c73e262 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 13 Apr 2015 14:11:51 +0800 Subject: x86/intel-mid: Delay initialization of APB timer MID has no PIC, but depending on the platform it requires the abt_timer, which is connected to irq0. The timer is set up at late_time_init(). But, looking at the MID code it seems, that there is no reason to do so. The only code which might need the timer working is the TSC calibration code, but thats a non issue on MID as that is using its own empty calibration function. And check_timer() is not invoked either because MID has no PIC and therefor no legacy irqs. So if you look at intel_mid_time_init() then you'll see that in the ARAT case the timer setup is skipped already. So until the point where x86_init.timers.setup_percpu_clockev() is called for the boot cpu nothing really needs a timer on MID. According to the MID code the apbt horror is only used for moorestown. Medfield and later use the local apic timer without the apbt nonsense. The best thing we can do is to drop moorestown support and get rid of that apbt nonsense alltogether. I don't think anyone deeply cares about it not being supported from 3.18 on. The number of devices which sport a moorestown should be pretty limited and the only relevant use case of those is to act as a pocket heater with short battery life time. Its pretty pointless to update kernels on pocket heaters except for bragging reasons. If someone at Intel really thinks that we need to keep moorestown alive for other than documentary and sentimental reasons, then we can move the apbt setup to x86_init.timers.setup_percpu_clockev(). At that point the IOAPIC is setup already, so it should just work. Signed-off-by: Thomas Gleixner Tested-by: Andy Shevchenko Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Kuppuswamy Sathyanarayanan Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Rickard Strandqvist Link: http://lkml.kernel.org/r/1428905519-23704-30-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apb_timer.c | 4 ---- arch/x86/platform/intel-mid/intel-mid.c | 18 +++++++++++++----- arch/x86/platform/intel-mid/sfi.c | 2 -- 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 6a7c23ff21d3..ede92c3364d3 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -171,10 +171,6 @@ static int __init apbt_clockevent_register(void) static void apbt_setup_irq(struct apbt_dev *adev) { - /* timer0 irq has been setup early */ - if (adev->irq == 0) - return; - irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); } diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 3005f0c89f2e..01d54ea766c1 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -81,26 +81,34 @@ static unsigned long __init intel_mid_calibrate_tsc(void) return 0; } +static void __init intel_mid_setup_bp_timer(void) +{ + apbt_time_init(); + setup_boot_APIC_clock(); +} + static void __init intel_mid_time_init(void) { sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); + switch (intel_mid_timer_options) { case INTEL_MID_TIMER_APBT_ONLY: break; case INTEL_MID_TIMER_LAPIC_APBT: - x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; + /* Use apbt and local apic */ + x86_init.timers.setup_percpu_clockev = intel_mid_setup_bp_timer; x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; - break; + return; default: if (!boot_cpu_has(X86_FEATURE_ARAT)) break; + /* Lapic only, no apbt */ x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; return; } - /* we need at least one APB timer */ - pre_init_apic_IRQ0(); - apbt_time_init(); + + x86_init.timers.setup_percpu_clockev = apbt_time_init; } static void intel_mid_arch_setup(void) diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index c14ad34776c4..aa59f88868f8 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -95,8 +95,6 @@ int __init sfi_parse_mtmr(struct sfi_table_header *table) pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz, irq = %d\n", totallen, (u32)pentry->phys_addr, pentry->freq_hz, pentry->irq); - if (!pentry->irq) - continue; mp_irq.type = MP_INTSRC; mp_irq.irqtype = mp_INT; /* triggering mode edge bit 2-3, active high polarity bit 0-1 */ -- cgit v1.2.3 From 4e69d7eab4c24aa88fb0ec99fad7feac254d9ece Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:53 +0800 Subject: x86/irq: Remove unused pre_init_apic_IRQ0() Now there's no user of pre_init_apic_IRQ0(), so remove it. Signed-off-by: Jiang Liu Tested-by: Andy Shevchenko Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Jan Beulich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428905519-23704-32-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 1 - arch/x86/kernel/apic/io_apic.c | 17 ----------------- 2 files changed, 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 2f91685fe1cd..c976de126a91 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -204,7 +204,6 @@ extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq); extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq); extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node); -extern void __init pre_init_apic_IRQ0(void); extern void mp_save_irq(struct mpc_intsrc *m); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 56d532106ef3..540598c77e55 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3091,20 +3091,3 @@ int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node) return ret; } - -/* Enable IOAPIC early just for system timer */ -void __init pre_init_apic_IRQ0(void) -{ - struct io_apic_irq_attr attr = { 0, 0, 0, 0 }; - - printk(KERN_INFO "Early APIC setup for system timer0\n"); -#ifndef CONFIG_SMP - physid_set_mask_of_physid(boot_cpu_physical_apicid, - &phys_cpu_present_map); -#endif - setup_local_APIC(); - - io_apic_setup_irq_pin(0, 0, &attr); - irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, - "edge"); -} -- cgit v1.2.3 From c4d05a2c354b15965c9b2a5f46016a5d9f43e224 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:54 +0800 Subject: x86/irq: Prepare IOAPIC interfaces to support hierarchical irqdomains Introduce helper functions to manipulate struct irq_alloc_info for IOAPIC. Also add an extra parameter to IOAPIC interfaces to prepare for hierarchical irqdomain. Function mp_set_gsi_attr() will be removed once we have switched to hierarchical irqdomains. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Len Brown Cc: Pavel Machek Cc: Jan Beulich Cc: Grant Likely Cc: David Cohen Link: http://lkml.kernel.org/r/1428905519-23704-33-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 14 ++++++-- arch/x86/kernel/acpi/boot.c | 6 ++-- arch/x86/kernel/apic/io_apic.c | 39 ++++++++++++++-------- arch/x86/pci/intel_mid_pci.c | 4 ++- .../platform/intel-mid/device_libs/platform_wdt.c | 4 ++- arch/x86/platform/intel-mid/sfi.c | 9 +++-- 6 files changed, 54 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index c976de126a91..1fbeda50a77d 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -95,6 +95,8 @@ struct IR_IO_APIC_route_entry { index : 15; } __attribute__ ((packed)); +struct irq_alloc_info; + #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 #define IOAPIC_LEVEL 1 @@ -194,7 +196,8 @@ extern u32 gsi_top; extern int mp_find_ioapic(u32 gsi); extern int mp_find_ioapic_pin(int ioapic, u32 gsi); extern u32 mp_pin_to_gsi(int ioapic, int pin); -extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags); +extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, + struct irq_alloc_info *info); extern void mp_unmap_irq(int irq); extern int mp_register_ioapic(int id, u32 address, u32 gsi_base, struct ioapic_domain_cfg *cfg); @@ -203,6 +206,8 @@ extern int mp_ioapic_registered(u32 gsi_base); extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq); extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq); +extern void ioapic_set_alloc_attr(struct irq_alloc_info *info, + int node, int trigger, int polarity); extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node); extern void mp_save_irq(struct mpc_intsrc *m); @@ -253,7 +258,12 @@ static inline void print_IO_APICs(void) {} #define gsi_top (NR_IRQS_LEGACY) static inline int mp_find_ioapic(u32 gsi) { return 0; } static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return UINT_MAX; } -static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) { return gsi; } +static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, + struct irq_alloc_info *info) +{ + return gsi; +} + static inline void mp_unmap_irq(int irq) { } static inline int save_ioapic_entries(void) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 803b684676ff..a43a4d3c60e1 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -404,6 +404,7 @@ static int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { int irq, node; + struct irq_alloc_info info; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; @@ -416,7 +417,8 @@ static int mp_register_gsi(struct device *dev, u32 gsi, int trigger, return -1; } - irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC); + ioapic_set_alloc_attr(&info, node, trigger, polarity); + irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); if (irq < 0) return irq; @@ -434,7 +436,7 @@ static void mp_unregister_gsi(u32 gsi) if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return; - irq = mp_map_gsi_to_irq(gsi, 0); + irq = mp_map_gsi_to_irq(gsi, 0, NULL); if (irq > 0) mp_unmap_irq(irq); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 540598c77e55..5c953bb96ecf 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -938,7 +938,19 @@ static int irq_trigger(int idx) return trigger; } -static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin) +void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, + int trigger, int polarity) +{ + init_irq_alloc_info(info, NULL); + info->type = X86_IRQ_ALLOC_TYPE_IOAPIC; + info->ioapic_node = node; + info->ioapic_trigger = trigger; + info->ioapic_polarity = polarity; + info->ioapic_valid = 1; +} + +static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin, + struct irq_alloc_info *info) { int irq = -1; int ioapic = (int)(long)domain->host_data; @@ -971,11 +983,11 @@ static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin) } static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, - unsigned int flags) + unsigned int flags, struct irq_alloc_info *info) { int irq; struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); - struct mp_pin_info *info = mp_pin_info(ioapic, pin); + struct mp_pin_info *pinfo = mp_pin_info(ioapic, pin); if (!domain) return -1; @@ -997,30 +1009,30 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; if (flags & IOAPIC_MAP_ALLOC) { - if (info->count == 0 && + if (pinfo->count == 0 && mp_irqdomain_map(domain, irq, pin) != 0) irq = -1; /* special handling for timer IRQ0 */ if (irq == 0) - info->count++; + pinfo->count++; } } else { irq = irq_find_mapping(domain, pin); if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC)) - irq = alloc_irq_from_domain(domain, gsi, pin); + irq = alloc_irq_from_domain(domain, gsi, pin, info); } if (flags & IOAPIC_MAP_ALLOC) { /* special handling for legacy IRQs */ - if (irq < nr_legacy_irqs() && info->count == 1 && + if (irq < nr_legacy_irqs() && pinfo->count == 1 && mp_irqdomain_map(domain, irq, pin) != 0) irq = -1; if (irq > 0) - info->count++; - else if (info->count == 0) - info->set = 0; + pinfo->count++; + else if (pinfo->count == 0) + pinfo->set = 0; } mutex_unlock(&ioapic_mutex); @@ -1058,10 +1070,11 @@ static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) } #endif - return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags); + return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL); } -int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) +int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, + struct irq_alloc_info *info) { int ioapic, pin, idx; @@ -1074,7 +1087,7 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) if ((flags & IOAPIC_MAP_CHECK) && idx < 0) return -1; - return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags); + return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info); } void mp_unmap_irq(int irq) diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 852aa4c92da0..57b5719b617b 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -208,6 +208,7 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, static int intel_mid_pci_irq_enable(struct pci_dev *dev) { + struct irq_alloc_info info; int polarity; if (dev->irq_managed && dev->irq > 0) @@ -217,6 +218,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) polarity = 0; /* active high */ else polarity = 1; /* active low */ + ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity); /* * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to @@ -224,7 +226,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) */ if (mp_set_gsi_attr(dev->irq, 1, polarity, dev_to_node(&dev->dev))) return -EBUSY; - if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC) < 0) + if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC, &info) < 0) return -EBUSY; dev->irq_managed = 1; diff --git a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c index 0b283d4d0ad7..de0009f6d555 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c @@ -27,6 +27,7 @@ static struct platform_device wdt_dev = { static int tangier_probe(struct platform_device *pdev) { int gsi; + struct irq_alloc_info info; struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data; if (!pdata) @@ -34,8 +35,9 @@ static int tangier_probe(struct platform_device *pdev) /* IOAPIC builds identity mapping between GSI and IRQ on MID */ gsi = pdata->irq; + ioapic_set_alloc_attr(&info, cpu_to_node(0), 1, 0); if (mp_set_gsi_attr(gsi, 1, 0, cpu_to_node(0)) || - mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC) <= 0) { + mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info) <= 0) { dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n", gsi); return -EINVAL; diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index 9a16749935d4..7d17355d820e 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -104,7 +104,7 @@ int __init sfi_parse_mtmr(struct sfi_table_header *table) mp_irq.dstapic = MP_APIC_ALL; mp_irq.dstirq = pentry->irq; mp_save_irq(&mp_irq); - mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC); + mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL); } return 0; @@ -175,7 +175,7 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) mp_irq.dstapic = MP_APIC_ALL; mp_irq.dstirq = pentry->irq; mp_save_irq(&mp_irq); - mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC); + mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL); } return 0; } @@ -434,6 +434,7 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) struct devs_id *dev = NULL; int num, i, ret; int polarity; + struct irq_alloc_info info; sb = (struct sfi_table_simple *)table; num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry); @@ -467,9 +468,11 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) polarity = 1; } + ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, polarity); ret = mp_set_gsi_attr(irq, 1, polarity, NUMA_NO_NODE); if (ret == 0) - ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC); + ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC, + &info); WARN_ON(ret < 0); } -- cgit v1.2.3 From 49c7e60022912d10da88ba67e8eb2927f1143f6a Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:55 +0800 Subject: x86/irq: Implement callbacks to enable hierarchical irqdomains on IOAPICs Implement required callbacks to prepare for enabling hierarchical irqdomains on IOAPICs. After the conversion we can remove quite some code from the old implementation. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Jan Beulich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428905519-23704-34-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 10 +++ arch/x86/kernel/apic/io_apic.c | 159 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 166 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 1fbeda50a77d..ecc192624eaf 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -96,6 +96,7 @@ struct IR_IO_APIC_route_entry { } __attribute__ ((packed)); struct irq_alloc_info; +struct irq_data; #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 @@ -206,6 +207,15 @@ extern int mp_ioapic_registered(u32 gsi_base); extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq); extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq); +extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg); +extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs); +extern void mp_irqdomain_activate(struct irq_domain *domain, + struct irq_data *irq_data); +extern void mp_irqdomain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data); +extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain); extern void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, int trigger, int polarity); extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5c953bb96ecf..3406dbec1570 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -78,6 +78,13 @@ static DEFINE_MUTEX(ioapic_mutex); static unsigned int ioapic_dynirq_base; static int ioapic_initialized; +struct mp_chip_data { + struct IO_APIC_route_entry entry; + int trigger; + int polarity; + bool isa_irq; +}; + struct mp_pin_info { int trigger; int polarity; @@ -949,11 +956,28 @@ void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, info->ioapic_valid = 1; } +static void mp_register_handler(unsigned int irq, unsigned long trigger) +{ + irq_flow_handler_t hdl; + bool fasteoi; + + if (trigger) { + irq_set_status_flags(irq, IRQ_LEVEL); + fasteoi = true; + } else { + irq_clear_status_flags(irq, IRQ_LEVEL); + fasteoi = false; + } + + hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq; + __irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge"); +} + static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin, struct irq_alloc_info *info) { int irq = -1; - int ioapic = (int)(long)domain->host_data; + int ioapic = mp_irqdomain_ioapic_idx(domain); int type = ioapics[ioapic].irqdomain_cfg.type; switch (type) { @@ -3029,7 +3053,7 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq) { - int ioapic = (int)(long)domain->host_data; + int ioapic = mp_irqdomain_ioapic_idx(domain); struct mp_pin_info *info = mp_pin_info(ioapic, hwirq); struct io_apic_irq_attr attr; @@ -3067,7 +3091,7 @@ void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq) { struct irq_data *data = irq_get_irq_data(virq); struct irq_cfg *cfg = irq_cfg(virq); - int ioapic = (int)(long)domain->host_data; + int ioapic = mp_irqdomain_ioapic_idx(domain); int pin = (int)data->hwirq; ioapic_mask_entry(ioapic, pin); @@ -3076,6 +3100,130 @@ void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq) arch_teardown_hwirq(virq); } +static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, + struct irq_alloc_info *info) +{ + if (info && info->ioapic_valid) { + data->trigger = info->ioapic_trigger; + data->polarity = info->ioapic_polarity; + } else if (acpi_get_override_irq(gsi, &data->trigger, + &data->polarity) < 0) { + /* PCI interrupts are always polarity one level triggered. */ + data->trigger = 1; + data->polarity = 1; + } +} + +static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data, + struct IO_APIC_route_entry *entry) +{ + memset(entry, 0, sizeof(*entry)); + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->dest = cfg->dest_apicid; + entry->vector = cfg->vector; + entry->mask = 0; /* enable IRQ */ + entry->trigger = data->trigger; + entry->polarity = data->polarity; + /* + * Mask level triggered irqs. + * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + */ + if (data->trigger) + entry->mask = 1; +} + +int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + int ret, ioapic, pin; + struct irq_cfg *cfg; + struct irq_data *irq_data; + struct mp_chip_data *data; + struct irq_alloc_info *info = arg; + + if (!info || nr_irqs > 1) + return -EINVAL; + irq_data = irq_domain_get_irq_data(domain, virq); + if (!irq_data) + return -EINVAL; + + ioapic = mp_irqdomain_ioapic_idx(domain); + pin = info->ioapic_pin; + if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0) + return -EEXIST; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + info->ioapic_entry = &data->entry; + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); + if (ret < 0) { + kfree(data); + return ret; + } + + irq_data->hwirq = info->ioapic_pin; + irq_data->chip = &ioapic_chip; + irq_data->chip_data = data; + mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info); + + cfg = irqd_cfg(irq_data); + add_pin_to_irq_node(cfg, info->ioapic_node, ioapic, pin); + if (info->ioapic_entry) + mp_setup_entry(cfg, data, info->ioapic_entry); + mp_register_handler(virq, data->trigger); + if (virq < nr_legacy_irqs()) + legacy_pic->mask(virq); + + apic_printk(APIC_VERBOSE, KERN_DEBUG + "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n", + ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector, + virq, data->trigger, data->polarity, cfg->dest_apicid); + + return 0; +} + +void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_cfg *cfg = irq_cfg(virq); + struct irq_data *irq_data; + + BUG_ON(nr_irqs != 1); + irq_data = irq_domain_get_irq_data(domain, virq); + if (irq_data && irq_data->chip_data) { + __remove_pin_from_irq(cfg, mp_irqdomain_ioapic_idx(domain), + (int)irq_data->hwirq); + WARN_ON(!list_empty(&cfg->irq_2_pin)); + kfree(irq_data->chip_data); + } + irq_domain_free_irqs_top(domain, virq, nr_irqs); +} + +void mp_irqdomain_activate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + unsigned long flags; + struct irq_pin_list *entry; + struct mp_chip_data *data = irq_data->chip_data; + struct irq_cfg *cfg = irqd_cfg(irq_data); + + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, cfg->irq_2_pin) + __ioapic_write_entry(entry->apic, entry->pin, data->entry); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +void mp_irqdomain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + /* It won't be called for IRQ with multiple IOAPIC pins associated */ + ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain), + (int)irq_data->hwirq); +} + int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node) { int ret = 0; @@ -3104,3 +3252,8 @@ int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node) return ret; } + +int mp_irqdomain_ioapic_idx(struct irq_domain *domain) +{ + return (int)(long)domain->host_data; +} -- cgit v1.2.3 From 133153205b263ea9ce4e771876ede544f896e034 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:56 +0800 Subject: x86/irq: Refine the way to allocate irq_cfg for legacy IRQs To support legacy ISA IRQs, we need to preallocate irq_cfg structures for legacy ISA IRQs. Refine the way to allocate irq_cfg for legacy ISA IRQs, so it's more friendly for the hierarchical irqdomain implementation. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428905519-23704-35-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 13 +------------ arch/x86/kernel/apic/vector.c | 42 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3406dbec1570..16d4ba3ac844 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -254,8 +254,7 @@ static void free_ioapic_saved_registers(int idx) int __init arch_early_ioapic_init(void) { - struct irq_cfg *cfg; - int i, node = cpu_to_node(0); + int i; if (!nr_legacy_irqs()) io_apic_irqs = ~0UL; @@ -263,16 +262,6 @@ int __init arch_early_ioapic_init(void) for_each_ioapic(i) alloc_ioapic_saved_registers(i); - /* - * For legacy IRQ's, start with assigning irq0 to irq15 to - * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. - */ - for (i = 0; i < nr_legacy_irqs(); i++) { - cfg = alloc_irq_and_cfg_at(i, node); - cfg->vector = IRQ0_VECTOR + i; - cpumask_setall(cfg->domain); - } - return 0; } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index b4b6b5a13440..633f03268d48 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -24,6 +24,9 @@ struct irq_domain *x86_vector_domain; static DEFINE_RAW_SPINLOCK(vector_lock); static struct irq_chip lapic_controller; +#ifdef CONFIG_X86_IO_APIC +static struct irq_cfg *legacy_irq_cfgs[NR_IRQS_LEGACY]; +#endif void lock_vector_lock(void) { @@ -283,6 +286,10 @@ static void x86_vector_free_irqs(struct irq_domain *domain, free_remapped_irq(virq); clear_irq_vector(virq + i, irq_data->chip_data); free_irq_cfg(irq_data->chip_data); +#ifdef CONFIG_X86_IO_APIC + if (virq + i < nr_legacy_irqs()) + legacy_irq_cfgs[virq + i] = NULL; +#endif irq_domain_reset_irq_data(irq_data); } } @@ -308,7 +315,12 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, for (i = 0; i < nr_irqs; i++) { irq_data = irq_domain_get_irq_data(domain, virq + i); BUG_ON(!irq_data); - cfg = alloc_irq_cfg(irq_data->node); +#ifdef CONFIG_X86_IO_APIC + if (virq + i < nr_legacy_irqs() && legacy_irq_cfgs[virq + i]) + cfg = legacy_irq_cfgs[virq + i]; + else +#endif + cfg = alloc_irq_cfg(irq_data->node); if (!cfg) { err = -ENOMEM; goto error; @@ -357,8 +369,36 @@ int __init arch_probe_nr_irqs(void) return nr_legacy_irqs(); } +#ifdef CONFIG_X86_IO_APIC +static void init_legacy_irqs(void) +{ + int i, node = cpu_to_node(0); + struct irq_cfg *cfg; + + /* + * For legacy IRQ's, start with assigning irq0 to irq15 to + * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. + */ + for (i = 0; i < nr_legacy_irqs(); i++) { + cfg = legacy_irq_cfgs[i] = alloc_irq_cfg(node); + BUG_ON(!cfg); + /* + * For legacy IRQ's, start with assigning irq0 to irq15 to + * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. + */ + cfg->vector = IRQ0_VECTOR + i; + cpumask_setall(cfg->domain); + irq_set_chip_data(i, cfg); + } +} +#else +static void init_legacy_irqs(void) { } +#endif + int __init arch_early_irq_init(void) { + init_legacy_irqs(); + x86_vector_domain = irq_domain_add_tree(NULL, &x86_vector_domain_ops, NULL); BUG_ON(x86_vector_domain == NULL); -- cgit v1.2.3 From a44174ee7b380012cdb63d563617f67bb7757649 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:57 +0800 Subject: x86/irq: Simplify the way to print IOAPIC entry Simplify the way to print IOAPIC entry content, so we can remove native_io_apic_print_entries(), intel_ir_io_apic_print_entries() and x86_io_apic_ops.print_entries() later. Folded a patch from Thomas to fix errors in printed pin attributes, http://www.spinics.net/lists/linux-tip-commits/msg26108.html Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428905519-23704-36-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 16d4ba3ac844..3c6609617306 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1424,6 +1424,33 @@ void ioapic_zap_locks(void) raw_spin_lock_init(&ioapic_lock); } +static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) +{ + int i; + char buf[256]; + struct IO_APIC_route_entry entry; + struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry; + + printk(KERN_DEBUG "IOAPIC %d:\n", apic); + for (i = 0; i <= nr_entries; i++) { + entry = ioapic_read_entry(apic, i); + snprintf(buf, sizeof(buf), + " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)", + i, entry.mask ? "disabled" : "enabled ", + entry.trigger ? "level" : "edge ", + entry.polarity ? "low " : "high", + entry.vector, entry.irr, entry.delivery_status); + if (ir_entry->format) + printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", + buf, (ir_entry->index << 15) | ir_entry->index, + ir_entry->zero); + else + printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", + buf, entry.dest_mode ? "logical " : "physical", + entry.dest, entry.delivery_mode); + } +} + static void __init print_IO_APIC(int ioapic_idx) { union IO_APIC_reg_00 reg_00; @@ -1477,8 +1504,7 @@ static void __init print_IO_APIC(int ioapic_idx) } printk(KERN_DEBUG ".... IRQ redirection table:\n"); - - x86_io_apic_ops.print_entries(ioapic_idx, reg_01.bits.entries); + io_apic_print_entries(ioapic_idx, reg_01.bits.entries); } void __init print_IO_APICs(void) -- cgit v1.2.3 From 96ed44b2d5e0e9d6e5b135e84ea5c8cd763ce861 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:58 +0800 Subject: x86/irq: Introduce helper functions to support hierarchical irqdomains for IOAPIC Introduce several helper functions, which will be used to enable hierarchical irqdomain for IOAPIC. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428905519-23704-37-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 61 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3c6609617306..c8f786b5b91c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -82,6 +82,7 @@ struct mp_chip_data { struct IO_APIC_route_entry entry; int trigger; int polarity; + u32 count; bool isa_irq; }; @@ -945,6 +946,46 @@ void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, info->ioapic_valid = 1; } +#ifndef CONFIG_ACPI +int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity); +#endif + +static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst, + struct irq_alloc_info *src, + u32 gsi, int ioapic_idx, int pin) +{ + int trigger, polarity; + + copy_irq_alloc_info(dst, src); + dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC; + dst->ioapic_id = mpc_ioapic_id(ioapic_idx); + dst->ioapic_pin = pin; + dst->ioapic_valid = 1; + if (src && src->ioapic_valid) { + dst->ioapic_node = src->ioapic_node; + dst->ioapic_trigger = src->ioapic_trigger; + dst->ioapic_polarity = src->ioapic_polarity; + } else { + dst->ioapic_node = NUMA_NO_NODE; + if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) { + dst->ioapic_trigger = trigger; + dst->ioapic_polarity = polarity; + } else { + /* + * PCI interrupts are always polarity one level + * triggered. + */ + dst->ioapic_trigger = 1; + dst->ioapic_polarity = 1; + } + } +} + +static int ioapic_alloc_attr_node(struct irq_alloc_info *info) +{ + return (info && info->ioapic_valid) ? info->ioapic_node : NUMA_NO_NODE; +} + static void mp_register_handler(unsigned int irq, unsigned long trigger) { irq_flow_handler_t hdl; @@ -962,6 +1003,26 @@ static void mp_register_handler(unsigned int irq, unsigned long trigger) __irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge"); } +static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) +{ + struct mp_chip_data *data = irq_get_chip_data(irq); + + /* + * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger + * and polarity attirbutes. So allow the first user to reprogram the + * pin with real trigger and polarity attributes. + */ + if (irq < nr_legacy_irqs() && data->count == 1) { + if (info->ioapic_trigger != data->trigger) + mp_register_handler(irq, data->trigger); + data->entry.trigger = data->trigger = info->ioapic_trigger; + data->entry.polarity = data->polarity = info->ioapic_polarity; + } + + return data->trigger == info->ioapic_trigger && + data->polarity == info->ioapic_polarity; +} + static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin, struct irq_alloc_info *info) { -- cgit v1.2.3 From d32932d02e1869be838cea3ace42467c360db377 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Mon, 13 Apr 2015 14:11:59 +0800 Subject: x86/irq: Convert IOAPIC to use hierarchical irqdomain interfaces Convert IOAPIC driver to support and use hierarchical irqdomain interfaces. It's a little big, but would break bisecting if we split it into multiple patches. Fold in a patch from Andy Shevchenko to make it bisectable. http://lkml.org/lkml/2014/12/10/622 Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Andy Shevchenko Cc: sfi-devel@simplefirmware.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Len Brown Cc: Pavel Machek Cc: Grant Likely Cc: Rob Herring Cc: David Rientjes Cc: David Cohen Link: http://lkml.kernel.org/r/1428905519-23704-38-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/acpi/boot.c | 11 +- arch/x86/kernel/apic/io_apic.c | 308 ++++++++++++++------- arch/x86/kernel/devicetree.c | 37 +-- arch/x86/kernel/mpparse.c | 6 +- arch/x86/pci/intel_mid_pci.c | 2 - .../platform/intel-mid/device_libs/platform_wdt.c | 3 +- arch/x86/platform/intel-mid/sfi.c | 5 +- arch/x86/platform/sfi/sfi.c | 5 +- 8 files changed, 240 insertions(+), 137 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a43a4d3c60e1..21e460b3b360 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -412,11 +412,6 @@ static int mp_register_gsi(struct device *dev, u32 gsi, int trigger, trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1; polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1; node = dev ? dev_to_node(dev) : NUMA_NO_NODE; - if (mp_set_gsi_attr(gsi, trigger, polarity, node)) { - pr_warn("Failed to set pin attr for GSI%d\n", gsi); - return -1; - } - ioapic_set_alloc_attr(&info, node, trigger, polarity); irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); if (irq < 0) @@ -442,8 +437,10 @@ static void mp_unregister_gsi(u32 gsi) } static struct irq_domain_ops acpi_irqdomain_ops = { - .map = mp_irqdomain_map, - .unmap = mp_irqdomain_unmap, + .alloc = mp_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, }; static int __init diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c8f786b5b91c..ba50f8d6f6b0 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -142,6 +142,11 @@ u32 mp_pin_to_gsi(int ioapic, int pin) return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin; } +static inline bool mp_is_legacy_irq(int irq) +{ + return irq >= 0 && irq < nr_legacy_irqs(); +} + /* * Initialize all legacy IRQs and all pins on the first IOAPIC * if we have legacy interrupt controller. Kernel boot option "pirq=" @@ -152,7 +157,7 @@ static inline int mp_init_irq_at_boot(int ioapic, int irq) if (!nr_legacy_irqs()) return 0; - return ioapic == 0 || (irq >= 0 && irq < nr_legacy_irqs()); + return ioapic == 0 || mp_is_legacy_irq(irq); } static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin) @@ -231,7 +236,7 @@ struct irq_pin_list { static struct irq_pin_list *alloc_irq_pin_list(int node) { - return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node); + return kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node); } static void alloc_ioapic_saved_registers(int idx) @@ -560,6 +565,17 @@ void native_eoi_ioapic_pin(int apic, int pin, int vector) } } +void eoi_ioapic_pin(int vector, struct irq_cfg *cfg) +{ + unsigned long flags; + struct irq_pin_list *entry; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, cfg->irq_2_pin) + native_eoi_ioapic_pin(entry->apic, entry->pin, vector); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) { struct irq_pin_list *entry; @@ -603,9 +619,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) entry.trigger = IOAPIC_LEVEL; ioapic_write_entry(apic, pin, entry); } - raw_spin_lock_irqsave(&ioapic_lock, flags); - x86_io_apic_ops.eoi_ioapic_pin(apic, pin, entry.vector); + native_eoi_ioapic_pin(apic, pin, entry.vector); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1023,95 +1038,121 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) data->polarity == info->ioapic_polarity; } -static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin, +static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, struct irq_alloc_info *info) { + bool legacy = false; int irq = -1; - int ioapic = mp_irqdomain_ioapic_idx(domain); int type = ioapics[ioapic].irqdomain_cfg.type; switch (type) { case IOAPIC_DOMAIN_LEGACY: /* - * Dynamically allocate IRQ number for non-ISA IRQs in the first 16 - * GSIs on some weird platforms. + * Dynamically allocate IRQ number for non-ISA IRQs in the first + * 16 GSIs on some weird platforms. */ - if (gsi < nr_legacy_irqs()) - irq = irq_create_mapping(domain, pin); - else if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0) + if (!ioapic_initialized || gsi >= nr_legacy_irqs()) irq = gsi; + legacy = mp_is_legacy_irq(irq); break; case IOAPIC_DOMAIN_STRICT: - if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0) - irq = gsi; + irq = gsi; break; case IOAPIC_DOMAIN_DYNAMIC: - irq = irq_create_mapping(domain, pin); break; default: WARN(1, "ioapic: unknown irqdomain type %d\n", type); - break; + return -1; + } + + return __irq_domain_alloc_irqs(domain, irq, 1, + ioapic_alloc_attr_node(info), + info, legacy); +} + +/* + * Need special handling for ISA IRQs because there may be multiple IOAPIC pins + * sharing the same ISA IRQ number and irqdomain only supports 1:1 mapping + * between IOAPIC pin and IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are + * used for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H). + * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are available, and + * some BIOSes may use MP Interrupt Source records to override IRQ numbers for + * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be + * multiple pins sharing the same legacy IRQ number when ACPI is disabled. + */ +static int alloc_isa_irq_from_domain(struct irq_domain *domain, + int irq, int ioapic, int pin, + struct irq_alloc_info *info) +{ + struct mp_chip_data *data; + struct irq_data *irq_data = irq_get_irq_data(irq); + int node = ioapic_alloc_attr_node(info); + + /* + * Legacy ISA IRQ has already been allocated, just add pin to + * the pin list assoicated with this IRQ and program the IOAPIC + * entry. The IOAPIC entry + */ + if (irq_data && irq_data->parent_data) { + struct irq_cfg *cfg = irqd_cfg(irq_data); + + if (!mp_check_pin_attr(irq, info)) + return -EBUSY; + if (__add_pin_to_irq_node(cfg, node, ioapic, info->ioapic_pin)) + return -ENOMEM; + } else { + irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true); + if (irq >= 0) { + irq_data = irq_domain_get_irq_data(domain, irq); + data = irq_data->chip_data; + data->isa_irq = true; + } } - return irq > 0 ? irq : -1; + return irq; } static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, unsigned int flags, struct irq_alloc_info *info) { int irq; + bool legacy = false; + struct irq_alloc_info tmp; + struct mp_chip_data *data; struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); - struct mp_pin_info *pinfo = mp_pin_info(ioapic, pin); if (!domain) - return -1; + return -ENOSYS; - mutex_lock(&ioapic_mutex); - - /* - * Don't use irqdomain to manage ISA IRQs because there may be - * multiple IOAPIC pins sharing the same ISA IRQ number and - * irqdomain only supports 1:1 mapping between IOAPIC pin and - * IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are used - * for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H). - * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are - * available, and some BIOSes may use MP Interrupt Source records - * to override IRQ numbers for PIRQs instead of reprogramming - * the interrupt routing logic. Thus there may be multiple pins - * sharing the same legacy IRQ number when ACPI is disabled. - */ if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; - if (flags & IOAPIC_MAP_ALLOC) { - if (pinfo->count == 0 && - mp_irqdomain_map(domain, irq, pin) != 0) - irq = -1; + legacy = mp_is_legacy_irq(irq); + } - /* special handling for timer IRQ0 */ + mutex_lock(&ioapic_mutex); + if (!(flags & IOAPIC_MAP_ALLOC)) { + if (!legacy) { + irq = irq_find_mapping(domain, pin); if (irq == 0) - pinfo->count++; + irq = -ENOENT; } } else { - irq = irq_find_mapping(domain, pin); - if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC)) - irq = alloc_irq_from_domain(domain, gsi, pin, info); - } - - if (flags & IOAPIC_MAP_ALLOC) { - /* special handling for legacy IRQs */ - if (irq < nr_legacy_irqs() && pinfo->count == 1 && - mp_irqdomain_map(domain, irq, pin) != 0) - irq = -1; - - if (irq > 0) - pinfo->count++; - else if (pinfo->count == 0) - pinfo->set = 0; + ioapic_copy_alloc_attr(&tmp, info, gsi, ioapic, pin); + if (legacy) + irq = alloc_isa_irq_from_domain(domain, irq, + ioapic, pin, &tmp); + else if ((irq = irq_find_mapping(domain, pin)) == 0) + irq = alloc_irq_from_domain(domain, ioapic, gsi, &tmp); + else if (!mp_check_pin_attr(irq, &tmp)) + irq = -EBUSY; + if (irq >= 0) { + data = irq_get_chip_data(irq); + data->count++; + } } - mutex_unlock(&ioapic_mutex); - return irq > 0 ? irq : -1; + return irq; } static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) @@ -1166,26 +1207,19 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, void mp_unmap_irq(int irq) { - struct irq_data *data = irq_get_irq_data(irq); - struct mp_pin_info *info; - int ioapic, pin; + struct irq_data *irq_data = irq_get_irq_data(irq); + struct mp_chip_data *data; - if (!data || !data->domain) + if (!irq_data || !irq_data->domain) return; - ioapic = (int)(long)data->domain->host_data; - pin = (int)data->hwirq; - info = mp_pin_info(ioapic, pin); + data = irq_data->chip_data; + if (!data || data->isa_irq) + return; mutex_lock(&ioapic_mutex); - if (--info->count == 0) { - info->set = 0; - if (irq < nr_legacy_irqs() && - ioapics[ioapic].irqdomain_cfg.type == IOAPIC_DOMAIN_LEGACY) - mp_irqdomain_unmap(data->domain, irq); - else - irq_dispose_mapping(irq); - } + if (--data->count == 0) + irq_domain_free_irqs(irq, 1); mutex_unlock(&ioapic_mutex); } @@ -1252,7 +1286,7 @@ out: } EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); -static struct irq_chip ioapic_chip; +static struct irq_chip ioapic_chip, ioapic_ir_chip; #ifdef CONFIG_X86_32 static inline int IO_APIC_irq_trigger(int irq) @@ -1595,7 +1629,7 @@ void __init print_IO_APICs(void) struct irq_pin_list *entry; chip = irq_get_chip(irq); - if (chip != &ioapic_chip) + if (chip != &ioapic_chip && chip != &ioapic_ir_chip) continue; cfg = irq_cfg(irq); @@ -2057,12 +2091,12 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, } #endif -static void ack_ioapic_level(struct irq_data *data) +static void ioapic_ack_level(struct irq_data *data) { struct irq_cfg *cfg = irqd_cfg(data); - int i, irq = data->irq; unsigned long v; bool masked; + int i; irq_complete_move(cfg); masked = ioapic_irqd_mask(data, cfg); @@ -2117,22 +2151,70 @@ static void ack_ioapic_level(struct irq_data *data) */ if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); - - eoi_ioapic_irq(irq, cfg); + eoi_ioapic_pin(cfg->vector, cfg); } ioapic_irqd_unmask(data, cfg, masked); } +static void ioapic_ir_ack_level(struct irq_data *irq_data) +{ + struct mp_chip_data *data = irq_data->chip_data; + + /* + * Intr-remapping uses pin number as the virtual vector + * in the RTE. Actual vector is programmed in + * intr-remapping table entry. Hence for the io-apic + * EOI we use the pin number. + */ + ack_APIC_irq(); + eoi_ioapic_pin(data->entry.vector, irqd_cfg(irq_data)); +} + +static int ioapic_set_affinity(struct irq_data *irq_data, + const struct cpumask *mask, bool force) +{ + struct irq_data *parent = irq_data->parent_data; + struct mp_chip_data *data = irq_data->chip_data; + unsigned int dest, irq = irq_data->irq; + struct irq_cfg *cfg; + unsigned long flags; + int ret; + + ret = parent->chip->irq_set_affinity(parent, mask, force); + raw_spin_lock_irqsave(&ioapic_lock, flags); + if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { + cfg = irqd_cfg(irq_data); + data->entry.dest = cfg->dest_apicid; + data->entry.vector = cfg->vector; + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(cfg->dest_apicid); + __target_IO_APIC_irq(irq, dest, cfg); + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return ret; +} + static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", .irq_startup = startup_ioapic_irq, .irq_mask = mask_ioapic_irq, .irq_unmask = unmask_ioapic_irq, - .irq_ack = apic_ack_edge, - .irq_eoi = ack_ioapic_level, - .irq_set_affinity = native_ioapic_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_ack = irq_chip_ack_parent, + .irq_eoi = ioapic_ack_level, + .irq_set_affinity = ioapic_set_affinity, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +static struct irq_chip ioapic_ir_chip __read_mostly = { + .name = "IR-IO-APIC", + .irq_startup = startup_ioapic_irq, + .irq_mask = mask_ioapic_irq, + .irq_unmask = unmask_ioapic_irq, + .irq_ack = irq_chip_ack_parent, + .irq_eoi = ioapic_ir_ack_level, + .irq_set_affinity = ioapic_set_affinity, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -2265,6 +2347,24 @@ static int __init disable_timer_pin_setup(char *arg) } early_param("disable_timer_pin_1", disable_timer_pin_setup); +static int mp_alloc_timer_irq(int ioapic, int pin) +{ + int irq = -1; + struct irq_alloc_info info; + struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); + + if (domain) { + ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0); + info.ioapic_id = mpc_ioapic_id(ioapic); + info.ioapic_pin = pin; + mutex_lock(&ioapic_mutex); + irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info); + mutex_unlock(&ioapic_mutex); + } + + return irq; +} + /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ @@ -2287,7 +2387,6 @@ static inline void __init check_timer(void) * get/set the timer IRQ vector: */ legacy_pic->mask(0); - assign_irq_vector(0, cfg, apic->target_cpus()); /* * As IRQ0 is to be enabled in the 8259A, the virtual @@ -2328,15 +2427,12 @@ static inline void __init check_timer(void) } if (pin1 != -1) { - /* - * Ok, does IRQ0 through the IOAPIC work? - */ + /* Ok, does IRQ0 through the IOAPIC work? */ if (no_pin1) { - add_pin_to_irq_node(cfg, node, apic1, pin1); - setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); + mp_alloc_timer_irq(apic1, pin1); } else { - /* for edge trigger, setup_ioapic_irq already - * leave it unmasked. + /* + * for edge trigger, it's already unmasked, * so only need to unmask if it is level-trigger * do we really have level trigger timer? */ @@ -2345,6 +2441,7 @@ static inline void __init check_timer(void) if (idx != -1 && irq_trigger(idx)) unmask_ioapic(cfg); } + irq_domain_activate_irq(irq_get_irq_data(0)); if (timer_irq_works()) { if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); @@ -2365,7 +2462,7 @@ static inline void __init check_timer(void) * legacy devices should be connected to IO APIC #0 */ replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); - setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); + irq_domain_activate_irq(irq_get_irq_data(0)); legacy_pic->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); @@ -2443,6 +2540,8 @@ out: static int mp_irqdomain_create(int ioapic) { size_t size; + struct irq_alloc_info info; + struct irq_domain *parent; int hwirqs = mp_ioapic_pin_count(ioapic); struct ioapic *ip = &ioapics[ioapic]; struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; @@ -2456,9 +2555,18 @@ static int mp_irqdomain_create(int ioapic) if (cfg->type == IOAPIC_DOMAIN_INVALID) return 0; + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_IOAPIC; + info.ioapic_id = mpc_ioapic_id(ioapic); + parent = irq_remapping_get_ir_irq_domain(&info); + if (!parent) + parent = x86_vector_domain; + ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops, (void *)(long)ioapic); - if(!ip->irqdomain) { + if (ip->irqdomain) { + ip->irqdomain->parent = parent; + } else { kfree(ip->pin_info); ip->pin_info = NULL; return -ENOMEM; @@ -3072,7 +3180,6 @@ int mp_unregister_ioapic(u32 gsi_base) { int ioapic, pin; int found = 0; - struct mp_pin_info *pin_info; for_each_ioapic(ioapic) if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) { @@ -3085,11 +3192,17 @@ int mp_unregister_ioapic(u32 gsi_base) } for_each_pin(ioapic, pin) { - pin_info = mp_pin_info(ioapic, pin); - if (pin_info->count) { - pr_warn("pin%d on IOAPIC%d is still in use.\n", - pin, ioapic); - return -EBUSY; + u32 gsi = mp_pin_to_gsi(ioapic, pin); + int irq = mp_map_gsi_to_irq(gsi, 0, NULL); + struct mp_chip_data *data; + + if (irq >= 0) { + data = irq_get_chip_data(irq); + if (data && data->count) { + pr_warn("pin%d on IOAPIC%d is still in use.\n", + pin, ioapic); + return -EBUSY; + } } } @@ -3241,7 +3354,8 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, } irq_data->hwirq = info->ioapic_pin; - irq_data->chip = &ioapic_chip; + irq_data->chip = (domain->parent == x86_vector_domain) ? + &ioapic_chip : &ioapic_ir_chip; irq_data->chip_data = data; mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info); diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 6367a780cc8c..05103d398ed7 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -196,38 +196,31 @@ static struct of_ioapic_type of_ioapic_type[] = }, }; -static int ioapic_xlate(struct irq_domain *domain, - struct device_node *controller, - const u32 *intspec, u32 intsize, - irq_hw_number_t *out_hwirq, u32 *out_type) +static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) { + struct of_phandle_args *irq_data = (void *)arg; struct of_ioapic_type *it; - u32 line, idx, gsi; + struct irq_alloc_info tmp; - if (WARN_ON(intsize < 2)) + if (WARN_ON(irq_data->args_count < 2)) return -EINVAL; - - line = intspec[0]; - - if (intspec[1] >= ARRAY_SIZE(of_ioapic_type)) + if (irq_data->args[1] >= ARRAY_SIZE(of_ioapic_type)) return -EINVAL; - it = &of_ioapic_type[intspec[1]]; + it = &of_ioapic_type[irq_data->args[1]]; + ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity); + tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain)); + tmp.ioapic_pin = irq_data->args[0]; - idx = (u32)(long)domain->host_data; - gsi = mp_pin_to_gsi(idx, line); - if (mp_set_gsi_attr(gsi, it->trigger, it->polarity, cpu_to_node(0))) - return -EBUSY; - - *out_hwirq = line; - *out_type = it->out_type; - return 0; + return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp); } const struct irq_domain_ops ioapic_irq_domain_ops = { - .map = mp_irqdomain_map, - .unmap = mp_irqdomain_unmap, - .xlate = ioapic_xlate, + .alloc = dt_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, }; static void __init dtb_add_ioapic(struct device_node *dn) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 2d2a237f2c73..aa4feee74dbe 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -114,8 +114,10 @@ static void __init MP_bus_info(struct mpc_bus *m) } static struct irq_domain_ops mp_ioapic_irqdomain_ops = { - .map = mp_irqdomain_map, - .unmap = mp_irqdomain_unmap, + .alloc = mp_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, }; static void __init MP_ioapic_info(struct mpc_ioapic *m) diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 57b5719b617b..27062303c881 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -224,8 +224,6 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to * IOAPIC RTE entries, so we just enable RTE for the device. */ - if (mp_set_gsi_attr(dev->irq, 1, polarity, dev_to_node(&dev->dev))) - return -EBUSY; if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC, &info) < 0) return -EBUSY; diff --git a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c index de0009f6d555..de734134bc8d 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c @@ -36,8 +36,7 @@ static int tangier_probe(struct platform_device *pdev) /* IOAPIC builds identity mapping between GSI and IRQ on MID */ gsi = pdata->irq; ioapic_set_alloc_attr(&info, cpu_to_node(0), 1, 0); - if (mp_set_gsi_attr(gsi, 1, 0, cpu_to_node(0)) || - mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info) <= 0) { + if (mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info) <= 0) { dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n", gsi); return -EINVAL; diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index 7d17355d820e..ce992e8cc065 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -469,10 +469,7 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) } ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, polarity); - ret = mp_set_gsi_attr(irq, 1, polarity, NUMA_NO_NODE); - if (ret == 0) - ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC, - &info); + ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC, &info); WARN_ON(ret < 0); } diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c index 2a8a74f3bd76..b66b194f9900 100644 --- a/arch/x86/platform/sfi/sfi.c +++ b/arch/x86/platform/sfi/sfi.c @@ -72,7 +72,10 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table) #ifdef CONFIG_X86_IO_APIC static struct irq_domain_ops sfi_ioapic_irqdomain_ops = { - .map = mp_irqdomain_map, + .alloc = mp_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, }; static int __init sfi_parse_ioapic(struct sfi_table_header *table) -- cgit v1.2.3 From 5ad274d41c1b3f3ccf73591078efaa8ed6828a8d Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:38 +0800 Subject: x86/irq: Remove unused old IOAPIC irqdomain interfaces Now we have converted to hierarchical irqdomain, so remove unused old IOAPIC interfaces and code. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-2-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 4 - arch/x86/kernel/apic/io_apic.c | 202 +---------------------------------------- 2 files changed, 1 insertion(+), 205 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index ecc192624eaf..705c425c9c3d 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -204,9 +204,6 @@ extern int mp_register_ioapic(int id, u32 address, u32 gsi_base, struct ioapic_domain_cfg *cfg); extern int mp_unregister_ioapic(u32 gsi_base); extern int mp_ioapic_registered(u32 gsi_base); -extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq); -extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq); extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg); extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, @@ -218,7 +215,6 @@ extern void mp_irqdomain_deactivate(struct irq_domain *domain, extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain); extern void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, int trigger, int polarity); -extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node); extern void mp_save_irq(struct mpc_intsrc *m); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ba50f8d6f6b0..523b326d71c2 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -89,7 +89,6 @@ struct mp_chip_data { struct mp_pin_info { int trigger; int polarity; - int node; int set; u32 count; }; @@ -1310,30 +1309,6 @@ static inline int IO_APIC_irq_trigger(int irq) } #endif -static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg, - unsigned long trigger) -{ - struct irq_chip *chip = &ioapic_chip; - irq_flow_handler_t hdl; - bool fasteoi; - - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) { - irq_set_status_flags(irq, IRQ_LEVEL); - fasteoi = true; - } else { - irq_clear_status_flags(irq, IRQ_LEVEL); - fasteoi = false; - } - - if (setup_remapped_irq(irq, cfg, chip)) - fasteoi = trigger != 0; - - hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq; - irq_set_chip_and_handler_name(irq, chip, hdl, - fasteoi ? "fasteoi" : "edge"); -} - int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, unsigned int destination, int vector, struct io_apic_irq_attr *attr) @@ -1358,48 +1333,6 @@ int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, return 0; } -static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, - struct io_apic_irq_attr *attr) -{ - struct IO_APIC_route_entry entry; - unsigned int dest; - - if (!IO_APIC_IRQ(irq)) - return; - - if (assign_irq_vector(irq, cfg, apic->target_cpus())) - return; - - if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(), - &dest)) { - pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n", - mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); - clear_irq_vector(irq, cfg); - - return; - } - - apic_printk(APIC_VERBOSE,KERN_DEBUG - "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " - "IRQ %d Mode:%i Active:%i Dest:%d)\n", - attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin, - cfg->vector, irq, attr->trigger, attr->polarity, dest); - - if (x86_io_apic_ops.setup_entry(irq, &entry, dest, cfg->vector, attr)) { - pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", - mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); - clear_irq_vector(irq, cfg); - - return; - } - - ioapic_register_intr(irq, cfg, attr->trigger); - if (irq < nr_legacy_irqs()) - legacy_pic->mask(irq); - - ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry); -} - static void __init setup_IO_APIC_irqs(void) { unsigned int ioapic, pin; @@ -1419,46 +1352,6 @@ static void __init setup_IO_APIC_irqs(void) } } -/* - * Set up the timer pin, possibly with the 8259A-master behind. - */ -static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, - unsigned int pin, int vector) -{ - struct IO_APIC_route_entry entry; - unsigned int dest; - - memset(&entry, 0, sizeof(entry)); - - /* - * We use logical delivery to get the timer IRQ - * to the first CPU. - */ - if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(), - apic->target_cpus(), &dest))) - dest = BAD_APICID; - - entry.dest_mode = apic->irq_dest_mode; - entry.mask = 0; /* don't mask IRQ for edge */ - entry.dest = dest; - entry.delivery_mode = apic->irq_delivery_mode; - entry.polarity = 0; - entry.trigger = 0; - entry.vector = vector; - - /* - * The timer IRQ doesn't have to know that behind the - * scene we may have a 8259A-master in AEOI mode ... - */ - irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, - "edge"); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(ioapic_idx, pin, entry); -} - void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries) { int i; @@ -2669,20 +2562,6 @@ static int __init ioapic_init_ops(void) device_initcall(ioapic_init_ops); -static int -io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) -{ - struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node); - int ret; - - if (!cfg) - return -EINVAL; - ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin); - if (!ret) - setup_ioapic_irq(irq, cfg, attr); - return ret; -} - static int io_apic_get_redir_entries(int ioapic) { union IO_APIC_reg_01 reg_01; @@ -3239,58 +3118,8 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, irq_attr->polarity = polarity; } -int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq) -{ - int ioapic = mp_irqdomain_ioapic_idx(domain); - struct mp_pin_info *info = mp_pin_info(ioapic, hwirq); - struct io_apic_irq_attr attr; - - /* Get default attribute if not set by caller yet */ - if (!info->set) { - u32 gsi = mp_pin_to_gsi(ioapic, hwirq); - - if (acpi_get_override_irq(gsi, &info->trigger, - &info->polarity) < 0) { - /* - * PCI interrupts are always polarity one level - * triggered. - */ - info->trigger = 1; - info->polarity = 1; - } - info->node = NUMA_NO_NODE; - - /* - * setup_IO_APIC_irqs() programs all legacy IRQs with default - * trigger and polarity attributes. Don't set the flag for that - * case so the first legacy IRQ user could reprogram the pin - * with real trigger and polarity attributes. - */ - if (virq >= nr_legacy_irqs() || info->count) - info->set = 1; - } - set_io_apic_irq_attr(&attr, ioapic, hwirq, info->trigger, - info->polarity); - - return io_apic_setup_irq_pin(virq, info->node, &attr); -} - -void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq) -{ - struct irq_data *data = irq_get_irq_data(virq); - struct irq_cfg *cfg = irq_cfg(virq); - int ioapic = mp_irqdomain_ioapic_idx(domain); - int pin = (int)data->hwirq; - - ioapic_mask_entry(ioapic, pin); - __remove_pin_from_irq(cfg, ioapic, pin); - WARN_ON(!list_empty(&cfg->irq_2_pin)); - arch_teardown_hwirq(virq); -} - static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, - struct irq_alloc_info *info) + struct irq_alloc_info *info) { if (info && info->ioapic_valid) { data->trigger = info->ioapic_trigger; @@ -3414,35 +3243,6 @@ void mp_irqdomain_deactivate(struct irq_domain *domain, (int)irq_data->hwirq); } -int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node) -{ - int ret = 0; - int ioapic, pin; - struct mp_pin_info *info; - - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return -ENODEV; - - pin = mp_find_ioapic_pin(ioapic, gsi); - info = mp_pin_info(ioapic, pin); - trigger = trigger ? 1 : 0; - polarity = polarity ? 1 : 0; - - mutex_lock(&ioapic_mutex); - if (!info->set) { - info->trigger = trigger; - info->polarity = polarity; - info->node = node; - info->set = 1; - } else if (info->trigger != trigger || info->polarity != polarity) { - ret = -EBUSY; - } - mutex_unlock(&ioapic_mutex); - - return ret; -} - int mp_irqdomain_ioapic_idx(struct irq_domain *domain) { return (int)(long)domain->host_data; -- cgit v1.2.3 From b75e818f7fc6db153a4ebfba1d31366c1cc531aa Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:39 +0800 Subject: x86/irq: Remove unused struct mp_pin_info Now nobody makes use of struct mp_pin_info, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-3-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 30 +++--------------------------- 1 file changed, 3 insertions(+), 27 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 523b326d71c2..3506e8aeba91 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -86,13 +86,6 @@ struct mp_chip_data { bool isa_irq; }; -struct mp_pin_info { - int trigger; - int polarity; - int set; - u32 count; -}; - static struct ioapic { /* * # of IRQ routing registers @@ -108,7 +101,6 @@ static struct ioapic { struct mp_ioapic_gsi gsi_config; struct ioapic_domain_cfg irqdomain_cfg; struct irq_domain *irqdomain; - struct mp_pin_info *pin_info; struct resource *iomem_res; } ioapics[MAX_IO_APICS]; @@ -159,11 +151,6 @@ static inline int mp_init_irq_at_boot(int ioapic, int irq) return ioapic == 0 || mp_is_legacy_irq(irq); } -static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin) -{ - return ioapics[ioapic_idx].pin_info + pin; -} - static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic) { return ioapics[ioapic].irqdomain; @@ -2432,7 +2419,6 @@ out: static int mp_irqdomain_create(int ioapic) { - size_t size; struct irq_alloc_info info; struct irq_domain *parent; int hwirqs = mp_ioapic_pin_count(ioapic); @@ -2440,11 +2426,6 @@ static int mp_irqdomain_create(int ioapic) struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); - size = sizeof(struct mp_pin_info) * mp_ioapic_pin_count(ioapic); - ip->pin_info = kzalloc(size, GFP_KERNEL); - if (!ip->pin_info) - return -ENOMEM; - if (cfg->type == IOAPIC_DOMAIN_INVALID) return 0; @@ -2457,13 +2438,10 @@ static int mp_irqdomain_create(int ioapic) ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops, (void *)(long)ioapic); - if (ip->irqdomain) { - ip->irqdomain->parent = parent; - } else { - kfree(ip->pin_info); - ip->pin_info = NULL; + if (!ip->irqdomain) return -ENOMEM; - } + + ip->irqdomain->parent = parent; if (cfg->type == IOAPIC_DOMAIN_LEGACY || cfg->type == IOAPIC_DOMAIN_STRICT) @@ -2479,8 +2457,6 @@ static void ioapic_destroy_irqdomain(int idx) irq_domain_remove(ioapics[idx].irqdomain); ioapics[idx].irqdomain = NULL; } - kfree(ioapics[idx].pin_info); - ioapics[idx].pin_info = NULL; } void __init setup_IO_APIC(void) -- cgit v1.2.3 From 84bea5cc7709dffdadfa9885a66efd67d9ffc24c Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:40 +0800 Subject: x86/irq: Remove x86_io_apic_ops.print_entries and related interfaces Now there is no user of x86_io_apic_ops.print_entries anymore, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Cc: Yijing Wang Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-4-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 3 -- arch/x86/include/asm/x86_init.h | 1 - arch/x86/kernel/apic/io_apic.c | 55 ------------------------------------- arch/x86/kernel/x86_init.c | 1 - drivers/iommu/intel_irq_remapping.c | 7 ----- 5 files changed, 67 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 705c425c9c3d..47af5a7af358 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -225,8 +225,6 @@ extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg); extern void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int val); extern void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); extern void native_disable_io_apic(void); -extern void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries); -extern void intel_ir_io_apic_print_entries(unsigned int apic, unsigned int nr_entries); extern int native_ioapic_set_affinity(struct irq_data *, const struct cpumask *, bool); @@ -290,7 +288,6 @@ static inline void disable_ioapic_support(void) { } #define native_io_apic_write NULL #define native_io_apic_modify NULL #define native_disable_io_apic NULL -#define native_io_apic_print_entries NULL #define native_ioapic_set_affinity NULL #define native_setup_ioapic_entry NULL #define native_eoi_ioapic_pin NULL diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 1649bb9ca27c..2924bc88034a 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -191,7 +191,6 @@ struct x86_io_apic_ops { void (*write) (unsigned int apic, unsigned int reg, unsigned int value); void (*modify) (unsigned int apic, unsigned int reg, unsigned int value); void (*disable)(void); - void (*print_entries)(unsigned int apic, unsigned int nr_entries); int (*set_affinity)(struct irq_data *data, const struct cpumask *mask, bool force); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3506e8aeba91..acb91c19f318 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1339,61 +1339,6 @@ static void __init setup_IO_APIC_irqs(void) } } -void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries) -{ - int i; - - pr_debug(" NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect:\n"); - - for (i = 0; i <= nr_entries; i++) { - struct IO_APIC_route_entry entry; - - entry = ioapic_read_entry(apic, i); - - pr_debug(" %02x %02X ", i, entry.dest); - pr_cont("%1d %1d %1d %1d %1d " - "%1d %1d %02X\n", - entry.mask, - entry.trigger, - entry.irr, - entry.polarity, - entry.delivery_status, - entry.dest_mode, - entry.delivery_mode, - entry.vector); - } -} - -void intel_ir_io_apic_print_entries(unsigned int apic, - unsigned int nr_entries) -{ - int i; - - pr_debug(" NR Indx Fmt Mask Trig IRR Pol Stat Indx2 Zero Vect:\n"); - - for (i = 0; i <= nr_entries; i++) { - struct IR_IO_APIC_route_entry *ir_entry; - struct IO_APIC_route_entry entry; - - entry = ioapic_read_entry(apic, i); - - ir_entry = (struct IR_IO_APIC_route_entry *)&entry; - - pr_debug(" %02x %04X ", i, ir_entry->index); - pr_cont("%1d %1d %1d %1d %1d " - "%1d %1d %X %02X\n", - ir_entry->format, - ir_entry->mask, - ir_entry->trigger, - ir_entry->irr, - ir_entry->polarity, - ir_entry->delivery_status, - ir_entry->index2, - ir_entry->zero, - ir_entry->vector); - } -} - void ioapic_zap_locks(void) { raw_spin_lock_init(&ioapic_lock); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index b094d691f2fe..d6f36c7594d7 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -144,7 +144,6 @@ struct x86_io_apic_ops x86_io_apic_ops = { .write = native_io_apic_write, .modify = native_io_apic_modify, .disable = native_disable_io_apic, - .print_entries = native_io_apic_print_entries, .set_affinity = native_ioapic_set_affinity, .setup_entry = native_setup_ioapic_entry, .eoi_ioapic_pin = native_eoi_ioapic_pin, diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index 71a25aeee203..0c8935cd6229 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -696,13 +696,6 @@ static int __init intel_enable_irq_remapping(void) irq_remapping_enabled = 1; - /* - * VT-d has a different layout for IO-APIC entries when - * interrupt remapping is enabled. So it needs a special routine - * to print IO-APIC entries for debugging purposes too. - */ - x86_io_apic_ops.print_entries = intel_ir_io_apic_print_entries; - pr_info("Enabled IRQ remapping in %s mode\n", eim ? "x2apic" : "xapic"); return eim ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE; -- cgit v1.2.3 From 35d50d8fd5b8f932b3e71311a4cbd4384501ab9a Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:41 +0800 Subject: x86/irq: Remove x86_io_apic_ops.setup_entry and related interfaces Now there is no user of x86_io_apic_ops.setup_entry anymore, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Cc: Yijing Wang Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-5-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 4 ---- arch/x86/include/asm/irq_remapping.h | 13 ------------- arch/x86/include/asm/x86_init.h | 3 --- arch/x86/kernel/apic/io_apic.c | 24 ------------------------ arch/x86/kernel/x86_init.c | 1 - drivers/iommu/irq_remapping.c | 13 ------------- 6 files changed, 58 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 47af5a7af358..d2a34e4718d2 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -150,9 +150,6 @@ struct irq_cfg; extern void ioapic_insert_resources(void); extern int arch_early_ioapic_init(void); -extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *, - unsigned int, int, - struct io_apic_irq_attr *); extern void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg); extern void native_eoi_ioapic_pin(int apic, int pin, int vector); @@ -289,7 +286,6 @@ static inline void disable_ioapic_support(void) { } #define native_io_apic_modify NULL #define native_disable_io_apic NULL #define native_ioapic_set_affinity NULL -#define native_setup_ioapic_entry NULL #define native_eoi_ioapic_pin NULL static inline void setup_IO_APIC(void) { } diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 52e1a1f337b2..4b5bbd104ae2 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -42,11 +42,6 @@ extern int irq_remapping_enable(void); extern void irq_remapping_disable(void); extern int irq_remapping_reenable(int); extern int irq_remap_enable_fault_handling(void); -extern int setup_ioapic_remapped_entry(int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, - int vector, - struct io_apic_irq_attr *attr); extern void free_remapped_irq(int irq); extern void panic_if_irq_remap(const char *msg); extern bool setup_remapped_irq(int irq, @@ -77,14 +72,6 @@ static inline int irq_remapping_enable(void) { return -ENODEV; } static inline void irq_remapping_disable(void) { } static inline int irq_remapping_reenable(int eim) { return -ENODEV; } static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; } -static inline int setup_ioapic_remapped_entry(int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, - int vector, - struct io_apic_irq_attr *attr) -{ - return -ENODEV; -} static inline void free_remapped_irq(int irq) { } static inline void panic_if_irq_remap(const char *msg) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 2924bc88034a..0c690574efae 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -194,9 +194,6 @@ struct x86_io_apic_ops { int (*set_affinity)(struct irq_data *data, const struct cpumask *mask, bool force); - int (*setup_entry)(int irq, struct IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr); void (*eoi_ioapic_pin)(int apic, int pin, int vector); }; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index acb91c19f318..cf5cd19b74e3 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1296,30 +1296,6 @@ static inline int IO_APIC_irq_trigger(int irq) } #endif -int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr) -{ - memset(entry, 0, sizeof(*entry)); - - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->dest = destination; - entry->vector = vector; - entry->mask = 0; /* enable IRQ */ - entry->trigger = attr->trigger; - entry->polarity = attr->polarity; - - /* - * Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. - */ - if (attr->trigger) - entry->mask = 1; - - return 0; -} - static void __init setup_IO_APIC_irqs(void) { unsigned int ioapic, pin; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index d6f36c7594d7..066cdaa6503e 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -145,6 +145,5 @@ struct x86_io_apic_ops x86_io_apic_ops = { .modify = native_io_apic_modify, .disable = native_disable_io_apic, .set_affinity = native_ioapic_set_affinity, - .setup_entry = native_setup_ioapic_entry, .eoi_ioapic_pin = native_eoi_ioapic_pin, }; diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index 558c804dbe11..5bb1a04c91f5 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -62,7 +62,6 @@ static void __init irq_remapping_modify_x86_ops(void) { x86_io_apic_ops.disable = irq_remapping_disable_io_apic; x86_io_apic_ops.set_affinity = set_remapped_irq_affinity; - x86_io_apic_ops.setup_entry = setup_ioapic_remapped_entry; x86_io_apic_ops.eoi_ioapic_pin = eoi_ioapic_pin_remapped; } @@ -158,18 +157,6 @@ int __init irq_remap_enable_fault_handling(void) return remap_ops->enable_faulting(); } -int setup_ioapic_remapped_entry(int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr) -{ - if (!remap_ops->setup_ioapic_entry) - return -ENODEV; - - return remap_ops->setup_ioapic_entry(irq, entry, destination, - vector, attr); -} - static int set_remapped_irq_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { -- cgit v1.2.3 From aa5cb97f14a2dd5aefabed6538c35ebc087d7c24 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:42 +0800 Subject: x86/irq: Remove x86_io_apic_ops.set_affinity and related interfaces Now there is no user of x86_io_apic_ops.set_affinity anymore, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Cc: Yijing Wang Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-6-git-send-email-jiang.liu@linux.intel.com --- arch/x86/include/asm/io_apic.h | 4 ---- arch/x86/include/asm/x86_init.h | 3 --- arch/x86/kernel/apic/io_apic.c | 25 +------------------------ arch/x86/kernel/x86_init.c | 1 - drivers/iommu/irq_remapping.c | 15 --------------- 5 files changed, 1 insertion(+), 47 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index d2a34e4718d2..0ff68daa9949 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -222,9 +222,6 @@ extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg); extern void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int val); extern void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); extern void native_disable_io_apic(void); -extern int native_ioapic_set_affinity(struct irq_data *, - const struct cpumask *, - bool); static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) { @@ -285,7 +282,6 @@ static inline void disable_ioapic_support(void) { } #define native_io_apic_write NULL #define native_io_apic_modify NULL #define native_disable_io_apic NULL -#define native_ioapic_set_affinity NULL #define native_eoi_ioapic_pin NULL static inline void setup_IO_APIC(void) { } diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 0c690574efae..f9f83cfabcaa 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -191,9 +191,6 @@ struct x86_io_apic_ops { void (*write) (unsigned int apic, unsigned int reg, unsigned int value); void (*modify) (unsigned int apic, unsigned int reg, unsigned int value); void (*disable)(void); - int (*set_affinity)(struct irq_data *data, - const struct cpumask *mask, - bool force); void (*eoi_ioapic_pin)(int apic, int pin, int vector); }; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index cf5cd19b74e3..9ef964512b86 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1787,29 +1787,6 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq } } -int native_ioapic_set_affinity(struct irq_data *data, - const struct cpumask *mask, - bool force) -{ - unsigned int dest, irq = data->irq; - unsigned long flags; - int ret; - - if (!config_enabled(CONFIG_SMP)) - return -EPERM; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - ret = apic_set_affinity(data, mask, &dest); - if (!ret) { - /* Only the high 8 bits are valid. */ - dest = SET_APIC_LOGICAL_ID(dest); - __target_IO_APIC_irq(irq, dest, irqd_cfg(data)); - ret = IRQ_SET_MASK_OK_NOCOPY; - } - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return ret; -} - atomic_t irq_mis_count; #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -2686,7 +2663,7 @@ void __init setup_ioapic_dest(void) else mask = apic->target_cpus(); - x86_io_apic_ops.set_affinity(idata, mask, false); + irq_set_affinity(irq, mask); } } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 066cdaa6503e..f7e8eab3a7c4 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -144,6 +144,5 @@ struct x86_io_apic_ops x86_io_apic_ops = { .write = native_io_apic_write, .modify = native_io_apic_modify, .disable = native_disable_io_apic, - .set_affinity = native_ioapic_set_affinity, .eoi_ioapic_pin = native_eoi_ioapic_pin, }; diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index 5bb1a04c91f5..7baa54a13921 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -25,10 +25,6 @@ int no_x2apic_optout; static int disable_irq_remap; static struct irq_remap_ops *remap_ops; -static int set_remapped_irq_affinity(struct irq_data *data, - const struct cpumask *mask, - bool force); - static bool irq_remapped(struct irq_cfg *cfg) { return (cfg->remapped == 1); @@ -61,7 +57,6 @@ static void eoi_ioapic_pin_remapped(int apic, int pin, int vector) static void __init irq_remapping_modify_x86_ops(void) { x86_io_apic_ops.disable = irq_remapping_disable_io_apic; - x86_io_apic_ops.set_affinity = set_remapped_irq_affinity; x86_io_apic_ops.eoi_ioapic_pin = eoi_ioapic_pin_remapped; } @@ -157,15 +152,6 @@ int __init irq_remap_enable_fault_handling(void) return remap_ops->enable_faulting(); } -static int set_remapped_irq_affinity(struct irq_data *data, - const struct cpumask *mask, bool force) -{ - if (!config_enabled(CONFIG_SMP) || !remap_ops->set_affinity) - return 0; - - return remap_ops->set_affinity(data, mask, force); -} - void free_remapped_irq(int irq) { struct irq_cfg *cfg = irq_cfg(irq); @@ -201,7 +187,6 @@ void irq_remap_modify_chip_defaults(struct irq_chip *chip) chip->irq_print_chip = ir_print_prefix; chip->irq_ack = ir_ack_apic_edge; chip->irq_eoi = ir_ack_apic_level; - chip->irq_set_affinity = x86_io_apic_ops.set_affinity; } bool setup_remapped_irq(int irq, struct irq_cfg *cfg, struct irq_chip *chip) -- cgit v1.2.3 From ad66e1efc95e548598b032c1fe5bbc34f6460547 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:43 +0800 Subject: x86/irq: Remove x86_io_apic_ops.eoi_ioapic_pin and related interfaces Now there is no user of x86_io_apic_ops.eoi_ioapic_pin anymore, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Cc: Yijing Wang Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-7-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 7 ------- arch/x86/include/asm/x86_init.h | 1 - arch/x86/kernel/apic/io_apic.c | 20 ++++---------------- arch/x86/kernel/x86_init.c | 1 - drivers/iommu/irq_remapping.c | 19 ------------------- 5 files changed, 4 insertions(+), 44 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 0ff68daa9949..fa4b25ebd658 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -150,10 +150,6 @@ struct irq_cfg; extern void ioapic_insert_resources(void); extern int arch_early_ioapic_init(void); -extern void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg); - -extern void native_eoi_ioapic_pin(int apic, int pin, int vector); - extern int save_ioapic_entries(void); extern void mask_ioapic_entries(void); extern int restore_ioapic_entries(void); @@ -237,8 +233,6 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned x86_io_apic_ops.modify(apic, reg, value); } -extern void io_apic_eoi(unsigned int apic, unsigned int vector); - extern void setup_IO_APIC(void); extern void enable_IO_APIC(void); extern void disable_IO_APIC(void); @@ -282,7 +276,6 @@ static inline void disable_ioapic_support(void) { } #define native_io_apic_write NULL #define native_io_apic_modify NULL #define native_disable_io_apic NULL -#define native_eoi_ioapic_pin NULL static inline void setup_IO_APIC(void) { } static inline void enable_IO_APIC(void) { } diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index f9f83cfabcaa..4ada3d3a0e86 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -191,7 +191,6 @@ struct x86_io_apic_ops { void (*write) (unsigned int apic, unsigned int reg, unsigned int value); void (*modify) (unsigned int apic, unsigned int reg, unsigned int value); void (*disable)(void); - void (*eoi_ioapic_pin)(int apic, int pin, int vector); }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9ef964512b86..998fefad820e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -271,7 +271,7 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) + (mpc_ioapic_addr(idx) & ~PAGE_MASK); } -void io_apic_eoi(unsigned int apic, unsigned int vector) +static inline void io_apic_eoi(unsigned int apic, unsigned int vector) { struct io_apic __iomem *io_apic = io_apic_base(apic); writel(vector, &io_apic->eoi); @@ -527,7 +527,7 @@ static void unmask_ioapic_irq(struct irq_data *data) * Otherwise, we simulate the EOI message manually by changing the trigger * mode to edge and then back to level, with RTE being masked during this. */ -void native_eoi_ioapic_pin(int apic, int pin, int vector) +static void __eoi_ioapic_pin(int apic, int pin, int vector) { if (mpc_ioapic_ver(apic) >= 0x20) { io_apic_eoi(apic, vector); @@ -558,19 +558,7 @@ void eoi_ioapic_pin(int vector, struct irq_cfg *cfg) raw_spin_lock_irqsave(&ioapic_lock, flags); for_each_irq_pin(entry, cfg->irq_2_pin) - native_eoi_ioapic_pin(entry->apic, entry->pin, vector); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); -} - -void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) -{ - struct irq_pin_list *entry; - unsigned long flags; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) - x86_io_apic_ops.eoi_ioapic_pin(entry->apic, entry->pin, - cfg->vector); + __eoi_ioapic_pin(entry->apic, entry->pin, vector); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -606,7 +594,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) ioapic_write_entry(apic, pin, entry); } raw_spin_lock_irqsave(&ioapic_lock, flags); - native_eoi_ioapic_pin(apic, pin, entry.vector); + __eoi_ioapic_pin(apic, pin, entry.vector); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index f7e8eab3a7c4..f612dc018fb6 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -144,5 +144,4 @@ struct x86_io_apic_ops x86_io_apic_ops = { .write = native_io_apic_write, .modify = native_io_apic_modify, .disable = native_disable_io_apic, - .eoi_ioapic_pin = native_eoi_ioapic_pin, }; diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index 7baa54a13921..bca42550b1ad 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -43,21 +43,9 @@ static void irq_remapping_disable_io_apic(void) disconnect_bsp_APIC(0); } -static void eoi_ioapic_pin_remapped(int apic, int pin, int vector) -{ - /* - * Intr-remapping uses pin number as the virtual vector - * in the RTE. Actual vector is programmed in - * intr-remapping table entry. Hence for the io-apic - * EOI we use the pin number. - */ - io_apic_eoi(apic, pin); -} - static void __init irq_remapping_modify_x86_ops(void) { x86_io_apic_ops.disable = irq_remapping_disable_io_apic; - x86_io_apic_ops.eoi_ioapic_pin = eoi_ioapic_pin_remapped; } static __init int setup_nointremap(char *str) @@ -171,12 +159,6 @@ void ir_ack_apic_edge(struct irq_data *data) ack_APIC_irq(); } -static void ir_ack_apic_level(struct irq_data *data) -{ - ack_APIC_irq(); - eoi_ioapic_irq(data->irq, irqd_cfg(data)); -} - static void ir_print_prefix(struct irq_data *data, struct seq_file *p) { seq_printf(p, " IR-%s", data->chip->name); @@ -186,7 +168,6 @@ void irq_remap_modify_chip_defaults(struct irq_chip *chip) { chip->irq_print_chip = ir_print_prefix; chip->irq_ack = ir_ack_apic_edge; - chip->irq_eoi = ir_ack_apic_level; } bool setup_remapped_irq(int irq, struct irq_cfg *cfg, struct irq_chip *chip) -- cgit v1.2.3 From baac16952635445addaf397bad74e847db821d6d Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:44 +0800 Subject: x86/irq: Remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ There's no user of irq_alloc_hwirqs(), irq_alloc_hwirq(), irq_free_hwirqs() and irq_free_hwirq() in x86 anymore, so remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ and related code. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428978610-28986-8-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 1 - arch/x86/kernel/apic/vector.c | 34 ---------------------------------- 2 files changed, 35 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 26e066579637..c86fdc13615f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -912,7 +912,6 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI - select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ select IRQ_DOMAIN_HIERARCHY select PCI_MSI_IRQ_DOMAIN if PCI_MSI diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 633f03268d48..d0e5ea0fb947 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -659,40 +659,6 @@ void irq_force_complete_move(int irq) } #endif -/* - * Dynamic irq allocate and deallocation. Should be replaced by irq domains! - */ -int arch_setup_hwirq(unsigned int irq, int node) -{ - struct irq_cfg *cfg; - unsigned long flags; - int ret; - - cfg = alloc_irq_cfg(node); - if (!cfg) - return -ENOMEM; - - raw_spin_lock_irqsave(&vector_lock, flags); - ret = __assign_irq_vector(irq, cfg, apic->target_cpus()); - raw_spin_unlock_irqrestore(&vector_lock, flags); - - if (!ret) - irq_set_chip_data(irq, cfg); - else - free_irq_cfg(cfg); - return ret; -} - -void arch_teardown_hwirq(unsigned int irq) -{ - struct irq_cfg *cfg = irq_cfg(irq); - - free_remapped_irq(irq); - clear_irq_vector(irq, cfg); - irq_set_chip_data(irq, NULL); - free_irq_cfg(cfg); -} - static void __init print_APIC_field(int base) { int i; -- cgit v1.2.3 From 9880534989ba96faad26aebc01dcdb2c1b5793aa Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:46 +0800 Subject: irq_remapping: Clean up unsued code to support IOAPIC Now we have converted to hierarchical irqdomains, so clean up unused code. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428978610-28986-10-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/irq_remapping.h | 23 ----------------------- arch/x86/kernel/apic/vector.c | 1 - drivers/iommu/irq_remapping.c | 33 --------------------------------- 3 files changed, 57 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 4b5bbd104ae2..09efa358c831 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -26,12 +26,7 @@ #include #include -struct IO_APIC_route_entry; -struct io_apic_irq_attr; -struct irq_chip; struct msi_msg; -struct pci_dev; -struct irq_cfg; struct irq_alloc_info; #ifdef CONFIG_IRQ_REMAP @@ -42,13 +37,7 @@ extern int irq_remapping_enable(void); extern void irq_remapping_disable(void); extern int irq_remapping_reenable(int); extern int irq_remap_enable_fault_handling(void); -extern void free_remapped_irq(int irq); extern void panic_if_irq_remap(const char *msg); -extern bool setup_remapped_irq(int irq, - struct irq_cfg *cfg, - struct irq_chip *chip); - -void irq_remap_modify_chip_defaults(struct irq_chip *chip); extern struct irq_domain * irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info); @@ -72,23 +61,11 @@ static inline int irq_remapping_enable(void) { return -ENODEV; } static inline void irq_remapping_disable(void) { } static inline int irq_remapping_reenable(int eim) { return -ENODEV; } static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; } -static inline void free_remapped_irq(int irq) { } static inline void panic_if_irq_remap(const char *msg) { } -static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip) -{ -} - -static inline bool setup_remapped_irq(int irq, - struct irq_cfg *cfg, - struct irq_chip *chip) -{ - return false; -} - static inline struct irq_domain * irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info) { diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index d0e5ea0fb947..37bb9e82b919 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -283,7 +283,6 @@ static void x86_vector_free_irqs(struct irq_domain *domain, for (i = 0; i < nr_irqs; i++) { irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i); if (irq_data && irq_data->chip_data) { - free_remapped_irq(virq); clear_irq_vector(virq + i, irq_data->chip_data); free_irq_cfg(irq_data->chip_data); #ifdef CONFIG_X86_IO_APIC diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index bca42550b1ad..fc78b0d41f71 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -25,11 +25,6 @@ int no_x2apic_optout; static int disable_irq_remap; static struct irq_remap_ops *remap_ops; -static bool irq_remapped(struct irq_cfg *cfg) -{ - return (cfg->remapped == 1); -} - static void irq_remapping_disable_io_apic(void) { /* @@ -140,14 +135,6 @@ int __init irq_remap_enable_fault_handling(void) return remap_ops->enable_faulting(); } -void free_remapped_irq(int irq) -{ - struct irq_cfg *cfg = irq_cfg(irq); - - if (irq_remapped(cfg) && remap_ops->free_irq) - remap_ops->free_irq(irq); -} - void panic_if_irq_remap(const char *msg) { if (irq_remapping_enabled) @@ -159,26 +146,6 @@ void ir_ack_apic_edge(struct irq_data *data) ack_APIC_irq(); } -static void ir_print_prefix(struct irq_data *data, struct seq_file *p) -{ - seq_printf(p, " IR-%s", data->chip->name); -} - -void irq_remap_modify_chip_defaults(struct irq_chip *chip) -{ - chip->irq_print_chip = ir_print_prefix; - chip->irq_ack = ir_ack_apic_edge; -} - -bool setup_remapped_irq(int irq, struct irq_cfg *cfg, struct irq_chip *chip) -{ - if (!irq_remapped(cfg)) - return false; - irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - irq_remap_modify_chip_defaults(chip); - return true; -} - /** * irq_remapping_get_ir_irq_domain - Get the irqdomain associated with the IOMMU * device serving request @info -- cgit v1.2.3 From 4467715a44cca2fa41d25f3d32b737bd2331a8d9 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:53 +0800 Subject: x86/irq: Move irq_cfg.irq_2_pin into io_apic.c Now only io_apic.c accesses struct irq_cfg.irq_2_pin, so move irq_2_pin into struct mp_chip_data in io_apic.c to clean up struct irq_cfg further. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-17-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 7 -- arch/x86/kernel/apic/io_apic.c | 164 +++++++++++++++++++---------------------- arch/x86/kernel/apic/vector.c | 3 - 3 files changed, 77 insertions(+), 97 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index e47bc4de5630..f000b58cbc0c 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -176,13 +176,6 @@ struct irq_cfg { unsigned int dest_apicid; u8 vector; u8 move_in_progress : 1; - union { -#ifdef CONFIG_X86_IO_APIC - struct { - struct list_head irq_2_pin; - }; -#endif - }; }; extern struct irq_domain *x86_vector_domain; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 998fefad820e..a1abdcf2cb5f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -78,7 +78,13 @@ static DEFINE_MUTEX(ioapic_mutex); static unsigned int ioapic_dynirq_base; static int ioapic_initialized; +struct irq_pin_list { + struct list_head list; + int apic, pin; +}; + struct mp_chip_data { + struct list_head irq_2_pin; struct IO_APIC_route_entry entry; int trigger; int polarity; @@ -215,16 +221,6 @@ void mp_save_irq(struct mpc_intsrc *m) panic("Max # of irq sources exceeded!!\n"); } -struct irq_pin_list { - struct list_head list; - int apic, pin; -}; - -static struct irq_pin_list *alloc_irq_pin_list(int node) -{ - return kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node); -} - static void alloc_ioapic_saved_registers(int idx) { size_t size; @@ -379,16 +375,17 @@ static void ioapic_mask_entry(int apic, int pin) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static int __add_pin_to_irq_node(struct mp_chip_data *data, + int node, int apic, int pin) { struct irq_pin_list *entry; /* don't allow duplicates */ - for_each_irq_pin(entry, cfg->irq_2_pin) + for_each_irq_pin(entry, data->irq_2_pin) if (entry->apic == apic && entry->pin == pin) return 0; - entry = alloc_irq_pin_list(node); + entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node); if (!entry) { pr_err("can not alloc irq_pin_list (%d,%d,%d)\n", node, apic, pin); @@ -396,16 +393,16 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi } entry->apic = apic; entry->pin = pin; + list_add_tail(&entry->list, &data->irq_2_pin); - list_add_tail(&entry->list, &cfg->irq_2_pin); return 0; } -static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin) +static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin) { struct irq_pin_list *tmp, *entry; - list_for_each_entry_safe(entry, tmp, &cfg->irq_2_pin, list) + list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list) if (entry->apic == apic && entry->pin == pin) { list_del(&entry->list); kfree(entry); @@ -413,22 +410,23 @@ static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin) } } -static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static void add_pin_to_irq_node(struct mp_chip_data *data, + int node, int apic, int pin) { - if (__add_pin_to_irq_node(cfg, node, apic, pin)) + if (__add_pin_to_irq_node(data, node, apic, pin)) panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); } /* * Reroute an IRQ to a different pin. */ -static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, +static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node, int oldapic, int oldpin, int newapic, int newpin) { struct irq_pin_list *entry; - for_each_irq_pin(entry, cfg->irq_2_pin) { + for_each_irq_pin(entry, data->irq_2_pin) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; @@ -438,7 +436,7 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, } /* old apic/pin didn't exist, so just add new ones */ - add_pin_to_irq_node(cfg, node, newapic, newpin); + add_pin_to_irq_node(data, node, newapic, newpin); } static void __io_apic_modify_irq(struct irq_pin_list *entry, @@ -456,13 +454,13 @@ static void __io_apic_modify_irq(struct irq_pin_list *entry, final(entry); } -static void io_apic_modify_irq(struct irq_cfg *cfg, +static void io_apic_modify_irq(struct mp_chip_data *data, int mask_and, int mask_or, void (*final)(struct irq_pin_list *entry)) { struct irq_pin_list *entry; - for_each_irq_pin(entry, cfg->irq_2_pin) + for_each_irq_pin(entry, data->irq_2_pin) __io_apic_modify_irq(entry, mask_and, mask_or, final); } @@ -478,39 +476,31 @@ static void io_apic_sync(struct irq_pin_list *entry) readl(&io_apic->data); } -static void mask_ioapic(struct irq_cfg *cfg) +static void mask_ioapic_irq(struct irq_data *irq_data) { + struct mp_chip_data *data = irq_data->chip_data; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); + io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } -static void mask_ioapic_irq(struct irq_data *data) -{ - mask_ioapic(irqd_cfg(data)); -} - -static void __unmask_ioapic(struct irq_cfg *cfg) +static void __unmask_ioapic(struct mp_chip_data *data) { - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); + io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL); } -static void unmask_ioapic(struct irq_cfg *cfg) +static void unmask_ioapic_irq(struct irq_data *irq_data) { + struct mp_chip_data *data = irq_data->chip_data; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - __unmask_ioapic(cfg); + __unmask_ioapic(data); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } -static void unmask_ioapic_irq(struct irq_data *data) -{ - unmask_ioapic(irqd_cfg(data)); -} - /* * IO-APIC versions below 0x20 don't support EOI register. * For the record, here is the information about various versions: @@ -551,13 +541,13 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector) } } -void eoi_ioapic_pin(int vector, struct irq_cfg *cfg) +void eoi_ioapic_pin(int vector, struct mp_chip_data *data) { unsigned long flags; struct irq_pin_list *entry; raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) + for_each_irq_pin(entry, data->irq_2_pin) __eoi_ioapic_pin(entry->apic, entry->pin, vector); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1068,11 +1058,10 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, * entry. The IOAPIC entry */ if (irq_data && irq_data->parent_data) { - struct irq_cfg *cfg = irqd_cfg(irq_data); - if (!mp_check_pin_attr(irq, info)) return -EBUSY; - if (__add_pin_to_irq_node(cfg, node, ioapic, info->ioapic_pin)) + if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic, + info->ioapic_pin)) return -ENOMEM; } else { irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true); @@ -1394,9 +1383,7 @@ static void __init print_IO_APIC(int ioapic_idx) void __init print_IO_APICs(void) { int ioapic_idx; - struct irq_cfg *cfg; unsigned int irq; - struct irq_chip *chip; printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for_each_ioapic(ioapic_idx) @@ -1416,18 +1403,20 @@ void __init print_IO_APICs(void) printk(KERN_DEBUG "IRQ to pin mappings:\n"); for_each_active_irq(irq) { struct irq_pin_list *entry; + struct irq_chip *chip; + struct mp_chip_data *data; chip = irq_get_chip(irq); if (chip != &ioapic_chip && chip != &ioapic_ir_chip) continue; - - cfg = irq_cfg(irq); - if (!cfg) + data = irq_get_chip_data(irq); + if (!data) continue; - if (list_empty(&cfg->irq_2_pin)) + if (list_empty(&data->irq_2_pin)) continue; + printk(KERN_DEBUG "IRQ%d ", irq); - for_each_irq_pin(entry, cfg->irq_2_pin) + for_each_irq_pin(entry, data->irq_2_pin) pr_cont("-> %d:%d", entry->apic, entry->pin); pr_cont("\n"); } @@ -1740,7 +1729,7 @@ static unsigned int startup_ioapic_irq(struct irq_data *data) if (legacy_pic->irq_pending(irq)) was_pending = 1; } - __unmask_ioapic(irqd_cfg(data)); + __unmask_ioapic(data->chip_data); raw_spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; @@ -1755,13 +1744,15 @@ static unsigned int startup_ioapic_irq(struct irq_data *data) * races. */ -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) +static void __target_IO_APIC_irq(unsigned int irq, struct irq_cfg *cfg, + struct mp_chip_data *data) { int apic, pin; struct irq_pin_list *entry; u8 vector = cfg->vector; + unsigned int dest = SET_APIC_LOGICAL_ID(cfg->dest_apicid); - for_each_irq_pin(entry, cfg->irq_2_pin) { + for_each_irq_pin(entry, data->irq_2_pin) { unsigned int reg; apic = entry->apic; @@ -1778,13 +1769,13 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq atomic_t irq_mis_count; #ifdef CONFIG_GENERIC_PENDING_IRQ -static bool io_apic_level_ack_pending(struct irq_cfg *cfg) +static bool io_apic_level_ack_pending(struct mp_chip_data *data) { struct irq_pin_list *entry; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) { + for_each_irq_pin(entry, data->irq_2_pin) { unsigned int reg; int pin; @@ -1801,18 +1792,17 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) return false; } -static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) +static inline bool ioapic_irqd_mask(struct irq_data *data) { /* If we are moving the irq we need to mask it */ if (unlikely(irqd_is_setaffinity_pending(data))) { - mask_ioapic(cfg); + mask_ioapic_irq(data); return true; } return false; } -static inline void ioapic_irqd_unmask(struct irq_data *data, - struct irq_cfg *cfg, bool masked) +static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) { if (unlikely(masked)) { /* Only migrate the irq if the ack has been received. @@ -1841,31 +1831,30 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, * accurate and is causing problems then it is a hardware bug * and you can go talk to the chipset vendor about it. */ - if (!io_apic_level_ack_pending(cfg)) + if (!io_apic_level_ack_pending(data->chip_data)) irq_move_masked_irq(data); - unmask_ioapic(cfg); + unmask_ioapic_irq(data); } } #else -static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) +static inline bool ioapic_irqd_mask(struct irq_data *data) { return false; } -static inline void ioapic_irqd_unmask(struct irq_data *data, - struct irq_cfg *cfg, bool masked) +static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) { } #endif -static void ioapic_ack_level(struct irq_data *data) +static void ioapic_ack_level(struct irq_data *irq_data) { - struct irq_cfg *cfg = irqd_cfg(data); + struct irq_cfg *cfg = irqd_cfg(irq_data); unsigned long v; bool masked; int i; irq_complete_move(cfg); - masked = ioapic_irqd_mask(data, cfg); + masked = ioapic_irqd_mask(irq_data); /* * It appears there is an erratum which affects at least version 0x11 @@ -1917,10 +1906,10 @@ static void ioapic_ack_level(struct irq_data *data) */ if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); - eoi_ioapic_pin(cfg->vector, cfg); + eoi_ioapic_pin(cfg->vector, irq_data->chip_data); } - ioapic_irqd_unmask(data, cfg, masked); + ioapic_irqd_unmask(irq_data, masked); } static void ioapic_ir_ack_level(struct irq_data *irq_data) @@ -1934,7 +1923,7 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data) * EOI we use the pin number. */ ack_APIC_irq(); - eoi_ioapic_pin(data->entry.vector, irqd_cfg(irq_data)); + eoi_ioapic_pin(data->entry.vector, data); } static int ioapic_set_affinity(struct irq_data *irq_data, @@ -1942,7 +1931,6 @@ static int ioapic_set_affinity(struct irq_data *irq_data, { struct irq_data *parent = irq_data->parent_data; struct mp_chip_data *data = irq_data->chip_data; - unsigned int dest, irq = irq_data->irq; struct irq_cfg *cfg; unsigned long flags; int ret; @@ -1953,9 +1941,7 @@ static int ioapic_set_affinity(struct irq_data *irq_data, cfg = irqd_cfg(irq_data); data->entry.dest = cfg->dest_apicid; data->entry.vector = cfg->vector; - /* Only the high 8 bits are valid. */ - dest = SET_APIC_LOGICAL_ID(cfg->dest_apicid); - __target_IO_APIC_irq(irq, dest, cfg); + __target_IO_APIC_irq(irq_data->irq, cfg, irq_data->chip_data); } raw_spin_unlock_irqrestore(&ioapic_lock, flags); @@ -2116,10 +2102,11 @@ early_param("disable_timer_pin_1", disable_timer_pin_setup); static int mp_alloc_timer_irq(int ioapic, int pin) { int irq = -1; - struct irq_alloc_info info; struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); if (domain) { + struct irq_alloc_info info; + ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0); info.ioapic_id = mpc_ioapic_id(ioapic); info.ioapic_pin = pin; @@ -2141,7 +2128,9 @@ static int mp_alloc_timer_irq(int ioapic, int pin) */ static inline void __init check_timer(void) { - struct irq_cfg *cfg = irq_cfg(0); + struct irq_data *irq_data = irq_get_irq_data(0); + struct mp_chip_data *data = irq_data->chip_data; + struct irq_cfg *cfg = irqd_cfg(irq_data); int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; unsigned long flags; @@ -2205,9 +2194,9 @@ static inline void __init check_timer(void) int idx; idx = find_irq_entry(apic1, pin1, mp_INT); if (idx != -1 && irq_trigger(idx)) - unmask_ioapic(cfg); + unmask_ioapic_irq(irq_get_chip_data(0)); } - irq_domain_activate_irq(irq_get_irq_data(0)); + irq_domain_activate_irq(irq_data); if (timer_irq_works()) { if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); @@ -2227,8 +2216,8 @@ static inline void __init check_timer(void) /* * legacy devices should be connected to IO APIC #0 */ - replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); - irq_domain_activate_irq(irq_get_irq_data(0)); + replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2); + irq_domain_activate_irq(irq_data); legacy_pic->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); @@ -3044,6 +3033,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, return ret; } + INIT_LIST_HEAD(&data->irq_2_pin); irq_data->hwirq = info->ioapic_pin; irq_data->chip = (domain->parent == x86_vector_domain) ? &ioapic_chip : &ioapic_ir_chip; @@ -3051,7 +3041,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info); cfg = irqd_cfg(irq_data); - add_pin_to_irq_node(cfg, info->ioapic_node, ioapic, pin); + add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin); if (info->ioapic_entry) mp_setup_entry(cfg, data, info->ioapic_entry); mp_register_handler(virq, data->trigger); @@ -3069,15 +3059,16 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { - struct irq_cfg *cfg = irq_cfg(virq); struct irq_data *irq_data; + struct mp_chip_data *data; BUG_ON(nr_irqs != 1); irq_data = irq_domain_get_irq_data(domain, virq); if (irq_data && irq_data->chip_data) { - __remove_pin_from_irq(cfg, mp_irqdomain_ioapic_idx(domain), + data = irq_data->chip_data; + __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain), (int)irq_data->hwirq); - WARN_ON(!list_empty(&cfg->irq_2_pin)); + WARN_ON(!list_empty(&data->irq_2_pin)); kfree(irq_data->chip_data); } irq_domain_free_irqs_top(domain, virq, nr_irqs); @@ -3089,10 +3080,9 @@ void mp_irqdomain_activate(struct irq_domain *domain, unsigned long flags; struct irq_pin_list *entry; struct mp_chip_data *data = irq_data->chip_data; - struct irq_cfg *cfg = irqd_cfg(irq_data); raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) + for_each_irq_pin(entry, data->irq_2_pin) __ioapic_write_entry(entry->apic, entry->pin, data->entry); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 37bb9e82b919..af224e6774d8 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -68,9 +68,6 @@ static struct irq_cfg *alloc_irq_cfg(int node) goto out_cfg; if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node)) goto out_domain; -#ifdef CONFIG_X86_IO_APIC - INIT_LIST_HEAD(&cfg->irq_2_pin); -#endif return cfg; out_domain: free_cpumask_var(cfg->domain); -- cgit v1.2.3 From 50a6ad84b2a2c971e76d57884d61a5a55d7c1601 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:54 +0800 Subject: x86/irq: Remove struct io_apic_irq_attr Now there's no user of struct io_apic_irq_attr anymore, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-18-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 7 ------- arch/x86/kernel/apic/io_apic.c | 10 ---------- 2 files changed, 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index fa4b25ebd658..4eb4bcc5f219 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -157,13 +157,6 @@ extern int restore_ioapic_entries(void); extern void setup_ioapic_ids_from_mpc(void); extern void setup_ioapic_ids_from_mpc_nocheck(void); -struct io_apic_irq_attr { - int ioapic; - int ioapic_pin; - int trigger; - int polarity; -}; - enum ioapic_domain_type { IOAPIC_DOMAIN_INVALID, IOAPIC_DOMAIN_LEGACY, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a1abdcf2cb5f..76dc9f5bfdbc 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2959,16 +2959,6 @@ int mp_ioapic_registered(u32 gsi_base) return 0; } -static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, - int ioapic, int ioapic_pin, - int trigger, int polarity) -{ - irq_attr->ioapic = ioapic; - irq_attr->ioapic_pin = ioapic_pin; - irq_attr->trigger = trigger; - irq_attr->polarity = polarity; -} - static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, struct irq_alloc_info *info) { -- cgit v1.2.3 From 9a93d4736ec5ec322ec8f240a292c1a86cd0876d Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:55 +0800 Subject: x86/irq: Remove x86_io_apic_ops.write and x86_io_apic_ops.modify x86_io_apic_ops.write is always set to native_io_apic_write(), and nobody overrides it. So get rid of the indirection by changing native_io_apic_write() as io_apic_write() and removing x86_io_apic_ops.write. Do the same for x86_io_apic_ops.modify and native_io_apic_modify(). Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Yijing Wang Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-19-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 13 ------------- arch/x86/include/asm/x86_init.h | 2 -- arch/x86/kernel/apic/io_apic.c | 6 ++++-- arch/x86/kernel/x86_init.c | 2 -- 4 files changed, 4 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 4eb4bcc5f219..c6486dd44418 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -208,8 +208,6 @@ extern void disable_ioapic_support(void); extern void __init native_io_apic_init_mappings(void); extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg); -extern void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int val); -extern void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); extern void native_disable_io_apic(void); static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) @@ -217,15 +215,6 @@ static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) return x86_io_apic_ops.read(apic, reg); } -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ - x86_io_apic_ops.write(apic, reg, value); -} -static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) -{ - x86_io_apic_ops.modify(apic, reg, value); -} - extern void setup_IO_APIC(void); extern void enable_IO_APIC(void); extern void disable_IO_APIC(void); @@ -266,8 +255,6 @@ static inline void mp_save_irq(struct mpc_intsrc *m) { }; static inline void disable_ioapic_support(void) { } #define native_io_apic_init_mappings NULL #define native_io_apic_read NULL -#define native_io_apic_write NULL -#define native_io_apic_modify NULL #define native_disable_io_apic NULL static inline void setup_IO_APIC(void) { } diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 09d4dab9302f..844b37d55a44 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -182,8 +182,6 @@ struct x86_msi_ops { struct x86_io_apic_ops { void (*init) (void); unsigned int (*read) (unsigned int apic, unsigned int reg); - void (*write) (unsigned int apic, unsigned int reg, unsigned int value); - void (*modify) (unsigned int apic, unsigned int reg, unsigned int value); void (*disable)(void); }; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 76dc9f5bfdbc..d687a10ed3a2 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -280,7 +280,8 @@ unsigned int native_io_apic_read(unsigned int apic, unsigned int reg) return readl(&io_apic->data); } -void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +static void io_apic_write(unsigned int apic, unsigned int reg, + unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); @@ -294,7 +295,8 @@ void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int valu * * Older SiS APIC requires we rewrite the index register */ -void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +static void io_apic_modify(unsigned int apic, unsigned int reg, + unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index f612dc018fb6..633f07845099 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -141,7 +141,5 @@ void arch_restore_msi_irqs(struct pci_dev *dev) struct x86_io_apic_ops x86_io_apic_ops = { .init = native_io_apic_init_mappings, .read = native_io_apic_read, - .write = native_io_apic_write, - .modify = native_io_apic_modify, .disable = native_disable_io_apic, }; -- cgit v1.2.3 From ca1b88622e9c16df7b1e0a57e9c6c2300321bed4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 24 Apr 2015 13:57:48 +0200 Subject: x86: Remove more unmodified io_apic_ops io_apic_ops.init() is either NULL, if IO-APIC support is disabled at compile time or native_io_apic_init_mappings(). No point to have that as we can achieve the same thing with an empty inline. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 6 +++--- arch/x86/include/asm/x86_init.h | 1 - arch/x86/kernel/apic/io_apic.c | 2 +- arch/x86/kernel/setup.c | 3 +-- arch/x86/kernel/x86_init.c | 1 - 5 files changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index c6486dd44418..f80f4efa1717 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -206,7 +206,7 @@ extern void mp_save_irq(struct mpc_intsrc *m); extern void disable_ioapic_support(void); -extern void __init native_io_apic_init_mappings(void); +extern void __init io_apic_init_mappings(void); extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg); extern void native_disable_io_apic(void); @@ -251,9 +251,9 @@ static inline int restore_ioapic_entries(void) return -ENOMEM; } -static inline void mp_save_irq(struct mpc_intsrc *m) { }; +static inline void mp_save_irq(struct mpc_intsrc *m) { } static inline void disable_ioapic_support(void) { } -#define native_io_apic_init_mappings NULL +static inline void io_apic_init_mappings(void) { } #define native_io_apic_read NULL #define native_disable_io_apic NULL diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 844b37d55a44..48d34d28f5a6 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -180,7 +180,6 @@ struct x86_msi_ops { }; struct x86_io_apic_ops { - void (*init) (void); unsigned int (*read) (unsigned int apic, unsigned int reg); void (*disable)(void); }; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d687a10ed3a2..3029502b0a50 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2687,7 +2687,7 @@ static struct resource * __init ioapic_setup_resources(void) return res; } -void __init native_io_apic_init_mappings(void) +void __init io_apic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; struct resource *ioapic_res; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d74ac33290ae..8d04a7594a03 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1222,8 +1222,7 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); init_apic_mappings(); - if (x86_io_apic_ops.init) - x86_io_apic_ops.init(); + io_apic_init_mappings(); kvm_guest_init(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 633f07845099..3cee10abf01d 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -139,7 +139,6 @@ void arch_restore_msi_irqs(struct pci_dev *dev) #endif struct x86_io_apic_ops x86_io_apic_ops = { - .init = native_io_apic_init_mappings, .read = native_io_apic_read, .disable = native_disable_io_apic, }; -- cgit v1.2.3 From 154d9e50e413ee144d48ccd6c402633ffbecbfff Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:56 +0800 Subject: x86/irq: Clean up io_apic.h Clean up io_apic.h by: 1) moving definition of struct mp_ioapic_gsi into io_apic.c 2) changing mp_pin_to_gsi() and mp_ioapic_gsi_routing() as static 3) removing unused MP_MAX_IOAPIC_PIN 4) removing useless forward declaration 5) removing useless comments Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-20-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 13 ++----------- arch/x86/kernel/apic/io_apic.c | 23 ++++++++--------------- 2 files changed, 10 insertions(+), 26 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index f80f4efa1717..9b9050129ee7 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -113,9 +113,6 @@ extern int nr_ioapics; extern int mpc_ioapic_id(int ioapic); extern unsigned int mpc_ioapic_addr(int ioapic); -extern struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic); - -#define MP_MAX_IOAPIC_PIN 127 /* # of MP IRQ source entries */ extern int mp_irq_entries; @@ -135,6 +132,8 @@ extern int noioapicquirk; /* -1 if "noapic" boot option passed */ extern int noioapicreroute; +extern u32 gsi_top; + extern unsigned long io_apic_irqs; #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1 << (x)) & io_apic_irqs)) @@ -174,15 +173,8 @@ struct ioapic_domain_cfg { struct device_node *dev; }; -struct mp_ioapic_gsi{ - u32 gsi_base; - u32 gsi_end; -}; -extern u32 gsi_top; - extern int mp_find_ioapic(u32 gsi); extern int mp_find_ioapic_pin(int ioapic, u32 gsi); -extern u32 mp_pin_to_gsi(int ioapic, int pin); extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info); extern void mp_unmap_irq(int irq); @@ -231,7 +223,6 @@ static inline int arch_early_ioapic_init(void) { return 0; } static inline void print_IO_APICs(void) {} #define gsi_top (NR_IRQS_LEGACY) static inline int mp_find_ioapic(u32 gsi) { return 0; } -static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return UINT_MAX; } static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info) { diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3029502b0a50..4c7da8483398 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -63,7 +63,6 @@ #define for_each_ioapic_pin(idx, pin) \ for_each_ioapic((idx)) \ for_each_pin((idx), (pin)) - #define for_each_irq_pin(entry, head) \ list_for_each_entry(entry, &head, list) @@ -92,6 +91,11 @@ struct mp_chip_data { bool isa_irq; }; +struct mp_ioapic_gsi { + u32 gsi_base; + u32 gsi_end; +}; + static struct ioapic { /* * # of IRQ routing registers @@ -122,7 +126,7 @@ unsigned int mpc_ioapic_addr(int ioapic_idx) return ioapics[ioapic_idx].mp_config.apicaddr; } -struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx) +static inline struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx) { return &ioapics[ioapic_idx].gsi_config; } @@ -134,7 +138,7 @@ static inline int mp_ioapic_pin_count(int ioapic) return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1; } -u32 mp_pin_to_gsi(int ioapic, int pin) +static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin; } @@ -1153,8 +1157,7 @@ static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL); } -int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, - struct irq_alloc_info *info) +int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info) { int ioapic, pin, idx; @@ -1719,7 +1722,6 @@ static int __init timer_irq_works(void) * This is not complete - we should be able to fake * an edge even if it isn't on the 8259A... */ - static unsigned int startup_ioapic_irq(struct irq_data *data) { int was_pending = 0, irq = data->irq; @@ -1737,15 +1739,6 @@ static unsigned int startup_ioapic_irq(struct irq_data *data) return was_pending; } -/* - * Level and edge triggered IO-APIC interrupts need different handling, - * so we use two separate IRQ descriptors. Edge triggered IRQs can be - * handled with the level-triggered descriptor, but that one has slightly - * more overhead. Level-triggered interrupts cannot be handled with the - * edge-triggered handler, without risking IRQ storms and other ugly - * races. - */ - static void __target_IO_APIC_irq(unsigned int irq, struct irq_cfg *cfg, struct mp_chip_data *data) { -- cgit v1.2.3 From 0be275e3a5607b23f5132121bca22a10ee23aa99 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:57 +0800 Subject: x86/irq: Use cached IOAPIC entry instead of reading from hardware Use cached IOAPIC entry instead of reading data from IOAPIC hardware registers to improve performance. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-21-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 78 ++++++++++++------------------------------ 1 file changed, 21 insertions(+), 57 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4c7da8483398..4fb347f01653 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -67,8 +67,13 @@ list_for_each_entry(entry, &head, list) /* - * Is the SiS APIC rmw bug present ? + * Is the SiS APIC rmw bug present ? * -1 = don't know, 0 = no, 1 = yes + * When doing a read-modify-write operation on IOAPIC registers, older SiS APIC + * requires we rewrite the index register again where the read already set up + * the index register. + * The code to make use of sis_apic_bug has been removed, but we don't want to + * lose this knowledge. */ int sis_apic_bug = -1; @@ -293,22 +298,6 @@ static void io_apic_write(unsigned int apic, unsigned int reg, writel(value, &io_apic->data); } -/* - * Re-write a value: to be used for read-modify-write - * cycles where the read already set up the index register. - * - * Older SiS APIC requires we rewrite the index register - */ -static void io_apic_modify(unsigned int apic, unsigned int reg, - unsigned int value) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - - if (sis_apic_bug) - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -} - union entry_union { struct { u32 w1, w2; }; struct IO_APIC_route_entry entry; @@ -445,29 +434,23 @@ static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node, add_pin_to_irq_node(data, node, newapic, newpin); } -static void __io_apic_modify_irq(struct irq_pin_list *entry, - int mask_and, int mask_or, - void (*final)(struct irq_pin_list *entry)) -{ - unsigned int reg, pin; - - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin * 2); - reg &= mask_and; - reg |= mask_or; - io_apic_modify(entry->apic, 0x10 + pin * 2, reg); - if (final) - final(entry); -} - static void io_apic_modify_irq(struct mp_chip_data *data, int mask_and, int mask_or, void (*final)(struct irq_pin_list *entry)) { + union entry_union eu; struct irq_pin_list *entry; - for_each_irq_pin(entry, data->irq_2_pin) - __io_apic_modify_irq(entry, mask_and, mask_or, final); + eu.entry = data->entry; + eu.w1 &= mask_and; + eu.w1 |= mask_or; + data->entry = eu.entry; + + for_each_irq_pin(entry, data->irq_2_pin) { + io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1); + if (final) + final(entry); + } } static void io_apic_sync(struct irq_pin_list *entry) @@ -1739,28 +1722,6 @@ static unsigned int startup_ioapic_irq(struct irq_data *data) return was_pending; } -static void __target_IO_APIC_irq(unsigned int irq, struct irq_cfg *cfg, - struct mp_chip_data *data) -{ - int apic, pin; - struct irq_pin_list *entry; - u8 vector = cfg->vector; - unsigned int dest = SET_APIC_LOGICAL_ID(cfg->dest_apicid); - - for_each_irq_pin(entry, data->irq_2_pin) { - unsigned int reg; - - apic = entry->apic; - pin = entry->pin; - - io_apic_write(apic, 0x11 + pin*2, dest); - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, 0x10 + pin*2, reg); - } -} - atomic_t irq_mis_count; #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -1926,6 +1887,7 @@ static int ioapic_set_affinity(struct irq_data *irq_data, { struct irq_data *parent = irq_data->parent_data; struct mp_chip_data *data = irq_data->chip_data; + struct irq_pin_list *entry; struct irq_cfg *cfg; unsigned long flags; int ret; @@ -1936,7 +1898,9 @@ static int ioapic_set_affinity(struct irq_data *irq_data, cfg = irqd_cfg(irq_data); data->entry.dest = cfg->dest_apicid; data->entry.vector = cfg->vector; - __target_IO_APIC_irq(irq_data->irq, cfg, irq_data->chip_data); + for_each_irq_pin(entry, data->irq_2_pin) + __ioapic_write_entry(entry->apic, entry->pin, + data->entry); } raw_spin_unlock_irqrestore(&ioapic_lock, flags); -- cgit v1.2.3 From 1f934641294ca2e09016c689862378fbb15da4d4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 10:29:58 +0800 Subject: x86/irq: Remove sis apic bug workaround The SiS apic bug workaround is now obsolete as we cache the register values for performance reasons. Signed-off-by: Thomas Gleixner Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-22-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 3 --- arch/x86/kernel/apic/io_apic.c | 35 ++++++++++------------------------- drivers/pci/quirks.c | 7 ------- 3 files changed, 10 insertions(+), 35 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 9b9050129ee7..d58f1c6b5608 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -120,9 +120,6 @@ extern int mp_irq_entries; /* MP IRQ source entries */ extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; -/* Older SiS APIC requires we rewrite the index register */ -extern int sis_apic_bug; - /* 1 if "noapic" boot option passed */ extern int skip_ioapic_setup; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4fb347f01653..9806f9605bc4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -18,6 +18,16 @@ * and Rolf G. Tews * for testing these extensively * Paul Diefenbaugh : Added full ACPI support + * + * Historical information which is worth to be preserved: + * + * - SiS APIC rmw bug: + * + * We used to have a workaround for a bug in SiS chips which + * required to rewrite the index register for a read-modify-write + * operation as the chip lost the index information which was + * setup for the read already. We cache the data now, so that + * workaround has been removed. */ #include @@ -66,17 +76,6 @@ #define for_each_irq_pin(entry, head) \ list_for_each_entry(entry, &head, list) -/* - * Is the SiS APIC rmw bug present ? - * -1 = don't know, 0 = no, 1 = yes - * When doing a read-modify-write operation on IOAPIC registers, older SiS APIC - * requires we rewrite the index register again where the read already set up - * the index register. - * The code to make use of sis_apic_bug has been removed, but we don't want to - * lose this knowledge. - */ -int sis_apic_bug = -1; - static DEFINE_RAW_SPINLOCK(ioapic_lock); static DEFINE_MUTEX(ioapic_mutex); static unsigned int ioapic_dynirq_base; @@ -2320,20 +2319,6 @@ void __init setup_IO_APIC(void) ioapic_initialized = 1; } -/* - * Called after all the initialization is done. If we didn't find any - * APIC bugs then we can allow the modify fast path - */ - -static int __init io_apic_bug_finalize(void) -{ - if (sis_apic_bug == -1) - sis_apic_bug = 0; - return 0; -} - -late_initcall(io_apic_bug_finalize); - static void resume_ioapic_id(int ioapic_idx) { unsigned long flags; diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index c6dc1dfd25d5..2890ad7cf7c6 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -819,13 +819,6 @@ static void quirk_amd_ioapic(struct pci_dev *dev) } } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_VIPER_7410, quirk_amd_ioapic); - -static void quirk_ioapic_rmw(struct pci_dev *dev) -{ - if (dev->devfn == 0 && dev->bus->number == 0) - sis_apic_bug = 1; -} -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_SI, PCI_ANY_ID, quirk_ioapic_rmw); #endif /* CONFIG_X86_IO_APIC */ /* -- cgit v1.2.3 From a2cbbb47fd90ef1161ce22b099de5c6095f8365f Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:29:59 +0800 Subject: x86/irq: Remove unused alloc_irq_and_cfg_at() There's no caller of alloc_irq_and_cfg_at() anymore, so remove it. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428978610-28986-23-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 1 - arch/x86/kernel/apic/vector.c | 21 --------------------- 2 files changed, 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index f000b58cbc0c..bb2c990b97ef 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -186,7 +186,6 @@ extern void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src); extern struct irq_cfg *irq_cfg(unsigned int irq); extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data); -extern struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node); extern void lock_vector_lock(void); extern void unlock_vector_lock(void); extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *); diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index af224e6774d8..51cd46bfc46e 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -76,27 +76,6 @@ out_cfg: return NULL; } -struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) -{ - int res = irq_alloc_desc_at(at, node); - struct irq_cfg *cfg; - - if (res < 0) { - if (res != -EEXIST) - return NULL; - cfg = irq_cfg(at); - if (cfg) - return cfg; - } - - cfg = alloc_irq_cfg(node); - if (cfg) - irq_set_chip_data(at, cfg); - else - irq_free_desc(at); - return cfg; -} - static void free_irq_cfg(struct irq_cfg *cfg) { if (cfg) { -- cgit v1.2.3 From f970510cc55e41d21ca30feb56873aaeb57ec18d Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:00 +0800 Subject: x86/irq: Make functions only used in vector.c static Function {assign|clear}_irq_vector() and apic_retrigger_irq() are only used in vector.c, so make them static. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428978610-28986-24-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 3 --- arch/x86/kernel/apic/vector.c | 7 ++++--- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index bb2c990b97ef..9a797682fef5 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -188,8 +188,6 @@ extern struct irq_cfg *irq_cfg(unsigned int irq); extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data); extern void lock_vector_lock(void); extern void unlock_vector_lock(void); -extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *); -extern void clear_irq_vector(int irq, struct irq_cfg *cfg); extern void setup_vector_irq(int cpu); #ifdef CONFIG_SMP extern void send_cleanup_vector(struct irq_cfg *); @@ -199,7 +197,6 @@ static inline void send_cleanup_vector(struct irq_cfg *c) { } static inline void irq_complete_move(struct irq_cfg *c) { } #endif -extern int apic_retrigger_irq(struct irq_data *data); extern void apic_ack_edge(struct irq_data *data); extern int apic_set_affinity(struct irq_data *data, const struct cpumask *mask, unsigned int *dest_id); diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 51cd46bfc46e..d52af4d805db 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -185,7 +185,8 @@ next: return err; } -int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +static int assign_irq_vector(int irq, struct irq_cfg *cfg, + const struct cpumask *mask) { int err; unsigned long flags; @@ -196,7 +197,7 @@ int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) return err; } -void clear_irq_vector(int irq, struct irq_cfg *cfg) +static void clear_irq_vector(int irq, struct irq_cfg *cfg) { int cpu, vector; unsigned long flags; @@ -441,7 +442,7 @@ void setup_vector_irq(int cpu) __setup_vector_irq(cpu); } -int apic_retrigger_irq(struct irq_data *data) +static int apic_retrigger_irq(struct irq_data *data) { struct irq_cfg *cfg = irqd_cfg(data); unsigned long flags; -- cgit v1.2.3 From 68f9f4404d74f859dc84973db8731b41a51d929a Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:01 +0800 Subject: x86/irq: Remove function apic_set_affinity() Now there's no user of apic_set_affinity(), so remove it. Also rename vector_set_affinity() to apic_set_affinity() for consistency. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428978610-28986-25-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 2 -- arch/x86/kernel/apic/vector.c | 40 +++------------------------------------- 2 files changed, 3 insertions(+), 39 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 9a797682fef5..727c62378a65 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -198,8 +198,6 @@ static inline void irq_complete_move(struct irq_cfg *c) { } #endif extern void apic_ack_edge(struct irq_data *data); -extern int apic_set_affinity(struct irq_data *data, const struct cpumask *mask, - unsigned int *dest_id); #else /* CONFIG_X86_LOCAL_APIC */ static inline void lock_vector_lock(void) {} static inline void unlock_vector_lock(void) {} diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index d52af4d805db..1aea62d60cf2 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -463,42 +463,8 @@ void apic_ack_edge(struct irq_data *data) ack_APIC_irq(); } -/* - * Either sets data->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and - * leaves data->affinity untouched. - */ -int apic_set_affinity(struct irq_data *data, const struct cpumask *mask, - unsigned int *dest_id) -{ - struct irq_cfg *cfg = irqd_cfg(data); - unsigned int irq = data->irq; - int err; - - if (!config_enabled(CONFIG_SMP)) - return -EPERM; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return -EINVAL; - - err = assign_irq_vector(irq, cfg, mask); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id); - if (err) { - if (assign_irq_vector(irq, cfg, data->affinity)) - pr_err("Failed to recover vector for irq %d\n", irq); - return err; - } - - cpumask_copy(data->affinity, mask); - - return 0; -} - -static int vector_set_affinity(struct irq_data *irq_data, - const struct cpumask *dest, bool force) +static int apic_set_affinity(struct irq_data *irq_data, + const struct cpumask *dest, bool force) { struct irq_cfg *cfg = irq_data->chip_data; int err, irq = irq_data->irq; @@ -523,7 +489,7 @@ static int vector_set_affinity(struct irq_data *irq_data, static struct irq_chip lapic_controller = { .irq_ack = apic_ack_edge, - .irq_set_affinity = vector_set_affinity, + .irq_set_affinity = apic_set_affinity, .irq_retrigger = apic_retrigger_irq, }; -- cgit v1.2.3 From c6c2002b744215810c770dd73f45da954bcfa9d5 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:02 +0800 Subject: x86/irq: Move check of cfg->move_in_progress into send_cleanup_vector() Move check of cfg->move_in_progress into send_cleanup_vector() to prepare for simplifying struct irq_cfg. Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Greg Kroah-Hartman Cc: iommu@lists.linux-foundation.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1428978610-28986-26-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 10 ++++++++-- arch/x86/platform/uv/uv_irq.c | 3 +-- drivers/iommu/amd_iommu.c | 3 +-- drivers/iommu/intel_irq_remapping.c | 3 +-- 4 files changed, 11 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 1aea62d60cf2..0092a6e0d5ee 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -494,7 +494,7 @@ static struct irq_chip lapic_controller = { }; #ifdef CONFIG_SMP -void send_cleanup_vector(struct irq_cfg *cfg) +static void __send_cleanup_vector(struct irq_cfg *cfg) { cpumask_var_t cleanup_mask; @@ -512,6 +512,12 @@ void send_cleanup_vector(struct irq_cfg *cfg) cfg->move_in_progress = 0; } +void send_cleanup_vector(struct irq_cfg *cfg) +{ + if (cfg->move_in_progress) + __send_cleanup_vector(cfg); +} + asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; @@ -582,7 +588,7 @@ static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) me = smp_processor_id(); if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) - send_cleanup_vector(cfg); + __send_cleanup_vector(cfg); } void irq_complete_move(struct irq_cfg *cfg) diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 54af6e388a12..091b36ac44c4 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -63,8 +63,7 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, ret = parent->chip->irq_set_affinity(parent, mask, force); if (ret >= 0) { uv_program_mmr(cfg, data->chip_data); - if (cfg->move_in_progress) - send_cleanup_vector(cfg); + send_cleanup_vector(cfg); } return ret; diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index b5d903cb2804..cbe8c1f28a95 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -4327,8 +4327,7 @@ static int amd_ir_set_affinity(struct irq_data *data, * at the new destination. So, time to cleanup the previous * vector allocation. */ - if (cfg->move_in_progress) - send_cleanup_vector(cfg); + send_cleanup_vector(cfg); return IRQ_SET_MASK_OK_DONE; } diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index 34642d3f7fbd..14d95694fc1b 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -1003,8 +1003,7 @@ intel_ir_set_affinity(struct irq_data *data, const struct cpumask *mask, * at the new destination. So, time to cleanup the previous * vector allocation. */ - if (cfg->move_in_progress) - send_cleanup_vector(cfg); + send_cleanup_vector(cfg); return IRQ_SET_MASK_OK_DONE; } -- cgit v1.2.3 From 7f3262edcdf623296b514377d52911b115c7ab49 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:03 +0800 Subject: x86/irq: Move private data in struct irq_cfg into dedicated data structure Several fields in struct irq_cfg are private to vector.c, so move it into dedicated data structure. This helps to hide implementation details. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428978610-28986-27-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1416901802-24211-35-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner Tested-by: Joerg Roedel --- arch/x86/include/asm/hw_irq.h | 3 - arch/x86/kernel/apic/vector.c | 221 +++++++++++++++++++++++------------------- 2 files changed, 119 insertions(+), 105 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 727c62378a65..3b8233a26348 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -171,11 +171,8 @@ enum { }; struct irq_cfg { - cpumask_var_t domain; - cpumask_var_t old_domain; unsigned int dest_apicid; u8 vector; - u8 move_in_progress : 1; }; extern struct irq_domain *x86_vector_domain; diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 0092a6e0d5ee..60047495041c 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -21,11 +21,18 @@ #include #include +struct apic_chip_data { + struct irq_cfg cfg; + cpumask_var_t domain; + cpumask_var_t old_domain; + u8 move_in_progress : 1; +}; + struct irq_domain *x86_vector_domain; static DEFINE_RAW_SPINLOCK(vector_lock); static struct irq_chip lapic_controller; #ifdef CONFIG_X86_IO_APIC -static struct irq_cfg *legacy_irq_cfgs[NR_IRQS_LEGACY]; +static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY]; #endif void lock_vector_lock(void) @@ -41,12 +48,7 @@ void unlock_vector_lock(void) raw_spin_unlock(&vector_lock); } -struct irq_cfg *irq_cfg(unsigned int irq) -{ - return irqd_cfg(irq_get_irq_data(irq)); -} - -struct irq_cfg *irqd_cfg(struct irq_data *irq_data) +static struct apic_chip_data *apic_chip_data(struct irq_data *irq_data) { if (!irq_data) return NULL; @@ -57,36 +59,48 @@ struct irq_cfg *irqd_cfg(struct irq_data *irq_data) return irq_data->chip_data; } -static struct irq_cfg *alloc_irq_cfg(int node) +struct irq_cfg *irqd_cfg(struct irq_data *irq_data) +{ + struct apic_chip_data *data = apic_chip_data(irq_data); + + return data ? &data->cfg : NULL; +} + +struct irq_cfg *irq_cfg(unsigned int irq) { - struct irq_cfg *cfg; + return irqd_cfg(irq_get_irq_data(irq)); +} - cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node); - if (!cfg) +static struct apic_chip_data *alloc_apic_chip_data(int node) +{ + struct apic_chip_data *data; + + data = kzalloc_node(sizeof(*data), GFP_KERNEL, node); + if (!data) return NULL; - if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node)) - goto out_cfg; - if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node)) + if (!zalloc_cpumask_var_node(&data->domain, GFP_KERNEL, node)) + goto out_data; + if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node)) goto out_domain; - return cfg; + return data; out_domain: - free_cpumask_var(cfg->domain); -out_cfg: - kfree(cfg); + free_cpumask_var(data->domain); +out_data: + kfree(data); return NULL; } -static void free_irq_cfg(struct irq_cfg *cfg) +static void free_apic_chip_data(struct apic_chip_data *data) { - if (cfg) { - free_cpumask_var(cfg->domain); - free_cpumask_var(cfg->old_domain); - kfree(cfg); + if (data) { + free_cpumask_var(data->domain); + free_cpumask_var(data->old_domain); + kfree(data); } } -static int -__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +static int __assign_irq_vector(int irq, struct apic_chip_data *d, + const struct cpumask *mask) { /* * NOTE! The local APIC isn't very good at handling @@ -104,7 +118,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) int cpu, err; cpumask_var_t tmp_mask; - if (cfg->move_in_progress) + if (d->move_in_progress) return -EBUSY; if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) @@ -112,26 +126,26 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) /* Only try and allocate irqs on cpus that are present */ err = -ENOSPC; - cpumask_clear(cfg->old_domain); + cpumask_clear(d->old_domain); cpu = cpumask_first_and(mask, cpu_online_mask); while (cpu < nr_cpu_ids) { int new_cpu, vector, offset; apic->vector_allocation_domain(cpu, tmp_mask, mask); - if (cpumask_subset(tmp_mask, cfg->domain)) { + if (cpumask_subset(tmp_mask, d->domain)) { err = 0; - if (cpumask_equal(tmp_mask, cfg->domain)) + if (cpumask_equal(tmp_mask, d->domain)) break; /* * New cpumask using the vector is a proper subset of * the current in use mask. So cleanup the vector * allocation for the members that are not used anymore. */ - cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask); - cfg->move_in_progress = - cpumask_intersects(cfg->old_domain, cpu_online_mask); - cpumask_and(cfg->domain, cfg->domain, tmp_mask); + cpumask_andnot(d->old_domain, d->domain, tmp_mask); + d->move_in_progress = + cpumask_intersects(d->old_domain, cpu_online_mask); + cpumask_and(d->domain, d->domain, tmp_mask); break; } @@ -145,8 +159,8 @@ next: } if (unlikely(current_vector == vector)) { - cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask); - cpumask_andnot(tmp_mask, mask, cfg->old_domain); + cpumask_or(d->old_domain, d->old_domain, tmp_mask); + cpumask_andnot(tmp_mask, mask, d->old_domain); cpu = cpumask_first_and(tmp_mask, cpu_online_mask); continue; } @@ -162,15 +176,15 @@ next: /* Found one! */ current_vector = vector; current_offset = offset; - if (cfg->vector) { - cpumask_copy(cfg->old_domain, cfg->domain); - cfg->move_in_progress = - cpumask_intersects(cfg->old_domain, cpu_online_mask); + if (d->cfg.vector) { + cpumask_copy(d->old_domain, d->domain); + d->move_in_progress = + cpumask_intersects(d->old_domain, cpu_online_mask); } for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; - cfg->vector = vector; - cpumask_copy(cfg->domain, tmp_mask); + d->cfg.vector = vector; + cpumask_copy(d->domain, tmp_mask); err = 0; break; } @@ -178,46 +192,46 @@ next: if (!err) { /* cache destination APIC IDs into cfg->dest_apicid */ - err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, - &cfg->dest_apicid); + err = apic->cpu_mask_to_apicid_and(mask, d->domain, + &d->cfg.dest_apicid); } return err; } -static int assign_irq_vector(int irq, struct irq_cfg *cfg, +static int assign_irq_vector(int irq, struct apic_chip_data *data, const struct cpumask *mask) { int err; unsigned long flags; raw_spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, cfg, mask); + err = __assign_irq_vector(irq, data, mask); raw_spin_unlock_irqrestore(&vector_lock, flags); return err; } -static void clear_irq_vector(int irq, struct irq_cfg *cfg) +static void clear_irq_vector(int irq, struct apic_chip_data *data) { int cpu, vector; unsigned long flags; raw_spin_lock_irqsave(&vector_lock, flags); - BUG_ON(!cfg->vector); + BUG_ON(!data->cfg.vector); - vector = cfg->vector; - for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) + vector = data->cfg.vector; + for_each_cpu_and(cpu, data->domain, cpu_online_mask) per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; - cfg->vector = 0; - cpumask_clear(cfg->domain); + data->cfg.vector = 0; + cpumask_clear(data->domain); - if (likely(!cfg->move_in_progress)) { + if (likely(!data->move_in_progress)) { raw_spin_unlock_irqrestore(&vector_lock, flags); return; } - for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { + for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) @@ -226,7 +240,7 @@ static void clear_irq_vector(int irq, struct irq_cfg *cfg) break; } } - cfg->move_in_progress = 0; + data->move_in_progress = 0; raw_spin_unlock_irqrestore(&vector_lock, flags); } @@ -261,10 +275,10 @@ static void x86_vector_free_irqs(struct irq_domain *domain, irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i); if (irq_data && irq_data->chip_data) { clear_irq_vector(virq + i, irq_data->chip_data); - free_irq_cfg(irq_data->chip_data); + free_apic_chip_data(irq_data->chip_data); #ifdef CONFIG_X86_IO_APIC if (virq + i < nr_legacy_irqs()) - legacy_irq_cfgs[virq + i] = NULL; + legacy_irq_data[virq + i] = NULL; #endif irq_domain_reset_irq_data(irq_data); } @@ -275,9 +289,9 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { struct irq_alloc_info *info = arg; + struct apic_chip_data *data; const struct cpumask *mask; struct irq_data *irq_data; - struct irq_cfg *cfg; int i, err; if (disable_apic) @@ -292,20 +306,20 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, irq_data = irq_domain_get_irq_data(domain, virq + i); BUG_ON(!irq_data); #ifdef CONFIG_X86_IO_APIC - if (virq + i < nr_legacy_irqs() && legacy_irq_cfgs[virq + i]) - cfg = legacy_irq_cfgs[virq + i]; + if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i]) + data = legacy_irq_data[virq + i]; else #endif - cfg = alloc_irq_cfg(irq_data->node); - if (!cfg) { + data = alloc_apic_chip_data(irq_data->node); + if (!data) { err = -ENOMEM; goto error; } irq_data->chip = &lapic_controller; - irq_data->chip_data = cfg; + irq_data->chip_data = data; irq_data->hwirq = virq + i; - err = assign_irq_vector(virq, cfg, mask); + err = assign_irq_vector(virq, data, mask); if (err) goto error; } @@ -349,22 +363,22 @@ int __init arch_probe_nr_irqs(void) static void init_legacy_irqs(void) { int i, node = cpu_to_node(0); - struct irq_cfg *cfg; + struct apic_chip_data *data; /* * For legacy IRQ's, start with assigning irq0 to irq15 to * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. */ for (i = 0; i < nr_legacy_irqs(); i++) { - cfg = legacy_irq_cfgs[i] = alloc_irq_cfg(node); - BUG_ON(!cfg); + data = legacy_irq_data[i] = alloc_apic_chip_data(node); + BUG_ON(!data); /* * For legacy IRQ's, start with assigning irq0 to irq15 to * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. */ - cfg->vector = IRQ0_VECTOR + i; - cpumask_setall(cfg->domain); - irq_set_chip_data(i, cfg); + data->cfg.vector = IRQ0_VECTOR + i; + cpumask_setall(data->domain); + irq_set_chip_data(i, data); } } #else @@ -390,7 +404,7 @@ static void __setup_vector_irq(int cpu) { /* Initialize vector_irq on a new cpu */ int irq, vector; - struct irq_cfg *cfg; + struct apic_chip_data *data; /* * vector_lock will make sure that we don't run into irq vector @@ -400,13 +414,13 @@ static void __setup_vector_irq(int cpu) raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_active_irq(irq) { - cfg = irq_cfg(irq); - if (!cfg) + data = apic_chip_data(irq_get_irq_data(irq)); + if (!data) continue; - if (!cpumask_test_cpu(cpu, cfg->domain)) + if (!cpumask_test_cpu(cpu, data->domain)) continue; - vector = cfg->vector; + vector = data->cfg.vector; per_cpu(vector_irq, cpu)[vector] = irq; } /* Mark the free vectors */ @@ -415,8 +429,8 @@ static void __setup_vector_irq(int cpu) if (irq <= VECTOR_UNDEFINED) continue; - cfg = irq_cfg(irq); - if (!cpumask_test_cpu(cpu, cfg->domain)) + data = apic_chip_data(irq_get_irq_data(irq)); + if (!cpumask_test_cpu(cpu, data->domain)) per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; } raw_spin_unlock(&vector_lock); @@ -442,15 +456,15 @@ void setup_vector_irq(int cpu) __setup_vector_irq(cpu); } -static int apic_retrigger_irq(struct irq_data *data) +static int apic_retrigger_irq(struct irq_data *irq_data) { - struct irq_cfg *cfg = irqd_cfg(data); + struct apic_chip_data *data = apic_chip_data(irq_data); unsigned long flags; int cpu; raw_spin_lock_irqsave(&vector_lock, flags); - cpu = cpumask_first_and(cfg->domain, cpu_online_mask); - apic->send_IPI_mask(cpumask_of(cpu), cfg->vector); + cpu = cpumask_first_and(data->domain, cpu_online_mask); + apic->send_IPI_mask(cpumask_of(cpu), data->cfg.vector); raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; @@ -466,7 +480,7 @@ void apic_ack_edge(struct irq_data *data) static int apic_set_affinity(struct irq_data *irq_data, const struct cpumask *dest, bool force) { - struct irq_cfg *cfg = irq_data->chip_data; + struct apic_chip_data *data = irq_data->chip_data; int err, irq = irq_data->irq; if (!config_enabled(CONFIG_SMP)) @@ -475,11 +489,11 @@ static int apic_set_affinity(struct irq_data *irq_data, if (!cpumask_intersects(dest, cpu_online_mask)) return -EINVAL; - err = assign_irq_vector(irq, cfg, dest); + err = assign_irq_vector(irq, data, dest); if (err) { struct irq_data *top = irq_get_irq_data(irq); - if (assign_irq_vector(irq, cfg, top->affinity)) + if (assign_irq_vector(irq, data, top->affinity)) pr_err("Failed to recover vector for irq %d\n", irq); return err; } @@ -494,28 +508,31 @@ static struct irq_chip lapic_controller = { }; #ifdef CONFIG_SMP -static void __send_cleanup_vector(struct irq_cfg *cfg) +static void __send_cleanup_vector(struct apic_chip_data *data) { cpumask_var_t cleanup_mask; if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { unsigned int i; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + for_each_cpu_and(i, data->old_domain, cpu_online_mask) apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); } else { - cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask); apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); free_cpumask_var(cleanup_mask); } - cfg->move_in_progress = 0; + data->move_in_progress = 0; } void send_cleanup_vector(struct irq_cfg *cfg) { - if (cfg->move_in_progress) - __send_cleanup_vector(cfg); + struct apic_chip_data *data; + + data = container_of(cfg, struct apic_chip_data, cfg); + if (data->move_in_progress) + __send_cleanup_vector(data); } asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) @@ -531,7 +548,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) int irq; unsigned int irr; struct irq_desc *desc; - struct irq_cfg *cfg; + struct apic_chip_data *data; irq = __this_cpu_read(vector_irq[vector]); @@ -542,8 +559,8 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) if (!desc) continue; - cfg = irq_cfg(irq); - if (!cfg) + data = apic_chip_data(&desc->irq_data); + if (!data) continue; raw_spin_lock(&desc->lock); @@ -552,10 +569,11 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) * Check if the irq migration is in progress. If so, we * haven't received the cleanup request yet for this irq. */ - if (cfg->move_in_progress) + if (data->move_in_progress) goto unlock; - if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) + if (vector == data->cfg.vector && + cpumask_test_cpu(me, data->domain)) goto unlock; irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); @@ -581,14 +599,15 @@ unlock: static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) { unsigned me; + struct apic_chip_data *data; - if (likely(!cfg->move_in_progress)) + data = container_of(cfg, struct apic_chip_data, cfg); + if (likely(!data->move_in_progress)) return; me = smp_processor_id(); - - if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) - __send_cleanup_vector(cfg); + if (vector == data->cfg.vector && cpumask_test_cpu(me, data->domain)) + __send_cleanup_vector(data); } void irq_complete_move(struct irq_cfg *cfg) @@ -600,10 +619,8 @@ void irq_force_complete_move(int irq) { struct irq_cfg *cfg = irq_cfg(irq); - if (!cfg) - return; - - __irq_complete_move(cfg, cfg->vector); + if (cfg) + __irq_complete_move(cfg, cfg->vector); } #endif -- cgit v1.2.3 From 46176f39b1a6f457eae78999befbdf58e68555e7 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:05 +0800 Subject: x86/irq, ACPI: Remove private function mp_register_gsi()/ mp_unregister_gsi() Function mp_register_gsi() is only called once, so fold it into caller acpi_register_gsi_ioapic(). Do the same for mp_unregister_gsi(). Signed-off-by: Jiang Liu Tested-by: Joerg Roedel Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Len Brown Cc: Pavel Machek Link: http://lkml.kernel.org/r/1428978610-28986-29-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/acpi/boot.c | 57 ++++++++++++++------------------------------- 1 file changed, 18 insertions(+), 39 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 21e460b3b360..91a10120ed10 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -400,42 +400,6 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, return 0; } -static int mp_register_gsi(struct device *dev, u32 gsi, int trigger, - int polarity) -{ - int irq, node; - struct irq_alloc_info info; - - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) - return gsi; - - trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1; - polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1; - node = dev ? dev_to_node(dev) : NUMA_NO_NODE; - ioapic_set_alloc_attr(&info, node, trigger, polarity); - irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); - if (irq < 0) - return irq; - - /* Don't set up the ACPI SCI because it's already set up */ - if (enable_update_mptable && acpi_gbl_FADT.sci_interrupt != gsi) - mp_config_acpi_gsi(dev, gsi, trigger, polarity); - - return irq; -} - -static void mp_unregister_gsi(u32 gsi) -{ - int irq; - - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) - return; - - irq = mp_map_gsi_to_irq(gsi, 0, NULL); - if (irq > 0) - mp_unmap_irq(irq); -} - static struct irq_domain_ops acpi_irqdomain_ops = { .alloc = mp_irqdomain_alloc, .free = mp_irqdomain_free, @@ -662,10 +626,21 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, int trigger, int polarity) { int irq = gsi; - #ifdef CONFIG_X86_IO_APIC + int node; + struct irq_alloc_info info; + + node = dev ? dev_to_node(dev) : NUMA_NO_NODE; + trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1; + polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1; + ioapic_set_alloc_attr(&info, node, trigger, polarity); + mutex_lock(&acpi_ioapic_lock); - irq = mp_register_gsi(dev, gsi, trigger, polarity); + irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); + /* Don't set up the ACPI SCI because it's already set up */ + if (irq >= 0 && enable_update_mptable && + acpi_gbl_FADT.sci_interrupt != gsi) + mp_config_acpi_gsi(dev, gsi, trigger, polarity); mutex_unlock(&acpi_ioapic_lock); #endif @@ -675,8 +650,12 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, static void acpi_unregister_gsi_ioapic(u32 gsi) { #ifdef CONFIG_X86_IO_APIC + int irq; + mutex_lock(&acpi_ioapic_lock); - mp_unregister_gsi(gsi); + irq = mp_map_gsi_to_irq(gsi, 0, NULL); + if (irq > 0) + mp_unmap_irq(irq); mutex_unlock(&acpi_ioapic_lock); #endif } -- cgit v1.2.3 From 335efdf57da39d3949c3ef9338de5737e85cbe52 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 10:30:06 +0800 Subject: x86, ioapic: Use proper defines for the entry fields While looking at the printout issue, I stumbled more than once over the various 0/1 assignments which are either commented in strange ways or force to lookup the meaning. Use proper constants and fix the misleading comments. While at it remove pointless 0 assignments in native_disable_io_apic() which have no value for understanding the code. Signed-off-by: Thomas Gleixner Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Cc: x86@kernel.org Link: http://lkml.kernel.org/r/1428978610-28986-30-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 16 +++++-- arch/x86/kernel/apic/io_apic.c | 100 ++++++++++++++++++++--------------------- 2 files changed, 63 insertions(+), 53 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index d58f1c6b5608..984afb732efe 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -98,9 +98,19 @@ struct IR_IO_APIC_route_entry { struct irq_alloc_info; struct irq_data; -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + +#define IOAPIC_MASKED 1 +#define IOAPIC_UNMASKED 0 + +#define IOAPIC_POL_HIGH 0 +#define IOAPIC_POL_LOW 1 + +#define IOAPIC_DEST_MODE_PHYSICAL 0 +#define IOAPIC_DEST_MODE_LOGICAL 1 + #define IOAPIC_MAP_ALLOC 0x1 #define IOAPIC_MAP_CHECK 0x2 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9806f9605bc4..a63167f96126 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -356,7 +356,7 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) static void ioapic_mask_entry(int apic, int pin) { unsigned long flags; - union entry_union eu = { .entry.mask = 1 }; + union entry_union eu = { .entry.mask = IOAPIC_MASKED }; raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0x10 + 2*pin, eu.w1); @@ -517,7 +517,7 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector) /* * Mask the entry and change the trigger mode to edge. */ - entry1.mask = 1; + entry1.mask = IOAPIC_MASKED; entry1.trigger = IOAPIC_EDGE; __ioapic_write_entry(apic, pin, entry1); @@ -553,8 +553,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) * Make sure the entry is masked and re-read the contents to check * if it is a level triggered pin and if the remote-IRR is set. */ - if (!entry.mask) { - entry.mask = 1; + if (entry.mask == IOAPIC_UNMASKED) { + entry.mask = IOAPIC_MASKED; ioapic_write_entry(apic, pin, entry); entry = ioapic_read_entry(apic, pin); } @@ -567,7 +567,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) * doesn't clear the remote-IRR if the trigger mode is not * set to level. */ - if (!entry.trigger) { + if (entry.trigger == IOAPIC_EDGE) { entry.trigger = IOAPIC_LEVEL; ioapic_write_entry(apic, pin, entry); } @@ -670,8 +670,8 @@ void mask_ioapic_entries(void) struct IO_APIC_route_entry entry; entry = ioapics[apic].saved_registers[pin]; - if (!entry.mask) { - entry.mask = 1; + if (entry.mask == IOAPIC_UNMASKED) { + entry.mask = IOAPIC_MASKED; ioapic_write_entry(apic, pin, entry); } } @@ -773,11 +773,11 @@ static int EISA_ELCR(unsigned int irq) #endif -/* ISA interrupts are always polarity zero edge triggered, +/* ISA interrupts are always active high edge triggered, * when listed as conforming in the MP table. */ -#define default_ISA_trigger(idx) (0) -#define default_ISA_polarity(idx) (0) +#define default_ISA_trigger(idx) (IOAPIC_EDGE) +#define default_ISA_polarity(idx) (IOAPIC_POL_HIGH) /* EISA interrupts are always polarity zero and can be edge or level * trigger depending on the ELCR value. If an interrupt is listed as @@ -787,11 +787,11 @@ static int EISA_ELCR(unsigned int irq) #define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq)) #define default_EISA_polarity(idx) default_ISA_polarity(idx) -/* PCI interrupts are always polarity one level triggered, +/* PCI interrupts are always active low level triggered, * when listed as conforming in the MP table. */ -#define default_PCI_trigger(idx) (1) -#define default_PCI_polarity(idx) (1) +#define default_PCI_trigger(idx) (IOAPIC_LEVEL) +#define default_PCI_polarity(idx) (IOAPIC_POL_LOW) static int irq_polarity(int idx) { @@ -811,24 +811,24 @@ static int irq_polarity(int idx) break; case 1: /* high active */ { - polarity = 0; + polarity = IOAPIC_POL_HIGH; break; } case 2: /* reserved */ { pr_warn("broken BIOS!!\n"); - polarity = 1; + polarity = IOAPIC_POL_LOW; break; } case 3: /* low active */ { - polarity = 1; + polarity = IOAPIC_POL_LOW; break; } default: /* invalid */ { pr_warn("broken BIOS!!\n"); - polarity = 1; + polarity = IOAPIC_POL_LOW; break; } } @@ -870,7 +870,7 @@ static int irq_trigger(int idx) default: { pr_warn("broken BIOS!!\n"); - trigger = 1; + trigger = IOAPIC_LEVEL; break; } } @@ -878,24 +878,24 @@ static int irq_trigger(int idx) break; case 1: /* edge */ { - trigger = 0; + trigger = IOAPIC_EDGE; break; } case 2: /* reserved */ { pr_warn("broken BIOS!!\n"); - trigger = 1; + trigger = IOAPIC_LEVEL; break; } case 3: /* level */ { - trigger = 1; + trigger = IOAPIC_LEVEL; break; } default: /* invalid */ { pr_warn("broken BIOS!!\n"); - trigger = 0; + trigger = IOAPIC_EDGE; break; } } @@ -939,11 +939,11 @@ static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst, dst->ioapic_polarity = polarity; } else { /* - * PCI interrupts are always polarity one level + * PCI interrupts are always active low level * triggered. */ - dst->ioapic_trigger = 1; - dst->ioapic_polarity = 1; + dst->ioapic_trigger = IOAPIC_LEVEL; + dst->ioapic_polarity = IOAPIC_POL_LOW; } } } @@ -1296,9 +1296,10 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) entry = ioapic_read_entry(apic, i); snprintf(buf, sizeof(buf), " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)", - i, entry.mask ? "disabled" : "enabled ", - entry.trigger ? "level" : "edge ", - entry.polarity ? "low " : "high", + i, + entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ", + entry.trigger == IOAPIC_LEVEL ? "level" : "edge ", + entry.polarity == IOAPIC_POL_LOW ? "low " : "high", entry.vector, entry.irr, entry.delivery_status); if (ir_entry->format) printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", @@ -1306,7 +1307,9 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) ir_entry->zero); else printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", - buf, entry.dest_mode ? "logical " : "physical", + buf, + entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ? + "logical " : "physical", entry.dest, entry.delivery_mode); } } @@ -1476,15 +1479,12 @@ void native_disable_io_apic(void) struct IO_APIC_route_entry entry; memset(&entry, 0, sizeof(entry)); - entry.mask = 0; /* Enabled */ - entry.trigger = 0; /* Edge */ - entry.irr = 0; - entry.polarity = 0; /* High */ - entry.delivery_status = 0; - entry.dest_mode = 0; /* Physical */ - entry.delivery_mode = dest_ExtINT; /* ExtInt */ - entry.vector = 0; - entry.dest = read_apic_id(); + entry.mask = IOAPIC_UNMASKED; + entry.trigger = IOAPIC_EDGE; + entry.polarity = IOAPIC_POL_HIGH; + entry.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; + entry.delivery_mode = dest_ExtINT; + entry.dest = read_apic_id(); /* * Add it to the IO-APIC irq-routing table: @@ -1494,7 +1494,6 @@ void native_disable_io_apic(void) if (cpu_has_apic || apic_from_smp_config()) disconnect_bsp_APIC(ioapic_i8259.pin != -1); - } /* @@ -2018,12 +2017,12 @@ static inline void __init unlock_ExtINT_logic(void) memset(&entry1, 0, sizeof(entry1)); - entry1.dest_mode = 0; /* physical delivery */ - entry1.mask = 0; /* unmask IRQ now */ + entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; + entry1.mask = IOAPIC_UNMASKED; entry1.dest = hard_smp_processor_id(); entry1.delivery_mode = dest_ExtINT; entry1.polarity = entry0.polarity; - entry1.trigger = 0; + entry1.trigger = IOAPIC_EDGE; entry1.vector = 0; ioapic_write_entry(apic, pin, entry1); @@ -2911,9 +2910,9 @@ static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, data->polarity = info->ioapic_polarity; } else if (acpi_get_override_irq(gsi, &data->trigger, &data->polarity) < 0) { - /* PCI interrupts are always polarity one level triggered. */ - data->trigger = 1; - data->polarity = 1; + /* PCI interrupts are always active low level triggered. */ + data->trigger = IOAPIC_LEVEL; + data->polarity = IOAPIC_POL_LOW; } } @@ -2925,15 +2924,16 @@ static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data, entry->dest_mode = apic->irq_dest_mode; entry->dest = cfg->dest_apicid; entry->vector = cfg->vector; - entry->mask = 0; /* enable IRQ */ entry->trigger = data->trigger; entry->polarity = data->polarity; /* - * Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + * Mask level triggered irqs. Edge triggered irqs are masked + * by the irq core code in case they fire. */ - if (data->trigger) - entry->mask = 1; + if (data->trigger == IOAPIC_LEVEL) + entry->mask = IOAPIC_MASKED; + else + entry->mask = IOAPIC_UNMASKED; } int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, -- cgit v1.2.3 From ab76085ec0858d4c2707ea0d036db00ef4aee8fd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 10:30:07 +0800 Subject: x86,ioapic: Cleanup irq_trigger/polarity() These functions are full of pointless indentations, useless comments and even more useless printks. Clean them up. Signed-off-by: Thomas Gleixner Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Grant Likely Link: http://lkml.kernel.org/r/1428978610-28986-31-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner Cc: Jiang Liu Cc: x86@kernel.org Signed-off-by: Jiang Liu --- arch/x86/kernel/apic/io_apic.c | 138 +++++++++++++++-------------------------- 1 file changed, 50 insertions(+), 88 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a63167f96126..9fcca68b183d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -796,45 +796,47 @@ static int EISA_ELCR(unsigned int irq) static int irq_polarity(int idx) { int bus = mp_irqs[idx].srcbus; - int polarity; /* * Determine IRQ line polarity (high active or low active): */ - switch (mp_irqs[idx].irqflag & 3) - { - case 0: /* conforms, ie. bus-type dependent polarity */ - if (test_bit(bus, mp_bus_not_pci)) - polarity = default_ISA_polarity(idx); - else - polarity = default_PCI_polarity(idx); - break; - case 1: /* high active */ - { - polarity = IOAPIC_POL_HIGH; - break; - } - case 2: /* reserved */ - { - pr_warn("broken BIOS!!\n"); - polarity = IOAPIC_POL_LOW; - break; - } - case 3: /* low active */ - { - polarity = IOAPIC_POL_LOW; - break; - } - default: /* invalid */ - { - pr_warn("broken BIOS!!\n"); - polarity = IOAPIC_POL_LOW; - break; - } + switch (mp_irqs[idx].irqflag & 0x03) { + case 0: + /* conforms to spec, ie. bus-type dependent polarity */ + if (test_bit(bus, mp_bus_not_pci)) + return default_ISA_polarity(idx); + else + return default_PCI_polarity(idx); + case 1: + return IOAPIC_POL_HIGH; + case 2: + pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n"); + case 3: + default: /* Pointless default required due to do gcc stupidity */ + return IOAPIC_POL_LOW; } - return polarity; } +#ifdef CONFIG_EISA +static int eisa_irq_trigger(int idx, int bus, int trigger) +{ + switch (mp_bus_id_to_type[bus]) { + case MP_BUS_PCI: + case MP_BUS_ISA: + return trigger; + case MP_BUS_EISA: + return default_EISA_trigger(idx); + } + pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus); + return IOAPIC_LEVEL; +} +#else +static inline int eisa_irq_trigger(int idx, int bus, int trigger) +{ + return trigger; +} +#endif + static int irq_trigger(int idx) { int bus = mp_irqs[idx].srcbus; @@ -843,63 +845,23 @@ static int irq_trigger(int idx) /* * Determine IRQ trigger mode (edge or level sensitive): */ - switch ((mp_irqs[idx].irqflag>>2) & 3) - { - case 0: /* conforms, ie. bus-type dependent */ - if (test_bit(bus, mp_bus_not_pci)) - trigger = default_ISA_trigger(idx); - else - trigger = default_PCI_trigger(idx); -#ifdef CONFIG_EISA - switch (mp_bus_id_to_type[bus]) { - case MP_BUS_ISA: /* ISA pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* set before the switch */ - break; - } - default: - { - pr_warn("broken BIOS!!\n"); - trigger = IOAPIC_LEVEL; - break; - } - } -#endif - break; - case 1: /* edge */ - { - trigger = IOAPIC_EDGE; - break; - } - case 2: /* reserved */ - { - pr_warn("broken BIOS!!\n"); - trigger = IOAPIC_LEVEL; - break; - } - case 3: /* level */ - { - trigger = IOAPIC_LEVEL; - break; - } - default: /* invalid */ - { - pr_warn("broken BIOS!!\n"); - trigger = IOAPIC_EDGE; - break; - } + switch ((mp_irqs[idx].irqflag >> 2) & 0x03) { + case 0: + /* conforms to spec, ie. bus-type dependent trigger mode */ + if (test_bit(bus, mp_bus_not_pci)) + trigger = default_ISA_trigger(idx); + else + trigger = default_PCI_trigger(idx); + /* Take EISA into account */ + return eisa_irq_trigger(idx, bus, trigger); + case 1: + return IOAPIC_EDGE; + case 2: + pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n"); + case 3: + default: /* Pointless default required due to do gcc stupidity */ + return IOAPIC_LEVEL; } - return trigger; } void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, -- cgit v1.2.3 From f7a0c78669ee79443a91ea89652766c1be8d9e04 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 10:30:08 +0800 Subject: x86: Cleanup irq_domain ops We have 3 identical copies of the ioapic domain ops for acpi, mpparse, and sfi. Have a global one in the io_apic code and be done with it. To avoid include hell in io_apic.h, create a private irqdomain header and include the generic irqdomain header from there. Signed-off-by: Thomas Gleixner Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: sfi-devel@simplefirmware.org Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Len Brown Cc: Pavel Machek Cc: Grant Likely Cc: Rob Herring Cc: x86@kernel.org Link: http://lkml.kernel.org/r/1428978610-28986-32-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 29 ++--------------------------- arch/x86/include/asm/irqdomain.h | 34 ++++++++++++++++++++++++++++++++++ arch/x86/kernel/acpi/boot.c | 13 +++---------- arch/x86/kernel/apic/io_apic.c | 9 ++++++++- arch/x86/kernel/devicetree.c | 12 ++++++------ arch/x86/kernel/mpparse.c | 9 +-------- arch/x86/platform/sfi/sfi.c | 10 ++-------- 7 files changed, 56 insertions(+), 60 deletions(-) create mode 100644 arch/x86/include/asm/irqdomain.h (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 984afb732efe..6cbf2cfb3f8a 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -96,7 +96,7 @@ struct IR_IO_APIC_route_entry { } __attribute__ ((packed)); struct irq_alloc_info; -struct irq_data; +struct ioapic_domain_cfg; #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 @@ -163,23 +163,6 @@ extern int restore_ioapic_entries(void); extern void setup_ioapic_ids_from_mpc(void); extern void setup_ioapic_ids_from_mpc_nocheck(void); -enum ioapic_domain_type { - IOAPIC_DOMAIN_INVALID, - IOAPIC_DOMAIN_LEGACY, - IOAPIC_DOMAIN_STRICT, - IOAPIC_DOMAIN_DYNAMIC, -}; - -struct device_node; -struct irq_domain; -struct irq_domain_ops; - -struct ioapic_domain_cfg { - enum ioapic_domain_type type; - const struct irq_domain_ops *ops; - struct device_node *dev; -}; - extern int mp_find_ioapic(u32 gsi); extern int mp_find_ioapic_pin(int ioapic, u32 gsi); extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, @@ -189,15 +172,7 @@ extern int mp_register_ioapic(int id, u32 address, u32 gsi_base, struct ioapic_domain_cfg *cfg); extern int mp_unregister_ioapic(u32 gsi_base); extern int mp_ioapic_registered(u32 gsi_base); -extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs, void *arg); -extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs); -extern void mp_irqdomain_activate(struct irq_domain *domain, - struct irq_data *irq_data); -extern void mp_irqdomain_deactivate(struct irq_domain *domain, - struct irq_data *irq_data); -extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain); + extern void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, int trigger, int polarity); diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h new file mode 100644 index 000000000000..fe0d4c6636ec --- /dev/null +++ b/arch/x86/include/asm/irqdomain.h @@ -0,0 +1,34 @@ +#ifndef _ASM_IRQDOMAIN_H +#define _ASM_IRQDOMAIN_H + +#include + +enum ioapic_domain_type { + IOAPIC_DOMAIN_INVALID, + IOAPIC_DOMAIN_LEGACY, + IOAPIC_DOMAIN_STRICT, + IOAPIC_DOMAIN_DYNAMIC, +}; + +struct device_node; +struct irq_data; + +struct ioapic_domain_cfg { + enum ioapic_domain_type type; + const struct irq_domain_ops *ops; + struct device_node *dev; +}; + +extern const struct irq_domain_ops mp_ioapic_irqdomain_ops; + +extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg); +extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs); +extern void mp_irqdomain_activate(struct irq_domain *domain, + struct irq_data *irq_data); +extern void mp_irqdomain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data); +extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain); + +#endif diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 91a10120ed10..cb9f6f12246b 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -31,12 +31,12 @@ #include #include #include -#include #include #include #include #include +#include #include #include #include @@ -400,20 +400,13 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, return 0; } -static struct irq_domain_ops acpi_irqdomain_ops = { - .alloc = mp_irqdomain_alloc, - .free = mp_irqdomain_free, - .activate = mp_irqdomain_activate, - .deactivate = mp_irqdomain_deactivate, -}; - static int __init acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) { struct acpi_madt_io_apic *ioapic = NULL; struct ioapic_domain_cfg cfg = { .type = IOAPIC_DOMAIN_DYNAMIC, - .ops = &acpi_irqdomain_ops, + .ops = &mp_ioapic_irqdomain_ops, }; ioapic = (struct acpi_madt_io_apic *)header; @@ -764,7 +757,7 @@ int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base) u64 addr; struct ioapic_domain_cfg cfg = { .type = IOAPIC_DOMAIN_DYNAMIC, - .ops = &acpi_irqdomain_ops, + .ops = &mp_ioapic_irqdomain_ops, }; ioapic_id = acpi_get_ioapic_id(handle, gsi_base, &addr); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9fcca68b183d..845dc0df2002 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -41,13 +41,13 @@ #include #include #include -#include #include #include #include /* time_after() */ #include #include +#include #include #include #include @@ -2995,3 +2995,10 @@ int mp_irqdomain_ioapic_idx(struct irq_domain *domain) { return (int)(long)domain->host_data; } + +const struct irq_domain_ops mp_ioapic_irqdomain_ops = { + .alloc = mp_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, +}; diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 05103d398ed7..5ee771859b6f 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -17,6 +16,7 @@ #include #include +#include #include #include #include @@ -216,11 +216,11 @@ static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp); } -const struct irq_domain_ops ioapic_irq_domain_ops = { - .alloc = dt_irqdomain_alloc, - .free = mp_irqdomain_free, - .activate = mp_irqdomain_activate, - .deactivate = mp_irqdomain_deactivate, +static const struct irq_domain_ops ioapic_irq_domain_ops = { + .alloc = dt_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, }; static void __init dtb_add_ioapic(struct device_node *dn) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index aa4feee74dbe..30ca7607cbbb 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -19,8 +19,8 @@ #include #include #include -#include +#include #include #include #include @@ -113,13 +113,6 @@ static void __init MP_bus_info(struct mpc_bus *m) pr_warn("Unknown bustype %s - ignoring\n", str); } -static struct irq_domain_ops mp_ioapic_irqdomain_ops = { - .alloc = mp_irqdomain_alloc, - .free = mp_irqdomain_free, - .activate = mp_irqdomain_activate, - .deactivate = mp_irqdomain_deactivate, -}; - static void __init MP_ioapic_info(struct mpc_ioapic *m) { struct ioapic_domain_cfg cfg = { diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c index b66b194f9900..6c7111bbd1e9 100644 --- a/arch/x86/platform/sfi/sfi.c +++ b/arch/x86/platform/sfi/sfi.c @@ -25,8 +25,8 @@ #include #include #include -#include +#include #include #include #include @@ -71,12 +71,6 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table) #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC -static struct irq_domain_ops sfi_ioapic_irqdomain_ops = { - .alloc = mp_irqdomain_alloc, - .free = mp_irqdomain_free, - .activate = mp_irqdomain_activate, - .deactivate = mp_irqdomain_deactivate, -}; static int __init sfi_parse_ioapic(struct sfi_table_header *table) { @@ -85,7 +79,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table) int i, num; struct ioapic_domain_cfg cfg = { .type = IOAPIC_DOMAIN_STRICT, - .ops = &sfi_ioapic_irqdomain_ops, + .ops = &mp_ioapic_irqdomain_ops, }; sb = (struct sfi_table_simple *)table; -- cgit v1.2.3 From d746d1ebd30c48562a3fb512ab18d5822f137820 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:09 +0800 Subject: x86/irq: Move irqdomain specific code into asm/irqdomain.h Now we have dedicated asm/irqdomain.h, so move irqdomain specific code into it. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Cc: Joerg Roedel Cc: Andy Lutomirski Link: http://lkml.kernel.org/r/1428978610-28986-33-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 24 ------------------------ arch/x86/include/asm/irq_remapping.h | 2 +- arch/x86/include/asm/irqdomain.h | 35 ++++++++++++++++++++++++++++++++--- arch/x86/kernel/apic/htirq.c | 2 +- arch/x86/kernel/apic/msi.c | 2 +- arch/x86/kernel/apic/vector.c | 2 +- arch/x86/kernel/hpet.c | 2 +- arch/x86/platform/uv/uv_irq.c | 2 +- 8 files changed, 38 insertions(+), 33 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 3b8233a26348..1f88e719fa78 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -94,8 +94,6 @@ extern void trace_call_function_single_interrupt(void); #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi #endif /* CONFIG_TRACING */ -struct irq_domain; - #ifdef CONFIG_X86_LOCAL_APIC struct irq_data; struct pci_dev; @@ -165,22 +163,11 @@ struct irq_alloc_info { }; }; -enum { - /* Allocate contiguous CPU vectors */ - X86_IRQ_ALLOC_CONTIGUOUS_VECTORS = 0x1, -}; - struct irq_cfg { unsigned int dest_apicid; u8 vector; }; -extern struct irq_domain *x86_vector_domain; - -extern void init_irq_alloc_info(struct irq_alloc_info *info, - const struct cpumask *mask); -extern void copy_irq_alloc_info(struct irq_alloc_info *dst, - struct irq_alloc_info *src); extern struct irq_cfg *irq_cfg(unsigned int irq); extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data); extern void lock_vector_lock(void); @@ -200,17 +187,6 @@ static inline void lock_vector_lock(void) {} static inline void unlock_vector_lock(void) {} #endif /* CONFIG_X86_LOCAL_APIC */ -#ifdef CONFIG_PCI_MSI -extern void arch_init_msi_domain(struct irq_domain *domain); -#else -static inline void arch_init_msi_domain(struct irq_domain *domain) { } -#endif -#ifdef CONFIG_HT_IRQ -extern void arch_init_htirq_domain(struct irq_domain *domain); -#else -static inline void arch_init_htirq_domain(struct irq_domain *domain) { } -#endif - /* Statistics */ extern atomic_t irq_err_count; extern atomic_t irq_mis_count; diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 09efa358c831..78974fbc33b4 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -22,7 +22,7 @@ #ifndef __X86_IRQ_REMAPPING_H #define __X86_IRQ_REMAPPING_H -#include +#include #include #include diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h index fe0d4c6636ec..d26075b52885 100644 --- a/arch/x86/include/asm/irqdomain.h +++ b/arch/x86/include/asm/irqdomain.h @@ -2,6 +2,25 @@ #define _ASM_IRQDOMAIN_H #include +#include + +#ifdef CONFIG_X86_LOCAL_APIC +enum { + /* Allocate contiguous CPU vectors */ + X86_IRQ_ALLOC_CONTIGUOUS_VECTORS = 0x1, +}; + +extern struct irq_domain *x86_vector_domain; + +extern void init_irq_alloc_info(struct irq_alloc_info *info, + const struct cpumask *mask); +extern void copy_irq_alloc_info(struct irq_alloc_info *dst, + struct irq_alloc_info *src); +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC +struct device_node; +struct irq_data; enum ioapic_domain_type { IOAPIC_DOMAIN_INVALID, @@ -10,9 +29,6 @@ enum ioapic_domain_type { IOAPIC_DOMAIN_DYNAMIC, }; -struct device_node; -struct irq_data; - struct ioapic_domain_cfg { enum ioapic_domain_type type; const struct irq_domain_ops *ops; @@ -30,5 +46,18 @@ extern void mp_irqdomain_activate(struct irq_domain *domain, extern void mp_irqdomain_deactivate(struct irq_domain *domain, struct irq_data *irq_data); extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain); +#endif /* CONFIG_X86_IO_APIC */ + +#ifdef CONFIG_PCI_MSI +extern void arch_init_msi_domain(struct irq_domain *domain); +#else +static inline void arch_init_msi_domain(struct irq_domain *domain) { } +#endif + +#ifdef CONFIG_HT_IRQ +extern void arch_init_htirq_domain(struct irq_domain *domain); +#else +static inline void arch_init_htirq_domain(struct irq_domain *domain) { } +#endif #endif diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c index 1cae104415ea..341e99be42b8 100644 --- a/arch/x86/kernel/apic/htirq.c +++ b/arch/x86/kernel/apic/htirq.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 109584261c4e..58fde664e7c0 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 60047495041c..ad786f8a7cc7 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -13,8 +13,8 @@ #include #include #include -#include #include +#include #include #include #include diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index e3bc18080052..e2449cf38b06 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -11,8 +11,8 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 091b36ac44c4..cdf86cd3fd97 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -12,8 +12,8 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3 From f7fa7aeeecb7a9abdd5f5d069a71ffb3e99a2a07 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Apr 2015 10:30:10 +0800 Subject: x86/irq: Avoid memory allocation in __assign_irq_vector() Function __assign_irq_vector() is protected by vector_lock, so use a global temporary cpu_mask to avoid allocating/freeing cpu_mask. Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: David Cohen Cc: Sander Eikelenboom Cc: David Vrabel Cc: Tony Luck Cc: Joerg Roedel Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/1428978610-28986-34-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index ad786f8a7cc7..1c7dd42b98c1 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -30,6 +30,7 @@ struct apic_chip_data { struct irq_domain *x86_vector_domain; static DEFINE_RAW_SPINLOCK(vector_lock); +static cpumask_var_t vector_cpumask; static struct irq_chip lapic_controller; #ifdef CONFIG_X86_IO_APIC static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY]; @@ -116,14 +117,10 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d, static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; static int current_offset = VECTOR_OFFSET_START % 16; int cpu, err; - cpumask_var_t tmp_mask; if (d->move_in_progress) return -EBUSY; - if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) - return -ENOMEM; - /* Only try and allocate irqs on cpus that are present */ err = -ENOSPC; cpumask_clear(d->old_domain); @@ -131,21 +128,22 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d, while (cpu < nr_cpu_ids) { int new_cpu, vector, offset; - apic->vector_allocation_domain(cpu, tmp_mask, mask); + apic->vector_allocation_domain(cpu, vector_cpumask, mask); - if (cpumask_subset(tmp_mask, d->domain)) { + if (cpumask_subset(vector_cpumask, d->domain)) { err = 0; - if (cpumask_equal(tmp_mask, d->domain)) + if (cpumask_equal(vector_cpumask, d->domain)) break; /* * New cpumask using the vector is a proper subset of * the current in use mask. So cleanup the vector * allocation for the members that are not used anymore. */ - cpumask_andnot(d->old_domain, d->domain, tmp_mask); + cpumask_andnot(d->old_domain, d->domain, + vector_cpumask); d->move_in_progress = cpumask_intersects(d->old_domain, cpu_online_mask); - cpumask_and(d->domain, d->domain, tmp_mask); + cpumask_and(d->domain, d->domain, vector_cpumask); break; } @@ -159,16 +157,18 @@ next: } if (unlikely(current_vector == vector)) { - cpumask_or(d->old_domain, d->old_domain, tmp_mask); - cpumask_andnot(tmp_mask, mask, d->old_domain); - cpu = cpumask_first_and(tmp_mask, cpu_online_mask); + cpumask_or(d->old_domain, d->old_domain, + vector_cpumask); + cpumask_andnot(vector_cpumask, mask, d->old_domain); + cpu = cpumask_first_and(vector_cpumask, + cpu_online_mask); continue; } if (test_bit(vector, used_vectors)) goto next; - for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) { + for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) { if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED) goto next; @@ -181,14 +181,13 @@ next: d->move_in_progress = cpumask_intersects(d->old_domain, cpu_online_mask); } - for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) + for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; d->cfg.vector = vector; - cpumask_copy(d->domain, tmp_mask); + cpumask_copy(d->domain, vector_cpumask); err = 0; break; } - free_cpumask_var(tmp_mask); if (!err) { /* cache destination APIC IDs into cfg->dest_apicid */ @@ -397,6 +396,8 @@ int __init arch_early_irq_init(void) arch_init_msi_domain(x86_vector_domain); arch_init_htirq_domain(x86_vector_domain); + BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL)); + return arch_early_ioapic_init(); } -- cgit v1.2.3 From eb18cf55c299d2ac5c8b5421c58b6c582a044475 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 May 2015 11:10:11 +0200 Subject: x86: Constify irqdomain ops Nothing changes those ops. Make the initializers readable while at it. Reported-by: Krzysztof Kozlowski Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/htirq.c | 10 +++++----- arch/x86/kernel/apic/vector.c | 6 +++--- arch/x86/platform/uv/uv_irq.c | 10 +++++----- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c index 341e99be42b8..ae50d3454d78 100644 --- a/arch/x86/kernel/apic/htirq.c +++ b/arch/x86/kernel/apic/htirq.c @@ -143,11 +143,11 @@ static void htirq_domain_deactivate(struct irq_domain *domain, write_ht_irq_msg(irq_data->irq, &msg); } -static struct irq_domain_ops htirq_domain_ops = { - .alloc = htirq_domain_alloc, - .free = htirq_domain_free, - .activate = htirq_domain_activate, - .deactivate = htirq_domain_deactivate, +static const struct irq_domain_ops htirq_domain_ops = { + .alloc = htirq_domain_alloc, + .free = htirq_domain_free, + .activate = htirq_domain_activate, + .deactivate = htirq_domain_deactivate, }; void arch_init_htirq_domain(struct irq_domain *parent) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 1c7dd42b98c1..426496862be0 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -330,9 +330,9 @@ error: return err; } -static struct irq_domain_ops x86_vector_domain_ops = { - .alloc = x86_vector_alloc_irqs, - .free = x86_vector_free_irqs, +static const struct irq_domain_ops x86_vector_domain_ops = { + .alloc = x86_vector_alloc_irqs, + .free = x86_vector_free_irqs, }; int __init arch_probe_nr_irqs(void) diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index cdf86cd3fd97..8570abe68be1 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -149,11 +149,11 @@ static void uv_domain_deactivate(struct irq_domain *domain, uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data); } -static struct irq_domain_ops uv_domain_ops = { - .alloc = uv_domain_alloc, - .free = uv_domain_free, - .activate = uv_domain_activate, - .deactivate = uv_domain_deactivate, +static const struct irq_domain_ops uv_domain_ops = { + .alloc = uv_domain_alloc, + .free = uv_domain_free, + .activate = uv_domain_activate, + .deactivate = uv_domain_deactivate, }; static struct irq_domain *uv_get_irq_domain(void) -- cgit v1.2.3 From 781674fc33adf0d975a361e111bb45804356aa23 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 4 May 2015 17:58:00 +0200 Subject: x86/x2apic: Acpi_gbl_FADT existence depends on CONFIG_ACPI If ACPI is disabled, acpi_gbl_FADT is not available, and and the build breaks. Signed-off-by: Jan Kiszka Link: http://lkml.kernel.org/r/55479708.2000104@siemens.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/x2apic_phys.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 6fae733e9194..3ffd925655e0 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -21,11 +21,13 @@ early_param("x2apic_phys", set_x2apic_phys_mode); static bool x2apic_fadt_phys(void) { +#ifdef CONFIG_ACPI if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) && (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { printk(KERN_DEBUG "System requires x2apic physical mode\n"); return true; } +#endif return false; } -- cgit v1.2.3 From f5d6a52f511157c7476590532a23b5664b1ed877 Mon Sep 17 00:00:00 2001 From: "Jan H. Schönherr" Date: Mon, 4 May 2015 11:42:34 +0200 Subject: x86/smpboot: Skip delays during SMP initialization similar to Xen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the per-CPU delays during SMP initialization, which seems to be possible on newer architectures with an x2APIC. Xen does this since 2011. In fact, this commit is basically a combination of the following Xen commits. The first removes the delays, the second fixes an issue with the removal: commit 68fce206f6dba9981e8322269db49692c95ce250 Author: Tim Deegan Date: Tue Jul 19 14:13:01 2011 +0100 x86: Remove timeouts from INIT-SIPI-SIPI sequence when using x2apic. Some of the timeouts are pointless since they're waiting for the ICR to ack the IPI delivery and that doesn't happen on x2apic. The others should be benign (and are suggested in the SDM) but removing them makes AP bringup much more reliable on some test boxes. Signed-off-by: Tim Deegan commit f12ee533150761df5a7099c83f2a5fa6c07d1187 Author: Gang Wei Date: Thu Dec 29 10:07:54 2011 +0000 X86: Add a delay between INIT & SIPIs for tboot AP bring-up in X2APIC case Without this delay, Xen could not bring APs up while working with TXT/tboot, because tboot needs some time in APs to handle INIT before becoming ready for receiving SIPIs (this delay was removed as part of c/s 23724 by Tim Deegan). Signed-off-by: Gang Wei Acked-by: Keir Fraser Acked-by: Tim Deegan Committed-by: Tim Deegan Signed-off-by: Jan H. Schönherr Cc: Anthony Liguori Cc: Borislav Petkov Cc: Gang Wei Cc: H. Peter Anvin Cc: Len Brown Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Tim Deegan Link: http://lkml.kernel.org/r/1430732554-7294-1-git-send-email-jschoenh@amazon.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 61 +++++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 50e547eac8cd..63b46414c80c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -555,7 +555,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { - unsigned long send_status, accept_status = 0; + unsigned long send_status = 0, accept_status = 0; int maxlvt, num_starts, j; maxlvt = lapic_get_maxlvt(); @@ -580,22 +580,34 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, phys_apicid); - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); + if (!cpu_has_x2apic) { + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); - mdelay(10); + mdelay(10); - pr_debug("Deasserting INIT\n"); + pr_debug("Deasserting INIT\n"); - /* Target chip */ - /* Send IPI */ - apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); + /* Target chip */ + /* Send IPI */ + apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); - mb(); - atomic_set(&init_deasserted, 1); + mb(); + atomic_set(&init_deasserted, 1); + } else if (tboot_enabled()) { + /* + * With tboot AP is actually spinning in a mini-guest before + * receiving INIT. Upon receiving INIT ipi, AP need time to + * VMExit, update VMCS to tracking SIPIs and VMResume. + * + * While AP is in root mode handling the INIT the CPU will drop + * any SIPIs + */ + udelay(10); + } /* * Should we send STARTUP IPIs ? @@ -637,20 +649,23 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), phys_apicid); - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(300); + if (!cpu_has_x2apic) { + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(300); - pr_debug("Startup point 1\n"); + pr_debug("Startup point 1\n"); - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); + + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(200); + } - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(200); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); accept_status = (apic_read(APIC_ESR) & 0xEF); -- cgit v1.2.3 From afdf344e08fbec28ab2204a626fa1f260dcb68be Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Wed, 6 May 2015 06:58:53 -0500 Subject: x86/mce/amd: Factor out logging mechanism Refactor the code here to setup struct mce and call mce_log() to log the error. We're going to reuse this in a later patch as part of the deferred error interrupt enablement. No functional change is introduced. Suggested-by: Borislav Petkov Signed-off-by: Aravind Gopalakrishnan Cc: Tony Luck Cc: x86-ml Cc: linux-edac Link: http://lkml.kernel.org/r/1430913538-1415-2-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 55ad9b37cae8..5f25de20db36 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -264,6 +264,27 @@ init: } } +static void __log_error(unsigned int bank, bool threshold_err, u64 misc) +{ + struct mce m; + u64 status; + + rdmsrl(MSR_IA32_MCx_STATUS(bank), status); + if (!(status & MCI_STATUS_VAL)) + return; + + mce_setup(&m); + + m.status = status; + m.bank = bank; + if (threshold_err) + m.misc = misc; + + mce_log(&m); + + wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); +} + /* * APIC Interrupt Handler */ @@ -273,12 +294,12 @@ init: * the interrupt goes off when error_count reaches threshold_limit. * the handler will simply log mcelog w/ software defined bank number. */ + static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; int cpu = smp_processor_id(); unsigned int bank, block; - struct mce m; /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { @@ -321,15 +342,7 @@ static void amd_threshold_interrupt(void) return; log: - mce_setup(&m); - rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status); - if (!(m.status & MCI_STATUS_VAL)) - return; - m.misc = ((u64)high << 32) | low; - m.bank = bank; - mce_log(&m); - - wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); + __log_error(bank, true, ((u64)high << 32) | low); } /* -- cgit v1.2.3 From 6e6e746e33e9555a7fce159d25314c9df3bcda93 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Wed, 6 May 2015 06:58:54 -0500 Subject: x86/mce/amd: Collect valid address before logging an error amd_decode_mce() needs value in m->addr so it can report the error address correctly. This should be setup in __log_error() before we call mce_log(). We do this because the error address is an important bit of information which should be conveyed to userspace. The correct output then reports proper address, like this: [Hardware Error]: Corrected error, no action required. [Hardware Error]: CPU:0 (15:60:0) MC0_STATUS [-|CE|-|-|AddrV|-|-|CECC]: 0x840041000028017b [Hardware Error]: MC0 Error Address: 0x00001f808f0ff040 Signed-off-by: Aravind Gopalakrishnan Cc: Tony Luck Cc: x86-ml Cc: linux-edac Link: http://lkml.kernel.org/r/1430913538-1415-3-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 5f25de20db36..607075726e10 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -277,11 +277,14 @@ static void __log_error(unsigned int bank, bool threshold_err, u64 misc) m.status = status; m.bank = bank; + if (threshold_err) m.misc = misc; - mce_log(&m); + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MCx_ADDR(bank), m.addr); + mce_log(&m); wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); } -- cgit v1.2.3 From 7559e13fb4abe7880dfaf985d6a1630ca90a67ce Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Wed, 6 May 2015 06:58:55 -0500 Subject: x86/mce: Add support for deferred errors on AMD Deferred errors indicate error conditions that were not corrected, but those errors have not been consumed yet. They require no action from S/W (or action is optional). These errors provide info about a latent uncorrectable MCE that can occur when a poisoned data is consumed by the processor. Newer AMD processors can generate deferred errors and can be configured to generate APIC interrupts on such events. SUCCOR stands for S/W UnCorrectable error COntainment and Recovery. It indicates support for data poisoning in HW and deferred error interrupts. Add new bitfield to mce_vendor_flags for this. We use this to verify presence of deferred error interrupts before we enable them in mce_amd.c While at it, clarify comments in mce_vendor_flags to provide an indication of usages of the bitfields. Signed-off-by: Aravind Gopalakrishnan Cc: Tony Luck Cc: x86-ml Cc: linux-edac Link: http://lkml.kernel.org/r/1430913538-1415-4-git-send-email-Aravind.Gopalakrishnan@amd.com [ beef up commit message, do CPUID(8000_0007) only once. ] Signed-off-by: Borislav Petkov --- arch/x86/include/asm/mce.h | 15 +++++++++++++-- arch/x86/kernel/cpu/mcheck/mce.c | 10 ++++++++-- 2 files changed, 21 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 1f5a86d518db..407ced642ac1 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -117,8 +117,19 @@ struct mca_config { }; struct mce_vendor_flags { - __u64 overflow_recov : 1, /* cpuid_ebx(80000007) */ - __reserved_0 : 63; + /* + * overflow recovery cpuid bit indicates that overflow + * conditions are not fatal + */ + __u64 overflow_recov : 1, + + /* + * SUCCOR stands for S/W UnCorrectable error COntainment + * and Recovery. It indicates support for data poisoning + * in HW and deferred error interrupts. + */ + succor : 1, + __reserved_0 : 62; }; extern struct mce_vendor_flags mce_flags; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e535533d5ab8..521e5016aca6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1637,10 +1637,16 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) mce_intel_feature_init(c); mce_adjust_timer = cmci_intel_adjust_timer; break; - case X86_VENDOR_AMD: + + case X86_VENDOR_AMD: { + u32 ebx = cpuid_ebx(0x80000007); + mce_amd_feature_init(c); - mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1; + mce_flags.overflow_recov = !!(ebx & BIT(0)); + mce_flags.succor = !!(ebx & BIT(1)); break; + } + default: break; } -- cgit v1.2.3 From 24fd78a81f6d3fe7f7a440c8629f9c52cd5f830e Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Wed, 6 May 2015 06:58:56 -0500 Subject: x86/mce/amd: Introduce deferred error interrupt handler Deferred errors indicate error conditions that were not corrected, but require no action from S/W (or action is optional).These errors provide info about a latent UC MCE that can occur when a poisoned data is consumed by the processor. Processors that report these errors can be configured to generate APIC interrupts to notify OS about the error. Provide an interrupt handler in this patch so that OS can catch these errors as and when they happen. Currently, we simply log the errors and exit the handler as S/W action is not mandated. Signed-off-by: Aravind Gopalakrishnan Cc: Tony Luck Cc: x86-ml Cc: linux-edac Link: http://lkml.kernel.org/r/1430913538-1415-5-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Borislav Petkov --- arch/x86/include/asm/entry_arch.h | 3 ++ arch/x86/include/asm/hardirq.h | 3 ++ arch/x86/include/asm/hw_irq.h | 2 + arch/x86/include/asm/irq_vectors.h | 1 + arch/x86/include/asm/mce.h | 3 ++ arch/x86/include/asm/trace/irq_vectors.h | 6 +++ arch/x86/include/asm/traps.h | 3 +- arch/x86/kernel/cpu/mcheck/mce_amd.c | 93 ++++++++++++++++++++++++++++++++ arch/x86/kernel/entry_64.S | 5 ++ arch/x86/kernel/irq.c | 6 +++ arch/x86/kernel/irqinit.c | 4 ++ arch/x86/kernel/traps.c | 5 ++ 12 files changed, 133 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index dc5fa661465f..6da46dbaac87 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -50,4 +50,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) #endif +#ifdef CONFIG_X86_MCE_AMD +BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR) +#endif #endif diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 0f5fb6b6567e..db9f536f482f 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -33,6 +33,9 @@ typedef struct { #ifdef CONFIG_X86_MCE_THRESHOLD unsigned int irq_threshold_count; #endif +#ifdef CONFIG_X86_MCE_AMD + unsigned int irq_deferred_error_count; +#endif #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) unsigned int irq_hv_callback_count; #endif diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index e9571ddabc4f..f71e489d7537 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -73,6 +73,7 @@ extern asmlinkage void invalidate_interrupt31(void); extern asmlinkage void irq_move_cleanup_interrupt(void); extern asmlinkage void reboot_interrupt(void); extern asmlinkage void threshold_interrupt(void); +extern asmlinkage void deferred_error_interrupt(void); extern asmlinkage void call_function_interrupt(void); extern asmlinkage void call_function_single_interrupt(void); @@ -87,6 +88,7 @@ extern void trace_spurious_interrupt(void); extern void trace_thermal_interrupt(void); extern void trace_reschedule_interrupt(void); extern void trace_threshold_interrupt(void); +extern void trace_deferred_error_interrupt(void); extern void trace_call_function_interrupt(void); extern void trace_call_function_single_interrupt(void); #define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 666c89ec4bd7..026fc1e1599c 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -113,6 +113,7 @@ #define IRQ_WORK_VECTOR 0xf6 #define UV_BAU_MESSAGE 0xf5 +#define DEFERRED_ERROR_VECTOR 0xf4 /* Vector on which hypervisor callbacks will be delivered */ #define HYPERVISOR_CALLBACK_VECTOR 0xf3 diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 407ced642ac1..6a3034a0a072 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -234,6 +234,9 @@ void do_machine_check(struct pt_regs *, long); extern void (*mce_threshold_vector)(void); extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); +/* Deferred error interrupt handler */ +extern void (*deferred_error_int_vector)(void); + /* * Thermal handler */ diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 4cab890007a7..38a09a13a9bc 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -100,6 +100,12 @@ DEFINE_IRQ_VECTOR_EVENT(call_function_single); */ DEFINE_IRQ_VECTOR_EVENT(threshold_apic); +/* + * deferred_error_apic - called when entering/exiting a deferred apic interrupt + * vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic); + /* * thermal_apic - called when entering/exiting a thermal apic interrupt * vector handler diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 4e49d7dff78e..c5380bea2a36 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -108,7 +108,8 @@ extern int panic_on_unrecovered_nmi; void math_emulate(struct math_emu_info *); #ifndef CONFIG_X86_32 asmlinkage void smp_thermal_interrupt(void); -asmlinkage void mce_threshold_interrupt(void); +asmlinkage void smp_threshold_interrupt(void); +asmlinkage void smp_deferred_error_interrupt(void); #endif extern enum ctx_state ist_enter(struct pt_regs *regs); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 607075726e10..2e7ebe7e1e80 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -12,6 +12,8 @@ * - added support for AMD Family 0x10 processors * May 2012 * - major scrubbing + * May 2015 + * - add support for deferred error interrupts (Aravind Gopalakrishnan) * * All MC4_MISCi registers are shared between multi-cores */ @@ -32,6 +34,7 @@ #include #include #include +#include #define NR_BLOCKS 9 #define THRESHOLD_MAX 0xFFF @@ -47,6 +50,13 @@ #define MASK_BLKPTR_LO 0xFF000000 #define MCG_XBLK_ADDR 0xC0000400 +/* Deferred error settings */ +#define MSR_CU_DEF_ERR 0xC0000410 +#define MASK_DEF_LVTOFF 0x000000F0 +#define MASK_DEF_INT_TYPE 0x00000006 +#define DEF_LVT_OFF 0x2 +#define DEF_INT_TYPE_APIC 0x2 + static const char * const th_names[] = { "load_store", "insn_fetch", @@ -60,6 +70,13 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ static void amd_threshold_interrupt(void); +static void amd_deferred_error_interrupt(void); + +static void default_deferred_error_interrupt(void) +{ + pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR); +} +void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; /* * CPU Initialization @@ -205,6 +222,39 @@ static int setup_APIC_mce(int reserved, int new) return reserved; } +static int setup_APIC_deferred_error(int reserved, int new) +{ + if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR, + APIC_EILVT_MSG_FIX, 0)) + return new; + + return reserved; +} + +static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) +{ + u32 low = 0, high = 0; + int def_offset = -1, def_new; + + if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high)) + return; + + def_new = (low & MASK_DEF_LVTOFF) >> 4; + if (!(low & MASK_DEF_LVTOFF)) { + pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n"); + def_new = DEF_LVT_OFF; + low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4); + } + + def_offset = setup_APIC_deferred_error(def_offset, def_new); + if ((def_offset == def_new) && + (deferred_error_int_vector != amd_deferred_error_interrupt)) + deferred_error_int_vector = amd_deferred_error_interrupt; + + low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; + wrmsr(MSR_CU_DEF_ERR, low, high); +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { @@ -262,6 +312,9 @@ init: mce_threshold_block_init(&b, offset); } } + + if (mce_flags.succor) + deferred_error_interrupt_enable(c); } static void __log_error(unsigned int bank, bool threshold_err, u64 misc) @@ -288,6 +341,46 @@ static void __log_error(unsigned int bank, bool threshold_err, u64 misc) wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); } +static inline void __smp_deferred_error_interrupt(void) +{ + inc_irq_stat(irq_deferred_error_count); + deferred_error_int_vector(); +} + +asmlinkage __visible void smp_deferred_error_interrupt(void) +{ + entering_irq(); + __smp_deferred_error_interrupt(); + exiting_ack_irq(); +} + +asmlinkage __visible void smp_trace_deferred_error_interrupt(void) +{ + entering_irq(); + trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); + __smp_deferred_error_interrupt(); + trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); + exiting_ack_irq(); +} + +/* APIC interrupt handler for deferred errors */ +static void amd_deferred_error_interrupt(void) +{ + u64 status; + unsigned int bank; + + for (bank = 0; bank < mca_cfg.banks; ++bank) { + rdmsrl(MSR_IA32_MCx_STATUS(bank), status); + + if (!(status & MCI_STATUS_VAL) || + !(status & MCI_STATUS_DEFERRED)) + continue; + + __log_error(bank, false, 0); + break; + } +} + /* * APIC Interrupt Handler */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 02c2eff7478d..12aea85fe738 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -935,6 +935,11 @@ apicinterrupt THRESHOLD_APIC_VECTOR \ threshold_interrupt smp_threshold_interrupt #endif +#ifdef CONFIG_X86_MCE_AMD +apicinterrupt DEFERRED_ERROR_VECTOR \ + deferred_error_interrupt smp_deferred_error_interrupt +#endif + #ifdef CONFIG_X86_THERMAL_VECTOR apicinterrupt THERMAL_APIC_VECTOR \ thermal_interrupt smp_thermal_interrupt diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index e5952c225532..590ed6c1bf51 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -116,6 +116,12 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_puts(p, " Threshold APIC interrupts\n"); #endif +#ifdef CONFIG_X86_MCE_AMD + seq_printf(p, "%*s: ", prec, "DFR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count); + seq_puts(p, " Deferred Error APIC interrupts\n"); +#endif #ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index cd10a6437264..d7ec6e7b2b5b 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -135,6 +135,10 @@ static void __init apic_intr_init(void) alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif +#ifdef CONFIG_X86_MCE_AMD + alloc_intr_gate(DEFERRED_ERROR_VECTOR, deferred_error_interrupt); +#endif + #ifdef CONFIG_X86_LOCAL_APIC /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 324ab5247687..68b1d5979a46 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -827,6 +827,11 @@ asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void) { } +asmlinkage __visible void __attribute__((weak)) +smp_deferred_error_interrupt(void) +{ +} + /* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task -- cgit v1.2.3 From 868c00bb5980653c44d931384baa2c7f1bde81ef Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Wed, 6 May 2015 06:58:58 -0500 Subject: x86/mce/amd: Rename setup_APIC_mce 'setup_APIC_mce' doesn't give us an indication of why we are going to program LVT. Make that explicit by renaming it to setup_APIC_mce_threshold so we know. No functional change is introduced. Signed-off-by: Aravind Gopalakrishnan Cc: Tony Luck Cc: x86-ml Cc: linux-edac Link: http://lkml.kernel.org/r/1430913538-1415-7-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 2e7ebe7e1e80..70e1bf6f784d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -213,7 +213,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset) threshold_restart_bank(&tr); }; -static int setup_APIC_mce(int reserved, int new) +static int setup_APIC_mce_threshold(int reserved, int new) { if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR, APIC_EILVT_MSG_FIX, 0)) @@ -302,7 +302,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) b.interrupt_enable = 1; new = (high & MASK_LVTOFF_HI) >> 20; - offset = setup_APIC_mce(offset, new); + offset = setup_APIC_mce_threshold(offset, new); if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt)) -- cgit v1.2.3 From 8cd161b1f755decd8b7f6c9c7144119281fe11a4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 7 May 2015 11:38:08 +0200 Subject: x86/traps: Remove superfluous weak definitions and dead code Those were leftovers of the x86 merge, see 081f75bbdc86 ("traps: x86: make traps_32.c and traps_64.c equal") for example and are not needed now. Signed-off-by: Borislav Petkov --- arch/x86/kernel/traps.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 68b1d5979a46..2768bb663f94 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -813,23 +813,6 @@ dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { conditional_sti(regs); -#if 0 - /* No need to warn about this any longer. */ - pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); -#endif -} - -asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void) -{ -} - -asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void) -{ -} - -asmlinkage __visible void __attribute__((weak)) -smp_deferred_error_interrupt(void) -{ } /* -- cgit v1.2.3 From 3490c0e45f7e5a5b1e5c62e4c60b0e55b2e75e71 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 7 May 2015 12:06:43 +0200 Subject: x86/mce/amd: Zap changelog It is useless and git history has it all detailed anyway. Update copyright while at it. Signed-off-by: Borislav Petkov Cc: Aravind Gopalakrishnan --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 70e1bf6f784d..e99b15077e94 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -1,21 +1,13 @@ /* - * (c) 2005-2012 Advanced Micro Devices, Inc. + * (c) 2005-2015 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html * * Written by Jacob Shin - AMD, Inc. - * * Maintained by: Borislav Petkov * - * April 2006 - * - added support for AMD Family 0x10 processors - * May 2012 - * - major scrubbing - * May 2015 - * - add support for deferred error interrupts (Aravind Gopalakrishnan) - * - * All MC4_MISCi registers are shared between multi-cores + * All MC4_MISCi registers are shared between cores on a node. */ #include #include -- cgit v1.2.3 From dde74f2e4a4447ef838c57e407f7139de3df68cb Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 27 Apr 2015 15:21:51 +0200 Subject: x86/asm/entry/64: Tidy up JZ insns after TESTs After TESTs, use logically correct JZ/JNZ mnemonics instead of JE/JNE. This doesn't change code. Signed-off-by: Denys Vlasenko Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1430140912-7960-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e952f6bf1d6d..8f8b22a361df 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -666,7 +666,7 @@ END(irq_entries_start) leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ testl $3, CS-RBP(%rsp) - je 1f + jz 1f SWAPGS 1: /* @@ -721,7 +721,7 @@ ret_from_intr: CFI_ADJUST_CFA_OFFSET RBP testl $3,CS(%rsp) - je retint_kernel + jz retint_kernel /* Interrupt came from user space */ GET_THREAD_INFO(%rcx) @@ -1310,7 +1310,7 @@ ENTRY(error_entry) SAVE_EXTRA_REGS 8 xorl %ebx,%ebx testl $3,CS+8(%rsp) - je error_kernelspace + jz error_kernelspace error_swapgs: SWAPGS error_sti: @@ -1361,7 +1361,7 @@ ENTRY(error_exit) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) testl %eax,%eax - jne retint_kernel + jnz retint_kernel LOCKDEP_SYS_EXIT_IRQ movl TI_flags(%rcx),%edx movl $_TIF_WORK_MASK,%edi -- cgit v1.2.3 From 03335e95e27fc1f2b17b05b27342ad76986b3cf0 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 27 Apr 2015 15:21:52 +0200 Subject: x86/asm/entry/64: Clean up usage of TEST insns By the nature of TEST operation, it is often possible to test a narrower part of the operand: "testl $3, mem" -> "testb $3, mem" This results in shorter insns, because TEST insn has no sign-entending byte-immediate forms unlike other ALU ops. text data bss dec hex filename 11674 0 0 11674 2d9a entry_64.o.before 11658 0 0 11658 2d8a entry_64.o Changes in object code: - f7 84 24 88 00 00 00 03 00 00 00 testl $0x3,0x88(%rsp) + f6 84 24 88 00 00 00 03 testb $0x3,0x88(%rsp) - f7 44 24 68 03 00 00 00 testl $0x3,0x68(%rsp) + f6 44 24 68 03 testb $0x3,0x68(%rsp) - f7 84 24 90 00 00 00 03 00 00 00 testl $0x3,0x90(%rsp) + f6 84 24 90 00 00 00 03 testb $0x3,0x90(%rsp) Signed-off-by: Denys Vlasenko Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1430140912-7960-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 8f8b22a361df..60705b032521 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -601,7 +601,7 @@ ENTRY(ret_from_fork) RESTORE_EXTRA_REGS - testl $3,CS(%rsp) # from kernel_thread? + testb $3, CS(%rsp) # from kernel_thread? /* * By the time we get here, we have no idea whether our pt_regs, @@ -665,7 +665,7 @@ END(irq_entries_start) leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ - testl $3, CS-RBP(%rsp) + testb $3, CS-RBP(%rsp) jz 1f SWAPGS 1: @@ -720,7 +720,7 @@ ret_from_intr: CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET RBP - testl $3,CS(%rsp) + testb $3, CS(%rsp) jz retint_kernel /* Interrupt came from user space */ @@ -968,7 +968,7 @@ ENTRY(\sym) .if \paranoid .if \paranoid == 1 CFI_REMEMBER_STATE - testl $3, CS(%rsp) /* If coming from userspace, switch */ + testb $3, CS(%rsp) /* If coming from userspace, switch */ jnz 1f /* stacks. */ .endif call paranoid_entry @@ -1309,7 +1309,7 @@ ENTRY(error_entry) SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 xorl %ebx,%ebx - testl $3,CS+8(%rsp) + testb $3, CS+8(%rsp) jz error_kernelspace error_swapgs: SWAPGS @@ -1606,7 +1606,6 @@ end_repeat_nmi: je 1f movq %r12, %cr2 1: - testl %ebx,%ebx /* swapgs needed? */ jnz nmi_restore nmi_swapgs: -- cgit v1.2.3 From 63332a8455d8310b77d38779c6c21a660a8d9feb Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 24 Apr 2015 17:31:33 +0200 Subject: x86/entry: Stop using PER_CPU_VAR(kernel_stack) PER_CPU_VAR(kernel_stack) is redundant: - On the 64-bit build, we can use PER_CPU_VAR(cpu_tss + TSS_sp0). - On the 32-bit build, we can use PER_CPU_VAR(cpu_current_top_of_stack). PER_CPU_VAR(kernel_stack) will be deleted by a separate change. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1429889495-27850-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/include/asm/thread_info.h | 8 +++++++- arch/x86/kernel/entry_64.S | 2 +- arch/x86/xen/xen-asm_64.S | 5 +++-- 4 files changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 2ab0f7182df3..1b1330c07971 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -346,7 +346,7 @@ ENTRY(ia32_cstar_target) SWAPGS_UNSAFE_STACK movl %esp,%r8d CFI_REGISTER rsp,r8 - movq PER_CPU_VAR(kernel_stack),%rsp + movq PER_CPU_VAR(cpu_tss + TSS_sp0),%rsp ENABLE_INTERRUPTS(CLBR_NONE) /* Zero-extending 32-bit regs, do not remove */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index b4bdec3e9523..d656a363e1eb 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -198,9 +198,15 @@ static inline unsigned long current_stack_pointer(void) #else /* !__ASSEMBLY__ */ /* Load thread_info address into "reg" */ +#ifdef CONFIG_X86_32 #define GET_THREAD_INFO(reg) \ - _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \ + _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \ _ASM_SUB $(THREAD_SIZE),reg ; +#else +#define GET_THREAD_INFO(reg) \ + _ASM_MOV PER_CPU_VAR(cpu_tss + TSS_sp0),reg ; \ + _ASM_SUB $(THREAD_SIZE),reg ; +#endif /* * ASM operand which evaluates to a 'thread_info' address of diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 7423e3e2f5c5..c13b86b40176 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -216,7 +216,7 @@ ENTRY(system_call) GLOBAL(system_call_after_swapgs) movq %rsp,PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(kernel_stack),%rsp + movq PER_CPU_VAR(cpu_tss + TSS_sp0),%rsp /* Construct struct pt_regs on stack */ pushq_cfi $__USER_DS /* pt_regs->ss */ diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index a2cabb8bd6bf..5aa7ec607b9e 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -53,7 +54,7 @@ ENTRY(xen_sysret64) * still with the kernel gs, so we can easily switch back */ movq %rsp, PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(kernel_stack), %rsp + movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp pushq $__USER_DS pushq PER_CPU_VAR(rsp_scratch) @@ -72,7 +73,7 @@ ENTRY(xen_sysret32) * still with the kernel gs, so we can easily switch back */ movq %rsp, PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(kernel_stack), %rsp + movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp pushq $__USER32_DS pushq PER_CPU_VAR(rsp_scratch) -- cgit v1.2.3 From fed7c3f0f750f225317828d691e9eb76eec887b3 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 24 Apr 2015 17:31:34 +0200 Subject: x86/entry: Remove unused 'kernel_stack' per-cpu variable Signed-off-by: Denys Vlasenko Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1429889495-27850-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/thread_info.h | 2 -- arch/x86/kernel/cpu/common.c | 4 ---- arch/x86/kernel/process_32.c | 5 +---- arch/x86/kernel/process_64.c | 3 --- arch/x86/kernel/smpboot.c | 2 -- 5 files changed, 1 insertion(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index d656a363e1eb..472288962c99 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -177,8 +177,6 @@ struct thread_info { */ #ifndef __ASSEMBLY__ -DECLARE_PER_CPU(unsigned long, kernel_stack); - static inline struct thread_info *current_thread_info(void) { return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a62cf04dac8a..6bec0b55863e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1155,10 +1155,6 @@ static __init int setup_disablecpuid(char *arg) } __setup("clearcpuid=", setup_disablecpuid); -DEFINE_PER_CPU(unsigned long, kernel_stack) = - (unsigned long)&init_thread_union + THREAD_SIZE; -EXPORT_PER_CPU_SYMBOL(kernel_stack); - #ifdef CONFIG_X86_64 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 8ed2106b06da..a99900cedc22 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -302,13 +302,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) arch_end_context_switch(next_p); /* - * Reload esp0, kernel_stack, and current_top_of_stack. This changes + * Reload esp0 and cpu_current_top_of_stack. This changes * current_thread_info(). */ load_sp0(tss, next); - this_cpu_write(kernel_stack, - (unsigned long)task_stack_page(next_p) + - THREAD_SIZE); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ddfdbf74f174..82134506faa8 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -409,9 +409,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* Reload esp0 and ss1. This changes current_thread_info(). */ load_sp0(tss, next); - this_cpu_write(kernel_stack, - (unsigned long)task_stack_page(next_p) + THREAD_SIZE); - /* * Now maybe reload the debug registers and handle I/O bitmaps */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 50e547eac8cd..023cccf5a4ae 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -792,8 +792,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) clear_tsk_thread_flag(idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); #endif - per_cpu(kernel_stack, cpu) = - (unsigned long)task_stack_page(idle) + THREAD_SIZE; } /* -- cgit v1.2.3 From 3a23208e69679597e767cf3547b1a30dd845d9b5 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 24 Apr 2015 17:31:35 +0200 Subject: x86/entry: Define 'cpu_current_top_of_stack' for 64-bit code 32-bit code has PER_CPU_VAR(cpu_current_top_of_stack). 64-bit code uses somewhat more obscure: PER_CPU_VAR(cpu_tss + TSS_sp0). Define the 'cpu_current_top_of_stack' macro on CONFIG_X86_64 as well so that the PER_CPU_VAR(cpu_current_top_of_stack) expression can be used in both 32-bit and 64-bit code. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1429889495-27850-3-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 4 ++-- arch/x86/include/asm/thread_info.h | 10 ++++------ arch/x86/kernel/entry_64.S | 2 +- arch/x86/xen/xen-asm_64.S | 5 +++-- 4 files changed, 10 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 1b1330c07971..63450a596800 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -113,7 +113,7 @@ ENTRY(ia32_sysenter_target) * it is too small to ever cause noticeable irq latency. */ SWAPGS_UNSAFE_STACK - movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp ENABLE_INTERRUPTS(CLBR_NONE) /* Zero-extending 32-bit regs, do not remove */ @@ -346,7 +346,7 @@ ENTRY(ia32_cstar_target) SWAPGS_UNSAFE_STACK movl %esp,%r8d CFI_REGISTER rsp,r8 - movq PER_CPU_VAR(cpu_tss + TSS_sp0),%rsp + movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp ENABLE_INTERRUPTS(CLBR_NONE) /* Zero-extending 32-bit regs, do not remove */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 472288962c99..225ee545e1a0 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -195,16 +195,14 @@ static inline unsigned long current_stack_pointer(void) #else /* !__ASSEMBLY__ */ +#ifdef CONFIG_X86_64 +# define cpu_current_top_of_stack (cpu_tss + TSS_sp0) +#endif + /* Load thread_info address into "reg" */ -#ifdef CONFIG_X86_32 #define GET_THREAD_INFO(reg) \ _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \ _ASM_SUB $(THREAD_SIZE),reg ; -#else -#define GET_THREAD_INFO(reg) \ - _ASM_MOV PER_CPU_VAR(cpu_tss + TSS_sp0),reg ; \ - _ASM_SUB $(THREAD_SIZE),reg ; -#endif /* * ASM operand which evaluates to a 'thread_info' address of diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c13b86b40176..09c3f9e0e07e 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -216,7 +216,7 @@ ENTRY(system_call) GLOBAL(system_call_after_swapgs) movq %rsp,PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(cpu_tss + TSS_sp0),%rsp + movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp /* Construct struct pt_regs on stack */ pushq_cfi $__USER_DS /* pt_regs->ss */ diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 5aa7ec607b9e..04529e620559 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -54,7 +55,7 @@ ENTRY(xen_sysret64) * still with the kernel gs, so we can easily switch back */ movq %rsp, PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp pushq $__USER_DS pushq PER_CPU_VAR(rsp_scratch) @@ -73,7 +74,7 @@ ENTRY(xen_sysret32) * still with the kernel gs, so we can easily switch back */ movq %rsp, PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp pushq $__USER32_DS pushq PER_CPU_VAR(rsp_scratch) -- cgit v1.2.3 From c5bde906d2916d214d78cd8b67d665bf09867033 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 9 May 2015 11:36:50 -0400 Subject: x86/irq: Merge irq_regs & irq_stat Move irq_regs and irq_stat definitions to irq.c. Signed-off-by: Brian Gerst Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431185813-15413-2-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 6 ++++++ arch/x86/kernel/irq_32.c | 6 ------ arch/x86/kernel/irq_64.c | 6 ------ 3 files changed, 6 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index e5952c225532..fe2ed8bb507b 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -22,6 +22,12 @@ #define CREATE_TRACE_POINTS #include +DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); +EXPORT_PER_CPU_SYMBOL(irq_stat); + +DEFINE_PER_CPU(struct pt_regs *, irq_regs); +EXPORT_PER_CPU_SYMBOL(irq_regs); + atomic_t irq_err_count; /* Function pointer for generic interrupt vector handling */ diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index f9fd86a7fcc7..cd74f5978ab9 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -21,12 +21,6 @@ #include -DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); -EXPORT_PER_CPU_SYMBOL(irq_stat); - -DEFINE_PER_CPU(struct pt_regs *, irq_regs); -EXPORT_PER_CPU_SYMBOL(irq_regs); - #ifdef CONFIG_DEBUG_STACKOVERFLOW int sysctl_panic_on_stackoverflow __read_mostly; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 394e643d7830..bc4604e500a3 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -20,12 +20,6 @@ #include #include -DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); -EXPORT_PER_CPU_SYMBOL(irq_stat); - -DEFINE_PER_CPU(struct pt_regs *, irq_regs); -EXPORT_PER_CPU_SYMBOL(irq_regs); - int sysctl_panic_on_stackoverflow; /* -- cgit v1.2.3 From 51bb92843edcba5a58138cad25ced97923048add Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 9 May 2015 11:36:52 -0400 Subject: x86/asm/entry: Remove SYSCALL_VECTOR Use IA32_SYSCALL_VECTOR for both compat and native. Signed-off-by: Brian Gerst Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431185813-15413-4-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq_vectors.h | 3 --- arch/x86/kernel/traps.c | 4 ++-- arch/x86/lguest/boot.c | 4 ++-- 3 files changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 666c89ec4bd7..07f27926d473 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -47,9 +47,6 @@ #define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR #define IA32_SYSCALL_VECTOR 0x80 -#ifdef CONFIG_X86_32 -# define SYSCALL_VECTOR 0x80 -#endif /* * Vectors 0x30-0x3f are used for ISA interrupts. diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 324ab5247687..5e0791f9d3dc 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -997,8 +997,8 @@ void __init trap_init(void) #endif #ifdef CONFIG_X86_32 - set_system_trap_gate(SYSCALL_VECTOR, &system_call); - set_bit(SYSCALL_VECTOR, used_vectors); + set_system_trap_gate(IA32_SYSCALL_VECTOR, &system_call); + set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif /* diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 8f9a133cc099..cab9aaa7802c 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -90,7 +90,7 @@ struct lguest_data lguest_data = { .noirq_iret = (u32)lguest_noirq_iret, .kernel_address = PAGE_OFFSET, .blocked_interrupts = { 1 }, /* Block timer interrupts */ - .syscall_vec = SYSCALL_VECTOR, + .syscall_vec = IA32_SYSCALL_VECTOR, }; /*G:037 @@ -866,7 +866,7 @@ static void __init lguest_init_IRQ(void) for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) { /* Some systems map "vectors" to interrupts weirdly. Not us! */ __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR); - if (i != SYSCALL_VECTOR) + if (i != IA32_SYSCALL_VECTOR) set_intr_gate(i, irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR)); } -- cgit v1.2.3 From 8b455e6577f325289cf2d1b20f493b2fe5c6c316 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 9 May 2015 11:36:53 -0400 Subject: x86/asm/entry/irq: Clean up IRQn_VECTOR macros Since the ISA irqs are in a single block, use ISA_IRQ_VECTOR(irq) instead of individual macros. Signed-off-by: Brian Gerst Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431185813-15413-5-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq_vectors.h | 18 +----------------- arch/x86/kernel/apic/io_apic.c | 4 ++-- arch/x86/kernel/apic/vector.c | 2 +- arch/x86/kernel/i8259.c | 8 ++++---- arch/x86/kernel/irqinit.c | 4 ++-- 5 files changed, 10 insertions(+), 26 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 07f27926d473..117db96ad5fb 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -52,23 +52,7 @@ * Vectors 0x30-0x3f are used for ISA interrupts. * round up to the next 16-vector boundary */ -#define IRQ0_VECTOR ((FIRST_EXTERNAL_VECTOR + 16) & ~15) - -#define IRQ1_VECTOR (IRQ0_VECTOR + 1) -#define IRQ2_VECTOR (IRQ0_VECTOR + 2) -#define IRQ3_VECTOR (IRQ0_VECTOR + 3) -#define IRQ4_VECTOR (IRQ0_VECTOR + 4) -#define IRQ5_VECTOR (IRQ0_VECTOR + 5) -#define IRQ6_VECTOR (IRQ0_VECTOR + 6) -#define IRQ7_VECTOR (IRQ0_VECTOR + 7) -#define IRQ8_VECTOR (IRQ0_VECTOR + 8) -#define IRQ9_VECTOR (IRQ0_VECTOR + 9) -#define IRQ10_VECTOR (IRQ0_VECTOR + 10) -#define IRQ11_VECTOR (IRQ0_VECTOR + 11) -#define IRQ12_VECTOR (IRQ0_VECTOR + 12) -#define IRQ13_VECTOR (IRQ0_VECTOR + 13) -#define IRQ14_VECTOR (IRQ0_VECTOR + 14) -#define IRQ15_VECTOR (IRQ0_VECTOR + 15) +#define ISA_IRQ_VECTOR(irq) (((FIRST_EXTERNAL_VECTOR + 16) & ~15) + irq) /* * Special IRQ vectors used by the SMP architecture, 0xf0-0xff diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f4dc2462a1ac..e01e4117188a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -258,11 +258,11 @@ int __init arch_early_ioapic_init(void) /* * For legacy IRQ's, start with assigning irq0 to irq15 to - * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. + * ISA_IRQ_VECTOR(irq) for all cpu's. */ for (i = 0; i < nr_legacy_irqs(); i++) { cfg = alloc_irq_and_cfg_at(i, node); - cfg->vector = IRQ0_VECTOR + i; + cfg->vector = ISA_IRQ_VECTOR(i); cpumask_setall(cfg->domain); } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 6cedd7914581..82d44c314a3f 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -314,7 +314,7 @@ void setup_vector_irq(int cpu) * legacy vector to irq mapping: */ for (irq = 0; irq < nr_legacy_irqs(); irq++) - per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq; + per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq; __setup_vector_irq(cpu); } diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index e7cc5370cd2f..16cb827a5b27 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -329,8 +329,8 @@ static void init_8259A(int auto_eoi) */ outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ - outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); + /* ICW2: 8259A-1 IR0-7 mapped to ISA_IRQ_VECTOR(0) */ + outb_pic(ISA_IRQ_VECTOR(0), PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); @@ -342,8 +342,8 @@ static void init_8259A(int auto_eoi) outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ - /* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */ - outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); + /* ICW2: 8259A-2 IR0-7 mapped to ISA_IRQ_VECTOR(8) */ + outb_pic(ISA_IRQ_VECTOR(8), PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index cd10a6437264..dc1e08d23552 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -86,7 +86,7 @@ void __init init_IRQ(void) int i; /* - * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. + * On cpu 0, Assign ISA_IRQ_VECTOR(irq) to IRQ 0..15. * If these IRQ's are handled by legacy interrupt-controllers like PIC, * then this configuration will likely be static after the boot. If * these IRQ's are handled by more mordern controllers like IO-APIC, @@ -94,7 +94,7 @@ void __init init_IRQ(void) * irq's migrate etc. */ for (i = 0; i < nr_legacy_irqs(); i++) - per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i; + per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = i; x86_init.irqs.intr_init(); } -- cgit v1.2.3 From f21262b8e092a770e39fbd405cc18a0247c3af68 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 11 May 2015 10:15:46 +0200 Subject: x86/alternatives: Switch AMD F15h and later to the P6 NOPs Software optimization guides for both F15h and F16h cite those NOPs as the optimal ones. A microbenchmark confirms that actually even older families are better with the single-insn NOPs so switch to them for the alternatives. Cycles count below includes the loop overhead of the measurement but that overhead is the same with all runs. F10h, revE: ----------- Running NOP tests, 1000 NOPs x 1000000 repetitions K8: 90 288.212282 cycles 66 90 288.220840 cycles 66 66 90 288.219447 cycles 66 66 66 90 288.223204 cycles 66 66 90 66 90 571.393424 cycles 66 66 90 66 66 90 571.374919 cycles 66 66 66 90 66 66 90 572.249281 cycles 66 66 66 90 66 66 66 90 571.388651 cycles P6: 90 288.214193 cycles 66 90 288.225550 cycles 0f 1f 00 288.224441 cycles 0f 1f 40 00 288.225030 cycles 0f 1f 44 00 00 288.233558 cycles 66 0f 1f 44 00 00 324.792342 cycles 0f 1f 80 00 00 00 00 325.657462 cycles 0f 1f 84 00 00 00 00 00 430.246643 cycles F14h: ---- Running NOP tests, 1000 NOPs x 1000000 repetitions K8: 90 510.404890 cycles 66 90 510.432117 cycles 66 66 90 510.561858 cycles 66 66 66 90 510.541865 cycles 66 66 90 66 90 1014.192782 cycles 66 66 90 66 66 90 1014.226546 cycles 66 66 66 90 66 66 90 1014.334299 cycles 66 66 66 90 66 66 66 90 1014.381205 cycles P6: 90 510.436710 cycles 66 90 510.448229 cycles 0f 1f 00 510.545100 cycles 0f 1f 40 00 510.502792 cycles 0f 1f 44 00 00 510.589517 cycles 66 0f 1f 44 00 00 510.611462 cycles 0f 1f 80 00 00 00 00 511.166794 cycles 0f 1f 84 00 00 00 00 00 511.651641 cycles F15h: ----- Running NOP tests, 1000 NOPs x 1000000 repetitions K8: 90 243.128396 cycles 66 90 243.129883 cycles 66 66 90 243.131631 cycles 66 66 66 90 242.499324 cycles 66 66 90 66 90 481.829083 cycles 66 66 90 66 66 90 481.884413 cycles 66 66 66 90 66 66 90 481.851446 cycles 66 66 66 90 66 66 66 90 481.409220 cycles P6: 90 243.127026 cycles 66 90 243.130711 cycles 0f 1f 00 243.122747 cycles 0f 1f 40 00 242.497617 cycles 0f 1f 44 00 00 245.354461 cycles 66 0f 1f 44 00 00 361.930417 cycles 0f 1f 80 00 00 00 00 362.844944 cycles 0f 1f 84 00 00 00 00 00 480.514948 cycles F16h: ----- Running NOP tests, 1000 NOPs x 1000000 repetitions K8: 90 507.793298 cycles 66 90 507.789636 cycles 66 66 90 507.826490 cycles 66 66 66 90 507.859075 cycles 66 66 90 66 90 1008.663129 cycles 66 66 90 66 66 90 1008.696259 cycles 66 66 66 90 66 66 90 1008.692517 cycles 66 66 66 90 66 66 66 90 1008.755399 cycles P6: 90 507.795232 cycles 66 90 507.794761 cycles 0f 1f 00 507.834901 cycles 0f 1f 40 00 507.822629 cycles 0f 1f 44 00 00 507.838493 cycles 66 0f 1f 44 00 00 507.908597 cycles 0f 1f 80 00 00 00 00 507.946417 cycles 0f 1f 84 00 00 00 00 00 507.954960 cycles Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Aravind Gopalakrishnan Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431332153-18566-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index aef653193160..b0932c4341b3 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -227,6 +227,15 @@ void __init arch_init_ideal_nops(void) #endif } break; + + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 > 0xf) { + ideal_nops = p6_nops; + return; + } + + /* fall through */ + default: #ifdef CONFIG_X86_64 ideal_nops = k8_nops; -- cgit v1.2.3 From cd2f6a5a4704a359635eb34919317052e6a96ba7 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Mon, 11 May 2015 10:15:52 +0200 Subject: x86/mm/mtrr: Remove incorrect address check in __mtrr_type_lookup() __mtrr_type_lookup() checks MTRR fixed ranges when mtrr_state.have_fixed is set and start is less than 0x100000. However, the 'else if (start < 0x1000000)' in the code checks with an incorrect address as it has an extra-zero in the address. The code still runs correctly as this check is meaningless, though. This patch replaces the incorrect address check with 'else' with no condition. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave.hansen@intel.com Cc: linux-mm Cc: pebolle@tiscali.nl Link: http://lkml.kernel.org/r/1427234921-19737-4-git-send-email-toshi.kani@hp.com Link: http://lkml.kernel.org/r/1431332153-18566-8-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 7d74f7b3c6ba..5b239679cfc9 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -137,7 +137,7 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) idx = 1 * 8; idx += ((start - 0x80000) >> 14); return mtrr_state.fixed_ranges[idx]; - } else if (start < 0x1000000) { + } else { idx = 3 * 8; idx += ((start - 0xC0000) >> 12); return mtrr_state.fixed_ranges[idx]; -- cgit v1.2.3 From d68921f9bd148359e6d01c84aaa2e32bfbd82970 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 11 May 2015 17:27:09 -0400 Subject: x86/smp/boot: Add cmdline "cpu_init_udelay=N" to specify cpu_up() delay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No change to default behavior. Replace the hard-coded mdelay(10) in cpu_up() with a variable udelay, that is set to a defined default -- rather than a magic number. Add a boot-time override, "cpu_init_udelay=N" Signed-off-by: Len Brown Cc: Alan Cox Cc: Arjan van de Ven Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jan H. Schönherr Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2fe8e6c798e8def271122f62df9bbf58dc283e2a.1431379433.git.len.brown@intel.com Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 6 ++++++ arch/x86/kernel/smpboot.c | 23 ++++++++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 61ab1628a057..a320a41e7412 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -746,6 +746,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. cpuidle.off=1 [CPU_IDLE] disable the cpuidle sub-system + cpu_init_udelay=N + [X86] Delay for N microsec between assert and de-assert + of APIC INIT to start processors. This delay occurs + on every CPU online, such as boot, and resume from suspend. + Default: 10000 + cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver Format: ,,,[,] diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 51203f60587f..0629a8e513af 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -513,6 +513,27 @@ void __inquire_remote_apic(int apicid) } } +/* + * The Multiprocessor Specification 1.4 (1997) example code suggests + * that there should be a 10ms delay between the BSP asserting INIT + * and de-asserting INIT, when starting a remote processor. + * But that slows boot and resume on modern processors, which include + * many cores and don't require that delay. + * + * Cmdline "init_cpu_udelay=" is available to over-ride this delay. + */ +#define UDELAY_10MS_DEFAULT 10000 + +static unsigned int init_udelay = UDELAY_10MS_DEFAULT; + +static int __init cpu_init_udelay(char *str) +{ + get_option(&str, &init_udelay); + + return 0; +} +early_param("cpu_init_udelay", cpu_init_udelay); + /* * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this @@ -584,7 +605,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); - mdelay(10); + mdelay(init_udelay); pr_debug("Deasserting INIT\n"); -- cgit v1.2.3 From 1a744cb356c57303fc97eb15a298032170f841fa Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 11 May 2015 17:27:10 -0400 Subject: x86/smp/boot: Remove 10ms delay from cpu_up() on modern processors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modern processor familes do not require the 10ms delay in cpu_up() to de-assert INIT. This speeds up boot and resume by 10ms per (application) processor. Signed-off-by: Len Brown Cc: Alan Cox Cc: Arjan van de Ven Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jan H. Schönherr Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/021ce30c88f216ad39686646421194dc25671e55.1431379433.git.len.brown@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0629a8e513af..85bd6aad8c74 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -521,6 +521,7 @@ void __inquire_remote_apic(int apicid) * many cores and don't require that delay. * * Cmdline "init_cpu_udelay=" is available to over-ride this delay. + * Modern processor families are quirked to remove the delay entirely. */ #define UDELAY_10MS_DEFAULT 10000 @@ -534,6 +535,18 @@ static int __init cpu_init_udelay(char *str) } early_param("cpu_init_udelay", cpu_init_udelay); +static void __init smp_quirk_init_udelay(void) +{ + /* if cmdline changed it from default, leave it alone */ + if (init_udelay != UDELAY_10MS_DEFAULT) + return; + + /* if modern processor, use no delay */ + if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || + ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) + init_udelay = 0; +} + /* * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this @@ -1210,6 +1223,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) uv_system_init(); set_mtrr_aps_delayed_init(); + + smp_quirk_init_udelay(); } void arch_enable_nonboot_cpus_begin(void) -- cgit v1.2.3 From 853b160aaafbe27d6304c8832bb7340d57c6b04e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 13 May 2015 08:40:49 +0200 Subject: Revert f5d6a52f5111 ("x86/smpboot: Skip delays during SMP initialization similar to Xen") MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Huang Ying reported x86 boot hangs due to this commit. Turns out that the change, despite its changelog, does more than just change timeouts: it also changes the way we assert/deassert INIT via the APIC_DM_INIT IPI, in the x2apic case it skips the deassert step. This is historically fragile code and the patch did not improve it, so revert these changes. This commit: 1a744cb356c5 ("x86/smp/boot: Remove 10ms delay from cpu_up() on modern processors") independently removes the worst of the delays (the 10 msec delay). The remaining delays can be addressed one by one, combined with careful testing. Reported-by: Huang Ying Cc: Anthony Liguori Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Gang Wei Cc: H. Peter Anvin Cc: Jan H. Schönherr Cc: Len Brown Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Tim Deegan Link: http://lkml.kernel.org/r/1430732554-7294-1-git-send-email-jschoenh@amazon.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 58 ++++++++++++++++++----------------------------- 1 file changed, 22 insertions(+), 36 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 85bd6aad8c74..b9aaa3930b2f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -614,34 +614,22 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, phys_apicid); - if (!cpu_has_x2apic) { - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); - mdelay(init_udelay); + mdelay(init_udelay); - pr_debug("Deasserting INIT\n"); + pr_debug("Deasserting INIT\n"); - /* Target chip */ - /* Send IPI */ - apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); + /* Target chip */ + /* Send IPI */ + apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); - mb(); - atomic_set(&init_deasserted, 1); - } else if (tboot_enabled()) { - /* - * With tboot AP is actually spinning in a mini-guest before - * receiving INIT. Upon receiving INIT ipi, AP need time to - * VMExit, update VMCS to tracking SIPIs and VMResume. - * - * While AP is in root mode handling the INIT the CPU will drop - * any SIPIs - */ - udelay(10); - } + mb(); + atomic_set(&init_deasserted, 1); /* * Should we send STARTUP IPIs ? @@ -683,22 +671,20 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), phys_apicid); - if (!cpu_has_x2apic) { - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(300); + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(300); - pr_debug("Startup point 1\n"); + pr_debug("Startup point 1\n"); - pr_debug("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(200); - } + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(200); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); -- cgit v1.2.3 From 4a00c95dcdba45c9592af2e908c0816fd54f5544 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 11 May 2015 18:56:49 +0900 Subject: x86/hpet: Pass proper pointer to irq_alloc_info Fix the following oops: hpet_msi_get_hwirq+0x1f/0x27 msi_domain_alloc+0x35/0xfe ? trace_hardirqs_on_caller+0x16c/0x188 irq_domain_alloc_irqs_recursive+0x51/0x95 __irq_domain_alloc_irqs+0x151/0x223 hpet_assign_irq+0x5d/0x68 hpet_msi_capability_lookup+0x121/0x1cb ? hpet_enable+0x2b4/0x2b4 hpet_late_init+0x5f/0xf2 ? hpet_enable+0x2b4/0x2b4 do_one_initcall+0x184/0x199 kernel_init_freeable+0x1af/0x237 ? rest_init+0x13a/0x13a kernel_init+0xe/0xd4 ret_from_fork+0x3f/0x70 ? rest_init+0x13a/0x13a Since 3cb96f0c9733 ('x86/hpet: Enhance HPET IRQ to support hierarchical irqdomains') hpet_msi_capability_lookup() uses hpet_assign_irq(). The latter initializes irq_alloc_info on stack, but passes a NULL pointer to irq_domain_alloc_irqs(), which causes a NULL pointer dereference later in hpet_msi_get_hwirq(). Pass the pointer to the irq_alloc_info irq_domain_alloc_irqs(). Fixes: 3cb96f0c9733 'x86/hpet: Enhance HPET IRQ to support hierarchical irqdomains' Signed-off-by: Sergey Senozhatsky Reviewed-by: Jiang Liu Cc: Sergey Senozhatsky Link: http://lkml.kernel.org/r/20150512041444.GA1094@swordfish Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 58fde664e7c0..ef516afa20bb 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -351,6 +351,6 @@ int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev, info.hpet_id = hpet_dev_id(domain); info.hpet_index = dev_num; - return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, NULL); + return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info); } #endif -- cgit v1.2.3 From 486ca539caa082c7f2929c207af1b3ce2a304489 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Thu, 7 May 2015 10:53:56 +0800 Subject: x86, irq: Allocate CPU vectors from device local CPUs if possible On NUMA systems, an IO device may be associated with a NUMA node. It may improve IO performance to allocate resources, such as memory and interrupts, from device local node. This patch introduces a mechanism to support CPU vector allocation policies. It tries to allocate CPU vectors from CPUs on device local node first, and then fallback to all online(global) CPUs. This mechanism may be used to support NumaConnect systems to allocate CPU vectors from device local node. Signed-off-by: Jiang Liu Tested-by: Daniel J Blueman Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Bjorn Helgaas Cc: Benjamin Herrenschmidt Cc: Rafael J. Wysocki Cc: Randy Dunlap Cc: Yinghai Lu Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1430967244-28905-1-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 2766747e1a3b..b590c9d6736a 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -210,6 +210,18 @@ static int assign_irq_vector(int irq, struct apic_chip_data *data, return err; } +static int assign_irq_vector_policy(int irq, int node, + struct apic_chip_data *data, + struct irq_alloc_info *info) +{ + if (info && info->mask) + return assign_irq_vector(irq, data, info->mask); + if (node != NUMA_NO_NODE && + assign_irq_vector(irq, data, cpumask_of_node(node)) == 0) + return 0; + return assign_irq_vector(irq, data, apic->target_cpus()); +} + static void clear_irq_vector(int irq, struct apic_chip_data *data) { int cpu, vector; @@ -258,12 +270,6 @@ void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src) memset(dst, 0, sizeof(*dst)); } -static inline const struct cpumask * -irq_alloc_info_get_mask(struct irq_alloc_info *info) -{ - return (!info || !info->mask) ? apic->target_cpus() : info->mask; -} - static void x86_vector_free_irqs(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { @@ -289,7 +295,6 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, { struct irq_alloc_info *info = arg; struct apic_chip_data *data; - const struct cpumask *mask; struct irq_data *irq_data; int i, err; @@ -300,7 +305,6 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1) return -ENOSYS; - mask = irq_alloc_info_get_mask(info); for (i = 0; i < nr_irqs; i++) { irq_data = irq_domain_get_irq_data(domain, virq + i); BUG_ON(!irq_data); @@ -318,7 +322,8 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, irq_data->chip = &lapic_controller; irq_data->chip_data = data; irq_data->hwirq = virq + i; - err = assign_irq_vector(virq, data, mask); + err = assign_irq_vector_policy(virq, irq_data->node, data, + info); if (err) goto error; } -- cgit v1.2.3 From 6af7faf6076697a39438cf38e21b4035e2ebdac9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 May 2015 15:48:25 +0200 Subject: x86: Use entering[_ack]_irq() instead of open coding it Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/vector.c | 6 ++---- arch/x86/kernel/cpu/mshyperv.c | 6 ++---- arch/x86/kernel/irq.c | 16 ++++------------ 3 files changed, 8 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index b590c9d6736a..28eba2d38b15 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -542,9 +542,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; - ack_APIC_irq(); - irq_enter(); - exit_idle(); + entering_ack_irq(); me = smp_processor_id(); for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { @@ -596,7 +594,7 @@ unlock: raw_spin_unlock(&desc->lock); } - irq_exit(); + exiting_irq(); } static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 939155ffdece..aad4bd84b475 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -39,14 +39,12 @@ void hyperv_vector_handler(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); - irq_enter(); - exit_idle(); - + entering_irq(); inc_irq_stat(irq_hv_callback_count); if (vmbus_handler) vmbus_handler(); - irq_exit(); + exiting_irq(); set_irq_regs(old_regs); } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index fe2ed8bb507b..be3894512820 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -198,8 +198,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) unsigned vector = ~regs->orig_ax; unsigned irq; - irq_enter(); - exit_idle(); + entering_irq(); irq = __this_cpu_read(vector_irq[vector]); @@ -215,7 +214,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) } } - irq_exit(); + exiting_irq(); set_irq_regs(old_regs); return 1; @@ -250,16 +249,9 @@ __visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); - ack_APIC_irq(); - - irq_enter(); - - exit_idle(); - + entering_ack_irq(); inc_irq_stat(kvm_posted_intr_ipis); - - irq_exit(); - + exiting_irq(); set_irq_regs(old_regs); } #endif -- cgit v1.2.3 From 6dc178760553605c58d78bd403dfcb4e042c5b72 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 May 2015 15:50:45 +0200 Subject: x86: Consolidate irq entering inlines smp.c and irq_work.c implement the same inline helper. Move it to apic.h and use it everywhere. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra --- arch/x86/include/asm/apic.h | 6 ++++++ arch/x86/kernel/irq_work.c | 10 ++-------- arch/x86/kernel/smp.c | 19 ++++++------------- 3 files changed, 14 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 976b86a325e5..c8393634ca0c 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -644,6 +644,12 @@ static inline void entering_ack_irq(void) entering_irq(); } +static inline void ipi_entering_ack_irq(void) +{ + ack_APIC_irq(); + irq_enter(); +} + static inline void exiting_irq(void) { irq_exit(); diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 15d741ddfeeb..dc5fa6a1e8d6 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -10,12 +10,6 @@ #include #include -static inline void irq_work_entering_irq(void) -{ - irq_enter(); - ack_APIC_irq(); -} - static inline void __smp_irq_work_interrupt(void) { inc_irq_stat(apic_irq_work_irqs); @@ -24,14 +18,14 @@ static inline void __smp_irq_work_interrupt(void) __visible void smp_irq_work_interrupt(struct pt_regs *regs) { - irq_work_entering_irq(); + ipi_entering_ack_irq(); __smp_irq_work_interrupt(); exiting_irq(); } __visible void smp_trace_irq_work_interrupt(struct pt_regs *regs) { - irq_work_entering_irq(); + ipi_entering_ack_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); __smp_irq_work_interrupt(); trace_irq_work_exit(IRQ_WORK_VECTOR); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index be8e1bde07aa..15aaa69bbb5e 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -170,8 +170,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) asmlinkage __visible void smp_reboot_interrupt(void) { - ack_APIC_irq(); - irq_enter(); + ipi_entering_ack_irq(); stop_this_cpu(NULL); irq_exit(); } @@ -265,12 +264,6 @@ __visible void smp_reschedule_interrupt(struct pt_regs *regs) */ } -static inline void smp_entering_irq(void) -{ - ack_APIC_irq(); - irq_enter(); -} - __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs) { /* @@ -279,7 +272,7 @@ __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs) * scheduler_ipi(). This is OK, since those functions are allowed * to nest. */ - smp_entering_irq(); + ipi_entering_ack_irq(); trace_reschedule_entry(RESCHEDULE_VECTOR); __smp_reschedule_interrupt(); trace_reschedule_exit(RESCHEDULE_VECTOR); @@ -297,14 +290,14 @@ static inline void __smp_call_function_interrupt(void) __visible void smp_call_function_interrupt(struct pt_regs *regs) { - smp_entering_irq(); + ipi_entering_ack_irq(); __smp_call_function_interrupt(); exiting_irq(); } __visible void smp_trace_call_function_interrupt(struct pt_regs *regs) { - smp_entering_irq(); + ipi_entering_ack_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); __smp_call_function_interrupt(); trace_call_function_exit(CALL_FUNCTION_VECTOR); @@ -319,14 +312,14 @@ static inline void __smp_call_function_single_interrupt(void) __visible void smp_call_function_single_interrupt(struct pt_regs *regs) { - smp_entering_irq(); + ipi_entering_ack_irq(); __smp_call_function_single_interrupt(); exiting_irq(); } __visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs) { - smp_entering_irq(); + ipi_entering_ack_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); __smp_call_function_single_interrupt(); trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); -- cgit v1.2.3 From e839004b49c571e20006092cbe9da8f2c95d2e71 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 16 May 2015 18:17:59 +0200 Subject: x86/asm/head*.S: Change global labels to local Make the disassembly look less confusing: -- head_64.o.before.asm ++ head_64.o.after.asm 0000000000000120 : 120: fc cld 121: 83 3c 24 02 cmpl $0x2,(%rsp) - 125: 0f 84 9d 00 00 00 je 1c8 + 125: 0f 84 9d 00 00 00 je 1c8 12b: 83 3d 00 00 00 00 02 cmpl $0x2,0x0(%rip) # 132 132: 74 7e je 1b2 134: ff 05 00 00 00 00 incl 0x0(%rip) # 13a @@ -1198,9 +1198,7 @@ Disassembly of section .init.text: 1bf: 5a pop %rdx 1c0: 59 pop %rcx 1c1: 58 pop %rax - 1c2: ff 0d 00 00 00 00 decl 0x0(%rip) # 1c8 - -00000000000001c8 : + 1c2: ff 0d 00 00 00 00 decl 0x0(%rip) # 1c8 1c8: 48 83 c4 10 add $0x10,%rsp 1cc: 48 cf iretq -- head_32.o.before.asm ++ head_32.o.after.asm 0000016c : 16c: fc cld 16d: 83 3c 24 02 cmpl $0x2,(%esp) - 171: 74 73 je 1e6 + 171: 74 73 je 1e6 173: 36 83 3d 00 00 00 00 cmpl $0x2,%ss:0x0 17a: 02 17b: 74 5a je 1d7 @@ -483,8 +483,6 @@ Disassembly of section .init.text: 1dd: 59 pop %ecx 1de: 58 pop %eax 1df: 36 ff 0d 00 00 00 00 decl %ss:0x0 - -000001e6 : 1e6: 83 c4 08 add $0x8,%esp 1e9: cf iret 1ea: 66 90 xchg %ax,%ax No functionality change. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431793079-11153-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 4 ++-- arch/x86/kernel/head_64.S | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index d031bad9e07e..02d257256200 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -547,7 +547,7 @@ ENTRY(early_idt_handler) cld cmpl $2,(%esp) # X86_TRAP_NMI - je is_nmi # Ignore NMI + je .Lis_nmi # Ignore NMI cmpl $2,%ss:early_recursion_flag je hlt_loop @@ -600,7 +600,7 @@ ex_entry: pop %ecx pop %eax decl %ss:early_recursion_flag -is_nmi: +.Lis_nmi: addl $8,%esp /* drop vector number and error code */ iret ENDPROC(early_idt_handler) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ae6588b301c2..43eafc8afb69 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -344,7 +344,7 @@ ENTRY(early_idt_handler) cld cmpl $2,(%rsp) # X86_TRAP_NMI - je is_nmi # Ignore NMI + je .Lis_nmi # Ignore NMI cmpl $2,early_recursion_flag(%rip) jz 1f @@ -409,7 +409,7 @@ ENTRY(early_idt_handler) popq %rcx popq %rax decl early_recursion_flag(%rip) -is_nmi: +.Lis_nmi: addq $16,%rsp # drop vector number and error code INTERRUPT_RETURN ENDPROC(early_idt_handler) -- cgit v1.2.3 From adeb5537849d9db428fe0ddc3562e5a765a347e2 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 15 May 2015 22:39:06 +0200 Subject: x86/asm/entry/64: Use shorter MOVs from segment registers The "movw %ds,%cx" instruction needs a 0x66 prefix, while "movl %ds,%ecx" does not. The difference is that latter form (on 64-bit CPUs) overwrites the entire %ecx, not only its lower half. But subsequent code doesn't depend on the value of upper half of %ecx, so we can safely use the shorter instruction. The new code is also faster than the old one - now we don't depend on the old value of %ecx, but this code fragment is not performance-critical so it does not matter much. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1431722346-26585-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 09c3f9e0e07e..47b95813dc37 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1190,17 +1190,17 @@ ENTRY(xen_failsafe_callback) /*CFI_REL_OFFSET ds,DS*/ CFI_REL_OFFSET r11,8 CFI_REL_OFFSET rcx,0 - movw %ds,%cx + movl %ds,%ecx cmpw %cx,0x10(%rsp) CFI_REMEMBER_STATE jne 1f - movw %es,%cx + movl %es,%ecx cmpw %cx,0x18(%rsp) jne 1f - movw %fs,%cx + movl %fs,%ecx cmpw %cx,0x20(%rsp) jne 1f - movw %gs,%cx + movl %gs,%ecx cmpw %cx,0x28(%rsp) jne 1f /* All segments match their saved values => Category 2 (Bad IRET). */ -- cgit v1.2.3 From 7cb685982157567dcc55eb92d1c38d237465203b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 18 May 2015 12:05:13 +0200 Subject: x86/smp/boot: Fix legacy SMP bootup slow-boot bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So while testing kernels using tools/kvm/ (kvmtool) I noticed that it booted super slow: [ 0.142991] Performance Events: no PMU driver, software events only. [ 0.149265] x86: Booting SMP configuration: [ 0.149765] .... node #0, CPUs: #1 [ 0.148304] kvm-clock: cpu 1, msr 2:1bfe9041, secondary cpu clock [ 10.158813] KVM setup async PF for cpu 1 [ 10.159000] #2 [ 10.159000] kvm-stealtime: cpu 1, msr 211a4d400 [ 10.158829] kvm-clock: cpu 2, msr 2:1bfe9081, secondary cpu clock [ 20.167805] KVM setup async PF for cpu 2 [ 20.168000] #3 [ 20.168000] kvm-stealtime: cpu 2, msr 211a8d400 [ 20.167818] kvm-clock: cpu 3, msr 2:1bfe90c1, secondary cpu clock [ 30.176902] KVM setup async PF for cpu 3 [ 30.177000] #4 [ 30.177000] kvm-stealtime: cpu 3, msr 211acd400 One CPU booted up per 10 seconds. With 120 CPUs that takes a while. Bisection pinpointed this commit: 853b160aaafb ("Revert f5d6a52f5111 ("x86/smpboot: Skip delays during SMP initialization similar to Xen")") But that commit just restores previous behavior, so it cannot cause the problem. After some head scratching it turns out that these two commits: 1a744cb356c5 ("x86/smp/boot: Remove 10ms delay from cpu_up() on modern processors") d68921f9bd14 ("x86/smp/boot: Add cmdline "cpu_init_udelay=N" to specify cpu_up() delay") added the following code to smpboot.c: - mdelay(10); + mdelay(init_udelay); Note the mismatch in the units: the delay is called 'udelay' and is set to microseconds - while the function used here is actually 'mdelay', which counts in milliseconds ... So the delay for legacy systems is off by a factor of 1,000, so instead of 10 msecs we waited for 10 seconds ... The reason bisection pointed to 853b160aaafb was that 853b160aaafb removed a (broken) boot-time speedup patch, which masked the factor 1,000 bug. Fix it by using udelay(). This fixes my bootup problems. Cc: Len Brown Cc: Alan Cox Cc: Arjan van de Ven Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jan H. Schönherr Cc: Linus Torvalds Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b9aaa3930b2f..fd6291c921b6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -617,7 +617,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); - mdelay(init_udelay); + udelay(init_udelay); pr_debug("Deasserting INIT\n"); -- cgit v1.2.3 From ea6cd25058f39ac69623efdcbd94a7fc7d4d13f0 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sat, 9 May 2015 20:27:37 -0400 Subject: x86: Rename eisa_set_level_irq to elcr_set_level_irq This routine has been around for over a decade, but with EISA being dead and abandoned for about twice that long, the name can be kind of confusing. The function is going at the PIC Edge/Level Configuration Registers (ELCR), so rename it as such and mentally decouple it from the long since dead EISA bus. Signed-off-by: Paul Gortmaker Reviewed-by: Maciej W. Rozycki Acked-by: Pavel Machek Cc: Rafael J. Wysocki Cc: Len Brown Cc: Bjorn Helgaas Cc: x86@kernel.org Link: http://lkml.kernel.org/r/1431217657-934-1-git-send-email-paul.gortmaker@windriver.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hw_irq.h | 3 +-- arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/pci/irq.c | 13 +++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 9472c9aff26d..9ec5d37d8da3 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -157,8 +157,7 @@ static inline void unlock_vector_lock(void) {} extern atomic_t irq_err_count; extern atomic_t irq_mis_count; -/* EISA */ -extern void eisa_set_level_irq(unsigned int irq); +extern void elcr_set_level_irq(unsigned int irq); /* SMP */ extern __visible void smp_apic_timer_interrupt(struct pt_regs *); diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 271293ad89d7..e49ee24da85e 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -608,7 +608,7 @@ static int acpi_register_gsi_pic(struct device *dev, u32 gsi, * Make sure all (legacy) PCI IRQs are set as level-triggered. */ if (trigger == ACPI_LEVEL_SENSITIVE) - eisa_set_level_irq(gsi); + elcr_set_level_irq(gsi); #endif return gsi; diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 5dc6ca5e1741..9bd115484745 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -146,19 +146,20 @@ static void __init pirq_peer_trick(void) /* * Code for querying and setting of IRQ routes on various interrupt routers. + * PIC Edge/Level Control Registers (ELCR) 0x4d0 & 0x4d1. */ -void eisa_set_level_irq(unsigned int irq) +void elcr_set_level_irq(unsigned int irq) { unsigned char mask = 1 << (irq & 7); unsigned int port = 0x4d0 + (irq >> 3); unsigned char val; - static u16 eisa_irq_mask; + static u16 elcr_irq_mask; - if (irq >= 16 || (1 << irq) & eisa_irq_mask) + if (irq >= 16 || (1 << irq) & elcr_irq_mask) return; - eisa_irq_mask |= (1 << irq); + elcr_irq_mask |= (1 << irq); printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq); val = inb(port); if (!(val & mask)) { @@ -965,11 +966,11 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) { msg = "found"; - eisa_set_level_irq(irq); + elcr_set_level_irq(irq); } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { if (r->set(pirq_router_dev, dev, pirq, newirq)) { - eisa_set_level_irq(newirq); + elcr_set_level_irq(newirq); msg = "assigned"; irq = newirq; } -- cgit v1.2.3 From a2f1c8bdc02bfcaa5a658283b883fdb54e328b36 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Tue, 19 May 2015 17:07:15 +0800 Subject: x86/irq/msi: Implement irq_set_vcpu_affinity for remapped MSI irqs Implement irq_set_vcpu_affinity for pci_msi_ir_controller. Signed-off-by: Feng Wu Reviewed-by: Jiang Liu Link: http://lkml.kernel.org/r/1432026437-16560-3-git-send-email-feng.wu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/msi.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index ef516afa20bb..1a9d735e09c6 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -152,6 +152,7 @@ static struct irq_chip pci_msi_ir_controller = { .irq_mask = pci_msi_mask_irq, .irq_ack = irq_chip_ack_parent, .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_set_vcpu_affinity = irq_chip_set_vcpu_affinity_parent, .flags = IRQCHIP_SKIP_SET_WAKE, }; -- cgit v1.2.3 From f6b3c72c23661e5534cd2eede16e9bac7ebb761c Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Tue, 19 May 2015 17:07:16 +0800 Subject: x86/irq: Define a global vector for VT-d Posted-Interrupts Currently, we use a global vector as the Posted-Interrupts Notification Event for all the vCPUs in the system. We need to introduce another global vector for VT-d Posted-Interrtups, which will be used to wakeup the sleep vCPU when an external interrupt from a direct-assigned device happens for that vCPU. [ tglx: Removed a gazillion of extra newlines ] Signed-off-by: Feng Wu Cc: jiang.liu@linux.intel.com Link: http://lkml.kernel.org/r/1432026437-16560-4-git-send-email-feng.wu@intel.com Suggested-by: Yang Zhang Acked-by: H. Peter Anvin Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/entry_arch.h | 2 ++ arch/x86/include/asm/hardirq.h | 1 + arch/x86/include/asm/hw_irq.h | 2 ++ arch/x86/include/asm/irq.h | 4 ++++ arch/x86/include/asm/irq_vectors.h | 1 + arch/x86/kernel/entry_64.S | 2 ++ arch/x86/kernel/irq.c | 26 ++++++++++++++++++++++++++ arch/x86/kernel/irqinit.c | 2 ++ 8 files changed, 40 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index dc5fa661465f..27ca0afcccd7 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -23,6 +23,8 @@ BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) #ifdef CONFIG_HAVE_KVM BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR, smp_kvm_posted_intr_ipi) +BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR, + smp_kvm_posted_intr_wakeup_ipi) #endif /* diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 0f5fb6b6567e..986606539395 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -14,6 +14,7 @@ typedef struct { #endif #ifdef CONFIG_HAVE_KVM unsigned int kvm_posted_intr_ipis; + unsigned int kvm_posted_intr_wakeup_ipis; #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 9ec5d37d8da3..10c80d4f8386 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -29,6 +29,7 @@ extern asmlinkage void apic_timer_interrupt(void); extern asmlinkage void x86_platform_ipi(void); extern asmlinkage void kvm_posted_intr_ipi(void); +extern asmlinkage void kvm_posted_intr_wakeup_ipi(void); extern asmlinkage void error_interrupt(void); extern asmlinkage void irq_work_interrupt(void); @@ -58,6 +59,7 @@ extern void trace_call_function_single_interrupt(void); #define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt #define trace_reboot_interrupt reboot_interrupt #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi +#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi #endif /* CONFIG_TRACING */ #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index a80cbb88ea91..8008d06581c7 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -30,6 +30,10 @@ extern void fixup_irqs(void); extern void irq_force_complete_move(int); #endif +#ifdef CONFIG_HAVE_KVM +extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)); +#endif + extern void (*x86_platform_ipi_callback)(void); extern void native_init_IRQ(void); extern bool handle_irq(unsigned irq, struct pt_regs *regs); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index bf55235d7772..0ed29ac13a9d 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -86,6 +86,7 @@ /* Vector for KVM to deliver posted interrupt IPI */ #ifdef CONFIG_HAVE_KVM #define POSTED_INTR_VECTOR 0xf2 +#define POSTED_INTR_WAKEUP_VECTOR 0xf1 #endif /* diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 47b95813dc37..22aadc917868 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -916,6 +916,8 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \ #ifdef CONFIG_HAVE_KVM apicinterrupt3 POSTED_INTR_VECTOR \ kvm_posted_intr_ipi smp_kvm_posted_intr_ipi +apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR \ + kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi #endif #ifdef CONFIG_X86_MCE_THRESHOLD diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index be3894512820..90b2f7052f5b 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -242,6 +242,18 @@ __visible void smp_x86_platform_ipi(struct pt_regs *regs) } #ifdef CONFIG_HAVE_KVM +static void dummy_handler(void) {} +static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler; + +void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)) +{ + if (handler) + kvm_posted_intr_wakeup_handler = handler; + else + kvm_posted_intr_wakeup_handler = dummy_handler; +} +EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); + /* * Handler for POSTED_INTERRUPT_VECTOR. */ @@ -254,6 +266,20 @@ __visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs) exiting_irq(); set_irq_regs(old_regs); } + +/* + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. + */ +__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + entering_ack_irq(); + inc_irq_stat(kvm_posted_intr_wakeup_ipis); + kvm_posted_intr_wakeup_handler(); + exiting_irq(); + set_irq_regs(old_regs); +} #endif __visible void smp_trace_x86_platform_ipi(struct pt_regs *regs) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index dc1e08d23552..680723a8e4b6 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -144,6 +144,8 @@ static void __init apic_intr_init(void) #ifdef CONFIG_HAVE_KVM /* IPI for KVM to deliver posted interrupt */ alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi); + /* IPI for KVM to deliver interrupt to wake up tasks */ + alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi); #endif /* IPI vectors for APIC spurious and error interrupts */ -- cgit v1.2.3 From 501b32653ebf49114cccb9afbf9150cf18fd8700 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Tue, 19 May 2015 17:07:17 +0800 Subject: x86/irq: Show statistics information for posted-interrupts Show the statistics information for notification event and wakeup event for posted-interrupt in /proc/interrupts. [ tglx: Named the short identifiers PIN and PIW to match the long identifiers ] Signed-off-by: Feng Wu Cc: jiang.liu@linux.intel.com Link: http://lkml.kernel.org/r/1432026437-16560-5-git-send-email-feng.wu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/irq.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 90b2f7052f5b..7e10c8b4b318 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -141,6 +141,18 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); +#endif +#ifdef CONFIG_HAVE_KVM + seq_printf(p, "%*s: ", prec, "PIN"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis); + seq_puts(p, " Posted-interrupt notification event\n"); + + seq_printf(p, "%*s: ", prec, "PIW"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->kvm_posted_intr_wakeup_ipis); + seq_puts(p, " Posted-interrupt wakeup event\n"); #endif return 0; } -- cgit v1.2.3 From cdeb6048940fa4bfb429e2f1cba0d28a11e20cd5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 22 May 2015 16:15:47 -0700 Subject: x86/asm/irq: Stop relying on magic JMP behavior for early_idt_handlers The early_idt_handlers asm code generates an array of entry points spaced nine bytes apart. It's not really clear from that code or from the places that reference it what's going on, and the code only works in the first place because GAS never generates two-byte JMP instructions when jumping to global labels. Clean up the code to generate the correct array stride (member size) explicitly. This should be considerably more robust against screw-ups, as GAS will warn if a .fill directive has a negative count. Using '. =' to advance would have been even more robust (it would generate an actual error if it tried to move backwards), but it would pad with nulls, confusing anyone who tries to disassemble the code. The new scheme should be much clearer to future readers. While we're at it, improve the comments and rename the array and common code. Binutils may start relaxing jumps to non-weak labels. If so, this change will fix our build, and we may need to backport this change. Before, on x86_64: 0000000000000000 : 0: 6a 00 pushq $0x0 2: 6a 00 pushq $0x0 4: e9 00 00 00 00 jmpq 9 5: R_X86_64_PC32 early_idt_handler-0x4 ... 48: 66 90 xchg %ax,%ax 4a: 6a 08 pushq $0x8 4c: e9 00 00 00 00 jmpq 51 4d: R_X86_64_PC32 early_idt_handler-0x4 ... 117: 6a 00 pushq $0x0 119: 6a 1f pushq $0x1f 11b: e9 00 00 00 00 jmpq 120 11c: R_X86_64_PC32 early_idt_handler-0x4 After: 0000000000000000 : 0: 6a 00 pushq $0x0 2: 6a 00 pushq $0x0 4: e9 14 01 00 00 jmpq 11d ... 48: 6a 08 pushq $0x8 4a: e9 d1 00 00 00 jmpq 120 4f: cc int3 50: cc int3 ... 117: 6a 00 pushq $0x0 119: 6a 1f pushq $0x1f 11b: eb 03 jmp 120 11d: cc int3 11e: cc int3 11f: cc int3 Signed-off-by: Andy Lutomirski Acked-by: H. Peter Anvin Cc: Binutils Cc: Borislav Petkov Cc: H.J. Lu Cc: Jan Beulich Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ac027962af343b0c599cbfcf50b945ad2ef3d7a8.1432336324.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/segment.h | 14 ++++++++++++-- arch/x86/kernel/head64.c | 2 +- arch/x86/kernel/head_32.S | 33 ++++++++++++++++++--------------- arch/x86/kernel/head_64.S | 20 +++++++++++--------- 4 files changed, 42 insertions(+), 27 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 5a9856eb12ba..7d5a1929d76b 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -231,11 +231,21 @@ #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8) #ifdef __KERNEL__ + +/* + * early_idt_handler_array is an array of entry points referenced in the + * early IDT. For simplicity, it's a real array with one entry point + * every nine bytes. That leaves room for an optional 'push $0' if the + * vector has no error code (two bytes), a 'push $vector_number' (two + * bytes), and a jump to the common entry code (up to five bytes). + */ +#define EARLY_IDT_HANDLER_SIZE 9 + #ifndef __ASSEMBLY__ -extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5]; +extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; #ifdef CONFIG_TRACING -# define trace_early_idt_handlers early_idt_handlers +# define trace_early_idt_handler_array early_idt_handler_array #endif /* diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 2b55ee6db053..5a4668136e98 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -167,7 +167,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_bss(); for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) - set_intr_gate(i, early_idt_handlers[i]); + set_intr_gate(i, early_idt_handler_array[i]); load_idt((const struct desc_ptr *)&idt_descr); copy_bootdata(__va(real_mode_data)); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 02d257256200..544dec4cc605 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -478,21 +478,22 @@ is486: __INIT setup_once: /* - * Set up a idt with 256 entries pointing to ignore_int, - * interrupt gates. It doesn't actually load idt - that needs - * to be done on each CPU. Interrupts are enabled elsewhere, - * when we can be relatively sure everything is ok. + * Set up a idt with 256 interrupt gates that push zero if there + * is no error code and then jump to early_idt_handler_common. + * It doesn't actually load the idt - that needs to be done on + * each CPU. Interrupts are enabled elsewhere, when we can be + * relatively sure everything is ok. */ movl $idt_table,%edi - movl $early_idt_handlers,%eax + movl $early_idt_handler_array,%eax movl $NUM_EXCEPTION_VECTORS,%ecx 1: movl %eax,(%edi) movl %eax,4(%edi) /* interrupt gate, dpl=0, present */ movl $(0x8E000000 + __KERNEL_CS),2(%edi) - addl $9,%eax + addl $EARLY_IDT_HANDLER_SIZE,%eax addl $8,%edi loop 1b @@ -524,26 +525,28 @@ setup_once: andl $0,setup_once_ref /* Once is enough, thanks */ ret -ENTRY(early_idt_handlers) +ENTRY(early_idt_handler_array) # 36(%esp) %eflags # 32(%esp) %cs # 28(%esp) %eip # 24(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .if (EXCEPTION_ERRCODE_MASK >> i) & 1 - ASM_NOP2 - .else + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 pushl $0 # Dummy error code, to make stack frame uniform .endif pushl $i # 20(%esp) Vector number - jmp early_idt_handler + jmp early_idt_handler_common i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr -ENDPROC(early_idt_handlers) +ENDPROC(early_idt_handler_array) - /* This is global to keep gas from relaxing the jumps */ -ENTRY(early_idt_handler) +early_idt_handler_common: + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ cld cmpl $2,(%esp) # X86_TRAP_NMI @@ -603,7 +606,7 @@ ex_entry: .Lis_nmi: addl $8,%esp /* drop vector number and error code */ iret -ENDPROC(early_idt_handler) +ENDPROC(early_idt_handler_common) /* This is the default interrupt "handler" :-) */ ALIGN diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 43eafc8afb69..e5c27f729a38 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -321,26 +321,28 @@ bad_address: jmp bad_address __INIT - .globl early_idt_handlers -early_idt_handlers: +ENTRY(early_idt_handler_array) # 104(%rsp) %rflags # 96(%rsp) %cs # 88(%rsp) %rip # 80(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .if (EXCEPTION_ERRCODE_MASK >> i) & 1 - ASM_NOP2 - .else + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 pushq $0 # Dummy error code, to make stack frame uniform .endif pushq $i # 72(%rsp) Vector number - jmp early_idt_handler + jmp early_idt_handler_common i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr +ENDPROC(early_idt_handler_array) -/* This is global to keep gas from relaxing the jumps */ -ENTRY(early_idt_handler) +early_idt_handler_common: + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ cld cmpl $2,(%rsp) # X86_TRAP_NMI @@ -412,7 +414,7 @@ ENTRY(early_idt_handler) .Lis_nmi: addq $16,%rsp # drop vector number and error code INTERRUPT_RETURN -ENDPROC(early_idt_handler) +ENDPROC(early_idt_handler_common) __INITDATA -- cgit v1.2.3 From 5c31b2800d8d3e735e5ecac8fc13d1cf862fd330 Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Tue, 26 May 2015 10:28:21 +0200 Subject: x86/mce: Fix monarch timeout setting through the mce= cmdline option Using "mce=1,10000000" on the kernel cmdline to change the monarch timeout does not work. The cause is that get_option() does parse a subsequent comma in the option string and signals that with a return value. So we don't need to check for a second comma ourselves. Signed-off-by: Xie XiuQi Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1432120943-25028-1-git-send-email-xiexiuqi@huawei.com Link: http://lkml.kernel.org/r/1432628901-18044-19-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 521e5016aca6..0cbcd3183acf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2014,11 +2014,8 @@ static int __init mcheck_enable(char *str) else if (!strcmp(str, "bios_cmci_threshold")) cfg->bios_cmci_threshold = true; else if (isdigit(str[0])) { - get_option(&str, &(cfg->tolerant)); - if (*str == ',') { - ++str; + if (get_option(&str, &cfg->tolerant) == 2) get_option(&str, &(cfg->monarch_timeout)); - } } else { pr_info("mce argument %s ignored. Please use /sys\n", str); return 0; -- cgit v1.2.3 From 7f0431e3dc8953f41e9433581c1fdd7ee45860b0 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 May 2015 10:28:05 +0200 Subject: x86/mm/mtrr: Fix MTRR lookup to handle an inclusive entry When an MTRR entry is inclusive to a requested range, i.e. the start and end of the request are not within the MTRR entry range but the range contains the MTRR entry entirely: range_start ... [mtrr_start ... mtrr_end] ... range_end __mtrr_type_lookup() ignores such a case because both start_state and end_state are set to zero. This bug can cause the following issues: 1) reserve_memtype() tracks an effective memory type in case a request type is WB (ex. /dev/mem blindly uses WB). Missing to track with its effective type causes a subsequent request to map the same range with the effective type to fail. 2) pud_set_huge() and pmd_set_huge() check if a requested range has any overlap with MTRRs. Missing to detect an overlap may cause a performance penalty or undefined behavior. This patch fixes the bug by adding a new flag, 'inclusive', to detect the inclusive case. This case is then handled in the same way as end_state:1 since the first region is the same. With this fix, __mtrr_type_lookup() handles the inclusive case properly. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave.hansen@intel.com Cc: linux-mm Cc: pebolle@tiscali.nl Link: http://lkml.kernel.org/r/1431714237-880-3-git-send-email-toshi.kani@hp.com Link: http://lkml.kernel.org/r/1432628901-18044-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 5b239679cfc9..e202d26f64a2 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -154,7 +154,7 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) prev_match = 0xFF; for (i = 0; i < num_var_ranges; ++i) { - unsigned short start_state, end_state; + unsigned short start_state, end_state, inclusive; if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11))) continue; @@ -166,19 +166,27 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) start_state = ((start & mask) == (base & mask)); end_state = ((end & mask) == (base & mask)); + inclusive = ((start < base) && (end > base)); - if (start_state != end_state) { + if ((start_state != end_state) || inclusive) { /* * We have start:end spanning across an MTRR. - * We split the region into - * either - * (start:mtrr_end) (mtrr_end:end) - * or - * (start:mtrr_start) (mtrr_start:end) + * We split the region into either + * + * - start_state:1 + * (start:mtrr_end)(mtrr_end:end) + * - end_state:1 + * (start:mtrr_start)(mtrr_start:end) + * - inclusive:1 + * (start:mtrr_start)(mtrr_start:mtrr_end)(mtrr_end:end) + * * depending on kind of overlap. - * Return the type for first region and a pointer to - * the start of second region so that caller will - * lookup again on the second region. + * + * Return the type of the first region and a pointer + * to the start of next region so that caller will be + * advised to lookup again after having adjusted start + * and end. + * * Note: This way we handle multiple overlaps as well. */ if (start_state) -- cgit v1.2.3 From 9b3aca620883fc06636737c82a4d024b22182281 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 May 2015 10:28:06 +0200 Subject: x86/mm/mtrr: Fix MTRR state checks in mtrr_type_lookup() 'mtrr_state.enabled' contains the FE (fixed MTRRs enabled) and E (MTRRs enabled) flags in MSR_MTRRdefType. Intel SDM, section 11.11.2.1, defines these flags as follows: - All MTRRs are disabled when the E flag is clear. The FE flag has no affect when the E flag is clear. - The default type is enabled when the E flag is set. - MTRR variable ranges are enabled when the E flag is set. - MTRR fixed ranges are enabled when both E and FE flags are set. MTRR state checks in __mtrr_type_lookup() do not match with SDM. Hence, this patch makes the following changes: - The current code detects MTRRs disabled when both E and FE flags are clear in mtrr_state.enabled. Fix to detect MTRRs disabled when the E flag is clear. - The current code does not check if the FE bit is set in mtrr_state.enabled when looking at the fixed entries. Fix to check the FE flag. - The current code returns the default type when the E flag is clear in mtrr_state.enabled. However, the default type is UC when the E flag is clear. Remove the code as this case is handled as MTRR disabled with the 1st change. In addition, this patch defines the E and FE flags in mtrr_state.enabled as follows. - FE flag: MTRR_STATE_MTRR_FIXED_ENABLED - E flag: MTRR_STATE_MTRR_ENABLED print_mtrr_state() and x86_get_mtrr_mem_range() are also updated accordingly. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave.hansen@intel.com Cc: linux-mm Cc: pebolle@tiscali.nl Link: http://lkml.kernel.org/r/1431714237-880-4-git-send-email-toshi.kani@hp.com Link: http://lkml.kernel.org/r/1432628901-18044-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mtrr.h | 4 ++++ arch/x86/kernel/cpu/mtrr/cleanup.c | 3 ++- arch/x86/kernel/cpu/mtrr/generic.c | 15 ++++++++------- 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index f768f6298419..ef927948657c 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -127,4 +127,8 @@ struct mtrr_gentry32 { _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry32) #endif /* CONFIG_COMPAT */ +/* Bit fields for enabled in struct mtrr_state_type */ +#define MTRR_STATE_MTRR_FIXED_ENABLED 0x01 +#define MTRR_STATE_MTRR_ENABLED 0x02 + #endif /* _ASM_X86_MTRR_H */ diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 5f90b85ff22e..70d7c93f4550 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -98,7 +98,8 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, continue; base = range_state[i].base_pfn; if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed && - (mtrr_state.enabled & 1)) { + (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { /* Var MTRR contains UC entry below 1M? Skip it: */ printk(BIOS_BUG_MSG, i); if (base + size <= (1<<(20-PAGE_SHIFT))) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index e202d26f64a2..b0599dbb899a 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -119,14 +119,16 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) if (!mtrr_state_set) return 0xFF; - if (!mtrr_state.enabled) + if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) return 0xFF; /* Make end inclusive end, instead of exclusive */ end--; /* Look in fixed ranges. Just return the type as per start */ - if (mtrr_state.have_fixed && (start < 0x100000)) { + if ((start < 0x100000) && + (mtrr_state.have_fixed) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { int idx; if (start < 0x80000) { @@ -149,9 +151,6 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) * Look of multiple ranges matching this address and pick type * as per MTRR precedence */ - if (!(mtrr_state.enabled & 2)) - return mtrr_state.def_type; - prev_match = 0xFF; for (i = 0; i < num_var_ranges; ++i) { unsigned short start_state, end_state, inclusive; @@ -355,7 +354,9 @@ static void __init print_mtrr_state(void) mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { pr_debug("MTRR fixed ranges %sabled:\n", - mtrr_state.enabled & 1 ? "en" : "dis"); + ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ? + "en" : "dis"); print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); for (i = 0; i < 2; ++i) print_fixed(0x80000 + i * 0x20000, 0x04000, @@ -368,7 +369,7 @@ static void __init print_mtrr_state(void) print_fixed_last(); } pr_debug("MTRR variable ranges %sabled:\n", - mtrr_state.enabled & 2 ? "en" : "dis"); + mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis"); high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { -- cgit v1.2.3 From 3d3ca416d9b0784cfcf244eeeba1bcaf421bc64d Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 May 2015 10:28:07 +0200 Subject: x86/mm/mtrr: Use symbolic define as a retval for disabled MTRRs mtrr_type_lookup() returns verbatim 0xFF when MTRRs are disabled. This patch defines MTRR_TYPE_INVALID to clarify the meaning of this value, and documents its usage. Document the return values of the kernel virtual address mapping helpers pud_set_huge(), pmd_set_huge, pud_clear_huge() and pmd_clear_huge(). There is no functional change in this patch. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave.hansen@intel.com Cc: linux-mm Cc: pebolle@tiscali.nl Link: http://lkml.kernel.org/r/1431714237-880-5-git-send-email-toshi.kani@hp.com Link: http://lkml.kernel.org/r/1432628901-18044-5-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mtrr.h | 2 +- arch/x86/include/uapi/asm/mtrr.h | 8 +++++++- arch/x86/kernel/cpu/mtrr/generic.c | 14 ++++++------- arch/x86/mm/pgtable.c | 42 +++++++++++++++++++++++++++++--------- 4 files changed, 47 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index ef927948657c..bb03a547c1ab 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -55,7 +55,7 @@ static inline u8 mtrr_type_lookup(u64 addr, u64 end) /* * Return no-MTRRs: */ - return 0xff; + return MTRR_TYPE_INVALID; } #define mtrr_save_fixed_ranges(arg) do {} while (0) #define mtrr_save_state() do {} while (0) diff --git a/arch/x86/include/uapi/asm/mtrr.h b/arch/x86/include/uapi/asm/mtrr.h index d0acb658c8f4..7528dcf59691 100644 --- a/arch/x86/include/uapi/asm/mtrr.h +++ b/arch/x86/include/uapi/asm/mtrr.h @@ -103,7 +103,7 @@ struct mtrr_state_type { #define MTRRIOC_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry) #define MTRRIOC_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry) -/* These are the region types */ +/* MTRR memory types, which are defined in SDM */ #define MTRR_TYPE_UNCACHABLE 0 #define MTRR_TYPE_WRCOMB 1 /*#define MTRR_TYPE_ 2*/ @@ -113,5 +113,11 @@ struct mtrr_state_type { #define MTRR_TYPE_WRBACK 6 #define MTRR_NUM_TYPES 7 +/* + * Invalid MTRR memory type. mtrr_type_lookup() returns this value when + * MTRRs are disabled. Note, this value is allocated from the reserved + * values (0x7-0xff) of the MTRR memory types. + */ +#define MTRR_TYPE_INVALID 0xff #endif /* _UAPI_ASM_X86_MTRR_H */ diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index b0599dbb899a..7b1491c6232d 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -104,7 +104,7 @@ static int check_type_overlap(u8 *prev, u8 *curr) /* * Error/Semi-error returns: - * 0xFF - when MTRR is not enabled + * MTRR_TYPE_INVALID - when MTRR is not enabled * *repeat == 1 implies [start:end] spanned across MTRR range and type returned * corresponds only to [start:*partial_end]. * Caller has to lookup again for [*partial_end:end]. @@ -117,10 +117,10 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) *repeat = 0; if (!mtrr_state_set) - return 0xFF; + return MTRR_TYPE_INVALID; if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) - return 0xFF; + return MTRR_TYPE_INVALID; /* Make end inclusive end, instead of exclusive */ end--; @@ -151,7 +151,7 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) * Look of multiple ranges matching this address and pick type * as per MTRR precedence */ - prev_match = 0xFF; + prev_match = MTRR_TYPE_INVALID; for (i = 0; i < num_var_ranges; ++i) { unsigned short start_state, end_state, inclusive; @@ -206,7 +206,7 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) continue; curr_match = mtrr_state.var_ranges[i].base_lo & 0xff; - if (prev_match == 0xFF) { + if (prev_match == MTRR_TYPE_INVALID) { prev_match = curr_match; continue; } @@ -220,7 +220,7 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) return MTRR_TYPE_WRBACK; } - if (prev_match != 0xFF) + if (prev_match != MTRR_TYPE_INVALID) return prev_match; return mtrr_state.def_type; @@ -229,7 +229,7 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) /* * Returns the effective MTRR type for the region * Error return: - * 0xFF - when MTRR is not enabled + * MTRR_TYPE_INVALID - when MTRR is not enabled */ u8 mtrr_type_lookup(u64 start, u64 end) { diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 0b97d2c75df3..c30f9819786b 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -563,16 +563,22 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, } #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +/** + * pud_set_huge - setup kernel PUD mapping + * + * MTRR can override PAT memory types with 4KiB granularity. Therefore, + * this function does not set up a huge page when the range is covered + * by a non-WB type of MTRR. MTRR_TYPE_INVALID indicates that MTRR are + * disabled. + * + * Returns 1 on success and 0 on failure. + */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { u8 mtrr; - /* - * Do not use a huge page when the range is covered by non-WB type - * of MTRRs. - */ mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE); - if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF)) + if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != MTRR_TYPE_INVALID)) return 0; prot = pgprot_4k_2_large(prot); @@ -584,16 +590,22 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) return 1; } +/** + * pmd_set_huge - setup kernel PMD mapping + * + * MTRR can override PAT memory types with 4KiB granularity. Therefore, + * this function does not set up a huge page when the range is covered + * by a non-WB type of MTRR. MTRR_TYPE_INVALID indicates that MTRR are + * disabled. + * + * Returns 1 on success and 0 on failure. + */ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { u8 mtrr; - /* - * Do not use a huge page when the range is covered by non-WB type - * of MTRRs. - */ mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE); - if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF)) + if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != MTRR_TYPE_INVALID)) return 0; prot = pgprot_4k_2_large(prot); @@ -605,6 +617,11 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) return 1; } +/** + * pud_clear_huge - clear kernel PUD mapping when it is set + * + * Returns 1 on success and 0 on failure (no PUD map is found). + */ int pud_clear_huge(pud_t *pud) { if (pud_large(*pud)) { @@ -615,6 +632,11 @@ int pud_clear_huge(pud_t *pud) return 0; } +/** + * pmd_clear_huge - clear kernel PMD mapping when it is set + * + * Returns 1 on success and 0 on failure (no PMD map is found). + */ int pmd_clear_huge(pmd_t *pmd) { if (pmd_large(*pmd)) { -- cgit v1.2.3 From 0cc705f56e400764a171055f727d28a48260bb4b Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 May 2015 10:28:08 +0200 Subject: x86/mm/mtrr: Clean up mtrr_type_lookup() MTRRs contain fixed and variable entries. mtrr_type_lookup() may repeatedly call __mtrr_type_lookup() to handle a request that overlaps with variable entries. However, __mtrr_type_lookup() also handles the fixed entries, which do not have to be repeated. Therefore, this patch creates separate functions, mtrr_type_lookup_fixed() and mtrr_type_lookup_variable(), to handle the fixed and variable ranges respectively. The patch also updates the function headers to clarify the return values and output argument. It updates comments to clarify that the repeating is necessary to handle overlaps with the default type, since overlaps with multiple entries alone can be handled without such repeating. There is no functional change in this patch. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave.hansen@intel.com Cc: linux-mm Cc: pebolle@tiscali.nl Link: http://lkml.kernel.org/r/1431714237-880-6-git-send-email-toshi.kani@hp.com Link: http://lkml.kernel.org/r/1432628901-18044-6-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 138 +++++++++++++++++++++++-------------- 1 file changed, 86 insertions(+), 52 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 7b1491c6232d..e51100c49eea 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -102,55 +102,68 @@ static int check_type_overlap(u8 *prev, u8 *curr) return 0; } -/* - * Error/Semi-error returns: - * MTRR_TYPE_INVALID - when MTRR is not enabled - * *repeat == 1 implies [start:end] spanned across MTRR range and type returned - * corresponds only to [start:*partial_end]. - * Caller has to lookup again for [*partial_end:end]. +/** + * mtrr_type_lookup_fixed - look up memory type in MTRR fixed entries + * + * Return the MTRR fixed memory type of 'start'. + * + * MTRR fixed entries are divided into the following ways: + * 0x00000 - 0x7FFFF : This range is divided into eight 64KB sub-ranges + * 0x80000 - 0xBFFFF : This range is divided into sixteen 16KB sub-ranges + * 0xC0000 - 0xFFFFF : This range is divided into sixty-four 4KB sub-ranges + * + * Return Values: + * MTRR_TYPE_(type) - Matched memory type + * MTRR_TYPE_INVALID - Unmatched + */ +static u8 mtrr_type_lookup_fixed(u64 start, u64 end) +{ + int idx; + + if (start >= 0x100000) + return MTRR_TYPE_INVALID; + + /* 0x0 - 0x7FFFF */ + if (start < 0x80000) { + idx = 0; + idx += (start >> 16); + return mtrr_state.fixed_ranges[idx]; + /* 0x80000 - 0xBFFFF */ + } else if (start < 0xC0000) { + idx = 1 * 8; + idx += ((start - 0x80000) >> 14); + return mtrr_state.fixed_ranges[idx]; + } + + /* 0xC0000 - 0xFFFFF */ + idx = 3 * 8; + idx += ((start - 0xC0000) >> 12); + return mtrr_state.fixed_ranges[idx]; +} + +/** + * mtrr_type_lookup_variable - look up memory type in MTRR variable entries + * + * Return Value: + * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched) + * + * Output Argument: + * repeat - Set to 1 when [start:end] spanned across MTRR range and type + * returned corresponds only to [start:*partial_end]. Caller has + * to lookup again for [*partial_end:end]. */ -static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) +static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, + int *repeat) { int i; u64 base, mask; u8 prev_match, curr_match; *repeat = 0; - if (!mtrr_state_set) - return MTRR_TYPE_INVALID; - - if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) - return MTRR_TYPE_INVALID; - /* Make end inclusive end, instead of exclusive */ + /* Make end inclusive instead of exclusive */ end--; - /* Look in fixed ranges. Just return the type as per start */ - if ((start < 0x100000) && - (mtrr_state.have_fixed) && - (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { - int idx; - - if (start < 0x80000) { - idx = 0; - idx += (start >> 16); - return mtrr_state.fixed_ranges[idx]; - } else if (start < 0xC0000) { - idx = 1 * 8; - idx += ((start - 0x80000) >> 14); - return mtrr_state.fixed_ranges[idx]; - } else { - idx = 3 * 8; - idx += ((start - 0xC0000) >> 12); - return mtrr_state.fixed_ranges[idx]; - } - } - - /* - * Look in variable ranges - * Look of multiple ranges matching this address and pick type - * as per MTRR precedence - */ prev_match = MTRR_TYPE_INVALID; for (i = 0; i < num_var_ranges; ++i) { unsigned short start_state, end_state, inclusive; @@ -186,7 +199,8 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) * advised to lookup again after having adjusted start * and end. * - * Note: This way we handle multiple overlaps as well. + * Note: This way we handle overlaps with multiple + * entries and the default type properly. */ if (start_state) *partial_end = base + get_mtrr_size(mask); @@ -215,21 +229,18 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) return curr_match; } - if (mtrr_tom2) { - if (start >= (1ULL<<32) && (end < mtrr_tom2)) - return MTRR_TYPE_WRBACK; - } - if (prev_match != MTRR_TYPE_INVALID) return prev_match; return mtrr_state.def_type; } -/* - * Returns the effective MTRR type for the region - * Error return: - * MTRR_TYPE_INVALID - when MTRR is not enabled +/** + * mtrr_type_lookup - look up memory type in MTRR + * + * Return Values: + * MTRR_TYPE_(type) - The effective MTRR type for the region + * MTRR_TYPE_INVALID - MTRR is disabled */ u8 mtrr_type_lookup(u64 start, u64 end) { @@ -237,22 +248,45 @@ u8 mtrr_type_lookup(u64 start, u64 end) int repeat; u64 partial_end; - type = __mtrr_type_lookup(start, end, &partial_end, &repeat); + if (!mtrr_state_set) + return MTRR_TYPE_INVALID; + + if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) + return MTRR_TYPE_INVALID; + + /* + * Look up the fixed ranges first, which take priority over + * the variable ranges. + */ + if ((start < 0x100000) && + (mtrr_state.have_fixed) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) + return mtrr_type_lookup_fixed(start, end); + + /* + * Look up the variable ranges. Look of multiple ranges matching + * this address and pick type as per MTRR precedence. + */ + type = mtrr_type_lookup_variable(start, end, &partial_end, &repeat); /* * Common path is with repeat = 0. * However, we can have cases where [start:end] spans across some - * MTRR range. Do repeated lookups for that case here. + * MTRR ranges and/or the default type. Do repeated lookups for + * that case here. */ while (repeat) { prev_type = type; start = partial_end; - type = __mtrr_type_lookup(start, end, &partial_end, &repeat); + type = mtrr_type_lookup_variable(start, end, &partial_end, &repeat); if (check_type_overlap(&prev_type, &type)) return type; } + if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2)) + return MTRR_TYPE_WRBACK; + return type; } -- cgit v1.2.3 From b73522e0c1be58d3c69b124985b8ccf94e3677f7 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 May 2015 10:28:10 +0200 Subject: x86/mm/mtrr: Enhance MTRR checks in kernel mapping helpers This patch adds the argument 'uniform' to mtrr_type_lookup(), which gets set to 1 when a given range is covered uniformly by MTRRs, i.e. the range is fully covered by a single MTRR entry or the default type. Change pud_set_huge() and pmd_set_huge() to honor the 'uniform' flag to see if it is safe to create a huge page mapping in the range. This allows them to create a huge page mapping in a range covered by a single MTRR entry of any memory type. It also detects a non-optimal request properly. They continue to check with the WB type since it does not effectively change the uniform mapping even if a request spans multiple MTRR entries. pmd_set_huge() logs a warning message to a non-optimal request so that driver writers will be aware of such a case. Drivers should make a mapping request aligned to a single MTRR entry when the range is covered by MTRRs. Signed-off-by: Toshi Kani [ Realign, flesh out comments, improve warning message. ] Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elliott@hp.com Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave.hansen@intel.com Cc: linux-mm Cc: pebolle@tiscali.nl Link: http://lkml.kernel.org/r/1431714237-880-7-git-send-email-toshi.kani@hp.com Link: http://lkml.kernel.org/r/1432628901-18044-8-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mtrr.h | 4 ++-- arch/x86/kernel/cpu/mtrr/generic.c | 40 ++++++++++++++++++++++++++++---------- arch/x86/mm/pat.c | 4 ++-- arch/x86/mm/pgtable.c | 38 +++++++++++++++++++++++------------- 4 files changed, 58 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index bb03a547c1ab..a31759e1edd9 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -31,7 +31,7 @@ * arch_phys_wc_add and arch_phys_wc_del. */ # ifdef CONFIG_MTRR -extern u8 mtrr_type_lookup(u64 addr, u64 end); +extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform); extern void mtrr_save_fixed_ranges(void *); extern void mtrr_save_state(void); extern int mtrr_add(unsigned long base, unsigned long size, @@ -50,7 +50,7 @@ extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); extern int phys_wc_to_mtrr_index(int handle); # else -static inline u8 mtrr_type_lookup(u64 addr, u64 end) +static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform) { /* * Return no-MTRRs: diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index e51100c49eea..f782d9b62cb3 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -147,19 +147,24 @@ static u8 mtrr_type_lookup_fixed(u64 start, u64 end) * Return Value: * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched) * - * Output Argument: + * Output Arguments: * repeat - Set to 1 when [start:end] spanned across MTRR range and type * returned corresponds only to [start:*partial_end]. Caller has * to lookup again for [*partial_end:end]. + * + * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the + * region is fully covered by a single MTRR entry or the default + * type. */ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, - int *repeat) + int *repeat, u8 *uniform) { int i; u64 base, mask; u8 prev_match, curr_match; *repeat = 0; + *uniform = 1; /* Make end inclusive instead of exclusive */ end--; @@ -214,6 +219,7 @@ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, end = *partial_end - 1; /* end is inclusive */ *repeat = 1; + *uniform = 0; } if ((start & mask) != (base & mask)) @@ -225,6 +231,7 @@ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, continue; } + *uniform = 0; if (check_type_overlap(&prev_match, &curr_match)) return curr_match; } @@ -241,10 +248,15 @@ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, * Return Values: * MTRR_TYPE_(type) - The effective MTRR type for the region * MTRR_TYPE_INVALID - MTRR is disabled + * + * Output Argument: + * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the + * region is fully covered by a single MTRR entry or the default + * type. */ -u8 mtrr_type_lookup(u64 start, u64 end) +u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform) { - u8 type, prev_type; + u8 type, prev_type, is_uniform = 1, dummy; int repeat; u64 partial_end; @@ -260,14 +272,18 @@ u8 mtrr_type_lookup(u64 start, u64 end) */ if ((start < 0x100000) && (mtrr_state.have_fixed) && - (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) - return mtrr_type_lookup_fixed(start, end); + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { + is_uniform = 0; + type = mtrr_type_lookup_fixed(start, end); + goto out; + } /* * Look up the variable ranges. Look of multiple ranges matching * this address and pick type as per MTRR precedence. */ - type = mtrr_type_lookup_variable(start, end, &partial_end, &repeat); + type = mtrr_type_lookup_variable(start, end, &partial_end, + &repeat, &is_uniform); /* * Common path is with repeat = 0. @@ -278,15 +294,19 @@ u8 mtrr_type_lookup(u64 start, u64 end) while (repeat) { prev_type = type; start = partial_end; - type = mtrr_type_lookup_variable(start, end, &partial_end, &repeat); + is_uniform = 0; + type = mtrr_type_lookup_variable(start, end, &partial_end, + &repeat, &dummy); if (check_type_overlap(&prev_type, &type)) - return type; + goto out; } if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2)) - return MTRR_TYPE_WRBACK; + type = MTRR_TYPE_WRBACK; +out: + *uniform = is_uniform; return type; } diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 35af6771a95a..372ad422c2c3 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -267,9 +267,9 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, * request is for WB. */ if (req_type == _PAGE_CACHE_MODE_WB) { - u8 mtrr_type; + u8 mtrr_type, uniform; - mtrr_type = mtrr_type_lookup(start, end); + mtrr_type = mtrr_type_lookup(start, end, &uniform); if (mtrr_type != MTRR_TYPE_WRBACK) return _PAGE_CACHE_MODE_UC_MINUS; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index c30f9819786b..fb0a9dd1d6e4 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -566,19 +566,28 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, /** * pud_set_huge - setup kernel PUD mapping * - * MTRR can override PAT memory types with 4KiB granularity. Therefore, - * this function does not set up a huge page when the range is covered - * by a non-WB type of MTRR. MTRR_TYPE_INVALID indicates that MTRR are - * disabled. + * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this + * function sets up a huge page only if any of the following conditions are met: + * + * - MTRRs are disabled, or + * + * - MTRRs are enabled and the range is completely covered by a single MTRR, or + * + * - MTRRs are enabled and the corresponding MTRR memory type is WB, which + * has no effect on the requested PAT memory type. + * + * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger + * page mapping attempt fails. * * Returns 1 on success and 0 on failure. */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { - u8 mtrr; + u8 mtrr, uniform; - mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE); - if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != MTRR_TYPE_INVALID)) + mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); + if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) && + (mtrr != MTRR_TYPE_WRBACK)) return 0; prot = pgprot_4k_2_large(prot); @@ -593,20 +602,21 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) /** * pmd_set_huge - setup kernel PMD mapping * - * MTRR can override PAT memory types with 4KiB granularity. Therefore, - * this function does not set up a huge page when the range is covered - * by a non-WB type of MTRR. MTRR_TYPE_INVALID indicates that MTRR are - * disabled. + * See text over pud_set_huge() above. * * Returns 1 on success and 0 on failure. */ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { - u8 mtrr; + u8 mtrr, uniform; - mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE); - if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != MTRR_TYPE_INVALID)) + mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); + if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) && + (mtrr != MTRR_TYPE_WRBACK)) { + pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", + __func__, addr, addr + PMD_SIZE); return 0; + } prot = pgprot_4k_2_large(prot); -- cgit v1.2.3 From 2f9e897353fcb99effd6eff22f7b464f8e2a659a Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 26 May 2015 10:28:12 +0200 Subject: x86/mm/mtrr, pat: Document Write Combining MTRR type effects on PAT / non-PAT pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As part of the effort to phase out MTRR use document write-combining MTRR effects on pages with different non-PAT page attributes flags and different PAT entry values. Extend arch_phys_wc_add() documentation to clarify power of two sizes / boundary requirements as we phase out mtrr_add() use. Lastly hint towards ioremap_uc() for corner cases on device drivers working with devices with mixed regions where MTRR size requirements would otherwise not enable write-combining effective memory types. Signed-off-by: Luis R. Rodriguez Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Antonino Daplas Cc: Borislav Petkov Cc: Brian Gerst Cc: Daniel Vetter Cc: Dave Airlie Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jean-Christophe Plagniol-Villard Cc: Jonathan Corbet Cc: Juergen Gross Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Suresh Siddha Cc: Thomas Gleixner Cc: Tomi Valkeinen Cc: Ville Syrjälä Cc: Vlastimil Babka Cc: linux-fbdev@vger.kernel.org Link: http://lkml.kernel.org/r/1430343851-967-3-git-send-email-mcgrof@do-not-panic.com Link: http://lkml.kernel.org/r/1432628901-18044-10-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- Documentation/x86/mtrr.txt | 18 +++++++++++++++--- Documentation/x86/pat.txt | 35 ++++++++++++++++++++++++++++++++++- arch/x86/kernel/cpu/mtrr/main.c | 3 +++ 3 files changed, 52 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/Documentation/x86/mtrr.txt b/Documentation/x86/mtrr.txt index cc071dc333c2..860bc3adc223 100644 --- a/Documentation/x86/mtrr.txt +++ b/Documentation/x86/mtrr.txt @@ -1,7 +1,19 @@ MTRR (Memory Type Range Register) control -3 Jun 1999 -Richard Gooch - + +Richard Gooch - 3 Jun 1999 +Luis R. Rodriguez - April 9, 2015 + +=============================================================================== +Phasing out MTRR use + +MTRR use is replaced on modern x86 hardware with PAT. Over time the only type +of effective MTRR that is expected to be supported will be for write-combining. +As MTRR use is phased out device drivers should use arch_phys_wc_add() to make +MTRR effective on non-PAT systems while a no-op on PAT enabled systems. + +For details refer to Documentation/x86/pat.txt. + +=============================================================================== On Intel P6 family processors (Pentium Pro, Pentium II and later) the Memory Type Range Registers (MTRRs) may be used to control diff --git a/Documentation/x86/pat.txt b/Documentation/x86/pat.txt index cf08c9fff3cd..521bd8adc3b8 100644 --- a/Documentation/x86/pat.txt +++ b/Documentation/x86/pat.txt @@ -34,6 +34,8 @@ ioremap | -- | UC- | UC- | | | | | ioremap_cache | -- | WB | WB | | | | | +ioremap_uc | -- | UC | UC | + | | | | ioremap_nocache | -- | UC- | UC- | | | | | ioremap_wc | -- | -- | WC | @@ -102,7 +104,38 @@ wants to export a RAM region, it has to do set_memory_uc() or set_memory_wc() as step 0 above and also track the usage of those pages and use set_memory_wb() before the page is freed to free pool. - +MTRR effects on PAT / non-PAT systems +------------------------------------- + +The following table provides the effects of using write-combining MTRRs when +using ioremap*() calls on x86 for both non-PAT and PAT systems. Ideally +mtrr_add() usage will be phased out in favor of arch_phys_wc_add() which will +be a no-op on PAT enabled systems. The region over which a arch_phys_wc_add() +is made, should already have been ioremapped with WC attributes or PAT entries, +this can be done by using ioremap_wc() / set_memory_wc(). Devices which +combine areas of IO memory desired to remain uncacheable with areas where +write-combining is desirable should consider use of ioremap_uc() followed by +set_memory_wc() to white-list effective write-combined areas. Such use is +nevertheless discouraged as the effective memory type is considered +implementation defined, yet this strategy can be used as last resort on devices +with size-constrained regions where otherwise MTRR write-combining would +otherwise not be effective. + +---------------------------------------------------------------------- +MTRR Non-PAT PAT Linux ioremap value Effective memory type +---------------------------------------------------------------------- + Non-PAT | PAT + PAT + |PCD + ||PWT + ||| +WC 000 WB _PAGE_CACHE_MODE_WB WC | WC +WC 001 WC _PAGE_CACHE_MODE_WC WC* | WC +WC 010 UC- _PAGE_CACHE_MODE_UC_MINUS WC* | UC +WC 011 UC _PAGE_CACHE_MODE_UC UC | UC +---------------------------------------------------------------------- + +(*) denotes implementation defined and is discouraged Notes: diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index ea5f363a1948..04aceb7e6443 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -538,6 +538,9 @@ EXPORT_SYMBOL(mtrr_del); * attempts to add a WC MTRR covering size bytes starting at base and * logs an error if this fails. * + * The called should provide a power of two size on an equivalent + * power of two boundary. + * * Drivers must store the return value to pass to mtrr_del_wc_if_needed, * but drivers should not try to interpret that return value. */ -- cgit v1.2.3 From 7d010fdf299929f9583ce5e17da629dcd83c36ef Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 26 May 2015 10:28:13 +0200 Subject: x86/mm/mtrr: Avoid #ifdeffery with phys_wc_to_mtrr_index() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is only one user but since we're going to bury MTRR next out of access to drivers, expose this last piece of API to drivers in a general fashion only needing io.h for access to helpers. Signed-off-by: Luis R. Rodriguez Signed-off-by: Borislav Petkov Cc: Abhilash Kesavan Cc: Andrew Morton Cc: Andy Lutomirski Cc: Antonino Daplas Cc: Borislav Petkov Cc: Brian Gerst Cc: Catalin Marinas Cc: Cristian Stoica Cc: Daniel Vetter Cc: Dave Airlie Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Denys Vlasenko Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Jean-Christophe Plagniol-Villard Cc: Juergen Gross Cc: Linus Torvalds Cc: Matthias Brugger Cc: Mel Gorman Cc: Peter Zijlstra Cc: Suresh Siddha Cc: Thierry Reding Cc: Thomas Gleixner Cc: Tomi Valkeinen Cc: Toshi Kani Cc: Ville Syrjälä Cc: Vlastimil Babka Cc: Will Deacon Cc: dri-devel@lists.freedesktop.org Link: http://lkml.kernel.org/r/1429722736-4473-1-git-send-email-mcgrof@do-not-panic.com Link: http://lkml.kernel.org/r/1432628901-18044-11-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 3 +++ arch/x86/include/asm/mtrr.h | 5 ----- arch/x86/kernel/cpu/mtrr/main.c | 6 +++--- drivers/gpu/drm/drm_ioctl.c | 14 +------------- include/linux/io.h | 7 +++++++ 5 files changed, 14 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 4afc05ffa566..a2b97404922d 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -339,6 +339,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, #define IO_SPACE_LIMIT 0xffff #ifdef CONFIG_MTRR +extern int __must_check arch_phys_wc_index(int handle); +#define arch_phys_wc_index arch_phys_wc_index + extern int __must_check arch_phys_wc_add(unsigned long base, unsigned long size); extern void arch_phys_wc_del(int handle); diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index a31759e1edd9..b94f6f64e23d 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -48,7 +48,6 @@ extern void mtrr_aps_init(void); extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); -extern int phys_wc_to_mtrr_index(int handle); # else static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform) { @@ -84,10 +83,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { } -static inline int phys_wc_to_mtrr_index(int handle) -{ - return -1; -} #define mtrr_ap_init() do {} while (0) #define mtrr_bp_init() do {} while (0) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 04aceb7e6443..81baf5fee0e1 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -580,7 +580,7 @@ void arch_phys_wc_del(int handle) EXPORT_SYMBOL(arch_phys_wc_del); /* - * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value + * arch_phys_wc_index - translates arch_phys_wc_add's return value * @handle: Return value from arch_phys_wc_add * * This will turn the return value from arch_phys_wc_add into an mtrr @@ -590,14 +590,14 @@ EXPORT_SYMBOL(arch_phys_wc_del); * in printk line. Alas there is an illegitimate use in some ancient * drm ioctls. */ -int phys_wc_to_mtrr_index(int handle) +int arch_phys_wc_index(int handle) { if (handle < MTRR_TO_PHYS_WC_OFFSET) return -1; else return handle - MTRR_TO_PHYS_WC_OFFSET; } -EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index); +EXPORT_SYMBOL_GPL(arch_phys_wc_index); /* * HACK ALERT! diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c index 266dcd6cdf3b..0a957828b3bd 100644 --- a/drivers/gpu/drm/drm_ioctl.c +++ b/drivers/gpu/drm/drm_ioctl.c @@ -36,9 +36,6 @@ #include #include -#ifdef CONFIG_X86 -#include -#endif static int drm_version(struct drm_device *dev, void *data, struct drm_file *file_priv); @@ -197,16 +194,7 @@ static int drm_getmap(struct drm_device *dev, void *data, map->type = r_list->map->type; map->flags = r_list->map->flags; map->handle = (void *)(unsigned long) r_list->user_token; - -#ifdef CONFIG_X86 - /* - * There appears to be exactly one user of the mtrr index: dritest. - * It's easy enough to keep it working on non-PAT systems. - */ - map->mtrr = phys_wc_to_mtrr_index(r_list->map->mtrr); -#else - map->mtrr = -1; -#endif + map->mtrr = arch_phys_wc_index(r_list->map->mtrr); mutex_unlock(&dev->struct_mutex); diff --git a/include/linux/io.h b/include/linux/io.h index 986f2bffea1e..04cce4da3685 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -111,6 +111,13 @@ static inline void arch_phys_wc_del(int handle) } #define arch_phys_wc_add arch_phys_wc_add +#ifndef arch_phys_wc_index +static inline int arch_phys_wc_index(int handle) +{ + return -1; +} +#define arch_phys_wc_index arch_phys_wc_index +#endif #endif #endif /* _LINUX_IO_H */ -- cgit v1.2.3 From f9626104a5b6815ec7d65789dfb900af5fa51e64 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 26 May 2015 10:28:14 +0200 Subject: x86/mm/mtrr: Generalize runtime disabling of MTRRs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is possible to enable CONFIG_MTRR and CONFIG_X86_PAT and end up with a system with MTRR functionality disabled but PAT functionality enabled. This can happen, for instance, when the Xen hypervisor is used where MTRRs are not supported but PAT is. This can happen on Linux as of commit 47591df50512 ("xen: Support Xen pv-domains using PAT") by Juergen, introduced in v3.19. Technically, we should assume the proper CPU bits would be set to disable MTRRs but we can't always rely on this. At least on the Xen Hypervisor, for instance, only X86_FEATURE_MTRR was disabled as of Xen 4.4 through Xen commit 586ab6a [0], but not X86_FEATURE_K6_MTRR, X86_FEATURE_CENTAUR_MCR, or X86_FEATURE_CYRIX_ARR for instance. Roger Pau Monné has clarified though that although this is technically true we will never support PVH on these CPU types so Xen has no need to disable these bits on those systems. As per Roger, AMD K6, Centaur and VIA chips don't have the necessary hardware extensions to allow running PVH guests [1]. As per Toshi it is also possible for the BIOS to disable MTRR support, in such cases get_mtrr_state() would update the MTRR state as per the BIOS, we need to propagate this information as well. x86 MTRR code relies on quite a bit of checks for mtrr_if being set to check to see if MTRRs did get set up. Instead, lets provide a generic getter for that. This also adds a few checks where they were not before which could potentially safeguard ourselves against incorrect usage of MTRR where this was not desirable. Where possible match error codes as if MTRRs were disabled on arch/x86/include/asm/mtrr.h. Lastly, since disabling MTRRs can happen at run time and we could end up with PAT enabled, best record now in our logs when MTRRs are disabled. [0] ~/devel/xen (git::stable-4.5)$ git describe --contains 586ab6a 4.4.0-rc1~18 [1] http://lists.xenproject.org/archives/html/xen-devel/2015-03/msg03460.html Signed-off-by: Luis R. Rodriguez Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Antonino Daplas Cc: Borislav Petkov Cc: Brian Gerst Cc: Daniel Vetter Cc: Dave Airlie Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jean-Christophe Plagniol-Villard Cc: Juergen Gross Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Roger Pau Monné Cc: Stefan Bader Cc: Suresh Siddha Cc: Thomas Gleixner Cc: Tomi Valkeinen Cc: Toshi Kani Cc: Ville Syrjälä Cc: Vlastimil Babka Cc: bhelgaas@google.com Cc: david.vrabel@citrix.com Cc: jbeulich@suse.com Cc: konrad.wilk@oracle.com Cc: venkatesh.pallipadi@intel.com Cc: ville.syrjala@linux.intel.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/1426893517-2511-3-git-send-email-mcgrof@do-not-panic.com Link: http://lkml.kernel.org/r/1432628901-18044-12-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 4 +++- arch/x86/kernel/cpu/mtrr/main.c | 39 ++++++++++++++++++++++++++++++-------- arch/x86/kernel/cpu/mtrr/mtrr.h | 2 +- 3 files changed, 35 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index f782d9b62cb3..3b533cf37c74 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -445,7 +445,7 @@ static void __init print_mtrr_state(void) } /* Grab all of the MTRR state for this CPU into *state */ -void __init get_mtrr_state(void) +bool __init get_mtrr_state(void) { struct mtrr_var_range *vrs; unsigned long flags; @@ -489,6 +489,8 @@ void __init get_mtrr_state(void) post_set(); local_irq_restore(flags); + + return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED); } /* Some BIOS's are messed up and don't set all MTRRs the same! */ diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 81baf5fee0e1..383efb26e516 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -59,6 +59,12 @@ #define MTRR_TO_PHYS_WC_OFFSET 1000 u32 num_var_ranges; +static bool __mtrr_enabled; + +static bool mtrr_enabled(void) +{ + return __mtrr_enabled; +} unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); @@ -286,7 +292,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, int i, replace, error; mtrr_type ltype; - if (!mtrr_if) + if (!mtrr_enabled()) return -ENXIO; error = mtrr_if->validate_add_page(base, size, type); @@ -435,6 +441,8 @@ static int mtrr_check(unsigned long base, unsigned long size) int mtrr_add(unsigned long base, unsigned long size, unsigned int type, bool increment) { + if (!mtrr_enabled()) + return -ENODEV; if (mtrr_check(base, size)) return -EINVAL; return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, @@ -463,8 +471,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) unsigned long lbase, lsize; int error = -EINVAL; - if (!mtrr_if) - return -ENXIO; + if (!mtrr_enabled()) + return -ENODEV; max = num_var_ranges; /* No CPU hotplug when we change MTRR entries */ @@ -523,6 +531,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) */ int mtrr_del(int reg, unsigned long base, unsigned long size) { + if (!mtrr_enabled()) + return -ENODEV; if (mtrr_check(base, size)) return -EINVAL; return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); @@ -548,7 +558,7 @@ int arch_phys_wc_add(unsigned long base, unsigned long size) { int ret; - if (pat_enabled) + if (pat_enabled || !mtrr_enabled()) return 0; /* Success! (We don't need to do anything.) */ ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true); @@ -737,10 +747,12 @@ void __init mtrr_bp_init(void) } if (mtrr_if) { + __mtrr_enabled = true; set_num_var_ranges(); init_table(); if (use_intel()) { - get_mtrr_state(); + /* BIOS may override */ + __mtrr_enabled = get_mtrr_state(); if (mtrr_cleanup(phys_addr)) { changed_by_mtrr_cleanup = 1; @@ -748,10 +760,16 @@ void __init mtrr_bp_init(void) } } } + + if (!mtrr_enabled()) + pr_info("MTRR: Disabled\n"); } void mtrr_ap_init(void) { + if (!mtrr_enabled()) + return; + if (!use_intel() || mtrr_aps_delayed_init) return; /* @@ -777,6 +795,9 @@ void mtrr_save_state(void) { int first_cpu; + if (!mtrr_enabled()) + return; + get_online_cpus(); first_cpu = cpumask_first(cpu_online_mask); smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); @@ -785,6 +806,8 @@ void mtrr_save_state(void) void set_mtrr_aps_delayed_init(void) { + if (!mtrr_enabled()) + return; if (!use_intel()) return; @@ -796,7 +819,7 @@ void set_mtrr_aps_delayed_init(void) */ void mtrr_aps_init(void) { - if (!use_intel()) + if (!use_intel() || !mtrr_enabled()) return; /* @@ -813,7 +836,7 @@ void mtrr_aps_init(void) void mtrr_bp_restore(void) { - if (!use_intel()) + if (!use_intel() || !mtrr_enabled()) return; mtrr_if->set_all(); @@ -821,7 +844,7 @@ void mtrr_bp_restore(void) static int __init mtrr_init_finialize(void) { - if (!mtrr_if) + if (!mtrr_enabled()) return 0; if (use_intel()) { diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index df5e41f31a27..951884dcc433 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -51,7 +51,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); -void get_mtrr_state(void); +bool get_mtrr_state(void); extern void set_mtrr_ops(const struct mtrr_ops *ops); -- cgit v1.2.3 From cb32edf65bf2197a2d2226e94c7602dc92e295bb Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 26 May 2015 10:28:15 +0200 Subject: x86/mm/pat: Wrap pat_enabled into a function API We use pat_enabled in x86-specific code to see if PAT is enabled or not but we're granting full access to it even though readers do not need to set it. If, for instance, we granted access to it to modules later they then could override the variable setting... no bueno. This renames pat_enabled to a new static variable __pat_enabled. Folks are redirected to use pat_enabled() now. Code that sets this can only be internal to pat.c. Apart from the early kernel parameter "nopat" to disable PAT, we also have a few cases that disable it later and make use of a helper pat_disable(). It is wrapped under an ifdef but since that code cannot run unless PAT was enabled its not required to wrap it with ifdefs, unwrap that. Likewise, since "nopat" doesn't really change non-PAT systems just remove that ifdef as well. Although we could add and use an early_param_off(), these helpers don't use __read_mostly but we want to keep __read_mostly for __pat_enabled as this is a hot path -- upon boot, for instance, a simple guest may see ~4k accesses to pat_enabled(). Since __read_mostly early boot params are not that common we don't add a helper for them just yet. Signed-off-by: Luis R. Rodriguez Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Andy Walls Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Brian Gerst Cc: Christoph Lameter Cc: Daniel Vetter Cc: Dave Airlie Cc: Denys Vlasenko Cc: Doug Ledford Cc: H. Peter Anvin Cc: Juergen Gross Cc: Kyle McMartin Cc: Linus Torvalds Cc: Michael S. Tsirkin Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1430425520-22275-3-git-send-email-mcgrof@do-not-panic.com Link: http://lkml.kernel.org/r/1432628901-18044-13-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pat.h | 7 +------ arch/x86/kernel/cpu/mtrr/main.c | 2 +- arch/x86/mm/iomap_32.c | 2 +- arch/x86/mm/ioremap.c | 4 ++-- arch/x86/mm/pageattr.c | 2 +- arch/x86/mm/pat.c | 33 +++++++++++++++------------------ arch/x86/pci/i386.c | 6 +++--- 7 files changed, 24 insertions(+), 32 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 91bc4ba95f91..cdcff7f7f694 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -4,12 +4,7 @@ #include #include -#ifdef CONFIG_X86_PAT -extern int pat_enabled; -#else -static const int pat_enabled; -#endif - +bool pat_enabled(void); extern void pat_init(void); void pat_init_cache_modes(void); diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 383efb26e516..e7ed0d8ebacb 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -558,7 +558,7 @@ int arch_phys_wc_add(unsigned long base, unsigned long size) { int ret; - if (pat_enabled || !mtrr_enabled()) + if (pat_enabled() || !mtrr_enabled()) return 0; /* Success! (We don't need to do anything.) */ ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true); diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 9ca35fc60cfe..3a2ec8790ca7 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -82,7 +82,7 @@ iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) * MTRR is UC or WC. UC_MINUS gets the real intention, of the * user, which is "WC if the MTRR is WC, UC if you can't do that." */ - if (!pat_enabled && pgprot_val(prot) == + if (!pat_enabled() && pgprot_val(prot) == (__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_WC))) prot = __pgprot(__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index a493bb83aa89..82d63ed70045 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -234,7 +234,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) { /* * Ideally, this should be: - * pat_enabled ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS; + * pat_enabled() ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS; * * Till we fix all X drivers to use ioremap_wc(), we will use * UC MINUS. Drivers that are certain they need or can already @@ -292,7 +292,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc); */ void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) { - if (pat_enabled) + if (pat_enabled()) return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC, __builtin_return_address(0)); else diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 397838eb292b..70d221fe2eb4 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1571,7 +1571,7 @@ int set_memory_wc(unsigned long addr, int numpages) { int ret; - if (!pat_enabled) + if (!pat_enabled()) return set_memory_uc(addr, numpages); ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 8c50b9bfa996..484dce7f759b 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -36,12 +36,11 @@ #undef pr_fmt #define pr_fmt(fmt) "" fmt -#ifdef CONFIG_X86_PAT -int __read_mostly pat_enabled = 1; +static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT); static inline void pat_disable(const char *reason) { - pat_enabled = 0; + __pat_enabled = 0; pr_info("x86/PAT: %s\n", reason); } @@ -51,13 +50,11 @@ static int __init nopat(char *str) return 0; } early_param("nopat", nopat); -#else -static inline void pat_disable(const char *reason) + +bool pat_enabled(void) { - (void)reason; + return !!__pat_enabled; } -#endif - int pat_debug_enable; @@ -201,7 +198,7 @@ void pat_init(void) u64 pat; bool boot_cpu = !boot_pat_state; - if (!pat_enabled) + if (!pat_enabled()) return; if (!cpu_has_pat) { @@ -402,7 +399,7 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, BUG_ON(start >= end); /* end is exclusive */ - if (!pat_enabled) { + if (!pat_enabled()) { /* This is identical to page table setting without PAT */ if (new_type) { if (req_type == _PAGE_CACHE_MODE_WC) @@ -477,7 +474,7 @@ int free_memtype(u64 start, u64 end) int is_range_ram; struct memtype *entry; - if (!pat_enabled) + if (!pat_enabled()) return 0; /* Low ISA region is always mapped WB. No need to track */ @@ -625,7 +622,7 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) u64 to = from + size; u64 cursor = from; - if (!pat_enabled) + if (!pat_enabled()) return 1; while (cursor < to) { @@ -661,7 +658,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, * caching for the high addresses through the KEN pin, but * we maintain the tradition of paranoia in this code. */ - if (!pat_enabled && + if (!pat_enabled() && !(boot_cpu_has(X86_FEATURE_MTRR) || boot_cpu_has(X86_FEATURE_K6_MTRR) || boot_cpu_has(X86_FEATURE_CYRIX_ARR) || @@ -730,7 +727,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, * the type requested matches the type of first page in the range. */ if (is_ram) { - if (!pat_enabled) + if (!pat_enabled()) return 0; pcm = lookup_memtype(paddr); @@ -844,7 +841,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, return ret; } - if (!pat_enabled) + if (!pat_enabled()) return 0; /* @@ -872,7 +869,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, { enum page_cache_mode pcm; - if (!pat_enabled) + if (!pat_enabled()) return 0; /* Set prot based on lookup */ @@ -913,7 +910,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, pgprot_t pgprot_writecombine(pgprot_t prot) { - if (pat_enabled) + if (pat_enabled()) return __pgprot(pgprot_val(prot) | cachemode2protval(_PAGE_CACHE_MODE_WC)); else @@ -996,7 +993,7 @@ static const struct file_operations memtype_fops = { static int __init pat_memtype_list_init(void) { - if (pat_enabled) { + if (pat_enabled()) { debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, NULL, &memtype_fops); } diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 349c0d32cc0b..0a9f2caf358f 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -429,12 +429,12 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, * Caller can followup with UC MINUS request and add a WC mtrr if there * is a free mtrr slot. */ - if (!pat_enabled && write_combine) + if (!pat_enabled() && write_combine) return -EINVAL; - if (pat_enabled && write_combine) + if (pat_enabled() && write_combine) prot |= cachemode2protval(_PAGE_CACHE_MODE_WC); - else if (pat_enabled || boot_cpu_data.x86 > 3) + else if (pat_enabled() || boot_cpu_data.x86 > 3) /* * ioremap() and ioremap_nocache() defaults to UC MINUS for now. * To avoid attribute conflicts, request UC MINUS here -- cgit v1.2.3 From 131484c8da97ed600c18dd9d03b661e8ae052df6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 May 2015 12:21:47 +0200 Subject: x86/debug: Remove perpetually broken, unmaintainable dwarf annotations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So the dwarf2 annotations in low level assembly code have become an increasing hindrance: unreadable, messy macros mixed into some of the most security sensitive code paths of the Linux kernel. These debug info annotations don't even buy the upstream kernel anything: dwarf driven stack unwinding has caused problems in the past so it's out of tree, and the upstream kernel only uses the much more robust framepointers based stack unwinding method. In addition to that there's a steady, slow bitrot going on with these annotations, requiring frequent fixups. There's no tooling and no functionality upstream that keeps it correct. So burn down the sick forest, allowing new, healthier growth: 27 files changed, 350 insertions(+), 1101 deletions(-) Someone who has the willingness and time to do this properly can attempt to reintroduce dwarf debuginfo in x86 assembly code plus dwarf unwinding from first principles, with the following conditions: - it should be maximally readable, and maximally low-key to 'ordinary' code reading and maintenance. - find a build time method to insert dwarf annotations automatically in the most common cases, for pop/push instructions that manipulate the stack pointer. This could be done for example via a preprocessing step that just looks for common patterns - plus special annotations for the few cases where we want to depart from the default. We have hundreds of CFI annotations, so automating most of that makes sense. - it should come with build tooling checks that ensure that CFI annotations are sensible. We've seen such efforts from the framepointer side, and there's no reason it couldn't be done on the dwarf side. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frédéric Weisbecker Cc: Jan Beulich Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Makefile | 10 +- arch/x86/ia32/ia32entry.S | 133 ++++----------- arch/x86/include/asm/calling.h | 94 +++++------ arch/x86/include/asm/dwarf2.h | 170 ------------------- arch/x86/include/asm/frame.h | 7 +- arch/x86/kernel/entry_32.S | 368 ++++++++++++----------------------------- arch/x86/kernel/entry_64.S | 288 ++++++-------------------------- arch/x86/lib/atomic64_386_32.S | 7 +- arch/x86/lib/atomic64_cx8_32.S | 61 +++---- arch/x86/lib/checksum_32.S | 52 +++--- arch/x86/lib/clear_page_64.S | 7 - arch/x86/lib/cmpxchg16b_emu.S | 12 +- arch/x86/lib/cmpxchg8b_emu.S | 11 +- arch/x86/lib/copy_page_64.S | 11 -- arch/x86/lib/copy_user_64.S | 15 -- arch/x86/lib/csum-copy_64.S | 17 -- arch/x86/lib/getuser.S | 13 -- arch/x86/lib/iomap_copy_64.S | 3 - arch/x86/lib/memcpy_64.S | 3 - arch/x86/lib/memmove_64.S | 3 - arch/x86/lib/memset_64.S | 5 - arch/x86/lib/msr-reg.S | 44 ++--- arch/x86/lib/putuser.S | 8 +- arch/x86/lib/rwsem.S | 49 +++--- arch/x86/lib/thunk_32.S | 15 +- arch/x86/lib/thunk_64.S | 44 +++-- arch/x86/net/bpf_jit.S | 1 - 27 files changed, 350 insertions(+), 1101 deletions(-) delete mode 100644 arch/x86/include/asm/dwarf2.h (limited to 'arch/x86/kernel') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 57996ee840dd..43e8328a23e4 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -149,12 +149,6 @@ endif sp-$(CONFIG_X86_32) := esp sp-$(CONFIG_X86_64) := rsp -# do binutils support CFI? -cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1) -# is .cfi_signal_frame supported too? -cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1) -cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1) - # does binutils support specific instructions? asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1) @@ -162,8 +156,8 @@ asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1) avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) -KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) -KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) +KBUILD_AFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) +KBUILD_CFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) LDFLAGS := -m elf_$(UTS_MACHINE) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 63450a596800..2be23c734db5 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -4,7 +4,6 @@ * Copyright 2000-2002 Andi Kleen, SuSE Labs. */ -#include #include #include #include @@ -60,17 +59,6 @@ movl %eax,%eax /* zero extension */ .endm - .macro CFI_STARTPROC32 simple - CFI_STARTPROC \simple - CFI_UNDEFINED r8 - CFI_UNDEFINED r9 - CFI_UNDEFINED r10 - CFI_UNDEFINED r11 - CFI_UNDEFINED r12 - CFI_UNDEFINED r13 - CFI_UNDEFINED r14 - CFI_UNDEFINED r15 - .endm #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret32) @@ -102,11 +90,6 @@ ENDPROC(native_usergs_sysret32) * with the int 0x80 path. */ ENTRY(ia32_sysenter_target) - CFI_STARTPROC32 simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,0 - CFI_REGISTER rsp,rbp - /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -121,25 +104,21 @@ ENTRY(ia32_sysenter_target) movl %eax, %eax movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d - CFI_REGISTER rip,r10 /* Construct struct pt_regs on stack */ - pushq_cfi $__USER32_DS /* pt_regs->ss */ - pushq_cfi %rbp /* pt_regs->sp */ - CFI_REL_OFFSET rsp,0 - pushfq_cfi /* pt_regs->flags */ - pushq_cfi $__USER32_CS /* pt_regs->cs */ - pushq_cfi %r10 /* pt_regs->ip = thread_info->sysenter_return */ - CFI_REL_OFFSET rip,0 - pushq_cfi_reg rax /* pt_regs->orig_ax */ - pushq_cfi_reg rdi /* pt_regs->di */ - pushq_cfi_reg rsi /* pt_regs->si */ - pushq_cfi_reg rdx /* pt_regs->dx */ - pushq_cfi_reg rcx /* pt_regs->cx */ - pushq_cfi $-ENOSYS /* pt_regs->ax */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %rbp /* pt_regs->sp */ + pushfq /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ cld sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ - CFI_ADJUST_CFA_OFFSET 10*8 /* * no need to do an access_ok check here because rbp has been @@ -161,8 +140,8 @@ sysenter_flags_fixed: orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - CFI_REMEMBER_STATE jnz sysenter_tracesys + sysenter_do_call: /* 32bit syscall -> 64bit C ABI argument conversion */ movl %edi,%r8d /* arg5 */ @@ -193,14 +172,12 @@ sysexit_from_sys_call: */ andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) movl RIP(%rsp),%ecx /* User %eip */ - CFI_REGISTER rip,rcx RESTORE_RSI_RDI xorl %edx,%edx /* avoid info leaks */ xorq %r8,%r8 xorq %r9,%r9 xorq %r10,%r10 movl EFLAGS(%rsp),%r11d /* User eflags */ - /*CFI_RESTORE rflags*/ TRACE_IRQS_ON /* @@ -231,8 +208,6 @@ sysexit_from_sys_call: */ USERGS_SYSRET32 - CFI_RESTORE_STATE - #ifdef CONFIG_AUDITSYSCALL .macro auditsys_entry_common movl %esi,%r8d /* 5th arg: 4th syscall arg */ @@ -282,8 +257,8 @@ sysexit_audit: #endif sysenter_fix_flags: - pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) - popfq_cfi + pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) + popfq jmp sysenter_flags_fixed sysenter_tracesys: @@ -298,7 +273,6 @@ sysenter_tracesys: LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ RESTORE_EXTRA_REGS jmp sysenter_do_call - CFI_ENDPROC ENDPROC(ia32_sysenter_target) /* @@ -332,12 +306,6 @@ ENDPROC(ia32_sysenter_target) * with the int 0x80 path. */ ENTRY(ia32_cstar_target) - CFI_STARTPROC32 simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,0 - CFI_REGISTER rip,rcx - /*CFI_REGISTER rflags,r11*/ - /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -345,7 +313,6 @@ ENTRY(ia32_cstar_target) */ SWAPGS_UNSAFE_STACK movl %esp,%r8d - CFI_REGISTER rsp,r8 movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp ENABLE_INTERRUPTS(CLBR_NONE) @@ -353,22 +320,19 @@ ENTRY(ia32_cstar_target) movl %eax,%eax /* Construct struct pt_regs on stack */ - pushq_cfi $__USER32_DS /* pt_regs->ss */ - pushq_cfi %r8 /* pt_regs->sp */ - CFI_REL_OFFSET rsp,0 - pushq_cfi %r11 /* pt_regs->flags */ - pushq_cfi $__USER32_CS /* pt_regs->cs */ - pushq_cfi %rcx /* pt_regs->ip */ - CFI_REL_OFFSET rip,0 - pushq_cfi_reg rax /* pt_regs->orig_ax */ - pushq_cfi_reg rdi /* pt_regs->di */ - pushq_cfi_reg rsi /* pt_regs->si */ - pushq_cfi_reg rdx /* pt_regs->dx */ - pushq_cfi_reg rbp /* pt_regs->cx */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %r8 /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rbp /* pt_regs->cx */ movl %ebp,%ecx - pushq_cfi $-ENOSYS /* pt_regs->ax */ + pushq $-ENOSYS /* pt_regs->ax */ sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ - CFI_ADJUST_CFA_OFFSET 10*8 /* * no need to do an access_ok check here because r8 has been @@ -380,8 +344,8 @@ ENTRY(ia32_cstar_target) ASM_CLAC orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - CFI_REMEMBER_STATE jnz cstar_tracesys + cstar_do_call: /* 32bit syscall -> 64bit C ABI argument conversion */ movl %edi,%r8d /* arg5 */ @@ -403,15 +367,12 @@ sysretl_from_sys_call: andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) RESTORE_RSI_RDI_RDX movl RIP(%rsp),%ecx - CFI_REGISTER rip,rcx movl EFLAGS(%rsp),%r11d - /*CFI_REGISTER rflags,r11*/ xorq %r10,%r10 xorq %r9,%r9 xorq %r8,%r8 TRACE_IRQS_ON movl RSP(%rsp),%esp - CFI_RESTORE rsp /* * 64bit->32bit SYSRET restores eip from ecx, * eflags from r11 (but RF and VM bits are forced to 0), @@ -430,7 +391,6 @@ sysretl_from_sys_call: #ifdef CONFIG_AUDITSYSCALL cstar_auditsys: - CFI_RESTORE_STATE movl %r9d,R9(%rsp) /* register to be clobbered by call */ auditsys_entry_common movl R9(%rsp),%r9d /* reload 6th syscall arg */ @@ -460,7 +420,6 @@ ia32_badarg: ASM_CLAC movq $-EFAULT,%rax jmp ia32_sysret - CFI_ENDPROC /* * Emulated IA32 system calls via int 0x80. @@ -484,15 +443,6 @@ ia32_badarg: */ ENTRY(ia32_syscall) - CFI_STARTPROC32 simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,5*8 - /*CFI_REL_OFFSET ss,4*8 */ - CFI_REL_OFFSET rsp,3*8 - /*CFI_REL_OFFSET rflags,2*8 */ - /*CFI_REL_OFFSET cs,1*8 */ - CFI_REL_OFFSET rip,0*8 - /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -506,15 +456,14 @@ ENTRY(ia32_syscall) movl %eax,%eax /* Construct struct pt_regs on stack (iret frame is already on stack) */ - pushq_cfi_reg rax /* pt_regs->orig_ax */ - pushq_cfi_reg rdi /* pt_regs->di */ - pushq_cfi_reg rsi /* pt_regs->si */ - pushq_cfi_reg rdx /* pt_regs->dx */ - pushq_cfi_reg rcx /* pt_regs->cx */ - pushq_cfi $-ENOSYS /* pt_regs->ax */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ cld sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ - CFI_ADJUST_CFA_OFFSET 10*8 orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) @@ -544,7 +493,6 @@ ia32_tracesys: LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ RESTORE_EXTRA_REGS jmp ia32_do_call - CFI_ENDPROC END(ia32_syscall) .macro PTREGSCALL label, func @@ -554,8 +502,6 @@ GLOBAL(\label) jmp ia32_ptregs_common .endm - CFI_STARTPROC32 - PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn PTREGSCALL stub32_sigreturn, sys32_sigreturn PTREGSCALL stub32_fork, sys_fork @@ -569,23 +515,8 @@ GLOBAL(stub32_clone) ALIGN ia32_ptregs_common: - CFI_ENDPROC - CFI_STARTPROC32 simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SIZEOF_PTREGS - CFI_REL_OFFSET rax,RAX - CFI_REL_OFFSET rcx,RCX - CFI_REL_OFFSET rdx,RDX - CFI_REL_OFFSET rsi,RSI - CFI_REL_OFFSET rdi,RDI - CFI_REL_OFFSET rip,RIP -/* CFI_REL_OFFSET cs,CS*/ -/* CFI_REL_OFFSET rflags,EFLAGS*/ - CFI_REL_OFFSET rsp,RSP -/* CFI_REL_OFFSET ss,SS*/ SAVE_EXTRA_REGS 8 call *%rax RESTORE_EXTRA_REGS 8 ret - CFI_ENDPROC END(ia32_ptregs_common) diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 1c8b50edb2db..0d76accde45b 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -46,8 +46,6 @@ For 32-bit we have the following conventions - kernel is built with */ -#include - #ifdef CONFIG_X86_64 /* @@ -92,27 +90,26 @@ For 32-bit we have the following conventions - kernel is built with .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 subq $15*8+\addskip, %rsp - CFI_ADJUST_CFA_OFFSET 15*8+\addskip .endm .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 .if \r11 - movq_cfi r11, 6*8+\offset + movq %r11, 6*8+\offset(%rsp) .endif .if \r8910 - movq_cfi r10, 7*8+\offset - movq_cfi r9, 8*8+\offset - movq_cfi r8, 9*8+\offset + movq %r10, 7*8+\offset(%rsp) + movq %r9, 8*8+\offset(%rsp) + movq %r8, 9*8+\offset(%rsp) .endif .if \rax - movq_cfi rax, 10*8+\offset + movq %rax, 10*8+\offset(%rsp) .endif .if \rcx - movq_cfi rcx, 11*8+\offset + movq %rcx, 11*8+\offset(%rsp) .endif - movq_cfi rdx, 12*8+\offset - movq_cfi rsi, 13*8+\offset - movq_cfi rdi, 14*8+\offset + movq %rdx, 12*8+\offset(%rsp) + movq %rsi, 13*8+\offset(%rsp) + movq %rdi, 14*8+\offset(%rsp) .endm .macro SAVE_C_REGS offset=0 SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 @@ -131,24 +128,24 @@ For 32-bit we have the following conventions - kernel is built with .endm .macro SAVE_EXTRA_REGS offset=0 - movq_cfi r15, 0*8+\offset - movq_cfi r14, 1*8+\offset - movq_cfi r13, 2*8+\offset - movq_cfi r12, 3*8+\offset - movq_cfi rbp, 4*8+\offset - movq_cfi rbx, 5*8+\offset + movq %r15, 0*8+\offset(%rsp) + movq %r14, 1*8+\offset(%rsp) + movq %r13, 2*8+\offset(%rsp) + movq %r12, 3*8+\offset(%rsp) + movq %rbp, 4*8+\offset(%rsp) + movq %rbx, 5*8+\offset(%rsp) .endm .macro SAVE_EXTRA_REGS_RBP offset=0 - movq_cfi rbp, 4*8+\offset + movq %rbp, 4*8+\offset(%rsp) .endm .macro RESTORE_EXTRA_REGS offset=0 - movq_cfi_restore 0*8+\offset, r15 - movq_cfi_restore 1*8+\offset, r14 - movq_cfi_restore 2*8+\offset, r13 - movq_cfi_restore 3*8+\offset, r12 - movq_cfi_restore 4*8+\offset, rbp - movq_cfi_restore 5*8+\offset, rbx + movq 0*8+\offset(%rsp), %r15 + movq 1*8+\offset(%rsp), %r14 + movq 2*8+\offset(%rsp), %r13 + movq 3*8+\offset(%rsp), %r12 + movq 4*8+\offset(%rsp), %rbp + movq 5*8+\offset(%rsp), %rbx .endm .macro ZERO_EXTRA_REGS @@ -162,24 +159,24 @@ For 32-bit we have the following conventions - kernel is built with .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 .if \rstor_r11 - movq_cfi_restore 6*8, r11 + movq 6*8(%rsp), %r11 .endif .if \rstor_r8910 - movq_cfi_restore 7*8, r10 - movq_cfi_restore 8*8, r9 - movq_cfi_restore 9*8, r8 + movq 7*8(%rsp), %r10 + movq 8*8(%rsp), %r9 + movq 9*8(%rsp), %r8 .endif .if \rstor_rax - movq_cfi_restore 10*8, rax + movq 10*8(%rsp), %rax .endif .if \rstor_rcx - movq_cfi_restore 11*8, rcx + movq 11*8(%rsp), %rcx .endif .if \rstor_rdx - movq_cfi_restore 12*8, rdx + movq 12*8(%rsp), %rdx .endif - movq_cfi_restore 13*8, rsi - movq_cfi_restore 14*8, rdi + movq 13*8(%rsp), %rsi + movq 14*8(%rsp), %rdi .endm .macro RESTORE_C_REGS RESTORE_C_REGS_HELPER 1,1,1,1,1 @@ -205,7 +202,6 @@ For 32-bit we have the following conventions - kernel is built with .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 addq $15*8+\addskip, %rsp - CFI_ADJUST_CFA_OFFSET -(15*8+\addskip) .endm .macro icebp @@ -224,23 +220,23 @@ For 32-bit we have the following conventions - kernel is built with */ .macro SAVE_ALL - pushl_cfi_reg eax - pushl_cfi_reg ebp - pushl_cfi_reg edi - pushl_cfi_reg esi - pushl_cfi_reg edx - pushl_cfi_reg ecx - pushl_cfi_reg ebx + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx .endm .macro RESTORE_ALL - popl_cfi_reg ebx - popl_cfi_reg ecx - popl_cfi_reg edx - popl_cfi_reg esi - popl_cfi_reg edi - popl_cfi_reg ebp - popl_cfi_reg eax + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax .endm #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h deleted file mode 100644 index de1cdaf4d743..000000000000 --- a/arch/x86/include/asm/dwarf2.h +++ /dev/null @@ -1,170 +0,0 @@ -#ifndef _ASM_X86_DWARF2_H -#define _ASM_X86_DWARF2_H - -#ifndef __ASSEMBLY__ -#warning "asm/dwarf2.h should be only included in pure assembly files" -#endif - -/* - * Macros for dwarf2 CFI unwind table entries. - * See "as.info" for details on these pseudo ops. Unfortunately - * they are only supported in very new binutils, so define them - * away for older version. - */ - -#ifdef CONFIG_AS_CFI - -#define CFI_STARTPROC .cfi_startproc -#define CFI_ENDPROC .cfi_endproc -#define CFI_DEF_CFA .cfi_def_cfa -#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register -#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset -#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset -#define CFI_OFFSET .cfi_offset -#define CFI_REL_OFFSET .cfi_rel_offset -#define CFI_REGISTER .cfi_register -#define CFI_RESTORE .cfi_restore -#define CFI_REMEMBER_STATE .cfi_remember_state -#define CFI_RESTORE_STATE .cfi_restore_state -#define CFI_UNDEFINED .cfi_undefined -#define CFI_ESCAPE .cfi_escape - -#ifdef CONFIG_AS_CFI_SIGNAL_FRAME -#define CFI_SIGNAL_FRAME .cfi_signal_frame -#else -#define CFI_SIGNAL_FRAME -#endif - -#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__) - /* - * Emit CFI data in .debug_frame sections, not .eh_frame sections. - * The latter we currently just discard since we don't do DWARF - * unwinding at runtime. So only the offline DWARF information is - * useful to anyone. Note we should not use this directive if this - * file is used in the vDSO assembly, or if vmlinux.lds.S gets - * changed so it doesn't discard .eh_frame. - */ - .cfi_sections .debug_frame -#endif - -#else - -/* - * Due to the structure of pre-exisiting code, don't use assembler line - * comment character # to ignore the arguments. Instead, use a dummy macro. - */ -.macro cfi_ignore a=0, b=0, c=0, d=0 -.endm - -#define CFI_STARTPROC cfi_ignore -#define CFI_ENDPROC cfi_ignore -#define CFI_DEF_CFA cfi_ignore -#define CFI_DEF_CFA_REGISTER cfi_ignore -#define CFI_DEF_CFA_OFFSET cfi_ignore -#define CFI_ADJUST_CFA_OFFSET cfi_ignore -#define CFI_OFFSET cfi_ignore -#define CFI_REL_OFFSET cfi_ignore -#define CFI_REGISTER cfi_ignore -#define CFI_RESTORE cfi_ignore -#define CFI_REMEMBER_STATE cfi_ignore -#define CFI_RESTORE_STATE cfi_ignore -#define CFI_UNDEFINED cfi_ignore -#define CFI_ESCAPE cfi_ignore -#define CFI_SIGNAL_FRAME cfi_ignore - -#endif - -/* - * An attempt to make CFI annotations more or less - * correct and shorter. It is implied that you know - * what you're doing if you use them. - */ -#ifdef __ASSEMBLY__ -#ifdef CONFIG_X86_64 - .macro pushq_cfi reg - pushq \reg - CFI_ADJUST_CFA_OFFSET 8 - .endm - - .macro pushq_cfi_reg reg - pushq %\reg - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET \reg, 0 - .endm - - .macro popq_cfi reg - popq \reg - CFI_ADJUST_CFA_OFFSET -8 - .endm - - .macro popq_cfi_reg reg - popq %\reg - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE \reg - .endm - - .macro pushfq_cfi - pushfq - CFI_ADJUST_CFA_OFFSET 8 - .endm - - .macro popfq_cfi - popfq - CFI_ADJUST_CFA_OFFSET -8 - .endm - - .macro movq_cfi reg offset=0 - movq %\reg, \offset(%rsp) - CFI_REL_OFFSET \reg, \offset - .endm - - .macro movq_cfi_restore offset reg - movq \offset(%rsp), %\reg - CFI_RESTORE \reg - .endm -#else /*!CONFIG_X86_64*/ - .macro pushl_cfi reg - pushl \reg - CFI_ADJUST_CFA_OFFSET 4 - .endm - - .macro pushl_cfi_reg reg - pushl %\reg - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET \reg, 0 - .endm - - .macro popl_cfi reg - popl \reg - CFI_ADJUST_CFA_OFFSET -4 - .endm - - .macro popl_cfi_reg reg - popl %\reg - CFI_ADJUST_CFA_OFFSET -4 - CFI_RESTORE \reg - .endm - - .macro pushfl_cfi - pushfl - CFI_ADJUST_CFA_OFFSET 4 - .endm - - .macro popfl_cfi - popfl - CFI_ADJUST_CFA_OFFSET -4 - .endm - - .macro movl_cfi reg offset=0 - movl %\reg, \offset(%esp) - CFI_REL_OFFSET \reg, \offset - .endm - - .macro movl_cfi_restore offset reg - movl \offset(%esp), %\reg - CFI_RESTORE \reg - .endm -#endif /*!CONFIG_X86_64*/ -#endif /*__ASSEMBLY__*/ - -#endif /* _ASM_X86_DWARF2_H */ diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h index 3b629f47eb65..793179cf8e21 100644 --- a/arch/x86/include/asm/frame.h +++ b/arch/x86/include/asm/frame.h @@ -1,20 +1,17 @@ #ifdef __ASSEMBLY__ #include -#include /* The annotation hides the frame from the unwinder and makes it look like a ordinary ebp save/restore. This avoids some special cases for frame pointer later */ #ifdef CONFIG_FRAME_POINTER .macro FRAME - __ASM_SIZE(push,_cfi) %__ASM_REG(bp) - CFI_REL_OFFSET __ASM_REG(bp), 0 + __ASM_SIZE(push,) %__ASM_REG(bp) __ASM_SIZE(mov) %__ASM_REG(sp), %__ASM_REG(bp) .endm .macro ENDFRAME - __ASM_SIZE(pop,_cfi) %__ASM_REG(bp) - CFI_RESTORE __ASM_REG(bp) + __ASM_SIZE(pop,) %__ASM_REG(bp) .endm #else .macro FRAME diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 1c309763e321..0ac73de925d1 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -50,7 +50,6 @@ #include #include #include -#include #include #include #include @@ -113,11 +112,10 @@ /* unfortunately push/pop can't be no-op */ .macro PUSH_GS - pushl_cfi $0 + pushl $0 .endm .macro POP_GS pop=0 addl $(4 + \pop), %esp - CFI_ADJUST_CFA_OFFSET -(4 + \pop) .endm .macro POP_GS_EX .endm @@ -137,16 +135,13 @@ #else /* CONFIG_X86_32_LAZY_GS */ .macro PUSH_GS - pushl_cfi %gs - /*CFI_REL_OFFSET gs, 0*/ + pushl %gs .endm .macro POP_GS pop=0 -98: popl_cfi %gs - /*CFI_RESTORE gs*/ +98: popl %gs .if \pop <> 0 add $\pop, %esp - CFI_ADJUST_CFA_OFFSET -\pop .endif .endm .macro POP_GS_EX @@ -170,11 +165,9 @@ .macro GS_TO_REG reg movl %gs, \reg - /*CFI_REGISTER gs, \reg*/ .endm .macro REG_TO_PTGS reg movl \reg, PT_GS(%esp) - /*CFI_REL_OFFSET gs, PT_GS*/ .endm .macro SET_KERNEL_GS reg movl $(__KERNEL_STACK_CANARY), \reg @@ -186,26 +179,16 @@ .macro SAVE_ALL cld PUSH_GS - pushl_cfi %fs - /*CFI_REL_OFFSET fs, 0;*/ - pushl_cfi %es - /*CFI_REL_OFFSET es, 0;*/ - pushl_cfi %ds - /*CFI_REL_OFFSET ds, 0;*/ - pushl_cfi %eax - CFI_REL_OFFSET eax, 0 - pushl_cfi %ebp - CFI_REL_OFFSET ebp, 0 - pushl_cfi %edi - CFI_REL_OFFSET edi, 0 - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %edx - CFI_REL_OFFSET edx, 0 - pushl_cfi %ecx - CFI_REL_OFFSET ecx, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx movl $(__USER_DS), %edx movl %edx, %ds movl %edx, %es @@ -215,30 +198,20 @@ .endm .macro RESTORE_INT_REGS - popl_cfi %ebx - CFI_RESTORE ebx - popl_cfi %ecx - CFI_RESTORE ecx - popl_cfi %edx - CFI_RESTORE edx - popl_cfi %esi - CFI_RESTORE esi - popl_cfi %edi - CFI_RESTORE edi - popl_cfi %ebp - CFI_RESTORE ebp - popl_cfi %eax - CFI_RESTORE eax + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax .endm .macro RESTORE_REGS pop=0 RESTORE_INT_REGS -1: popl_cfi %ds - /*CFI_RESTORE ds;*/ -2: popl_cfi %es - /*CFI_RESTORE es;*/ -3: popl_cfi %fs - /*CFI_RESTORE fs;*/ +1: popl %ds +2: popl %es +3: popl %fs POP_GS \pop .pushsection .fixup, "ax" 4: movl $0, (%esp) @@ -254,64 +227,27 @@ POP_GS_EX .endm -.macro RING0_INT_FRAME - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, 3*4 - /*CFI_OFFSET cs, -2*4;*/ - CFI_OFFSET eip, -3*4 -.endm - -.macro RING0_EC_FRAME - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, 4*4 - /*CFI_OFFSET cs, -2*4;*/ - CFI_OFFSET eip, -3*4 -.endm - -.macro RING0_PTREGS_FRAME - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, PT_OLDESP-PT_EBX - /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/ - CFI_OFFSET eip, PT_EIP-PT_OLDESP - /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/ - /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/ - CFI_OFFSET eax, PT_EAX-PT_OLDESP - CFI_OFFSET ebp, PT_EBP-PT_OLDESP - CFI_OFFSET edi, PT_EDI-PT_OLDESP - CFI_OFFSET esi, PT_ESI-PT_OLDESP - CFI_OFFSET edx, PT_EDX-PT_OLDESP - CFI_OFFSET ecx, PT_ECX-PT_OLDESP - CFI_OFFSET ebx, PT_EBX-PT_OLDESP -.endm - ENTRY(ret_from_fork) - CFI_STARTPROC - pushl_cfi %eax + pushl %eax call schedule_tail GET_THREAD_INFO(%ebp) - popl_cfi %eax - pushl_cfi $0x0202 # Reset kernel eflags - popfl_cfi + popl %eax + pushl $0x0202 # Reset kernel eflags + popfl jmp syscall_exit - CFI_ENDPROC END(ret_from_fork) ENTRY(ret_from_kernel_thread) - CFI_STARTPROC - pushl_cfi %eax + pushl %eax call schedule_tail GET_THREAD_INFO(%ebp) - popl_cfi %eax - pushl_cfi $0x0202 # Reset kernel eflags - popfl_cfi + popl %eax + pushl $0x0202 # Reset kernel eflags + popfl movl PT_EBP(%esp),%eax call *PT_EBX(%esp) movl $0,PT_EAX(%esp) jmp syscall_exit - CFI_ENDPROC ENDPROC(ret_from_kernel_thread) /* @@ -323,7 +259,6 @@ ENDPROC(ret_from_kernel_thread) # userspace resumption stub bypassing syscall exit tracing ALIGN - RING0_PTREGS_FRAME ret_from_exception: preempt_stop(CLBR_ANY) ret_from_intr: @@ -367,17 +302,12 @@ need_resched: jmp need_resched END(resume_kernel) #endif - CFI_ENDPROC /* SYSENTER_RETURN points to after the "sysenter" instruction in the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ # sysenter call handler stub ENTRY(ia32_sysenter_target) - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, 0 - CFI_REGISTER esp, ebp movl TSS_sysenter_sp0(%esp),%esp sysenter_past_esp: /* @@ -385,14 +315,11 @@ sysenter_past_esp: * enough kernel state to call TRACE_IRQS_OFF can be called - but * we immediately enable interrupts at that point anyway. */ - pushl_cfi $__USER_DS - /*CFI_REL_OFFSET ss, 0*/ - pushl_cfi %ebp - CFI_REL_OFFSET esp, 0 - pushfl_cfi + pushl $__USER_DS + pushl %ebp + pushfl orl $X86_EFLAGS_IF, (%esp) - pushl_cfi $__USER_CS - /*CFI_REL_OFFSET cs, 0*/ + pushl $__USER_CS /* * Push current_thread_info()->sysenter_return to the stack. * A tiny bit of offset fixup is necessary: TI_sysenter_return @@ -401,10 +328,9 @@ sysenter_past_esp: * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; * and THREAD_SIZE takes us to the bottom. */ - pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) - CFI_REL_OFFSET eip, 0 + pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) - pushl_cfi %eax + pushl %eax SAVE_ALL ENABLE_INTERRUPTS(CLBR_NONE) @@ -453,11 +379,11 @@ sysenter_audit: /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ - pushl_cfi PT_ESI(%esp) /* a3: 5th arg */ - pushl_cfi PT_EDX+4(%esp) /* a2: 4th arg */ + pushl PT_ESI(%esp) /* a3: 5th arg */ + pushl PT_EDX+4(%esp) /* a2: 4th arg */ call __audit_syscall_entry - popl_cfi %ecx /* get that remapped edx off the stack */ - popl_cfi %ecx /* get that remapped esi off the stack */ + popl %ecx /* get that remapped edx off the stack */ + popl %ecx /* get that remapped esi off the stack */ movl PT_EAX(%esp),%eax /* reload syscall number */ jmp sysenter_do_call @@ -480,7 +406,6 @@ sysexit_audit: jmp sysenter_exit #endif - CFI_ENDPROC .pushsection .fixup,"ax" 2: movl $0,PT_FS(%esp) jmp 1b @@ -491,9 +416,8 @@ ENDPROC(ia32_sysenter_target) # system call handler stub ENTRY(system_call) - RING0_INT_FRAME # can't unwind into user space anyway ASM_CLAC - pushl_cfi %eax # save orig_eax + pushl %eax # save orig_eax SAVE_ALL GET_THREAD_INFO(%ebp) # system call tracing in operation / emulation @@ -527,7 +451,6 @@ restore_all_notrace: movb PT_CS(%esp), %al andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax - CFI_REMEMBER_STATE je ldt_ss # returning to user-space with LDT SS #endif restore_nocheck: @@ -543,7 +466,6 @@ ENTRY(iret_exc) _ASM_EXTABLE(irq_return,iret_exc) #ifdef CONFIG_X86_ESPFIX32 - CFI_RESTORE_STATE ldt_ss: #ifdef CONFIG_PARAVIRT /* @@ -577,22 +499,19 @@ ldt_ss: shr $16, %edx mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ - pushl_cfi $__ESPFIX_SS - pushl_cfi %eax /* new kernel esp */ + pushl $__ESPFIX_SS + pushl %eax /* new kernel esp */ /* Disable interrupts, but do not irqtrace this section: we * will soon execute iret and the tracer was already set to * the irqstate after the iret */ DISABLE_INTERRUPTS(CLBR_EAX) lss (%esp), %esp /* switch to espfix segment */ - CFI_ADJUST_CFA_OFFSET -8 jmp restore_nocheck #endif - CFI_ENDPROC ENDPROC(system_call) # perform work that needs to be done immediately before resumption ALIGN - RING0_PTREGS_FRAME # can't unwind into user space anyway work_pending: testb $_TIF_NEED_RESCHED, %cl jz work_notifysig @@ -634,9 +553,9 @@ work_notifysig: # deal with pending signals and #ifdef CONFIG_VM86 ALIGN work_notifysig_v86: - pushl_cfi %ecx # save ti_flags for do_notify_resume + pushl %ecx # save ti_flags for do_notify_resume call save_v86_state # %eax contains pt_regs pointer - popl_cfi %ecx + popl %ecx movl %eax, %esp jmp 1b #endif @@ -666,9 +585,7 @@ syscall_exit_work: call syscall_trace_leave jmp resume_userspace END(syscall_exit_work) - CFI_ENDPROC - RING0_INT_FRAME # can't unwind into user space anyway syscall_fault: ASM_CLAC GET_THREAD_INFO(%ebp) @@ -685,7 +602,6 @@ sysenter_badsys: movl $-ENOSYS,%eax jmp sysenter_after_call END(sysenter_badsys) - CFI_ENDPROC .macro FIXUP_ESPFIX_STACK /* @@ -701,10 +617,9 @@ END(sysenter_badsys) mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ shl $16, %eax addl %esp, %eax /* the adjusted stack pointer */ - pushl_cfi $__KERNEL_DS - pushl_cfi %eax + pushl $__KERNEL_DS + pushl %eax lss (%esp), %esp /* switch to the normal stack segment */ - CFI_ADJUST_CFA_OFFSET -8 #endif .endm .macro UNWIND_ESPFIX_STACK @@ -728,13 +643,11 @@ END(sysenter_badsys) */ .align 8 ENTRY(irq_entries_start) - RING0_INT_FRAME vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ + pushl $(~vector+0x80) /* Note: always in signed byte range */ vector=vector+1 jmp common_interrupt - CFI_ADJUST_CFA_OFFSET -4 .align 8 .endr END(irq_entries_start) @@ -753,19 +666,16 @@ common_interrupt: call do_IRQ jmp ret_from_intr ENDPROC(common_interrupt) - CFI_ENDPROC #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ - RING0_INT_FRAME; \ ASM_CLAC; \ - pushl_cfi $~(nr); \ + pushl $~(nr); \ SAVE_ALL; \ TRACE_IRQS_OFF \ movl %esp,%eax; \ call fn; \ jmp ret_from_intr; \ - CFI_ENDPROC; \ ENDPROC(name) @@ -784,37 +694,31 @@ ENDPROC(name) #include ENTRY(coprocessor_error) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_coprocessor_error + pushl $0 + pushl $do_coprocessor_error jmp error_code - CFI_ENDPROC END(coprocessor_error) ENTRY(simd_coprocessor_error) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 + pushl $0 #ifdef CONFIG_X86_INVD_BUG /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ - ALTERNATIVE "pushl_cfi $do_general_protection", \ + ALTERNATIVE "pushl $do_general_protection", \ "pushl $do_simd_coprocessor_error", \ X86_FEATURE_XMM #else - pushl_cfi $do_simd_coprocessor_error + pushl $do_simd_coprocessor_error #endif jmp error_code - CFI_ENDPROC END(simd_coprocessor_error) ENTRY(device_not_available) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $-1 # mark this as an int - pushl_cfi $do_device_not_available + pushl $-1 # mark this as an int + pushl $do_device_not_available jmp error_code - CFI_ENDPROC END(device_not_available) #ifdef CONFIG_PARAVIRT @@ -830,115 +734,89 @@ END(native_irq_enable_sysexit) #endif ENTRY(overflow) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_overflow + pushl $0 + pushl $do_overflow jmp error_code - CFI_ENDPROC END(overflow) ENTRY(bounds) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_bounds + pushl $0 + pushl $do_bounds jmp error_code - CFI_ENDPROC END(bounds) ENTRY(invalid_op) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_invalid_op + pushl $0 + pushl $do_invalid_op jmp error_code - CFI_ENDPROC END(invalid_op) ENTRY(coprocessor_segment_overrun) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_coprocessor_segment_overrun + pushl $0 + pushl $do_coprocessor_segment_overrun jmp error_code - CFI_ENDPROC END(coprocessor_segment_overrun) ENTRY(invalid_TSS) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $do_invalid_TSS + pushl $do_invalid_TSS jmp error_code - CFI_ENDPROC END(invalid_TSS) ENTRY(segment_not_present) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $do_segment_not_present + pushl $do_segment_not_present jmp error_code - CFI_ENDPROC END(segment_not_present) ENTRY(stack_segment) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $do_stack_segment + pushl $do_stack_segment jmp error_code - CFI_ENDPROC END(stack_segment) ENTRY(alignment_check) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $do_alignment_check + pushl $do_alignment_check jmp error_code - CFI_ENDPROC END(alignment_check) ENTRY(divide_error) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 # no error code - pushl_cfi $do_divide_error + pushl $0 # no error code + pushl $do_divide_error jmp error_code - CFI_ENDPROC END(divide_error) #ifdef CONFIG_X86_MCE ENTRY(machine_check) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi machine_check_vector + pushl $0 + pushl machine_check_vector jmp error_code - CFI_ENDPROC END(machine_check) #endif ENTRY(spurious_interrupt_bug) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_spurious_interrupt_bug + pushl $0 + pushl $do_spurious_interrupt_bug jmp error_code - CFI_ENDPROC END(spurious_interrupt_bug) #ifdef CONFIG_XEN /* Xen doesn't set %esp to be precisely what the normal sysenter entrypoint expects, so fix it up before using the normal path. */ ENTRY(xen_sysenter_target) - RING0_INT_FRAME addl $5*4, %esp /* remove xen-provided frame */ - CFI_ADJUST_CFA_OFFSET -5*4 jmp sysenter_past_esp - CFI_ENDPROC ENTRY(xen_hypervisor_callback) - CFI_STARTPROC - pushl_cfi $-1 /* orig_ax = -1 => not a system call */ + pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL TRACE_IRQS_OFF @@ -962,7 +840,6 @@ ENTRY(xen_do_upcall) call xen_maybe_preempt_hcall #endif jmp ret_from_intr - CFI_ENDPROC ENDPROC(xen_hypervisor_callback) # Hypervisor uses this for application faults while it executes. @@ -976,8 +853,7 @@ ENDPROC(xen_hypervisor_callback) # to pop the stack frame we end up in an infinite loop of failsafe callbacks. # We distinguish between categories by maintaining a status value in EAX. ENTRY(xen_failsafe_callback) - CFI_STARTPROC - pushl_cfi %eax + pushl %eax movl $1,%eax 1: mov 4(%esp),%ds 2: mov 8(%esp),%es @@ -986,15 +862,13 @@ ENTRY(xen_failsafe_callback) /* EAX == 0 => Category 1 (Bad segment) EAX != 0 => Category 2 (Bad IRET) */ testl %eax,%eax - popl_cfi %eax + popl %eax lea 16(%esp),%esp - CFI_ADJUST_CFA_OFFSET -16 jz 5f jmp iret_exc -5: pushl_cfi $-1 /* orig_ax = -1 => not a system call */ +5: pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL jmp ret_from_exception - CFI_ENDPROC .section .fixup,"ax" 6: xorl %eax,%eax @@ -1195,34 +1069,28 @@ return_to_handler: #ifdef CONFIG_TRACING ENTRY(trace_page_fault) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $trace_do_page_fault + pushl $trace_do_page_fault jmp error_code - CFI_ENDPROC END(trace_page_fault) #endif ENTRY(page_fault) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $do_page_fault + pushl $do_page_fault ALIGN error_code: /* the function address is in %gs's slot on the stack */ - pushl_cfi %fs - /*CFI_REL_OFFSET fs, 0*/ - pushl_cfi %es - /*CFI_REL_OFFSET es, 0*/ - pushl_cfi %ds - /*CFI_REL_OFFSET ds, 0*/ - pushl_cfi_reg eax - pushl_cfi_reg ebp - pushl_cfi_reg edi - pushl_cfi_reg esi - pushl_cfi_reg edx - pushl_cfi_reg ecx - pushl_cfi_reg ebx + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx cld movl $(__KERNEL_PERCPU), %ecx movl %ecx, %fs @@ -1240,7 +1108,6 @@ error_code: movl %esp,%eax # pt_regs pointer call *%edi jmp ret_from_exception - CFI_ENDPROC END(page_fault) /* @@ -1261,29 +1128,24 @@ END(page_fault) jne \ok \label: movl TSS_sysenter_sp0 + \offset(%esp), %esp - CFI_DEF_CFA esp, 0 - CFI_UNDEFINED eip - pushfl_cfi - pushl_cfi $__KERNEL_CS - pushl_cfi $sysenter_past_esp - CFI_REL_OFFSET eip, 0 + pushfl + pushl $__KERNEL_CS + pushl $sysenter_past_esp .endm ENTRY(debug) - RING0_INT_FRAME ASM_CLAC cmpl $ia32_sysenter_target,(%esp) jne debug_stack_correct FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn debug_stack_correct: - pushl_cfi $-1 # mark this as an int + pushl $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF xorl %edx,%edx # error code 0 movl %esp,%eax # pt_regs pointer call do_debug jmp ret_from_exception - CFI_ENDPROC END(debug) /* @@ -1295,45 +1157,40 @@ END(debug) * fault happened on the sysenter path. */ ENTRY(nmi) - RING0_INT_FRAME ASM_CLAC #ifdef CONFIG_X86_ESPFIX32 - pushl_cfi %eax + pushl %eax movl %ss, %eax cmpw $__ESPFIX_SS, %ax - popl_cfi %eax + popl %eax je nmi_espfix_stack #endif cmpl $ia32_sysenter_target,(%esp) je nmi_stack_fixup - pushl_cfi %eax + pushl %eax movl %esp,%eax /* Do not access memory above the end of our stack page, * it might not exist. */ andl $(THREAD_SIZE-1),%eax cmpl $(THREAD_SIZE-20),%eax - popl_cfi %eax + popl %eax jae nmi_stack_correct cmpl $ia32_sysenter_target,12(%esp) je nmi_debug_stack_check nmi_stack_correct: - /* We have a RING0_INT_FRAME here */ - pushl_cfi %eax + pushl %eax SAVE_ALL xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_nmi jmp restore_all_notrace - CFI_ENDPROC nmi_stack_fixup: - RING0_INT_FRAME FIX_STACK 12, nmi_stack_correct, 1 jmp nmi_stack_correct nmi_debug_stack_check: - /* We have a RING0_INT_FRAME here */ cmpw $__KERNEL_CS,16(%esp) jne nmi_stack_correct cmpl $debug,(%esp) @@ -1345,57 +1202,48 @@ nmi_debug_stack_check: #ifdef CONFIG_X86_ESPFIX32 nmi_espfix_stack: - /* We have a RING0_INT_FRAME here. - * + /* * create the pointer to lss back */ - pushl_cfi %ss - pushl_cfi %esp + pushl %ss + pushl %esp addl $4, (%esp) /* copy the iret frame of 12 bytes */ .rept 3 - pushl_cfi 16(%esp) + pushl 16(%esp) .endr - pushl_cfi %eax + pushl %eax SAVE_ALL FIXUP_ESPFIX_STACK # %eax == %esp xorl %edx,%edx # zero error code call do_nmi RESTORE_REGS lss 12+4(%esp), %esp # back to espfix stack - CFI_ADJUST_CFA_OFFSET -24 jmp irq_return #endif - CFI_ENDPROC END(nmi) ENTRY(int3) - RING0_INT_FRAME ASM_CLAC - pushl_cfi $-1 # mark this as an int + pushl $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_int3 jmp ret_from_exception - CFI_ENDPROC END(int3) ENTRY(general_protection) - RING0_EC_FRAME - pushl_cfi $do_general_protection + pushl $do_general_protection jmp error_code - CFI_ENDPROC END(general_protection) #ifdef CONFIG_KVM_GUEST ENTRY(async_page_fault) - RING0_EC_FRAME ASM_CLAC - pushl_cfi $do_async_page_fault + pushl $do_async_page_fault jmp error_code - CFI_ENDPROC END(async_page_fault) #endif diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 47b95813dc37..b84cec50c8cf 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -19,8 +19,6 @@ * at the top of the kernel process stack. * * Some macro usage: - * - CFI macros are used to generate dwarf2 unwind information for better - * backtraces. They don't change any code. * - ENTRY/END Define functions in the symbol table. * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. * - idtentry - Define exception entry points. @@ -30,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -112,61 +109,6 @@ ENDPROC(native_usergs_sysret64) # define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ #endif -/* - * empty frame - */ - .macro EMPTY_FRAME start=1 offset=0 - .if \start - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,8+\offset - .else - CFI_DEF_CFA_OFFSET 8+\offset - .endif - .endm - -/* - * initial frame state for interrupts (and exceptions without error code) - */ - .macro INTR_FRAME start=1 offset=0 - EMPTY_FRAME \start, 5*8+\offset - /*CFI_REL_OFFSET ss, 4*8+\offset*/ - CFI_REL_OFFSET rsp, 3*8+\offset - /*CFI_REL_OFFSET rflags, 2*8+\offset*/ - /*CFI_REL_OFFSET cs, 1*8+\offset*/ - CFI_REL_OFFSET rip, 0*8+\offset - .endm - -/* - * initial frame state for exceptions with error code (and interrupts - * with vector already pushed) - */ - .macro XCPT_FRAME start=1 offset=0 - INTR_FRAME \start, 1*8+\offset - .endm - -/* - * frame that enables passing a complete pt_regs to a C function. - */ - .macro DEFAULT_FRAME start=1 offset=0 - XCPT_FRAME \start, ORIG_RAX+\offset - CFI_REL_OFFSET rdi, RDI+\offset - CFI_REL_OFFSET rsi, RSI+\offset - CFI_REL_OFFSET rdx, RDX+\offset - CFI_REL_OFFSET rcx, RCX+\offset - CFI_REL_OFFSET rax, RAX+\offset - CFI_REL_OFFSET r8, R8+\offset - CFI_REL_OFFSET r9, R9+\offset - CFI_REL_OFFSET r10, R10+\offset - CFI_REL_OFFSET r11, R11+\offset - CFI_REL_OFFSET rbx, RBX+\offset - CFI_REL_OFFSET rbp, RBP+\offset - CFI_REL_OFFSET r12, R12+\offset - CFI_REL_OFFSET r13, R13+\offset - CFI_REL_OFFSET r14, R14+\offset - CFI_REL_OFFSET r15, R15+\offset - .endm - /* * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. * @@ -196,12 +138,6 @@ ENDPROC(native_usergs_sysret64) */ ENTRY(system_call) - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,0 - CFI_REGISTER rip,rcx - /*CFI_REGISTER rflags,r11*/ - /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -219,8 +155,8 @@ GLOBAL(system_call_after_swapgs) movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp /* Construct struct pt_regs on stack */ - pushq_cfi $__USER_DS /* pt_regs->ss */ - pushq_cfi PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ + pushq $__USER_DS /* pt_regs->ss */ + pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ /* * Re-enable interrupts. * We use 'rsp_scratch' as a scratch space, hence irq-off block above @@ -229,22 +165,20 @@ GLOBAL(system_call_after_swapgs) * with using rsp_scratch: */ ENABLE_INTERRUPTS(CLBR_NONE) - pushq_cfi %r11 /* pt_regs->flags */ - pushq_cfi $__USER_CS /* pt_regs->cs */ - pushq_cfi %rcx /* pt_regs->ip */ - CFI_REL_OFFSET rip,0 - pushq_cfi_reg rax /* pt_regs->orig_ax */ - pushq_cfi_reg rdi /* pt_regs->di */ - pushq_cfi_reg rsi /* pt_regs->si */ - pushq_cfi_reg rdx /* pt_regs->dx */ - pushq_cfi_reg rcx /* pt_regs->cx */ - pushq_cfi $-ENOSYS /* pt_regs->ax */ - pushq_cfi_reg r8 /* pt_regs->r8 */ - pushq_cfi_reg r9 /* pt_regs->r9 */ - pushq_cfi_reg r10 /* pt_regs->r10 */ - pushq_cfi_reg r11 /* pt_regs->r11 */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ - CFI_ADJUST_CFA_OFFSET 6*8 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz tracesys @@ -282,13 +216,9 @@ system_call_fastpath: testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ - CFI_REMEMBER_STATE - RESTORE_C_REGS_EXCEPT_RCX_R11 movq RIP(%rsp),%rcx - CFI_REGISTER rip,rcx movq EFLAGS(%rsp),%r11 - /*CFI_REGISTER rflags,r11*/ movq RSP(%rsp),%rsp /* * 64bit SYSRET restores rip from rcx, @@ -307,8 +237,6 @@ system_call_fastpath: */ USERGS_SYSRET64 - CFI_RESTORE_STATE - /* Do syscall entry tracing */ tracesys: movq %rsp, %rdi @@ -374,9 +302,9 @@ int_careful: jnc int_very_careful TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq_cfi %rdi + pushq %rdi SCHEDULE_USER - popq_cfi %rdi + popq %rdi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check @@ -389,10 +317,10 @@ int_very_careful: /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx jz int_signal - pushq_cfi %rdi + pushq %rdi leaq 8(%rsp),%rdi # &ptregs -> arg1 call syscall_trace_leave - popq_cfi %rdi + popq %rdi andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi jmp int_restore_rest @@ -475,27 +403,21 @@ syscall_return: * perf profiles. Nothing jumps here. */ syscall_return_via_sysret: - CFI_REMEMBER_STATE /* rcx and r11 are already restored (see code above) */ RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp),%rsp USERGS_SYSRET64 - CFI_RESTORE_STATE opportunistic_sysret_failed: SWAPGS jmp restore_c_regs_and_iret - CFI_ENDPROC END(system_call) .macro FORK_LIKE func ENTRY(stub_\func) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 /* offset 8: return address */ SAVE_EXTRA_REGS 8 jmp sys_\func - CFI_ENDPROC END(stub_\func) .endm @@ -504,8 +426,6 @@ END(stub_\func) FORK_LIKE vfork ENTRY(stub_execve) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 call sys_execve return_from_execve: testl %eax, %eax @@ -515,11 +435,9 @@ return_from_execve: 1: /* must use IRET code path (pt_regs->cs may have changed) */ addq $8, %rsp - CFI_ADJUST_CFA_OFFSET -8 ZERO_EXTRA_REGS movq %rax,RAX(%rsp) jmp int_ret_from_sys_call - CFI_ENDPROC END(stub_execve) /* * Remaining execve stubs are only 7 bytes long. @@ -527,32 +445,23 @@ END(stub_execve) */ .align 8 GLOBAL(stub_execveat) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 call sys_execveat jmp return_from_execve - CFI_ENDPROC END(stub_execveat) #if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) .align 8 GLOBAL(stub_x32_execve) GLOBAL(stub32_execve) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 call compat_sys_execve jmp return_from_execve - CFI_ENDPROC END(stub32_execve) END(stub_x32_execve) .align 8 GLOBAL(stub_x32_execveat) GLOBAL(stub32_execveat) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 call compat_sys_execveat jmp return_from_execve - CFI_ENDPROC END(stub32_execveat) END(stub_x32_execveat) #endif @@ -562,8 +471,6 @@ END(stub_x32_execveat) * This cannot be done with SYSRET, so use the IRET return path instead. */ ENTRY(stub_rt_sigreturn) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 /* * SAVE_EXTRA_REGS result is not normally needed: * sigreturn overwrites all pt_regs->GPREGS. @@ -575,21 +482,16 @@ ENTRY(stub_rt_sigreturn) call sys_rt_sigreturn return_from_stub: addq $8, %rsp - CFI_ADJUST_CFA_OFFSET -8 RESTORE_EXTRA_REGS movq %rax,RAX(%rsp) jmp int_ret_from_sys_call - CFI_ENDPROC END(stub_rt_sigreturn) #ifdef CONFIG_X86_X32_ABI ENTRY(stub_x32_rt_sigreturn) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 SAVE_EXTRA_REGS 8 call sys32_x32_rt_sigreturn jmp return_from_stub - CFI_ENDPROC END(stub_x32_rt_sigreturn) #endif @@ -599,12 +501,11 @@ END(stub_x32_rt_sigreturn) * rdi: prev task we switched from */ ENTRY(ret_from_fork) - DEFAULT_FRAME LOCK ; btr $TIF_FORK,TI_flags(%r8) - pushq_cfi $0x0002 - popfq_cfi # reset kernel eflags + pushq $0x0002 + popfq # reset kernel eflags call schedule_tail # rdi: 'prev' task parameter @@ -628,7 +529,6 @@ ENTRY(ret_from_fork) movl $0, RAX(%rsp) RESTORE_EXTRA_REGS jmp int_ret_from_sys_call - CFI_ENDPROC END(ret_from_fork) /* @@ -637,16 +537,13 @@ END(ret_from_fork) */ .align 8 ENTRY(irq_entries_start) - INTR_FRAME vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ + pushq $(~vector+0x80) /* Note: always in signed byte range */ vector=vector+1 jmp common_interrupt - CFI_ADJUST_CFA_OFFSET -8 .align 8 .endr - CFI_ENDPROC END(irq_entries_start) /* @@ -688,17 +585,7 @@ END(irq_entries_start) movq %rsp, %rsi incl PER_CPU_VAR(irq_count) cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp - CFI_DEF_CFA_REGISTER rsi pushq %rsi - /* - * For debugger: - * "CFA (Current Frame Address) is the value on stack + offset" - */ - CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ - 0x77 /* DW_OP_breg7 (rsp) */, 0, \ - 0x06 /* DW_OP_deref */, \ - 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \ - 0x22 /* DW_OP_plus */ /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF @@ -711,7 +598,6 @@ END(irq_entries_start) */ .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: - XCPT_FRAME ASM_CLAC addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ @@ -723,11 +609,8 @@ ret_from_intr: /* Restore saved previous stack */ popq %rsi - CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */ /* return code expects complete pt_regs - adjust rsp accordingly: */ leaq -RBP(%rsi),%rsp - CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET RBP testb $3, CS(%rsp) jz retint_kernel @@ -743,7 +626,6 @@ retint_check: LOCKDEP_SYS_EXIT_IRQ movl TI_flags(%rcx),%edx andl %edi,%edx - CFI_REMEMBER_STATE jnz retint_careful retint_swapgs: /* return to user-space */ @@ -807,8 +689,8 @@ native_irq_return_iret: #ifdef CONFIG_X86_ESPFIX64 native_irq_return_ldt: - pushq_cfi %rax - pushq_cfi %rdi + pushq %rax + pushq %rdi SWAPGS movq PER_CPU_VAR(espfix_waddr),%rdi movq %rax,(0*8)(%rdi) /* RAX */ @@ -823,24 +705,23 @@ native_irq_return_ldt: movq (5*8)(%rsp),%rax /* RSP */ movq %rax,(4*8)(%rdi) andl $0xffff0000,%eax - popq_cfi %rdi + popq %rdi orq PER_CPU_VAR(espfix_stack),%rax SWAPGS movq %rax,%rsp - popq_cfi %rax + popq %rax jmp native_irq_return_iret #endif /* edi: workmask, edx: work */ retint_careful: - CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc retint_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq_cfi %rdi + pushq %rdi SCHEDULE_USER - popq_cfi %rdi + popq %rdi GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -862,7 +743,6 @@ retint_signal: GET_THREAD_INFO(%rcx) jmp retint_with_reschedule - CFI_ENDPROC END(common_interrupt) /* @@ -870,13 +750,11 @@ END(common_interrupt) */ .macro apicinterrupt3 num sym do_sym ENTRY(\sym) - INTR_FRAME ASM_CLAC - pushq_cfi $~(\num) + pushq $~(\num) .Lcommon_\sym: interrupt \do_sym jmp ret_from_intr - CFI_ENDPROC END(\sym) .endm @@ -959,24 +837,17 @@ ENTRY(\sym) .error "using shift_ist requires paranoid=1" .endif - .if \has_error_code - XCPT_FRAME - .else - INTR_FRAME - .endif - ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME .ifeq \has_error_code - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + pushq $-1 /* ORIG_RAX: no syscall to restart */ .endif ALLOC_PT_GPREGS_ON_STACK .if \paranoid .if \paranoid == 1 - CFI_REMEMBER_STATE testb $3, CS(%rsp) /* If coming from userspace, switch */ jnz 1f /* stacks. */ .endif @@ -986,8 +857,6 @@ ENTRY(\sym) .endif /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ - DEFAULT_FRAME 0 - .if \paranoid .if \shift_ist != -1 TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ @@ -1023,7 +892,6 @@ ENTRY(\sym) .endif .if \paranoid == 1 - CFI_RESTORE_STATE /* * Paranoid entry from userspace. Switch stacks and treat it * as a normal entry. This means that paranoid handlers @@ -1032,7 +900,6 @@ ENTRY(\sym) 1: call error_entry - DEFAULT_FRAME 0 movq %rsp,%rdi /* pt_regs pointer */ call sync_regs @@ -1051,8 +918,6 @@ ENTRY(\sym) jmp error_exit /* %ebx: no swapgs flag */ .endif - - CFI_ENDPROC END(\sym) .endm @@ -1085,17 +950,15 @@ idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 /* Reload gs selector with exception handling */ /* edi: new selector */ ENTRY(native_load_gs_index) - CFI_STARTPROC - pushfq_cfi + pushfq DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS gs_change: movl %edi,%gs 2: mfence /* workaround */ SWAPGS - popfq_cfi + popfq ret - CFI_ENDPROC END(native_load_gs_index) _ASM_EXTABLE(gs_change,bad_gs) @@ -1110,22 +973,15 @@ bad_gs: /* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(do_softirq_own_stack) - CFI_STARTPROC - pushq_cfi %rbp - CFI_REL_OFFSET rbp,0 + pushq %rbp mov %rsp,%rbp - CFI_DEF_CFA_REGISTER rbp incl PER_CPU_VAR(irq_count) cmove PER_CPU_VAR(irq_stack_ptr),%rsp push %rbp # backlink for old unwinder call __do_softirq leaveq - CFI_RESTORE rbp - CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET -8 decl PER_CPU_VAR(irq_count) ret - CFI_ENDPROC END(do_softirq_own_stack) #ifdef CONFIG_XEN @@ -1145,28 +1001,22 @@ idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 * activation and restart the handler using the previous one. */ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) - CFI_STARTPROC /* * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will * see the correct pointer to the pt_regs */ movq %rdi, %rsp # we don't return, adjust the stack frame - CFI_ENDPROC - DEFAULT_FRAME 11: incl PER_CPU_VAR(irq_count) movq %rsp,%rbp - CFI_DEF_CFA_REGISTER rbp cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp pushq %rbp # backlink for old unwinder call xen_evtchn_do_upcall popq %rsp - CFI_DEF_CFA_REGISTER rsp decl PER_CPU_VAR(irq_count) #ifndef CONFIG_PREEMPT call xen_maybe_preempt_hcall #endif jmp error_exit - CFI_ENDPROC END(xen_do_hypervisor_callback) /* @@ -1183,16 +1033,8 @@ END(xen_do_hypervisor_callback) * with its current contents: any discrepancy means we in category 1. */ ENTRY(xen_failsafe_callback) - INTR_FRAME 1 (6*8) - /*CFI_REL_OFFSET gs,GS*/ - /*CFI_REL_OFFSET fs,FS*/ - /*CFI_REL_OFFSET es,ES*/ - /*CFI_REL_OFFSET ds,DS*/ - CFI_REL_OFFSET r11,8 - CFI_REL_OFFSET rcx,0 movl %ds,%ecx cmpw %cx,0x10(%rsp) - CFI_REMEMBER_STATE jne 1f movl %es,%ecx cmpw %cx,0x18(%rsp) @@ -1205,29 +1047,21 @@ ENTRY(xen_failsafe_callback) jne 1f /* All segments match their saved values => Category 2 (Bad IRET). */ movq (%rsp),%rcx - CFI_RESTORE rcx movq 8(%rsp),%r11 - CFI_RESTORE r11 addq $0x30,%rsp - CFI_ADJUST_CFA_OFFSET -0x30 - pushq_cfi $0 /* RIP */ - pushq_cfi %r11 - pushq_cfi %rcx + pushq $0 /* RIP */ + pushq %r11 + pushq %rcx jmp general_protection - CFI_RESTORE_STATE 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ movq (%rsp),%rcx - CFI_RESTORE rcx movq 8(%rsp),%r11 - CFI_RESTORE r11 addq $0x30,%rsp - CFI_ADJUST_CFA_OFFSET -0x30 - pushq_cfi $-1 /* orig_ax = -1 => not a system call */ + pushq $-1 /* orig_ax = -1 => not a system call */ ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS jmp error_exit - CFI_ENDPROC END(xen_failsafe_callback) apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ @@ -1263,7 +1097,6 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector( * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(paranoid_entry) - XCPT_FRAME 1 15*8 cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 @@ -1275,7 +1108,6 @@ ENTRY(paranoid_entry) SWAPGS xorl %ebx,%ebx 1: ret - CFI_ENDPROC END(paranoid_entry) /* @@ -1290,7 +1122,6 @@ END(paranoid_entry) */ /* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) - DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF_DEBUG testl %ebx,%ebx /* swapgs needed? */ @@ -1305,7 +1136,6 @@ paranoid_exit_restore: RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 INTERRUPT_RETURN - CFI_ENDPROC END(paranoid_exit) /* @@ -1313,7 +1143,6 @@ END(paranoid_exit) * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(error_entry) - XCPT_FRAME 1 15*8 cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 @@ -1333,7 +1162,6 @@ error_sti: * for these here too. */ error_kernelspace: - CFI_REL_OFFSET rcx, RCX+8 incl %ebx leaq native_irq_return_iret(%rip),%rcx cmpq %rcx,RIP+8(%rsp) @@ -1357,13 +1185,11 @@ error_bad_iret: mov %rax,%rsp decl %ebx /* Return to usergs */ jmp error_sti - CFI_ENDPROC END(error_entry) /* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(error_exit) - DEFAULT_FRAME movl %ebx,%eax RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) @@ -1377,12 +1203,10 @@ ENTRY(error_exit) andl %edi,%edx jnz retint_careful jmp retint_swapgs - CFI_ENDPROC END(error_exit) /* Runs on exception stack */ ENTRY(nmi) - INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME /* * We allow breakpoints in NMIs. If a breakpoint occurs, then @@ -1417,8 +1241,7 @@ ENTRY(nmi) */ /* Use %rdx as our temp variable throughout */ - pushq_cfi %rdx - CFI_REL_OFFSET rdx, 0 + pushq %rdx /* * If %cs was not the kernel segment, then the NMI triggered in user @@ -1452,8 +1275,6 @@ ENTRY(nmi) jb first_nmi /* Ah, it is within the NMI stack, treat it as nested */ - CFI_REMEMBER_STATE - nested_nmi: /* * Do nothing if we interrupted the fixup in repeat_nmi. @@ -1471,26 +1292,22 @@ nested_nmi: /* Set up the interrupted NMIs stack to jump to repeat_nmi */ leaq -1*8(%rsp), %rdx movq %rdx, %rsp - CFI_ADJUST_CFA_OFFSET 1*8 leaq -10*8(%rsp), %rdx - pushq_cfi $__KERNEL_DS - pushq_cfi %rdx - pushfq_cfi - pushq_cfi $__KERNEL_CS - pushq_cfi $repeat_nmi + pushq $__KERNEL_DS + pushq %rdx + pushfq + pushq $__KERNEL_CS + pushq $repeat_nmi /* Put stack back */ addq $(6*8), %rsp - CFI_ADJUST_CFA_OFFSET -6*8 nested_nmi_out: - popq_cfi %rdx - CFI_RESTORE rdx + popq %rdx /* No need to check faults here */ INTERRUPT_RETURN - CFI_RESTORE_STATE first_nmi: /* * Because nested NMIs will use the pushed location that we @@ -1529,22 +1346,19 @@ first_nmi: */ /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ movq (%rsp), %rdx - CFI_RESTORE rdx /* Set the NMI executing variable on the stack. */ - pushq_cfi $1 + pushq $1 /* * Leave room for the "copied" frame */ subq $(5*8), %rsp - CFI_ADJUST_CFA_OFFSET 5*8 /* Copy the stack frame to the Saved frame */ .rept 5 - pushq_cfi 11*8(%rsp) + pushq 11*8(%rsp) .endr - CFI_DEF_CFA_OFFSET 5*8 /* Everything up to here is safe from nested NMIs */ @@ -1567,12 +1381,10 @@ repeat_nmi: /* Make another copy, this one may be modified by nested NMIs */ addq $(10*8), %rsp - CFI_ADJUST_CFA_OFFSET -10*8 .rept 5 - pushq_cfi -6*8(%rsp) + pushq -6*8(%rsp) .endr subq $(5*8), %rsp - CFI_DEF_CFA_OFFSET 5*8 end_repeat_nmi: /* @@ -1580,7 +1392,7 @@ end_repeat_nmi: * NMI if the first NMI took an exception and reset our iret stack * so that we repeat another NMI. */ - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + pushq $-1 /* ORIG_RAX: no syscall to restart */ ALLOC_PT_GPREGS_ON_STACK /* @@ -1591,7 +1403,6 @@ end_repeat_nmi: * exceptions might do. */ call paranoid_entry - DEFAULT_FRAME 0 /* * Save off the CR2 register. If we take a page fault in the NMI then @@ -1628,13 +1439,10 @@ nmi_restore: /* Clear the NMI executing stack variable */ movq $0, 5*8(%rsp) jmp irq_return - CFI_ENDPROC END(nmi) ENTRY(ignore_sysret) - CFI_STARTPROC mov $-ENOSYS,%eax sysret - CFI_ENDPROC END(ignore_sysret) diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S index 00933d5e992f..9b0ca8fe80fc 100644 --- a/arch/x86/lib/atomic64_386_32.S +++ b/arch/x86/lib/atomic64_386_32.S @@ -11,26 +11,23 @@ #include #include -#include /* if you want SMP support, implement these with real spinlocks */ .macro LOCK reg - pushfl_cfi + pushfl cli .endm .macro UNLOCK reg - popfl_cfi + popfl .endm #define BEGIN(op) \ .macro endp; \ - CFI_ENDPROC; \ ENDPROC(atomic64_##op##_386); \ .purgem endp; \ .endm; \ ENTRY(atomic64_##op##_386); \ - CFI_STARTPROC; \ LOCK v; #define ENDP endp diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index 082a85167a5b..db3ae85440ff 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S @@ -11,7 +11,6 @@ #include #include -#include .macro read64 reg movl %ebx, %eax @@ -22,16 +21,11 @@ .endm ENTRY(atomic64_read_cx8) - CFI_STARTPROC - read64 %ecx ret - CFI_ENDPROC ENDPROC(atomic64_read_cx8) ENTRY(atomic64_set_cx8) - CFI_STARTPROC - 1: /* we don't need LOCK_PREFIX since aligned 64-bit writes * are atomic on 586 and newer */ @@ -39,28 +33,23 @@ ENTRY(atomic64_set_cx8) jne 1b ret - CFI_ENDPROC ENDPROC(atomic64_set_cx8) ENTRY(atomic64_xchg_cx8) - CFI_STARTPROC - 1: LOCK_PREFIX cmpxchg8b (%esi) jne 1b ret - CFI_ENDPROC ENDPROC(atomic64_xchg_cx8) .macro addsub_return func ins insc ENTRY(atomic64_\func\()_return_cx8) - CFI_STARTPROC - pushl_cfi_reg ebp - pushl_cfi_reg ebx - pushl_cfi_reg esi - pushl_cfi_reg edi + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi movl %eax, %esi movl %edx, %edi @@ -79,12 +68,11 @@ ENTRY(atomic64_\func\()_return_cx8) 10: movl %ebx, %eax movl %ecx, %edx - popl_cfi_reg edi - popl_cfi_reg esi - popl_cfi_reg ebx - popl_cfi_reg ebp + popl %edi + popl %esi + popl %ebx + popl %ebp ret - CFI_ENDPROC ENDPROC(atomic64_\func\()_return_cx8) .endm @@ -93,8 +81,7 @@ addsub_return sub sub sbb .macro incdec_return func ins insc ENTRY(atomic64_\func\()_return_cx8) - CFI_STARTPROC - pushl_cfi_reg ebx + pushl %ebx read64 %esi 1: @@ -109,9 +96,8 @@ ENTRY(atomic64_\func\()_return_cx8) 10: movl %ebx, %eax movl %ecx, %edx - popl_cfi_reg ebx + popl %ebx ret - CFI_ENDPROC ENDPROC(atomic64_\func\()_return_cx8) .endm @@ -119,8 +105,7 @@ incdec_return inc add adc incdec_return dec sub sbb ENTRY(atomic64_dec_if_positive_cx8) - CFI_STARTPROC - pushl_cfi_reg ebx + pushl %ebx read64 %esi 1: @@ -136,18 +121,16 @@ ENTRY(atomic64_dec_if_positive_cx8) 2: movl %ebx, %eax movl %ecx, %edx - popl_cfi_reg ebx + popl %ebx ret - CFI_ENDPROC ENDPROC(atomic64_dec_if_positive_cx8) ENTRY(atomic64_add_unless_cx8) - CFI_STARTPROC - pushl_cfi_reg ebp - pushl_cfi_reg ebx + pushl %ebp + pushl %ebx /* these just push these two parameters on the stack */ - pushl_cfi_reg edi - pushl_cfi_reg ecx + pushl %edi + pushl %ecx movl %eax, %ebp movl %edx, %edi @@ -168,21 +151,18 @@ ENTRY(atomic64_add_unless_cx8) movl $1, %eax 3: addl $8, %esp - CFI_ADJUST_CFA_OFFSET -8 - popl_cfi_reg ebx - popl_cfi_reg ebp + popl %ebx + popl %ebp ret 4: cmpl %edx, 4(%esp) jne 2b xorl %eax, %eax jmp 3b - CFI_ENDPROC ENDPROC(atomic64_add_unless_cx8) ENTRY(atomic64_inc_not_zero_cx8) - CFI_STARTPROC - pushl_cfi_reg ebx + pushl %ebx read64 %esi 1: @@ -199,7 +179,6 @@ ENTRY(atomic64_inc_not_zero_cx8) movl $1, %eax 3: - popl_cfi_reg ebx + popl %ebx ret - CFI_ENDPROC ENDPROC(atomic64_inc_not_zero_cx8) diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index 9bc944a91274..c1e623209853 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -26,7 +26,6 @@ */ #include -#include #include #include @@ -50,9 +49,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) * alignment for the unrolled loop. */ ENTRY(csum_partial) - CFI_STARTPROC - pushl_cfi_reg esi - pushl_cfi_reg ebx + pushl %esi + pushl %ebx movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len movl 12(%esp),%esi # Function arg: unsigned char *buff @@ -129,10 +127,9 @@ ENTRY(csum_partial) jz 8f roll $8, %eax 8: - popl_cfi_reg ebx - popl_cfi_reg esi + popl %ebx + popl %esi ret - CFI_ENDPROC ENDPROC(csum_partial) #else @@ -140,9 +137,8 @@ ENDPROC(csum_partial) /* Version for PentiumII/PPro */ ENTRY(csum_partial) - CFI_STARTPROC - pushl_cfi_reg esi - pushl_cfi_reg ebx + pushl %esi + pushl %ebx movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len movl 12(%esp),%esi # Function arg: const unsigned char *buf @@ -249,10 +245,9 @@ ENTRY(csum_partial) jz 90f roll $8, %eax 90: - popl_cfi_reg ebx - popl_cfi_reg esi + popl %ebx + popl %esi ret - CFI_ENDPROC ENDPROC(csum_partial) #endif @@ -287,12 +282,10 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst, #define FP 12 ENTRY(csum_partial_copy_generic) - CFI_STARTPROC subl $4,%esp - CFI_ADJUST_CFA_OFFSET 4 - pushl_cfi_reg edi - pushl_cfi_reg esi - pushl_cfi_reg ebx + pushl %edi + pushl %esi + pushl %ebx movl ARGBASE+16(%esp),%eax # sum movl ARGBASE+12(%esp),%ecx # len movl ARGBASE+4(%esp),%esi # src @@ -401,12 +394,11 @@ DST( movb %cl, (%edi) ) .previous - popl_cfi_reg ebx - popl_cfi_reg esi - popl_cfi_reg edi - popl_cfi %ecx # equivalent to addl $4,%esp + popl %ebx + popl %esi + popl %edi + popl %ecx # equivalent to addl $4,%esp ret - CFI_ENDPROC ENDPROC(csum_partial_copy_generic) #else @@ -426,10 +418,9 @@ ENDPROC(csum_partial_copy_generic) #define ARGBASE 12 ENTRY(csum_partial_copy_generic) - CFI_STARTPROC - pushl_cfi_reg ebx - pushl_cfi_reg edi - pushl_cfi_reg esi + pushl %ebx + pushl %edi + pushl %esi movl ARGBASE+4(%esp),%esi #src movl ARGBASE+8(%esp),%edi #dst movl ARGBASE+12(%esp),%ecx #len @@ -489,11 +480,10 @@ DST( movb %dl, (%edi) ) jmp 7b .previous - popl_cfi_reg esi - popl_cfi_reg edi - popl_cfi_reg ebx + popl %esi + popl %edi + popl %ebx ret - CFI_ENDPROC ENDPROC(csum_partial_copy_generic) #undef ROUND diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index e67e579c93bd..a2fe51b00cce 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,5 +1,4 @@ #include -#include #include #include @@ -15,7 +14,6 @@ * %rdi - page */ ENTRY(clear_page) - CFI_STARTPROC ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \ "jmp clear_page_c_e", X86_FEATURE_ERMS @@ -24,11 +22,9 @@ ENTRY(clear_page) xorl %eax,%eax rep stosq ret - CFI_ENDPROC ENDPROC(clear_page) ENTRY(clear_page_orig) - CFI_STARTPROC xorl %eax,%eax movl $4096/64,%ecx @@ -48,14 +44,11 @@ ENTRY(clear_page_orig) jnz .Lloop nop ret - CFI_ENDPROC ENDPROC(clear_page_orig) ENTRY(clear_page_c_e) - CFI_STARTPROC movl $4096,%ecx xorl %eax,%eax rep stosb ret - CFI_ENDPROC ENDPROC(clear_page_c_e) diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S index 40a172541ee2..9b330242e740 100644 --- a/arch/x86/lib/cmpxchg16b_emu.S +++ b/arch/x86/lib/cmpxchg16b_emu.S @@ -6,7 +6,6 @@ * */ #include -#include #include .text @@ -21,7 +20,6 @@ * %al : Operation successful */ ENTRY(this_cpu_cmpxchg16b_emu) -CFI_STARTPROC # # Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not @@ -32,7 +30,7 @@ CFI_STARTPROC # *atomic* on a single cpu (as provided by the this_cpu_xx class of # macros). # - pushfq_cfi + pushfq cli cmpq PER_CPU_VAR((%rsi)), %rax @@ -43,17 +41,13 @@ CFI_STARTPROC movq %rbx, PER_CPU_VAR((%rsi)) movq %rcx, PER_CPU_VAR(8(%rsi)) - CFI_REMEMBER_STATE - popfq_cfi + popfq mov $1, %al ret - CFI_RESTORE_STATE .Lnot_same: - popfq_cfi + popfq xor %al,%al ret -CFI_ENDPROC - ENDPROC(this_cpu_cmpxchg16b_emu) diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S index b4807fce5177..ad5349778490 100644 --- a/arch/x86/lib/cmpxchg8b_emu.S +++ b/arch/x86/lib/cmpxchg8b_emu.S @@ -7,7 +7,6 @@ */ #include -#include .text @@ -20,14 +19,13 @@ * %ecx : high 32 bits of new value */ ENTRY(cmpxchg8b_emu) -CFI_STARTPROC # # Emulate 'cmpxchg8b (%esi)' on UP except we don't # set the whole ZF thing (caller will just compare # eax:edx with the expected value) # - pushfl_cfi + pushfl cli cmpl (%esi), %eax @@ -38,18 +36,15 @@ CFI_STARTPROC movl %ebx, (%esi) movl %ecx, 4(%esi) - CFI_REMEMBER_STATE - popfl_cfi + popfl ret - CFI_RESTORE_STATE .Lnot_same: movl (%esi), %eax .Lhalf_same: movl 4(%esi), %edx - popfl_cfi + popfl ret -CFI_ENDPROC ENDPROC(cmpxchg8b_emu) diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 8239dbcbf984..009f98216b7e 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -1,7 +1,6 @@ /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ #include -#include #include #include @@ -13,22 +12,16 @@ */ ALIGN ENTRY(copy_page) - CFI_STARTPROC ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD movl $4096/8, %ecx rep movsq ret - CFI_ENDPROC ENDPROC(copy_page) ENTRY(copy_page_regs) - CFI_STARTPROC subq $2*8, %rsp - CFI_ADJUST_CFA_OFFSET 2*8 movq %rbx, (%rsp) - CFI_REL_OFFSET rbx, 0 movq %r12, 1*8(%rsp) - CFI_REL_OFFSET r12, 1*8 movl $(4096/64)-5, %ecx .p2align 4 @@ -87,11 +80,7 @@ ENTRY(copy_page_regs) jnz .Loop2 movq (%rsp), %rbx - CFI_RESTORE rbx movq 1*8(%rsp), %r12 - CFI_RESTORE r12 addq $2*8, %rsp - CFI_ADJUST_CFA_OFFSET -2*8 ret - CFI_ENDPROC ENDPROC(copy_page_regs) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index e4b3beee83bd..982ce34f4a9b 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -7,7 +7,6 @@ */ #include -#include #include #include #include @@ -18,7 +17,6 @@ /* Standard copy_to_user with segment limit checking */ ENTRY(_copy_to_user) - CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rdi,%rcx addq %rdx,%rcx @@ -30,12 +28,10 @@ ENTRY(_copy_to_user) X86_FEATURE_REP_GOOD, \ "jmp copy_user_enhanced_fast_string", \ X86_FEATURE_ERMS - CFI_ENDPROC ENDPROC(_copy_to_user) /* Standard copy_from_user with segment limit checking */ ENTRY(_copy_from_user) - CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rsi,%rcx addq %rdx,%rcx @@ -47,14 +43,12 @@ ENTRY(_copy_from_user) X86_FEATURE_REP_GOOD, \ "jmp copy_user_enhanced_fast_string", \ X86_FEATURE_ERMS - CFI_ENDPROC ENDPROC(_copy_from_user) .section .fixup,"ax" /* must zero dest */ ENTRY(bad_from_user) bad_from_user: - CFI_STARTPROC movl %edx,%ecx xorl %eax,%eax rep @@ -62,7 +56,6 @@ bad_from_user: bad_to_user: movl %edx,%eax ret - CFI_ENDPROC ENDPROC(bad_from_user) .previous @@ -80,7 +73,6 @@ ENDPROC(bad_from_user) * eax uncopied bytes or 0 if successful. */ ENTRY(copy_user_generic_unrolled) - CFI_STARTPROC ASM_STAC cmpl $8,%edx jb 20f /* less then 8 bytes, go to byte copy loop */ @@ -162,7 +154,6 @@ ENTRY(copy_user_generic_unrolled) _ASM_EXTABLE(19b,40b) _ASM_EXTABLE(21b,50b) _ASM_EXTABLE(22b,50b) - CFI_ENDPROC ENDPROC(copy_user_generic_unrolled) /* Some CPUs run faster using the string copy instructions. @@ -184,7 +175,6 @@ ENDPROC(copy_user_generic_unrolled) * eax uncopied bytes or 0 if successful. */ ENTRY(copy_user_generic_string) - CFI_STARTPROC ASM_STAC cmpl $8,%edx jb 2f /* less than 8 bytes, go to byte copy loop */ @@ -209,7 +199,6 @@ ENTRY(copy_user_generic_string) _ASM_EXTABLE(1b,11b) _ASM_EXTABLE(3b,12b) - CFI_ENDPROC ENDPROC(copy_user_generic_string) /* @@ -225,7 +214,6 @@ ENDPROC(copy_user_generic_string) * eax uncopied bytes or 0 if successful. */ ENTRY(copy_user_enhanced_fast_string) - CFI_STARTPROC ASM_STAC movl %edx,%ecx 1: rep @@ -240,7 +228,6 @@ ENTRY(copy_user_enhanced_fast_string) .previous _ASM_EXTABLE(1b,12b) - CFI_ENDPROC ENDPROC(copy_user_enhanced_fast_string) /* @@ -248,7 +235,6 @@ ENDPROC(copy_user_enhanced_fast_string) * This will force destination/source out of cache for more performance. */ ENTRY(__copy_user_nocache) - CFI_STARTPROC ASM_STAC cmpl $8,%edx jb 20f /* less then 8 bytes, go to byte copy loop */ @@ -332,5 +318,4 @@ ENTRY(__copy_user_nocache) _ASM_EXTABLE(19b,40b) _ASM_EXTABLE(21b,50b) _ASM_EXTABLE(22b,50b) - CFI_ENDPROC ENDPROC(__copy_user_nocache) diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index 9734182966f3..7e48807b2fa1 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -6,7 +6,6 @@ * for more details. No warranty for anything given at all. */ #include -#include #include #include @@ -47,23 +46,16 @@ ENTRY(csum_partial_copy_generic) - CFI_STARTPROC cmpl $3*64, %edx jle .Lignore .Lignore: subq $7*8, %rsp - CFI_ADJUST_CFA_OFFSET 7*8 movq %rbx, 2*8(%rsp) - CFI_REL_OFFSET rbx, 2*8 movq %r12, 3*8(%rsp) - CFI_REL_OFFSET r12, 3*8 movq %r14, 4*8(%rsp) - CFI_REL_OFFSET r14, 4*8 movq %r13, 5*8(%rsp) - CFI_REL_OFFSET r13, 5*8 movq %rbp, 6*8(%rsp) - CFI_REL_OFFSET rbp, 6*8 movq %r8, (%rsp) movq %r9, 1*8(%rsp) @@ -206,22 +198,14 @@ ENTRY(csum_partial_copy_generic) addl %ebx, %eax adcl %r9d, %eax /* carry */ - CFI_REMEMBER_STATE .Lende: movq 2*8(%rsp), %rbx - CFI_RESTORE rbx movq 3*8(%rsp), %r12 - CFI_RESTORE r12 movq 4*8(%rsp), %r14 - CFI_RESTORE r14 movq 5*8(%rsp), %r13 - CFI_RESTORE r13 movq 6*8(%rsp), %rbp - CFI_RESTORE rbp addq $7*8, %rsp - CFI_ADJUST_CFA_OFFSET -7*8 ret - CFI_RESTORE_STATE /* Exception handlers. Very simple, zeroing is done in the wrappers */ .Lbad_source: @@ -237,5 +221,4 @@ ENTRY(csum_partial_copy_generic) jz .Lende movl $-EFAULT, (%rax) jmp .Lende - CFI_ENDPROC ENDPROC(csum_partial_copy_generic) diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index a4512359656a..46668cda4ffd 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -26,7 +26,6 @@ */ #include -#include #include #include #include @@ -36,7 +35,6 @@ .text ENTRY(__get_user_1) - CFI_STARTPROC GET_THREAD_INFO(%_ASM_DX) cmp TI_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user @@ -45,11 +43,9 @@ ENTRY(__get_user_1) xor %eax,%eax ASM_CLAC ret - CFI_ENDPROC ENDPROC(__get_user_1) ENTRY(__get_user_2) - CFI_STARTPROC add $1,%_ASM_AX jc bad_get_user GET_THREAD_INFO(%_ASM_DX) @@ -60,11 +56,9 @@ ENTRY(__get_user_2) xor %eax,%eax ASM_CLAC ret - CFI_ENDPROC ENDPROC(__get_user_2) ENTRY(__get_user_4) - CFI_STARTPROC add $3,%_ASM_AX jc bad_get_user GET_THREAD_INFO(%_ASM_DX) @@ -75,11 +69,9 @@ ENTRY(__get_user_4) xor %eax,%eax ASM_CLAC ret - CFI_ENDPROC ENDPROC(__get_user_4) ENTRY(__get_user_8) - CFI_STARTPROC #ifdef CONFIG_X86_64 add $7,%_ASM_AX jc bad_get_user @@ -104,28 +96,23 @@ ENTRY(__get_user_8) ASM_CLAC ret #endif - CFI_ENDPROC ENDPROC(__get_user_8) bad_get_user: - CFI_STARTPROC xor %edx,%edx mov $(-EFAULT),%_ASM_AX ASM_CLAC ret - CFI_ENDPROC END(bad_get_user) #ifdef CONFIG_X86_32 bad_get_user_8: - CFI_STARTPROC xor %edx,%edx xor %ecx,%ecx mov $(-EFAULT),%_ASM_AX ASM_CLAC ret - CFI_ENDPROC END(bad_get_user_8) #endif diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S index 05a95e713da8..33147fef3452 100644 --- a/arch/x86/lib/iomap_copy_64.S +++ b/arch/x86/lib/iomap_copy_64.S @@ -16,15 +16,12 @@ */ #include -#include /* * override generic version in lib/iomap_copy.c */ ENTRY(__iowrite32_copy) - CFI_STARTPROC movl %edx,%ecx rep movsd ret - CFI_ENDPROC ENDPROC(__iowrite32_copy) diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index b046664f5a1c..16698bba87de 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -2,7 +2,6 @@ #include #include -#include #include /* @@ -53,7 +52,6 @@ ENTRY(memcpy_erms) ENDPROC(memcpy_erms) ENTRY(memcpy_orig) - CFI_STARTPROC movq %rdi, %rax cmpq $0x20, %rdx @@ -178,5 +176,4 @@ ENTRY(memcpy_orig) .Lend: retq - CFI_ENDPROC ENDPROC(memcpy_orig) diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 0f8a0d0331b9..ca2afdd6d98e 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -6,7 +6,6 @@ * - Copyright 2011 Fenghua Yu */ #include -#include #include #include @@ -27,7 +26,6 @@ ENTRY(memmove) ENTRY(__memmove) - CFI_STARTPROC /* Handle more 32 bytes in loop */ mov %rdi, %rax @@ -207,6 +205,5 @@ ENTRY(__memmove) movb %r11b, (%rdi) 13: retq - CFI_ENDPROC ENDPROC(__memmove) ENDPROC(memmove) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 93118fb23976..2661fad05827 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -1,7 +1,6 @@ /* Copyright 2002 Andi Kleen, SuSE Labs */ #include -#include #include #include @@ -66,7 +65,6 @@ ENTRY(memset_erms) ENDPROC(memset_erms) ENTRY(memset_orig) - CFI_STARTPROC movq %rdi,%r10 /* expand byte value */ @@ -78,7 +76,6 @@ ENTRY(memset_orig) movl %edi,%r9d andl $7,%r9d jnz .Lbad_alignment - CFI_REMEMBER_STATE .Lafter_bad_alignment: movq %rdx,%rcx @@ -128,7 +125,6 @@ ENTRY(memset_orig) movq %r10,%rax ret - CFI_RESTORE_STATE .Lbad_alignment: cmpq $7,%rdx jbe .Lhandle_7 @@ -139,5 +135,4 @@ ENTRY(memset_orig) subq %r8,%rdx jmp .Lafter_bad_alignment .Lfinal: - CFI_ENDPROC ENDPROC(memset_orig) diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index 3ca5218fbece..c81556409bbb 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S @@ -1,6 +1,5 @@ #include #include -#include #include #include @@ -13,9 +12,8 @@ */ .macro op_safe_regs op ENTRY(\op\()_safe_regs) - CFI_STARTPROC - pushq_cfi_reg rbx - pushq_cfi_reg rbp + pushq %rbx + pushq %rbp movq %rdi, %r10 /* Save pointer */ xorl %r11d, %r11d /* Return value */ movl (%rdi), %eax @@ -25,7 +23,6 @@ ENTRY(\op\()_safe_regs) movl 20(%rdi), %ebp movl 24(%rdi), %esi movl 28(%rdi), %edi - CFI_REMEMBER_STATE 1: \op 2: movl %eax, (%r10) movl %r11d, %eax /* Return value */ @@ -35,16 +32,14 @@ ENTRY(\op\()_safe_regs) movl %ebp, 20(%r10) movl %esi, 24(%r10) movl %edi, 28(%r10) - popq_cfi_reg rbp - popq_cfi_reg rbx + popq %rbp + popq %rbx ret 3: - CFI_RESTORE_STATE movl $-EIO, %r11d jmp 2b _ASM_EXTABLE(1b, 3b) - CFI_ENDPROC ENDPROC(\op\()_safe_regs) .endm @@ -52,13 +47,12 @@ ENDPROC(\op\()_safe_regs) .macro op_safe_regs op ENTRY(\op\()_safe_regs) - CFI_STARTPROC - pushl_cfi_reg ebx - pushl_cfi_reg ebp - pushl_cfi_reg esi - pushl_cfi_reg edi - pushl_cfi $0 /* Return value */ - pushl_cfi %eax + pushl %ebx + pushl %ebp + pushl %esi + pushl %edi + pushl $0 /* Return value */ + pushl %eax movl 4(%eax), %ecx movl 8(%eax), %edx movl 12(%eax), %ebx @@ -66,32 +60,28 @@ ENTRY(\op\()_safe_regs) movl 24(%eax), %esi movl 28(%eax), %edi movl (%eax), %eax - CFI_REMEMBER_STATE 1: \op -2: pushl_cfi %eax +2: pushl %eax movl 4(%esp), %eax - popl_cfi (%eax) + popl (%eax) addl $4, %esp - CFI_ADJUST_CFA_OFFSET -4 movl %ecx, 4(%eax) movl %edx, 8(%eax) movl %ebx, 12(%eax) movl %ebp, 20(%eax) movl %esi, 24(%eax) movl %edi, 28(%eax) - popl_cfi %eax - popl_cfi_reg edi - popl_cfi_reg esi - popl_cfi_reg ebp - popl_cfi_reg ebx + popl %eax + popl %edi + popl %esi + popl %ebp + popl %ebx ret 3: - CFI_RESTORE_STATE movl $-EIO, 4(%esp) jmp 2b _ASM_EXTABLE(1b, 3b) - CFI_ENDPROC ENDPROC(\op\()_safe_regs) .endm diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index fc6ba17a7eec..e0817a12d323 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -11,7 +11,6 @@ * return value. */ #include -#include #include #include #include @@ -30,11 +29,9 @@ * as they get called from within inline assembly. */ -#define ENTER CFI_STARTPROC ; \ - GET_THREAD_INFO(%_ASM_BX) +#define ENTER GET_THREAD_INFO(%_ASM_BX) #define EXIT ASM_CLAC ; \ - ret ; \ - CFI_ENDPROC + ret .text ENTRY(__put_user_1) @@ -87,7 +84,6 @@ ENTRY(__put_user_8) ENDPROC(__put_user_8) bad_put_user: - CFI_STARTPROC movl $-EFAULT,%eax EXIT END(bad_put_user) diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S index 2322abe4da3b..40027db99140 100644 --- a/arch/x86/lib/rwsem.S +++ b/arch/x86/lib/rwsem.S @@ -15,7 +15,6 @@ #include #include -#include #define __ASM_HALF_REG(reg) __ASM_SEL(reg, e##reg) #define __ASM_HALF_SIZE(inst) __ASM_SEL(inst##w, inst##l) @@ -34,10 +33,10 @@ */ #define save_common_regs \ - pushl_cfi_reg ecx + pushl %ecx #define restore_common_regs \ - popl_cfi_reg ecx + popl %ecx /* Avoid uglifying the argument copying x86-64 needs to do. */ .macro movq src, dst @@ -64,50 +63,45 @@ */ #define save_common_regs \ - pushq_cfi_reg rdi; \ - pushq_cfi_reg rsi; \ - pushq_cfi_reg rcx; \ - pushq_cfi_reg r8; \ - pushq_cfi_reg r9; \ - pushq_cfi_reg r10; \ - pushq_cfi_reg r11 + pushq %rdi; \ + pushq %rsi; \ + pushq %rcx; \ + pushq %r8; \ + pushq %r9; \ + pushq %r10; \ + pushq %r11 #define restore_common_regs \ - popq_cfi_reg r11; \ - popq_cfi_reg r10; \ - popq_cfi_reg r9; \ - popq_cfi_reg r8; \ - popq_cfi_reg rcx; \ - popq_cfi_reg rsi; \ - popq_cfi_reg rdi + popq %r11; \ + popq %r10; \ + popq %r9; \ + popq %r8; \ + popq %rcx; \ + popq %rsi; \ + popq %rdi #endif /* Fix up special calling conventions */ ENTRY(call_rwsem_down_read_failed) - CFI_STARTPROC save_common_regs - __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx) + __ASM_SIZE(push,) %__ASM_REG(dx) movq %rax,%rdi call rwsem_down_read_failed - __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx) + __ASM_SIZE(pop,) %__ASM_REG(dx) restore_common_regs ret - CFI_ENDPROC ENDPROC(call_rwsem_down_read_failed) ENTRY(call_rwsem_down_write_failed) - CFI_STARTPROC save_common_regs movq %rax,%rdi call rwsem_down_write_failed restore_common_regs ret - CFI_ENDPROC ENDPROC(call_rwsem_down_write_failed) ENTRY(call_rwsem_wake) - CFI_STARTPROC /* do nothing if still outstanding active readers */ __ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx) jnz 1f @@ -116,17 +110,14 @@ ENTRY(call_rwsem_wake) call rwsem_wake restore_common_regs 1: ret - CFI_ENDPROC ENDPROC(call_rwsem_wake) ENTRY(call_rwsem_downgrade_wake) - CFI_STARTPROC save_common_regs - __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx) + __ASM_SIZE(push,) %__ASM_REG(dx) movq %rax,%rdi call rwsem_downgrade_wake - __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx) + __ASM_SIZE(pop,) %__ASM_REG(dx) restore_common_regs ret - CFI_ENDPROC ENDPROC(call_rwsem_downgrade_wake) diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S index 5eb715087b80..e9acf5f4fc92 100644 --- a/arch/x86/lib/thunk_32.S +++ b/arch/x86/lib/thunk_32.S @@ -6,16 +6,14 @@ */ #include #include - #include /* put return address in eax (arg1) */ .macro THUNK name, func, put_ret_addr_in_eax=0 .globl \name \name: - CFI_STARTPROC - pushl_cfi_reg eax - pushl_cfi_reg ecx - pushl_cfi_reg edx + pushl %eax + pushl %ecx + pushl %edx .if \put_ret_addr_in_eax /* Place EIP in the arg1 */ @@ -23,11 +21,10 @@ .endif call \func - popl_cfi_reg edx - popl_cfi_reg ecx - popl_cfi_reg eax + popl %edx + popl %ecx + popl %eax ret - CFI_ENDPROC _ASM_NOKPROBE(\name) .endm diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index f89ba4e93025..10f555e435e1 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -6,7 +6,6 @@ * Subject to the GNU public license, v.2. No warranty of any kind. */ #include -#include #include #include @@ -14,27 +13,25 @@ .macro THUNK name, func, put_ret_addr_in_rdi=0 .globl \name \name: - CFI_STARTPROC /* this one pushes 9 elems, the next one would be %rIP */ - pushq_cfi_reg rdi - pushq_cfi_reg rsi - pushq_cfi_reg rdx - pushq_cfi_reg rcx - pushq_cfi_reg rax - pushq_cfi_reg r8 - pushq_cfi_reg r9 - pushq_cfi_reg r10 - pushq_cfi_reg r11 + pushq %rdi + pushq %rsi + pushq %rdx + pushq %rcx + pushq %rax + pushq %r8 + pushq %r9 + pushq %r10 + pushq %r11 .if \put_ret_addr_in_rdi /* 9*8(%rsp) is return addr on stack */ - movq_cfi_restore 9*8, rdi + movq 9*8(%rsp), %rdi .endif call \func jmp restore - CFI_ENDPROC _ASM_NOKPROBE(\name) .endm @@ -57,19 +54,16 @@ #if defined(CONFIG_TRACE_IRQFLAGS) \ || defined(CONFIG_DEBUG_LOCK_ALLOC) \ || defined(CONFIG_PREEMPT) - CFI_STARTPROC - CFI_ADJUST_CFA_OFFSET 9*8 restore: - popq_cfi_reg r11 - popq_cfi_reg r10 - popq_cfi_reg r9 - popq_cfi_reg r8 - popq_cfi_reg rax - popq_cfi_reg rcx - popq_cfi_reg rdx - popq_cfi_reg rsi - popq_cfi_reg rdi + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi ret - CFI_ENDPROC _ASM_NOKPROBE(restore) #endif diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S index 6440221ced0d..4093216b3791 100644 --- a/arch/x86/net/bpf_jit.S +++ b/arch/x86/net/bpf_jit.S @@ -8,7 +8,6 @@ * of the License. */ #include -#include /* * Calling convention : -- cgit v1.2.3 From 2f63b9db7260beba3c19d66d6c11b0b78ea84a8c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 1 Jun 2015 13:03:59 +0100 Subject: x86/asm/entry/64: Fold identical code paths retint_kernel doesn't require %rcx to be pointing to thread info (anymore?), and the code on the two alternative paths is - not really surprisingly - identical. Signed-off-by: Jan Beulich Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/556C664F020000780007FB64@mail.emea.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b84cec50c8cf..4ad79e946f5a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -615,7 +615,7 @@ ret_from_intr: testb $3, CS(%rsp) jz retint_kernel /* Interrupt came from user space */ - +retint_user: GET_THREAD_INFO(%rcx) /* * %rcx: thread info. Interrupts off. @@ -1194,15 +1194,9 @@ ENTRY(error_exit) RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - GET_THREAD_INFO(%rcx) testl %eax,%eax jnz retint_kernel - LOCKDEP_SYS_EXIT_IRQ - movl TI_flags(%rcx),%edx - movl $_TIF_WORK_MASK,%edi - andl %edi,%edx - jnz retint_careful - jmp retint_swapgs + jmp retint_user END(error_exit) /* Runs on exception stack */ -- cgit v1.2.3 From d6472302f242559d45dcf4ebace62508dc4d8aeb Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 2 Jun 2015 19:01:38 +1000 Subject: x86/mm: Decouple from Nothing in uses anything from , so remove it from there and fix up the resulting build problems triggered on x86 {64|32}-bit {def|allmod|allno}configs. The breakages were triggering in places where x86 builds relied on vmalloc() facilities but did not include explicitly and relied on the implicit inclusion via . Also add: - to - to ... which were two other implicit header file dependencies. Suggested-by: David Miller Signed-off-by: Stephen Rothwell [ Tidied up the changelog. ] Acked-by: David Miller Acked-by: Takashi Iwai Acked-by: Viresh Kumar Acked-by: Vinod Koul Cc: Andrew Morton Cc: Anton Vorontsov Cc: Boris Ostrovsky Cc: Colin Cross Cc: David Vrabel Cc: H. Peter Anvin Cc: Haiyang Zhang Cc: James E.J. Bottomley Cc: Jaroslav Kysela Cc: K. Y. Srinivasan Cc: Kees Cook Cc: Konrad Rzeszutek Wilk Cc: Kristen Carlson Accardi Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Suma Ramars Cc: Thomas Gleixner Cc: Tony Luck Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 3 +-- arch/x86/kernel/crash.c | 1 + arch/x86/kernel/machine_kexec_64.c | 1 + arch/x86/mm/pageattr-test.c | 1 + arch/x86/mm/pageattr.c | 1 + arch/x86/xen/p2m.c | 1 + drivers/acpi/apei/erst.c | 1 + drivers/cpufreq/intel_pstate.c | 1 + drivers/dma/mic_x100_dma.c | 1 + drivers/net/hyperv/netvsc.c | 1 + drivers/net/hyperv/rndis_filter.c | 1 + drivers/scsi/fnic/fnic_debugfs.c | 1 + drivers/scsi/fnic/fnic_trace.c | 1 + include/linux/io.h | 1 + sound/pci/asihpi/hpioctl.c | 1 + 15 files changed, 15 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index a2b97404922d..a94463063b46 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -40,6 +40,7 @@ #include #include #include +#include #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -198,8 +199,6 @@ extern void set_iounmap_nonlazy(void); #include -#include - /* * Convert a virtual cached pointer to an uncached pointer */ diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index c76d3e37c6e1..e068d6683dba 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 415480d3ea84..11546b462fa6 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 6629f397b467..8ff686aa7e8c 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 70d221fe2eb4..fae3c5366ac0 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index b47124d4cd67..8b7f18e200aa 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c index ed65e9c4b5b0..3670bbab57a3 100644 --- a/drivers/acpi/apei/erst.c +++ b/drivers/acpi/apei/erst.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include "apei-internal.h" diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 6414661ac1c4..2ba53f4f6af2 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include diff --git a/drivers/dma/mic_x100_dma.c b/drivers/dma/mic_x100_dma.c index 6de2e677be04..74d9db05a5ad 100644 --- a/drivers/dma/mic_x100_dma.c +++ b/drivers/dma/mic_x100_dma.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "mic_x100_dma.h" diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index ea091bc5ff09..1e09243d5449 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include "hyperv_net.h" diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 9118cea91882..35a482d526d9 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "hyperv_net.h" diff --git a/drivers/scsi/fnic/fnic_debugfs.c b/drivers/scsi/fnic/fnic_debugfs.c index 5980c10c734d..d6498fabe628 100644 --- a/drivers/scsi/fnic/fnic_debugfs.c +++ b/drivers/scsi/fnic/fnic_debugfs.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "fnic.h" static struct dentry *fnic_trace_debugfs_root; diff --git a/drivers/scsi/fnic/fnic_trace.c b/drivers/scsi/fnic/fnic_trace.c index 65a9bde26974..4e15c4bf0795 100644 --- a/drivers/scsi/fnic/fnic_trace.c +++ b/drivers/scsi/fnic/fnic_trace.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "fnic_io.h" #include "fnic.h" diff --git a/include/linux/io.h b/include/linux/io.h index 04cce4da3685..fb5a99800e77 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -19,6 +19,7 @@ #define _LINUX_IO_H #include +#include #include #include diff --git a/sound/pci/asihpi/hpioctl.c b/sound/pci/asihpi/hpioctl.c index 6610bd096fc9..d17937b92331 100644 --- a/sound/pci/asihpi/hpioctl.c +++ b/sound/pci/asihpi/hpioctl.c @@ -32,6 +32,7 @@ #include #include #include +#include #ifdef MODULE_FIRMWARE MODULE_FIRMWARE("asihpi/dsp5000.bin"); -- cgit v1.2.3 From 905a36a2851838bca5a424fb758e201990234e6e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 13:37:36 +0200 Subject: x86/asm/entry: Move entry_64.S and entry_32.S to arch/x86/entry/ Create a new directory hierarchy for the low level x86 entry code: arch/x86/entry/* This will host all the low level glue that is currently scattered all across arch/x86/. Start with entry_64.S and entry_32.S. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Kbuild | 3 + arch/x86/entry/Makefile | 4 + arch/x86/entry/entry_32.S | 1249 ++++++++++++++++++++++++++++++++++++++ arch/x86/entry/entry_64.S | 1442 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/entry_32.S | 1249 -------------------------------------- arch/x86/kernel/entry_64.S | 1442 -------------------------------------------- 7 files changed, 2699 insertions(+), 2692 deletions(-) create mode 100644 arch/x86/entry/Makefile create mode 100644 arch/x86/entry/entry_32.S create mode 100644 arch/x86/entry/entry_64.S delete mode 100644 arch/x86/kernel/entry_32.S delete mode 100644 arch/x86/kernel/entry_64.S (limited to 'arch/x86/kernel') diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 3942f74c92d7..b9b816277e72 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -1,3 +1,6 @@ + +obj-y += entry/ + obj-$(CONFIG_KVM) += kvm/ # Xen paravirtualization support diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile new file mode 100644 index 000000000000..fa7e0cf6d3c4 --- /dev/null +++ b/arch/x86/entry/Makefile @@ -0,0 +1,4 @@ +# +# Makefile for the x86 low level entry code +# +obj-y := entry_$(BITS).o diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S new file mode 100644 index 000000000000..0ac73de925d1 --- /dev/null +++ b/arch/x86/entry/entry_32.S @@ -0,0 +1,1249 @@ +/* + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * This also contains the timer-interrupt handler, as well as all interrupts + * and faults that can result in a task-switch. + * + * NOTE: This code handles signal-recognition, which happens every time + * after a timer-interrupt and after each system call. + * + * I changed all the .align's to 4 (16 byte alignment), as that's faster + * on a 486. + * + * Stack layout in 'syscall_exit': + * ptrace needs to have all regs on the stack. + * if the order here is changed, it needs to be + * updated in fork.c:copy_process, signal.c:do_signal, + * ptrace.c and ptrace.h + * + * 0(%esp) - %ebx + * 4(%esp) - %ecx + * 8(%esp) - %edx + * C(%esp) - %esi + * 10(%esp) - %edi + * 14(%esp) - %ebp + * 18(%esp) - %eax + * 1C(%esp) - %ds + * 20(%esp) - %es + * 24(%esp) - %fs + * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS + * 2C(%esp) - orig_eax + * 30(%esp) - %eip + * 34(%esp) - %cs + * 38(%esp) - %eflags + * 3C(%esp) - %oldesp + * 40(%esp) - %oldss + * + * "current" is in register %ebx during any slow entries. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +#define sysenter_audit syscall_trace_entry +#define sysexit_audit syscall_exit_work +#endif + + .section .entry.text, "ax" + +/* + * We use macros for low-level operations which need to be overridden + * for paravirtualization. The following will never clobber any registers: + * INTERRUPT_RETURN (aka. "iret") + * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). + * + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). + * Allowing a register to be clobbered can shrink the paravirt replacement + * enough to patch inline, increasing performance. + */ + +#ifdef CONFIG_PREEMPT +#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF +#else +#define preempt_stop(clobbers) +#define resume_kernel restore_all +#endif + +.macro TRACE_IRQS_IRET +#ifdef CONFIG_TRACE_IRQFLAGS + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off? + jz 1f + TRACE_IRQS_ON +1: +#endif +.endm + +/* + * User gs save/restore + * + * %gs is used for userland TLS and kernel only uses it for stack + * canary which is required to be at %gs:20 by gcc. Read the comment + * at the top of stackprotector.h for more info. + * + * Local labels 98 and 99 are used. + */ +#ifdef CONFIG_X86_32_LAZY_GS + + /* unfortunately push/pop can't be no-op */ +.macro PUSH_GS + pushl $0 +.endm +.macro POP_GS pop=0 + addl $(4 + \pop), %esp +.endm +.macro POP_GS_EX +.endm + + /* all the rest are no-op */ +.macro PTGS_TO_GS +.endm +.macro PTGS_TO_GS_EX +.endm +.macro GS_TO_REG reg +.endm +.macro REG_TO_PTGS reg +.endm +.macro SET_KERNEL_GS reg +.endm + +#else /* CONFIG_X86_32_LAZY_GS */ + +.macro PUSH_GS + pushl %gs +.endm + +.macro POP_GS pop=0 +98: popl %gs + .if \pop <> 0 + add $\pop, %esp + .endif +.endm +.macro POP_GS_EX +.pushsection .fixup, "ax" +99: movl $0, (%esp) + jmp 98b +.popsection + _ASM_EXTABLE(98b,99b) +.endm + +.macro PTGS_TO_GS +98: mov PT_GS(%esp), %gs +.endm +.macro PTGS_TO_GS_EX +.pushsection .fixup, "ax" +99: movl $0, PT_GS(%esp) + jmp 98b +.popsection + _ASM_EXTABLE(98b,99b) +.endm + +.macro GS_TO_REG reg + movl %gs, \reg +.endm +.macro REG_TO_PTGS reg + movl \reg, PT_GS(%esp) +.endm +.macro SET_KERNEL_GS reg + movl $(__KERNEL_STACK_CANARY), \reg + movl \reg, %gs +.endm + +#endif /* CONFIG_X86_32_LAZY_GS */ + +.macro SAVE_ALL + cld + PUSH_GS + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + movl $(__USER_DS), %edx + movl %edx, %ds + movl %edx, %es + movl $(__KERNEL_PERCPU), %edx + movl %edx, %fs + SET_KERNEL_GS %edx +.endm + +.macro RESTORE_INT_REGS + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax +.endm + +.macro RESTORE_REGS pop=0 + RESTORE_INT_REGS +1: popl %ds +2: popl %es +3: popl %fs + POP_GS \pop +.pushsection .fixup, "ax" +4: movl $0, (%esp) + jmp 1b +5: movl $0, (%esp) + jmp 2b +6: movl $0, (%esp) + jmp 3b +.popsection + _ASM_EXTABLE(1b,4b) + _ASM_EXTABLE(2b,5b) + _ASM_EXTABLE(3b,6b) + POP_GS_EX +.endm + +ENTRY(ret_from_fork) + pushl %eax + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + pushl $0x0202 # Reset kernel eflags + popfl + jmp syscall_exit +END(ret_from_fork) + +ENTRY(ret_from_kernel_thread) + pushl %eax + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + pushl $0x0202 # Reset kernel eflags + popfl + movl PT_EBP(%esp),%eax + call *PT_EBX(%esp) + movl $0,PT_EAX(%esp) + jmp syscall_exit +ENDPROC(ret_from_kernel_thread) + +/* + * Return to user mode is not as complex as all this looks, + * but we want the default path for a system call return to + * go as quickly as possible which is why some of this is + * less clear than it otherwise should be. + */ + + # userspace resumption stub bypassing syscall exit tracing + ALIGN +ret_from_exception: + preempt_stop(CLBR_ANY) +ret_from_intr: + GET_THREAD_INFO(%ebp) +#ifdef CONFIG_VM86 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax +#else + /* + * We can be coming here from child spawned by kernel_thread(). + */ + movl PT_CS(%esp), %eax + andl $SEGMENT_RPL_MASK, %eax +#endif + cmpl $USER_RPL, %eax + jb resume_kernel # not returning to v8086 or userspace + +ENTRY(resume_userspace) + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on + # int/exception return? + jne work_pending + jmp restore_all +END(ret_from_exception) + +#ifdef CONFIG_PREEMPT +ENTRY(resume_kernel) + DISABLE_INTERRUPTS(CLBR_ANY) +need_resched: + cmpl $0,PER_CPU_VAR(__preempt_count) + jnz restore_all + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? + jz restore_all + call preempt_schedule_irq + jmp need_resched +END(resume_kernel) +#endif + +/* SYSENTER_RETURN points to after the "sysenter" instruction in + the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ + + # sysenter call handler stub +ENTRY(ia32_sysenter_target) + movl TSS_sysenter_sp0(%esp),%esp +sysenter_past_esp: + /* + * Interrupts are disabled here, but we can't trace it until + * enough kernel state to call TRACE_IRQS_OFF can be called - but + * we immediately enable interrupts at that point anyway. + */ + pushl $__USER_DS + pushl %ebp + pushfl + orl $X86_EFLAGS_IF, (%esp) + pushl $__USER_CS + /* + * Push current_thread_info()->sysenter_return to the stack. + * A tiny bit of offset fixup is necessary: TI_sysenter_return + * is relative to thread_info, which is at the bottom of the + * kernel stack page. 4*4 means the 4 words pushed above; + * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; + * and THREAD_SIZE takes us to the bottom. + */ + pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) + + pushl %eax + SAVE_ALL + ENABLE_INTERRUPTS(CLBR_NONE) + +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-3,%ebp + jae syscall_fault + ASM_STAC +1: movl (%ebp),%ebp + ASM_CLAC + movl %ebp,PT_EBP(%esp) + _ASM_EXTABLE(1b,syscall_fault) + + GET_THREAD_INFO(%ebp) + + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz sysenter_audit +sysenter_do_call: + cmpl $(NR_syscalls), %eax + jae sysenter_badsys + call *sys_call_table(,%eax,4) +sysenter_after_call: + movl %eax,PT_EAX(%esp) + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx + jnz sysexit_audit +sysenter_exit: +/* if something modifies registers it must also disable sysexit */ + movl PT_EIP(%esp), %edx + movl PT_OLDESP(%esp), %ecx + xorl %ebp,%ebp + TRACE_IRQS_ON +1: mov PT_FS(%esp), %fs + PTGS_TO_GS + ENABLE_INTERRUPTS_SYSEXIT + +#ifdef CONFIG_AUDITSYSCALL +sysenter_audit: + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ + movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ + /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ + pushl PT_ESI(%esp) /* a3: 5th arg */ + pushl PT_EDX+4(%esp) /* a2: 4th arg */ + call __audit_syscall_entry + popl %ecx /* get that remapped edx off the stack */ + popl %ecx /* get that remapped esi off the stack */ + movl PT_EAX(%esp),%eax /* reload syscall number */ + jmp sysenter_do_call + +sysexit_audit: + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jnz syscall_exit_work + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + movl %eax,%edx /* second arg, syscall return value */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ + movzbl %al,%eax /* zero-extend that */ + call __audit_syscall_exit + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jnz syscall_exit_work + movl PT_EAX(%esp),%eax /* reload syscall return value */ + jmp sysenter_exit +#endif + +.pushsection .fixup,"ax" +2: movl $0,PT_FS(%esp) + jmp 1b +.popsection + _ASM_EXTABLE(1b,2b) + PTGS_TO_GS_EX +ENDPROC(ia32_sysenter_target) + + # system call handler stub +ENTRY(system_call) + ASM_CLAC + pushl %eax # save orig_eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + # system call tracing in operation / emulation + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(NR_syscalls), %eax + jae syscall_badsys +syscall_call: + call *sys_call_table(,%eax,4) +syscall_after_call: + movl %eax,PT_EAX(%esp) # store the return value +syscall_exit: + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx # current->work + jnz syscall_exit_work + +restore_all: + TRACE_IRQS_IRET +restore_all_notrace: +#ifdef CONFIG_X86_ESPFIX32 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + # Warning: PT_OLDSS(%esp) contains the wrong/random values if we + # are returning to the kernel. + # See comments in process.c:copy_thread() for details. + movb PT_OLDSS(%esp), %ah + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax + je ldt_ss # returning to user-space with LDT SS +#endif +restore_nocheck: + RESTORE_REGS 4 # skip orig_eax/error_code +irq_return: + INTERRUPT_RETURN +.section .fixup,"ax" +ENTRY(iret_exc) + pushl $0 # no error code + pushl $do_iret_error + jmp error_code +.previous + _ASM_EXTABLE(irq_return,iret_exc) + +#ifdef CONFIG_X86_ESPFIX32 +ldt_ss: +#ifdef CONFIG_PARAVIRT + /* + * The kernel can't run on a non-flat stack if paravirt mode + * is active. Rather than try to fixup the high bits of + * ESP, bypass this code entirely. This may break DOSemu + * and/or Wine support in a paravirt VM, although the option + * is still available to implement the setting of the high + * 16-bits in the INTERRUPT_RETURN paravirt-op. + */ + cmpl $0, pv_info+PARAVIRT_enabled + jne restore_nocheck +#endif + +/* + * Setup and switch to ESPFIX stack + * + * We're returning to userspace with a 16 bit stack. The CPU will not + * restore the high word of ESP for us on executing iret... This is an + * "official" bug of all the x86-compatible CPUs, which we can work + * around to make dosemu and wine happy. We do this by preloading the + * high word of ESP with the high word of the userspace ESP while + * compensating for the offset by changing to the ESPFIX segment with + * a base address that matches for the difference. + */ +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) + mov %esp, %edx /* load kernel esp */ + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ + shr $16, %edx + mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ + mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ + pushl $__ESPFIX_SS + pushl %eax /* new kernel esp */ + /* Disable interrupts, but do not irqtrace this section: we + * will soon execute iret and the tracer was already set to + * the irqstate after the iret */ + DISABLE_INTERRUPTS(CLBR_EAX) + lss (%esp), %esp /* switch to espfix segment */ + jmp restore_nocheck +#endif +ENDPROC(system_call) + + # perform work that needs to be done immediately before resumption + ALIGN +work_pending: + testb $_TIF_NEED_RESCHED, %cl + jz work_notifysig +work_resched: + call schedule + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other + # than syscall tracing? + jz restore_all + testb $_TIF_NEED_RESCHED, %cl + jnz work_resched + +work_notifysig: # deal with pending signals and + # notify-resume requests +#ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) + movl %esp, %eax + jnz work_notifysig_v86 # returning to kernel-space or + # vm86-space +1: +#else + movl %esp, %eax +#endif + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + movb PT_CS(%esp), %bl + andb $SEGMENT_RPL_MASK, %bl + cmpb $USER_RPL, %bl + jb resume_kernel + xorl %edx, %edx + call do_notify_resume + jmp resume_userspace + +#ifdef CONFIG_VM86 + ALIGN +work_notifysig_v86: + pushl %ecx # save ti_flags for do_notify_resume + call save_v86_state # %eax contains pt_regs pointer + popl %ecx + movl %eax, %esp + jmp 1b +#endif +END(work_pending) + + # perform syscall exit tracing + ALIGN +syscall_trace_entry: + movl $-ENOSYS,PT_EAX(%esp) + movl %esp, %eax + call syscall_trace_enter + /* What it returned is what we'll actually use. */ + cmpl $(NR_syscalls), %eax + jnae syscall_call + jmp syscall_exit +END(syscall_trace_entry) + + # perform syscall exit tracing + ALIGN +syscall_exit_work: + testl $_TIF_WORK_SYSCALL_EXIT, %ecx + jz work_pending + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call + # schedule() instead + movl %esp, %eax + call syscall_trace_leave + jmp resume_userspace +END(syscall_exit_work) + +syscall_fault: + ASM_CLAC + GET_THREAD_INFO(%ebp) + movl $-EFAULT,PT_EAX(%esp) + jmp resume_userspace +END(syscall_fault) + +syscall_badsys: + movl $-ENOSYS,%eax + jmp syscall_after_call +END(syscall_badsys) + +sysenter_badsys: + movl $-ENOSYS,%eax + jmp sysenter_after_call +END(sysenter_badsys) + +.macro FIXUP_ESPFIX_STACK +/* + * Switch back for ESPFIX stack to the normal zerobased stack + * + * We can't call C functions using the ESPFIX stack. This code reads + * the high word of the segment base from the GDT and swiches to the + * normal stack and adjusts ESP with the matching offset. + */ +#ifdef CONFIG_X86_ESPFIX32 + /* fixup the stack */ + mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ + mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ + shl $16, %eax + addl %esp, %eax /* the adjusted stack pointer */ + pushl $__KERNEL_DS + pushl %eax + lss (%esp), %esp /* switch to the normal stack segment */ +#endif +.endm +.macro UNWIND_ESPFIX_STACK +#ifdef CONFIG_X86_ESPFIX32 + movl %ss, %eax + /* see if on espfix stack */ + cmpw $__ESPFIX_SS, %ax + jne 27f + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %es + /* switch to normal stack */ + FIXUP_ESPFIX_STACK +27: +#endif +.endm + +/* + * Build the entry stubs with some assembler magic. + * We pack 1 stub into every 8-byte block. + */ + .align 8 +ENTRY(irq_entries_start) + vector=FIRST_EXTERNAL_VECTOR + .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + pushl $(~vector+0x80) /* Note: always in signed byte range */ + vector=vector+1 + jmp common_interrupt + .align 8 + .endr +END(irq_entries_start) + +/* + * the CPU automatically disables interrupts when executing an IRQ vector, + * so IRQ-flags tracing has to follow that: + */ + .p2align CONFIG_X86_L1_CACHE_SHIFT +common_interrupt: + ASM_CLAC + addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ + SAVE_ALL + TRACE_IRQS_OFF + movl %esp,%eax + call do_IRQ + jmp ret_from_intr +ENDPROC(common_interrupt) + +#define BUILD_INTERRUPT3(name, nr, fn) \ +ENTRY(name) \ + ASM_CLAC; \ + pushl $~(nr); \ + SAVE_ALL; \ + TRACE_IRQS_OFF \ + movl %esp,%eax; \ + call fn; \ + jmp ret_from_intr; \ +ENDPROC(name) + + +#ifdef CONFIG_TRACING +#define TRACE_BUILD_INTERRUPT(name, nr) \ + BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) +#else +#define TRACE_BUILD_INTERRUPT(name, nr) +#endif + +#define BUILD_INTERRUPT(name, nr) \ + BUILD_INTERRUPT3(name, nr, smp_##name); \ + TRACE_BUILD_INTERRUPT(name, nr) + +/* The include is where all of the SMP etc. interrupts come from */ +#include + +ENTRY(coprocessor_error) + ASM_CLAC + pushl $0 + pushl $do_coprocessor_error + jmp error_code +END(coprocessor_error) + +ENTRY(simd_coprocessor_error) + ASM_CLAC + pushl $0 +#ifdef CONFIG_X86_INVD_BUG + /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ + ALTERNATIVE "pushl $do_general_protection", \ + "pushl $do_simd_coprocessor_error", \ + X86_FEATURE_XMM +#else + pushl $do_simd_coprocessor_error +#endif + jmp error_code +END(simd_coprocessor_error) + +ENTRY(device_not_available) + ASM_CLAC + pushl $-1 # mark this as an int + pushl $do_device_not_available + jmp error_code +END(device_not_available) + +#ifdef CONFIG_PARAVIRT +ENTRY(native_iret) + iret + _ASM_EXTABLE(native_iret, iret_exc) +END(native_iret) + +ENTRY(native_irq_enable_sysexit) + sti + sysexit +END(native_irq_enable_sysexit) +#endif + +ENTRY(overflow) + ASM_CLAC + pushl $0 + pushl $do_overflow + jmp error_code +END(overflow) + +ENTRY(bounds) + ASM_CLAC + pushl $0 + pushl $do_bounds + jmp error_code +END(bounds) + +ENTRY(invalid_op) + ASM_CLAC + pushl $0 + pushl $do_invalid_op + jmp error_code +END(invalid_op) + +ENTRY(coprocessor_segment_overrun) + ASM_CLAC + pushl $0 + pushl $do_coprocessor_segment_overrun + jmp error_code +END(coprocessor_segment_overrun) + +ENTRY(invalid_TSS) + ASM_CLAC + pushl $do_invalid_TSS + jmp error_code +END(invalid_TSS) + +ENTRY(segment_not_present) + ASM_CLAC + pushl $do_segment_not_present + jmp error_code +END(segment_not_present) + +ENTRY(stack_segment) + ASM_CLAC + pushl $do_stack_segment + jmp error_code +END(stack_segment) + +ENTRY(alignment_check) + ASM_CLAC + pushl $do_alignment_check + jmp error_code +END(alignment_check) + +ENTRY(divide_error) + ASM_CLAC + pushl $0 # no error code + pushl $do_divide_error + jmp error_code +END(divide_error) + +#ifdef CONFIG_X86_MCE +ENTRY(machine_check) + ASM_CLAC + pushl $0 + pushl machine_check_vector + jmp error_code +END(machine_check) +#endif + +ENTRY(spurious_interrupt_bug) + ASM_CLAC + pushl $0 + pushl $do_spurious_interrupt_bug + jmp error_code +END(spurious_interrupt_bug) + +#ifdef CONFIG_XEN +/* Xen doesn't set %esp to be precisely what the normal sysenter + entrypoint expects, so fix it up before using the normal path. */ +ENTRY(xen_sysenter_target) + addl $5*4, %esp /* remove xen-provided frame */ + jmp sysenter_past_esp + +ENTRY(xen_hypervisor_callback) + pushl $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + TRACE_IRQS_OFF + + /* Check to see if we got the event in the critical + region in xen_iret_direct, after we've reenabled + events and checked for pending events. This simulates + iret instruction's behaviour where it delivers a + pending interrupt when enabling interrupts. */ + movl PT_EIP(%esp),%eax + cmpl $xen_iret_start_crit,%eax + jb 1f + cmpl $xen_iret_end_crit,%eax + jae 1f + + jmp xen_iret_crit_fixup + +ENTRY(xen_do_upcall) +1: mov %esp, %eax + call xen_evtchn_do_upcall +#ifndef CONFIG_PREEMPT + call xen_maybe_preempt_hcall +#endif + jmp ret_from_intr +ENDPROC(xen_hypervisor_callback) + +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we fix up by reattempting the load, and zeroing the segment +# register if the load fails. +# Category 2 we fix up by jumping to do_iret_error. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by maintaining a status value in EAX. +ENTRY(xen_failsafe_callback) + pushl %eax + movl $1,%eax +1: mov 4(%esp),%ds +2: mov 8(%esp),%es +3: mov 12(%esp),%fs +4: mov 16(%esp),%gs + /* EAX == 0 => Category 1 (Bad segment) + EAX != 0 => Category 2 (Bad IRET) */ + testl %eax,%eax + popl %eax + lea 16(%esp),%esp + jz 5f + jmp iret_exc +5: pushl $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + jmp ret_from_exception + +.section .fixup,"ax" +6: xorl %eax,%eax + movl %eax,4(%esp) + jmp 1b +7: xorl %eax,%eax + movl %eax,8(%esp) + jmp 2b +8: xorl %eax,%eax + movl %eax,12(%esp) + jmp 3b +9: xorl %eax,%eax + movl %eax,16(%esp) + jmp 4b +.previous + _ASM_EXTABLE(1b,6b) + _ASM_EXTABLE(2b,7b) + _ASM_EXTABLE(3b,8b) + _ASM_EXTABLE(4b,9b) +ENDPROC(xen_failsafe_callback) + +BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, + xen_evtchn_do_upcall) + +#endif /* CONFIG_XEN */ + +#if IS_ENABLED(CONFIG_HYPERV) + +BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, + hyperv_vector_handler) + +#endif /* CONFIG_HYPERV */ + +#ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(mcount) + ret +END(mcount) + +ENTRY(ftrace_caller) + pushl %eax + pushl %ecx + pushl %edx + pushl $0 /* Pass NULL as regs pointer */ + movl 4*4(%esp), %eax + movl 0x4(%ebp), %edx + movl function_trace_op, %ecx + subl $MCOUNT_INSN_SIZE, %eax + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + addl $4,%esp /* skip NULL pointer */ + popl %edx + popl %ecx + popl %eax +ftrace_ret: +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif + +.globl ftrace_stub +ftrace_stub: + ret +END(ftrace_caller) + +ENTRY(ftrace_regs_caller) + pushf /* push flags before compare (in cs location) */ + + /* + * i386 does not save SS and ESP when coming from kernel. + * Instead, to get sp, ®s->sp is used (see ptrace.h). + * Unfortunately, that means eflags must be at the same location + * as the current return ip is. We move the return ip into the + * ip location, and move flags into the return ip location. + */ + pushl 4(%esp) /* save return ip into ip slot */ + + pushl $0 /* Load 0 into orig_ax */ + pushl %gs + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + + movl 13*4(%esp), %eax /* Get the saved flags */ + movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ + /* clobbering return ip */ + movl $__KERNEL_CS,13*4(%esp) + + movl 12*4(%esp), %eax /* Load ip (1st parameter) */ + subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ + movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ + movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ + pushl %esp /* Save pt_regs as 4th parameter */ + +GLOBAL(ftrace_regs_call) + call ftrace_stub + + addl $4, %esp /* Skip pt_regs */ + movl 14*4(%esp), %eax /* Move flags back into cs */ + movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ + movl 12*4(%esp), %eax /* Get return ip from regs->ip */ + movl %eax, 14*4(%esp) /* Put return ip back for ret */ + + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + popl %ds + popl %es + popl %fs + popl %gs + addl $8, %esp /* Skip orig_ax and ip */ + popf /* Pop flags at end (no addl to corrupt flags) */ + jmp ftrace_ret + + popf + jmp ftrace_stub +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(mcount) + cmpl $__PAGE_OFFSET, %esp + jb ftrace_stub /* Paging not enabled yet? */ + + cmpl $ftrace_stub, ftrace_trace_function + jnz trace +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif +.globl ftrace_stub +ftrace_stub: + ret + + /* taken from glibc */ +trace: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + lea 0x4(%ebp), %edx + movl (%ebp), %ecx + subl $MCOUNT_INSN_SIZE, %eax + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax + ret +END(ftrace_graph_caller) + +.globl return_to_handler +return_to_handler: + pushl %eax + pushl %edx + movl %ebp, %eax + call ftrace_return_to_handler + movl %eax, %ecx + popl %edx + popl %eax + jmp *%ecx +#endif + +#ifdef CONFIG_TRACING +ENTRY(trace_page_fault) + ASM_CLAC + pushl $trace_do_page_fault + jmp error_code +END(trace_page_fault) +#endif + +ENTRY(page_fault) + ASM_CLAC + pushl $do_page_fault + ALIGN +error_code: + /* the function address is in %gs's slot on the stack */ + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + cld + movl $(__KERNEL_PERCPU), %ecx + movl %ecx, %fs + UNWIND_ESPFIX_STACK + GS_TO_REG %ecx + movl PT_GS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + REG_TO_PTGS %ecx + SET_KERNEL_GS %ecx + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + TRACE_IRQS_OFF + movl %esp,%eax # pt_regs pointer + call *%edi + jmp ret_from_exception +END(page_fault) + +/* + * Debug traps and NMI can happen at the one SYSENTER instruction + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * + * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * + * We just load the right stack, and push the three (known) values + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +.macro FIX_STACK offset ok label + cmpw $__KERNEL_CS, 4(%esp) + jne \ok +\label: + movl TSS_sysenter_sp0 + \offset(%esp), %esp + pushfl + pushl $__KERNEL_CS + pushl $sysenter_past_esp +.endm + +ENTRY(debug) + ASM_CLAC + cmpl $ia32_sysenter_target,(%esp) + jne debug_stack_correct + FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn +debug_stack_correct: + pushl $-1 # mark this as an int + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # error code 0 + movl %esp,%eax # pt_regs pointer + call do_debug + jmp ret_from_exception +END(debug) + +/* + * NMI is doubly nasty. It can happen _while_ we're handling + * a debug fault, and the debug fault hasn't yet been able to + * clear up the stack. So we first check whether we got an + * NMI on the sysenter entry path, but after that we need to + * check whether we got an NMI on the debug path where the debug + * fault happened on the sysenter path. + */ +ENTRY(nmi) + ASM_CLAC +#ifdef CONFIG_X86_ESPFIX32 + pushl %eax + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl %eax + je nmi_espfix_stack +#endif + cmpl $ia32_sysenter_target,(%esp) + je nmi_stack_fixup + pushl %eax + movl %esp,%eax + /* Do not access memory above the end of our stack page, + * it might not exist. + */ + andl $(THREAD_SIZE-1),%eax + cmpl $(THREAD_SIZE-20),%eax + popl %eax + jae nmi_stack_correct + cmpl $ia32_sysenter_target,12(%esp) + je nmi_debug_stack_check +nmi_stack_correct: + pushl %eax + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + jmp restore_all_notrace + +nmi_stack_fixup: + FIX_STACK 12, nmi_stack_correct, 1 + jmp nmi_stack_correct + +nmi_debug_stack_check: + cmpw $__KERNEL_CS,16(%esp) + jne nmi_stack_correct + cmpl $debug,(%esp) + jb nmi_stack_correct + cmpl $debug_esp_fix_insn,(%esp) + ja nmi_stack_correct + FIX_STACK 24, nmi_stack_correct, 1 + jmp nmi_stack_correct + +#ifdef CONFIG_X86_ESPFIX32 +nmi_espfix_stack: + /* + * create the pointer to lss back + */ + pushl %ss + pushl %esp + addl $4, (%esp) + /* copy the iret frame of 12 bytes */ + .rept 3 + pushl 16(%esp) + .endr + pushl %eax + SAVE_ALL + FIXUP_ESPFIX_STACK # %eax == %esp + xorl %edx,%edx # zero error code + call do_nmi + RESTORE_REGS + lss 12+4(%esp), %esp # back to espfix stack + jmp irq_return +#endif +END(nmi) + +ENTRY(int3) + ASM_CLAC + pushl $-1 # mark this as an int + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_int3 + jmp ret_from_exception +END(int3) + +ENTRY(general_protection) + pushl $do_general_protection + jmp error_code +END(general_protection) + +#ifdef CONFIG_KVM_GUEST +ENTRY(async_page_fault) + ASM_CLAC + pushl $do_async_page_fault + jmp error_code +END(async_page_fault) +#endif + diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S new file mode 100644 index 000000000000..4ad79e946f5a --- /dev/null +++ b/arch/x86/entry/entry_64.S @@ -0,0 +1,1442 @@ +/* + * linux/arch/x86_64/entry.S + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs + * Copyright (C) 2000 Pavel Machek + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * + * Some of this is documented in Documentation/x86/entry_64.txt + * + * NOTE: This code handles signal-recognition, which happens every time + * after an interrupt and after each system call. + * + * A note on terminology: + * - iret frame: Architecture defined interrupt frame from SS to RIP + * at the top of the kernel process stack. + * + * Some macro usage: + * - ENTRY/END Define functions in the symbol table. + * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. + * - idtentry - Define exception entry points. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_64BIT 0x80000000 +#define __AUDIT_ARCH_LE 0x40000000 + + .code64 + .section .entry.text, "ax" + + +#ifdef CONFIG_PARAVIRT +ENTRY(native_usergs_sysret64) + swapgs + sysretq +ENDPROC(native_usergs_sysret64) +#endif /* CONFIG_PARAVIRT */ + + +.macro TRACE_IRQS_IRETQ +#ifdef CONFIG_TRACE_IRQFLAGS + bt $9,EFLAGS(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON +1: +#endif +.endm + +/* + * When dynamic function tracer is enabled it will add a breakpoint + * to all locations that it is about to modify, sync CPUs, update + * all the code, sync CPUs, then remove the breakpoints. In this time + * if lockdep is enabled, it might jump back into the debug handler + * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). + * + * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to + * make sure the stack pointer does not get reset back to the top + * of the debug stack, and instead just reuses the current stack. + */ +#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) + +.macro TRACE_IRQS_OFF_DEBUG + call debug_stack_set_zero + TRACE_IRQS_OFF + call debug_stack_reset +.endm + +.macro TRACE_IRQS_ON_DEBUG + call debug_stack_set_zero + TRACE_IRQS_ON + call debug_stack_reset +.endm + +.macro TRACE_IRQS_IRETQ_DEBUG + bt $9,EFLAGS(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON_DEBUG +1: +.endm + +#else +# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF +# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON +# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ +#endif + +/* + * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. + * + * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * then loads new ss, cs, and rip from previously programmed MSRs. + * rflags gets masked by a value from another MSR (so CLD and CLAC + * are not needed). SYSCALL does not save anything on the stack + * and does not change rsp. + * + * Registers on entry: + * rax system call number + * rcx return address + * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) + * rdi arg0 + * rsi arg1 + * rdx arg2 + * r10 arg3 (needs to be moved to rcx to conform to C ABI) + * r8 arg4 + * r9 arg5 + * (note: r12-r15,rbp,rbx are callee-preserved in C ABI) + * + * Only called from user space. + * + * When user can change pt_regs->foo always force IRET. That is because + * it deals with uncanonical addresses better. SYSRET has trouble + * with them due to bugs in both AMD and Intel CPUs. + */ + +ENTRY(system_call) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + /* + * A hypervisor implementation might want to use a label + * after the swapgs, so that it can do the swapgs + * for the guest and jump here on syscall. + */ +GLOBAL(system_call_after_swapgs) + + movq %rsp,PER_CPU_VAR(rsp_scratch) + movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp + + /* Construct struct pt_regs on stack */ + pushq $__USER_DS /* pt_regs->ss */ + pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ + /* + * Re-enable interrupts. + * We use 'rsp_scratch' as a scratch space, hence irq-off block above + * must execute atomically in the face of possible interrupt-driven + * task preemption. We must enable interrupts only after we're done + * with using rsp_scratch: + */ + ENABLE_INTERRUPTS(CLBR_NONE) + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ + sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ + + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz tracesys +system_call_fastpath: +#if __SYSCALL_MASK == ~0 + cmpq $__NR_syscall_max,%rax +#else + andl $__SYSCALL_MASK,%eax + cmpl $__NR_syscall_max,%eax +#endif + ja 1f /* return -ENOSYS (already in pt_regs->ax) */ + movq %r10,%rcx + call *sys_call_table(,%rax,8) + movq %rax,RAX(%rsp) +1: +/* + * Syscall return path ending with SYSRET (fast path). + * Has incompletely filled pt_regs. + */ + LOCKDEP_SYS_EXIT + /* + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + DISABLE_INTERRUPTS(CLBR_NONE) + + /* + * We must check ti flags with interrupts (or at least preemption) + * off because we must *never* return to userspace without + * processing exit work that is enqueued if we're preempted here. + * In particular, returning to userspace with any of the one-shot + * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is + * very bad. + */ + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ + + RESTORE_C_REGS_EXCEPT_RCX_R11 + movq RIP(%rsp),%rcx + movq EFLAGS(%rsp),%r11 + movq RSP(%rsp),%rsp + /* + * 64bit SYSRET restores rip from rcx, + * rflags from r11 (but RF and VM bits are forced to 0), + * cs and ss are loaded from MSRs. + * Restoration of rflags re-enables interrupts. + * + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss + * descriptor is not reinitialized. This means that we should + * avoid SYSRET with SS == NULL, which could happen if we schedule, + * exit the kernel, and re-enter using an interrupt vector. (All + * interrupt entries on x86_64 set SS to NULL.) We prevent that + * from happening by reloading SS in __switch_to. (Actually + * detecting the failure in 64-bit userspace is tricky but can be + * done.) + */ + USERGS_SYSRET64 + + /* Do syscall entry tracing */ +tracesys: + movq %rsp, %rdi + movl $AUDIT_ARCH_X86_64, %esi + call syscall_trace_enter_phase1 + test %rax, %rax + jnz tracesys_phase2 /* if needed, run the slow path */ + RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ + movq ORIG_RAX(%rsp), %rax + jmp system_call_fastpath /* and return to the fast path */ + +tracesys_phase2: + SAVE_EXTRA_REGS + movq %rsp, %rdi + movl $AUDIT_ARCH_X86_64, %esi + movq %rax,%rdx + call syscall_trace_enter_phase2 + + /* + * Reload registers from stack in case ptrace changed them. + * We don't reload %rax because syscall_trace_entry_phase2() returned + * the value it wants us to use in the table lookup. + */ + RESTORE_C_REGS_EXCEPT_RAX + RESTORE_EXTRA_REGS +#if __SYSCALL_MASK == ~0 + cmpq $__NR_syscall_max,%rax +#else + andl $__SYSCALL_MASK,%eax + cmpl $__NR_syscall_max,%eax +#endif + ja 1f /* return -ENOSYS (already in pt_regs->ax) */ + movq %r10,%rcx /* fixup for C */ + call *sys_call_table(,%rax,8) + movq %rax,RAX(%rsp) +1: + /* Use IRET because user could have changed pt_regs->foo */ + +/* + * Syscall return path ending with IRET. + * Has correct iret frame. + */ +GLOBAL(int_ret_from_sys_call) + DISABLE_INTERRUPTS(CLBR_NONE) +int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ + TRACE_IRQS_OFF + movl $_TIF_ALLWORK_MASK,%edi + /* edi: mask to check */ +GLOBAL(int_with_check) + LOCKDEP_SYS_EXIT_IRQ + GET_THREAD_INFO(%rcx) + movl TI_flags(%rcx),%edx + andl %edi,%edx + jnz int_careful + andl $~TS_COMPAT,TI_status(%rcx) + jmp syscall_return + + /* Either reschedule or signal or syscall exit tracking needed. */ + /* First do a reschedule test. */ + /* edx: work, edi: workmask */ +int_careful: + bt $TIF_NEED_RESCHED,%edx + jnc int_very_careful + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + pushq %rdi + SCHEDULE_USER + popq %rdi + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp int_with_check + + /* handle signals and tracing -- both require a full pt_regs */ +int_very_careful: + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + SAVE_EXTRA_REGS + /* Check for syscall exit trace */ + testl $_TIF_WORK_SYSCALL_EXIT,%edx + jz int_signal + pushq %rdi + leaq 8(%rsp),%rdi # &ptregs -> arg1 + call syscall_trace_leave + popq %rdi + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi + jmp int_restore_rest + +int_signal: + testl $_TIF_DO_NOTIFY_MASK,%edx + jz 1f + movq %rsp,%rdi # &ptregs -> arg1 + xorl %esi,%esi # oldset -> arg2 + call do_notify_resume +1: movl $_TIF_WORK_MASK,%edi +int_restore_rest: + RESTORE_EXTRA_REGS + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp int_with_check + +syscall_return: + /* The IRETQ could re-enable interrupts: */ + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_IRETQ + + /* + * Try to use SYSRET instead of IRET if we're returning to + * a completely clean 64-bit userspace context. + */ + movq RCX(%rsp),%rcx + movq RIP(%rsp),%r11 + cmpq %rcx,%r11 /* RCX == RIP */ + jne opportunistic_sysret_failed + + /* + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP + * in kernel space. This essentially lets the user take over + * the kernel, since userspace controls RSP. + * + * If width of "canonical tail" ever becomes variable, this will need + * to be updated to remain correct on both old and new CPUs. + */ + .ifne __VIRTUAL_MASK_SHIFT - 47 + .error "virtual address width changed -- SYSRET checks need update" + .endif + /* Change top 16 bits to be the sign-extension of 47th bit */ + shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx + sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx + /* If this changed %rcx, it was not canonical */ + cmpq %rcx, %r11 + jne opportunistic_sysret_failed + + cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ + jne opportunistic_sysret_failed + + movq R11(%rsp),%r11 + cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ + jne opportunistic_sysret_failed + + /* + * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, + * restoring TF results in a trap from userspace immediately after + * SYSRET. This would cause an infinite loop whenever #DB happens + * with register state that satisfies the opportunistic SYSRET + * conditions. For example, single-stepping this user code: + * + * movq $stuck_here,%rcx + * pushfq + * popq %r11 + * stuck_here: + * + * would never get past 'stuck_here'. + */ + testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 + jnz opportunistic_sysret_failed + + /* nothing to check for RSP */ + + cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ + jne opportunistic_sysret_failed + + /* + * We win! This label is here just for ease of understanding + * perf profiles. Nothing jumps here. + */ +syscall_return_via_sysret: + /* rcx and r11 are already restored (see code above) */ + RESTORE_C_REGS_EXCEPT_RCX_R11 + movq RSP(%rsp),%rsp + USERGS_SYSRET64 + +opportunistic_sysret_failed: + SWAPGS + jmp restore_c_regs_and_iret +END(system_call) + + + .macro FORK_LIKE func +ENTRY(stub_\func) + SAVE_EXTRA_REGS 8 + jmp sys_\func +END(stub_\func) + .endm + + FORK_LIKE clone + FORK_LIKE fork + FORK_LIKE vfork + +ENTRY(stub_execve) + call sys_execve +return_from_execve: + testl %eax, %eax + jz 1f + /* exec failed, can use fast SYSRET code path in this case */ + ret +1: + /* must use IRET code path (pt_regs->cs may have changed) */ + addq $8, %rsp + ZERO_EXTRA_REGS + movq %rax,RAX(%rsp) + jmp int_ret_from_sys_call +END(stub_execve) +/* + * Remaining execve stubs are only 7 bytes long. + * ENTRY() often aligns to 16 bytes, which in this case has no benefits. + */ + .align 8 +GLOBAL(stub_execveat) + call sys_execveat + jmp return_from_execve +END(stub_execveat) + +#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) + .align 8 +GLOBAL(stub_x32_execve) +GLOBAL(stub32_execve) + call compat_sys_execve + jmp return_from_execve +END(stub32_execve) +END(stub_x32_execve) + .align 8 +GLOBAL(stub_x32_execveat) +GLOBAL(stub32_execveat) + call compat_sys_execveat + jmp return_from_execve +END(stub32_execveat) +END(stub_x32_execveat) +#endif + +/* + * sigreturn is special because it needs to restore all registers on return. + * This cannot be done with SYSRET, so use the IRET return path instead. + */ +ENTRY(stub_rt_sigreturn) + /* + * SAVE_EXTRA_REGS result is not normally needed: + * sigreturn overwrites all pt_regs->GPREGS. + * But sigreturn can fail (!), and there is no easy way to detect that. + * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error, + * we SAVE_EXTRA_REGS here. + */ + SAVE_EXTRA_REGS 8 + call sys_rt_sigreturn +return_from_stub: + addq $8, %rsp + RESTORE_EXTRA_REGS + movq %rax,RAX(%rsp) + jmp int_ret_from_sys_call +END(stub_rt_sigreturn) + +#ifdef CONFIG_X86_X32_ABI +ENTRY(stub_x32_rt_sigreturn) + SAVE_EXTRA_REGS 8 + call sys32_x32_rt_sigreturn + jmp return_from_stub +END(stub_x32_rt_sigreturn) +#endif + +/* + * A newly forked process directly context switches into this address. + * + * rdi: prev task we switched from + */ +ENTRY(ret_from_fork) + + LOCK ; btr $TIF_FORK,TI_flags(%r8) + + pushq $0x0002 + popfq # reset kernel eflags + + call schedule_tail # rdi: 'prev' task parameter + + RESTORE_EXTRA_REGS + + testb $3, CS(%rsp) # from kernel_thread? + + /* + * By the time we get here, we have no idea whether our pt_regs, + * ti flags, and ti status came from the 64-bit SYSCALL fast path, + * the slow path, or one of the ia32entry paths. + * Use IRET code path to return, since it can safely handle + * all of the above. + */ + jnz int_ret_from_sys_call + + /* We came from kernel_thread */ + /* nb: we depend on RESTORE_EXTRA_REGS above */ + movq %rbp, %rdi + call *%rbx + movl $0, RAX(%rsp) + RESTORE_EXTRA_REGS + jmp int_ret_from_sys_call +END(ret_from_fork) + +/* + * Build the entry stubs with some assembler magic. + * We pack 1 stub into every 8-byte block. + */ + .align 8 +ENTRY(irq_entries_start) + vector=FIRST_EXTERNAL_VECTOR + .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + pushq $(~vector+0x80) /* Note: always in signed byte range */ + vector=vector+1 + jmp common_interrupt + .align 8 + .endr +END(irq_entries_start) + +/* + * Interrupt entry/exit. + * + * Interrupt entry points save only callee clobbered registers in fast path. + * + * Entry runs with interrupts off. + */ + +/* 0(%rsp): ~(interrupt number) */ + .macro interrupt func + cld + /* + * Since nothing in interrupt handling code touches r12...r15 members + * of "struct pt_regs", and since interrupts can nest, we can save + * four stack slots and simultaneously provide + * an unwind-friendly stack layout by saving "truncated" pt_regs + * exactly up to rbp slot, without these members. + */ + ALLOC_PT_GPREGS_ON_STACK -RBP + SAVE_C_REGS -RBP + /* this goes to 0(%rsp) for unwinder, not for saving the value: */ + SAVE_EXTRA_REGS_RBP -RBP + + leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ + + testb $3, CS-RBP(%rsp) + jz 1f + SWAPGS +1: + /* + * Save previous stack pointer, optionally switch to interrupt stack. + * irq_count is used to check if a CPU is already on an interrupt stack + * or not. While this is essentially redundant with preempt_count it is + * a little cheaper to use a separate counter in the PDA (short of + * moving irq_enter into assembly, which would be too much work) + */ + movq %rsp, %rsi + incl PER_CPU_VAR(irq_count) + cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp + pushq %rsi + /* We entered an interrupt context - irqs are off: */ + TRACE_IRQS_OFF + + call \func + .endm + + /* + * The interrupt stubs push (~vector+0x80) onto the stack and + * then jump to common_interrupt. + */ + .p2align CONFIG_X86_L1_CACHE_SHIFT +common_interrupt: + ASM_CLAC + addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ + interrupt do_IRQ + /* 0(%rsp): old RSP */ +ret_from_intr: + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + decl PER_CPU_VAR(irq_count) + + /* Restore saved previous stack */ + popq %rsi + /* return code expects complete pt_regs - adjust rsp accordingly: */ + leaq -RBP(%rsi),%rsp + + testb $3, CS(%rsp) + jz retint_kernel + /* Interrupt came from user space */ +retint_user: + GET_THREAD_INFO(%rcx) + /* + * %rcx: thread info. Interrupts off. + */ +retint_with_reschedule: + movl $_TIF_WORK_MASK,%edi +retint_check: + LOCKDEP_SYS_EXIT_IRQ + movl TI_flags(%rcx),%edx + andl %edi,%edx + jnz retint_careful + +retint_swapgs: /* return to user-space */ + /* + * The iretq could re-enable interrupts: + */ + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_IRETQ + + SWAPGS + jmp restore_c_regs_and_iret + +/* Returning to kernel space */ +retint_kernel: +#ifdef CONFIG_PREEMPT + /* Interrupts are off */ + /* Check if we need preemption */ + bt $9,EFLAGS(%rsp) /* interrupts were off? */ + jnc 1f +0: cmpl $0,PER_CPU_VAR(__preempt_count) + jnz 1f + call preempt_schedule_irq + jmp 0b +1: +#endif + /* + * The iretq could re-enable interrupts: + */ + TRACE_IRQS_IRETQ + +/* + * At this label, code paths which return to kernel and to user, + * which come from interrupts/exception and from syscalls, merge. + */ +restore_c_regs_and_iret: + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 + +irq_return: + INTERRUPT_RETURN + +ENTRY(native_iret) + /* + * Are we returning to a stack segment from the LDT? Note: in + * 64-bit mode SS:RSP on the exception stack is always valid. + */ +#ifdef CONFIG_X86_ESPFIX64 + testb $4,(SS-RIP)(%rsp) + jnz native_irq_return_ldt +#endif + +.global native_irq_return_iret +native_irq_return_iret: + /* + * This may fault. Non-paranoid faults on return to userspace are + * handled by fixup_bad_iret. These include #SS, #GP, and #NP. + * Double-faults due to espfix64 are handled in do_double_fault. + * Other faults here are fatal. + */ + iretq + +#ifdef CONFIG_X86_ESPFIX64 +native_irq_return_ldt: + pushq %rax + pushq %rdi + SWAPGS + movq PER_CPU_VAR(espfix_waddr),%rdi + movq %rax,(0*8)(%rdi) /* RAX */ + movq (2*8)(%rsp),%rax /* RIP */ + movq %rax,(1*8)(%rdi) + movq (3*8)(%rsp),%rax /* CS */ + movq %rax,(2*8)(%rdi) + movq (4*8)(%rsp),%rax /* RFLAGS */ + movq %rax,(3*8)(%rdi) + movq (6*8)(%rsp),%rax /* SS */ + movq %rax,(5*8)(%rdi) + movq (5*8)(%rsp),%rax /* RSP */ + movq %rax,(4*8)(%rdi) + andl $0xffff0000,%eax + popq %rdi + orq PER_CPU_VAR(espfix_stack),%rax + SWAPGS + movq %rax,%rsp + popq %rax + jmp native_irq_return_iret +#endif + + /* edi: workmask, edx: work */ +retint_careful: + bt $TIF_NEED_RESCHED,%edx + jnc retint_signal + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + pushq %rdi + SCHEDULE_USER + popq %rdi + GET_THREAD_INFO(%rcx) + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp retint_check + +retint_signal: + testl $_TIF_DO_NOTIFY_MASK,%edx + jz retint_swapgs + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + SAVE_EXTRA_REGS + movq $-1,ORIG_RAX(%rsp) + xorl %esi,%esi # oldset + movq %rsp,%rdi # &pt_regs + call do_notify_resume + RESTORE_EXTRA_REGS + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + GET_THREAD_INFO(%rcx) + jmp retint_with_reschedule + +END(common_interrupt) + +/* + * APIC interrupts. + */ +.macro apicinterrupt3 num sym do_sym +ENTRY(\sym) + ASM_CLAC + pushq $~(\num) +.Lcommon_\sym: + interrupt \do_sym + jmp ret_from_intr +END(\sym) +.endm + +#ifdef CONFIG_TRACING +#define trace(sym) trace_##sym +#define smp_trace(sym) smp_trace_##sym + +.macro trace_apicinterrupt num sym +apicinterrupt3 \num trace(\sym) smp_trace(\sym) +.endm +#else +.macro trace_apicinterrupt num sym do_sym +.endm +#endif + +.macro apicinterrupt num sym do_sym +apicinterrupt3 \num \sym \do_sym +trace_apicinterrupt \num \sym +.endm + +#ifdef CONFIG_SMP +apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \ + irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt +apicinterrupt3 REBOOT_VECTOR \ + reboot_interrupt smp_reboot_interrupt +#endif + +#ifdef CONFIG_X86_UV +apicinterrupt3 UV_BAU_MESSAGE \ + uv_bau_message_intr1 uv_bau_message_interrupt +#endif +apicinterrupt LOCAL_TIMER_VECTOR \ + apic_timer_interrupt smp_apic_timer_interrupt +apicinterrupt X86_PLATFORM_IPI_VECTOR \ + x86_platform_ipi smp_x86_platform_ipi + +#ifdef CONFIG_HAVE_KVM +apicinterrupt3 POSTED_INTR_VECTOR \ + kvm_posted_intr_ipi smp_kvm_posted_intr_ipi +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD +apicinterrupt THRESHOLD_APIC_VECTOR \ + threshold_interrupt smp_threshold_interrupt +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR +apicinterrupt THERMAL_APIC_VECTOR \ + thermal_interrupt smp_thermal_interrupt +#endif + +#ifdef CONFIG_SMP +apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ + call_function_single_interrupt smp_call_function_single_interrupt +apicinterrupt CALL_FUNCTION_VECTOR \ + call_function_interrupt smp_call_function_interrupt +apicinterrupt RESCHEDULE_VECTOR \ + reschedule_interrupt smp_reschedule_interrupt +#endif + +apicinterrupt ERROR_APIC_VECTOR \ + error_interrupt smp_error_interrupt +apicinterrupt SPURIOUS_APIC_VECTOR \ + spurious_interrupt smp_spurious_interrupt + +#ifdef CONFIG_IRQ_WORK +apicinterrupt IRQ_WORK_VECTOR \ + irq_work_interrupt smp_irq_work_interrupt +#endif + +/* + * Exception entry points. + */ +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) + +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 +ENTRY(\sym) + /* Sanity check */ + .if \shift_ist != -1 && \paranoid == 0 + .error "using shift_ist requires paranoid=1" + .endif + + ASM_CLAC + PARAVIRT_ADJUST_EXCEPTION_FRAME + + .ifeq \has_error_code + pushq $-1 /* ORIG_RAX: no syscall to restart */ + .endif + + ALLOC_PT_GPREGS_ON_STACK + + .if \paranoid + .if \paranoid == 1 + testb $3, CS(%rsp) /* If coming from userspace, switch */ + jnz 1f /* stacks. */ + .endif + call paranoid_entry + .else + call error_entry + .endif + /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ + + .if \paranoid + .if \shift_ist != -1 + TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ + .else + TRACE_IRQS_OFF + .endif + .endif + + movq %rsp,%rdi /* pt_regs pointer */ + + .if \has_error_code + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + .else + xorl %esi,%esi /* no error code */ + .endif + + .if \shift_ist != -1 + subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + .endif + + call \do_sym + + .if \shift_ist != -1 + addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + .endif + + /* these procedures expect "no swapgs" flag in ebx */ + .if \paranoid + jmp paranoid_exit + .else + jmp error_exit + .endif + + .if \paranoid == 1 + /* + * Paranoid entry from userspace. Switch stacks and treat it + * as a normal entry. This means that paranoid handlers + * run in real process context if user_mode(regs). + */ +1: + call error_entry + + + movq %rsp,%rdi /* pt_regs pointer */ + call sync_regs + movq %rax,%rsp /* switch stack */ + + movq %rsp,%rdi /* pt_regs pointer */ + + .if \has_error_code + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + .else + xorl %esi,%esi /* no error code */ + .endif + + call \do_sym + + jmp error_exit /* %ebx: no swapgs flag */ + .endif +END(\sym) +.endm + +#ifdef CONFIG_TRACING +.macro trace_idtentry sym do_sym has_error_code:req +idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code +idtentry \sym \do_sym has_error_code=\has_error_code +.endm +#else +.macro trace_idtentry sym do_sym has_error_code:req +idtentry \sym \do_sym has_error_code=\has_error_code +.endm +#endif + +idtentry divide_error do_divide_error has_error_code=0 +idtentry overflow do_overflow has_error_code=0 +idtentry bounds do_bounds has_error_code=0 +idtentry invalid_op do_invalid_op has_error_code=0 +idtentry device_not_available do_device_not_available has_error_code=0 +idtentry double_fault do_double_fault has_error_code=1 paranoid=2 +idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 +idtentry invalid_TSS do_invalid_TSS has_error_code=1 +idtentry segment_not_present do_segment_not_present has_error_code=1 +idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 +idtentry coprocessor_error do_coprocessor_error has_error_code=0 +idtentry alignment_check do_alignment_check has_error_code=1 +idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 + + + /* Reload gs selector with exception handling */ + /* edi: new selector */ +ENTRY(native_load_gs_index) + pushfq + DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) + SWAPGS +gs_change: + movl %edi,%gs +2: mfence /* workaround */ + SWAPGS + popfq + ret +END(native_load_gs_index) + + _ASM_EXTABLE(gs_change,bad_gs) + .section .fixup,"ax" + /* running with kernelgs */ +bad_gs: + SWAPGS /* switch back to user gs */ + xorl %eax,%eax + movl %eax,%gs + jmp 2b + .previous + +/* Call softirq on interrupt stack. Interrupts are off. */ +ENTRY(do_softirq_own_stack) + pushq %rbp + mov %rsp,%rbp + incl PER_CPU_VAR(irq_count) + cmove PER_CPU_VAR(irq_stack_ptr),%rsp + push %rbp # backlink for old unwinder + call __do_softirq + leaveq + decl PER_CPU_VAR(irq_count) + ret +END(do_softirq_own_stack) + +#ifdef CONFIG_XEN +idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 + +/* + * A note on the "critical region" in our callback handler. + * We want to avoid stacking callback handlers due to events occurring + * during handling of the last event. To do this, we keep events disabled + * until we've done all processing. HOWEVER, we must enable events before + * popping the stack frame (can't be done atomically) and so it would still + * be possible to get enough handler activations to overflow the stack. + * Although unlikely, bugs of that kind are hard to track down, so we'd + * like to avoid the possibility. + * So, on entry to the handler we detect whether we interrupted an + * existing activation in its critical region -- if so, we pop the current + * activation and restart the handler using the previous one. + */ +ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) +/* + * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will + * see the correct pointer to the pt_regs + */ + movq %rdi, %rsp # we don't return, adjust the stack frame +11: incl PER_CPU_VAR(irq_count) + movq %rsp,%rbp + cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp + pushq %rbp # backlink for old unwinder + call xen_evtchn_do_upcall + popq %rsp + decl PER_CPU_VAR(irq_count) +#ifndef CONFIG_PREEMPT + call xen_maybe_preempt_hcall +#endif + jmp error_exit +END(xen_do_hypervisor_callback) + +/* + * Hypervisor uses this for application faults while it executes. + * We get here for two reasons: + * 1. Fault while reloading DS, ES, FS or GS + * 2. Fault while executing IRET + * Category 1 we do not need to fix up as Xen has already reloaded all segment + * registers that could be reloaded and zeroed the others. + * Category 2 we fix up by killing the current process. We cannot use the + * normal Linux return path in this case because if we use the IRET hypercall + * to pop the stack frame we end up in an infinite loop of failsafe callbacks. + * We distinguish between categories by comparing each saved segment register + * with its current contents: any discrepancy means we in category 1. + */ +ENTRY(xen_failsafe_callback) + movl %ds,%ecx + cmpw %cx,0x10(%rsp) + jne 1f + movl %es,%ecx + cmpw %cx,0x18(%rsp) + jne 1f + movl %fs,%ecx + cmpw %cx,0x20(%rsp) + jne 1f + movl %gs,%ecx + cmpw %cx,0x28(%rsp) + jne 1f + /* All segments match their saved values => Category 2 (Bad IRET). */ + movq (%rsp),%rcx + movq 8(%rsp),%r11 + addq $0x30,%rsp + pushq $0 /* RIP */ + pushq %r11 + pushq %rcx + jmp general_protection +1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ + movq (%rsp),%rcx + movq 8(%rsp),%r11 + addq $0x30,%rsp + pushq $-1 /* orig_ax = -1 => not a system call */ + ALLOC_PT_GPREGS_ON_STACK + SAVE_C_REGS + SAVE_EXTRA_REGS + jmp error_exit +END(xen_failsafe_callback) + +apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ + xen_hvm_callback_vector xen_evtchn_do_upcall + +#endif /* CONFIG_XEN */ + +#if IS_ENABLED(CONFIG_HYPERV) +apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ + hyperv_callback_vector hyperv_vector_handler +#endif /* CONFIG_HYPERV */ + +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry stack_segment do_stack_segment has_error_code=1 +#ifdef CONFIG_XEN +idtentry xen_debug do_debug has_error_code=0 +idtentry xen_int3 do_int3 has_error_code=0 +idtentry xen_stack_segment do_stack_segment has_error_code=1 +#endif +idtentry general_protection do_general_protection has_error_code=1 +trace_idtentry page_fault do_page_fault has_error_code=1 +#ifdef CONFIG_KVM_GUEST +idtentry async_page_fault do_async_page_fault has_error_code=1 +#endif +#ifdef CONFIG_X86_MCE +idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) +#endif + +/* + * Save all registers in pt_regs, and switch gs if needed. + * Use slow, but surefire "are we in kernel?" check. + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + */ +ENTRY(paranoid_entry) + cld + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 + movl $1,%ebx + movl $MSR_GS_BASE,%ecx + rdmsr + testl %edx,%edx + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx,%ebx +1: ret +END(paranoid_entry) + +/* + * "Paranoid" exit path from exception stack. This is invoked + * only on return from non-NMI IST interrupts that came + * from kernel space. + * + * We may be returning to very strange contexts (e.g. very early + * in syscall entry), so checking for preemption here would + * be complicated. Fortunately, we there's no good reason + * to try to handle preemption here. + */ +/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ +ENTRY(paranoid_exit) + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF_DEBUG + testl %ebx,%ebx /* swapgs needed? */ + jnz paranoid_exit_no_swapgs + TRACE_IRQS_IRETQ + SWAPGS_UNSAFE_STACK + jmp paranoid_exit_restore +paranoid_exit_no_swapgs: + TRACE_IRQS_IRETQ_DEBUG +paranoid_exit_restore: + RESTORE_EXTRA_REGS + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 + INTERRUPT_RETURN +END(paranoid_exit) + +/* + * Save all registers in pt_regs, and switch gs if needed. + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + */ +ENTRY(error_entry) + cld + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 + xorl %ebx,%ebx + testb $3, CS+8(%rsp) + jz error_kernelspace +error_swapgs: + SWAPGS +error_sti: + TRACE_IRQS_OFF + ret + + /* + * There are two places in the kernel that can potentially fault with + * usergs. Handle them here. B stepping K8s sometimes report a + * truncated RIP for IRET exceptions returning to compat mode. Check + * for these here too. + */ +error_kernelspace: + incl %ebx + leaq native_irq_return_iret(%rip),%rcx + cmpq %rcx,RIP+8(%rsp) + je error_bad_iret + movl %ecx,%eax /* zero extend */ + cmpq %rax,RIP+8(%rsp) + je bstep_iret + cmpq $gs_change,RIP+8(%rsp) + je error_swapgs + jmp error_sti + +bstep_iret: + /* Fix truncated RIP */ + movq %rcx,RIP+8(%rsp) + /* fall through */ + +error_bad_iret: + SWAPGS + mov %rsp,%rdi + call fixup_bad_iret + mov %rax,%rsp + decl %ebx /* Return to usergs */ + jmp error_sti +END(error_entry) + + +/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ +ENTRY(error_exit) + movl %ebx,%eax + RESTORE_EXTRA_REGS + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl %eax,%eax + jnz retint_kernel + jmp retint_user +END(error_exit) + +/* Runs on exception stack */ +ENTRY(nmi) + PARAVIRT_ADJUST_EXCEPTION_FRAME + /* + * We allow breakpoints in NMIs. If a breakpoint occurs, then + * the iretq it performs will take us out of NMI context. + * This means that we can have nested NMIs where the next + * NMI is using the top of the stack of the previous NMI. We + * can't let it execute because the nested NMI will corrupt the + * stack of the previous NMI. NMI handlers are not re-entrant + * anyway. + * + * To handle this case we do the following: + * Check the a special location on the stack that contains + * a variable that is set when NMIs are executing. + * The interrupted task's stack is also checked to see if it + * is an NMI stack. + * If the variable is not set and the stack is not the NMI + * stack then: + * o Set the special variable on the stack + * o Copy the interrupt frame into a "saved" location on the stack + * o Copy the interrupt frame into a "copy" location on the stack + * o Continue processing the NMI + * If the variable is set or the previous stack is the NMI stack: + * o Modify the "copy" location to jump to the repeate_nmi + * o return back to the first NMI + * + * Now on exit of the first NMI, we first clear the stack variable + * The NMI stack will tell any nested NMIs at that point that it is + * nested. Then we pop the stack normally with iret, and if there was + * a nested NMI that updated the copy interrupt stack frame, a + * jump will be made to the repeat_nmi code that will handle the second + * NMI. + */ + + /* Use %rdx as our temp variable throughout */ + pushq %rdx + + /* + * If %cs was not the kernel segment, then the NMI triggered in user + * space, which means it is definitely not nested. + */ + cmpl $__KERNEL_CS, 16(%rsp) + jne first_nmi + + /* + * Check the special variable on the stack to see if NMIs are + * executing. + */ + cmpl $1, -8(%rsp) + je nested_nmi + + /* + * Now test if the previous stack was an NMI stack. + * We need the double check. We check the NMI stack to satisfy the + * race when the first NMI clears the variable before returning. + * We check the variable because the first NMI could be in a + * breakpoint routine using a breakpoint stack. + */ + lea 6*8(%rsp), %rdx + /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ + cmpq %rdx, 4*8(%rsp) + /* If the stack pointer is above the NMI stack, this is a normal NMI */ + ja first_nmi + subq $EXCEPTION_STKSZ, %rdx + cmpq %rdx, 4*8(%rsp) + /* If it is below the NMI stack, it is a normal NMI */ + jb first_nmi + /* Ah, it is within the NMI stack, treat it as nested */ + +nested_nmi: + /* + * Do nothing if we interrupted the fixup in repeat_nmi. + * It's about to repeat the NMI handler, so we are fine + * with ignoring this one. + */ + movq $repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja 1f + movq $end_repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja nested_nmi_out + +1: + /* Set up the interrupted NMIs stack to jump to repeat_nmi */ + leaq -1*8(%rsp), %rdx + movq %rdx, %rsp + leaq -10*8(%rsp), %rdx + pushq $__KERNEL_DS + pushq %rdx + pushfq + pushq $__KERNEL_CS + pushq $repeat_nmi + + /* Put stack back */ + addq $(6*8), %rsp + +nested_nmi_out: + popq %rdx + + /* No need to check faults here */ + INTERRUPT_RETURN + +first_nmi: + /* + * Because nested NMIs will use the pushed location that we + * stored in rdx, we must keep that space available. + * Here's what our stack frame will look like: + * +-------------------------+ + * | original SS | + * | original Return RSP | + * | original RFLAGS | + * | original CS | + * | original RIP | + * +-------------------------+ + * | temp storage for rdx | + * +-------------------------+ + * | NMI executing variable | + * +-------------------------+ + * | copied SS | + * | copied Return RSP | + * | copied RFLAGS | + * | copied CS | + * | copied RIP | + * +-------------------------+ + * | Saved SS | + * | Saved Return RSP | + * | Saved RFLAGS | + * | Saved CS | + * | Saved RIP | + * +-------------------------+ + * | pt_regs | + * +-------------------------+ + * + * The saved stack frame is used to fix up the copied stack frame + * that a nested NMI may change to make the interrupted NMI iret jump + * to the repeat_nmi. The original stack frame and the temp storage + * is also used by nested NMIs and can not be trusted on exit. + */ + /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ + movq (%rsp), %rdx + + /* Set the NMI executing variable on the stack. */ + pushq $1 + + /* + * Leave room for the "copied" frame + */ + subq $(5*8), %rsp + + /* Copy the stack frame to the Saved frame */ + .rept 5 + pushq 11*8(%rsp) + .endr + + /* Everything up to here is safe from nested NMIs */ + + /* + * If there was a nested NMI, the first NMI's iret will return + * here. But NMIs are still enabled and we can take another + * nested NMI. The nested NMI checks the interrupted RIP to see + * if it is between repeat_nmi and end_repeat_nmi, and if so + * it will just return, as we are about to repeat an NMI anyway. + * This makes it safe to copy to the stack frame that a nested + * NMI will update. + */ +repeat_nmi: + /* + * Update the stack variable to say we are still in NMI (the update + * is benign for the non-repeat case, where 1 was pushed just above + * to this very stack slot). + */ + movq $1, 10*8(%rsp) + + /* Make another copy, this one may be modified by nested NMIs */ + addq $(10*8), %rsp + .rept 5 + pushq -6*8(%rsp) + .endr + subq $(5*8), %rsp +end_repeat_nmi: + + /* + * Everything below this point can be preempted by a nested + * NMI if the first NMI took an exception and reset our iret stack + * so that we repeat another NMI. + */ + pushq $-1 /* ORIG_RAX: no syscall to restart */ + ALLOC_PT_GPREGS_ON_STACK + + /* + * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit + * as we should not be calling schedule in NMI context. + * Even with normal interrupts enabled. An NMI should not be + * setting NEED_RESCHED or anything that normal interrupts and + * exceptions might do. + */ + call paranoid_entry + + /* + * Save off the CR2 register. If we take a page fault in the NMI then + * it could corrupt the CR2 value. If the NMI preempts a page fault + * handler before it was able to read the CR2 register, and then the + * NMI itself takes a page fault, the page fault that was preempted + * will read the information from the NMI page fault and not the + * origin fault. Save it off and restore it if it changes. + * Use the r12 callee-saved register. + */ + movq %cr2, %r12 + + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ + movq %rsp,%rdi + movq $-1,%rsi + call do_nmi + + /* Did the NMI take a page fault? Restore cr2 if it did */ + movq %cr2, %rcx + cmpq %rcx, %r12 + je 1f + movq %r12, %cr2 +1: + testl %ebx,%ebx /* swapgs needed? */ + jnz nmi_restore +nmi_swapgs: + SWAPGS_UNSAFE_STACK +nmi_restore: + RESTORE_EXTRA_REGS + RESTORE_C_REGS + /* Pop the extra iret frame at once */ + REMOVE_PT_GPREGS_FROM_STACK 6*8 + + /* Clear the NMI executing stack variable */ + movq $0, 5*8(%rsp) + jmp irq_return +END(nmi) + +ENTRY(ignore_sysret) + mov $-ENOSYS,%eax + sysret +END(ignore_sysret) + diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9bcd0b56ca17..9d3ee054453d 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -22,7 +22,7 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n CFLAGS_irq.o := -I$(src)/../include/asm/trace -obj-y := process_$(BITS).o signal.o entry_$(BITS).o +obj-y := process_$(BITS).o signal.o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S deleted file mode 100644 index 0ac73de925d1..000000000000 --- a/arch/x86/kernel/entry_32.S +++ /dev/null @@ -1,1249 +0,0 @@ -/* - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -/* - * entry.S contains the system-call and fault low-level handling routines. - * This also contains the timer-interrupt handler, as well as all interrupts - * and faults that can result in a task-switch. - * - * NOTE: This code handles signal-recognition, which happens every time - * after a timer-interrupt and after each system call. - * - * I changed all the .align's to 4 (16 byte alignment), as that's faster - * on a 486. - * - * Stack layout in 'syscall_exit': - * ptrace needs to have all regs on the stack. - * if the order here is changed, it needs to be - * updated in fork.c:copy_process, signal.c:do_signal, - * ptrace.c and ptrace.h - * - * 0(%esp) - %ebx - * 4(%esp) - %ecx - * 8(%esp) - %edx - * C(%esp) - %esi - * 10(%esp) - %edi - * 14(%esp) - %ebp - * 18(%esp) - %eax - * 1C(%esp) - %ds - * 20(%esp) - %es - * 24(%esp) - %fs - * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS - * 2C(%esp) - orig_eax - * 30(%esp) - %eip - * 34(%esp) - %cs - * 38(%esp) - %eflags - * 3C(%esp) - %oldesp - * 40(%esp) - %oldss - * - * "current" is in register %ebx during any slow entries. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Avoid __ASSEMBLER__'ifying just for this. */ -#include -#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 - -#ifndef CONFIG_AUDITSYSCALL -#define sysenter_audit syscall_trace_entry -#define sysexit_audit syscall_exit_work -#endif - - .section .entry.text, "ax" - -/* - * We use macros for low-level operations which need to be overridden - * for paravirtualization. The following will never clobber any registers: - * INTERRUPT_RETURN (aka. "iret") - * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") - * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). - * - * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must - * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). - * Allowing a register to be clobbered can shrink the paravirt replacement - * enough to patch inline, increasing performance. - */ - -#ifdef CONFIG_PREEMPT -#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF -#else -#define preempt_stop(clobbers) -#define resume_kernel restore_all -#endif - -.macro TRACE_IRQS_IRET -#ifdef CONFIG_TRACE_IRQFLAGS - testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off? - jz 1f - TRACE_IRQS_ON -1: -#endif -.endm - -/* - * User gs save/restore - * - * %gs is used for userland TLS and kernel only uses it for stack - * canary which is required to be at %gs:20 by gcc. Read the comment - * at the top of stackprotector.h for more info. - * - * Local labels 98 and 99 are used. - */ -#ifdef CONFIG_X86_32_LAZY_GS - - /* unfortunately push/pop can't be no-op */ -.macro PUSH_GS - pushl $0 -.endm -.macro POP_GS pop=0 - addl $(4 + \pop), %esp -.endm -.macro POP_GS_EX -.endm - - /* all the rest are no-op */ -.macro PTGS_TO_GS -.endm -.macro PTGS_TO_GS_EX -.endm -.macro GS_TO_REG reg -.endm -.macro REG_TO_PTGS reg -.endm -.macro SET_KERNEL_GS reg -.endm - -#else /* CONFIG_X86_32_LAZY_GS */ - -.macro PUSH_GS - pushl %gs -.endm - -.macro POP_GS pop=0 -98: popl %gs - .if \pop <> 0 - add $\pop, %esp - .endif -.endm -.macro POP_GS_EX -.pushsection .fixup, "ax" -99: movl $0, (%esp) - jmp 98b -.popsection - _ASM_EXTABLE(98b,99b) -.endm - -.macro PTGS_TO_GS -98: mov PT_GS(%esp), %gs -.endm -.macro PTGS_TO_GS_EX -.pushsection .fixup, "ax" -99: movl $0, PT_GS(%esp) - jmp 98b -.popsection - _ASM_EXTABLE(98b,99b) -.endm - -.macro GS_TO_REG reg - movl %gs, \reg -.endm -.macro REG_TO_PTGS reg - movl \reg, PT_GS(%esp) -.endm -.macro SET_KERNEL_GS reg - movl $(__KERNEL_STACK_CANARY), \reg - movl \reg, %gs -.endm - -#endif /* CONFIG_X86_32_LAZY_GS */ - -.macro SAVE_ALL - cld - PUSH_GS - pushl %fs - pushl %es - pushl %ds - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - movl $(__USER_DS), %edx - movl %edx, %ds - movl %edx, %es - movl $(__KERNEL_PERCPU), %edx - movl %edx, %fs - SET_KERNEL_GS %edx -.endm - -.macro RESTORE_INT_REGS - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %eax -.endm - -.macro RESTORE_REGS pop=0 - RESTORE_INT_REGS -1: popl %ds -2: popl %es -3: popl %fs - POP_GS \pop -.pushsection .fixup, "ax" -4: movl $0, (%esp) - jmp 1b -5: movl $0, (%esp) - jmp 2b -6: movl $0, (%esp) - jmp 3b -.popsection - _ASM_EXTABLE(1b,4b) - _ASM_EXTABLE(2b,5b) - _ASM_EXTABLE(3b,6b) - POP_GS_EX -.endm - -ENTRY(ret_from_fork) - pushl %eax - call schedule_tail - GET_THREAD_INFO(%ebp) - popl %eax - pushl $0x0202 # Reset kernel eflags - popfl - jmp syscall_exit -END(ret_from_fork) - -ENTRY(ret_from_kernel_thread) - pushl %eax - call schedule_tail - GET_THREAD_INFO(%ebp) - popl %eax - pushl $0x0202 # Reset kernel eflags - popfl - movl PT_EBP(%esp),%eax - call *PT_EBX(%esp) - movl $0,PT_EAX(%esp) - jmp syscall_exit -ENDPROC(ret_from_kernel_thread) - -/* - * Return to user mode is not as complex as all this looks, - * but we want the default path for a system call return to - * go as quickly as possible which is why some of this is - * less clear than it otherwise should be. - */ - - # userspace resumption stub bypassing syscall exit tracing - ALIGN -ret_from_exception: - preempt_stop(CLBR_ANY) -ret_from_intr: - GET_THREAD_INFO(%ebp) -#ifdef CONFIG_VM86 - movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax -#else - /* - * We can be coming here from child spawned by kernel_thread(). - */ - movl PT_CS(%esp), %eax - andl $SEGMENT_RPL_MASK, %eax -#endif - cmpl $USER_RPL, %eax - jb resume_kernel # not returning to v8086 or userspace - -ENTRY(resume_userspace) - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done on - # int/exception return? - jne work_pending - jmp restore_all -END(ret_from_exception) - -#ifdef CONFIG_PREEMPT -ENTRY(resume_kernel) - DISABLE_INTERRUPTS(CLBR_ANY) -need_resched: - cmpl $0,PER_CPU_VAR(__preempt_count) - jnz restore_all - testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all - call preempt_schedule_irq - jmp need_resched -END(resume_kernel) -#endif - -/* SYSENTER_RETURN points to after the "sysenter" instruction in - the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ - - # sysenter call handler stub -ENTRY(ia32_sysenter_target) - movl TSS_sysenter_sp0(%esp),%esp -sysenter_past_esp: - /* - * Interrupts are disabled here, but we can't trace it until - * enough kernel state to call TRACE_IRQS_OFF can be called - but - * we immediately enable interrupts at that point anyway. - */ - pushl $__USER_DS - pushl %ebp - pushfl - orl $X86_EFLAGS_IF, (%esp) - pushl $__USER_CS - /* - * Push current_thread_info()->sysenter_return to the stack. - * A tiny bit of offset fixup is necessary: TI_sysenter_return - * is relative to thread_info, which is at the bottom of the - * kernel stack page. 4*4 means the 4 words pushed above; - * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; - * and THREAD_SIZE takes us to the bottom. - */ - pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) - - pushl %eax - SAVE_ALL - ENABLE_INTERRUPTS(CLBR_NONE) - -/* - * Load the potential sixth argument from user stack. - * Careful about security. - */ - cmpl $__PAGE_OFFSET-3,%ebp - jae syscall_fault - ASM_STAC -1: movl (%ebp),%ebp - ASM_CLAC - movl %ebp,PT_EBP(%esp) - _ASM_EXTABLE(1b,syscall_fault) - - GET_THREAD_INFO(%ebp) - - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) - jnz sysenter_audit -sysenter_do_call: - cmpl $(NR_syscalls), %eax - jae sysenter_badsys - call *sys_call_table(,%eax,4) -sysenter_after_call: - movl %eax,PT_EAX(%esp) - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $_TIF_ALLWORK_MASK, %ecx - jnz sysexit_audit -sysenter_exit: -/* if something modifies registers it must also disable sysexit */ - movl PT_EIP(%esp), %edx - movl PT_OLDESP(%esp), %ecx - xorl %ebp,%ebp - TRACE_IRQS_ON -1: mov PT_FS(%esp), %fs - PTGS_TO_GS - ENABLE_INTERRUPTS_SYSEXIT - -#ifdef CONFIG_AUDITSYSCALL -sysenter_audit: - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) - jnz syscall_trace_entry - /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ - movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ - /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ - pushl PT_ESI(%esp) /* a3: 5th arg */ - pushl PT_EDX+4(%esp) /* a2: 4th arg */ - call __audit_syscall_entry - popl %ecx /* get that remapped edx off the stack */ - popl %ecx /* get that remapped esi off the stack */ - movl PT_EAX(%esp),%eax /* reload syscall number */ - jmp sysenter_do_call - -sysexit_audit: - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jnz syscall_exit_work - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) - movl %eax,%edx /* second arg, syscall return value */ - cmpl $-MAX_ERRNO,%eax /* is it an error ? */ - setbe %al /* 1 if so, 0 if not */ - movzbl %al,%eax /* zero-extend that */ - call __audit_syscall_exit - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jnz syscall_exit_work - movl PT_EAX(%esp),%eax /* reload syscall return value */ - jmp sysenter_exit -#endif - -.pushsection .fixup,"ax" -2: movl $0,PT_FS(%esp) - jmp 1b -.popsection - _ASM_EXTABLE(1b,2b) - PTGS_TO_GS_EX -ENDPROC(ia32_sysenter_target) - - # system call handler stub -ENTRY(system_call) - ASM_CLAC - pushl %eax # save orig_eax - SAVE_ALL - GET_THREAD_INFO(%ebp) - # system call tracing in operation / emulation - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) - jnz syscall_trace_entry - cmpl $(NR_syscalls), %eax - jae syscall_badsys -syscall_call: - call *sys_call_table(,%eax,4) -syscall_after_call: - movl %eax,PT_EAX(%esp) # store the return value -syscall_exit: - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $_TIF_ALLWORK_MASK, %ecx # current->work - jnz syscall_exit_work - -restore_all: - TRACE_IRQS_IRET -restore_all_notrace: -#ifdef CONFIG_X86_ESPFIX32 - movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS - # Warning: PT_OLDSS(%esp) contains the wrong/random values if we - # are returning to the kernel. - # See comments in process.c:copy_thread() for details. - movb PT_OLDSS(%esp), %ah - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax - cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax - je ldt_ss # returning to user-space with LDT SS -#endif -restore_nocheck: - RESTORE_REGS 4 # skip orig_eax/error_code -irq_return: - INTERRUPT_RETURN -.section .fixup,"ax" -ENTRY(iret_exc) - pushl $0 # no error code - pushl $do_iret_error - jmp error_code -.previous - _ASM_EXTABLE(irq_return,iret_exc) - -#ifdef CONFIG_X86_ESPFIX32 -ldt_ss: -#ifdef CONFIG_PARAVIRT - /* - * The kernel can't run on a non-flat stack if paravirt mode - * is active. Rather than try to fixup the high bits of - * ESP, bypass this code entirely. This may break DOSemu - * and/or Wine support in a paravirt VM, although the option - * is still available to implement the setting of the high - * 16-bits in the INTERRUPT_RETURN paravirt-op. - */ - cmpl $0, pv_info+PARAVIRT_enabled - jne restore_nocheck -#endif - -/* - * Setup and switch to ESPFIX stack - * - * We're returning to userspace with a 16 bit stack. The CPU will not - * restore the high word of ESP for us on executing iret... This is an - * "official" bug of all the x86-compatible CPUs, which we can work - * around to make dosemu and wine happy. We do this by preloading the - * high word of ESP with the high word of the userspace ESP while - * compensating for the offset by changing to the ESPFIX segment with - * a base address that matches for the difference. - */ -#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) - mov %esp, %edx /* load kernel esp */ - mov PT_OLDESP(%esp), %eax /* load userspace esp */ - mov %dx, %ax /* eax: new kernel esp */ - sub %eax, %edx /* offset (low word is 0) */ - shr $16, %edx - mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ - mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ - pushl $__ESPFIX_SS - pushl %eax /* new kernel esp */ - /* Disable interrupts, but do not irqtrace this section: we - * will soon execute iret and the tracer was already set to - * the irqstate after the iret */ - DISABLE_INTERRUPTS(CLBR_EAX) - lss (%esp), %esp /* switch to espfix segment */ - jmp restore_nocheck -#endif -ENDPROC(system_call) - - # perform work that needs to be done immediately before resumption - ALIGN -work_pending: - testb $_TIF_NEED_RESCHED, %cl - jz work_notifysig -work_resched: - call schedule - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done other - # than syscall tracing? - jz restore_all - testb $_TIF_NEED_RESCHED, %cl - jnz work_resched - -work_notifysig: # deal with pending signals and - # notify-resume requests -#ifdef CONFIG_VM86 - testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) - movl %esp, %eax - jnz work_notifysig_v86 # returning to kernel-space or - # vm86-space -1: -#else - movl %esp, %eax -#endif - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - movb PT_CS(%esp), %bl - andb $SEGMENT_RPL_MASK, %bl - cmpb $USER_RPL, %bl - jb resume_kernel - xorl %edx, %edx - call do_notify_resume - jmp resume_userspace - -#ifdef CONFIG_VM86 - ALIGN -work_notifysig_v86: - pushl %ecx # save ti_flags for do_notify_resume - call save_v86_state # %eax contains pt_regs pointer - popl %ecx - movl %eax, %esp - jmp 1b -#endif -END(work_pending) - - # perform syscall exit tracing - ALIGN -syscall_trace_entry: - movl $-ENOSYS,PT_EAX(%esp) - movl %esp, %eax - call syscall_trace_enter - /* What it returned is what we'll actually use. */ - cmpl $(NR_syscalls), %eax - jnae syscall_call - jmp syscall_exit -END(syscall_trace_entry) - - # perform syscall exit tracing - ALIGN -syscall_exit_work: - testl $_TIF_WORK_SYSCALL_EXIT, %ecx - jz work_pending - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call - # schedule() instead - movl %esp, %eax - call syscall_trace_leave - jmp resume_userspace -END(syscall_exit_work) - -syscall_fault: - ASM_CLAC - GET_THREAD_INFO(%ebp) - movl $-EFAULT,PT_EAX(%esp) - jmp resume_userspace -END(syscall_fault) - -syscall_badsys: - movl $-ENOSYS,%eax - jmp syscall_after_call -END(syscall_badsys) - -sysenter_badsys: - movl $-ENOSYS,%eax - jmp sysenter_after_call -END(sysenter_badsys) - -.macro FIXUP_ESPFIX_STACK -/* - * Switch back for ESPFIX stack to the normal zerobased stack - * - * We can't call C functions using the ESPFIX stack. This code reads - * the high word of the segment base from the GDT and swiches to the - * normal stack and adjusts ESP with the matching offset. - */ -#ifdef CONFIG_X86_ESPFIX32 - /* fixup the stack */ - mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ - mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ - shl $16, %eax - addl %esp, %eax /* the adjusted stack pointer */ - pushl $__KERNEL_DS - pushl %eax - lss (%esp), %esp /* switch to the normal stack segment */ -#endif -.endm -.macro UNWIND_ESPFIX_STACK -#ifdef CONFIG_X86_ESPFIX32 - movl %ss, %eax - /* see if on espfix stack */ - cmpw $__ESPFIX_SS, %ax - jne 27f - movl $__KERNEL_DS, %eax - movl %eax, %ds - movl %eax, %es - /* switch to normal stack */ - FIXUP_ESPFIX_STACK -27: -#endif -.endm - -/* - * Build the entry stubs with some assembler magic. - * We pack 1 stub into every 8-byte block. - */ - .align 8 -ENTRY(irq_entries_start) - vector=FIRST_EXTERNAL_VECTOR - .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushl $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 - jmp common_interrupt - .align 8 - .endr -END(irq_entries_start) - -/* - * the CPU automatically disables interrupts when executing an IRQ vector, - * so IRQ-flags tracing has to follow that: - */ - .p2align CONFIG_X86_L1_CACHE_SHIFT -common_interrupt: - ASM_CLAC - addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ - SAVE_ALL - TRACE_IRQS_OFF - movl %esp,%eax - call do_IRQ - jmp ret_from_intr -ENDPROC(common_interrupt) - -#define BUILD_INTERRUPT3(name, nr, fn) \ -ENTRY(name) \ - ASM_CLAC; \ - pushl $~(nr); \ - SAVE_ALL; \ - TRACE_IRQS_OFF \ - movl %esp,%eax; \ - call fn; \ - jmp ret_from_intr; \ -ENDPROC(name) - - -#ifdef CONFIG_TRACING -#define TRACE_BUILD_INTERRUPT(name, nr) \ - BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) -#else -#define TRACE_BUILD_INTERRUPT(name, nr) -#endif - -#define BUILD_INTERRUPT(name, nr) \ - BUILD_INTERRUPT3(name, nr, smp_##name); \ - TRACE_BUILD_INTERRUPT(name, nr) - -/* The include is where all of the SMP etc. interrupts come from */ -#include - -ENTRY(coprocessor_error) - ASM_CLAC - pushl $0 - pushl $do_coprocessor_error - jmp error_code -END(coprocessor_error) - -ENTRY(simd_coprocessor_error) - ASM_CLAC - pushl $0 -#ifdef CONFIG_X86_INVD_BUG - /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ - ALTERNATIVE "pushl $do_general_protection", \ - "pushl $do_simd_coprocessor_error", \ - X86_FEATURE_XMM -#else - pushl $do_simd_coprocessor_error -#endif - jmp error_code -END(simd_coprocessor_error) - -ENTRY(device_not_available) - ASM_CLAC - pushl $-1 # mark this as an int - pushl $do_device_not_available - jmp error_code -END(device_not_available) - -#ifdef CONFIG_PARAVIRT -ENTRY(native_iret) - iret - _ASM_EXTABLE(native_iret, iret_exc) -END(native_iret) - -ENTRY(native_irq_enable_sysexit) - sti - sysexit -END(native_irq_enable_sysexit) -#endif - -ENTRY(overflow) - ASM_CLAC - pushl $0 - pushl $do_overflow - jmp error_code -END(overflow) - -ENTRY(bounds) - ASM_CLAC - pushl $0 - pushl $do_bounds - jmp error_code -END(bounds) - -ENTRY(invalid_op) - ASM_CLAC - pushl $0 - pushl $do_invalid_op - jmp error_code -END(invalid_op) - -ENTRY(coprocessor_segment_overrun) - ASM_CLAC - pushl $0 - pushl $do_coprocessor_segment_overrun - jmp error_code -END(coprocessor_segment_overrun) - -ENTRY(invalid_TSS) - ASM_CLAC - pushl $do_invalid_TSS - jmp error_code -END(invalid_TSS) - -ENTRY(segment_not_present) - ASM_CLAC - pushl $do_segment_not_present - jmp error_code -END(segment_not_present) - -ENTRY(stack_segment) - ASM_CLAC - pushl $do_stack_segment - jmp error_code -END(stack_segment) - -ENTRY(alignment_check) - ASM_CLAC - pushl $do_alignment_check - jmp error_code -END(alignment_check) - -ENTRY(divide_error) - ASM_CLAC - pushl $0 # no error code - pushl $do_divide_error - jmp error_code -END(divide_error) - -#ifdef CONFIG_X86_MCE -ENTRY(machine_check) - ASM_CLAC - pushl $0 - pushl machine_check_vector - jmp error_code -END(machine_check) -#endif - -ENTRY(spurious_interrupt_bug) - ASM_CLAC - pushl $0 - pushl $do_spurious_interrupt_bug - jmp error_code -END(spurious_interrupt_bug) - -#ifdef CONFIG_XEN -/* Xen doesn't set %esp to be precisely what the normal sysenter - entrypoint expects, so fix it up before using the normal path. */ -ENTRY(xen_sysenter_target) - addl $5*4, %esp /* remove xen-provided frame */ - jmp sysenter_past_esp - -ENTRY(xen_hypervisor_callback) - pushl $-1 /* orig_ax = -1 => not a system call */ - SAVE_ALL - TRACE_IRQS_OFF - - /* Check to see if we got the event in the critical - region in xen_iret_direct, after we've reenabled - events and checked for pending events. This simulates - iret instruction's behaviour where it delivers a - pending interrupt when enabling interrupts. */ - movl PT_EIP(%esp),%eax - cmpl $xen_iret_start_crit,%eax - jb 1f - cmpl $xen_iret_end_crit,%eax - jae 1f - - jmp xen_iret_crit_fixup - -ENTRY(xen_do_upcall) -1: mov %esp, %eax - call xen_evtchn_do_upcall -#ifndef CONFIG_PREEMPT - call xen_maybe_preempt_hcall -#endif - jmp ret_from_intr -ENDPROC(xen_hypervisor_callback) - -# Hypervisor uses this for application faults while it executes. -# We get here for two reasons: -# 1. Fault while reloading DS, ES, FS or GS -# 2. Fault while executing IRET -# Category 1 we fix up by reattempting the load, and zeroing the segment -# register if the load fails. -# Category 2 we fix up by jumping to do_iret_error. We cannot use the -# normal Linux return path in this case because if we use the IRET hypercall -# to pop the stack frame we end up in an infinite loop of failsafe callbacks. -# We distinguish between categories by maintaining a status value in EAX. -ENTRY(xen_failsafe_callback) - pushl %eax - movl $1,%eax -1: mov 4(%esp),%ds -2: mov 8(%esp),%es -3: mov 12(%esp),%fs -4: mov 16(%esp),%gs - /* EAX == 0 => Category 1 (Bad segment) - EAX != 0 => Category 2 (Bad IRET) */ - testl %eax,%eax - popl %eax - lea 16(%esp),%esp - jz 5f - jmp iret_exc -5: pushl $-1 /* orig_ax = -1 => not a system call */ - SAVE_ALL - jmp ret_from_exception - -.section .fixup,"ax" -6: xorl %eax,%eax - movl %eax,4(%esp) - jmp 1b -7: xorl %eax,%eax - movl %eax,8(%esp) - jmp 2b -8: xorl %eax,%eax - movl %eax,12(%esp) - jmp 3b -9: xorl %eax,%eax - movl %eax,16(%esp) - jmp 4b -.previous - _ASM_EXTABLE(1b,6b) - _ASM_EXTABLE(2b,7b) - _ASM_EXTABLE(3b,8b) - _ASM_EXTABLE(4b,9b) -ENDPROC(xen_failsafe_callback) - -BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - xen_evtchn_do_upcall) - -#endif /* CONFIG_XEN */ - -#if IS_ENABLED(CONFIG_HYPERV) - -BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - hyperv_vector_handler) - -#endif /* CONFIG_HYPERV */ - -#ifdef CONFIG_FUNCTION_TRACER -#ifdef CONFIG_DYNAMIC_FTRACE - -ENTRY(mcount) - ret -END(mcount) - -ENTRY(ftrace_caller) - pushl %eax - pushl %ecx - pushl %edx - pushl $0 /* Pass NULL as regs pointer */ - movl 4*4(%esp), %eax - movl 0x4(%ebp), %edx - movl function_trace_op, %ecx - subl $MCOUNT_INSN_SIZE, %eax - -.globl ftrace_call -ftrace_call: - call ftrace_stub - - addl $4,%esp /* skip NULL pointer */ - popl %edx - popl %ecx - popl %eax -ftrace_ret: -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -.globl ftrace_graph_call -ftrace_graph_call: - jmp ftrace_stub -#endif - -.globl ftrace_stub -ftrace_stub: - ret -END(ftrace_caller) - -ENTRY(ftrace_regs_caller) - pushf /* push flags before compare (in cs location) */ - - /* - * i386 does not save SS and ESP when coming from kernel. - * Instead, to get sp, ®s->sp is used (see ptrace.h). - * Unfortunately, that means eflags must be at the same location - * as the current return ip is. We move the return ip into the - * ip location, and move flags into the return ip location. - */ - pushl 4(%esp) /* save return ip into ip slot */ - - pushl $0 /* Load 0 into orig_ax */ - pushl %gs - pushl %fs - pushl %es - pushl %ds - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - - movl 13*4(%esp), %eax /* Get the saved flags */ - movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ - /* clobbering return ip */ - movl $__KERNEL_CS,13*4(%esp) - - movl 12*4(%esp), %eax /* Load ip (1st parameter) */ - subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ - movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ - movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ - pushl %esp /* Save pt_regs as 4th parameter */ - -GLOBAL(ftrace_regs_call) - call ftrace_stub - - addl $4, %esp /* Skip pt_regs */ - movl 14*4(%esp), %eax /* Move flags back into cs */ - movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ - movl 12*4(%esp), %eax /* Get return ip from regs->ip */ - movl %eax, 14*4(%esp) /* Put return ip back for ret */ - - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %eax - popl %ds - popl %es - popl %fs - popl %gs - addl $8, %esp /* Skip orig_ax and ip */ - popf /* Pop flags at end (no addl to corrupt flags) */ - jmp ftrace_ret - - popf - jmp ftrace_stub -#else /* ! CONFIG_DYNAMIC_FTRACE */ - -ENTRY(mcount) - cmpl $__PAGE_OFFSET, %esp - jb ftrace_stub /* Paging not enabled yet? */ - - cmpl $ftrace_stub, ftrace_trace_function - jnz trace -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - cmpl $ftrace_stub, ftrace_graph_return - jnz ftrace_graph_caller - - cmpl $ftrace_graph_entry_stub, ftrace_graph_entry - jnz ftrace_graph_caller -#endif -.globl ftrace_stub -ftrace_stub: - ret - - /* taken from glibc */ -trace: - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - movl 0x4(%ebp), %edx - subl $MCOUNT_INSN_SIZE, %eax - - call *ftrace_trace_function - - popl %edx - popl %ecx - popl %eax - jmp ftrace_stub -END(mcount) -#endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FUNCTION_TRACER */ - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(ftrace_graph_caller) - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - lea 0x4(%ebp), %edx - movl (%ebp), %ecx - subl $MCOUNT_INSN_SIZE, %eax - call prepare_ftrace_return - popl %edx - popl %ecx - popl %eax - ret -END(ftrace_graph_caller) - -.globl return_to_handler -return_to_handler: - pushl %eax - pushl %edx - movl %ebp, %eax - call ftrace_return_to_handler - movl %eax, %ecx - popl %edx - popl %eax - jmp *%ecx -#endif - -#ifdef CONFIG_TRACING -ENTRY(trace_page_fault) - ASM_CLAC - pushl $trace_do_page_fault - jmp error_code -END(trace_page_fault) -#endif - -ENTRY(page_fault) - ASM_CLAC - pushl $do_page_fault - ALIGN -error_code: - /* the function address is in %gs's slot on the stack */ - pushl %fs - pushl %es - pushl %ds - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - cld - movl $(__KERNEL_PERCPU), %ecx - movl %ecx, %fs - UNWIND_ESPFIX_STACK - GS_TO_REG %ecx - movl PT_GS(%esp), %edi # get the function address - movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - REG_TO_PTGS %ecx - SET_KERNEL_GS %ecx - movl $(__USER_DS), %ecx - movl %ecx, %ds - movl %ecx, %es - TRACE_IRQS_OFF - movl %esp,%eax # pt_regs pointer - call *%edi - jmp ret_from_exception -END(page_fault) - -/* - * Debug traps and NMI can happen at the one SYSENTER instruction - * that sets up the real kernel stack. Check here, since we can't - * allow the wrong stack to be used. - * - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have - * already pushed 3 words if it hits on the sysenter instruction: - * eflags, cs and eip. - * - * We just load the right stack, and push the three (known) values - * by hand onto the new stack - while updating the return eip past - * the instruction that would have done it for sysenter. - */ -.macro FIX_STACK offset ok label - cmpw $__KERNEL_CS, 4(%esp) - jne \ok -\label: - movl TSS_sysenter_sp0 + \offset(%esp), %esp - pushfl - pushl $__KERNEL_CS - pushl $sysenter_past_esp -.endm - -ENTRY(debug) - ASM_CLAC - cmpl $ia32_sysenter_target,(%esp) - jne debug_stack_correct - FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn -debug_stack_correct: - pushl $-1 # mark this as an int - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # error code 0 - movl %esp,%eax # pt_regs pointer - call do_debug - jmp ret_from_exception -END(debug) - -/* - * NMI is doubly nasty. It can happen _while_ we're handling - * a debug fault, and the debug fault hasn't yet been able to - * clear up the stack. So we first check whether we got an - * NMI on the sysenter entry path, but after that we need to - * check whether we got an NMI on the debug path where the debug - * fault happened on the sysenter path. - */ -ENTRY(nmi) - ASM_CLAC -#ifdef CONFIG_X86_ESPFIX32 - pushl %eax - movl %ss, %eax - cmpw $__ESPFIX_SS, %ax - popl %eax - je nmi_espfix_stack -#endif - cmpl $ia32_sysenter_target,(%esp) - je nmi_stack_fixup - pushl %eax - movl %esp,%eax - /* Do not access memory above the end of our stack page, - * it might not exist. - */ - andl $(THREAD_SIZE-1),%eax - cmpl $(THREAD_SIZE-20),%eax - popl %eax - jae nmi_stack_correct - cmpl $ia32_sysenter_target,12(%esp) - je nmi_debug_stack_check -nmi_stack_correct: - pushl %eax - SAVE_ALL - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_nmi - jmp restore_all_notrace - -nmi_stack_fixup: - FIX_STACK 12, nmi_stack_correct, 1 - jmp nmi_stack_correct - -nmi_debug_stack_check: - cmpw $__KERNEL_CS,16(%esp) - jne nmi_stack_correct - cmpl $debug,(%esp) - jb nmi_stack_correct - cmpl $debug_esp_fix_insn,(%esp) - ja nmi_stack_correct - FIX_STACK 24, nmi_stack_correct, 1 - jmp nmi_stack_correct - -#ifdef CONFIG_X86_ESPFIX32 -nmi_espfix_stack: - /* - * create the pointer to lss back - */ - pushl %ss - pushl %esp - addl $4, (%esp) - /* copy the iret frame of 12 bytes */ - .rept 3 - pushl 16(%esp) - .endr - pushl %eax - SAVE_ALL - FIXUP_ESPFIX_STACK # %eax == %esp - xorl %edx,%edx # zero error code - call do_nmi - RESTORE_REGS - lss 12+4(%esp), %esp # back to espfix stack - jmp irq_return -#endif -END(nmi) - -ENTRY(int3) - ASM_CLAC - pushl $-1 # mark this as an int - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_int3 - jmp ret_from_exception -END(int3) - -ENTRY(general_protection) - pushl $do_general_protection - jmp error_code -END(general_protection) - -#ifdef CONFIG_KVM_GUEST -ENTRY(async_page_fault) - ASM_CLAC - pushl $do_async_page_fault - jmp error_code -END(async_page_fault) -#endif - diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S deleted file mode 100644 index 4ad79e946f5a..000000000000 --- a/arch/x86/kernel/entry_64.S +++ /dev/null @@ -1,1442 +0,0 @@ -/* - * linux/arch/x86_64/entry.S - * - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs - * Copyright (C) 2000 Pavel Machek - */ - -/* - * entry.S contains the system-call and fault low-level handling routines. - * - * Some of this is documented in Documentation/x86/entry_64.txt - * - * NOTE: This code handles signal-recognition, which happens every time - * after an interrupt and after each system call. - * - * A note on terminology: - * - iret frame: Architecture defined interrupt frame from SS to RIP - * at the top of the kernel process stack. - * - * Some macro usage: - * - ENTRY/END Define functions in the symbol table. - * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. - * - idtentry - Define exception entry points. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Avoid __ASSEMBLER__'ifying just for this. */ -#include -#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_64BIT 0x80000000 -#define __AUDIT_ARCH_LE 0x40000000 - - .code64 - .section .entry.text, "ax" - - -#ifdef CONFIG_PARAVIRT -ENTRY(native_usergs_sysret64) - swapgs - sysretq -ENDPROC(native_usergs_sysret64) -#endif /* CONFIG_PARAVIRT */ - - -.macro TRACE_IRQS_IRETQ -#ifdef CONFIG_TRACE_IRQFLAGS - bt $9,EFLAGS(%rsp) /* interrupts off? */ - jnc 1f - TRACE_IRQS_ON -1: -#endif -.endm - -/* - * When dynamic function tracer is enabled it will add a breakpoint - * to all locations that it is about to modify, sync CPUs, update - * all the code, sync CPUs, then remove the breakpoints. In this time - * if lockdep is enabled, it might jump back into the debug handler - * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). - * - * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to - * make sure the stack pointer does not get reset back to the top - * of the debug stack, and instead just reuses the current stack. - */ -#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) - -.macro TRACE_IRQS_OFF_DEBUG - call debug_stack_set_zero - TRACE_IRQS_OFF - call debug_stack_reset -.endm - -.macro TRACE_IRQS_ON_DEBUG - call debug_stack_set_zero - TRACE_IRQS_ON - call debug_stack_reset -.endm - -.macro TRACE_IRQS_IRETQ_DEBUG - bt $9,EFLAGS(%rsp) /* interrupts off? */ - jnc 1f - TRACE_IRQS_ON_DEBUG -1: -.endm - -#else -# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF -# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON -# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ -#endif - -/* - * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. - * - * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, - * then loads new ss, cs, and rip from previously programmed MSRs. - * rflags gets masked by a value from another MSR (so CLD and CLAC - * are not needed). SYSCALL does not save anything on the stack - * and does not change rsp. - * - * Registers on entry: - * rax system call number - * rcx return address - * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) - * rdi arg0 - * rsi arg1 - * rdx arg2 - * r10 arg3 (needs to be moved to rcx to conform to C ABI) - * r8 arg4 - * r9 arg5 - * (note: r12-r15,rbp,rbx are callee-preserved in C ABI) - * - * Only called from user space. - * - * When user can change pt_regs->foo always force IRET. That is because - * it deals with uncanonical addresses better. SYSRET has trouble - * with them due to bugs in both AMD and Intel CPUs. - */ - -ENTRY(system_call) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - SWAPGS_UNSAFE_STACK - /* - * A hypervisor implementation might want to use a label - * after the swapgs, so that it can do the swapgs - * for the guest and jump here on syscall. - */ -GLOBAL(system_call_after_swapgs) - - movq %rsp,PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp - - /* Construct struct pt_regs on stack */ - pushq $__USER_DS /* pt_regs->ss */ - pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ - /* - * Re-enable interrupts. - * We use 'rsp_scratch' as a scratch space, hence irq-off block above - * must execute atomically in the face of possible interrupt-driven - * task preemption. We must enable interrupts only after we're done - * with using rsp_scratch: - */ - ENABLE_INTERRUPTS(CLBR_NONE) - pushq %r11 /* pt_regs->flags */ - pushq $__USER_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ - pushq $-ENOSYS /* pt_regs->ax */ - pushq %r8 /* pt_regs->r8 */ - pushq %r9 /* pt_regs->r9 */ - pushq %r10 /* pt_regs->r10 */ - pushq %r11 /* pt_regs->r11 */ - sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ - - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz tracesys -system_call_fastpath: -#if __SYSCALL_MASK == ~0 - cmpq $__NR_syscall_max,%rax -#else - andl $__SYSCALL_MASK,%eax - cmpl $__NR_syscall_max,%eax -#endif - ja 1f /* return -ENOSYS (already in pt_regs->ax) */ - movq %r10,%rcx - call *sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) -1: -/* - * Syscall return path ending with SYSRET (fast path). - * Has incompletely filled pt_regs. - */ - LOCKDEP_SYS_EXIT - /* - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - DISABLE_INTERRUPTS(CLBR_NONE) - - /* - * We must check ti flags with interrupts (or at least preemption) - * off because we must *never* return to userspace without - * processing exit work that is enqueued if we're preempted here. - * In particular, returning to userspace with any of the one-shot - * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is - * very bad. - */ - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ - - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RIP(%rsp),%rcx - movq EFLAGS(%rsp),%r11 - movq RSP(%rsp),%rsp - /* - * 64bit SYSRET restores rip from rcx, - * rflags from r11 (but RF and VM bits are forced to 0), - * cs and ss are loaded from MSRs. - * Restoration of rflags re-enables interrupts. - * - * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss - * descriptor is not reinitialized. This means that we should - * avoid SYSRET with SS == NULL, which could happen if we schedule, - * exit the kernel, and re-enter using an interrupt vector. (All - * interrupt entries on x86_64 set SS to NULL.) We prevent that - * from happening by reloading SS in __switch_to. (Actually - * detecting the failure in 64-bit userspace is tricky but can be - * done.) - */ - USERGS_SYSRET64 - - /* Do syscall entry tracing */ -tracesys: - movq %rsp, %rdi - movl $AUDIT_ARCH_X86_64, %esi - call syscall_trace_enter_phase1 - test %rax, %rax - jnz tracesys_phase2 /* if needed, run the slow path */ - RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ - movq ORIG_RAX(%rsp), %rax - jmp system_call_fastpath /* and return to the fast path */ - -tracesys_phase2: - SAVE_EXTRA_REGS - movq %rsp, %rdi - movl $AUDIT_ARCH_X86_64, %esi - movq %rax,%rdx - call syscall_trace_enter_phase2 - - /* - * Reload registers from stack in case ptrace changed them. - * We don't reload %rax because syscall_trace_entry_phase2() returned - * the value it wants us to use in the table lookup. - */ - RESTORE_C_REGS_EXCEPT_RAX - RESTORE_EXTRA_REGS -#if __SYSCALL_MASK == ~0 - cmpq $__NR_syscall_max,%rax -#else - andl $__SYSCALL_MASK,%eax - cmpl $__NR_syscall_max,%eax -#endif - ja 1f /* return -ENOSYS (already in pt_regs->ax) */ - movq %r10,%rcx /* fixup for C */ - call *sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) -1: - /* Use IRET because user could have changed pt_regs->foo */ - -/* - * Syscall return path ending with IRET. - * Has correct iret frame. - */ -GLOBAL(int_ret_from_sys_call) - DISABLE_INTERRUPTS(CLBR_NONE) -int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ - TRACE_IRQS_OFF - movl $_TIF_ALLWORK_MASK,%edi - /* edi: mask to check */ -GLOBAL(int_with_check) - LOCKDEP_SYS_EXIT_IRQ - GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx),%edx - andl %edi,%edx - jnz int_careful - andl $~TS_COMPAT,TI_status(%rcx) - jmp syscall_return - - /* Either reschedule or signal or syscall exit tracking needed. */ - /* First do a reschedule test. */ - /* edx: work, edi: workmask */ -int_careful: - bt $TIF_NEED_RESCHED,%edx - jnc int_very_careful - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - SCHEDULE_USER - popq %rdi - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp int_with_check - - /* handle signals and tracing -- both require a full pt_regs */ -int_very_careful: - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_EXTRA_REGS - /* Check for syscall exit trace */ - testl $_TIF_WORK_SYSCALL_EXIT,%edx - jz int_signal - pushq %rdi - leaq 8(%rsp),%rdi # &ptregs -> arg1 - call syscall_trace_leave - popq %rdi - andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi - jmp int_restore_rest - -int_signal: - testl $_TIF_DO_NOTIFY_MASK,%edx - jz 1f - movq %rsp,%rdi # &ptregs -> arg1 - xorl %esi,%esi # oldset -> arg2 - call do_notify_resume -1: movl $_TIF_WORK_MASK,%edi -int_restore_rest: - RESTORE_EXTRA_REGS - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp int_with_check - -syscall_return: - /* The IRETQ could re-enable interrupts: */ - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_IRETQ - - /* - * Try to use SYSRET instead of IRET if we're returning to - * a completely clean 64-bit userspace context. - */ - movq RCX(%rsp),%rcx - movq RIP(%rsp),%r11 - cmpq %rcx,%r11 /* RCX == RIP */ - jne opportunistic_sysret_failed - - /* - * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP - * in kernel space. This essentially lets the user take over - * the kernel, since userspace controls RSP. - * - * If width of "canonical tail" ever becomes variable, this will need - * to be updated to remain correct on both old and new CPUs. - */ - .ifne __VIRTUAL_MASK_SHIFT - 47 - .error "virtual address width changed -- SYSRET checks need update" - .endif - /* Change top 16 bits to be the sign-extension of 47th bit */ - shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx - sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx - /* If this changed %rcx, it was not canonical */ - cmpq %rcx, %r11 - jne opportunistic_sysret_failed - - cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ - jne opportunistic_sysret_failed - - movq R11(%rsp),%r11 - cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ - jne opportunistic_sysret_failed - - /* - * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, - * restoring TF results in a trap from userspace immediately after - * SYSRET. This would cause an infinite loop whenever #DB happens - * with register state that satisfies the opportunistic SYSRET - * conditions. For example, single-stepping this user code: - * - * movq $stuck_here,%rcx - * pushfq - * popq %r11 - * stuck_here: - * - * would never get past 'stuck_here'. - */ - testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz opportunistic_sysret_failed - - /* nothing to check for RSP */ - - cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ - jne opportunistic_sysret_failed - - /* - * We win! This label is here just for ease of understanding - * perf profiles. Nothing jumps here. - */ -syscall_return_via_sysret: - /* rcx and r11 are already restored (see code above) */ - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp),%rsp - USERGS_SYSRET64 - -opportunistic_sysret_failed: - SWAPGS - jmp restore_c_regs_and_iret -END(system_call) - - - .macro FORK_LIKE func -ENTRY(stub_\func) - SAVE_EXTRA_REGS 8 - jmp sys_\func -END(stub_\func) - .endm - - FORK_LIKE clone - FORK_LIKE fork - FORK_LIKE vfork - -ENTRY(stub_execve) - call sys_execve -return_from_execve: - testl %eax, %eax - jz 1f - /* exec failed, can use fast SYSRET code path in this case */ - ret -1: - /* must use IRET code path (pt_regs->cs may have changed) */ - addq $8, %rsp - ZERO_EXTRA_REGS - movq %rax,RAX(%rsp) - jmp int_ret_from_sys_call -END(stub_execve) -/* - * Remaining execve stubs are only 7 bytes long. - * ENTRY() often aligns to 16 bytes, which in this case has no benefits. - */ - .align 8 -GLOBAL(stub_execveat) - call sys_execveat - jmp return_from_execve -END(stub_execveat) - -#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) - .align 8 -GLOBAL(stub_x32_execve) -GLOBAL(stub32_execve) - call compat_sys_execve - jmp return_from_execve -END(stub32_execve) -END(stub_x32_execve) - .align 8 -GLOBAL(stub_x32_execveat) -GLOBAL(stub32_execveat) - call compat_sys_execveat - jmp return_from_execve -END(stub32_execveat) -END(stub_x32_execveat) -#endif - -/* - * sigreturn is special because it needs to restore all registers on return. - * This cannot be done with SYSRET, so use the IRET return path instead. - */ -ENTRY(stub_rt_sigreturn) - /* - * SAVE_EXTRA_REGS result is not normally needed: - * sigreturn overwrites all pt_regs->GPREGS. - * But sigreturn can fail (!), and there is no easy way to detect that. - * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error, - * we SAVE_EXTRA_REGS here. - */ - SAVE_EXTRA_REGS 8 - call sys_rt_sigreturn -return_from_stub: - addq $8, %rsp - RESTORE_EXTRA_REGS - movq %rax,RAX(%rsp) - jmp int_ret_from_sys_call -END(stub_rt_sigreturn) - -#ifdef CONFIG_X86_X32_ABI -ENTRY(stub_x32_rt_sigreturn) - SAVE_EXTRA_REGS 8 - call sys32_x32_rt_sigreturn - jmp return_from_stub -END(stub_x32_rt_sigreturn) -#endif - -/* - * A newly forked process directly context switches into this address. - * - * rdi: prev task we switched from - */ -ENTRY(ret_from_fork) - - LOCK ; btr $TIF_FORK,TI_flags(%r8) - - pushq $0x0002 - popfq # reset kernel eflags - - call schedule_tail # rdi: 'prev' task parameter - - RESTORE_EXTRA_REGS - - testb $3, CS(%rsp) # from kernel_thread? - - /* - * By the time we get here, we have no idea whether our pt_regs, - * ti flags, and ti status came from the 64-bit SYSCALL fast path, - * the slow path, or one of the ia32entry paths. - * Use IRET code path to return, since it can safely handle - * all of the above. - */ - jnz int_ret_from_sys_call - - /* We came from kernel_thread */ - /* nb: we depend on RESTORE_EXTRA_REGS above */ - movq %rbp, %rdi - call *%rbx - movl $0, RAX(%rsp) - RESTORE_EXTRA_REGS - jmp int_ret_from_sys_call -END(ret_from_fork) - -/* - * Build the entry stubs with some assembler magic. - * We pack 1 stub into every 8-byte block. - */ - .align 8 -ENTRY(irq_entries_start) - vector=FIRST_EXTERNAL_VECTOR - .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushq $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 - jmp common_interrupt - .align 8 - .endr -END(irq_entries_start) - -/* - * Interrupt entry/exit. - * - * Interrupt entry points save only callee clobbered registers in fast path. - * - * Entry runs with interrupts off. - */ - -/* 0(%rsp): ~(interrupt number) */ - .macro interrupt func - cld - /* - * Since nothing in interrupt handling code touches r12...r15 members - * of "struct pt_regs", and since interrupts can nest, we can save - * four stack slots and simultaneously provide - * an unwind-friendly stack layout by saving "truncated" pt_regs - * exactly up to rbp slot, without these members. - */ - ALLOC_PT_GPREGS_ON_STACK -RBP - SAVE_C_REGS -RBP - /* this goes to 0(%rsp) for unwinder, not for saving the value: */ - SAVE_EXTRA_REGS_RBP -RBP - - leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ - - testb $3, CS-RBP(%rsp) - jz 1f - SWAPGS -1: - /* - * Save previous stack pointer, optionally switch to interrupt stack. - * irq_count is used to check if a CPU is already on an interrupt stack - * or not. While this is essentially redundant with preempt_count it is - * a little cheaper to use a separate counter in the PDA (short of - * moving irq_enter into assembly, which would be too much work) - */ - movq %rsp, %rsi - incl PER_CPU_VAR(irq_count) - cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp - pushq %rsi - /* We entered an interrupt context - irqs are off: */ - TRACE_IRQS_OFF - - call \func - .endm - - /* - * The interrupt stubs push (~vector+0x80) onto the stack and - * then jump to common_interrupt. - */ - .p2align CONFIG_X86_L1_CACHE_SHIFT -common_interrupt: - ASM_CLAC - addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ - interrupt do_IRQ - /* 0(%rsp): old RSP */ -ret_from_intr: - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - decl PER_CPU_VAR(irq_count) - - /* Restore saved previous stack */ - popq %rsi - /* return code expects complete pt_regs - adjust rsp accordingly: */ - leaq -RBP(%rsi),%rsp - - testb $3, CS(%rsp) - jz retint_kernel - /* Interrupt came from user space */ -retint_user: - GET_THREAD_INFO(%rcx) - /* - * %rcx: thread info. Interrupts off. - */ -retint_with_reschedule: - movl $_TIF_WORK_MASK,%edi -retint_check: - LOCKDEP_SYS_EXIT_IRQ - movl TI_flags(%rcx),%edx - andl %edi,%edx - jnz retint_careful - -retint_swapgs: /* return to user-space */ - /* - * The iretq could re-enable interrupts: - */ - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_IRETQ - - SWAPGS - jmp restore_c_regs_and_iret - -/* Returning to kernel space */ -retint_kernel: -#ifdef CONFIG_PREEMPT - /* Interrupts are off */ - /* Check if we need preemption */ - bt $9,EFLAGS(%rsp) /* interrupts were off? */ - jnc 1f -0: cmpl $0,PER_CPU_VAR(__preempt_count) - jnz 1f - call preempt_schedule_irq - jmp 0b -1: -#endif - /* - * The iretq could re-enable interrupts: - */ - TRACE_IRQS_IRETQ - -/* - * At this label, code paths which return to kernel and to user, - * which come from interrupts/exception and from syscalls, merge. - */ -restore_c_regs_and_iret: - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 - -irq_return: - INTERRUPT_RETURN - -ENTRY(native_iret) - /* - * Are we returning to a stack segment from the LDT? Note: in - * 64-bit mode SS:RSP on the exception stack is always valid. - */ -#ifdef CONFIG_X86_ESPFIX64 - testb $4,(SS-RIP)(%rsp) - jnz native_irq_return_ldt -#endif - -.global native_irq_return_iret -native_irq_return_iret: - /* - * This may fault. Non-paranoid faults on return to userspace are - * handled by fixup_bad_iret. These include #SS, #GP, and #NP. - * Double-faults due to espfix64 are handled in do_double_fault. - * Other faults here are fatal. - */ - iretq - -#ifdef CONFIG_X86_ESPFIX64 -native_irq_return_ldt: - pushq %rax - pushq %rdi - SWAPGS - movq PER_CPU_VAR(espfix_waddr),%rdi - movq %rax,(0*8)(%rdi) /* RAX */ - movq (2*8)(%rsp),%rax /* RIP */ - movq %rax,(1*8)(%rdi) - movq (3*8)(%rsp),%rax /* CS */ - movq %rax,(2*8)(%rdi) - movq (4*8)(%rsp),%rax /* RFLAGS */ - movq %rax,(3*8)(%rdi) - movq (6*8)(%rsp),%rax /* SS */ - movq %rax,(5*8)(%rdi) - movq (5*8)(%rsp),%rax /* RSP */ - movq %rax,(4*8)(%rdi) - andl $0xffff0000,%eax - popq %rdi - orq PER_CPU_VAR(espfix_stack),%rax - SWAPGS - movq %rax,%rsp - popq %rax - jmp native_irq_return_iret -#endif - - /* edi: workmask, edx: work */ -retint_careful: - bt $TIF_NEED_RESCHED,%edx - jnc retint_signal - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - SCHEDULE_USER - popq %rdi - GET_THREAD_INFO(%rcx) - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp retint_check - -retint_signal: - testl $_TIF_DO_NOTIFY_MASK,%edx - jz retint_swapgs - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_EXTRA_REGS - movq $-1,ORIG_RAX(%rsp) - xorl %esi,%esi # oldset - movq %rsp,%rdi # &pt_regs - call do_notify_resume - RESTORE_EXTRA_REGS - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - GET_THREAD_INFO(%rcx) - jmp retint_with_reschedule - -END(common_interrupt) - -/* - * APIC interrupts. - */ -.macro apicinterrupt3 num sym do_sym -ENTRY(\sym) - ASM_CLAC - pushq $~(\num) -.Lcommon_\sym: - interrupt \do_sym - jmp ret_from_intr -END(\sym) -.endm - -#ifdef CONFIG_TRACING -#define trace(sym) trace_##sym -#define smp_trace(sym) smp_trace_##sym - -.macro trace_apicinterrupt num sym -apicinterrupt3 \num trace(\sym) smp_trace(\sym) -.endm -#else -.macro trace_apicinterrupt num sym do_sym -.endm -#endif - -.macro apicinterrupt num sym do_sym -apicinterrupt3 \num \sym \do_sym -trace_apicinterrupt \num \sym -.endm - -#ifdef CONFIG_SMP -apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \ - irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt -apicinterrupt3 REBOOT_VECTOR \ - reboot_interrupt smp_reboot_interrupt -#endif - -#ifdef CONFIG_X86_UV -apicinterrupt3 UV_BAU_MESSAGE \ - uv_bau_message_intr1 uv_bau_message_interrupt -#endif -apicinterrupt LOCAL_TIMER_VECTOR \ - apic_timer_interrupt smp_apic_timer_interrupt -apicinterrupt X86_PLATFORM_IPI_VECTOR \ - x86_platform_ipi smp_x86_platform_ipi - -#ifdef CONFIG_HAVE_KVM -apicinterrupt3 POSTED_INTR_VECTOR \ - kvm_posted_intr_ipi smp_kvm_posted_intr_ipi -#endif - -#ifdef CONFIG_X86_MCE_THRESHOLD -apicinterrupt THRESHOLD_APIC_VECTOR \ - threshold_interrupt smp_threshold_interrupt -#endif - -#ifdef CONFIG_X86_THERMAL_VECTOR -apicinterrupt THERMAL_APIC_VECTOR \ - thermal_interrupt smp_thermal_interrupt -#endif - -#ifdef CONFIG_SMP -apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ - call_function_single_interrupt smp_call_function_single_interrupt -apicinterrupt CALL_FUNCTION_VECTOR \ - call_function_interrupt smp_call_function_interrupt -apicinterrupt RESCHEDULE_VECTOR \ - reschedule_interrupt smp_reschedule_interrupt -#endif - -apicinterrupt ERROR_APIC_VECTOR \ - error_interrupt smp_error_interrupt -apicinterrupt SPURIOUS_APIC_VECTOR \ - spurious_interrupt smp_spurious_interrupt - -#ifdef CONFIG_IRQ_WORK -apicinterrupt IRQ_WORK_VECTOR \ - irq_work_interrupt smp_irq_work_interrupt -#endif - -/* - * Exception entry points. - */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) - -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 -ENTRY(\sym) - /* Sanity check */ - .if \shift_ist != -1 && \paranoid == 0 - .error "using shift_ist requires paranoid=1" - .endif - - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - - .ifeq \has_error_code - pushq $-1 /* ORIG_RAX: no syscall to restart */ - .endif - - ALLOC_PT_GPREGS_ON_STACK - - .if \paranoid - .if \paranoid == 1 - testb $3, CS(%rsp) /* If coming from userspace, switch */ - jnz 1f /* stacks. */ - .endif - call paranoid_entry - .else - call error_entry - .endif - /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ - - .if \paranoid - .if \shift_ist != -1 - TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ - .else - TRACE_IRQS_OFF - .endif - .endif - - movq %rsp,%rdi /* pt_regs pointer */ - - .if \has_error_code - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ - .else - xorl %esi,%esi /* no error code */ - .endif - - .if \shift_ist != -1 - subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) - .endif - - call \do_sym - - .if \shift_ist != -1 - addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) - .endif - - /* these procedures expect "no swapgs" flag in ebx */ - .if \paranoid - jmp paranoid_exit - .else - jmp error_exit - .endif - - .if \paranoid == 1 - /* - * Paranoid entry from userspace. Switch stacks and treat it - * as a normal entry. This means that paranoid handlers - * run in real process context if user_mode(regs). - */ -1: - call error_entry - - - movq %rsp,%rdi /* pt_regs pointer */ - call sync_regs - movq %rax,%rsp /* switch stack */ - - movq %rsp,%rdi /* pt_regs pointer */ - - .if \has_error_code - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ - .else - xorl %esi,%esi /* no error code */ - .endif - - call \do_sym - - jmp error_exit /* %ebx: no swapgs flag */ - .endif -END(\sym) -.endm - -#ifdef CONFIG_TRACING -.macro trace_idtentry sym do_sym has_error_code:req -idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code -idtentry \sym \do_sym has_error_code=\has_error_code -.endm -#else -.macro trace_idtentry sym do_sym has_error_code:req -idtentry \sym \do_sym has_error_code=\has_error_code -.endm -#endif - -idtentry divide_error do_divide_error has_error_code=0 -idtentry overflow do_overflow has_error_code=0 -idtentry bounds do_bounds has_error_code=0 -idtentry invalid_op do_invalid_op has_error_code=0 -idtentry device_not_available do_device_not_available has_error_code=0 -idtentry double_fault do_double_fault has_error_code=1 paranoid=2 -idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 -idtentry invalid_TSS do_invalid_TSS has_error_code=1 -idtentry segment_not_present do_segment_not_present has_error_code=1 -idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 -idtentry coprocessor_error do_coprocessor_error has_error_code=0 -idtentry alignment_check do_alignment_check has_error_code=1 -idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 - - - /* Reload gs selector with exception handling */ - /* edi: new selector */ -ENTRY(native_load_gs_index) - pushfq - DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) - SWAPGS -gs_change: - movl %edi,%gs -2: mfence /* workaround */ - SWAPGS - popfq - ret -END(native_load_gs_index) - - _ASM_EXTABLE(gs_change,bad_gs) - .section .fixup,"ax" - /* running with kernelgs */ -bad_gs: - SWAPGS /* switch back to user gs */ - xorl %eax,%eax - movl %eax,%gs - jmp 2b - .previous - -/* Call softirq on interrupt stack. Interrupts are off. */ -ENTRY(do_softirq_own_stack) - pushq %rbp - mov %rsp,%rbp - incl PER_CPU_VAR(irq_count) - cmove PER_CPU_VAR(irq_stack_ptr),%rsp - push %rbp # backlink for old unwinder - call __do_softirq - leaveq - decl PER_CPU_VAR(irq_count) - ret -END(do_softirq_own_stack) - -#ifdef CONFIG_XEN -idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 - -/* - * A note on the "critical region" in our callback handler. - * We want to avoid stacking callback handlers due to events occurring - * during handling of the last event. To do this, we keep events disabled - * until we've done all processing. HOWEVER, we must enable events before - * popping the stack frame (can't be done atomically) and so it would still - * be possible to get enough handler activations to overflow the stack. - * Although unlikely, bugs of that kind are hard to track down, so we'd - * like to avoid the possibility. - * So, on entry to the handler we detect whether we interrupted an - * existing activation in its critical region -- if so, we pop the current - * activation and restart the handler using the previous one. - */ -ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) -/* - * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will - * see the correct pointer to the pt_regs - */ - movq %rdi, %rsp # we don't return, adjust the stack frame -11: incl PER_CPU_VAR(irq_count) - movq %rsp,%rbp - cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp - pushq %rbp # backlink for old unwinder - call xen_evtchn_do_upcall - popq %rsp - decl PER_CPU_VAR(irq_count) -#ifndef CONFIG_PREEMPT - call xen_maybe_preempt_hcall -#endif - jmp error_exit -END(xen_do_hypervisor_callback) - -/* - * Hypervisor uses this for application faults while it executes. - * We get here for two reasons: - * 1. Fault while reloading DS, ES, FS or GS - * 2. Fault while executing IRET - * Category 1 we do not need to fix up as Xen has already reloaded all segment - * registers that could be reloaded and zeroed the others. - * Category 2 we fix up by killing the current process. We cannot use the - * normal Linux return path in this case because if we use the IRET hypercall - * to pop the stack frame we end up in an infinite loop of failsafe callbacks. - * We distinguish between categories by comparing each saved segment register - * with its current contents: any discrepancy means we in category 1. - */ -ENTRY(xen_failsafe_callback) - movl %ds,%ecx - cmpw %cx,0x10(%rsp) - jne 1f - movl %es,%ecx - cmpw %cx,0x18(%rsp) - jne 1f - movl %fs,%ecx - cmpw %cx,0x20(%rsp) - jne 1f - movl %gs,%ecx - cmpw %cx,0x28(%rsp) - jne 1f - /* All segments match their saved values => Category 2 (Bad IRET). */ - movq (%rsp),%rcx - movq 8(%rsp),%r11 - addq $0x30,%rsp - pushq $0 /* RIP */ - pushq %r11 - pushq %rcx - jmp general_protection -1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ - movq (%rsp),%rcx - movq 8(%rsp),%r11 - addq $0x30,%rsp - pushq $-1 /* orig_ax = -1 => not a system call */ - ALLOC_PT_GPREGS_ON_STACK - SAVE_C_REGS - SAVE_EXTRA_REGS - jmp error_exit -END(xen_failsafe_callback) - -apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ - xen_hvm_callback_vector xen_evtchn_do_upcall - -#endif /* CONFIG_XEN */ - -#if IS_ENABLED(CONFIG_HYPERV) -apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ - hyperv_callback_vector hyperv_vector_handler -#endif /* CONFIG_HYPERV */ - -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry stack_segment do_stack_segment has_error_code=1 -#ifdef CONFIG_XEN -idtentry xen_debug do_debug has_error_code=0 -idtentry xen_int3 do_int3 has_error_code=0 -idtentry xen_stack_segment do_stack_segment has_error_code=1 -#endif -idtentry general_protection do_general_protection has_error_code=1 -trace_idtentry page_fault do_page_fault has_error_code=1 -#ifdef CONFIG_KVM_GUEST -idtentry async_page_fault do_async_page_fault has_error_code=1 -#endif -#ifdef CONFIG_X86_MCE -idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) -#endif - -/* - * Save all registers in pt_regs, and switch gs if needed. - * Use slow, but surefire "are we in kernel?" check. - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise - */ -ENTRY(paranoid_entry) - cld - SAVE_C_REGS 8 - SAVE_EXTRA_REGS 8 - movl $1,%ebx - movl $MSR_GS_BASE,%ecx - rdmsr - testl %edx,%edx - js 1f /* negative -> in kernel */ - SWAPGS - xorl %ebx,%ebx -1: ret -END(paranoid_entry) - -/* - * "Paranoid" exit path from exception stack. This is invoked - * only on return from non-NMI IST interrupts that came - * from kernel space. - * - * We may be returning to very strange contexts (e.g. very early - * in syscall entry), so checking for preemption here would - * be complicated. Fortunately, we there's no good reason - * to try to handle preemption here. - */ -/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ -ENTRY(paranoid_exit) - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF_DEBUG - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_exit_no_swapgs - TRACE_IRQS_IRETQ - SWAPGS_UNSAFE_STACK - jmp paranoid_exit_restore -paranoid_exit_no_swapgs: - TRACE_IRQS_IRETQ_DEBUG -paranoid_exit_restore: - RESTORE_EXTRA_REGS - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 - INTERRUPT_RETURN -END(paranoid_exit) - -/* - * Save all registers in pt_regs, and switch gs if needed. - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise - */ -ENTRY(error_entry) - cld - SAVE_C_REGS 8 - SAVE_EXTRA_REGS 8 - xorl %ebx,%ebx - testb $3, CS+8(%rsp) - jz error_kernelspace -error_swapgs: - SWAPGS -error_sti: - TRACE_IRQS_OFF - ret - - /* - * There are two places in the kernel that can potentially fault with - * usergs. Handle them here. B stepping K8s sometimes report a - * truncated RIP for IRET exceptions returning to compat mode. Check - * for these here too. - */ -error_kernelspace: - incl %ebx - leaq native_irq_return_iret(%rip),%rcx - cmpq %rcx,RIP+8(%rsp) - je error_bad_iret - movl %ecx,%eax /* zero extend */ - cmpq %rax,RIP+8(%rsp) - je bstep_iret - cmpq $gs_change,RIP+8(%rsp) - je error_swapgs - jmp error_sti - -bstep_iret: - /* Fix truncated RIP */ - movq %rcx,RIP+8(%rsp) - /* fall through */ - -error_bad_iret: - SWAPGS - mov %rsp,%rdi - call fixup_bad_iret - mov %rax,%rsp - decl %ebx /* Return to usergs */ - jmp error_sti -END(error_entry) - - -/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ -ENTRY(error_exit) - movl %ebx,%eax - RESTORE_EXTRA_REGS - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl %eax,%eax - jnz retint_kernel - jmp retint_user -END(error_exit) - -/* Runs on exception stack */ -ENTRY(nmi) - PARAVIRT_ADJUST_EXCEPTION_FRAME - /* - * We allow breakpoints in NMIs. If a breakpoint occurs, then - * the iretq it performs will take us out of NMI context. - * This means that we can have nested NMIs where the next - * NMI is using the top of the stack of the previous NMI. We - * can't let it execute because the nested NMI will corrupt the - * stack of the previous NMI. NMI handlers are not re-entrant - * anyway. - * - * To handle this case we do the following: - * Check the a special location on the stack that contains - * a variable that is set when NMIs are executing. - * The interrupted task's stack is also checked to see if it - * is an NMI stack. - * If the variable is not set and the stack is not the NMI - * stack then: - * o Set the special variable on the stack - * o Copy the interrupt frame into a "saved" location on the stack - * o Copy the interrupt frame into a "copy" location on the stack - * o Continue processing the NMI - * If the variable is set or the previous stack is the NMI stack: - * o Modify the "copy" location to jump to the repeate_nmi - * o return back to the first NMI - * - * Now on exit of the first NMI, we first clear the stack variable - * The NMI stack will tell any nested NMIs at that point that it is - * nested. Then we pop the stack normally with iret, and if there was - * a nested NMI that updated the copy interrupt stack frame, a - * jump will be made to the repeat_nmi code that will handle the second - * NMI. - */ - - /* Use %rdx as our temp variable throughout */ - pushq %rdx - - /* - * If %cs was not the kernel segment, then the NMI triggered in user - * space, which means it is definitely not nested. - */ - cmpl $__KERNEL_CS, 16(%rsp) - jne first_nmi - - /* - * Check the special variable on the stack to see if NMIs are - * executing. - */ - cmpl $1, -8(%rsp) - je nested_nmi - - /* - * Now test if the previous stack was an NMI stack. - * We need the double check. We check the NMI stack to satisfy the - * race when the first NMI clears the variable before returning. - * We check the variable because the first NMI could be in a - * breakpoint routine using a breakpoint stack. - */ - lea 6*8(%rsp), %rdx - /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ - cmpq %rdx, 4*8(%rsp) - /* If the stack pointer is above the NMI stack, this is a normal NMI */ - ja first_nmi - subq $EXCEPTION_STKSZ, %rdx - cmpq %rdx, 4*8(%rsp) - /* If it is below the NMI stack, it is a normal NMI */ - jb first_nmi - /* Ah, it is within the NMI stack, treat it as nested */ - -nested_nmi: - /* - * Do nothing if we interrupted the fixup in repeat_nmi. - * It's about to repeat the NMI handler, so we are fine - * with ignoring this one. - */ - movq $repeat_nmi, %rdx - cmpq 8(%rsp), %rdx - ja 1f - movq $end_repeat_nmi, %rdx - cmpq 8(%rsp), %rdx - ja nested_nmi_out - -1: - /* Set up the interrupted NMIs stack to jump to repeat_nmi */ - leaq -1*8(%rsp), %rdx - movq %rdx, %rsp - leaq -10*8(%rsp), %rdx - pushq $__KERNEL_DS - pushq %rdx - pushfq - pushq $__KERNEL_CS - pushq $repeat_nmi - - /* Put stack back */ - addq $(6*8), %rsp - -nested_nmi_out: - popq %rdx - - /* No need to check faults here */ - INTERRUPT_RETURN - -first_nmi: - /* - * Because nested NMIs will use the pushed location that we - * stored in rdx, we must keep that space available. - * Here's what our stack frame will look like: - * +-------------------------+ - * | original SS | - * | original Return RSP | - * | original RFLAGS | - * | original CS | - * | original RIP | - * +-------------------------+ - * | temp storage for rdx | - * +-------------------------+ - * | NMI executing variable | - * +-------------------------+ - * | copied SS | - * | copied Return RSP | - * | copied RFLAGS | - * | copied CS | - * | copied RIP | - * +-------------------------+ - * | Saved SS | - * | Saved Return RSP | - * | Saved RFLAGS | - * | Saved CS | - * | Saved RIP | - * +-------------------------+ - * | pt_regs | - * +-------------------------+ - * - * The saved stack frame is used to fix up the copied stack frame - * that a nested NMI may change to make the interrupted NMI iret jump - * to the repeat_nmi. The original stack frame and the temp storage - * is also used by nested NMIs and can not be trusted on exit. - */ - /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ - movq (%rsp), %rdx - - /* Set the NMI executing variable on the stack. */ - pushq $1 - - /* - * Leave room for the "copied" frame - */ - subq $(5*8), %rsp - - /* Copy the stack frame to the Saved frame */ - .rept 5 - pushq 11*8(%rsp) - .endr - - /* Everything up to here is safe from nested NMIs */ - - /* - * If there was a nested NMI, the first NMI's iret will return - * here. But NMIs are still enabled and we can take another - * nested NMI. The nested NMI checks the interrupted RIP to see - * if it is between repeat_nmi and end_repeat_nmi, and if so - * it will just return, as we are about to repeat an NMI anyway. - * This makes it safe to copy to the stack frame that a nested - * NMI will update. - */ -repeat_nmi: - /* - * Update the stack variable to say we are still in NMI (the update - * is benign for the non-repeat case, where 1 was pushed just above - * to this very stack slot). - */ - movq $1, 10*8(%rsp) - - /* Make another copy, this one may be modified by nested NMIs */ - addq $(10*8), %rsp - .rept 5 - pushq -6*8(%rsp) - .endr - subq $(5*8), %rsp -end_repeat_nmi: - - /* - * Everything below this point can be preempted by a nested - * NMI if the first NMI took an exception and reset our iret stack - * so that we repeat another NMI. - */ - pushq $-1 /* ORIG_RAX: no syscall to restart */ - ALLOC_PT_GPREGS_ON_STACK - - /* - * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit - * as we should not be calling schedule in NMI context. - * Even with normal interrupts enabled. An NMI should not be - * setting NEED_RESCHED or anything that normal interrupts and - * exceptions might do. - */ - call paranoid_entry - - /* - * Save off the CR2 register. If we take a page fault in the NMI then - * it could corrupt the CR2 value. If the NMI preempts a page fault - * handler before it was able to read the CR2 register, and then the - * NMI itself takes a page fault, the page fault that was preempted - * will read the information from the NMI page fault and not the - * origin fault. Save it off and restore it if it changes. - * Use the r12 callee-saved register. - */ - movq %cr2, %r12 - - /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ - movq %rsp,%rdi - movq $-1,%rsi - call do_nmi - - /* Did the NMI take a page fault? Restore cr2 if it did */ - movq %cr2, %rcx - cmpq %rcx, %r12 - je 1f - movq %r12, %cr2 -1: - testl %ebx,%ebx /* swapgs needed? */ - jnz nmi_restore -nmi_swapgs: - SWAPGS_UNSAFE_STACK -nmi_restore: - RESTORE_EXTRA_REGS - RESTORE_C_REGS - /* Pop the extra iret frame at once */ - REMOVE_PT_GPREGS_FROM_STACK 6*8 - - /* Clear the NMI executing stack variable */ - movq $0, 5*8(%rsp) - jmp irq_return -END(nmi) - -ENTRY(ignore_sysret) - mov $-ENOSYS,%eax - sysret -END(ignore_sysret) - -- cgit v1.2.3 From 00398a0018d1334fedabfeaabd0fa563121de612 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 18:41:06 +0200 Subject: x86/asm/entry: Move the vsyscall code to arch/x86/entry/vsyscall/ The vsyscall code is entry code too, so move it to arch/x86/entry/vsyscall/. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/Makefile | 6 +- arch/x86/entry/syscall_32.c | 33 +++ arch/x86/entry/syscall_64.c | 32 +++ arch/x86/entry/vsyscall/Makefile | 7 + arch/x86/entry/vsyscall/vsyscall_64.c | 335 ++++++++++++++++++++++++++++++ arch/x86/entry/vsyscall/vsyscall_emu_64.S | 37 ++++ arch/x86/entry/vsyscall/vsyscall_gtod.c | 70 +++++++ arch/x86/entry/vsyscall/vsyscall_trace.h | 29 +++ arch/x86/kernel/Makefile | 3 - arch/x86/kernel/syscall_32.c | 33 --- arch/x86/kernel/syscall_64.c | 32 --- arch/x86/kernel/vsyscall_64.c | 335 ------------------------------ arch/x86/kernel/vsyscall_emu_64.S | 37 ---- arch/x86/kernel/vsyscall_gtod.c | 70 ------- arch/x86/kernel/vsyscall_trace.h | 29 --- 15 files changed, 547 insertions(+), 541 deletions(-) create mode 100644 arch/x86/entry/syscall_32.c create mode 100644 arch/x86/entry/syscall_64.c create mode 100644 arch/x86/entry/vsyscall/Makefile create mode 100644 arch/x86/entry/vsyscall/vsyscall_64.c create mode 100644 arch/x86/entry/vsyscall/vsyscall_emu_64.S create mode 100644 arch/x86/entry/vsyscall/vsyscall_gtod.c create mode 100644 arch/x86/entry/vsyscall/vsyscall_trace.h delete mode 100644 arch/x86/kernel/syscall_32.c delete mode 100644 arch/x86/kernel/syscall_64.c delete mode 100644 arch/x86/kernel/vsyscall_64.c delete mode 100644 arch/x86/kernel/vsyscall_emu_64.S delete mode 100644 arch/x86/kernel/vsyscall_gtod.c delete mode 100644 arch/x86/kernel/vsyscall_trace.h (limited to 'arch/x86/kernel') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 9df72c8a0ac2..b93cce1c6bb2 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -1,8 +1,10 @@ # # Makefile for the x86 low level entry code # -obj-y := entry_$(BITS).o thunk_$(BITS).o +obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o + obj-y += vdso/ +obj-y += vsyscall/ -obj-$(CONFIG_IA32_EMULATION) += ia32entry.o +obj-$(CONFIG_IA32_EMULATION) += ia32entry.o syscall_32.o diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c new file mode 100644 index 000000000000..3777189c4a19 --- /dev/null +++ b/arch/x86/entry/syscall_32.c @@ -0,0 +1,33 @@ +/* System call table for i386. */ + +#include +#include +#include +#include + +#ifdef CONFIG_IA32_EMULATION +#define SYM(sym, compat) compat +#else +#define SYM(sym, compat) sym +#define ia32_sys_call_table sys_call_table +#define __NR_ia32_syscall_max __NR_syscall_max +#endif + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; +#include +#undef __SYSCALL_I386 + +#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), + +typedef asmlinkage void (*sys_call_ptr_t)(void); + +extern asmlinkage void sys_ni_syscall(void); + +__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, +#include +}; diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c new file mode 100644 index 000000000000..4ac730b37f0b --- /dev/null +++ b/arch/x86/entry/syscall_64.c @@ -0,0 +1,32 @@ +/* System call table for x86-64. */ + +#include +#include +#include +#include +#include + +#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) + +#ifdef CONFIG_X86_X32_ABI +# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat) +#else +# define __SYSCALL_X32(nr, sym, compat) /* nothing */ +#endif + +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#include +#undef __SYSCALL_64 + +#define __SYSCALL_64(nr, sym, compat) [nr] = sym, + +extern void sys_ni_syscall(void); + +asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_max] = &sys_ni_syscall, +#include +}; diff --git a/arch/x86/entry/vsyscall/Makefile b/arch/x86/entry/vsyscall/Makefile new file mode 100644 index 000000000000..a9f4856f622a --- /dev/null +++ b/arch/x86/entry/vsyscall/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the x86 low level vsyscall code +# +obj-y := vsyscall_gtod.o + +obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o + diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c new file mode 100644 index 000000000000..2dcc6ff6fdcc --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2012-2014 Andy Lutomirski + * + * Based on the original implementation which is: + * Copyright (C) 2001 Andrea Arcangeli SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Parts of the original code have been moved to arch/x86/vdso/vma.c + * + * This file implements vsyscall emulation. vsyscalls are a legacy ABI: + * Userspace can request certain kernel services by calling fixed + * addresses. This concept is problematic: + * + * - It interferes with ASLR. + * - It's awkward to write code that lives in kernel addresses but is + * callable by userspace at fixed addresses. + * - The whole concept is impossible for 32-bit compat userspace. + * - UML cannot easily virtualize a vsyscall. + * + * As of mid-2014, I believe that there is no new userspace code that + * will use a vsyscall if the vDSO is present. I hope that there will + * soon be no new userspace code that will ever use a vsyscall. + * + * The code in this file emulates vsyscalls when notified of a page + * fault to a vsyscall address. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include "vsyscall_trace.h" + +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; + +static int __init vsyscall_setup(char *str) +{ + if (str) { + if (!strcmp("emulate", str)) + vsyscall_mode = EMULATE; + else if (!strcmp("native", str)) + vsyscall_mode = NATIVE; + else if (!strcmp("none", str)) + vsyscall_mode = NONE; + else + return -EINVAL; + + return 0; + } + + return -EINVAL; +} +early_param("vsyscall", vsyscall_setup); + +static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, + const char *message) +{ + if (!show_unhandled_signals) + return; + + printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + level, current->comm, task_pid_nr(current), + message, regs->ip, regs->cs, + regs->sp, regs->ax, regs->si, regs->di); +} + +static int addr_to_vsyscall_nr(unsigned long addr) +{ + int nr; + + if ((addr & ~0xC00UL) != VSYSCALL_ADDR) + return -EINVAL; + + nr = (addr & 0xC00UL) >> 10; + if (nr >= 3) + return -EINVAL; + + return nr; +} + +static bool write_ok_or_segv(unsigned long ptr, size_t size) +{ + /* + * XXX: if access_ok, get_user, and put_user handled + * sig_on_uaccess_error, this could go away. + */ + + if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { + siginfo_t info; + struct thread_struct *thread = ¤t->thread; + + thread->error_code = 6; /* user fault, no page, write */ + thread->cr2 = ptr; + thread->trap_nr = X86_TRAP_PF; + + memset(&info, 0, sizeof(info)); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_MAPERR; + info.si_addr = (void __user *)ptr; + + force_sig_info(SIGSEGV, &info, current); + return false; + } else { + return true; + } +} + +bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) +{ + struct task_struct *tsk; + unsigned long caller; + int vsyscall_nr, syscall_nr, tmp; + int prev_sig_on_uaccess_error; + long ret; + + /* + * No point in checking CS -- the only way to get here is a user mode + * trap to a high address, which means that we're in 64-bit user code. + */ + + WARN_ON_ONCE(address != regs->ip); + + if (vsyscall_mode == NONE) { + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall attempted with vsyscall=none"); + return false; + } + + vsyscall_nr = addr_to_vsyscall_nr(address); + + trace_emulate_vsyscall(vsyscall_nr); + + if (vsyscall_nr < 0) { + warn_bad_vsyscall(KERN_WARNING, regs, + "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); + goto sigsegv; + } + + if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { + warn_bad_vsyscall(KERN_WARNING, regs, + "vsyscall with bad stack (exploit attempt?)"); + goto sigsegv; + } + + tsk = current; + + /* + * Check for access_ok violations and find the syscall nr. + * + * NULL is a valid user pointer (in the access_ok sense) on 32-bit and + * 64-bit, so we don't need to special-case it here. For all the + * vsyscalls, NULL means "don't write anything" not "write it at + * address 0". + */ + switch (vsyscall_nr) { + case 0: + if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || + !write_ok_or_segv(regs->si, sizeof(struct timezone))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_gettimeofday; + break; + + case 1: + if (!write_ok_or_segv(regs->di, sizeof(time_t))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_time; + break; + + case 2: + if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || + !write_ok_or_segv(regs->si, sizeof(unsigned))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_getcpu; + break; + } + + /* + * Handle seccomp. regs->ip must be the original value. + * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. + * + * We could optimize the seccomp disabled case, but performance + * here doesn't matter. + */ + regs->orig_ax = syscall_nr; + regs->ax = -ENOSYS; + tmp = secure_computing(); + if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { + warn_bad_vsyscall(KERN_DEBUG, regs, + "seccomp tried to change syscall nr or ip"); + do_exit(SIGSYS); + } + regs->orig_ax = -1; + if (tmp) + goto do_ret; /* skip requested */ + + /* + * With a real vsyscall, page faults cause SIGSEGV. We want to + * preserve that behavior to make writing exploits harder. + */ + prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; + current_thread_info()->sig_on_uaccess_error = 1; + + ret = -EFAULT; + switch (vsyscall_nr) { + case 0: + ret = sys_gettimeofday( + (struct timeval __user *)regs->di, + (struct timezone __user *)regs->si); + break; + + case 1: + ret = sys_time((time_t __user *)regs->di); + break; + + case 2: + ret = sys_getcpu((unsigned __user *)regs->di, + (unsigned __user *)regs->si, + NULL); + break; + } + + current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; + +check_fault: + if (ret == -EFAULT) { + /* Bad news -- userspace fed a bad pointer to a vsyscall. */ + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall fault (exploit attempt?)"); + + /* + * If we failed to generate a signal for any reason, + * generate one here. (This should be impossible.) + */ + if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && + !sigismember(&tsk->pending.signal, SIGSEGV))) + goto sigsegv; + + return true; /* Don't emulate the ret. */ + } + + regs->ax = ret; + +do_ret: + /* Emulate a ret instruction. */ + regs->ip = caller; + regs->sp += 8; + return true; + +sigsegv: + force_sig(SIGSEGV, current); + return true; +} + +/* + * A pseudo VMA to allow ptrace access for the vsyscall page. This only + * covers the 64bit vsyscall page now. 32bit has a real VMA now and does + * not need special handling anymore: + */ +static const char *gate_vma_name(struct vm_area_struct *vma) +{ + return "[vsyscall]"; +} +static struct vm_operations_struct gate_vma_ops = { + .name = gate_vma_name, +}; +static struct vm_area_struct gate_vma = { + .vm_start = VSYSCALL_ADDR, + .vm_end = VSYSCALL_ADDR + PAGE_SIZE, + .vm_page_prot = PAGE_READONLY_EXEC, + .vm_flags = VM_READ | VM_EXEC, + .vm_ops = &gate_vma_ops, +}; + +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) +{ +#ifdef CONFIG_IA32_EMULATION + if (!mm || mm->context.ia32_compat) + return NULL; +#endif + if (vsyscall_mode == NONE) + return NULL; + return &gate_vma; +} + +int in_gate_area(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma = get_gate_vma(mm); + + if (!vma) + return 0; + + return (addr >= vma->vm_start) && (addr < vma->vm_end); +} + +/* + * Use this when you have no reliable mm, typically from interrupt + * context. It is less reliable than using a task's mm and may give + * false positives. + */ +int in_gate_area_no_mm(unsigned long addr) +{ + return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; +} + +void __init map_vsyscall(void) +{ + extern char __vsyscall_page; + unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); + + if (vsyscall_mode != NONE) + __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, + vsyscall_mode == NATIVE + ? PAGE_KERNEL_VSYSCALL + : PAGE_KERNEL_VVAR); + + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != + (unsigned long)VSYSCALL_ADDR); +} diff --git a/arch/x86/entry/vsyscall/vsyscall_emu_64.S b/arch/x86/entry/vsyscall/vsyscall_emu_64.S new file mode 100644 index 000000000000..c9596a9af159 --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_emu_64.S @@ -0,0 +1,37 @@ +/* + * vsyscall_emu_64.S: Vsyscall emulation page + * + * Copyright (c) 2011 Andy Lutomirski + * + * Subject to the GNU General Public License, version 2 + */ + +#include + +#include +#include +#include + +__PAGE_ALIGNED_DATA + .globl __vsyscall_page + .balign PAGE_SIZE, 0xcc + .type __vsyscall_page, @object +__vsyscall_page: + + mov $__NR_gettimeofday, %rax + syscall + ret + + .balign 1024, 0xcc + mov $__NR_time, %rax + syscall + ret + + .balign 1024, 0xcc + mov $__NR_getcpu, %rax + syscall + ret + + .balign 4096, 0xcc + + .size __vsyscall_page, 4096 diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c new file mode 100644 index 000000000000..51e330416995 --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2001 Andrea Arcangeli SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Modified for x86 32 bit architecture by + * Stefani Seibold + * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany + * + * Thanks to hpa@transmeta.com for some useful hint. + * Special thanks to Ingo Molnar for his early experience with + * a different vsyscall implementation for Linux/IA32 and for the name. + * + */ + +#include +#include +#include + +DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); + +void update_vsyscall_tz(void) +{ + vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest; + vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime; +} + +void update_vsyscall(struct timekeeper *tk) +{ + struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; + + gtod_write_begin(vdata); + + /* copy vsyscall data */ + vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + vdata->cycle_last = tk->tkr_mono.cycle_last; + vdata->mask = tk->tkr_mono.mask; + vdata->mult = tk->tkr_mono.mult; + vdata->shift = tk->tkr_mono.shift; + + vdata->wall_time_sec = tk->xtime_sec; + vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; + + vdata->monotonic_time_sec = tk->xtime_sec + + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec + + ((u64)tk->wall_to_monotonic.tv_nsec + << tk->tkr_mono.shift); + while (vdata->monotonic_time_snsec >= + (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { + vdata->monotonic_time_snsec -= + ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; + vdata->monotonic_time_sec++; + } + + vdata->wall_time_coarse_sec = tk->xtime_sec; + vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >> + tk->tkr_mono.shift); + + vdata->monotonic_time_coarse_sec = + vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_coarse_nsec = + vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; + + while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { + vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; + vdata->monotonic_time_coarse_sec++; + } + + gtod_write_end(vdata); +} diff --git a/arch/x86/entry/vsyscall/vsyscall_trace.h b/arch/x86/entry/vsyscall/vsyscall_trace.h new file mode 100644 index 000000000000..9dd7359a38a8 --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_trace.h @@ -0,0 +1,29 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vsyscall + +#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define __VSYSCALL_TRACE_H + +#include + +TRACE_EVENT(emulate_vsyscall, + + TP_PROTO(int nr), + + TP_ARGS(nr), + + TP_STRUCT__entry(__field(int, nr)), + + TP_fast_assign( + __entry->nr = nr; + ), + + TP_printk("nr = %d", __entry->nr) +); + +#endif + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/ +#define TRACE_INCLUDE_FILE vsyscall_trace +#include diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9d3ee054453d..01663ee5f1b7 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -31,9 +31,6 @@ obj-y += probe_roms.o obj-$(CONFIG_X86_32) += i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += mcount_64.o -obj-y += syscall_$(BITS).o vsyscall_gtod.o -obj-$(CONFIG_IA32_EMULATION) += syscall_32.o -obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c deleted file mode 100644 index 3777189c4a19..000000000000 --- a/arch/x86/kernel/syscall_32.c +++ /dev/null @@ -1,33 +0,0 @@ -/* System call table for i386. */ - -#include -#include -#include -#include - -#ifdef CONFIG_IA32_EMULATION -#define SYM(sym, compat) compat -#else -#define SYM(sym, compat) sym -#define ia32_sys_call_table sys_call_table -#define __NR_ia32_syscall_max __NR_syscall_max -#endif - -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; -#include -#undef __SYSCALL_I386 - -#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), - -typedef asmlinkage void (*sys_call_ptr_t)(void); - -extern asmlinkage void sys_ni_syscall(void); - -__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, -#include -}; diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c deleted file mode 100644 index 4ac730b37f0b..000000000000 --- a/arch/x86/kernel/syscall_64.c +++ /dev/null @@ -1,32 +0,0 @@ -/* System call table for x86-64. */ - -#include -#include -#include -#include -#include - -#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) - -#ifdef CONFIG_X86_X32_ABI -# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat) -#else -# define __SYSCALL_X32(nr, sym, compat) /* nothing */ -#endif - -#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; -#include -#undef __SYSCALL_64 - -#define __SYSCALL_64(nr, sym, compat) [nr] = sym, - -extern void sys_ni_syscall(void); - -asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_syscall_max] = &sys_ni_syscall, -#include -}; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c deleted file mode 100644 index 2dcc6ff6fdcc..000000000000 --- a/arch/x86/kernel/vsyscall_64.c +++ /dev/null @@ -1,335 +0,0 @@ -/* - * Copyright (c) 2012-2014 Andy Lutomirski - * - * Based on the original implementation which is: - * Copyright (C) 2001 Andrea Arcangeli SuSE - * Copyright 2003 Andi Kleen, SuSE Labs. - * - * Parts of the original code have been moved to arch/x86/vdso/vma.c - * - * This file implements vsyscall emulation. vsyscalls are a legacy ABI: - * Userspace can request certain kernel services by calling fixed - * addresses. This concept is problematic: - * - * - It interferes with ASLR. - * - It's awkward to write code that lives in kernel addresses but is - * callable by userspace at fixed addresses. - * - The whole concept is impossible for 32-bit compat userspace. - * - UML cannot easily virtualize a vsyscall. - * - * As of mid-2014, I believe that there is no new userspace code that - * will use a vsyscall if the vDSO is present. I hope that there will - * soon be no new userspace code that will ever use a vsyscall. - * - * The code in this file emulates vsyscalls when notified of a page - * fault to a vsyscall address. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include - -#define CREATE_TRACE_POINTS -#include "vsyscall_trace.h" - -static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; - -static int __init vsyscall_setup(char *str) -{ - if (str) { - if (!strcmp("emulate", str)) - vsyscall_mode = EMULATE; - else if (!strcmp("native", str)) - vsyscall_mode = NATIVE; - else if (!strcmp("none", str)) - vsyscall_mode = NONE; - else - return -EINVAL; - - return 0; - } - - return -EINVAL; -} -early_param("vsyscall", vsyscall_setup); - -static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, - const char *message) -{ - if (!show_unhandled_signals) - return; - - printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", - level, current->comm, task_pid_nr(current), - message, regs->ip, regs->cs, - regs->sp, regs->ax, regs->si, regs->di); -} - -static int addr_to_vsyscall_nr(unsigned long addr) -{ - int nr; - - if ((addr & ~0xC00UL) != VSYSCALL_ADDR) - return -EINVAL; - - nr = (addr & 0xC00UL) >> 10; - if (nr >= 3) - return -EINVAL; - - return nr; -} - -static bool write_ok_or_segv(unsigned long ptr, size_t size) -{ - /* - * XXX: if access_ok, get_user, and put_user handled - * sig_on_uaccess_error, this could go away. - */ - - if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { - siginfo_t info; - struct thread_struct *thread = ¤t->thread; - - thread->error_code = 6; /* user fault, no page, write */ - thread->cr2 = ptr; - thread->trap_nr = X86_TRAP_PF; - - memset(&info, 0, sizeof(info)); - info.si_signo = SIGSEGV; - info.si_errno = 0; - info.si_code = SEGV_MAPERR; - info.si_addr = (void __user *)ptr; - - force_sig_info(SIGSEGV, &info, current); - return false; - } else { - return true; - } -} - -bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) -{ - struct task_struct *tsk; - unsigned long caller; - int vsyscall_nr, syscall_nr, tmp; - int prev_sig_on_uaccess_error; - long ret; - - /* - * No point in checking CS -- the only way to get here is a user mode - * trap to a high address, which means that we're in 64-bit user code. - */ - - WARN_ON_ONCE(address != regs->ip); - - if (vsyscall_mode == NONE) { - warn_bad_vsyscall(KERN_INFO, regs, - "vsyscall attempted with vsyscall=none"); - return false; - } - - vsyscall_nr = addr_to_vsyscall_nr(address); - - trace_emulate_vsyscall(vsyscall_nr); - - if (vsyscall_nr < 0) { - warn_bad_vsyscall(KERN_WARNING, regs, - "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); - goto sigsegv; - } - - if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { - warn_bad_vsyscall(KERN_WARNING, regs, - "vsyscall with bad stack (exploit attempt?)"); - goto sigsegv; - } - - tsk = current; - - /* - * Check for access_ok violations and find the syscall nr. - * - * NULL is a valid user pointer (in the access_ok sense) on 32-bit and - * 64-bit, so we don't need to special-case it here. For all the - * vsyscalls, NULL means "don't write anything" not "write it at - * address 0". - */ - switch (vsyscall_nr) { - case 0: - if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || - !write_ok_or_segv(regs->si, sizeof(struct timezone))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_gettimeofday; - break; - - case 1: - if (!write_ok_or_segv(regs->di, sizeof(time_t))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_time; - break; - - case 2: - if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || - !write_ok_or_segv(regs->si, sizeof(unsigned))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_getcpu; - break; - } - - /* - * Handle seccomp. regs->ip must be the original value. - * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. - * - * We could optimize the seccomp disabled case, but performance - * here doesn't matter. - */ - regs->orig_ax = syscall_nr; - regs->ax = -ENOSYS; - tmp = secure_computing(); - if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { - warn_bad_vsyscall(KERN_DEBUG, regs, - "seccomp tried to change syscall nr or ip"); - do_exit(SIGSYS); - } - regs->orig_ax = -1; - if (tmp) - goto do_ret; /* skip requested */ - - /* - * With a real vsyscall, page faults cause SIGSEGV. We want to - * preserve that behavior to make writing exploits harder. - */ - prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; - current_thread_info()->sig_on_uaccess_error = 1; - - ret = -EFAULT; - switch (vsyscall_nr) { - case 0: - ret = sys_gettimeofday( - (struct timeval __user *)regs->di, - (struct timezone __user *)regs->si); - break; - - case 1: - ret = sys_time((time_t __user *)regs->di); - break; - - case 2: - ret = sys_getcpu((unsigned __user *)regs->di, - (unsigned __user *)regs->si, - NULL); - break; - } - - current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; - -check_fault: - if (ret == -EFAULT) { - /* Bad news -- userspace fed a bad pointer to a vsyscall. */ - warn_bad_vsyscall(KERN_INFO, regs, - "vsyscall fault (exploit attempt?)"); - - /* - * If we failed to generate a signal for any reason, - * generate one here. (This should be impossible.) - */ - if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && - !sigismember(&tsk->pending.signal, SIGSEGV))) - goto sigsegv; - - return true; /* Don't emulate the ret. */ - } - - regs->ax = ret; - -do_ret: - /* Emulate a ret instruction. */ - regs->ip = caller; - regs->sp += 8; - return true; - -sigsegv: - force_sig(SIGSEGV, current); - return true; -} - -/* - * A pseudo VMA to allow ptrace access for the vsyscall page. This only - * covers the 64bit vsyscall page now. 32bit has a real VMA now and does - * not need special handling anymore: - */ -static const char *gate_vma_name(struct vm_area_struct *vma) -{ - return "[vsyscall]"; -} -static struct vm_operations_struct gate_vma_ops = { - .name = gate_vma_name, -}; -static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_ADDR, - .vm_end = VSYSCALL_ADDR + PAGE_SIZE, - .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC, - .vm_ops = &gate_vma_ops, -}; - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ -#ifdef CONFIG_IA32_EMULATION - if (!mm || mm->context.ia32_compat) - return NULL; -#endif - if (vsyscall_mode == NONE) - return NULL; - return &gate_vma; -} - -int in_gate_area(struct mm_struct *mm, unsigned long addr) -{ - struct vm_area_struct *vma = get_gate_vma(mm); - - if (!vma) - return 0; - - return (addr >= vma->vm_start) && (addr < vma->vm_end); -} - -/* - * Use this when you have no reliable mm, typically from interrupt - * context. It is less reliable than using a task's mm and may give - * false positives. - */ -int in_gate_area_no_mm(unsigned long addr) -{ - return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; -} - -void __init map_vsyscall(void) -{ - extern char __vsyscall_page; - unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - - if (vsyscall_mode != NONE) - __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, - vsyscall_mode == NATIVE - ? PAGE_KERNEL_VSYSCALL - : PAGE_KERNEL_VVAR); - - BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != - (unsigned long)VSYSCALL_ADDR); -} diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S deleted file mode 100644 index c9596a9af159..000000000000 --- a/arch/x86/kernel/vsyscall_emu_64.S +++ /dev/null @@ -1,37 +0,0 @@ -/* - * vsyscall_emu_64.S: Vsyscall emulation page - * - * Copyright (c) 2011 Andy Lutomirski - * - * Subject to the GNU General Public License, version 2 - */ - -#include - -#include -#include -#include - -__PAGE_ALIGNED_DATA - .globl __vsyscall_page - .balign PAGE_SIZE, 0xcc - .type __vsyscall_page, @object -__vsyscall_page: - - mov $__NR_gettimeofday, %rax - syscall - ret - - .balign 1024, 0xcc - mov $__NR_time, %rax - syscall - ret - - .balign 1024, 0xcc - mov $__NR_getcpu, %rax - syscall - ret - - .balign 4096, 0xcc - - .size __vsyscall_page, 4096 diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c deleted file mode 100644 index 51e330416995..000000000000 --- a/arch/x86/kernel/vsyscall_gtod.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (C) 2001 Andrea Arcangeli SuSE - * Copyright 2003 Andi Kleen, SuSE Labs. - * - * Modified for x86 32 bit architecture by - * Stefani Seibold - * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany - * - * Thanks to hpa@transmeta.com for some useful hint. - * Special thanks to Ingo Molnar for his early experience with - * a different vsyscall implementation for Linux/IA32 and for the name. - * - */ - -#include -#include -#include - -DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); - -void update_vsyscall_tz(void) -{ - vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest; - vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime; -} - -void update_vsyscall(struct timekeeper *tk) -{ - struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; - - gtod_write_begin(vdata); - - /* copy vsyscall data */ - vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; - vdata->cycle_last = tk->tkr_mono.cycle_last; - vdata->mask = tk->tkr_mono.mask; - vdata->mult = tk->tkr_mono.mult; - vdata->shift = tk->tkr_mono.shift; - - vdata->wall_time_sec = tk->xtime_sec; - vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; - - vdata->monotonic_time_sec = tk->xtime_sec - + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec - + ((u64)tk->wall_to_monotonic.tv_nsec - << tk->tkr_mono.shift); - while (vdata->monotonic_time_snsec >= - (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { - vdata->monotonic_time_snsec -= - ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; - vdata->monotonic_time_sec++; - } - - vdata->wall_time_coarse_sec = tk->xtime_sec; - vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >> - tk->tkr_mono.shift); - - vdata->monotonic_time_coarse_sec = - vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_coarse_nsec = - vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; - - while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { - vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; - vdata->monotonic_time_coarse_sec++; - } - - gtod_write_end(vdata); -} diff --git a/arch/x86/kernel/vsyscall_trace.h b/arch/x86/kernel/vsyscall_trace.h deleted file mode 100644 index a8b2edec54fe..000000000000 --- a/arch/x86/kernel/vsyscall_trace.h +++ /dev/null @@ -1,29 +0,0 @@ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM vsyscall - -#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) -#define __VSYSCALL_TRACE_H - -#include - -TRACE_EVENT(emulate_vsyscall, - - TP_PROTO(int nr), - - TP_ARGS(nr), - - TP_STRUCT__entry(__field(int, nr)), - - TP_fast_assign( - __entry->nr = nr; - ), - - TP_printk("nr = %d", __entry->nr) -); - -#endif - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../arch/x86/kernel -#define TRACE_INCLUDE_FILE vsyscall_trace -#include -- cgit v1.2.3 From 88d538672ea26223bca08225bc49f4e65e71683d Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Thu, 4 Jun 2015 18:55:23 +0200 Subject: x86/mce: Add infrastructure to support Local MCE Initialize and prepare for handling LMCEs. Add a boot-time option to disable LMCEs. Signed-off-by: Ashok Raj [ Simplify stuff, align statements for better readability, reflow comments; kill unused lmce_clear(); save us an MSR write if LMCE is already enabled. ] Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1433436928-31903-16-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- Documentation/x86/x86_64/boot-options.txt | 3 +++ arch/x86/include/asm/mce.h | 5 ++++ arch/x86/kernel/cpu/mcheck/mce.c | 3 +++ arch/x86/kernel/cpu/mcheck/mce_intel.c | 43 +++++++++++++++++++++++++++++++ 4 files changed, 54 insertions(+) (limited to 'arch/x86/kernel') diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 5223479291a2..68ed3114c363 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -31,6 +31,9 @@ Machine check (e.g. BIOS or hardware monitoring applications), conflicting with OS's error handling, and you cannot deactivate the agent, then this option will be a help. + mce=no_lmce + Do not opt-in to Local MCE delivery. Use legacy method + to broadcast MCEs. mce=bootlog Enable logging of machine checks left over from booting. Disabled by default on AMD because some BIOS leave bogus ones. diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index ae2bfb895994..982dfc3679ad 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -109,6 +109,7 @@ struct mce_log { struct mca_config { bool dont_log_ce; bool cmci_disabled; + bool lmce_disabled; bool ignore_ce; bool disabled; bool ser; @@ -184,12 +185,16 @@ void cmci_clear(void); void cmci_reenable(void); void cmci_rediscover(void); void cmci_recheck(void); +void lmce_clear(void); +void lmce_enable(void); #else static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } static inline void cmci_clear(void) {} static inline void cmci_reenable(void) {} static inline void cmci_rediscover(void) {} static inline void cmci_recheck(void) {} +static inline void lmce_clear(void) {} +static inline void lmce_enable(void) {} #endif #ifdef CONFIG_X86_MCE_AMD diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 0cbcd3183acf..c8c6577b4ada 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1982,6 +1982,7 @@ void mce_disable_bank(int bank) /* * mce=off Disables machine check * mce=no_cmci Disables CMCI + * mce=no_lmce Disables LMCE * mce=dont_log_ce Clears corrected events silently, no log created for CEs. * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) @@ -2005,6 +2006,8 @@ static int __init mcheck_enable(char *str) cfg->disabled = true; else if (!strcmp(str, "no_cmci")) cfg->cmci_disabled = true; + else if (!strcmp(str, "no_lmce")) + cfg->lmce_disabled = true; else if (!strcmp(str, "dont_log_ce")) cfg->dont_log_ce = true; else if (!strcmp(str, "ignore_ce")) diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index b4a41cf030ed..2d872deb2c50 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -91,6 +91,36 @@ static int cmci_supported(int *banks) return !!(cap & MCG_CMCI_P); } +static bool lmce_supported(void) +{ + u64 tmp; + + if (mca_cfg.lmce_disabled) + return false; + + rdmsrl(MSR_IA32_MCG_CAP, tmp); + + /* + * LMCE depends on recovery support in the processor. Hence both + * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP. + */ + if ((tmp & (MCG_SER_P | MCG_LMCE_P)) != + (MCG_SER_P | MCG_LMCE_P)) + return false; + + /* + * BIOS should indicate support for LMCE by setting bit 20 in + * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will + * generate a #GP fault. + */ + rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp); + if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) == + (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) + return true; + + return false; +} + bool mce_intel_cmci_poll(void) { if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) @@ -405,6 +435,19 @@ static void intel_init_cmci(void) cmci_recheck(); } +void intel_init_lmce(void) +{ + u64 val; + + if (!lmce_supported()) + return; + + rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + + if (!(val & MCG_EXT_CTL_LMCE_EN)) + wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); -- cgit v1.2.3 From 243d657eaf540db882f73497060da5a4f7d86a90 Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Thu, 4 Jun 2015 18:55:24 +0200 Subject: x86/mce: Handle Local MCE events Add the necessary changes to do_machine_check() to be able to process MCEs signaled as local MCEs. Typically, only recoverable errors (SRAR type) will be Signaled as LMCE. The architecture does not restrict to only those errors, however. When errors are signaled as LMCE, there is no need for the MCE handler to perform rendezvous with other logical processors unlike earlier processors that would broadcast machine check errors. Signed-off-by: Ashok Raj Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1433436928-31903-17-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 32 ++++++++++++++++++++++++++------ arch/x86/kernel/cpu/mcheck/mce_intel.c | 1 + 2 files changed, 27 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index c8c6577b4ada..ddc46d67d93e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1047,6 +1047,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) char *msg = "Unknown"; u64 recover_paddr = ~0ull; int flags = MF_ACTION_REQUIRED; + int lmce = 0; prev_state = ist_enter(regs); @@ -1074,11 +1075,20 @@ void do_machine_check(struct pt_regs *regs, long error_code) kill_it = 1; /* - * Go through all the banks in exclusion of the other CPUs. - * This way we don't report duplicated events on shared banks - * because the first one to see it will clear it. + * Check if this MCE is signaled to only this logical processor */ - order = mce_start(&no_way_out); + if (m.mcgstatus & MCG_STATUS_LMCES) + lmce = 1; + else { + /* + * Go through all the banks in exclusion of the other CPUs. + * This way we don't report duplicated events on shared banks + * because the first one to see it will clear it. + * If this is a Local MCE, then no need to perform rendezvous. + */ + order = mce_start(&no_way_out); + } + for (i = 0; i < cfg->banks; i++) { __clear_bit(i, toclear); if (!test_bit(i, valid_banks)) @@ -1155,8 +1165,18 @@ void do_machine_check(struct pt_regs *regs, long error_code) * Do most of the synchronization with other CPUs. * When there's any problem use only local no_way_out state. */ - if (mce_end(order) < 0) - no_way_out = worst >= MCE_PANIC_SEVERITY; + if (!lmce) { + if (mce_end(order) < 0) + no_way_out = worst >= MCE_PANIC_SEVERITY; + } else { + /* + * Local MCE skipped calling mce_reign() + * If we found a fatal error, we need to panic here. + */ + if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) + mce_panic("Machine check from unknown source", + NULL, NULL); + } /* * At insane "tolerant" levels we take no action. Otherwise diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 2d872deb2c50..844f56c5616d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -452,4 +452,5 @@ void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); intel_init_cmci(); + intel_init_lmce(); } -- cgit v1.2.3 From c8e56d20f2d190d54c0615775dcb6a23c1091681 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 4 Jun 2015 18:55:25 +0200 Subject: x86: Kill CONFIG_X86_HT In talking to Aravind recently about making certain AMD topology attributes available to the MCE injection module, it seemed like that CONFIG_X86_HT thing is more or less superfluous. It is def_bool y, depends on SMP and gets enabled in the majority of .configs - distro and otherwise - out there. So let's kill it and make code behind it depend directly on SMP. Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Aravind Gopalakrishnan Cc: Bartosz Golaszewski Cc: Catalin Marinas Cc: Daniel Walter Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Igor Mammedov Cc: Jacob Shin Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1433436928-31903-18-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 8 ++------ arch/x86/include/asm/topology.h | 2 +- arch/x86/kernel/cpu/amd.c | 6 +++--- arch/x86/kernel/cpu/common.c | 4 ++-- arch/x86/kernel/cpu/intel_cacheinfo.c | 8 ++++---- 5 files changed, 12 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8a5cca39e74f..7e39f9b22705 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -261,10 +261,6 @@ config X86_64_SMP def_bool y depends on X86_64 && SMP -config X86_HT - def_bool y - depends on SMP - config X86_32_LAZY_GS def_bool y depends on X86_32 && !CC_STACKPROTECTOR @@ -865,7 +861,7 @@ config NR_CPUS config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" - depends on X86_HT + depends on SMP ---help--- SMT scheduler support improves the CPU scheduler's decision making when dealing with Intel Pentium 4 chips with HyperThreading at a @@ -875,7 +871,7 @@ config SCHED_SMT config SCHED_MC def_bool y prompt "Multi-core scheduler support" - depends on X86_HT + depends on SMP ---help--- Multi-core scheduler support improves the CPU scheduler's decision making when dealing with multi-core CPU chips at a cost of slightly diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 0e8f04f2c26f..8d717faeed22 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -26,7 +26,7 @@ #define _ASM_X86_TOPOLOGY_H #ifdef CONFIG_X86_32 -# ifdef CONFIG_X86_HT +# ifdef CONFIG_SMP # define ENABLE_TOPO_DEFINES # endif #else diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e4cf63301ff4..eb4f01269b5d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -288,7 +288,7 @@ static int nearby_node(int apicid) * Assumption: Number of cores in each internal node is the same. * (2) AMD processors supporting compute units */ -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP static void amd_get_topology(struct cpuinfo_x86 *c) { u32 nodes, cores_per_cu = 1; @@ -341,7 +341,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c) */ static void amd_detect_cmp(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP unsigned bits; int cpu = smp_processor_id(); @@ -420,7 +420,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c) static void early_init_amd_mc(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP unsigned bits, ecx; /* Multi core CPU? */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6bec0b55863e..b6fe2e47f7f1 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -508,7 +508,7 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) void detect_ht(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP u32 eax, ebx, ecx, edx; int index_msb, core_bits; static bool printed; @@ -844,7 +844,7 @@ static void generic_identify(struct cpuinfo_x86 *c) if (c->cpuid_level >= 0x00000001) { c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; #ifdef CONFIG_X86_32 -# ifdef CONFIG_X86_HT +# ifdef CONFIG_SMP c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); # else c->apicid = c->initial_apicid; diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index edcb0e28c336..be4febc58b94 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -654,7 +654,7 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP unsigned int cpu = c->cpu_index; #endif @@ -773,19 +773,19 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) if (new_l2) { l2 = new_l2; -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP per_cpu(cpu_llc_id, cpu) = l2_id; #endif } if (new_l3) { l3 = new_l3; -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP per_cpu(cpu_llc_id, cpu) = l3_id; #endif } -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP /* * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in * turns means that the only possibility is SMT (as indicated in -- cgit v1.2.3 From 2cd23553b488589f287457b7396470f5e3c40698 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jun 2015 08:28:07 +0200 Subject: x86/asm/entry: Rename compat syscall entry points Rename the following system call entry points: ia32_cstar_target -> entry_SYSCALL_compat ia32_syscall -> entry_INT80_compat The generic naming scheme for x86 system call entry points is: entry_MNEMONIC_qualifier where 'qualifier' is one of _32, _64 or _compat. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- Documentation/x86/entry_64.txt | 4 ++-- arch/x86/entry/entry_64_compat.S | 8 ++++---- arch/x86/entry/syscall_32.c | 6 +++--- arch/x86/include/asm/proto.h | 4 ++-- arch/x86/kernel/asm-offsets_64.c | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/traps.c | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- 8 files changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/Documentation/x86/entry_64.txt b/Documentation/x86/entry_64.txt index 9132b86176a3..33884d156125 100644 --- a/Documentation/x86/entry_64.txt +++ b/Documentation/x86/entry_64.txt @@ -18,10 +18,10 @@ Some of these entries are: - system_call: syscall instruction from 64-bit code. - - ia32_syscall: int 0x80 from 32-bit or 64-bit code; compat syscall + - entry_INT80_compat: int 0x80 from 32-bit or 64-bit code; compat syscall either way. - - ia32_syscall, ia32_sysenter: syscall and sysenter from 32-bit + - entry_INT80_compat, ia32_sysenter: syscall and sysenter from 32-bit code - interrupt: An array of entries. Every IDT vector that doesn't diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 9558dacf32b9..8058892fb5ff 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -288,7 +288,7 @@ ENDPROC(ia32_sysenter_target) * path below. We set up a complete hardware stack frame to share code * with the int 0x80 path. */ -ENTRY(ia32_cstar_target) +ENTRY(entry_SYSCALL_compat) /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -409,7 +409,7 @@ cstar_tracesys: RESTORE_EXTRA_REGS jmp cstar_do_call -END(ia32_cstar_target) +END(entry_SYSCALL_compat) ia32_badarg: ASM_CLAC @@ -445,7 +445,7 @@ ia32_ret_from_sys_call: * Assumes it is only called from user space and entered with interrupts off. */ -ENTRY(ia32_syscall) +ENTRY(entry_INT80_compat) /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -511,7 +511,7 @@ ia32_tracesys: movl %eax, %eax /* zero extension */ RESTORE_EXTRA_REGS jmp ia32_do_call -END(ia32_syscall) +END(entry_INT80_compat) .macro PTREGSCALL label, func ALIGN diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 3777189c4a19..e398d033673f 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -10,7 +10,7 @@ #else #define SYM(sym, compat) sym #define ia32_sys_call_table sys_call_table -#define __NR_ia32_syscall_max __NR_syscall_max +#define __NR_entry_INT80_compat_max __NR_syscall_max #endif #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; @@ -23,11 +23,11 @@ typedef asmlinkage void (*sys_call_ptr_t)(void); extern asmlinkage void sys_ni_syscall(void); -__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { +__visible const sys_call_ptr_t ia32_sys_call_table[__NR_entry_INT80_compat_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, + [0 ... __NR_entry_INT80_compat_max] = &sys_ni_syscall, #include }; diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index a90f8972dad5..7d2961a231f1 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -8,8 +8,8 @@ void system_call(void); void syscall_init(void); -void ia32_syscall(void); -void ia32_cstar_target(void); +void entry_INT80_compat(void); +void entry_SYSCALL_compat(void); void ia32_sysenter_target(void); void x86_configure_nx(void); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index dcaab87da629..599afcf0005f 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -66,7 +66,7 @@ int main(void) DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); DEFINE(NR_syscalls, sizeof(syscalls_64)); - DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1); + DEFINE(__NR_entry_INT80_compat_max, sizeof(syscalls_ia32) - 1); DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32)); return 0; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6bec0b55863e..f0b85c401014 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1207,7 +1207,7 @@ void syscall_init(void) wrmsrl(MSR_LSTAR, system_call); #ifdef CONFIG_IA32_EMULATION - wrmsrl(MSR_CSTAR, ia32_cstar_target); + wrmsrl(MSR_CSTAR, entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5e0791f9d3dc..edf97986a53d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -992,7 +992,7 @@ void __init trap_init(void) set_bit(i, used_vectors); #ifdef CONFIG_IA32_EMULATION - set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); + set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_compat); set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 04529e620559..3c43c03a499c 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -122,7 +122,7 @@ ENDPROC(xen_syscall_target) /* 32-bit compat syscall target */ ENTRY(xen_syscall32_target) undo_xen_syscall - jmp ia32_cstar_target + jmp entry_SYSCALL_compat ENDPROC(xen_syscall32_target) /* 32-bit compat sysenter target */ -- cgit v1.2.3 From 4c8cd0c50d0b1559727bf0ec7ff27caeba2dfe09 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jun 2015 08:33:56 +0200 Subject: x86/asm/entry: Untangle 'ia32_sysenter_target' into two entry points: entry_SYSENTER_32 and entry_SYSENTER_compat So the SYSENTER instruction is pretty quirky and it has different behavior depending on bitness and CPU maker. Yet we create a false sense of coherency by naming it 'ia32_sysenter_target' in both of the cases. Split the name into its two uses: ia32_sysenter_target (32) -> entry_SYSENTER_32 ia32_sysenter_target (64) -> entry_SYSENTER_compat As per the generic naming scheme for x86 system call entry points: entry_MNEMONIC_qualifier where 'qualifier' is one of _32, _64 or _compat. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 10 +++++----- arch/x86/entry/entry_64_compat.S | 4 ++-- arch/x86/include/asm/proto.h | 3 ++- arch/x86/kernel/cpu/common.c | 4 ++-- arch/x86/xen/xen-asm_64.S | 2 +- 5 files changed, 12 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 0ac73de925d1..a65f46c3b8e1 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -307,7 +307,7 @@ END(resume_kernel) the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ # sysenter call handler stub -ENTRY(ia32_sysenter_target) +ENTRY(entry_SYSENTER_32) movl TSS_sysenter_sp0(%esp),%esp sysenter_past_esp: /* @@ -412,7 +412,7 @@ sysexit_audit: .popsection _ASM_EXTABLE(1b,2b) PTGS_TO_GS_EX -ENDPROC(ia32_sysenter_target) +ENDPROC(entry_SYSENTER_32) # system call handler stub ENTRY(system_call) @@ -1135,7 +1135,7 @@ END(page_fault) ENTRY(debug) ASM_CLAC - cmpl $ia32_sysenter_target,(%esp) + cmpl $entry_SYSENTER_32,(%esp) jne debug_stack_correct FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn debug_stack_correct: @@ -1165,7 +1165,7 @@ ENTRY(nmi) popl %eax je nmi_espfix_stack #endif - cmpl $ia32_sysenter_target,(%esp) + cmpl $entry_SYSENTER_32,(%esp) je nmi_stack_fixup pushl %eax movl %esp,%eax @@ -1176,7 +1176,7 @@ ENTRY(nmi) cmpl $(THREAD_SIZE-20),%eax popl %eax jae nmi_stack_correct - cmpl $ia32_sysenter_target,12(%esp) + cmpl $entry_SYSENTER_32,12(%esp) je nmi_debug_stack_check nmi_stack_correct: pushl %eax diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 8058892fb5ff..59840e33d203 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -57,7 +57,7 @@ ENDPROC(native_usergs_sysret32) * path below. We set up a complete hardware stack frame to share code * with the int 0x80 path. */ -ENTRY(ia32_sysenter_target) +ENTRY(entry_SYSENTER_compat) /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -256,7 +256,7 @@ sysenter_tracesys: RESTORE_EXTRA_REGS jmp sysenter_do_call -ENDPROC(ia32_sysenter_target) +ENDPROC(entry_SYSENTER_compat) /* * 32-bit SYSCALL instruction entry. diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 7d2961a231f1..83a7f8227949 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -10,7 +10,8 @@ void syscall_init(void); void entry_INT80_compat(void); void entry_SYSCALL_compat(void); -void ia32_sysenter_target(void); +void entry_SYSENTER_32(void); +void entry_SYSENTER_compat(void); void x86_configure_nx(void); void x86_report_nx(void); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f0b85c401014..b2ae7cec33ca 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1026,7 +1026,7 @@ void enable_sep_cpu(void) (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); out: put_cpu(); @@ -1216,7 +1216,7 @@ void syscall_init(void) */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, ignore_sysret); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 3c43c03a499c..ccac1b1e6e93 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -128,7 +128,7 @@ ENDPROC(xen_syscall32_target) /* 32-bit compat sysenter target */ ENTRY(xen_sysenter_target) undo_xen_syscall - jmp ia32_sysenter_target + jmp entry_SYSENTER_compat ENDPROC(xen_sysenter_target) #else /* !CONFIG_IA32_EMULATION */ -- cgit v1.2.3 From b2502b418e63fcde0fe1857732a476b5aa3789b1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jun 2015 08:42:03 +0200 Subject: x86/asm/entry: Untangle 'system_call' into two entry points: entry_SYSCALL_64 and entry_INT80_32 The 'system_call' entry points differ starkly between native 32-bit and 64-bit kernels: on 32-bit kernels it defines the INT 0x80 entry point, while on 64-bit it's the SYSCALL entry point. This is pretty confusing when looking at generic code, and it also obscures the nature of the entry point at the assembly level. So unangle this by splitting the name into its two uses: system_call (32) -> entry_INT80_32 system_call (64) -> entry_SYSCALL_64 As per the generic naming scheme for x86 system call entry points: entry_MNEMONIC_qualifier where 'qualifier' is one of _32, _64 or _compat. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 4 ++-- arch/x86/entry/entry_64.S | 10 +++++----- arch/x86/include/asm/proto.h | 5 +++-- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/traps.c | 5 ++--- arch/x86/xen/xen-asm_64.S | 2 +- 6 files changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index a65f46c3b8e1..d59461032625 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -415,7 +415,7 @@ sysexit_audit: ENDPROC(entry_SYSENTER_32) # system call handler stub -ENTRY(system_call) +ENTRY(entry_INT80_32) ASM_CLAC pushl %eax # save orig_eax SAVE_ALL @@ -508,7 +508,7 @@ ldt_ss: lss (%esp), %esp /* switch to espfix segment */ jmp restore_nocheck #endif -ENDPROC(system_call) +ENDPROC(entry_INT80_32) # perform work that needs to be done immediately before resumption ALIGN diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4cf3dd36aa0d..e1852c407155 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -137,7 +137,7 @@ ENDPROC(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ -ENTRY(system_call) +ENTRY(entry_SYSCALL_64) /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -149,7 +149,7 @@ ENTRY(system_call) * after the swapgs, so that it can do the swapgs * for the guest and jump here on syscall. */ -GLOBAL(system_call_after_swapgs) +GLOBAL(entry_SYSCALL_64_after_swapgs) movq %rsp,PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp @@ -182,7 +182,7 @@ GLOBAL(system_call_after_swapgs) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz tracesys -system_call_fastpath: +entry_SYSCALL_64_fastpath: #if __SYSCALL_MASK == ~0 cmpq $__NR_syscall_max,%rax #else @@ -246,7 +246,7 @@ tracesys: jnz tracesys_phase2 /* if needed, run the slow path */ RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ movq ORIG_RAX(%rsp), %rax - jmp system_call_fastpath /* and return to the fast path */ + jmp entry_SYSCALL_64_fastpath /* and return to the fast path */ tracesys_phase2: SAVE_EXTRA_REGS @@ -411,7 +411,7 @@ syscall_return_via_sysret: opportunistic_sysret_failed: SWAPGS jmp restore_c_regs_and_iret -END(system_call) +END(entry_SYSCALL_64) .macro FORK_LIKE func diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 83a7f8227949..a4a77286cb1d 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -5,11 +5,12 @@ /* misc architecture specific prototypes */ -void system_call(void); void syscall_init(void); -void entry_INT80_compat(void); +void entry_SYSCALL_64(void); void entry_SYSCALL_compat(void); +void entry_INT80_32(void); +void entry_INT80_compat(void); void entry_SYSENTER_32(void); void entry_SYSENTER_compat(void); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b2ae7cec33ca..914be4bbc2e5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1204,7 +1204,7 @@ void syscall_init(void) * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. */ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); - wrmsrl(MSR_LSTAR, system_call); + wrmsrl(MSR_LSTAR, entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, entry_SYSCALL_compat); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index edf97986a53d..001ddac221a1 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -72,8 +72,7 @@ gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss; #else #include #include - -asmlinkage int system_call(void); +#include #endif /* Must be page-aligned because the real IDT is used in a fixmap. */ @@ -997,7 +996,7 @@ void __init trap_init(void) #endif #ifdef CONFIG_X86_32 - set_system_trap_gate(IA32_SYSCALL_VECTOR, &system_call); + set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32); set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index ccac1b1e6e93..f22667abf7b9 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -114,7 +114,7 @@ RELOC(xen_sysret32, 1b+1) /* Normal 64-bit system call target */ ENTRY(xen_syscall_target) undo_xen_syscall - jmp system_call_after_swapgs + jmp entry_SYSCALL_64_after_swapgs ENDPROC(xen_syscall_target) #ifdef CONFIG_IA32_EMULATION -- cgit v1.2.3 From bace7117d3fb59a6ed7ea1aa6c8994df6a28a72a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jun 2015 21:20:26 +0200 Subject: x86/asm/entry: (Re-)rename __NR_entry_INT80_compat_max to __NR_syscall_compat_max Brian Gerst noticed that I did a weird rename in the following commit: b2502b418e63 ("x86/asm/entry: Untangle 'system_call' into two entry points: entry_SYSCALL_64 and entry_INT80_32") which renamed __NR_ia32_syscall_max to __NR_entry_INT80_compat_max. Now the original name was a misnomer, but the new one is a misnomer as well, as all the 32-bit compat syscall entry points (sysenter, syscall) share the system call table, not just the INT80 based one. Rename it to __NR_syscall_compat_max. Reported-by: Brian Gerst Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/syscall_32.c | 6 +++--- arch/x86/kernel/asm-offsets_64.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index e398d033673f..8ea34f94e973 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -10,7 +10,7 @@ #else #define SYM(sym, compat) sym #define ia32_sys_call_table sys_call_table -#define __NR_entry_INT80_compat_max __NR_syscall_max +#define __NR_syscall_compat_max __NR_syscall_max #endif #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; @@ -23,11 +23,11 @@ typedef asmlinkage void (*sys_call_ptr_t)(void); extern asmlinkage void sys_ni_syscall(void); -__visible const sys_call_ptr_t ia32_sys_call_table[__NR_entry_INT80_compat_max+1] = { +__visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_entry_INT80_compat_max] = &sys_ni_syscall, + [0 ... __NR_syscall_compat_max] = &sys_ni_syscall, #include }; diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 599afcf0005f..d8f42f902a0f 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -66,7 +66,7 @@ int main(void) DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); DEFINE(NR_syscalls, sizeof(syscalls_64)); - DEFINE(__NR_entry_INT80_compat_max, sizeof(syscalls_ia32) - 1); + DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1); DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32)); return 0; -- cgit v1.2.3 From b58d930750135d6c5b8e5aa084c0e9303c78c286 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 15 Jun 2015 17:40:01 +0800 Subject: x86/platform/intel/baytrail: Add comments about why we disabled HPET on Baytrail This question has been asked many times, and finally I found the official document which explains the problem of HPET on Baytrail, that it will halt in deep idle states. Signed-off-by: Feng Tang Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: john.stultz@linaro.org Cc: len.brown@intel.com Cc: matthew.lee@intel.com Link: http://lkml.kernel.org/r/1434361201-31743-1-git-send-email-feng.tang@intel.com [ Prettified things a bit. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/early-quirks.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index fe9f0b79a18b..5cb9a4d6f623 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -627,8 +627,12 @@ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, QFLAG_APPLY_ONCE, intel_graphics_stolen }, /* - * HPET on current version of Baytrail platform has accuracy - * problems, disable it for now: + * HPET on the current version of the Baytrail platform has accuracy + * problems: it will halt in deep idle state - so we disable it. + * + * More details can be found in section 18.10.1.3 of the datasheet: + * + * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, -- cgit v1.2.3 From bafac298fb20e9ae1305c710d4fd8d20c5911afa Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Sat, 20 Jun 2015 11:50:50 +0200 Subject: x86/hpet: Check for irq==0 when allocating hpet MSI interrupts irq == 0 is not a valid irq for a irqdomain MSI allocation, but hpet code checks only for negative return values. Reported-by: Sergey Senozhatsky Cc: Borislav Petkov Link: http://lkml.kernel.org/r/558447AF.30703@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index e2449cf38b06..c47aab35a17e 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -578,7 +578,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) continue; irq = hpet_assign_irq(hpet_domain, hdev, hdev->num); - if (irq < 0) + if (irq <= 0) continue; sprintf(hdev->name, "hpet%d", i); -- cgit v1.2.3 From cb17b2a674f2059343f997599b4b001e64eec516 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 21 Jun 2015 16:21:50 +0200 Subject: x86/hpet: Use proper hpet device number for MSI allocation hpet_assign_irq() is called with hpet_device->num as "hardware interrupt number", but hpet_device->num is initialized after the interrupt has been assigned, so it's always 0. As a consequence only the first MSI allocation succeeds, the following ones fail because the "hardware interrupt number" already exists. Move the initialization of dev->num and other fields before the call to hpet_assign_irq(), which is the ordering before the offending commit which introduced that regression. Fixes: "3cb96f0c9733 x86/hpet: Enhance HPET IRQ to support hierarchical irqdomains" Reported-by: Sergey Senozhatsky Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1506211635010.4107@nanos Cc: Jiang Liu Cc: Borislav Petkov --- arch/x86/kernel/hpet.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index c47aab35a17e..10757d0a3fcf 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -577,16 +577,17 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) if (!(cfg & HPET_TN_FSB_CAP)) continue; + hdev->flags = 0; + if (cfg & HPET_TN_PERIODIC_CAP) + hdev->flags |= HPET_DEV_PERI_CAP; + sprintf(hdev->name, "hpet%d", i); + hdev->num = i; + irq = hpet_assign_irq(hpet_domain, hdev, hdev->num); if (irq <= 0) continue; - sprintf(hdev->name, "hpet%d", i); - hdev->num = i; hdev->irq = irq; - hdev->flags = 0; - if (cfg & HPET_TN_PERIODIC_CAP) - hdev->flags |= HPET_DEV_PERI_CAP; hdev->flags |= HPET_DEV_FSB_CAP; hdev->flags |= HPET_DEV_VALID; num_timers_used++; -- cgit v1.2.3