From 248fbcd5aee00f6519a12c5ed3bc3dc0f5e84de5 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 7 Aug 2015 09:36:55 +0100 Subject: x86/efi-bgrt: Switch pr_err() to pr_debug() for invalid BGRT It's totally legitimate, per the ACPI spec, for the firmware to set the BGRT 'status' field to zero to indicate that the BGRT image isn't being displayed, and we shouldn't be printing an error message in that case because it's just noise for users. So swap pr_err() for pr_debug(). However, Josh points that out it still makes sense to test the validity of the upper 7 bits of the 'status' field, since they're marked as "reserved" in the spec and must be zero. If firmware violates this it really *is* an error. Reported-by: Tom Yan Tested-by: Tom Yan Signed-off-by: Matt Fleming Reviewed-by: Josh Triplett Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Matthew Garrett Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1438936621-5215-2-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/platform/efi/efi-bgrt.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c index d7f997f7c26d..ea48449b2e63 100644 --- a/arch/x86/platform/efi/efi-bgrt.c +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -50,11 +50,16 @@ void __init efi_bgrt_init(void) bgrt_tab->version); return; } - if (bgrt_tab->status != 1) { - pr_err("Ignoring BGRT: invalid status %u (expected 1)\n", + if (bgrt_tab->status & 0xfe) { + pr_err("Ignoring BGRT: reserved status bits are non-zero %u\n", bgrt_tab->status); return; } + if (bgrt_tab->status != 1) { + pr_debug("Ignoring BGRT: invalid status %u (expected 1)\n", + bgrt_tab->status); + return; + } if (bgrt_tab->image_type != 0) { pr_err("Ignoring BGRT: invalid image type %u (expected 0)\n", bgrt_tab->image_type); -- cgit v1.2.3 From fa5c35011a8d5f3d0c597a6336107eafd1b6046c Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 7 Aug 2015 09:36:56 +0100 Subject: Revert "x86/efi: Request desired alignment via the PE/COFF headers" This reverts commit: aeffc4928ea2 ("x86/efi: Request desired alignment via the PE/COFF headers") Linn reports that Signtool complains that kernels built with CONFIG_EFI_STUB=y are violating the PE/COFF specification because the 'SizeOfImage' field is not a multiple of 'SectionAlignment'. This violation was introduced as an optimisation to skip having the kernel relocate itself during boot and instead have the firmware place it at a correctly aligned address. No one else has complained and I'm not aware of any firmware implementations that refuse to boot with commit aeffc4928ea2, but it's a real bug, so revert the offending commit. Reported-by: Linn Crosetto Signed-off-by: Matt Fleming Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Michael Brown Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1438936621-5215-3-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/boot/header.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 16ef02596db2..7a6d43a554d7 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -154,7 +154,7 @@ extra_header_fields: #else .quad 0 # ImageBase #endif - .long CONFIG_PHYSICAL_ALIGN # SectionAlignment + .long 0x20 # SectionAlignment .long 0x20 # FileAlignment .word 0 # MajorOperatingSystemVersion .word 0 # MinorOperatingSystemVersion -- cgit v1.2.3 From 7bf793115dd96ce9bd8ed1665fc187d961a95dba Mon Sep 17 00:00:00 2001 From: "Jonathan (Zhixiong) Zhang" Date: Fri, 7 Aug 2015 09:36:57 +0100 Subject: efi, x86: Rearrange efi_mem_attributes() x86 and ia64 implement efi_mem_attributes() differently. This function needs to be available for other architectures (such as arm64) as well, such as for the purpose of ACPI/APEI. ia64 EFI does not set up a 'memmap' variable and does not set the EFI_MEMMAP flag, so it needs to have its unique implementation of efi_mem_attributes(). Move efi_mem_attributes() implementation from x86 to the core EFI code, and declare it with __weak. It is recommended that other architectures should not override the default implementation. Signed-off-by: Jonathan (Zhixiong) Zhang Signed-off-by: Matt Fleming Reviewed-by: Matt Fleming Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1438936621-5215-4-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/platform/efi/efi.c | 18 ------------------ drivers/firmware/efi/efi.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index e4308fe6afe8..2f61fcddcddb 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -952,24 +952,6 @@ u32 efi_mem_type(unsigned long phys_addr) return 0; } -u64 efi_mem_attributes(unsigned long phys_addr) -{ - efi_memory_desc_t *md; - void *p; - - if (!efi_enabled(EFI_MEMMAP)) - return 0; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if ((md->phys_addr <= phys_addr) && - (phys_addr < (md->phys_addr + - (md->num_pages << EFI_PAGE_SHIFT)))) - return md->attribute; - } - return 0; -} - static int __init arch_parse_efi_cmdline(char *str) { if (!str) { diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index d7a9160008d3..afee2880e0fd 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -607,3 +607,34 @@ char * __init efi_md_typeattr_format(char *buf, size_t size, attr & EFI_MEMORY_UC ? "UC" : ""); return buf; } + +/* + * efi_mem_attributes - lookup memmap attributes for physical address + * @phys_addr: the physical address to lookup + * + * Search in the EFI memory map for the region covering + * @phys_addr. Returns the EFI memory attributes if the region + * was found in the memory map, 0 otherwise. + * + * Despite being marked __weak, most architectures should *not* + * override this function. It is __weak solely for the benefit + * of ia64 which has a funky EFI memory map that doesn't work + * the same way as other architectures. + */ +u64 __weak efi_mem_attributes(unsigned long phys_addr) +{ + efi_memory_desc_t *md; + void *p; + + if (!efi_enabled(EFI_MEMMAP)) + return 0; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if ((md->phys_addr <= phys_addr) && + (phys_addr < (md->phys_addr + + (md->num_pages << EFI_PAGE_SHIFT)))) + return md->attribute; + } + return 0; +} -- cgit v1.2.3 From b40227fbfb1f98614e6f9bdc4cb3a54ab31d48a5 Mon Sep 17 00:00:00 2001 From: "Jonathan (Zhixiong) Zhang" Date: Fri, 7 Aug 2015 09:36:58 +0100 Subject: acpi, x86: Implement arch_apei_get_mem_attributes() ... to allow an arch specific implementation of getting page protection type associated with a physical address. On x86, we currently have no way to look up the EFI memory map attributes for a region in a consistent way, because the memmap is discarded after efi_free_boot_services(). So if you call efi_mem_attributes() during boot and at runtime, you could theoretically see different attributes. Since we are yet to see any x86 platforms that require anything other than PAGE_KERNEL (some arm64 platforms require the equivalent of PAGE_KERNEL_NOCACHE), return that until we know differently. Signed-off-by: Jonathan (Zhixiong) Zhang Signed-off-by: Matt Fleming Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1438936621-5215-5-git-send-email-matt@codeblueprint.co.uk [ Small fixes to spelling. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/acpi.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 3a45668f6dc3..94c18ebfd68c 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -32,6 +32,10 @@ #include #include +#ifdef CONFIG_ACPI_APEI +# include +#endif + #ifdef CONFIG_ACPI extern int acpi_lapic; extern int acpi_ioapic; @@ -147,4 +151,23 @@ extern int x86_acpi_numa_init(void); #define acpi_unlazy_tlb(x) leave_mm(x) +#ifdef CONFIG_ACPI_APEI +static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) +{ + /* + * We currently have no way to look up the EFI memory map + * attributes for a region in a consistent way, because the + * memmap is discarded after efi_free_boot_services(). So if + * you call efi_mem_attributes() during boot and at runtime, + * you could theoretically see different attributes. + * + * Since we are yet to see any x86 platforms that require + * anything other than PAGE_KERNEL (some arm64 platforms + * require the equivalent of PAGE_KERNEL_NOCACHE), return that + * until we know differently. + */ + return PAGE_KERNEL; +} +#endif + #endif /* _ASM_X86_ACPI_H */ -- cgit v1.2.3 From 78173ec6311a22ca9f42cf949cf37754a8b71633 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 2 Sep 2015 15:15:27 +0300 Subject: x86/insn: perf tools: Pedantically tweak opcode map for MPX instructions The MPX instructions are presently not described in the SDM opcode maps, and there are not encoding characters for bnd registers, address method or operand type. So the kernel opcode map is using 'Gv' for bnd registers and 'Ev' for everything else. That is fine because the instruction decoder does not use that information anyway, except as an indication that there is a ModR/M byte. Nevertheless, in some cases the 'Gv' and 'Ev' are the wrong way around, BNDLDX and BNDSTX have 2 operands not 3, and it wouldn't hurt to identify the mandatory prefixes. This has no effect on the decoding of valid instructions, but the addition of the mandatory prefixes will cause some invalid instructions to error out that wouldn't have previously. Note that perf tools has a copy of the instruction decoder and provides a test for new instructions which includes MPX instructions e.g. $ perf test "x86 ins" 39: Test x86 instruction decoder - new instructions : Ok Or to see the details: $ perf test -v "x86 ins" Commiter notes: And to see these MPX instructions specifically: $ perf test -v "x86 ins" 2>&1 | grep bndldx | head -3 Decoded ok: 0f 1a 00 bndldx (%eax),%bnd0 Decoded ok: 0f 1a 05 78 56 34 12 bndldx 0x12345678,%bnd0 Decoded ok: 0f 1a 18 bndldx (%eax),%bnd3 $ perf test -v "x86 ins" 2>&1 | grep bndstx | head -3 Decoded ok: 0f 1b 00 bndstx %bnd0,(%eax) Decoded ok: 0f 1b 05 78 56 34 12 bndstx %bnd0,0x12345678 Decoded ok: 0f 1b 18 bndstx %bnd3,(%eax) $ Signed-off-by: Adrian Hunter Acked-by: Masami Hiramatsu Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Denys Vlasenko Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Qiaowei Ren Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1441196131-20632-4-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/lib/x86-opcode-map.txt | 8 ++++++-- tools/perf/util/intel-pt-decoder/x86-opcode-map.txt | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 816488c0b97e..a02a195d219c 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -353,8 +353,12 @@ AVXcode: 1 17: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1) 18: Grp16 (1A) 19: -1a: BNDCL Ev,Gv | BNDCU Ev,Gv | BNDMOV Gv,Ev | BNDLDX Gv,Ev,Gv -1b: BNDCN Ev,Gv | BNDMOV Ev,Gv | BNDMK Gv,Ev | BNDSTX Ev,GV,Gv +# Intel SDM opcode map does not list MPX instructions. For now using Gv for +# bnd registers and Ev for everything else is OK because the instruction +# decoder does not use the information except as an indication that there is +# a ModR/M byte. +1a: BNDCL Gv,Ev (F3) | BNDCU Gv,Ev (F2) | BNDMOV Gv,Ev (66) | BNDLDX Gv,Ev +1b: BNDCN Gv,Ev (F2) | BNDMOV Ev,Gv (66) | BNDMK Gv,Ev (F3) | BNDSTX Ev,Gv 1c: 1d: 1e: diff --git a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt index 816488c0b97e..a02a195d219c 100644 --- a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt +++ b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt @@ -353,8 +353,12 @@ AVXcode: 1 17: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1) 18: Grp16 (1A) 19: -1a: BNDCL Ev,Gv | BNDCU Ev,Gv | BNDMOV Gv,Ev | BNDLDX Gv,Ev,Gv -1b: BNDCN Ev,Gv | BNDMOV Ev,Gv | BNDMK Gv,Ev | BNDSTX Ev,GV,Gv +# Intel SDM opcode map does not list MPX instructions. For now using Gv for +# bnd registers and Ev for everything else is OK because the instruction +# decoder does not use the information except as an indication that there is +# a ModR/M byte. +1a: BNDCL Gv,Ev (F3) | BNDCU Gv,Ev (F2) | BNDMOV Gv,Ev (66) | BNDLDX Gv,Ev +1b: BNDCN Gv,Ev (F2) | BNDMOV Ev,Gv (66) | BNDMK Gv,Ev (F3) | BNDSTX Ev,Gv 1c: 1d: 1e: -- cgit v1.2.3 From 3fe78d6af9e2f08c4014fd3ccbf9e1ff312dedf1 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 2 Sep 2015 15:15:28 +0300 Subject: x86/insn: perf tools: Add new SHA instructions Intel SHA Extensions are explained in the Intel Architecture Instruction Set Extensions Programing Reference (Oct 2014). There are 7 new instructions. Add them to the op code map and the perf tools new instructions test. e.g. $ tools/perf/perf test "x86 ins" 39: Test x86 instruction decoder - new instructions : Ok Or to see the details: $ tools/perf/perf test -v "x86 ins" 2>&1 | grep sha Committer note: 3 lines of details, for the curious: $ perf test -v "x86 ins" 2>&1 | grep sha256msg1 | tail -3 Decoded ok: 0f 38 cc 84 08 78 56 34 12 sha256msg1 0x12345678(%rax,%rcx,1),%xmm0 Decoded ok: 0f 38 cc 84 c8 78 56 34 12 sha256msg1 0x12345678(%rax,%rcx,8),%xmm0 Decoded ok: 44 0f 38 cc bc c8 78 56 34 12 sha256msg1 0x12345678(%rax,%rcx,8),%xmm15 $ Signed-off-by: Adrian Hunter Acked-by: Masami Hiramatsu Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Denys Vlasenko Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Qiaowei Ren Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1441196131-20632-5-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/lib/x86-opcode-map.txt | 7 + tools/perf/tests/insn-x86-dat-32.c | 294 ++++++++++++++++ tools/perf/tests/insn-x86-dat-64.c | 364 ++++++++++++++++++++ tools/perf/tests/insn-x86-dat-src.c | 373 +++++++++++++++++++++ .../perf/util/intel-pt-decoder/x86-opcode-map.txt | 7 + 5 files changed, 1045 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index a02a195d219c..25dad388b371 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -736,6 +736,12 @@ bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1) be: vfnmsub231ps/d Vx,Hx,Wx (66),(v) bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1) # 0x0f 0x38 0xc0-0xff +c8: sha1nexte Vdq,Wdq +c9: sha1msg1 Vdq,Wdq +ca: sha1msg2 Vdq,Wdq +cb: sha256rnds2 Vdq,Wdq +cc: sha256msg1 Vdq,Wdq +cd: sha256msg2 Vdq,Wdq db: VAESIMC Vdq,Wdq (66),(v1) dc: VAESENC Vdq,Hdq,Wdq (66),(v1) dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1) @@ -794,6 +800,7 @@ AVXcode: 3 61: vpcmpestri Vdq,Wdq,Ib (66),(v1) 62: vpcmpistrm Vdq,Wdq,Ib (66),(v1) 63: vpcmpistri Vdq,Wdq,Ib (66),(v1) +cc: sha1rnds4 Vdq,Wdq,Ib df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1) f0: RORX Gy,Ey,Ib (F2),(v) EndTable diff --git a/tools/perf/tests/insn-x86-dat-32.c b/tools/perf/tests/insn-x86-dat-32.c index 6a38a34a5a49..83f5078e74e1 100644 --- a/tools/perf/tests/insn-x86-dat-32.c +++ b/tools/perf/tests/insn-x86-dat-32.c @@ -322,3 +322,297 @@ "f2 ff 21 \tbnd jmp *(%ecx)",}, {{0xf2, 0x0f, 0x85, 0xfc, 0xff, 0xff, 0xff, }, 7, 0xfffffffc, "jcc", "conditional", "f2 0f 85 fc ff ff ff \tbnd jne 3de ",}, +{{0x0f, 0x3a, 0xcc, 0xc1, 0x00, }, 5, 0, "", "", +"0f 3a cc c1 00 \tsha1rnds4 $0x0,%xmm1,%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0xd7, 0x91, }, 5, 0, "", "", +"0f 3a cc d7 91 \tsha1rnds4 $0x91,%xmm7,%xmm2",}, +{{0x0f, 0x3a, 0xcc, 0x00, 0x91, }, 5, 0, "", "", +"0f 3a cc 00 91 \tsha1rnds4 $0x91,(%eax),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x05, 0x78, 0x56, 0x34, 0x12, 0x91, }, 9, 0, "", "", +"0f 3a cc 05 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678,%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x18, 0x91, }, 5, 0, "", "", +"0f 3a cc 18 91 \tsha1rnds4 $0x91,(%eax),%xmm3",}, +{{0x0f, 0x3a, 0xcc, 0x04, 0x01, 0x91, }, 6, 0, "", "", +"0f 3a cc 04 01 91 \tsha1rnds4 $0x91,(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 04 05 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(,%eax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x04, 0x08, 0x91, }, 6, 0, "", "", +"0f 3a cc 04 08 91 \tsha1rnds4 $0x91,(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x04, 0xc8, 0x91, }, 6, 0, "", "", +"0f 3a cc 04 c8 91 \tsha1rnds4 $0x91,(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x40, 0x12, 0x91, }, 6, 0, "", "", +"0f 3a cc 40 12 91 \tsha1rnds4 $0x91,0x12(%eax),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x45, 0x12, 0x91, }, 6, 0, "", "", +"0f 3a cc 45 12 91 \tsha1rnds4 $0x91,0x12(%ebp),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x44, 0x01, 0x12, 0x91, }, 7, 0, "", "", +"0f 3a cc 44 01 12 91 \tsha1rnds4 $0x91,0x12(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x44, 0x05, 0x12, 0x91, }, 7, 0, "", "", +"0f 3a cc 44 05 12 91 \tsha1rnds4 $0x91,0x12(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x44, 0x08, 0x12, 0x91, }, 7, 0, "", "", +"0f 3a cc 44 08 12 91 \tsha1rnds4 $0x91,0x12(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x44, 0xc8, 0x12, 0x91, }, 7, 0, "", "", +"0f 3a cc 44 c8 12 91 \tsha1rnds4 $0x91,0x12(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x80, 0x78, 0x56, 0x34, 0x12, 0x91, }, 9, 0, "", "", +"0f 3a cc 80 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%eax),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x85, 0x78, 0x56, 0x34, 0x12, 0x91, }, 9, 0, "", "", +"0f 3a cc 85 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%ebp),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 84 01 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 84 05 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 84 08 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 84 c8 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0xc1, }, 4, 0, "", "", +"0f 38 c8 c1 \tsha1nexte %xmm1,%xmm0",}, +{{0x0f, 0x38, 0xc8, 0xd7, }, 4, 0, "", "", +"0f 38 c8 d7 \tsha1nexte %xmm7,%xmm2",}, +{{0x0f, 0x38, 0xc8, 0x00, }, 4, 0, "", "", +"0f 38 c8 00 \tsha1nexte (%eax),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x05, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 c8 05 78 56 34 12 \tsha1nexte 0x12345678,%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x18, }, 4, 0, "", "", +"0f 38 c8 18 \tsha1nexte (%eax),%xmm3",}, +{{0x0f, 0x38, 0xc8, 0x04, 0x01, }, 5, 0, "", "", +"0f 38 c8 04 01 \tsha1nexte (%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 04 05 78 56 34 12 \tsha1nexte 0x12345678(,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x04, 0x08, }, 5, 0, "", "", +"0f 38 c8 04 08 \tsha1nexte (%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x04, 0xc8, }, 5, 0, "", "", +"0f 38 c8 04 c8 \tsha1nexte (%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x40, 0x12, }, 5, 0, "", "", +"0f 38 c8 40 12 \tsha1nexte 0x12(%eax),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x45, 0x12, }, 5, 0, "", "", +"0f 38 c8 45 12 \tsha1nexte 0x12(%ebp),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x44, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 c8 44 01 12 \tsha1nexte 0x12(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x44, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 c8 44 05 12 \tsha1nexte 0x12(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x44, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 c8 44 08 12 \tsha1nexte 0x12(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x44, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 c8 44 c8 12 \tsha1nexte 0x12(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x80, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 c8 80 78 56 34 12 \tsha1nexte 0x12345678(%eax),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x85, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 c8 85 78 56 34 12 \tsha1nexte 0x12345678(%ebp),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 84 01 78 56 34 12 \tsha1nexte 0x12345678(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 84 05 78 56 34 12 \tsha1nexte 0x12345678(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 84 08 78 56 34 12 \tsha1nexte 0x12345678(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 84 c8 78 56 34 12 \tsha1nexte 0x12345678(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0xc1, }, 4, 0, "", "", +"0f 38 c9 c1 \tsha1msg1 %xmm1,%xmm0",}, +{{0x0f, 0x38, 0xc9, 0xd7, }, 4, 0, "", "", +"0f 38 c9 d7 \tsha1msg1 %xmm7,%xmm2",}, +{{0x0f, 0x38, 0xc9, 0x00, }, 4, 0, "", "", +"0f 38 c9 00 \tsha1msg1 (%eax),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x05, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 c9 05 78 56 34 12 \tsha1msg1 0x12345678,%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x18, }, 4, 0, "", "", +"0f 38 c9 18 \tsha1msg1 (%eax),%xmm3",}, +{{0x0f, 0x38, 0xc9, 0x04, 0x01, }, 5, 0, "", "", +"0f 38 c9 04 01 \tsha1msg1 (%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 04 05 78 56 34 12 \tsha1msg1 0x12345678(,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x04, 0x08, }, 5, 0, "", "", +"0f 38 c9 04 08 \tsha1msg1 (%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x04, 0xc8, }, 5, 0, "", "", +"0f 38 c9 04 c8 \tsha1msg1 (%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x40, 0x12, }, 5, 0, "", "", +"0f 38 c9 40 12 \tsha1msg1 0x12(%eax),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x45, 0x12, }, 5, 0, "", "", +"0f 38 c9 45 12 \tsha1msg1 0x12(%ebp),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x44, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 c9 44 01 12 \tsha1msg1 0x12(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x44, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 c9 44 05 12 \tsha1msg1 0x12(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x44, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 c9 44 08 12 \tsha1msg1 0x12(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x44, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 c9 44 c8 12 \tsha1msg1 0x12(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x80, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 c9 80 78 56 34 12 \tsha1msg1 0x12345678(%eax),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x85, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 c9 85 78 56 34 12 \tsha1msg1 0x12345678(%ebp),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 84 01 78 56 34 12 \tsha1msg1 0x12345678(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 84 05 78 56 34 12 \tsha1msg1 0x12345678(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 84 08 78 56 34 12 \tsha1msg1 0x12345678(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 84 c8 78 56 34 12 \tsha1msg1 0x12345678(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xca, 0xc1, }, 4, 0, "", "", +"0f 38 ca c1 \tsha1msg2 %xmm1,%xmm0",}, +{{0x0f, 0x38, 0xca, 0xd7, }, 4, 0, "", "", +"0f 38 ca d7 \tsha1msg2 %xmm7,%xmm2",}, +{{0x0f, 0x38, 0xca, 0x00, }, 4, 0, "", "", +"0f 38 ca 00 \tsha1msg2 (%eax),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x05, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 ca 05 78 56 34 12 \tsha1msg2 0x12345678,%xmm0",}, +{{0x0f, 0x38, 0xca, 0x18, }, 4, 0, "", "", +"0f 38 ca 18 \tsha1msg2 (%eax),%xmm3",}, +{{0x0f, 0x38, 0xca, 0x04, 0x01, }, 5, 0, "", "", +"0f 38 ca 04 01 \tsha1msg2 (%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 04 05 78 56 34 12 \tsha1msg2 0x12345678(,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x04, 0x08, }, 5, 0, "", "", +"0f 38 ca 04 08 \tsha1msg2 (%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x04, 0xc8, }, 5, 0, "", "", +"0f 38 ca 04 c8 \tsha1msg2 (%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x40, 0x12, }, 5, 0, "", "", +"0f 38 ca 40 12 \tsha1msg2 0x12(%eax),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x45, 0x12, }, 5, 0, "", "", +"0f 38 ca 45 12 \tsha1msg2 0x12(%ebp),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x44, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 ca 44 01 12 \tsha1msg2 0x12(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x44, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 ca 44 05 12 \tsha1msg2 0x12(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x44, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 ca 44 08 12 \tsha1msg2 0x12(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x44, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 ca 44 c8 12 \tsha1msg2 0x12(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x80, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 ca 80 78 56 34 12 \tsha1msg2 0x12345678(%eax),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x85, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 ca 85 78 56 34 12 \tsha1msg2 0x12345678(%ebp),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 84 01 78 56 34 12 \tsha1msg2 0x12345678(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 84 05 78 56 34 12 \tsha1msg2 0x12345678(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 84 08 78 56 34 12 \tsha1msg2 0x12345678(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 84 c8 78 56 34 12 \tsha1msg2 0x12345678(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xcb, 0xcc, }, 4, 0, "", "", +"0f 38 cb cc \tsha256rnds2 %xmm0,%xmm4,%xmm1",}, +{{0x0f, 0x38, 0xcb, 0xd7, }, 4, 0, "", "", +"0f 38 cb d7 \tsha256rnds2 %xmm0,%xmm7,%xmm2",}, +{{0x0f, 0x38, 0xcb, 0x08, }, 4, 0, "", "", +"0f 38 cb 08 \tsha256rnds2 %xmm0,(%eax),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x0d, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cb 0d 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678,%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x18, }, 4, 0, "", "", +"0f 38 cb 18 \tsha256rnds2 %xmm0,(%eax),%xmm3",}, +{{0x0f, 0x38, 0xcb, 0x0c, 0x01, }, 5, 0, "", "", +"0f 38 cb 0c 01 \tsha256rnds2 %xmm0,(%ecx,%eax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x0c, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 0c 05 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(,%eax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x0c, 0x08, }, 5, 0, "", "", +"0f 38 cb 0c 08 \tsha256rnds2 %xmm0,(%eax,%ecx,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x0c, 0xc8, }, 5, 0, "", "", +"0f 38 cb 0c c8 \tsha256rnds2 %xmm0,(%eax,%ecx,8),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x48, 0x12, }, 5, 0, "", "", +"0f 38 cb 48 12 \tsha256rnds2 %xmm0,0x12(%eax),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x4d, 0x12, }, 5, 0, "", "", +"0f 38 cb 4d 12 \tsha256rnds2 %xmm0,0x12(%ebp),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x4c, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 cb 4c 01 12 \tsha256rnds2 %xmm0,0x12(%ecx,%eax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x4c, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 cb 4c 05 12 \tsha256rnds2 %xmm0,0x12(%ebp,%eax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x4c, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 cb 4c 08 12 \tsha256rnds2 %xmm0,0x12(%eax,%ecx,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x4c, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 cb 4c c8 12 \tsha256rnds2 %xmm0,0x12(%eax,%ecx,8),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x88, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cb 88 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%eax),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x8d, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cb 8d 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%ebp),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x8c, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 8c 01 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%ecx,%eax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x8c, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 8c 05 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%ebp,%eax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x8c, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 8c 08 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%eax,%ecx,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x8c, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 8c c8 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%eax,%ecx,8),%xmm1",}, +{{0x0f, 0x38, 0xcc, 0xc1, }, 4, 0, "", "", +"0f 38 cc c1 \tsha256msg1 %xmm1,%xmm0",}, +{{0x0f, 0x38, 0xcc, 0xd7, }, 4, 0, "", "", +"0f 38 cc d7 \tsha256msg1 %xmm7,%xmm2",}, +{{0x0f, 0x38, 0xcc, 0x00, }, 4, 0, "", "", +"0f 38 cc 00 \tsha256msg1 (%eax),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x05, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cc 05 78 56 34 12 \tsha256msg1 0x12345678,%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x18, }, 4, 0, "", "", +"0f 38 cc 18 \tsha256msg1 (%eax),%xmm3",}, +{{0x0f, 0x38, 0xcc, 0x04, 0x01, }, 5, 0, "", "", +"0f 38 cc 04 01 \tsha256msg1 (%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 04 05 78 56 34 12 \tsha256msg1 0x12345678(,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x04, 0x08, }, 5, 0, "", "", +"0f 38 cc 04 08 \tsha256msg1 (%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x04, 0xc8, }, 5, 0, "", "", +"0f 38 cc 04 c8 \tsha256msg1 (%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x40, 0x12, }, 5, 0, "", "", +"0f 38 cc 40 12 \tsha256msg1 0x12(%eax),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x45, 0x12, }, 5, 0, "", "", +"0f 38 cc 45 12 \tsha256msg1 0x12(%ebp),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x44, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 cc 44 01 12 \tsha256msg1 0x12(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x44, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 cc 44 05 12 \tsha256msg1 0x12(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x44, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 cc 44 08 12 \tsha256msg1 0x12(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x44, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 cc 44 c8 12 \tsha256msg1 0x12(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x80, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cc 80 78 56 34 12 \tsha256msg1 0x12345678(%eax),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x85, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cc 85 78 56 34 12 \tsha256msg1 0x12345678(%ebp),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 84 01 78 56 34 12 \tsha256msg1 0x12345678(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 84 05 78 56 34 12 \tsha256msg1 0x12345678(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 84 08 78 56 34 12 \tsha256msg1 0x12345678(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 84 c8 78 56 34 12 \tsha256msg1 0x12345678(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0xc1, }, 4, 0, "", "", +"0f 38 cd c1 \tsha256msg2 %xmm1,%xmm0",}, +{{0x0f, 0x38, 0xcd, 0xd7, }, 4, 0, "", "", +"0f 38 cd d7 \tsha256msg2 %xmm7,%xmm2",}, +{{0x0f, 0x38, 0xcd, 0x00, }, 4, 0, "", "", +"0f 38 cd 00 \tsha256msg2 (%eax),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x05, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cd 05 78 56 34 12 \tsha256msg2 0x12345678,%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x18, }, 4, 0, "", "", +"0f 38 cd 18 \tsha256msg2 (%eax),%xmm3",}, +{{0x0f, 0x38, 0xcd, 0x04, 0x01, }, 5, 0, "", "", +"0f 38 cd 04 01 \tsha256msg2 (%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 04 05 78 56 34 12 \tsha256msg2 0x12345678(,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x04, 0x08, }, 5, 0, "", "", +"0f 38 cd 04 08 \tsha256msg2 (%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x04, 0xc8, }, 5, 0, "", "", +"0f 38 cd 04 c8 \tsha256msg2 (%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x40, 0x12, }, 5, 0, "", "", +"0f 38 cd 40 12 \tsha256msg2 0x12(%eax),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x45, 0x12, }, 5, 0, "", "", +"0f 38 cd 45 12 \tsha256msg2 0x12(%ebp),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x44, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 cd 44 01 12 \tsha256msg2 0x12(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x44, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 cd 44 05 12 \tsha256msg2 0x12(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x44, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 cd 44 08 12 \tsha256msg2 0x12(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x44, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 cd 44 c8 12 \tsha256msg2 0x12(%eax,%ecx,8),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x80, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cd 80 78 56 34 12 \tsha256msg2 0x12345678(%eax),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x85, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cd 85 78 56 34 12 \tsha256msg2 0x12345678(%ebp),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 84 01 78 56 34 12 \tsha256msg2 0x12345678(%ecx,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 84 05 78 56 34 12 \tsha256msg2 0x12345678(%ebp,%eax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 84 08 78 56 34 12 \tsha256msg2 0x12345678(%eax,%ecx,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 84 c8 78 56 34 12 \tsha256msg2 0x12345678(%eax,%ecx,8),%xmm0",}, diff --git a/tools/perf/tests/insn-x86-dat-64.c b/tools/perf/tests/insn-x86-dat-64.c index 01122421a776..13f008588590 100644 --- a/tools/perf/tests/insn-x86-dat-64.c +++ b/tools/perf/tests/insn-x86-dat-64.c @@ -338,3 +338,367 @@ "67 f2 ff 21 \tbnd jmpq *(%ecx)",}, {{0xf2, 0x0f, 0x85, 0x00, 0x00, 0x00, 0x00, }, 7, 0, "jcc", "conditional", "f2 0f 85 00 00 00 00 \tbnd jne 413 ",}, +{{0x0f, 0x3a, 0xcc, 0xc1, 0x00, }, 5, 0, "", "", +"0f 3a cc c1 00 \tsha1rnds4 $0x0,%xmm1,%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0xd7, 0x91, }, 5, 0, "", "", +"0f 3a cc d7 91 \tsha1rnds4 $0x91,%xmm7,%xmm2",}, +{{0x41, 0x0f, 0x3a, 0xcc, 0xc0, 0x91, }, 6, 0, "", "", +"41 0f 3a cc c0 91 \tsha1rnds4 $0x91,%xmm8,%xmm0",}, +{{0x44, 0x0f, 0x3a, 0xcc, 0xc7, 0x91, }, 6, 0, "", "", +"44 0f 3a cc c7 91 \tsha1rnds4 $0x91,%xmm7,%xmm8",}, +{{0x45, 0x0f, 0x3a, 0xcc, 0xc7, 0x91, }, 6, 0, "", "", +"45 0f 3a cc c7 91 \tsha1rnds4 $0x91,%xmm15,%xmm8",}, +{{0x0f, 0x3a, 0xcc, 0x00, 0x91, }, 5, 0, "", "", +"0f 3a cc 00 91 \tsha1rnds4 $0x91,(%rax),%xmm0",}, +{{0x41, 0x0f, 0x3a, 0xcc, 0x00, 0x91, }, 6, 0, "", "", +"41 0f 3a cc 00 91 \tsha1rnds4 $0x91,(%r8),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x04, 0x25, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 04 25 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678,%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x18, 0x91, }, 5, 0, "", "", +"0f 3a cc 18 91 \tsha1rnds4 $0x91,(%rax),%xmm3",}, +{{0x0f, 0x3a, 0xcc, 0x04, 0x01, 0x91, }, 6, 0, "", "", +"0f 3a cc 04 01 91 \tsha1rnds4 $0x91,(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 04 05 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(,%rax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x04, 0x08, 0x91, }, 6, 0, "", "", +"0f 3a cc 04 08 91 \tsha1rnds4 $0x91,(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x04, 0xc8, 0x91, }, 6, 0, "", "", +"0f 3a cc 04 c8 91 \tsha1rnds4 $0x91,(%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x40, 0x12, 0x91, }, 6, 0, "", "", +"0f 3a cc 40 12 91 \tsha1rnds4 $0x91,0x12(%rax),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x45, 0x12, 0x91, }, 6, 0, "", "", +"0f 3a cc 45 12 91 \tsha1rnds4 $0x91,0x12(%rbp),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x44, 0x01, 0x12, 0x91, }, 7, 0, "", "", +"0f 3a cc 44 01 12 91 \tsha1rnds4 $0x91,0x12(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x44, 0x05, 0x12, 0x91, }, 7, 0, "", "", +"0f 3a cc 44 05 12 91 \tsha1rnds4 $0x91,0x12(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x44, 0x08, 0x12, 0x91, }, 7, 0, "", "", +"0f 3a cc 44 08 12 91 \tsha1rnds4 $0x91,0x12(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x44, 0xc8, 0x12, 0x91, }, 7, 0, "", "", +"0f 3a cc 44 c8 12 91 \tsha1rnds4 $0x91,0x12(%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x80, 0x78, 0x56, 0x34, 0x12, 0x91, }, 9, 0, "", "", +"0f 3a cc 80 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%rax),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x85, 0x78, 0x56, 0x34, 0x12, 0x91, }, 9, 0, "", "", +"0f 3a cc 85 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%rbp),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 84 01 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 84 05 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 84 08 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x3a, 0xcc, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, 0x91, }, 10, 0, "", "", +"0f 3a cc 84 c8 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%rax,%rcx,8),%xmm0",}, +{{0x44, 0x0f, 0x3a, 0xcc, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, 0x91, }, 11, 0, "", "", +"44 0f 3a cc bc c8 78 56 34 12 91 \tsha1rnds4 $0x91,0x12345678(%rax,%rcx,8),%xmm15",}, +{{0x0f, 0x38, 0xc8, 0xc1, }, 4, 0, "", "", +"0f 38 c8 c1 \tsha1nexte %xmm1,%xmm0",}, +{{0x0f, 0x38, 0xc8, 0xd7, }, 4, 0, "", "", +"0f 38 c8 d7 \tsha1nexte %xmm7,%xmm2",}, +{{0x41, 0x0f, 0x38, 0xc8, 0xc0, }, 5, 0, "", "", +"41 0f 38 c8 c0 \tsha1nexte %xmm8,%xmm0",}, +{{0x44, 0x0f, 0x38, 0xc8, 0xc7, }, 5, 0, "", "", +"44 0f 38 c8 c7 \tsha1nexte %xmm7,%xmm8",}, +{{0x45, 0x0f, 0x38, 0xc8, 0xc7, }, 5, 0, "", "", +"45 0f 38 c8 c7 \tsha1nexte %xmm15,%xmm8",}, +{{0x0f, 0x38, 0xc8, 0x00, }, 4, 0, "", "", +"0f 38 c8 00 \tsha1nexte (%rax),%xmm0",}, +{{0x41, 0x0f, 0x38, 0xc8, 0x00, }, 5, 0, "", "", +"41 0f 38 c8 00 \tsha1nexte (%r8),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x04, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 04 25 78 56 34 12 \tsha1nexte 0x12345678,%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x18, }, 4, 0, "", "", +"0f 38 c8 18 \tsha1nexte (%rax),%xmm3",}, +{{0x0f, 0x38, 0xc8, 0x04, 0x01, }, 5, 0, "", "", +"0f 38 c8 04 01 \tsha1nexte (%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 04 05 78 56 34 12 \tsha1nexte 0x12345678(,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x04, 0x08, }, 5, 0, "", "", +"0f 38 c8 04 08 \tsha1nexte (%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x04, 0xc8, }, 5, 0, "", "", +"0f 38 c8 04 c8 \tsha1nexte (%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x40, 0x12, }, 5, 0, "", "", +"0f 38 c8 40 12 \tsha1nexte 0x12(%rax),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x45, 0x12, }, 5, 0, "", "", +"0f 38 c8 45 12 \tsha1nexte 0x12(%rbp),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x44, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 c8 44 01 12 \tsha1nexte 0x12(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x44, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 c8 44 05 12 \tsha1nexte 0x12(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x44, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 c8 44 08 12 \tsha1nexte 0x12(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x44, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 c8 44 c8 12 \tsha1nexte 0x12(%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x80, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 c8 80 78 56 34 12 \tsha1nexte 0x12345678(%rax),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x85, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 c8 85 78 56 34 12 \tsha1nexte 0x12345678(%rbp),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 84 01 78 56 34 12 \tsha1nexte 0x12345678(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 84 05 78 56 34 12 \tsha1nexte 0x12345678(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 84 08 78 56 34 12 \tsha1nexte 0x12345678(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xc8, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c8 84 c8 78 56 34 12 \tsha1nexte 0x12345678(%rax,%rcx,8),%xmm0",}, +{{0x44, 0x0f, 0x38, 0xc8, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"44 0f 38 c8 bc c8 78 56 34 12 \tsha1nexte 0x12345678(%rax,%rcx,8),%xmm15",}, +{{0x0f, 0x38, 0xc9, 0xc1, }, 4, 0, "", "", +"0f 38 c9 c1 \tsha1msg1 %xmm1,%xmm0",}, +{{0x0f, 0x38, 0xc9, 0xd7, }, 4, 0, "", "", +"0f 38 c9 d7 \tsha1msg1 %xmm7,%xmm2",}, +{{0x41, 0x0f, 0x38, 0xc9, 0xc0, }, 5, 0, "", "", +"41 0f 38 c9 c0 \tsha1msg1 %xmm8,%xmm0",}, +{{0x44, 0x0f, 0x38, 0xc9, 0xc7, }, 5, 0, "", "", +"44 0f 38 c9 c7 \tsha1msg1 %xmm7,%xmm8",}, +{{0x45, 0x0f, 0x38, 0xc9, 0xc7, }, 5, 0, "", "", +"45 0f 38 c9 c7 \tsha1msg1 %xmm15,%xmm8",}, +{{0x0f, 0x38, 0xc9, 0x00, }, 4, 0, "", "", +"0f 38 c9 00 \tsha1msg1 (%rax),%xmm0",}, +{{0x41, 0x0f, 0x38, 0xc9, 0x00, }, 5, 0, "", "", +"41 0f 38 c9 00 \tsha1msg1 (%r8),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x04, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 04 25 78 56 34 12 \tsha1msg1 0x12345678,%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x18, }, 4, 0, "", "", +"0f 38 c9 18 \tsha1msg1 (%rax),%xmm3",}, +{{0x0f, 0x38, 0xc9, 0x04, 0x01, }, 5, 0, "", "", +"0f 38 c9 04 01 \tsha1msg1 (%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 04 05 78 56 34 12 \tsha1msg1 0x12345678(,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x04, 0x08, }, 5, 0, "", "", +"0f 38 c9 04 08 \tsha1msg1 (%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x04, 0xc8, }, 5, 0, "", "", +"0f 38 c9 04 c8 \tsha1msg1 (%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x40, 0x12, }, 5, 0, "", "", +"0f 38 c9 40 12 \tsha1msg1 0x12(%rax),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x45, 0x12, }, 5, 0, "", "", +"0f 38 c9 45 12 \tsha1msg1 0x12(%rbp),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x44, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 c9 44 01 12 \tsha1msg1 0x12(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x44, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 c9 44 05 12 \tsha1msg1 0x12(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x44, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 c9 44 08 12 \tsha1msg1 0x12(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x44, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 c9 44 c8 12 \tsha1msg1 0x12(%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x80, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 c9 80 78 56 34 12 \tsha1msg1 0x12345678(%rax),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x85, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 c9 85 78 56 34 12 \tsha1msg1 0x12345678(%rbp),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 84 01 78 56 34 12 \tsha1msg1 0x12345678(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 84 05 78 56 34 12 \tsha1msg1 0x12345678(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 84 08 78 56 34 12 \tsha1msg1 0x12345678(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xc9, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 c9 84 c8 78 56 34 12 \tsha1msg1 0x12345678(%rax,%rcx,8),%xmm0",}, +{{0x44, 0x0f, 0x38, 0xc9, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"44 0f 38 c9 bc c8 78 56 34 12 \tsha1msg1 0x12345678(%rax,%rcx,8),%xmm15",}, +{{0x0f, 0x38, 0xca, 0xc1, }, 4, 0, "", "", +"0f 38 ca c1 \tsha1msg2 %xmm1,%xmm0",}, +{{0x0f, 0x38, 0xca, 0xd7, }, 4, 0, "", "", +"0f 38 ca d7 \tsha1msg2 %xmm7,%xmm2",}, +{{0x41, 0x0f, 0x38, 0xca, 0xc0, }, 5, 0, "", "", +"41 0f 38 ca c0 \tsha1msg2 %xmm8,%xmm0",}, +{{0x44, 0x0f, 0x38, 0xca, 0xc7, }, 5, 0, "", "", +"44 0f 38 ca c7 \tsha1msg2 %xmm7,%xmm8",}, +{{0x45, 0x0f, 0x38, 0xca, 0xc7, }, 5, 0, "", "", +"45 0f 38 ca c7 \tsha1msg2 %xmm15,%xmm8",}, +{{0x0f, 0x38, 0xca, 0x00, }, 4, 0, "", "", +"0f 38 ca 00 \tsha1msg2 (%rax),%xmm0",}, +{{0x41, 0x0f, 0x38, 0xca, 0x00, }, 5, 0, "", "", +"41 0f 38 ca 00 \tsha1msg2 (%r8),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x04, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 04 25 78 56 34 12 \tsha1msg2 0x12345678,%xmm0",}, +{{0x0f, 0x38, 0xca, 0x18, }, 4, 0, "", "", +"0f 38 ca 18 \tsha1msg2 (%rax),%xmm3",}, +{{0x0f, 0x38, 0xca, 0x04, 0x01, }, 5, 0, "", "", +"0f 38 ca 04 01 \tsha1msg2 (%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 04 05 78 56 34 12 \tsha1msg2 0x12345678(,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x04, 0x08, }, 5, 0, "", "", +"0f 38 ca 04 08 \tsha1msg2 (%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x04, 0xc8, }, 5, 0, "", "", +"0f 38 ca 04 c8 \tsha1msg2 (%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x40, 0x12, }, 5, 0, "", "", +"0f 38 ca 40 12 \tsha1msg2 0x12(%rax),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x45, 0x12, }, 5, 0, "", "", +"0f 38 ca 45 12 \tsha1msg2 0x12(%rbp),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x44, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 ca 44 01 12 \tsha1msg2 0x12(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x44, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 ca 44 05 12 \tsha1msg2 0x12(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x44, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 ca 44 08 12 \tsha1msg2 0x12(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x44, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 ca 44 c8 12 \tsha1msg2 0x12(%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x80, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 ca 80 78 56 34 12 \tsha1msg2 0x12345678(%rax),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x85, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 ca 85 78 56 34 12 \tsha1msg2 0x12345678(%rbp),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 84 01 78 56 34 12 \tsha1msg2 0x12345678(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 84 05 78 56 34 12 \tsha1msg2 0x12345678(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 84 08 78 56 34 12 \tsha1msg2 0x12345678(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xca, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 ca 84 c8 78 56 34 12 \tsha1msg2 0x12345678(%rax,%rcx,8),%xmm0",}, +{{0x44, 0x0f, 0x38, 0xca, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"44 0f 38 ca bc c8 78 56 34 12 \tsha1msg2 0x12345678(%rax,%rcx,8),%xmm15",}, +{{0x0f, 0x38, 0xcb, 0xcc, }, 4, 0, "", "", +"0f 38 cb cc \tsha256rnds2 %xmm0,%xmm4,%xmm1",}, +{{0x0f, 0x38, 0xcb, 0xd7, }, 4, 0, "", "", +"0f 38 cb d7 \tsha256rnds2 %xmm0,%xmm7,%xmm2",}, +{{0x41, 0x0f, 0x38, 0xcb, 0xc8, }, 5, 0, "", "", +"41 0f 38 cb c8 \tsha256rnds2 %xmm0,%xmm8,%xmm1",}, +{{0x44, 0x0f, 0x38, 0xcb, 0xc7, }, 5, 0, "", "", +"44 0f 38 cb c7 \tsha256rnds2 %xmm0,%xmm7,%xmm8",}, +{{0x45, 0x0f, 0x38, 0xcb, 0xc7, }, 5, 0, "", "", +"45 0f 38 cb c7 \tsha256rnds2 %xmm0,%xmm15,%xmm8",}, +{{0x0f, 0x38, 0xcb, 0x08, }, 4, 0, "", "", +"0f 38 cb 08 \tsha256rnds2 %xmm0,(%rax),%xmm1",}, +{{0x41, 0x0f, 0x38, 0xcb, 0x08, }, 5, 0, "", "", +"41 0f 38 cb 08 \tsha256rnds2 %xmm0,(%r8),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x0c, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 0c 25 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678,%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x18, }, 4, 0, "", "", +"0f 38 cb 18 \tsha256rnds2 %xmm0,(%rax),%xmm3",}, +{{0x0f, 0x38, 0xcb, 0x0c, 0x01, }, 5, 0, "", "", +"0f 38 cb 0c 01 \tsha256rnds2 %xmm0,(%rcx,%rax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x0c, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 0c 05 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(,%rax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x0c, 0x08, }, 5, 0, "", "", +"0f 38 cb 0c 08 \tsha256rnds2 %xmm0,(%rax,%rcx,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x0c, 0xc8, }, 5, 0, "", "", +"0f 38 cb 0c c8 \tsha256rnds2 %xmm0,(%rax,%rcx,8),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x48, 0x12, }, 5, 0, "", "", +"0f 38 cb 48 12 \tsha256rnds2 %xmm0,0x12(%rax),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x4d, 0x12, }, 5, 0, "", "", +"0f 38 cb 4d 12 \tsha256rnds2 %xmm0,0x12(%rbp),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x4c, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 cb 4c 01 12 \tsha256rnds2 %xmm0,0x12(%rcx,%rax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x4c, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 cb 4c 05 12 \tsha256rnds2 %xmm0,0x12(%rbp,%rax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x4c, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 cb 4c 08 12 \tsha256rnds2 %xmm0,0x12(%rax,%rcx,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x4c, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 cb 4c c8 12 \tsha256rnds2 %xmm0,0x12(%rax,%rcx,8),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x88, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cb 88 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%rax),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x8d, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cb 8d 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%rbp),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x8c, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 8c 01 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%rcx,%rax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x8c, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 8c 05 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%rbp,%rax,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x8c, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 8c 08 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%rax,%rcx,1),%xmm1",}, +{{0x0f, 0x38, 0xcb, 0x8c, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cb 8c c8 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%rax,%rcx,8),%xmm1",}, +{{0x44, 0x0f, 0x38, 0xcb, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"44 0f 38 cb bc c8 78 56 34 12 \tsha256rnds2 %xmm0,0x12345678(%rax,%rcx,8),%xmm15",}, +{{0x0f, 0x38, 0xcc, 0xc1, }, 4, 0, "", "", +"0f 38 cc c1 \tsha256msg1 %xmm1,%xmm0",}, +{{0x0f, 0x38, 0xcc, 0xd7, }, 4, 0, "", "", +"0f 38 cc d7 \tsha256msg1 %xmm7,%xmm2",}, +{{0x41, 0x0f, 0x38, 0xcc, 0xc0, }, 5, 0, "", "", +"41 0f 38 cc c0 \tsha256msg1 %xmm8,%xmm0",}, +{{0x44, 0x0f, 0x38, 0xcc, 0xc7, }, 5, 0, "", "", +"44 0f 38 cc c7 \tsha256msg1 %xmm7,%xmm8",}, +{{0x45, 0x0f, 0x38, 0xcc, 0xc7, }, 5, 0, "", "", +"45 0f 38 cc c7 \tsha256msg1 %xmm15,%xmm8",}, +{{0x0f, 0x38, 0xcc, 0x00, }, 4, 0, "", "", +"0f 38 cc 00 \tsha256msg1 (%rax),%xmm0",}, +{{0x41, 0x0f, 0x38, 0xcc, 0x00, }, 5, 0, "", "", +"41 0f 38 cc 00 \tsha256msg1 (%r8),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x04, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 04 25 78 56 34 12 \tsha256msg1 0x12345678,%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x18, }, 4, 0, "", "", +"0f 38 cc 18 \tsha256msg1 (%rax),%xmm3",}, +{{0x0f, 0x38, 0xcc, 0x04, 0x01, }, 5, 0, "", "", +"0f 38 cc 04 01 \tsha256msg1 (%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 04 05 78 56 34 12 \tsha256msg1 0x12345678(,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x04, 0x08, }, 5, 0, "", "", +"0f 38 cc 04 08 \tsha256msg1 (%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x04, 0xc8, }, 5, 0, "", "", +"0f 38 cc 04 c8 \tsha256msg1 (%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x40, 0x12, }, 5, 0, "", "", +"0f 38 cc 40 12 \tsha256msg1 0x12(%rax),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x45, 0x12, }, 5, 0, "", "", +"0f 38 cc 45 12 \tsha256msg1 0x12(%rbp),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x44, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 cc 44 01 12 \tsha256msg1 0x12(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x44, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 cc 44 05 12 \tsha256msg1 0x12(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x44, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 cc 44 08 12 \tsha256msg1 0x12(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x44, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 cc 44 c8 12 \tsha256msg1 0x12(%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x80, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cc 80 78 56 34 12 \tsha256msg1 0x12345678(%rax),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x85, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cc 85 78 56 34 12 \tsha256msg1 0x12345678(%rbp),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 84 01 78 56 34 12 \tsha256msg1 0x12345678(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 84 05 78 56 34 12 \tsha256msg1 0x12345678(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 84 08 78 56 34 12 \tsha256msg1 0x12345678(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xcc, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cc 84 c8 78 56 34 12 \tsha256msg1 0x12345678(%rax,%rcx,8),%xmm0",}, +{{0x44, 0x0f, 0x38, 0xcc, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"44 0f 38 cc bc c8 78 56 34 12 \tsha256msg1 0x12345678(%rax,%rcx,8),%xmm15",}, +{{0x0f, 0x38, 0xcd, 0xc1, }, 4, 0, "", "", +"0f 38 cd c1 \tsha256msg2 %xmm1,%xmm0",}, +{{0x0f, 0x38, 0xcd, 0xd7, }, 4, 0, "", "", +"0f 38 cd d7 \tsha256msg2 %xmm7,%xmm2",}, +{{0x41, 0x0f, 0x38, 0xcd, 0xc0, }, 5, 0, "", "", +"41 0f 38 cd c0 \tsha256msg2 %xmm8,%xmm0",}, +{{0x44, 0x0f, 0x38, 0xcd, 0xc7, }, 5, 0, "", "", +"44 0f 38 cd c7 \tsha256msg2 %xmm7,%xmm8",}, +{{0x45, 0x0f, 0x38, 0xcd, 0xc7, }, 5, 0, "", "", +"45 0f 38 cd c7 \tsha256msg2 %xmm15,%xmm8",}, +{{0x0f, 0x38, 0xcd, 0x00, }, 4, 0, "", "", +"0f 38 cd 00 \tsha256msg2 (%rax),%xmm0",}, +{{0x41, 0x0f, 0x38, 0xcd, 0x00, }, 5, 0, "", "", +"41 0f 38 cd 00 \tsha256msg2 (%r8),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x04, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 04 25 78 56 34 12 \tsha256msg2 0x12345678,%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x18, }, 4, 0, "", "", +"0f 38 cd 18 \tsha256msg2 (%rax),%xmm3",}, +{{0x0f, 0x38, 0xcd, 0x04, 0x01, }, 5, 0, "", "", +"0f 38 cd 04 01 \tsha256msg2 (%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x04, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 04 05 78 56 34 12 \tsha256msg2 0x12345678(,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x04, 0x08, }, 5, 0, "", "", +"0f 38 cd 04 08 \tsha256msg2 (%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x04, 0xc8, }, 5, 0, "", "", +"0f 38 cd 04 c8 \tsha256msg2 (%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x40, 0x12, }, 5, 0, "", "", +"0f 38 cd 40 12 \tsha256msg2 0x12(%rax),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x45, 0x12, }, 5, 0, "", "", +"0f 38 cd 45 12 \tsha256msg2 0x12(%rbp),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x44, 0x01, 0x12, }, 6, 0, "", "", +"0f 38 cd 44 01 12 \tsha256msg2 0x12(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x44, 0x05, 0x12, }, 6, 0, "", "", +"0f 38 cd 44 05 12 \tsha256msg2 0x12(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x44, 0x08, 0x12, }, 6, 0, "", "", +"0f 38 cd 44 08 12 \tsha256msg2 0x12(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x44, 0xc8, 0x12, }, 6, 0, "", "", +"0f 38 cd 44 c8 12 \tsha256msg2 0x12(%rax,%rcx,8),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x80, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cd 80 78 56 34 12 \tsha256msg2 0x12345678(%rax),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x85, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f 38 cd 85 78 56 34 12 \tsha256msg2 0x12345678(%rbp),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x84, 0x01, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 84 01 78 56 34 12 \tsha256msg2 0x12345678(%rcx,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x84, 0x05, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 84 05 78 56 34 12 \tsha256msg2 0x12345678(%rbp,%rax,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x84, 0x08, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 84 08 78 56 34 12 \tsha256msg2 0x12345678(%rax,%rcx,1),%xmm0",}, +{{0x0f, 0x38, 0xcd, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"0f 38 cd 84 c8 78 56 34 12 \tsha256msg2 0x12345678(%rax,%rcx,8),%xmm0",}, +{{0x44, 0x0f, 0x38, 0xcd, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"44 0f 38 cd bc c8 78 56 34 12 \tsha256msg2 0x12345678(%rax,%rcx,8),%xmm15",}, diff --git a/tools/perf/tests/insn-x86-dat-src.c b/tools/perf/tests/insn-x86-dat-src.c index b506830f33a8..7d06c9b22070 100644 --- a/tools/perf/tests/insn-x86-dat-src.c +++ b/tools/perf/tests/insn-x86-dat-src.c @@ -217,6 +217,210 @@ int main(void) asm volatile("bnd jmp *(%ecx)"); /* Expecting: jmp indirect 0 */ asm volatile("bnd jne label1"); /* Expecting: jcc conditional 0 */ + /* sha1rnds4 imm8, xmm2/m128, xmm1 */ + + asm volatile("sha1rnds4 $0x0, %xmm1, %xmm0"); + asm volatile("sha1rnds4 $0x91, %xmm7, %xmm2"); + asm volatile("sha1rnds4 $0x91, %xmm8, %xmm0"); + asm volatile("sha1rnds4 $0x91, %xmm7, %xmm8"); + asm volatile("sha1rnds4 $0x91, %xmm15, %xmm8"); + asm volatile("sha1rnds4 $0x91, (%rax), %xmm0"); + asm volatile("sha1rnds4 $0x91, (%r8), %xmm0"); + asm volatile("sha1rnds4 $0x91, (0x12345678), %xmm0"); + asm volatile("sha1rnds4 $0x91, (%rax), %xmm3"); + asm volatile("sha1rnds4 $0x91, (%rcx,%rax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(,%rax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, (%rax,%rcx,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, (%rax,%rcx,8), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%rax), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%rbp), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%rcx,%rax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%rbp,%rax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%rax,%rcx,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%rax,%rcx,8), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%rax), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%rbp), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%rcx,%rax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%rbp,%rax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%rax,%rcx,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%rax,%rcx,8), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%rax,%rcx,8), %xmm15"); + + /* sha1nexte xmm2/m128, xmm1 */ + + asm volatile("sha1nexte %xmm1, %xmm0"); + asm volatile("sha1nexte %xmm7, %xmm2"); + asm volatile("sha1nexte %xmm8, %xmm0"); + asm volatile("sha1nexte %xmm7, %xmm8"); + asm volatile("sha1nexte %xmm15, %xmm8"); + asm volatile("sha1nexte (%rax), %xmm0"); + asm volatile("sha1nexte (%r8), %xmm0"); + asm volatile("sha1nexte (0x12345678), %xmm0"); + asm volatile("sha1nexte (%rax), %xmm3"); + asm volatile("sha1nexte (%rcx,%rax,1), %xmm0"); + asm volatile("sha1nexte 0x12345678(,%rax,1), %xmm0"); + asm volatile("sha1nexte (%rax,%rcx,1), %xmm0"); + asm volatile("sha1nexte (%rax,%rcx,8), %xmm0"); + asm volatile("sha1nexte 0x12(%rax), %xmm0"); + asm volatile("sha1nexte 0x12(%rbp), %xmm0"); + asm volatile("sha1nexte 0x12(%rcx,%rax,1), %xmm0"); + asm volatile("sha1nexte 0x12(%rbp,%rax,1), %xmm0"); + asm volatile("sha1nexte 0x12(%rax,%rcx,1), %xmm0"); + asm volatile("sha1nexte 0x12(%rax,%rcx,8), %xmm0"); + asm volatile("sha1nexte 0x12345678(%rax), %xmm0"); + asm volatile("sha1nexte 0x12345678(%rbp), %xmm0"); + asm volatile("sha1nexte 0x12345678(%rcx,%rax,1), %xmm0"); + asm volatile("sha1nexte 0x12345678(%rbp,%rax,1), %xmm0"); + asm volatile("sha1nexte 0x12345678(%rax,%rcx,1), %xmm0"); + asm volatile("sha1nexte 0x12345678(%rax,%rcx,8), %xmm0"); + asm volatile("sha1nexte 0x12345678(%rax,%rcx,8), %xmm15"); + + /* sha1msg1 xmm2/m128, xmm1 */ + + asm volatile("sha1msg1 %xmm1, %xmm0"); + asm volatile("sha1msg1 %xmm7, %xmm2"); + asm volatile("sha1msg1 %xmm8, %xmm0"); + asm volatile("sha1msg1 %xmm7, %xmm8"); + asm volatile("sha1msg1 %xmm15, %xmm8"); + asm volatile("sha1msg1 (%rax), %xmm0"); + asm volatile("sha1msg1 (%r8), %xmm0"); + asm volatile("sha1msg1 (0x12345678), %xmm0"); + asm volatile("sha1msg1 (%rax), %xmm3"); + asm volatile("sha1msg1 (%rcx,%rax,1), %xmm0"); + asm volatile("sha1msg1 0x12345678(,%rax,1), %xmm0"); + asm volatile("sha1msg1 (%rax,%rcx,1), %xmm0"); + asm volatile("sha1msg1 (%rax,%rcx,8), %xmm0"); + asm volatile("sha1msg1 0x12(%rax), %xmm0"); + asm volatile("sha1msg1 0x12(%rbp), %xmm0"); + asm volatile("sha1msg1 0x12(%rcx,%rax,1), %xmm0"); + asm volatile("sha1msg1 0x12(%rbp,%rax,1), %xmm0"); + asm volatile("sha1msg1 0x12(%rax,%rcx,1), %xmm0"); + asm volatile("sha1msg1 0x12(%rax,%rcx,8), %xmm0"); + asm volatile("sha1msg1 0x12345678(%rax), %xmm0"); + asm volatile("sha1msg1 0x12345678(%rbp), %xmm0"); + asm volatile("sha1msg1 0x12345678(%rcx,%rax,1), %xmm0"); + asm volatile("sha1msg1 0x12345678(%rbp,%rax,1), %xmm0"); + asm volatile("sha1msg1 0x12345678(%rax,%rcx,1), %xmm0"); + asm volatile("sha1msg1 0x12345678(%rax,%rcx,8), %xmm0"); + asm volatile("sha1msg1 0x12345678(%rax,%rcx,8), %xmm15"); + + /* sha1msg2 xmm2/m128, xmm1 */ + + asm volatile("sha1msg2 %xmm1, %xmm0"); + asm volatile("sha1msg2 %xmm7, %xmm2"); + asm volatile("sha1msg2 %xmm8, %xmm0"); + asm volatile("sha1msg2 %xmm7, %xmm8"); + asm volatile("sha1msg2 %xmm15, %xmm8"); + asm volatile("sha1msg2 (%rax), %xmm0"); + asm volatile("sha1msg2 (%r8), %xmm0"); + asm volatile("sha1msg2 (0x12345678), %xmm0"); + asm volatile("sha1msg2 (%rax), %xmm3"); + asm volatile("sha1msg2 (%rcx,%rax,1), %xmm0"); + asm volatile("sha1msg2 0x12345678(,%rax,1), %xmm0"); + asm volatile("sha1msg2 (%rax,%rcx,1), %xmm0"); + asm volatile("sha1msg2 (%rax,%rcx,8), %xmm0"); + asm volatile("sha1msg2 0x12(%rax), %xmm0"); + asm volatile("sha1msg2 0x12(%rbp), %xmm0"); + asm volatile("sha1msg2 0x12(%rcx,%rax,1), %xmm0"); + asm volatile("sha1msg2 0x12(%rbp,%rax,1), %xmm0"); + asm volatile("sha1msg2 0x12(%rax,%rcx,1), %xmm0"); + asm volatile("sha1msg2 0x12(%rax,%rcx,8), %xmm0"); + asm volatile("sha1msg2 0x12345678(%rax), %xmm0"); + asm volatile("sha1msg2 0x12345678(%rbp), %xmm0"); + asm volatile("sha1msg2 0x12345678(%rcx,%rax,1), %xmm0"); + asm volatile("sha1msg2 0x12345678(%rbp,%rax,1), %xmm0"); + asm volatile("sha1msg2 0x12345678(%rax,%rcx,1), %xmm0"); + asm volatile("sha1msg2 0x12345678(%rax,%rcx,8), %xmm0"); + asm volatile("sha1msg2 0x12345678(%rax,%rcx,8), %xmm15"); + + /* sha256rnds2 , xmm2/m128, xmm1 */ + /* Note sha256rnds2 has an implicit operand 'xmm0' */ + + asm volatile("sha256rnds2 %xmm4, %xmm1"); + asm volatile("sha256rnds2 %xmm7, %xmm2"); + asm volatile("sha256rnds2 %xmm8, %xmm1"); + asm volatile("sha256rnds2 %xmm7, %xmm8"); + asm volatile("sha256rnds2 %xmm15, %xmm8"); + asm volatile("sha256rnds2 (%rax), %xmm1"); + asm volatile("sha256rnds2 (%r8), %xmm1"); + asm volatile("sha256rnds2 (0x12345678), %xmm1"); + asm volatile("sha256rnds2 (%rax), %xmm3"); + asm volatile("sha256rnds2 (%rcx,%rax,1), %xmm1"); + asm volatile("sha256rnds2 0x12345678(,%rax,1), %xmm1"); + asm volatile("sha256rnds2 (%rax,%rcx,1), %xmm1"); + asm volatile("sha256rnds2 (%rax,%rcx,8), %xmm1"); + asm volatile("sha256rnds2 0x12(%rax), %xmm1"); + asm volatile("sha256rnds2 0x12(%rbp), %xmm1"); + asm volatile("sha256rnds2 0x12(%rcx,%rax,1), %xmm1"); + asm volatile("sha256rnds2 0x12(%rbp,%rax,1), %xmm1"); + asm volatile("sha256rnds2 0x12(%rax,%rcx,1), %xmm1"); + asm volatile("sha256rnds2 0x12(%rax,%rcx,8), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%rax), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%rbp), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%rcx,%rax,1), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%rbp,%rax,1), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%rax,%rcx,1), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%rax,%rcx,8), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%rax,%rcx,8), %xmm15"); + + /* sha256msg1 xmm2/m128, xmm1 */ + + asm volatile("sha256msg1 %xmm1, %xmm0"); + asm volatile("sha256msg1 %xmm7, %xmm2"); + asm volatile("sha256msg1 %xmm8, %xmm0"); + asm volatile("sha256msg1 %xmm7, %xmm8"); + asm volatile("sha256msg1 %xmm15, %xmm8"); + asm volatile("sha256msg1 (%rax), %xmm0"); + asm volatile("sha256msg1 (%r8), %xmm0"); + asm volatile("sha256msg1 (0x12345678), %xmm0"); + asm volatile("sha256msg1 (%rax), %xmm3"); + asm volatile("sha256msg1 (%rcx,%rax,1), %xmm0"); + asm volatile("sha256msg1 0x12345678(,%rax,1), %xmm0"); + asm volatile("sha256msg1 (%rax,%rcx,1), %xmm0"); + asm volatile("sha256msg1 (%rax,%rcx,8), %xmm0"); + asm volatile("sha256msg1 0x12(%rax), %xmm0"); + asm volatile("sha256msg1 0x12(%rbp), %xmm0"); + asm volatile("sha256msg1 0x12(%rcx,%rax,1), %xmm0"); + asm volatile("sha256msg1 0x12(%rbp,%rax,1), %xmm0"); + asm volatile("sha256msg1 0x12(%rax,%rcx,1), %xmm0"); + asm volatile("sha256msg1 0x12(%rax,%rcx,8), %xmm0"); + asm volatile("sha256msg1 0x12345678(%rax), %xmm0"); + asm volatile("sha256msg1 0x12345678(%rbp), %xmm0"); + asm volatile("sha256msg1 0x12345678(%rcx,%rax,1), %xmm0"); + asm volatile("sha256msg1 0x12345678(%rbp,%rax,1), %xmm0"); + asm volatile("sha256msg1 0x12345678(%rax,%rcx,1), %xmm0"); + asm volatile("sha256msg1 0x12345678(%rax,%rcx,8), %xmm0"); + asm volatile("sha256msg1 0x12345678(%rax,%rcx,8), %xmm15"); + + /* sha256msg2 xmm2/m128, xmm1 */ + + asm volatile("sha256msg2 %xmm1, %xmm0"); + asm volatile("sha256msg2 %xmm7, %xmm2"); + asm volatile("sha256msg2 %xmm8, %xmm0"); + asm volatile("sha256msg2 %xmm7, %xmm8"); + asm volatile("sha256msg2 %xmm15, %xmm8"); + asm volatile("sha256msg2 (%rax), %xmm0"); + asm volatile("sha256msg2 (%r8), %xmm0"); + asm volatile("sha256msg2 (0x12345678), %xmm0"); + asm volatile("sha256msg2 (%rax), %xmm3"); + asm volatile("sha256msg2 (%rcx,%rax,1), %xmm0"); + asm volatile("sha256msg2 0x12345678(,%rax,1), %xmm0"); + asm volatile("sha256msg2 (%rax,%rcx,1), %xmm0"); + asm volatile("sha256msg2 (%rax,%rcx,8), %xmm0"); + asm volatile("sha256msg2 0x12(%rax), %xmm0"); + asm volatile("sha256msg2 0x12(%rbp), %xmm0"); + asm volatile("sha256msg2 0x12(%rcx,%rax,1), %xmm0"); + asm volatile("sha256msg2 0x12(%rbp,%rax,1), %xmm0"); + asm volatile("sha256msg2 0x12(%rax,%rcx,1), %xmm0"); + asm volatile("sha256msg2 0x12(%rax,%rcx,8), %xmm0"); + asm volatile("sha256msg2 0x12345678(%rax), %xmm0"); + asm volatile("sha256msg2 0x12345678(%rbp), %xmm0"); + asm volatile("sha256msg2 0x12345678(%rcx,%rax,1), %xmm0"); + asm volatile("sha256msg2 0x12345678(%rbp,%rax,1), %xmm0"); + asm volatile("sha256msg2 0x12345678(%rax,%rcx,1), %xmm0"); + asm volatile("sha256msg2 0x12345678(%rax,%rcx,8), %xmm0"); + asm volatile("sha256msg2 0x12345678(%rax,%rcx,8), %xmm15"); + #else /* #ifdef __x86_64__ */ /* bndmk m32, bnd */ @@ -407,6 +611,175 @@ int main(void) asm volatile("bnd jmp *(%ecx)"); /* Expecting: jmp indirect 0 */ asm volatile("bnd jne label1"); /* Expecting: jcc conditional 0xfffffffc */ + /* sha1rnds4 imm8, xmm2/m128, xmm1 */ + + asm volatile("sha1rnds4 $0x0, %xmm1, %xmm0"); + asm volatile("sha1rnds4 $0x91, %xmm7, %xmm2"); + asm volatile("sha1rnds4 $0x91, (%eax), %xmm0"); + asm volatile("sha1rnds4 $0x91, (0x12345678), %xmm0"); + asm volatile("sha1rnds4 $0x91, (%eax), %xmm3"); + asm volatile("sha1rnds4 $0x91, (%ecx,%eax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(,%eax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, (%eax,%ecx,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, (%eax,%ecx,8), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%eax), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%ebp), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%ecx,%eax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%ebp,%eax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%eax,%ecx,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12(%eax,%ecx,8), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%eax), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%ebp), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%ecx,%eax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%ebp,%eax,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%eax,%ecx,1), %xmm0"); + asm volatile("sha1rnds4 $0x91, 0x12345678(%eax,%ecx,8), %xmm0"); + + /* sha1nexte xmm2/m128, xmm1 */ + + asm volatile("sha1nexte %xmm1, %xmm0"); + asm volatile("sha1nexte %xmm7, %xmm2"); + asm volatile("sha1nexte (%eax), %xmm0"); + asm volatile("sha1nexte (0x12345678), %xmm0"); + asm volatile("sha1nexte (%eax), %xmm3"); + asm volatile("sha1nexte (%ecx,%eax,1), %xmm0"); + asm volatile("sha1nexte 0x12345678(,%eax,1), %xmm0"); + asm volatile("sha1nexte (%eax,%ecx,1), %xmm0"); + asm volatile("sha1nexte (%eax,%ecx,8), %xmm0"); + asm volatile("sha1nexte 0x12(%eax), %xmm0"); + asm volatile("sha1nexte 0x12(%ebp), %xmm0"); + asm volatile("sha1nexte 0x12(%ecx,%eax,1), %xmm0"); + asm volatile("sha1nexte 0x12(%ebp,%eax,1), %xmm0"); + asm volatile("sha1nexte 0x12(%eax,%ecx,1), %xmm0"); + asm volatile("sha1nexte 0x12(%eax,%ecx,8), %xmm0"); + asm volatile("sha1nexte 0x12345678(%eax), %xmm0"); + asm volatile("sha1nexte 0x12345678(%ebp), %xmm0"); + asm volatile("sha1nexte 0x12345678(%ecx,%eax,1), %xmm0"); + asm volatile("sha1nexte 0x12345678(%ebp,%eax,1), %xmm0"); + asm volatile("sha1nexte 0x12345678(%eax,%ecx,1), %xmm0"); + asm volatile("sha1nexte 0x12345678(%eax,%ecx,8), %xmm0"); + + /* sha1msg1 xmm2/m128, xmm1 */ + + asm volatile("sha1msg1 %xmm1, %xmm0"); + asm volatile("sha1msg1 %xmm7, %xmm2"); + asm volatile("sha1msg1 (%eax), %xmm0"); + asm volatile("sha1msg1 (0x12345678), %xmm0"); + asm volatile("sha1msg1 (%eax), %xmm3"); + asm volatile("sha1msg1 (%ecx,%eax,1), %xmm0"); + asm volatile("sha1msg1 0x12345678(,%eax,1), %xmm0"); + asm volatile("sha1msg1 (%eax,%ecx,1), %xmm0"); + asm volatile("sha1msg1 (%eax,%ecx,8), %xmm0"); + asm volatile("sha1msg1 0x12(%eax), %xmm0"); + asm volatile("sha1msg1 0x12(%ebp), %xmm0"); + asm volatile("sha1msg1 0x12(%ecx,%eax,1), %xmm0"); + asm volatile("sha1msg1 0x12(%ebp,%eax,1), %xmm0"); + asm volatile("sha1msg1 0x12(%eax,%ecx,1), %xmm0"); + asm volatile("sha1msg1 0x12(%eax,%ecx,8), %xmm0"); + asm volatile("sha1msg1 0x12345678(%eax), %xmm0"); + asm volatile("sha1msg1 0x12345678(%ebp), %xmm0"); + asm volatile("sha1msg1 0x12345678(%ecx,%eax,1), %xmm0"); + asm volatile("sha1msg1 0x12345678(%ebp,%eax,1), %xmm0"); + asm volatile("sha1msg1 0x12345678(%eax,%ecx,1), %xmm0"); + asm volatile("sha1msg1 0x12345678(%eax,%ecx,8), %xmm0"); + + /* sha1msg2 xmm2/m128, xmm1 */ + + asm volatile("sha1msg2 %xmm1, %xmm0"); + asm volatile("sha1msg2 %xmm7, %xmm2"); + asm volatile("sha1msg2 (%eax), %xmm0"); + asm volatile("sha1msg2 (0x12345678), %xmm0"); + asm volatile("sha1msg2 (%eax), %xmm3"); + asm volatile("sha1msg2 (%ecx,%eax,1), %xmm0"); + asm volatile("sha1msg2 0x12345678(,%eax,1), %xmm0"); + asm volatile("sha1msg2 (%eax,%ecx,1), %xmm0"); + asm volatile("sha1msg2 (%eax,%ecx,8), %xmm0"); + asm volatile("sha1msg2 0x12(%eax), %xmm0"); + asm volatile("sha1msg2 0x12(%ebp), %xmm0"); + asm volatile("sha1msg2 0x12(%ecx,%eax,1), %xmm0"); + asm volatile("sha1msg2 0x12(%ebp,%eax,1), %xmm0"); + asm volatile("sha1msg2 0x12(%eax,%ecx,1), %xmm0"); + asm volatile("sha1msg2 0x12(%eax,%ecx,8), %xmm0"); + asm volatile("sha1msg2 0x12345678(%eax), %xmm0"); + asm volatile("sha1msg2 0x12345678(%ebp), %xmm0"); + asm volatile("sha1msg2 0x12345678(%ecx,%eax,1), %xmm0"); + asm volatile("sha1msg2 0x12345678(%ebp,%eax,1), %xmm0"); + asm volatile("sha1msg2 0x12345678(%eax,%ecx,1), %xmm0"); + asm volatile("sha1msg2 0x12345678(%eax,%ecx,8), %xmm0"); + + /* sha256rnds2 , xmm2/m128, xmm1 */ + /* Note sha256rnds2 has an implicit operand 'xmm0' */ + + asm volatile("sha256rnds2 %xmm4, %xmm1"); + asm volatile("sha256rnds2 %xmm7, %xmm2"); + asm volatile("sha256rnds2 (%eax), %xmm1"); + asm volatile("sha256rnds2 (0x12345678), %xmm1"); + asm volatile("sha256rnds2 (%eax), %xmm3"); + asm volatile("sha256rnds2 (%ecx,%eax,1), %xmm1"); + asm volatile("sha256rnds2 0x12345678(,%eax,1), %xmm1"); + asm volatile("sha256rnds2 (%eax,%ecx,1), %xmm1"); + asm volatile("sha256rnds2 (%eax,%ecx,8), %xmm1"); + asm volatile("sha256rnds2 0x12(%eax), %xmm1"); + asm volatile("sha256rnds2 0x12(%ebp), %xmm1"); + asm volatile("sha256rnds2 0x12(%ecx,%eax,1), %xmm1"); + asm volatile("sha256rnds2 0x12(%ebp,%eax,1), %xmm1"); + asm volatile("sha256rnds2 0x12(%eax,%ecx,1), %xmm1"); + asm volatile("sha256rnds2 0x12(%eax,%ecx,8), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%eax), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%ebp), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%ecx,%eax,1), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%ebp,%eax,1), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%eax,%ecx,1), %xmm1"); + asm volatile("sha256rnds2 0x12345678(%eax,%ecx,8), %xmm1"); + + /* sha256msg1 xmm2/m128, xmm1 */ + + asm volatile("sha256msg1 %xmm1, %xmm0"); + asm volatile("sha256msg1 %xmm7, %xmm2"); + asm volatile("sha256msg1 (%eax), %xmm0"); + asm volatile("sha256msg1 (0x12345678), %xmm0"); + asm volatile("sha256msg1 (%eax), %xmm3"); + asm volatile("sha256msg1 (%ecx,%eax,1), %xmm0"); + asm volatile("sha256msg1 0x12345678(,%eax,1), %xmm0"); + asm volatile("sha256msg1 (%eax,%ecx,1), %xmm0"); + asm volatile("sha256msg1 (%eax,%ecx,8), %xmm0"); + asm volatile("sha256msg1 0x12(%eax), %xmm0"); + asm volatile("sha256msg1 0x12(%ebp), %xmm0"); + asm volatile("sha256msg1 0x12(%ecx,%eax,1), %xmm0"); + asm volatile("sha256msg1 0x12(%ebp,%eax,1), %xmm0"); + asm volatile("sha256msg1 0x12(%eax,%ecx,1), %xmm0"); + asm volatile("sha256msg1 0x12(%eax,%ecx,8), %xmm0"); + asm volatile("sha256msg1 0x12345678(%eax), %xmm0"); + asm volatile("sha256msg1 0x12345678(%ebp), %xmm0"); + asm volatile("sha256msg1 0x12345678(%ecx,%eax,1), %xmm0"); + asm volatile("sha256msg1 0x12345678(%ebp,%eax,1), %xmm0"); + asm volatile("sha256msg1 0x12345678(%eax,%ecx,1), %xmm0"); + asm volatile("sha256msg1 0x12345678(%eax,%ecx,8), %xmm0"); + + /* sha256msg2 xmm2/m128, xmm1 */ + + asm volatile("sha256msg2 %xmm1, %xmm0"); + asm volatile("sha256msg2 %xmm7, %xmm2"); + asm volatile("sha256msg2 (%eax), %xmm0"); + asm volatile("sha256msg2 (0x12345678), %xmm0"); + asm volatile("sha256msg2 (%eax), %xmm3"); + asm volatile("sha256msg2 (%ecx,%eax,1), %xmm0"); + asm volatile("sha256msg2 0x12345678(,%eax,1), %xmm0"); + asm volatile("sha256msg2 (%eax,%ecx,1), %xmm0"); + asm volatile("sha256msg2 (%eax,%ecx,8), %xmm0"); + asm volatile("sha256msg2 0x12(%eax), %xmm0"); + asm volatile("sha256msg2 0x12(%ebp), %xmm0"); + asm volatile("sha256msg2 0x12(%ecx,%eax,1), %xmm0"); + asm volatile("sha256msg2 0x12(%ebp,%eax,1), %xmm0"); + asm volatile("sha256msg2 0x12(%eax,%ecx,1), %xmm0"); + asm volatile("sha256msg2 0x12(%eax,%ecx,8), %xmm0"); + asm volatile("sha256msg2 0x12345678(%eax), %xmm0"); + asm volatile("sha256msg2 0x12345678(%ebp), %xmm0"); + asm volatile("sha256msg2 0x12345678(%ecx,%eax,1), %xmm0"); + asm volatile("sha256msg2 0x12345678(%ebp,%eax,1), %xmm0"); + asm volatile("sha256msg2 0x12345678(%eax,%ecx,1), %xmm0"); + asm volatile("sha256msg2 0x12345678(%eax,%ecx,8), %xmm0"); + #endif /* #ifndef __x86_64__ */ /* Following line is a marker for the awk script - do not change */ diff --git a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt index a02a195d219c..25dad388b371 100644 --- a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt +++ b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt @@ -736,6 +736,12 @@ bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1) be: vfnmsub231ps/d Vx,Hx,Wx (66),(v) bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1) # 0x0f 0x38 0xc0-0xff +c8: sha1nexte Vdq,Wdq +c9: sha1msg1 Vdq,Wdq +ca: sha1msg2 Vdq,Wdq +cb: sha256rnds2 Vdq,Wdq +cc: sha256msg1 Vdq,Wdq +cd: sha256msg2 Vdq,Wdq db: VAESIMC Vdq,Wdq (66),(v1) dc: VAESENC Vdq,Hdq,Wdq (66),(v1) dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1) @@ -794,6 +800,7 @@ AVXcode: 3 61: vpcmpestri Vdq,Wdq,Ib (66),(v1) 62: vpcmpistrm Vdq,Wdq,Ib (66),(v1) 63: vpcmpistri Vdq,Wdq,Ib (66),(v1) +cc: sha1rnds4 Vdq,Wdq,Ib df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1) f0: RORX Gy,Ey,Ib (F2),(v) EndTable -- cgit v1.2.3 From ac1c8859a81e2fc45db1dbff30bdc572005734ca Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 2 Sep 2015 15:15:29 +0300 Subject: x86/insn: perf tools: Add new memory instructions Intel Architecture Instruction Set Extensions Programing Reference (Oct 2014) describes 3 new memory instructions, namely clflushopt, clwb and pcommit. Add them to the op code map and the perf tools new instructions test. e.g. $ tools/perf/perf test "x86 ins" 39: Test x86 instruction decoder - new instructions : Ok Or to see the details: $ tools/perf/perf test -v "x86 ins" Signed-off-by: Adrian Hunter Acked-by: Masami Hiramatsu Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Denys Vlasenko Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Qiaowei Ren Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1441196131-20632-6-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/lib/x86-opcode-map.txt | 4 +- tools/perf/tests/insn-x86-dat-32.c | 22 +++++++++++ tools/perf/tests/insn-x86-dat-64.c | 34 ++++++++++++++++ tools/perf/tests/insn-x86-dat-src.c | 46 ++++++++++++++++++++++ .../perf/util/intel-pt-decoder/x86-opcode-map.txt | 4 +- 5 files changed, 106 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 25dad388b371..f4f0451a301e 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -943,8 +943,8 @@ GrpTable: Grp15 3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B) 4: XSAVE 5: XRSTOR | lfence (11B) -6: XSAVEOPT | mfence (11B) -7: clflush | sfence (11B) +6: XSAVEOPT | clwb (66) | mfence (11B) +7: clflush | clflushopt (66) | sfence (11B) | pcommit (66),(11B) EndTable GrpTable: Grp16 diff --git a/tools/perf/tests/insn-x86-dat-32.c b/tools/perf/tests/insn-x86-dat-32.c index 83f5078e74e1..4b09b7e130a0 100644 --- a/tools/perf/tests/insn-x86-dat-32.c +++ b/tools/perf/tests/insn-x86-dat-32.c @@ -616,3 +616,25 @@ "0f 38 cd 84 08 78 56 34 12 \tsha256msg2 0x12345678(%eax,%ecx,1),%xmm0",}, {{0x0f, 0x38, 0xcd, 0x84, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", "0f 38 cd 84 c8 78 56 34 12 \tsha256msg2 0x12345678(%eax,%ecx,8),%xmm0",}, +{{0x66, 0x0f, 0xae, 0x38, }, 4, 0, "", "", +"66 0f ae 38 \tclflushopt (%eax)",}, +{{0x66, 0x0f, 0xae, 0x3d, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"66 0f ae 3d 78 56 34 12 \tclflushopt 0x12345678",}, +{{0x66, 0x0f, 0xae, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"66 0f ae bc c8 78 56 34 12 \tclflushopt 0x12345678(%eax,%ecx,8)",}, +{{0x0f, 0xae, 0x38, }, 3, 0, "", "", +"0f ae 38 \tclflush (%eax)",}, +{{0x0f, 0xae, 0xf8, }, 3, 0, "", "", +"0f ae f8 \tsfence ",}, +{{0x66, 0x0f, 0xae, 0x30, }, 4, 0, "", "", +"66 0f ae 30 \tclwb (%eax)",}, +{{0x66, 0x0f, 0xae, 0x35, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"66 0f ae 35 78 56 34 12 \tclwb 0x12345678",}, +{{0x66, 0x0f, 0xae, 0xb4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"66 0f ae b4 c8 78 56 34 12 \tclwb 0x12345678(%eax,%ecx,8)",}, +{{0x0f, 0xae, 0x30, }, 3, 0, "", "", +"0f ae 30 \txsaveopt (%eax)",}, +{{0x0f, 0xae, 0xf0, }, 3, 0, "", "", +"0f ae f0 \tmfence ",}, +{{0x66, 0x0f, 0xae, 0xf8, }, 4, 0, "", "", +"66 0f ae f8 \tpcommit ",}, diff --git a/tools/perf/tests/insn-x86-dat-64.c b/tools/perf/tests/insn-x86-dat-64.c index 13f008588590..5da235a4414f 100644 --- a/tools/perf/tests/insn-x86-dat-64.c +++ b/tools/perf/tests/insn-x86-dat-64.c @@ -702,3 +702,37 @@ "0f 38 cd 84 c8 78 56 34 12 \tsha256msg2 0x12345678(%rax,%rcx,8),%xmm0",}, {{0x44, 0x0f, 0x38, 0xcd, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", "44 0f 38 cd bc c8 78 56 34 12 \tsha256msg2 0x12345678(%rax,%rcx,8),%xmm15",}, +{{0x66, 0x0f, 0xae, 0x38, }, 4, 0, "", "", +"66 0f ae 38 \tclflushopt (%rax)",}, +{{0x66, 0x41, 0x0f, 0xae, 0x38, }, 5, 0, "", "", +"66 41 0f ae 38 \tclflushopt (%r8)",}, +{{0x66, 0x0f, 0xae, 0x3c, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"66 0f ae 3c 25 78 56 34 12 \tclflushopt 0x12345678",}, +{{0x66, 0x0f, 0xae, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"66 0f ae bc c8 78 56 34 12 \tclflushopt 0x12345678(%rax,%rcx,8)",}, +{{0x66, 0x41, 0x0f, 0xae, 0xbc, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"66 41 0f ae bc c8 78 56 34 12 \tclflushopt 0x12345678(%r8,%rcx,8)",}, +{{0x0f, 0xae, 0x38, }, 3, 0, "", "", +"0f ae 38 \tclflush (%rax)",}, +{{0x41, 0x0f, 0xae, 0x38, }, 4, 0, "", "", +"41 0f ae 38 \tclflush (%r8)",}, +{{0x0f, 0xae, 0xf8, }, 3, 0, "", "", +"0f ae f8 \tsfence ",}, +{{0x66, 0x0f, 0xae, 0x30, }, 4, 0, "", "", +"66 0f ae 30 \tclwb (%rax)",}, +{{0x66, 0x41, 0x0f, 0xae, 0x30, }, 5, 0, "", "", +"66 41 0f ae 30 \tclwb (%r8)",}, +{{0x66, 0x0f, 0xae, 0x34, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"66 0f ae 34 25 78 56 34 12 \tclwb 0x12345678",}, +{{0x66, 0x0f, 0xae, 0xb4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"66 0f ae b4 c8 78 56 34 12 \tclwb 0x12345678(%rax,%rcx,8)",}, +{{0x66, 0x41, 0x0f, 0xae, 0xb4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "", +"66 41 0f ae b4 c8 78 56 34 12 \tclwb 0x12345678(%r8,%rcx,8)",}, +{{0x0f, 0xae, 0x30, }, 3, 0, "", "", +"0f ae 30 \txsaveopt (%rax)",}, +{{0x41, 0x0f, 0xae, 0x30, }, 4, 0, "", "", +"41 0f ae 30 \txsaveopt (%r8)",}, +{{0x0f, 0xae, 0xf0, }, 3, 0, "", "", +"0f ae f0 \tmfence ",}, +{{0x66, 0x0f, 0xae, 0xf8, }, 4, 0, "", "", +"66 0f ae f8 \tpcommit ",}, diff --git a/tools/perf/tests/insn-x86-dat-src.c b/tools/perf/tests/insn-x86-dat-src.c index 7d06c9b22070..482637f44245 100644 --- a/tools/perf/tests/insn-x86-dat-src.c +++ b/tools/perf/tests/insn-x86-dat-src.c @@ -421,6 +421,30 @@ int main(void) asm volatile("sha256msg2 0x12345678(%rax,%rcx,8), %xmm0"); asm volatile("sha256msg2 0x12345678(%rax,%rcx,8), %xmm15"); + /* clflushopt m8 */ + + asm volatile("clflushopt (%rax)"); + asm volatile("clflushopt (%r8)"); + asm volatile("clflushopt (0x12345678)"); + asm volatile("clflushopt 0x12345678(%rax,%rcx,8)"); + asm volatile("clflushopt 0x12345678(%r8,%rcx,8)"); + /* Also check instructions in the same group encoding as clflushopt */ + asm volatile("clflush (%rax)"); + asm volatile("clflush (%r8)"); + asm volatile("sfence"); + + /* clwb m8 */ + + asm volatile("clwb (%rax)"); + asm volatile("clwb (%r8)"); + asm volatile("clwb (0x12345678)"); + asm volatile("clwb 0x12345678(%rax,%rcx,8)"); + asm volatile("clwb 0x12345678(%r8,%rcx,8)"); + /* Also check instructions in the same group encoding as clwb */ + asm volatile("xsaveopt (%rax)"); + asm volatile("xsaveopt (%r8)"); + asm volatile("mfence"); + #else /* #ifdef __x86_64__ */ /* bndmk m32, bnd */ @@ -780,8 +804,30 @@ int main(void) asm volatile("sha256msg2 0x12345678(%eax,%ecx,1), %xmm0"); asm volatile("sha256msg2 0x12345678(%eax,%ecx,8), %xmm0"); + /* clflushopt m8 */ + + asm volatile("clflushopt (%eax)"); + asm volatile("clflushopt (0x12345678)"); + asm volatile("clflushopt 0x12345678(%eax,%ecx,8)"); + /* Also check instructions in the same group encoding as clflushopt */ + asm volatile("clflush (%eax)"); + asm volatile("sfence"); + + /* clwb m8 */ + + asm volatile("clwb (%eax)"); + asm volatile("clwb (0x12345678)"); + asm volatile("clwb 0x12345678(%eax,%ecx,8)"); + /* Also check instructions in the same group encoding as clwb */ + asm volatile("xsaveopt (%eax)"); + asm volatile("mfence"); + #endif /* #ifndef __x86_64__ */ + /* pcommit */ + + asm volatile("pcommit"); + /* Following line is a marker for the awk script - do not change */ asm volatile("rdtsc"); /* Stop here */ diff --git a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt index 25dad388b371..f4f0451a301e 100644 --- a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt +++ b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt @@ -943,8 +943,8 @@ GrpTable: Grp15 3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B) 4: XSAVE 5: XRSTOR | lfence (11B) -6: XSAVEOPT | mfence (11B) -7: clflush | sfence (11B) +6: XSAVEOPT | clwb (66) | mfence (11B) +7: clflush | clflushopt (66) | sfence (11B) | pcommit (66),(11B) EndTable GrpTable: Grp16 -- cgit v1.2.3 From 978260cdbec3e34a3dfb2277ffc0aa1809457362 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 2 Sep 2015 15:15:30 +0300 Subject: x86/insn: perf tools: Add new memory protection keys instructions Add rdpkru and wrpkru to the op code map and the perf tools new instructions test. In the case of the test, only the bytes can be tested at the moment since binutils doesn't support the instructions yet. To run the test: $ tools/perf/perf test "x86 ins" 39: Test x86 instruction decoder - new instructions : Ok Or to see the details: $ tools/perf/perf test -v "x86 ins" 2>&1 | grep pkru For information about rdpkru and wrpkru, refer the Intel SDM. Signed-off-by: Adrian Hunter Acked-by: Masami Hiramatsu Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Denys Vlasenko Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Qiaowei Ren Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1441196131-20632-7-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/lib/x86-opcode-map.txt | 2 +- tools/perf/tests/insn-x86.c | 4 ++++ tools/perf/util/intel-pt-decoder/x86-opcode-map.txt | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index f4f0451a301e..5a9705ed9139 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -885,7 +885,7 @@ GrpTable: Grp7 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) 3: LIDT Ms 4: SMSW Mw/Rv -5: +5: rdpkru (110),(11B) | wrpkru (111),(11B) 6: LMSW Ew 7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B) EndTable diff --git a/tools/perf/tests/insn-x86.c b/tools/perf/tests/insn-x86.c index 0e126a099874..5c49eec81349 100644 --- a/tools/perf/tests/insn-x86.c +++ b/tools/perf/tests/insn-x86.c @@ -17,11 +17,15 @@ struct test_data { struct test_data test_data_32[] = { #include "insn-x86-dat-32.c" + {{0x0f, 0x01, 0xee}, 3, 0, NULL, NULL, "0f 01 ee \trdpkru"}, + {{0x0f, 0x01, 0xef}, 3, 0, NULL, NULL, "0f 01 ef \twrpkru"}, {{0}, 0, 0, NULL, NULL, NULL}, }; struct test_data test_data_64[] = { #include "insn-x86-dat-64.c" + {{0x0f, 0x01, 0xee}, 3, 0, NULL, NULL, "0f 01 ee \trdpkru"}, + {{0x0f, 0x01, 0xef}, 3, 0, NULL, NULL, "0f 01 ef \twrpkru"}, {{0}, 0, 0, NULL, NULL, NULL}, }; diff --git a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt index f4f0451a301e..5a9705ed9139 100644 --- a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt +++ b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt @@ -885,7 +885,7 @@ GrpTable: Grp7 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) 3: LIDT Ms 4: SMSW Mw/Rv -5: +5: rdpkru (110),(11B) | wrpkru (111),(11B) 6: LMSW Ew 7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B) EndTable -- cgit v1.2.3 From f83b6b64eba155cfb43ab8a5d9c422c3e7f603e6 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 2 Sep 2015 15:15:31 +0300 Subject: x86/insn: perf tools: Add new xsave instructions Add xsavec, xsaves and xrstors to the op code map and the perf tools new instructions test. To run the test: $ tools/perf/perf test "x86 ins" 39: Test x86 instruction decoder - new instructions : Ok Or to see the details: $ tools/perf/perf test -v "x86 ins" 2>&1 | grep 'xsave\|xrst' For information about xsavec, xsaves and xrstors, refer the Intel SDM. Signed-off-by: Adrian Hunter Acked-by: Masami Hiramatsu Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Denys Vlasenko Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Qiaowei Ren Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1441196131-20632-8-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/lib/x86-opcode-map.txt | 3 ++ tools/perf/tests/insn-x86-dat-32.c | 18 ++++++++++ tools/perf/tests/insn-x86-dat-64.c | 30 ++++++++++++++++ tools/perf/tests/insn-x86-dat-src.c | 42 ++++++++++++++++++++++ .../perf/util/intel-pt-decoder/x86-opcode-map.txt | 3 ++ 5 files changed, 96 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 5a9705ed9139..d388de72eaca 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -899,6 +899,9 @@ EndTable GrpTable: Grp9 1: CMPXCHG8B/16B Mq/Mdq +3: xrstors +4: xsavec +5: xsaves 6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B) 7: VMPTRST Mq | VMPTRST Mq (F3) | RDSEED Rv (11B) EndTable diff --git a/tools/perf/tests/insn-x86-dat-32.c b/tools/perf/tests/insn-x86-dat-32.c index 4b09b7e130a0..3b491cfe204e 100644 --- a/tools/perf/tests/insn-x86-dat-32.c +++ b/tools/perf/tests/insn-x86-dat-32.c @@ -636,5 +636,23 @@ "0f ae 30 \txsaveopt (%eax)",}, {{0x0f, 0xae, 0xf0, }, 3, 0, "", "", "0f ae f0 \tmfence ",}, +{{0x0f, 0xc7, 0x20, }, 3, 0, "", "", +"0f c7 20 \txsavec (%eax)",}, +{{0x0f, 0xc7, 0x25, 0x78, 0x56, 0x34, 0x12, }, 7, 0, "", "", +"0f c7 25 78 56 34 12 \txsavec 0x12345678",}, +{{0x0f, 0xc7, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f c7 a4 c8 78 56 34 12 \txsavec 0x12345678(%eax,%ecx,8)",}, +{{0x0f, 0xc7, 0x28, }, 3, 0, "", "", +"0f c7 28 \txsaves (%eax)",}, +{{0x0f, 0xc7, 0x2d, 0x78, 0x56, 0x34, 0x12, }, 7, 0, "", "", +"0f c7 2d 78 56 34 12 \txsaves 0x12345678",}, +{{0x0f, 0xc7, 0xac, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f c7 ac c8 78 56 34 12 \txsaves 0x12345678(%eax,%ecx,8)",}, +{{0x0f, 0xc7, 0x18, }, 3, 0, "", "", +"0f c7 18 \txrstors (%eax)",}, +{{0x0f, 0xc7, 0x1d, 0x78, 0x56, 0x34, 0x12, }, 7, 0, "", "", +"0f c7 1d 78 56 34 12 \txrstors 0x12345678",}, +{{0x0f, 0xc7, 0x9c, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f c7 9c c8 78 56 34 12 \txrstors 0x12345678(%eax,%ecx,8)",}, {{0x66, 0x0f, 0xae, 0xf8, }, 4, 0, "", "", "66 0f ae f8 \tpcommit ",}, diff --git a/tools/perf/tests/insn-x86-dat-64.c b/tools/perf/tests/insn-x86-dat-64.c index 5da235a4414f..4fe7cce179c4 100644 --- a/tools/perf/tests/insn-x86-dat-64.c +++ b/tools/perf/tests/insn-x86-dat-64.c @@ -734,5 +734,35 @@ "41 0f ae 30 \txsaveopt (%r8)",}, {{0x0f, 0xae, 0xf0, }, 3, 0, "", "", "0f ae f0 \tmfence ",}, +{{0x0f, 0xc7, 0x20, }, 3, 0, "", "", +"0f c7 20 \txsavec (%rax)",}, +{{0x41, 0x0f, 0xc7, 0x20, }, 4, 0, "", "", +"41 0f c7 20 \txsavec (%r8)",}, +{{0x0f, 0xc7, 0x24, 0x25, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f c7 24 25 78 56 34 12 \txsavec 0x12345678",}, +{{0x0f, 0xc7, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f c7 a4 c8 78 56 34 12 \txsavec 0x12345678(%rax,%rcx,8)",}, +{{0x41, 0x0f, 0xc7, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"41 0f c7 a4 c8 78 56 34 12 \txsavec 0x12345678(%r8,%rcx,8)",}, +{{0x0f, 0xc7, 0x28, }, 3, 0, "", "", +"0f c7 28 \txsaves (%rax)",}, +{{0x41, 0x0f, 0xc7, 0x28, }, 4, 0, "", "", +"41 0f c7 28 \txsaves (%r8)",}, +{{0x0f, 0xc7, 0x2c, 0x25, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f c7 2c 25 78 56 34 12 \txsaves 0x12345678",}, +{{0x0f, 0xc7, 0xac, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f c7 ac c8 78 56 34 12 \txsaves 0x12345678(%rax,%rcx,8)",}, +{{0x41, 0x0f, 0xc7, 0xac, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"41 0f c7 ac c8 78 56 34 12 \txsaves 0x12345678(%r8,%rcx,8)",}, +{{0x0f, 0xc7, 0x18, }, 3, 0, "", "", +"0f c7 18 \txrstors (%rax)",}, +{{0x41, 0x0f, 0xc7, 0x18, }, 4, 0, "", "", +"41 0f c7 18 \txrstors (%r8)",}, +{{0x0f, 0xc7, 0x1c, 0x25, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f c7 1c 25 78 56 34 12 \txrstors 0x12345678",}, +{{0x0f, 0xc7, 0x9c, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "", +"0f c7 9c c8 78 56 34 12 \txrstors 0x12345678(%rax,%rcx,8)",}, +{{0x41, 0x0f, 0xc7, 0x9c, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "", +"41 0f c7 9c c8 78 56 34 12 \txrstors 0x12345678(%r8,%rcx,8)",}, {{0x66, 0x0f, 0xae, 0xf8, }, 4, 0, "", "", "66 0f ae f8 \tpcommit ",}, diff --git a/tools/perf/tests/insn-x86-dat-src.c b/tools/perf/tests/insn-x86-dat-src.c index 482637f44245..41b1b1c62660 100644 --- a/tools/perf/tests/insn-x86-dat-src.c +++ b/tools/perf/tests/insn-x86-dat-src.c @@ -445,6 +445,30 @@ int main(void) asm volatile("xsaveopt (%r8)"); asm volatile("mfence"); + /* xsavec mem */ + + asm volatile("xsavec (%rax)"); + asm volatile("xsavec (%r8)"); + asm volatile("xsavec (0x12345678)"); + asm volatile("xsavec 0x12345678(%rax,%rcx,8)"); + asm volatile("xsavec 0x12345678(%r8,%rcx,8)"); + + /* xsaves mem */ + + asm volatile("xsaves (%rax)"); + asm volatile("xsaves (%r8)"); + asm volatile("xsaves (0x12345678)"); + asm volatile("xsaves 0x12345678(%rax,%rcx,8)"); + asm volatile("xsaves 0x12345678(%r8,%rcx,8)"); + + /* xrstors mem */ + + asm volatile("xrstors (%rax)"); + asm volatile("xrstors (%r8)"); + asm volatile("xrstors (0x12345678)"); + asm volatile("xrstors 0x12345678(%rax,%rcx,8)"); + asm volatile("xrstors 0x12345678(%r8,%rcx,8)"); + #else /* #ifdef __x86_64__ */ /* bndmk m32, bnd */ @@ -822,6 +846,24 @@ int main(void) asm volatile("xsaveopt (%eax)"); asm volatile("mfence"); + /* xsavec mem */ + + asm volatile("xsavec (%eax)"); + asm volatile("xsavec (0x12345678)"); + asm volatile("xsavec 0x12345678(%eax,%ecx,8)"); + + /* xsaves mem */ + + asm volatile("xsaves (%eax)"); + asm volatile("xsaves (0x12345678)"); + asm volatile("xsaves 0x12345678(%eax,%ecx,8)"); + + /* xrstors mem */ + + asm volatile("xrstors (%eax)"); + asm volatile("xrstors (0x12345678)"); + asm volatile("xrstors 0x12345678(%eax,%ecx,8)"); + #endif /* #ifndef __x86_64__ */ /* pcommit */ diff --git a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt index 5a9705ed9139..d388de72eaca 100644 --- a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt +++ b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt @@ -899,6 +899,9 @@ EndTable GrpTable: Grp9 1: CMPXCHG8B/16B Mq/Mdq +3: xrstors +4: xsavec +5: xsaves 6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B) 7: VMPTRST Mq | VMPTRST Mq (F3) | RDSEED Rv (11B) EndTable -- cgit v1.2.3 From b76cb6c869b966b4c991251136b45b3e1babc4e8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 5 Sep 2015 09:32:29 +0200 Subject: x86/headers: Fix (old) header file dependency bug in uapi/asm/sigcontext32.h Mikko Rapeli reported that the following standalone user-space header does not compile: #include Due to undefined 'struct __fpx_sw_bytes' which is defined in asm/sigcontext.h. The following header order works: #include #include and that's probably how everyone's been using these headers for the past decade or so, but it's a legit header file dependency bug, so include asm/sigcontext.h in sigcontext32.h to allow it to be built standlone. Reported-by: Mikko Rapeli Acked-by: Mikko Rapeli Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1441438363-9999-2-git-send-email-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/uapi/asm/sigcontext32.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/sigcontext32.h b/arch/x86/include/uapi/asm/sigcontext32.h index ad1478c4ae12..ff7826c41a1c 100644 --- a/arch/x86/include/uapi/asm/sigcontext32.h +++ b/arch/x86/include/uapi/asm/sigcontext32.h @@ -3,6 +3,8 @@ #include +#include + /* signal context for 32bit programs. */ #define X86_FXSR_MAGIC 0x0000 -- cgit v1.2.3 From c3f4986fb0774bb13feb4bade1276bed2ba8a3db Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 5 Sep 2015 09:32:30 +0200 Subject: x86/headers: Clean up uapi/asm/sigcontext32.h Clean up sigcontext32.h a bit: - use consistent and readable vertical spacing - fix, harmonize and extend comments No field name has been changed, user-space might be relying on them. Acked-by: Mikko Rapeli Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1441438363-9999-3-git-send-email-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/uapi/asm/sigcontext32.h | 110 ++++++++++++++++--------------- 1 file changed, 58 insertions(+), 52 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/sigcontext32.h b/arch/x86/include/uapi/asm/sigcontext32.h index ff7826c41a1c..6ff4fbab650e 100644 --- a/arch/x86/include/uapi/asm/sigcontext32.h +++ b/arch/x86/include/uapi/asm/sigcontext32.h @@ -1,79 +1,85 @@ #ifndef _ASM_X86_SIGCONTEXT32_H #define _ASM_X86_SIGCONTEXT32_H +/* Signal context definitions for compat 32-bit programs: */ + #include #include -/* signal context for 32bit programs. */ - -#define X86_FXSR_MAGIC 0x0000 - +/* 10-byte legacy floating point register: */ struct _fpreg { - unsigned short significand[4]; - unsigned short exponent; + unsigned short significand[4]; + unsigned short exponent; }; +/* 16-byte floating point register: */ struct _fpxreg { - unsigned short significand[4]; - unsigned short exponent; - unsigned short padding[3]; + unsigned short significand[4]; + unsigned short exponent; + unsigned short padding[3]; }; +/* 16-byte XMM vector register: */ struct _xmmreg { __u32 element[4]; }; -/* FSAVE frame with extensions */ +#define X86_FXSR_MAGIC 0x0000 + +/* FXSAVE frame: FSAVE frame with extensions */ struct _fpstate_ia32 { - /* Regular FPU environment */ - __u32 cw; - __u32 sw; - __u32 tag; /* not compatible to 64bit twd */ - __u32 ipoff; - __u32 cssel; - __u32 dataoff; - __u32 datasel; - struct _fpreg _st[8]; - unsigned short status; - unsigned short magic; /* 0xffff = regular FPU data only */ + /* Regular FPU environment: */ + __u32 cw; + __u32 sw; + __u32 tag; /* Not compatible with the 64-bit frame */ + __u32 ipoff; + __u32 cssel; + __u32 dataoff; + __u32 datasel; + struct _fpreg _st[8]; + unsigned short status; + unsigned short magic; /* 0xffff: regular FPU data only */ + /* 0x0000: FXSR data */ - /* FXSR FPU environment */ - __u32 _fxsr_env[6]; - __u32 mxcsr; - __u32 reserved; - struct _fpxreg _fxsr_st[8]; - struct _xmmreg _xmm[8]; /* It's actually 16 */ - __u32 padding[44]; + /* Extended FXSR FPU environment: */ + __u32 _fxsr_env[6]; + __u32 mxcsr; + __u32 reserved; + struct _fpxreg _fxsr_st[8]; + struct _xmmreg _xmm[8]; /* The first 8 XMM registers */ + __u32 padding[44]; /* The second 8 XMM registers plus padding */ union { - __u32 padding2[12]; - struct _fpx_sw_bytes sw_reserved; + __u32 padding2[12]; + /* Might encode xstate extensions, see asm/sigcontext.h: */ + struct _fpx_sw_bytes sw_reserved; }; }; +/* 32-bit compat sigcontext: */ struct sigcontext_ia32 { - unsigned short gs, __gsh; - unsigned short fs, __fsh; - unsigned short es, __esh; - unsigned short ds, __dsh; - unsigned int di; - unsigned int si; - unsigned int bp; - unsigned int sp; - unsigned int bx; - unsigned int dx; - unsigned int cx; - unsigned int ax; - unsigned int trapno; - unsigned int err; - unsigned int ip; - unsigned short cs, __csh; - unsigned int flags; - unsigned int sp_at_signal; - unsigned short ss, __ssh; - unsigned int fpstate; /* really (struct _fpstate_ia32 *) */ - unsigned int oldmask; - unsigned int cr2; + unsigned short gs, __gsh; + unsigned short fs, __fsh; + unsigned short es, __esh; + unsigned short ds, __dsh; + unsigned int di; + unsigned int si; + unsigned int bp; + unsigned int sp; + unsigned int bx; + unsigned int dx; + unsigned int cx; + unsigned int ax; + unsigned int trapno; + unsigned int err; + unsigned int ip; + unsigned short cs, __csh; + unsigned int flags; + unsigned int sp_at_signal; + unsigned short ss, __ssh; + unsigned int fpstate; /* Pointer to 'struct _fpstate_ia32' */ + unsigned int oldmask; + unsigned int cr2; }; #endif /* _ASM_X86_SIGCONTEXT32_H */ -- cgit v1.2.3 From cbf5f4fbf435e871e59d49653d555266b8796efc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 5 Sep 2015 09:32:31 +0200 Subject: x86/headers: Clean up and better document uapi/asm/sigcontext.h Clean up sigcontext.h: - the explanations were full of typos and were hard to read in general - use consistent and readable vertical spacing - fix, harmonize and extend comments No field name has been changed, user-space might be relying on them. Acked-by: Mikko Rapeli Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1441438363-9999-4-git-send-email-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/uapi/asm/sigcontext.h | 330 ++++++++++++++++++--------------- 1 file changed, 182 insertions(+), 148 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index 40836a9a7250..f89b2f1abe7c 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -1,221 +1,255 @@ #ifndef _UAPI_ASM_X86_SIGCONTEXT_H #define _UAPI_ASM_X86_SIGCONTEXT_H +/* + * Linux signal context definitions. The sigcontext includes a complex hierarchy of CPU + * and FPU state, available to user-space (on the stack) when a signal handler is + * executed. + * + * As over the years this ABI grew from its very simple roots towards supporting more and + * more CPU state organically, some of the details (which were rather clever hacks back + * in the days) became a bit quirky by today. + * + * The current ABI includes flexible provisions for future extensions, so we won't have + * to grow new quirks for quite some time. Promise! + */ + #include #include -#define FP_XSTATE_MAGIC1 0x46505853U -#define FP_XSTATE_MAGIC2 0x46505845U -#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2) +#define FP_XSTATE_MAGIC1 0x46505853U +#define FP_XSTATE_MAGIC2 0x46505845U +#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2) /* - * bytes 464..511 in the current 512byte layout of fxsave/fxrstor frame - * are reserved for SW usage. On cpu's supporting xsave/xrstor, these bytes - * are used to extended the fpstate pointer in the sigcontext, which now + * Bytes 464..511 in the current 512-byte layout of the FXSAVE/FXRSTOR frame + * are reserved for SW usage. On CPUs supporting XSAVE/XRSTOR, these bytes + * are used to extend the fpstate pointer in the sigcontext, which now * includes the extended state information along with fpstate information. * - * Presence of FP_XSTATE_MAGIC1 at the beginning of this SW reserved - * area and FP_XSTATE_MAGIC2 at the end of memory layout - * (extended_size - FP_XSTATE_MAGIC2_SIZE) indicates the presence of the - * extended state information in the memory layout pointed by the fpstate - * pointer in sigcontext. + * If sw_reserved.magic1 == FP_XSTATE_MAGIC1 then there's a sw_reserved.extended_size + * bytes large extended context area present. (The last 32-bit word of this extended + * area (at the fpstate+extended_size-FP_XSTATE_MAGIC2_SIZE address) is set to + * FP_XSTATE_MAGIC2 so that you can sanity check your size calculations.) + * + * This extended area typically grows with newer CPUs that have larger and larger + * XSAVE areas. */ struct _fpx_sw_bytes { - __u32 magic1; /* FP_XSTATE_MAGIC1 */ - __u32 extended_size; /* total size of the layout referred by - * fpstate pointer in the sigcontext. - */ - __u64 xfeatures; - /* feature bit mask (including fp/sse/extended - * state) that is present in the memory - * layout. - */ - __u32 xstate_size; /* actual xsave state size, based on the - * features saved in the layout. - * 'extended_size' will be greater than - * 'xstate_size'. - */ - __u32 padding[7]; /* for future use. */ + /* If set to FP_XSTATE_MAGIC1 then this is an xstate context. 0 if a legacy frame. */ + __u32 magic1; + + /* + * Total size of the fpstate area: + * + * - if magic1 == 0 then it's sizeof(struct _fpstate) + * - if magic1 == FP_XSTATE_MAGIC1 then it's sizeof(struct _xstate) plus extensions (if any) + */ + __u32 extended_size; + + /* + * Feature bit mask (including FP/SSE/extended state) that is present + * in the memory layout: + */ + __u64 xfeatures; + + /* + * Actual XSAVE state size, based on the xfeatures saved in the layout. + * 'extended_size' is greater than 'xstate_size': + */ + __u32 xstate_size; + + /* For future use: */ + __u32 padding[7]; }; #ifdef __i386__ /* - * As documented in the iBCS2 standard.. + * As documented in the iBCS2 standard: * * The first part of "struct _fpstate" is just the normal i387 * hardware setup, the extra "status" word is used to save the * coprocessor status word before entering the handler. * - * Pentium III FXSR, SSE support - * Gareth Hughes , May 2000 - * * The FPU state data structure has had to grow to accommodate the * extended FPU state required by the Streaming SIMD Extensions. * There is no documented standard to accomplish this at the moment. */ + +/* 10-byte legacy floating point register: */ struct _fpreg { - unsigned short significand[4]; - unsigned short exponent; + unsigned short significand[4]; + unsigned short exponent; }; +/* 16-byte floating point register: */ struct _fpxreg { - unsigned short significand[4]; - unsigned short exponent; - unsigned short padding[3]; + unsigned short significand[4]; + unsigned short exponent; + unsigned short padding[3]; }; +/* 16-byte XMM register: */ struct _xmmreg { - unsigned long element[4]; + unsigned long element[4]; }; +#define X86_FXSR_MAGIC 0x0000 + struct _fpstate { - /* Regular FPU environment */ - unsigned long cw; - unsigned long sw; - unsigned long tag; - unsigned long ipoff; - unsigned long cssel; - unsigned long dataoff; - unsigned long datasel; - struct _fpreg _st[8]; - unsigned short status; - unsigned short magic; /* 0xffff = regular FPU data only */ + /* Legacy FPU environment: */ + unsigned long cw; + unsigned long sw; + unsigned long tag; + unsigned long ipoff; + unsigned long cssel; + unsigned long dataoff; + unsigned long datasel; + struct _fpreg _st[8]; + unsigned short status; + unsigned short magic; /* 0xffff: regular FPU data only */ + /* 0x0000: FXSR FPU data */ /* FXSR FPU environment */ - unsigned long _fxsr_env[6]; /* FXSR FPU env is ignored */ - unsigned long mxcsr; - unsigned long reserved; - struct _fpxreg _fxsr_st[8]; /* FXSR FPU reg data is ignored */ - struct _xmmreg _xmm[8]; - unsigned long padding1[44]; + unsigned long _fxsr_env[6]; /* FXSR FPU env is ignored */ + unsigned long mxcsr; + unsigned long reserved; + struct _fpxreg _fxsr_st[8]; /* FXSR FPU reg data is ignored */ + struct _xmmreg _xmm[8]; /* First 8 XMM registers */ + unsigned long padding1[44]; /* Second 8 XMM registers plus padding */ union { - unsigned long padding2[12]; - struct _fpx_sw_bytes sw_reserved; /* represents the extended - * state info */ + unsigned long padding2[12]; + struct _fpx_sw_bytes sw_reserved; /* Potential extended state is encoded here */ }; }; -#define X86_FXSR_MAGIC 0x0000 - -#ifndef __KERNEL__ +# ifndef __KERNEL__ /* * User-space might still rely on the old definition: */ struct sigcontext { - unsigned short gs, __gsh; - unsigned short fs, __fsh; - unsigned short es, __esh; - unsigned short ds, __dsh; - unsigned long edi; - unsigned long esi; - unsigned long ebp; - unsigned long esp; - unsigned long ebx; - unsigned long edx; - unsigned long ecx; - unsigned long eax; - unsigned long trapno; - unsigned long err; - unsigned long eip; - unsigned short cs, __csh; - unsigned long eflags; - unsigned long esp_at_signal; - unsigned short ss, __ssh; - struct _fpstate __user *fpstate; - unsigned long oldmask; - unsigned long cr2; + unsigned short gs, __gsh; + unsigned short fs, __fsh; + unsigned short es, __esh; + unsigned short ds, __dsh; + unsigned long edi; + unsigned long esi; + unsigned long ebp; + unsigned long esp; + unsigned long ebx; + unsigned long edx; + unsigned long ecx; + unsigned long eax; + unsigned long trapno; + unsigned long err; + unsigned long eip; + unsigned short cs, __csh; + unsigned long eflags; + unsigned long esp_at_signal; + unsigned short ss, __ssh; + struct _fpstate __user *fpstate; + unsigned long oldmask; + unsigned long cr2; }; -#endif /* !__KERNEL__ */ +# endif /* !__KERNEL__ */ -#else /* __i386__ */ +#else /* __x86_64__: */ -/* FXSAVE frame */ -/* Note: reserved1/2 may someday contain valuable data. Always save/restore - them when you change signal frames. */ +/* + * The FXSAVE frame. + * + * Note1: If sw_reserved.magic1 == FP_XSTATE_MAGIC1 then the structure is + * larger: 'struct _xstate'. Note that 'struct _xstate' embedds + * 'struct _fpstate' so that you can always assume the _fpstate portion + * exists so that you can check the magic value. + * + * Note2: Reserved fields may someday contain valuable data. Always save/restore + * them when you change signal frames. + */ struct _fpstate { - __u16 cwd; - __u16 swd; - __u16 twd; /* Note this is not the same as the - 32bit/x87/FSAVE twd */ - __u16 fop; - __u64 rip; - __u64 rdp; - __u32 mxcsr; - __u32 mxcsr_mask; - __u32 st_space[32]; /* 8*16 bytes for each FP-reg */ - __u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg */ - __u32 reserved2[12]; + __u16 cwd; + __u16 swd; + /* Note this is not the same as the 32-bit/x87/FSAVE twd: */ + __u16 twd; + __u16 fop; + __u64 rip; + __u64 rdp; + __u32 mxcsr; + __u32 mxcsr_mask; + __u32 st_space[32]; /* 8x FP registers, 16 bytes each */ + __u32 xmm_space[64]; /* 16x XMM registers, 16 bytes each */ + __u32 reserved2[12]; union { - __u32 reserved3[12]; - struct _fpx_sw_bytes sw_reserved; /* represents the extended - * state information */ + __u32 reserved3[12]; + struct _fpx_sw_bytes sw_reserved; /* Potential extended state is encoded here */ }; }; -#ifndef __KERNEL__ +# ifndef __KERNEL__ /* * User-space might still rely on the old definition: */ struct sigcontext { - __u64 r8; - __u64 r9; - __u64 r10; - __u64 r11; - __u64 r12; - __u64 r13; - __u64 r14; - __u64 r15; - __u64 rdi; - __u64 rsi; - __u64 rbp; - __u64 rbx; - __u64 rdx; - __u64 rax; - __u64 rcx; - __u64 rsp; - __u64 rip; - __u64 eflags; /* RFLAGS */ - __u16 cs; - __u16 gs; - __u16 fs; - __u16 __pad0; - __u64 err; - __u64 trapno; - __u64 oldmask; - __u64 cr2; - struct _fpstate __user *fpstate; /* zero when no FPU context */ -#ifdef __ILP32__ - __u32 __fpstate_pad; -#endif - __u64 reserved1[8]; + __u64 r8; + __u64 r9; + __u64 r10; + __u64 r11; + __u64 r12; + __u64 r13; + __u64 r14; + __u64 r15; + __u64 rdi; + __u64 rsi; + __u64 rbp; + __u64 rbx; + __u64 rdx; + __u64 rax; + __u64 rcx; + __u64 rsp; + __u64 rip; + __u64 eflags; /* RFLAGS */ + __u16 cs; + __u16 gs; + __u16 fs; + __u16 __pad0; + __u64 err; + __u64 trapno; + __u64 oldmask; + __u64 cr2; + struct _fpstate __user *fpstate; /* Zero when no FPU context */ +# ifdef __ILP32__ + __u32 __fpstate_pad; +# endif + __u64 reserved1[8]; }; -#endif /* !__KERNEL__ */ +# endif /* !__KERNEL__ */ -#endif /* !__i386__ */ +#endif /* __x86_64__ */ struct _header { - __u64 xfeatures; - __u64 reserved1[2]; - __u64 reserved2[5]; + __u64 xfeatures; + __u64 reserved1[2]; + __u64 reserved2[5]; }; struct _ymmh_state { - /* 16 * 16 bytes for each YMMH-reg */ - __u32 ymmh_space[64]; + /* 16x YMM registers, 16 bytes each: */ + __u32 ymmh_space[64]; }; /* - * Extended state pointed by the fpstate pointer in the sigcontext. - * In addition to the fpstate, information encoded in the xstate_hdr - * indicates the presence of other extended state information - * supported by the processor and OS. + * Extended state pointed to by sigcontext::fpstate. + * + * In addition to the fpstate, information encoded in _xstate::xstate_hdr + * indicates the presence of other extended state information supported + * by the CPU and kernel: */ struct _xstate { - struct _fpstate fpstate; - struct _header xstate_hdr; - struct _ymmh_state ymmh; - /* new processor state extensions go here */ + struct _fpstate fpstate; + struct _header xstate_hdr; + struct _ymmh_state ymmh; + /* New processor state extensions go here: */ }; #endif /* _UAPI_ASM_X86_SIGCONTEXT_H */ -- cgit v1.2.3 From 128f8257a17a47b9a40f550cc2f36458cd8c07b0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 5 Sep 2015 09:32:32 +0200 Subject: x86/headers: Separate out legacy user-space structure definitions Better separate the user-space struct sigcontext definitions from the kernel definitions, so that we can unify the kernel definitions with sigcontext32.h. Acked-by: Mikko Rapeli Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1441438363-9999-5-git-send-email-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/uapi/asm/sigcontext.h | 119 ++++++++++++++++----------------- 1 file changed, 59 insertions(+), 60 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index f89b2f1abe7c..40d6cbac08c6 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -124,36 +124,6 @@ struct _fpstate { }; }; -# ifndef __KERNEL__ -/* - * User-space might still rely on the old definition: - */ -struct sigcontext { - unsigned short gs, __gsh; - unsigned short fs, __fsh; - unsigned short es, __esh; - unsigned short ds, __dsh; - unsigned long edi; - unsigned long esi; - unsigned long ebp; - unsigned long esp; - unsigned long ebx; - unsigned long edx; - unsigned long ecx; - unsigned long eax; - unsigned long trapno; - unsigned long err; - unsigned long eip; - unsigned short cs, __csh; - unsigned long eflags; - unsigned long esp_at_signal; - unsigned short ss, __ssh; - struct _fpstate __user *fpstate; - unsigned long oldmask; - unsigned long cr2; -}; -# endif /* !__KERNEL__ */ - #else /* __x86_64__: */ /* @@ -186,10 +156,65 @@ struct _fpstate { }; }; -# ifndef __KERNEL__ +#endif /* __x86_64__ */ + +struct _header { + __u64 xfeatures; + __u64 reserved1[2]; + __u64 reserved2[5]; +}; + +struct _ymmh_state { + /* 16x YMM registers, 16 bytes each: */ + __u32 ymmh_space[64]; +}; + /* - * User-space might still rely on the old definition: + * Extended state pointed to by sigcontext::fpstate. + * + * In addition to the fpstate, information encoded in _xstate::xstate_hdr + * indicates the presence of other extended state information supported + * by the CPU and kernel: */ +struct _xstate { + struct _fpstate fpstate; + struct _header xstate_hdr; + struct _ymmh_state ymmh; + /* New processor state extensions go here: */ +}; + +/* + * The old user-space sigcontext definition, just in case user-space still + * relies on it. The kernel definition (in asm/sigcontext.h) has unified + * field names but otherwise the same layout. + */ +#ifndef __KERNEL__ +# ifdef __i386__ +struct sigcontext { + unsigned short gs, __gsh; + unsigned short fs, __fsh; + unsigned short es, __esh; + unsigned short ds, __dsh; + unsigned long edi; + unsigned long esi; + unsigned long ebp; + unsigned long esp; + unsigned long ebx; + unsigned long edx; + unsigned long ecx; + unsigned long eax; + unsigned long trapno; + unsigned long err; + unsigned long eip; + unsigned short cs, __csh; + unsigned long eflags; + unsigned long esp_at_signal; + unsigned short ss, __ssh; + struct _fpstate __user *fpstate; + unsigned long oldmask; + unsigned long cr2; +}; +# else /* __x86_64__: */ struct sigcontext { __u64 r8; __u64 r9; @@ -223,33 +248,7 @@ struct sigcontext { # endif __u64 reserved1[8]; }; -# endif /* !__KERNEL__ */ - -#endif /* __x86_64__ */ - -struct _header { - __u64 xfeatures; - __u64 reserved1[2]; - __u64 reserved2[5]; -}; - -struct _ymmh_state { - /* 16x YMM registers, 16 bytes each: */ - __u32 ymmh_space[64]; -}; - -/* - * Extended state pointed to by sigcontext::fpstate. - * - * In addition to the fpstate, information encoded in _xstate::xstate_hdr - * indicates the presence of other extended state information supported - * by the CPU and kernel: - */ -struct _xstate { - struct _fpstate fpstate; - struct _header xstate_hdr; - struct _ymmh_state ymmh; - /* New processor state extensions go here: */ -}; +# endif /* __x86_64__ */ +#endif /* !__KERNEL__ */ #endif /* _UAPI_ASM_X86_SIGCONTEXT_H */ -- cgit v1.2.3 From 3f623a5b27c150451387e358774131780dbd2407 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 5 Sep 2015 09:32:33 +0200 Subject: x86/headers: Use ABI types consistently in sigcontext*.h Use the __u16/32/64 types we standardized on in ABI definitions - and which most of this header was already using. This will allow us to more obviously unify the compat header into the main header. No change in functionality. Acked-by: Mikko Rapeli Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1441438363-9999-6-git-send-email-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/uapi/asm/sigcontext.h | 82 ++++++++++++++++---------------- arch/x86/include/uapi/asm/sigcontext32.h | 58 +++++++++++----------- 2 files changed, 70 insertions(+), 70 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index 40d6cbac08c6..07b0e32a1d23 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -78,48 +78,48 @@ struct _fpx_sw_bytes { /* 10-byte legacy floating point register: */ struct _fpreg { - unsigned short significand[4]; - unsigned short exponent; + __u16 significand[4]; + __u16 exponent; }; /* 16-byte floating point register: */ struct _fpxreg { - unsigned short significand[4]; - unsigned short exponent; - unsigned short padding[3]; + __u16 significand[4]; + __u16 exponent; + __u16 padding[3]; }; /* 16-byte XMM register: */ struct _xmmreg { - unsigned long element[4]; + __u32 element[4]; }; #define X86_FXSR_MAGIC 0x0000 struct _fpstate { /* Legacy FPU environment: */ - unsigned long cw; - unsigned long sw; - unsigned long tag; - unsigned long ipoff; - unsigned long cssel; - unsigned long dataoff; - unsigned long datasel; + __u32 cw; + __u32 sw; + __u32 tag; + __u32 ipoff; + __u32 cssel; + __u32 dataoff; + __u32 datasel; struct _fpreg _st[8]; - unsigned short status; - unsigned short magic; /* 0xffff: regular FPU data only */ + __u16 status; + __u16 magic; /* 0xffff: regular FPU data only */ /* 0x0000: FXSR FPU data */ /* FXSR FPU environment */ - unsigned long _fxsr_env[6]; /* FXSR FPU env is ignored */ - unsigned long mxcsr; - unsigned long reserved; + __u32 _fxsr_env[6]; /* FXSR FPU env is ignored */ + __u32 mxcsr; + __u32 reserved; struct _fpxreg _fxsr_st[8]; /* FXSR FPU reg data is ignored */ struct _xmmreg _xmm[8]; /* First 8 XMM registers */ - unsigned long padding1[44]; /* Second 8 XMM registers plus padding */ + __u32 padding1[44]; /* Second 8 XMM registers plus padding */ union { - unsigned long padding2[12]; + __u32 padding2[12]; struct _fpx_sw_bytes sw_reserved; /* Potential extended state is encoded here */ }; }; @@ -191,28 +191,28 @@ struct _xstate { #ifndef __KERNEL__ # ifdef __i386__ struct sigcontext { - unsigned short gs, __gsh; - unsigned short fs, __fsh; - unsigned short es, __esh; - unsigned short ds, __dsh; - unsigned long edi; - unsigned long esi; - unsigned long ebp; - unsigned long esp; - unsigned long ebx; - unsigned long edx; - unsigned long ecx; - unsigned long eax; - unsigned long trapno; - unsigned long err; - unsigned long eip; - unsigned short cs, __csh; - unsigned long eflags; - unsigned long esp_at_signal; - unsigned short ss, __ssh; + __u16 gs, __gsh; + __u16 fs, __fsh; + __u16 es, __esh; + __u16 ds, __dsh; + __u32 edi; + __u32 esi; + __u32 ebp; + __u32 esp; + __u32 ebx; + __u32 edx; + __u32 ecx; + __u32 eax; + __u32 trapno; + __u32 err; + __u32 eip; + __u16 cs, __csh; + __u32 eflags; + __u32 esp_at_signal; + __u16 ss, __ssh; struct _fpstate __user *fpstate; - unsigned long oldmask; - unsigned long cr2; + __u32 oldmask; + __u32 cr2; }; # else /* __x86_64__: */ struct sigcontext { diff --git a/arch/x86/include/uapi/asm/sigcontext32.h b/arch/x86/include/uapi/asm/sigcontext32.h index 6ff4fbab650e..939a84885673 100644 --- a/arch/x86/include/uapi/asm/sigcontext32.h +++ b/arch/x86/include/uapi/asm/sigcontext32.h @@ -9,15 +9,15 @@ /* 10-byte legacy floating point register: */ struct _fpreg { - unsigned short significand[4]; - unsigned short exponent; + __u16 significand[4]; + __u16 exponent; }; /* 16-byte floating point register: */ struct _fpxreg { - unsigned short significand[4]; - unsigned short exponent; - unsigned short padding[3]; + __u16 significand[4]; + __u16 exponent; + __u16 padding[3]; }; /* 16-byte XMM vector register: */ @@ -38,8 +38,8 @@ struct _fpstate_ia32 { __u32 dataoff; __u32 datasel; struct _fpreg _st[8]; - unsigned short status; - unsigned short magic; /* 0xffff: regular FPU data only */ + __u16 status; + __u16 magic; /* 0xffff: regular FPU data only */ /* 0x0000: FXSR data */ /* Extended FXSR FPU environment: */ @@ -58,28 +58,28 @@ struct _fpstate_ia32 { /* 32-bit compat sigcontext: */ struct sigcontext_ia32 { - unsigned short gs, __gsh; - unsigned short fs, __fsh; - unsigned short es, __esh; - unsigned short ds, __dsh; - unsigned int di; - unsigned int si; - unsigned int bp; - unsigned int sp; - unsigned int bx; - unsigned int dx; - unsigned int cx; - unsigned int ax; - unsigned int trapno; - unsigned int err; - unsigned int ip; - unsigned short cs, __csh; - unsigned int flags; - unsigned int sp_at_signal; - unsigned short ss, __ssh; - unsigned int fpstate; /* Pointer to 'struct _fpstate_ia32' */ - unsigned int oldmask; - unsigned int cr2; + __u16 gs, __gsh; + __u16 fs, __fsh; + __u16 es, __esh; + __u16 ds, __dsh; + __u32 di; + __u32 si; + __u32 bp; + __u32 sp; + __u32 bx; + __u32 dx; + __u32 cx; + __u32 ax; + __u32 trapno; + __u32 err; + __u32 ip; + __u16 cs, __csh; + __u32 flags; + __u32 sp_at_signal; + __u16 ss, __ssh; + __u32 fpstate; /* Pointer to 'struct _fpstate_ia32' */ + __u32 oldmask; + __u32 cr2; }; #endif /* _ASM_X86_SIGCONTEXT32_H */ -- cgit v1.2.3 From 337a167d1a5b2704414679d1a993220a4613ec13 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 5 Sep 2015 09:32:34 +0200 Subject: x86/headers: Unify register type definitions between 32-bit compat and i386 The following sigcontext related types were duplicated across native 32-bit and compat 32-bit headers: struct _fpreg; struct _fpxreg; struct _xmmreg; X86_FXSR_MAGIC Unify them. Acked-by: Mikko Rapeli Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1441438363-9999-7-git-send-email-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/uapi/asm/sigcontext.h | 3 ++- arch/x86/include/uapi/asm/sigcontext32.h | 22 ---------------------- 2 files changed, 2 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index 07b0e32a1d23..9df4df3e40ef 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -63,7 +63,6 @@ struct _fpx_sw_bytes { __u32 padding[7]; }; -#ifdef __i386__ /* * As documented in the iBCS2 standard: * @@ -96,6 +95,8 @@ struct _xmmreg { #define X86_FXSR_MAGIC 0x0000 +#ifdef __i386__ + struct _fpstate { /* Legacy FPU environment: */ __u32 cw; diff --git a/arch/x86/include/uapi/asm/sigcontext32.h b/arch/x86/include/uapi/asm/sigcontext32.h index 939a84885673..356caab997e7 100644 --- a/arch/x86/include/uapi/asm/sigcontext32.h +++ b/arch/x86/include/uapi/asm/sigcontext32.h @@ -3,30 +3,8 @@ /* Signal context definitions for compat 32-bit programs: */ -#include - #include -/* 10-byte legacy floating point register: */ -struct _fpreg { - __u16 significand[4]; - __u16 exponent; -}; - -/* 16-byte floating point register: */ -struct _fpxreg { - __u16 significand[4]; - __u16 exponent; - __u16 padding[3]; -}; - -/* 16-byte XMM vector register: */ -struct _xmmreg { - __u32 element[4]; -}; - -#define X86_FXSR_MAGIC 0x0000 - /* FXSAVE frame: FSAVE frame with extensions */ struct _fpstate_ia32 { /* Regular FPU environment: */ -- cgit v1.2.3 From 7bb0dc2222779b4cbf1ec9ad651e500e62fa5b11 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 5 Sep 2015 09:32:35 +0200 Subject: x86/headers: Unify 'struct _fpstate_ia32' and i386 struct _fpstate 'struct _fpstate_ia32' and 'struct _fpstate' on i386 are identical in all fields, except 'padding1' being named 'padding'. We unify the two structures and add a union that is both named 'padding1' and 'padding', in the (unlikely) case there's user-space code that relies on the padding field name. We rename the two main types to be: struct _fpstate_32 struct _fpstate_64 for the 32-bit and 64-bit frame, and map them to the main and compat structure names (_fpstate) depending on whether we are on 32-bit or on 64-bit kernels. We also keep the old _fpstate_ia32 name as a legacy name. Acked-by: Mikko Rapeli Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1441438363-9999-8-git-send-email-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/sigframe.h | 1 - arch/x86/include/uapi/asm/sigcontext.h | 26 +++++++++++++++++--------- arch/x86/include/uapi/asm/sigcontext32.h | 29 ----------------------------- 3 files changed, 17 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 1f3175bb994e..a27c73d6863a 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -10,7 +10,6 @@ #define sigframe_ia32 sigframe #define rt_sigframe_ia32 rt_sigframe #define sigcontext_ia32 sigcontext -#define _fpstate_ia32 _fpstate #define ucontext_ia32 ucontext #else /* !CONFIG_X86_32 */ diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index 9df4df3e40ef..85811167821f 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -95,9 +95,10 @@ struct _xmmreg { #define X86_FXSR_MAGIC 0x0000 -#ifdef __i386__ - -struct _fpstate { +/* + * The 32-bit FPU frame: + */ +struct _fpstate_32 { /* Legacy FPU environment: */ __u32 cw; __u32 sw; @@ -117,7 +118,10 @@ struct _fpstate { __u32 reserved; struct _fpxreg _fxsr_st[8]; /* FXSR FPU reg data is ignored */ struct _xmmreg _xmm[8]; /* First 8 XMM registers */ - __u32 padding1[44]; /* Second 8 XMM registers plus padding */ + union { + __u32 padding1[44]; /* Second 8 XMM registers plus padding */ + __u32 padding[44]; /* Alias name for old user-space */ + }; union { __u32 padding2[12]; @@ -125,10 +129,8 @@ struct _fpstate { }; }; -#else /* __x86_64__: */ - /* - * The FXSAVE frame. + * The 64-bit FPU frame. (FXSAVE format and later) * * Note1: If sw_reserved.magic1 == FP_XSTATE_MAGIC1 then the structure is * larger: 'struct _xstate'. Note that 'struct _xstate' embedds @@ -138,7 +140,7 @@ struct _fpstate { * Note2: Reserved fields may someday contain valuable data. Always save/restore * them when you change signal frames. */ -struct _fpstate { +struct _fpstate_64 { __u16 cwd; __u16 swd; /* Note this is not the same as the 32-bit/x87/FSAVE twd: */ @@ -157,7 +159,13 @@ struct _fpstate { }; }; -#endif /* __x86_64__ */ +#ifdef __i386__ +# define _fpstate _fpstate_32 +#else +# define _fpstate _fpstate_64 +#endif + +#define _fpstate_ia32 _fpstate_32 struct _header { __u64 xfeatures; diff --git a/arch/x86/include/uapi/asm/sigcontext32.h b/arch/x86/include/uapi/asm/sigcontext32.h index 356caab997e7..19a89165ea1d 100644 --- a/arch/x86/include/uapi/asm/sigcontext32.h +++ b/arch/x86/include/uapi/asm/sigcontext32.h @@ -5,35 +5,6 @@ #include -/* FXSAVE frame: FSAVE frame with extensions */ -struct _fpstate_ia32 { - /* Regular FPU environment: */ - __u32 cw; - __u32 sw; - __u32 tag; /* Not compatible with the 64-bit frame */ - __u32 ipoff; - __u32 cssel; - __u32 dataoff; - __u32 datasel; - struct _fpreg _st[8]; - __u16 status; - __u16 magic; /* 0xffff: regular FPU data only */ - /* 0x0000: FXSR data */ - - /* Extended FXSR FPU environment: */ - __u32 _fxsr_env[6]; - __u32 mxcsr; - __u32 reserved; - struct _fpxreg _fxsr_st[8]; - struct _xmmreg _xmm[8]; /* The first 8 XMM registers */ - __u32 padding[44]; /* The second 8 XMM registers plus padding */ - union { - __u32 padding2[12]; - /* Might encode xstate extensions, see asm/sigcontext.h: */ - struct _fpx_sw_bytes sw_reserved; - }; -}; - /* 32-bit compat sigcontext: */ struct sigcontext_ia32 { __u16 gs, __gsh; -- cgit v1.2.3 From 86e9fc3a0ecdd3c21a938e325daf462ca1724f68 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 5 Sep 2015 09:32:36 +0200 Subject: x86/headers: Convert uses of _fpstate_ia32 to _fpstate_32 Remove uses of _fpstate_ia32 from the kernel, and move the legacy _fpstate_ia32 definition to the user-space only portion of the header. Acked-by: Mikko Rapeli Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1441438363-9999-9-git-send-email-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/include/asm/sigframe.h | 2 +- arch/x86/include/uapi/asm/sigcontext.h | 5 +++-- arch/x86/include/uapi/asm/sigcontext32.h | 2 +- arch/x86/kernel/fpu/signal.c | 4 ++-- 5 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index a0a19b7ba22d..b22015e847dd 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -234,7 +234,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, unsigned long fx_aligned, math_size; sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size); - *fpstate = (struct _fpstate_ia32 __user *) sp; + *fpstate = (struct _fpstate_32 __user *) sp; if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned, math_size) < 0) return (void __user *) -1L; diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index a27c73d6863a..5ff020a65e1b 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -32,7 +32,7 @@ struct sigframe_ia32 { * the offset of extramask[] in the sigframe and thus prevent any * legacy application accessing/modifying it. */ - struct _fpstate_ia32 fpstate_unused; + struct _fpstate_32 fpstate_unused; #ifdef CONFIG_IA32_EMULATION unsigned int extramask[_COMPAT_NSIG_WORDS-1]; #else /* !CONFIG_IA32_EMULATION */ diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index 85811167821f..ca542e37c783 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -165,8 +165,6 @@ struct _fpstate_64 { # define _fpstate _fpstate_64 #endif -#define _fpstate_ia32 _fpstate_32 - struct _header { __u64 xfeatures; __u64 reserved1[2]; @@ -198,6 +196,9 @@ struct _xstate { * field names but otherwise the same layout. */ #ifndef __KERNEL__ + +#define _fpstate_ia32 _fpstate_32 + # ifdef __i386__ struct sigcontext { __u16 gs, __gsh; diff --git a/arch/x86/include/uapi/asm/sigcontext32.h b/arch/x86/include/uapi/asm/sigcontext32.h index 19a89165ea1d..06c749b40263 100644 --- a/arch/x86/include/uapi/asm/sigcontext32.h +++ b/arch/x86/include/uapi/asm/sigcontext32.h @@ -26,7 +26,7 @@ struct sigcontext_ia32 { __u32 flags; __u32 sp_at_signal; __u16 ss, __ssh; - __u32 fpstate; /* Pointer to 'struct _fpstate_ia32' */ + __u32 fpstate; /* Pointer to 'struct _fpstate_32' */ __u32 oldmask; __u32 cr2; }; diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 50ec9af1bd51..24aac16603a2 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -56,7 +56,7 @@ static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) if (use_fxsr()) { struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; struct user_i387_ia32_struct env; - struct _fpstate_ia32 __user *fp = buf; + struct _fpstate_32 __user *fp = buf; convert_from_fxsr(&env, tsk); @@ -165,7 +165,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_get(current, NULL, 0, sizeof(struct user_i387_ia32_struct), NULL, - (struct _fpstate_ia32 __user *) buf) ? -1 : 1; + (struct _fpstate_32 __user *) buf) ? -1 : 1; if (fpregs_active()) { /* Save the live register state to the user directly. */ -- cgit v1.2.3 From 2d057c69e743bfb88f1259ddbf1b1160bdd850e5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 5 Sep 2015 09:32:37 +0200 Subject: x86/headers: Clean up the kernel's struct sigcontext types to be ABI-clean Use the __u16/32/64 types we standardized on in ABI definitions and which other sigcontext related types are already using. This will help unify struct sigcontext types between native 32-bit, compat and 64-bit kernels. Acked-by: Mikko Rapeli Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1441438363-9999-10-git-send-email-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/sigcontext.h | 104 +++++++++++++++++++------------------- 1 file changed, 52 insertions(+), 52 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index 9dfce4e0417d..b9c2bd6402df 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -5,25 +5,25 @@ #ifdef __i386__ struct sigcontext { - unsigned short gs, __gsh; - unsigned short fs, __fsh; - unsigned short es, __esh; - unsigned short ds, __dsh; - unsigned long di; - unsigned long si; - unsigned long bp; - unsigned long sp; - unsigned long bx; - unsigned long dx; - unsigned long cx; - unsigned long ax; - unsigned long trapno; - unsigned long err; - unsigned long ip; - unsigned short cs, __csh; - unsigned long flags; - unsigned long sp_at_signal; - unsigned short ss, __ssh; + __u16 gs, __gsh; + __u16 fs, __fsh; + __u16 es, __esh; + __u16 ds, __dsh; + __u32 di; + __u32 si; + __u32 bp; + __u32 sp; + __u32 bx; + __u32 dx; + __u32 cx; + __u32 ax; + __u32 trapno; + __u32 err; + __u32 ip; + __u16 cs, __csh; + __u32 flags; + __u32 sp_at_signal; + __u16 ss, __ssh; /* * fpstate is really (struct _fpstate *) or (struct _xstate *) @@ -32,38 +32,38 @@ struct sigcontext { * of extended memory layout. See comments at the definition of * (struct _fpx_sw_bytes) */ - void __user *fpstate; /* zero when no FPU/extended context */ - unsigned long oldmask; - unsigned long cr2; + void __user *fpstate; /* Zero when no FPU/extended context */ + __u32 oldmask; + __u32 cr2; }; -#else /* __i386__ */ +#else /* __x86_64__: */ struct sigcontext { - unsigned long r8; - unsigned long r9; - unsigned long r10; - unsigned long r11; - unsigned long r12; - unsigned long r13; - unsigned long r14; - unsigned long r15; - unsigned long di; - unsigned long si; - unsigned long bp; - unsigned long bx; - unsigned long dx; - unsigned long ax; - unsigned long cx; - unsigned long sp; - unsigned long ip; - unsigned long flags; - unsigned short cs; - unsigned short gs; - unsigned short fs; - unsigned short __pad0; - unsigned long err; - unsigned long trapno; - unsigned long oldmask; - unsigned long cr2; + __u64 r8; + __u64 r9; + __u64 r10; + __u64 r11; + __u64 r12; + __u64 r13; + __u64 r14; + __u64 r15; + __u64 di; + __u64 si; + __u64 bp; + __u64 bx; + __u64 dx; + __u64 ax; + __u64 cx; + __u64 sp; + __u64 ip; + __u64 flags; + __u16 cs; + __u16 gs; + __u16 fs; + __u16 __pad0; + __u64 err; + __u64 trapno; + __u64 oldmask; + __u64 cr2; /* * fpstate is really (struct _fpstate *) or (struct _xstate *) @@ -72,8 +72,8 @@ struct sigcontext { * of extended memory layout. See comments at the definition of * (struct _fpx_sw_bytes) */ - void __user *fpstate; /* zero when no FPU/extended context */ - unsigned long reserved1[8]; + void __user *fpstate; /* Zero when no FPU/extended context */ + __u64 reserved1[8]; }; -#endif /* !__i386__ */ +#endif /* !__x86_64__ */ #endif /* _ASM_X86_SIGCONTEXT_H */ -- cgit v1.2.3 From f2c609bca0b6526810fa22330ce4d173cc023ef8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 5 Sep 2015 09:32:38 +0200 Subject: x86/headers: Move the 'struct sigcontext' definitions into the UAPI header Our goal is to eliminate the duplicate struct sigcontext_ia32 definition, so move the kernel's primary sigcontext type into the UAPI header, defining these two variants: struct sigcontext_32 struct sigcontext_64 ... and map them to 'struct sigcontext'. Acked-by: Mikko Rapeli Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1441438363-9999-11-git-send-email-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/sigcontext.h | 73 ------------------------------ arch/x86/include/uapi/asm/sigcontext.h | 83 ++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 73 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index b9c2bd6402df..25815f00b4ff 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -3,77 +3,4 @@ #include -#ifdef __i386__ -struct sigcontext { - __u16 gs, __gsh; - __u16 fs, __fsh; - __u16 es, __esh; - __u16 ds, __dsh; - __u32 di; - __u32 si; - __u32 bp; - __u32 sp; - __u32 bx; - __u32 dx; - __u32 cx; - __u32 ax; - __u32 trapno; - __u32 err; - __u32 ip; - __u16 cs, __csh; - __u32 flags; - __u32 sp_at_signal; - __u16 ss, __ssh; - - /* - * fpstate is really (struct _fpstate *) or (struct _xstate *) - * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved - * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end - * of extended memory layout. See comments at the definition of - * (struct _fpx_sw_bytes) - */ - void __user *fpstate; /* Zero when no FPU/extended context */ - __u32 oldmask; - __u32 cr2; -}; -#else /* __x86_64__: */ -struct sigcontext { - __u64 r8; - __u64 r9; - __u64 r10; - __u64 r11; - __u64 r12; - __u64 r13; - __u64 r14; - __u64 r15; - __u64 di; - __u64 si; - __u64 bp; - __u64 bx; - __u64 dx; - __u64 ax; - __u64 cx; - __u64 sp; - __u64 ip; - __u64 flags; - __u16 cs; - __u16 gs; - __u16 fs; - __u16 __pad0; - __u64 err; - __u64 trapno; - __u64 oldmask; - __u64 cr2; - - /* - * fpstate is really (struct _fpstate *) or (struct _xstate *) - * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved - * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end - * of extended memory layout. See comments at the definition of - * (struct _fpx_sw_bytes) - */ - void __user *fpstate; /* Zero when no FPU/extended context */ - __u64 reserved1[8]; -}; -#endif /* !__x86_64__ */ #endif /* _ASM_X86_SIGCONTEXT_H */ diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index ca542e37c783..3591cef6d7d2 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -190,6 +190,89 @@ struct _xstate { /* New processor state extensions go here: */ }; +struct sigcontext_32 { + __u16 gs, __gsh; + __u16 fs, __fsh; + __u16 es, __esh; + __u16 ds, __dsh; + __u32 di; + __u32 si; + __u32 bp; + __u32 sp; + __u32 bx; + __u32 dx; + __u32 cx; + __u32 ax; + __u32 trapno; + __u32 err; + __u32 ip; + __u16 cs, __csh; + __u32 flags; + __u32 sp_at_signal; + __u16 ss, __ssh; + + /* + * fpstate is really (struct _fpstate *) or (struct _xstate *) + * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved + * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end + * of extended memory layout. See comments at the definition of + * (struct _fpx_sw_bytes) + */ + void __user *fpstate; /* Zero when no FPU/extended context */ + __u32 oldmask; + __u32 cr2; +}; + +struct sigcontext_64 { + __u64 r8; + __u64 r9; + __u64 r10; + __u64 r11; + __u64 r12; + __u64 r13; + __u64 r14; + __u64 r15; + __u64 di; + __u64 si; + __u64 bp; + __u64 bx; + __u64 dx; + __u64 ax; + __u64 cx; + __u64 sp; + __u64 ip; + __u64 flags; + __u16 cs; + __u16 gs; + __u16 fs; + __u16 __pad0; + __u64 err; + __u64 trapno; + __u64 oldmask; + __u64 cr2; + + /* + * fpstate is really (struct _fpstate *) or (struct _xstate *) + * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved + * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end + * of extended memory layout. See comments at the definition of + * (struct _fpx_sw_bytes) + */ + void __user *fpstate; /* Zero when no FPU/extended context */ + __u64 reserved1[8]; +}; + +/* + * Create the real 'struct sigcontext' type: + */ +#ifdef __KERNEL__ +# ifdef __i386__ +# define sigcontext sigcontext_32 +# else +# define sigcontext sigcontext_64 +# endif +#endif + /* * The old user-space sigcontext definition, just in case user-space still * relies on it. The kernel definition (in asm/sigcontext.h) has unified -- cgit v1.2.3 From 530e5c827182a7a5322c55276b0617fd06874c24 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 5 Sep 2015 09:32:39 +0200 Subject: x86/headers: Make sigcontext pointers bit independent Before we can eliminate the duplication between 'struct sigcontext_32' and 'struct sigcontext_ia32', make the 'fpstate' pointer field in 'struct sigcontext_32' bit independent. Acked-by: Mikko Rapeli Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1441438363-9999-12-git-send-email-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/uapi/asm/sigcontext.h | 4 ++-- arch/x86/kernel/signal.c | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index 3591cef6d7d2..d0def259d545 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -218,7 +218,7 @@ struct sigcontext_32 { * of extended memory layout. See comments at the definition of * (struct _fpx_sw_bytes) */ - void __user *fpstate; /* Zero when no FPU/extended context */ + __u32 fpstate; /* Zero when no FPU/extended context */ __u32 oldmask; __u32 cr2; }; @@ -258,7 +258,7 @@ struct sigcontext_64 { * of extended memory layout. See comments at the definition of * (struct _fpx_sw_bytes) */ - void __user *fpstate; /* Zero when no FPU/extended context */ + __u64 fpstate; /* Zero when no FPU/extended context */ __u64 reserved1[8]; }; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index da52e6bb5c7f..3724ff38033e 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -63,6 +63,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) { + unsigned long buf_val; void __user *buf; unsigned int tmpflags; unsigned int err = 0; @@ -107,7 +108,8 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); regs->orig_ax = -1; /* disable syscall checks */ - get_user_ex(buf, &sc->fpstate); + get_user_ex(buf_val, &sc->fpstate); + buf = (void __user *)buf_val; } get_user_catch(err); err |= fpu__restore_sig(buf, config_enabled(CONFIG_X86_32)); -- cgit v1.2.3 From db1e031401c6abab983919e882916d028f3b385e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 5 Sep 2015 09:32:40 +0200 Subject: x86/headers: Unify 'struct sigcontext_ia32' and 'struct sigcontext_32' The two structures are identical - merge them and keep the legacy name as a define. Acked-by: Mikko Rapeli Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1441438363-9999-13-git-send-email-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/sigframe.h | 1 - arch/x86/include/uapi/asm/sigcontext.h | 2 ++ arch/x86/include/uapi/asm/sigcontext32.h | 26 -------------------------- 3 files changed, 2 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 5ff020a65e1b..98f04018663a 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -9,7 +9,6 @@ #ifdef CONFIG_X86_32 #define sigframe_ia32 sigframe #define rt_sigframe_ia32 rt_sigframe -#define sigcontext_ia32 sigcontext #define ucontext_ia32 ucontext #else /* !CONFIG_X86_32 */ diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index d0def259d545..592bfafd5cb2 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -223,6 +223,8 @@ struct sigcontext_32 { __u32 cr2; }; +#define sigcontext_ia32 sigcontext_32 + struct sigcontext_64 { __u64 r8; __u64 r9; diff --git a/arch/x86/include/uapi/asm/sigcontext32.h b/arch/x86/include/uapi/asm/sigcontext32.h index 06c749b40263..257fbb3baaa7 100644 --- a/arch/x86/include/uapi/asm/sigcontext32.h +++ b/arch/x86/include/uapi/asm/sigcontext32.h @@ -5,30 +5,4 @@ #include -/* 32-bit compat sigcontext: */ -struct sigcontext_ia32 { - __u16 gs, __gsh; - __u16 fs, __fsh; - __u16 es, __esh; - __u16 ds, __dsh; - __u32 di; - __u32 si; - __u32 bp; - __u32 sp; - __u32 bx; - __u32 dx; - __u32 cx; - __u32 ax; - __u32 trapno; - __u32 err; - __u32 ip; - __u16 cs, __csh; - __u32 flags; - __u32 sp_at_signal; - __u16 ss, __ssh; - __u32 fpstate; /* Pointer to 'struct _fpstate_32' */ - __u32 oldmask; - __u32 cr2; -}; - #endif /* _ASM_X86_SIGCONTEXT32_H */ -- cgit v1.2.3 From 8fcb346b910e860d2525457742ae984c4ddc64b5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 5 Sep 2015 09:32:41 +0200 Subject: x86/headers: Convert sigcontext_ia32 uses to sigcontext_32 Use the new name in kernel code, and move the old name to the user-space-only legacy section of the UAPI header. Acked-by: Mikko Rapeli Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1441438363-9999-14-git-send-email-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 4 ++-- arch/x86/include/asm/ia32.h | 2 +- arch/x86/include/asm/sigframe.h | 2 +- arch/x86/include/uapi/asm/sigcontext.h | 12 +++++++++--- arch/x86/kernel/asm-offsets.c | 18 +++++++++--------- 5 files changed, 22 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index b22015e847dd..b42103086a78 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -68,7 +68,7 @@ } static int ia32_restore_sigcontext(struct pt_regs *regs, - struct sigcontext_ia32 __user *sc) + struct sigcontext_32 __user *sc) { unsigned int tmpflags, err = 0; void __user *buf; @@ -170,7 +170,7 @@ badframe: * Set up a signal frame. */ -static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, +static int ia32_setup_sigcontext(struct sigcontext_32 __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned int mask) { diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index 28019765442e..3173b48c3c56 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -18,7 +18,7 @@ struct ucontext_ia32 { unsigned int uc_flags; unsigned int uc_link; compat_stack_t uc_stack; - struct sigcontext_ia32 uc_mcontext; + struct sigcontext_32 uc_mcontext; compat_sigset_t uc_sigmask; /* mask last for extensibility */ }; diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 98f04018663a..867a7577fbeb 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -22,7 +22,7 @@ struct sigframe_ia32 { u32 pretcode; int sig; - struct sigcontext_ia32 sc; + struct sigcontext_32 sc; /* * fpstate is unused. fpstate is moved/allocated after * retcode[] below. This movement allows to have the FP state and the diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index 592bfafd5cb2..4411e6a8f8e7 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -190,6 +190,9 @@ struct _xstate { /* New processor state extensions go here: */ }; +/* + * The 32-bit signal frame: + */ struct sigcontext_32 { __u16 gs, __gsh; __u16 fs, __fsh; @@ -223,8 +226,9 @@ struct sigcontext_32 { __u32 cr2; }; -#define sigcontext_ia32 sigcontext_32 - +/* + * The 64-bit signal frame: + */ struct sigcontext_64 { __u64 r8; __u64 r9; @@ -282,7 +286,9 @@ struct sigcontext_64 { */ #ifndef __KERNEL__ -#define _fpstate_ia32 _fpstate_32 +#define _fpstate_ia32 _fpstate_32 +#define sigcontext_ia32 sigcontext_32 + # ifdef __i386__ struct sigcontext { diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 8e3d22a1af94..bc9f4afa889a 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -43,15 +43,15 @@ void common(void) { #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) BLANK(); - OFFSET(IA32_SIGCONTEXT_ax, sigcontext_ia32, ax); - OFFSET(IA32_SIGCONTEXT_bx, sigcontext_ia32, bx); - OFFSET(IA32_SIGCONTEXT_cx, sigcontext_ia32, cx); - OFFSET(IA32_SIGCONTEXT_dx, sigcontext_ia32, dx); - OFFSET(IA32_SIGCONTEXT_si, sigcontext_ia32, si); - OFFSET(IA32_SIGCONTEXT_di, sigcontext_ia32, di); - OFFSET(IA32_SIGCONTEXT_bp, sigcontext_ia32, bp); - OFFSET(IA32_SIGCONTEXT_sp, sigcontext_ia32, sp); - OFFSET(IA32_SIGCONTEXT_ip, sigcontext_ia32, ip); + OFFSET(IA32_SIGCONTEXT_ax, sigcontext_32, ax); + OFFSET(IA32_SIGCONTEXT_bx, sigcontext_32, bx); + OFFSET(IA32_SIGCONTEXT_cx, sigcontext_32, cx); + OFFSET(IA32_SIGCONTEXT_dx, sigcontext_32, dx); + OFFSET(IA32_SIGCONTEXT_si, sigcontext_32, si); + OFFSET(IA32_SIGCONTEXT_di, sigcontext_32, di); + OFFSET(IA32_SIGCONTEXT_bp, sigcontext_32, bp); + OFFSET(IA32_SIGCONTEXT_sp, sigcontext_32, sp); + OFFSET(IA32_SIGCONTEXT_ip, sigcontext_32, ip); BLANK(); OFFSET(TI_sysenter_return, thread_info, sysenter_return); -- cgit v1.2.3 From 711531f4f3cb881f4598cddfcc36663101dee411 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 5 Sep 2015 09:32:42 +0200 Subject: x86/headers: Remove direct sigcontext32.h uses Now that all sigcontext types are defined in asm/sigcontext.h, remove the various sigcontext32.h uses in the kernel. We still keep the header itself, which includes sigcontext.h, in case user-space relies on it. Acked-by: Mikko Rapeli Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1441438363-9999-15-git-send-email-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/include/asm/fpu/signal.h | 2 +- arch/x86/include/asm/ia32.h | 2 +- arch/x86/include/uapi/asm/sigcontext32.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index b42103086a78..955ca03910ee 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index 7358e9d61f1e..856f4b3cf1e3 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -5,7 +5,7 @@ #define _ASM_X86_FPU_SIGNAL_H #ifdef CONFIG_X86_64 -# include +# include # include struct ksignal; int ia32_setup_rt_frame(int sig, struct ksignal *ksig, diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index 3173b48c3c56..16edda2816be 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -10,7 +10,7 @@ * 32 bit structures for IA32 support. */ -#include +#include /* signal.h */ diff --git a/arch/x86/include/uapi/asm/sigcontext32.h b/arch/x86/include/uapi/asm/sigcontext32.h index 257fbb3baaa7..a92b0f0dc09e 100644 --- a/arch/x86/include/uapi/asm/sigcontext32.h +++ b/arch/x86/include/uapi/asm/sigcontext32.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_SIGCONTEXT32_H #define _ASM_X86_SIGCONTEXT32_H -/* Signal context definitions for compat 32-bit programs: */ +/* This is a legacy file - all the type definitions are in sigcontext.h: */ #include -- cgit v1.2.3 From decb4c41159e1511197f2964da758fa7f2eeb741 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 5 Sep 2015 09:32:43 +0200 Subject: x86/headers: Remove references on the kernel side Now that all type definitions are in the UAPI header, include it directly, instead of through . [ We still keep asm/sigcontext.h, so that uapi/asm/sigcontext32.h can include . ] Acked-by: Mikko Rapeli Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1441438363-9999-16-git-send-email-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/include/asm/fpu/signal.h | 2 +- arch/x86/include/asm/ia32.h | 2 +- arch/x86/include/asm/processor.h | 2 +- arch/x86/include/asm/sigcontext.h | 2 ++ arch/x86/include/asm/sigframe.h | 2 +- arch/x86/include/asm/signal.h | 2 +- arch/x86/math-emu/fpu_emu.h | 2 +- 8 files changed, 9 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 955ca03910ee..5959042fae8d 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index 856f4b3cf1e3..0e970d00dfcd 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -5,7 +5,7 @@ #define _ASM_X86_FPU_SIGNAL_H #ifdef CONFIG_X86_64 -# include +# include # include struct ksignal; int ia32_setup_rt_frame(int sig, struct ksignal *ksig, diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index 16edda2816be..a9bdf5569ab3 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -10,7 +10,7 @@ * 32 bit structures for IA32 support. */ -#include +#include /* signal.h */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 19577dd325fa..bb911e718330 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -11,7 +11,7 @@ struct vm86; #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index 25815f00b4ff..e6cd2c489dbb 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_SIGCONTEXT_H #define _ASM_X86_SIGCONTEXT_H +/* This is a legacy header - all kernel code includes directly. */ + #include #endif /* _ASM_X86_SIGCONTEXT_H */ diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 867a7577fbeb..34edd1650bae 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_SIGFRAME_H #define _ASM_X86_SIGFRAME_H -#include +#include #include #include #include diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index c481be78fcf1..2138c9ae19ee 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -34,7 +34,7 @@ extern void do_signal(struct pt_regs *regs); #define __ARCH_HAS_SA_RESTORER -#include +#include #ifdef __i386__ diff --git a/arch/x86/math-emu/fpu_emu.h b/arch/x86/math-emu/fpu_emu.h index 4dae511c85ad..afbc4d805d66 100644 --- a/arch/x86/math-emu/fpu_emu.h +++ b/arch/x86/math-emu/fpu_emu.h @@ -71,7 +71,7 @@ #include "fpu_system.h" -#include /* for struct _fpstate */ +#include /* for struct _fpstate */ #include #include -- cgit v1.2.3 From 7c52198b9c36001d47e11d50f7f1556805c5c3e3 Mon Sep 17 00:00:00 2001 From: George Beshers Date: Sat, 12 Sep 2015 21:51:05 -0500 Subject: x86/platform/uv: Insert per_cpu accessor function on uv_hub_nmi UV: NMI: insert this_cpu_read accessor function on uv_hub_nmi. On SGI UV systems a 'power nmi' command from the CMC causes all processors to drop into uv_handle_nmi(). With the 4.0 kernel this results in BUG: unable to handle kernel paging request The bug is caused by the current code trying to use the PER_CPU variable uv_cpu_nmi.hub without an appropriate accessor function. That oversight occurred in commit e16321709c82 ("uv: Replace __get_cpu_var") Author: Christoph Lameter Date: Sun Aug 17 12:30:41 2014 -0500 This patch inserts this_cpu_read() in the uv_hub_nmi macro restoring the intended functionality. Signed-off-by: George Beshers Acked-by: Mike Travis Cc: Alex Thorlton Cc: Dimitri Sivanich Cc: Hedi Berriche Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index a00ad8f2a657..ea7074784cc4 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -609,7 +609,7 @@ struct uv_cpu_nmi_s { DECLARE_PER_CPU(struct uv_cpu_nmi_s, uv_cpu_nmi); -#define uv_hub_nmi (uv_cpu_nmi.hub) +#define uv_hub_nmi this_cpu_read(uv_cpu_nmi.hub) #define uv_cpu_nmi_per(cpu) (per_cpu(uv_cpu_nmi, cpu)) #define uv_hub_nmi_per(cpu) (uv_cpu_nmi_per(cpu).hub) -- cgit v1.2.3 From b20112edeadf0b8a1416de061caa4beb11539902 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 21 Aug 2015 12:05:18 +0300 Subject: perf/x86: Improve accuracy of perf/sched clock When TSC is stable perf/sched clock is based on it. However the conversion from cycles to nanoseconds is not as accurate as it could be. Because CYC2NS_SCALE_FACTOR is 10, the accuracy is +/- 1/2048 The change is to calculate the maximum shift that results in a multiplier that is still a 32-bit number. For example all frequencies over 1 GHz will have a shift of 32, making the accuracy of the conversion +/- 1/(2^33). That is achieved by using the 'clocks_calc_mult_shift()' function. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1440147918-22250-1-git-send-email-adrian.hunter@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index c8d52cb4cb6e..39ec3d07affd 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -167,21 +167,20 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) * ns = cycles * cyc2ns_scale / SC * * And since SC is a constant power of two, we can convert the div - * into a shift. + * into a shift. The larger SC is, the more accurate the conversion, but + * cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication + * (64-bit result) can be used. * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * We can use khz divisor instead of mhz to keep a better precision. * (mathieu.desnoyers@polymtl.ca) * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - static void cyc2ns_data_init(struct cyc2ns_data *data) { data->cyc2ns_mul = 0; - data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; + data->cyc2ns_shift = 0; data->cyc2ns_offset = 0; data->__count = 0; } @@ -215,14 +214,14 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) if (likely(data == tail)) { ns = data->cyc2ns_offset; - ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); } else { data->__count++; barrier(); ns = data->cyc2ns_offset; - ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); barrier(); @@ -256,12 +255,11 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) * time function is continuous; see the comment near struct * cyc2ns_data. */ - data->cyc2ns_mul = - DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, - cpu_khz); - data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; + clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz, + NSEC_PER_MSEC, 0); + data->cyc2ns_offset = ns_now - - mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift); cyc2ns_write_end(cpu, data); -- cgit v1.2.3 From a09d31f45224c219a4d5c728fa40150dc9d3e3e5 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Mon, 31 Aug 2015 17:09:27 +0300 Subject: perf/x86/intel/ds: Work around BTS leaking kernel addresses BTS leaks kernel addresses even in userspace-only mode due to imprecise IP sampling, so sometimes syscall entry points or page fault handler addresses end up in a userspace trace. Since this driver uses a relatively small buffer for BTS records and it has to iterate through them anyway, it can also take on the additional job of filtering out the records that contain kernel addresses when kernel space tracing is not enabled. This patch changes the bts code to skip the offending records from perf output. In order to request the exact amount of space on the ring buffer, we need to do an extra pass through the records to know how many there are of the valid ones, but considering the small size of the buffer, this extra pass adds very little overhead to the nmi handler. This way we won't end up with awkward IP samples with zero IPs in the perf stream. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1441030168-6853-2-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 40 ++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 84f236ab96b0..5db1c7755548 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -510,10 +510,11 @@ int intel_pmu_drain_bts_buffer(void) u64 flags; }; struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; - struct bts_record *at, *top; + struct bts_record *at, *base, *top; struct perf_output_handle handle; struct perf_event_header header; struct perf_sample_data data; + unsigned long skip = 0; struct pt_regs regs; if (!event) @@ -522,10 +523,10 @@ int intel_pmu_drain_bts_buffer(void) if (!x86_pmu.bts_active) return 0; - at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; - top = (struct bts_record *)(unsigned long)ds->bts_index; + base = (struct bts_record *)(unsigned long)ds->bts_buffer_base; + top = (struct bts_record *)(unsigned long)ds->bts_index; - if (top <= at) + if (top <= base) return 0; memset(®s, 0, sizeof(regs)); @@ -534,6 +535,27 @@ int intel_pmu_drain_bts_buffer(void) perf_sample_data_init(&data, 0, event->hw.last_period); + /* + * BTS leaks kernel addresses in branches across the cpl boundary, + * such as traps or system calls, so unless the user is asking for + * kernel tracing (and right now it's not possible), we'd need to + * filter them out. But first we need to count how many of those we + * have in the current batch. This is an extra O(n) pass, however, + * it's much faster than the other one especially considering that + * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the + * alloc_bts_buffer()). + */ + for (at = base; at < top; at++) { + /* + * Note that right now *this* BTS code only works if + * attr::exclude_kernel is set, but let's keep this extra + * check here in case that changes. + */ + if (event->attr.exclude_kernel && + (kernel_ip(at->from) || kernel_ip(at->to))) + skip++; + } + /* * Prepare a generic sample, i.e. fill in the invariant fields. * We will overwrite the from and to address before we output @@ -541,10 +563,16 @@ int intel_pmu_drain_bts_buffer(void) */ perf_prepare_sample(&header, &data, event, ®s); - if (perf_output_begin(&handle, event, header.size * (top - at))) + if (perf_output_begin(&handle, event, header.size * + (top - base - skip))) return 1; - for (; at < top; at++) { + for (at = base; at < top; at++) { + /* Filter out any records that contain kernel addresses. */ + if (event->attr.exclude_kernel && + (kernel_ip(at->from) || kernel_ip(at->to))) + continue; + data.ip = at->from; data.addr = at->to; -- cgit v1.2.3 From d2878d642a4edd1d57c691dc3e4d7847cbf9d442 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Mon, 31 Aug 2015 17:09:28 +0300 Subject: perf/x86/intel/bts: Disallow use by unprivileged users on paranoid systems BTS leaks kernel addresses even in userspace-only mode due to imprecise IP sampling, so sometimes syscall entry points or page fault handler addresses end up in a userspace trace. Now, intel_bts driver exports trace data zero-copy, it does not scan through it to filter out the kernel addresses and it's would be a O(n) job. To work around this situation, this patch forbids the use of intel_bts driver by unprivileged users on systems with the paranoid setting above the (kernel's) default "1", which still allows kernel profiling. In other words, using intel_bts driver implies kernel tracing, regardless of the "exclude_kernel" attribute setting. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1441030168-6853-3-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_bts.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c index d1c0f254afbe..2cad71d1b14c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_bts.c +++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c @@ -495,6 +495,19 @@ static int bts_event_init(struct perf_event *event) if (x86_add_exclusive(x86_lbr_exclusive_bts)) return -EBUSY; + /* + * BTS leaks kernel addresses even when CPL0 tracing is + * disabled, so disallow intel_bts driver for unprivileged + * users on paranoid systems since it provides trace data + * to the user in a zero-copy fashion. + * + * Note that the default paranoia setting permits unprivileged + * users to profile the kernel. + */ + if (event->attr.exclude_kernel && perf_paranoid_kernel() && + !capable(CAP_SYS_ADMIN)) + return -EACCES; + ret = x86_reserve_hardware(); if (ret) { x86_del_exclusive(x86_lbr_exclusive_bts); -- cgit v1.2.3 From deb27519bf1f4b21a761c0675dbdf1196df7d72a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 17 Aug 2015 08:37:31 -0400 Subject: perf/x86/intel: Fix LBR callstack issue caused by FREEZE_LBRS_ON_PMI This patch fixes an issue which introduced by commit 1a78d93750bb5f61abdc59a91fc3bd06a214542a ("perf/x86/intel: Streamline LBR MSR handling in PMI"). The old patch not only avoids writing LBR_SELECT MSR in PMI, but also avoids updating lbr_select variable. So in PMI, FREEZE_LBRS_ON_PMI bit is always mistakenly set for IA32_DEBUGCTLMSR MSR, which causes superfluous increase/decrease of LBR_TOS when collecting LBR callstack. Reported-by: Milian Wolff Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1439815051-8616-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index b2c9475b7ff2..a1d07c71d3b6 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -151,10 +151,9 @@ static void __intel_pmu_lbr_enable(bool pmi) * No need to reprogram LBR_SELECT in a PMI, as it * did not change. */ - if (cpuc->lbr_sel && !pmi) { - lbr_select = cpuc->lbr_sel->config; + lbr_select = cpuc->lbr_sel->config; + if (cpuc->lbr_sel && !pmi) wrmsrl(MSR_LBR_SELECT, lbr_select); - } rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); orig_debugctl = debugctl; -- cgit v1.2.3 From 73fdeb66592ee80dffb16fb8a9b7378a00c1a826 Mon Sep 17 00:00:00 2001 From: Huaitong Han Date: Mon, 31 Aug 2015 16:21:02 +0800 Subject: perf/x86/intel/pt: Fix KVM warning due to doing rdmsr() before the CPUID test If KVM does not support INTEL_PT, guest MSR_IA32_RTIT_CTL reading will produce host warning like "kvm [2469]: vcpu0 unhandled rdmsr: 0x570". Guest can determine whether the CPU supports Intel_PT according to CPUID, so test_cpu_cap function is added before rdmsr,and it is more in line with the code style. Signed-off-by: Huaitong Han Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Link: http://lkml.kernel.org/r/1441009262-9792-1-git-send-email-huaitong.han@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_pt.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index 42169283448b..868e1194337f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -139,9 +139,6 @@ static int __init pt_pmu_hw_init(void) long i; attrs = NULL; - ret = -ENODEV; - if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT)) - goto fail; for (i = 0; i < PT_CPUID_LEAVES; i++) { cpuid_count(20, i, @@ -1130,6 +1127,10 @@ static __init int pt_init(void) int ret, cpu, prior_warn = 0; BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE); + + if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT)) + return -ENODEV; + get_online_cpus(); for_each_online_cpu(cpu) { u64 ctl; -- cgit v1.2.3 From fbbe07011581990ef74dfac06dc8511b1a14badb Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 3 Sep 2015 20:07:45 -0700 Subject: perf/core: Add a 'flags' parameter to the PMU transactional interfaces Currently, the PMU interface allows reading only one counter at a time. But some PMUs like the 24x7 counters in Power, support reading several counters at once. To leveage this functionality, extend the transaction interface to support a "transaction type". The first type, PERF_PMU_TXN_ADD, refers to the existing transactions, i.e. used to _schedule_ all the events on the PMU as a group. A second transaction type, PERF_PMU_TXN_READ, will be used in a follow-on patch, by the 24x7 counters to read several counters at once. Extend the transaction interfaces to the PMU to accept a 'txn_flags' parameter and use this parameter to ignore any transactions that are not of type PERF_PMU_TXN_ADD. Thanks to Peter Zijlstra for his input. Signed-off-by: Sukadev Bhattiprolu [peterz: s390 compile fix] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Michael Ellerman Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1441336073-22750-3-git-send-email-sukadev@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/powerpc/perf/core-book3s.c | 30 +++++++++++++++++++++++++++++- arch/s390/kernel/perf_cpum_cf.c | 30 +++++++++++++++++++++++++++++- arch/sparc/kernel/perf_event.c | 25 ++++++++++++++++++++++++- arch/x86/kernel/cpu/perf_event.c | 32 +++++++++++++++++++++++++++++++- arch/x86/kernel/cpu/perf_event.h | 1 + include/linux/perf_event.h | 14 +++++++++++--- kernel/events/core.c | 31 ++++++++++++++++++++++++++++--- 7 files changed, 153 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index b0382f3f1095..c84074185c80 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -49,6 +49,7 @@ struct cpu_hw_events { unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; unsigned int group_flag; + unsigned int txn_flags; int n_txn_start; /* BHRB bits */ @@ -1586,11 +1587,21 @@ static void power_pmu_stop(struct perf_event *event, int ef_flags) * Start group events scheduling transaction * Set the flag to make pmu::enable() not perform the * schedulability test, it will be performed at commit time + * + * We only support PERF_PMU_TXN_ADD transactions. Save the + * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD + * transactions. */ -static void power_pmu_start_txn(struct pmu *pmu) +static void power_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) { struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + WARN_ON_ONCE(cpuhw->txn_flags); /* txn already in flight */ + + cpuhw->txn_flags = txn_flags; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_disable(pmu); cpuhw->group_flag |= PERF_EVENT_TXN; cpuhw->n_txn_start = cpuhw->n_events; @@ -1604,6 +1615,14 @@ static void power_pmu_start_txn(struct pmu *pmu) static void power_pmu_cancel_txn(struct pmu *pmu) { struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + unsigned int txn_flags; + + WARN_ON_ONCE(!cpuhw->txn_flags); /* no txn in flight */ + + txn_flags = cpuhw->txn_flags; + cpuhw->txn_flags = 0; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; cpuhw->group_flag &= ~PERF_EVENT_TXN; perf_pmu_enable(pmu); @@ -1621,7 +1640,15 @@ static int power_pmu_commit_txn(struct pmu *pmu) if (!ppmu) return -EAGAIN; + cpuhw = this_cpu_ptr(&cpu_hw_events); + WARN_ON_ONCE(!cpuhw->txn_flags); /* no txn in flight */ + + if (cpuhw->txn_flags & ~PERF_PMU_TXN_ADD) { + cpuhw->txn_flags = 0; + return 0; + } + n = cpuhw->n_events; if (check_excludes(cpuhw->event, cpuhw->flags, 0, n)) return -EAGAIN; @@ -1633,6 +1660,7 @@ static int power_pmu_commit_txn(struct pmu *pmu) cpuhw->event[i]->hw.config = cpuhw->events[i]; cpuhw->group_flag &= ~PERF_EVENT_TXN; + cpuhw->txn_flags = 0; perf_pmu_enable(pmu); return 0; } diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 56fdad479115..19138be412d6 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -72,6 +72,7 @@ struct cpu_hw_events { atomic_t ctr_set[CPUMF_CTR_SET_MAX]; u64 state, tx_state; unsigned int flags; + unsigned int txn_flags; }; static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .ctr_set = { @@ -82,6 +83,7 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { }, .state = 0, .flags = 0, + .txn_flags = 0, }; static int get_counter_set(u64 event) @@ -572,11 +574,21 @@ static void cpumf_pmu_del(struct perf_event *event, int flags) /* * Start group events scheduling transaction. * Set flags to perform a single test at commit time. + * + * We only support PERF_PMU_TXN_ADD transactions. Save the + * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD + * transactions. */ -static void cpumf_pmu_start_txn(struct pmu *pmu) +static void cpumf_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) { struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + WARN_ON_ONCE(cpuhw->txn_flags); /* txn already in flight */ + + cpuhw->txn_flags = txn_flags; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_disable(pmu); cpuhw->flags |= PERF_EVENT_TXN; cpuhw->tx_state = cpuhw->state; @@ -589,8 +601,16 @@ static void cpumf_pmu_start_txn(struct pmu *pmu) */ static void cpumf_pmu_cancel_txn(struct pmu *pmu) { + unsigned int txn_flags; struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + WARN_ON_ONCE(!cpuhw->txn_flags); /* no txn in flight */ + + txn_flags = cpuhw->txn_flags; + cpuhw->txn_flags = 0; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; + WARN_ON(cpuhw->tx_state != cpuhw->state); cpuhw->flags &= ~PERF_EVENT_TXN; @@ -607,6 +627,13 @@ static int cpumf_pmu_commit_txn(struct pmu *pmu) struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); u64 state; + WARN_ON_ONCE(!cpuhw->txn_flags); /* no txn in flight */ + + if (cpuhw->txn_flags & ~PERF_PMU_TXN_ADD) { + cpuhw->txn_flags = 0; + return 0; + } + /* check if the updated state can be scheduled */ state = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); state >>= CPUMF_LCCTL_ENABLE_SHIFT; @@ -614,6 +641,7 @@ static int cpumf_pmu_commit_txn(struct pmu *pmu) return -EPERM; cpuhw->flags &= ~PERF_EVENT_TXN; + cpuhw->txn_flags = 0; perf_pmu_enable(pmu); return 0; } diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index e3f89c7e5062..2c0984d146ec 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -109,6 +109,7 @@ struct cpu_hw_events { int enabled; unsigned int group_flag; + unsigned int txn_flags; }; static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; @@ -1494,10 +1495,16 @@ static int sparc_pmu_event_init(struct perf_event *event) * Set the flag to make pmu::enable() not perform the * schedulability test, it will be performed at commit time */ -static void sparc_pmu_start_txn(struct pmu *pmu) +static void sparc_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) { struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + WARN_ON_ONCE(cpuhw->txn_flags); /* txn already in flight */ + + cpuhw->txn_flags = txn_flags; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_disable(pmu); cpuhw->group_flag |= PERF_EVENT_TXN; } @@ -1510,6 +1517,14 @@ static void sparc_pmu_start_txn(struct pmu *pmu) static void sparc_pmu_cancel_txn(struct pmu *pmu) { struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); + unsigned int txn_flags; + + WARN_ON_ONCE(!cpuhw->txn_flags); /* no txn in flight */ + + txn_flags = cpuhw->txn_flags; + cpuhw->txn_flags = 0; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; cpuhw->group_flag &= ~PERF_EVENT_TXN; perf_pmu_enable(pmu); @@ -1528,6 +1543,13 @@ static int sparc_pmu_commit_txn(struct pmu *pmu) if (!sparc_pmu) return -EINVAL; + WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ + + if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) { + cpuc->txn_flags = 0; + return 0; + } + n = cpuc->n_events; if (check_excludes(cpuc->event, 0, n)) return -EINVAL; @@ -1535,6 +1557,7 @@ static int sparc_pmu_commit_txn(struct pmu *pmu) return -EAGAIN; cpuc->group_flag &= ~PERF_EVENT_TXN; + cpuc->txn_flags = 0; perf_pmu_enable(pmu); return 0; } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 66dd3fe99b82..dc77ef470e0e 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1748,9 +1748,21 @@ static inline void x86_pmu_read(struct perf_event *event) * Start group events scheduling transaction * Set the flag to make pmu::enable() not perform the * schedulability test, it will be performed at commit time + * + * We only support PERF_PMU_TXN_ADD transactions. Save the + * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD + * transactions. */ -static void x86_pmu_start_txn(struct pmu *pmu) +static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */ + + cpuc->txn_flags = txn_flags; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_disable(pmu); __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN); __this_cpu_write(cpu_hw_events.n_txn, 0); @@ -1763,6 +1775,16 @@ static void x86_pmu_start_txn(struct pmu *pmu) */ static void x86_pmu_cancel_txn(struct pmu *pmu) { + unsigned int txn_flags; + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ + + txn_flags = cpuc->txn_flags; + cpuc->txn_flags = 0; + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; + __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN); /* * Truncate collected array by the number of events added in this @@ -1786,6 +1808,13 @@ static int x86_pmu_commit_txn(struct pmu *pmu) int assign[X86_PMC_IDX_MAX]; int n, ret; + WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ + + if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) { + cpuc->txn_flags = 0; + return 0; + } + n = cpuc->n_events; if (!x86_pmu_initialized()) @@ -1802,6 +1831,7 @@ static int x86_pmu_commit_txn(struct pmu *pmu) memcpy(cpuc->assign, assign, n*sizeof(int)); cpuc->group_flag &= ~PERF_EVENT_TXN; + cpuc->txn_flags = 0; perf_pmu_enable(pmu); return 0; } diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 5edf6d868fc1..c99c93b9e348 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -196,6 +196,7 @@ struct cpu_hw_events { int n_excl; /* the number of exclusive events */ unsigned int group_flag; + unsigned int txn_flags; int is_fake; /* diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d61ee4459eee..ea3b5dd21a4c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -201,6 +201,8 @@ struct perf_event; */ #define PERF_EVENT_TXN 0x1 +#define PERF_PMU_TXN_ADD 0x1 /* txn to add/schedule event on PMU */ + /** * pmu::capabilities flags */ @@ -331,20 +333,26 @@ struct pmu { * * Start the transaction, after this ->add() doesn't need to * do schedulability tests. + * + * Optional. */ - void (*start_txn) (struct pmu *pmu); /* optional */ + void (*start_txn) (struct pmu *pmu, unsigned int txn_flags); /* * If ->start_txn() disabled the ->add() schedulability test * then ->commit_txn() is required to perform one. On success * the transaction is closed. On error the transaction is kept * open until ->cancel_txn() is called. + * + * Optional. */ - int (*commit_txn) (struct pmu *pmu); /* optional */ + int (*commit_txn) (struct pmu *pmu); /* * Will cancel the transaction, assumes ->del() is called * for each successful ->add() during the transaction. + * + * Optional. */ - void (*cancel_txn) (struct pmu *pmu); /* optional */ + void (*cancel_txn) (struct pmu *pmu); /* * Will return the value for perf_event_mmap_page::index for this event, diff --git a/kernel/events/core.c b/kernel/events/core.c index 76e64be9bfb5..c80cee82959f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1914,7 +1914,7 @@ group_sched_in(struct perf_event *group_event, if (group_event->state == PERF_EVENT_STATE_OFF) return 0; - pmu->start_txn(pmu); + pmu->start_txn(pmu, PERF_PMU_TXN_ADD); if (event_sched_in(group_event, cpuctx, ctx)) { pmu->cancel_txn(pmu); @@ -7267,24 +7267,49 @@ static void perf_pmu_nop_void(struct pmu *pmu) { } +static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags) +{ +} + static int perf_pmu_nop_int(struct pmu *pmu) { return 0; } -static void perf_pmu_start_txn(struct pmu *pmu) +DEFINE_PER_CPU(unsigned int, nop_txn_flags); + +static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) { + __this_cpu_write(nop_txn_flags, flags); + + if (flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_disable(pmu); } static int perf_pmu_commit_txn(struct pmu *pmu) { + unsigned int flags = __this_cpu_read(nop_txn_flags); + + __this_cpu_write(nop_txn_flags, 0); + + if (flags & ~PERF_PMU_TXN_ADD) + return 0; + perf_pmu_enable(pmu); return 0; } static void perf_pmu_cancel_txn(struct pmu *pmu) { + unsigned int flags = __this_cpu_read(nop_txn_flags); + + __this_cpu_write(nop_txn_flags, 0); + + if (flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_enable(pmu); } @@ -7523,7 +7548,7 @@ got_cpu_context: pmu->commit_txn = perf_pmu_commit_txn; pmu->cancel_txn = perf_pmu_cancel_txn; } else { - pmu->start_txn = perf_pmu_nop_void; + pmu->start_txn = perf_pmu_nop_txn; pmu->commit_txn = perf_pmu_nop_int; pmu->cancel_txn = perf_pmu_nop_void; } -- cgit v1.2.3 From 8f3e5684d3fbd91ead283916676fa3dac22615e5 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 3 Sep 2015 20:07:53 -0700 Subject: perf/core: Drop PERF_EVENT_TXN We currently use PERF_EVENT_TXN flag to determine if we are in the middle of a transaction. If in a transaction, we defer the schedulability checks from pmu->add() operation to the pmu->commit() operation. Now that we have "transaction types" (PERF_PMU_TXN_ADD, PERF_PMU_TXN_READ) we can use the type to determine if we are in a transaction and drop the PERF_EVENT_TXN flag. When PERF_EVENT_TXN is dropped, the cpuhw->group_flag on some architectures becomes unused, so drop that field as well. This is an extension of the Powerpc patch from Peter Zijlstra to s390, Sparc and x86 architectures. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1441336073-22750-11-git-send-email-sukadev@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/powerpc/perf/core-book3s.c | 6 +----- arch/s390/kernel/perf_cpum_cf.c | 5 +---- arch/sparc/kernel/perf_event.c | 6 +----- arch/x86/kernel/cpu/perf_event.c | 7 ++----- arch/x86/kernel/cpu/perf_event.h | 1 - include/linux/perf_event.h | 2 -- 6 files changed, 5 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index c84074185c80..d1e65ce545b3 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -48,7 +48,6 @@ struct cpu_hw_events { unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; - unsigned int group_flag; unsigned int txn_flags; int n_txn_start; @@ -1442,7 +1441,7 @@ static int power_pmu_add(struct perf_event *event, int ef_flags) * skip the schedulability test here, it will be performed * at commit time(->commit_txn) as a whole */ - if (cpuhw->group_flag & PERF_EVENT_TXN) + if (cpuhw->txn_flags & PERF_PMU_TXN_ADD) goto nocheck; if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1)) @@ -1603,7 +1602,6 @@ static void power_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) return; perf_pmu_disable(pmu); - cpuhw->group_flag |= PERF_EVENT_TXN; cpuhw->n_txn_start = cpuhw->n_events; } @@ -1624,7 +1622,6 @@ static void power_pmu_cancel_txn(struct pmu *pmu) if (txn_flags & ~PERF_PMU_TXN_ADD) return; - cpuhw->group_flag &= ~PERF_EVENT_TXN; perf_pmu_enable(pmu); } @@ -1659,7 +1656,6 @@ static int power_pmu_commit_txn(struct pmu *pmu) for (i = cpuhw->n_txn_start; i < n; ++i) cpuhw->event[i]->hw.config = cpuhw->events[i]; - cpuhw->group_flag &= ~PERF_EVENT_TXN; cpuhw->txn_flags = 0; perf_pmu_enable(pmu); return 0; diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 19138be412d6..cb774ff6e749 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -536,7 +536,7 @@ static int cpumf_pmu_add(struct perf_event *event, int flags) * For group events transaction, the authorization check is * done in cpumf_pmu_commit_txn(). */ - if (!(cpuhw->flags & PERF_EVENT_TXN)) + if (!(cpuhw->txn_flags & PERF_PMU_TXN_ADD)) if (validate_ctr_auth(&event->hw)) return -EPERM; @@ -590,7 +590,6 @@ static void cpumf_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) return; perf_pmu_disable(pmu); - cpuhw->flags |= PERF_EVENT_TXN; cpuhw->tx_state = cpuhw->state; } @@ -613,7 +612,6 @@ static void cpumf_pmu_cancel_txn(struct pmu *pmu) WARN_ON(cpuhw->tx_state != cpuhw->state); - cpuhw->flags &= ~PERF_EVENT_TXN; perf_pmu_enable(pmu); } @@ -640,7 +638,6 @@ static int cpumf_pmu_commit_txn(struct pmu *pmu) if ((state & cpuhw->info.auth_ctl) != state) return -EPERM; - cpuhw->flags &= ~PERF_EVENT_TXN; cpuhw->txn_flags = 0; perf_pmu_enable(pmu); return 0; diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 2c0984d146ec..b0da5aedb336 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -108,7 +108,6 @@ struct cpu_hw_events { /* Enabled/disable state. */ int enabled; - unsigned int group_flag; unsigned int txn_flags; }; static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; @@ -1380,7 +1379,7 @@ static int sparc_pmu_add(struct perf_event *event, int ef_flags) * skip the schedulability test here, it will be performed * at commit time(->commit_txn) as a whole */ - if (cpuc->group_flag & PERF_EVENT_TXN) + if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto nocheck; if (check_excludes(cpuc->event, n0, 1)) @@ -1506,7 +1505,6 @@ static void sparc_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) return; perf_pmu_disable(pmu); - cpuhw->group_flag |= PERF_EVENT_TXN; } /* @@ -1526,7 +1524,6 @@ static void sparc_pmu_cancel_txn(struct pmu *pmu) if (txn_flags & ~PERF_PMU_TXN_ADD) return; - cpuhw->group_flag &= ~PERF_EVENT_TXN; perf_pmu_enable(pmu); } @@ -1556,7 +1553,6 @@ static int sparc_pmu_commit_txn(struct pmu *pmu) if (sparc_check_constraints(cpuc->event, cpuc->events, n)) return -EAGAIN; - cpuc->group_flag &= ~PERF_EVENT_TXN; cpuc->txn_flags = 0; perf_pmu_enable(pmu); return 0; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index dc77ef470e0e..4562cf070c27 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1175,7 +1175,7 @@ static int x86_pmu_add(struct perf_event *event, int flags) * skip the schedulability test here, it will be performed * at commit time (->commit_txn) as a whole. */ - if (cpuc->group_flag & PERF_EVENT_TXN) + if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto done_collect; ret = x86_pmu.schedule_events(cpuc, n, assign); @@ -1326,7 +1326,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) * XXX assumes any ->del() called during a TXN will only be on * an event added during that same TXN. */ - if (cpuc->group_flag & PERF_EVENT_TXN) + if (cpuc->txn_flags & PERF_PMU_TXN_ADD) return; /* @@ -1764,7 +1764,6 @@ static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) return; perf_pmu_disable(pmu); - __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN); __this_cpu_write(cpu_hw_events.n_txn, 0); } @@ -1785,7 +1784,6 @@ static void x86_pmu_cancel_txn(struct pmu *pmu) if (txn_flags & ~PERF_PMU_TXN_ADD) return; - __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN); /* * Truncate collected array by the number of events added in this * transaction. See x86_pmu_add() and x86_pmu_*_txn(). @@ -1830,7 +1828,6 @@ static int x86_pmu_commit_txn(struct pmu *pmu) */ memcpy(cpuc->assign, assign, n*sizeof(int)); - cpuc->group_flag &= ~PERF_EVENT_TXN; cpuc->txn_flags = 0; perf_pmu_enable(pmu); return 0; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index c99c93b9e348..953a0e4e3284 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -195,7 +195,6 @@ struct cpu_hw_events { int n_excl; /* the number of exclusive events */ - unsigned int group_flag; unsigned int txn_flags; int is_fake; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b83cea932f74..d841d33bcdc9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -199,8 +199,6 @@ struct perf_event; /* * Common implementation detail of pmu::{start,commit,cancel}_txn */ -#define PERF_EVENT_TXN 0x1 - #define PERF_PMU_TXN_ADD 0x1 /* txn to add/schedule event on PMU */ #define PERF_PMU_TXN_READ 0x2 /* txn to read event group from PMU */ -- cgit v1.2.3 From 0e2815de552a638295cfdaf0865e575573bf263e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Sep 2015 09:12:44 +0200 Subject: x86/headers: Clean up too long lines Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Thomas Gleixner Cc: bp@alien8.de Cc: brgerst@gmail.com Cc: dvlasenk@redhat.com Cc: luto@amacapital.net Cc: mikko.rapeli@iki.fi Cc: oleg@redhat.com Link: http://lkml.kernel.org/r/20150909071244.GM3644@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/include/uapi/asm/sigcontext.h | 57 ++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index 4411e6a8f8e7..d485232f1e9f 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -2,16 +2,16 @@ #define _UAPI_ASM_X86_SIGCONTEXT_H /* - * Linux signal context definitions. The sigcontext includes a complex hierarchy of CPU - * and FPU state, available to user-space (on the stack) when a signal handler is - * executed. + * Linux signal context definitions. The sigcontext includes a complex + * hierarchy of CPU and FPU state, available to user-space (on the stack) when + * a signal handler is executed. * - * As over the years this ABI grew from its very simple roots towards supporting more and - * more CPU state organically, some of the details (which were rather clever hacks back - * in the days) became a bit quirky by today. + * As over the years this ABI grew from its very simple roots towards + * supporting more and more CPU state organically, some of the details (which + * were rather clever hacks back in the days) became a bit quirky by today. * - * The current ABI includes flexible provisions for future extensions, so we won't have - * to grow new quirks for quite some time. Promise! + * The current ABI includes flexible provisions for future extensions, so we + * won't have to grow new quirks for quite some time. Promise! */ #include @@ -23,27 +23,32 @@ /* * Bytes 464..511 in the current 512-byte layout of the FXSAVE/FXRSTOR frame - * are reserved for SW usage. On CPUs supporting XSAVE/XRSTOR, these bytes - * are used to extend the fpstate pointer in the sigcontext, which now - * includes the extended state information along with fpstate information. + * are reserved for SW usage. On CPUs supporting XSAVE/XRSTOR, these bytes are + * used to extend the fpstate pointer in the sigcontext, which now includes the + * extended state information along with fpstate information. * - * If sw_reserved.magic1 == FP_XSTATE_MAGIC1 then there's a sw_reserved.extended_size - * bytes large extended context area present. (The last 32-bit word of this extended - * area (at the fpstate+extended_size-FP_XSTATE_MAGIC2_SIZE address) is set to + * If sw_reserved.magic1 == FP_XSTATE_MAGIC1 then there's a + * sw_reserved.extended_size bytes large extended context area present. (The + * last 32-bit word of this extended area (at the + * fpstate+extended_size-FP_XSTATE_MAGIC2_SIZE address) is set to * FP_XSTATE_MAGIC2 so that you can sanity check your size calculations.) * - * This extended area typically grows with newer CPUs that have larger and larger - * XSAVE areas. + * This extended area typically grows with newer CPUs that have larger and + * larger XSAVE areas. */ struct _fpx_sw_bytes { - /* If set to FP_XSTATE_MAGIC1 then this is an xstate context. 0 if a legacy frame. */ + /* + * If set to FP_XSTATE_MAGIC1 then this is an xstate context. + * 0 if a legacy frame. + */ __u32 magic1; /* * Total size of the fpstate area: * * - if magic1 == 0 then it's sizeof(struct _fpstate) - * - if magic1 == FP_XSTATE_MAGIC1 then it's sizeof(struct _xstate) plus extensions (if any) + * - if magic1 == FP_XSTATE_MAGIC1 then it's sizeof(struct _xstate) + * plus extensions (if any) */ __u32 extended_size; @@ -66,13 +71,13 @@ struct _fpx_sw_bytes { /* * As documented in the iBCS2 standard: * - * The first part of "struct _fpstate" is just the normal i387 - * hardware setup, the extra "status" word is used to save the - * coprocessor status word before entering the handler. + * The first part of "struct _fpstate" is just the normal i387 hardware setup, + * the extra "status" word is used to save the coprocessor status word before + * entering the handler. * - * The FPU state data structure has had to grow to accommodate the - * extended FPU state required by the Streaming SIMD Extensions. - * There is no documented standard to accomplish this at the moment. + * The FPU state data structure has had to grow to accommodate the extended FPU + * state required by the Streaming SIMD Extensions. There is no documented + * standard to accomplish this at the moment. */ /* 10-byte legacy floating point register: */ @@ -137,8 +142,8 @@ struct _fpstate_32 { * 'struct _fpstate' so that you can always assume the _fpstate portion * exists so that you can check the magic value. * - * Note2: Reserved fields may someday contain valuable data. Always save/restore - * them when you change signal frames. + * Note2: Reserved fields may someday contain valuable data. Always + * save/restore them when you change signal frames. */ struct _fpstate_64 { __u16 cwd; -- cgit v1.2.3 From b0815359590f496f80e355a74319b6dc77010951 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 2 Sep 2015 16:31:24 -0700 Subject: x86/fpu: Print xfeature buffer size in decimal This is utterly a personal taste thing, but I find it way easier to read structure sizes in decimal than in hex. Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: dave@sr71.net Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20150902233124.1A8B04A8@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 62fc001c7846..b790dcbd3d2a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -187,7 +187,7 @@ static void __init setup_xstate_features(void) xstate_offsets[leaf] = ebx; xstate_sizes[leaf] = eax; - printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %04x, xstate_sizes[%d]: %04x\n", leaf, ebx, leaf, eax); + printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", leaf, ebx, leaf, eax); } } @@ -357,7 +357,7 @@ void __init fpu__init_system_xstate(void) setup_init_fpu_buf(); setup_xstate_comp(); - pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is 0x%x bytes, using '%s' format.\n", + pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", xfeatures_mask, xstate_size, cpu_has_xsaves ? "compacted" : "standard"); -- cgit v1.2.3 From 0a265375028b241a9173b7c569dd2368ba97fcd4 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 2 Sep 2015 16:31:24 -0700 Subject: x86/fpu: Move XSAVE-disabling code to a helper When we want to _completely_ disable XSAVE support as far as the kernel is concerned, we have a big set of feature flags to clear. We currently only do this in cases where the user asks for it to be disabled, but we are about to expand the places where we do it to handle errors too. Move the code in to xstate.c, and put it in the xstate.h header. We will use it in the next patch too. Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: dave@sr71.net Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20150902233124.EA9A70E5@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 1 + arch/x86/kernel/fpu/init.c | 12 +----------- arch/x86/kernel/fpu/xstate.c | 19 +++++++++++++++++++ 3 files changed, 21 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 4656b25bb9a7..d5a9b736553c 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -40,6 +40,7 @@ extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); +void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index d14e9ac3235a..0a250afc6cdf 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -354,17 +354,7 @@ static int __init x86_noxsave_setup(char *s) if (strlen(s)) return 0; - setup_clear_cpu_cap(X86_FEATURE_XSAVE); - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - setup_clear_cpu_cap(X86_FEATURE_XSAVEC); - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - setup_clear_cpu_cap(X86_FEATURE_AVX); - setup_clear_cpu_cap(X86_FEATURE_AVX2); - setup_clear_cpu_cap(X86_FEATURE_AVX512F); - setup_clear_cpu_cap(X86_FEATURE_AVX512PF); - setup_clear_cpu_cap(X86_FEATURE_AVX512ER); - setup_clear_cpu_cap(X86_FEATURE_AVX512CD); - setup_clear_cpu_cap(X86_FEATURE_MPX); + fpu__xstate_clear_all_cpu_caps(); return 1; } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b790dcbd3d2a..2ada11c0f8d7 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -38,6 +38,25 @@ static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; /* The number of supported xfeatures in xfeatures_mask: */ static unsigned int xfeatures_nr; +/* + * Clear all of the X86_FEATURE_* bits that are unavailable + * when the CPU has no XSAVE support. + */ +void fpu__xstate_clear_all_cpu_caps(void) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + setup_clear_cpu_cap(X86_FEATURE_XSAVEC); + setup_clear_cpu_cap(X86_FEATURE_XSAVES); + setup_clear_cpu_cap(X86_FEATURE_AVX); + setup_clear_cpu_cap(X86_FEATURE_AVX2); + setup_clear_cpu_cap(X86_FEATURE_AVX512F); + setup_clear_cpu_cap(X86_FEATURE_AVX512PF); + setup_clear_cpu_cap(X86_FEATURE_AVX512ER); + setup_clear_cpu_cap(X86_FEATURE_AVX512CD); + setup_clear_cpu_cap(X86_FEATURE_MPX); +} + /* * Return whether the system supports a given xfeature. * -- cgit v1.2.3 From 4109ca066b6b899ac7549bf3aac94b178ac95891 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 2 Sep 2015 16:31:25 -0700 Subject: x86/fpu: Remove XSTATE_RESERVE The original purpose of XSTATE_RESERVE was to carve out space to store all of the possible extended state components that get saved with the XSAVE instruction(s). However, we are now almost entirely dynamically allocating the buffers we use for XSAVE by placing them at the end of the task_struct and them sizing them at boot. The one exception for that is the init_task. The maximum extended state component size that we have today is on systems with space for AVX-512 and Memory Protection Keys: 2696 bytes. We have reserved a PAGE_SIZE buffer in the init_task via fpregs_state->__padding. This check ensures that even if the component sizes or layout were changed (which we do not expect), that we will still not overflow the init_task's buffer. In the case that we detect we might overflow the buffer, we completely disable XSAVE support in the kernel and try to boot as if we had 'legacy x87 FPU' support in place. This is a crippled state without any of the XSAVE-enabled features (MPX, AVX, etc...). But, it at least let us boot safely. Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: dave@sr71.net Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20150902233125.D948D475@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 15 +++++----- arch/x86/kernel/fpu/xstate.c | 60 ++++++++++++++++++++++++++++++++++------ 2 files changed, 59 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index c49c5173158e..6aaafe07338d 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -159,22 +159,19 @@ struct xstate_header { u64 reserved[6]; } __attribute__((packed)); -/* New processor state extensions should be added here: */ -#define XSTATE_RESERVE (sizeof(struct ymmh_struct) + \ - sizeof(struct lwp_struct) + \ - sizeof(struct mpx_struct) ) /* * This is our most modern FPU state format, as saved by the XSAVE * and restored by the XRSTOR instructions. * * It consists of a legacy fxregs portion, an xstate header and - * subsequent fixed size areas as defined by the xstate header. - * Not all CPUs support all the extensions. + * subsequent areas as defined by the xstate header. Not all CPUs + * support all the extensions, so the size of the extended area + * can vary quite a bit between CPUs. */ struct xregs_state { struct fxregs_state i387; struct xstate_header header; - u8 __reserved[XSTATE_RESERVE]; + u8 extended_state_area[0]; } __attribute__ ((packed, aligned (64))); /* @@ -182,7 +179,9 @@ struct xregs_state { * put together, so that we can pick the right one runtime. * * The size of the structure is determined by the largest - * member - which is the xsave area: + * member - which is the xsave area. The padding is there + * to ensure that statically-allocated task_structs (just + * the init_task today) have enough space. */ union fpregs_state { struct fregs_state fsave; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 2ada11c0f8d7..769603abae63 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -312,24 +312,64 @@ static void __init setup_init_fpu_buf(void) /* * Calculate total size of enabled xstates in XCR0/xfeatures_mask. */ -static void __init init_xstate_size(void) +static unsigned int __init calculate_xstate_size(void) { unsigned int eax, ebx, ecx, edx; + unsigned int calculated_xstate_size; int i; if (!cpu_has_xsaves) { cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - xstate_size = ebx; - return; + calculated_xstate_size = ebx; + return calculated_xstate_size; } - xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; + calculated_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; for (i = 2; i < 64; i++) { if (test_bit(i, (unsigned long *)&xfeatures_mask)) { cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); - xstate_size += eax; + calculated_xstate_size += eax; } } + return calculated_xstate_size; +} + +/* + * Will the runtime-enumerated 'xstate_size' fit in the init + * task's statically-allocated buffer? + */ +static bool is_supported_xstate_size(unsigned int test_xstate_size) +{ + if (test_xstate_size <= sizeof(union fpregs_state)) + return true; + + pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n", + sizeof(union fpregs_state), test_xstate_size); + return false; +} + +static int init_xstate_size(void) +{ + /* Recompute the context size for enabled features: */ + unsigned int possible_xstate_size = calculate_xstate_size(); + + /* Ensure we have the space to store all enabled: */ + if (!is_supported_xstate_size(possible_xstate_size)) + return -EINVAL; + + /* + * The size is OK, we are definitely going to use xsave, + * make it known to the world that we need more space. + */ + xstate_size = possible_xstate_size; + return 0; +} + +void fpu__init_disable_system_xstate(void) +{ + xfeatures_mask = 0; + cr4_clear_bits(X86_CR4_OSXSAVE); + fpu__xstate_clear_all_cpu_caps(); } /* @@ -340,6 +380,7 @@ void __init fpu__init_system_xstate(void) { unsigned int eax, ebx, ecx, edx; static int on_boot_cpu = 1; + int err; WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; @@ -367,9 +408,12 @@ void __init fpu__init_system_xstate(void) /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); - - /* Recompute the context size for enabled features: */ - init_xstate_size(); + err = init_xstate_size(); + if (err) { + /* something went wrong, boot without any XSAVE support */ + fpu__init_disable_system_xstate(); + return; + } update_regset_xstate_info(xstate_size, xfeatures_mask); fpu__init_prepare_fx_sw_frame(); -- cgit v1.2.3 From 75933433d666c2ab13a7a93f4ec1e6f000a94ffc Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 2 Sep 2015 16:31:25 -0700 Subject: x86/fpu: Remove partial LWP support definitions LightWeight Profiling was evidently an AMD profiling feature that we never got around to implementing. Remove the references to it. Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: dave@sr71.net Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20150902233125.7E602284@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 6aaafe07338d..5dc1a18ef11c 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -132,11 +132,6 @@ struct ymmh_struct { u8 ymmh_space[256]; }; -/* We don't support LWP yet: */ -struct lwp_struct { - u8 reserved[128]; -}; - /* Intel MPX support: */ struct bndreg { u64 lower_bound; -- cgit v1.2.3 From d91cab78133d33b1dfd3d3fa7167fcbf74fb5f99 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 2 Sep 2015 16:31:26 -0700 Subject: x86/fpu: Rename XSAVE macros There are two concepts that have some confusing naming: 1. Extended State Component numbers (currently called XFEATURE_BIT_*) 2. Extended State Component masks (currently called XSTATE_*) The numbers are (currently) from 0-9. State component 3 is the bounds registers for MPX, for instance. But when we want to enable "state component 3", we go set a bit in XCR0. The bit we set is 1<<3. We can check to see if a state component feature is enabled by looking at its bit. The current 'xfeature_bit's are at best xfeature bit _numbers_. Calling them bits is at best inconsistent with ending the enum list with 'XFEATURES_NR_MAX'. This patch renames the enum to be 'xfeature'. These also happen to be what the Intel documentation calls a "state component". We also want to differentiate these from the "XSTATE_*" macros. The "XSTATE_*" macros are a mask, and we rename them to match. These macros are reasonably widely used so this patch is a wee bit big, but this really is just a rename. The only non-mechanical part of this is the s/XSTATE_EXTEND_MASK/XFEATURE_MASK_EXTEND/ We need a better name for it, but that's another patch. Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: dave@sr71.net Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20150902233126.38653250@viggo.jf.intel.com [ Ported to v4.3-rc1. ] Signed-off-by: Ingo Molnar --- arch/x86/crypto/camellia_aesni_avx2_glue.c | 3 +- arch/x86/crypto/camellia_aesni_avx_glue.c | 3 +- arch/x86/crypto/cast5_avx_glue.c | 3 +- arch/x86/crypto/cast6_avx_glue.c | 3 +- arch/x86/crypto/chacha20_glue.c | 2 +- arch/x86/crypto/poly1305_glue.c | 2 +- arch/x86/crypto/serpent_avx2_glue.c | 3 +- arch/x86/crypto/serpent_avx_glue.c | 3 +- arch/x86/crypto/sha1_ssse3_glue.c | 2 +- arch/x86/crypto/sha256_ssse3_glue.c | 2 +- arch/x86/crypto/sha512_ssse3_glue.c | 2 +- arch/x86/crypto/twofish_avx_glue.c | 2 +- arch/x86/include/asm/fpu/types.h | 44 +++++++++++++++++------------- arch/x86/include/asm/fpu/xstate.h | 14 ++++++---- arch/x86/kernel/fpu/init.c | 6 ++-- arch/x86/kernel/fpu/regset.c | 4 +-- arch/x86/kernel/fpu/signal.c | 6 ++-- arch/x86/kernel/fpu/xstate.c | 36 +++++++++++++----------- arch/x86/kernel/traps.c | 2 +- arch/x86/kvm/cpuid.c | 4 +-- arch/x86/kvm/x86.c | 27 +++++++++--------- arch/x86/kvm/x86.h | 6 ++-- arch/x86/mm/mpx.c | 6 ++-- 23 files changed, 103 insertions(+), 82 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index 4c65c70e628b..d84456924563 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -567,7 +567,8 @@ static int __init camellia_aesni_init(void) return -ENODEV; } - if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 80a0e4389c9a..12e729bfe71b 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -554,7 +554,8 @@ static int __init camellia_aesni_init(void) { const char *feature_name; - if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index be00aa48b2b5..8648158f3916 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -469,7 +469,8 @@ static int __init cast5_init(void) { const char *feature_name; - if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 5dbba7224221..fca459578c35 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -591,7 +591,8 @@ static int __init cast6_init(void) { const char *feature_name; - if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index effe2160b7c5..722bacea040e 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -130,7 +130,7 @@ static int __init chacha20_simd_mod_init(void) #ifdef CONFIG_AS_AVX2 chacha20_use_avx2 = cpu_has_avx && cpu_has_avx2 && - cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL); + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); #endif return crypto_register_alg(&alg); } diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index f7170d764f32..4264a3d59589 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -184,7 +184,7 @@ static int __init poly1305_simd_mod_init(void) #ifdef CONFIG_AS_AVX2 poly1305_use_avx2 = cpu_has_avx && cpu_has_avx2 && - cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL); + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); alg.descsize = sizeof(struct poly1305_simd_desc_ctx); if (poly1305_use_avx2) alg.descsize += 10 * sizeof(u32); diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 7d838dc4d888..6d198342e2de 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -542,7 +542,8 @@ static int __init init(void) pr_info("AVX2 instructions are not detected.\n"); return -ENODEV; } - if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index da7dafc9b16d..5dc37026c7ce 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -597,7 +597,8 @@ static int __init serpent_init(void) { const char *feature_name; - if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 7c48e8b20848..00212c32d4db 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -121,7 +121,7 @@ static struct shash_alg alg = { #ifdef CONFIG_AS_AVX static bool __init avx_usable(void) { - if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) { + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { if (cpu_has_avx) pr_info("AVX detected but unusable.\n"); return false; diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index f8097fc0d1d1..0e0e85aea634 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -130,7 +130,7 @@ static struct shash_alg algs[] = { { #ifdef CONFIG_AS_AVX static bool __init avx_usable(void) { - if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) { + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { if (cpu_has_avx) pr_info("AVX detected but unusable.\n"); return false; diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 2edad7b81870..0c8c38c101ac 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -129,7 +129,7 @@ static struct shash_alg algs[] = { { #ifdef CONFIG_AS_AVX static bool __init avx_usable(void) { - if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) { + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { if (cpu_has_avx) pr_info("AVX detected but unusable.\n"); return false; diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index c2bd0ce718ee..6f3738ced95e 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -558,7 +558,7 @@ static int __init twofish_init(void) { const char *feature_name; - if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 5dc1a18ef11c..9f20d10af3b1 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -95,30 +95,36 @@ struct swregs_state { /* * List of XSAVE features Linux knows about: */ -enum xfeature_bit { - XSTATE_BIT_FP, - XSTATE_BIT_SSE, - XSTATE_BIT_YMM, - XSTATE_BIT_BNDREGS, - XSTATE_BIT_BNDCSR, - XSTATE_BIT_OPMASK, - XSTATE_BIT_ZMM_Hi256, - XSTATE_BIT_Hi16_ZMM, +enum xfeature { + XFEATURE_FP, + XFEATURE_SSE, + /* + * Values above here are "legacy states". + * Those below are "extended states". + */ + XFEATURE_YMM, + XFEATURE_BNDREGS, + XFEATURE_BNDCSR, + XFEATURE_OPMASK, + XFEATURE_ZMM_Hi256, + XFEATURE_Hi16_ZMM, XFEATURES_NR_MAX, }; -#define XSTATE_FP (1 << XSTATE_BIT_FP) -#define XSTATE_SSE (1 << XSTATE_BIT_SSE) -#define XSTATE_YMM (1 << XSTATE_BIT_YMM) -#define XSTATE_BNDREGS (1 << XSTATE_BIT_BNDREGS) -#define XSTATE_BNDCSR (1 << XSTATE_BIT_BNDCSR) -#define XSTATE_OPMASK (1 << XSTATE_BIT_OPMASK) -#define XSTATE_ZMM_Hi256 (1 << XSTATE_BIT_ZMM_Hi256) -#define XSTATE_Hi16_ZMM (1 << XSTATE_BIT_Hi16_ZMM) +#define XFEATURE_MASK_FP (1 << XFEATURE_FP) +#define XFEATURE_MASK_SSE (1 << XFEATURE_SSE) +#define XFEATURE_MASK_YMM (1 << XFEATURE_YMM) +#define XFEATURE_MASK_BNDREGS (1 << XFEATURE_BNDREGS) +#define XFEATURE_MASK_BNDCSR (1 << XFEATURE_BNDCSR) +#define XFEATURE_MASK_OPMASK (1 << XFEATURE_OPMASK) +#define XFEATURE_MASK_ZMM_Hi256 (1 << XFEATURE_ZMM_Hi256) +#define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM) -#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) -#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) +#define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) +#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \ + | XFEATURE_MASK_ZMM_Hi256 \ + | XFEATURE_MASK_Hi16_ZMM) /* * There are 16x 256-bit AVX registers named YMM0-YMM15. diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index d5a9b736553c..3a6c89b70307 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -6,7 +6,7 @@ #include /* Bit 63 of XCR0 is reserved for future expansion */ -#define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) +#define XFEATURE_MASK_EXTEND (~(XFEATURE_MASK_FPSSE | (1ULL << 63))) #define XSTATE_CPUID 0x0000000d @@ -19,14 +19,18 @@ #define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) /* Supported features which support lazy state saving */ -#define XSTATE_LAZY (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ - | XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) +#define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \ + XFEATURE_MASK_SSE | \ + XFEATURE_MASK_YMM | \ + XFEATURE_MASK_OPMASK | \ + XFEATURE_MASK_ZMM_Hi256 | \ + XFEATURE_MASK_Hi16_ZMM) /* Supported features which require eager state saving */ -#define XSTATE_EAGER (XSTATE_BNDREGS | XSTATE_BNDCSR) +#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR) /* All currently supported features */ -#define XCNTXT_MASK (XSTATE_LAZY | XSTATE_EAGER) +#define XCNTXT_MASK (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER) #ifdef CONFIG_X86_64 #define REX_PREFIX "0x48, " diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 0a250afc6cdf..be39b5fde4b9 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -290,11 +290,11 @@ static void __init fpu__init_system_ctx_switch(void) if (cpu_has_xsaveopt && eagerfpu != DISABLE) eagerfpu = ENABLE; - if (xfeatures_mask & XSTATE_EAGER) { + if (xfeatures_mask & XFEATURE_MASK_EAGER) { if (eagerfpu == DISABLE) { pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n", - xfeatures_mask & XSTATE_EAGER); - xfeatures_mask &= ~XSTATE_EAGER; + xfeatures_mask & XFEATURE_MASK_EAGER); + xfeatures_mask &= ~XFEATURE_MASK_EAGER; } else { eagerfpu = ENABLE; } diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index dc60810c1c74..0bc3490420c5 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -66,7 +66,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, * presence of FP and SSE state. */ if (cpu_has_xsave) - fpu->state.xsave.header.xfeatures |= XSTATE_FPSSE; + fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; return ret; } @@ -326,7 +326,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, * presence of FP. */ if (cpu_has_xsave) - fpu->state.xsave.header.xfeatures |= XSTATE_FP; + fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP; return ret; } diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 50ec9af1bd51..eb032677f939 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -107,7 +107,7 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) * header as well as change any contents in the memory layout. * xrestore as part of sigreturn will capture all the changes. */ - xfeatures |= XSTATE_FPSSE; + xfeatures |= XFEATURE_MASK_FPSSE; err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures); @@ -207,7 +207,7 @@ sanitize_restored_xstate(struct task_struct *tsk, * layout and not enabled by the OS. */ if (fx_only) - header->xfeatures = XSTATE_FPSSE; + header->xfeatures = XFEATURE_MASK_FPSSE; else header->xfeatures &= (xfeatures_mask & xfeatures); } @@ -230,7 +230,7 @@ static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_ { if (use_xsave()) { if ((unsigned long)buf % 64 || fx_only) { - u64 init_bv = xfeatures_mask & ~XSTATE_FPSSE; + u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); return copy_user_to_fxregs(buf); } else { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 769603abae63..d6f0be9c2f5a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -72,7 +72,7 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) /* * So we use FLS here to be able to print the most advanced * feature that was requested but is missing. So if a driver - * asks about "XSTATE_SSE | XSTATE_YMM" we'll print the + * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the * missing AVX feature - this is the most informative message * to users: */ @@ -131,7 +131,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu) /* * FP is in init state */ - if (!(xfeatures & XSTATE_FP)) { + if (!(xfeatures & XFEATURE_MASK_FP)) { fx->cwd = 0x37f; fx->swd = 0; fx->twd = 0; @@ -144,7 +144,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu) /* * SSE is in init state */ - if (!(xfeatures & XSTATE_SSE)) + if (!(xfeatures & XFEATURE_MASK_SSE)) memset(&fx->xmm_space[0], 0, 256); /* @@ -223,14 +223,14 @@ static void __init print_xstate_feature(u64 xstate_mask) */ static void __init print_xstate_features(void) { - print_xstate_feature(XSTATE_FP); - print_xstate_feature(XSTATE_SSE); - print_xstate_feature(XSTATE_YMM); - print_xstate_feature(XSTATE_BNDREGS); - print_xstate_feature(XSTATE_BNDCSR); - print_xstate_feature(XSTATE_OPMASK); - print_xstate_feature(XSTATE_ZMM_Hi256); - print_xstate_feature(XSTATE_Hi16_ZMM); + print_xstate_feature(XFEATURE_MASK_FP); + print_xstate_feature(XFEATURE_MASK_SSE); + print_xstate_feature(XFEATURE_MASK_YMM); + print_xstate_feature(XFEATURE_MASK_BNDREGS); + print_xstate_feature(XFEATURE_MASK_BNDCSR); + print_xstate_feature(XFEATURE_MASK_OPMASK); + print_xstate_feature(XFEATURE_MASK_ZMM_Hi256); + print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); } /* @@ -365,7 +365,11 @@ static int init_xstate_size(void) return 0; } -void fpu__init_disable_system_xstate(void) +/* + * We enabled the XSAVE hardware, but something went wrong and + * we can not use it. Disable it. + */ +static void fpu__init_disable_system_xstate(void) { xfeatures_mask = 0; cr4_clear_bits(X86_CR4_OSXSAVE); @@ -398,7 +402,7 @@ void __init fpu__init_system_xstate(void) cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); xfeatures_mask = eax + ((u64)edx << 32); - if ((xfeatures_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { + if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask); BUG(); } @@ -451,7 +455,7 @@ void fpu__resume_cpu(void) * Inputs: * xstate: the thread's storage area for all FPU data * xstate_feature: state which is defined in xsave.h (e.g. - * XSTATE_FP, XSTATE_SSE, etc...) + * XFEATURE_MASK_FP, XFEATURE_MASK_SSE, etc...) * Output: * address of the state in the xsave area, or NULL if the * field is not present in the xsave buffer. @@ -502,8 +506,8 @@ EXPORT_SYMBOL_GPL(get_xsave_addr); * Note that this only works on the current task. * * Inputs: - * @xsave_state: state which is defined in xsave.h (e.g. XSTATE_FP, - * XSTATE_SSE, etc...) + * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP, + * XFEATURE_MASK_SSE, etc...) * Output: * address of the state in the xsave area or NULL if the state * is not present or is in its 'init state'. diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 346eec73f7db..0cd2ac5c0f28 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -384,7 +384,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) * which is all zeros which indicates MPX was not * responsible for the exception. */ - bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR); + bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR); if (!bndcsr) goto exit_trap; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2fbea2544f24..156441bcaac8 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -30,7 +30,7 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) int feature_bit = 0; u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; - xstate_bv &= XSTATE_EXTEND_MASK; + xstate_bv &= XFEATURE_MASK_EXTEND; while (xstate_bv) { if (xstate_bv & 0x1) { u32 eax, ebx, ecx, edx, offset; @@ -51,7 +51,7 @@ u64 kvm_supported_xcr0(void) u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0; if (!kvm_x86_ops->mpx_supported()) - xcr0 &= ~(XSTATE_BNDREGS | XSTATE_BNDCSR); + xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); return xcr0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a60bdbccff51..2d4e54db49af 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -662,9 +662,9 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ if (index != XCR_XFEATURE_ENABLED_MASK) return 1; - if (!(xcr0 & XSTATE_FP)) + if (!(xcr0 & XFEATURE_MASK_FP)) return 1; - if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) + if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE)) return 1; /* @@ -672,23 +672,24 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) * saving. However, xcr0 bit 0 is always set, even if the * emulated CPU does not support XSAVE (see fx_init). */ - valid_bits = vcpu->arch.guest_supported_xcr0 | XSTATE_FP; + valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP; if (xcr0 & ~valid_bits) return 1; - if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR))) + if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) != + (!(xcr0 & XFEATURE_MASK_BNDCSR))) return 1; - if (xcr0 & XSTATE_AVX512) { - if (!(xcr0 & XSTATE_YMM)) + if (xcr0 & XFEATURE_MASK_AVX512) { + if (!(xcr0 & XFEATURE_MASK_YMM)) return 1; - if ((xcr0 & XSTATE_AVX512) != XSTATE_AVX512) + if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512) return 1; } kvm_put_guest_xcr0(vcpu); vcpu->arch.xcr0 = xcr0; - if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK) + if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) kvm_update_cpuid(vcpu); return 0; } @@ -2906,7 +2907,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) * Copy each region from the possibly compacted offset to the * non-compacted offset. */ - valid = xstate_bv & ~XSTATE_FPSSE; + valid = xstate_bv & ~XFEATURE_MASK_FPSSE; while (valid) { u64 feature = valid & -valid; int index = fls64(feature) - 1; @@ -2944,7 +2945,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) * Copy each region from the non-compacted offset to the * possibly compacted offset. */ - valid = xstate_bv & ~XSTATE_FPSSE; + valid = xstate_bv & ~XFEATURE_MASK_FPSSE; while (valid) { u64 feature = valid & -valid; int index = fls64(feature) - 1; @@ -2972,7 +2973,7 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, &vcpu->arch.guest_fpu.state.fxsave, sizeof(struct fxregs_state)); *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = - XSTATE_FPSSE; + XFEATURE_MASK_FPSSE; } } @@ -2992,7 +2993,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, return -EINVAL; load_xsave(vcpu, (u8 *)guest_xsave->region); } else { - if (xstate_bv & ~XSTATE_FPSSE) + if (xstate_bv & ~XFEATURE_MASK_FPSSE) return -EINVAL; memcpy(&vcpu->arch.guest_fpu.state.fxsave, guest_xsave->region, sizeof(struct fxregs_state)); @@ -7001,7 +7002,7 @@ static void fx_init(struct kvm_vcpu *vcpu) /* * Ensure guest xcr0 is valid for loading */ - vcpu->arch.xcr0 = XSTATE_FP; + vcpu->arch.xcr0 = XFEATURE_MASK_FP; vcpu->arch.cr0 |= X86_CR0_ET; } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 2f822cd886c2..f2afa5fe48a6 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -180,9 +180,9 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int page_num); -#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ - | XSTATE_BNDREGS | XSTATE_BNDCSR \ - | XSTATE_AVX512) +#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ + | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ + | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512) extern u64 host_xcr0; extern u64 kvm_supported_xcr0(void); diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 134948b0926f..f35fc9c6ed50 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -258,7 +258,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) goto err_out; } /* get bndregs field from current task's xsave area */ - bndregs = get_xsave_field_ptr(XSTATE_BNDREGS); + bndregs = get_xsave_field_ptr(XFEATURE_MASK_BNDREGS); if (!bndregs) { err = -EINVAL; goto err_out; @@ -315,7 +315,7 @@ static __user void *mpx_get_bounds_dir(void) * The bounds directory pointer is stored in a register * only accessible if we first do an xsave. */ - bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR); + bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR); if (!bndcsr) return MPX_INVALID_BOUNDS_DIR; @@ -492,7 +492,7 @@ static int do_mpx_bt_fault(void) const struct bndcsr *bndcsr; struct mm_struct *mm = current->mm; - bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR); + bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR); if (!bndcsr) return -EINVAL; /* -- cgit v1.2.3 From dad8c4fe8530f28dde73dadd1d588e3aaa507562 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 2 Sep 2015 16:31:27 -0700 Subject: x86/fpu: Rename XFEATURES_NR_MAX This is a logcal followon to the last patch. It makes the XFEATURE_MAX naming consistent with the other enum values. This is what Ingo suggested. Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: dave@sr71.net Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20150902233127.A541448F@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 2 +- arch/x86/kernel/fpu/xstate.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 9f20d10af3b1..8764df32f115 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -109,7 +109,7 @@ enum xfeature { XFEATURE_ZMM_Hi256, XFEATURE_Hi16_ZMM, - XFEATURES_NR_MAX, + XFEATURE_MAX, }; #define XFEATURE_MASK_FP (1 << XFEATURE_FP) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index d6f0be9c2f5a..f94e236e7e08 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -31,8 +31,8 @@ static const char *xfeature_names[] = */ u64 xfeatures_mask __read_mostly; -static unsigned int xstate_offsets[XFEATURES_NR_MAX] = { [ 0 ... XFEATURES_NR_MAX - 1] = -1}; -static unsigned int xstate_sizes[XFEATURES_NR_MAX] = { [ 0 ... XFEATURES_NR_MAX - 1] = -1}; +static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; /* The number of supported xfeatures in xfeatures_mask: */ -- cgit v1.2.3 From 8a93c9e0dca131a0bf330ea9d1e57c1bcf3824ad Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 2 Sep 2015 16:31:28 -0700 Subject: x86/fpu: Rework XSTATE_* macros to remove magic '2' The 'xstate.c' code has a bunch of references to '2'. This is because we have a lot more work to do for the "extended" xstates than the "legacy" ones and state component 2 is the first "extended" state. This patch replaces all of the instances of '2' with FIRST_EXTENDED_XFEATURE, which clearly explains what is going on. Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: dave@sr71.net Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20150902233128.A8C0BF51@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 2 ++ arch/x86/kernel/fpu/xstate.c | 13 +++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 8764df32f115..9f579305dd11 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -126,6 +126,8 @@ enum xfeature { | XFEATURE_MASK_ZMM_Hi256 \ | XFEATURE_MASK_Hi16_ZMM) +#define FIRST_EXTENDED_XFEATURE XFEATURE_YMM + /* * There are 16x 256-bit AVX registers named YMM0-YMM15. * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index f94e236e7e08..c1a0bf4668de 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -200,7 +200,7 @@ static void __init setup_xstate_features(void) xfeatures_nr = fls64(xfeatures_mask); - for (leaf = 2; leaf < xfeatures_nr; leaf++) { + for (leaf = FIRST_EXTENDED_XFEATURE; leaf < xfeatures_nr; leaf++) { cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); xstate_offsets[leaf] = ebx; @@ -252,7 +252,7 @@ static void __init setup_xstate_comp(void) xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space); if (!cpu_has_xsaves) { - for (i = 2; i < xfeatures_nr; i++) { + for (i = FIRST_EXTENDED_XFEATURE; i < xfeatures_nr; i++) { if (test_bit(i, (unsigned long *)&xfeatures_mask)) { xstate_comp_offsets[i] = xstate_offsets[i]; xstate_comp_sizes[i] = xstate_sizes[i]; @@ -261,15 +261,16 @@ static void __init setup_xstate_comp(void) return; } - xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE; + xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] = + FXSAVE_SIZE + XSAVE_HDR_SIZE; - for (i = 2; i < xfeatures_nr; i++) { + for (i = FIRST_EXTENDED_XFEATURE; i < xfeatures_nr; i++) { if (test_bit(i, (unsigned long *)&xfeatures_mask)) xstate_comp_sizes[i] = xstate_sizes[i]; else xstate_comp_sizes[i] = 0; - if (i > 2) + if (i > FIRST_EXTENDED_XFEATURE) xstate_comp_offsets[i] = xstate_comp_offsets[i-1] + xstate_comp_sizes[i-1]; @@ -325,7 +326,7 @@ static unsigned int __init calculate_xstate_size(void) } calculated_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; - for (i = 2; i < 64; i++) { + for (i = FIRST_EXTENDED_XFEATURE; i < 64; i++) { if (test_bit(i, (unsigned long *)&xfeatures_mask)) { cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); calculated_xstate_size += eax; -- cgit v1.2.3 From ee9ae257eb17d3426ee9ab91449a3aa443298b36 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 2 Sep 2015 16:31:28 -0700 Subject: x86/fpu: Remove 'xfeature_nr' xfeature_nr ended up being initialized too late for me to use it in the "xsave size sanity check" patch which is later in the series. I tried to move around its initialization but realized that it was just as easy to get rid of it. We only have 9 XFEATURES. Instead of dynamically calculating and storing the last feature, just use the compile-time max: XFEATURES_NR_MAX. Note that even with 'xfeatures_nr' we can had "holes" in the xfeatures_mask that we had to deal with. We also change a 'leaf' variable to be a plain 'i'. Although it is used to grab a cpuid leaf in this one loop, all of the other loops just use an 'i' and I find it much more obvious to keep the naming consistent across all the similar loops. Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: dave@sr71.net Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20150902233128.3F30DF5A@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c1a0bf4668de..61ec60bcbb63 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -35,9 +35,6 @@ static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; -/* The number of supported xfeatures in xfeatures_mask: */ -static unsigned int xfeatures_nr; - /* * Clear all of the X86_FEATURE_* bits that are unavailable * when the CPU has no XSAVE support. @@ -190,23 +187,18 @@ void fpu__init_cpu_xstate(void) /* * Record the offsets and sizes of various xstates contained * in the XSAVE state memory layout. - * - * ( Note that certain features might be non-present, for them - * we'll have 0 offset and 0 size. ) */ static void __init setup_xstate_features(void) { - u32 eax, ebx, ecx, edx, leaf; - - xfeatures_nr = fls64(xfeatures_mask); + u32 eax, ebx, ecx, edx, i; - for (leaf = FIRST_EXTENDED_XFEATURE; leaf < xfeatures_nr; leaf++) { - cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); - xstate_offsets[leaf] = ebx; - xstate_sizes[leaf] = eax; + xstate_offsets[i] = ebx; + xstate_sizes[i] = eax; - printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", leaf, ebx, leaf, eax); + printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, ebx, i, eax); } } @@ -252,7 +244,7 @@ static void __init setup_xstate_comp(void) xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space); if (!cpu_has_xsaves) { - for (i = FIRST_EXTENDED_XFEATURE; i < xfeatures_nr; i++) { + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { if (test_bit(i, (unsigned long *)&xfeatures_mask)) { xstate_comp_offsets[i] = xstate_offsets[i]; xstate_comp_sizes[i] = xstate_sizes[i]; @@ -264,7 +256,7 @@ static void __init setup_xstate_comp(void) xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] = FXSAVE_SIZE + XSAVE_HDR_SIZE; - for (i = FIRST_EXTENDED_XFEATURE; i < xfeatures_nr; i++) { + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { if (test_bit(i, (unsigned long *)&xfeatures_mask)) xstate_comp_sizes[i] = xstate_sizes[i]; else -- cgit v1.2.3 From 633d54c47a5bedfb42f10e6a63eeeebd35abdb4c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 2 Sep 2015 16:31:29 -0700 Subject: x86/fpu: Add xfeature_enabled() helper instead of test_bit() We currently use test_bit() in a few places to see if an xfeature is enabled. It ends up being a bit ugly because 'xfeatures_mask' is a u64 and test_bit wants an 'unsigned long' so it requires a cast. The *_bit() functions are also techincally atomic, which we have no need for here. So, remove the test_bit()s and replace with the new xfeature_enabled() helper. This also provides a central place to add a comment about the future need to support 'system xstates'. Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: dave@sr71.net Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20150902233129.B1534F86@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 61ec60bcbb63..ba6b6e0baf88 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -225,6 +225,16 @@ static void __init print_xstate_features(void) print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); } +/* + * Note that in the future we will likely need a pair of + * functions here: one for user xstates and the other for + * system xstates. For now, they are the same. + */ +static int xfeature_enabled(enum xfeature xfeature) +{ + return !!(xfeatures_mask & (1UL << xfeature)); +} + /* * This function sets up offsets and sizes of all extended states in * xsave area. This supports both standard format and compacted format @@ -245,7 +255,7 @@ static void __init setup_xstate_comp(void) if (!cpu_has_xsaves) { for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (test_bit(i, (unsigned long *)&xfeatures_mask)) { + if (xfeature_enabled(i)) { xstate_comp_offsets[i] = xstate_offsets[i]; xstate_comp_sizes[i] = xstate_sizes[i]; } @@ -257,7 +267,7 @@ static void __init setup_xstate_comp(void) FXSAVE_SIZE + XSAVE_HDR_SIZE; for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (test_bit(i, (unsigned long *)&xfeatures_mask)) + if (xfeature_enabled(i)) xstate_comp_sizes[i] = xstate_sizes[i]; else xstate_comp_sizes[i] = 0; @@ -319,7 +329,7 @@ static unsigned int __init calculate_xstate_size(void) calculated_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; for (i = FIRST_EXTENDED_XFEATURE; i < 64; i++) { - if (test_bit(i, (unsigned long *)&xfeatures_mask)) { + if (xfeature_enabled(i)) { cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); calculated_xstate_size += eax; } -- cgit v1.2.3 From 1126cb4535c4ff172c37a412a6bd25d6b47a1901 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 2 Sep 2015 16:31:29 -0700 Subject: x86/fpu/mpx: Rework MPX 'xstate' types MPX includes two separate "extended state components". There is no real need to have an 'mpx_struct' because we never really manage the states together. We also separate out the actual data in 'mpx_bndcsr_state' from the padding. We will shortly be checking the state sizes against our structures and need them to match. For consistency, we also ensure to prefix these types with 'mpx_'. Lastly, we add some comments to mirror some of the descriptions in the Intel documents (SDM) of the various state components. Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: dave@sr71.net Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20150902233129.384B73EB@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 29 +++++++++++++++++++++++------ arch/x86/include/asm/trace/mpx.h | 7 ++++--- arch/x86/kernel/traps.c | 2 +- arch/x86/mm/mpx.c | 9 +++++---- 4 files changed, 33 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 9f579305dd11..4d8c2009b94e 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -141,20 +141,37 @@ struct ymmh_struct { }; /* Intel MPX support: */ -struct bndreg { + +struct mpx_bndreg { u64 lower_bound; u64 upper_bound; } __packed; +/* + * State component 3 is used for the 4 128-bit bounds registers + */ +struct mpx_bndreg_state { + struct mpx_bndreg bndreg[4]; +} __packed; -struct bndcsr { +/* + * State component 4 is used for the 64-bit user-mode MPX + * configuration register BNDCFGU and the 64-bit MPX status + * register BNDSTATUS. We call the pair "BNDCSR". + */ +struct mpx_bndcsr { u64 bndcfgu; u64 bndstatus; } __packed; -struct mpx_struct { - struct bndreg bndreg[4]; - struct bndcsr bndcsr; -}; +/* + * The BNDCSR state is padded out to be 64-bytes in size. + */ +struct mpx_bndcsr_state { + union { + struct mpx_bndcsr bndcsr; + u8 pad_to_64_bytes[64]; + }; +} __packed; struct xstate_header { u64 xfeatures; diff --git a/arch/x86/include/asm/trace/mpx.h b/arch/x86/include/asm/trace/mpx.h index 173dd3ba108c..0f492fc50bce 100644 --- a/arch/x86/include/asm/trace/mpx.h +++ b/arch/x86/include/asm/trace/mpx.h @@ -11,7 +11,7 @@ TRACE_EVENT(mpx_bounds_register_exception, TP_PROTO(void *addr_referenced, - const struct bndreg *bndreg), + const struct mpx_bndreg *bndreg), TP_ARGS(addr_referenced, bndreg), TP_STRUCT__entry( @@ -44,7 +44,7 @@ TRACE_EVENT(mpx_bounds_register_exception, TRACE_EVENT(bounds_exception_mpx, - TP_PROTO(const struct bndcsr *bndcsr), + TP_PROTO(const struct mpx_bndcsr *bndcsr), TP_ARGS(bndcsr), TP_STRUCT__entry( @@ -116,7 +116,8 @@ TRACE_EVENT(mpx_new_bounds_table, /* * This gets used outside of MPX-specific code, so we need a stub. */ -static inline void trace_bounds_exception_mpx(const struct bndcsr *bndcsr) +static inline +void trace_bounds_exception_mpx(const struct mpx_bndcsr *bndcsr) { } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 0cd2ac5c0f28..ade185a46b1d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -361,7 +361,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) { - const struct bndcsr *bndcsr; + const struct mpx_bndcsr *bndcsr; siginfo_t *info; RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index f35fc9c6ed50..b0ae85f90f10 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -237,7 +237,8 @@ bad_opcode: */ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) { - const struct bndreg *bndregs, *bndreg; + const struct mpx_bndreg_state *bndregs; + const struct mpx_bndreg *bndreg; siginfo_t *info = NULL; struct insn insn; uint8_t bndregno; @@ -264,7 +265,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) goto err_out; } /* now go select the individual register in the set of 4 */ - bndreg = &bndregs[bndregno]; + bndreg = &bndregs->bndreg[bndregno]; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) { @@ -306,7 +307,7 @@ err_out: static __user void *mpx_get_bounds_dir(void) { - const struct bndcsr *bndcsr; + const struct mpx_bndcsr *bndcsr; if (!cpu_feature_enabled(X86_FEATURE_MPX)) return MPX_INVALID_BOUNDS_DIR; @@ -489,7 +490,7 @@ out_unmap: static int do_mpx_bt_fault(void) { unsigned long bd_entry, bd_base; - const struct bndcsr *bndcsr; + const struct mpx_bndcsr *bndcsr; struct mm_struct *mm = current->mm; bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR); -- cgit v1.2.3 From 83aa3c45307228af4329cbb915a2f2142e5479ad Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 2 Sep 2015 16:31:29 -0700 Subject: x86/fpu: Rework YMM definition We are about to rework all of the "extended state" definitions. This makes the 'ymm' naming consistent with the AVX-512 types we will introduce later. We also add a convenience type: "reg_128_bit" so that we do not have to spell out our arithmetic. Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: dave@sr71.net Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20150902233129.B4EB045F@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 4d8c2009b94e..eb0c9a237afa 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -128,17 +128,23 @@ enum xfeature { #define FIRST_EXTENDED_XFEATURE XFEATURE_YMM +struct reg_128_bit { + u8 regbytes[128/8]; +}; + /* + * State component 2: + * * There are 16x 256-bit AVX registers named YMM0-YMM15. * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15) - * and are stored in 'struct fxregs_state::xmm_space[]'. + * and are stored in 'struct fxregs_state::xmm_space[]' in the + * "legacy" area. * - * The high 128 bits are stored here: - * 16x 128 bits == 256 bytes. + * The high 128 bits are stored here. */ struct ymmh_struct { - u8 ymmh_space[256]; -}; + struct reg_128_bit hi_ymm[16]; +} __packed; /* Intel MPX support: */ -- cgit v1.2.3 From 060dd0f712562925662c65b90d225d82304764f7 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 2 Sep 2015 16:31:29 -0700 Subject: x86/fpu: Add C structures for AVX-512 state components AVX-512 has 3 separate state components: 1. opmask registers 2. zmm upper half of registers 0-15 3. new zmm registers (16-31) This patch adds C structures for the three components along with a few comments mostly lifted from the SDM to explain what they do. This will allow us to check our structures against what the hardware tells us about the sizes of the components. Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: dave@sr71.net Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20150902233129.F2433B98@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index eb0c9a237afa..1c6f6ac52ad0 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -131,6 +131,12 @@ enum xfeature { struct reg_128_bit { u8 regbytes[128/8]; }; +struct reg_256_bit { + u8 regbytes[256/8]; +}; +struct reg_512_bit { + u8 regbytes[512/8]; +}; /* * State component 2: @@ -179,6 +185,33 @@ struct mpx_bndcsr_state { }; } __packed; +/* AVX-512 Components: */ + +/* + * State component 5 is used for the 8 64-bit opmask registers + * k0-k7 (opmask state). + */ +struct avx_512_opmask_state { + u64 opmask_reg[8]; +} __packed; + +/* + * State component 6 is used for the upper 256 bits of the + * registers ZMM0-ZMM15. These 16 256-bit values are denoted + * ZMM0_H-ZMM15_H (ZMM_Hi256 state). + */ +struct avx_512_zmm_uppers_state { + struct reg_256_bit zmm_upper[16]; +} __packed; + +/* + * State component 7 is used for the 16 512-bit registers + * ZMM16-ZMM31 (Hi16_ZMM state). + */ +struct avx_512_hi16_state { + struct reg_512_bit hi16_zmm[16]; +} __packed; + struct xstate_header { u64 xfeatures; u64 xcomp_bv; -- cgit v1.2.3 From 65ac2e9baa7deebe3e9588769d44d85555e05619 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 2 Sep 2015 16:31:30 -0700 Subject: x86/fpu: Correct and check XSAVE xstate size calculations Note: our xsaves support is currently broken and disabled. This patch does not fix it, but it is an incremental improvement. This might be useful to someone backporting the entire set of XSAVES patches at some point, but it should not be backported alone. Ingo said he wanted something like this (bullets 2 and 3): http://lkml.kernel.org/r/20150808091508.GB32641@gmail.com There are currently two xsave buffer formats: standard and compacted. The standard format is waht 'XSAVE' and 'XSAVEOPT' produce while 'XSAVES' and 'XSAVEC' produce a compacted-formet buffer. (The kernel never uses XSAVEC) But, the XSAVES buffer *ALSO* contains "system state components" which are never saved by a plain XSAVE. So, XSAVES has two things that might make its buffer differently-sized from an XSAVE-produced one. The current code assumes that an XSAVES buffer's size is simply the sum of the sizes of the (user) states which are supported. This seems to work in most cases, but it is not consistent with what the SDM says, and it breaks if we 'align' a component in the buffer. The calculation is also unnecessary work since the CPU *tells* us the size of the buffer directly. This patch just reads the size of the buffer right out of the CPUID leaf instead of trying to derive it. But, blindly trusting the CPU like this is dangerous. We add a verification pass in do_extra_xstate_size_checks() to ensure that the size we calculate matches with what we see from the hardware. When it comes down to it, we trust but verify the CPU. Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: dave@sr71.net Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20150902233130.234FE1EC@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 184 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 174 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index ba6b6e0baf88..470495504403 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -312,27 +312,190 @@ static void __init setup_init_fpu_buf(void) copy_xregs_to_kernel_booting(&init_fpstate.xsave); } +static int xfeature_is_supervisor(int xfeature_nr) +{ + /* + * We currently do not support supervisor states, but if + * we did, we could find out like this. + * + * SDM says: If state component i is a user state component, + * ECX[0] return 0; if state component i is a supervisor + * state component, ECX[0] returns 1. + u32 eax, ebx, ecx, edx; + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx; + return !!(ecx & 1); + */ + return 0; +} +/* +static int xfeature_is_user(int xfeature_nr) +{ + return !xfeature_is_supervisor(xfeature_nr); +} +*/ + +/* + * This check is important because it is easy to get XSTATE_* + * confused with XSTATE_BIT_*. + */ +#define CHECK_XFEATURE(nr) do { \ + WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ + WARN_ON(nr >= XFEATURE_MAX); \ +} while (0) + +/* + * We could cache this like xstate_size[], but we only use + * it here, so it would be a waste of space. + */ +static int xfeature_is_aligned(int xfeature_nr) +{ + u32 eax, ebx, ecx, edx; + + CHECK_XFEATURE(xfeature_nr); + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + /* + * The value returned by ECX[1] indicates the alignment + * of state component i when the compacted format + * of the extended region of an XSAVE area is used + */ + return !!(ecx & 2); +} + +static int xfeature_uncompacted_offset(int xfeature_nr) +{ + u32 eax, ebx, ecx, edx; + + CHECK_XFEATURE(xfeature_nr); + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + return ebx; +} + +static int xfeature_size(int xfeature_nr) +{ + u32 eax, ebx, ecx, edx; + + CHECK_XFEATURE(xfeature_nr); + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + return eax; +} + +/* + * 'XSAVES' implies two different things: + * 1. saving of supervisor/system state + * 2. using the compacted format + * + * Use this function when dealing with the compacted format so + * that it is obvious which aspect of 'XSAVES' is being handled + * by the calling code. + */ +static int using_compacted_format(void) +{ + return cpu_has_xsaves; +} + +static void __xstate_dump_leaves(void) +{ + int i; + u32 eax, ebx, ecx, edx; + static int should_dump = 1; + + if (!should_dump) + return; + should_dump = 0; + /* + * Dump out a few leaves past the ones that we support + * just in case there are some goodies up there + */ + for (i = 0; i < XFEATURE_MAX + 10; i++) { + cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", + XSTATE_CPUID, i, eax, ebx, ecx, edx); + } +} + +#define XSTATE_WARN_ON(x) do { \ + if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) { \ + __xstate_dump_leaves(); \ + } \ +} while (0) + +/* + * This essentially double-checks what the cpu told us about + * how large the XSAVE buffer needs to be. We are recalculating + * it to be safe. + */ +static void do_extra_xstate_size_checks(void) +{ + int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; + int i; + + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if (!xfeature_enabled(i)) + continue; + /* + * Supervisor state components can be managed only by + * XSAVES, which is compacted-format only. + */ + if (!using_compacted_format()) + XSTATE_WARN_ON(xfeature_is_supervisor(i)); + + /* Align from the end of the previous feature */ + if (xfeature_is_aligned(i)) + paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64); + /* + * The offset of a given state in the non-compacted + * format is given to us in a CPUID leaf. We check + * them for being ordered (increasing offsets) in + * setup_xstate_features(). + */ + if (!using_compacted_format()) + paranoid_xstate_size = xfeature_uncompacted_offset(i); + /* + * The compacted-format offset always depends on where + * the previous state ended. + */ + paranoid_xstate_size += xfeature_size(i); + } + XSTATE_WARN_ON(paranoid_xstate_size != xstate_size); +} + /* * Calculate total size of enabled xstates in XCR0/xfeatures_mask. + * + * Note the SDM's wording here. "sub-function 0" only enumerates + * the size of the *user* states. If we use it to size a buffer + * that we use 'XSAVES' on, we could potentially overflow the + * buffer because 'XSAVES' saves system states too. + * + * Note that we do not currently set any bits on IA32_XSS so + * 'XCR0 | IA32_XSS == XCR0' for now. */ static unsigned int __init calculate_xstate_size(void) { unsigned int eax, ebx, ecx, edx; unsigned int calculated_xstate_size; - int i; if (!cpu_has_xsaves) { + /* + * - CPUID function 0DH, sub-function 0: + * EBX enumerates the size (in bytes) required by + * the XSAVE instruction for an XSAVE area + * containing all the *user* state components + * corresponding to bits currently set in XCR0. + */ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); calculated_xstate_size = ebx; - return calculated_xstate_size; - } - - calculated_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; - for (i = FIRST_EXTENDED_XFEATURE; i < 64; i++) { - if (xfeature_enabled(i)) { - cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); - calculated_xstate_size += eax; - } + } else { + /* + * - CPUID function 0DH, sub-function 1: + * EBX enumerates the size (in bytes) required by + * the XSAVES instruction for an XSAVE area + * containing all the state components + * corresponding to bits currently set in + * XCR0 | IA32_XSS. + */ + cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + calculated_xstate_size = ebx; } return calculated_xstate_size; } @@ -365,6 +528,7 @@ static int init_xstate_size(void) * make it known to the world that we need more space. */ xstate_size = possible_xstate_size; + do_extra_xstate_size_checks(); return 0; } -- cgit v1.2.3 From e6e888f96b4a531886f3bf29ba9af0b6f1026365 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 2 Sep 2015 16:31:30 -0700 Subject: x86/fpu: Check to ensure increasing-offset xstate offsets The xstate CPUID leaves enumerate where each state component is inside the XSAVE buffer, along with the size of the entire buffer. Our new XSAVE sanity-checking code extrapolates an expected _total_ buffer size by looking at the last component that it encounters. That method requires that the highest-numbered component also be the one with the highest offset. This is a pretty safe assumption, but let's add some code to ensure it stays true. To make this check work correctly, we also need to ensure we only consider the offsets from enabled features because the offset register (ebx) will return 0 on unsupported features. This also means that we will preserve the -1's that we initialized xstate_offsets/sizes[] with. That will help find bugs. Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: dave@sr71.net Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20150902233130.0843AB15@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 470495504403..a8297f2ca2d1 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -184,6 +184,16 @@ void fpu__init_cpu_xstate(void) xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); } +/* + * Note that in the future we will likely need a pair of + * functions here: one for user xstates and the other for + * system xstates. For now, they are the same. + */ +static int xfeature_enabled(enum xfeature xfeature) +{ + return !!(xfeatures_mask & (1UL << xfeature)); +} + /* * Record the offsets and sizes of various xstates contained * in the XSAVE state memory layout. @@ -191,12 +201,25 @@ void fpu__init_cpu_xstate(void) static void __init setup_xstate_features(void) { u32 eax, ebx, ecx, edx, i; + /* start at the beginnning of the "extended state" */ + unsigned int last_good_offset = offsetof(struct xregs_state, + extended_state_area); for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + if (!xfeature_enabled(i)) + continue; + cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); xstate_offsets[i] = ebx; xstate_sizes[i] = eax; + /* + * In our xstate size checks, we assume that the + * highest-numbered xstate feature has the + * highest offset in the buffer. Ensure it does. + */ + WARN_ONCE(last_good_offset > xstate_offsets[i], + "x86/fpu: misordered xstate at %d\n", last_good_offset); + last_good_offset = xstate_offsets[i]; printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, ebx, i, eax); } @@ -225,16 +248,6 @@ static void __init print_xstate_features(void) print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); } -/* - * Note that in the future we will likely need a pair of - * functions here: one for user xstates and the other for - * system xstates. For now, they are the same. - */ -static int xfeature_enabled(enum xfeature xfeature) -{ - return !!(xfeatures_mask & (1UL << xfeature)); -} - /* * This function sets up offsets and sizes of all extended states in * xsave area. This supports both standard format and compacted format -- cgit v1.2.3 From ef78f2a4bf84d8db9f36868decca2dc24e02a6af Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 2 Sep 2015 16:31:31 -0700 Subject: x86/fpu: Check CPU-provided sizes against struct declarations We now have C structures defined for each of the XSAVE state components that we support. This patch adds checks during our verification pass to ensure that the CPU-provided data enumerated in CPUID leaves matches our C structures. If not, we warn and dump all the XSAVE CPUID leaves. Note: this *actually* found an inconsistency with the MPX 'bndcsr' state. The hardware pads it out differently from our C structures. This patch caught it and warned. Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: dave@sr71.net Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20150902233131.A8DB36DA@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index a8297f2ca2d1..6454f2731b56 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -432,6 +432,49 @@ static void __xstate_dump_leaves(void) } \ } while (0) +#define XCHECK_SZ(sz, nr, nr_macro, __struct) do { \ + if ((nr == nr_macro) && \ + WARN_ONCE(sz != sizeof(__struct), \ + "%s: struct is %zu bytes, cpu state %d bytes\n", \ + __stringify(nr_macro), sizeof(__struct), sz)) { \ + __xstate_dump_leaves(); \ + } \ +} while (0) + +/* + * We have a C struct for each 'xstate'. We need to ensure + * that our software representation matches what the CPU + * tells us about the state's size. + */ +static void check_xstate_against_struct(int nr) +{ + /* + * Ask the CPU for the size of the state. + */ + int sz = xfeature_size(nr); + /* + * Match each CPU state with the corresponding software + * structure. + */ + XCHECK_SZ(sz, nr, XFEATURE_YMM, struct ymmh_struct); + XCHECK_SZ(sz, nr, XFEATURE_BNDREGS, struct mpx_bndreg_state); + XCHECK_SZ(sz, nr, XFEATURE_BNDCSR, struct mpx_bndcsr_state); + XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state); + XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state); + XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state); + + /* + * Make *SURE* to add any feature numbers in below if + * there are "holes" in the xsave state component + * numbers. + */ + if ((nr < XFEATURE_YMM) || + (nr >= XFEATURE_MAX)) { + WARN_ONCE(1, "no structure for xstate: %d\n", nr); + XSTATE_WARN_ON(1); + } +} + /* * This essentially double-checks what the cpu told us about * how large the XSAVE buffer needs to be. We are recalculating @@ -445,6 +488,8 @@ static void do_extra_xstate_size_checks(void) for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { if (!xfeature_enabled(i)) continue; + + check_xstate_against_struct(i); /* * Supervisor state components can be managed only by * XSAVES, which is compacted-format only. -- cgit v1.2.3 From d0a9964e98731c708500a2e712f28f9d39183647 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 12 Sep 2015 21:51:10 -0500 Subject: x86/platform/uv: Implement simple dump failover if kdump fails The ability to trigger a kdump using the system NMI command was added by commit 12ba6c990fab ("x86/UV: Add kdump to UV NMI handler") Author: Mike Travis Date: Mon Sep 23 16:25:03 2013 -0500 This is useful because when kdump is working the information gathered is more informative than the original per CPU stack traces or "dump" option. However a number of things can go wrong with kdump and then the stack traces are more useful than nothing. The two most common reasons for kdump to not be available are: 1) if a problem occurs during boot before the kdump service is started, or 2) the kdump daemon failed to start. In either case the call to crash_kexec() returns unexpectedly. When this happens uv_nmi_kdump() also sets the uv_nmi_kexec_failed flag which causes the slave CPU's to also return to the NMI handler. Upon this unexpected return to the NMI handler, the NMI handler will revert to the "dump" action which uses show_regs() to obtain a process trace dump for all the CPU's. Other minor changes: The "dump" action now generates both the show_regs() stack trace and show instruction pointer information. Whereas the "ips" action only shows instruction pointers for non-idle CPU's. This is more like an abbreviated "ps" display. Change printk(KERN_DEFAULT...) --> pr_info() Signed-off-by: Mike Travis Signed-off-by: George Beshers Cc: Alex Thorlton Cc: Dimitri Sivanich Cc: Hedi Berriche Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/uv_nmi.c | 53 ++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index 5c9f63fa6abf..327f21c3bde1 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -376,38 +376,42 @@ static void uv_nmi_wait(int master) atomic_read(&uv_nmi_cpus_in_nmi), num_online_cpus()); } +/* Dump Instruction Pointer header */ static void uv_nmi_dump_cpu_ip_hdr(void) { - printk(KERN_DEFAULT - "\nUV: %4s %6s %-32s %s (Note: PID 0 not listed)\n", + pr_info("\nUV: %4s %6s %-32s %s (Note: PID 0 not listed)\n", "CPU", "PID", "COMMAND", "IP"); } +/* Dump Instruction Pointer info */ static void uv_nmi_dump_cpu_ip(int cpu, struct pt_regs *regs) { - printk(KERN_DEFAULT "UV: %4d %6d %-32.32s ", - cpu, current->pid, current->comm); - + pr_info("UV: %4d %6d %-32.32s ", cpu, current->pid, current->comm); printk_address(regs->ip); } -/* Dump this cpu's state */ +/* + * Dump this CPU's state. If action was set to "kdump" and the crash_kexec + * failed, then we provide "dump" as an alternate action. Action "dump" now + * also includes the show "ips" (instruction pointers) action whereas the + * action "ips" only displays instruction pointers for the non-idle CPU's. + * This is an abbreviated form of the "ps" command. + */ static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs) { const char *dots = " ................................. "; - if (uv_nmi_action_is("ips")) { - if (cpu == 0) - uv_nmi_dump_cpu_ip_hdr(); + if (cpu == 0) + uv_nmi_dump_cpu_ip_hdr(); - if (current->pid != 0) - uv_nmi_dump_cpu_ip(cpu, regs); + if (current->pid != 0 || !uv_nmi_action_is("ips")) + uv_nmi_dump_cpu_ip(cpu, regs); - } else if (uv_nmi_action_is("dump")) { - printk(KERN_DEFAULT - "UV:%sNMI process trace for CPU %d\n", dots, cpu); + if (uv_nmi_action_is("dump")) { + pr_info("UV:%sNMI process trace for CPU %d\n", dots, cpu); show_regs(regs); } + this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_DUMP_DONE); } @@ -469,8 +473,7 @@ static void uv_nmi_dump_state(int cpu, struct pt_regs *regs, int master) uv_nmi_trigger_dump(tcpu); } if (ignored) - printk(KERN_DEFAULT "UV: %d CPUs ignored NMI\n", - ignored); + pr_alert("UV: %d CPUs ignored NMI\n", ignored); console_loglevel = saved_console_loglevel; pr_alert("UV: process trace complete\n"); @@ -492,8 +495,9 @@ static void uv_nmi_touch_watchdogs(void) touch_nmi_watchdog(); } -#if defined(CONFIG_KEXEC_CORE) static atomic_t uv_nmi_kexec_failed; + +#if defined(CONFIG_KEXEC_CORE) static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) { /* Call crash to dump system state */ @@ -502,10 +506,9 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) crash_kexec(regs); pr_emerg("UV: crash_kexec unexpectedly returned, "); + atomic_set(&uv_nmi_kexec_failed, 1); if (!kexec_crash_image) { pr_cont("crash kernel not loaded\n"); - atomic_set(&uv_nmi_kexec_failed, 1); - uv_nmi_sync_exit(1); return; } pr_cont("kexec busy, stalling cpus while waiting\n"); @@ -514,9 +517,6 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) /* If crash exec fails the slaves should return, otherwise stall */ while (atomic_read(&uv_nmi_kexec_failed) == 0) mdelay(10); - - /* Crash kernel most likely not loaded, return in an orderly fashion */ - uv_nmi_sync_exit(0); } #else /* !CONFIG_KEXEC_CORE */ @@ -524,6 +524,7 @@ static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) { if (master) pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n"); + atomic_set(&uv_nmi_kexec_failed, 1); } #endif /* !CONFIG_KEXEC_CORE */ @@ -613,9 +614,14 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) master = (atomic_read(&uv_nmi_cpu) == cpu); /* If NMI action is "kdump", then attempt to do it */ - if (uv_nmi_action_is("kdump")) + if (uv_nmi_action_is("kdump")) { uv_nmi_kdump(cpu, master, regs); + /* Unexpected return, revert action to "dump" */ + if (master) + strncpy(uv_nmi_action, "dump", strlen(uv_nmi_action)); + } + /* Pause as all cpus enter the NMI handler */ uv_nmi_wait(master); @@ -640,6 +646,7 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) atomic_set(&uv_nmi_cpus_in_nmi, -1); atomic_set(&uv_nmi_cpu, -1); atomic_set(&uv_in_nmi, 0); + atomic_set(&uv_nmi_kexec_failed, 0); } uv_nmi_touch_watchdogs(); -- cgit v1.2.3 From 73477bbb09e7022063d1737c7322ad2e08968c23 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 27 Aug 2015 20:15:28 +0200 Subject: x86/fpu/math-emu: Remove !NO_UNDOC_CODE We always want to support all FPU opcodes, including undocumented ones. That define was fully justified ~20 years ago but not today. Let's not complicate the code with it. Signed-off-by: Denys Vlasenko Cc: Andy Lutomirski Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1440699330-1305-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/math-emu/fpu_entry.c | 36 ------------------------------------ 1 file changed, 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 3d8f2e421466..929b1d500823 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -40,8 +40,6 @@ #define __BAD__ FPU_illegal /* Illegal on an 80486, causes SIGILL */ -#ifndef NO_UNDOC_CODE /* Un-documented FPU op-codes supported by default. */ - /* WARNING: These codes are not documented by Intel in their 80486 manual and may not work on FPU clones or later Intel FPUs. */ @@ -68,21 +66,6 @@ static FUNC const st_instr_table[64] = { fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__, }; -#else /* Support only documented FPU op-codes */ - -static FUNC const st_instr_table[64] = { - fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, __BAD__, - fmul__, fxch_i, __BAD__, __BAD__, fmul_i, __BAD__, fmulp_, __BAD__, - fcom_st, fp_nop, __BAD__, __BAD__, __BAD__, fst_i_, __BAD__, __BAD__, - fcompst, __BAD__, __BAD__, __BAD__, __BAD__, fstp_i, fcompp, __BAD__, - fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_, - fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__, - fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__, - fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__, -}; - -#endif /* NO_UNDOC_CODE */ - #define _NONE_ 0 /* Take no special action */ #define _REG0_ 1 /* Need to check for not empty st(0) */ #define _REGI_ 2 /* Need to check for not empty st(0) and st(rm) */ @@ -94,10 +77,6 @@ static FUNC const st_instr_table[64] = { #define _REGIc 0 /* Compare st(0) and st(rm) */ #define _REGIn 0 /* Uses st(0) and st(rm), but handle checks later */ -#ifndef NO_UNDOC_CODE - -/* Un-documented FPU op-codes supported by default. (see above) */ - static u_char const type_table[64] = { _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_, _REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_, @@ -109,21 +88,6 @@ static u_char const type_table[64] = { _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_ }; -#else /* Support only documented FPU op-codes */ - -static u_char const type_table[64] = { - _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _null_, - _REGI_, _REGIn, _null_, _null_, _REGIi, _null_, _REGIp, _null_, - _REGIc, _NONE_, _null_, _null_, _null_, _REG0_, _null_, _null_, - _REGIc, _null_, _null_, _null_, _null_, _REG0_, _REGIc, _null_, - _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, - _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_, - _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_, - _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_ -}; - -#endif /* NO_UNDOC_CODE */ - #ifdef RE_ENTRANT_CHECKING u_char emulating = 0; #endif /* RE_ENTRANT_CHECKING */ -- cgit v1.2.3 From c9e69c8c58eb8671e9f6cee728088e4c5abc9115 Mon Sep 17 00:00:00 2001 From: Andrzej Hajda Date: Fri, 7 Aug 2015 09:59:14 +0200 Subject: arch/x86/intel-mid: Use kmemdup rather than duplicating its implementation The patch was generated using fixed coccinelle semantic patch scripts/coccinelle/api/memdup.cocci [1]. [1]: http://permalink.gmane.org/gmane.linux.kernel/2014320 Signed-off-by: Andrzej Hajda Cc: Bartlomiej Zolnierkiewicz Cc: Marek Szyprowski Cc: Julia Lawall Link: http://lkml.kernel.org/r/1438934377-4922-9-git-send-email-a.hajda@samsung.com Signed-off-by: Thomas Gleixner --- arch/x86/platform/intel-mid/sfi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index ce992e8cc065..5ee360a951ce 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -197,10 +197,9 @@ static int __init sfi_parse_gpio(struct sfi_table_header *table) num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry); pentry = (struct sfi_gpio_table_entry *)sb->pentry; - gpio_table = kmalloc(num * sizeof(*pentry), GFP_KERNEL); + gpio_table = kmemdup(pentry, num * sizeof(*pentry), GFP_KERNEL); if (!gpio_table) return -1; - memcpy(gpio_table, pentry, num * sizeof(*pentry)); gpio_num_entry = num; pr_debug("GPIO pin info:\n"); -- cgit v1.2.3 From 5e176213a6b2bc5146820c79542d37290434a3c4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 14 Sep 2015 14:47:02 -0700 Subject: perf/x86/intel: Make the CYCLE_ACTIVITY.* constraint on Broadwell more specific The counter constraint for CYCLE_ACTIVITY.* on Broadwell covered all CYCLE_ACTIVITY.* sub events, and forced them on counter 2. But actually only one sub event (umask 8) needs to be on counter 2, all others do not have any constraint. Only force that subevent. This fixes groups with multiple CYCLE_ACTIVITY.* events, for example: % perf stat -x, -e '{cpu/event=0xa3,umask=0x6,cmask=6/,\ cpu/event=0xa2,umask=0x8/,\ cpu/event=0xa3,umask=0x4,cmask=4/,cpu/event=0xb1,umask=0x1,cmask=1/}' true 122150,,cpu/event=0xa3,umask=0x6,cmask=6/,846486,100.00 16483,,cpu/event=0xa2,umask=0x8/,846486,100.00 252280,,cpu/event=0xa3,umask=0x4,cmask=4/,846486,100.00 233604,,cpu/event=0xb1,umask=0x1,cmask=1/,846486,100.00 % Without this patch the third result would be Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1442267222-16464-1-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 3fefebfbdf4b..1d84b41ba932 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -250,7 +250,7 @@ struct event_constraint intel_bdw_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */ - INTEL_EVENT_CONSTRAINT(0xa3, 0x4), /* CYCLE_ACTIVITY.* */ + INTEL_UEVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */ EVENT_CONSTRAINT_END }; -- cgit v1.2.3 From d0dc8494cd6904f8ad035d9ad97f313948f35d0c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 9 Sep 2015 14:53:59 -0700 Subject: perf/x86/intel/pebs: Add PEBS frontend profiling for Skylake Skylake has a new FRONTEND_LATENCY PEBS event to accurately profile frontend problems (like ITLB or decoding issues). The new event is configured through a separate MSR, which selects a range of sub events. Define the extra MSR as a extra reg and export support for it through sysfs. To avoid duplicating the existing tables use a new function to add new entries to existing tables. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1435707205-6676-4-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 11 ++++++++++- 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index c1c0a1c14344..54390bc140dd 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -141,6 +141,8 @@ #define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10) #define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11) +#define MSR_PEBS_FRONTEND 0x000003f7 + #define MSR_IA32_POWER_CTL 0x000001fc #define MSR_IA32_MC0_CTL 0x00000400 diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 5edf6d868fc1..165be83a7fa4 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -47,6 +47,7 @@ enum extra_reg_type { EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ EXTRA_REG_LBR = 2, /* lbr_select */ EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */ + EXTRA_REG_FE = 4, /* fe_* */ EXTRA_REG_MAX /* number of entries needed */ }; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 1d84b41ba932..ef74c9d05f82 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -205,6 +205,7 @@ static struct extra_reg intel_skl_extra_regs[] __read_mostly = { INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x3fff17, FE), EVENT_EXTRA_END }; @@ -2891,6 +2892,8 @@ PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); PMU_FORMAT_ATTR(ldlat, "config1:0-15"); +PMU_FORMAT_ATTR(frontend, "config1:0-23"); + static struct attribute *intel_arch3_formats_attr[] = { &format_attr_event.attr, &format_attr_umask.attr, @@ -2907,6 +2910,11 @@ static struct attribute *intel_arch3_formats_attr[] = { NULL, }; +static struct attribute *skl_format_attr[] = { + &format_attr_frontend.attr, + NULL, +}; + static __initconst const struct x86_pmu core_pmu = { .name = "core", .handle_irq = x86_pmu_handle_irq, @@ -3516,7 +3524,8 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; - x86_pmu.cpu_events = hsw_events_attrs; + x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr, + skl_format_attr); WARN_ON(!x86_pmu.format_attrs); x86_pmu.cpu_events = hsw_events_attrs; pr_cont("Skylake events, "); -- cgit v1.2.3 From dfe1f3cb312624928052413928d88b0ee3492216 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 9 Sep 2015 14:54:00 -0700 Subject: perf/x86/intel: Fix Skylake FRONTEND MSR extrareg mask Stephane pointed out that the extrareg mask was one bit too short. The bubble width field was truncated by one bit. Fix that here. Also add some extra comments on the reserved bits inside the event select code. Reported-by: Stephane Eranian Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1441835640-21347-3-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index ef74c9d05f82..f63360be2238 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -205,7 +205,11 @@ static struct extra_reg intel_skl_extra_regs[] __read_mostly = { INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), - INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x3fff17, FE), + /* + * Note the low 8 bits eventsel code is not a continuous field, containing + * some #GPing bits. These are masked out. + */ + INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE), EVENT_EXTRA_END }; -- cgit v1.2.3 From 96f3eda67fcf2598e9d2794398e0e7ab35138ea6 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 14 Sep 2015 10:14:07 -0400 Subject: perf/x86/intel: Fix static checker warning in lbr enable Commit deb27519bf1f ("perf/x86/intel: Fix LBR callstack issue caused by FREEZE_LBRS_ON_PMI") leads to the following Smatch complaint: warn: variable dereferenced before check 'cpuc->lbr_sel' (see line 154) Fix the warning. Reported-by: Dan Carpenter Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: deb27519bf1f ("perf/x86/intel: Fix LBR callstack issue caused by FREEZE_LBRS_ON_PMI") Link: http://lkml.kernel.org/r/1442240047-48149-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index a1d07c71d3b6..ad0b8b0490a0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -151,8 +151,9 @@ static void __intel_pmu_lbr_enable(bool pmi) * No need to reprogram LBR_SELECT in a PMI, as it * did not change. */ - lbr_select = cpuc->lbr_sel->config; - if (cpuc->lbr_sel && !pmi) + if (cpuc->lbr_sel) + lbr_select = cpuc->lbr_sel->config; + if (!pmi) wrmsrl(MSR_LBR_SELECT, lbr_select); rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); -- cgit v1.2.3 From 4aef363e48177d451b4d263c69dd2c86437e988b Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 18 Sep 2015 16:53:28 +0200 Subject: x86/fpu/math-emu: Remove define layer for undocumented opcodes No code changes. Signed-off-by: Denys Vlasenko Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1442588010-20055-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/math-emu/fpu_entry.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 929b1d500823..65afd46643f0 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -40,30 +40,29 @@ #define __BAD__ FPU_illegal /* Illegal on an 80486, causes SIGILL */ -/* WARNING: These codes are not documented by Intel in their 80486 manual - and may not work on FPU clones or later Intel FPUs. */ - -/* Changes to support the un-doc codes provided by Linus Torvalds. */ - -#define _d9_d8_ fstp_i /* unofficial code (19) */ -#define _dc_d0_ fcom_st /* unofficial code (14) */ -#define _dc_d8_ fcompst /* unofficial code (1c) */ -#define _dd_c8_ fxch_i /* unofficial code (0d) */ -#define _de_d0_ fcompst /* unofficial code (16) */ -#define _df_c0_ ffreep /* unofficial code (07) ffree + pop */ -#define _df_c8_ fxch_i /* unofficial code (0f) */ -#define _df_d0_ fstp_i /* unofficial code (17) */ -#define _df_d8_ fstp_i /* unofficial code (1f) */ +/* WARNING: "u" entries are not documented by Intel in their 80486 manual + and may not work on FPU clones or later Intel FPUs. + Changes to support them provided by Linus Torvalds. */ static FUNC const st_instr_table[64] = { - fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, _df_c0_, - fmul__, fxch_i, __BAD__, __BAD__, fmul_i, _dd_c8_, fmulp_, _df_c8_, - fcom_st, fp_nop, __BAD__, __BAD__, _dc_d0_, fst_i_, _de_d0_, _df_d0_, - fcompst, _d9_d8_, __BAD__, __BAD__, _dc_d8_, fstp_i, fcompp, _df_d8_, - fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_, - fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__, - fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__, - fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__, +/* Opcode: d8 d9 da db */ +/* dc dd de df */ +/* c0..7 */ fadd__, fld_i_, __BAD__, __BAD__, +/* c0..7 */ fadd_i, ffree_, faddp_, ffreep,/*u*/ +/* c8..f */ fmul__, fxch_i, __BAD__, __BAD__, +/* c8..f */ fmul_i, fxch_i,/*u*/ fmulp_, fxch_i,/*u*/ +/* d0..7 */ fcom_st, fp_nop, __BAD__, __BAD__, +/* d0..7 */ fcom_st,/*u*/ fst_i_, fcompst,/*u*/ fstp_i,/*u*/ +/* d8..f */ fcompst, fstp_i,/*u*/ __BAD__, __BAD__, +/* d8..f */ fcompst,/*u*/ fstp_i, fcompp, fstp_i,/*u*/ +/* e0..7 */ fsub__, FPU_etc, __BAD__, finit_, +/* e0..7 */ fsubri, fucom_, fsubrp, fstsw_, +/* e8..f */ fsubr_, fconst, fucompp, __BAD__, +/* e8..f */ fsub_i, fucomp, fsubp_, __BAD__, +/* f0..7 */ fdiv__, FPU_triga, __BAD__, __BAD__, +/* f0..7 */ fdivri, __BAD__, fdivrp, __BAD__, +/* f8..f */ fdivr_, FPU_trigb, __BAD__, __BAD__, +/* f8..f */ fdiv_i, __BAD__, fdivp_, __BAD__, }; #define _NONE_ 0 /* Take no special action */ -- cgit v1.2.3 From b8e4a910e576961009a87d07f6b7eff67c5c2e34 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 18 Sep 2015 16:53:29 +0200 Subject: x86/fpu/math-emu: Add support for F[U]COMI[P] insns Run-tested by booting with "no387 nofxsr" and running test program: [RUN] Testing f[u]comi[p] instructions [OK] f[u]comi[p] Signed-off-by: Denys Vlasenko Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1442588010-20055-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/math-emu/fpu_entry.c | 27 +++++---- arch/x86/math-emu/fpu_proto.h | 4 ++ arch/x86/math-emu/reg_compare.c | 128 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 147 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 65afd46643f0..4ecf68342d7b 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -40,6 +40,8 @@ #define __BAD__ FPU_illegal /* Illegal on an 80486, causes SIGILL */ +/* f(u)comi(p) are enabled if CPUID(1).EDX(15) "cmov" is set */ + /* WARNING: "u" entries are not documented by Intel in their 80486 manual and may not work on FPU clones or later Intel FPUs. Changes to support them provided by Linus Torvalds. */ @@ -57,10 +59,10 @@ static FUNC const st_instr_table[64] = { /* d8..f */ fcompst,/*u*/ fstp_i, fcompp, fstp_i,/*u*/ /* e0..7 */ fsub__, FPU_etc, __BAD__, finit_, /* e0..7 */ fsubri, fucom_, fsubrp, fstsw_, -/* e8..f */ fsubr_, fconst, fucompp, __BAD__, -/* e8..f */ fsub_i, fucomp, fsubp_, __BAD__, -/* f0..7 */ fdiv__, FPU_triga, __BAD__, __BAD__, -/* f0..7 */ fdivri, __BAD__, fdivrp, __BAD__, +/* e8..f */ fsubr_, fconst, fucompp, fucomi_, +/* e8..f */ fsub_i, fucomp, fsubp_, fucomip, +/* f0..7 */ fdiv__, FPU_triga, __BAD__, fcomi_, +/* f0..7 */ fdivri, __BAD__, fdivrp, fcomip, /* f8..f */ fdivr_, FPU_trigb, __BAD__, __BAD__, /* f8..f */ fdiv_i, __BAD__, fdivp_, __BAD__, }; @@ -77,14 +79,15 @@ static FUNC const st_instr_table[64] = { #define _REGIn 0 /* Uses st(0) and st(rm), but handle checks later */ static u_char const type_table[64] = { - _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_, - _REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_, - _REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, - _REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, - _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, - _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_, - _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_, - _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_ +/* Opcode: d8 d9 da db dc dd de df */ +/* c0..7 */ _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_, +/* c8..f */ _REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_, +/* d0..7 */ _REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, +/* d8..f */ _REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, +/* e0..7 */ _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, +/* e8..f */ _REGI_, _NONE_, _REGIc, _REGIc, _REGIi, _REGIc, _REGIp, _REGIc, +/* f0..7 */ _REGI_, _NONE_, _null_, _REGIc, _REGIi, _null_, _REGIp, _REGIc, +/* f8..f */ _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_, }; #ifdef RE_ENTRANT_CHECKING diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h index 9779df436b7d..1f8d130663ad 100644 --- a/arch/x86/math-emu/fpu_proto.h +++ b/arch/x86/math-emu/fpu_proto.h @@ -108,6 +108,10 @@ extern void fcompp(void); extern void fucom_(void); extern void fucomp(void); extern void fucompp(void); +extern void fcomi_(void); +extern void fcomip(void); +extern void fucomi_(void); +extern void fucomip(void); /* reg_constant.c */ extern void fconst(void); /* reg_ld_str.c */ diff --git a/arch/x86/math-emu/reg_compare.c b/arch/x86/math-emu/reg_compare.c index ecce55fc2e2e..b77360fdbf4a 100644 --- a/arch/x86/math-emu/reg_compare.c +++ b/arch/x86/math-emu/reg_compare.c @@ -249,6 +249,54 @@ static int compare_st_st(int nr) return 0; } +static int compare_i_st_st(int nr) +{ + int f, c; + FPU_REG *st_ptr; + + if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) { + FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF); + /* Stack fault */ + EXCEPTION(EX_StackUnder); + return !(control_word & CW_Invalid); + } + + partial_status &= ~SW_C0; + st_ptr = &st(nr); + c = compare(st_ptr, FPU_gettagi(nr)); + if (c & COMP_NaN) { + FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF); + EXCEPTION(EX_Invalid); + return !(control_word & CW_Invalid); + } + + switch (c & 7) { + case COMP_A_lt_B: + f = X86_EFLAGS_CF; + break; + case COMP_A_eq_B: + f = X86_EFLAGS_ZF; + break; + case COMP_A_gt_B: + f = 0; + break; + case COMP_No_Comp: + f = X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF; + break; +#ifdef PARANOID + default: + EXCEPTION(EX_INTERNAL | 0x122); + f = 0; + break; +#endif /* PARANOID */ + } + FPU_EFLAGS = (FPU_EFLAGS & ~(X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF)) | f; + if (c & COMP_Denormal) { + return denormal_operand() < 0; + } + return 0; +} + static int compare_u_st_st(int nr) { int f = 0, c; @@ -299,6 +347,58 @@ static int compare_u_st_st(int nr) return 0; } +static int compare_ui_st_st(int nr) +{ + int f = 0, c; + FPU_REG *st_ptr; + + if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) { + FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF); + /* Stack fault */ + EXCEPTION(EX_StackUnder); + return !(control_word & CW_Invalid); + } + + partial_status &= ~SW_C0; + st_ptr = &st(nr); + c = compare(st_ptr, FPU_gettagi(nr)); + if (c & COMP_NaN) { + FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF); + if (c & COMP_SNaN) { /* This is the only difference between + un-ordered and ordinary comparisons */ + EXCEPTION(EX_Invalid); + return !(control_word & CW_Invalid); + } + return 0; + } + + switch (c & 7) { + case COMP_A_lt_B: + f = X86_EFLAGS_CF; + break; + case COMP_A_eq_B: + f = X86_EFLAGS_ZF; + break; + case COMP_A_gt_B: + f = 0; + break; + case COMP_No_Comp: + f = X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF; + break; +#ifdef PARANOID + default: + EXCEPTION(EX_INTERNAL | 0x123); + f = 0; + break; +#endif /* PARANOID */ + } + FPU_EFLAGS = (FPU_EFLAGS & ~(X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF)) | f; + if (c & COMP_Denormal) { + return denormal_operand() < 0; + } + return 0; +} + /*---------------------------------------------------------------------------*/ void fcom_st(void) @@ -348,3 +448,31 @@ void fucompp(void) } else FPU_illegal(); } + +/* P6+ compare-to-EFLAGS ops */ + +void fcomi_(void) +{ + /* fcomi st(i) */ + compare_i_st_st(FPU_rm); +} + +void fcomip(void) +{ + /* fcomip st(i) */ + if (!compare_i_st_st(FPU_rm)) + FPU_pop(); +} + +void fucomi_(void) +{ + /* fucomi st(i) */ + compare_ui_st_st(FPU_rm); +} + +void fucomip(void) +{ + /* fucomip st(i) */ + if (!compare_ui_st_st(FPU_rm)) + FPU_pop(); +} -- cgit v1.2.3 From 9a9d8642d03a7512f78cbe7ed6a2011fad3cbca3 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 18 Sep 2015 16:53:30 +0200 Subject: x86/fpu/math-emu: Add support for FCMOVcc insns Run-tested by booting with "no387 nofxsr" and running test program: [RUN] Testing fcmovCC instructions [OK] fcmovCC Signed-off-by: Denys Vlasenko Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1442588010-20055-3-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/math-emu/fpu_aux.c | 70 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/math-emu/fpu_entry.c | 18 +++++------ arch/x86/math-emu/fpu_proto.h | 8 +++++ 3 files changed, 87 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c index dd76a05729b0..024f6e971174 100644 --- a/arch/x86/math-emu/fpu_aux.c +++ b/arch/x86/math-emu/fpu_aux.c @@ -169,6 +169,76 @@ void fxch_i(void) fpu_tag_word = tag_word; } +static void fcmovCC(void) +{ + /* fcmovCC st(i) */ + int i = FPU_rm; + FPU_REG *st0_ptr = &st(0); + FPU_REG *sti_ptr = &st(i); + long tag_word = fpu_tag_word; + int regnr = top & 7; + int regnri = (top + i) & 7; + u_char sti_tag = (tag_word >> (regnri * 2)) & 3; + + if (sti_tag == TAG_Empty) { + FPU_stack_underflow(); + clear_C1(); + return; + } + reg_copy(sti_ptr, st0_ptr); + tag_word &= ~(3 << (regnr * 2)); + tag_word |= (sti_tag << (regnr * 2)); + fpu_tag_word = tag_word; +} + +void fcmovb(void) +{ + if (FPU_EFLAGS & X86_EFLAGS_CF) + fcmovCC(); +} + +void fcmove(void) +{ + if (FPU_EFLAGS & X86_EFLAGS_ZF) + fcmovCC(); +} + +void fcmovbe(void) +{ + if (FPU_EFLAGS & (X86_EFLAGS_CF|X86_EFLAGS_ZF)) + fcmovCC(); +} + +void fcmovu(void) +{ + if (FPU_EFLAGS & X86_EFLAGS_PF) + fcmovCC(); +} + +void fcmovnb(void) +{ + if (!(FPU_EFLAGS & X86_EFLAGS_CF)) + fcmovCC(); +} + +void fcmovne(void) +{ + if (!(FPU_EFLAGS & X86_EFLAGS_ZF)) + fcmovCC(); +} + +void fcmovnbe(void) +{ + if (!(FPU_EFLAGS & (X86_EFLAGS_CF|X86_EFLAGS_ZF))) + fcmovCC(); +} + +void fcmovnu(void) +{ + if (!(FPU_EFLAGS & X86_EFLAGS_PF)) + fcmovCC(); +} + void ffree_(void) { /* ffree st(i) */ diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 4ecf68342d7b..e945fedf1de2 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -40,7 +40,7 @@ #define __BAD__ FPU_illegal /* Illegal on an 80486, causes SIGILL */ -/* f(u)comi(p) are enabled if CPUID(1).EDX(15) "cmov" is set */ +/* fcmovCC and f(u)comi(p) are enabled if CPUID(1).EDX(15) "cmov" is set */ /* WARNING: "u" entries are not documented by Intel in their 80486 manual and may not work on FPU clones or later Intel FPUs. @@ -49,13 +49,13 @@ static FUNC const st_instr_table[64] = { /* Opcode: d8 d9 da db */ /* dc dd de df */ -/* c0..7 */ fadd__, fld_i_, __BAD__, __BAD__, +/* c0..7 */ fadd__, fld_i_, fcmovb, fcmovnb, /* c0..7 */ fadd_i, ffree_, faddp_, ffreep,/*u*/ -/* c8..f */ fmul__, fxch_i, __BAD__, __BAD__, +/* c8..f */ fmul__, fxch_i, fcmove, fcmovne, /* c8..f */ fmul_i, fxch_i,/*u*/ fmulp_, fxch_i,/*u*/ -/* d0..7 */ fcom_st, fp_nop, __BAD__, __BAD__, +/* d0..7 */ fcom_st, fp_nop, fcmovbe, fcmovnbe, /* d0..7 */ fcom_st,/*u*/ fst_i_, fcompst,/*u*/ fstp_i,/*u*/ -/* d8..f */ fcompst, fstp_i,/*u*/ __BAD__, __BAD__, +/* d8..f */ fcompst, fstp_i,/*u*/ fcmovu, fcmovnu, /* d8..f */ fcompst,/*u*/ fstp_i, fcompp, fstp_i,/*u*/ /* e0..7 */ fsub__, FPU_etc, __BAD__, finit_, /* e0..7 */ fsubri, fucom_, fsubrp, fstsw_, @@ -80,10 +80,10 @@ static FUNC const st_instr_table[64] = { static u_char const type_table[64] = { /* Opcode: d8 d9 da db dc dd de df */ -/* c0..7 */ _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_, -/* c8..f */ _REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_, -/* d0..7 */ _REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, -/* d8..f */ _REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, +/* c0..7 */ _REGI_, _NONE_, _REGIn, _REGIn, _REGIi, _REGi_, _REGIp, _REGi_, +/* c8..f */ _REGI_, _REGIn, _REGIn, _REGIn, _REGIi, _REGI_, _REGIp, _REGI_, +/* d0..7 */ _REGIc, _NONE_, _REGIn, _REGIn, _REGIc, _REG0_, _REGIc, _REG0_, +/* d8..f */ _REGIc, _REG0_, _REGIn, _REGIn, _REGIc, _REG0_, _REGIc, _REG0_, /* e0..7 */ _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, /* e8..f */ _REGI_, _NONE_, _REGIc, _REGIc, _REGIi, _REGIc, _REGIp, _REGIc, /* f0..7 */ _REGI_, _NONE_, _null_, _REGIc, _REGIi, _null_, _REGIp, _REGIc, diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h index 1f8d130663ad..caff438b9c1d 100644 --- a/arch/x86/math-emu/fpu_proto.h +++ b/arch/x86/math-emu/fpu_proto.h @@ -46,6 +46,14 @@ extern void fstsw_(void); extern void fp_nop(void); extern void fld_i_(void); extern void fxch_i(void); +extern void fcmovb(void); +extern void fcmove(void); +extern void fcmovbe(void); +extern void fcmovu(void); +extern void fcmovnb(void); +extern void fcmovne(void); +extern void fcmovnbe(void); +extern void fcmovnu(void); extern void ffree_(void); extern void ffreep(void); extern void fst_i_(void); -- cgit v1.2.3 From 3dc33bd30f3e1c1bcaaafa3482737694debf0f0b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 12 Aug 2015 17:55:19 -0700 Subject: x86/entry/vsyscall: Add CONFIG to control default Most modern systems can run with vsyscall=none. In an effort to provide a way for build-time defaults to lack legacy settings, this adds a new CONFIG to select the type of vsyscall mapping to use, similar to the existing "vsyscall" command line parameter. Signed-off-by: Kees Cook Acked-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Triplett Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150813005519.GA11696@www.outflux.net Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 49 +++++++++++++++++++++++++++++++++++ arch/x86/entry/vsyscall/vsyscall_64.c | 9 ++++++- 2 files changed, 57 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 328c8352480c..9bfb9e15a6e8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2042,6 +2042,55 @@ config COMPAT_VDSO If unsure, say N: if you are compiling your own kernel, you are unlikely to be using a buggy version of glibc. +choice + prompt "vsyscall table for legacy applications" + depends on X86_64 + default LEGACY_VSYSCALL_EMULATE + help + Legacy user code that does not know how to find the vDSO expects + to be able to issue three syscalls by calling fixed addresses in + kernel space. Since this location is not randomized with ASLR, + it can be used to assist security vulnerability exploitation. + + This setting can be changed at boot time via the kernel command + line parameter vsyscall=[native|emulate|none]. + + On a system with recent enough glibc (2.14 or newer) and no + static binaries, you can say None without a performance penalty + to improve security. + + If unsure, select "Emulate". + + config LEGACY_VSYSCALL_NATIVE + bool "Native" + help + Actual executable code is located in the fixed vsyscall + address mapping, implementing time() efficiently. Since + this makes the mapping executable, it can be used during + security vulnerability exploitation (traditionally as + ROP gadgets). This configuration is not recommended. + + config LEGACY_VSYSCALL_EMULATE + bool "Emulate" + help + The kernel traps and emulates calls into the fixed + vsyscall address mapping. This makes the mapping + non-executable, but it still contains known contents, + which could be used in certain rare security vulnerability + exploits. This configuration is recommended when userspace + still uses the vsyscall area. + + config LEGACY_VSYSCALL_NONE + bool "None" + help + There will be no vsyscall mapping at all. This will + eliminate any risk of ASLR bypass due to the vsyscall + fixed address mapping. Attempts to use the vsyscalls + will be reported to dmesg, so that either old or + malicious userspace programs can be identified. + +endchoice + config CMDLINE_BOOL bool "Built-in kernel command line" ---help--- diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index b160c0c6baed..76e0fd3ea1fb 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -38,7 +38,14 @@ #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" -static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = +#ifdef CONFIG_LEGACY_VSYSCALL_NATIVE + NATIVE; +#elif CONFIG_LEGACY_VSYSCALL_NONE + NONE; +#else + EMULATE; +#endif static int __init vsyscall_setup(char *str) { -- cgit v1.2.3 From e4877d64f00964d86a6e4a023011cccc73899018 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 18 Sep 2015 20:23:34 +0200 Subject: x86/fpu/math-emu: Add support for FISTTP instructions These FPU instructions were added in SSE3-enabled CPUs. Run-tested by booting with "no387 nofxsr" and running test program: [RUN] Testing fisttp instructions [OK] fisttp Signed-off-by: Denys Vlasenko Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1442600614-28428-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/math-emu/load_store.c | 63 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c index 2931ff355218..95228ff042c0 100644 --- a/arch/x86/math-emu/load_store.c +++ b/arch/x86/math-emu/load_store.c @@ -33,11 +33,12 @@ #define pop_0() { FPU_settag0(TAG_Empty); top++; } +/* index is a 5-bit value: (3-bit FPU_modrm.reg field | opcode[2,1]) */ static u_char const type_table[32] = { - _PUSH_, _PUSH_, _PUSH_, _PUSH_, - _null_, _null_, _null_, _null_, - _REG0_, _REG0_, _REG0_, _REG0_, - _REG0_, _REG0_, _REG0_, _REG0_, + _PUSH_, _PUSH_, _PUSH_, _PUSH_, /* /0: d9:fld f32, db:fild m32, dd:fld f64, df:fild m16 */ + _null_, _REG0_, _REG0_, _REG0_, /* /1: d9:undef, db,dd,df:fisttp m32/64/16 */ + _REG0_, _REG0_, _REG0_, _REG0_, /* /2: d9:fst f32, db:fist m32, dd:fst f64, df:fist m16 */ + _REG0_, _REG0_, _REG0_, _REG0_, /* /3: d9:fstp f32, db:fistp m32, dd:fstp f64, df:fistp m16 */ _NONE_, _null_, _NONE_, _PUSH_, _NONE_, _PUSH_, _null_, _PUSH_, _NONE_, _null_, _NONE_, _REG0_, @@ -45,15 +46,19 @@ static u_char const type_table[32] = { }; u_char const data_sizes_16[32] = { - 4, 4, 8, 2, 0, 0, 0, 0, - 4, 4, 8, 2, 4, 4, 8, 2, + 4, 4, 8, 2, + 0, 4, 8, 2, /* /1: d9:undef, db,dd,df:fisttp */ + 4, 4, 8, 2, + 4, 4, 8, 2, 14, 0, 94, 10, 2, 10, 0, 8, 14, 0, 94, 10, 2, 10, 2, 8 }; static u_char const data_sizes_32[32] = { - 4, 4, 8, 2, 0, 0, 0, 0, - 4, 4, 8, 2, 4, 4, 8, 2, + 4, 4, 8, 2, + 0, 4, 8, 2, /* /1: d9:undef, db,dd,df:fisttp */ + 4, 4, 8, 2, + 4, 4, 8, 2, 28, 0, 108, 10, 2, 10, 0, 8, 28, 0, 108, 10, 2, 10, 2, 8 }; @@ -65,6 +70,7 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes, FPU_REG *st0_ptr; u_char st0_tag = TAG_Empty; /* This is just to stop a gcc warning. */ u_char loaded_tag; + int sv_cw; st0_ptr = NULL; /* Initialized just to stop compiler warnings. */ @@ -111,7 +117,8 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes, } switch (type) { - case 000: /* fld m32real */ + /* type is a 5-bit value: (3-bit FPU_modrm.reg field | opcode[2,1]) */ + case 000: /* fld m32real (d9 /0) */ clear_C1(); loaded_tag = FPU_load_single((float __user *)data_address, &loaded_data); @@ -123,13 +130,13 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes, } FPU_copy_to_reg0(&loaded_data, loaded_tag); break; - case 001: /* fild m32int */ + case 001: /* fild m32int (db /0) */ clear_C1(); loaded_tag = FPU_load_int32((long __user *)data_address, &loaded_data); FPU_copy_to_reg0(&loaded_data, loaded_tag); break; - case 002: /* fld m64real */ + case 002: /* fld m64real (dd /0) */ clear_C1(); loaded_tag = FPU_load_double((double __user *)data_address, @@ -142,12 +149,44 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes, } FPU_copy_to_reg0(&loaded_data, loaded_tag); break; - case 003: /* fild m16int */ + case 003: /* fild m16int (df /0) */ clear_C1(); loaded_tag = FPU_load_int16((short __user *)data_address, &loaded_data); FPU_copy_to_reg0(&loaded_data, loaded_tag); break; + /* case 004: undefined (d9 /1) */ + /* fisttp are enabled if CPUID(1).ECX(0) "sse3" is set */ + case 005: /* fisttp m32int (db /1) */ + clear_C1(); + sv_cw = control_word; + control_word |= RC_CHOP; + if (FPU_store_int32 + (st0_ptr, st0_tag, (long __user *)data_address)) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + control_word = sv_cw; + break; + case 006: /* fisttp m64int (dd /1) */ + clear_C1(); + sv_cw = control_word; + control_word |= RC_CHOP; + if (FPU_store_int64 + (st0_ptr, st0_tag, (long long __user *)data_address)) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + control_word = sv_cw; + break; + case 007: /* fisttp m16int (df /1) */ + clear_C1(); + sv_cw = control_word; + control_word |= RC_CHOP; + if (FPU_store_int16 + (st0_ptr, st0_tag, (short __user *)data_address)) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + control_word = sv_cw; + break; case 010: /* fst m32real */ clear_C1(); FPU_store_single(st0_ptr, st0_tag, -- cgit v1.2.3 From 93f13a9f96771a064c716364aebc6e283b186eb8 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 21 Sep 2015 09:48:29 +0200 Subject: x86/entry/vsyscall: Fix undefined symbol warning Commit: 3dc33bd30f3e1 ("x86/entry/vsyscall: Add CONFIG to control default") did the ifdef/elif thing but GCC doesn't like that: arch/x86/entry/vsyscall/vsyscall_64.c:44:7: warning: "CONFIG_LEGACY_VSYSCALL_NONE" is not defined [-Wundef] #elif CONFIG_LEGACY_VSYSCALL_NONE ^ Use defined() instead. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Triplett Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150921074829.GA3550@pd.tnic Signed-off-by: Ingo Molnar --- arch/x86/entry/vsyscall/vsyscall_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 76e0fd3ea1fb..174c2549939d 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -39,9 +39,9 @@ #include "vsyscall_trace.h" static enum { EMULATE, NATIVE, NONE } vsyscall_mode = -#ifdef CONFIG_LEGACY_VSYSCALL_NATIVE +#if defined(CONFIG_LEGACY_VSYSCALL_NATIVE) NATIVE; -#elif CONFIG_LEGACY_VSYSCALL_NONE +#elif defined(CONFIG_LEGACY_VSYSCALL_NONE) NONE; #else EMULATE; -- cgit v1.2.3 From c356a7e975a25e8867961c1b7a4a965d506f0a04 Mon Sep 17 00:00:00 2001 From: tim Date: Thu, 10 Sep 2015 15:26:59 -0700 Subject: crypto: x86/sha - Intel SHA Extensions optimized SHA1 transform function This patch includes the Intel SHA Extensions optimized implementation of SHA-1 update function. This function has been tested on Broxton platform and measured a speed up of 3.6x over the SSSE3 implementiation for 4K blocks. Originally-by: Chandramouli Narayanan Signed-off-by: Tim Chen Acked-by: David S. Miller Signed-off-by: Herbert Xu --- arch/x86/crypto/sha1_ni_asm.S | 302 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 302 insertions(+) create mode 100644 arch/x86/crypto/sha1_ni_asm.S (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S new file mode 100644 index 000000000000..874a651b9e7d --- /dev/null +++ b/arch/x86/crypto/sha1_ni_asm.S @@ -0,0 +1,302 @@ +/* + * Intel SHA Extensions optimized implementation of a SHA-1 update function + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Sean Gulley + * Tim Chen + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include + +#define DIGEST_PTR %rdi /* 1st arg */ +#define DATA_PTR %rsi /* 2nd arg */ +#define NUM_BLKS %rdx /* 3rd arg */ + +#define RSPSAVE %rax + +/* gcc conversion */ +#define FRAME_SIZE 32 /* space for 2x16 bytes */ + +#define ABCD %xmm0 +#define E0 %xmm1 /* Need two E's b/c they ping pong */ +#define E1 %xmm2 +#define MSG0 %xmm3 +#define MSG1 %xmm4 +#define MSG2 %xmm5 +#define MSG3 %xmm6 +#define SHUF_MASK %xmm7 + + +/* + * Intel SHA Extensions optimized implementation of a SHA-1 update function + * + * The function takes a pointer to the current hash values, a pointer to the + * input data, and a number of 64 byte blocks to process. Once all blocks have + * been processed, the digest pointer is updated with the resulting hash value. + * The function only processes complete blocks, there is no functionality to + * store partial blocks. All message padding and hash value initialization must + * be done outside the update function. + * + * The indented lines in the loop are instructions related to rounds processing. + * The non-indented lines are instructions related to the message schedule. + * + * void sha1_ni_transform(uint32_t *digest, const void *data, + uint32_t numBlocks) + * digest : pointer to digest + * data: pointer to input data + * numBlocks: Number of blocks to process + */ +.text +.align 32 +ENTRY(sha1_ni_transform) + mov %rsp, RSPSAVE + sub $FRAME_SIZE, %rsp + and $~0xF, %rsp + + shl $6, NUM_BLKS /* convert to bytes */ + jz .Ldone_hash + add DATA_PTR, NUM_BLKS /* pointer to end of data */ + + /* load initial hash values */ + pinsrd $3, 1*16(DIGEST_PTR), E0 + movdqu 0*16(DIGEST_PTR), ABCD + pand UPPER_WORD_MASK(%rip), E0 + pshufd $0x1B, ABCD, ABCD + + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK + +.Lloop0: + /* Save hash values for addition after rounds */ + movdqa E0, (0*16)(%rsp) + movdqa ABCD, (1*16)(%rsp) + + /* Rounds 0-3 */ + movdqu 0*16(DATA_PTR), MSG0 + pshufb SHUF_MASK, MSG0 + paddd MSG0, E0 + movdqa ABCD, E1 + sha1rnds4 $0, E0, ABCD + + /* Rounds 4-7 */ + movdqu 1*16(DATA_PTR), MSG1 + pshufb SHUF_MASK, MSG1 + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG1, MSG0 + + /* Rounds 8-11 */ + movdqu 2*16(DATA_PTR), MSG2 + pshufb SHUF_MASK, MSG2 + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG2, MSG1 + pxor MSG2, MSG0 + + /* Rounds 12-15 */ + movdqu 3*16(DATA_PTR), MSG3 + pshufb SHUF_MASK, MSG3 + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG3, MSG2 + pxor MSG3, MSG1 + + /* Rounds 16-19 */ + sha1nexte MSG0, E0 + movdqa ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG0, MSG3 + pxor MSG0, MSG2 + + /* Rounds 20-23 */ + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + pxor MSG1, MSG3 + + /* Rounds 24-27 */ + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG2, MSG1 + pxor MSG2, MSG0 + + /* Rounds 28-31 */ + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG3, MSG2 + pxor MSG3, MSG1 + + /* Rounds 32-35 */ + sha1nexte MSG0, E0 + movdqa ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG0, MSG3 + pxor MSG0, MSG2 + + /* Rounds 36-39 */ + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + pxor MSG1, MSG3 + + /* Rounds 40-43 */ + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + pxor MSG2, MSG0 + + /* Rounds 44-47 */ + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG3, MSG2 + pxor MSG3, MSG1 + + /* Rounds 48-51 */ + sha1nexte MSG0, E0 + movdqa ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG0, MSG3 + pxor MSG0, MSG2 + + /* Rounds 52-55 */ + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG1, MSG0 + pxor MSG1, MSG3 + + /* Rounds 56-59 */ + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + pxor MSG2, MSG0 + + /* Rounds 60-63 */ + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $3, E1, ABCD + sha1msg1 MSG3, MSG2 + pxor MSG3, MSG1 + + /* Rounds 64-67 */ + sha1nexte MSG0, E0 + movdqa ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $3, E0, ABCD + sha1msg1 MSG0, MSG3 + pxor MSG0, MSG2 + + /* Rounds 68-71 */ + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $3, E1, ABCD + pxor MSG1, MSG3 + + /* Rounds 72-75 */ + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $3, E0, ABCD + + /* Rounds 76-79 */ + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1rnds4 $3, E1, ABCD + + /* Add current hash values with previously saved */ + sha1nexte (0*16)(%rsp), E0 + paddd (1*16)(%rsp), ABCD + + /* Increment data pointer and loop if more to process */ + add $64, DATA_PTR + cmp NUM_BLKS, DATA_PTR + jne .Lloop0 + + /* Write hash values back in the correct order */ + pshufd $0x1B, ABCD, ABCD + movdqu ABCD, 0*16(DIGEST_PTR) + pextrd $3, E0, 1*16(DIGEST_PTR) + +.Ldone_hash: + mov RSPSAVE, %rsp + + ret +ENDPROC(sha1_ni_transform) + +.data + +.align 64 +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x000102030405060708090a0b0c0d0e0f +UPPER_WORD_MASK: + .octa 0xFFFFFFFF000000000000000000000000 -- cgit v1.2.3 From 600a2334e83d22e5c3f7ff2581f545bfc354d206 Mon Sep 17 00:00:00 2001 From: tim Date: Thu, 10 Sep 2015 15:27:13 -0700 Subject: crypto: x86/sha - Intel SHA Extensions optimized SHA256 transform function This patch includes the Intel SHA Extensions optimized implementation of SHA-256 update function. This function has been tested on Broxton platform and measured a speed up of 3.6x over the SSSE3 implementiation for 4K blocks. Originally-by: Chandramouli Narayanan Signed-off-by: Tim Chen Acked-by: David S. Miller Signed-off-by: Herbert Xu --- arch/x86/crypto/sha256_ni_asm.S | 353 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 353 insertions(+) create mode 100644 arch/x86/crypto/sha256_ni_asm.S (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S new file mode 100644 index 000000000000..748cdf21a938 --- /dev/null +++ b/arch/x86/crypto/sha256_ni_asm.S @@ -0,0 +1,353 @@ +/* + * Intel SHA Extensions optimized implementation of a SHA-256 update function + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Sean Gulley + * Tim Chen + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include + +#define DIGEST_PTR %rdi /* 1st arg */ +#define DATA_PTR %rsi /* 2nd arg */ +#define NUM_BLKS %rdx /* 3rd arg */ + +#define SHA256CONSTANTS %rax + +#define MSG %xmm0 +#define STATE0 %xmm1 +#define STATE1 %xmm2 +#define MSGTMP0 %xmm3 +#define MSGTMP1 %xmm4 +#define MSGTMP2 %xmm5 +#define MSGTMP3 %xmm6 +#define MSGTMP4 %xmm7 + +#define SHUF_MASK %xmm8 + +#define ABEF_SAVE %xmm9 +#define CDGH_SAVE %xmm10 + +/* + * Intel SHA Extensions optimized implementation of a SHA-256 update function + * + * The function takes a pointer to the current hash values, a pointer to the + * input data, and a number of 64 byte blocks to process. Once all blocks have + * been processed, the digest pointer is updated with the resulting hash value. + * The function only processes complete blocks, there is no functionality to + * store partial blocks. All message padding and hash value initialization must + * be done outside the update function. + * + * The indented lines in the loop are instructions related to rounds processing. + * The non-indented lines are instructions related to the message schedule. + * + * void sha256_ni_transform(uint32_t *digest, const void *data, + uint32_t numBlocks); + * digest : pointer to digest + * data: pointer to input data + * numBlocks: Number of blocks to process + */ + +.text +.align 32 +ENTRY(sha256_ni_transform) + + shl $6, NUM_BLKS /* convert to bytes */ + jz .Ldone_hash + add DATA_PTR, NUM_BLKS /* pointer to end of data */ + + /* + * load initial hash values + * Need to reorder these appropriately + * DCBA, HGFE -> ABEF, CDGH + */ + movdqu 0*16(DIGEST_PTR), STATE0 + movdqu 1*16(DIGEST_PTR), STATE1 + + pshufd $0xB1, STATE0, STATE0 /* CDAB */ + pshufd $0x1B, STATE1, STATE1 /* EFGH */ + movdqa STATE0, MSGTMP4 + palignr $8, STATE1, STATE0 /* ABEF */ + pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */ + + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK + lea K256(%rip), SHA256CONSTANTS + +.Lloop0: + /* Save hash values for addition after rounds */ + movdqa STATE0, ABEF_SAVE + movdqa STATE1, CDGH_SAVE + + /* Rounds 0-3 */ + movdqu 0*16(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + movdqa MSG, MSGTMP0 + paddd 0*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Rounds 4-7 */ + movdqu 1*16(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + movdqa MSG, MSGTMP1 + paddd 1*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 8-11 */ + movdqu 2*16(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + movdqa MSG, MSGTMP2 + paddd 2*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 12-15 */ + movdqu 3*16(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + movdqa MSG, MSGTMP3 + paddd 3*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP3, MSGTMP4 + palignr $4, MSGTMP2, MSGTMP4 + paddd MSGTMP4, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 16-19 */ + movdqa MSGTMP0, MSG + paddd 4*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP0, MSGTMP4 + palignr $4, MSGTMP3, MSGTMP4 + paddd MSGTMP4, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 20-23 */ + movdqa MSGTMP1, MSG + paddd 5*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP1, MSGTMP4 + palignr $4, MSGTMP0, MSGTMP4 + paddd MSGTMP4, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 24-27 */ + movdqa MSGTMP2, MSG + paddd 6*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP2, MSGTMP4 + palignr $4, MSGTMP1, MSGTMP4 + paddd MSGTMP4, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 28-31 */ + movdqa MSGTMP3, MSG + paddd 7*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP3, MSGTMP4 + palignr $4, MSGTMP2, MSGTMP4 + paddd MSGTMP4, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 32-35 */ + movdqa MSGTMP0, MSG + paddd 8*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP0, MSGTMP4 + palignr $4, MSGTMP3, MSGTMP4 + paddd MSGTMP4, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 36-39 */ + movdqa MSGTMP1, MSG + paddd 9*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP1, MSGTMP4 + palignr $4, MSGTMP0, MSGTMP4 + paddd MSGTMP4, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 40-43 */ + movdqa MSGTMP2, MSG + paddd 10*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP2, MSGTMP4 + palignr $4, MSGTMP1, MSGTMP4 + paddd MSGTMP4, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 44-47 */ + movdqa MSGTMP3, MSG + paddd 11*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP3, MSGTMP4 + palignr $4, MSGTMP2, MSGTMP4 + paddd MSGTMP4, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 48-51 */ + movdqa MSGTMP0, MSG + paddd 12*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP0, MSGTMP4 + palignr $4, MSGTMP3, MSGTMP4 + paddd MSGTMP4, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 52-55 */ + movdqa MSGTMP1, MSG + paddd 13*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP1, MSGTMP4 + palignr $4, MSGTMP0, MSGTMP4 + paddd MSGTMP4, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Rounds 56-59 */ + movdqa MSGTMP2, MSG + paddd 14*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP2, MSGTMP4 + palignr $4, MSGTMP1, MSGTMP4 + paddd MSGTMP4, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Rounds 60-63 */ + movdqa MSGTMP3, MSG + paddd 15*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Add current hash values with previously saved */ + paddd ABEF_SAVE, STATE0 + paddd CDGH_SAVE, STATE1 + + /* Increment data pointer and loop if more to process */ + add $64, DATA_PTR + cmp NUM_BLKS, DATA_PTR + jne .Lloop0 + + /* Write hash values back in the correct order */ + pshufd $0x1B, STATE0, STATE0 /* FEBA */ + pshufd $0xB1, STATE1, STATE1 /* DCHG */ + movdqa STATE0, MSGTMP4 + pblendw $0xF0, STATE1, STATE0 /* DCBA */ + palignr $8, MSGTMP4, STATE1 /* HGFE */ + + movdqu STATE0, 0*16(DIGEST_PTR) + movdqu STATE1, 1*16(DIGEST_PTR) + +.Ldone_hash: + + ret +ENDPROC(sha256_ni_transform) + +.data +.align 64 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203 -- cgit v1.2.3 From 95fca7df0b4964fbe3fe159e3d6e681e6b5b7a53 Mon Sep 17 00:00:00 2001 From: tim Date: Thu, 10 Sep 2015 15:27:20 -0700 Subject: crypto: x86/sha - glue code for Intel SHA extensions optimized SHA1 & SHA256 This patch adds the glue code to detect and utilize the Intel SHA extensions optimized SHA1 and SHA256 update transforms when available. This code has been tested on Broxton for functionality. Originally-by: Chandramouli Narayanan Signed-off-by: Tim Chen Acked-by: David S. Miller Signed-off-by: Herbert Xu --- arch/x86/crypto/sha1_ssse3_glue.c | 12 +++++++++++- arch/x86/crypto/sha256_ssse3_glue.c | 38 ++++++++++++++++++++++--------------- 2 files changed, 34 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 7c48e8b20848..98be8cc17ca2 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -44,6 +44,10 @@ asmlinkage void sha1_transform_avx(u32 *digest, const char *data, asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, unsigned int rounds); #endif +#ifdef CONFIG_AS_SHA1_NI +asmlinkage void sha1_ni_transform(u32 *digest, const char *data, + unsigned int rounds); +#endif static void (*sha1_transform_asm)(u32 *, const char *, unsigned int); @@ -166,12 +170,18 @@ static int __init sha1_ssse3_mod_init(void) #endif } #endif +#ifdef CONFIG_AS_SHA1_NI + if (boot_cpu_has(X86_FEATURE_SHA_NI)) { + sha1_transform_asm = sha1_ni_transform; + algo_name = "SHA-NI"; + } +#endif if (sha1_transform_asm) { pr_info("Using %s optimized SHA-1 implementation\n", algo_name); return crypto_register_shash(&alg); } - pr_info("Neither AVX nor AVX2 nor SSSE3 is available/usable.\n"); + pr_info("Neither AVX nor AVX2 nor SSSE3/SHA-NI is available/usable.\n"); return -ENODEV; } diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index f8097fc0d1d1..9c7b22c489f6 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -50,6 +50,10 @@ asmlinkage void sha256_transform_avx(u32 *digest, const char *data, asmlinkage void sha256_transform_rorx(u32 *digest, const char *data, u64 rounds); #endif +#ifdef CONFIG_AS_SHA256_NI +asmlinkage void sha256_ni_transform(u32 *digest, const char *data, + u64 rounds); /*unsigned int rounds);*/ +#endif static void (*sha256_transform_asm)(u32 *, const char *, u64); @@ -142,36 +146,40 @@ static bool __init avx_usable(void) static int __init sha256_ssse3_mod_init(void) { + char *algo; + /* test for SSSE3 first */ - if (cpu_has_ssse3) + if (cpu_has_ssse3) { sha256_transform_asm = sha256_transform_ssse3; + algo = "SSSE3"; + } #ifdef CONFIG_AS_AVX /* allow AVX to override SSSE3, it's a little faster */ if (avx_usable()) { + sha256_transform_asm = sha256_transform_avx; + algo = "AVX"; #ifdef CONFIG_AS_AVX2 - if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2)) + if (boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_BMI2)) { sha256_transform_asm = sha256_transform_rorx; - else + algo = "AVX2"; + } +#endif + } #endif - sha256_transform_asm = sha256_transform_avx; +#ifdef CONFIG_AS_SHA256_NI + if (boot_cpu_has(X86_FEATURE_SHA_NI)) { + sha256_transform_asm = sha256_ni_transform; + algo = "SHA-256-NI"; } #endif if (sha256_transform_asm) { -#ifdef CONFIG_AS_AVX - if (sha256_transform_asm == sha256_transform_avx) - pr_info("Using AVX optimized SHA-256 implementation\n"); -#ifdef CONFIG_AS_AVX2 - else if (sha256_transform_asm == sha256_transform_rorx) - pr_info("Using AVX2 optimized SHA-256 implementation\n"); -#endif - else -#endif - pr_info("Using SSSE3 optimized SHA-256 implementation\n"); + pr_info("Using %s optimized SHA-256 implementation\n", algo); return crypto_register_shashes(algs, ARRAY_SIZE(algs)); } - pr_info("Neither AVX nor SSSE3 is available/usable.\n"); + pr_info("Neither AVX nor SSSE3/SHA-NI is available/usable.\n"); return -ENODEV; } -- cgit v1.2.3 From e38b6b7fcfd11fb83dcac54a33cbca3739c45a09 Mon Sep 17 00:00:00 2001 From: tim Date: Thu, 10 Sep 2015 15:27:26 -0700 Subject: crypto: x86/sha - Add build support for Intel SHA Extensions optimized SHA1 and SHA256 This patch provides the configuration and build support to include and build the optimized SHA1 and SHA256 update transforms for the kernel's crypto library. Originally-by: Chandramouli Narayanan Signed-off-by: Tim Chen Acked-by: David S. Miller Signed-off-by: Herbert Xu --- arch/x86/Makefile | 6 ++++-- arch/x86/crypto/Makefile | 8 ++++++++ crypto/Kconfig | 10 ++++++---- 3 files changed, 18 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 747860c696e1..a8009c77918a 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -165,9 +165,11 @@ asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1) asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1) avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) +sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1) +sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1) -KBUILD_AFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) -KBUILD_CFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) +KBUILD_AFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr) +KBUILD_CFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr) LDFLAGS := -m elf_$(UTS_MACHINE) diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 9a2838cf0591..b9b912a44d61 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -5,6 +5,8 @@ avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ $(comma)4)$(comma)%ymm2,yes,no) +sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no) +sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no) obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o @@ -91,9 +93,15 @@ ifeq ($(avx2_supported),yes) sha1-ssse3-y += sha1_avx2_x86_64_asm.o poly1305-x86_64-y += poly1305-avx2-x86_64.o endif +ifeq ($(sha1_ni_supported),yes) +sha1-ssse3-y += sha1_ni_asm.o +endif crc32c-intel-y := crc32c-intel_glue.o crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o +ifeq ($(sha256_ni_supported),yes) +sha256-ssse3-y += sha256_ni_asm.o +endif sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o diff --git a/crypto/Kconfig b/crypto/Kconfig index 48ee3e175dac..fc934444d3a2 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -597,17 +597,18 @@ config CRYPTO_SHA1 SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2). config CRYPTO_SHA1_SSSE3 - tristate "SHA1 digest algorithm (SSSE3/AVX/AVX2)" + tristate "SHA1 digest algorithm (SSSE3/AVX/AVX2/SHA-NI)" depends on X86 && 64BIT select CRYPTO_SHA1 select CRYPTO_HASH help SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2) implemented using Supplemental SSE3 (SSSE3) instructions or Advanced Vector - Extensions (AVX/AVX2), when available. + Extensions (AVX/AVX2) or SHA-NI(SHA Extensions New Instructions), + when available. config CRYPTO_SHA256_SSSE3 - tristate "SHA256 digest algorithm (SSSE3/AVX/AVX2)" + tristate "SHA256 digest algorithm (SSSE3/AVX/AVX2/SHA-NI)" depends on X86 && 64BIT select CRYPTO_SHA256 select CRYPTO_HASH @@ -615,7 +616,8 @@ config CRYPTO_SHA256_SSSE3 SHA-256 secure hash standard (DFIPS 180-2) implemented using Supplemental SSE3 (SSSE3) instructions, or Advanced Vector Extensions version 1 (AVX1), or Advanced Vector Extensions - version 2 (AVX2) instructions, when available. + version 2 (AVX2) instructions, or SHA-NI (SHA Extensions New + Instructions) when available. config CRYPTO_SHA512_SSSE3 tristate "SHA512 digest algorithm (SSSE3/AVX/AVX2)" -- cgit v1.2.3 From 85c66ecd6f2144c075044292359e179b20af1f2d Mon Sep 17 00:00:00 2001 From: tim Date: Wed, 16 Sep 2015 16:34:53 -0700 Subject: crypto: x86/sha - Restructure x86 sha1 glue code to expose all the available sha1 transforms Restructure the x86 sha1 glue code so we will expose sha1 transforms based on SSSE3, AVX, AVX2 or SHA-NI extension as separate individual drivers when cpu provides such support. This will make it easy for alternative algorithms to be used if desired and makes the code cleaner and easier to maintain. Signed-off-by: Tim Chen Signed-off-by: Herbert Xu --- arch/x86/crypto/sha1_ssse3_glue.c | 324 +++++++++++++++++++++++++++++--------- 1 file changed, 250 insertions(+), 74 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 98be8cc17ca2..c934197fe84a 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -31,28 +31,11 @@ #include #include +typedef void (sha1_transform_fn)(u32 *digest, const char *data, + unsigned int rounds); -asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, - unsigned int rounds); -#ifdef CONFIG_AS_AVX -asmlinkage void sha1_transform_avx(u32 *digest, const char *data, - unsigned int rounds); -#endif -#ifdef CONFIG_AS_AVX2 -#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ - -asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, - unsigned int rounds); -#endif -#ifdef CONFIG_AS_SHA1_NI -asmlinkage void sha1_ni_transform(u32 *digest, const char *data, - unsigned int rounds); -#endif - -static void (*sha1_transform_asm)(u32 *, const char *, unsigned int); - -static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) +static int sha1_update(struct shash_desc *desc, const u8 *data, + unsigned int len, sha1_transform_fn *sha1_xform) { struct sha1_state *sctx = shash_desc_ctx(desc); @@ -65,14 +48,14 @@ static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, kernel_fpu_begin(); sha1_base_do_update(desc, data, len, - (sha1_block_fn *)sha1_transform_asm); + (sha1_block_fn *)sha1_xform); kernel_fpu_end(); return 0; } -static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) +static int sha1_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out, sha1_transform_fn *sha1_xform) { if (!irq_fpu_usable()) return crypto_sha1_finup(desc, data, len, out); @@ -80,32 +63,37 @@ static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, kernel_fpu_begin(); if (len) sha1_base_do_update(desc, data, len, - (sha1_block_fn *)sha1_transform_asm); - sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_transform_asm); + (sha1_block_fn *)sha1_xform); + sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_xform); kernel_fpu_end(); return sha1_base_finish(desc, out); } -/* Add padding and return the message digest. */ -static int sha1_ssse3_final(struct shash_desc *desc, u8 *out) +asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, + unsigned int rounds); + +static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len) { - return sha1_ssse3_finup(desc, NULL, 0, out); + return sha1_update(desc, data, len, + (sha1_transform_fn *) sha1_transform_ssse3); } -#ifdef CONFIG_AS_AVX2 -static void sha1_apply_transform_avx2(u32 *digest, const char *data, - unsigned int rounds) +static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) { - /* Select the optimal transform based on data block size */ - if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE) - sha1_transform_avx2(digest, data, rounds); - else - sha1_transform_avx(digest, data, rounds); + return sha1_finup(desc, data, len, out, + (sha1_transform_fn *) sha1_transform_ssse3); +} + +/* Add padding and return the message digest. */ +static int sha1_ssse3_final(struct shash_desc *desc, u8 *out) +{ + return sha1_ssse3_finup(desc, NULL, 0, out); } -#endif -static struct shash_alg alg = { +static struct shash_alg sha1_ssse3_alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_base_init, .update = sha1_ssse3_update, @@ -114,7 +102,7 @@ static struct shash_alg alg = { .descsize = sizeof(struct sha1_state), .base = { .cra_name = "sha1", - .cra_driver_name= "sha1-ssse3", + .cra_driver_name = "sha1-ssse3", .cra_priority = 150, .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA1_BLOCK_SIZE, @@ -122,8 +110,60 @@ static struct shash_alg alg = { } }; +static int register_sha1_ssse3(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + return crypto_register_shash(&sha1_ssse3_alg); + return 0; +} + +static void unregister_sha1_ssse3(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + crypto_unregister_shash(&sha1_ssse3_alg); +} + #ifdef CONFIG_AS_AVX -static bool __init avx_usable(void) +asmlinkage void sha1_transform_avx(u32 *digest, const char *data, + unsigned int rounds); + +static int sha1_avx_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha1_update(desc, data, len, + (sha1_transform_fn *) sha1_transform_avx); +} + +static int sha1_avx_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha1_finup(desc, data, len, out, + (sha1_transform_fn *) sha1_transform_avx); +} + +static int sha1_avx_final(struct shash_desc *desc, u8 *out) +{ + return sha1_avx_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha1_avx_alg = { + .digestsize = SHA1_DIGEST_SIZE, + .init = sha1_base_init, + .update = sha1_avx_update, + .final = sha1_avx_final, + .finup = sha1_avx_finup, + .descsize = sizeof(struct sha1_state), + .base = { + .cra_name = "sha1", + .cra_driver_name = "sha1-avx", + .cra_priority = 160, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static bool avx_usable(void) { if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) { if (cpu_has_avx) @@ -134,61 +174,197 @@ static bool __init avx_usable(void) return true; } -#ifdef CONFIG_AS_AVX2 -static bool __init avx2_usable(void) +static int register_sha1_avx(void) +{ + if (avx_usable()) + return crypto_register_shash(&sha1_avx_alg); + return 0; +} + +static void unregister_sha1_avx(void) +{ + if (avx_usable()) + crypto_unregister_shash(&sha1_avx_alg); +} + +#else /* CONFIG_AS_AVX */ +static inline int register_sha1_avx(void) { return 0; } +static inline void unregister_sha1_avx(void) { } +#endif /* CONFIG_AS_AVX */ + + +#if defined(CONFIG_AS_AVX2) && (CONFIG_AS_AVX) +#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ + +asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, + unsigned int rounds); + +static bool avx2_usable(void) { - if (avx_usable() && cpu_has_avx2 && boot_cpu_has(X86_FEATURE_BMI1) && - boot_cpu_has(X86_FEATURE_BMI2)) + if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) + && boot_cpu_has(X86_FEATURE_BMI1) + && boot_cpu_has(X86_FEATURE_BMI2)) return true; return false; } -#endif -#endif -static int __init sha1_ssse3_mod_init(void) +static void sha1_apply_transform_avx2(u32 *digest, const char *data, + unsigned int rounds) { - char *algo_name; + /* Select the optimal transform based on data block size */ + if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE) + sha1_transform_avx2(digest, data, rounds); + else + sha1_transform_avx(digest, data, rounds); +} - /* test for SSSE3 first */ - if (cpu_has_ssse3) { - sha1_transform_asm = sha1_transform_ssse3; - algo_name = "SSSE3"; - } +static int sha1_avx2_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha1_update(desc, data, len, + (sha1_transform_fn *) sha1_apply_transform_avx2); +} -#ifdef CONFIG_AS_AVX - /* allow AVX to override SSSE3, it's a little faster */ - if (avx_usable()) { - sha1_transform_asm = sha1_transform_avx; - algo_name = "AVX"; -#ifdef CONFIG_AS_AVX2 - /* allow AVX2 to override AVX, it's a little faster */ - if (avx2_usable()) { - sha1_transform_asm = sha1_apply_transform_avx2; - algo_name = "AVX2"; - } -#endif +static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha1_finup(desc, data, len, out, + (sha1_transform_fn *) sha1_apply_transform_avx2); +} + +static int sha1_avx2_final(struct shash_desc *desc, u8 *out) +{ + return sha1_avx2_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha1_avx2_alg = { + .digestsize = SHA1_DIGEST_SIZE, + .init = sha1_base_init, + .update = sha1_avx2_update, + .final = sha1_avx2_final, + .finup = sha1_avx2_finup, + .descsize = sizeof(struct sha1_state), + .base = { + .cra_name = "sha1", + .cra_driver_name = "sha1-avx2", + .cra_priority = 170, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, } +}; + +static int register_sha1_avx2(void) +{ + if (avx2_usable()) + return crypto_register_shash(&sha1_avx2_alg); + return 0; +} + +static void unregister_sha1_avx2(void) +{ + if (avx2_usable()) + crypto_unregister_shash(&sha1_avx2_alg); +} + +#else +static inline int register_sha1_avx2(void) { return 0; } +static inline void unregister_sha1_avx2(void) { } #endif + #ifdef CONFIG_AS_SHA1_NI - if (boot_cpu_has(X86_FEATURE_SHA_NI)) { - sha1_transform_asm = sha1_ni_transform; - algo_name = "SHA-NI"; +asmlinkage void sha1_ni_transform(u32 *digest, const char *data, + unsigned int rounds); + +static int sha1_ni_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha1_update(desc, data, len, + (sha1_transform_fn *) sha1_ni_transform); +} + +static int sha1_ni_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha1_finup(desc, data, len, out, + (sha1_transform_fn *) sha1_ni_transform); +} + +static int sha1_ni_final(struct shash_desc *desc, u8 *out) +{ + return sha1_ni_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha1_ni_alg = { + .digestsize = SHA1_DIGEST_SIZE, + .init = sha1_base_init, + .update = sha1_ni_update, + .final = sha1_ni_final, + .finup = sha1_ni_finup, + .descsize = sizeof(struct sha1_state), + .base = { + .cra_name = "sha1", + .cra_driver_name = "sha1-ni", + .cra_priority = 250, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, } +}; + +static int register_sha1_ni(void) +{ + if (boot_cpu_has(X86_FEATURE_SHA_NI)) + return crypto_register_shash(&sha1_ni_alg); + return 0; +} + +static void unregister_sha1_ni(void) +{ + if (boot_cpu_has(X86_FEATURE_SHA_NI)) + crypto_unregister_shash(&sha1_ni_alg); +} + +#else +static inline int register_sha1_ni(void) { return 0; } +static inline void unregister_sha1_ni(void) { } #endif - if (sha1_transform_asm) { - pr_info("Using %s optimized SHA-1 implementation\n", algo_name); - return crypto_register_shash(&alg); +static int __init sha1_ssse3_mod_init(void) +{ + if (register_sha1_ssse3()) + goto fail; + + if (register_sha1_avx()) { + unregister_sha1_ssse3(); + goto fail; } - pr_info("Neither AVX nor AVX2 nor SSSE3/SHA-NI is available/usable.\n"); + if (register_sha1_avx2()) { + unregister_sha1_avx(); + unregister_sha1_ssse3(); + goto fail; + } + + if (register_sha1_ni()) { + unregister_sha1_avx2(); + unregister_sha1_avx(); + unregister_sha1_ssse3(); + goto fail; + } + + return 0; +fail: return -ENODEV; } static void __exit sha1_ssse3_mod_fini(void) { - crypto_unregister_shash(&alg); + unregister_sha1_ni(); + unregister_sha1_avx2(); + unregister_sha1_avx(); + unregister_sha1_ssse3(); } module_init(sha1_ssse3_mod_init); -- cgit v1.2.3 From 5dda42fc89f26fb3b6312076b17feda8c397d2b8 Mon Sep 17 00:00:00 2001 From: tim Date: Wed, 16 Sep 2015 16:35:23 -0700 Subject: crypto: x86/sha - Restructure x86 sha256 glue code to expose all the available sha256 transforms Restructure the x86 sha256 glue code so we will expose sha256 transforms based on SSSE3, AVX, AVX2 or SHA-NI extension as separate individual drivers when cpu provides such support. This will make it easy for alternative algorithms to be used if desired and makes the code cleaner and easier to maintain. Signed-off-by: Tim Chen Signed-off-by: Herbert Xu --- arch/x86/crypto/sha256_ssse3_glue.c | 331 ++++++++++++++++++++++++++++++------ 1 file changed, 281 insertions(+), 50 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 9c7b22c489f6..863e2f6aad13 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -42,23 +42,10 @@ asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data, u64 rounds); -#ifdef CONFIG_AS_AVX -asmlinkage void sha256_transform_avx(u32 *digest, const char *data, - u64 rounds); -#endif -#ifdef CONFIG_AS_AVX2 -asmlinkage void sha256_transform_rorx(u32 *digest, const char *data, - u64 rounds); -#endif -#ifdef CONFIG_AS_SHA256_NI -asmlinkage void sha256_ni_transform(u32 *digest, const char *data, - u64 rounds); /*unsigned int rounds);*/ -#endif +typedef void (sha256_transform_fn)(u32 *digest, const char *data, u64 rounds); -static void (*sha256_transform_asm)(u32 *, const char *, u64); - -static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) +static int sha256_update(struct shash_desc *desc, const u8 *data, + unsigned int len, sha256_transform_fn *sha256_xform) { struct sha256_state *sctx = shash_desc_ctx(desc); @@ -71,14 +58,14 @@ static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, kernel_fpu_begin(); sha256_base_do_update(desc, data, len, - (sha256_block_fn *)sha256_transform_asm); + (sha256_block_fn *)sha256_xform); kernel_fpu_end(); return 0; } -static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) +static int sha256_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out, sha256_transform_fn *sha256_xform) { if (!irq_fpu_usable()) return crypto_sha256_finup(desc, data, len, out); @@ -86,20 +73,32 @@ static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data, kernel_fpu_begin(); if (len) sha256_base_do_update(desc, data, len, - (sha256_block_fn *)sha256_transform_asm); - sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_transform_asm); + (sha256_block_fn *)sha256_xform); + sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_xform); kernel_fpu_end(); return sha256_base_finish(desc, out); } +static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha256_update(desc, data, len, sha256_transform_ssse3); +} + +static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha256_finup(desc, data, len, out, sha256_transform_ssse3); +} + /* Add padding and return the message digest. */ static int sha256_ssse3_final(struct shash_desc *desc, u8 *out) { return sha256_ssse3_finup(desc, NULL, 0, out); } -static struct shash_alg algs[] = { { +static struct shash_alg sha256_ssse3_algs[] = { { .digestsize = SHA256_DIGEST_SIZE, .init = sha256_base_init, .update = sha256_ssse3_update, @@ -131,8 +130,75 @@ static struct shash_alg algs[] = { { } } }; +static int register_sha256_ssse3(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + return crypto_register_shashes(sha256_ssse3_algs, + ARRAY_SIZE(sha256_ssse3_algs)); + return 0; +} + +static void unregister_sha256_ssse3(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + crypto_unregister_shashes(sha256_ssse3_algs, + ARRAY_SIZE(sha256_ssse3_algs)); +} + #ifdef CONFIG_AS_AVX -static bool __init avx_usable(void) +asmlinkage void sha256_transform_avx(u32 *digest, const char *data, + u64 rounds); + +static int sha256_avx_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha256_update(desc, data, len, sha256_transform_avx); +} + +static int sha256_avx_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha256_finup(desc, data, len, out, sha256_transform_avx); +} + +static int sha256_avx_final(struct shash_desc *desc, u8 *out) +{ + return sha256_avx_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha256_avx_algs[] = { { + .digestsize = SHA256_DIGEST_SIZE, + .init = sha256_base_init, + .update = sha256_avx_update, + .final = sha256_avx_final, + .finup = sha256_avx_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha256", + .cra_driver_name = "sha256-avx", + .cra_priority = 160, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = sha224_base_init, + .update = sha256_avx_update, + .final = sha256_avx_final, + .finup = sha256_avx_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha224", + .cra_driver_name = "sha224-avx", + .cra_priority = 160, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; + +static bool avx_usable(void) { if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) { if (cpu_has_avx) @@ -142,51 +208,216 @@ static bool __init avx_usable(void) return true; } -#endif -static int __init sha256_ssse3_mod_init(void) +static int register_sha256_avx(void) { - char *algo; + if (avx_usable()) + return crypto_register_shashes(sha256_avx_algs, + ARRAY_SIZE(sha256_avx_algs)); + return 0; +} - /* test for SSSE3 first */ - if (cpu_has_ssse3) { - sha256_transform_asm = sha256_transform_ssse3; - algo = "SSSE3"; - } +static void unregister_sha256_avx(void) +{ + if (avx_usable()) + crypto_unregister_shashes(sha256_avx_algs, + ARRAY_SIZE(sha256_avx_algs)); +} -#ifdef CONFIG_AS_AVX - /* allow AVX to override SSSE3, it's a little faster */ - if (avx_usable()) { - sha256_transform_asm = sha256_transform_avx; - algo = "AVX"; -#ifdef CONFIG_AS_AVX2 - if (boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_BMI2)) { - sha256_transform_asm = sha256_transform_rorx; - algo = "AVX2"; - } +#else +static inline int register_sha256_avx(void) { return 0; } +static inline void unregister_sha256_avx(void) { } #endif + +#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX) +asmlinkage void sha256_transform_rorx(u32 *digest, const char *data, + u64 rounds); + +static int sha256_avx2_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha256_update(desc, data, len, sha256_transform_rorx); +} + +static int sha256_avx2_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha256_finup(desc, data, len, out, sha256_transform_rorx); +} + +static int sha256_avx2_final(struct shash_desc *desc, u8 *out) +{ + return sha256_avx2_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha256_avx2_algs[] = { { + .digestsize = SHA256_DIGEST_SIZE, + .init = sha256_base_init, + .update = sha256_avx2_update, + .final = sha256_avx2_final, + .finup = sha256_avx2_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha256", + .cra_driver_name = "sha256-avx2", + .cra_priority = 170, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, } +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = sha224_base_init, + .update = sha256_avx2_update, + .final = sha256_avx2_final, + .finup = sha256_avx2_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha224", + .cra_driver_name = "sha224-avx2", + .cra_priority = 170, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; + +static bool avx2_usable(void) +{ + if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_BMI2)) + return true; + + return false; +} + +static int register_sha256_avx2(void) +{ + if (avx2_usable()) + return crypto_register_shashes(sha256_avx2_algs, + ARRAY_SIZE(sha256_avx2_algs)); + return 0; +} + +static void unregister_sha256_avx2(void) +{ + if (avx2_usable()) + crypto_unregister_shashes(sha256_avx2_algs, + ARRAY_SIZE(sha256_avx2_algs)); +} + +#else +static inline int register_sha256_avx2(void) { return 0; } +static inline void unregister_sha256_avx2(void) { } #endif + #ifdef CONFIG_AS_SHA256_NI - if (boot_cpu_has(X86_FEATURE_SHA_NI)) { - sha256_transform_asm = sha256_ni_transform; - algo = "SHA-256-NI"; +asmlinkage void sha256_ni_transform(u32 *digest, const char *data, + u64 rounds); /*unsigned int rounds);*/ + +static int sha256_ni_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha256_update(desc, data, len, sha256_ni_transform); +} + +static int sha256_ni_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha256_finup(desc, data, len, out, sha256_ni_transform); +} + +static int sha256_ni_final(struct shash_desc *desc, u8 *out) +{ + return sha256_ni_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha256_ni_algs[] = { { + .digestsize = SHA256_DIGEST_SIZE, + .init = sha256_base_init, + .update = sha256_ni_update, + .final = sha256_ni_final, + .finup = sha256_ni_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha256", + .cra_driver_name = "sha256-ni", + .cra_priority = 250, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = sha224_base_init, + .update = sha256_ni_update, + .final = sha256_ni_final, + .finup = sha256_ni_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha224", + .cra_driver_name = "sha224-ni", + .cra_priority = 250, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_module = THIS_MODULE, } +} }; + +static int register_sha256_ni(void) +{ + if (boot_cpu_has(X86_FEATURE_SHA_NI)) + return crypto_register_shashes(sha256_ni_algs, + ARRAY_SIZE(sha256_ni_algs)); + return 0; +} + +static void unregister_sha256_ni(void) +{ + if (boot_cpu_has(X86_FEATURE_SHA_NI)) + crypto_unregister_shashes(sha256_ni_algs, + ARRAY_SIZE(sha256_ni_algs)); +} + +#else +static inline int register_sha256_ni(void) { return 0; } +static inline void unregister_sha256_ni(void) { } #endif - if (sha256_transform_asm) { - pr_info("Using %s optimized SHA-256 implementation\n", algo); - return crypto_register_shashes(algs, ARRAY_SIZE(algs)); +static int __init sha256_ssse3_mod_init(void) +{ + if (register_sha256_ssse3()) + goto fail; + + if (register_sha256_avx()) { + unregister_sha256_ssse3(); + goto fail; } - pr_info("Neither AVX nor SSSE3/SHA-NI is available/usable.\n"); + if (register_sha256_avx2()) { + unregister_sha256_avx(); + unregister_sha256_ssse3(); + goto fail; + } + + if (register_sha256_ni()) { + unregister_sha256_avx2(); + unregister_sha256_avx(); + unregister_sha256_ssse3(); + goto fail; + } + + return 0; +fail: return -ENODEV; } static void __exit sha256_ssse3_mod_fini(void) { - crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); + unregister_sha256_ni(); + unregister_sha256_avx2(); + unregister_sha256_avx(); + unregister_sha256_ssse3(); } module_init(sha256_ssse3_mod_init); -- cgit v1.2.3 From be6ec98ddb6749bba0fc7f67bd2f89a2396805de Mon Sep 17 00:00:00 2001 From: tim Date: Wed, 16 Sep 2015 16:35:53 -0700 Subject: crypto: x86/sha - Restructure x86 sha512 glue code to expose all the available sha512 transforms Restructure the x86 sha512 glue code so we will expose sha512 transforms based on SSSE3, AVX or AVX2 as separate individual drivers when cpu provides support. This will make it easy for alternative algorithms to be used if desired and makes the code cleaner and easier to maintain. Signed-off-by: Tim Chen Signed-off-by: Herbert Xu --- arch/x86/crypto/sha512_ssse3_glue.c | 249 +++++++++++++++++++++++++++++------- 1 file changed, 204 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 2edad7b81870..0dfe9a2ba64b 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -41,19 +41,11 @@ asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data, u64 rounds); -#ifdef CONFIG_AS_AVX -asmlinkage void sha512_transform_avx(u64 *digest, const char *data, - u64 rounds); -#endif -#ifdef CONFIG_AS_AVX2 -asmlinkage void sha512_transform_rorx(u64 *digest, const char *data, - u64 rounds); -#endif -static void (*sha512_transform_asm)(u64 *, const char *, u64); +typedef void (sha512_transform_fn)(u64 *digest, const char *data, u64 rounds); -static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) +static int sha512_update(struct shash_desc *desc, const u8 *data, + unsigned int len, sha512_transform_fn *sha512_xform) { struct sha512_state *sctx = shash_desc_ctx(desc); @@ -66,14 +58,14 @@ static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, kernel_fpu_begin(); sha512_base_do_update(desc, data, len, - (sha512_block_fn *)sha512_transform_asm); + (sha512_block_fn *)sha512_xform); kernel_fpu_end(); return 0; } -static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) +static int sha512_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out, sha512_transform_fn *sha512_xform) { if (!irq_fpu_usable()) return crypto_sha512_finup(desc, data, len, out); @@ -81,20 +73,32 @@ static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, kernel_fpu_begin(); if (len) sha512_base_do_update(desc, data, len, - (sha512_block_fn *)sha512_transform_asm); - sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_transform_asm); + (sha512_block_fn *)sha512_xform); + sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_xform); kernel_fpu_end(); return sha512_base_finish(desc, out); } +static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha512_update(desc, data, len, sha512_transform_ssse3); +} + +static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha512_finup(desc, data, len, out, sha512_transform_ssse3); +} + /* Add padding and return the message digest. */ static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) { return sha512_ssse3_finup(desc, NULL, 0, out); } -static struct shash_alg algs[] = { { +static struct shash_alg sha512_ssse3_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_ssse3_update, @@ -126,8 +130,25 @@ static struct shash_alg algs[] = { { } } }; +static int register_sha512_ssse3(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + return crypto_register_shashes(sha512_ssse3_algs, + ARRAY_SIZE(sha512_ssse3_algs)); + return 0; +} + +static void unregister_sha512_ssse3(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + crypto_unregister_shashes(sha512_ssse3_algs, + ARRAY_SIZE(sha512_ssse3_algs)); +} + #ifdef CONFIG_AS_AVX -static bool __init avx_usable(void) +asmlinkage void sha512_transform_avx(u64 *digest, const char *data, + u64 rounds); +static bool avx_usable(void) { if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) { if (cpu_has_avx) @@ -137,47 +158,185 @@ static bool __init avx_usable(void) return true; } -#endif -static int __init sha512_ssse3_mod_init(void) +static int sha512_avx_update(struct shash_desc *desc, const u8 *data, + unsigned int len) { - /* test for SSSE3 first */ - if (cpu_has_ssse3) - sha512_transform_asm = sha512_transform_ssse3; + return sha512_update(desc, data, len, sha512_transform_avx); +} -#ifdef CONFIG_AS_AVX - /* allow AVX to override SSSE3, it's a little faster */ - if (avx_usable()) { -#ifdef CONFIG_AS_AVX2 - if (boot_cpu_has(X86_FEATURE_AVX2)) - sha512_transform_asm = sha512_transform_rorx; - else -#endif - sha512_transform_asm = sha512_transform_avx; +static int sha512_avx_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha512_finup(desc, data, len, out, sha512_transform_avx); +} + +/* Add padding and return the message digest. */ +static int sha512_avx_final(struct shash_desc *desc, u8 *out) +{ + return sha512_avx_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha512_avx_algs[] = { { + .digestsize = SHA512_DIGEST_SIZE, + .init = sha512_base_init, + .update = sha512_avx_update, + .final = sha512_avx_final, + .finup = sha512_avx_finup, + .descsize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha512", + .cra_driver_name = "sha512-avx", + .cra_priority = 160, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA512_BLOCK_SIZE, + .cra_module = THIS_MODULE, } -#endif +}, { + .digestsize = SHA384_DIGEST_SIZE, + .init = sha384_base_init, + .update = sha512_avx_update, + .final = sha512_avx_final, + .finup = sha512_avx_finup, + .descsize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha384", + .cra_driver_name = "sha384-avx", + .cra_priority = 160, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA384_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; - if (sha512_transform_asm) { -#ifdef CONFIG_AS_AVX - if (sha512_transform_asm == sha512_transform_avx) - pr_info("Using AVX optimized SHA-512 implementation\n"); -#ifdef CONFIG_AS_AVX2 - else if (sha512_transform_asm == sha512_transform_rorx) - pr_info("Using AVX2 optimized SHA-512 implementation\n"); +static int register_sha512_avx(void) +{ + if (avx_usable()) + return crypto_register_shashes(sha512_avx_algs, + ARRAY_SIZE(sha512_avx_algs)); + return 0; +} + +static void unregister_sha512_avx(void) +{ + if (avx_usable()) + crypto_unregister_shashes(sha512_avx_algs, + ARRAY_SIZE(sha512_avx_algs)); +} +#else +static inline int register_sha512_avx(void) { return 0; } +static inline void unregister_sha512_avx(void) { } #endif - else + +#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX) +asmlinkage void sha512_transform_rorx(u64 *digest, const char *data, + u64 rounds); + +static int sha512_avx2_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha512_update(desc, data, len, sha512_transform_rorx); +} + +static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha512_finup(desc, data, len, out, sha512_transform_rorx); +} + +/* Add padding and return the message digest. */ +static int sha512_avx2_final(struct shash_desc *desc, u8 *out) +{ + return sha512_avx2_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha512_avx2_algs[] = { { + .digestsize = SHA512_DIGEST_SIZE, + .init = sha512_base_init, + .update = sha512_avx2_update, + .final = sha512_avx2_final, + .finup = sha512_avx2_finup, + .descsize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha512", + .cra_driver_name = "sha512-avx2", + .cra_priority = 170, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA512_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}, { + .digestsize = SHA384_DIGEST_SIZE, + .init = sha384_base_init, + .update = sha512_avx2_update, + .final = sha512_avx2_final, + .finup = sha512_avx2_finup, + .descsize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha384", + .cra_driver_name = "sha384-avx2", + .cra_priority = 170, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA384_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; + +static bool avx2_usable(void) +{ + if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_BMI2)) + return true; + + return false; +} + +static int register_sha512_avx2(void) +{ + if (avx2_usable()) + return crypto_register_shashes(sha512_avx2_algs, + ARRAY_SIZE(sha512_avx2_algs)); + return 0; +} + +static void unregister_sha512_avx2(void) +{ + if (avx2_usable()) + crypto_unregister_shashes(sha512_avx2_algs, + ARRAY_SIZE(sha512_avx2_algs)); +} +#else +static inline int register_sha512_avx2(void) { return 0; } +static inline void unregister_sha512_avx2(void) { } #endif - pr_info("Using SSSE3 optimized SHA-512 implementation\n"); - return crypto_register_shashes(algs, ARRAY_SIZE(algs)); + +static int __init sha512_ssse3_mod_init(void) +{ + + if (register_sha512_ssse3()) + goto fail; + + if (register_sha512_avx()) { + unregister_sha512_ssse3(); + goto fail; } - pr_info("Neither AVX nor SSSE3 is available/usable.\n"); + if (register_sha512_avx2()) { + unregister_sha512_avx(); + unregister_sha512_ssse3(); + goto fail; + } + + return 0; +fail: return -ENODEV; } static void __exit sha512_ssse3_mod_fini(void) { - crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); + unregister_sha512_avx2(); + unregister_sha512_avx(); + unregister_sha512_ssse3(); } module_init(sha512_ssse3_mod_init); -- cgit v1.2.3 From 97bce7e0b58dfc7d159ded329f57961868fb060b Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Sun, 20 Sep 2015 16:42:36 +0200 Subject: crypto: crc32c-pclmul - use .rodata instead of .rotata Module crc32c-intel uses a special read-only data section named .rotata. This section is defined for K_table, and its name seems to be a spelling mistake for .rodata. Fixes: 473946e674eb ("crypto: crc32c-pclmul - Shrink K_table to 32-bit words") Signed-off-by: Nicolas Iooss Signed-off-by: Herbert Xu --- arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 225be06edc80..4fe27e074194 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -330,7 +330,7 @@ ENDPROC(crc_pcl) ## PCLMULQDQ tables ## Table is 128 entries x 2 words (8 bytes) each ################################################################ -.section .rotata, "a", %progbits +.section .rodata, "a", %progbits .align 8 K_table: .long 0x493c7d27, 0x00000001 -- cgit v1.2.3 From fb535ccb30845fe0b7bd09caa37a838985b72ff9 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:14 -0600 Subject: x86/vdso32: Define PGTABLE_LEVELS to 32bit VDSO In case of CONFIG_X86_64, vdso32/vclock_gettime.c fakes a 32-bit non-PAE kernel configuration by re-defining it to CONFIG_X86_32. However, it does not re-define CONFIG_PGTABLE_LEVELS leaving it as 4 levels. This mismatch leads to NOT include and , which will cause compile errors when a later patch enhances to use PUD_SHIFT and PMD_SHIFT. These -nopud & -nopmd headers define these SHIFTs for the 32-bit non-PAE kernel. Fix it by re-defining CONFIG_PGTABLE_LEVELS to 2 levels. Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-2-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/entry/vdso/vdso32/vclock_gettime.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c index 175cc72c0f68..87a86e017f0e 100644 --- a/arch/x86/entry/vdso/vdso32/vclock_gettime.c +++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c @@ -14,11 +14,13 @@ */ #undef CONFIG_64BIT #undef CONFIG_X86_64 +#undef CONFIG_PGTABLE_LEVELS #undef CONFIG_ILLEGAL_POINTER_VALUE #undef CONFIG_SPARSEMEM_VMEMMAP #undef CONFIG_NR_CPUS #define CONFIG_X86_32 1 +#define CONFIG_PGTABLE_LEVELS 2 #define CONFIG_PAGE_OFFSET 0 #define CONFIG_ILLEGAL_POINTER_VALUE 0 #define CONFIG_NR_CPUS 1 -- cgit v1.2.3 From 832102671855f73962e7a04fdafd48b9385ea5c6 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:15 -0600 Subject: x86/asm: Move PUD_PAGE macros to page_types.h PUD_SHIFT is defined according to a given kernel configuration, which allows it be commonly used by any x86 kernels. However, PUD_PAGE_SIZE and PUD_PAGE_MASK, which are set from PUD_SHIFT, are defined in page_64_types.h, which can be used by 64-bit kernel only. Move PUD_PAGE_SIZE and PUD_PAGE_MASK to page_types.h so that they can be used by any x86 kernels as well. Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-3-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/page_64_types.h | 3 --- arch/x86/include/asm/page_types.h | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 4edd53b79a81..4928cf0d5af0 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -26,9 +26,6 @@ #define MCE_STACK 4 #define N_EXCEPTION_STACKS 4 /* hw limit: 7 */ -#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) -#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) - /* * Set __PAGE_OFFSET to the most negative possible address + * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index c7c712f2648b..c5b7fb2774d0 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -20,6 +20,9 @@ #define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) #define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) +#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) +#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) + #define HPAGE_SHIFT PMD_SHIFT #define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) -- cgit v1.2.3 From 4be4c1fb9a754b100466ebaec50f825be0b2050b Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:16 -0600 Subject: x86/asm: Add pud/pmd mask interfaces to handle large PAT bit The PAT bit gets relocated to bit 12 when PUD and PMD mappings are used. This bit 12, however, is not covered by PTE_FLAGS_MASK, which is used for masking pfn and flags for all levels. Add pud/pmd mask interfaces to handle pfn and flags properly by using P?D_PAGE_MASK when PUD/PMD mappings are used, i.e. PSE bit is set. Suggested-by: Juergen Gross Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-4-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/pgtable_types.h | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 13f310bfc09a..7c8f79780038 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -209,10 +209,10 @@ enum page_cache_mode { #include -/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */ +/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */ #define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) -/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */ +/* Extracts the flags from a (pte|pmd|pud|pgd)val_t of a 4KB page */ #define PTE_FLAGS_MASK (~PTE_PFN_MASK) typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; @@ -276,11 +276,43 @@ static inline pmdval_t native_pmd_val(pmd_t pmd) } #endif +static inline pudval_t pud_pfn_mask(pud_t pud) +{ + if (native_pud_val(pud) & _PAGE_PSE) + return PUD_PAGE_MASK & PHYSICAL_PAGE_MASK; + else + return PTE_PFN_MASK; +} + +static inline pudval_t pud_flags_mask(pud_t pud) +{ + if (native_pud_val(pud) & _PAGE_PSE) + return ~(PUD_PAGE_MASK & (pudval_t)PHYSICAL_PAGE_MASK); + else + return ~PTE_PFN_MASK; +} + static inline pudval_t pud_flags(pud_t pud) { return native_pud_val(pud) & PTE_FLAGS_MASK; } +static inline pmdval_t pmd_pfn_mask(pmd_t pmd) +{ + if (native_pmd_val(pmd) & _PAGE_PSE) + return PMD_PAGE_MASK & PHYSICAL_PAGE_MASK; + else + return PTE_PFN_MASK; +} + +static inline pmdval_t pmd_flags_mask(pmd_t pmd) +{ + if (native_pmd_val(pmd) & _PAGE_PSE) + return ~(PMD_PAGE_MASK & (pmdval_t)PHYSICAL_PAGE_MASK); + else + return ~PTE_PFN_MASK; +} + static inline pmdval_t pmd_flags(pmd_t pmd) { return native_pmd_val(pmd) & PTE_FLAGS_MASK; -- cgit v1.2.3 From f70abb0fc3da1b2945c92751ccda2744081bf2b7 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:17 -0600 Subject: x86/asm: Fix pud/pmd interfaces to handle large PAT bit Now that we have pud/pmd mask interfaces, which handle pfn & flags mask properly for the large PAT bit. Fix pud/pmd pfn & flags interfaces by replacing PTE_PFN_MASK and PTE_FLAGS_MASK with the pud/pmd mask interfaces. Suggested-by: Juergen Gross Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-5-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/pgtable.h | 14 ++++++++------ arch/x86/include/asm/pgtable_types.h | 4 ++-- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 867da5bbb4a3..0733ec7c8036 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -142,12 +142,12 @@ static inline unsigned long pte_pfn(pte_t pte) static inline unsigned long pmd_pfn(pmd_t pmd) { - return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; + return (pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; } static inline unsigned long pud_pfn(pud_t pud) { - return (pud_val(pud) & PTE_PFN_MASK) >> PAGE_SHIFT; + return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT; } #define pte_page(pte) pfn_to_page(pte_pfn(pte)) @@ -502,14 +502,15 @@ static inline int pmd_none(pmd_t pmd) static inline unsigned long pmd_page_vaddr(pmd_t pmd) { - return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK); + return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd)); } /* * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT) +#define pmd_page(pmd) \ + pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT) /* * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] @@ -570,14 +571,15 @@ static inline int pud_present(pud_t pud) static inline unsigned long pud_page_vaddr(pud_t pud) { - return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK); + return (unsigned long)__va(pud_val(pud) & pud_pfn_mask(pud)); } /* * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT) +#define pud_page(pud) \ + pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT) /* Find an entry in the second-level page table.. */ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 7c8f79780038..dd5b0aa9dd2f 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -294,7 +294,7 @@ static inline pudval_t pud_flags_mask(pud_t pud) static inline pudval_t pud_flags(pud_t pud) { - return native_pud_val(pud) & PTE_FLAGS_MASK; + return native_pud_val(pud) & pud_flags_mask(pud); } static inline pmdval_t pmd_pfn_mask(pmd_t pmd) @@ -315,7 +315,7 @@ static inline pmdval_t pmd_flags_mask(pmd_t pmd) static inline pmdval_t pmd_flags(pmd_t pmd) { - return native_pmd_val(pmd) & PTE_FLAGS_MASK; + return native_pmd_val(pmd) & pmd_flags_mask(pmd); } static inline pte_t native_make_pte(pteval_t val) -- cgit v1.2.3 From bbac8c6deadab921f4b7d00ce675ffa4f358ec7f Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:18 -0600 Subject: x86/asm: Add pud_pgprot() and pmd_pgprot() pte_pgprot() returns a pgprot_t value by calling pte_flags(). Now that pud_flags() and pmd_flags() work specifically for the pud/pmd levels, define pud_pgprot() and pmd_pgprot() for PUD/PMD. Also update pte_pgprot() to remove the unnecessary mask with PTE_FLAGS_MASK as pte_flags() takes care of it. Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-6-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/pgtable.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 0733ec7c8036..59fc3414c68b 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -379,7 +379,9 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) return __pgprot(preservebits | addbits); } -#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK) +#define pte_pgprot(x) __pgprot(pte_flags(x)) +#define pmd_pgprot(x) __pgprot(pmd_flags(x)) +#define pud_pgprot(x) __pgprot(pud_flags(x)) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) -- cgit v1.2.3 From da25e628c4c231a281b1c1de3168a36ab9bfe473 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:19 -0600 Subject: x86/mm: Fix page table dump to show PAT bit /sys/kernel/debug/kernel_page_tables does not show the PAT bit for PUD/PMD mappings. This is because walk_pud_level(), walk_pmd_level() and note_page() mask the flags with PTE_FLAGS_MASK, which does not cover their PAT bit, _PAGE_PAT_LARGE. Fix it by replacing the use of PTE_FLAGS_MASK with p?d_flags(), which masks the flags properly. Also change to show the PAT bit as "PAT" to be consistent with other bits. Reported-by: Robert Elliott Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-7-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/dump_pagetables.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index f0cedf3395af..71ab2d741024 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -155,7 +155,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) pt_dump_cont_printf(m, dmsg, " "); if ((level == 4 && pr & _PAGE_PAT) || ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) - pt_dump_cont_printf(m, dmsg, "pat "); + pt_dump_cont_printf(m, dmsg, "PAT "); else pt_dump_cont_printf(m, dmsg, " "); if (pr & _PAGE_GLOBAL) @@ -198,8 +198,8 @@ static void note_page(struct seq_file *m, struct pg_state *st, * we have now. "break" is either changing perms, levels or * address space marker. */ - prot = pgprot_val(new_prot) & PTE_FLAGS_MASK; - cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK; + prot = pgprot_val(new_prot); + cur = pgprot_val(st->current_prot); if (!st->level) { /* First entry */ @@ -269,13 +269,13 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, { int i; pte_t *start; + pgprotval_t prot; start = (pte_t *) pmd_page_vaddr(addr); for (i = 0; i < PTRS_PER_PTE; i++) { - pgprot_t prot = pte_pgprot(*start); - + prot = pte_flags(*start); st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); - note_page(m, st, prot, 4); + note_page(m, st, __pgprot(prot), 4); start++; } } @@ -287,18 +287,19 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, { int i; pmd_t *start; + pgprotval_t prot; start = (pmd_t *) pud_page_vaddr(addr); for (i = 0; i < PTRS_PER_PMD; i++) { st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); if (!pmd_none(*start)) { - pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK; - - if (pmd_large(*start) || !pmd_present(*start)) + if (pmd_large(*start) || !pmd_present(*start)) { + prot = pmd_flags(*start); note_page(m, st, __pgprot(prot), 3); - else + } else { walk_pte_level(m, st, *start, P + i * PMD_LEVEL_MULT); + } } else note_page(m, st, __pgprot(0), 3); start++; @@ -318,19 +319,20 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, { int i; pud_t *start; + pgprotval_t prot; start = (pud_t *) pgd_page_vaddr(addr); for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); if (!pud_none(*start)) { - pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK; - - if (pud_large(*start) || !pud_present(*start)) + if (pud_large(*start) || !pud_present(*start)) { + prot = pud_flags(*start); note_page(m, st, __pgprot(prot), 2); - else + } else { walk_pmd_level(m, st, *start, P + i * PUD_LEVEL_MULT); + } } else note_page(m, st, __pgprot(0), 2); @@ -351,6 +353,7 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) #else pgd_t *start = swapper_pg_dir; #endif + pgprotval_t prot; int i; struct pg_state st = {}; @@ -362,13 +365,13 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) for (i = 0; i < PTRS_PER_PGD; i++) { st.current_address = normalize_addr(i * PGD_LEVEL_MULT); if (!pgd_none(*start)) { - pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK; - - if (pgd_large(*start) || !pgd_present(*start)) + if (pgd_large(*start) || !pgd_present(*start)) { + prot = pgd_flags(*start); note_page(m, &st, __pgprot(prot), 1); - else + } else { walk_pud_level(m, &st, *start, i * PGD_LEVEL_MULT); + } } else note_page(m, &st, __pgprot(0), 1); -- cgit v1.2.3 From 34437e67a6727885bdf6cbfd8441b1ac43a1ee65 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:20 -0600 Subject: x86/mm: Fix slow_virt_to_phys() to handle large PAT bit slow_virt_to_phys() calls lookup_address() to obtain *pte and its level. It then calls pte_pfn() to obtain a physical address for any level. However, this physical address is not correct when the large PAT bit is set because pte_pfn() does not mask the large PAT bit properly for PUD/PMD. Fix slow_virt_to_phys() to use pud_pfn() and pmd_pfn() for 1GB and 2MB mapping levels. Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-8-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 2c44c0792301..1441368a7931 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -414,18 +414,28 @@ pmd_t *lookup_pmd_address(unsigned long address) phys_addr_t slow_virt_to_phys(void *__virt_addr) { unsigned long virt_addr = (unsigned long)__virt_addr; - phys_addr_t phys_addr; - unsigned long offset; + unsigned long phys_addr, offset; enum pg_level level; - unsigned long pmask; pte_t *pte; pte = lookup_address(virt_addr, &level); BUG_ON(!pte); - pmask = page_level_mask(level); - offset = virt_addr & ~pmask; - phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; - return (phys_addr | offset); + + switch (level) { + case PG_LEVEL_1G: + phys_addr = pud_pfn(*(pud_t *)pte) << PAGE_SHIFT; + offset = virt_addr & ~PUD_PAGE_MASK; + break; + case PG_LEVEL_2M: + phys_addr = pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT; + offset = virt_addr & ~PMD_PAGE_MASK; + break; + default: + phys_addr = pte_pfn(*pte) << PAGE_SHIFT; + offset = virt_addr & ~PAGE_MASK; + } + + return (phys_addr_t)(phys_addr | offset); } EXPORT_SYMBOL_GPL(slow_virt_to_phys); -- cgit v1.2.3 From daf3e35c5888e8bd6a2f5ed15ed392b2df362ecf Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:21 -0600 Subject: x86/mm: Fix gup_huge_p?d() to handle large PAT bit gup_huge_pud() and gup_huge_pmd() cast *pud and *pmd to *pte, and use pte_xxx() interfaces to obtain the flags and PFN. However, the pte_xxx() interface does not handle the large PAT bit properly for PUD/PMD. Fix gup_huge_pud() and gup_huge_pmd() to use pud_xxx() and pmd_xxx() interfaces according to their type. Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-9-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/gup.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 81bf3d2af3eb..ae9a37bf1371 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -118,21 +118,20 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long mask; - pte_t pte = *(pte_t *)&pmd; struct page *head, *page; int refs; mask = _PAGE_PRESENT|_PAGE_USER; if (write) mask |= _PAGE_RW; - if ((pte_flags(pte) & mask) != mask) + if ((pmd_flags(pmd) & mask) != mask) return 0; /* hugepages are never "special" */ - VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); + VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); refs = 0; - head = pte_page(pte); + head = pmd_page(pmd); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); do { VM_BUG_ON_PAGE(compound_head(page) != head, page); @@ -195,21 +194,20 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long mask; - pte_t pte = *(pte_t *)&pud; struct page *head, *page; int refs; mask = _PAGE_PRESENT|_PAGE_USER; if (write) mask |= _PAGE_RW; - if ((pte_flags(pte) & mask) != mask) + if ((pud_flags(pud) & mask) != mask) return 0; /* hugepages are never "special" */ - VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL); + VM_BUG_ON(!pfn_valid(pud_pfn(pud))); refs = 0; - head = pte_page(pte); + head = pud_page(pud); page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); do { VM_BUG_ON_PAGE(compound_head(page) != head, page); -- cgit v1.2.3 From 3a19109efbfa7d887996a74257556a46e00525c2 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:22 -0600 Subject: x86/mm: Fix try_preserve_large_page() to handle large PAT bit try_preserve_large_page() is called from __change_page_attr() to change the mapping attribute of a given large page. This function uses pte_pfn() and pte_pgprot() for PUD/PMD, which do not handle the large PAT bit properly. Fix try_preserve_large_page() by using the corresponding pud/pmd prot/pfn interfaces. Also remove '#ifdef CONFIG_X86_64', which is not necessary. Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-10-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1441368a7931..3f612713159a 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -468,7 +468,7 @@ static int try_preserve_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { - unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn; + unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn; pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot, req_prot; int i, do_split = 1; @@ -488,17 +488,21 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, switch (level) { case PG_LEVEL_2M: -#ifdef CONFIG_X86_64 + old_prot = pmd_pgprot(*(pmd_t *)kpte); + old_pfn = pmd_pfn(*(pmd_t *)kpte); + break; case PG_LEVEL_1G: -#endif - psize = page_level_size(level); - pmask = page_level_mask(level); + old_prot = pud_pgprot(*(pud_t *)kpte); + old_pfn = pud_pfn(*(pud_t *)kpte); break; default: do_split = -EINVAL; goto out_unlock; } + psize = page_level_size(level); + pmask = page_level_mask(level); + /* * Calculate the number of pages, which fit into this large * page starting at address: @@ -514,7 +518,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * up accordingly. */ old_pte = *kpte; - old_prot = req_prot = pgprot_large_2_4k(pte_pgprot(old_pte)); + old_prot = req_prot = pgprot_large_2_4k(old_prot); pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); @@ -540,10 +544,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, req_prot = canon_pgprot(req_prot); /* - * old_pte points to the large page base address. So we need + * old_pfn points to the large page base pfn. So we need * to add the offset of the virtual address: */ - pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); + pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT); cpa->pfn = pfn; new_prot = static_protections(req_prot, address, pfn); @@ -554,7 +558,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * the pages in the range we try to preserve: */ addr = address & pmask; - pfn = pte_pfn(old_pte); + pfn = old_pfn; for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { pgprot_t chk_prot = static_protections(req_prot, addr, pfn); @@ -584,7 +588,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * The address is aligned and the number of pages * covers the full page. */ - new_pte = pfn_pte(pte_pfn(old_pte), new_prot); + new_pte = pfn_pte(old_pfn, new_prot); __set_pmd_pte(kpte, address, new_pte); cpa->flags |= CPA_FLUSHTLB; do_split = 0; -- cgit v1.2.3 From d551aaa2f7e1387fa66093ce9914c2e91f283a50 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:23 -0600 Subject: x86/mm: Fix __split_large_page() to handle large PAT bit __split_large_page() is called from __change_page_attr() to change the mapping attribute by splitting a given large page into smaller pages. This function uses pte_pfn() and pte_pgprot() for PUD/PMD, which do not handle the large PAT bit properly. Fix __split_large_page() by using the corresponding pud/pmd pfn/ pgprot interfaces. Also remove '#ifdef CONFIG_X86_64', which is not necessary. Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-11-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 3f612713159a..ab9465b9160f 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -605,7 +605,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, struct page *base) { pte_t *pbase = (pte_t *)page_address(base); - unsigned long pfn, pfninc = 1; + unsigned long ref_pfn, pfn, pfninc = 1; unsigned int i, level; pte_t *tmp; pgprot_t ref_prot; @@ -622,26 +622,33 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, } paravirt_alloc_pte(&init_mm, page_to_pfn(base)); - ref_prot = pte_pgprot(pte_clrhuge(*kpte)); - /* promote PAT bit to correct position */ - if (level == PG_LEVEL_2M) + switch (level) { + case PG_LEVEL_2M: + ref_prot = pmd_pgprot(*(pmd_t *)kpte); + /* clear PSE and promote PAT bit to correct position */ ref_prot = pgprot_large_2_4k(ref_prot); + ref_pfn = pmd_pfn(*(pmd_t *)kpte); + break; -#ifdef CONFIG_X86_64 - if (level == PG_LEVEL_1G) { + case PG_LEVEL_1G: + ref_prot = pud_pgprot(*(pud_t *)kpte); + ref_pfn = pud_pfn(*(pud_t *)kpte); pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; + /* - * Set the PSE flags only if the PRESENT flag is set + * Clear the PSE flags if the PRESENT flag is not set * otherwise pmd_present/pmd_huge will return true * even on a non present pmd. */ - if (pgprot_val(ref_prot) & _PAGE_PRESENT) - pgprot_val(ref_prot) |= _PAGE_PSE; - else + if (!(pgprot_val(ref_prot) & _PAGE_PRESENT)) pgprot_val(ref_prot) &= ~_PAGE_PSE; + break; + + default: + spin_unlock(&pgd_lock); + return 1; } -#endif /* * Set the GLOBAL flags only if the PRESENT flag is set @@ -657,7 +664,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, /* * Get the target pfn from the original entry: */ - pfn = pte_pfn(*kpte); + pfn = ref_pfn; for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot))); -- cgit v1.2.3 From 55696b1f664e52b3036f21631f9c2247b667f587 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 17 Sep 2015 12:24:24 -0600 Subject: x86/mm: Fix no-change case in try_preserve_large_page() try_preserve_large_page() checks if new_prot is the same as old_prot. If so, it simply sets do_split to 0, and returns with no-operation. However, old_prot is set as a 4KB pgprot value while new_prot is a large page pgprot value. Now that old_prot is initially set from p?d_pgprot() as a large page pgprot value, fix it by not overwriting old_prot with a 4KB pgprot value. Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Juergen Gross Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: Konrad Wilk Cc: Robert Elliot Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1442514264-12475-12-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index ab9465b9160f..e2621a8e8213 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -518,7 +518,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * up accordingly. */ old_pte = *kpte; - old_prot = req_prot = pgprot_large_2_4k(old_prot); + req_prot = pgprot_large_2_4k(old_prot); pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); -- cgit v1.2.3 From db1003a719d75cebe5843a7906c02c29bec9922c Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Mon, 21 Sep 2015 01:01:59 +0800 Subject: x86/numachip: Cleanup Numachip support Drop unused code and includes in Numachip header files and APIC driver. Additionally, use the 'numachip1' prefix on Numachip1-specific functions; this prepares for adding Numachip2 support in later patches. Signed-off-by: Daniel J Blueman Acked-by: Steffen Persvold Cc: Daniel Lezcano Link: http://lkml.kernel.org/r/1442768522-19217-1-git-send-email-daniel@numascale.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/numachip/numachip_csr.h | 118 +-------------------------- arch/x86/kernel/apic/apic_numachip.c | 104 ++++++++++------------- 2 files changed, 44 insertions(+), 178 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/numachip/numachip_csr.h b/arch/x86/include/asm/numachip/numachip_csr.h index 660f843df928..7469b13a9cfa 100644 --- a/arch/x86/include/asm/numachip/numachip_csr.h +++ b/arch/x86/include/asm/numachip/numachip_csr.h @@ -14,12 +14,7 @@ #ifndef _ASM_X86_NUMACHIP_NUMACHIP_CSR_H #define _ASM_X86_NUMACHIP_NUMACHIP_CSR_H -#include -#include #include -#include -#include -#include #define CSR_NODE_SHIFT 16 #define CSR_NODE_BITS(p) (((unsigned long)(p)) << CSR_NODE_SHIFT) @@ -27,11 +22,8 @@ /* 32K CSR space, b15 indicates geo/non-geo */ #define CSR_OFFSET_MASK 0x7fffUL - -/* Global CSR space covers all 4K possible nodes with 64K CSR space per node */ -#define NUMACHIP_GCSR_BASE 0x3fff00000000ULL -#define NUMACHIP_GCSR_LIM 0x3fff0fffffffULL -#define NUMACHIP_GCSR_SIZE (NUMACHIP_GCSR_LIM - NUMACHIP_GCSR_BASE + 1) +#define CSR_G0_NODE_IDS (0x008 + (0 << 12)) +#define CSR_G3_EXT_IRQ_GEN (0x030 + (3 << 12)) /* * Local CSR space starts in global CSR space with "nodeid" = 0xfff0, however @@ -42,28 +34,12 @@ #define NUMACHIP_LCSR_LIM 0x3fffffffffffULL #define NUMACHIP_LCSR_SIZE (NUMACHIP_LCSR_LIM - NUMACHIP_LCSR_BASE + 1) -static inline void *gcsr_address(int node, unsigned long offset) -{ - return __va(NUMACHIP_GCSR_BASE | (1UL << 15) | - CSR_NODE_BITS(node & CSR_NODE_MASK) | (offset & CSR_OFFSET_MASK)); -} - static inline void *lcsr_address(unsigned long offset) { return __va(NUMACHIP_LCSR_BASE | (1UL << 15) | CSR_NODE_BITS(0xfff0) | (offset & CSR_OFFSET_MASK)); } -static inline unsigned int read_gcsr(int node, unsigned long offset) -{ - return swab32(readl(gcsr_address(node, offset))); -} - -static inline void write_gcsr(int node, unsigned long offset, unsigned int val) -{ - writel(swab32(val), gcsr_address(node, offset)); -} - static inline unsigned int read_lcsr(unsigned long offset) { return swab32(readl(lcsr_address(offset))); @@ -74,94 +50,4 @@ static inline void write_lcsr(unsigned long offset, unsigned int val) writel(swab32(val), lcsr_address(offset)); } -/* ========================================================================= */ -/* CSR_G0_STATE_CLEAR */ -/* ========================================================================= */ - -#define CSR_G0_STATE_CLEAR (0x000 + (0 << 12)) -union numachip_csr_g0_state_clear { - unsigned int v; - struct numachip_csr_g0_state_clear_s { - unsigned int _state:2; - unsigned int _rsvd_2_6:5; - unsigned int _lost:1; - unsigned int _rsvd_8_31:24; - } s; -}; - -/* ========================================================================= */ -/* CSR_G0_NODE_IDS */ -/* ========================================================================= */ - -#define CSR_G0_NODE_IDS (0x008 + (0 << 12)) -union numachip_csr_g0_node_ids { - unsigned int v; - struct numachip_csr_g0_node_ids_s { - unsigned int _initialid:16; - unsigned int _nodeid:12; - unsigned int _rsvd_28_31:4; - } s; -}; - -/* ========================================================================= */ -/* CSR_G3_EXT_IRQ_GEN */ -/* ========================================================================= */ - -#define CSR_G3_EXT_IRQ_GEN (0x030 + (3 << 12)) -union numachip_csr_g3_ext_irq_gen { - unsigned int v; - struct numachip_csr_g3_ext_irq_gen_s { - unsigned int _vector:8; - unsigned int _msgtype:3; - unsigned int _index:5; - unsigned int _destination_apic_id:16; - } s; -}; - -/* ========================================================================= */ -/* CSR_G3_EXT_IRQ_STATUS */ -/* ========================================================================= */ - -#define CSR_G3_EXT_IRQ_STATUS (0x034 + (3 << 12)) -union numachip_csr_g3_ext_irq_status { - unsigned int v; - struct numachip_csr_g3_ext_irq_status_s { - unsigned int _result:32; - } s; -}; - -/* ========================================================================= */ -/* CSR_G3_EXT_IRQ_DEST */ -/* ========================================================================= */ - -#define CSR_G3_EXT_IRQ_DEST (0x038 + (3 << 12)) -union numachip_csr_g3_ext_irq_dest { - unsigned int v; - struct numachip_csr_g3_ext_irq_dest_s { - unsigned int _irq:8; - unsigned int _rsvd_8_31:24; - } s; -}; - -/* ========================================================================= */ -/* CSR_G3_NC_ATT_MAP_SELECT */ -/* ========================================================================= */ - -#define CSR_G3_NC_ATT_MAP_SELECT (0x7fc + (3 << 12)) -union numachip_csr_g3_nc_att_map_select { - unsigned int v; - struct numachip_csr_g3_nc_att_map_select_s { - unsigned int _upper_address_bits:4; - unsigned int _select_ram:4; - unsigned int _rsvd_8_31:24; - } s; -}; - -/* ========================================================================= */ -/* CSR_G3_NC_ATT_MAP_SELECT_0-255 */ -/* ========================================================================= */ - -#define CSR_G3_NC_ATT_MAP_SELECT_0 (0x800 + (3 << 12)) - #endif /* _ASM_X86_NUMACHIP_NUMACHIP_CSR_H */ - diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index b548fd3b764b..eeefbb11ec79 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -11,30 +11,20 @@ * */ -#include -#include -#include -#include -#include -#include -#include #include -#include -#include #include #include -#include -#include #include #include #include +#include -static int numachip_system __read_mostly; +u8 numachip_system __read_mostly; +static const struct apic apic_numachip1; +static void (*numachip_apic_icr_write)(int apicid, unsigned int val) __read_mostly; -static const struct apic apic_numachip; - -static unsigned int get_apic_id(unsigned long x) +static unsigned int numachip1_get_apic_id(unsigned long x) { unsigned long value; unsigned int id = (x >> 24) & 0xff; @@ -47,7 +37,7 @@ static unsigned int get_apic_id(unsigned long x) return id; } -static unsigned long set_apic_id(unsigned int id) +static unsigned long numachip1_set_apic_id(unsigned int id) { unsigned long x; @@ -55,11 +45,6 @@ static unsigned long set_apic_id(unsigned int id) return x; } -static unsigned int read_xapic_id(void) -{ - return get_apic_id(apic_read(APIC_ID)); -} - static int numachip_apic_id_valid(int apicid) { /* Trust what bootloader passes in MADT */ @@ -68,7 +53,7 @@ static int numachip_apic_id_valid(int apicid) static int numachip_apic_id_registered(void) { - return physid_isset(read_xapic_id(), phys_cpu_present_map); + return 1; } static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) @@ -76,36 +61,27 @@ static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) return initial_apic_id >> index_msb; } -static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) +static void numachip1_apic_icr_write(int apicid, unsigned int val) { - union numachip_csr_g3_ext_irq_gen int_gen; - - int_gen.s._destination_apic_id = phys_apicid; - int_gen.s._vector = 0; - int_gen.s._msgtype = APIC_DM_INIT >> 8; - int_gen.s._index = 0; - - write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); - - int_gen.s._msgtype = APIC_DM_STARTUP >> 8; - int_gen.s._vector = start_rip >> 12; + write_lcsr(CSR_G3_EXT_IRQ_GEN, (apicid << 16) | val); +} - write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); +static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) +{ + numachip_apic_icr_write(phys_apicid, APIC_DM_INIT); + numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP | + (start_rip >> 12)); return 0; } static void numachip_send_IPI_one(int cpu, int vector) { - union numachip_csr_g3_ext_irq_gen int_gen; int apicid = per_cpu(x86_cpu_to_apicid, cpu); + unsigned int dmode; - int_gen.s._destination_apic_id = apicid; - int_gen.s._vector = vector; - int_gen.s._msgtype = (vector == NMI_VECTOR ? APIC_DM_NMI : APIC_DM_FIXED) >> 8; - int_gen.s._index = 0; - - write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); + dmode = (vector == NMI_VECTOR) ? APIC_DM_NMI : APIC_DM_FIXED; + numachip_apic_icr_write(apicid, dmode | vector); } static void numachip_send_IPI_mask(const struct cpumask *mask, int vector) @@ -149,9 +125,9 @@ static void numachip_send_IPI_self(int vector) apic_write(APIC_SELF_IPI, vector); } -static int __init numachip_probe(void) +static int __init numachip1_probe(void) { - return apic == &apic_numachip; + return apic == &apic_numachip1; } static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) @@ -172,34 +148,38 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) static int __init numachip_system_init(void) { - if (!numachip_system) + /* Map the LCSR area and set up the apic_icr_write function */ + switch (numachip_system) { + case 1: + init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); + numachip_apic_icr_write = numachip1_apic_icr_write; + x86_init.pci.arch_init = pci_numachip_init; + break; + default: return 0; - - init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); - init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE); + } x86_cpuinit.fixup_cpu_id = fixup_cpu_id; - x86_init.pci.arch_init = pci_numachip_init; return 0; } early_initcall(numachip_system_init); -static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int numachip1_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - if (!strncmp(oem_id, "NUMASC", 6)) { - numachip_system = 1; - return 1; - } + if ((strncmp(oem_id, "NUMASC", 6) != 0) || + (strncmp(oem_table_id, "NCONNECT", 8) != 0)) + return 0; - return 0; -} + numachip_system = 1; -static const struct apic apic_numachip __refconst = { + return 1; +} +static const struct apic apic_numachip1 __refconst = { .name = "NumaConnect system", - .probe = numachip_probe, - .acpi_madt_oem_check = numachip_acpi_madt_oem_check, + .probe = numachip1_probe, + .acpi_madt_oem_check = numachip1_acpi_madt_oem_check, .apic_id_valid = numachip_apic_id_valid, .apic_id_registered = numachip_apic_id_registered, @@ -221,8 +201,8 @@ static const struct apic apic_numachip __refconst = { .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = numachip_phys_pkg_id, - .get_apic_id = get_apic_id, - .set_apic_id = set_apic_id, + .get_apic_id = numachip1_get_apic_id, + .set_apic_id = numachip1_set_apic_id, .apic_id_mask = 0xffU << 24, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, @@ -244,5 +224,5 @@ static const struct apic apic_numachip __refconst = { .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, }; -apic_driver(apic_numachip); +apic_driver(apic_numachip1); -- cgit v1.2.3 From d9d4dee6cedfa17e5eedcba242dca3091bf73bc3 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Mon, 21 Sep 2015 01:02:00 +0800 Subject: x86/numachip: Add Numachip2 APIC support Introduce support for Numachip2 remote interrupts via detecting the right ACPI SRAT signature. Access is performed via a fixed mapping in the x86 physical address space. Signed-off-by: Daniel J Blueman Acked-by: Steffen Persvold Cc: Daniel Lezcano Link: http://lkml.kernel.org/r/1442768522-19217-2-git-send-email-daniel@numascale.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/numachip/numachip.h | 1 + arch/x86/include/asm/numachip/numachip_csr.h | 35 +++++++++++ arch/x86/kernel/apic/apic_numachip.c | 93 ++++++++++++++++++++++++++++ 3 files changed, 129 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/numachip/numachip.h b/arch/x86/include/asm/numachip/numachip.h index 1c6f7f6212c1..c64373a2d731 100644 --- a/arch/x86/include/asm/numachip/numachip.h +++ b/arch/x86/include/asm/numachip/numachip.h @@ -14,6 +14,7 @@ #ifndef _ASM_X86_NUMACHIP_NUMACHIP_H #define _ASM_X86_NUMACHIP_NUMACHIP_H +extern u8 numachip_system; extern int __init pci_numachip_init(void); #endif /* _ASM_X86_NUMACHIP_NUMACHIP_H */ diff --git a/arch/x86/include/asm/numachip/numachip_csr.h b/arch/x86/include/asm/numachip/numachip_csr.h index 7469b13a9cfa..e08b803f8491 100644 --- a/arch/x86/include/asm/numachip/numachip_csr.h +++ b/arch/x86/include/asm/numachip/numachip_csr.h @@ -14,6 +14,7 @@ #ifndef _ASM_X86_NUMACHIP_NUMACHIP_CSR_H #define _ASM_X86_NUMACHIP_NUMACHIP_CSR_H +#include #include #define CSR_NODE_SHIFT 16 @@ -50,4 +51,38 @@ static inline void write_lcsr(unsigned long offset, unsigned int val) writel(swab32(val), lcsr_address(offset)); } +/* + * On NumaChip2, local CSR space is 16MB and starts at fixed offset below 4G + */ + +#define NUMACHIP2_LCSR_BASE 0xf0000000UL +#define NUMACHIP2_LCSR_SIZE 0x1000000UL +#define NUMACHIP2_APIC_ICR 0x100000 + +static inline void __iomem *numachip2_lcsr_address(unsigned long offset) +{ + return (void __iomem *)__va(NUMACHIP2_LCSR_BASE | + (offset & (NUMACHIP2_LCSR_SIZE - 1))); +} + +static inline u32 numachip2_read32_lcsr(unsigned long offset) +{ + return readl(numachip2_lcsr_address(offset)); +} + +static inline u64 numachip2_read64_lcsr(unsigned long offset) +{ + return readq(numachip2_lcsr_address(offset)); +} + +static inline void numachip2_write32_lcsr(unsigned long offset, u32 val) +{ + writel(val, numachip2_lcsr_address(offset)); +} + +static inline void numachip2_write64_lcsr(unsigned long offset, u64 val) +{ + writeq(val, numachip2_lcsr_address(offset)); +} + #endif /* _ASM_X86_NUMACHIP_NUMACHIP_CSR_H */ diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index eeefbb11ec79..3cb929481d46 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -22,6 +22,7 @@ u8 numachip_system __read_mostly; static const struct apic apic_numachip1; +static const struct apic apic_numachip2; static void (*numachip_apic_icr_write)(int apicid, unsigned int val) __read_mostly; static unsigned int numachip1_get_apic_id(unsigned long x) @@ -45,6 +46,19 @@ static unsigned long numachip1_set_apic_id(unsigned int id) return x; } +static unsigned int numachip2_get_apic_id(unsigned long x) +{ + u64 mcfg; + + rdmsrl(MSR_FAM10H_MMIO_CONF_BASE, mcfg); + return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24); +} + +static unsigned long numachip2_set_apic_id(unsigned int id) +{ + return id << 24; +} + static int numachip_apic_id_valid(int apicid) { /* Trust what bootloader passes in MADT */ @@ -66,6 +80,11 @@ static void numachip1_apic_icr_write(int apicid, unsigned int val) write_lcsr(CSR_G3_EXT_IRQ_GEN, (apicid << 16) | val); } +static void numachip2_apic_icr_write(int apicid, unsigned int val) +{ + numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val); +} + static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) { numachip_apic_icr_write(phys_apicid, APIC_DM_INIT); @@ -130,6 +149,11 @@ static int __init numachip1_probe(void) return apic == &apic_numachip1; } +static int __init numachip2_probe(void) +{ + return apic == &apic_numachip2; +} + static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) { u64 val; @@ -155,6 +179,13 @@ static int __init numachip_system_init(void) numachip_apic_icr_write = numachip1_apic_icr_write; x86_init.pci.arch_init = pci_numachip_init; break; + case 2: + init_extra_mapping_uc(NUMACHIP2_LCSR_BASE, NUMACHIP2_LCSR_SIZE); + numachip_apic_icr_write = numachip2_apic_icr_write; + + /* Use MCFG config cycles rather than locked CF8 cycles */ + raw_pci_ops = &pci_mmcfg; + break; default: return 0; } @@ -176,6 +207,17 @@ static int numachip1_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 1; } +static int numachip2_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if ((strncmp(oem_id, "NUMASC", 6) != 0) || + (strncmp(oem_table_id, "NCONECT2", 8) != 0)) + return 0; + + numachip_system = 2; + + return 1; +} + static const struct apic apic_numachip1 __refconst = { .name = "NumaConnect system", .probe = numachip1_probe, @@ -226,3 +268,54 @@ static const struct apic apic_numachip1 __refconst = { }; apic_driver(apic_numachip1); + +static const struct apic apic_numachip2 __refconst = { + .name = "NumaConnect2 system", + .probe = numachip2_probe, + .acpi_madt_oem_check = numachip2_acpi_madt_oem_check, + .apic_id_valid = numachip_apic_id_valid, + .apic_id_registered = numachip_apic_id_registered, + + .irq_delivery_mode = dest_Fixed, + .irq_dest_mode = 0, /* physical */ + + .target_cpus = online_target_cpus, + .disable_esr = 0, + .dest_logical = 0, + .check_apicid_used = NULL, + + .vector_allocation_domain = default_vector_allocation_domain, + .init_apic_ldr = flat_init_apic_ldr, + + .ioapic_phys_id_map = NULL, + .setup_apic_routing = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .phys_pkg_id = numachip_phys_pkg_id, + + .get_apic_id = numachip2_get_apic_id, + .set_apic_id = numachip2_set_apic_id, + .apic_id_mask = 0xffU << 24, + + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + + .send_IPI_mask = numachip_send_IPI_mask, + .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, + .send_IPI_allbutself = numachip_send_IPI_allbutself, + .send_IPI_all = numachip_send_IPI_all, + .send_IPI_self = numachip_send_IPI_self, + + .wakeup_secondary_cpu = numachip_wakeup_secondary, + .inquire_remote_apic = NULL, /* REMRD not supported */ + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, + .wait_icr_idle = native_apic_wait_icr_idle, + .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, +}; + +apic_driver(apic_numachip2); -- cgit v1.2.3 From ad03a9c25d258641556c7198e26fd882c741987a Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Mon, 21 Sep 2015 01:02:01 +0800 Subject: x86/numachip: Add Numachip IPI optimisations When sending IPIs, first check if the non-local part of the source and destination APIC IDs match; if so, send via the local APIC for efficiency. Secondly, since the AMD BIOS-kernel developer guide states IPI delivery will occur invarient of prior deliver status, avoid polling the delivery status bit for efficiency. Signed-off-by: Daniel J Blueman Acked-by: Steffen Persvold Cc: Daniel Lezcano Link: http://lkml.kernel.org/r/1442768522-19217-3-git-send-email-daniel@numascale.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/numachip/numachip_csr.h | 1 + arch/x86/kernel/apic/apic_numachip.c | 37 ++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/numachip/numachip_csr.h b/arch/x86/include/asm/numachip/numachip_csr.h index e08b803f8491..e09d845ce406 100644 --- a/arch/x86/include/asm/numachip/numachip_csr.h +++ b/arch/x86/include/asm/numachip/numachip_csr.h @@ -34,6 +34,7 @@ #define NUMACHIP_LCSR_BASE 0x3ffffe000000ULL #define NUMACHIP_LCSR_LIM 0x3fffffffffffULL #define NUMACHIP_LCSR_SIZE (NUMACHIP_LCSR_LIM - NUMACHIP_LCSR_BASE + 1) +#define NUMACHIP_LAPIC_BITS 8 static inline void *lcsr_address(unsigned long offset) { diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 3cb929481d46..38dd5efdd04c 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -96,9 +96,25 @@ static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) static void numachip_send_IPI_one(int cpu, int vector) { - int apicid = per_cpu(x86_cpu_to_apicid, cpu); + int local_apicid, apicid = per_cpu(x86_cpu_to_apicid, cpu); unsigned int dmode; + preempt_disable(); + local_apicid = __this_cpu_read(x86_cpu_to_apicid); + + /* Send via local APIC where non-local part matches */ + if (!((apicid ^ local_apicid) >> NUMACHIP_LAPIC_BITS)) { + unsigned long flags; + + local_irq_save(flags); + __default_send_IPI_dest_field(apicid, vector, + APIC_DEST_PHYSICAL); + local_irq_restore(flags); + preempt_enable(); + return; + } + preempt_enable(); + dmode = (vector == NMI_VECTOR) ? APIC_DM_NMI : APIC_DM_FIXED; numachip_apic_icr_write(apicid, dmode | vector); } @@ -218,6 +234,17 @@ static int numachip2_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 1; } +/* APIC IPIs are queued */ +static void numachip_apic_wait_icr_idle(void) +{ +} + +/* APIC NMI IPIs are queued */ +static u32 numachip_safe_apic_wait_icr_idle(void) +{ + return 0; +} + static const struct apic apic_numachip1 __refconst = { .name = "NumaConnect system", .probe = numachip1_probe, @@ -263,8 +290,8 @@ static const struct apic apic_numachip1 __refconst = { .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + .wait_icr_idle = numachip_apic_wait_icr_idle, + .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle, }; apic_driver(apic_numachip1); @@ -314,8 +341,8 @@ static const struct apic apic_numachip2 __refconst = { .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + .wait_icr_idle = numachip_apic_wait_icr_idle, + .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle, }; apic_driver(apic_numachip2); -- cgit v1.2.3 From ce2e572cfe7b2fc3f0e9da4aa7bc61a2c2c51fc7 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Mon, 21 Sep 2015 18:02:25 +0800 Subject: x86/numachip: Introduce Numachip2 timer mechanisms Add 1GHz 64-bit Numachip2 clocksource timer support for accurate system-wide timekeeping, as core TSCs are unsynchronised. Additionally, add a per-core clockevent mechanism that interrupts via the platform IPI vector after a programmed period. [ tglx: Taking it through x86 due to dependencies ] Signed-off-by: Daniel J Blueman Acked-by: Steffen Persvold Cc: Daniel Lezcano Link: http://lkml.kernel.org/r/1442829745-29311-1-git-send-email-daniel@numascale.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/numachip/numachip_csr.h | 9 +++ drivers/clocksource/Makefile | 1 + drivers/clocksource/numachip.c | 95 ++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+) create mode 100644 drivers/clocksource/numachip.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/numachip/numachip_csr.h b/arch/x86/include/asm/numachip/numachip_csr.h index e09d845ce406..29719eecdc2e 100644 --- a/arch/x86/include/asm/numachip/numachip_csr.h +++ b/arch/x86/include/asm/numachip/numachip_csr.h @@ -59,6 +59,10 @@ static inline void write_lcsr(unsigned long offset, unsigned int val) #define NUMACHIP2_LCSR_BASE 0xf0000000UL #define NUMACHIP2_LCSR_SIZE 0x1000000UL #define NUMACHIP2_APIC_ICR 0x100000 +#define NUMACHIP2_TIMER_DEADLINE 0x200000 +#define NUMACHIP2_TIMER_INT 0x200008 +#define NUMACHIP2_TIMER_NOW 0x200018 +#define NUMACHIP2_TIMER_RESET 0x200020 static inline void __iomem *numachip2_lcsr_address(unsigned long offset) { @@ -86,4 +90,9 @@ static inline void numachip2_write64_lcsr(unsigned long offset, u64 val) writeq(val, numachip2_lcsr_address(offset)); } +static inline unsigned int numachip2_timer(void) +{ + return (smp_processor_id() % 48) << 6; +} + #endif /* _ASM_X86_NUMACHIP_NUMACHIP_CSR_H */ diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index 5c00863c3e33..57dfad312341 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -62,3 +62,4 @@ obj-$(CONFIG_H8300) += h8300_timer8.o obj-$(CONFIG_H8300_TMR16) += h8300_timer16.o obj-$(CONFIG_H8300_TPU) += h8300_tpu.o obj-$(CONFIG_CLKSRC_ST_LPC) += clksrc_st_lpc.o +obj-$(CONFIG_X86_NUMACHIP) += numachip.o diff --git a/drivers/clocksource/numachip.c b/drivers/clocksource/numachip.c new file mode 100644 index 000000000000..088e5fac8b3f --- /dev/null +++ b/drivers/clocksource/numachip.c @@ -0,0 +1,95 @@ +/* + * + * Copyright (C) 2015 Numascale AS. All rights reserved. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include + +#include +#include +#include + +static DEFINE_PER_CPU(struct clock_event_device, cpu_ced); + +static cycles_t numachip2_timer_read(struct clocksource *cs) +{ + return numachip2_read64_lcsr(NUMACHIP2_TIMER_NOW); +} + +static struct clocksource numachip2_clocksource = { + .name = "numachip2", + .rating = 295, + .read = numachip2_timer_read, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .mult = 1, + .shift = 0, +}; + +static int numachip2_set_next_event(unsigned long delta, struct clock_event_device *ced) +{ + numachip2_write64_lcsr(NUMACHIP2_TIMER_DEADLINE + numachip2_timer(), + delta); + return 0; +} + +static struct clock_event_device numachip2_clockevent = { + .name = "numachip2", + .rating = 400, + .set_next_event = numachip2_set_next_event, + .features = CLOCK_EVT_FEAT_ONESHOT, + .mult = 1, + .shift = 0, + .min_delta_ns = 1250, + .max_delta_ns = LONG_MAX, +}; + +static void numachip_timer_interrupt(void) +{ + struct clock_event_device *ced = this_cpu_ptr(&cpu_ced); + + ced->event_handler(ced); +} + +static __init void numachip_timer_each(struct work_struct *work) +{ + unsigned local_apicid = __this_cpu_read(x86_cpu_to_apicid) & 0xff; + struct clock_event_device *ced = this_cpu_ptr(&cpu_ced); + + /* Setup IPI vector to local core and relative timing mode */ + numachip2_write64_lcsr(NUMACHIP2_TIMER_INT + numachip2_timer(), + (3 << 22) | (X86_PLATFORM_IPI_VECTOR << 14) | + (local_apicid << 6)); + + *ced = numachip2_clockevent; + ced->cpumask = cpumask_of(smp_processor_id()); + clockevents_register_device(ced); +} + +static int __init numachip_timer_init(void) +{ + if (numachip_system != 2) + return -ENODEV; + + /* Reset timer */ + numachip2_write64_lcsr(NUMACHIP2_TIMER_RESET, 0); + clocksource_register_hz(&numachip2_clocksource, NSEC_PER_SEC); + + /* Setup per-cpu clockevents */ + x86_platform_ipi_callback = numachip_timer_interrupt; + schedule_on_each_cpu(&numachip_timer_each); + + return 0; +} + +arch_initcall(numachip_timer_init); -- cgit v1.2.3 From 62e8a3258bda118f24ff462fe04cfbe75b8189b5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 18 Sep 2015 11:13:10 +0200 Subject: atomic, arch: Audit atomic_{read,set}() This patch makes sure that atomic_{read,set}() are at least {READ,WRITE}_ONCE(). We already had the 'requirement' that atomic_read() should use ACCESS_ONCE(), and most archs had this, but a few were lacking. All are now converted to use READ_ONCE(). And, by a symmetry and general paranoia argument, upgrade atomic_set() to use WRITE_ONCE(). Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Dmitry Vyukov Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: james.hogan@imgtec.com Cc: linux-kernel@vger.kernel.org Cc: oleg@redhat.com Cc: will.deacon@arm.com Signed-off-by: Ingo Molnar --- arch/alpha/include/asm/atomic.h | 8 ++++---- arch/arc/include/asm/atomic.h | 8 ++++---- arch/arm/include/asm/atomic.h | 4 ++-- arch/arm64/include/asm/atomic.h | 2 +- arch/avr32/include/asm/atomic.h | 4 ++-- arch/frv/include/asm/atomic.h | 4 ++-- arch/h8300/include/asm/atomic.h | 4 ++-- arch/hexagon/include/asm/atomic.h | 2 +- arch/ia64/include/asm/atomic.h | 8 ++++---- arch/m32r/include/asm/atomic.h | 4 ++-- arch/m68k/include/asm/atomic.h | 4 ++-- arch/metag/include/asm/atomic_lnkget.h | 2 +- arch/metag/include/asm/atomic_lock1.h | 2 +- arch/mips/include/asm/atomic.h | 8 ++++---- arch/mn10300/include/asm/atomic.h | 4 ++-- arch/parisc/include/asm/atomic.h | 2 +- arch/sh/include/asm/atomic.h | 4 ++-- arch/sparc/include/asm/atomic_64.h | 8 ++++---- arch/tile/include/asm/atomic.h | 2 +- arch/tile/include/asm/atomic_64.h | 6 +++--- arch/x86/include/asm/atomic.h | 4 ++-- arch/x86/include/asm/atomic64_64.h | 4 ++-- arch/xtensa/include/asm/atomic.h | 4 ++-- include/asm-generic/atomic.h | 4 ++-- 24 files changed, 53 insertions(+), 53 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h index e8c956098424..572b228c44c7 100644 --- a/arch/alpha/include/asm/atomic.h +++ b/arch/alpha/include/asm/atomic.h @@ -17,11 +17,11 @@ #define ATOMIC_INIT(i) { (i) } #define ATOMIC64_INIT(i) { (i) } -#define atomic_read(v) ACCESS_ONCE((v)->counter) -#define atomic64_read(v) ACCESS_ONCE((v)->counter) +#define atomic_read(v) READ_ONCE((v)->counter) +#define atomic64_read(v) READ_ONCE((v)->counter) -#define atomic_set(v,i) ((v)->counter = (i)) -#define atomic64_set(v,i) ((v)->counter = (i)) +#define atomic_set(v,i) WRITE_ONCE((v)->counter, (i)) +#define atomic64_set(v,i) WRITE_ONCE((v)->counter, (i)) /* * To get proper branch prediction for the main line, we must branch diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h index c3ecda023e3a..7730d302cadb 100644 --- a/arch/arc/include/asm/atomic.h +++ b/arch/arc/include/asm/atomic.h @@ -17,11 +17,11 @@ #include #include -#define atomic_read(v) ((v)->counter) +#define atomic_read(v) READ_ONCE((v)->counter) #ifdef CONFIG_ARC_HAS_LLSC -#define atomic_set(v, i) (((v)->counter) = (i)) +#define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i)) #ifdef CONFIG_ARC_STAR_9000923308 @@ -107,7 +107,7 @@ static inline int atomic_##op##_return(int i, atomic_t *v) \ #ifndef CONFIG_SMP /* violating atomic_xxx API locking protocol in UP for optimization sake */ -#define atomic_set(v, i) (((v)->counter) = (i)) +#define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i)) #else @@ -125,7 +125,7 @@ static inline void atomic_set(atomic_t *v, int i) unsigned long flags; atomic_ops_lock(flags); - v->counter = i; + WRITE_ONCE(v->counter, i); atomic_ops_unlock(flags); } diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h index fe3ef397f5a4..2bf80afb7841 100644 --- a/arch/arm/include/asm/atomic.h +++ b/arch/arm/include/asm/atomic.h @@ -27,8 +27,8 @@ * strex/ldrex monitor on some implementations. The reason we can use it for * atomic_set() is the clrex or dummy strex done on every exception return. */ -#define atomic_read(v) ACCESS_ONCE((v)->counter) -#define atomic_set(v,i) (((v)->counter) = (i)) +#define atomic_read(v) READ_ONCE((v)->counter) +#define atomic_set(v,i) WRITE_ONCE(((v)->counter), (i)) #if __LINUX_ARM_ARCH__ >= 6 diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h index 35a67783cfa0..1e247ac2601a 100644 --- a/arch/arm64/include/asm/atomic.h +++ b/arch/arm64/include/asm/atomic.h @@ -54,7 +54,7 @@ #define ATOMIC_INIT(i) { (i) } #define atomic_read(v) READ_ONCE((v)->counter) -#define atomic_set(v, i) (((v)->counter) = (i)) +#define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i)) #define atomic_xchg(v, new) xchg(&((v)->counter), (new)) #define atomic_cmpxchg(v, old, new) cmpxchg(&((v)->counter), (old), (new)) diff --git a/arch/avr32/include/asm/atomic.h b/arch/avr32/include/asm/atomic.h index 97c9bdf83409..d74fd8ce980a 100644 --- a/arch/avr32/include/asm/atomic.h +++ b/arch/avr32/include/asm/atomic.h @@ -19,8 +19,8 @@ #define ATOMIC_INIT(i) { (i) } -#define atomic_read(v) ACCESS_ONCE((v)->counter) -#define atomic_set(v, i) (((v)->counter) = i) +#define atomic_read(v) READ_ONCE((v)->counter) +#define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i)) #define ATOMIC_OP_RETURN(op, asm_op, asm_con) \ static inline int __atomic_##op##_return(int i, atomic_t *v) \ diff --git a/arch/frv/include/asm/atomic.h b/arch/frv/include/asm/atomic.h index 0da689def4cc..64f02d451aa8 100644 --- a/arch/frv/include/asm/atomic.h +++ b/arch/frv/include/asm/atomic.h @@ -32,8 +32,8 @@ */ #define ATOMIC_INIT(i) { (i) } -#define atomic_read(v) ACCESS_ONCE((v)->counter) -#define atomic_set(v, i) (((v)->counter) = (i)) +#define atomic_read(v) READ_ONCE((v)->counter) +#define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i)) static inline int atomic_inc_return(atomic_t *v) { diff --git a/arch/h8300/include/asm/atomic.h b/arch/h8300/include/asm/atomic.h index 702ee539f87d..4435a445ae7e 100644 --- a/arch/h8300/include/asm/atomic.h +++ b/arch/h8300/include/asm/atomic.h @@ -11,8 +11,8 @@ #define ATOMIC_INIT(i) { (i) } -#define atomic_read(v) ACCESS_ONCE((v)->counter) -#define atomic_set(v, i) (((v)->counter) = i) +#define atomic_read(v) READ_ONCE((v)->counter) +#define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i)) #include diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h index 811d61f6422d..55696c4100d4 100644 --- a/arch/hexagon/include/asm/atomic.h +++ b/arch/hexagon/include/asm/atomic.h @@ -48,7 +48,7 @@ static inline void atomic_set(atomic_t *v, int new) * * Assumes all word reads on our architecture are atomic. */ -#define atomic_read(v) ((v)->counter) +#define atomic_read(v) READ_ONCE((v)->counter) /** * atomic_xchg - atomic diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h index be4beeb77d57..8dfb5f6f6c35 100644 --- a/arch/ia64/include/asm/atomic.h +++ b/arch/ia64/include/asm/atomic.h @@ -21,11 +21,11 @@ #define ATOMIC_INIT(i) { (i) } #define ATOMIC64_INIT(i) { (i) } -#define atomic_read(v) ACCESS_ONCE((v)->counter) -#define atomic64_read(v) ACCESS_ONCE((v)->counter) +#define atomic_read(v) READ_ONCE((v)->counter) +#define atomic64_read(v) READ_ONCE((v)->counter) -#define atomic_set(v,i) (((v)->counter) = (i)) -#define atomic64_set(v,i) (((v)->counter) = (i)) +#define atomic_set(v,i) WRITE_ONCE(((v)->counter), (i)) +#define atomic64_set(v,i) WRITE_ONCE(((v)->counter), (i)) #define ATOMIC_OP(op, c_op) \ static __inline__ int \ diff --git a/arch/m32r/include/asm/atomic.h b/arch/m32r/include/asm/atomic.h index 025e2a170493..ea35160d632b 100644 --- a/arch/m32r/include/asm/atomic.h +++ b/arch/m32r/include/asm/atomic.h @@ -28,7 +28,7 @@ * * Atomically reads the value of @v. */ -#define atomic_read(v) ACCESS_ONCE((v)->counter) +#define atomic_read(v) READ_ONCE((v)->counter) /** * atomic_set - set atomic variable @@ -37,7 +37,7 @@ * * Atomically sets the value of @v to @i. */ -#define atomic_set(v,i) (((v)->counter) = (i)) +#define atomic_set(v,i) WRITE_ONCE(((v)->counter), (i)) #ifdef CONFIG_CHIP_M32700_TS1 #define __ATOMIC_CLOBBER , "r4" diff --git a/arch/m68k/include/asm/atomic.h b/arch/m68k/include/asm/atomic.h index 039fac120cc0..4858178260f9 100644 --- a/arch/m68k/include/asm/atomic.h +++ b/arch/m68k/include/asm/atomic.h @@ -17,8 +17,8 @@ #define ATOMIC_INIT(i) { (i) } -#define atomic_read(v) ACCESS_ONCE((v)->counter) -#define atomic_set(v, i) (((v)->counter) = i) +#define atomic_read(v) READ_ONCE((v)->counter) +#define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i)) /* * The ColdFire parts cannot do some immediate to memory operations, diff --git a/arch/metag/include/asm/atomic_lnkget.h b/arch/metag/include/asm/atomic_lnkget.h index 21c4c268b86c..a62581815624 100644 --- a/arch/metag/include/asm/atomic_lnkget.h +++ b/arch/metag/include/asm/atomic_lnkget.h @@ -3,7 +3,7 @@ #define ATOMIC_INIT(i) { (i) } -#define atomic_set(v, i) ((v)->counter = (i)) +#define atomic_set(v, i) WRITE_ONCE((v)->counter, (i)) #include diff --git a/arch/metag/include/asm/atomic_lock1.h b/arch/metag/include/asm/atomic_lock1.h index f8efe380fe8b..0295d9b8d5bf 100644 --- a/arch/metag/include/asm/atomic_lock1.h +++ b/arch/metag/include/asm/atomic_lock1.h @@ -10,7 +10,7 @@ static inline int atomic_read(const atomic_t *v) { - return (v)->counter; + return READ_ONCE((v)->counter); } /* diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h index 4c42fd9af777..f82d3af07931 100644 --- a/arch/mips/include/asm/atomic.h +++ b/arch/mips/include/asm/atomic.h @@ -30,7 +30,7 @@ * * Atomically reads the value of @v. */ -#define atomic_read(v) ACCESS_ONCE((v)->counter) +#define atomic_read(v) READ_ONCE((v)->counter) /* * atomic_set - set atomic variable @@ -39,7 +39,7 @@ * * Atomically sets the value of @v to @i. */ -#define atomic_set(v, i) ((v)->counter = (i)) +#define atomic_set(v, i) WRITE_ONCE((v)->counter, (i)) #define ATOMIC_OP(op, c_op, asm_op) \ static __inline__ void atomic_##op(int i, atomic_t * v) \ @@ -315,14 +315,14 @@ static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u) * @v: pointer of type atomic64_t * */ -#define atomic64_read(v) ACCESS_ONCE((v)->counter) +#define atomic64_read(v) READ_ONCE((v)->counter) /* * atomic64_set - set atomic variable * @v: pointer of type atomic64_t * @i: required value */ -#define atomic64_set(v, i) ((v)->counter = (i)) +#define atomic64_set(v, i) WRITE_ONCE((v)->counter, (i)) #define ATOMIC64_OP(op, c_op, asm_op) \ static __inline__ void atomic64_##op(long i, atomic64_t * v) \ diff --git a/arch/mn10300/include/asm/atomic.h b/arch/mn10300/include/asm/atomic.h index 375e59140c9c..ce318d5ab23b 100644 --- a/arch/mn10300/include/asm/atomic.h +++ b/arch/mn10300/include/asm/atomic.h @@ -34,7 +34,7 @@ * * Atomically reads the value of @v. Note that the guaranteed */ -#define atomic_read(v) (ACCESS_ONCE((v)->counter)) +#define atomic_read(v) READ_ONCE((v)->counter) /** * atomic_set - set atomic variable @@ -43,7 +43,7 @@ * * Atomically sets the value of @v to @i. Note that the guaranteed */ -#define atomic_set(v, i) (((v)->counter) = (i)) +#define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i)) #define ATOMIC_OP(op) \ static inline void atomic_##op(int i, atomic_t *v) \ diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h index 2536965d00ea..1d109990a022 100644 --- a/arch/parisc/include/asm/atomic.h +++ b/arch/parisc/include/asm/atomic.h @@ -67,7 +67,7 @@ static __inline__ void atomic_set(atomic_t *v, int i) static __inline__ int atomic_read(const atomic_t *v) { - return ACCESS_ONCE((v)->counter); + return READ_ONCE((v)->counter); } /* exported interface */ diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h index 05b9f74ce2d5..c399e1c55685 100644 --- a/arch/sh/include/asm/atomic.h +++ b/arch/sh/include/asm/atomic.h @@ -14,8 +14,8 @@ #define ATOMIC_INIT(i) { (i) } -#define atomic_read(v) ACCESS_ONCE((v)->counter) -#define atomic_set(v,i) ((v)->counter = (i)) +#define atomic_read(v) READ_ONCE((v)->counter) +#define atomic_set(v,i) WRITE_ONCE((v)->counter, (i)) #if defined(CONFIG_GUSA_RB) #include diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h index 917084ace49d..f2fbf9e16faf 100644 --- a/arch/sparc/include/asm/atomic_64.h +++ b/arch/sparc/include/asm/atomic_64.h @@ -14,11 +14,11 @@ #define ATOMIC_INIT(i) { (i) } #define ATOMIC64_INIT(i) { (i) } -#define atomic_read(v) ACCESS_ONCE((v)->counter) -#define atomic64_read(v) ACCESS_ONCE((v)->counter) +#define atomic_read(v) READ_ONCE((v)->counter) +#define atomic64_read(v) READ_ONCE((v)->counter) -#define atomic_set(v, i) (((v)->counter) = i) -#define atomic64_set(v, i) (((v)->counter) = i) +#define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i)) +#define atomic64_set(v, i) WRITE_ONCE(((v)->counter), (i)) #define ATOMIC_OP(op) \ void atomic_##op(int, atomic_t *); \ diff --git a/arch/tile/include/asm/atomic.h b/arch/tile/include/asm/atomic.h index 709798460763..9fc0107a9c5e 100644 --- a/arch/tile/include/asm/atomic.h +++ b/arch/tile/include/asm/atomic.h @@ -34,7 +34,7 @@ */ static inline int atomic_read(const atomic_t *v) { - return ACCESS_ONCE(v->counter); + return READ_ONCE(v->counter); } /** diff --git a/arch/tile/include/asm/atomic_64.h b/arch/tile/include/asm/atomic_64.h index 096a56d6ead4..51cabc26e387 100644 --- a/arch/tile/include/asm/atomic_64.h +++ b/arch/tile/include/asm/atomic_64.h @@ -24,7 +24,7 @@ /* First, the 32-bit atomic ops that are "real" on our 64-bit platform. */ -#define atomic_set(v, i) ((v)->counter = (i)) +#define atomic_set(v, i) WRITE_ONCE((v)->counter, (i)) /* * The smp_mb() operations throughout are to support the fact that @@ -82,8 +82,8 @@ static inline void atomic_xor(int i, atomic_t *v) #define ATOMIC64_INIT(i) { (i) } -#define atomic64_read(v) ((v)->counter) -#define atomic64_set(v, i) ((v)->counter = (i)) +#define atomic64_read(v) READ_ONCE((v)->counter) +#define atomic64_set(v, i) WRITE_ONCE((v)->counter, (i)) static inline void atomic64_add(long i, atomic64_t *v) { diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index fb52aa644aab..ae5fb83e6d91 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -24,7 +24,7 @@ */ static __always_inline int atomic_read(const atomic_t *v) { - return ACCESS_ONCE((v)->counter); + return READ_ONCE((v)->counter); } /** @@ -36,7 +36,7 @@ static __always_inline int atomic_read(const atomic_t *v) */ static __always_inline void atomic_set(atomic_t *v, int i) { - v->counter = i; + WRITE_ONCE(v->counter, i); } /** diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 50e33eff58de..037351022f54 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -18,7 +18,7 @@ */ static inline long atomic64_read(const atomic64_t *v) { - return ACCESS_ONCE((v)->counter); + return READ_ONCE((v)->counter); } /** @@ -30,7 +30,7 @@ static inline long atomic64_read(const atomic64_t *v) */ static inline void atomic64_set(atomic64_t *v, long i) { - v->counter = i; + WRITE_ONCE(v->counter, i); } /** diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h index 93795d047303..fd8017ce298a 100644 --- a/arch/xtensa/include/asm/atomic.h +++ b/arch/xtensa/include/asm/atomic.h @@ -47,7 +47,7 @@ * * Atomically reads the value of @v. */ -#define atomic_read(v) ACCESS_ONCE((v)->counter) +#define atomic_read(v) READ_ONCE((v)->counter) /** * atomic_set - set atomic variable @@ -56,7 +56,7 @@ * * Atomically sets the value of @v to @i. */ -#define atomic_set(v,i) ((v)->counter = (i)) +#define atomic_set(v,i) WRITE_ONCE((v)->counter, (i)) #if XCHAL_HAVE_S32C1I #define ATOMIC_OP(op) \ diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h index d4d7e337fdcb..74f1a3704d7a 100644 --- a/include/asm-generic/atomic.h +++ b/include/asm-generic/atomic.h @@ -127,7 +127,7 @@ ATOMIC_OP(xor, ^) * Atomically reads the value of @v. */ #ifndef atomic_read -#define atomic_read(v) ACCESS_ONCE((v)->counter) +#define atomic_read(v) READ_ONCE((v)->counter) #endif /** @@ -137,7 +137,7 @@ ATOMIC_OP(xor, ^) * * Atomically sets the value of @v to @i. */ -#define atomic_set(v, i) (((v)->counter) = (i)) +#define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i)) #include -- cgit v1.2.3 From a7adb91b13c104e5ad950fbe1795aa2722f2ea0a Mon Sep 17 00:00:00 2001 From: Kristen Carlson Accardi Date: Tue, 22 Sep 2015 10:51:36 -0700 Subject: x86/cpufeatures: Correct spelling of the HWP_NOTIFY flag Because noitification just isn't right. Signed-off-by: Kristen Carlson Accardi Acked-by: Rafael J. Wysocki Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: rjw@rjwysocki.net Link: http://lkml.kernel.org/r/1442944296-11737-1-git-send-email-kristen@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 2 +- arch/x86/kernel/cpu/scattered.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index e6cf2ad350d1..9727b3b48bd1 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -193,7 +193,7 @@ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_HWP ( 7*32+ 10) /* "hwp" Intel HWP */ -#define X86_FEATURE_HWP_NOITFY ( 7*32+ 11) /* Intel HWP_NOTIFY */ +#define X86_FEATURE_HWP_NOTIFY ( 7*32+ 11) /* Intel HWP_NOTIFY */ #define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */ #define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */ #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 3d423a101fae..608fb26c7254 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -37,7 +37,7 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 }, { X86_FEATURE_HWP, CR_EAX, 7, 0x00000006, 0 }, - { X86_FEATURE_HWP_NOITFY, CR_EAX, 8, 0x00000006, 0 }, + { X86_FEATURE_HWP_NOTIFY, CR_EAX, 8, 0x00000006, 0 }, { X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 }, { X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 }, { X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 }, -- cgit v1.2.3 From 158ecc39185b885420e5136b803b29be2bbec7fb Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 23 Sep 2015 12:49:01 +0200 Subject: x86/fpu: Fixup uninitialized feature_name warning Hand in &feature_name to cpu_has_xfeatures() as it is supposed to. Fixes an uninitialized warning. Signed-off-by: Borislav Petkov Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: brgerst@gmail.com Cc: dvlasenk@redhat.com Cc: fenghua.yu@intel.com Cc: luto@amacapital.net Cc: tim.c.chen@linux.intel.com Fixes: d91cab78133d ("x86/fpu: Rename XSAVE macros") Link: http://lkml.kernel.org/r/20150923104901.GA3538@pd.tnic Signed-off-by: Ingo Molnar --- arch/x86/crypto/twofish_avx_glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 6f3738ced95e..b7a3904b953c 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -558,7 +558,7 @@ static int __init twofish_init(void) { const char *feature_name; - if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } -- cgit v1.2.3 From 3f2c5085ed99b6ad233cf77009c2f4f898b2f7c8 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 1 Sep 2015 15:41:06 -0700 Subject: x86/sched/64: Don't save flags on context switch (reinstated) This reinstates the following commit: 2c7577a75837 ("sched/x86_64: Don't save flags on context switch") which was reverted in: 512255a2ad2c ("Revert 'sched/x86_64: Don't save flags on context switch'") Historically, Linux has always saved and restored EFLAGS across context switches. As far as I know, the only reason to do this is because of the NT flag. In particular, if something calls switch_to() with the NT flag set, then we don't want to leak the NT flag into a different task that might try to IRET and fail because NT is set. Before this commit: 8c7aa698baca ("x86_64, entry: Filter RFLAGS.NT on entry from userspace") we could run system call bodies with NT set. This would be a DoS or possibly privilege escalation hole if scheduling in such a system call would leak NT into a different task. Importantly, we don't need to worry about NT being set while preemptible or across page faults. The only way we can schedule due to preemption or a page fault is in an interrupt entry that nests inside the SYSENTER prologue. The CPU will clear NT when entering through an interrupt gate, so we won't schedule with NT set. The only other interesting flags are IOPL and AC. Allowing switch_to() to change IOPL has no effect, as the value loaded during kernel execution doesn't matter at all except between a SYSENTER entry and the subsequent PUSHF, and anythign that interrupts in that window will restore IOPL on return. If we call __switch_to() with AC set, we have bigger problems. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/d4440fdc2a89247bffb7c003d2a9a2952bd46827.1441146105.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/switch_to.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index d7f3b3b78ac3..751bf4b7bf11 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -79,12 +79,12 @@ do { \ #else /* CONFIG_X86_32 */ /* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" +#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t" #define __EXTRA_CLOBBER \ , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ - "r12", "r13", "r14", "r15" + "r12", "r13", "r14", "r15", "flags" #ifdef CONFIG_CC_STACKPROTECTOR #define __switch_canary \ @@ -100,7 +100,11 @@ do { \ #define __switch_canary_iparam #endif /* CC_STACKPROTECTOR */ -/* Save restore flags to clear handle leaking NT */ +/* + * There is no need to save or restore flags, because flags are always + * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL + * has no effect. + */ #define switch_to(prev, next, last) \ asm volatile(SAVE_CONTEXT \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ -- cgit v1.2.3 From 7e5560a5648ab2bce7199c73b9c2a51b846f5541 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 24 Sep 2015 04:48:53 -0700 Subject: perf/x86: Change test_aperfmperf() and test_intel() to static Fixes the following sparse warnings: arch/x86/kernel/cpu/perf_event_msr.c:13:6: warning: symbol 'test_aperfmperf' was not declared. Should it be static? arch/x86/kernel/cpu/perf_event_msr.c:18:6: warning: symbol 'test_intel' was not declared. Should it be static? Signed-off-by: Geliang Tang Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/4588e8ab09638458f2451af572827108be3b4a36.1443123796.git.geliangtang@163.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_msr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c index 086b12eae794..f32ac13934f2 100644 --- a/arch/x86/kernel/cpu/perf_event_msr.c +++ b/arch/x86/kernel/cpu/perf_event_msr.c @@ -10,12 +10,12 @@ enum perf_msr_id { PERF_MSR_EVENT_MAX, }; -bool test_aperfmperf(int idx) +static bool test_aperfmperf(int idx) { return boot_cpu_has(X86_FEATURE_APERFMPERF); } -bool test_intel(int idx) +static bool test_intel(int idx) { if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || boot_cpu_data.x86 != 6) -- cgit v1.2.3 From 0b101e62afe626ecae60173f92f1e0ec72151653 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 24 Sep 2015 14:02:29 +0200 Subject: x86/asm: Force inlining of cpu_relax() On x86, cpu_relax() simply calls rep_nop(), which generates one instruction, PAUSE (aka REP NOP). With this config: http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os gcc-4.7.2 does not always inline rep_nop(): it generates several copies of this: (16 copies, 194 calls): 55 push %rbp 48 89 e5 mov %rsp,%rbp f3 90 pause 5d pop %rbp c3 retq See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122 This patch fixes this via s/inline/__always_inline/ on rep_nop() and cpu_relax(). ( Forcing inlining only on rep_nop() causes GCC to deinline cpu_relax(), with almost no change in generated code). text data bss dec hex filename 88118971 19905208 36421632 144445811 89c1173 vmlinux.before 88118139 19905208 36421632 144444979 89c0e33 vmlinux Signed-off-by: Denys Vlasenko Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1443096149-27291-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 19577dd325fa..b55f30960554 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -556,12 +556,12 @@ static inline unsigned int cpuid_edx(unsigned int op) } /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static inline void rep_nop(void) +static __always_inline void rep_nop(void) { asm volatile("rep; nop" ::: "memory"); } -static inline void cpu_relax(void) +static __always_inline void cpu_relax(void) { rep_nop(); } -- cgit v1.2.3 From 18ab2cd3ee9d52dc64c5ae984146a261a328c4e8 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sun, 27 Sep 2015 23:25:50 +0800 Subject: perf/core, perf/x86: Change needlessly global functions and a variable to static Fixes various sparse warnings. Signed-off-by: Geliang Tang Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/70c14234da1bed6e3e67b9c419e2d5e376ab4f32.1443367286.git.geliangtang@163.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 8 ++++---- kernel/events/core.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index be4febc58b94..e38d338a6447 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -157,7 +157,7 @@ struct _cpuid4_info_regs { struct amd_northbridge *nb; }; -unsigned short num_cache_leaves; +static unsigned short num_cache_leaves; /* AMD doesn't have CPUID4. Emulate it here to report the same information to the user. This makes some assumptions about the machine: @@ -326,7 +326,7 @@ static void amd_calc_l3_indices(struct amd_northbridge *nb) * * @returns: the disabled index if used or negative value if slot free. */ -int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) +static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) { unsigned int reg = 0; @@ -403,8 +403,8 @@ static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, * * @return: 0 on success, error status on failure */ -int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot, - unsigned long index) +static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, + unsigned slot, unsigned long index) { int ret = 0; diff --git a/kernel/events/core.c b/kernel/events/core.c index f87b434c3c1e..ea02109aee77 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -196,7 +196,7 @@ static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; static int perf_sample_allowed_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; -void update_perf_cpu_limits(void) +static void update_perf_cpu_limits(void) { u64 tmp = perf_sample_period_ns; @@ -472,7 +472,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, * mode SWOUT : schedule out everything * mode SWIN : schedule in based on cgroup for next */ -void perf_cgroup_switch(struct task_struct *task, int mode) +static void perf_cgroup_switch(struct task_struct *task, int mode) { struct perf_cpu_context *cpuctx; struct pmu *pmu; @@ -7390,7 +7390,7 @@ static int perf_pmu_nop_int(struct pmu *pmu) return 0; } -DEFINE_PER_CPU(unsigned int, nop_txn_flags); +static DEFINE_PER_CPU(unsigned int, nop_txn_flags); static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) { @@ -7750,7 +7750,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) return ret; } -struct pmu *perf_init_event(struct perf_event *event) +static struct pmu *perf_init_event(struct perf_event *event) { struct pmu *pmu = NULL; int idx; -- cgit v1.2.3 From 0d44975d1e2791f6df2b84b182f49d815ba3c9e0 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 26 Sep 2015 14:47:17 +0200 Subject: x86/kgdb: Replace bool_int_array[NR_CPUS] with bitmap Straigntforward conversion from: int was_in_debug_nmi[NR_CPUS] to: DECLARE_BITMAP(was_in_debug_nmi, NR_CPUS) Saves about 2 kbytes in BSS for NR_CPUS=512. Signed-off-by: Denys Vlasenko Cc: H. Peter Anvin Cc: Jason Wessel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1443271638-2568-1-git-send-email-dvlasenk@redhat.com [ Tidied up the code a bit. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/kgdb.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index d6178d9791db..44256a62702b 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -511,26 +511,31 @@ single_step_cont(struct pt_regs *regs, struct die_args *args) return NOTIFY_STOP; } -static int was_in_debug_nmi[NR_CPUS]; +static DECLARE_BITMAP(was_in_debug_nmi, NR_CPUS); static int kgdb_nmi_handler(unsigned int cmd, struct pt_regs *regs) { + int cpu; + switch (cmd) { case NMI_LOCAL: if (atomic_read(&kgdb_active) != -1) { /* KGDB CPU roundup */ - kgdb_nmicallback(raw_smp_processor_id(), regs); - was_in_debug_nmi[raw_smp_processor_id()] = 1; + cpu = raw_smp_processor_id(); + kgdb_nmicallback(cpu, regs); + set_bit(cpu, was_in_debug_nmi); touch_nmi_watchdog(); + return NMI_HANDLED; } break; case NMI_UNKNOWN: - if (was_in_debug_nmi[raw_smp_processor_id()]) { - was_in_debug_nmi[raw_smp_processor_id()] = 0; + cpu = raw_smp_processor_id(); + + if (__test_and_clear_bit(cpu, was_in_debug_nmi)) return NMI_HANDLED; - } + break; default: /* do nothing */ -- cgit v1.2.3 From 6e06780a98f149f131d46c1108d4ae27f05a9357 Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Mon, 28 Sep 2015 09:21:43 +0200 Subject: x86/mce: Don't clear shared banks on Intel when offlining CPUs It is not safe to clear global MCi_CTL banks during CPU offline or suspend/resume operations. These MSRs are either thread-scoped (meaning private to a thread), or core-scoped (private to threads in that core only), or with a socket scope: visible and controllable from all threads in the socket. When we offline a single CPU, clearing those MCi_CTL bits will stop signaling for all the shared, i.e., socket-wide resources, such as LLC, iMC, etc. In addition, it might be possible to compromise the integrity of an Intel Secure Guard eXtentions (SGX) system if the attacker has control of the host system and is able to inject errors which would be otherwise ignored when MCi_CTL bits are cleared. Hence on SGX enabled systems, if MCi_CTL is cleared, SGX gets disabled. Tested-by: Serge Ayoun Signed-off-by: Ashok Raj [ Cleanup text. ] Signed-off-by: Borislav Petkov Reviewed-by: Tony Luck Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-edac Link: http://lkml.kernel.org/r/1441391390-16985-1-git-send-email-ashok.raj@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 9d014b82a124..17b5ec6edcb6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2042,7 +2042,7 @@ int __init mcheck_init(void) * Disable machine checks on suspend and shutdown. We can't really handle * them later. */ -static int mce_disable_error_reporting(void) +static void mce_disable_error_reporting(void) { int i; @@ -2052,17 +2052,32 @@ static int mce_disable_error_reporting(void) if (b->init) wrmsrl(MSR_IA32_MCx_CTL(i), 0); } - return 0; + return; +} + +static void vendor_disable_error_reporting(void) +{ + /* + * Don't clear on Intel CPUs. Some of these MSRs are socket-wide. + * Disabling them for just a single offlined CPU is bad, since it will + * inhibit reporting for all shared resources on the socket like the + * last level cache (LLC), the integrated memory controller (iMC), etc. + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + return; + + mce_disable_error_reporting(); } static int mce_syscore_suspend(void) { - return mce_disable_error_reporting(); + vendor_disable_error_reporting(); + return 0; } static void mce_syscore_shutdown(void) { - mce_disable_error_reporting(); + vendor_disable_error_reporting(); } /* @@ -2342,19 +2357,14 @@ static void mce_device_remove(unsigned int cpu) static void mce_disable_cpu(void *h) { unsigned long action = *(unsigned long *)h; - int i; if (!mce_available(raw_cpu_ptr(&cpu_info))) return; if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); - for (i = 0; i < mca_cfg.banks; i++) { - struct mce_bank *b = &mce_banks[i]; - if (b->init) - wrmsrl(MSR_IA32_MCx_CTL(i), 0); - } + vendor_disable_error_reporting(); } static void mce_reenable_cpu(void *h) -- cgit v1.2.3 From 9bac175d8ed0b1dd3d3611c0713666b724eeace3 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Fri, 18 Sep 2015 17:54:30 +0200 Subject: Revert "KVM: x86: zero kvmclock_offset when vcpu0 initializes kvmclock system MSR" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shifting pvclock_vcpu_time_info.system_time on write to KVM system time MSR is a change of ABI. Probably only 2.6.16 based SLES 10 breaks due to its custom enhancements to kvmclock, but KVM never declared the MSR only for one-shot initialization. (Doc says that only one write is needed.) This reverts commit b7e60c5aedd2b63f16ef06fde4f81ca032211bc5. And adds a note to the definition of PVCLOCK_COUNTS_FROM_ZERO. Signed-off-by: Radim Krčmář Acked-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/pvclock-abi.h | 1 + arch/x86/kvm/x86.c | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h index 655e07a48f6c..67f08230103a 100644 --- a/arch/x86/include/asm/pvclock-abi.h +++ b/arch/x86/include/asm/pvclock-abi.h @@ -41,6 +41,7 @@ struct pvclock_wall_clock { #define PVCLOCK_TSC_STABLE_BIT (1 << 0) #define PVCLOCK_GUEST_STOPPED (1 << 1) +/* PVCLOCK_COUNTS_FROM_ZERO broke ABI and can't be used anymore. */ #define PVCLOCK_COUNTS_FROM_ZERO (1 << 2) #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PVCLOCK_ABI_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 991466bf8dee..92511d4b7236 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1708,8 +1708,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->pvclock_set_guest_stopped_request = false; } - pvclock_flags |= PVCLOCK_COUNTS_FROM_ZERO; - /* If the host uses TSC clocksource, then it is stable */ if (use_master_clock) pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; @@ -2007,8 +2005,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) &vcpu->requests); ka->boot_vcpu_runs_old_kvmclock = tmp; - - ka->kvmclock_offset = -get_kernel_ns(); } vcpu->arch.time = data; -- cgit v1.2.3 From 24f775a6605a8ffc697c0767fc7ea85656ddb958 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 4 Sep 2015 14:50:33 +0200 Subject: xen: use correct type for HYPERVISOR_memory_op() HYPERVISOR_memory_op() is defined to return an "int" value. This is wrong, as the Xen hypervisor will return "long". The sub-function XENMEM_maximum_reservation returns the maximum number of pages for the current domain. An int will overflow for a domain configured with 8TB of memory or more. Correct this by using the correct type. Signed-off-by: Juergen Gross Signed-off-by: David Vrabel --- arch/x86/include/asm/xen/hypercall.h | 4 ++-- arch/x86/xen/setup.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 83aea8055119..4c20dd333412 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -336,10 +336,10 @@ HYPERVISOR_update_descriptor(u64 ma, u64 desc) return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32); } -static inline int +static inline long HYPERVISOR_memory_op(unsigned int cmd, void *arg) { - return _hypercall2(int, memory_op, cmd, arg); + return _hypercall2(long, memory_op, cmd, arg); } static inline int diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index f5ef6746d47a..4ebfcecc2a8b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -548,7 +548,7 @@ static unsigned long __init xen_get_max_pages(void) { unsigned long max_pages, limit; domid_t domid = DOMID_SELF; - int ret; + long ret; limit = xen_get_pages_limit(); max_pages = limit; -- cgit v1.2.3 From 2ecf91b6d8b0ee8ef38aa7ea2a0fe0cd57b6ca50 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 21 Sep 2015 09:09:28 -0400 Subject: xen/x86: Don't try to write syscall-related MSRs for PV guests For PV guests these registers are set up by hypervisor and thus should not be written by the guest. The comment in xen_write_msr_safe() says so but we still write the MSRs, causing the hypervisor to print a warning. Signed-off-by: Boris Ostrovsky Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 30d12afe52ed..1f1dbd2949a4 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1077,6 +1077,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) /* Fast syscall setup is all done in hypercalls, so these are all ignored. Stub them out here to stop Xen console noise. */ + break; default: if (!pmu_msr_write(msr, low, high, &ret)) -- cgit v1.2.3 From 0b34a166f291d255755be46e43ed5497cdd194f2 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 25 Sep 2015 11:59:52 +0200 Subject: x86/xen: Support kexec/kdump in HVM guests by doing a soft reset Currently there is a number of issues preventing PVHVM Xen guests from doing successful kexec/kdump: - Bound event channels. - Registered vcpu_info. - PIRQ/emuirq mappings. - shared_info frame after XENMAPSPACE_shared_info operation. - Active grant mappings. Basically, newly booted kernel stumbles upon already set up Xen interfaces and there is no way to reestablish them. In Xen-4.7 a new feature called 'soft reset' is coming. A guest performing kexec/kdump operation is supposed to call SCHEDOP_shutdown hypercall with SHUTDOWN_soft_reset reason before jumping to new kernel. Hypervisor (with some help from toolstack) will do full domain cleanup (but keeping its memory and vCPU contexts intact) returning the guest to the state it had when it was first booted and thus allowing it to start over. Doing SHUTDOWN_soft_reset on Xen hypervisors which don't support it is probably OK as by default all unknown shutdown reasons cause domain destroy with a message in toolstack log: 'Unknown shutdown reason code 5. Destroying domain.' which gives a clue to what the problem is and eliminates false expectations. Signed-off-by: Vitaly Kuznetsov Cc: Signed-off-by: David Vrabel --- arch/x86/xen/enlighten.c | 23 +++++++++++++++++++++++ include/xen/interface/sched.h | 8 ++++++++ 2 files changed, 31 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 1f1dbd2949a4..993b7a71386d 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -33,6 +33,10 @@ #include #include +#ifdef CONFIG_KEXEC_CORE +#include +#endif + #include #include #include @@ -1808,6 +1812,21 @@ static struct notifier_block xen_hvm_cpu_notifier = { .notifier_call = xen_hvm_cpu_notify, }; +#ifdef CONFIG_KEXEC_CORE +static void xen_hvm_shutdown(void) +{ + native_machine_shutdown(); + if (kexec_in_progress) + xen_reboot(SHUTDOWN_soft_reset); +} + +static void xen_hvm_crash_shutdown(struct pt_regs *regs) +{ + native_machine_crash_shutdown(regs); + xen_reboot(SHUTDOWN_soft_reset); +} +#endif + static void __init xen_hvm_guest_init(void) { if (xen_pv_domain()) @@ -1827,6 +1846,10 @@ static void __init xen_hvm_guest_init(void) x86_init.irqs.intr_init = xen_init_IRQ; xen_hvm_init_time_ops(); xen_hvm_init_mmu_ops(); +#ifdef CONFIG_KEXEC_CORE + machine_ops.shutdown = xen_hvm_shutdown; + machine_ops.crash_shutdown = xen_hvm_crash_shutdown; +#endif } #endif diff --git a/include/xen/interface/sched.h b/include/xen/interface/sched.h index 9ce083960a25..f18490985fc8 100644 --- a/include/xen/interface/sched.h +++ b/include/xen/interface/sched.h @@ -107,5 +107,13 @@ struct sched_watchdog { #define SHUTDOWN_suspend 2 /* Clean up, save suspend info, kill. */ #define SHUTDOWN_crash 3 /* Tell controller we've crashed. */ #define SHUTDOWN_watchdog 4 /* Restart because watchdog time expired. */ +/* + * Domain asked to perform 'soft reset' for it. The expected behavior is to + * reset internal Xen state for the domain returning it to the point where it + * was created but leaving the domain's memory contents and vCPU contexts + * intact. This will allow the domain to start over and set up all Xen specific + * interfaces again. + */ +#define SHUTDOWN_soft_reset 5 #endif /* __XEN_PUBLIC_SCHED_H__ */ -- cgit v1.2.3 From 64c98e7f49100b637cd20a6c63508caed6bbba7a Mon Sep 17 00:00:00 2001 From: Malcolm Crossley Date: Mon, 28 Sep 2015 11:36:52 +0100 Subject: x86/xen: Do not clip xen_e820_map to xen_e820_map_entries when sanitizing map Sanitizing the e820 map may produce extra E820 entries which would result in the topmost E820 entries being removed. The removed entries would typically include the top E820 usable RAM region and thus result in the domain having signicantly less RAM available to it. Fix by allowing sanitize_e820_map to use the full size of the allocated E820 array. Signed-off-by: Malcolm Crossley Reviewed-by: Boris Ostrovsky Cc: Signed-off-by: David Vrabel --- arch/x86/xen/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 4ebfcecc2a8b..1c30e4ab1022 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -798,7 +798,7 @@ char * __init xen_memory_setup(void) xen_ignore_unusable(); /* Make sure the Xen-supplied memory map is well-ordered. */ - sanitize_e820_map(xen_e820_map, xen_e820_map_entries, + sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map), &xen_e820_map_entries); max_pages = xen_get_max_pages(); -- cgit v1.2.3 From 1e034743e918d195d339af340ae933727c072bce Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 23 Sep 2015 12:02:57 +0200 Subject: x86/hyperv: Fix the build in the !CONFIG_KEXEC_CORE case Recent changes in the Hyper-V driver: b4370df2b1f5 ("Drivers: hv: vmbus: add special crash handler") broke the build when CONFIG_KEXEC_CORE is not set: arch/x86/built-in.o: In function `hv_machine_crash_shutdown': arch/x86/kernel/cpu/mshyperv.c:112: undefined reference to `native_machine_crash_shutdown' Decorate all kexec related code with #ifdef CONFIG_KEXEC_CORE. Reported-by: Jim Davis Reported-by: Stephen Hemminger Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Cc: devel@linuxdriverproject.org Cc: K. Y. Srinivasan Cc: Haiyang Zhang Cc: Greg Kroah-Hartman Cc: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1443002577-25370-1-git-send-email-vkuznets@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mshyperv.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 381c8b9b3a33..20e242ea1bc4 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -34,11 +34,10 @@ struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); -static void (*hv_kexec_handler)(void); -static void (*hv_crash_handler)(struct pt_regs *regs); - #if IS_ENABLED(CONFIG_HYPERV) static void (*vmbus_handler)(void); +static void (*hv_kexec_handler)(void); +static void (*hv_crash_handler)(struct pt_regs *regs); void hyperv_vector_handler(struct pt_regs *regs) { @@ -96,8 +95,8 @@ void hv_remove_crash_handler(void) hv_crash_handler = NULL; } EXPORT_SYMBOL_GPL(hv_remove_crash_handler); -#endif +#ifdef CONFIG_KEXEC_CORE static void hv_machine_shutdown(void) { if (kexec_in_progress && hv_kexec_handler) @@ -111,7 +110,8 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs) hv_crash_handler(regs); native_machine_crash_shutdown(regs); } - +#endif /* CONFIG_KEXEC_CORE */ +#endif /* CONFIG_HYPERV */ static uint32_t __init ms_hyperv_platform(void) { @@ -186,8 +186,10 @@ static void __init ms_hyperv_init_platform(void) no_timer_check = 1; #endif +#if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE) machine_ops.shutdown = hv_machine_shutdown; machine_ops.crash_shutdown = hv_machine_crash_shutdown; +#endif mark_tsc_unstable("running on Hyper-V"); } -- cgit v1.2.3 From 4ac86a6dcec1c3878de9747bf5a2aa4455be69e3 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 29 Sep 2015 19:40:14 +0300 Subject: x86, efi, kasan: Fix build failure on !KASAN && KMEMCHECK=y kernels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With KMEMCHECK=y, KASAN=n we get this build failure: arch/x86/platform/efi/efi.c:673:3: error: implicit declaration of function ‘memcpy’ [-Werror=implicit-function-declaration] arch/x86/platform/efi/efi_64.c:139:2: error: implicit declaration of function ‘memcpy’ [-Werror=implicit-function-declaration] arch/x86/include/asm/desc.h:121:2: error: implicit declaration of function ‘memcpy’ [-Werror=implicit-function-declaration] Don't #undef memcpy if KASAN=n. Reported-by: Ingo Molnar Reported-by: Sedat Dilek Signed-off-by: Andrey Ryabinin Cc: Linus Torvalds Cc: Matt Fleming Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 769a8089c1fd ("x86, efi, kasan: #undef memset/memcpy/memmove per arch") Link: http://lkml.kernel.org/r/1443544814-20122-1-git-send-email-ryabinin.a.a@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/efi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index ab5f1d447ef9..ae68be92f755 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -86,6 +86,7 @@ extern u64 asmlinkage efi_call(void *fp, ...); extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, u32 type, u64 attribute); +#ifdef CONFIG_KASAN /* * CONFIG_KASAN may redefine memset to __memset. __memset function is present * only in kernel binary. Since the EFI stub linked into a separate binary it @@ -95,6 +96,7 @@ extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, #undef memcpy #undef memset #undef memmove +#endif #endif /* CONFIG_X86_32 */ -- cgit v1.2.3 From d786ad32c305ca0f6be1924558866fe9f901e291 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 29 Sep 2015 22:37:02 +0200 Subject: x86/apic: Deinline various functions __x2apic_disable: 178 bytes, 3 calls __x2apic_enable: 117 bytes, 3 calls __smp_spurious_interrupt: 110 bytes, 2 calls __smp_error_interrupt: 208 bytes, 2 calls Reduces code size by about 850 bytes. Signed-off-by: Denys Vlasenko Cc: Borislav Petkov Cc: Andy Lutomirski Link: http://lkml.kernel.org/r/1443559022-23793-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/apic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 24e94ce454e2..2f69e3b184f6 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1431,7 +1431,7 @@ enum { }; static int x2apic_state; -static inline void __x2apic_disable(void) +static void __x2apic_disable(void) { u64 msr; @@ -1447,7 +1447,7 @@ static inline void __x2apic_disable(void) printk_once(KERN_INFO "x2apic disabled\n"); } -static inline void __x2apic_enable(void) +static void __x2apic_enable(void) { u64 msr; @@ -1807,7 +1807,7 @@ int apic_version[MAX_LOCAL_APIC]; /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -static inline void __smp_spurious_interrupt(u8 vector) +static void __smp_spurious_interrupt(u8 vector) { u32 v; @@ -1848,7 +1848,7 @@ __visible void smp_trace_spurious_interrupt(struct pt_regs *regs) /* * This interrupt should never happen with our APIC/SMP architecture */ -static inline void __smp_error_interrupt(struct pt_regs *regs) +static void __smp_error_interrupt(struct pt_regs *regs) { u32 v; u32 i = 0; -- cgit v1.2.3 From e02ae3871355194a61b03a07d96fd71e81d7eff9 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 28 Sep 2015 12:26:31 +0200 Subject: x86/x2apic: Make stub functions available even if !CONFIG_X86_LOCAL_APIC Some CONFIG_X86_X2APIC functions, especially x2apic_enabled(), are not declared if !CONFIG_X86_LOCAL_APIC. However, the same stubs that work for !CONFIG_X86_X2APIC are okay even if there is no local APIC support at all. Avoid the introduction of #ifdefs by moving the x2apic declarations completely outside the CONFIG_X86_LOCAL_APIC block. (Unfortunately, diff generation messes up the actual change that this patch makes). There is no semantic change because CONFIG_X86_X2APIC depends on CONFIG_X86_LOCAL_APIC. Reported-by: Fengguang Wu Signed-off-by: Paolo Bonzini Cc: Feng Wu Link: http://lkml.kernel.org/r/1443435991-35750-1-git-send-email-pbonzini@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/apic.h | 110 ++++++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 55 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index ebf6d5e5668c..a30316bf801a 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -115,6 +115,59 @@ static inline bool apic_is_x2apic_enabled(void) return msr & X2APIC_ENABLE; } +extern void enable_IR_x2apic(void); + +extern int get_physical_broadcast(void); + +extern int lapic_get_maxlvt(void); +extern void clear_local_APIC(void); +extern void disconnect_bsp_APIC(int virt_wire_setup); +extern void disable_local_APIC(void); +extern void lapic_shutdown(void); +extern void sync_Arb_IDs(void); +extern void init_bsp_APIC(void); +extern void setup_local_APIC(void); +extern void init_apic_mappings(void); +void register_lapic_address(unsigned long address); +extern void setup_boot_APIC_clock(void); +extern void setup_secondary_APIC_clock(void); +extern int APIC_init_uniprocessor(void); + +#ifdef CONFIG_X86_64 +static inline int apic_force_enable(unsigned long addr) +{ + return -1; +} +#else +extern int apic_force_enable(unsigned long addr); +#endif + +extern int apic_bsp_setup(bool upmode); +extern void apic_ap_setup(void); + +/* + * On 32bit this is mach-xxx local + */ +#ifdef CONFIG_X86_64 +extern int apic_is_clustered_box(void); +#else +static inline int apic_is_clustered_box(void) +{ + return 0; +} +#endif + +extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask); + +#else /* !CONFIG_X86_LOCAL_APIC */ +static inline void lapic_shutdown(void) { } +#define local_apic_timer_c2_ok 1 +static inline void init_apic_mappings(void) { } +static inline void disable_local_APIC(void) { } +# define setup_boot_APIC_clock x86_init_noop +# define setup_secondary_APIC_clock x86_init_noop +#endif /* !CONFIG_X86_LOCAL_APIC */ + #ifdef CONFIG_X86_X2APIC /* * Make previous memory operations globally visible before @@ -186,67 +239,14 @@ static inline int x2apic_enabled(void) } #define x2apic_supported() (cpu_has_x2apic) -#else +#else /* !CONFIG_X86_X2APIC */ static inline void check_x2apic(void) { } static inline void x2apic_setup(void) { } static inline int x2apic_enabled(void) { return 0; } #define x2apic_mode (0) #define x2apic_supported() (0) -#endif - -extern void enable_IR_x2apic(void); - -extern int get_physical_broadcast(void); - -extern int lapic_get_maxlvt(void); -extern void clear_local_APIC(void); -extern void disconnect_bsp_APIC(int virt_wire_setup); -extern void disable_local_APIC(void); -extern void lapic_shutdown(void); -extern void sync_Arb_IDs(void); -extern void init_bsp_APIC(void); -extern void setup_local_APIC(void); -extern void init_apic_mappings(void); -void register_lapic_address(unsigned long address); -extern void setup_boot_APIC_clock(void); -extern void setup_secondary_APIC_clock(void); -extern int APIC_init_uniprocessor(void); - -#ifdef CONFIG_X86_64 -static inline int apic_force_enable(unsigned long addr) -{ - return -1; -} -#else -extern int apic_force_enable(unsigned long addr); -#endif - -extern int apic_bsp_setup(bool upmode); -extern void apic_ap_setup(void); - -/* - * On 32bit this is mach-xxx local - */ -#ifdef CONFIG_X86_64 -extern int apic_is_clustered_box(void); -#else -static inline int apic_is_clustered_box(void) -{ - return 0; -} -#endif - -extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask); - -#else /* !CONFIG_X86_LOCAL_APIC */ -static inline void lapic_shutdown(void) { } -#define local_apic_timer_c2_ok 1 -static inline void init_apic_mappings(void) { } -static inline void disable_local_APIC(void) { } -# define setup_boot_APIC_clock x86_init_noop -# define setup_secondary_APIC_clock x86_init_noop -#endif /* !CONFIG_X86_LOCAL_APIC */ +#endif /* !CONFIG_X86_X2APIC */ #ifdef CONFIG_X86_64 #define SET_APIC_ID(x) (apic->set_apic_id(x)) -- cgit v1.2.3 From eddd3826a1a0190e5235703d1e666affa4d13b96 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 30 Sep 2015 08:38:22 +0000 Subject: x86/process: Add proper bound checks in 64bit get_wchan() Dmitry Vyukov reported the following using trinity and the memory error detector AddressSanitizer (https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerForKernel). [ 124.575597] ERROR: AddressSanitizer: heap-buffer-overflow on address ffff88002e280000 [ 124.576801] ffff88002e280000 is located 131938492886538 bytes to the left of 28857600-byte region [ffffffff81282e0a, ffffffff82e0830a) [ 124.578633] Accessed by thread T10915: [ 124.579295] inlined in describe_heap_address ./arch/x86/mm/asan/report.c:164 [ 124.579295] #0 ffffffff810dd277 in asan_report_error ./arch/x86/mm/asan/report.c:278 [ 124.580137] #1 ffffffff810dc6a0 in asan_check_region ./arch/x86/mm/asan/asan.c:37 [ 124.581050] #2 ffffffff810dd423 in __tsan_read8 ??:0 [ 124.581893] #3 ffffffff8107c093 in get_wchan ./arch/x86/kernel/process_64.c:444 The address checks in the 64bit implementation of get_wchan() are wrong in several ways: - The lower bound of the stack is not the start of the stack page. It's the start of the stack page plus sizeof (struct thread_info) - The upper bound must be: top_of_stack - TOP_OF_KERNEL_STACK_PADDING - 2 * sizeof(unsigned long). The 2 * sizeof(unsigned long) is required because the stack pointer points at the frame pointer. The layout on the stack is: ... IP FP ... IP FP. So we need to make sure that both IP and FP are in the bounds. Fix the bound checks and get rid of the mix of numeric constants, u64 and unsigned long. Making all unsigned long allows us to use the same function for 32bit as well. Use READ_ONCE() when accessing the stack. This does not prevent a concurrent wakeup of the task and the stack changing, but at least it avoids TOCTOU. Also check task state at the end of the loop. Again that does not prevent concurrent changes, but it avoids walking for nothing. Add proper comments while at it. Reported-by: Dmitry Vyukov Reported-by: Sasha Levin Based-on-patch-from: Wolfram Gloger Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andrey Konovalov Cc: Kostya Serebryany Cc: Alexander Potapenko Cc: kasan-dev Cc: Denys Vlasenko Cc: Andi Kleen Cc: Wolfram Gloger Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20150930083302.694788319@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process_64.c | 52 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3c1bbcf12924..f1fd0889e8af 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -499,27 +499,59 @@ void set_personality_ia32(bool x32) } EXPORT_SYMBOL_GPL(set_personality_ia32); +/* + * Called from fs/proc with a reference on @p to find the function + * which called into schedule(). This needs to be done carefully + * because the task might wake up and we might look at a stack + * changing under us. + */ unsigned long get_wchan(struct task_struct *p) { - unsigned long stack; - u64 fp, ip; + unsigned long start, bottom, top, sp, fp, ip; int count = 0; if (!p || p == current || p->state == TASK_RUNNING) return 0; - stack = (unsigned long)task_stack_page(p); - if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE) + + start = (unsigned long)task_stack_page(p); + if (!start) + return 0; + + /* + * Layout of the stack page: + * + * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) + * PADDING + * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING + * stack + * ----------- bottom = start + sizeof(thread_info) + * thread_info + * ----------- start + * + * The tasks stack pointer points at the location where the + * framepointer is stored. The data on the stack is: + * ... IP FP ... IP FP + * + * We need to read FP and IP, so we need to adjust the upper + * bound by another unsigned long. + */ + top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; + top -= 2 * sizeof(unsigned long); + bottom = start + sizeof(struct thread_info); + + sp = READ_ONCE(p->thread.sp); + if (sp < bottom || sp > top) return 0; - fp = *(u64 *)(p->thread.sp); + + fp = READ_ONCE(*(unsigned long *)sp); do { - if (fp < (unsigned long)stack || - fp >= (unsigned long)stack+THREAD_SIZE) + if (fp < bottom || fp > top) return 0; - ip = *(u64 *)(fp+8); + ip = READ_ONCE(*(unsigned long *)(fp + sizeof(unsigned long))); if (!in_sched_functions(ip)) return ip; - fp = *(u64 *)fp; - } while (count++ < 16); + fp = READ_ONCE(*(unsigned long *)fp); + } while (count++ < 16 && p->state != TASK_RUNNING); return 0; } -- cgit v1.2.3 From 7ba78053aacb89998a052843e3c56983c31d57f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 30 Sep 2015 08:38:23 +0000 Subject: x86/process: Unify 32bit and 64bit implementations of get_wchan() The stack layout and the functionality is identical. Use the 64bit version for all of x86. Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andrey Konovalov Cc: Kostya Serebryany Cc: Alexander Potapenko Cc: kasan-dev Cc: Denys Vlasenko Cc: Andi Kleen Cc: Sasha Levin Cc: Wolfram Gloger Link: http://lkml.kernel.org/r/20150930083302.779694618@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process.c | 55 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/process_32.c | 28 ---------------------- arch/x86/kernel/process_64.c | 56 -------------------------------------------- 3 files changed, 55 insertions(+), 84 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 6d0e62ae8516..39e585a554b7 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -506,3 +506,58 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) return randomize_range(mm->brk, range_end, 0) ? : mm->brk; } +/* + * Called from fs/proc with a reference on @p to find the function + * which called into schedule(). This needs to be done carefully + * because the task might wake up and we might look at a stack + * changing under us. + */ +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long start, bottom, top, sp, fp, ip; + int count = 0; + + if (!p || p == current || p->state == TASK_RUNNING) + return 0; + + start = (unsigned long)task_stack_page(p); + if (!start) + return 0; + + /* + * Layout of the stack page: + * + * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) + * PADDING + * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING + * stack + * ----------- bottom = start + sizeof(thread_info) + * thread_info + * ----------- start + * + * The tasks stack pointer points at the location where the + * framepointer is stored. The data on the stack is: + * ... IP FP ... IP FP + * + * We need to read FP and IP, so we need to adjust the upper + * bound by another unsigned long. + */ + top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; + top -= 2 * sizeof(unsigned long); + bottom = start + sizeof(struct thread_info); + + sp = READ_ONCE(p->thread.sp); + if (sp < bottom || sp > top) + return 0; + + fp = READ_ONCE(*(unsigned long *)sp); + do { + if (fp < bottom || fp > top) + return 0; + ip = READ_ONCE(*(unsigned long *)(fp + sizeof(unsigned long))); + if (!in_sched_functions(ip)) + return ip; + fp = READ_ONCE(*(unsigned long *)fp); + } while (count++ < 16 && p->state != TASK_RUNNING); + return 0; +} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index c13df2c735f8..737527b40e5b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -324,31 +324,3 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } - -#define top_esp (THREAD_SIZE - sizeof(unsigned long)) -#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) - -unsigned long get_wchan(struct task_struct *p) -{ - unsigned long bp, sp, ip; - unsigned long stack_page; - int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) - return 0; - stack_page = (unsigned long)task_stack_page(p); - sp = p->thread.sp; - if (!stack_page || sp < stack_page || sp > top_esp+stack_page) - return 0; - /* include/asm-i386/system.h:switch_to() pushes bp last. */ - bp = *(unsigned long *) sp; - do { - if (bp < stack_page || bp > top_ebp+stack_page) - return 0; - ip = *(unsigned long *) (bp+4); - if (!in_sched_functions(ip)) - return ip; - bp = *(unsigned long *) bp; - } while (count++ < 16); - return 0; -} - diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index f1fd0889e8af..b35921a670b2 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -499,62 +499,6 @@ void set_personality_ia32(bool x32) } EXPORT_SYMBOL_GPL(set_personality_ia32); -/* - * Called from fs/proc with a reference on @p to find the function - * which called into schedule(). This needs to be done carefully - * because the task might wake up and we might look at a stack - * changing under us. - */ -unsigned long get_wchan(struct task_struct *p) -{ - unsigned long start, bottom, top, sp, fp, ip; - int count = 0; - - if (!p || p == current || p->state == TASK_RUNNING) - return 0; - - start = (unsigned long)task_stack_page(p); - if (!start) - return 0; - - /* - * Layout of the stack page: - * - * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) - * PADDING - * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING - * stack - * ----------- bottom = start + sizeof(thread_info) - * thread_info - * ----------- start - * - * The tasks stack pointer points at the location where the - * framepointer is stored. The data on the stack is: - * ... IP FP ... IP FP - * - * We need to read FP and IP, so we need to adjust the upper - * bound by another unsigned long. - */ - top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; - top -= 2 * sizeof(unsigned long); - bottom = start + sizeof(struct thread_info); - - sp = READ_ONCE(p->thread.sp); - if (sp < bottom || sp > top) - return 0; - - fp = READ_ONCE(*(unsigned long *)sp); - do { - if (fp < bottom || fp > top) - return 0; - ip = READ_ONCE(*(unsigned long *)(fp + sizeof(unsigned long))); - if (!in_sched_functions(ip)) - return ip; - fp = READ_ONCE(*(unsigned long *)fp); - } while (count++ < 16 && p->state != TASK_RUNNING); - return 0; -} - long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) { int ret = 0; -- cgit v1.2.3 From e6e5f84092b1f3a8733c20e26838af4e21a4854f Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 28 Sep 2015 14:23:55 +0200 Subject: x86/e820: Deinline e820_type_to_string, save 126 bytes This function compiles to 102 bytes of machine code. It has two callsites. Signed-off-by: Denys Vlasenko Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Link: http://lkml.kernel.org/r/1443443037-22077-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/e820.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index a102564d08eb..569c1e4f96fe 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -911,7 +911,7 @@ void __init finish_e820_parsing(void) } } -static inline const char *e820_type_to_string(int e820_type) +static const char *e820_type_to_string(int e820_type) { switch (e820_type) { case E820_RESERVED_KERN: -- cgit v1.2.3 From c368ef2866adbfc0e90fdecc09e3b3fe2cddcd14 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 28 Sep 2015 14:23:56 +0200 Subject: x86: Deinline early_console_register, save 403 bytes This function compiles to 60 bytes of machine code. Signed-off-by: Denys Vlasenko Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Link: http://lkml.kernel.org/r/1443443037-22077-3-git-send-email-dvlasenk@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/early_printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index eec40f595ab9..076a4a739aa6 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -316,7 +316,7 @@ static struct console early_serial_console = { .index = -1, }; -static inline void early_console_register(struct console *con, int keep_early) +static void early_console_register(struct console *con, int keep_early) { if (con->index != -1) { printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n", -- cgit v1.2.3 From dae0f305d61b07933a129dfe975342f3177277d5 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 28 Sep 2015 14:23:57 +0200 Subject: x86/signal: Deinline get_sigframe, save 240 bytes This function compiles to 277 bytes of machine code and has 4 callsites. Signed-off-by: Denys Vlasenko Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Link: http://lkml.kernel.org/r/1443443037-22077-4-git-send-email-dvlasenk@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index da52e6bb5c7f..2c3336b6b264 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -196,7 +196,7 @@ static unsigned long align_sigframe(unsigned long sp) return sp; } -static inline void __user * +static void __user * get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, void __user **fpstate) { -- cgit v1.2.3 From c2365b9388e8ec19305e3f449c1826e7493d156d Mon Sep 17 00:00:00 2001 From: Vaishali Thakkar Date: Thu, 1 Oct 2015 14:22:02 +0530 Subject: perf/x86/intel/uncore: Do not use macro DEFINE_PCI_DEVICE_TABLE() The DEFINE_PCI_DEVICE_TABLE() macro is deprecated. Use 'struct pci_device_id' instead of DEFINE_PCI_DEVICE_TABLE(), with the goal of getting rid of this macro completely. This Coccinelle semantic patch performs this transformation: @@ identifier a; declarer name DEFINE_PCI_DEVICE_TABLE; initializer i; @@ - DEFINE_PCI_DEVICE_TABLE(a) + const struct pci_device_id a[] = i; Signed-off-by: Vaishali Thakkar Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20151001085201.GA16939@localhost Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index 694510a887dc..8cbb3f386b07 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -2444,7 +2444,7 @@ static struct intel_uncore_type *bdx_pci_uncores[] = { NULL, }; -static DEFINE_PCI_DEVICE_TABLE(bdx_uncore_pci_ids) = { +static const struct pci_device_id bdx_uncore_pci_ids[] = { { /* Home Agent 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30), .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0), -- cgit v1.2.3 From a7e705af524d165fe7bc303aee82225c66734885 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 1 Oct 2015 10:55:29 +0800 Subject: x86/irq: Drop unlikely before IS_ERR_OR_NULL IS_ERR_OR_NULL already contain an unlikely compiler flag. Drop it. Signed-off-by: Geliang Tang Cc: Brian Gerst Cc: Andy Lutomirski Link: http://lkml.kernel.org/r/03d18502ed7ed417f136c091f417d2d88c147ec6.1443667610.git.geliangtang@163.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/irq_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index c767cf2bc80a..206d0b90a3ab 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -72,7 +72,7 @@ bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) { stack_overflow_check(regs); - if (unlikely(IS_ERR_OR_NULL(desc))) + if (IS_ERR_OR_NULL(desc)) return false; generic_handle_irq_desc(desc); -- cgit v1.2.3 From a5caa209ba9c29c6421292e7879d2387a2ef39c9 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 25 Sep 2015 23:02:18 +0100 Subject: x86/efi: Fix boot crash by mapping EFI memmap entries bottom-up at runtime, instead of top-down Beginning with UEFI v2.5 EFI_PROPERTIES_TABLE was introduced that signals that the firmware PE/COFF loader supports splitting code and data sections of PE/COFF images into separate EFI memory map entries. This allows the kernel to map those regions with strict memory protections, e.g. EFI_MEMORY_RO for code, EFI_MEMORY_XP for data, etc. Unfortunately, an unwritten requirement of this new feature is that the regions need to be mapped with the same offsets relative to each other as observed in the EFI memory map. If this is not done crashes like this may occur, BUG: unable to handle kernel paging request at fffffffefe6086dd IP: [] 0xfffffffefe6086dd Call Trace: [] efi_call+0x7e/0x100 [] ? virt_efi_set_variable+0x61/0x90 [] efi_delete_dummy_variable+0x63/0x70 [] efi_enter_virtual_mode+0x383/0x392 [] start_kernel+0x38a/0x417 [] x86_64_start_reservations+0x2a/0x2c [] x86_64_start_kernel+0xeb/0xef Here 0xfffffffefe6086dd refers to an address the firmware expects to be mapped but which the OS never claimed was mapped. The issue is that included in these regions are relative addresses to other regions which were emitted by the firmware toolchain before the "splitting" of sections occurred at runtime. Needless to say, we don't satisfy this unwritten requirement on x86_64 and instead map the EFI memory map entries in reverse order. The above crash is almost certainly triggerable with any kernel newer than v3.13 because that's when we rewrote the EFI runtime region mapping code, in commit d2f7cbe7b26a ("x86/efi: Runtime services virtual mapping"). For kernel versions before v3.13 things may work by pure luck depending on the fragmentation of the kernel virtual address space at the time we map the EFI regions. Instead of mapping the EFI memory map entries in reverse order, where entry N has a higher virtual address than entry N+1, map them in the same order as they appear in the EFI memory map to preserve this relative offset between regions. This patch has been kept as small as possible with the intention that it should be applied aggressively to stable and distribution kernels. It is very much a bugfix rather than support for a new feature, since when EFI_PROPERTIES_TABLE is enabled we must map things as outlined above to even boot - we have no way of asking the firmware not to split the code/data regions. In fact, this patch doesn't even make use of the more strict memory protections available in UEFI v2.5. That will come later. Suggested-by: Ard Biesheuvel Reported-by: Ard Biesheuvel Signed-off-by: Matt Fleming Cc: Cc: Borislav Petkov Cc: Chun-Yi Cc: Dave Young Cc: H. Peter Anvin Cc: James Bottomley Cc: Lee, Chun-Yi Cc: Leif Lindholm Cc: Linus Torvalds Cc: Matthew Garrett Cc: Mike Galbraith Cc: Peter Jones Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1443218539-7610-2-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/platform/efi/efi.c | 67 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 1db84c0758b7..6a28ded74211 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -704,6 +704,70 @@ out: return ret; } +/* + * Iterate the EFI memory map in reverse order because the regions + * will be mapped top-down. The end result is the same as if we had + * mapped things forward, but doesn't require us to change the + * existing implementation of efi_map_region(). + */ +static inline void *efi_map_next_entry_reverse(void *entry) +{ + /* Initial call */ + if (!entry) + return memmap.map_end - memmap.desc_size; + + entry -= memmap.desc_size; + if (entry < memmap.map) + return NULL; + + return entry; +} + +/* + * efi_map_next_entry - Return the next EFI memory map descriptor + * @entry: Previous EFI memory map descriptor + * + * This is a helper function to iterate over the EFI memory map, which + * we do in different orders depending on the current configuration. + * + * To begin traversing the memory map @entry must be %NULL. + * + * Returns %NULL when we reach the end of the memory map. + */ +static void *efi_map_next_entry(void *entry) +{ + if (!efi_enabled(EFI_OLD_MEMMAP) && efi_enabled(EFI_64BIT)) { + /* + * Starting in UEFI v2.5 the EFI_PROPERTIES_TABLE + * config table feature requires us to map all entries + * in the same order as they appear in the EFI memory + * map. That is to say, entry N must have a lower + * virtual address than entry N+1. This is because the + * firmware toolchain leaves relative references in + * the code/data sections, which are split and become + * separate EFI memory regions. Mapping things + * out-of-order leads to the firmware accessing + * unmapped addresses. + * + * Since we need to map things this way whether or not + * the kernel actually makes use of + * EFI_PROPERTIES_TABLE, let's just switch to this + * scheme by default for 64-bit. + */ + return efi_map_next_entry_reverse(entry); + } + + /* Initial call */ + if (!entry) + return memmap.map; + + entry += memmap.desc_size; + if (entry >= memmap.map_end) + return NULL; + + return entry; +} + /* * Map the efi memory ranges of the runtime services and update new_mmap with * virtual addresses. @@ -714,7 +778,8 @@ static void * __init efi_map_regions(int *count, int *pg_shift) unsigned long left = 0; efi_memory_desc_t *md; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + p = NULL; + while ((p = efi_map_next_entry(p))) { md = p; if (!(md->attribute & EFI_MEMORY_RUNTIME)) { #ifdef CONFIG_X86_64 -- cgit v1.2.3 From 606decd67049217684e3cb5a54104d51ddd4ef35 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 1 Oct 2015 13:12:47 +0200 Subject: Revert "KVM: x86: apply guest MTRR virtualization on host reserved pages" This reverts commit fd717f11015f673487ffc826e59b2bad69d20fe5. It was reported to cause Machine Check Exceptions (bug 104091). Reported-by: harn-solo@gmx.de Cc: stable@vger.kernel.org # 4.2+ Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 7 +++++-- arch/x86/kvm/vmx.c | 11 ++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 94b7d15db3fc..9e88078f4146 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1167,11 +1167,14 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) u8 mtrr; /* - * 1. MMIO: trust guest MTRR, so same as item 3. + * 1. MMIO: always map as UC * 2. No passthrough: always map as WB, and force guest PAT to WB as well * 3. Passthrough: can't guarantee the result, try to trust guest. */ - if (!is_mmio && !kvm_arch_has_assigned_device(vcpu->kvm)) + if (is_mmio) + return _PAGE_NOCACHE; + + if (!kvm_arch_has_assigned_device(vcpu->kvm)) return 0; if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED) && diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 64076740251e..06ef4908ba61 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8617,17 +8617,22 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) u64 ipat = 0; /* For VT-d and EPT combination - * 1. MMIO: guest may want to apply WC, trust it. + * 1. MMIO: always map as UC * 2. EPT with VT-d: * a. VT-d without snooping control feature: can't guarantee the - * result, try to trust guest. So the same as item 1. + * result, try to trust guest. * b. VT-d with snooping control feature: snooping control feature of * VT-d engine can guarantee the cache correctness. Just set it * to WB to keep consistent with host. So the same as item 3. * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep * consistent with host MTRR */ - if (!is_mmio && !kvm_arch_has_noncoherent_dma(vcpu->kvm)) { + if (is_mmio) { + cache = MTRR_TYPE_UNCACHABLE; + goto exit; + } + + if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { ipat = VMX_EPT_IPAT_BIT; cache = MTRR_TYPE_WRBACK; goto exit; -- cgit v1.2.3 From 625422f60c55bbc368b8568ff925770b36bfc189 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 1 Oct 2015 13:28:15 +0200 Subject: Revert "KVM: SVM: Sync g_pat with guest-written PAT value" This reverts commit e098223b789b4a618dacd79e5e0dad4a9d5018d1, which has a dependency on other commits being reverted. Cc: stable@vger.kernel.org # 4.2+ Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9e88078f4146..22601206ffd7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3354,16 +3354,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_VM_IGNNE: vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; - case MSR_IA32_CR_PAT: - if (npt_enabled) { - if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) - return 1; - vcpu->arch.pat = data; - svm_set_guest_pat(svm, &svm->vmcb->save.g_pat); - mark_dirty(svm->vmcb, VMCB_NPT); - break; - } - /* fall through */ default: return kvm_set_msr_common(vcpu, msr); } -- cgit v1.2.3 From bcf166a9942c3aabd8b752a7c38a49f57c54cfb8 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 1 Oct 2015 13:19:55 +0200 Subject: Revert "KVM: svm: handle KVM_X86_QUIRK_CD_NW_CLEARED in svm_get_mt_mask" This reverts commit 5492830370171b6a4ede8a3bfba687a8d0f25fa5. It builds on the commit that is being reverted next. Cc: stable@vger.kernel.org # 4.2+ Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 22601206ffd7..7c242b7007ac 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1177,10 +1177,6 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) if (!kvm_arch_has_assigned_device(vcpu->kvm)) return 0; - if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED) && - kvm_read_cr0(vcpu) & X86_CR0_CD) - return _PAGE_NOCACHE; - mtrr = kvm_mtrr_get_guest_memory_type(vcpu, gfn); return mtrr2protval[mtrr]; } @@ -1676,10 +1672,13 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!vcpu->fpu_active) cr0 |= X86_CR0_TS; - - /* These are emulated via page tables. */ - cr0 &= ~(X86_CR0_CD | X86_CR0_NW); - + /* + * re-enable caching here because the QEMU bios + * does not do it - this results in some delay at + * reboot + */ + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) + cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; mark_dirty(svm->vmcb, VMCB_CR); update_cr0_intercept(svm); -- cgit v1.2.3 From fc07e76ac7ffa3afd621a1c3858a503386a14281 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 1 Oct 2015 13:20:22 +0200 Subject: Revert "KVM: SVM: use NPT page attributes" This reverts commit 3c2e7f7de3240216042b61073803b61b9b3cfb22. Initializing the mapping from MTRR to PAT values was reported to fail nondeterministically, and it also caused extremely slow boot (due to caching getting disabled---bug 103321) with assigned devices. Reported-by: Markus Trippelsdorf Reported-by: Sebastian Schuette Cc: stable@vger.kernel.org # 4.2+ Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 101 +++-------------------------------------------------- 1 file changed, 5 insertions(+), 96 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7c242b7007ac..99b3c5f40bf7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -866,64 +866,6 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); } -#define MTRR_TYPE_UC_MINUS 7 -#define MTRR2PROTVAL_INVALID 0xff - -static u8 mtrr2protval[8]; - -static u8 fallback_mtrr_type(int mtrr) -{ - /* - * WT and WP aren't always available in the host PAT. Treat - * them as UC and UC- respectively. Everything else should be - * there. - */ - switch (mtrr) - { - case MTRR_TYPE_WRTHROUGH: - return MTRR_TYPE_UNCACHABLE; - case MTRR_TYPE_WRPROT: - return MTRR_TYPE_UC_MINUS; - default: - BUG(); - } -} - -static void build_mtrr2protval(void) -{ - int i; - u64 pat; - - for (i = 0; i < 8; i++) - mtrr2protval[i] = MTRR2PROTVAL_INVALID; - - /* Ignore the invalid MTRR types. */ - mtrr2protval[2] = 0; - mtrr2protval[3] = 0; - - /* - * Use host PAT value to figure out the mapping from guest MTRR - * values to nested page table PAT/PCD/PWT values. We do not - * want to change the host PAT value every time we enter the - * guest. - */ - rdmsrl(MSR_IA32_CR_PAT, pat); - for (i = 0; i < 8; i++) { - u8 mtrr = pat >> (8 * i); - - if (mtrr2protval[mtrr] == MTRR2PROTVAL_INVALID) - mtrr2protval[mtrr] = __cm_idx2pte(i); - } - - for (i = 0; i < 8; i++) { - if (mtrr2protval[i] == MTRR2PROTVAL_INVALID) { - u8 fallback = fallback_mtrr_type(i); - mtrr2protval[i] = mtrr2protval[fallback]; - BUG_ON(mtrr2protval[i] == MTRR2PROTVAL_INVALID); - } - } -} - static __init int svm_hardware_setup(void) { int cpu; @@ -990,7 +932,6 @@ static __init int svm_hardware_setup(void) } else kvm_disable_tdp(); - build_mtrr2protval(); return 0; err: @@ -1145,42 +1086,6 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) return target_tsc - tsc; } -static void svm_set_guest_pat(struct vcpu_svm *svm, u64 *g_pat) -{ - struct kvm_vcpu *vcpu = &svm->vcpu; - - /* Unlike Intel, AMD takes the guest's CR0.CD into account. - * - * AMD does not have IPAT. To emulate it for the case of guests - * with no assigned devices, just set everything to WB. If guests - * have assigned devices, however, we cannot force WB for RAM - * pages only, so use the guest PAT directly. - */ - if (!kvm_arch_has_assigned_device(vcpu->kvm)) - *g_pat = 0x0606060606060606; - else - *g_pat = vcpu->arch.pat; -} - -static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) -{ - u8 mtrr; - - /* - * 1. MMIO: always map as UC - * 2. No passthrough: always map as WB, and force guest PAT to WB as well - * 3. Passthrough: can't guarantee the result, try to trust guest. - */ - if (is_mmio) - return _PAGE_NOCACHE; - - if (!kvm_arch_has_assigned_device(vcpu->kvm)) - return 0; - - mtrr = kvm_mtrr_get_guest_memory_type(vcpu, gfn); - return mtrr2protval[mtrr]; -} - static void init_vmcb(struct vcpu_svm *svm, bool init_event) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -1277,7 +1182,6 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event) clr_cr_intercept(svm, INTERCEPT_CR3_READ); clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); save->g_pat = svm->vcpu.arch.pat; - svm_set_guest_pat(svm, &save->g_pat); save->cr3 = 0; save->cr4 = 0; } @@ -4187,6 +4091,11 @@ static bool svm_has_high_real_mode_segbase(void) return true; } +static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +{ + return 0; +} + static void svm_cpuid_update(struct kvm_vcpu *vcpu) { } -- cgit v1.2.3 From d2922422c48df93f3edff7d872ee4f3191fefb08 Mon Sep 17 00:00:00 2001 From: Dirk Müller Date: Thu, 1 Oct 2015 13:43:42 +0200 Subject: Use WARN_ON_ONCE for missing X86_FEATURE_NRIPS The cpu feature flags are not ever going to change, so warning everytime can cause a lot of kernel log spam (in our case more than 10GB/hour). The warning seems to only occur when nested virtualization is enabled, so it's probably triggered by a KVM bug. This is a sensible and safe change anyway, and the KVM bug fix might not be suitable for stable releases anyway. Cc: stable@vger.kernel.org Signed-off-by: Dirk Mueller Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 99b3c5f40bf7..2f9ed1ff0632 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -514,7 +514,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); if (svm->vmcb->control.next_rip != 0) { - WARN_ON(!static_cpu_has(X86_FEATURE_NRIPS)); + WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS)); svm->next_rip = svm->vmcb->control.next_rip; } -- cgit v1.2.3 From bdaffe1d93e7eddbcc71d074a5d49eba7fe1c765 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 29 Jul 2015 15:03:06 +0200 Subject: KVM: x86: set TMR when the interrupt is accepted Do not compute TMR in advance. Instead, set the TMR just before the interrupt is accepted into the IRR. This limits the coupling between IOAPIC and LAPIC. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 9 ++------- arch/x86/kvm/ioapic.h | 3 +-- arch/x86/kvm/lapic.c | 19 ++++++++++--------- arch/x86/kvm/lapic.h | 1 - arch/x86/kvm/x86.c | 5 +---- 5 files changed, 14 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 856f79105bb5..eaf4ec38d980 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -246,8 +246,7 @@ static void update_handled_vectors(struct kvm_ioapic *ioapic) smp_wmb(); } -void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, - u32 *tmr) +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; union kvm_ioapic_redirect_entry *e; @@ -260,13 +259,9 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) || index == RTC_GSI) { if (kvm_apic_match_dest(vcpu, NULL, 0, - e->fields.dest_id, e->fields.dest_mode)) { + e->fields.dest_id, e->fields.dest_mode)) __set_bit(e->fields.vector, (unsigned long *)eoi_exit_bitmap); - if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG) - __set_bit(e->fields.vector, - (unsigned long *)tmr); - } } } spin_unlock(&ioapic->lock); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index ca0b0b4e6256..3dbd0e2aac4e 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -120,7 +120,6 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, unsigned long *dest_map); int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); -void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, - u32 *tmr); +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8d9013c5e1ee..5693dd9fc163 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -551,15 +551,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } -void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - int i; - - for (i = 0; i < 8; i++) - apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]); -} - static void apic_update_ppr(struct kvm_lapic *apic) { u32 tpr, isrv, ppr, old_ppr; @@ -781,6 +772,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_LOWEST: vcpu->arch.apic_arb_prio++; case APIC_DM_FIXED: + if (unlikely(trig_mode && !level)) + break; + /* FIXME add logic for vcpu on reset */ if (unlikely(!apic_enabled(apic))) break; @@ -790,6 +784,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (dest_map) __set_bit(vcpu->vcpu_id, dest_map); + if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) { + if (trig_mode) + apic_set_vector(vector, apic->regs + APIC_TMR); + else + apic_clear_vector(vector, apic->regs + APIC_TMR); + } + if (kvm_x86_ops->deliver_posted_interrupt) kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); else { diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 764037991d26..eb46d6bcaa75 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -57,7 +57,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_apic_set_version(struct kvm_vcpu *vcpu); -void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr); void __kvm_apic_update_irr(u32 *pir, void *regs); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 92511d4b7236..c1ed74ebc502 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6144,17 +6144,14 @@ static void process_smi(struct kvm_vcpu *vcpu) static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { u64 eoi_exit_bitmap[4]; - u32 tmr[8]; if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return; memset(eoi_exit_bitmap, 0, 32); - memset(tmr, 0, 32); - kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr); + kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap); kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); - kvm_apic_update_tmr(vcpu, tmr); } static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 3bb345f387dd26beb097cf776e342bc0d96d805a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 29 Jul 2015 10:43:18 +0200 Subject: KVM: x86: store IOAPIC-handled vectors in each VCPU We can reuse the algorithm that computes the EOI exit bitmap to figure out which vectors are handled by the IOAPIC. The only difference between the two is for edge-triggered interrupts other than IRQ8 that have no notifiers active; however, the IOAPIC does not have to do anything special for these interrupts anyway. This again limits the interactions between the IOAPIC and the LAPIC, making it easier to move the former to userspace. Inspired by a patch from Steve Rutherford. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/ioapic.c | 18 ++---------------- arch/x86/kvm/ioapic.h | 8 -------- arch/x86/kvm/lapic.c | 10 ++++++++-- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 3 ++- arch/x86/kvm/x86.c | 8 +++----- 7 files changed, 18 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2beee0382088..33609c2c743b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -396,6 +396,7 @@ struct kvm_vcpu_arch { u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ + u64 eoi_exit_bitmap[4]; unsigned long apic_attention; int32_t apic_arb_prio; int mp_state; @@ -822,7 +823,7 @@ struct kvm_x86_ops { int (*vm_has_apicv)(struct kvm *kvm); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm *kvm, int isr); - void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); + void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu); void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index eaf4ec38d980..2dcda0f188ba 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -233,19 +233,6 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr) } -static void update_handled_vectors(struct kvm_ioapic *ioapic) -{ - DECLARE_BITMAP(handled_vectors, 256); - int i; - - memset(handled_vectors, 0, sizeof(handled_vectors)); - for (i = 0; i < IOAPIC_NUM_PINS; ++i) - __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors); - memcpy(ioapic->handled_vectors, handled_vectors, - sizeof(handled_vectors)); - smp_wmb(); -} - void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; @@ -310,7 +297,6 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) e->bits |= (u32) val; e->fields.remote_irr = 0; } - update_handled_vectors(ioapic); mask_after = e->fields.mask; if (mask_before != mask_after) kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); @@ -594,7 +580,6 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic) ioapic->id = 0; memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); rtc_irq_eoi_tracking_reset(ioapic); - update_handled_vectors(ioapic); } static const struct kvm_io_device_ops ioapic_mmio_ops = { @@ -623,8 +608,10 @@ int kvm_ioapic_init(struct kvm *kvm) if (ret < 0) { kvm->arch.vioapic = NULL; kfree(ioapic); + return ret; } + kvm_vcpu_request_scan_ioapic(kvm); return ret; } @@ -661,7 +648,6 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); ioapic->irr = 0; ioapic->irr_delivered = 0; - update_handled_vectors(ioapic); kvm_vcpu_request_scan_ioapic(kvm); kvm_ioapic_inject_all(ioapic, state->irr); spin_unlock(&ioapic->lock); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 3dbd0e2aac4e..bf36d66a1951 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -73,7 +73,6 @@ struct kvm_ioapic { struct kvm *kvm; void (*ack_notifier)(void *opaque, int irq); spinlock_t lock; - DECLARE_BITMAP(handled_vectors, 256); struct rtc_status rtc_status; struct delayed_work eoi_inject; u32 irq_eoi[IOAPIC_NUM_PINS]; @@ -98,13 +97,6 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) return kvm->arch.vioapic; } -static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector) -{ - struct kvm_ioapic *ioapic = kvm->arch.vioapic; - smp_rmb(); - return test_bit(vector, ioapic->handled_vectors); -} - void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5693dd9fc163..4c30fb0a48a1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -869,14 +869,20 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } +static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector) +{ + return test_bit(vector, (ulong *)apic->vcpu->arch.eoi_exit_bitmap); +} + static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) { - if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { + if (kvm_ioapic_handles_vector(apic, vector)) { int trigger_mode; if (apic_test_vector(vector, apic->regs + APIC_TMR)) trigger_mode = IOAPIC_LEVEL_TRIG; else trigger_mode = IOAPIC_EDGE_TRIG; + kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); } } @@ -1923,7 +1929,7 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu, /* Cache not set: could be safe but we don't bother. */ apic->highest_isr_cache == -1 || /* Need EOI to update ioapic. */ - kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) { + kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) { /* * PV EOI was disabled by apic_sync_pv_eoi_from_guest * so we need not do anything here. diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2f9ed1ff0632..09582081276e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3664,7 +3664,7 @@ static int svm_vm_has_apicv(struct kvm *kvm) return 0; } -static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu) { return; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 06ef4908ba61..9381b1d34313 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8043,8 +8043,9 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) } } -static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu) { + u64 *eoi_exit_bitmap = vcpu->arch.eoi_exit_bitmap; if (!vmx_vm_has_apicv(vcpu->kvm)) return; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c1ed74ebc502..c4adb8847b23 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6143,15 +6143,13 @@ static void process_smi(struct kvm_vcpu *vcpu) static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { - u64 eoi_exit_bitmap[4]; - if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return; - memset(eoi_exit_bitmap, 0, 32); + memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8); - kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap); - kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); + kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap); + kvm_x86_ops->load_eoi_exitmap(vcpu); } static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From d50ab6c1a2b24e12d3012d7beb343eba5b94a6ca Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 29 Jul 2015 11:49:59 +0200 Subject: KVM: x86: replace vm_has_apicv hook with cpu_uses_apicv This will avoid an unnecessary trip to ->kvm and from there to the VPIC. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/irq.c | 2 +- arch/x86/kvm/lapic.c | 4 ++-- arch/x86/kvm/lapic.h | 4 ++-- arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx.c | 8 +++++++- 6 files changed, 15 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 33609c2c743b..a0ef289d5a86 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -820,7 +820,7 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - int (*vm_has_apicv)(struct kvm *kvm); + int (*cpu_uses_apicv)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm *kvm, int isr); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index a1ec6a50a05a..c0dad893dc59 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -63,7 +63,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) if (kvm_cpu_has_extint(v)) return 1; - if (kvm_apic_vid_enabled(v->kvm)) + if (kvm_vcpu_apic_vid_enabled(v)) return 0; return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4c30fb0a48a1..c568d69c7060 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -390,7 +390,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) vcpu = apic->vcpu; - if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) { + if (unlikely(kvm_vcpu_apic_vid_enabled(vcpu))) { /* try to update RVI */ apic_clear_vector(vec, apic->regs + APIC_IRR); kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -1622,7 +1622,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } - apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm); + apic->irr_pending = kvm_vcpu_apic_vid_enabled(vcpu); apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0; apic->highest_isr_cache = -1; update_divide_count(apic); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index eb46d6bcaa75..7259d272416f 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -143,9 +143,9 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic) return apic->vcpu->arch.apic_base & X2APIC_ENABLE; } -static inline bool kvm_apic_vid_enabled(struct kvm *kvm) +static inline bool kvm_vcpu_apic_vid_enabled(struct kvm_vcpu *vcpu) { - return kvm_x86_ops->vm_has_apicv(kvm); + return kvm_x86_ops->cpu_uses_apicv(vcpu); } static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 09582081276e..9a37e487aca0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3659,7 +3659,7 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) return; } -static int svm_vm_has_apicv(struct kvm *kvm) +static int svm_cpu_uses_apicv(struct kvm_vcpu *vcpu) { return 0; } @@ -4425,7 +4425,7 @@ static struct kvm_x86_ops svm_x86_ops = { .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, - .vm_has_apicv = svm_vm_has_apicv, + .cpu_uses_apicv = svm_cpu_uses_apicv, .load_eoi_exitmap = svm_load_eoi_exitmap, .sync_pir_to_irr = svm_sync_pir_to_irr, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9381b1d34313..b8a90c4f676f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -810,6 +810,7 @@ static void kvm_cpu_vmxoff(void); static bool vmx_mpx_supported(void); static bool vmx_xsaves_supported(void); static int vmx_vm_has_apicv(struct kvm *kvm); +static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -4337,6 +4338,11 @@ static int vmx_vm_has_apicv(struct kvm *kvm) return enable_apicv && irqchip_in_kernel(kvm); } +static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu) +{ + return vmx_vm_has_apicv(vcpu->kvm); +} + static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -10362,7 +10368,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, .set_apic_access_page_addr = vmx_set_apic_access_page_addr, - .vm_has_apicv = vmx_vm_has_apicv, + .cpu_uses_apicv = vmx_cpu_uses_apicv, .load_eoi_exitmap = vmx_load_eoi_exitmap, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, -- cgit v1.2.3 From 35754c987f252e859bfa390a6816e85563afe79d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 29 Jul 2015 12:05:37 +0200 Subject: KVM: x86: introduce lapic_in_kernel Avoid pointer chasing and memory barriers, and simplify the code when split irqchip (LAPIC in kernel, IOAPIC/PIC in userspace) is introduced. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/irq.c | 6 +++--- arch/x86/kvm/irq.h | 8 ++++++++ arch/x86/kvm/lapic.c | 4 ++-- arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx.c | 46 ++++++++++++++++++++-------------------------- arch/x86/kvm/x86.c | 18 +++++++++--------- 7 files changed, 45 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index c0dad893dc59..b653ae202c8e 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -57,7 +57,7 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v) */ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) { - if (!irqchip_in_kernel(v->kvm)) + if (!lapic_in_kernel(v)) return v->arch.interrupt.pending; if (kvm_cpu_has_extint(v)) @@ -75,7 +75,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) */ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) { - if (!irqchip_in_kernel(v->kvm)) + if (!lapic_in_kernel(v)) return v->arch.interrupt.pending; if (kvm_cpu_has_extint(v)) @@ -103,7 +103,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) { int vector; - if (!irqchip_in_kernel(v->kvm)) + if (!lapic_in_kernel(v)) return v->arch.interrupt.nr; vector = kvm_cpu_get_extint(v); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 3d782a2c336a..9e6e7e04de98 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -92,6 +92,14 @@ static inline int irqchip_in_kernel(struct kvm *kvm) return vpic != NULL; } +static inline int lapic_in_kernel(struct kvm_vcpu *vcpu) +{ + /* Same as irqchip_in_kernel(vcpu->kvm), but with less + * pointer chasing and no unnecessary memory barriers. + */ + return vcpu->arch.apic != NULL; +} + void kvm_pic_reset(struct kvm_kpic_state *s); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c568d69c7060..c4bcc86d6dc4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1985,7 +1985,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) struct kvm_lapic *apic = vcpu->arch.apic; u32 reg = (msr - APIC_BASE_MSR) << 4; - if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) return 1; if (reg == APIC_ICR2) @@ -2002,7 +2002,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) struct kvm_lapic *apic = vcpu->arch.apic; u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0; - if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) return 1; if (reg == APIC_DFR || reg == APIC_ICR2) { diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff606f507913..c3f39aa9b9cb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3427,7 +3427,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) static bool can_do_async_pf(struct kvm_vcpu *vcpu) { - if (unlikely(!irqchip_in_kernel(vcpu->kvm) || + if (unlikely(!lapic_in_kernel(vcpu) || kvm_event_needs_reinjection(vcpu))) return false; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9a37e487aca0..34c089023827 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3060,7 +3060,7 @@ static int cr8_write_interception(struct vcpu_svm *svm) u8 cr8_prev = kvm_get_cr8(&svm->vcpu); /* instruction emulation calls kvm_set_cr8() */ r = cr_interception(svm); - if (irqchip_in_kernel(svm->vcpu.kvm)) + if (lapic_in_kernel(&svm->vcpu)) return r; if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) return r; @@ -3305,7 +3305,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm) * If the user space waits to inject interrupts, exit as soon as * possible */ - if (!irqchip_in_kernel(svm->vcpu.kvm) && + if (!lapic_in_kernel(&svm->vcpu) && kvm_run->request_interrupt_window && !kvm_cpu_has_interrupt(&svm->vcpu)) { kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b8a90c4f676f..4729b7f6e5f2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -809,7 +809,6 @@ static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static bool vmx_mpx_supported(void); static bool vmx_xsaves_supported(void); -static int vmx_vm_has_apicv(struct kvm *kvm); static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, @@ -947,9 +946,9 @@ static inline bool cpu_has_vmx_tpr_shadow(void) return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; } -static inline bool vm_need_tpr_shadow(struct kvm *kvm) +static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu) { - return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); + return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu); } static inline bool cpu_has_secondary_exec_ctrls(void) @@ -1063,9 +1062,9 @@ static inline bool cpu_has_vmx_ple(void) SECONDARY_EXEC_PAUSE_LOOP_EXITING; } -static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm) +static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) { - return flexpriority_enabled && irqchip_in_kernel(kvm); + return flexpriority_enabled && lapic_in_kernel(vcpu); } static inline bool cpu_has_vmx_vpid(void) @@ -2378,7 +2377,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; - if (vmx_vm_has_apicv(vmx->vcpu.kvm)) + if (vmx_cpu_uses_apicv(&vmx->vcpu)) vmx->nested.nested_vmx_pinbased_ctls_high |= PIN_BASED_POSTED_INTR; @@ -4333,14 +4332,9 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr) msr, MSR_TYPE_W); } -static int vmx_vm_has_apicv(struct kvm *kvm) -{ - return enable_apicv && irqchip_in_kernel(kvm); -} - static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu) { - return vmx_vm_has_apicv(vcpu->kvm); + return enable_apicv && lapic_in_kernel(vcpu); } static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) @@ -4520,7 +4514,7 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) { u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; - if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) + if (!vmx_cpu_uses_apicv(&vmx->vcpu)) pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; return pin_based_exec_ctrl; } @@ -4532,7 +4526,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) exec_control &= ~CPU_BASED_MOV_DR_EXITING; - if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { + if (!cpu_need_tpr_shadow(&vmx->vcpu)) { exec_control &= ~CPU_BASED_TPR_SHADOW; #ifdef CONFIG_X86_64 exec_control |= CPU_BASED_CR8_STORE_EXITING | @@ -4549,7 +4543,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; - if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) + if (!cpu_need_virtualize_apic_accesses(&vmx->vcpu)) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; @@ -4563,7 +4557,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; if (!ple_gap) exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; - if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) + if (!vmx_cpu_uses_apicv(&vmx->vcpu)) exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; @@ -4624,7 +4618,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx_secondary_exec_control(vmx)); } - if (vmx_vm_has_apicv(vmx->vcpu.kvm)) { + if (vmx_cpu_uses_apicv(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); vmcs_write64(EOI_EXIT_BITMAP2, 0); @@ -4768,7 +4762,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (cpu_has_vmx_tpr_shadow() && !init_event) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); - if (vm_need_tpr_shadow(vcpu->kvm)) + if (cpu_need_tpr_shadow(vcpu)) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, __pa(vcpu->arch.apic->regs)); vmcs_write32(TPR_THRESHOLD, 0); @@ -4776,7 +4770,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - if (vmx_vm_has_apicv(vcpu->kvm)) + if (vmx_cpu_uses_apicv(vcpu)) memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); if (vmx->vpid != 0) @@ -5316,7 +5310,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) u8 cr8 = (u8)val; err = kvm_set_cr8(vcpu, cr8); kvm_complete_insn_gp(vcpu, err); - if (irqchip_in_kernel(vcpu->kvm)) + if (lapic_in_kernel(vcpu)) return 1; if (cr8_prev <= cr8) return 1; @@ -5535,7 +5529,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) * If the user space waits to inject interrupts, exit as soon as * possible */ - if (!irqchip_in_kernel(vcpu->kvm) && + if (!lapic_in_kernel(vcpu) && vcpu->run->request_interrupt_window && !kvm_cpu_has_interrupt(vcpu)) { vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; @@ -7944,10 +7938,10 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) * apicv */ if (!cpu_has_vmx_virtualize_x2apic_mode() || - !vmx_vm_has_apicv(vcpu->kvm)) + !vmx_cpu_uses_apicv(vcpu)) return; - if (!vm_need_tpr_shadow(vcpu->kvm)) + if (!cpu_need_tpr_shadow(vcpu)) return; sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); @@ -8052,7 +8046,7 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu) { u64 *eoi_exit_bitmap = vcpu->arch.eoi_exit_bitmap; - if (!vmx_vm_has_apicv(vcpu->kvm)) + if (!vmx_cpu_uses_apicv(vcpu)) return; vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); @@ -8551,7 +8545,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) put_cpu(); if (err) goto free_vmcs; - if (vm_need_virtualize_apic_accesses(kvm)) { + if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { err = alloc_apic_access_page(kvm); if (err) goto free_vmcs; @@ -9344,7 +9338,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->nested.apic_access_page)); } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && - (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) { + cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; kvm_vcpu_reload_apic_access_page(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c4adb8847b23..88fb3bf2ae6d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -788,7 +788,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { if (cr8 & CR8_RESERVED_BITS) return 1; - if (irqchip_in_kernel(vcpu->kvm)) + if (lapic_in_kernel(vcpu)) kvm_lapic_set_tpr(vcpu, cr8); else vcpu->arch.cr8 = cr8; @@ -798,7 +798,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) { - if (irqchip_in_kernel(vcpu->kvm)) + if (lapic_in_kernel(vcpu)) return kvm_lapic_get_cr8(vcpu); else return vcpu->arch.cr8; @@ -3175,7 +3175,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_vapic_addr va; r = -EINVAL; - if (!irqchip_in_kernel(vcpu->kvm)) + if (!lapic_in_kernel(vcpu)) goto out; r = -EFAULT; if (copy_from_user(&va, argp, sizeof va)) @@ -5666,7 +5666,7 @@ void kvm_arch_exit(void) int kvm_vcpu_halt(struct kvm_vcpu *vcpu) { ++vcpu->stat.halt_exits; - if (irqchip_in_kernel(vcpu->kvm)) { + if (lapic_in_kernel(vcpu)) { vcpu->arch.mp_state = KVM_MP_STATE_HALTED; return 1; } else { @@ -6162,7 +6162,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { struct page *page = NULL; - if (!irqchip_in_kernel(vcpu->kvm)) + if (!lapic_in_kernel(vcpu)) return; if (!kvm_x86_ops->set_apic_access_page_addr) @@ -6200,7 +6200,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; - bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && + bool req_int_win = !lapic_in_kernel(vcpu) && vcpu->run->request_interrupt_window; bool req_immediate_exit = false; @@ -6597,7 +6597,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } /* re-sync apic's tpr */ - if (!irqchip_in_kernel(vcpu->kvm)) { + if (!lapic_in_kernel(vcpu)) { if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { r = -EINVAL; goto out; @@ -7297,7 +7297,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { - return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); + return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu); } struct static_key kvm_no_apic_vcpu __read_mostly; @@ -7391,7 +7391,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) kvm_mmu_destroy(vcpu); srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); - if (!irqchip_in_kernel(vcpu->kvm)) + if (!lapic_in_kernel(vcpu)) static_key_slow_dec(&kvm_no_apic_vcpu); } -- cgit v1.2.3 From 4ca7dd8ce4b24e18f94eed90e80c6eb80fb48c9a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 30 Jul 2015 10:32:16 +0200 Subject: KVM: x86: unify handling of interrupt window The interrupt window is currently checked twice, once in vmx.c/svm.c and once in dm_request_for_irq_injection. The only difference is the extra check for kvm_arch_interrupt_allowed in dm_request_for_irq_injection, and the different return value (EINTR/KVM_EXIT_INTR for vmx.c/svm.c vs. 0/KVM_EXIT_IRQ_WINDOW_OPEN for dm_request_for_irq_injection). However, dm_request_for_irq_injection is basically dead code! Revive it by removing the checks in vmx.c and svm.c's vmexit handlers, and fixing the returned values for the dm_request_for_irq_injection case. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 13 ------------- arch/x86/kvm/vmx.c | 11 ----------- arch/x86/kvm/x86.c | 4 ++-- 3 files changed, 2 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 34c089023827..54a86183e5d3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3294,24 +3294,11 @@ static int msr_interception(struct vcpu_svm *svm) static int interrupt_window_interception(struct vcpu_svm *svm) { - struct kvm_run *kvm_run = svm->vcpu.run; - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; mark_dirty(svm->vmcb, VMCB_INTR); ++svm->vcpu.stat.irq_window_exits; - /* - * If the user space waits to inject interrupts, exit as soon as - * possible - */ - if (!lapic_in_kernel(&svm->vcpu) && - kvm_run->request_interrupt_window && - !kvm_cpu_has_interrupt(&svm->vcpu)) { - kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - return 0; - } - return 1; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4729b7f6e5f2..6ee0dc69675b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5524,17 +5524,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); ++vcpu->stat.irq_window_exits; - - /* - * If the user space waits to inject interrupts, exit as soon as - * possible - */ - if (!lapic_in_kernel(vcpu) && - vcpu->run->request_interrupt_window && - !kvm_cpu_has_interrupt(vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - return 0; - } return 1; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 88fb3bf2ae6d..28c98c8f9d1c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6469,8 +6469,8 @@ static int vcpu_run(struct kvm_vcpu *vcpu) kvm_inject_pending_timer_irqs(vcpu); if (dm_request_for_irq_injection(vcpu)) { - r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; + r = 0; + vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; ++vcpu->stat.request_irq_exits; break; } -- cgit v1.2.3 From 49df6397edfc5a8ba8ca813b51fb9729d8e94b40 Mon Sep 17 00:00:00 2001 From: Steve Rutherford Date: Wed, 29 Jul 2015 23:21:40 -0700 Subject: KVM: x86: Split the APIC from the rest of IRQCHIP. First patch in a series which enables the relocation of the PIC/IOAPIC to userspace. Adds capability KVM_CAP_SPLIT_IRQCHIP; KVM_CAP_SPLIT_IRQCHIP enables the construction of LAPICs without the rest of the irqchip. Compile tested for x86. Signed-off-by: Steve Rutherford Suggested-by: Andrew Honig Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 17 +++++++++++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/i8254.c | 4 +++- arch/x86/kvm/ioapic.h | 8 ++++++++ arch/x86/kvm/irq.h | 11 ++++++++++- arch/x86/kvm/irq_comm.c | 9 ++++++++- arch/x86/kvm/lapic.c | 6 ++++-- arch/x86/kvm/x86.c | 23 +++++++++++++++++++++-- include/linux/kvm_host.h | 1 + include/uapi/linux/kvm.h | 1 + 10 files changed, 75 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index d9ecceea5a02..43e0816d0de1 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3627,6 +3627,23 @@ struct { KVM handlers should exit to userspace with rc = -EREMOTE. +7.5 KVM_CAP_SPLIT_IRQCHIP + +Architectures: x86 +Parameters: None +Returns: 0 on success, -1 on error + +Create a local apic for each processor in the kernel. This can be used +instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the +IOAPIC and PIC (and also the PIT, even though this has to be enabled +separately). + +This supersedes KVM_CREATE_IRQCHIP, creating only local APICs, but no in kernel +IOAPIC or PIC. This also enables in kernel routing of interrupt requests. + +Fails if VCPU has already been created, or if the irqchip is already in the +kernel (i.e. KVM_CREATE_IRQCHIP has already been called). + 8. Other capabilities. ---------------------- diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a0ef289d5a86..befcf555bddc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -684,6 +684,8 @@ struct kvm_arch { u32 bsp_vcpu_id; u64 disabled_quirks; + + bool irqchip_split; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index f90952f64e79..08116ff227cc 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -35,6 +35,7 @@ #include #include +#include "ioapic.h" #include "irq.h" #include "i8254.h" #include "x86.h" @@ -333,7 +334,8 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; s64 interval; - if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) + if (!ioapic_in_kernel(kvm) || + ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) return; interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index bf36d66a1951..a8842c0dee73 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -97,6 +97,14 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) return kvm->arch.vioapic; } +static inline int ioapic_in_kernel(struct kvm *kvm) +{ + int ret; + + ret = (ioapic_irqchip(kvm) != NULL); + return ret; +} + void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 9e6e7e04de98..2f9703dcd913 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -83,13 +83,22 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) return kvm->arch.vpic; } +static inline int irqchip_split(struct kvm *kvm) +{ + return kvm->arch.irqchip_split; +} + static inline int irqchip_in_kernel(struct kvm *kvm) { struct kvm_pic *vpic = pic_irqchip(kvm); + bool ret; + + ret = (vpic != NULL); + ret |= irqchip_split(kvm); /* Read vpic before kvm->irq_routing. */ smp_rmb(); - return vpic != NULL; + return ret; } static inline int lapic_in_kernel(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 9efff9e5b58c..67f6b62a6814 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -208,7 +208,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) goto unlock; } clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); - if (!irqchip_in_kernel(kvm)) + if (!ioapic_in_kernel(kvm)) goto unlock; kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); @@ -328,3 +328,10 @@ int kvm_setup_default_irq_routing(struct kvm *kvm) return kvm_set_irq_routing(kvm, default_routing, ARRAY_SIZE(default_routing), 0); } + +static const struct kvm_irq_routing_entry empty_routing[] = {}; + +int kvm_setup_empty_irq_routing(struct kvm *kvm) +{ + return kvm_set_irq_routing(kvm, empty_routing, 0, 0); +} diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c4bcc86d6dc4..e05946c36b87 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -209,7 +209,8 @@ out: if (old) kfree_rcu(old, rcu); - kvm_vcpu_request_scan_ioapic(kvm); + if (ioapic_in_kernel(kvm)) + kvm_vcpu_request_scan_ioapic(kvm); } static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) @@ -1845,7 +1846,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_rtc_eoi_tracking_restore_one(vcpu); + if (ioapic_in_kernel(vcpu->kvm)) + kvm_rtc_eoi_tracking_restore_one(vcpu); } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 28c98c8f9d1c..f720774a4797 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2448,6 +2448,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_DISABLE_QUIRKS: case KVM_CAP_SET_BOOT_CPU_ID: + case KVM_CAP_SPLIT_IRQCHIP: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: @@ -3555,6 +3556,24 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, kvm->arch.disabled_quirks = cap->args[0]; r = 0; break; + case KVM_CAP_SPLIT_IRQCHIP: { + mutex_lock(&kvm->lock); + r = -EEXIST; + if (irqchip_in_kernel(kvm)) + goto split_irqchip_unlock; + if (atomic_read(&kvm->online_vcpus)) + goto split_irqchip_unlock; + r = kvm_setup_empty_irq_routing(kvm); + if (r) + goto split_irqchip_unlock; + /* Pairs with irqchip_in_kernel. */ + smp_wmb(); + kvm->arch.irqchip_split = true; + r = 0; +split_irqchip_unlock: + mutex_unlock(&kvm->lock); + break; + } default: r = -EINVAL; break; @@ -3668,7 +3687,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm)) + if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) goto get_irqchip_out; r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) @@ -3692,7 +3711,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm)) + if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); if (r) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1bef9e21e725..354f147647ab 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1002,6 +1002,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) #endif int kvm_setup_default_irq_routing(struct kvm *kvm); +int kvm_setup_empty_irq_routing(struct kvm *kvm); int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *entries, unsigned nr, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index a9256f0331ae..ed00f8fc9ea2 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -824,6 +824,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_MULTI_ADDRESS_SPACE 118 #define KVM_CAP_GUEST_DEBUG_HW_BPS 119 #define KVM_CAP_GUEST_DEBUG_HW_WPS 120 +#define KVM_CAP_SPLIT_IRQCHIP 121 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 7543a635aa09eb138b2cbf60ac3ff19503ae6954 Mon Sep 17 00:00:00 2001 From: Steve Rutherford Date: Wed, 29 Jul 2015 23:21:41 -0700 Subject: KVM: x86: Add KVM exit for IOAPIC EOIs Adds KVM_EXIT_IOAPIC_EOI which allows the kernel to EOI level-triggered IOAPIC interrupts. Uses a per VCPU exit bitmap to decide whether or not the IOAPIC needs to be informed (which is identical to the EOI_EXIT_BITMAP field used by modern x86 processors, but can also be used to elide kvm IOAPIC EOI exits on older processors). [Note: A prototype using ResampleFDs found that decoupling the EOI from the VCPU's thread made it possible for the VCPU to not see a recent EOI after reentering the guest. This does not match real hardware.] Compile tested for Intel x86. Signed-off-by: Steve Rutherford Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 12 ++++++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/lapic.c | 24 +++++++++++++++++------- arch/x86/kvm/x86.c | 11 +++++++++++ include/linux/kvm_host.h | 2 +- include/uapi/linux/kvm.h | 5 +++++ 6 files changed, 48 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 43e0816d0de1..0d14bf5db534 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3309,6 +3309,18 @@ Valid values for 'type' are: to ignore the request, or to gather VM memory core dump and/or reset/shutdown of the VM. + /* KVM_EXIT_IOAPIC_EOI */ + struct { + __u8 vector; + } eoi; + +Indicates that the VCPU's in-kernel local APIC received an EOI for a +level-triggered IOAPIC interrupt. This exit only triggers when the +IOAPIC is implemented in userspace (i.e. KVM_CAP_SPLIT_IRQCHIP is enabled); +the userspace IOAPIC should process the EOI and retrigger the interrupt if +it is still asserted. Vector is the LAPIC interrupt vector for which the +EOI was received. + /* Fix the size of the union. */ char padding[256]; }; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index befcf555bddc..af09fa1d1be7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -574,6 +574,8 @@ struct kvm_vcpu_arch { struct { bool pv_unhalted; } pv; + + int pending_ioapic_eoi; }; struct kvm_lpage_info { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e05946c36b87..ef70f6f3a37a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -877,15 +877,25 @@ static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector) static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) { - if (kvm_ioapic_handles_vector(apic, vector)) { - int trigger_mode; - if (apic_test_vector(vector, apic->regs + APIC_TMR)) - trigger_mode = IOAPIC_LEVEL_TRIG; - else - trigger_mode = IOAPIC_EDGE_TRIG; + int trigger_mode; + + /* Eoi the ioapic only if the ioapic doesn't own the vector. */ + if (!kvm_ioapic_handles_vector(apic, vector)) + return; - kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); + /* Request a KVM exit to inform the userspace IOAPIC. */ + if (irqchip_split(apic->vcpu->kvm)) { + apic->vcpu->arch.pending_ioapic_eoi = vector; + kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu); + return; } + + if (apic_test_vector(vector, apic->regs + APIC_TMR)) + trigger_mode = IOAPIC_LEVEL_TRIG; + else + trigger_mode = IOAPIC_EDGE_TRIG; + + kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); } static int apic_set_eoi(struct kvm_lapic *apic) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f720774a4797..27429daa054e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6271,6 +6271,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_pmu_handle_event(vcpu); if (kvm_check_request(KVM_REQ_PMI, vcpu)) kvm_pmu_deliver_pmi(vcpu); + if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) { + BUG_ON(vcpu->arch.pending_ioapic_eoi > 255); + if (test_bit(vcpu->arch.pending_ioapic_eoi, + (void *) vcpu->arch.eoi_exit_bitmap)) { + vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI; + vcpu->run->eoi.vector = + vcpu->arch.pending_ioapic_eoi; + r = 0; + goto out; + } + } if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) vcpu_scan_ioapic(vcpu); if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 354f147647ab..3b33215ed447 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -140,6 +140,7 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_APIC_PAGE_RELOAD 25 #define KVM_REQ_SMI 26 #define KVM_REQ_HV_CRASH 27 +#define KVM_REQ_IOAPIC_EOI_EXIT 28 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 @@ -1146,4 +1147,3 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) } #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ #endif - diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ed00f8fc9ea2..12e3afbf0f47 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -183,6 +183,7 @@ struct kvm_s390_skeys { #define KVM_EXIT_EPR 23 #define KVM_EXIT_SYSTEM_EVENT 24 #define KVM_EXIT_S390_STSI 25 +#define KVM_EXIT_IOAPIC_EOI 26 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -333,6 +334,10 @@ struct kvm_run { __u8 sel1; __u16 sel2; } s390_stsi; + /* KVM_EXIT_IOAPIC_EOI */ + struct { + __u8 vector; + } eoi; /* Fix the size of the union. */ char padding[256]; }; -- cgit v1.2.3 From b053b2aef25d00773fa6762dcd4b7f5c9c42d171 Mon Sep 17 00:00:00 2001 From: Steve Rutherford Date: Wed, 29 Jul 2015 23:32:35 -0700 Subject: KVM: x86: Add EOI exit bitmap inference In order to support a userspace IOAPIC interacting with an in kernel APIC, the EOI exit bitmaps need to be configurable. If the IOAPIC is in userspace (i.e. the irqchip has been split), the EOI exit bitmaps will be set whenever the GSI Routes are configured. In particular, for the low MSI routes are reservable for userspace IOAPICs. For these MSI routes, the EOI Exit bit corresponding to the destination vector of the route will be set for the destination VCPU. The intention is for the userspace IOAPICs to use the reservable MSI routes to inject interrupts into the guest. This is a slight abuse of the notion of an MSI Route, given that MSIs classically bypass the IOAPIC. It might be worthwhile to add an additional route type to improve clarity. Compile tested for Intel x86. Signed-off-by: Steve Rutherford Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 9 ++++++--- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/ioapic.h | 2 ++ arch/x86/kvm/irq_comm.c | 42 +++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/lapic.c | 3 +-- arch/x86/kvm/x86.c | 9 ++++++++- include/linux/kvm_host.h | 16 +++++++++++++++ virt/kvm/irqchip.c | 12 ++--------- 8 files changed, 78 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 0d14bf5db534..89e71648d748 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3642,7 +3642,7 @@ KVM handlers should exit to userspace with rc = -EREMOTE. 7.5 KVM_CAP_SPLIT_IRQCHIP Architectures: x86 -Parameters: None +Parameters: args[0] - number of routes reserved for userspace IOAPICs Returns: 0 on success, -1 on error Create a local apic for each processor in the kernel. This can be used @@ -3650,8 +3650,11 @@ instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the IOAPIC and PIC (and also the PIT, even though this has to be enabled separately). -This supersedes KVM_CREATE_IRQCHIP, creating only local APICs, but no in kernel -IOAPIC or PIC. This also enables in kernel routing of interrupt requests. +This capability also enables in kernel routing of interrupt requests; +when KVM_CAP_SPLIT_IRQCHIP only routes of KVM_IRQ_ROUTING_MSI type are +used in the IRQ routing table. The first args[0] MSI routes are reserved +for the IOAPIC pins. Whenever the LAPIC receives an EOI for these routes, +a KVM_EXIT_IOAPIC_EOI vmexit will be reported to userspace. Fails if VCPU has already been created, or if the irqchip is already in the kernel (i.e. KVM_CREATE_IRQCHIP has already been called). diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index af09fa1d1be7..7a5f9debbcd8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -688,6 +688,7 @@ struct kvm_arch { u64 disabled_quirks; bool irqchip_split; + u8 nr_reserved_ioapic_pins; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index a8842c0dee73..084617d37c74 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -9,6 +9,7 @@ struct kvm; struct kvm_vcpu; #define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS +#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES #define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ #define IOAPIC_EDGE_TRIG 0 #define IOAPIC_LEVEL_TRIG 1 @@ -121,5 +122,6 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); #endif diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 67f6b62a6814..177460998bb0 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -335,3 +335,45 @@ int kvm_setup_empty_irq_routing(struct kvm *kvm) { return kvm_set_irq_routing(kvm, empty_routing, 0, 0); } + +void kvm_arch_irq_routing_update(struct kvm *kvm) +{ + if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm)) + return; + kvm_make_scan_ioapic_request(kvm); +} + +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_kernel_irq_routing_entry *entry; + struct kvm_irq_routing_table *table; + u32 i, nr_ioapic_pins; + int idx; + + /* kvm->irq_routing must be read after clearing + * KVM_SCAN_IOAPIC. */ + smp_mb(); + idx = srcu_read_lock(&kvm->irq_srcu); + table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + nr_ioapic_pins = min_t(u32, table->nr_rt_entries, + kvm->arch.nr_reserved_ioapic_pins); + for (i = 0; i < nr_ioapic_pins; ++i) { + hlist_for_each_entry(entry, &table->map[i], link) { + u32 dest_id, dest_mode; + + if (entry->type != KVM_IRQ_ROUTING_MSI) + continue; + dest_id = (entry->msi.address_lo >> 12) & 0xff; + dest_mode = (entry->msi.address_lo >> 2) & 0x1; + if (kvm_apic_match_dest(vcpu, NULL, 0, dest_id, + dest_mode)) { + u32 vector = entry->msi.data & 0xff; + + __set_bit(vector, + (unsigned long *) eoi_exit_bitmap); + } + } + } + srcu_read_unlock(&kvm->irq_srcu, idx); +} diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ef70f6f3a37a..2f4c0d0cbe0a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -209,8 +209,7 @@ out: if (old) kfree_rcu(old, rcu); - if (ioapic_in_kernel(kvm)) - kvm_vcpu_request_scan_ioapic(kvm); + kvm_make_scan_ioapic_request(kvm); } static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 27429daa054e..4aeed2086c5e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3558,6 +3558,9 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, break; case KVM_CAP_SPLIT_IRQCHIP: { mutex_lock(&kvm->lock); + r = -EINVAL; + if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS) + goto split_irqchip_unlock; r = -EEXIST; if (irqchip_in_kernel(kvm)) goto split_irqchip_unlock; @@ -3569,6 +3572,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, /* Pairs with irqchip_in_kernel. */ smp_wmb(); kvm->arch.irqchip_split = true; + kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; r = 0; split_irqchip_unlock: mutex_unlock(&kvm->lock); @@ -6167,7 +6171,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8); - kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap); + if (irqchip_split(vcpu->kvm)) + kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap); + else + kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap); kvm_x86_ops->load_eoi_exitmap(vcpu); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3b33215ed447..d754f2d826e9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -330,6 +330,18 @@ struct kvm_kernel_irq_routing_entry { struct hlist_node link; }; +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING +struct kvm_irq_routing_table { + int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS]; + u32 nr_rt_entries; + /* + * Array indexed by gsi. Each entry contains list of irq chips + * the gsi is connected to. + */ + struct hlist_head map[0]; +}; +#endif + #ifndef KVM_PRIVATE_MEM_SLOTS #define KVM_PRIVATE_MEM_SLOTS 0 #endif @@ -456,10 +468,14 @@ void vcpu_put(struct kvm_vcpu *vcpu); #ifdef __KVM_HAVE_IOAPIC void kvm_vcpu_request_scan_ioapic(struct kvm *kvm); +void kvm_arch_irq_routing_update(struct kvm *kvm); #else static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) { } +static inline void kvm_arch_irq_routing_update(struct kvm *kvm) +{ +} #endif #ifdef CONFIG_HAVE_KVM_IRQFD diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index d7ea8e20dae4..716a1c4db528 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -31,16 +31,6 @@ #include #include "irq.h" -struct kvm_irq_routing_table { - int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS]; - u32 nr_rt_entries; - /* - * Array indexed by gsi. Each entry contains list of irq chips - * the gsi is connected to. - */ - struct hlist_head map[0]; -}; - int kvm_irq_map_gsi(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *entries, int gsi) { @@ -231,6 +221,8 @@ int kvm_set_irq_routing(struct kvm *kvm, kvm_irq_routing_update(kvm); mutex_unlock(&kvm->irq_lock); + kvm_arch_irq_routing_update(kvm); + synchronize_srcu_expedited(&kvm->irq_srcu); new = old; -- cgit v1.2.3 From 1c1a9ce973a7863dd46767226bce2a5f12d48bc6 Mon Sep 17 00:00:00 2001 From: Steve Rutherford Date: Thu, 30 Jul 2015 11:27:16 +0200 Subject: KVM: x86: Add support for local interrupt requests from userspace In order to enable userspace PIC support, the userspace PIC needs to be able to inject local interrupts even when the APICs are in the kernel. KVM_INTERRUPT now supports sending local interrupts to an APIC when APICs are in the kernel. The ready_for_interrupt_request flag is now only set when the CPU/APIC will immediately accept and inject an interrupt (i.e. APIC has not masked the PIC). When the PIC wishes to initiate an INTA cycle with, say, CPU0, it kicks CPU0 out of the guest, and renedezvous with CPU0 once it arrives in userspace. When the CPU/APIC unmasks the PIC, a KVM_EXIT_IRQ_WINDOW_OPEN is triggered, so that userspace has a chance to inject a PIC interrupt if it had been pending. Overall, this design can lead to a small number of spurious userspace renedezvous. In particular, whenever the PIC transistions from low to high while it is masked and whenever the PIC becomes unmasked while it is low. Note: this does not buffer more than one local interrupt in the kernel, so the VMM needs to enter the guest in order to complete interrupt injection before injecting an additional interrupt. Compiles for x86. Can pass the KVM Unit Tests. Signed-off-by: Steve Rutherford Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 14 +++++++++---- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/irq.c | 32 +++++++++++++++++++++++------ arch/x86/kvm/irq.h | 8 ++++++++ arch/x86/kvm/x86.c | 42 ++++++++++++++++++++++++++++++--------- 5 files changed, 78 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 89e71648d748..e3e9c41721a2 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -401,10 +401,9 @@ Capability: basic Architectures: x86, ppc, mips Type: vcpu ioctl Parameters: struct kvm_interrupt (in) -Returns: 0 on success, -1 on error +Returns: 0 on success, negative on failure. -Queues a hardware interrupt vector to be injected. This is only -useful if in-kernel local APIC or equivalent is not used. +Queues a hardware interrupt vector to be injected. /* for KVM_INTERRUPT */ struct kvm_interrupt { @@ -414,7 +413,14 @@ struct kvm_interrupt { X86: -Note 'irq' is an interrupt vector, not an interrupt pin or line. +Returns: 0 on success, + -EEXIST if an interrupt is already enqueued + -EINVAL the the irq number is invalid + -ENXIO if the PIC is in the kernel + -EFAULT if the pointer is invalid + +Note 'irq' is an interrupt vector, not an interrupt pin or line. This +ioctl is useful if the in-kernel PIC is not used. PPC: diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7a5f9debbcd8..76a5b30979b3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -576,6 +576,7 @@ struct kvm_vcpu_arch { } pv; int pending_ioapic_eoi; + int pending_external_vector; }; struct kvm_lpage_info { diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index b653ae202c8e..097060e33bd6 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -37,15 +37,28 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL(kvm_cpu_has_pending_timer); +/* + * check if there is a pending userspace external interrupt + */ +static int pending_userspace_extint(struct kvm_vcpu *v) +{ + return v->arch.pending_external_vector != -1; +} + /* * check if there is pending interrupt from * non-APIC source without intack. */ static int kvm_cpu_has_extint(struct kvm_vcpu *v) { - if (kvm_apic_accept_pic_intr(v)) - return pic_irqchip(v->kvm)->output; /* PIC */ - else + u8 accept = kvm_apic_accept_pic_intr(v); + + if (accept) { + if (irqchip_split(v->kvm)) + return pending_userspace_extint(v); + else + return pic_irqchip(v->kvm)->output; + } else return 0; } @@ -91,9 +104,16 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); */ static int kvm_cpu_get_extint(struct kvm_vcpu *v) { - if (kvm_cpu_has_extint(v)) - return kvm_pic_read_irq(v->kvm); /* PIC */ - return -1; + if (kvm_cpu_has_extint(v)) { + if (irqchip_split(v->kvm)) { + int vector = v->arch.pending_external_vector; + + v->arch.pending_external_vector = -1; + return vector; + } else + return kvm_pic_read_irq(v->kvm); /* PIC */ + } else + return -1; } /* diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 2f9703dcd913..ae5c78f2337d 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -83,6 +83,14 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) return kvm->arch.vpic; } +static inline int pic_in_kernel(struct kvm *kvm) +{ + int ret; + + ret = (pic_irqchip(kvm) != NULL); + return ret; +} + static inline int irqchip_split(struct kvm *kvm) { return kvm->arch.irqchip_split; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4aeed2086c5e..8b97c54edebc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2662,12 +2662,24 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, { if (irq->irq >= KVM_NR_INTERRUPTS) return -EINVAL; - if (irqchip_in_kernel(vcpu->kvm)) + + if (!irqchip_in_kernel(vcpu->kvm)) { + kvm_queue_interrupt(vcpu, irq->irq, false); + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; + } + + /* + * With in-kernel LAPIC, we only use this to inject EXTINT, so + * fail for in-kernel 8259. + */ + if (pic_in_kernel(vcpu->kvm)) return -ENXIO; - kvm_queue_interrupt(vcpu, irq->irq, false); - kvm_make_request(KVM_REQ_EVENT, vcpu); + if (vcpu->arch.pending_external_vector != -1) + return -EEXIST; + vcpu->arch.pending_external_vector = irq->irq; return 0; } @@ -5796,9 +5808,15 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) */ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) { - return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && - vcpu->run->request_interrupt_window && - kvm_arch_interrupt_allowed(vcpu)); + if (!vcpu->run->request_interrupt_window || pic_in_kernel(vcpu->kvm)) + return false; + + if (kvm_cpu_has_interrupt(vcpu)) + return false; + + return (irqchip_split(vcpu->kvm) + ? kvm_apic_accept_pic_intr(vcpu) + : kvm_arch_interrupt_allowed(vcpu)); } static void post_kvm_run_save(struct kvm_vcpu *vcpu) @@ -5809,13 +5827,17 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0; kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); - if (irqchip_in_kernel(vcpu->kvm)) - kvm_run->ready_for_interrupt_injection = 1; - else + if (!irqchip_in_kernel(vcpu->kvm)) kvm_run->ready_for_interrupt_injection = kvm_arch_interrupt_allowed(vcpu) && !kvm_cpu_has_interrupt(vcpu) && !kvm_event_needs_reinjection(vcpu); + else if (!pic_in_kernel(vcpu->kvm)) + kvm_run->ready_for_interrupt_injection = + kvm_apic_accept_pic_intr(vcpu) && + !kvm_cpu_has_interrupt(vcpu); + else + kvm_run->ready_for_interrupt_injection = 1; } static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -7403,6 +7425,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); + vcpu->arch.pending_external_vector = -1; + return 0; fail_free_mce_banks: -- cgit v1.2.3 From 931c33b178b091cced2a6b3f57f04655f8ff5207 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 15 Sep 2015 14:41:58 +0800 Subject: kvm: add tracepoint for fast mmio Cc: Gleb Natapov Cc: Paolo Bonzini Signed-off-by: Jason Wang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/trace.h | 18 ++++++++++++++++++ arch/x86/kvm/vmx.c | 1 + arch/x86/kvm/x86.c | 1 + 3 files changed, 20 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 4eae7c35ddf5..ce4abe333c39 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -128,6 +128,24 @@ TRACE_EVENT(kvm_pio, __entry->count > 1 ? "(...)" : "") ); +/* + * Tracepoint for fast mmio. + */ +TRACE_EVENT(kvm_fast_mmio, + TP_PROTO(u64 gpa), + TP_ARGS(gpa), + + TP_STRUCT__entry( + __field(u64, gpa) + ), + + TP_fast_assign( + __entry->gpa = gpa; + ), + + TP_printk("fast mmio at gpa 0x%llx", __entry->gpa) +); + /* * Tracepoint for cpuid. */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6ee0dc69675b..324b09ff1def 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5756,6 +5756,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { skip_emulated_instruction(vcpu); + trace_kvm_fast_mmio(gpa); return 1; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8b97c54edebc..b90ad01c617d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8070,6 +8070,7 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); -- cgit v1.2.3 From e516cebb4f2b2057a5a421fea589079502acfff6 Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Wed, 16 Sep 2015 12:29:48 +0300 Subject: kvm/x86: Hyper-V HV_X64_MSR_RESET msr HV_X64_MSR_RESET msr is used by Hyper-V based Windows guest to reset guest VM by hypervisor. Necessary to support loading of winhv.sys in guest, which in turn is required to support Windows VMBus. Signed-off-by: Andrey Smetanin Reviewed-by: Roman Kagan Signed-off-by: Denis V. Lunev CC: Paolo Bonzini CC: Gleb Natapov Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/hyperv.h | 3 +++ arch/x86/kvm/hyperv.c | 10 ++++++++++ arch/x86/kvm/x86.c | 7 +++++++ include/linux/kvm_host.h | 1 + 4 files changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index f0412c50c47b..dab584bf7ddf 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -153,6 +153,9 @@ /* MSR used to provide vcpu index */ #define HV_X64_MSR_VP_INDEX 0x40000002 +/* MSR used to reset the guest OS. */ +#define HV_X64_MSR_RESET 0x40000003 + /* MSR used to read the per-partition time reference counter */ #define HV_X64_MSR_TIME_REF_COUNT 0x40000020 diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index a8160d2ae362..0ad11a232474 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -41,6 +41,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr) case HV_X64_MSR_TIME_REF_COUNT: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + case HV_X64_MSR_RESET: r = true; break; } @@ -163,6 +164,12 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, data); case HV_X64_MSR_CRASH_CTL: return kvm_hv_msr_set_crash_ctl(vcpu, data, host); + case HV_X64_MSR_RESET: + if (data == 1) { + vcpu_debug(vcpu, "hyper-v reset requested\n"); + kvm_make_request(KVM_REQ_HV_RESET, vcpu); + } + break; default: vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -241,6 +248,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) pdata); case HV_X64_MSR_CRASH_CTL: return kvm_hv_msr_get_crash_ctl(vcpu, pdata); + case HV_X64_MSR_RESET: + data = 0; + break; default: vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b90ad01c617d..297b36fa3b80 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -952,6 +952,7 @@ static u32 emulated_msrs[] = { HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, + HV_X64_MSR_RESET, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, @@ -6321,6 +6322,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 0; goto out; } + if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET; + r = 0; + goto out; + } } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d754f2d826e9..cd0ba2e931e1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -141,6 +141,7 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_SMI 26 #define KVM_REQ_HV_CRASH 27 #define KVM_REQ_IOAPIC_EOI_EXIT 28 +#define KVM_REQ_HV_RESET 29 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 -- cgit v1.2.3 From 11c4b1ca719eaaa5ca6fe0e80bb009f3f2012fd2 Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Wed, 16 Sep 2015 12:29:49 +0300 Subject: kvm/x86: Hyper-V HV_X64_MSR_VP_INDEX export for QEMU. Insert Hyper-V HV_X64_MSR_VP_INDEX into msr's emulated list, so QEMU can set Hyper-V features cpuid HV_X64_MSR_VP_INDEX_AVAILABLE bit correctly. KVM emulation part is in place already. Necessary to support loading of winhv.sys in guest, which in turn is required to support Windows VMBus. Signed-off-by: Andrey Smetanin Reviewed-by: Roman Kagan Signed-off-by: Denis V. Lunev CC: Paolo Bonzini CC: Gleb Natapov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 297b36fa3b80..716b4610791c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -953,6 +953,7 @@ static u32 emulated_msrs[] = { HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, HV_X64_MSR_RESET, + HV_X64_MSR_VP_INDEX, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, -- cgit v1.2.3 From 9eec50b8bbe1535c440a1ee88c1958f78fc55957 Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Wed, 16 Sep 2015 12:29:50 +0300 Subject: kvm/x86: Hyper-V HV_X64_MSR_VP_RUNTIME support HV_X64_MSR_VP_RUNTIME msr used by guest to get "the time the virtual processor consumes running guest code, and the time the associated logical processor spends running hypervisor code on behalf of that guest." Calculation of this time is performed by task_cputime_adjusted() for vcpu task. Necessary to support loading of winhv.sys in guest, which in turn is required to support Windows VMBus. Signed-off-by: Andrey Smetanin Reviewed-by: Roman Kagan Signed-off-by: Denis V. Lunev CC: Paolo Bonzini CC: Gleb Natapov Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/uapi/asm/hyperv.h | 3 +++ arch/x86/kvm/hyperv.c | 21 +++++++++++++++++++-- arch/x86/kvm/x86.c | 1 + kernel/sched/cputime.c | 2 ++ 5 files changed, 26 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 76a5b30979b3..d064cb2e19e8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -374,6 +374,7 @@ struct kvm_mtrr { /* Hyper-V per vcpu emulation context */ struct kvm_vcpu_hv { u64 hv_vapic; + s64 runtime_offset; }; struct kvm_vcpu_arch { diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index dab584bf7ddf..2677a0aac2cc 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -156,6 +156,9 @@ /* MSR used to reset the guest OS. */ #define HV_X64_MSR_RESET 0x40000003 +/* MSR used to provide vcpu runtime in 100ns units */ +#define HV_X64_MSR_VP_RUNTIME 0x40000010 + /* MSR used to read the per-partition time reference counter */ #define HV_X64_MSR_TIME_REF_COUNT 0x40000020 diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 0ad11a232474..62cf8c915e95 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -178,7 +178,16 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, return 0; } -static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +/* Calculate cpu time spent by current task in 100ns units */ +static u64 current_task_runtime_100ns(void) +{ + cputime_t utime, stime; + + task_cputime_adjusted(current, &utime, &stime); + return div_u64(cputime_to_nsecs(utime + stime), 100); +} + +static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; @@ -212,6 +221,11 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); + case HV_X64_MSR_VP_RUNTIME: + if (!host) + return 1; + hv->runtime_offset = data - current_task_runtime_100ns(); + break; default: vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -287,6 +301,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_APIC_ASSIST_PAGE: data = hv->hv_vapic; break; + case HV_X64_MSR_VP_RUNTIME: + data = current_task_runtime_100ns() + hv->runtime_offset; + break; default: vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; @@ -305,7 +322,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) mutex_unlock(&vcpu->kvm->lock); return r; } else - return kvm_hv_set_msr(vcpu, msr, data); + return kvm_hv_set_msr(vcpu, msr, data, host); } int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 716b4610791c..185fc1619c99 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -954,6 +954,7 @@ static u32 emulated_msrs[] = { HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, HV_X64_MSR_RESET, HV_X64_MSR_VP_INDEX, + HV_X64_MSR_VP_RUNTIME, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8cbc3db671df..26a54461bf59 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -444,6 +444,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) *ut = p->utime; *st = p->stime; } +EXPORT_SYMBOL_GPL(task_cputime_adjusted); void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { @@ -652,6 +653,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) task_cputime(p, &cputime.utime, &cputime.stime); cputime_adjust(&cputime, &p->prev_cputime, ut, st); } +EXPORT_SYMBOL_GPL(task_cputime_adjusted); void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { -- cgit v1.2.3 From d6a858d13e5d02b97daab5ba0648c704ae3f9517 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 28 Sep 2015 11:58:14 +0200 Subject: KVM: vmx: disable posted interrupts if no local APIC Uniprocessor 32-bit randconfigs can disable the local APIC, and posted interrupts require reserving a vector on the LAPIC, so they are incompatible. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 324b09ff1def..0f15e2382109 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -983,7 +983,8 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void) static inline bool cpu_has_vmx_posted_intr(void) { - return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; + return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && + vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; } static inline bool cpu_has_vmx_apicv(void) -- cgit v1.2.3 From eb1c31b46800a476644cd63ab3bc7ef918b0a917 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:50 +0800 Subject: KVM: x86: allow guest to use cflushopt and clwb Pass these CPU features to guest to enable them in guest They are needed by nvdimm drivers Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2fbea2544f24..962fc7d7e0d4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | - F(AVX512CD); + F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB); /* cpuid 0xD.1.eax */ const u32 kvm_supported_word10_x86_features = -- cgit v1.2.3 From 8b3e34e46aca9b6d349b331cd9cf71ccbdc91b2e Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:51 +0800 Subject: KVM: x86: add pcommit support Pass PCOMMIT CPU feature to guest to enable PCOMMIT instruction Currently we do not catch pcommit instruction for L1 guest and allow L1 to catch this instruction for L2 if, as required by the spec, L1 can enumerate the PCOMMIT instruction via CPUID: | IA32_VMX_PROCBASED_CTLS2[53] (which enumerates support for the | 1-setting of PCOMMIT exiting) is always the same as | CPUID.07H:EBX.PCOMMIT[bit 22]. Thus, software can set PCOMMIT exiting | to 1 if and only if the PCOMMIT instruction is enumerated via CPUID The spec can be found at https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 2 +- arch/x86/include/uapi/asm/vmx.h | 4 +++- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/cpuid.h | 8 ++++++++ arch/x86/kvm/vmx.c | 34 +++++++++++++++++++++++++++++----- 5 files changed, 42 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 448b7ca61aee..d25f32ac9c5c 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -72,7 +72,7 @@ #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 - +#define SECONDARY_EXEC_PCOMMIT 0x00200000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_NMI_EXITING 0x00000008 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 37fee272618f..5b15d94a33f8 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -78,6 +78,7 @@ #define EXIT_REASON_PML_FULL 62 #define EXIT_REASON_XSAVES 63 #define EXIT_REASON_XRSTORS 64 +#define EXIT_REASON_PCOMMIT 65 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -126,7 +127,8 @@ { EXIT_REASON_INVVPID, "INVVPID" }, \ { EXIT_REASON_INVPCID, "INVPCID" }, \ { EXIT_REASON_XSAVES, "XSAVES" }, \ - { EXIT_REASON_XRSTORS, "XRSTORS" } + { EXIT_REASON_XRSTORS, "XRSTORS" }, \ + { EXIT_REASON_PCOMMIT, "PCOMMIT" } #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 962fc7d7e0d4..faeb0b3bb343 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | - F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB); + F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT); /* cpuid 0xD.1.eax */ const u32 kvm_supported_word10_x86_features = diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index dd05b9cef6ae..aed7bfeeaf0d 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -133,4 +133,12 @@ static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_MPX)); } + +static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_PCOMMIT)); +} #endif diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0f15e2382109..691010604144 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2475,7 +2475,8 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING | - SECONDARY_EXEC_XSAVES; + SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_PCOMMIT; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ @@ -3016,7 +3017,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_ENABLE_PML; + SECONDARY_EXEC_ENABLE_PML | + SECONDARY_EXEC_PCOMMIT; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -4571,6 +4573,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) /* PML is enabled/disabled in creating/destorying vcpu */ exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + /* Currently, we allow L1 guest to directly run pcommit instruction. */ + exec_control &= ~SECONDARY_EXEC_PCOMMIT; + return exec_control; } @@ -4614,10 +4619,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); - if (cpu_has_secondary_exec_ctrls()) { + if (cpu_has_secondary_exec_ctrls()) vmcs_write32(SECONDARY_VM_EXEC_CONTROL, vmx_secondary_exec_control(vmx)); - } if (vmx_cpu_uses_apicv(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); @@ -7212,6 +7216,13 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) return 1; } +static int handle_pcommit(struct kvm_vcpu *vcpu) +{ + /* we never catch pcommit instruct for L1 guest. */ + WARN_ON(1); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -7262,6 +7273,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, + [EXIT_REASON_PCOMMIT] = handle_pcommit, }; static const int kvm_vmx_max_exit_handlers = @@ -7563,6 +7575,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * the XSS exit bitmap in vmcs12. */ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); + case EXIT_REASON_PCOMMIT: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT); default: return true; } @@ -8697,6 +8711,15 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (best) best->ebx &= ~bit(X86_FEATURE_INVPCID); } + + if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) { + if (guest_cpuid_has_pcommit(vcpu)) + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_PCOMMIT; + else + vmx->nested.nested_vmx_secondary_ctls_high &= + ~SECONDARY_EXEC_PCOMMIT; + } } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -9310,7 +9333,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_APIC_REGISTER_VIRT); + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_PCOMMIT); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; -- cgit v1.2.3 From e2821620c0908593e6cb40f79c7e78b31921ed73 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:52 +0800 Subject: KVM: VMX: drop rdtscp_enabled check in prepare_vmcs02() SECONDARY_EXEC_RDTSCP set for L2 guest comes from vmcs12 Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 691010604144..8f4c13808232 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9327,8 +9327,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (cpu_has_secondary_exec_ctrls()) { exec_control = vmx_secondary_exec_control(vmx); - if (!vmx->rdtscp_enabled) - exec_control &= ~SECONDARY_EXEC_RDTSCP; + /* Take the following fields only from vmcs12 */ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | -- cgit v1.2.3 From f36201e5f44b55faf44ddc820929548e51ec841b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:53 +0800 Subject: KVM: VMX: simplify rdtscp handling in vmx_cpuid_update() if vmx_rdtscp_supported() is true SECONDARY_EXEC_RDTSCP must have already been set in current vmcs by vmx_secondary_exec_control() Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8f4c13808232..d53f57aad5eb 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8677,16 +8677,15 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) vmx->rdtscp_enabled = false; if (vmx_rdtscp_supported()) { exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - if (exec_control & SECONDARY_EXEC_RDTSCP) { - best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) - vmx->rdtscp_enabled = true; - else { - exec_control &= ~SECONDARY_EXEC_RDTSCP; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); - } + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) + vmx->rdtscp_enabled = true; + else { + exec_control &= ~SECONDARY_EXEC_RDTSCP; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); } + if (nested && !vmx->rdtscp_enabled) vmx->nested.nested_vmx_secondary_ctls_high &= ~SECONDARY_EXEC_RDTSCP; -- cgit v1.2.3 From 29541bb8f4a18b2939dea137315e1803b4617c8d Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:54 +0800 Subject: KVM: VMX: simplify invpcid handling in vmx_cpuid_update() If vmx_invpcid_supported() is true, second execution control filed must be supported and SECONDARY_EXEC_ENABLE_INVPCID must have already been set in current vmcs by vmx_secondary_exec_control() If vmx_invpcid_supported() is false, no need to clear SECONDARY_EXEC_ENABLE_INVPCID Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d53f57aad5eb..a5b26a940df2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8694,19 +8694,12 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) /* Exposing INVPCID only when PCID is exposed */ best = kvm_find_cpuid_entry(vcpu, 0x7, 0); if (vmx_invpcid_supported() && - best && (best->ebx & bit(X86_FEATURE_INVPCID)) && - guest_cpuid_has_pcid(vcpu)) { + (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) || + !guest_cpuid_has_pcid(vcpu))) { exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); - } else { - if (cpu_has_secondary_exec_ctrls()) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); - } + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + if (best) best->ebx &= ~bit(X86_FEATURE_INVPCID); } -- cgit v1.2.3 From 8b97265a1516819d54c2d24b3874489a52f269d4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 15 Sep 2015 17:34:42 +0200 Subject: KVM: VMX: align vmx->nested.nested_vmx_secondary_ctls_high to vmx->rdtscp_enabled The SECONDARY_EXEC_RDTSCP must be available iff RDTSCP is enabled in the guest. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a5b26a940df2..dad471f8ca57 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8686,9 +8686,14 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) exec_control); } - if (nested && !vmx->rdtscp_enabled) - vmx->nested.nested_vmx_secondary_ctls_high &= - ~SECONDARY_EXEC_RDTSCP; + if (nested) { + if (vmx->rdtscp_enabled) + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_RDTSCP; + else + vmx->nested.nested_vmx_secondary_ctls_high &= + ~SECONDARY_EXEC_RDTSCP; + } } /* Exposing INVPCID only when PCID is exposed */ -- cgit v1.2.3 From feda805fe7c4ed9cf78158e73b1218752e3b4314 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:55 +0800 Subject: KVM: VMX: unify SECONDARY_VM_EXEC_CONTROL update Unify the update in vmx_cpuid_update() Signed-off-by: Xiao Guangrong [Rewrite to use vmcs_set_secondary_exec_control. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index dad471f8ca57..a7a0ed6906fe 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8668,23 +8668,38 @@ static int vmx_get_lpage_level(void) return PT_PDPE_LEVEL; } +static void vmcs_set_secondary_exec_control(u32 new_ctl) +{ + /* + * These bits in the secondary execution controls field + * are dynamic, the others are mostly based on the hypervisor + * architecture and the guest's CPUID. Do not touch the + * dynamic bits. + */ + u32 mask = + SECONDARY_EXEC_SHADOW_VMCS | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + + u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + (new_ctl & ~mask) | (cur_ctl & mask)); +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exec_control; + u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx); vmx->rdtscp_enabled = false; if (vmx_rdtscp_supported()) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) vmx->rdtscp_enabled = true; - else { - exec_control &= ~SECONDARY_EXEC_RDTSCP; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); - } + else + secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP; if (nested) { if (vmx->rdtscp_enabled) @@ -8701,14 +8716,14 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (vmx_invpcid_supported() && (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) || !guest_cpuid_has_pcid(vcpu))) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID; if (best) best->ebx &= ~bit(X86_FEATURE_INVPCID); } + vmcs_set_secondary_exec_control(secondary_exec_ctl); + if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) { if (guest_cpuid_has_pcommit(vcpu)) vmx->nested.nested_vmx_secondary_ctls_high |= -- cgit v1.2.3 From 7ec362964d6cc228ea68572fdd6d75a59f8b40fa Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:56 +0800 Subject: KVM: VMX: clean up bit operation on SECONDARY_VM_EXEC_CONTROL Use vmcs_set_bits() and vmcs_clear_bits() to clean up the code Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a7a0ed6906fe..2937cf136df6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6636,7 +6636,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) { - u32 exec_control; if (vmx->nested.current_vmptr == -1ull) return; @@ -6649,9 +6648,8 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) they were modified */ copy_shadow_to_vmcs12(vmx); vmx->nested.sync_shadow_vmcs = false; - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, -1ull); } vmx->nested.posted_intr_nv = -1; @@ -7047,7 +7045,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); gpa_t vmptr; - u32 exec_control; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -7079,9 +7076,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) vmx->nested.current_vmcs12 = new_vmcs12; vmx->nested.current_vmcs12_page = page; if (enable_shadow_vmcs) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control |= SECONDARY_EXEC_SHADOW_VMCS; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, __pa(vmx->nested.current_shadow_vmcs)); vmx->nested.sync_shadow_vmcs = true; @@ -7591,7 +7587,6 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) static int vmx_enable_pml(struct vcpu_vmx *vmx) { struct page *pml_pg; - u32 exec_control; pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!pml_pg) @@ -7602,24 +7597,18 @@ static int vmx_enable_pml(struct vcpu_vmx *vmx) vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control |= SECONDARY_EXEC_ENABLE_PML; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML); return 0; } static void vmx_disable_pml(struct vcpu_vmx *vmx) { - u32 exec_control; - ASSERT(vmx->pml_pg); __free_page(vmx->pml_pg); vmx->pml_pg = NULL; - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_ENABLE_PML; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML); } static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 1cea0ce68ed76490ffa64a9e2a7a40104efe9352 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Sep 2015 14:05:57 +0800 Subject: KVM: VMX: drop rdtscp_enabled field Check cpuid bit instead of it Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.h | 8 ++++++++ arch/x86/kvm/vmx.c | 17 ++++++----------- 2 files changed, 14 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index aed7bfeeaf0d..d434ee952b00 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -141,4 +141,12 @@ static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_PCOMMIT)); } + +static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + return best && (best->edx & bit(X86_FEATURE_RDTSCP)); +} #endif diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2937cf136df6..6f3f97f6e248 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -532,8 +532,6 @@ struct vcpu_vmx { s64 vnmi_blocked_time; u32 exit_reason; - bool rdtscp_enabled; - /* Posted interrupt descriptor */ struct pi_desc pi_desc; @@ -2208,7 +2206,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); index = __find_msr_index(vmx, MSR_TSC_AUX); - if (index >= 0 && vmx->rdtscp_enabled) + if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu)) move_msr_up(vmx, index, save_nmsrs++); /* * MSR_STAR is only needed on long mode guests, and only @@ -2675,7 +2673,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.ia32_xss; break; case MSR_TSC_AUX: - if (!to_vmx(vcpu)->rdtscp_enabled) + if (!guest_cpuid_has_rdtscp(vcpu)) return 1; /* Otherwise falls through */ default: @@ -2781,7 +2779,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) clear_atomic_switch_msr(vmx, MSR_IA32_XSS); break; case MSR_TSC_AUX: - if (!vmx->rdtscp_enabled) + if (!guest_cpuid_has_rdtscp(vcpu)) return 1; /* Check reserved bit, higher 32 bits should be zero */ if ((data >> 32) != 0) @@ -8682,16 +8680,13 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx); - vmx->rdtscp_enabled = false; if (vmx_rdtscp_supported()) { - best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) - vmx->rdtscp_enabled = true; - else + bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu); + if (!rdtscp_enabled) secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP; if (nested) { - if (vmx->rdtscp_enabled) + if (rdtscp_enabled) vmx->nested.nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_RDTSCP; else -- cgit v1.2.3 From 72c930dcfc2b49404ee9e20f6c868402e9c71166 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Fri, 18 Sep 2015 17:54:29 +0200 Subject: x86: kvmclock: abolish PVCLOCK_COUNTS_FROM_ZERO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Newer KVM won't be exposing PVCLOCK_COUNTS_FROM_ZERO anymore. The purpose of that flags was to start counting system time from 0 when the KVM clock has been initialized. We can achieve the same by selecting one read as the initial point. A simple subtraction will work unless the KVM clock count overflows earlier (has smaller width) than scheduler's cycle count. We should be safe till x86_128. Because PVCLOCK_COUNTS_FROM_ZERO was enabled only on new hypervisors, setting sched clock as stable based on PVCLOCK_TSC_STABLE_BIT might regress on older ones. I presume we don't need to change kvm_clock_read instead of introducing kvm_sched_clock_read. A problem could arise in case sched_clock is expected to return the same value as get_cycles, but we should have merged those clocks in that case. Signed-off-by: Radim Krčmář Acked-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvmclock.c | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 2c7aafa70702..2bd81e302427 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -32,6 +32,7 @@ static int kvmclock = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; +static cycle_t kvm_sched_clock_offset; static int parse_no_kvmclock(char *arg) { @@ -92,6 +93,29 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs) return kvm_clock_read(); } +static cycle_t kvm_sched_clock_read(void) +{ + return kvm_clock_read() - kvm_sched_clock_offset; +} + +static inline void kvm_sched_clock_init(bool stable) +{ + if (!stable) { + pv_time_ops.sched_clock = kvm_clock_read; + return; + } + + kvm_sched_clock_offset = kvm_clock_read(); + pv_time_ops.sched_clock = kvm_sched_clock_read; + set_sched_clock_stable(); + + printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", + kvm_sched_clock_offset); + + BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > + sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); +} + /* * If we don't do that, there is the possibility that the guest * will calibrate under heavy load - thus, getting a lower lpj - @@ -248,7 +272,17 @@ void __init kvmclock_init(void) memblock_free(mem, size); return; } - pv_time_ops.sched_clock = kvm_clock_read; + + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) + pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); + + cpu = get_cpu(); + vcpu_time = &hv_clock[cpu].pvti; + flags = pvclock_read_flags(vcpu_time); + + kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); + put_cpu(); + x86_platform.calibrate_tsc = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; @@ -265,16 +299,6 @@ void __init kvmclock_init(void) kvm_get_preset_lpj(); clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.name = "KVM"; - - if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) - pvclock_set_flags(~0); - - cpu = get_cpu(); - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - if (flags & PVCLOCK_COUNTS_FROM_ZERO) - set_sched_clock_stable(); - put_cpu(); } int __init kvm_setup_vsyscall_timeinfo(void) -- cgit v1.2.3 From 18cd52c4d9bfbd081fb643021ba8d048475cd82a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 18 Sep 2015 18:04:17 +0200 Subject: irq_remapping: move structs outside #ifdef This is friendlier to clients of the code, who are going to prepare vcpu_data structs unconditionally, even if CONFIG_IRQ_REMAP is not defined. Reviewed-by: Thomas Gleixner Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/irq_remapping.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 046c7fb1ca43..a210eba2727c 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -33,6 +33,11 @@ enum irq_remap_cap { IRQ_POSTING_CAP = 0, }; +struct vcpu_data { + u64 pi_desc_addr; /* Physical address of PI Descriptor */ + u32 vector; /* Guest vector of the interrupt */ +}; + #ifdef CONFIG_IRQ_REMAP extern bool irq_remapping_cap(enum irq_remap_cap cap); @@ -58,11 +63,6 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void) return x86_vector_domain; } -struct vcpu_data { - u64 pi_desc_addr; /* Physical address of PI Descriptor */ - u32 vector; /* Guest vector of the interrupt */ -}; - #else /* CONFIG_IRQ_REMAP */ static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; } -- cgit v1.2.3 From 6ef1522f7ecc063317dfb5ca63da6e47130a4c50 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:45 +0800 Subject: KVM: Extend struct pi_desc for VT-d Posted-Interrupts Extend struct pi_desc for VT-d Posted-Interrupts. Signed-off-by: Feng Wu Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6f3f97f6e248..4ebdccd882c8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -446,8 +446,24 @@ struct nested_vmx { /* Posted-Interrupt Descriptor */ struct pi_desc { u32 pir[8]; /* Posted interrupt requested */ - u32 control; /* bit 0 of control is outstanding notification bit */ - u32 rsvd[7]; + union { + struct { + /* bit 256 - Outstanding Notification */ + u16 on : 1, + /* bit 257 - Suppress Notification */ + sn : 1, + /* bit 271:258 - Reserved */ + rsvd_1 : 14; + /* bit 279:272 - Notification Vector */ + u8 nv; + /* bit 287:280 - Reserved */ + u8 rsvd_2; + /* bit 319:288 - Notification Destination */ + u32 ndst; + }; + u64 control; + }; + u32 rsvd[6]; } __aligned(64); static bool pi_test_and_set_on(struct pi_desc *pi_desc) -- cgit v1.2.3 From ebbfc765369690cf8fc512615e6b83ec1702f8ac Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:46 +0800 Subject: KVM: Add some helper functions for Posted-Interrupts This patch adds some helper functions to manipulate the Posted-Interrupts Descriptor. Signed-off-by: Feng Wu Reviewed-by: Paolo Bonzini Reviewed-by: Alex Williamson [Make the new functions inline. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4ebdccd882c8..4739a52874c2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -443,6 +443,8 @@ struct nested_vmx { }; #define POSTED_INTR_ON 0 +#define POSTED_INTR_SN 1 + /* Posted-Interrupt Descriptor */ struct pi_desc { u32 pir[8]; /* Posted interrupt requested */ @@ -483,6 +485,30 @@ static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); } +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + return clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_set_sn(struct pi_desc *pi_desc) +{ + return set_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_on(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_sn(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; -- cgit v1.2.3 From 8feb4a04dc756002f78df0026e58118669de4851 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:47 +0800 Subject: KVM: Define a new interface kvm_intr_is_single_vcpu() This patch defines a new interface kvm_intr_is_single_vcpu(), which can returns whether the interrupt is for single-CPU or not. It is used by VT-d PI, since now we only support single-CPU interrupts, For lowest-priority interrupts, if user configures it via /proc/irq or uses irqbalance to make it single-CPU, we can use PI to deliver the interrupts to it. Full functionality of lowest-priority support will be added later. Signed-off-by: Feng Wu Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/irq_comm.c | 27 +++++++++++++++++++ arch/x86/kvm/lapic.c | 59 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/lapic.h | 2 ++ 4 files changed, 91 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d064cb2e19e8..ba4e5673e604 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1241,4 +1241,7 @@ int x86_set_memory_region(struct kvm *kvm, bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); +bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu); + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 177460998bb0..39f833f8132e 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -297,6 +297,33 @@ out: return r; } +bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu) +{ + int i, r = 0; + struct kvm_vcpu *vcpu; + + if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu)) + return true; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + + if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand, + irq->dest_id, irq->dest_mode)) + continue; + + if (++r == 2) + return false; + + *dest_vcpu = vcpu; + } + + return r == 1; +} +EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu); + #define IOAPIC_ROUTING_ENTRY(irq) \ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2f4c0d0cbe0a..944b38a56929 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -755,6 +755,65 @@ out: return ret; } +bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu) +{ + struct kvm_apic_map *map; + bool ret = false; + struct kvm_lapic *dst = NULL; + + if (irq->shorthand) + return false; + + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); + + if (!map) + goto out; + + if (irq->dest_mode == APIC_DEST_PHYSICAL) { + if (irq->dest_id == 0xFF) + goto out; + + if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) + goto out; + + dst = map->phys_map[irq->dest_id]; + if (dst && kvm_apic_present(dst->vcpu)) + *dest_vcpu = dst->vcpu; + else + goto out; + } else { + u16 cid; + unsigned long bitmap = 1; + int i, r = 0; + + if (!kvm_apic_logical_map_valid(map)) + goto out; + + apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap); + + if (cid >= ARRAY_SIZE(map->logical_map)) + goto out; + + for_each_set_bit(i, &bitmap, 16) { + dst = map->logical_map[cid][i]; + if (++r == 2) + goto out; + } + + if (dst && kvm_apic_present(dst->vcpu)) + *dest_vcpu = dst->vcpu; + else + goto out; + } + + ret = true; +out: + rcu_read_unlock(); + return ret; +} + /* * Add a pending IRQ into lapic. * Return 1 if successfully added and 0 if discarded. diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 7259d272416f..fde8e35d5850 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -168,4 +168,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void wait_lapic_expire(struct kvm_vcpu *vcpu); +bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu); #endif -- cgit v1.2.3 From d84f1e0755ba1e87d20d1f90c1e6eb0cbbc0af9d Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:49 +0800 Subject: KVM: make kvm_set_msi_irq() public Make kvm_set_msi_irq() public, we can use this function outside. Signed-off-by: Feng Wu Reviewed-by: Paolo Bonzini Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 ++++ arch/x86/kvm/irq_comm.c | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ba4e5673e604..79fffbbe2348 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -176,6 +176,8 @@ enum { */ #define KVM_APIC_PV_EOI_PENDING 1 +struct kvm_kernel_irq_routing_entry; + /* * We don't want allocation failures within the mmu code, so we preallocate * enough memory for a single page fault in a cache. @@ -1244,4 +1246,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); +void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm_lapic_irq *irq); #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 39f833f8132e..c89228980230 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -91,8 +91,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, return r; } -static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, - struct kvm_lapic_irq *irq) +void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm_lapic_irq *irq) { trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); @@ -108,6 +108,7 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, irq->level = 1; irq->shorthand = 0; } +EXPORT_SYMBOL_GPL(kvm_set_msi_irq); int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) -- cgit v1.2.3 From efc644048ecde54f016011fe10110addd0de348f Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:51 +0800 Subject: KVM: x86: Update IRTE for posted-interrupts This patch adds the routine to update IRTE for posted-interrupts when guest changes the interrupt configuration. Signed-off-by: Feng Wu Reviewed-by: Alex Williamson Signed-off-by: Fengguang Wu [Squashed in automatically generated patch from the build robot "KVM: x86: vcpu_to_pi_desc() can be static" - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++ arch/x86/kvm/trace.h | 33 ++++++++++++++++ arch/x86/kvm/vmx.c | 83 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 2 + 4 files changed, 121 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 79fffbbe2348..a4067ecd4376 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -897,6 +897,9 @@ struct kvm_x86_ops { gfn_t offset, unsigned long mask); /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; + + int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set); }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index ce4abe333c39..120302511802 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -992,6 +992,39 @@ TRACE_EVENT(kvm_enter_smm, __entry->smbase) ); +/* + * Tracepoint for VT-d posted-interrupts. + */ +TRACE_EVENT(kvm_pi_irte_update, + TP_PROTO(unsigned int vcpu_id, unsigned int gsi, + unsigned int gvec, u64 pi_desc_addr, bool set), + TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( unsigned int, gsi ) + __field( unsigned int, gvec ) + __field( u64, pi_desc_addr ) + __field( bool, set ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->gsi = gsi; + __entry->gvec = gvec; + __entry->pi_desc_addr = pi_desc_addr; + __entry->set = set; + ), + + TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, " + "gvec: 0x%x, pi_desc_addr: 0x%llx", + __entry->set ? "enabled and being updated" : "disabled", + __entry->vcpu_id, + __entry->gsi, + __entry->gvec, + __entry->pi_desc_addr) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4739a52874c2..cb26b93a2cd7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "trace.h" #include "pmu.h" @@ -603,6 +604,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } +static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) +{ + return &(to_vmx(vcpu)->pi_desc); +} + #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) #define FIELD(number, name) [number] = VMCS12_OFFSET(name) #define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ @@ -10345,6 +10351,81 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } +/* + * vmx_update_pi_irte - set IRTE for Posted-Interrupts + * + * @kvm: kvm + * @host_irq: host irq of the interrupt + * @guest_irq: gsi of the interrupt + * @set: set or unset PI + * returns 0 on success, < 0 on failure + */ +static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) +{ + struct kvm_kernel_irq_routing_entry *e; + struct kvm_irq_routing_table *irq_rt; + struct kvm_lapic_irq irq; + struct kvm_vcpu *vcpu; + struct vcpu_data vcpu_info; + int idx, ret = -EINVAL; + + if (!kvm_arch_has_assigned_device(kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return 0; + + idx = srcu_read_lock(&kvm->irq_srcu); + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + BUG_ON(guest_irq >= irq_rt->nr_rt_entries); + + hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { + if (e->type != KVM_IRQ_ROUTING_MSI) + continue; + /* + * VT-d PI cannot support posting multicast/broadcast + * interrupts to a vCPU, we still use interrupt remapping + * for these kind of interrupts. + * + * For lowest-priority interrupts, we only support + * those with single CPU as the destination, e.g. user + * configures the interrupts via /proc/irq or uses + * irqbalance to make the interrupts single-CPU. + * + * We will support full lowest-priority interrupt later. + */ + + kvm_set_msi_irq(e, &irq); + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) + continue; + + vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); + vcpu_info.vector = irq.vector; + + trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi, + vcpu_info.vector, vcpu_info.pi_desc_addr, set); + + if (set) + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); + else { + /* suppress notification event before unposting */ + pi_set_sn(vcpu_to_pi_desc(vcpu)); + ret = irq_set_vcpu_affinity(host_irq, NULL); + pi_clear_sn(vcpu_to_pi_desc(vcpu)); + } + + if (ret < 0) { + printk(KERN_INFO "%s: failed to update PI IRTE\n", + __func__); + goto out; + } + } + + ret = 0; +out: + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -10462,6 +10543,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, .pmu_ops = &intel_pmu_ops, + + .update_pi_irte = vmx_update_pi_irte, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 185fc1619c99..3978ace4ddbc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -64,6 +64,7 @@ #include /* Ugh! */ #include #include +#include #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 @@ -8094,3 +8095,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); -- cgit v1.2.3 From 87276880065246ce49ec571130d3d1e4a22e5604 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:40 +0800 Subject: KVM: x86: select IRQ_BYPASS_MANAGER Select IRQ_BYPASS_MANAGER for x86 when CONFIG_KVM is set Signed-off-by: Feng Wu Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/Kconfig | 2 ++ arch/x86/kvm/x86.c | 53 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a4067ecd4376..15664994b6f3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index d8a1d56276e1..639a6e34500c 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -28,6 +28,8 @@ config KVM select ANON_INODES select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD + select IRQ_BYPASS_MANAGER + select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_EVENTFD select KVM_APIC_ARCHITECTURE diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3978ace4ddbc..b8425a769c0a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -51,6 +51,8 @@ #include #include #include +#include +#include #include #define CREATE_TRACE_POINTS @@ -8079,6 +8081,57 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); +int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + if (kvm_x86_ops->update_pi_irte) { + irqfd->producer = prod; + return kvm_x86_ops->update_pi_irte(irqfd->kvm, + prod->irq, irqfd->gsi, 1); + } + + return -EINVAL; +} + +void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + int ret; + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + if (!kvm_x86_ops->update_pi_irte) { + WARN_ON(irqfd->producer != NULL); + return; + } + + WARN_ON(irqfd->producer != prod); + irqfd->producer = NULL; + + /* + * When producer of consumer is unregistered, we change back to + * remapped mode, so we can re-use the current implementation + * when the irq is masked/disabed or the consumer side (KVM + * int this case doesn't want to receive the interrupts. + */ + ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); + if (ret) + printk(KERN_INFO "irq bypass consumer (token %p) unregistration" + " fails: %d\n", irqfd->consumer.token, ret); +} + +int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) +{ + if (!kvm_x86_ops->update_pi_irte) + return -EINVAL; + + return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); +} + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); -- cgit v1.2.3 From 28b835d60fcc2498e717cf5e6f0c3691c24546f7 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:54 +0800 Subject: KVM: Update Posted-Interrupts Descriptor when vCPU is preempted This patch updates the Posted-Interrupts Descriptor when vCPU is preempted. sched out: - Set 'SN' to suppress furture non-urgent interrupts posted for the vCPU. sched in: - Clear 'SN' - Change NDST if vCPU is scheduled to a different CPU - Set 'NV' to POSTED_INTR_VECTOR Signed-off-by: Feng Wu [Include asm/cpu.h to fix !CONFIG_SMP compilation. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index cb26b93a2cd7..99f5c61954ea 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -35,6 +35,7 @@ #include "kvm_cache_regs.h" #include "x86.h" +#include #include #include #include @@ -1942,6 +1943,52 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) preempt_enable(); } +static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return; + + do { + old.control = new.control = pi_desc->control; + + /* + * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there + * are two possible cases: + * 1. After running 'pre_block', context switch + * happened. For this case, 'sn' was set in + * vmx_vcpu_put(), so we need to clear it here. + * 2. After running 'pre_block', we were blocked, + * and woken up by some other guy. For this case, + * we don't need to do anything, 'pi_post_block' + * will do everything for us. However, we cannot + * check whether it is case #1 or case #2 here + * (maybe, not needed), so we also clear sn here, + * I think it is not a big deal. + */ + if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) { + if (vcpu->cpu != cpu) { + dest = cpu_physical_id(cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + } + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } + + /* Allow posting non-urgent interrupts */ + new.sn = 0; + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); +} /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -1992,10 +2039,27 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ vmx->loaded_vmcs->cpu = cpu; } + + vmx_vcpu_pi_load(vcpu, cpu); +} + +static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return; + + /* Set SN when the vCPU is preempted */ + if (vcpu->preempted) + pi_set_sn(pi_desc); } static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { + vmx_vcpu_pi_put(vcpu); + __vmx_load_host_state(to_vmx(vcpu)); if (!vmm_exclusive) { __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); @@ -4427,6 +4491,22 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) { #ifdef CONFIG_SMP if (vcpu->mode == IN_GUEST_MODE) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * Currently, we don't support urgent interrupt, + * all interrupts are recognized as non-urgent + * interrupt, so we cannot post interrupts when + * 'SN' is set. + * + * If the vcpu is in guest mode, it means it is + * running instead of being scheduled out and + * waiting in the run queue, and that's the only + * case when 'SN' is set currently, warning if + * 'SN' is set. + */ + WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)); + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), POSTED_INTR_VECTOR); return true; -- cgit v1.2.3 From bf9f6ac8d74969690df1485b33b7c238ca9f2269 Mon Sep 17 00:00:00 2001 From: Feng Wu Date: Fri, 18 Sep 2015 22:29:55 +0800 Subject: KVM: Update Posted-Interrupts Descriptor when vCPU is blocked This patch updates the Posted-Interrupts Descriptor when vCPU is blocked. pre-block: - Add the vCPU to the blocked per-CPU list - Set 'NV' to POSTED_INTR_WAKEUP_VECTOR post-block: - Remove the vCPU from the per-CPU list Signed-off-by: Feng Wu [Concentrate invocation of pre/post-block hooks to vcpu_block. - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/locking.txt | 12 +++ arch/x86/include/asm/kvm_host.h | 11 +++ arch/x86/kvm/vmx.c | 153 ++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 34 +++++--- include/linux/kvm_host.h | 3 + virt/kvm/kvm_main.c | 3 + 6 files changed, 206 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index d68af4dc3006..19f94a6b9bb0 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt @@ -166,3 +166,15 @@ Comment: The srcu read lock must be held while accessing memslots (e.g. MMIO/PIO address->device structure mapping (kvm->buses). The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu if it is needed by multiple functions. + +Name: blocked_vcpu_on_cpu_lock +Type: spinlock_t +Arch: x86 +Protects: blocked_vcpu_on_cpu +Comment: This is a per-CPU lock and it is used for VT-d posted-interrupts. + When VT-d posted-interrupts is supported and the VM has assigned + devices, we put the blocked vCPU on the list blocked_vcpu_on_cpu + protected by blocked_vcpu_on_cpu_lock, when VT-d hardware issues + wakeup notification event since external interrupts from the + assigned devices happens, we will find the vCPU on the list to + wakeup. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 15664994b6f3..cdbdb559ecd2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -899,6 +899,17 @@ struct kvm_x86_ops { /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; + /* + * Architecture specific hooks for vCPU blocking due to + * HLT instruction. + * Returns for .pre_block(): + * - 0 means continue to block the vCPU. + * - 1 means we cannot block the vCPU since some event + * happens during this period, such as, 'ON' bit in + * posted-interrupts descriptor is set. + */ + int (*pre_block)(struct kvm_vcpu *vcpu); + void (*post_block)(struct kvm_vcpu *vcpu); int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); }; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 99f5c61954ea..c5c22831aee2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -878,6 +878,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); static DEFINE_PER_CPU(struct desc_ptr, host_gdt); +/* + * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we + * can find which vCPU should be waken up. + */ +static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); +static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); + static unsigned long *vmx_io_bitmap_a; static unsigned long *vmx_io_bitmap_b; static unsigned long *vmx_msr_bitmap_legacy; @@ -2986,6 +2993,8 @@ static int hardware_enable(void) return -EBUSY; INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); + INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); + spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); /* * Now we can enable the vmclear operation in kdump @@ -6045,6 +6054,25 @@ static void update_ple_window_actual_max(void) ple_window_grow, INT_MIN); } +/* + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. + */ +static void wakeup_handler(void) +{ + struct kvm_vcpu *vcpu; + int cpu = smp_processor_id(); + + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), + blocked_vcpu_list) { + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (pi_test_on(pi_desc) == 1) + kvm_vcpu_kick(vcpu); + } + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); +} + static __init int hardware_setup(void) { int r = -ENOMEM, i, msr; @@ -6231,6 +6259,8 @@ static __init int hardware_setup(void) kvm_x86_ops->enable_log_dirty_pt_masked = NULL; } + kvm_set_posted_intr_wakeup_handler(wakeup_handler); + return alloc_kvm_area(); out8: @@ -10431,6 +10461,126 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } +/* + * This routine does the following things for vCPU which is going + * to be blocked if VT-d PI is enabled. + * - Store the vCPU to the wakeup list, so when interrupts happen + * we can find the right vCPU to wake up. + * - Change the Posted-interrupt descriptor as below: + * 'NDST' <-- vcpu->pre_pcpu + * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR + * - If 'ON' is set during this process, which means at least one + * interrupt is posted for this vCPU, we cannot block it, in + * this case, return 1, otherwise, return 0. + * + */ +static int vmx_pre_block(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + unsigned int dest; + struct pi_desc old, new; + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return 0; + + vcpu->pre_pcpu = vcpu->cpu; + spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + list_add_tail(&vcpu->blocked_vcpu_list, + &per_cpu(blocked_vcpu_on_cpu, + vcpu->pre_pcpu)); + spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + + do { + old.control = new.control = pi_desc->control; + + /* + * We should not block the vCPU if + * an interrupt is posted for it. + */ + if (pi_test_on(pi_desc) == 1) { + spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock_irqrestore( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + vcpu->pre_pcpu = -1; + + return 1; + } + + WARN((pi_desc->sn == 1), + "Warning: SN field of posted-interrupts " + "is set before blocking\n"); + + /* + * Since vCPU can be preempted during this process, + * vcpu->cpu could be different with pre_pcpu, we + * need to set pre_pcpu as the destination of wakeup + * notification event, then we can find the right vCPU + * to wakeup in wakeup handler if interrupts happen + * when the vCPU is in blocked state. + */ + dest = cpu_physical_id(vcpu->pre_pcpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* set 'NV' to 'wakeup vector' */ + new.nv = POSTED_INTR_WAKEUP_VECTOR; + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); + + return 0; +} + +static void vmx_post_block(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + unsigned long flags; + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return; + + do { + old.control = new.control = pi_desc->control; + + dest = cpu_physical_id(vcpu->cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* Allow posting non-urgent interrupts */ + new.sn = 0; + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); + + if(vcpu->pre_pcpu != -1) { + spin_lock_irqsave( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock_irqrestore( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + vcpu->pre_pcpu = -1; + } +} + /* * vmx_update_pi_irte - set IRTE for Posted-Interrupts * @@ -10622,6 +10772,9 @@ static struct kvm_x86_ops vmx_x86_ops = { .flush_log_dirty = vmx_flush_log_dirty, .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, + .pre_block = vmx_pre_block, + .post_block = vmx_post_block, + .pmu_ops = &intel_pmu_ops, .update_pi_irte = vmx_update_pi_irte, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b8425a769c0a..2d2c9bb0d6d6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6335,6 +6335,20 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } } + /* + * KVM_REQ_EVENT is not set when posted interrupts are set by + * VT-d hardware, so we have to update RVI unconditionally. + */ + if (kvm_lapic_enabled(vcpu)) { + /* + * Update architecture specific hints for APIC + * virtual interrupt delivery. + */ + if (kvm_x86_ops->hwapic_irr_update) + kvm_x86_ops->hwapic_irr_update(vcpu, + kvm_lapic_find_highest_irr(vcpu)); + } + if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { kvm_apic_accept_events(vcpu); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { @@ -6351,13 +6365,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops->enable_irq_window(vcpu); if (kvm_lapic_enabled(vcpu)) { - /* - * Update architecture specific hints for APIC - * virtual interrupt delivery. - */ - if (kvm_x86_ops->hwapic_irr_update) - kvm_x86_ops->hwapic_irr_update(vcpu, - kvm_lapic_find_highest_irr(vcpu)); update_cr8_intercept(vcpu); kvm_lapic_sync_to_vapic(vcpu); } @@ -6493,10 +6500,15 @@ out: static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { - if (!kvm_arch_vcpu_runnable(vcpu)) { + if (!kvm_arch_vcpu_runnable(vcpu) && + (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + + if (kvm_x86_ops->post_block) + kvm_x86_ops->post_block(vcpu); + if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) return 1; } @@ -6528,10 +6540,12 @@ static int vcpu_run(struct kvm_vcpu *vcpu) for (;;) { if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && - !vcpu->arch.apf.halted) + !vcpu->arch.apf.halted) { r = vcpu_enter_guest(vcpu); - else + } else { r = vcpu_block(kvm, vcpu); + } + if (r <= 0) break; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5c3f4538807f..9596a2f0977b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -234,6 +234,9 @@ struct kvm_vcpu { unsigned long requests; unsigned long guest_debug; + int pre_pcpu; + struct list_head blocked_vcpu_list; + struct mutex mutex; struct kvm_run *run; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index afd7ae6aec65..a75502c93c3e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -230,6 +230,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) init_waitqueue_head(&vcpu->wq); kvm_async_pf_vcpu_init(vcpu); + vcpu->pre_pcpu = -1; + INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); + page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) { r = -ENOMEM; -- cgit v1.2.3 From a523841ee4e506fa1f05ff3a85b1e6d8176a3d4d Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 1 Oct 2015 15:36:48 -0700 Subject: arch/x86/include/asm/efi.h: fix build failure With KMEMCHECK=y, KASAN=n: arch/x86/platform/efi/efi.c:673:3: error: implicit declaration of function `memcpy' [-Werror=implicit-function-declaration] arch/x86/platform/efi/efi_64.c:139:2: error: implicit declaration of function `memcpy' [-Werror=implicit-function-declaration] arch/x86/include/asm/desc.h:121:2: error: implicit declaration of function `memcpy' [-Werror=implicit-function-declaration] Don't #undef memcpy if KASAN=n. Fixes: 769a8089c1fd ("x86, efi, kasan: #undef memset/memcpy/memmove per arch") Signed-off-by: Andrey Ryabinin Reported-by: Ingo Molnar Reported-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/efi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index ab5f1d447ef9..ae68be92f755 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -86,6 +86,7 @@ extern u64 asmlinkage efi_call(void *fp, ...); extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, u32 type, u64 attribute); +#ifdef CONFIG_KASAN /* * CONFIG_KASAN may redefine memset to __memset. __memset function is present * only in kernel binary. Since the EFI stub linked into a separate binary it @@ -95,6 +96,7 @@ extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, #undef memcpy #undef memset #undef memmove +#endif #endif /* CONFIG_X86_32 */ -- cgit v1.2.3 From e3c41e37b0f4b18cbd4dac76cbeece5a7558b909 Mon Sep 17 00:00:00 2001 From: "Lee, Chun-Yi" Date: Tue, 29 Sep 2015 20:58:57 +0800 Subject: x86/kexec: Fix kexec crash in syscall kexec_file_load() The original bug is a page fault crash that sometimes happens on big machines when preparing ELF headers: BUG: unable to handle kernel paging request at ffffc90613fc9000 IP: [] prepare_elf64_ram_headers_callback+0x165/0x260 The bug is caused by us under-counting the number of memory ranges and subsequently not allocating enough ELF header space for them. The bug is typically masked on smaller systems, because the ELF header allocation is rounded up to the next page. This patch modifies the code in fill_up_crash_elf_data() by using walk_system_ram_res() instead of walk_system_ram_range() to correctly count the max number of crash memory ranges. That's because the walk_system_ram_range() filters out small memory regions that reside in the same page, but walk_system_ram_res() does not. Here's how I found the bug: After tracing prepare_elf64_headers() and prepare_elf64_ram_headers_callback(), the code uses walk_system_ram_res() to fill-in crash memory regions information to the program header, so it counts those small memory regions that reside in a page area. But, when the kernel was using walk_system_ram_range() in fill_up_crash_elf_data() to count the number of crash memory regions, it filters out small regions. I printed those small memory regions, for example: kexec: Get nr_ram ranges. vaddr=0xffff880077592258 paddr=0x77592258, sz=0xdc0 Based on the code in walk_system_ram_range(), this memory region will be filtered out: pfn = (0x77592258 + 0x1000 - 1) >> 12 = 0x77593 end_pfn = (0x77592258 + 0xfc0 -1 + 1) >> 12 = 0x77593 end_pfn - pfn = 0x77593 - 0x77593 = 0 <=== if (end_pfn > pfn) is FALSE So, the max_nr_ranges that's counted by the kernel doesn't include small memory regions - causing us to under-allocate the required space. That causes the page fault crash that happens in a later code path when preparing ELF headers. This bug is not easy to reproduce on small machines that have few CPUs, because the allocated page aligned ELF buffer has more free space to cover those small memory regions' PT_LOAD headers. Signed-off-by: Lee, Chun-Yi Cc: Andy Lutomirski Cc: Baoquan He Cc: Jiang Liu Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Stephen Rothwell Cc: Takashi Iwai Cc: Thomas Gleixner Cc: Viresh Kumar Cc: Vivek Goyal Cc: kexec@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: Link: http://lkml.kernel.org/r/1443531537-29436-1-git-send-email-jlee@suse.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/crash.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index e068d6683dba..74ca2fe7a0b3 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -185,10 +185,9 @@ void native_machine_crash_shutdown(struct pt_regs *regs) } #ifdef CONFIG_KEXEC_FILE -static int get_nr_ram_ranges_callback(unsigned long start_pfn, - unsigned long nr_pfn, void *arg) +static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg) { - int *nr_ranges = arg; + unsigned int *nr_ranges = arg; (*nr_ranges)++; return 0; @@ -214,7 +213,7 @@ static void fill_up_crash_elf_data(struct crash_elf_data *ced, ced->image = image; - walk_system_ram_range(0, -1, &nr_ranges, + walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback); ced->max_nr_ranges = nr_ranges; -- cgit v1.2.3 From ab76f7b4ab2397ffdd2f1eb07c55697d19991d10 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Thu, 1 Oct 2015 09:04:22 -0400 Subject: x86/mm: Set NX on gap between __ex_table and rodata Unused space between the end of __ex_table and the start of rodata can be left W+x in the kernel page tables. Extend the setting of the NX bit to cover this gap by starting from text_end rather than rodata_start. Before: ---[ High Kernel Mapping ]--- 0xffffffff80000000-0xffffffff81000000 16M pmd 0xffffffff81000000-0xffffffff81600000 6M ro PSE GLB x pmd 0xffffffff81600000-0xffffffff81754000 1360K ro GLB x pte 0xffffffff81754000-0xffffffff81800000 688K RW GLB x pte 0xffffffff81800000-0xffffffff81a00000 2M ro PSE GLB NX pmd 0xffffffff81a00000-0xffffffff81b3b000 1260K ro GLB NX pte 0xffffffff81b3b000-0xffffffff82000000 4884K RW GLB NX pte 0xffffffff82000000-0xffffffff82200000 2M RW PSE GLB NX pmd 0xffffffff82200000-0xffffffffa0000000 478M pmd After: ---[ High Kernel Mapping ]--- 0xffffffff80000000-0xffffffff81000000 16M pmd 0xffffffff81000000-0xffffffff81600000 6M ro PSE GLB x pmd 0xffffffff81600000-0xffffffff81754000 1360K ro GLB x pte 0xffffffff81754000-0xffffffff81800000 688K RW GLB NX pte 0xffffffff81800000-0xffffffff81a00000 2M ro PSE GLB NX pmd 0xffffffff81a00000-0xffffffff81b3b000 1260K ro GLB NX pte 0xffffffff81b3b000-0xffffffff82000000 4884K RW GLB NX pte 0xffffffff82000000-0xffffffff82200000 2M RW PSE GLB NX pmd 0xffffffff82200000-0xffffffffa0000000 478M pmd Signed-off-by: Stephen Smalley Acked-by: Kees Cook Cc: Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1443704662-3138-1-git-send-email-sds@tycho.nsa.gov Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 30564e2752d3..df48430c279b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1132,7 +1132,7 @@ void mark_rodata_ro(void) * has been zapped already via cleanup_highmem(). */ all_end = roundup((unsigned long)_brk_end, PMD_SIZE); - set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT); + set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT); rodata_test(); -- cgit v1.2.3 From f4b4aae1828855db761bf998ce37d3062b1d6446 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 1 Oct 2015 01:40:43 +0100 Subject: x86/headers/uapi: Fix __BITS_PER_LONG value for x32 builds On x32, gcc predefines __x86_64__ but long is only 32-bit. Use __ILP32__ to distinguish x32. Fixes this compiler error in perf: tools/include/asm-generic/bitops/__ffs.h: In function '__ffs': tools/include/asm-generic/bitops/__ffs.h:19:8: error: right shift count >= width of type [-Werror=shift-count-overflow] word >>= 32; ^ This isn't sufficient to build perf for x32, though. Signed-off-by: Ben Hutchings Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1443660043.2730.15.camel@decadent.org.uk Signed-off-by: Ingo Molnar --- arch/x86/include/uapi/asm/bitsperlong.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/bitsperlong.h b/arch/x86/include/uapi/asm/bitsperlong.h index b0ae1c4dc791..217909b4d6f5 100644 --- a/arch/x86/include/uapi/asm/bitsperlong.h +++ b/arch/x86/include/uapi/asm/bitsperlong.h @@ -1,7 +1,7 @@ #ifndef __ASM_X86_BITSPERLONG_H #define __ASM_X86_BITSPERLONG_H -#ifdef __x86_64__ +#if defined(__x86_64__) && !defined(__ILP32__) # define __BITS_PER_LONG 64 #else # define __BITS_PER_LONG 32 -- cgit v1.2.3 From a91263d520246b63c63e75ddfb072ee6a853fe15 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Sep 2015 01:41:50 +0200 Subject: ebpf: migrate bpf_prog's flags to bitfield As we need to add further flags to the bpf_prog structure, lets migrate both bools to a bitfield representation. The size of the base structure (excluding insns) remains unchanged at 40 bytes. Add also tags for the kmemchecker, so that it doesn't throw false positives. Even in case gcc would generate suboptimal code, it's not being accessed in performance critical paths. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/arm/net/bpf_jit_32.c | 2 +- arch/arm64/net/bpf_jit_comp.c | 2 +- arch/mips/net/bpf_jit.c | 2 +- arch/powerpc/net/bpf_jit_comp.c | 2 +- arch/s390/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp.c | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- include/linux/filter.h | 6 ++++-- kernel/bpf/core.c | 4 ++++ kernel/bpf/syscall.c | 4 ++-- net/core/filter.c | 2 +- 11 files changed, 18 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 876060bcceeb..0df5fd561513 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1047,7 +1047,7 @@ void bpf_jit_compile(struct bpf_prog *fp) set_memory_ro((unsigned long)header, header->pages); fp->bpf_func = (void *)ctx.target; - fp->jited = true; + fp->jited = 1; out: kfree(ctx.offsets); return; diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index c047598b09e0..a44e5293c6f5 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -744,7 +744,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog) set_memory_ro((unsigned long)header, header->pages); prog->bpf_func = (void *)ctx.image; - prog->jited = true; + prog->jited = 1; out: kfree(ctx.offset); } diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 0c4a133f6216..77cb27309db2 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -1251,7 +1251,7 @@ void bpf_jit_compile(struct bpf_prog *fp) bpf_jit_dump(fp->len, alloc_size, 2, ctx.target); fp->bpf_func = (void *)ctx.target; - fp->jited = true; + fp->jited = 1; out: kfree(ctx.offsets); diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 17cea18a09d3..04782164ee67 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -679,7 +679,7 @@ void bpf_jit_compile(struct bpf_prog *fp) ((u64 *)image)[1] = local_paca->kernel_toc; #endif fp->bpf_func = (void *)image; - fp->jited = true; + fp->jited = 1; } out: kfree(addrs); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index eeda051442c3..9a0c4c22e536 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1310,7 +1310,7 @@ void bpf_int_jit_compile(struct bpf_prog *fp) if (jit.prg_buf) { set_memory_ro((unsigned long)header, header->pages); fp->bpf_func = (void *) jit.prg_buf; - fp->jited = true; + fp->jited = 1; } free_addrs: kfree(jit.addrs); diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c index f8b9f71b9a2b..22564f5f2364 100644 --- a/arch/sparc/net/bpf_jit_comp.c +++ b/arch/sparc/net/bpf_jit_comp.c @@ -812,7 +812,7 @@ cond_branch: f_offset = addrs[i + filter[i].jf]; if (image) { bpf_flush_icache(image, image + proglen); fp->bpf_func = (void *)image; - fp->jited = true; + fp->jited = 1; } out: kfree(addrs); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 70efcd0940f9..75991979f667 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1109,7 +1109,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog) bpf_flush_icache(header, image + proglen); set_memory_ro((unsigned long)header, header->pages); prog->bpf_func = (void *)image; - prog->jited = true; + prog->jited = 1; } out: kfree(addrs); diff --git a/include/linux/filter.h b/include/linux/filter.h index fa2cab985e57..bad618f316d7 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -326,8 +326,10 @@ struct bpf_binary_header { struct bpf_prog { u16 pages; /* Number of allocated pages */ - bool jited; /* Is our filter JIT'ed? */ - bool gpl_compatible; /* Is our filter GPL compatible? */ + kmemcheck_bitfield_begin(meta); + u16 jited:1, /* Is our filter JIT'ed? */ + gpl_compatible:1; /* Is filter GPL compatible? */ + kmemcheck_bitfield_end(meta); u32 len; /* Number of filter blocks */ enum bpf_prog_type type; /* Type of BPF program */ struct bpf_prog_aux *aux; /* Auxiliary fields */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 67c380cfa9ca..c8855c2a7a48 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -82,6 +82,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (fp == NULL) return NULL; + kmemcheck_annotate_bitfield(fp, meta); + aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); if (aux == NULL) { vfree(fp); @@ -110,6 +112,8 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); if (fp != NULL) { + kmemcheck_annotate_bitfield(fp, meta); + memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = size / PAGE_SIZE; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 35bac8e8b071..2190ab14b763 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -553,10 +553,10 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_prog; prog->orig_prog = NULL; - prog->jited = false; + prog->jited = 0; atomic_set(&prog->aux->refcnt, 1); - prog->gpl_compatible = is_gpl; + prog->gpl_compatible = is_gpl ? 1 : 0; /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); diff --git a/net/core/filter.c b/net/core/filter.c index 60e3fe7c59c0..04664acb86ce 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1001,7 +1001,7 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, int err; fp->bpf_func = NULL; - fp->jited = false; + fp->jited = 0; err = bpf_check_classic(fp->insns, fp->len); if (err) { -- cgit v1.2.3 From e1a58320a38dfa72be48a0f1a3a92273663ba6db Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Mon, 5 Oct 2015 12:55:20 -0400 Subject: x86/mm: Warn on W^X mappings Warn on any residual W+X mappings after setting NX if DEBUG_WX is enabled. Introduce a separate X86_PTDUMP_CORE config that enables the code for dumping the page tables without enabling the debugfs interface, so that DEBUG_WX can be enabled without exposing the debugfs interface. Switch EFI_PGT_DUMP to using X86_PTDUMP_CORE so that it also does not require enabling the debugfs interface. On success it prints this to the kernel log: x86/mm: Checked W+X mappings: passed, no W+X pages found. On failure it prints a warning and a count of the failed pages: ------------[ cut here ]------------ WARNING: CPU: 1 PID: 1 at arch/x86/mm/dump_pagetables.c:226 note_page+0x610/0x7b0() x86/mm: Found insecure W+X mapping at address ffffffff81755000/__stop___ex_table+0xfa8/0xabfa8 [...] Call Trace: [] dump_stack+0x44/0x55 [] warn_slowpath_common+0x82/0xc0 [] warn_slowpath_fmt+0x5c/0x80 [] ? note_page+0x5c9/0x7b0 [] note_page+0x610/0x7b0 [] ptdump_walk_pgd_level_core+0x259/0x3c0 [] ptdump_walk_pgd_level_checkwx+0x17/0x20 [] mark_rodata_ro+0xf5/0x100 [] ? rest_init+0x80/0x80 [] kernel_init+0x1d/0xe0 [] ret_from_fork+0x3f/0x70 [] ? rest_init+0x80/0x80 ---[ end trace a1f23a1e42a2ac76 ]--- x86/mm: Checked W+X mappings: FAILED, 171 W+X pages found. Signed-off-by: Stephen Smalley Acked-by: Kees Cook Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1444064120-11450-1-git-send-email-sds@tycho.nsa.gov [ Improved the Kconfig help text and made the new option default-y if CONFIG_DEBUG_RODATA=y, because it already found buggy mappings, so we really want people to have this on by default. ] Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 36 +++++++++++++++++++++++++++++++++++- arch/x86/include/asm/pgtable.h | 7 +++++++ arch/x86/mm/Makefile | 2 +- arch/x86/mm/dump_pagetables.c | 42 +++++++++++++++++++++++++++++++++++++++++- arch/x86/mm/init_32.c | 2 ++ arch/x86/mm/init_64.c | 2 ++ 6 files changed, 88 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index d8c0d3266173..3e0baf726eef 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -65,10 +65,14 @@ config EARLY_PRINTK_EFI This is useful for kernel debugging when your machine crashes very early before the console code is initialized. +config X86_PTDUMP_CORE + def_bool n + config X86_PTDUMP bool "Export kernel pagetable layout to userspace via debugfs" depends on DEBUG_KERNEL select DEBUG_FS + select X86_PTDUMP_CORE ---help--- Say Y here if you want to show the kernel pagetable layout in a debugfs file. This information is only useful for kernel developers @@ -79,7 +83,8 @@ config X86_PTDUMP config EFI_PGT_DUMP bool "Dump the EFI pagetable" - depends on EFI && X86_PTDUMP + depends on EFI + select X86_PTDUMP_CORE ---help--- Enable this if you want to dump the EFI page table before enabling virtual mode. This can be used to debug miscellaneous @@ -105,6 +110,35 @@ config DEBUG_RODATA_TEST feature as well as for the change_page_attr() infrastructure. If in doubt, say "N" +config DEBUG_WX + bool "Warn on W+X mappings at boot" + depends on DEBUG_RODATA + default y + select X86_PTDUMP_CORE + ---help--- + Generate a warning if any W+X mappings are found at boot. + + This is useful for discovering cases where the kernel is leaving + W+X mappings after applying NX, as such mappings are a security risk. + + Look for a message in dmesg output like this: + + x86/mm: Checked W+X mappings: passed, no W+X pages found. + + or like this, if the check failed: + + x86/mm: Checked W+X mappings: FAILED, W+X pages found. + + Note that even if the check fails, your kernel is possibly + still fine, as W+X mappings are not a security hole in + themselves, what they do is that they make the exploitation + of other unfixed kernel bugs easier. + + There is no runtime or memory usage effect of this option + once the kernel has booted up - it's a one time check. + + If in doubt, say "Y". + config DEBUG_SET_MODULE_RONX bool "Set loadable kernel module data as NX and text as RO" depends on MODULES diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 59fc3414c68b..c0b41f111a9a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -19,6 +19,13 @@ #include void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); +void ptdump_walk_pgd_level_checkwx(void); + +#ifdef CONFIG_DEBUG_WX +#define debug_checkwx() ptdump_walk_pgd_level_checkwx() +#else +#define debug_checkwx() do { } while (0) +#endif /* * ZERO_PAGE is a global shared page that is always zero: used diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index a482d105172b..65c47fda26fc 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_SMP) += tlb.o obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o +obj-$(CONFIG_X86_PTDUMP_CORE) += dump_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 71ab2d741024..1bf417e9cc13 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -32,6 +32,8 @@ struct pg_state { const struct addr_marker *marker; unsigned long lines; bool to_dmesg; + bool check_wx; + unsigned long wx_pages; }; struct addr_marker { @@ -214,6 +216,16 @@ static void note_page(struct seq_file *m, struct pg_state *st, const char *unit = units; unsigned long delta; int width = sizeof(unsigned long) * 2; + pgprotval_t pr = pgprot_val(st->current_prot); + + if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) { + WARN_ONCE(1, + "x86/mm: Found insecure W+X mapping at address %p/%pS\n", + (void *)st->start_address, + (void *)st->start_address); + st->wx_pages += (st->current_address - + st->start_address) / PAGE_SIZE; + } /* * Now print the actual finished series @@ -346,7 +358,8 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, #define pgd_none(a) pud_none(__pud(pgd_val(a))) #endif -void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) +static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, + bool checkwx) { #ifdef CONFIG_X86_64 pgd_t *start = (pgd_t *) &init_level4_pgt; @@ -362,6 +375,10 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) st.to_dmesg = true; } + st.check_wx = checkwx; + if (checkwx) + st.wx_pages = 0; + for (i = 0; i < PTRS_PER_PGD; i++) { st.current_address = normalize_addr(i * PGD_LEVEL_MULT); if (!pgd_none(*start)) { @@ -381,8 +398,26 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) /* Flush out the last page */ st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT); note_page(m, &st, __pgprot(0), 0); + if (!checkwx) + return; + if (st.wx_pages) + pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n", + st.wx_pages); + else + pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n"); +} + +void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) +{ + ptdump_walk_pgd_level_core(m, pgd, false); } +void ptdump_walk_pgd_level_checkwx(void) +{ + ptdump_walk_pgd_level_core(NULL, NULL, true); +} + +#ifdef CONFIG_X86_PTDUMP static int ptdump_show(struct seq_file *m, void *v) { ptdump_walk_pgd_level(m, NULL); @@ -400,10 +435,13 @@ static const struct file_operations ptdump_fops = { .llseek = seq_lseek, .release = single_release, }; +#endif static int pt_dump_init(void) { +#ifdef CONFIG_X86_PTDUMP struct dentry *pe; +#endif #ifdef CONFIG_X86_32 /* Not a compile-time constant on x86-32 */ @@ -415,10 +453,12 @@ static int pt_dump_init(void) address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; #endif +#ifdef CONFIG_X86_PTDUMP pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, &ptdump_fops); if (!pe) return -ENOMEM; +#endif return 0; } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 7562f42914b4..cb4ef3de61f9 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -957,6 +957,8 @@ void mark_rodata_ro(void) set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); #endif mark_nxdata_nx(); + if (__supported_pte_mask & _PAGE_NX) + debug_checkwx(); } #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 30564e2752d3..f8b157366700 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1150,6 +1150,8 @@ void mark_rodata_ro(void) free_init_pages("unused kernel", (unsigned long) __va(__pa_symbol(rodata_end)), (unsigned long) __va(__pa_symbol(_sdata))); + + debug_checkwx(); } #endif -- cgit v1.2.3 From 98dd166ea3a3c3b57919e20d9b0d1237fcd0349d Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 7 Sep 2015 17:14:08 +0100 Subject: x86/xen/p2m: hint at the last populated P2M entry With commit 633d6f17cd91ad5bf2370265946f716e42d388c6 (x86/xen: prepare p2m list for memory hotplug) the P2M may be sized to accomdate a much larger amount of memory than the domain currently has. When saving a domain, the toolstack must scan all the P2M looking for populated pages. This results in a performance regression due to the unnecessary scanning. Instead of reporting (via shared_info) the maximum possible size of the P2M, hint at the last PFN which might be populated. This hint is increased as new leaves are added to the P2M (in the expectation that they will be used for populated entries). Signed-off-by: David Vrabel Cc: # 4.0+ --- arch/x86/xen/p2m.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index bfc08b13044b..660b3cfef234 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -112,6 +112,15 @@ static unsigned long *p2m_identity; static pte_t *p2m_missing_pte; static pte_t *p2m_identity_pte; +/* + * Hint at last populated PFN. + * + * Used to set HYPERVISOR_shared_info->arch.max_pfn so the toolstack + * can avoid scanning the whole P2M (which may be sized to account for + * hotplugged memory). + */ +static unsigned long xen_p2m_last_pfn; + static inline unsigned p2m_top_index(unsigned long pfn) { BUG_ON(pfn >= MAX_P2M_PFN); @@ -270,7 +279,7 @@ void xen_setup_mfn_list_list(void) else HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = virt_to_mfn(p2m_top_mfn); - HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; + HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn; HYPERVISOR_shared_info->arch.p2m_generation = 0; HYPERVISOR_shared_info->arch.p2m_vaddr = (unsigned long)xen_p2m_addr; HYPERVISOR_shared_info->arch.p2m_cr3 = @@ -406,6 +415,8 @@ void __init xen_vmalloc_p2m_tree(void) static struct vm_struct vm; unsigned long p2m_limit; + xen_p2m_last_pfn = xen_max_p2m_pfn; + p2m_limit = (phys_addr_t)P2M_LIMIT * 1024 * 1024 * 1024 / PAGE_SIZE; vm.flags = VM_ALLOC; vm.size = ALIGN(sizeof(unsigned long) * max(xen_max_p2m_pfn, p2m_limit), @@ -608,6 +619,12 @@ static bool alloc_p2m(unsigned long pfn) free_p2m_page(p2m); } + /* Expanded the p2m? */ + if (pfn > xen_p2m_last_pfn) { + xen_p2m_last_pfn = pfn; + HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn; + } + return true; } -- cgit v1.2.3 From 609ca066386b2e64d4c0b0f55da327654962a0c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 17:52:18 +0200 Subject: sched/core: Create preempt_count invariant Assuming units of PREEMPT_DISABLE_OFFSET for preempt_count() numbers. Now that TASK_DEAD no longer results in preempt_count() == 3 during scheduling, we will always call context_switch() with preempt_count() == 2. However, we don't always end up with preempt_count() == 2 in finish_task_switch() because new tasks get created with preempt_count() == 1. Create FORK_PREEMPT_COUNT and set it to 2 and use that in the right places. Note that we cannot use INIT_PREEMPT_COUNT as that serves another purpose (boot). After this, preempt_count() is invariant across the context switch, with exception of PREEMPT_ACTIVE. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/preempt.h | 2 +- include/asm-generic/preempt.h | 2 +- include/linux/sched.h | 17 ++++++++++++----- kernel/sched/core.c | 23 +++++++++++++++++++++-- 4 files changed, 35 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index b12f81022a6b..01e700d392cb 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -31,7 +31,7 @@ static __always_inline void preempt_count_set(int pc) * must be macros to avoid header recursion hell */ #define init_task_preempt_count(p) do { \ - task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \ + task_thread_info(p)->saved_preempt_count = FORK_PREEMPT_COUNT; \ } while (0) #define init_idle_preempt_count(p, cpu) do { \ diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index 0bec580a4885..5d8ffa3e6f8c 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -24,7 +24,7 @@ static __always_inline void preempt_count_set(int pc) * must be macros to avoid header recursion hell */ #define init_task_preempt_count(p) do { \ - task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ + task_thread_info(p)->preempt_count = FORK_PREEMPT_COUNT; \ } while (0) #define init_idle_preempt_count(p, cpu) do { \ diff --git a/include/linux/sched.h b/include/linux/sched.h index e5b8cbc4b8d6..23ca455d9582 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -599,11 +599,7 @@ struct task_cputime_atomic { .sum_exec_runtime = ATOMIC64_INIT(0), \ } -#ifdef CONFIG_PREEMPT_COUNT -#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED) -#else -#define PREEMPT_DISABLED PREEMPT_ENABLED -#endif +#define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) /* * Disable preemption until the scheduler is running -- use an unconditional @@ -613,6 +609,17 @@ struct task_cputime_atomic { */ #define INIT_PREEMPT_COUNT PREEMPT_OFFSET +/* + * Initial preempt_count value; reflects the preempt_count schedule invariant + * which states that during context switches: + * + * preempt_count() == 2*PREEMPT_DISABLE_OFFSET + * + * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels. + * Note: See finish_task_switch(). + */ +#define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) + /** * struct thread_group_cputimer - thread group interval timer counts * @cputime_atomic: atomic thread group interval timers. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 530fe8baa645..8d8722b84dee 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2504,6 +2504,18 @@ static struct rq *finish_task_switch(struct task_struct *prev) struct mm_struct *mm = rq->prev_mm; long prev_state; + /* + * The previous task will have left us with a preempt_count of 2 + * because it left us after: + * + * schedule() + * preempt_disable(); // 1 + * __schedule() + * raw_spin_lock_irq(&rq->lock) // 2 + * + * Also, see FORK_PREEMPT_COUNT. + */ + rq->prev_mm = NULL; /* @@ -2588,8 +2600,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) { struct rq *rq; - /* finish_task_switch() drops rq->lock and enables preemtion */ - preempt_disable(); + /* + * New tasks start with FORK_PREEMPT_COUNT, see there and + * finish_task_switch() for details. + * + * finish_task_switch() will drop rq->lock() and lower preempt_count + * and the preempt_enable() will end up enabling preemption (on + * PREEMPT_COUNT kernels). + */ + rq = finish_task_switch(prev); balance_callback(rq); preempt_enable(); -- cgit v1.2.3 From d87b7a33794f52226131f93cbc9db03274d9fecf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 18:11:18 +0200 Subject: sched/core, sched/x86: Kill thread_info::saved_preempt_count With the introduction of the context switch preempt_count invariant, and the demise of PREEMPT_ACTIVE, its pointless to save/restore the per-cpu preemption count, it must always be 2. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/preempt.h | 5 +---- arch/x86/include/asm/thread_info.h | 2 -- arch/x86/kernel/process_32.c | 8 -------- arch/x86/kernel/process_64.c | 8 -------- 4 files changed, 1 insertion(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 01e700d392cb..01bcde84d3e4 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -30,12 +30,9 @@ static __always_inline void preempt_count_set(int pc) /* * must be macros to avoid header recursion hell */ -#define init_task_preempt_count(p) do { \ - task_thread_info(p)->saved_preempt_count = FORK_PREEMPT_COUNT; \ -} while (0) +#define init_task_preempt_count(p) do { } while (0) #define init_idle_preempt_count(p, cpu) do { \ - task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \ per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \ } while (0) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 8afdc3e44247..809877e9030b 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -57,7 +57,6 @@ struct thread_info { __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ - int saved_preempt_count; mm_segment_t addr_limit; void __user *sysenter_return; unsigned int sig_on_uaccess_error:1; @@ -69,7 +68,6 @@ struct thread_info { .task = &tsk, \ .flags = 0, \ .cpu = 0, \ - .saved_preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 737527b40e5b..9f950917528b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -279,14 +279,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) set_iopl_mask(next->iopl); - /* - * If it were not for PREEMPT_ACTIVE we could guarantee that the - * preempt_count of all tasks was equal here and this would not be - * needed. - */ - task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); - this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); - /* * Now maybe handle debug registers and/or IO bitmaps */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b35921a670b2..d7f1d5c4387d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -401,14 +401,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ this_cpu_write(current_task, next_p); - /* - * If it were not for PREEMPT_ACTIVE we could guarantee that the - * preempt_count of all tasks was equal here and this would not be - * needed. - */ - task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); - this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); - /* Reload esp0 and ss1. This changes current_thread_info(). */ load_sp0(tss, next); -- cgit v1.2.3 From 7ce1346a6842550a3c4c453cdf1c7b81fb60b07e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 28 Sep 2015 08:30:04 -0400 Subject: perf/x86: Add Intel cstate PMUs support This patch adds new PMUs to support cstate related free running (read-only) counters. These counters may be used simultaneously by other tools, such as turbostat. However, it still make sense to implement them in perf. Because we can conveniently collect them together with other events, and allow to use them from tools without special MSR access code. These counters include CORE_C*_RESIDENCY and PKG_C*_RESIDENCY. According to counters' scope and category, two PMUs are registered with the perf_event core subsystem. - 'cstate_core': The counter is available for each physical core. The counters include CORE_C*_RESIDENCY. - 'cstate_pkg': The counter is available for each physical package. The counters include PKG_C*_RESIDENCY. The events are exposed in sysfs for use by perf stat and other tools. The files are: /sys/devices/cstate_core/events/c*-residency /sys/devices/cstate_pkg/events/c*-residency These events only support system-wide mode counting. The /sys/devices/cstate_*/cpumask file can be used by tools to figure out which CPUs to monitor by default. The PMU type (attr->type) is dynamically allocated and is available from /sys/devices/core_misc/type and /sys/device/cstate_*/type. Sampling is not supported. Here is an example. - To caculate the fraction of time when the core is running in C6 state CORE_C6_time% = CORE_C6_RESIDENCY / TSC # perf stat -x, -e"cstate_core/c6-residency/,msr/tsc/" -C0 -- taskset -c 0 sleep 5 11838820015,,cstate_core/c6-residency/,5175919658,100.00 11877130740,,msr/tsc/,5175922010,100.00 For sleep, 99.7% of time we ran in C6 state. # perf stat -x, -e"cstate_core/c6-residency/,msr/tsc/" -C0 -- taskset -c 0 busyloop 1253316,,cstate_core/c6-residency/,4360969154,100.00 10012635248,,msr/tsc/,4360972366,100.00 For busyloop, 0.01% of time we ran in C6 state. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Arjan van de Ven Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: eranian@google.com Link: http://lkml.kernel.org/r/1443443404-8581-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/perf_event_intel_cstate.c | 694 ++++++++++++++++++++++++++ 2 files changed, 695 insertions(+) create mode 100644 arch/x86/kernel/cpu/perf_event_intel_cstate.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4eb065c6bed2..58031303e304 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o perf_event_intel_bts.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_cstate.o obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \ perf_event_intel_uncore_snb.o \ diff --git a/arch/x86/kernel/cpu/perf_event_intel_cstate.c b/arch/x86/kernel/cpu/perf_event_intel_cstate.c new file mode 100644 index 000000000000..75a38b5a2e26 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_cstate.c @@ -0,0 +1,694 @@ +/* + * perf_event_intel_cstate.c: support cstate residency counters + * + * Copyright (C) 2015, Intel Corp. + * Author: Kan Liang (kan.liang@intel.com) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + */ + +/* + * This file export cstate related free running (read-only) counters + * for perf. These counters may be use simultaneously by other tools, + * such as turbostat. However, it still make sense to implement them + * in perf. Because we can conveniently collect them together with + * other events, and allow to use them from tools without special MSR + * access code. + * + * The events only support system-wide mode counting. There is no + * sampling support because it is not supported by the hardware. + * + * According to counters' scope and category, two PMUs are registered + * with the perf_event core subsystem. + * - 'cstate_core': The counter is available for each physical core. + * The counters include CORE_C*_RESIDENCY. + * - 'cstate_pkg': The counter is available for each physical package. + * The counters include PKG_C*_RESIDENCY. + * + * All of these counters are specified in the Intel® 64 and IA-32 + * Architectures Software Developer.s Manual Vol3b. + * + * Model specific counters: + * MSR_CORE_C1_RES: CORE C1 Residency Counter + * perf code: 0x00 + * Available model: SLM,AMT + * Scope: Core (each processor core has a MSR) + * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter + * perf code: 0x01 + * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL + * Scope: Core + * MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter + * perf code: 0x02 + * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL + * Scope: Core + * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter + * perf code: 0x03 + * Available model: SNB,IVB,HSW,BDW,SKL + * Scope: Core + * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. + * perf code: 0x00 + * Available model: SNB,IVB,HSW,BDW,SKL + * Scope: Package (physical package) + * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. + * perf code: 0x01 + * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL + * Scope: Package (physical package) + * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. + * perf code: 0x02 + * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL + * Scope: Package (physical package) + * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. + * perf code: 0x03 + * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL + * Scope: Package (physical package) + * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter. + * perf code: 0x04 + * Available model: HSW ULT only + * Scope: Package (physical package) + * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter. + * perf code: 0x05 + * Available model: HSW ULT only + * Scope: Package (physical package) + * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. + * perf code: 0x06 + * Available model: HSW ULT only + * Scope: Package (physical package) + * + */ + +#include +#include +#include +#include +#include "perf_event.h" + +#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __cstate_##_var##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __cstate_##_var##_show, NULL) + +static ssize_t cstate_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, + char *buf); + +struct perf_cstate_msr { + u64 msr; + struct perf_pmu_events_attr *attr; + bool (*test)(int idx); +}; + + +/* cstate_core PMU */ + +static struct pmu cstate_core_pmu; +static bool has_cstate_core; + +enum perf_cstate_core_id { + /* + * cstate_core events + */ + PERF_CSTATE_CORE_C1_RES = 0, + PERF_CSTATE_CORE_C3_RES, + PERF_CSTATE_CORE_C6_RES, + PERF_CSTATE_CORE_C7_RES, + + PERF_CSTATE_CORE_EVENT_MAX, +}; + +bool test_core(int idx) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + boot_cpu_data.x86 != 6) + return false; + + switch (boot_cpu_data.x86_model) { + case 30: /* 45nm Nehalem */ + case 26: /* 45nm Nehalem-EP */ + case 46: /* 45nm Nehalem-EX */ + + case 37: /* 32nm Westmere */ + case 44: /* 32nm Westmere-EP */ + case 47: /* 32nm Westmere-EX */ + if (idx == PERF_CSTATE_CORE_C3_RES || + idx == PERF_CSTATE_CORE_C6_RES) + return true; + break; + case 42: /* 32nm SandyBridge */ + case 45: /* 32nm SandyBridge-E/EN/EP */ + + case 58: /* 22nm IvyBridge */ + case 62: /* 22nm IvyBridge-EP/EX */ + + case 60: /* 22nm Haswell Core */ + case 63: /* 22nm Haswell Server */ + case 69: /* 22nm Haswell ULT */ + case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ + + case 61: /* 14nm Broadwell Core-M */ + case 86: /* 14nm Broadwell Xeon D */ + case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ + case 79: /* 14nm Broadwell Server */ + + case 78: /* 14nm Skylake Mobile */ + case 94: /* 14nm Skylake Desktop */ + if (idx == PERF_CSTATE_CORE_C3_RES || + idx == PERF_CSTATE_CORE_C6_RES || + idx == PERF_CSTATE_CORE_C7_RES) + return true; + break; + case 55: /* 22nm Atom "Silvermont" */ + case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ + case 76: /* 14nm Atom "Airmont" */ + if (idx == PERF_CSTATE_CORE_C1_RES || + idx == PERF_CSTATE_CORE_C6_RES) + return true; + break; + } + + return false; +} + +PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00"); +PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01"); +PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02"); +PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03"); + +static struct perf_cstate_msr core_msr[] = { + [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES, &evattr_cstate_core_c1, test_core, }, + [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY, &evattr_cstate_core_c3, test_core, }, + [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY, &evattr_cstate_core_c6, test_core, }, + [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY, &evattr_cstate_core_c7, test_core, }, +}; + +static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = { + NULL, +}; + +static struct attribute_group core_events_attr_group = { + .name = "events", + .attrs = core_events_attrs, +}; + +DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63"); +static struct attribute *core_format_attrs[] = { + &format_attr_core_event.attr, + NULL, +}; + +static struct attribute_group core_format_attr_group = { + .name = "format", + .attrs = core_format_attrs, +}; + +static cpumask_t cstate_core_cpu_mask; +static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL); + +static struct attribute *cstate_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group cpumask_attr_group = { + .attrs = cstate_cpumask_attrs, +}; + +static const struct attribute_group *core_attr_groups[] = { + &core_events_attr_group, + &core_format_attr_group, + &cpumask_attr_group, + NULL, +}; + +/* cstate_core PMU end */ + + +/* cstate_pkg PMU */ + +static struct pmu cstate_pkg_pmu; +static bool has_cstate_pkg; + +enum perf_cstate_pkg_id { + /* + * cstate_pkg events + */ + PERF_CSTATE_PKG_C2_RES = 0, + PERF_CSTATE_PKG_C3_RES, + PERF_CSTATE_PKG_C6_RES, + PERF_CSTATE_PKG_C7_RES, + PERF_CSTATE_PKG_C8_RES, + PERF_CSTATE_PKG_C9_RES, + PERF_CSTATE_PKG_C10_RES, + + PERF_CSTATE_PKG_EVENT_MAX, +}; + +bool test_pkg(int idx) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + boot_cpu_data.x86 != 6) + return false; + + switch (boot_cpu_data.x86_model) { + case 30: /* 45nm Nehalem */ + case 26: /* 45nm Nehalem-EP */ + case 46: /* 45nm Nehalem-EX */ + + case 37: /* 32nm Westmere */ + case 44: /* 32nm Westmere-EP */ + case 47: /* 32nm Westmere-EX */ + if (idx == PERF_CSTATE_CORE_C3_RES || + idx == PERF_CSTATE_CORE_C6_RES || + idx == PERF_CSTATE_CORE_C7_RES) + return true; + break; + case 42: /* 32nm SandyBridge */ + case 45: /* 32nm SandyBridge-E/EN/EP */ + + case 58: /* 22nm IvyBridge */ + case 62: /* 22nm IvyBridge-EP/EX */ + + case 60: /* 22nm Haswell Core */ + case 63: /* 22nm Haswell Server */ + case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ + + case 61: /* 14nm Broadwell Core-M */ + case 86: /* 14nm Broadwell Xeon D */ + case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ + case 79: /* 14nm Broadwell Server */ + + case 78: /* 14nm Skylake Mobile */ + case 94: /* 14nm Skylake Desktop */ + if (idx == PERF_CSTATE_PKG_C2_RES || + idx == PERF_CSTATE_PKG_C3_RES || + idx == PERF_CSTATE_PKG_C6_RES || + idx == PERF_CSTATE_PKG_C7_RES) + return true; + break; + case 55: /* 22nm Atom "Silvermont" */ + case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ + case 76: /* 14nm Atom "Airmont" */ + if (idx == PERF_CSTATE_CORE_C6_RES) + return true; + break; + case 69: /* 22nm Haswell ULT */ + if (idx == PERF_CSTATE_PKG_C2_RES || + idx == PERF_CSTATE_PKG_C3_RES || + idx == PERF_CSTATE_PKG_C6_RES || + idx == PERF_CSTATE_PKG_C7_RES || + idx == PERF_CSTATE_PKG_C8_RES || + idx == PERF_CSTATE_PKG_C9_RES || + idx == PERF_CSTATE_PKG_C10_RES) + return true; + break; + } + + return false; +} + +PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00"); +PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01"); +PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02"); +PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_pkg_c7, "event=0x03"); +PMU_EVENT_ATTR_STRING(c8-residency, evattr_cstate_pkg_c8, "event=0x04"); +PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05"); +PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06"); + +static struct perf_cstate_msr pkg_msr[] = { + [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY, &evattr_cstate_pkg_c2, test_pkg, }, + [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY, &evattr_cstate_pkg_c3, test_pkg, }, + [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY, &evattr_cstate_pkg_c6, test_pkg, }, + [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY, &evattr_cstate_pkg_c7, test_pkg, }, + [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY, &evattr_cstate_pkg_c8, test_pkg, }, + [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY, &evattr_cstate_pkg_c9, test_pkg, }, + [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &evattr_cstate_pkg_c10, test_pkg, }, +}; + +static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = { + NULL, +}; + +static struct attribute_group pkg_events_attr_group = { + .name = "events", + .attrs = pkg_events_attrs, +}; + +DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63"); +static struct attribute *pkg_format_attrs[] = { + &format_attr_pkg_event.attr, + NULL, +}; +static struct attribute_group pkg_format_attr_group = { + .name = "format", + .attrs = pkg_format_attrs, +}; + +static cpumask_t cstate_pkg_cpu_mask; + +static const struct attribute_group *pkg_attr_groups[] = { + &pkg_events_attr_group, + &pkg_format_attr_group, + &cpumask_attr_group, + NULL, +}; + +/* cstate_pkg PMU end*/ + +static ssize_t cstate_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pmu *pmu = dev_get_drvdata(dev); + + if (pmu == &cstate_core_pmu) + return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask); + else if (pmu == &cstate_pkg_pmu) + return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask); + else + return 0; +} + +static int cstate_pmu_event_init(struct perf_event *event) +{ + u64 cfg = event->attr.config; + int ret = 0; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + event->attr.sample_period) /* no sampling */ + return -EINVAL; + + if (event->pmu == &cstate_core_pmu) { + if (cfg >= PERF_CSTATE_CORE_EVENT_MAX) + return -EINVAL; + if (!core_msr[cfg].attr) + return -EINVAL; + event->hw.event_base = core_msr[cfg].msr; + } else if (event->pmu == &cstate_pkg_pmu) { + if (cfg >= PERF_CSTATE_PKG_EVENT_MAX) + return -EINVAL; + if (!pkg_msr[cfg].attr) + return -EINVAL; + event->hw.event_base = pkg_msr[cfg].msr; + } else + return -ENOENT; + + /* must be done before validate_group */ + event->hw.config = cfg; + event->hw.idx = -1; + + return ret; +} + +static inline u64 cstate_pmu_read_counter(struct perf_event *event) +{ + u64 val; + + rdmsrl(event->hw.event_base, val); + return val; +} + +static void cstate_pmu_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev_raw_count, new_raw_count; + +again: + prev_raw_count = local64_read(&hwc->prev_count); + new_raw_count = cstate_pmu_read_counter(event); + + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + goto again; + + local64_add(new_raw_count - prev_raw_count, &event->count); +} + +static void cstate_pmu_event_start(struct perf_event *event, int mode) +{ + local64_set(&event->hw.prev_count, cstate_pmu_read_counter(event)); +} + +static void cstate_pmu_event_stop(struct perf_event *event, int mode) +{ + cstate_pmu_event_update(event); +} + +static void cstate_pmu_event_del(struct perf_event *event, int mode) +{ + cstate_pmu_event_stop(event, PERF_EF_UPDATE); +} + +static int cstate_pmu_event_add(struct perf_event *event, int mode) +{ + if (mode & PERF_EF_START) + cstate_pmu_event_start(event, mode); + + return 0; +} + +static void cstate_cpu_exit(int cpu) +{ + int i, id, target; + + /* cpu exit for cstate core */ + if (has_cstate_core) { + id = topology_core_id(cpu); + target = -1; + + for_each_online_cpu(i) { + if (i == cpu) + continue; + if (id == topology_core_id(i)) { + target = i; + break; + } + } + if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0) + cpumask_set_cpu(target, &cstate_core_cpu_mask); + WARN_ON(cpumask_empty(&cstate_core_cpu_mask)); + if (target >= 0) + perf_pmu_migrate_context(&cstate_core_pmu, cpu, target); + } + + /* cpu exit for cstate pkg */ + if (has_cstate_pkg) { + id = topology_physical_package_id(cpu); + target = -1; + + for_each_online_cpu(i) { + if (i == cpu) + continue; + if (id == topology_physical_package_id(i)) { + target = i; + break; + } + } + if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0) + cpumask_set_cpu(target, &cstate_pkg_cpu_mask); + WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask)); + if (target >= 0) + perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target); + } +} + +static void cstate_cpu_init(int cpu) +{ + int i, id; + + /* cpu init for cstate core */ + if (has_cstate_core) { + id = topology_core_id(cpu); + for_each_cpu(i, &cstate_core_cpu_mask) { + if (id == topology_core_id(i)) + break; + } + if (i >= nr_cpu_ids) + cpumask_set_cpu(cpu, &cstate_core_cpu_mask); + } + + /* cpu init for cstate pkg */ + if (has_cstate_pkg) { + id = topology_physical_package_id(cpu); + for_each_cpu(i, &cstate_pkg_cpu_mask) { + if (id == topology_physical_package_id(i)) + break; + } + if (i >= nr_cpu_ids) + cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask); + } +} + +static int cstate_cpu_notifier(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + break; + case CPU_STARTING: + cstate_cpu_init(cpu); + break; + case CPU_UP_CANCELED: + case CPU_DYING: + break; + case CPU_ONLINE: + case CPU_DEAD: + break; + case CPU_DOWN_PREPARE: + cstate_cpu_exit(cpu); + break; + default: + break; + } + + return NOTIFY_OK; +} + +/* + * Probe the cstate events and insert the available one into sysfs attrs + * Return false if there is no available events. + */ +static bool cstate_probe_msr(struct perf_cstate_msr *msr, + struct attribute **events_attrs, + int max_event_nr) +{ + int i, j = 0; + u64 val; + + /* Probe the cstate events. */ + for (i = 0; i < max_event_nr; i++) { + if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val)) + msr[i].attr = NULL; + } + + /* List remaining events in the sysfs attrs. */ + for (i = 0; i < max_event_nr; i++) { + if (msr[i].attr) + events_attrs[j++] = &msr[i].attr->attr.attr; + } + events_attrs[j] = NULL; + + return (j > 0) ? true : false; +} + +static int __init cstate_init(void) +{ + /* SLM has different MSR for PKG C6 */ + switch (boot_cpu_data.x86_model) { + case 55: + case 76: + case 77: + pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY; + } + + if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX)) + has_cstate_core = true; + + if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX)) + has_cstate_pkg = true; + + return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV; +} + +static void __init cstate_cpumask_init(void) +{ + int cpu; + + cpu_notifier_register_begin(); + + for_each_online_cpu(cpu) + cstate_cpu_init(cpu); + + __perf_cpu_notifier(cstate_cpu_notifier); + + cpu_notifier_register_done(); +} + +static struct pmu cstate_core_pmu = { + .attr_groups = core_attr_groups, + .name = "cstate_core", + .task_ctx_nr = perf_invalid_context, + .event_init = cstate_pmu_event_init, + .add = cstate_pmu_event_add, /* must have */ + .del = cstate_pmu_event_del, /* must have */ + .start = cstate_pmu_event_start, + .stop = cstate_pmu_event_stop, + .read = cstate_pmu_event_update, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT, +}; + +static struct pmu cstate_pkg_pmu = { + .attr_groups = pkg_attr_groups, + .name = "cstate_pkg", + .task_ctx_nr = perf_invalid_context, + .event_init = cstate_pmu_event_init, + .add = cstate_pmu_event_add, /* must have */ + .del = cstate_pmu_event_del, /* must have */ + .start = cstate_pmu_event_start, + .stop = cstate_pmu_event_stop, + .read = cstate_pmu_event_update, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT, +}; + +static void __init cstate_pmus_register(void) +{ + int err; + + if (has_cstate_core) { + err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1); + if (WARN_ON(err)) + pr_info("Failed to register PMU %s error %d\n", + cstate_core_pmu.name, err); + } + + if (has_cstate_pkg) { + err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1); + if (WARN_ON(err)) + pr_info("Failed to register PMU %s error %d\n", + cstate_pkg_pmu.name, err); + } +} + +static int __init cstate_pmu_init(void) +{ + int err; + + if (cpu_has_hypervisor) + return -ENODEV; + + err = cstate_init(); + if (err) + return err; + + cstate_cpumask_init(); + + cstate_pmus_register(); + + return 0; +} + +device_initcall(cstate_pmu_init); -- cgit v1.2.3 From 712df65ccb63da08a484bf57c40b250dfd4103a7 Mon Sep 17 00:00:00 2001 From: Taku Izumi Date: Thu, 24 Sep 2015 21:10:21 +0900 Subject: perf/x86/intel/uncore: Fix multi-segment problem of perf_event_intel_uncore In multi-segment system, uncore devices may belong to buses whose segment number is other than 0: .... 0000:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 Scratchpad & Semaphore Registers (rev 03) ... 0001:7f:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 Scratchpad & Semaphore Registers (rev 03) ... 0001:bf:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 Scratchpad & Semaphore Registers (rev 03) ... 0001:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 Scratchpad & Semaphore Registers (rev 03 ... In that case, relation of bus number and physical id may be broken because "uncore_pcibus_to_physid" doesn't take account of PCI segment. For example, bus 0000:ff and 0001:ff uses the same entry of "uncore_pcibus_to_physid" array. This patch fixes this problem by introducing the segment-aware pci2phy_map instead. Signed-off-by: Taku Izumi Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1443096621-4119-1-git-send-email-izumi.taku@jp.fujitsu.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 61 ++++++++++++++++++++-- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 12 ++++- arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 16 ++++-- .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 32 +++++++++--- 4 files changed, 106 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 560e5255b15e..61215a69b03d 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore; static bool pcidrv_registered; struct pci_driver *uncore_pci_driver; /* pci bus to socket mapping */ -int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, }; +DEFINE_RAW_SPINLOCK(pci2phy_map_lock); +struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head); struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX]; static DEFINE_RAW_SPINLOCK(uncore_box_lock); @@ -20,6 +21,59 @@ static struct event_constraint uncore_constraint_fixed = struct event_constraint uncore_constraint_empty = EVENT_CONSTRAINT(0, 0, 0); +int uncore_pcibus_to_physid(struct pci_bus *bus) +{ + struct pci2phy_map *map; + int phys_id = -1; + + raw_spin_lock(&pci2phy_map_lock); + list_for_each_entry(map, &pci2phy_map_head, list) { + if (map->segment == pci_domain_nr(bus)) { + phys_id = map->pbus_to_physid[bus->number]; + break; + } + } + raw_spin_unlock(&pci2phy_map_lock); + + return phys_id; +} + +struct pci2phy_map *__find_pci2phy_map(int segment) +{ + struct pci2phy_map *map, *alloc = NULL; + int i; + + lockdep_assert_held(&pci2phy_map_lock); + +lookup: + list_for_each_entry(map, &pci2phy_map_head, list) { + if (map->segment == segment) + goto end; + } + + if (!alloc) { + raw_spin_unlock(&pci2phy_map_lock); + alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL); + raw_spin_lock(&pci2phy_map_lock); + + if (!alloc) + return NULL; + + goto lookup; + } + + map = alloc; + alloc = NULL; + map->segment = segment; + for (i = 0; i < 256; i++) + map->pbus_to_physid[i] = -1; + list_add_tail(&map->list, &pci2phy_map_head); + +end: + kfree(alloc); + return map; +} + ssize_t uncore_event_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -809,7 +863,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id int phys_id; bool first_box = false; - phys_id = uncore_pcibus_to_physid[pdev->bus->number]; + phys_id = uncore_pcibus_to_physid(pdev->bus); if (phys_id < 0) return -ENODEV; @@ -856,9 +910,10 @@ static void uncore_pci_remove(struct pci_dev *pdev) { struct intel_uncore_box *box = pci_get_drvdata(pdev); struct intel_uncore_pmu *pmu; - int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number]; + int i, cpu, phys_id; bool last_box = false; + phys_id = uncore_pcibus_to_physid(pdev->bus); box = pci_get_drvdata(pdev); if (!box) { for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) { diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 72c54c2e5b1a..2f0a4a98e16b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -117,6 +117,15 @@ struct uncore_event_desc { const char *config; }; +struct pci2phy_map { + struct list_head list; + int segment; + int pbus_to_physid[256]; +}; + +int uncore_pcibus_to_physid(struct pci_bus *bus); +struct pci2phy_map *__find_pci2phy_map(int segment); + ssize_t uncore_event_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); @@ -317,7 +326,8 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx); extern struct intel_uncore_type **uncore_msr_uncores; extern struct intel_uncore_type **uncore_pci_uncores; extern struct pci_driver *uncore_pci_driver; -extern int uncore_pcibus_to_physid[256]; +extern raw_spinlock_t pci2phy_map_lock; +extern struct list_head pci2phy_map_head; extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX]; extern struct event_constraint uncore_constraint_empty; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c index f78574b3cb55..845256158a10 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c @@ -420,15 +420,25 @@ static void snb_uncore_imc_event_del(struct perf_event *event, int flags) static int snb_pci2phy_map_init(int devid) { struct pci_dev *dev = NULL; - int bus; + struct pci2phy_map *map; + int bus, segment; dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev); if (!dev) return -ENOTTY; bus = dev->bus->number; - - uncore_pcibus_to_physid[bus] = 0; + segment = pci_domain_nr(dev->bus); + + raw_spin_lock(&pci2phy_map_lock); + map = __find_pci2phy_map(segment); + if (!map) { + raw_spin_unlock(&pci2phy_map_lock); + pci_dev_put(dev); + return -ENOMEM; + } + map->pbus_to_physid[bus] = 0; + raw_spin_unlock(&pci2phy_map_lock); pci_dev_put(dev); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index 8cbb3f386b07..f0f4fcba252e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -1087,7 +1087,8 @@ static struct pci_driver snbep_uncore_pci_driver = { static int snbep_pci2phy_map_init(int devid) { struct pci_dev *ubox_dev = NULL; - int i, bus, nodeid; + int i, bus, nodeid, segment; + struct pci2phy_map *map; int err = 0; u32 config = 0; @@ -1106,16 +1107,27 @@ static int snbep_pci2phy_map_init(int devid) err = pci_read_config_dword(ubox_dev, 0x54, &config); if (err) break; + + segment = pci_domain_nr(ubox_dev->bus); + raw_spin_lock(&pci2phy_map_lock); + map = __find_pci2phy_map(segment); + if (!map) { + raw_spin_unlock(&pci2phy_map_lock); + err = -ENOMEM; + break; + } + /* * every three bits in the Node ID mapping register maps * to a particular node. */ for (i = 0; i < 8; i++) { if (nodeid == ((config >> (3 * i)) & 0x7)) { - uncore_pcibus_to_physid[bus] = i; + map->pbus_to_physid[bus] = i; break; } } + raw_spin_unlock(&pci2phy_map_lock); } if (!err) { @@ -1123,13 +1135,17 @@ static int snbep_pci2phy_map_init(int devid) * For PCI bus with no UBOX device, find the next bus * that has UBOX device and use its mapping. */ - i = -1; - for (bus = 255; bus >= 0; bus--) { - if (uncore_pcibus_to_physid[bus] >= 0) - i = uncore_pcibus_to_physid[bus]; - else - uncore_pcibus_to_physid[bus] = i; + raw_spin_lock(&pci2phy_map_lock); + list_for_each_entry(map, &pci2phy_map_head, list) { + i = -1; + for (bus = 255; bus >= 0; bus--) { + if (map->pbus_to_physid[bus] >= 0) + i = map->pbus_to_physid[bus]; + else + map->pbus_to_physid[bus] = i; + } } + raw_spin_unlock(&pci2phy_map_lock); } pci_dev_put(ubox_dev); -- cgit v1.2.3 From a76cf66e948afbaeda8e3ecc861f29c47a026c27 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:49 -0700 Subject: x86/uaccess: Tell the compiler that uaccess is unlikely to fault GCC doesn't realize that get_user(), put_user(), and their __ variants are unlikely to fail. Tell it. I noticed this while playing with the C entry code. Before: text data bss dec filename 21828763 5194760 1277952 28301475 vmlinux.baseline After: text data bss dec filename 21828379 5194760 1277952 28301091 vmlinux.new The generated code shrunk by 384 bytes. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/dc37bed7024319c3004d950d57151fca6aeacf97.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index a8df874f3e88..3e911c68876e 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -182,7 +182,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) : "=a" (__ret_gu), "=r" (__val_gu) \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ - __ret_gu; \ + __builtin_expect(__ret_gu, 0); \ }) #define __put_user_x(size, x, ptr, __ret_pu) \ @@ -278,7 +278,7 @@ extern void __put_user_8(void); __put_user_x(X, __pu_val, ptr, __ret_pu); \ break; \ } \ - __ret_pu; \ + __builtin_expect(__ret_pu, 0); \ }) #define __put_user_size(x, ptr, size, retval, errret) \ @@ -401,7 +401,7 @@ do { \ ({ \ int __pu_err; \ __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \ - __pu_err; \ + __builtin_expect(__pu_err, 0); \ }) #define __get_user_nocheck(x, ptr, size) \ @@ -410,7 +410,7 @@ do { \ unsigned long __gu_val; \ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - __gu_err; \ + __builtin_expect(__gu_err, 0); \ }) /* FIXME: this hack is definitely wrong -AK */ -- cgit v1.2.3 From 7e0f51cb445be8d3aee80e433ed8da4a33ad0157 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:50 -0700 Subject: x86/uaccess: Add unlikely() to __chk_range_not_ok() failure paths This should improve code quality a bit. It also shrinks the kernel text: Before: text data bss dec filename 21828379 5194760 1277952 28301091 vmlinux After: text data bss dec filename 21827997 5194760 1277952 28300709 vmlinux ... by 382 bytes. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/f427b8002d932e5deab9055e0074bb4e7e80ee39.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 3e911c68876e..09b1b0ab94b7 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -51,13 +51,13 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un * limit, not add it to the address). */ if (__builtin_constant_p(size)) - return addr > limit - size; + return unlikely(addr > limit - size); /* Arbitrary sizes? Be careful about overflow */ addr += size; - if (addr < size) + if (unlikely(addr < size)) return true; - return addr > limit; + return unlikely(addr > limit); } #define __range_not_ok(addr, size, limit) \ -- cgit v1.2.3 From dd27f998f0ed3c797032a82033fa191be7c61e4c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:53 -0700 Subject: x86/entry/64/compat: Fix SYSENTER's NT flag before user memory access Clearing NT is part of the prologue, whereas loading up arg6 makes more sense to think about as part of syscall processing. Reorder them. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/19eb235828b2d2a52c53459e09f2974e15e65a35.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index a9360d40fb7f..e2cca89c1ed3 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -88,15 +88,6 @@ ENTRY(entry_SYSENTER_compat) cld sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ - /* - * no need to do an access_ok check here because rbp has been - * 32-bit zero extended - */ - ASM_STAC -1: movl (%rbp), %ebp - _ASM_EXTABLE(1b, ia32_badarg) - ASM_CLAC - /* * Sysenter doesn't filter flags, so we need to clear NT * ourselves. To save a few cycles, we can check whether @@ -106,6 +97,15 @@ ENTRY(entry_SYSENTER_compat) jnz sysenter_fix_flags sysenter_flags_fixed: + /* + * No need to do an access_ok() check here because RBP has been + * 32-bit zero extended: + */ + ASM_STAC +1: movl (%rbp), %ebp + _ASM_EXTABLE(1b, ia32_badarg) + ASM_CLAC + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz sysenter_tracesys -- cgit v1.2.3 From 72f924783b8a87e4454516520ffb5f35e4930371 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:54 -0700 Subject: x86/entry, locking/lockdep: Move lockdep_sys_exit() to prepare_exit_to_usermode() Rather than worrying about exactly where LOCKDEP_SYS_EXIT should go in the asm code, add it to prepare_exit_from_usermode() and remove all of the asm calls that are followed by prepare_exit_to_usermode(). LOCKDEP_SYS_EXIT now appears only in the syscall fast paths. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1736ebe948b845e68120b86b89091f3ec27f5e8e.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 2 ++ arch/x86/entry/entry_32.S | 2 -- arch/x86/entry/entry_64.S | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 80dcc9261ca3..d94a60c16029 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -220,6 +220,8 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs) if (WARN_ON(!irqs_disabled())) local_irq_disable(); + lockdep_sys_exit(); + /* * In order to return to user mode, we need to have IRQs off with * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY, diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index b2909bf8cf70..a08ded481aba 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -255,7 +255,6 @@ ret_from_intr: jb resume_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) - LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl %esp, %eax @@ -372,7 +371,6 @@ syscall_call: syscall_after_call: movl %eax, PT_EAX(%esp) # store the return value syscall_exit: - LOCKDEP_SYS_EXIT jmp syscall_exit_work restore_all: diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 055a01de7c8d..7dc285036b6d 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -557,7 +557,6 @@ ret_from_intr: jz retint_kernel /* Interrupt came from user space */ - LOCKDEP_SYS_EXIT_IRQ GLOBAL(retint_user) mov %rsp,%rdi call prepare_exit_to_usermode -- cgit v1.2.3 From b611acf4736b5b00c89dcc238f640337832abcb4 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:55 -0700 Subject: x86/entry/64/compat: After SYSENTER, move STI after the NT fixup We eventually want to make it all the way into C code before enabling interrupts. We need to rework our flags handling slightly to delay enabling interrupts. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/35d24d2a9305da3182eab7b2cdfd32902e90962c.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index e2cca89c1ed3..bc678f0c3c91 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -58,14 +58,9 @@ ENDPROC(native_usergs_sysret32) * with the int 0x80 path. */ ENTRY(entry_SYSENTER_compat) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ + /* Interrupts are off on entry. */ SWAPGS_UNSAFE_STACK movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - ENABLE_INTERRUPTS(CLBR_NONE) /* Zero-extending 32-bit regs, do not remove */ movl %ebp, %ebp @@ -76,7 +71,16 @@ ENTRY(entry_SYSENTER_compat) /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ pushq %rbp /* pt_regs->sp */ - pushfq /* pt_regs->flags */ + + /* + * Push flags. This is nasty. First, interrupts are currently + * off, but we need pt_regs->flags to have IF set. Second, even + * if TF was set when SYSENTER started, it's clear by now. We fix + * that later using TIF_SINGLESTEP. + */ + pushfq /* pt_regs->flags (except IF = 0) */ + orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ + pushq $__USER32_CS /* pt_regs->cs */ pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ pushq %rax /* pt_regs->orig_ax */ @@ -92,11 +96,21 @@ ENTRY(entry_SYSENTER_compat) * Sysenter doesn't filter flags, so we need to clear NT * ourselves. To save a few cycles, we can check whether * NT was set instead of doing an unconditional popfq. + * This needs to happen before enabling interrupts so that + * we don't get preempted with NT set. */ testl $X86_EFLAGS_NT, EFLAGS(%rsp) jnz sysenter_fix_flags sysenter_flags_fixed: + /* + * Re-enable interrupts. IRQ tracing already thinks that IRQs are + * on (since we treat user mode as having IRQs on), and the + * prologue above is too short for it to be worth adding a + * tracing round trip. + */ + ENABLE_INTERRUPTS(CLBR_NONE) + /* * No need to do an access_ok() check here because RBP has been * 32-bit zero extended: @@ -244,7 +258,7 @@ sysexit_audit: #endif sysenter_fix_flags: - pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) + pushq $X86_EFLAGS_FIXED popfq jmp sysenter_flags_fixed -- cgit v1.2.3 From 0a6d1fa0d2b48fbae444e46e7f37a4832b2f8bdf Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:56 -0700 Subject: x86/vdso: Remove runtime 32-bit vDSO selection 32-bit userspace will now always see the same vDSO, which is exactly what used to be the int80 vDSO. Subsequent patches will clean it up and make it support SYSENTER and SYSCALL using alternatives. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/e7e6b3526fa442502e6125fe69486aab50813c32.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/Makefile | 35 +++------- arch/x86/entry/vdso/vdso2c.c | 1 - arch/x86/entry/vdso/vdso32-setup.c | 28 +------- arch/x86/entry/vdso/vdso32/int80.S | 56 --------------- arch/x86/entry/vdso/vdso32/syscall.S | 75 -------------------- arch/x86/entry/vdso/vdso32/sysenter.S | 116 ------------------------------- arch/x86/entry/vdso/vdso32/system_call.S | 56 +++++++++++++++ arch/x86/entry/vdso/vma.c | 13 +--- arch/x86/ia32/ia32_signal.c | 4 +- arch/x86/include/asm/elf.h | 2 +- arch/x86/include/asm/vdso.h | 9 +-- arch/x86/kernel/signal.c | 4 +- arch/x86/xen/setup.c | 13 +--- 13 files changed, 77 insertions(+), 335 deletions(-) delete mode 100644 arch/x86/entry/vdso/vdso32/int80.S delete mode 100644 arch/x86/entry/vdso/vdso32/syscall.S delete mode 100644 arch/x86/entry/vdso/vdso32/sysenter.S create mode 100644 arch/x86/entry/vdso/vdso32/system_call.S (limited to 'arch/x86') diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index a3d0767a6b29..3bfb39e7b8b2 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -19,9 +19,7 @@ obj-y += vma.o # vDSO images to build vdso_img-$(VDSO64-y) += 64 vdso_img-$(VDSOX32-y) += x32 -vdso_img-$(VDSO32-y) += 32-int80 -vdso_img-$(CONFIG_IA32_EMULATION) += 32-syscall -vdso_img-$(VDSO32-y) += 32-sysenter +vdso_img-$(VDSO32-y) += 32 obj-$(VDSO32-y) += vdso32-setup.o @@ -122,15 +120,6 @@ $(obj)/%.so: $(obj)/%.so.dbg $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE $(call if_changed,vdso) -# -# Build multiple 32-bit vDSO images to choose from at boot time. -# -vdso32.so-$(VDSO32-y) += int80 -vdso32.so-$(CONFIG_IA32_EMULATION) += syscall -vdso32.so-$(VDSO32-y) += sysenter - -vdso32-images = $(vdso32.so-y:%=vdso32-%.so) - CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1 @@ -139,14 +128,12 @@ VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1 override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ targets += vdso32/vdso32.lds -targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o) +targets += vdso32/note.o vdso32/vclock_gettime.o vdso32/system_call.o targets += vdso32/vclock_gettime.o -$(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%) - KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -$(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) -$(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32 +$(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) +$(obj)/vdso32.so.dbg: asflags-$(CONFIG_X86_64) += -m32 KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) @@ -157,13 +144,13 @@ KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING -$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) +$(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) -$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ - $(obj)/vdso32/vdso32.lds \ - $(obj)/vdso32/vclock_gettime.o \ - $(obj)/vdso32/note.o \ - $(obj)/vdso32/%.o +$(obj)/vdso32.so.dbg: FORCE \ + $(obj)/vdso32/vdso32.lds \ + $(obj)/vdso32/vclock_gettime.o \ + $(obj)/vdso32/note.o \ + $(obj)/vdso32/system_call.o $(call if_changed,vdso) # @@ -206,4 +193,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE PHONY += vdso_install $(vdso_img_insttargets) vdso_install: $(vdso_img_insttargets) FORCE -clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c vdsox32.so* +clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so* diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 8627db24a7f6..2637eb1e3949 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -98,7 +98,6 @@ struct vdso_sym required_syms[] = { "VDSO_FAKE_SECTION_TABLE_END", false }, {"VDSO32_NOTE_MASK", true}, - {"VDSO32_SYSENTER_RETURN", true}, {"__kernel_vsyscall", true}, {"__kernel_sigreturn", true}, {"__kernel_rt_sigreturn", true}, diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index e904c270573b..08a317a9ae4b 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -48,35 +48,9 @@ __setup("vdso32=", vdso32_setup); __setup_param("vdso=", vdso_setup, vdso32_setup, 0); #endif -#ifdef CONFIG_X86_64 - -#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) -#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32)) - -#else /* CONFIG_X86_32 */ - -#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) -#define vdso32_syscall() (0) - -#endif /* CONFIG_X86_64 */ - -#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) -const struct vdso_image *selected_vdso32; -#endif - int __init sysenter_setup(void) { -#ifdef CONFIG_COMPAT - if (vdso32_syscall()) - selected_vdso32 = &vdso_image_32_syscall; - else -#endif - if (vdso32_sysenter()) - selected_vdso32 = &vdso_image_32_sysenter; - else - selected_vdso32 = &vdso_image_32_int80; - - init_vdso_image(selected_vdso32); + init_vdso_image(&vdso_image_32); return 0; } diff --git a/arch/x86/entry/vdso/vdso32/int80.S b/arch/x86/entry/vdso/vdso32/int80.S deleted file mode 100644 index b15b7c01aedb..000000000000 --- a/arch/x86/entry/vdso/vdso32/int80.S +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Code for the vDSO. This version uses the old int $0x80 method. - * - * First get the common code for the sigreturn entry points. - * This must come first. - */ -#include "sigreturn.S" - - .text - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function - ALIGN -__kernel_vsyscall: -.LSTART_vsyscall: - int $0x80 - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - .previous - - .section .eh_frame,"a",@progbits -.LSTARTFRAMEDLSI: - .long .LENDCIEDLSI-.LSTARTCIEDLSI -.LSTARTCIEDLSI: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIEDLSI: - .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ -.LSTARTFDEDLSI: - .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 - .align 4 -.LENDFDEDLSI: - .previous - - /* - * Pad out the segment to match the size of the sysenter.S version. - */ -VDSO32_vsyscall_eh_frame_size = 0x40 - .section .data,"aw",@progbits - .space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0 - .previous diff --git a/arch/x86/entry/vdso/vdso32/syscall.S b/arch/x86/entry/vdso/vdso32/syscall.S deleted file mode 100644 index 6b286bb5251c..000000000000 --- a/arch/x86/entry/vdso/vdso32/syscall.S +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Code for the vDSO. This version uses the syscall instruction. - * - * First get the common code for the sigreturn entry points. - * This must come first. - */ -#define SYSCALL_ENTER_KERNEL syscall -#include "sigreturn.S" - -#include - - .text - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function - ALIGN -__kernel_vsyscall: -.LSTART_vsyscall: - push %ebp -.Lpush_ebp: - movl %ecx, %ebp - syscall - movl %ebp, %ecx - popl %ebp -.Lpop_ebp: - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - - .section .eh_frame,"a",@progbits -.LSTARTFRAME: - .long .LENDCIE-.LSTARTCIE -.LSTARTCIE: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIE: - - .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */ -.LSTARTFDE1: - .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 /* Augmentation length */ - /* What follows are the instructions for the table generation. - We have to record all changes of the stack pointer. */ - .byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .uleb128 8 - .byte 0x85, 0x02 /* DW_CFA_offset %ebp -8 */ - .byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */ - .byte 0xc5 /* DW_CFA_restore %ebp */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .uleb128 4 - .align 4 -.LENDFDE1: - .previous - - /* - * Pad out the segment to match the size of the sysenter.S version. - */ -VDSO32_vsyscall_eh_frame_size = 0x40 - .section .data,"aw",@progbits - .space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0 - .previous diff --git a/arch/x86/entry/vdso/vdso32/sysenter.S b/arch/x86/entry/vdso/vdso32/sysenter.S deleted file mode 100644 index e354bceee0e0..000000000000 --- a/arch/x86/entry/vdso/vdso32/sysenter.S +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Code for the vDSO. This version uses the sysenter instruction. - * - * First get the common code for the sigreturn entry points. - * This must come first. - */ -#include "sigreturn.S" - -/* - * The caller puts arg2 in %ecx, which gets pushed. The kernel will use - * %ecx itself for arg2. The pushing is because the sysexit instruction - * (found in entry.S) requires that we clobber %ecx with the desired %esp. - * User code might expect that %ecx is unclobbered though, as it would be - * for returning via the iret instruction, so we must push and pop. - * - * The caller puts arg3 in %edx, which the sysexit instruction requires - * for %eip. Thus, exactly as for arg2, we must push and pop. - * - * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter - * instruction clobbers %esp, the user's %esp won't even survive entry - * into the kernel. We store %esp in %ebp. Code in entry.S must fetch - * arg6 from the stack. - * - * You can not use this vsyscall for the clone() syscall because the - * three words on the parent stack do not get copied to the child. - */ - .text - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function - ALIGN -__kernel_vsyscall: -.LSTART_vsyscall: - push %ecx -.Lpush_ecx: - push %edx -.Lpush_edx: - push %ebp -.Lenter_kernel: - movl %esp,%ebp - sysenter - - /* 7: align return point with nop's to make disassembly easier */ - .space 7,0x90 - - /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ - int $0x80 - /* 16: System call normal return point is here! */ -VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */ - pop %ebp -.Lpop_ebp: - pop %edx -.Lpop_edx: - pop %ecx -.Lpop_ecx: - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - .previous - - .section .eh_frame,"a",@progbits -.LSTARTFRAMEDLSI: - .long .LENDCIEDLSI-.LSTARTCIEDLSI -.LSTARTCIEDLSI: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIEDLSI: - .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ -.LSTARTFDEDLSI: - .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 - /* What follows are the instructions for the table generation. - We have to record all changes of the stack pointer. */ - .byte 0x40 + (.Lpush_ecx-.LSTART_vsyscall) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x08 /* RA at offset 8 now */ - .byte 0x40 + (.Lpush_edx-.Lpush_ecx) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x0c /* RA at offset 12 now */ - .byte 0x40 + (.Lenter_kernel-.Lpush_edx) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x10 /* RA at offset 16 now */ - .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ - /* Finally the epilogue. */ - .byte 0x40 + (.Lpop_ebp-.Lenter_kernel) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x0c /* RA at offset 12 now */ - .byte 0xc5 /* DW_CFA_restore %ebp */ - .byte 0x40 + (.Lpop_edx-.Lpop_ebp) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x08 /* RA at offset 8 now */ - .byte 0x40 + (.Lpop_ecx-.Lpop_edx) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x04 /* RA at offset 4 now */ - .align 4 -.LENDFDEDLSI: - .previous - - /* - * Emit a symbol with the size of this .eh_frame data, - * to verify it matches the other versions. - */ -VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI) diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S new file mode 100644 index 000000000000..b15b7c01aedb --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -0,0 +1,56 @@ +/* + * Code for the vDSO. This version uses the old int $0x80 method. + * + * First get the common code for the sigreturn entry points. + * This must come first. + */ +#include "sigreturn.S" + + .text + .globl __kernel_vsyscall + .type __kernel_vsyscall,@function + ALIGN +__kernel_vsyscall: +.LSTART_vsyscall: + int $0x80 + ret +.LEND_vsyscall: + .size __kernel_vsyscall,.-.LSTART_vsyscall + .previous + + .section .eh_frame,"a",@progbits +.LSTARTFRAMEDLSI: + .long .LENDCIEDLSI-.LSTARTCIEDLSI +.LSTARTCIEDLSI: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zR" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0x0c /* DW_CFA_def_cfa */ + .uleb128 4 + .uleb128 4 + .byte 0x88 /* DW_CFA_offset, column 0x8 */ + .uleb128 1 + .align 4 +.LENDCIEDLSI: + .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ +.LSTARTFDEDLSI: + .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ + .long .LSTART_vsyscall-. /* PC-relative start address */ + .long .LEND_vsyscall-.LSTART_vsyscall + .uleb128 0 + .align 4 +.LENDFDEDLSI: + .previous + + /* + * Pad out the segment to match the size of the sysenter.S version. + */ +VDSO32_vsyscall_eh_frame_size = 0x40 + .section .data,"aw",@progbits + .space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0 + .previous diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 434543145d78..64df47148160 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -180,21 +180,10 @@ up_fail: #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static int load_vdso32(void) { - int ret; - if (vdso32_enabled != 1) /* Other values all mean "disabled" */ return 0; - ret = map_vdso(selected_vdso32, false); - if (ret) - return ret; - - if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN) - current_thread_info()->sysenter_return = - current->mm->context.vdso + - selected_vdso32->sym_VDSO32_SYSENTER_RETURN; - - return 0; + return map_vdso(&vdso_image_32, false); } #endif diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index a0a19b7ba22d..e6a5c275cd3f 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -289,7 +289,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, /* Return stub is in 32bit vsyscall page */ if (current->mm->context.vdso) restorer = current->mm->context.vdso + - selected_vdso32->sym___kernel_sigreturn; + vdso_image_32.sym___kernel_sigreturn; else restorer = &frame->retcode; } @@ -368,7 +368,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, restorer = ksig->ka.sa.sa_restorer; else restorer = current->mm->context.vdso + - selected_vdso32->sym___kernel_rt_sigreturn; + vdso_image_32.sym___kernel_rt_sigreturn; put_user_ex(ptr_to_compat(restorer), &frame->pretcode); /* diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 141c561f4664..2ee05c4f5f37 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -328,7 +328,7 @@ else \ #define VDSO_ENTRY \ ((unsigned long)current->mm->context.vdso + \ - selected_vdso32->sym___kernel_vsyscall) + vdso_image_32.sym___kernel_vsyscall) struct linux_binprm; diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 8021bd28c0f1..5bcb1de8296e 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -26,7 +26,6 @@ struct vdso_image { long sym___kernel_sigreturn; long sym___kernel_rt_sigreturn; long sym___kernel_vsyscall; - long sym_VDSO32_SYSENTER_RETURN; }; #ifdef CONFIG_X86_64 @@ -38,13 +37,7 @@ extern const struct vdso_image vdso_image_x32; #endif #if defined CONFIG_X86_32 || defined CONFIG_COMPAT -extern const struct vdso_image vdso_image_32_int80; -#ifdef CONFIG_COMPAT -extern const struct vdso_image vdso_image_32_syscall; -#endif -extern const struct vdso_image vdso_image_32_sysenter; - -extern const struct vdso_image *selected_vdso32; +extern const struct vdso_image vdso_image_32; #endif extern void __init init_vdso_image(const struct vdso_image *image); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index da52e6bb5c7f..d87ce92d3404 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -299,7 +299,7 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, if (current->mm->context.vdso) restorer = current->mm->context.vdso + - selected_vdso32->sym___kernel_sigreturn; + vdso_image_32.sym___kernel_sigreturn; else restorer = &frame->retcode; if (ksig->ka.sa.sa_flags & SA_RESTORER) @@ -363,7 +363,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, /* Set up to return from userspace. */ restorer = current->mm->context.vdso + - selected_vdso32->sym___kernel_rt_sigreturn; + vdso_image_32.sym___kernel_rt_sigreturn; if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; put_user_ex(restorer, &frame->pretcode); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 1c30e4ab1022..63320b6d35bc 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -965,17 +965,8 @@ char * __init xen_auto_xlated_memory_setup(void) static void __init fiddle_vdso(void) { #ifdef CONFIG_X86_32 - /* - * This could be called before selected_vdso32 is initialized, so - * just fiddle with both possible images. vdso_image_32_syscall - * can't be selected, since it only exists on 64-bit systems. - */ - u32 *mask; - mask = vdso_image_32_int80.data + - vdso_image_32_int80.sym_VDSO32_NOTE_MASK; - *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; - mask = vdso_image_32_sysenter.data + - vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK; + u32 *mask = vdso_image_32.data + + vdso_image_32.sym_VDSO32_NOTE_MASK; *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; #endif } -- cgit v1.2.3 From 9d99c7123c9af5f47a2037933a3ffaec3e37efad Mon Sep 17 00:00:00 2001 From: Christian Melki Date: Mon, 5 Oct 2015 17:31:33 +0200 Subject: swiotlb: Enable it under x86 PAE Most distributions end up enabling SWIOTLB already with 32-bit kernels due to the combination of CONFIG_HYPERVISOR_GUEST|CONFIG_XEN=y as those end up requiring the SWIOTLB. However for those that are not interested in virtualization and run in 32-bit they will discover that: "32-bit PAE 4.2.0 kernel (no IOMMU code) would hang when writing to my USB disk. The kernel spews million(-ish messages per sec) to syslog, effectively "hanging" userspace with my kernel. Oct 2 14:33:06 voodoochild kernel: [ 223.287447] nommu_map_sg: overflow 25dcac000+1024 of device mask ffffffff Oct 2 14:33:06 voodoochild kernel: [ 223.287448] nommu_map_sg: overflow 25dcac000+1024 of device mask ffffffff Oct 2 14:33:06 voodoochild kernel: [ 223.287449] nommu_map_sg: overflow 25dcac000+1024 of device mask ffffffff ... etc ..." Enabling it makes the problem go away. N.B. With a6dfa128ce5c414ab46b1d690f7a1b8decb8526d "config: Enable NEED_DMA_MAP_STATE by default when SWIOTLB is selected" we also have the important part of the SG macros enabled to make this work properly - in case anybody wants to backport this patch. Reported-and-Tested-by: Christian Melki Signed-off-by: Christian Melki Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 328c8352480c..96d058a87100 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1308,6 +1308,7 @@ config HIGHMEM config X86_PAE bool "PAE (Physical Address Extension) Support" depends on X86_32 && !HIGHMEM4G + select SWIOTLB ---help--- PAE is required for NX support, and furthermore enables larger swapspace support for non-overcommit purposes. It -- cgit v1.2.3 From 92b279070dd6c94265db32748bbeb5b583588de9 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 6 Oct 2015 12:31:33 +0100 Subject: crypto: camellia_aesni_avx - Fix CPU feature checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to explicitly check the AVX and AES CPU features, as we can't infer them from the related XSAVE feature flags. For example, the Core i3 2310M passes the XSAVE feature test but does not implement AES-NI. Reported-and-tested-by: Stéphane Glondu References: https://bugs.debian.org/800934 Fixes: ce4f5f9b65ae ("x86/fpu, crypto x86/camellia_aesni_avx: Simplify...") Signed-off-by: Ben Hutchings Cc: stable # 4.2 Signed-off-by: Herbert Xu --- arch/x86/crypto/camellia_aesni_avx_glue.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 80a0e4389c9a..bacaa13acac5 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -554,6 +554,11 @@ static int __init camellia_aesni_init(void) { const char *feature_name; + if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { + pr_info("AVX or AES-NI instructions are not detected.\n"); + return -ENODEV; + } + if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; -- cgit v1.2.3 From 7b956f035a9ef8bd3ef5490f49fc1bd834d8a70a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:57 -0700 Subject: x86/asm: Re-add parts of the manual CFI infrastructure Commit: 131484c8da97 ("x86/debug: Remove perpetually broken, unmaintainable dwarf annotations") removed all the manual DWARF annotations outside the vDSO. It also removed the macros we used for the manual annotations. Re-add these macros so that we can clean up the vDSO annotations. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/4c70bb98a8b773c8ccfaabf6745e569ff43e7f65.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Makefile | 10 ++- arch/x86/include/asm/dwarf2.h | 170 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 178 insertions(+), 2 deletions(-) create mode 100644 arch/x86/include/asm/dwarf2.h (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 747860c696e1..2dfaa72260b4 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -159,6 +159,12 @@ endif sp-$(CONFIG_X86_32) := esp sp-$(CONFIG_X86_64) := rsp +# do binutils support CFI? +cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1) +# is .cfi_signal_frame supported too? +cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1) +cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1) + # does binutils support specific instructions? asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1) @@ -166,8 +172,8 @@ asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1) avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) -KBUILD_AFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) -KBUILD_CFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) +KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) +KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) LDFLAGS := -m elf_$(UTS_MACHINE) diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h new file mode 100644 index 000000000000..de1cdaf4d743 --- /dev/null +++ b/arch/x86/include/asm/dwarf2.h @@ -0,0 +1,170 @@ +#ifndef _ASM_X86_DWARF2_H +#define _ASM_X86_DWARF2_H + +#ifndef __ASSEMBLY__ +#warning "asm/dwarf2.h should be only included in pure assembly files" +#endif + +/* + * Macros for dwarf2 CFI unwind table entries. + * See "as.info" for details on these pseudo ops. Unfortunately + * they are only supported in very new binutils, so define them + * away for older version. + */ + +#ifdef CONFIG_AS_CFI + +#define CFI_STARTPROC .cfi_startproc +#define CFI_ENDPROC .cfi_endproc +#define CFI_DEF_CFA .cfi_def_cfa +#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register +#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset +#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset +#define CFI_OFFSET .cfi_offset +#define CFI_REL_OFFSET .cfi_rel_offset +#define CFI_REGISTER .cfi_register +#define CFI_RESTORE .cfi_restore +#define CFI_REMEMBER_STATE .cfi_remember_state +#define CFI_RESTORE_STATE .cfi_restore_state +#define CFI_UNDEFINED .cfi_undefined +#define CFI_ESCAPE .cfi_escape + +#ifdef CONFIG_AS_CFI_SIGNAL_FRAME +#define CFI_SIGNAL_FRAME .cfi_signal_frame +#else +#define CFI_SIGNAL_FRAME +#endif + +#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__) + /* + * Emit CFI data in .debug_frame sections, not .eh_frame sections. + * The latter we currently just discard since we don't do DWARF + * unwinding at runtime. So only the offline DWARF information is + * useful to anyone. Note we should not use this directive if this + * file is used in the vDSO assembly, or if vmlinux.lds.S gets + * changed so it doesn't discard .eh_frame. + */ + .cfi_sections .debug_frame +#endif + +#else + +/* + * Due to the structure of pre-exisiting code, don't use assembler line + * comment character # to ignore the arguments. Instead, use a dummy macro. + */ +.macro cfi_ignore a=0, b=0, c=0, d=0 +.endm + +#define CFI_STARTPROC cfi_ignore +#define CFI_ENDPROC cfi_ignore +#define CFI_DEF_CFA cfi_ignore +#define CFI_DEF_CFA_REGISTER cfi_ignore +#define CFI_DEF_CFA_OFFSET cfi_ignore +#define CFI_ADJUST_CFA_OFFSET cfi_ignore +#define CFI_OFFSET cfi_ignore +#define CFI_REL_OFFSET cfi_ignore +#define CFI_REGISTER cfi_ignore +#define CFI_RESTORE cfi_ignore +#define CFI_REMEMBER_STATE cfi_ignore +#define CFI_RESTORE_STATE cfi_ignore +#define CFI_UNDEFINED cfi_ignore +#define CFI_ESCAPE cfi_ignore +#define CFI_SIGNAL_FRAME cfi_ignore + +#endif + +/* + * An attempt to make CFI annotations more or less + * correct and shorter. It is implied that you know + * what you're doing if you use them. + */ +#ifdef __ASSEMBLY__ +#ifdef CONFIG_X86_64 + .macro pushq_cfi reg + pushq \reg + CFI_ADJUST_CFA_OFFSET 8 + .endm + + .macro pushq_cfi_reg reg + pushq %\reg + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET \reg, 0 + .endm + + .macro popq_cfi reg + popq \reg + CFI_ADJUST_CFA_OFFSET -8 + .endm + + .macro popq_cfi_reg reg + popq %\reg + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE \reg + .endm + + .macro pushfq_cfi + pushfq + CFI_ADJUST_CFA_OFFSET 8 + .endm + + .macro popfq_cfi + popfq + CFI_ADJUST_CFA_OFFSET -8 + .endm + + .macro movq_cfi reg offset=0 + movq %\reg, \offset(%rsp) + CFI_REL_OFFSET \reg, \offset + .endm + + .macro movq_cfi_restore offset reg + movq \offset(%rsp), %\reg + CFI_RESTORE \reg + .endm +#else /*!CONFIG_X86_64*/ + .macro pushl_cfi reg + pushl \reg + CFI_ADJUST_CFA_OFFSET 4 + .endm + + .macro pushl_cfi_reg reg + pushl %\reg + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET \reg, 0 + .endm + + .macro popl_cfi reg + popl \reg + CFI_ADJUST_CFA_OFFSET -4 + .endm + + .macro popl_cfi_reg reg + popl %\reg + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE \reg + .endm + + .macro pushfl_cfi + pushfl + CFI_ADJUST_CFA_OFFSET 4 + .endm + + .macro popfl_cfi + popfl + CFI_ADJUST_CFA_OFFSET -4 + .endm + + .macro movl_cfi reg offset=0 + movl %\reg, \offset(%esp) + CFI_REL_OFFSET \reg, \offset + .endm + + .macro movl_cfi_restore offset reg + movl \offset(%esp), %\reg + CFI_RESTORE \reg + .endm +#endif /*!CONFIG_X86_64*/ +#endif /*__ASSEMBLY__*/ + +#endif /* _ASM_X86_DWARF2_H */ -- cgit v1.2.3 From f24f910884277aa6824bbc2dda4b5d0418d45c28 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:58 -0700 Subject: x86/vdso: Define BUILD_VDSO while building and emit .eh_frame in asm For the vDSO, user code wants runtime unwind info. Make sure that, if we use .cfi directives, we generate it. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/16e29ad8855e6508197000d8c41f56adb00d7580.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/Makefile | 4 ++-- arch/x86/include/asm/dwarf2.h | 13 ++++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 3bfb39e7b8b2..265c0ed68118 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -67,7 +67,7 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ -fno-omit-frame-pointer -foptimize-sibling-calls \ - -DDISABLE_BRANCH_PROFILING + -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO $(vobjs): KBUILD_CFLAGS += $(CFL) @@ -131,7 +131,7 @@ targets += vdso32/vdso32.lds targets += vdso32/note.o vdso32/vclock_gettime.o vdso32/system_call.o targets += vdso32/vclock_gettime.o -KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) +KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO $(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) $(obj)/vdso32.so.dbg: asflags-$(CONFIG_X86_64) += -m32 diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index de1cdaf4d743..09133ba032b3 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -36,15 +36,22 @@ #endif #if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__) +#ifndef BUILD_VDSO /* * Emit CFI data in .debug_frame sections, not .eh_frame sections. * The latter we currently just discard since we don't do DWARF * unwinding at runtime. So only the offline DWARF information is - * useful to anyone. Note we should not use this directive if this - * file is used in the vDSO assembly, or if vmlinux.lds.S gets - * changed so it doesn't discard .eh_frame. + * useful to anyone. Note we should not use this directive if + * vmlinux.lds.S gets changed so it doesn't discard .eh_frame. */ .cfi_sections .debug_frame +#else + /* + * For the vDSO, emit both runtime unwind information and debug + * symbols for the .dbg file. + */ + .cfi_sections .eh_frame, .debug_frame +#endif #endif #else -- cgit v1.2.3 From 29c0ce9508458ed31a9db2ed425f64c0d6d3ddfb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:59 -0700 Subject: x86/vdso: Replace hex int80 CFI annotations with GAS directives Maintaining the current CFI annotations written in R'lyehian is difficult for most of us. Translate them to something a little closer to English. This will remove the CFI data for kernels built with extremely old versions of binutils. I think this is a fair tradeoff for the ability for mortals to edit the asm. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/ae3ff4ff5278b4bfc1e1dab368823469866d4b71.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vdso32/system_call.S | 48 ++++++-------------------------- 1 file changed, 8 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index b15b7c01aedb..b52cbfbe119e 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -1,6 +1,10 @@ /* * Code for the vDSO. This version uses the old int $0x80 method. - * +*/ + +#include + +/* * First get the common code for the sigreturn entry points. * This must come first. */ @@ -11,46 +15,10 @@ .type __kernel_vsyscall,@function ALIGN __kernel_vsyscall: -.LSTART_vsyscall: + CFI_STARTPROC int $0x80 ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - .previous - - .section .eh_frame,"a",@progbits -.LSTARTFRAMEDLSI: - .long .LENDCIEDLSI-.LSTARTCIEDLSI -.LSTARTCIEDLSI: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIEDLSI: - .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ -.LSTARTFDEDLSI: - .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 - .align 4 -.LENDFDEDLSI: - .previous + CFI_ENDPROC - /* - * Pad out the segment to match the size of the sysenter.S version. - */ -VDSO32_vsyscall_eh_frame_size = 0x40 - .section .data,"aw",@progbits - .space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0 + .size __kernel_vsyscall,.-__kernel_vsyscall .previous -- cgit v1.2.3 From 7bcdea4d050cbe4912854a68b93494203eec8b24 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:00 -0700 Subject: x86/elf/64: Clear more registers in elf_common_init() Before we start calling execve in contexts that honor the full pt_regs, we need to teach it to initialize all registers. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/65a38a9edee61a1158cfd230800c61dbd963dac5.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/elf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 2ee05c4f5f37..1514753fd435 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -171,11 +171,11 @@ do { \ static inline void elf_common_init(struct thread_struct *t, struct pt_regs *regs, const u16 ds) { - /* Commented-out registers are cleared in stub_execve */ - /*regs->ax = regs->bx =*/ regs->cx = regs->dx = 0; - regs->si = regs->di /*= regs->bp*/ = 0; + /* ax gets execve's return value. */ + /*regs->ax = */ regs->bx = regs->cx = regs->dx = 0; + regs->si = regs->di = regs->bp = 0; regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; - /*regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;*/ + regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; t->fs = t->gs = 0; t->fsindex = t->gsindex = 0; t->ds = t->es = ds; -- cgit v1.2.3 From 8242c6c84a644e5f0f721e4ae2bd542f640c89f9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:01 -0700 Subject: x86/vdso/32: Save extra registers in the INT80 vsyscall path The goal is to integrate the SYSENTER and SYSCALL32 entry paths with the INT80 path. SYSENTER clobbers ESP and EIP. SYSCALL32 clobbers ECX (and, invisibly, R11). SYSRETL (long mode to compat mode) clobbers ECX and, invisibly, R11. SYSEXIT (which we only need for native 32-bit) clobbers ECX and EDX. This means that we'll need to provide ESP to the kernel in a register (I chose ECX, since it's only needed for SYSENTER) and we need to provide the args that normally live in ECX and EDX in memory. The epilogue needs to restore ECX and EDX, since user code relies on regs being preserved. We don't need to do anything special about EIP, since the kernel already knows where we are. The kernel will eventually need to know where int $0x80 lands, so add a vdso_image entry for it. The only user-visible effect of this code is that ptrace-induced changes to ECX and EDX during fast syscalls will be lost. This is already the case for the SYSENTER path. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/b860925adbee2d2627a0671fbfe23a7fd04127f8.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vdso2c.c | 1 + arch/x86/entry/vdso/vdso32/system_call.S | 25 ++++++++++++++++++++++++- arch/x86/include/asm/vdso.h | 1 + 3 files changed, 26 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 2637eb1e3949..785d9922b106 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -101,6 +101,7 @@ struct vdso_sym required_syms[] = { {"__kernel_vsyscall", true}, {"__kernel_sigreturn", true}, {"__kernel_rt_sigreturn", true}, + {"int80_landing_pad", true}, }; __attribute__((format(printf, 1, 2))) __attribute__((noreturn)) diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index b52cbfbe119e..d591fe93e93a 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -16,7 +16,30 @@ ALIGN __kernel_vsyscall: CFI_STARTPROC - int $0x80 + /* + * Reshuffle regs so that all of any of the entry instructions + * will preserve enough state. + */ + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx, 0 + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 + movl %esp, %ecx + + /* Enter using int $0x80 */ + movl (%esp), %ecx + int $0x80 +GLOBAL(int80_landing_pad) + + /* Restore ECX and EDX in case they were clobbered. */ + popl %ecx + CFI_RESTORE ecx + CFI_ADJUST_CFA_OFFSET -4 + popl %edx + CFI_RESTORE edx + CFI_ADJUST_CFA_OFFSET -4 ret CFI_ENDPROC diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 5bcb1de8296e..756de9190aec 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -26,6 +26,7 @@ struct vdso_image { long sym___kernel_sigreturn; long sym___kernel_rt_sigreturn; long sym___kernel_vsyscall; + long sym_int80_landing_pad; }; #ifdef CONFIG_X86_64 -- cgit v1.2.3 From e62a254a1f93fcc7299497a5c7231639400b8c3c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:02 -0700 Subject: x86/entry/64/compat: Disable SYSENTER and SYSCALL32 entries We've disabled the vDSO helpers to call them, so turn off the entries entirely (temporarily) in preparation for cleaning them up. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/8d6e84bf651519289dc532dcc230adfabbd2a3eb.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index bc678f0c3c91..06a8966415f9 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -103,6 +103,14 @@ ENTRY(entry_SYSENTER_compat) jnz sysenter_fix_flags sysenter_flags_fixed: + /* Temporary: SYSENTER is disabled. */ +#ifdef CONFIG_CONTEXT_TRACKING + call enter_from_user_mode +#endif + ENABLE_INTERRUPTS(CLBR_NONE) + movl $11, %edi + call do_exit + /* * Re-enable interrupts. IRQ tracing already thinks that IRQs are * on (since we treat user mode as having IRQs on), and the @@ -324,6 +332,11 @@ ENTRY(entry_SYSCALL_compat) * it is too small to ever cause noticeable irq latency. */ SWAPGS_UNSAFE_STACK + + /* Temporary: SYSCALL32 is disabled. */ + movl $-ENOSYS, %eax + USERGS_SYSRET32 + movl %esp, %r8d movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp ENABLE_INTERRUPTS(CLBR_NONE) -- cgit v1.2.3 From c5f638ac90d514202155c87aa58730e86d484d9e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:03 -0700 Subject: x86/entry/64/compat: Remove audit optimizations These audit optimizations are messy and hard to maintain. We'll get a similar effect from opportunistic sysret when fast compat system calls are re-implemented. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/0bcca79ac7ff835d0e5a38725298865b01347a82.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 98 +--------------------------------------- 1 file changed, 2 insertions(+), 96 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 06a8966415f9..75f5fcf15e11 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -16,16 +16,6 @@ #include #include -/* Avoid __ASSEMBLER__'ifying just for this. */ -#include -#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 - -#ifndef CONFIG_AUDITSYSCALL -# define sysexit_audit ia32_ret_from_sys_call_irqs_off -# define sysretl_audit ia32_ret_from_sys_call_irqs_off -#endif - .section .entry.text, "ax" #ifdef CONFIG_PARAVIRT @@ -148,7 +138,7 @@ sysenter_dispatch: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysexit_audit + jnz ia32_ret_from_sys_call_irqs_off sysexit_from_sys_call: /* * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an @@ -200,81 +190,12 @@ sysexit_from_sys_call: */ USERGS_SYSRET32 -#ifdef CONFIG_AUDITSYSCALL - .macro auditsys_entry_common - /* - * At this point, registers hold syscall args in the 32-bit syscall ABI: - * EAX is syscall number, the 6 args are in EBX,ECX,EDX,ESI,EDI,EBP. - * - * We want to pass them to __audit_syscall_entry(), which is a 64-bit - * C function with 5 parameters, so shuffle them to match what - * the function expects: RDI,RSI,RDX,RCX,R8. - */ - movl %esi, %r8d /* arg5 (R8 ) <= 4th syscall arg (ESI) */ - xchg %ecx, %edx /* arg4 (RCX) <= 3rd syscall arg (EDX) */ - /* arg3 (RDX) <= 2nd syscall arg (ECX) */ - movl %ebx, %esi /* arg2 (RSI) <= 1st syscall arg (EBX) */ - movl %eax, %edi /* arg1 (RDI) <= syscall number (EAX) */ - call __audit_syscall_entry - - /* - * We are going to jump back to the syscall dispatch code. - * Prepare syscall args as required by the 64-bit C ABI. - * Registers clobbered by __audit_syscall_entry() are - * loaded from pt_regs on stack: - */ - movl ORIG_RAX(%rsp), %eax /* syscall number */ - movl %ebx, %edi /* arg1 */ - movl RCX(%rsp), %esi /* arg2 */ - movl RDX(%rsp), %edx /* arg3 */ - movl RSI(%rsp), %ecx /* arg4 */ - movl RDI(%rsp), %r8d /* arg5 */ - .endm - - .macro auditsys_exit exit - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_ret_from_sys_call - movl %eax, %esi /* second arg, syscall return value */ - cmpl $-MAX_ERRNO, %eax /* is it an error ? */ - jbe 1f - movslq %eax, %rsi /* if error sign extend to 64 bits */ -1: setbe %al /* 1 if error, 0 if not */ - movzbl %al, %edi /* zero-extend that into %edi */ - call __audit_syscall_exit - movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz \exit - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - jmp int_ret_from_sys_call_irqs_off - .endm - -sysenter_auditsys: - auditsys_entry_common - movl %ebp, %r9d /* reload 6th syscall arg */ - jmp sysenter_dispatch - -sysexit_audit: - auditsys_exit sysexit_from_sys_call -#endif - sysenter_fix_flags: pushq $X86_EFLAGS_FIXED popfq jmp sysenter_flags_fixed sysenter_tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz sysenter_auditsys -#endif SAVE_EXTRA_REGS xorl %eax, %eax /* Do not leak kernel information */ movq %rax, R11(%rsp) @@ -389,7 +310,7 @@ cstar_dispatch: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysretl_audit + jnz ia32_ret_from_sys_call_irqs_off sysretl_from_sys_call: andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) @@ -420,22 +341,7 @@ sysretl_from_sys_call: */ USERGS_SYSRET32 -#ifdef CONFIG_AUDITSYSCALL -cstar_auditsys: - movl %r9d, R9(%rsp) /* register to be clobbered by call */ - auditsys_entry_common - movl R9(%rsp), %r9d /* reload 6th syscall arg */ - jmp cstar_dispatch - -sysretl_audit: - auditsys_exit sysretl_from_sys_call -#endif - cstar_tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz cstar_auditsys -#endif xchgl %r9d, %ebp SAVE_EXTRA_REGS xorl %eax, %eax /* Do not leak kernel information */ -- cgit v1.2.3 From 2ec67971faccc21ff18878552ccfe4409088c4c8 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:04 -0700 Subject: x86/entry/64/compat: Remove most of the fast system call machinery We now have only one code path that calls through the compat syscall table. This will make it much more pleasant to change the pt_regs vs register calling convention, which we need to do to move the call into C. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/320cda5573cefdc601b955d23fbe8f36c085432d.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 246 +-------------------------------------- 1 file changed, 4 insertions(+), 242 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 75f5fcf15e11..3216e6072312 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -101,119 +101,13 @@ sysenter_flags_fixed: movl $11, %edi call do_exit - /* - * Re-enable interrupts. IRQ tracing already thinks that IRQs are - * on (since we treat user mode as having IRQs on), and the - * prologue above is too short for it to be worth adding a - * tracing round trip. - */ - ENABLE_INTERRUPTS(CLBR_NONE) - - /* - * No need to do an access_ok() check here because RBP has been - * 32-bit zero extended: - */ - ASM_STAC -1: movl (%rbp), %ebp - _ASM_EXTABLE(1b, ia32_badarg) - ASM_CLAC - - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysenter_tracesys - -sysenter_do_call: - /* 32-bit syscall -> 64-bit C ABI argument conversion */ - movl %edi, %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ - xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ - movl %ebx, %edi /* arg1 */ - movl %edx, %edx /* arg3 (zero extension) */ -sysenter_dispatch: - cmpq $(IA32_NR_syscalls-1), %rax - ja 1f - call *ia32_sys_call_table(, %rax, 8) - movq %rax, RAX(%rsp) -1: - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_ret_from_sys_call_irqs_off -sysexit_from_sys_call: - /* - * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an - * NMI between STI and SYSEXIT has poorly specified behavior, - * and and NMI followed by an IRQ with usergs is fatal. So - * we just pretend we're using SYSEXIT but we really use - * SYSRETL instead. - * - * This code path is still called 'sysexit' because it pairs - * with 'sysenter' and it uses the SYSENTER calling convention. - */ - andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - movl RIP(%rsp), %ecx /* User %eip */ - movq RAX(%rsp), %rax - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - xorl %edx, %edx /* Do not leak kernel information */ - xorq %r8, %r8 - xorq %r9, %r9 - xorq %r10, %r10 - movl EFLAGS(%rsp), %r11d /* User eflags */ - TRACE_IRQS_ON - - /* - * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, - * since it avoids a dicey window with interrupts enabled. - */ - movl RSP(%rsp), %esp - - /* - * USERGS_SYSRET32 does: - * gsbase = user's gs base - * eip = ecx - * rflags = r11 - * cs = __USER32_CS - * ss = __USER_DS - * - * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: - * - * pop %ebp - * pop %edx - * pop %ecx - * - * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to - * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's - * address (already known to user code), and R12-R15 are - * callee-saved and therefore don't contain any interesting - * kernel data. - */ - USERGS_SYSRET32 + /* Unreachable. */ + ud2 sysenter_fix_flags: pushq $X86_EFLAGS_FIXED popfq jmp sysenter_flags_fixed - -sysenter_tracesys: - SAVE_EXTRA_REGS - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - movq %rsp, %rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - - /* Reload arg registers from stack. (see sysenter_tracesys) */ - movl RCX(%rsp), %ecx - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ - - RESTORE_EXTRA_REGS - jmp sysenter_do_call ENDPROC(entry_SYSENTER_compat) /* @@ -280,142 +174,10 @@ ENTRY(entry_SYSCALL_compat) pushq $-ENOSYS /* pt_regs->ax */ sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ - /* - * No need to do an access_ok check here because r8 has been - * 32-bit zero extended: - */ - ASM_STAC -1: movl (%r8), %r9d - _ASM_EXTABLE(1b, ia32_badarg) - ASM_CLAC - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz cstar_tracesys - -cstar_do_call: - /* 32-bit syscall -> 64-bit C ABI argument conversion */ - movl %edi, %r8d /* arg5 */ - /* r9 already loaded */ /* arg6 */ - xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ - movl %ebx, %edi /* arg1 */ - movl %edx, %edx /* arg3 (zero extension) */ - -cstar_dispatch: - cmpq $(IA32_NR_syscalls-1), %rax - ja 1f - - call *ia32_sys_call_table(, %rax, 8) - movq %rax, RAX(%rsp) -1: - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_ret_from_sys_call_irqs_off - -sysretl_from_sys_call: - andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl RIP(%rsp), %ecx - movl EFLAGS(%rsp), %r11d - movq RAX(%rsp), %rax - xorq %r10, %r10 - xorq %r9, %r9 - xorq %r8, %r8 - TRACE_IRQS_ON - movl RSP(%rsp), %esp - /* - * 64-bit->32-bit SYSRET restores eip from ecx, - * eflags from r11 (but RF and VM bits are forced to 0), - * cs and ss are loaded from MSRs. - * (Note: 32-bit->32-bit SYSRET is different: since r11 - * does not exist, it merely sets eflags.IF=1). - * - * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss - * descriptor is not reinitialized. This means that we must - * avoid SYSRET with SS == NULL, which could happen if we schedule, - * exit the kernel, and re-enter using an interrupt vector. (All - * interrupt entries on x86_64 set SS to NULL.) We prevent that - * from happening by reloading SS in __switch_to. - */ - USERGS_SYSRET32 - -cstar_tracesys: - xchgl %r9d, %ebp - SAVE_EXTRA_REGS - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %r9, R9(%rsp) - movq %rax, R8(%rsp) - movq %rsp, %rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - movl R9(%rsp), %r9d - - /* Reload arg registers from stack. (see sysenter_tracesys) */ - movl RCX(%rsp), %ecx - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ - - RESTORE_EXTRA_REGS - xchgl %ebp, %r9d - jmp cstar_do_call + /* Unreachable. */ + ud2 END(entry_SYSCALL_compat) -ia32_badarg: - /* - * So far, we've entered kernel mode, set AC, turned on IRQs, and - * saved C regs except r8-r11. We haven't done any of the other - * standard entry work, though. We want to bail, but we shouldn't - * treat this as a syscall entry since we don't even know what the - * args are. Instead, treat this as a non-syscall entry, finish - * the entry work, and immediately exit after setting AX = -EFAULT. - * - * We're really just being polite here. Killing the task outright - * would be a reasonable action, too. Given that the only valid - * way to have gotten here is through the vDSO, and we already know - * that the stack pointer is bad, the task isn't going to survive - * for long no matter what we do. - */ - - ASM_CLAC /* undo STAC */ - movq $-EFAULT, RAX(%rsp) /* return -EFAULT if possible */ - - /* Fill in the rest of pt_regs */ - xorl %eax, %eax - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - SAVE_EXTRA_REGS - - /* Turn IRQs back off. */ - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - - /* Now finish entering normal kernel mode. */ -#ifdef CONFIG_CONTEXT_TRACKING - call enter_from_user_mode -#endif - - /* And exit again. */ - jmp retint_user - -ia32_ret_from_sys_call_irqs_off: - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - -ia32_ret_from_sys_call: - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - jmp int_ret_from_sys_call - /* * Emulated IA32 system calls via int 0x80. * -- cgit v1.2.3 From 8169aff611956ed360e3313e8c718f530f58f6cb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:05 -0700 Subject: x86/entry/64/compat: Set up full pt_regs for all compat syscalls This is conceptually simpler. More importantly, it eliminates the PTREGSCALL and execve stubs, which were not compatible with the C ABI. This means that C code can call through the compat syscall table. The execve stubs are a bit subtle. They did two things: they cleared some registers and they forced slow-path return. Neither is necessary any more: elf_common_init clears the extra registers and start_thread calls force_iret(). Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/f95b7f7dfaacf88a8cae85bb06226cae53769287.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 6 +---- arch/x86/entry/entry_64_compat.S | 42 +++++++++++----------------------- arch/x86/entry/syscalls/syscall_32.tbl | 12 +++++----- 3 files changed, 20 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 7dc285036b6d..83ad95877570 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -391,20 +391,16 @@ GLOBAL(stub_execveat) jmp return_from_execve END(stub_execveat) -#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) +#if defined(CONFIG_X86_X32_ABI) .align 8 GLOBAL(stub_x32_execve) -GLOBAL(stub32_execve) call compat_sys_execve jmp return_from_execve -END(stub32_execve) END(stub_x32_execve) .align 8 GLOBAL(stub_x32_execveat) -GLOBAL(stub32_execveat) call compat_sys_execveat jmp return_from_execve -END(stub32_execveat) END(stub_x32_execveat) #endif diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 3216e6072312..2c2aac577b3c 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -219,12 +219,18 @@ ENTRY(entry_INT80_compat) pushq %rdx /* pt_regs->dx */ pushq %rcx /* pt_regs->cx */ pushq $-ENOSYS /* pt_regs->ax */ - pushq $0 /* pt_regs->r8 */ - pushq $0 /* pt_regs->r9 */ - pushq $0 /* pt_regs->r10 */ - pushq $0 /* pt_regs->r11 */ + xorq %r8,%r8 + pushq %r8 /* pt_regs->r8 = 0 */ + pushq %r8 /* pt_regs->r9 = 0 */ + pushq %r8 /* pt_regs->r10 = 0 */ + pushq %r8 /* pt_regs->r11 = 0 */ + pushq %rbx /* pt_regs->rbx */ + pushq %rbp /* pt_regs->rbp */ + pushq %r12 /* pt_regs->r12 */ + pushq %r13 /* pt_regs->r13 */ + pushq %r14 /* pt_regs->r14 */ + pushq %r15 /* pt_regs->r15 */ cld - sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) @@ -243,10 +249,10 @@ ia32_do_call: call *ia32_sys_call_table(, %rax, 8) movq %rax, RAX(%rsp) 1: + RESTORE_EXTRA_REGS jmp int_ret_from_sys_call ia32_tracesys: - SAVE_EXTRA_REGS movq %rsp, %rdi /* &pt_regs -> arg1 */ call syscall_trace_enter /* @@ -261,25 +267,11 @@ ia32_tracesys: movl RSI(%rsp), %esi movl RDI(%rsp), %edi movl %eax, %eax /* zero extension */ - RESTORE_EXTRA_REGS jmp ia32_do_call END(entry_INT80_compat) - .macro PTREGSCALL label, func - ALIGN -GLOBAL(\label) - leaq \func(%rip), %rax - jmp ia32_ptregs_common - .endm - - PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn - PTREGSCALL stub32_sigreturn, sys32_sigreturn - PTREGSCALL stub32_fork, sys_fork - PTREGSCALL stub32_vfork, sys_vfork - ALIGN GLOBAL(stub32_clone) - leaq sys_clone(%rip), %rax /* * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). @@ -288,12 +280,4 @@ GLOBAL(stub32_clone) * so we need to swap arguments here before calling it: */ xchg %r8, %rcx - jmp ia32_ptregs_common - - ALIGN -ia32_ptregs_common: - SAVE_EXTRA_REGS 8 - call *%rax - RESTORE_EXTRA_REGS 8 - ret -END(ia32_ptregs_common) + jmp sys_clone diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 7663c455b9f6..caa2c712d1e7 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -8,7 +8,7 @@ # 0 i386 restart_syscall sys_restart_syscall 1 i386 exit sys_exit -2 i386 fork sys_fork stub32_fork +2 i386 fork sys_fork sys_fork 3 i386 read sys_read 4 i386 write sys_write 5 i386 open sys_open compat_sys_open @@ -17,7 +17,7 @@ 8 i386 creat sys_creat 9 i386 link sys_link 10 i386 unlink sys_unlink -11 i386 execve sys_execve stub32_execve +11 i386 execve sys_execve compat_sys_execve 12 i386 chdir sys_chdir 13 i386 time sys_time compat_sys_time 14 i386 mknod sys_mknod @@ -125,7 +125,7 @@ 116 i386 sysinfo sys_sysinfo compat_sys_sysinfo 117 i386 ipc sys_ipc compat_sys_ipc 118 i386 fsync sys_fsync -119 i386 sigreturn sys_sigreturn stub32_sigreturn +119 i386 sigreturn sys_sigreturn sys32_sigreturn 120 i386 clone sys_clone stub32_clone 121 i386 setdomainname sys_setdomainname 122 i386 uname sys_newuname @@ -179,7 +179,7 @@ 170 i386 setresgid sys_setresgid16 171 i386 getresgid sys_getresgid16 172 i386 prctl sys_prctl -173 i386 rt_sigreturn sys_rt_sigreturn stub32_rt_sigreturn +173 i386 rt_sigreturn sys_rt_sigreturn sys32_rt_sigreturn 174 i386 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction 175 i386 rt_sigprocmask sys_rt_sigprocmask 176 i386 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending @@ -196,7 +196,7 @@ 187 i386 sendfile sys_sendfile compat_sys_sendfile 188 i386 getpmsg 189 i386 putpmsg -190 i386 vfork sys_vfork stub32_vfork +190 i386 vfork sys_vfork sys_vfork 191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit 192 i386 mmap2 sys_mmap_pgoff 193 i386 truncate64 sys_truncate64 sys32_truncate64 @@ -364,7 +364,7 @@ 355 i386 getrandom sys_getrandom 356 i386 memfd_create sys_memfd_create 357 i386 bpf sys_bpf -358 i386 execveat sys_execveat stub32_execveat +358 i386 execveat sys_execveat compat_sys_execveat 359 i386 socket sys_socket 360 i386 socketpair sys_socketpair 361 i386 bind sys_bind -- cgit v1.2.3 From 034042cc1e2837a584cda0a5e4fc2b0a96b74543 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:06 -0700 Subject: x86/entry/syscalls: Move syscall table declarations into asm/syscalls.h The header was missing some compat declarations. Also make sys_call_ptr_t have a consistent type. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/3166aaff0fb43897998fcb6ef92991533f8c5c6c.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/syscall_32.c | 5 +---- arch/x86/include/asm/syscall.h | 12 +++++++++++- arch/x86/um/sys_call_table_32.c | 3 +-- arch/x86/um/sys_call_table_64.c | 3 +-- 4 files changed, 14 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 8ea34f94e973..429460d7721e 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -4,13 +4,12 @@ #include #include #include +#include #ifdef CONFIG_IA32_EMULATION #define SYM(sym, compat) compat #else #define SYM(sym, compat) sym -#define ia32_sys_call_table sys_call_table -#define __NR_syscall_compat_max __NR_syscall_max #endif #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; @@ -19,8 +18,6 @@ #define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), -typedef asmlinkage void (*sys_call_ptr_t)(void); - extern asmlinkage void sys_ni_syscall(void); __visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = { diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index d6a756ae04c8..f3ff2ef36e19 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -20,9 +20,19 @@ #include /* for TS_COMPAT */ #include -typedef void (*sys_call_ptr_t)(void); +typedef asmlinkage void (*sys_call_ptr_t)(void); extern const sys_call_ptr_t sys_call_table[]; +#if defined(CONFIG_X86_32) +#define ia32_sys_call_table sys_call_table +#define __NR_syscall_compat_max __NR_syscall_max +#define IA32_NR_syscalls NR_syscalls +#endif + +#if defined(CONFIG_IA32_EMULATION) +extern const sys_call_ptr_t ia32_sys_call_table[]; +#endif + /* * Only the low 32 bits of orig_ax are meaningful, so we return int. * This importantly ignores the high bits on 64-bit, so comparisons diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index bd16d6c370ec..d738e9c96036 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -7,6 +7,7 @@ #include #include #include +#include #define __NO_STUBS @@ -30,8 +31,6 @@ #undef __SYSCALL_I386 #define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym, -typedef asmlinkage void (*sys_call_ptr_t)(void); - extern asmlinkage void sys_ni_syscall(void); const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index a75d8700472a..1ff9a21b2645 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -7,6 +7,7 @@ #include #include #include +#include #define __NO_STUBS @@ -43,8 +44,6 @@ #undef __SYSCALL_64 #define __SYSCALL_64(nr, sym, compat) [ nr ] = sym, -typedef void (*sys_call_ptr_t)(void); - extern void sys_ni_syscall(void); const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { -- cgit v1.2.3 From eb974c62565072e10c1422eb3205f5b611dd99a1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:07 -0700 Subject: x86/syscalls: Give sys_call_ptr_t a useful type Syscalls are asmlinkage functions (on 32-bit kernels), take six args of type unsigned long, and return long. Note that uml could probably be slightly cleaned up on top of this patch. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/4d3ecc4a169388d47009175408b2961961744e6f.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/syscall_32.c | 4 ++-- arch/x86/entry/syscall_64.c | 4 ++-- arch/x86/include/asm/syscall.h | 4 +++- arch/x86/um/sys_call_table_32.c | 4 ++-- arch/x86/um/sys_call_table_64.c | 4 ++-- 5 files changed, 11 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 429460d7721e..9a6649857106 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -12,13 +12,13 @@ #define SYM(sym, compat) sym #endif -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_I386 #define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), -extern asmlinkage void sys_ni_syscall(void); +extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); __visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = { /* diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index 4ac730b37f0b..41283d22be7a 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -14,13 +14,13 @@ # define __SYSCALL_X32(nr, sym, compat) /* nothing */ #endif -#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_64 #define __SYSCALL_64(nr, sym, compat) [nr] = sym, -extern void sys_ni_syscall(void); +extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index f3ff2ef36e19..999b7cd2e78c 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -20,7 +20,9 @@ #include /* for TS_COMPAT */ #include -typedef asmlinkage void (*sys_call_ptr_t)(void); +typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long, + unsigned long, unsigned long, + unsigned long, unsigned long); extern const sys_call_ptr_t sys_call_table[]; #if defined(CONFIG_X86_32) diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index d738e9c96036..439c0994b696 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -25,13 +25,13 @@ #define old_mmap sys_old_mmap -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_I386 #define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym, -extern asmlinkage void sys_ni_syscall(void); +extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { /* diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 1ff9a21b2645..b74ea6c2c0e7 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -38,13 +38,13 @@ #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) #define __SYSCALL_X32(nr, sym, compat) /* Not supported */ -#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_64 #define __SYSCALL_64(nr, sym, compat) [ nr ] = sym, -extern void sys_ni_syscall(void); +extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { /* -- cgit v1.2.3 From bd2d3a3ba67ac580f6e809aac36bf942f5447f91 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:08 -0700 Subject: x86/entry: Add do_syscall_32(), a C function to do 32-bit syscalls System calls are really quite simple. Add a helper to call a 32-bit system call. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/a77ed179834c27da436fb4a7fb23c8ee77abc11c.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index d94a60c16029..41d17508cf46 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -318,3 +318,46 @@ __visible void syscall_return_slowpath(struct pt_regs *regs) local_irq_disable(); prepare_exit_to_usermode(regs); } + +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) +/* + * Does a 32-bit syscall. Called with IRQs off and does all entry and + * exit work. + */ +__visible void do_int80_syscall_32(struct pt_regs *regs) +{ + struct thread_info *ti = pt_regs_to_thread_info(regs); + unsigned int nr = (unsigned int)regs->orig_ax; + +#ifdef CONFIG_IA32_EMULATION + ti->status |= TS_COMPAT; +#endif + + local_irq_enable(); + + if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { + /* + * Subtlety here: if ptrace pokes something larger than + * 2^32-1 into orig_ax, this truncates it. This may or + * may not be necessary, but it matches the old asm + * behavior. + */ + nr = syscall_trace_enter(regs); + } + + if (nr < IA32_NR_syscalls) { + /* + * It's possible that a 32-bit syscall implementation + * takes a 64-bit parameter but nonetheless assumes that + * the high bits are zero. Make sure we zero-extend all + * of the args. + */ + regs->ax = ia32_sys_call_table[nr]( + (unsigned int)regs->bx, (unsigned int)regs->cx, + (unsigned int)regs->dx, (unsigned int)regs->si, + (unsigned int)regs->di, (unsigned int)regs->bp); + } + + syscall_return_slowpath(regs); +} +#endif -- cgit v1.2.3 From ee08c6bd315e70756ad2c47ee6ea708a4a882b55 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:09 -0700 Subject: x86/entry/64/compat: Migrate the body of the syscall entry to C Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/a2f0fce68feeba798a24339b5a7ec1ec2dd9eaf7.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 2 +- arch/x86/entry/entry_64_compat.S | 56 +++++++++++++--------------------------- 2 files changed, 19 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 83ad95877570..53616ca03244 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -582,7 +582,7 @@ retint_kernel: * At this label, code paths which return to kernel and to user, * which come from interrupts/exception and from syscalls, merge. */ -restore_regs_and_iret: +GLOBAL(restore_regs_and_iret) RESTORE_EXTRA_REGS restore_c_regs_and_iret: RESTORE_C_REGS diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 2c2aac577b3c..63ef9fa29002 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -202,14 +202,17 @@ END(entry_SYSCALL_compat) ENTRY(entry_INT80_compat) /* * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. */ PARAVIRT_ADJUST_EXCEPTION_FRAME SWAPGS - ENABLE_INTERRUPTS(CLBR_NONE) - /* Zero-extending 32-bit regs, do not remove */ + /* + * User tracing code (ptrace or signal handlers) might assume that + * the saved RAX contains a 32-bit number when we're invoking a 32-bit + * syscall. Just in case the high bits are nonzero, zero-extend + * the syscall number. (This could almost certainly be deleted + * with no ill effects.) + */ movl %eax, %eax /* Construct struct pt_regs on stack (iret frame is already on stack) */ @@ -232,42 +235,19 @@ ENTRY(entry_INT80_compat) pushq %r15 /* pt_regs->r15 */ cld - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_tracesys - -ia32_do_call: - /* 32-bit syscall -> 64-bit C ABI argument conversion */ - movl %edi, %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ - xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ - movl %ebx, %edi /* arg1 */ - movl %edx, %edx /* arg3 (zero extension) */ - cmpq $(IA32_NR_syscalls-1), %rax - ja 1f - - call *ia32_sys_call_table(, %rax, 8) - movq %rax, RAX(%rsp) -1: - RESTORE_EXTRA_REGS - jmp int_ret_from_sys_call - -ia32_tracesys: - movq %rsp, %rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter /* - * Reload arg registers from stack in case ptrace changed them. - * Don't reload %eax because syscall_trace_enter() returned - * the %rax value we should see. But do truncate it to 32 bits. - * If it's -1 to make us punt the syscall, then (u32)-1 is still - * an appropriately invalid value. + * User mode is traced as though IRQs are on, and the interrupt + * gate turned them off. */ - movl RCX(%rsp), %ecx - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ - jmp ia32_do_call + TRACE_IRQS_OFF + + movq %rsp, %rdi + call do_int80_syscall_32 + + /* Go back to user mode. */ + TRACE_IRQS_ON + SWAPGS + jmp restore_regs_and_iret END(entry_INT80_compat) ALIGN -- cgit v1.2.3 From 710246df58041106b7de645f4b45770f8a59a269 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:10 -0700 Subject: x86/entry: Add C code for fast system call entries This handles both SYSENTER and SYSCALL. The asm glue will take care of the differences. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/6041a58a9b8ef6d2522ab4350deb1a1945eb563f.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 41d17508cf46..1b2606edc621 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -24,6 +24,8 @@ #include #include +#include +#include #define CREATE_TRACE_POINTS #include @@ -360,4 +362,45 @@ __visible void do_int80_syscall_32(struct pt_regs *regs) syscall_return_slowpath(regs); } + +__visible void do_fast_syscall_32(struct pt_regs *regs) +{ + /* + * Called using the internal vDSO SYSENTER/SYSCALL32 calling + * convention. Adjust regs so it looks like we entered using int80. + */ + + unsigned long landing_pad = (unsigned long)current->mm->context.vdso + + vdso_image_32.sym_int80_landing_pad; + + /* + * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward + * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. + * Fix it up. + */ + regs->ip = landing_pad; + + /* + * Fetch ECX from where the vDSO stashed it. + * + * WARNING: We are in CONTEXT_USER and RCU isn't paying attention! + */ + local_irq_enable(); + if (get_user(*(u32 *)®s->cx, + (u32 __user __force *)(unsigned long)(u32)regs->sp)) { + /* User code screwed up. */ + local_irq_disable(); + regs->ax = -EFAULT; +#ifdef CONFIG_CONTEXT_TRACKING + enter_from_user_mode(); +#endif + prepare_exit_to_usermode(regs); + return; + } + local_irq_disable(); + + /* Now this is just like a normal syscall. */ + do_int80_syscall_32(regs); + return; +} #endif -- cgit v1.2.3 From a474e67c913d3ebaf02ba9d7835d5299d226c3ed Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:11 -0700 Subject: x86/vdso/compat: Wire up SYSENTER and SYSCSALL for compat userspace What, you didn't realize that SYSENTER and SYSCALL were actually the same thing? :) Unlike the old code, this actually passes the ptrace_syscall_32 test on AMD systems. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/b74615af58d785aa02d917213ec64e2022a2c796.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 90 +++++++++++++++++++------------- arch/x86/entry/vdso/vdso32/system_call.S | 8 +++ 2 files changed, 62 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 63ef9fa29002..8f109de51d03 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -52,15 +52,18 @@ ENTRY(entry_SYSENTER_compat) SWAPGS_UNSAFE_STACK movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - /* Zero-extending 32-bit regs, do not remove */ - movl %ebp, %ebp + /* + * User tracing code (ptrace or signal handlers) might assume that + * the saved RAX contains a 32-bit number when we're invoking a 32-bit + * syscall. Just in case the high bits are nonzero, zero-extend + * the syscall number. (This could almost certainly be deleted + * with no ill effects.) + */ movl %eax, %eax - movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d - /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ - pushq %rbp /* pt_regs->sp */ + pushq %rcx /* pt_regs->sp */ /* * Push flags. This is nasty. First, interrupts are currently @@ -70,17 +73,28 @@ ENTRY(entry_SYSENTER_compat) */ pushfq /* pt_regs->flags (except IF = 0) */ orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ + ASM_CLAC /* Clear AC after saving FLAGS */ pushq $__USER32_CS /* pt_regs->cs */ - pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ + xorq %r8,%r8 + pushq %r8 /* pt_regs->ip = 0 (placeholder) */ pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ + pushq %rcx /* pt_regs->cx (will be overwritten) */ pushq $-ENOSYS /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 = 0 */ + pushq %r8 /* pt_regs->r9 = 0 */ + pushq %r8 /* pt_regs->r10 = 0 */ + pushq %r8 /* pt_regs->r11 = 0 */ + pushq %rbx /* pt_regs->rbx */ + pushq %rbp /* pt_regs->rbp */ + pushq %r8 /* pt_regs->r12 = 0 */ + pushq %r8 /* pt_regs->r13 = 0 */ + pushq %r8 /* pt_regs->r14 = 0 */ + pushq %r8 /* pt_regs->r15 = 0 */ cld - sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ /* * Sysenter doesn't filter flags, so we need to clear NT @@ -93,16 +107,15 @@ ENTRY(entry_SYSENTER_compat) jnz sysenter_fix_flags sysenter_flags_fixed: - /* Temporary: SYSENTER is disabled. */ -#ifdef CONFIG_CONTEXT_TRACKING - call enter_from_user_mode -#endif - ENABLE_INTERRUPTS(CLBR_NONE) - movl $11, %edi - call do_exit + /* + * User mode is traced as though IRQs are on, and SYSENTER + * turned them off. + */ + TRACE_IRQS_OFF - /* Unreachable. */ - ud2 + movq %rsp, %rdi + call do_fast_syscall_32 + jmp .Lsyscall_32_done sysenter_fix_flags: pushq $X86_EFLAGS_FIXED @@ -135,26 +148,14 @@ ENDPROC(entry_SYSENTER_compat) * edi arg5 * esp user stack * 0(%esp) arg6 - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. We set up a complete hardware stack frame to share code - * with the int 0x80 path. */ ENTRY(entry_SYSCALL_compat) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ + /* Interrupts are off on entry. */ SWAPGS_UNSAFE_STACK - /* Temporary: SYSCALL32 is disabled. */ - movl $-ENOSYS, %eax - USERGS_SYSRET32 - + /* Stash user ESP and switch to the kernel stack. */ movl %esp, %r8d movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - ENABLE_INTERRUPTS(CLBR_NONE) /* Zero-extending 32-bit regs, do not remove */ movl %eax, %eax @@ -169,13 +170,29 @@ ENTRY(entry_SYSCALL_compat) pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ - pushq %rbp /* pt_regs->cx */ - movl %ebp, %ecx + pushq %rcx /* pt_regs->cx (will be overwritten) */ pushq $-ENOSYS /* pt_regs->ax */ - sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ + xorq %r8,%r8 + pushq %r8 /* pt_regs->r8 = 0 */ + pushq %r8 /* pt_regs->r9 = 0 */ + pushq %r8 /* pt_regs->r10 = 0 */ + pushq %r8 /* pt_regs->r11 = 0 */ + pushq %rbx /* pt_regs->rbx */ + pushq %rbp /* pt_regs->rbp */ + pushq %r8 /* pt_regs->r12 = 0 */ + pushq %r8 /* pt_regs->r13 = 0 */ + pushq %r8 /* pt_regs->r14 = 0 */ + pushq %r8 /* pt_regs->r15 = 0 */ - /* Unreachable. */ - ud2 + /* + * User mode is traced as though IRQs are on, and SYSENTER + * turned them off. + */ + TRACE_IRQS_OFF + + movq %rsp, %rdi + call do_fast_syscall_32 + jmp .Lsyscall_32_done END(entry_SYSCALL_compat) /* @@ -243,6 +260,7 @@ ENTRY(entry_INT80_compat) movq %rsp, %rdi call do_int80_syscall_32 +.Lsyscall_32_done: /* Go back to user mode. */ TRACE_IRQS_ON diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index d591fe93e93a..00157cae71e0 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -3,6 +3,8 @@ */ #include +#include +#include /* * First get the common code for the sigreturn entry points. @@ -28,6 +30,12 @@ __kernel_vsyscall: CFI_REL_OFFSET ecx, 0 movl %esp, %ecx +#ifdef CONFIG_X86_64 + /* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */ + ALTERNATIVE_2 "", "sysenter", X86_FEATURE_SYSENTER32, \ + "syscall", X86_FEATURE_SYSCALL32 +#endif + /* Enter using int $0x80 */ movl (%esp), %ecx int $0x80 -- cgit v1.2.3 From 7841b408717d4c3b1b334c8f1fef7f18c98cd2bd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:12 -0700 Subject: x86/entry/compat: Implement opportunistic SYSRETL for compat syscalls If CS, SS and IP are as expected and FLAGS is compatible with SYSRETL, then return from fast compat syscalls (both SYSCALL and SYSENTER) using SYSRETL. Unlike native 64-bit opportunistic SYSRET, this is not invisible to user code: RCX and R8-R15 end up in a different state than shown saved in pt_regs. To compensate, we only do this when returning to the vDSO fast syscall return path. This won't interfere with syscall restart, as we won't use SYSRETL when returning to the INT80 restart instruction. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/aa15e49db33773eb10b73d73466b6d5466d7856a.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 23 +++++++++++++++++++--- arch/x86/entry/entry_64_compat.S | 42 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 1b2606edc621..88dc5ba14d47 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -363,7 +363,8 @@ __visible void do_int80_syscall_32(struct pt_regs *regs) syscall_return_slowpath(regs); } -__visible void do_fast_syscall_32(struct pt_regs *regs) +/* Returns 0 to return using IRET or 1 to return using SYSRETL. */ +__visible long do_fast_syscall_32(struct pt_regs *regs) { /* * Called using the internal vDSO SYSENTER/SYSCALL32 calling @@ -395,12 +396,28 @@ __visible void do_fast_syscall_32(struct pt_regs *regs) enter_from_user_mode(); #endif prepare_exit_to_usermode(regs); - return; + return 0; /* Keep it simple: use IRET. */ } local_irq_disable(); /* Now this is just like a normal syscall. */ do_int80_syscall_32(regs); - return; + +#ifdef CONFIG_X86_64 + /* + * Opportunistic SYSRETL: if possible, try to return using SYSRETL. + * SYSRETL is available on all 64-bit CPUs, so we don't need to + * bother with SYSEXIT. + * + * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, + * because the ECX fixup above will ensure that this is essentially + * never the case. + */ + return regs->cs == __USER32_CS && regs->ss == __USER_DS && + regs->ip == landing_pad && + (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0; +#else + return 0; +#endif } #endif diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 8f109de51d03..cf9641cd4796 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -115,7 +115,9 @@ sysenter_flags_fixed: movq %rsp, %rdi call do_fast_syscall_32 - jmp .Lsyscall_32_done + testl %eax, %eax + jz .Lsyscall_32_done + jmp sysret32_from_system_call sysenter_fix_flags: pushq $X86_EFLAGS_FIXED @@ -192,7 +194,43 @@ ENTRY(entry_SYSCALL_compat) movq %rsp, %rdi call do_fast_syscall_32 - jmp .Lsyscall_32_done + testl %eax, %eax + jz .Lsyscall_32_done + + /* Opportunistic SYSRET */ +sysret32_from_system_call: + TRACE_IRQS_ON /* User mode traces as IRQs on. */ + movq RBX(%rsp), %rbx /* pt_regs->rbx */ + movq RBP(%rsp), %rbp /* pt_regs->rbp */ + movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */ + movq RIP(%rsp), %rcx /* pt_regs->ip (in rcx) */ + addq $RAX, %rsp /* Skip r8-r15 */ + popq %rax /* pt_regs->rax */ + popq %rdx /* Skip pt_regs->cx */ + popq %rdx /* pt_regs->dx */ + popq %rsi /* pt_regs->si */ + popq %rdi /* pt_regs->di */ + + /* + * USERGS_SYSRET32 does: + * GSBASE = user's GS base + * EIP = ECX + * RFLAGS = R11 + * CS = __USER32_CS + * SS = __USER_DS + * + * ECX will not match pt_regs->cx, but we're returning to a vDSO + * trampoline that will fix up RCX, so this is okay. + * + * R12-R15 are callee-saved, so they contain whatever was in them + * when the system call started, which is already known to user + * code. We zero R8-R10 to avoid info leaks. + */ + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r10 + movq RSP-ORIG_RAX(%rsp), %rsp + USERGS_SYSRET32 END(entry_SYSCALL_compat) /* -- cgit v1.2.3 From 39e8701f33d65c7f51d749a5d12a1379065e0926 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:13 -0700 Subject: x86/entry/32: Open-code return tracking from fork and kthreads syscall_exit is going away, and return tracing is just a function call now, so open-code the two non-syscall 32-bit users. While we're at it, update the big register layout comment. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/a6b3c472fda7cda0e368c3ccd553dea7447dfdd2.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index a08ded481aba..36b6beb181f6 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -3,7 +3,7 @@ * * entry_32.S contains the system-call and low-level fault and trap handling routines. * - * Stack layout in 'syscall_exit': + * Stack layout while running C code: * ptrace needs to have all registers on the stack. * If the order here is changed, it needs to be * updated in fork.c:copy_process(), signal.c:do_signal(), @@ -211,7 +211,11 @@ ENTRY(ret_from_fork) popl %eax pushl $0x0202 # Reset kernel eflags popfl - jmp syscall_exit + + /* When we fork, we trace the syscall return in the child, too. */ + movl %esp, %eax + call syscall_return_slowpath + jmp restore_all END(ret_from_fork) ENTRY(ret_from_kernel_thread) @@ -224,7 +228,15 @@ ENTRY(ret_from_kernel_thread) movl PT_EBP(%esp), %eax call *PT_EBX(%esp) movl $0, PT_EAX(%esp) - jmp syscall_exit + + /* + * Kernel threads return to userspace as if returning from a syscall. + * We should check whether anything actually uses this path and, if so, + * consider switching it over to ret_from_fork. + */ + movl %esp, %eax + call syscall_return_slowpath + jmp restore_all ENDPROC(ret_from_kernel_thread) /* -- cgit v1.2.3 From 150ac78d63afb96360dab448b7b4d33c98c8266c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:14 -0700 Subject: x86/entry/32: Switch INT80 to the new C syscall path Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/a7e8d8df96838eae3208dd0441023f3ce7a81831.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 36b6beb181f6..02881e528945 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -153,13 +153,13 @@ #endif /* CONFIG_X86_32_LAZY_GS */ -.macro SAVE_ALL +.macro SAVE_ALL pt_regs_ax=%eax cld PUSH_GS pushl %fs pushl %es pushl %ds - pushl %eax + pushl \pt_regs_ax pushl %ebp pushl %edi pushl %esi @@ -370,20 +370,17 @@ ENDPROC(entry_SYSENTER_32) # system call handler stub ENTRY(entry_INT80_32) ASM_CLAC - pushl %eax # save orig_eax - SAVE_ALL - GET_THREAD_INFO(%ebp) - # system call tracing in operation / emulation - testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp) - jnz syscall_trace_entry - cmpl $(NR_syscalls), %eax - jae syscall_badsys -syscall_call: - call *sys_call_table(, %eax, 4) -syscall_after_call: - movl %eax, PT_EAX(%esp) # store the return value -syscall_exit: - jmp syscall_exit_work + pushl %eax /* pt_regs->orig_ax */ + SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest, load -ENOSYS into ax */ + + /* + * User mode is traced as though IRQs are on, and the interrupt gate + * turned them off. + */ + TRACE_IRQS_OFF + + movl %esp, %eax + call do_int80_syscall_32 restore_all: TRACE_IRQS_IRET @@ -491,11 +488,6 @@ syscall_fault: jmp resume_userspace END(syscall_fault) -syscall_badsys: - movl $-ENOSYS, %eax - jmp syscall_after_call -END(syscall_badsys) - sysenter_badsys: movl $-ENOSYS, %eax jmp sysenter_after_call -- cgit v1.2.3 From 5f310f739b4cc343f3f087681e41bbc2f0ce902d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:15 -0700 Subject: x86/entry/32: Re-implement SYSENTER using the new C path Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/5b99659e8be70f3dd10cd8970a5c90293d9ad9a7.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 17 +++- arch/x86/entry/entry_32.S | 132 ++++++++----------------------- arch/x86/entry/vdso/vdso32/system_call.S | 2 + 3 files changed, 51 insertions(+), 100 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 88dc5ba14d47..0ed023d6a983 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -363,7 +363,7 @@ __visible void do_int80_syscall_32(struct pt_regs *regs) syscall_return_slowpath(regs); } -/* Returns 0 to return using IRET or 1 to return using SYSRETL. */ +/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ __visible long do_fast_syscall_32(struct pt_regs *regs) { /* @@ -417,7 +417,20 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) regs->ip == landing_pad && (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0; #else - return 0; + /* + * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT. + * + * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, + * because the ECX fixup above will ensure that this is essentially + * never the case. + * + * We don't allow syscalls at all from VM86 mode, but we still + * need to check VM, because we might be returning from sys_vm86. + */ + return static_cpu_has(X86_FEATURE_SEP) && + regs->cs == __USER_CS && regs->ss == __USER_DS && + regs->ip == landing_pad && + (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; #endif } #endif diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 02881e528945..c1c7c6364216 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -287,76 +287,47 @@ need_resched: END(resume_kernel) #endif -/* - * SYSENTER_RETURN points to after the SYSENTER instruction - * in the vsyscall page. See vsyscall-sysentry.S, which defines - * the symbol. - */ - # SYSENTER call handler stub ENTRY(entry_SYSENTER_32) movl TSS_sysenter_sp0(%esp), %esp sysenter_past_esp: + pushl $__USER_DS /* pt_regs->ss */ + pushl %ecx /* pt_regs->cx */ + pushfl /* pt_regs->flags (except IF = 0) */ + orl $X86_EFLAGS_IF, (%esp) /* Fix IF */ + pushl $__USER_CS /* pt_regs->cs */ + pushl $0 /* pt_regs->ip = 0 (placeholder) */ + pushl %eax /* pt_regs->orig_ax */ + SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ + /* - * Interrupts are disabled here, but we can't trace it until - * enough kernel state to call TRACE_IRQS_OFF can be called - but - * we immediately enable interrupts at that point anyway. - */ - pushl $__USER_DS - pushl %ebp - pushfl - orl $X86_EFLAGS_IF, (%esp) - pushl $__USER_CS - /* - * Push current_thread_info()->sysenter_return to the stack. - * A tiny bit of offset fixup is necessary: TI_sysenter_return - * is relative to thread_info, which is at the bottom of the - * kernel stack page. 4*4 means the 4 words pushed above; - * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; - * and THREAD_SIZE takes us to the bottom. + * User mode is traced as though IRQs are on, and SYSENTER + * turned them off. */ - pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) - - pushl %eax - SAVE_ALL - ENABLE_INTERRUPTS(CLBR_NONE) - -/* - * Load the potential sixth argument from user stack. - * Careful about security. - */ - cmpl $__PAGE_OFFSET-3, %ebp - jae syscall_fault - ASM_STAC -1: movl (%ebp), %ebp - ASM_CLAC - movl %ebp, PT_EBP(%esp) - _ASM_EXTABLE(1b, syscall_fault) - - GET_THREAD_INFO(%ebp) - - testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp) - jnz syscall_trace_entry -sysenter_do_call: - cmpl $(NR_syscalls), %eax - jae sysenter_badsys - call *sys_call_table(, %eax, 4) -sysenter_after_call: - movl %eax, PT_EAX(%esp) - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $_TIF_ALLWORK_MASK, %ecx - jnz syscall_exit_work_irqs_off -sysenter_exit: -/* if something modifies registers it must also disable sysexit */ - movl PT_EIP(%esp), %edx - movl PT_OLDESP(%esp), %ecx - xorl %ebp, %ebp - TRACE_IRQS_ON + + movl %esp, %eax + call do_fast_syscall_32 + testl %eax, %eax + jz .Lsyscall_32_done + +/* Opportunistic SYSEXIT */ + TRACE_IRQS_ON /* User mode traces as IRQs on. */ + movl PT_EIP(%esp), %edx /* pt_regs->ip */ + movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */ + popl %ebx /* pt_regs->bx */ + addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */ + popl %esi /* pt_regs->si */ + popl %edi /* pt_regs->di */ + popl %ebp /* pt_regs->bp */ + popl %eax /* pt_regs->ax */ 1: mov PT_FS(%esp), %fs PTGS_TO_GS + + /* + * Return back to the vDSO, which will pop ecx and edx. + * Don't bother with DS and ES (they already contain __USER_DS). + */ ENABLE_INTERRUPTS_SYSEXIT .pushsection .fixup, "ax" @@ -371,7 +342,7 @@ ENDPROC(entry_SYSENTER_32) ENTRY(entry_INT80_32) ASM_CLAC pushl %eax /* pt_regs->orig_ax */ - SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest, load -ENOSYS into ax */ + SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ /* * User mode is traced as though IRQs are on, and the interrupt gate @@ -381,6 +352,7 @@ ENTRY(entry_INT80_32) movl %esp, %eax call do_int80_syscall_32 +.Lsyscall_32_done: restore_all: TRACE_IRQS_IRET @@ -457,42 +429,6 @@ ldt_ss: #endif ENDPROC(entry_INT80_32) - # perform syscall exit tracing - ALIGN -syscall_trace_entry: - movl $-ENOSYS, PT_EAX(%esp) - movl %esp, %eax - call syscall_trace_enter - /* What it returned is what we'll actually use. */ - cmpl $(NR_syscalls), %eax - jnae syscall_call - jmp syscall_exit -END(syscall_trace_entry) - - # perform syscall exit tracing - ALIGN -syscall_exit_work_irqs_off: - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) - -syscall_exit_work: - movl %esp, %eax - call syscall_return_slowpath - jmp restore_all -END(syscall_exit_work) - -syscall_fault: - ASM_CLAC - GET_THREAD_INFO(%ebp) - movl $-EFAULT, PT_EAX(%esp) - jmp resume_userspace -END(syscall_fault) - -sysenter_badsys: - movl $-ENOSYS, %eax - jmp sysenter_after_call -END(sysenter_badsys) - .macro FIXUP_ESPFIX_STACK /* * Switch back for ESPFIX stack to the normal zerobased stack diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index 00157cae71e0..93bd8452383f 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -34,6 +34,8 @@ __kernel_vsyscall: /* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */ ALTERNATIVE_2 "", "sysenter", X86_FEATURE_SYSENTER32, \ "syscall", X86_FEATURE_SYSCALL32 +#else + ALTERNATIVE "", "sysenter", X86_FEATURE_SEP #endif /* Enter using int $0x80 */ -- cgit v1.2.3 From 487e3bf4f77699160aa81a414200060a78a67c3d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:16 -0700 Subject: x86/asm: Remove thread_info.sysenter_return It's no longer needed. We could reinstate something like it as an optimization, which would remove two cachelines from the fast syscall entry working set. I benchmarked it, and it makes no difference whatsoever to the performance of cache-hot compat syscalls on Sandy Bridge. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/f08cc0cff30201afe9bb565c47134c0a6c1a96a2.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/thread_info.h | 1 - arch/x86/kernel/asm-offsets.c | 3 --- 2 files changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 8afdc3e44247..a1ecd214d227 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -59,7 +59,6 @@ struct thread_info { __u32 cpu; /* current CPU */ int saved_preempt_count; mm_segment_t addr_limit; - void __user *sysenter_return; unsigned int sig_on_uaccess_error:1; unsigned int uaccess_err:1; /* uaccess failed */ }; diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 8e3d22a1af94..95a18e25d5bf 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -53,9 +53,6 @@ void common(void) { OFFSET(IA32_SIGCONTEXT_sp, sigcontext_ia32, sp); OFFSET(IA32_SIGCONTEXT_ip, sigcontext_ia32, ip); - BLANK(); - OFFSET(TI_sysenter_return, thread_info, sysenter_return); - BLANK(); OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); #endif -- cgit v1.2.3 From 8b13c2552ffc8e54e57598df36707183933e8e8c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:17 -0700 Subject: x86/entry: Remove unnecessary IRQ twiddling in fast 32-bit syscalls This is slightly messy, but it eliminates an unnecessary cli;sti pair. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/22f34b1096694a37326f36c53407b8dd90f37948.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 0ed023d6a983..0d1c842ef3ca 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -323,10 +323,10 @@ __visible void syscall_return_slowpath(struct pt_regs *regs) #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) /* - * Does a 32-bit syscall. Called with IRQs off and does all entry and - * exit work. + * Does a 32-bit syscall. Called with IRQs on and does all entry and + * exit work and returns with IRQs off. */ -__visible void do_int80_syscall_32(struct pt_regs *regs) +static void do_syscall_32_irqs_on(struct pt_regs *regs) { struct thread_info *ti = pt_regs_to_thread_info(regs); unsigned int nr = (unsigned int)regs->orig_ax; @@ -335,8 +335,6 @@ __visible void do_int80_syscall_32(struct pt_regs *regs) ti->status |= TS_COMPAT; #endif - local_irq_enable(); - if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { /* * Subtlety here: if ptrace pokes something larger than @@ -363,6 +361,13 @@ __visible void do_int80_syscall_32(struct pt_regs *regs) syscall_return_slowpath(regs); } +/* Handles int $0x80 */ +__visible void do_int80_syscall_32(struct pt_regs *regs) +{ + local_irq_enable(); + do_syscall_32_irqs_on(regs); +} + /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ __visible long do_fast_syscall_32(struct pt_regs *regs) { @@ -398,10 +403,9 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) prepare_exit_to_usermode(regs); return 0; /* Keep it simple: use IRET. */ } - local_irq_disable(); /* Now this is just like a normal syscall. */ - do_int80_syscall_32(regs); + do_syscall_32_irqs_on(regs); #ifdef CONFIG_X86_64 /* -- cgit v1.2.3 From 460d12453e1afe20416ce9536cfecb31d17a9abd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:18 -0700 Subject: x86/entry: Make irqs_disabled checks in exit code depend on lockdep These checks are quite slow. Disable them in non-lockdep kernels to reduce the performance hit. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/eccff2a154ae6fb50f40228901003a6e9c24f3d0.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 0d1c842ef3ca..03aacd188458 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -219,7 +219,7 @@ static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) /* Called with IRQs disabled. */ __visible void prepare_exit_to_usermode(struct pt_regs *regs) { - if (WARN_ON(!irqs_disabled())) + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) local_irq_disable(); lockdep_sys_exit(); @@ -281,8 +281,8 @@ __visible void syscall_return_slowpath(struct pt_regs *regs) CT_WARN_ON(ct_state() != CONTEXT_KERNEL); - if (WARN(irqs_disabled(), "syscall %ld left IRQs disabled", - regs->orig_ax)) + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && + WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax)) local_irq_enable(); /* -- cgit v1.2.3 From 33c52129f45e06d9ce23e1a3d50bf9fd6770748b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:19 -0700 Subject: x86/entry: Force inlining of 32-bit syscall code On systems that support fast syscalls, we only really care about the performance of the fast syscall path. Forcibly inline it and add a likely annotation. This saves 4-6 cycles. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/8472036ff1f4b426b4c4c3e3d0b3bf5264407c0c.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 03aacd188458..d5eee851071c 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -324,9 +324,11 @@ __visible void syscall_return_slowpath(struct pt_regs *regs) #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) /* * Does a 32-bit syscall. Called with IRQs on and does all entry and - * exit work and returns with IRQs off. + * exit work and returns with IRQs off. This function is extremely hot + * in workloads that use it, and it's usually called from + * do_fast_syscall_32, so forcibly inline it to improve performance. */ -static void do_syscall_32_irqs_on(struct pt_regs *regs) +static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) { struct thread_info *ti = pt_regs_to_thread_info(regs); unsigned int nr = (unsigned int)regs->orig_ax; @@ -345,7 +347,7 @@ static void do_syscall_32_irqs_on(struct pt_regs *regs) nr = syscall_trace_enter(regs); } - if (nr < IA32_NR_syscalls) { + if (likely(nr < IA32_NR_syscalls)) { /* * It's possible that a 32-bit syscall implementation * takes a 64-bit parameter but nonetheless assumes that -- cgit v1.2.3 From c68ca6787bdd6d2df37cf950135aa11e71af358a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:20 -0700 Subject: x86/entry: Micro-optimize compat fast syscall arg fetch We're following a 32-bit pointer, and the uaccess code isn't smart enough to figure out that the access_ok() check isn't needed. This saves about three cycles on a cache-hot fast syscall. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/bdff034e2f23c5eb974c760cf494cb5bddce8f29.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index d5eee851071c..08a945d7915e 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -394,8 +394,20 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) * WARNING: We are in CONTEXT_USER and RCU isn't paying attention! */ local_irq_enable(); - if (get_user(*(u32 *)®s->cx, - (u32 __user __force *)(unsigned long)(u32)regs->sp)) { + if ( +#ifdef CONFIG_X86_64 + /* + * Micro-optimization: the pointer we're following is explicitly + * 32 bits, so it can't be out of range. + */ + __get_user(*(u32 *)®s->cx, + (u32 __user __force *)(unsigned long)(u32)regs->sp) +#else + get_user(*(u32 *)®s->cx, + (u32 __user __force *)(unsigned long)(u32)regs->sp) +#endif + ) { + /* User code screwed up. */ local_irq_disable(); regs->ax = -EFAULT; -- cgit v1.2.3 From 4aabd140f9cbe0361401a1368bac74df1010abf5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:21 -0700 Subject: x86/entry: Hide two syscall entry assertions behind CONFIG_DEBUG_ENTRY This shaves a few cycles off the slow paths. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/ce383fa9e129286ce6da6e00b53acd4c9fb5d06a.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 08a945d7915e..778ca70c22dd 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -71,7 +71,8 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) unsigned long ret = 0; u32 work; - BUG_ON(regs != task_pt_regs(current)); + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + BUG_ON(regs != task_pt_regs(current)); work = ACCESS_ONCE(current_thread_info()->flags) & _TIF_WORK_SYSCALL_ENTRY; @@ -160,7 +161,8 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, u32 work = ACCESS_ONCE(current_thread_info()->flags) & _TIF_WORK_SYSCALL_ENTRY; - BUG_ON(regs != task_pt_regs(current)); + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + BUG_ON(regs != task_pt_regs(current)); /* * If we stepped into a sysenter/syscall insn, it trapped in -- cgit v1.2.3 From dd636071c3d8044c802b7a365e9934724a929530 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:22 -0700 Subject: x86/entry: Use pt_regs_to_thread_info() in syscall entry tracing It generates simpler and faster code than current_thread_info(). Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/a3b6633e7dcb9f673c1b619afae602d29d27d2cf.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 778ca70c22dd..d0874210d5b5 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -30,6 +30,13 @@ #define CREATE_TRACE_POINTS #include +static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) +{ + unsigned long top_of_stack = + (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING; + return (struct thread_info *)(top_of_stack - THREAD_SIZE); +} + #ifdef CONFIG_CONTEXT_TRACKING /* Called on entry from user mode with IRQs off. */ __visible void enter_from_user_mode(void) @@ -68,14 +75,14 @@ static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) */ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) { + struct thread_info *ti = pt_regs_to_thread_info(regs); unsigned long ret = 0; u32 work; if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) BUG_ON(regs != task_pt_regs(current)); - work = ACCESS_ONCE(current_thread_info()->flags) & - _TIF_WORK_SYSCALL_ENTRY; + work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; #ifdef CONFIG_CONTEXT_TRACKING /* @@ -157,9 +164,9 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, unsigned long phase1_result) { + struct thread_info *ti = pt_regs_to_thread_info(regs); long ret = 0; - u32 work = ACCESS_ONCE(current_thread_info()->flags) & - _TIF_WORK_SYSCALL_ENTRY; + u32 work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) BUG_ON(regs != task_pt_regs(current)); @@ -211,13 +218,6 @@ long syscall_trace_enter(struct pt_regs *regs) return syscall_trace_enter_phase2(regs, arch, phase1_result); } -static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) -{ - unsigned long top_of_stack = - (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING; - return (struct thread_info *)(top_of_stack - THREAD_SIZE); -} - /* Called with IRQs disabled. */ __visible void prepare_exit_to_usermode(struct pt_regs *regs) { -- cgit v1.2.3 From 39b48e575e92e31251b74b4b48cea2129cee90bd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:23 -0700 Subject: x86/entry: Split and inline prepare_exit_to_usermode() GCC is unable to properly optimize functions that have a very short likely case and a longer and register-heavier cold part -- it fails to sink all of the register saving and stack frame setup code into the unlikely part. Help it out with prepare_exit_to_usermode() by splitting it into two parts and inline the hot part. Saves 6-8 cycles for compat syscalls. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/9fc53eda4a5b924070952f12fa4ae3e477640a07.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index d0874210d5b5..66ccbd664d4c 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -218,14 +218,12 @@ long syscall_trace_enter(struct pt_regs *regs) return syscall_trace_enter_phase2(regs, arch, phase1_result); } -/* Called with IRQs disabled. */ -__visible void prepare_exit_to_usermode(struct pt_regs *regs) -{ - if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) - local_irq_disable(); - - lockdep_sys_exit(); +#define EXIT_TO_USERMODE_LOOP_FLAGS \ + (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ + _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY) +static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) +{ /* * In order to return to user mode, we need to have IRQs off with * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY, @@ -235,14 +233,6 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs) * work to clear some of the flags can sleep. */ while (true) { - u32 cached_flags = - READ_ONCE(pt_regs_to_thread_info(regs)->flags); - - if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | - _TIF_UPROBE | _TIF_NEED_RESCHED | - _TIF_USER_RETURN_NOTIFY))) - break; - /* We have work to do. */ local_irq_enable(); @@ -266,7 +256,30 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs) /* Disable IRQs and retry */ local_irq_disable(); + + cached_flags = READ_ONCE(pt_regs_to_thread_info(regs)->flags); + + if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) + break; + } +} + +/* Called with IRQs disabled. */ +__visible inline void prepare_exit_to_usermode(struct pt_regs *regs) +{ + u32 cached_flags; + + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) + local_irq_disable(); + + lockdep_sys_exit(); + + cached_flags = + READ_ONCE(pt_regs_to_thread_info(regs)->flags); + + if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) + exit_to_usermode_loop(regs, cached_flags); user_enter(); } -- cgit v1.2.3 From f5e6a9753ac2965564a14e6285a06f44043ed9c8 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:24 -0700 Subject: x86/entry: Split and inline syscall_return_slowpath() GCC is unable to properly optimize functions that have a very short likely case and a longer and register-heavier cold part -- it fails to sink all of the register saving and stack frame setup code into the unlikely part. Help it out with syscall_return_slowpath() by splitting it into two parts and inline the hot part. Saves 6 cycles for compat syscalls. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/0f773a894ab15c589ac794c2d34ca6ba9b5335c9.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 50 ++++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 66ccbd664d4c..b53e04d301a3 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -284,15 +284,40 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) user_enter(); } +#define SYSCALL_EXIT_WORK_FLAGS \ + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) + +static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags) +{ + bool step; + + audit_syscall_exit(regs); + + if (cached_flags & _TIF_SYSCALL_TRACEPOINT) + trace_sys_exit(regs, regs->ax); + + /* + * If TIF_SYSCALL_EMU is set, we only get here because of + * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). + * We already reported this syscall instruction in + * syscall_trace_enter(). + */ + step = unlikely( + (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) + == _TIF_SINGLESTEP); + if (step || cached_flags & _TIF_SYSCALL_TRACE) + tracehook_report_syscall_exit(regs, step); +} + /* * Called with IRQs on and fully valid regs. Returns with IRQs off in a * state such that we can immediately switch to user mode. */ -__visible void syscall_return_slowpath(struct pt_regs *regs) +__visible inline void syscall_return_slowpath(struct pt_regs *regs) { struct thread_info *ti = pt_regs_to_thread_info(regs); u32 cached_flags = READ_ONCE(ti->flags); - bool step; CT_WARN_ON(ct_state() != CONTEXT_KERNEL); @@ -304,25 +329,8 @@ __visible void syscall_return_slowpath(struct pt_regs *regs) * First do one-time work. If these work items are enabled, we * want to run them exactly once per syscall exit with IRQs on. */ - if (cached_flags & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | - _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)) { - audit_syscall_exit(regs); - - if (cached_flags & _TIF_SYSCALL_TRACEPOINT) - trace_sys_exit(regs, regs->ax); - - /* - * If TIF_SYSCALL_EMU is set, we only get here because of - * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). - * We already reported this syscall instruction in - * syscall_trace_enter(). - */ - step = unlikely( - (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) - == _TIF_SINGLESTEP); - if (step || cached_flags & _TIF_SYSCALL_TRACE) - tracehook_report_syscall_exit(regs, step); - } + if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) + syscall_slow_exit_work(regs, cached_flags); #ifdef CONFIG_COMPAT /* -- cgit v1.2.3 From c9cdaeb2027e535b956ff69f215522d79f6b54e3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 17 Sep 2015 16:27:57 -0400 Subject: x86, mm: quiet arch_add_memory() Switch to pr_debug() so that dynamic-debug can disable these messages by default. This gets noisy in the presence of devm_memremap_pages(). Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- arch/x86/mm/init.c | 4 ++-- arch/x86/mm/init_64.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 1d8a83df153a..4b9ea3f27de4 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -354,7 +354,7 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, } for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n", + pr_debug(" [mem %#010lx-%#010lx] page %s\n", mr[i].start, mr[i].end - 1, page_size_string(&mr[i])); @@ -401,7 +401,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long ret = 0; int nr_range, i; - pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n", + pr_debug("init_memory_mapping: [mem %#010lx-%#010lx]\n", start, end - 1); memset(mr, 0, sizeof(mr)); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index df48430c279b..e5d42f1a2a71 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1268,7 +1268,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, /* check to see if we have contiguous blocks */ if (p_end != p || node_start != node) { if (p_start) - printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", + pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n", addr_start, addr_end-1, p_start, p_end-1, node_start); addr_start = addr; node_start = node; @@ -1366,7 +1366,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, void __meminit vmemmap_populate_print_last(void) { if (p_start) { - printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", + pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n", addr_start, addr_end-1, p_start, p_end-1, node_start); p_start = NULL; p_end = NULL; -- cgit v1.2.3 From 68d0d9794864a6bfbc6f88bb0ec980400bc17922 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Fri, 9 Oct 2015 00:51:46 +0600 Subject: x86/PCI: Make pci_subsys_init() static The pci_subsys_init() is a subsys_initcall that can be declared static. Signed-off-by: Alexander Kuleshov Signed-off-by: Bjorn Helgaas --- arch/x86/pci/legacy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 5b662c0faf8c..ea6f3802c17b 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -54,7 +54,7 @@ void pcibios_scan_specific_bus(int busn) } EXPORT_SYMBOL_GPL(pcibios_scan_specific_bus); -int __init pci_subsys_init(void) +static int __init pci_subsys_init(void) { /* * The init function returns an non zero value when -- cgit v1.2.3 From 374a3a3916a70fc6236bc2b8f8ac02548a128a54 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 9 Oct 2015 19:08:59 +0200 Subject: x86/entry/64/compat: Document sysenter_fix_flags's reason for existence The code under the label can normally be inline, without the jumping back and forth but the latter is an optimization. Document that. Signed-off-by: Borislav Petkov Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20151009170859.GA24266@pd.tnic Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index cf9641cd4796..92b0b27b43c6 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -102,6 +102,12 @@ ENTRY(entry_SYSENTER_compat) * NT was set instead of doing an unconditional popfq. * This needs to happen before enabling interrupts so that * we don't get preempted with NT set. + * + * NB.: sysenter_fix_flags is a label with the code under it moved + * out-of-line as an optimization: NT is unlikely to be set in the + * majority of the cases and instead of polluting the I$ unnecessarily, + * we're keeping that code behind a branch which will predict as + * not-taken and therefore its instructions won't be fetched. */ testl $X86_EFLAGS_NT, EFLAGS(%rsp) jnz sysenter_fix_flags -- cgit v1.2.3 From d1f0f6c72c14af8a27a6549e0623f7cd61805e83 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 9 Oct 2015 17:25:41 +0300 Subject: x86/intel-mid: Make intel_mid_ops static The following warning is issued on unfixed code. arch/x86/platform/intel-mid/intel-mid.c:64:22: warning: symbol 'intel_mid_ops' was not declared. Should it be static? Signed-off-by: Andy Shevchenko Link: http://lkml.kernel.org/r/1444400741-98669-1-git-send-email-andriy.shevchenko@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/platform/intel-mid/intel-mid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 01d54ea766c1..1bbc21e2e4ae 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -61,7 +61,7 @@ enum intel_mid_timer_options intel_mid_timer_options; /* intel_mid_ops to store sub arch ops */ -struct intel_mid_ops *intel_mid_ops; +static struct intel_mid_ops *intel_mid_ops; /* getter function for sub arch ops*/ static void *(*get_intel_mid_ops[])(void) = INTEL_MID_OPS_INIT; enum intel_mid_cpu_type __intel_mid_cpu_chip; -- cgit v1.2.3 From 4faefda97bc1be6ca909ba0fd0927ea78f37f67e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 9 Oct 2015 17:24:45 +0300 Subject: x86/io_apic: Make eoi_ioapic_pin() static We have to define internally used function as static, otherwise the following warning will be generated: arch/x86/kernel/apic/io_apic.c:532:6: warning: no previous prototype for 'eoi_ioapic_pin' [-Wmissing-prototypes] Signed-off-by: Andy Shevchenko Reviewed-by: Jiang Liu Link: http://lkml.kernel.org/r/1444400685-98611-1-git-send-email-andriy.shevchenko@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5c60bb162622..b5a0e3c5e930 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -529,7 +529,7 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector) } } -void eoi_ioapic_pin(int vector, struct mp_chip_data *data) +static void eoi_ioapic_pin(int vector, struct mp_chip_data *data) { unsigned long flags; struct irq_pin_list *entry; -- cgit v1.2.3 From 7e0abcd6b7ec1452bf4a850fccbae44043c05806 Mon Sep 17 00:00:00 2001 From: Gabriel Laskar Date: Tue, 6 Oct 2015 16:27:35 +0200 Subject: x86/mce: Include linux/ioctl.h in uapi mce header asm/ioctls.h contains definition for termios, not just the _IO* macros. This error was found with a tool in development used to generate automated pretty-printing functions for ioctl decoding in strace. Signed-off-by: Gabriel Laskar Acked-by: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1444141657-14898-2-git-send-email-gabriel@lse.epita.fr Signed-off-by: Thomas Gleixner --- arch/x86/include/uapi/asm/mce.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index 76880ede9a35..03429da2fa80 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -2,7 +2,7 @@ #define _UAPI_ASM_X86_MCE_H #include -#include +#include /* Fields are zero when not available */ struct mce { -- cgit v1.2.3 From e9c40d257fdd58c5cc97d3fe3aa141dd23ee5e9d Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Tue, 6 Oct 2015 02:35:55 +0800 Subject: x86/kexec: Remove obsolete 'in_crash_kexec' flag Previously, UV NMI used the 'in_crash_kexec' flag to determine whether we are in a kdump kernel or not: 5edd19af18a36a4 ("x86, UV: Make kdump avoid stack dumps") But this flags was removed in the following commit: 9c48f1c629ecfa1 ("x86, nmi: Wire up NMI handlers to new routines") Since it isn't used any more, remove it. Signed-off-by: Minfei Huang Acked-by: Don Zickus Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: cpw@sgi.com Cc: kexec@lists.infradead.org Cc: mhuang@redhat.com Link: http://lkml.kernel.org/r/1444070155-17934-1-git-send-email-mhuang@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kdebug.h | 6 ------ arch/x86/kernel/crash.c | 3 --- 2 files changed, 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index b130d59406fb..e5f5dc9787d5 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -29,11 +29,5 @@ extern void show_trace(struct task_struct *t, struct pt_regs *regs, extern void __show_regs(struct pt_regs *regs, int all); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); -#ifdef CONFIG_KEXEC_CORE -extern int in_crash_kexec; -#else -/* no crash dump is ever in progress if no crash kernel can be kexec'd */ -#define in_crash_kexec 0 -#endif #endif /* _ASM_X86_KDEBUG_H */ diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 74ca2fe7a0b3..2c1910f6717e 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -75,8 +75,6 @@ struct crash_memmap_data { unsigned int type; }; -int in_crash_kexec; - /* * This is used to VMCLEAR all VMCSs loaded on the * processor. And when loading kvm_intel module, the @@ -132,7 +130,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) static void kdump_nmi_shootdown_cpus(void) { - in_crash_kexec = 1; nmi_shootdown_cpus(kdump_nmi_callback); disable_local_APIC(); -- cgit v1.2.3 From 12dd00e83fb83fa41dcc965cdd1e1627494348cb Mon Sep 17 00:00:00 2001 From: Leif Lindholm Date: Wed, 26 Aug 2015 14:24:56 +0100 Subject: efi/x86: Move efi=debug option parsing to core fed6cefe3b6e ("x86/efi: Add a "debug" option to the efi= cmdline") adds the DBG flag, but does so for x86 only. Move this early param parsing to core code. Signed-off-by: Leif Lindholm Acked-by: Ard Biesheuvel Tested-by: Ard Biesheuvel Cc: Mark Salter Cc: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 2 -- drivers/firmware/efi/efi.c | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 2f61fcddcddb..4d0ff0f27a99 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -961,8 +961,6 @@ static int __init arch_parse_efi_cmdline(char *str) if (parse_option_str(str, "old_map")) set_bit(EFI_OLD_MEMMAP, &efi.flags); - if (parse_option_str(str, "debug")) - set_bit(EFI_DBG, &efi.flags); return 0; } diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 16c4928e36af..f0372a022c86 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -63,6 +63,9 @@ static int __init parse_efi_cmdline(char *str) return -EINVAL; } + if (parse_option_str(str, "debug")) + set_bit(EFI_DBG, &efi.flags); + if (parse_option_str(str, "noruntime")) disable_runtime = true; -- cgit v1.2.3 From ae2ee627dc87a70910de91b791b3cd0e9c6facdd Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 25 Aug 2015 16:32:55 +0100 Subject: efifb: Add support for 64-bit frame buffer addresses The EFI Graphics Output Protocol uses 64-bit frame buffer addresses but these get truncated to 32-bit by the EFI boot stub when storing the address in the 'lfb_base' field of 'struct screen_info'. Add a 'ext_lfb_base' field for the upper 32-bits of the frame buffer address and set VIDEO_TYPE_CAPABILITY_64BIT_BASE when the field is useable. It turns out that the reason no one has required this support so far is that there's actually code in tianocore to "downgrade" PCI resources that have option ROMs and 64-bit BARS from 64-bit to 32-bit to cope with legacy option ROMs that can't handle 64-bit addresses. The upshot is that basically all GOP devices in the wild use a 32-bit frame buffer address. Still, it is possible to build firmware that uses a full 64-bit GOP frame buffer address. Chad did, which led to him reporting this issue. Add support in anticipation of GOP devices using 64-bit addresses more widely, and so that efifb works out of the box when that happens. Reported-by: Chad Page Cc: Pete Hawkins Acked-by: Peter Jones Cc: Matthew Garrett Cc: Geert Uytterhoeven Signed-off-by: Matt Fleming --- arch/x86/boot/compressed/eboot.c | 24 ++++++++++++++++++++---- drivers/video/fbdev/efifb.c | 24 +++++++++++++++++++++++- include/uapi/linux/screen_info.h | 5 +++-- 3 files changed, 46 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 7d69afd8b6fa..43473bb86ac0 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -624,7 +624,7 @@ setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line, static efi_status_t __gop_query32(struct efi_graphics_output_protocol_32 *gop32, struct efi_graphics_output_mode_info **info, - unsigned long *size, u32 *fb_base) + unsigned long *size, u64 *fb_base) { struct efi_graphics_output_protocol_mode_32 *mode; efi_status_t status; @@ -650,7 +650,8 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto, unsigned long nr_gops; u16 width, height; u32 pixels_per_scan_line; - u32 fb_base; + u32 ext_lfb_base; + u64 fb_base; struct efi_pixel_bitmask pixel_info; int pixel_format; efi_status_t status; @@ -713,6 +714,13 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto, si->lfb_width = width; si->lfb_height = height; si->lfb_base = fb_base; + + ext_lfb_base = (u64)(unsigned long)fb_base >> 32; + if (ext_lfb_base) { + si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; + si->ext_lfb_base = ext_lfb_base; + } + si->pages = 1; setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); @@ -727,7 +735,7 @@ out: static efi_status_t __gop_query64(struct efi_graphics_output_protocol_64 *gop64, struct efi_graphics_output_mode_info **info, - unsigned long *size, u32 *fb_base) + unsigned long *size, u64 *fb_base) { struct efi_graphics_output_protocol_mode_64 *mode; efi_status_t status; @@ -753,7 +761,8 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto, unsigned long nr_gops; u16 width, height; u32 pixels_per_scan_line; - u32 fb_base; + u32 ext_lfb_base; + u64 fb_base; struct efi_pixel_bitmask pixel_info; int pixel_format; efi_status_t status; @@ -816,6 +825,13 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto, si->lfb_width = width; si->lfb_height = height; si->lfb_base = fb_base; + + ext_lfb_base = (u64)(unsigned long)fb_base >> 32; + if (ext_lfb_base) { + si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; + si->ext_lfb_base = ext_lfb_base; + } + si->pages = 1; setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); diff --git a/drivers/video/fbdev/efifb.c b/drivers/video/fbdev/efifb.c index 4bfff349b1fb..95d293b7445a 100644 --- a/drivers/video/fbdev/efifb.c +++ b/drivers/video/fbdev/efifb.c @@ -114,6 +114,20 @@ static int efifb_setup(char *options) return 0; } +static inline bool fb_base_is_valid(void) +{ + if (screen_info.lfb_base) + return true; + + if (!(screen_info.capabilities & VIDEO_CAPABILITY_64BIT_BASE)) + return false; + + if (screen_info.ext_lfb_base) + return true; + + return false; +} + static int efifb_probe(struct platform_device *dev) { struct fb_info *info; @@ -141,7 +155,7 @@ static int efifb_probe(struct platform_device *dev) screen_info.lfb_depth = 32; if (!screen_info.pages) screen_info.pages = 1; - if (!screen_info.lfb_base) { + if (!fb_base_is_valid()) { printk(KERN_DEBUG "efifb: invalid framebuffer address\n"); return -ENODEV; } @@ -160,6 +174,14 @@ static int efifb_probe(struct platform_device *dev) } efifb_fix.smem_start = screen_info.lfb_base; + + if (screen_info.capabilities & VIDEO_CAPABILITY_64BIT_BASE) { + u64 ext_lfb_base; + + ext_lfb_base = (u64)(unsigned long)screen_info.ext_lfb_base << 32; + efifb_fix.smem_start |= ext_lfb_base; + } + efifb_defined.bits_per_pixel = screen_info.lfb_depth; efifb_defined.xres = screen_info.lfb_width; efifb_defined.yres = screen_info.lfb_height; diff --git a/include/uapi/linux/screen_info.h b/include/uapi/linux/screen_info.h index 7530e7447620..8b8d39dfb67f 100644 --- a/include/uapi/linux/screen_info.h +++ b/include/uapi/linux/screen_info.h @@ -43,7 +43,8 @@ struct screen_info { __u16 pages; /* 0x32 */ __u16 vesa_attributes; /* 0x34 */ __u32 capabilities; /* 0x36 */ - __u8 _reserved[6]; /* 0x3a */ + __u32 ext_lfb_base; /* 0x3a */ + __u8 _reserved[2]; /* 0x3e */ } __attribute__((packed)); #define VIDEO_TYPE_MDA 0x10 /* Monochrome Text Display */ @@ -69,6 +70,6 @@ struct screen_info { #define VIDEO_FLAGS_NOCURSOR (1 << 0) /* The video mode has no cursor set */ #define VIDEO_CAPABILITY_SKIP_QUIRKS (1 << 0) - +#define VIDEO_CAPABILITY_64BIT_BASE (1 << 1) /* Frame buffer base is 64-bit */ #endif /* _UAPI_SCREEN_INFO_H */ -- cgit v1.2.3 From 0bbea1ce98ed44779736ecc54ca9cdbca2b95544 Mon Sep 17 00:00:00 2001 From: Taku Izumi Date: Wed, 30 Sep 2015 19:20:00 +0900 Subject: x86/efi: Rename print_efi_memmap() to efi_print_memmap() This patch renames print_efi_memmap() to efi_print_memmap() and make it global function so that we can invoke it outside of arch/x86/platform/efi/efi.c Signed-off-by: Taku Izumi Cc: Tony Luck Cc: Xishi Qiu Cc: Kamezawa Hiroyuki Cc: Ard Biesheuvel Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 1 + arch/x86/platform/efi/efi.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 155162ea0e00..cfee9d4b02af 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -93,6 +93,7 @@ extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable); extern int __init efi_memblock_x86_reserve_range(void); extern pgd_t * __init efi_call_phys_prolog(void); extern void __init efi_call_phys_epilog(pgd_t *save_pgd); +extern void __init efi_print_memmap(void); extern void __init efi_unmap_memmap(void); extern void __init efi_memory_uc(u64 addr, unsigned long size); extern void __init efi_map_region(efi_memory_desc_t *md); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 4d0ff0f27a99..c69e58fb7f19 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -222,7 +222,7 @@ int __init efi_memblock_x86_reserve_range(void) return 0; } -static void __init print_efi_memmap(void) +void __init efi_print_memmap(void) { #ifdef EFI_DEBUG efi_memory_desc_t *md; @@ -524,7 +524,7 @@ void __init efi_init(void) return; if (efi_enabled(EFI_DBG)) - print_efi_memmap(); + efi_print_memmap(); efi_esrt_init(); } -- cgit v1.2.3 From 0f96a99dab366333439e110d6ad253bc7c557c09 Mon Sep 17 00:00:00 2001 From: Taku Izumi Date: Wed, 30 Sep 2015 23:01:56 +0900 Subject: efi: Add "efi_fake_mem" boot option This patch introduces new boot option named "efi_fake_mem". By specifying this parameter, you can add arbitrary attribute to specific memory range. This is useful for debugging of Address Range Mirroring feature. For example, if "efi_fake_mem=2G@4G:0x10000,2G@0x10a0000000:0x10000" is specified, the original (firmware provided) EFI memmap will be updated so that the specified memory regions have EFI_MEMORY_MORE_RELIABLE attribute (0x10000): efi: mem36: [Conventional Memory| | | | | | |WB|WT|WC|UC] range=[0x0000000100000000-0x00000020a0000000) (129536MB) efi: mem36: [Conventional Memory| |MR| | | | |WB|WT|WC|UC] range=[0x0000000100000000-0x0000000180000000) (2048MB) efi: mem37: [Conventional Memory| | | | | | |WB|WT|WC|UC] range=[0x0000000180000000-0x00000010a0000000) (61952MB) efi: mem38: [Conventional Memory| |MR| | | | |WB|WT|WC|UC] range=[0x00000010a0000000-0x0000001120000000) (2048MB) efi: mem39: [Conventional Memory| | | | | | |WB|WT|WC|UC] range=[0x0000001120000000-0x00000020a0000000) (63488MB) And you will find that the following message is output: efi: Memory: 4096M/131455M mirrored memory Signed-off-by: Taku Izumi Cc: Tony Luck Cc: Xishi Qiu Cc: Kamezawa Hiroyuki Cc: Ard Biesheuvel Signed-off-by: Matt Fleming --- Documentation/kernel-parameters.txt | 15 +++ arch/x86/kernel/setup.c | 4 +- drivers/firmware/efi/Kconfig | 22 ++++ drivers/firmware/efi/Makefile | 1 + drivers/firmware/efi/fake_mem.c | 238 ++++++++++++++++++++++++++++++++++++ include/linux/efi.h | 6 + 6 files changed, 285 insertions(+), 1 deletion(-) create mode 100644 drivers/firmware/efi/fake_mem.c (limited to 'arch/x86') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 1d6f0459cd7b..cd5312f24981 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1092,6 +1092,21 @@ bytes respectively. Such letter suffixes can also be entirely omitted. you are really sure that your UEFI does sane gc and fulfills the spec otherwise your board may brick. + efi_fake_mem= nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI; X86] + Add arbitrary attribute to specific memory range by + updating original EFI memory map. + Region of memory which aa attribute is added to is + from ss to ss+nn. + If efi_fake_mem=2G@4G:0x10000,2G@0x10a0000000:0x10000 + is specified, EFI_MEMORY_MORE_RELIABLE(0x10000) + attribute is added to range 0x100000000-0x180000000 and + 0x10a0000000-0x1120000000. + + Using this parameter you can do debugging of EFI memmap + related feature. For example, you can do debugging of + Address Range Mirroring feature even if your box + doesn't support it. + eisa_irq_edge= [PARISC,HW] See header of drivers/parisc/eisa.c. diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 80f874bf999e..e3ed628f7db4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1104,8 +1104,10 @@ void __init setup_arch(char **cmdline_p) memblock_set_current_limit(ISA_END_ADDRESS); memblock_x86_fill(); - if (efi_enabled(EFI_BOOT)) + if (efi_enabled(EFI_BOOT)) { + efi_fake_memmap(); efi_find_mirror(); + } /* * The EFI specification says that boot service code won't be called diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index 54071c148340..1de6f0ed5077 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -52,6 +52,28 @@ config EFI_RUNTIME_MAP See also Documentation/ABI/testing/sysfs-firmware-efi-runtime-map. +config EFI_FAKE_MEMMAP + bool "Enable EFI fake memory map" + depends on EFI && X86 + default n + help + Saying Y here will enable "efi_fake_mem" boot option. + By specifying this parameter, you can add arbitrary attribute + to specific memory range by updating original (firmware provided) + EFI memmap. + This is useful for debugging of EFI memmap related feature. + e.g. Address Range Mirroring feature. + +config EFI_MAX_FAKE_MEM + int "maximum allowable number of ranges in efi_fake_mem boot option" + depends on EFI_FAKE_MEMMAP + range 1 128 + default 8 + help + Maximum allowable number of ranges in efi_fake_mem boot option. + Ranges can be set up to this value using comma-separated list. + The default value is 8. + config EFI_PARAMS_FROM_FDT bool help diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index 6fd3da938717..c24f00569acb 100644 --- a/drivers/firmware/efi/Makefile +++ b/drivers/firmware/efi/Makefile @@ -9,3 +9,4 @@ obj-$(CONFIG_UEFI_CPER) += cper.o obj-$(CONFIG_EFI_RUNTIME_MAP) += runtime-map.o obj-$(CONFIG_EFI_RUNTIME_WRAPPERS) += runtime-wrappers.o obj-$(CONFIG_EFI_STUB) += libstub/ +obj-$(CONFIG_EFI_FAKE_MEMMAP) += fake_mem.o diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c new file mode 100644 index 000000000000..32bcb14df2c8 --- /dev/null +++ b/drivers/firmware/efi/fake_mem.c @@ -0,0 +1,238 @@ +/* + * fake_mem.c + * + * Copyright (C) 2015 FUJITSU LIMITED + * Author: Taku Izumi + * + * This code introduces new boot option named "efi_fake_mem" + * By specifying this parameter, you can add arbitrary attribute to + * specific memory range by updating original (firmware provided) EFI + * memmap. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, see . + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + */ + +#include +#include +#include +#include +#include +#include +#include + +#define EFI_MAX_FAKEMEM CONFIG_EFI_MAX_FAKE_MEM + +struct fake_mem { + struct range range; + u64 attribute; +}; +static struct fake_mem fake_mems[EFI_MAX_FAKEMEM]; +static int nr_fake_mem; + +static int __init cmp_fake_mem(const void *x1, const void *x2) +{ + const struct fake_mem *m1 = x1; + const struct fake_mem *m2 = x2; + + if (m1->range.start < m2->range.start) + return -1; + if (m1->range.start > m2->range.start) + return 1; + return 0; +} + +void __init efi_fake_memmap(void) +{ + u64 start, end, m_start, m_end, m_attr; + int new_nr_map = memmap.nr_map; + efi_memory_desc_t *md; + u64 new_memmap_phy; + void *new_memmap; + void *old, *new; + int i; + + if (!nr_fake_mem || !efi_enabled(EFI_MEMMAP)) + return; + + /* count up the number of EFI memory descriptor */ + for (old = memmap.map; old < memmap.map_end; old += memmap.desc_size) { + md = old; + start = md->phys_addr; + end = start + (md->num_pages << EFI_PAGE_SHIFT) - 1; + + for (i = 0; i < nr_fake_mem; i++) { + /* modifying range */ + m_start = fake_mems[i].range.start; + m_end = fake_mems[i].range.end; + + if (m_start <= start) { + /* split into 2 parts */ + if (start < m_end && m_end < end) + new_nr_map++; + } + if (start < m_start && m_start < end) { + /* split into 3 parts */ + if (m_end < end) + new_nr_map += 2; + /* split into 2 parts */ + if (end <= m_end) + new_nr_map++; + } + } + } + + /* allocate memory for new EFI memmap */ + new_memmap_phy = memblock_alloc(memmap.desc_size * new_nr_map, + PAGE_SIZE); + if (!new_memmap_phy) + return; + + /* create new EFI memmap */ + new_memmap = early_memremap(new_memmap_phy, + memmap.desc_size * new_nr_map); + if (!new_memmap) { + memblock_free(new_memmap_phy, memmap.desc_size * new_nr_map); + return; + } + + for (old = memmap.map, new = new_memmap; + old < memmap.map_end; + old += memmap.desc_size, new += memmap.desc_size) { + + /* copy original EFI memory descriptor */ + memcpy(new, old, memmap.desc_size); + md = new; + start = md->phys_addr; + end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1; + + for (i = 0; i < nr_fake_mem; i++) { + /* modifying range */ + m_start = fake_mems[i].range.start; + m_end = fake_mems[i].range.end; + m_attr = fake_mems[i].attribute; + + if (m_start <= start && end <= m_end) + md->attribute |= m_attr; + + if (m_start <= start && + (start < m_end && m_end < end)) { + /* first part */ + md->attribute |= m_attr; + md->num_pages = (m_end - md->phys_addr + 1) >> + EFI_PAGE_SHIFT; + /* latter part */ + new += memmap.desc_size; + memcpy(new, old, memmap.desc_size); + md = new; + md->phys_addr = m_end + 1; + md->num_pages = (end - md->phys_addr + 1) >> + EFI_PAGE_SHIFT; + } + + if ((start < m_start && m_start < end) && m_end < end) { + /* first part */ + md->num_pages = (m_start - md->phys_addr) >> + EFI_PAGE_SHIFT; + /* middle part */ + new += memmap.desc_size; + memcpy(new, old, memmap.desc_size); + md = new; + md->attribute |= m_attr; + md->phys_addr = m_start; + md->num_pages = (m_end - m_start + 1) >> + EFI_PAGE_SHIFT; + /* last part */ + new += memmap.desc_size; + memcpy(new, old, memmap.desc_size); + md = new; + md->phys_addr = m_end + 1; + md->num_pages = (end - m_end) >> + EFI_PAGE_SHIFT; + } + + if ((start < m_start && m_start < end) && + (end <= m_end)) { + /* first part */ + md->num_pages = (m_start - md->phys_addr) >> + EFI_PAGE_SHIFT; + /* latter part */ + new += memmap.desc_size; + memcpy(new, old, memmap.desc_size); + md = new; + md->phys_addr = m_start; + md->num_pages = (end - md->phys_addr + 1) >> + EFI_PAGE_SHIFT; + md->attribute |= m_attr; + } + } + } + + /* swap into new EFI memmap */ + efi_unmap_memmap(); + memmap.map = new_memmap; + memmap.phys_map = (void *)new_memmap_phy; + memmap.nr_map = new_nr_map; + memmap.map_end = memmap.map + memmap.nr_map * memmap.desc_size; + set_bit(EFI_MEMMAP, &efi.flags); + + /* print new EFI memmap */ + efi_print_memmap(); +} + +static int __init setup_fake_mem(char *p) +{ + u64 start = 0, mem_size = 0, attribute = 0; + int i; + + if (!p) + return -EINVAL; + + while (*p != '\0') { + mem_size = memparse(p, &p); + if (*p == '@') + start = memparse(p+1, &p); + else + break; + + if (*p == ':') + attribute = simple_strtoull(p+1, &p, 0); + else + break; + + if (nr_fake_mem >= EFI_MAX_FAKEMEM) + break; + + fake_mems[nr_fake_mem].range.start = start; + fake_mems[nr_fake_mem].range.end = start + mem_size - 1; + fake_mems[nr_fake_mem].attribute = attribute; + nr_fake_mem++; + + if (*p == ',') + p++; + } + + sort(fake_mems, nr_fake_mem, sizeof(struct fake_mem), + cmp_fake_mem, NULL); + + for (i = 0; i < nr_fake_mem; i++) + pr_info("efi_fake_mem: add attr=0x%016llx to [mem 0x%016llx-0x%016llx]", + fake_mems[i].attribute, fake_mems[i].range.start, + fake_mems[i].range.end); + + return *p == '\0' ? 0 : -EINVAL; +} + +early_param("efi_fake_mem", setup_fake_mem); diff --git a/include/linux/efi.h b/include/linux/efi.h index fa5106c2f9f5..4d01c1033fce 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -922,6 +922,12 @@ extern struct kobject *efi_kobj; extern int efi_reboot_quirk_mode; extern bool efi_poweroff_required(void); +#ifdef CONFIG_EFI_FAKE_MEMMAP +extern void __init efi_fake_memmap(void); +#else +static inline void efi_fake_memmap(void) { } +#endif + /* Iterate through an efi_memory_map */ #define for_each_efi_memory_desc(m, md) \ for ((md) = (m)->map; \ -- cgit v1.2.3 From 85c9306d44f757d2fb3b0e3e399080a025315c7f Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Mon, 12 Oct 2015 11:22:38 +0200 Subject: x86/ras/mce_amd_inj: Return early on invalid input Invalid inputs such as these are currently reported in dmesg as failing: $> echo sweet > flags [ 122.079139] flags_write: Invalid flags value: et even though the 'flags' attribute has been updated correctly: $> cat flags sw This is because userspace keeps writing the remaining buffer until it encounters an error. However, the input as a whole is wrong and we should not be writing anything to the file. Therefore, correct flags_write() to return -EINVAL immediately on bad input strings. Signed-off-by: Aravind Gopalakrishnan Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1443190851-2172-2-git-send-email-Aravind.Gopalakrishnan@amd.com Link: http://lkml.kernel.org/r/1444641762-9437-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/ras/mce_amd_inj.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c index 17e35b5bf779..4fd8bb9b90b9 100644 --- a/arch/x86/ras/mce_amd_inj.c +++ b/arch/x86/ras/mce_amd_inj.c @@ -129,12 +129,9 @@ static ssize_t flags_write(struct file *filp, const char __user *ubuf, { char buf[MAX_FLAG_OPT_SIZE], *__buf; int err; - size_t ret; if (cnt > MAX_FLAG_OPT_SIZE) - cnt = MAX_FLAG_OPT_SIZE; - - ret = cnt; + return -EINVAL; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; @@ -150,9 +147,9 @@ static ssize_t flags_write(struct file *filp, const char __user *ubuf, return err; } - *ppos += ret; + *ppos += cnt; - return ret; + return cnt; } static const struct file_operations flags_fops = { -- cgit v1.2.3 From a1300e50529795cd605da6a015d4944a18921db0 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Mon, 12 Oct 2015 11:22:39 +0200 Subject: x86/ras/mce_amd_inj: Trigger deferred and thresholding errors interrupts Add the capability to trigger deferred error interrupts and threshold interrupts in order to test the APIC interrupt handler functionality for these type of errors. Update README section about the same too. Reported by: kbuild test robot Signed-off-by: Aravind Gopalakrishnan [ Cleanup comments. ] [ Include asm/irq_vectors.h directly so that misc randbuilds don't fail. ] Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1443190851-2172-3-git-send-email-Aravind.Gopalakrishnan@amd.com Link: http://lkml.kernel.org/r/1444641762-9437-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/ras/mce_amd_inj.c | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c index 4fd8bb9b90b9..4d3bafb540c2 100644 --- a/arch/x86/ras/mce_amd_inj.c +++ b/arch/x86/ras/mce_amd_inj.c @@ -17,7 +17,9 @@ #include #include #include + #include +#include #include "../kernel/cpu/mcheck/mce-internal.h" @@ -34,12 +36,16 @@ static u8 n_banks; enum injection_type { SW_INJ = 0, /* SW injection, simply decode the error */ HW_INJ, /* Trigger a #MC */ + DFR_INT_INJ, /* Trigger Deferred error interrupt */ + THR_INT_INJ, /* Trigger threshold interrupt */ N_INJ_TYPES, }; static const char * const flags_options[] = { [SW_INJ] = "sw", [HW_INJ] = "hw", + [DFR_INT_INJ] = "df", + [THR_INT_INJ] = "th", NULL }; @@ -182,6 +188,16 @@ static void trigger_mce(void *info) asm volatile("int $18"); } +static void trigger_dfr_int(void *info) +{ + asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR)); +} + +static void trigger_thr_int(void *info) +{ + asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR)); +} + static void do_inject(void) { u64 mcg_status = 0; @@ -202,6 +218,16 @@ static void do_inject(void) if (!(i_mce.status & MCI_STATUS_PCC)) mcg_status |= MCG_STATUS_RIPV; + /* + * Ensure necessary status bits for deferred errors: + * - MCx_STATUS[Deferred]: make sure it is a deferred error + * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC + */ + if (inj_type == DFR_INT_INJ) { + i_mce.status |= MCI_STATUS_DEFERRED; + i_mce.status |= (i_mce.status & ~MCI_STATUS_UC); + } + get_online_cpus(); if (!cpu_online(cpu)) goto err; @@ -222,7 +248,16 @@ static void do_inject(void) toggle_hw_mce_inject(cpu, false); - smp_call_function_single(cpu, trigger_mce, NULL, 0); + switch (inj_type) { + case DFR_INT_INJ: + smp_call_function_single(cpu, trigger_dfr_int, NULL, 0); + break; + case THR_INT_INJ: + smp_call_function_single(cpu, trigger_thr_int, NULL, 0); + break; + default: + smp_call_function_single(cpu, trigger_mce, NULL, 0); + } err: put_online_cpus(); @@ -287,6 +322,11 @@ static const char readme_msg[] = "\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n" "\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n" "\t before injecting.\n" +"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n" +"\t error APIC interrupt handler to handle the error if the feature is \n" +"\t is present in hardware. \n" +"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n" +"\t APIC interrupt handler to handle the error. \n" "\n"; static ssize_t -- cgit v1.2.3 From fa20a2ed6fff717839ec03b6574ea0affcb58841 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Mon, 12 Oct 2015 11:22:40 +0200 Subject: x86/ras/mce_amd_inj: Inject bank 4 errors on the NBC Bank 4 MCEs are logged and reported only on the node base core (NBC) in a socket. Refer to the D18F3x44[NbMcaToMstCpuEn] field in Fam10h and later BKDGs. The node base core (NBC) is the lowest numbered core in the node. This patch ensures that we inject the error on the NBC for bank 4 errors. Otherwise, triggering #MC or APIC interrupts on a core which is not the NBC would not have any effect on the system, i.e. we would not see any relevant output on kernel logs for the error we just injected. Signed-off-by: Aravind Gopalakrishnan [ Cleanup comments. ] [ Add a missing dependency on AMD_NB caught by Randy Dunlap. ] Signed-off-by: Borislav Petkov Acked-by: Randy Dunlap Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1443190851-2172-4-git-send-email-Aravind.Gopalakrishnan@amd.com Link: http://lkml.kernel.org/r/1444641762-9437-5-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/ras/Kconfig | 4 +--- arch/x86/ras/mce_amd_inj.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig index 10fea5fc821e..df280da34825 100644 --- a/arch/x86/ras/Kconfig +++ b/arch/x86/ras/Kconfig @@ -1,11 +1,9 @@ config AMD_MCE_INJ tristate "Simple MCE injection interface for AMD processors" - depends on RAS && EDAC_DECODE_MCE && DEBUG_FS + depends on RAS && EDAC_DECODE_MCE && DEBUG_FS && AMD_NB default n help This is a simple debugfs interface to inject MCEs and test different aspects of the MCE handling code. WARNING: Do not even assume this interface is staying stable! - - diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c index 4d3bafb540c2..55d38cfa46c2 100644 --- a/arch/x86/ras/mce_amd_inj.c +++ b/arch/x86/ras/mce_amd_inj.c @@ -17,8 +17,10 @@ #include #include #include +#include #include +#include #include #include "../kernel/cpu/mcheck/mce-internal.h" @@ -32,6 +34,7 @@ static struct dentry *dfs_inj; static u8 n_banks; #define MAX_FLAG_OPT_SIZE 3 +#define NBCFG 0x44 enum injection_type { SW_INJ = 0, /* SW injection, simply decode the error */ @@ -198,6 +201,45 @@ static void trigger_thr_int(void *info) asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR)); } +static u32 get_nbc_for_node(int node_id) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + u32 cores_per_node; + + cores_per_node = c->x86_max_cores / amd_get_nodes_per_socket(); + + return cores_per_node * node_id; +} + +static void toggle_nb_mca_mst_cpu(u16 nid) +{ + struct pci_dev *F3 = node_to_amd_nb(nid)->misc; + u32 val; + int err; + + if (!F3) + return; + + err = pci_read_config_dword(F3, NBCFG, &val); + if (err) { + pr_err("%s: Error reading F%dx%03x.\n", + __func__, PCI_FUNC(F3->devfn), NBCFG); + return; + } + + if (val & BIT(27)) + return; + + pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n", + __func__); + + val |= BIT(27); + err = pci_write_config_dword(F3, NBCFG, val); + if (err) + pr_err("%s: Error writing F%dx%03x.\n", + __func__, PCI_FUNC(F3->devfn), NBCFG); +} + static void do_inject(void) { u64 mcg_status = 0; @@ -228,6 +270,16 @@ static void do_inject(void) i_mce.status |= (i_mce.status & ~MCI_STATUS_UC); } + /* + * For multi node CPUs, logging and reporting of bank 4 errors happens + * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for + * Fam10h and later BKDGs. + */ + if (static_cpu_has(X86_FEATURE_AMD_DCM) && b == 4) { + toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); + cpu = get_nbc_for_node(amd_get_nb_id(cpu)); + } + get_online_cpus(); if (!cpu_online(cpu)) goto err; -- cgit v1.2.3 From 2eff73c0a11f19ff082a566e3429fbaaca7b8e7b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 12 Oct 2015 11:22:41 +0200 Subject: x86/microcode/amd: Extract current patch level read to a function Pave the way for checking the current patch level of the microcode in a core. We want to be able to do stuff depending on the patch level - in this case decide whether to update or not. But that will be added in a later patch. Drop unused local var uci assignment, while at it. Integrate a fix for 32-bit and CONFIG_PARAVIRT from Takashi Iwai: Use native_rdmsr() in check_current_patch_level() because with CONFIG_PARAVIRT enabled and on 32-bit, where we run before paging has been enabled, we cannot deref pv_info yet. Or we could, but we'd need to access its physical address. This way of fixing it is simpler. See: https://bugzilla.suse.com/show_bug.cgi?id=943179 for the background. Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Takashi Iwai : Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1444641762-9437-6-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/microcode_amd.h | 1 + arch/x86/kernel/cpu/microcode/amd.c | 24 ++++++++++++++++++++++-- arch/x86/kernel/cpu/microcode/amd_early.c | 17 +++++++---------- 3 files changed, 30 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index ac6d328977a6..9b214e10d499 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -76,4 +76,5 @@ static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; } void reload_ucode_amd(void) {} #endif +extern bool check_current_patch_level(u32 *rev); #endif /* _ASM_X86_MICROCODE_AMD_H */ diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 12829c3ced3c..2d630138bf3e 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -177,6 +177,25 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, return patch_size; } +/* + * Check the current patch level on this CPU. + * + * @rev: Use it to return the patch level. It is set to 0 in the case of + * error. + * + * Returns: + * - true: if update should stop + * - false: otherwise + */ +bool check_current_patch_level(u32 *rev) +{ + u32 dummy; + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, *rev, dummy); + + return false; +} + int __apply_microcode_amd(struct microcode_amd *mc_amd) { u32 rev, dummy; @@ -197,7 +216,7 @@ int apply_microcode_amd(int cpu) struct microcode_amd *mc_amd; struct ucode_cpu_info *uci; struct ucode_patch *p; - u32 rev, dummy; + u32 rev; BUG_ON(raw_smp_processor_id() != cpu); @@ -210,7 +229,8 @@ int apply_microcode_amd(int cpu) mc_amd = p->data; uci->mc = p->data; - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + if (check_current_patch_level(&rev)) + return -1; /* need to apply patch? */ if (rev >= mc_amd->hdr.patch_id) { diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c index e8a215a9a345..abb90097582f 100644 --- a/arch/x86/kernel/cpu/microcode/amd_early.c +++ b/arch/x86/kernel/cpu/microcode/amd_early.c @@ -196,9 +196,8 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) return; } - /* find ucode and update if needed */ - - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); + if (check_current_patch_level(&rev)) + return; while (left > 0) { struct microcode_amd *mc; @@ -319,7 +318,6 @@ static void __init get_bsp_sig(void) void load_ucode_amd_ap(void) { unsigned int cpu = smp_processor_id(); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct equiv_cpu_entry *eq; struct microcode_amd *mc; u32 rev, eax; @@ -332,10 +330,8 @@ void load_ucode_amd_ap(void) if (!container) return; - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); - - uci->cpu_sig.rev = rev; - uci->cpu_sig.sig = eax; + if (check_current_patch_level(&rev)) + return; eax = cpuid_eax(0x00000001); eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ); @@ -424,9 +420,10 @@ int __init save_microcode_in_initrd_amd(void) void reload_ucode_amd(void) { struct microcode_amd *mc; - u32 rev, eax; + u32 rev; - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); + if (check_current_patch_level(&rev)) + return; mc = (struct microcode_amd *)amd_ucode_patch; -- cgit v1.2.3 From 0399f73299f1b7e04de329050f7111b362b7eeb5 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 12 Oct 2015 11:22:42 +0200 Subject: x86/microcode/amd: Do not overwrite final patch levels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A certain number of patch levels of applied microcode should not be overwritten by the microcode loader, otherwise bad things will happen. Check those and abort update if the current core has one of those final patch levels applied by the BIOS. 32-bit needs special handling, of course. See https://bugzilla.suse.com/show_bug.cgi?id=913996 for more info. Tested-by: Peter Kirchgeßner Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1444641762-9437-7-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/microcode_amd.h | 2 +- arch/x86/kernel/cpu/microcode/amd.c | 38 +++++++++++++++++++++++++++---- arch/x86/kernel/cpu/microcode/amd_early.c | 13 ++++++++--- 3 files changed, 44 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index 9b214e10d499..d3e86cfd08fe 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -76,5 +76,5 @@ static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; } void reload_ucode_amd(void) {} #endif -extern bool check_current_patch_level(u32 *rev); +extern bool check_current_patch_level(u32 *rev, bool early); #endif /* _ASM_X86_MICROCODE_AMD_H */ diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 2d630138bf3e..da922d1e2f71 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -177,6 +177,16 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, return patch_size; } +/* + * Those patch levels cannot be updated to newer ones and thus should be final. + */ +static u32 final_levels[] = { + 0x01000098, + 0x0100009f, + 0x010000af, + 0, /* T-101 terminator */ +}; + /* * Check the current patch level on this CPU. * @@ -187,13 +197,31 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, * - true: if update should stop * - false: otherwise */ -bool check_current_patch_level(u32 *rev) +bool check_current_patch_level(u32 *rev, bool early) { - u32 dummy; + u32 lvl, dummy, i; + bool ret = false; + u32 *levels; + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy); + + if (IS_ENABLED(CONFIG_X86_32) && early) + levels = (u32 *)__pa_nodebug(&final_levels); + else + levels = final_levels; + + for (i = 0; levels[i]; i++) { + if (lvl == levels[i]) { + lvl = 0; + ret = true; + break; + } + } - native_rdmsr(MSR_AMD64_PATCH_LEVEL, *rev, dummy); + if (rev) + *rev = lvl; - return false; + return ret; } int __apply_microcode_amd(struct microcode_amd *mc_amd) @@ -229,7 +257,7 @@ int apply_microcode_amd(int cpu) mc_amd = p->data; uci->mc = p->data; - if (check_current_patch_level(&rev)) + if (check_current_patch_level(&rev, false)) return -1; /* need to apply patch? */ diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c index abb90097582f..a54a47b9d8ea 100644 --- a/arch/x86/kernel/cpu/microcode/amd_early.c +++ b/arch/x86/kernel/cpu/microcode/amd_early.c @@ -196,7 +196,7 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) return; } - if (check_current_patch_level(&rev)) + if (check_current_patch_level(&rev, true)) return; while (left > 0) { @@ -330,7 +330,10 @@ void load_ucode_amd_ap(void) if (!container) return; - if (check_current_patch_level(&rev)) + /* + * 64-bit runs with paging enabled, thus early==false. + */ + if (check_current_patch_level(&rev, false)) return; eax = cpuid_eax(0x00000001); @@ -422,7 +425,11 @@ void reload_ucode_amd(void) struct microcode_amd *mc; u32 rev; - if (check_current_patch_level(&rev)) + /* + * early==false because this is a syscore ->resume path and by + * that time paging is long enabled. + */ + if (check_current_patch_level(&rev, false)) return; mc = (struct microcode_amd *)amd_ucode_patch; -- cgit v1.2.3 From 1d8007bdee074fdffcf3539492d8a151a1fb3436 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 12 Oct 2015 13:38:32 +0200 Subject: KVM: x86: build kvm_userspace_memory_region in x86_set_memory_region MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The next patch will make x86_set_memory_region fill the userspace_addr. Since the struct is not used untouched anymore, it makes sense to build it in x86_set_memory_region directly; it also simplifies the callers. Reported-by: Alexandre DERUMIER Cc: stable@vger.kernel.org Fixes: 9da0e4d5ac969909f6b435ce28ea28135a9cbd69 Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 ++---- arch/x86/kvm/vmx.c | 26 ++++++-------------------- arch/x86/kvm/x86.c | 31 +++++++++++++------------------ 3 files changed, 21 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2beee0382088..3a36ee704c30 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1226,10 +1226,8 @@ void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); int kvm_is_in_guest(void); -int __x86_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem); -int x86_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem); +int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); +int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 06ef4908ba61..6a8bc64566ab 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4105,17 +4105,13 @@ static void seg_setup(int seg) static int alloc_apic_access_page(struct kvm *kvm) { struct page *page; - struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_page_done) goto out; - kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; - kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE; - kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __x86_set_memory_region(kvm, &kvm_userspace_mem); + r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); if (r) goto out; @@ -4140,17 +4136,12 @@ static int alloc_identity_pagetable(struct kvm *kvm) { /* Called with kvm->slots_lock held. */ - struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; BUG_ON(kvm->arch.ept_identity_pagetable_done); - kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; - kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = - kvm->arch.ept_identity_map_addr; - kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __x86_set_memory_region(kvm, &kvm_userspace_mem); + r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, + kvm->arch.ept_identity_map_addr, PAGE_SIZE); return r; } @@ -4949,14 +4940,9 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) { int ret; - struct kvm_userspace_memory_region tss_mem = { - .slot = TSS_PRIVATE_MEMSLOT, - .guest_phys_addr = addr, - .memory_size = PAGE_SIZE * 3, - .flags = 0, - }; - ret = x86_set_memory_region(kvm, &tss_mem); + ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, + PAGE_SIZE * 3); if (ret) return ret; kvm->arch.tss_addr = addr; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 92511d4b7236..7bf8096f013d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7474,18 +7474,21 @@ void kvm_arch_sync_events(struct kvm *kvm) kvm_free_pit(kvm); } -int __x86_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem) +int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { int i, r; /* Called with kvm->slots_lock held. */ - BUG_ON(mem->slot >= KVM_MEM_SLOTS_NUM); + if (WARN_ON(id >= KVM_MEM_SLOTS_NUM)) + return -EINVAL; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - struct kvm_userspace_memory_region m = *mem; + struct kvm_userspace_memory_region m; - m.slot |= i << 16; + m.slot = id | (i << 16); + m.flags = 0; + m.guest_phys_addr = gpa; + m.memory_size = size; r = __kvm_set_memory_region(kvm, &m); if (r < 0) return r; @@ -7495,13 +7498,12 @@ int __x86_set_memory_region(struct kvm *kvm, } EXPORT_SYMBOL_GPL(__x86_set_memory_region); -int x86_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem) +int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { int r; mutex_lock(&kvm->slots_lock); - r = __x86_set_memory_region(kvm, mem); + r = __x86_set_memory_region(kvm, id, gpa, size); mutex_unlock(&kvm->slots_lock); return r; @@ -7516,16 +7518,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm) * unless the the memory map has changed due to process exit * or fd copying. */ - struct kvm_userspace_memory_region mem; - memset(&mem, 0, sizeof(mem)); - mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; - x86_set_memory_region(kvm, &mem); - - mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; - x86_set_memory_region(kvm, &mem); - - mem.slot = TSS_PRIVATE_MEMSLOT; - x86_set_memory_region(kvm, &mem); + x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0); + x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0); + x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); } kvm_iommu_unmap_guest(kvm); kfree(kvm->arch.vpic); -- cgit v1.2.3 From f0d648bdf0a5bbc91da6099d5282f77996558ea4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 12 Oct 2015 13:56:27 +0200 Subject: KVM: x86: map/unmap private slots in __x86_set_memory_region MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise, two copies (one of them never populated and thus bogus) are allocated for the regular and SMM address spaces. This breaks SMM with EPT but without unrestricted guest support, because the SMM copy of the identity page map is all zeros. By moving the allocation to the caller we also remove the last vestiges of kernel-allocated memory regions (not accessible anymore in userspace since commit b74a07beed0e, "KVM: Remove kernel-allocated memory regions", 2010-06-21); that is a nice bonus. Reported-by: Alexandre DERUMIER Cc: stable@vger.kernel.org Fixes: 9da0e4d5ac969909f6b435ce28ea28135a9cbd69 Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 62 ++++++++++++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7bf8096f013d..3ac33f86c873 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7477,23 +7477,53 @@ void kvm_arch_sync_events(struct kvm *kvm) int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { int i, r; + u64 hva; + struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memory_slot *slot, old; /* Called with kvm->slots_lock held. */ if (WARN_ON(id >= KVM_MEM_SLOTS_NUM)) return -EINVAL; + slot = id_to_memslot(slots, id); + if (size) { + if (WARN_ON(slot->npages)) + return -EEXIST; + + /* + * MAP_SHARED to prevent internal slot pages from being moved + * by fork()/COW. + */ + hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, 0); + if (IS_ERR((void *)hva)) + return PTR_ERR((void *)hva); + } else { + if (!slot->npages) + return 0; + + hva = 0; + } + + old = *slot; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { struct kvm_userspace_memory_region m; m.slot = id | (i << 16); m.flags = 0; m.guest_phys_addr = gpa; + m.userspace_addr = hva; m.memory_size = size; r = __kvm_set_memory_region(kvm, &m); if (r < 0) return r; } + if (!size) { + r = vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE); + WARN_ON(r < 0); + } + return 0; } EXPORT_SYMBOL_GPL(__x86_set_memory_region); @@ -7623,27 +7653,6 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { - /* - * Only private memory slots need to be mapped here since - * KVM_SET_MEMORY_REGION ioctl is no longer supported. - */ - if ((memslot->id >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_CREATE)) { - unsigned long userspace_addr; - - /* - * MAP_SHARED to prevent internal slot pages from being moved - * by fork()/COW. - */ - userspace_addr = vm_mmap(NULL, 0, memslot->npages * PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, 0); - - if (IS_ERR((void *)userspace_addr)) - return PTR_ERR((void *)userspace_addr); - - memslot->userspace_addr = userspace_addr; - } - return 0; } @@ -7705,17 +7714,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, { int nr_mmu_pages = 0; - if (change == KVM_MR_DELETE && old->id >= KVM_USER_MEM_SLOTS) { - int ret; - - ret = vm_munmap(old->userspace_addr, - old->npages * PAGE_SIZE); - if (ret < 0) - printk(KERN_WARNING - "kvm_vm_ioctl_set_memory_region: " - "failed to munmap memory\n"); - } - if (!kvm->arch.n_requested_mmu_pages) nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); -- cgit v1.2.3 From 5d9bc648b94dd719022343b8675e6c4381f0c45f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 13 Oct 2015 10:18:53 +0200 Subject: KVM: x86: clean up kvm_arch_vcpu_runnable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split the huge conditional in two functions. Fixes: 64d6067057d9658acb8675afcfba549abdb7fc16 Cc: stable@vger.kernel.org Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3ac33f86c873..b69ef58e51ee 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6453,6 +6453,12 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) return 1; } +static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && + !vcpu->arch.apf.halted); +} + static int vcpu_run(struct kvm_vcpu *vcpu) { int r; @@ -6461,8 +6467,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); for (;;) { - if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && - !vcpu->arch.apf.halted) + if (kvm_vcpu_running(vcpu)) r = vcpu_enter_guest(vcpu); else r = vcpu_block(kvm, vcpu); @@ -7762,19 +7767,33 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, kvm_mmu_invalidate_zap_all_pages(kvm); } +static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) +{ + if (!list_empty_careful(&vcpu->async_pf.done)) + return true; + + if (kvm_apic_has_events(vcpu)) + return true; + + if (vcpu->arch.pv.pv_unhalted) + return true; + + if (atomic_read(&vcpu->arch.nmi_queued)) + return true; + + if (kvm_arch_interrupt_allowed(vcpu) && + kvm_cpu_has_interrupt(vcpu)) + return true; + + return false; +} + int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) kvm_x86_ops->check_nested_events(vcpu, false); - return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && - !vcpu->arch.apf.halted) - || !list_empty_careful(&vcpu->async_pf.done) - || kvm_apic_has_events(vcpu) - || vcpu->arch.pv.pv_unhalted - || atomic_read(&vcpu->arch.nmi_queued) || - (kvm_arch_interrupt_allowed(vcpu) && - kvm_cpu_has_interrupt(vcpu)); + return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); } int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 73917739334c6509833b0403b81d4a04a8784bdf Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 13 Oct 2015 10:19:35 +0200 Subject: KVM: x86: fix SMI to halted VCPU An SMI to a halted VCPU must wake it up, hence a VCPU with a pending SMI must be considered runnable. Fixes: 64d6067057d9658acb8675afcfba549abdb7fc16 Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b69ef58e51ee..6e03546faf2e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7781,6 +7781,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (atomic_read(&vcpu->arch.nmi_queued)) return true; + if (test_bit(KVM_REQ_SMI, &vcpu->requests)) + return true; + if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)) return true; -- cgit v1.2.3 From 3435dd08092934ee9672fc28a3ee4c2017741bd6 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 12 Oct 2015 13:47:17 +0300 Subject: x86/early_printk: Set __iomem address space for IO There are following warnings on unpatched code: arch/x86/kernel/early_printk.c:198:32: warning: incorrect type in initializer (different address spaces) arch/x86/kernel/early_printk.c:198:32: expected void [noderef] *vaddr arch/x86/kernel/early_printk.c:198:32: got unsigned int [usertype] * arch/x86/kernel/early_printk.c:205:32: warning: incorrect type in initializer (different address spaces) arch/x86/kernel/early_printk.c:205:32: expected void [noderef] *vaddr arch/x86/kernel/early_printk.c:205:32: got unsigned int [usertype] * Annotate it proper. Signed-off-by: Andy Shevchenko Link: http://lkml.kernel.org/r/1444646837-42615-1-git-send-email-andriy.shevchenko@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/early_printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 076a4a739aa6..21bf92490a7b 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -195,14 +195,14 @@ static __init void early_serial_init(char *s) #ifdef CONFIG_PCI static void mem32_serial_out(unsigned long addr, int offset, int value) { - u32 *vaddr = (u32 *)addr; + u32 __iomem *vaddr = (u32 __iomem *)addr; /* shift implied by pointer type */ writel(value, vaddr + offset); } static unsigned int mem32_serial_in(unsigned long addr, int offset) { - u32 *vaddr = (u32 *)addr; + u32 __iomem *vaddr = (u32 __iomem *)addr; /* shift implied by pointer type */ return readl(vaddr + offset); } -- cgit v1.2.3 From a7b7617493179a0ff76cbc0cc2eb45ad07074765 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 22 Apr 2015 14:20:47 +0200 Subject: mm: add architecture primitives for software dirty bit clearing There are primitives to create and query the software dirty bits in a pte or pmd. But the clearing of the software dirty bits is done in common code with x86 specific page table functions. Add the missing architecture primitives to clear the software dirty bits to allow the feature to be used on non-x86 systems, e.g. the s390 architecture. Acked-by: Cyrill Gorcunov Signed-off-by: Martin Schwidefsky --- arch/x86/include/asm/pgtable.h | 10 ++++++++++ fs/proc/task_mmu.c | 4 ++-- include/asm-generic/pgtable.h | 10 ++++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 867da5bbb4a3..81e144dad405 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -318,6 +318,16 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } +static inline pte_t pte_clear_soft_dirty(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY); +} + #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ /* diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e2d46adb54b4..b029d426c558 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -754,7 +754,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, if (pte_present(ptent)) { ptent = pte_wrprotect(ptent); - ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + ptent = pte_clear_soft_dirty(ptent); } else if (is_swap_pte(ptent)) { ptent = pte_swp_clear_soft_dirty(ptent); } @@ -768,7 +768,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, pmd_t pmd = *pmdp; pmd = pmd_wrprotect(pmd); - pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY); + pmd = pmd_clear_soft_dirty(pmd); if (vma->vm_flags & VM_SOFTDIRTY) vma->vm_flags &= ~VM_SOFTDIRTY; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 29c57b2cb344..f167cdd80fd7 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -482,6 +482,16 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) return pmd; } +static inline pte_t pte_clear_soft_dirty(pte_t pte) +{ + return pte; +} + +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) +{ + return pmd; +} + static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { return pte; -- cgit v1.2.3 From 8a53554e12e98d1759205afd7b8e9e2ea0936f48 Mon Sep 17 00:00:00 2001 From: "Kővágó, Zoltán" Date: Mon, 12 Oct 2015 15:13:56 +0100 Subject: x86/efi: Fix multiple GOP device support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When multiple GOP devices exists, but none of them implements ConOut, the code should just choose the first GOP (according to the comments). But currently 'fb_base' will refer to the last GOP, while other parameters to the first GOP, which will likely result in a garbled display. I can reliably reproduce this bug using my ASRock Z87M Extreme4 motherboard with CSM and integrated GPU disabled, and two PCIe video cards (NVidia GT640 and GTX980), booting from efi-stub (booting from grub works fine). On the primary display the ASRock logo remains and on the secondary screen it is garbled up completely. Signed-off-by: Kővágó, Zoltán Signed-off-by: Matt Fleming Cc: Cc: Linus Torvalds Cc: Matthew Garrett Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1444659236-24837-2-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/eboot.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index ee1b6d346b98..db51c1f27446 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -667,6 +667,7 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto, bool conout_found = false; void *dummy = NULL; u32 h = handles[i]; + u32 current_fb_base; status = efi_call_early(handle_protocol, h, proto, (void **)&gop32); @@ -678,7 +679,7 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto, if (status == EFI_SUCCESS) conout_found = true; - status = __gop_query32(gop32, &info, &size, &fb_base); + status = __gop_query32(gop32, &info, &size, ¤t_fb_base); if (status == EFI_SUCCESS && (!first_gop || conout_found)) { /* * Systems that use the UEFI Console Splitter may @@ -692,6 +693,7 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto, pixel_format = info->pixel_format; pixel_info = info->pixel_information; pixels_per_scan_line = info->pixels_per_scan_line; + fb_base = current_fb_base; /* * Once we've found a GOP supporting ConOut, @@ -770,6 +772,7 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto, bool conout_found = false; void *dummy = NULL; u64 h = handles[i]; + u32 current_fb_base; status = efi_call_early(handle_protocol, h, proto, (void **)&gop64); @@ -781,7 +784,7 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto, if (status == EFI_SUCCESS) conout_found = true; - status = __gop_query64(gop64, &info, &size, &fb_base); + status = __gop_query64(gop64, &info, &size, ¤t_fb_base); if (status == EFI_SUCCESS && (!first_gop || conout_found)) { /* * Systems that use the UEFI Console Splitter may @@ -795,6 +798,7 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto, pixel_format = info->pixel_format; pixel_info = info->pixel_information; pixels_per_scan_line = info->pixels_per_scan_line; + fb_base = current_fb_base; /* * Once we've found a GOP supporting ConOut, -- cgit v1.2.3 From 25188b9986cf6b0cadcf1bc1d1693a2e9c50ed47 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 Oct 2015 15:51:08 +0200 Subject: KVM: x86: fix previous commit for 32-bit Unfortunately I only noticed this after pushing. Fixes: f0d648bdf0a5bbc91da6099d5282f77996558ea4 Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6e03546faf2e..9a9a19830321 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7482,7 +7482,7 @@ void kvm_arch_sync_events(struct kvm *kvm) int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { int i, r; - u64 hva; + unsigned long hva; struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *slot, old; -- cgit v1.2.3 From b10d92a54dac25a6152f1aa1ffc95c12908035ce Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 Oct 2015 15:25:52 +0200 Subject: KVM: x86: fix RSM into 64-bit protected mode In order to get into 64-bit protected mode, you need to enable paging while EFER.LMA=1. For this to work, CS.L must be 0. Currently, we load the segments before CR0 and CR4, which means that if RSM returns into 64-bit protected mode CS.L is already 1 and everything breaks. Luckily, CS.L=0 is always the case when executing RSM, because it is forbidden to execute RSM from 64-bit protected mode. Hence it is enough to load CR0 and CR4 first, and only then the segments. Fixes: 660a5d517aaab9187f93854425c4c63f4a09195c Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b372a7557c16..9da95b9daf8d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2418,7 +2418,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) u64 val, cr0, cr4; u32 base3; u16 selector; - int i; + int i, r; for (i = 0; i < 16; i++) *reg_write(ctxt, i) = GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8); @@ -2460,13 +2460,17 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) dt.address = GET_SMSTATE(u64, smbase, 0x7e68); ctxt->ops->set_gdt(ctxt, &dt); + r = rsm_enter_protected_mode(ctxt, cr0, cr4); + if (r != X86EMUL_CONTINUE) + return r; + for (i = 0; i < 6; i++) { - int r = rsm_load_seg_64(ctxt, smbase, i); + r = rsm_load_seg_64(ctxt, smbase, i); if (r != X86EMUL_CONTINUE) return r; } - return rsm_enter_protected_mode(ctxt, cr0, cr4); + return X86EMUL_CONTINUE; } static int em_rsm(struct x86_emulate_ctxt *ctxt) -- cgit v1.2.3 From c77f3fab441c3e466b4c3601a475fc31ce156b06 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Thu, 8 Oct 2015 20:23:33 +0200 Subject: kvm: x86: set KVM_REQ_EVENT when updating IRR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After moving PIR to IRR, the interrupt needs to be delivered manually. Reported-by: Paolo Bonzini Cc: Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 944b38a56929..168b8759bd73 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -348,6 +348,8 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) struct kvm_lapic *apic = vcpu->arch.apic; __kvm_apic_update_irr(pir, apic->regs); + + kvm_make_request(KVM_REQ_EVENT, vcpu); } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); -- cgit v1.2.3 From db2bdcbbbd32e5500b822d5e74ef8b5bd777e687 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Thu, 8 Oct 2015 20:23:34 +0200 Subject: KVM: x86: fix edge EOI and IOAPIC reconfig race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KVM uses eoi_exit_bitmap to track vectors that need an action on EOI. The problem is that IOAPIC can be reconfigured while an interrupt with old configuration is pending and eoi_exit_bitmap only remembers the newest configuration; thus EOI from the pending interrupt is not recognized. (Reconfiguration is not a problem for level interrupts, because IOAPIC sends interrupt with the new configuration.) For an edge interrupt with ACK notifiers, like i8254 timer; things can happen in this order 1) IOAPIC inject a vector from i8254 2) guest reconfigures that vector's VCPU and therefore eoi_exit_bitmap on original VCPU gets cleared 3) guest's handler for the vector does EOI 4) KVM's EOI handler doesn't pass that vector to IOAPIC because it is not in that VCPU's eoi_exit_bitmap 5) i8254 stops working A simple solution is to set the IOAPIC vector in eoi_exit_bitmap if the vector is in PIR/IRR/ISR. This creates an unwanted situation if the vector is reused by a non-IOAPIC source, but I think it is so rare that we don't want to make the solution more sophisticated. The simple solution also doesn't work if we are reconfiguring the vector. (Shouldn't happen in the wild and I'd rather fix users of ACK notifiers instead of working around that.) The are no races because ioapic injection and reconfig are locked. Fixes: b053b2aef25d ("KVM: x86: Add EOI exit bitmap inference") [Before b053b2aef25d, this bug happened only with APICv.] Fixes: c7c9c56ca26f ("x86, apicv: add virtual interrupt delivery support") Cc: Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 4 +++- arch/x86/kvm/x86.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 2dcda0f188ba..88d0a92d3f94 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -246,7 +246,9 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) || index == RTC_GSI) { if (kvm_apic_match_dest(vcpu, NULL, 0, - e->fields.dest_id, e->fields.dest_mode)) + e->fields.dest_id, e->fields.dest_mode) || + (e->fields.trig_mode == IOAPIC_EDGE_TRIG && + kvm_apic_pending_eoi(vcpu, e->fields.vector))) __set_bit(e->fields.vector, (unsigned long *)eoi_exit_bitmap); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e28954d2698a..e33aebbf189e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6201,8 +6201,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap); - else + else { + kvm_x86_ops->sync_pir_to_irr(vcpu); kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap); + } kvm_x86_ops->load_eoi_exitmap(vcpu); } -- cgit v1.2.3 From 13db77347db175a68edd58a79963cdf5cb3a9607 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Thu, 8 Oct 2015 20:30:00 +0200 Subject: KVM: x86: don't notify userspace IOAPIC on edge EOI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On real hardware, edge-triggered interrupts don't set a bit in TMR, which means that IOAPIC isn't notified on EOI. Do the same here. Staying in guest/kernel mode after edge EOI is what we want for most devices. If some bugs could be nicely worked around with edge EOI notifications, we should invest in a better interface. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/irq_comm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index c89228980230..8f4499c7ffc1 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -389,13 +389,15 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) for (i = 0; i < nr_ioapic_pins; ++i) { hlist_for_each_entry(entry, &table->map[i], link) { u32 dest_id, dest_mode; + bool level; if (entry->type != KVM_IRQ_ROUTING_MSI) continue; dest_id = (entry->msi.address_lo >> 12) & 0xff; dest_mode = (entry->msi.address_lo >> 2) & 0x1; - if (kvm_apic_match_dest(vcpu, NULL, 0, dest_id, - dest_mode)) { + level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL; + if (level && kvm_apic_match_dest(vcpu, NULL, 0, + dest_id, dest_mode)) { u32 vector = entry->msi.data & 0xff; __set_bit(vector, -- cgit v1.2.3 From 991e7a0eedf12721f5561d7a1a54d248dc290bc0 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 16 Sep 2015 17:30:05 +0800 Subject: KVM: VMX: adjust interface to allocate/free_vpid Adjust allocate/free_vid so that they can be reused for the nested vpid. Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8eeba6ac5914..1452271e98ae 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4268,29 +4268,28 @@ static int alloc_identity_pagetable(struct kvm *kvm) return r; } -static void allocate_vpid(struct vcpu_vmx *vmx) +static int allocate_vpid(void) { int vpid; - vmx->vpid = 0; if (!enable_vpid) - return; + return 0; spin_lock(&vmx_vpid_lock); vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); - if (vpid < VMX_NR_VPIDS) { - vmx->vpid = vpid; + if (vpid < VMX_NR_VPIDS) __set_bit(vpid, vmx_vpid_bitmap); - } + else + vpid = 0; spin_unlock(&vmx_vpid_lock); + return vpid; } -static void free_vpid(struct vcpu_vmx *vmx) +static void free_vpid(int vpid) { - if (!enable_vpid) + if (!enable_vpid || vpid == 0) return; spin_lock(&vmx_vpid_lock); - if (vmx->vpid != 0) - __clear_bit(vmx->vpid, vmx_vpid_bitmap); + __clear_bit(vpid, vmx_vpid_bitmap); spin_unlock(&vmx_vpid_lock); } @@ -8629,7 +8628,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) if (enable_pml) vmx_disable_pml(vmx); - free_vpid(vmx); + free_vpid(vmx->vpid); leave_guest_mode(vcpu); vmx_load_vmcs01(vcpu); free_nested(vmx); @@ -8648,7 +8647,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx) return ERR_PTR(-ENOMEM); - allocate_vpid(vmx); + vmx->vpid = allocate_vpid(); err = kvm_vcpu_init(&vmx->vcpu, kvm, id); if (err) @@ -8724,7 +8723,7 @@ free_msrs: uninit_vcpu: kvm_vcpu_uninit(&vmx->vcpu); free_vcpu: - free_vpid(vmx); + free_vpid(vmx->vpid); kmem_cache_free(kvm_vcpu_cache, vmx); return ERR_PTR(err); } -- cgit v1.2.3 From dd5f5341a3a684c8850f8a8ccbaa70e196518e76 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 23 Sep 2015 18:26:57 +0800 Subject: KVM: VMX: introduce __vmx_flush_tlb to handle specific vpid Introduce __vmx_flush_tlb() to handle specific vpid. Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1452271e98ae..e1d94f81a28d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1392,13 +1392,13 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) __loaded_vmcs_clear, loaded_vmcs, 1); } -static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) +static inline void vpid_sync_vcpu_single(int vpid) { - if (vmx->vpid == 0) + if (vpid == 0) return; if (cpu_has_vmx_invvpid_single()) - __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); + __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0); } static inline void vpid_sync_vcpu_global(void) @@ -1407,10 +1407,10 @@ static inline void vpid_sync_vcpu_global(void) __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); } -static inline void vpid_sync_context(struct vcpu_vmx *vmx) +static inline void vpid_sync_context(int vpid) { if (cpu_has_vmx_invvpid_single()) - vpid_sync_vcpu_single(vmx); + vpid_sync_vcpu_single(vpid); else vpid_sync_vcpu_global(); } @@ -3563,9 +3563,9 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif -static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) { - vpid_sync_context(to_vmx(vcpu)); + vpid_sync_context(vpid); if (enable_ept) { if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; @@ -3573,6 +3573,11 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu) } } +static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +{ + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid); +} + static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; @@ -4915,7 +4920,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_fpu_activate(vcpu); update_exception_bitmap(vcpu); - vpid_sync_context(vmx); + vpid_sync_context(vmx->vpid); } /* -- cgit v1.2.3 From af22aa7c766d50712b9afeca53e9e4208ce6284c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 12 Oct 2015 17:32:13 -0700 Subject: x86/asm: Remove the xyz_cfi macros from dwarf2.h They are currently unused, and I don't think that anyone was ever particularly happy with them. They had the unfortunate property that they made it easy to CFI-annotate things without thinking about them -- when pushing, do you want to just update the CFA offset, or do you also want to update the saved location of the register being pushed? Suggested-by: Ingo Molnar Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1447bfbd10bb268b4593b32534ecefa1f4df287e.1444696194.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/dwarf2.h | 93 ------------------------------------------- 1 file changed, 93 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index 09133ba032b3..b7a1ab865d68 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -81,97 +81,4 @@ #endif -/* - * An attempt to make CFI annotations more or less - * correct and shorter. It is implied that you know - * what you're doing if you use them. - */ -#ifdef __ASSEMBLY__ -#ifdef CONFIG_X86_64 - .macro pushq_cfi reg - pushq \reg - CFI_ADJUST_CFA_OFFSET 8 - .endm - - .macro pushq_cfi_reg reg - pushq %\reg - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET \reg, 0 - .endm - - .macro popq_cfi reg - popq \reg - CFI_ADJUST_CFA_OFFSET -8 - .endm - - .macro popq_cfi_reg reg - popq %\reg - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE \reg - .endm - - .macro pushfq_cfi - pushfq - CFI_ADJUST_CFA_OFFSET 8 - .endm - - .macro popfq_cfi - popfq - CFI_ADJUST_CFA_OFFSET -8 - .endm - - .macro movq_cfi reg offset=0 - movq %\reg, \offset(%rsp) - CFI_REL_OFFSET \reg, \offset - .endm - - .macro movq_cfi_restore offset reg - movq \offset(%rsp), %\reg - CFI_RESTORE \reg - .endm -#else /*!CONFIG_X86_64*/ - .macro pushl_cfi reg - pushl \reg - CFI_ADJUST_CFA_OFFSET 4 - .endm - - .macro pushl_cfi_reg reg - pushl %\reg - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET \reg, 0 - .endm - - .macro popl_cfi reg - popl \reg - CFI_ADJUST_CFA_OFFSET -4 - .endm - - .macro popl_cfi_reg reg - popl %\reg - CFI_ADJUST_CFA_OFFSET -4 - CFI_RESTORE \reg - .endm - - .macro pushfl_cfi - pushfl - CFI_ADJUST_CFA_OFFSET 4 - .endm - - .macro popfl_cfi - popfl - CFI_ADJUST_CFA_OFFSET -4 - .endm - - .macro movl_cfi reg offset=0 - movl %\reg, \offset(%esp) - CFI_REL_OFFSET \reg, \offset - .endm - - .macro movl_cfi_restore offset reg - movl \offset(%esp), %\reg - CFI_RESTORE \reg - .endm -#endif /*!CONFIG_X86_64*/ -#endif /*__ASSEMBLY__*/ - #endif /* _ASM_X86_DWARF2_H */ -- cgit v1.2.3 From 612bece654ff6cd43160e201985be826e96b8bcb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 12 Oct 2015 17:32:14 -0700 Subject: um/x86: Fix build after x86 syscall changes I didn't realize that um didn't include x86's asm/syscall.h. Re-add a missing typedef. Reported-by: Richard Weinberger Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 034042cc1e28 ("x86/entry/syscalls: Move syscall table declarations into asm/syscalls.h") Link: http://lkml.kernel.org/r/8d15b9a88f4fd49e3342757e0a34624ee5ce9220.1444696194.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/um/asm/syscall.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/um/asm/syscall.h b/arch/x86/um/asm/syscall.h index 9fe77b7b5a0e..81d6562ce01d 100644 --- a/arch/x86/um/asm/syscall.h +++ b/arch/x86/um/asm/syscall.h @@ -3,6 +3,10 @@ #include +typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long, + unsigned long, unsigned long, + unsigned long, unsigned long); + static inline int syscall_get_arch(void) { #ifdef CONFIG_X86_32 -- cgit v1.2.3 From d81056b5278c520f1dede99bd59c14366ac8b1e1 Mon Sep 17 00:00:00 2001 From: Lukasz Anaczkowski Date: Wed, 9 Sep 2015 15:47:29 +0200 Subject: x86, ACPI: Handle apic/x2apic entries in MADT in correct order ACPI specifies the following rules when listing APIC IDs: (1) Boot processor is listed first (2) For multi-threaded processors, BIOS should list the first logical processor of each of the individual multi-threaded processors in MADT before listing any of the second logical processors. (3) APIC IDs < 0xFF should be listed in APIC subtable, APIC IDs >= 0xFF should be listed in X2APIC subtable Because of above, when there's more than 0xFF logical CPUs, BIOS interleaves APIC/X2APIC subtables. Assuming, there's 72 cores, 72 hyper-threads each, 288 CPUs total, listing is like this: APIC (0,4,8, .., 252) X2APIC (258,260,264, .. 284) APIC (1,5,9,...,253) X2APIC (259,261,265,...,285) APIC (2,6,10,...,254) X2APIC (260,262,266,..,286) APIC (3,7,11,...,251) X2APIC (255,261,262,266,..,287) Now, before this patch, due to how ACPI MADT subtables were parsed (BSP then X2APIC then APIC), kernel enumerated CPUs in reverted order (i.e. high APIC IDs were getting low logical IDs, and low APIC IDs were getting high logical IDs). This is wrong for the following reasons: () it's hard to predict how cores and threads are enumerated () when it's hard to predict, s/w threads cannot be properly affinitized causing significant performance impact due to e.g. inproper cache sharing () enumeration is inconsistent with how threads are enumerated on other Intel Xeon processors So, order in which MADT APIC/X2APIC handlers are passed is reverse and both handlers are passed to be called during same MADT table to walk to achieve correct CPU enumeration. In scenario when someone boots kernel with options 'maxcpus=72 nox2apic', in result less cores may be booted, since some of the CPUs the kernel will try to use will have APIC ID >= 0xFF. In such case, one should not pass 'nox2apic'. Disclimer: code parsing MADT APIC/X2APIC has not been touched since 2009, when X2APIC support was initially added. I do not know why MADT parsing code was added in the reversed order in the first place. I guess it didn't matter at that time since nobody cared about cores with APIC IDs >= 0xFF, right? This patch is based on work of "Yinghai Lu " previously published at https://lkml.org/lkml/2013/1/21/563 Here's the explanation why parsing interface needs to be changed and why simpler approach will not work https://lkml.org/lkml/2015/9/7/285 Signed-off-by: Lukasz Anaczkowski Acked-by: Thomas Gleixner (commit message) Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/boot.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index ded848c20e05..e75907601a41 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -976,6 +976,8 @@ static int __init acpi_parse_madt_lapic_entries(void) { int count; int x2count = 0; + int ret; + struct acpi_subtable_proc madt_proc[2]; if (!cpu_has_apic) return -ENODEV; @@ -999,10 +1001,22 @@ static int __init acpi_parse_madt_lapic_entries(void) acpi_parse_sapic, MAX_LOCAL_APIC); if (!count) { - x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, - acpi_parse_x2apic, MAX_LOCAL_APIC); - count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, - acpi_parse_lapic, MAX_LOCAL_APIC); + memset(madt_proc, 0, sizeof(madt_proc)); + madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC; + madt_proc[0].handler = acpi_parse_lapic; + madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC; + madt_proc[1].handler = acpi_parse_x2apic; + ret = acpi_table_parse_entries_array(ACPI_SIG_MADT, + sizeof(struct acpi_table_madt), + madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC); + if (ret < 0) { + printk(KERN_ERR PREFIX + "Error parsing LAPIC/X2APIC entries\n"); + return ret; + } + + x2count = madt_proc[0].count; + count = madt_proc[1].count; } if (!count && !x2count) { printk(KERN_ERR PREFIX "No LAPIC entries present\n"); -- cgit v1.2.3 From 6a35fc2d6c22bafe45117cdc5d8cee332244edbb Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Wed, 14 Oct 2015 16:11:59 -0700 Subject: cpufreq: intel_pstate: get P1 from TAR when available After Ivybridge, the max non turbo ratio obtained from platform info msr is not always guaranteed P1 on client platforms. The max non turbo activation ratio (TAR), determines the max for the current level of TDP. The ratio in platform info is physical max. The TAR MSR can be locked, so updating this value is not possible on all platforms. This change gets this ratio from MSR TURBO_ACTIVATION_RATIO if available, but also do some sanity checking to make sure that this value is correct. The sanity check involves reading the TDP ratio for the current tdp control value when platform has configurable TDP present and matching TAC with this. Signed-off-by: Srinivas Pandruvada Acked-by: Kristen Carlson Accardi Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/msr-index.h | 7 +++++++ drivers/cpufreq/intel_pstate.c | 39 +++++++++++++++++++++++++++++++++++---- 2 files changed, 42 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b8c14bb7fc8f..9f3905697f12 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -206,6 +206,13 @@ #define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0 #define MSR_RING_PERF_LIMIT_REASONS 0x000006B1 +/* Config TDP MSRs */ +#define MSR_CONFIG_TDP_NOMINAL 0x00000648 +#define MSR_CONFIG_TDP_LEVEL1 0x00000649 +#define MSR_CONFIG_TDP_LEVEL2 0x0000064A +#define MSR_CONFIG_TDP_CONTROL 0x0000064B +#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C + /* Hardware P state interface */ #define MSR_PPERF 0x0000064e #define MSR_PERF_LIMIT_REASONS 0x0000064f diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 3af9dd7332e6..da92d0201d52 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -43,7 +43,6 @@ #define int_tofp(X) ((int64_t)(X) << FRAC_BITS) #define fp_toint(X) ((X) >> FRAC_BITS) - static inline int32_t mul_fp(int32_t x, int32_t y) { return ((int64_t)x * (int64_t)y) >> FRAC_BITS; @@ -593,10 +592,42 @@ static int core_get_min_pstate(void) static int core_get_max_pstate(void) { - u64 value; + u64 tar; + u64 plat_info; + int max_pstate; + int err; + + rdmsrl(MSR_PLATFORM_INFO, plat_info); + max_pstate = (plat_info >> 8) & 0xFF; + + err = rdmsrl_safe(MSR_TURBO_ACTIVATION_RATIO, &tar); + if (!err) { + /* Do some sanity checking for safety */ + if (plat_info & 0x600000000) { + u64 tdp_ctrl; + u64 tdp_ratio; + int tdp_msr; + + err = rdmsrl_safe(MSR_CONFIG_TDP_CONTROL, &tdp_ctrl); + if (err) + goto skip_tar; + + tdp_msr = MSR_CONFIG_TDP_NOMINAL + tdp_ctrl; + err = rdmsrl_safe(tdp_msr, &tdp_ratio); + if (err) + goto skip_tar; + + if (tdp_ratio - 1 == tar) { + max_pstate = tar; + pr_debug("max_pstate=TAC %x\n", max_pstate); + } else { + goto skip_tar; + } + } + } - rdmsrl(MSR_PLATFORM_INFO, value); - return (value >> 8) & 0xFF; +skip_tar: + return max_pstate; } static int core_get_turbo_pstate(void) -- cgit v1.2.3 From 99b83ac893b84ed1a62ad6d1f2b6cc32026b9e85 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 13 Oct 2015 09:12:21 -0700 Subject: KVM: nVMX: emulate the INVVPID instruction Add the INVVPID instruction emulation. Reviewed-by: Wincy Van Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/vmx.c | 61 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index d25f32ac9c5c..aa336ff3e03e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -416,6 +416,7 @@ enum vmcs_field { #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) +#define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */ #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e1d94f81a28d..9b6ab311d0bd 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -442,6 +442,7 @@ struct nested_vmx { u32 nested_vmx_misc_low; u32 nested_vmx_misc_high; u32 nested_vmx_ept_caps; + u32 nested_vmx_vpid_caps; }; #define POSTED_INTR_ON 0 @@ -2612,6 +2613,8 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) } else vmx->nested.nested_vmx_ept_caps = 0; + vmx->nested.nested_vmx_vpid_caps = 0; + if (enable_unrestricted_guest) vmx->nested.nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_UNRESTRICTED_GUEST; @@ -7329,7 +7332,63 @@ static int handle_invept(struct kvm_vcpu *vcpu) static int handle_invvpid(struct kvm_vcpu *vcpu) { - kvm_queue_exception(vcpu, UD_VECTOR); + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 vmx_instruction_info; + unsigned long type, types; + gva_t gva; + struct x86_exception e; + int vpid; + + if (!(vmx->nested.nested_vmx_secondary_ctls_high & + SECONDARY_EXEC_ENABLE_VPID) || + !(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + + types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7; + + if (!(types & (1UL << type))) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return 1; + } + + /* according to the intel vmx instruction reference, the memory + * operand is read even if it isn't needed (e.g., for type==global) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, false, &gva)) + return 1; + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid, + sizeof(u32), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + switch (type) { + case VMX_VPID_EXTENT_ALL_CONTEXT: + if (get_vmcs12(vcpu)->virtual_processor_id == 0) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return 1; + } + vmx_flush_tlb(vcpu); + nested_vmx_succeed(vcpu); + break; + default: + /* Trap single context invalidation invvpid calls */ + BUG_ON(1); + break; + } + + skip_emulated_instruction(vcpu); return 1; } -- cgit v1.2.3 From 5c614b3583e7b6dab0c86356fa36c2bcbb8322a0 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 13 Oct 2015 09:18:36 -0700 Subject: KVM: nVMX: nested VPID emulation VPID is used to tag address space and avoid a TLB flush. Currently L0 use the same VPID to run L1 and all its guests. KVM flushes VPID when switching between L1 and L2. This patch advertises VPID to the L1 hypervisor, then address space of L1 and L2 can be separately treated and avoid TLB flush when swithing between L1 and L2. For each nested vmentry, if vpid12 is changed, reuse shadow vpid w/ an invvpid. Performance: run lmbench on L2 w/ 3.5 kernel. Context switching - times in microseconds - smaller is better ------------------------------------------------------------------------- Host OS 2p/0K 2p/16K 2p/64K 8p/16K 8p/64K 16p/16K 16p/64K ctxsw ctxsw ctxsw ctxsw ctxsw ctxsw ctxsw --------- ------------- ------ ------ ------ ------ ------ ------- ------- kernel Linux 3.5.0-1 1.2200 1.3700 1.4500 4.7800 2.3300 5.60000 2.88000 nested VPID kernel Linux 3.5.0-1 1.2600 1.4300 1.5600 12.7 12.9 3.49000 7.46000 vanilla Reviewed-by: Jan Kiszka Reviewed-by: Wincy Van Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9b6ab311d0bd..bea552204aa2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -426,6 +426,9 @@ struct nested_vmx { /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ u64 vmcs01_debugctl; + u16 vpid02; + u16 last_vpid; + u32 nested_vmx_procbased_ctls_low; u32 nested_vmx_procbased_ctls_high; u32 nested_vmx_true_procbased_ctls_low; @@ -1213,6 +1216,11 @@ static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); } +static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID); +} + static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); @@ -2590,6 +2598,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING | @@ -6818,6 +6827,7 @@ static void free_nested(struct vcpu_vmx *vmx) return; vmx->nested.vmxon = false; + free_vpid(vmx->nested.vpid02); nested_release_vmcs12(vmx); if (enable_shadow_vmcs) free_vmcs(vmx->nested.current_shadow_vmcs); @@ -7379,7 +7389,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); return 1; } - vmx_flush_tlb(vcpu); + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); nested_vmx_succeed(vcpu); break; default: @@ -8759,8 +8769,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vmcs; } - if (nested) + if (nested) { nested_vmx_setup_ctls_msrs(vmx); + vmx->nested.vpid02 = allocate_vpid(); + } vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; @@ -8781,6 +8793,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) return &vmx->vcpu; free_vmcs: + free_vpid(vmx->nested.vpid02); free_loaded_vmcs(vmx->loaded_vmcs); free_msrs: kfree(vmx->guest_msrs); @@ -9665,12 +9678,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (enable_vpid) { /* - * Trivially support vpid by letting L2s share their parent - * L1's vpid. TODO: move to a more elaborate solution, giving - * each L2 its own vpid and exposing the vpid feature to L1. + * There is no direct mapping between vpid02 and vpid12, the + * vpid02 is per-vCPU for L0 and reused while the value of + * vpid12 is changed w/ one invvpid during nested vmentry. + * The vpid12 is allocated by L1 for L2, so it will not + * influence global bitmap(for vpid01 and vpid02 allocation) + * even if spawn a lot of nested vCPUs. */ - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - vmx_flush_tlb(vcpu); + if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) { + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); + if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { + vmx->nested.last_vpid = vmcs12->virtual_processor_id; + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); + } + } else { + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + vmx_flush_tlb(vcpu); + } + } if (nested_cpu_has_ept(vmcs12)) { -- cgit v1.2.3 From 089d7b6ec5151ad06a2cd524bc0580d311b641ad Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 13 Oct 2015 09:18:37 -0700 Subject: KVM: nVMX: expose VPID capability to L1 Expose VPID capability to L1. For nested guests, we don't do anything specific for single context invalidation. Hence, only advertise support for global context invalidation. The major benefit of nested VPID comes from having separate vpids when switching between L1 and L2, and also when L2's vCPUs not sched in/out on L1. Reviewed-by: Wincy Van Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bea552204aa2..15bff51d492a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2622,7 +2622,11 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) } else vmx->nested.nested_vmx_ept_caps = 0; - vmx->nested.nested_vmx_vpid_caps = 0; + if (enable_vpid) + vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | + VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; + else + vmx->nested.nested_vmx_vpid_caps = 0; if (enable_unrestricted_guest) vmx->nested.nested_vmx_secondary_ctls_high |= @@ -2739,7 +2743,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) break; case MSR_IA32_VMX_EPT_VPID_CAP: /* Currently, no nested vpid support */ - *pdata = vmx->nested.nested_vmx_ept_caps; + *pdata = vmx->nested.nested_vmx_ept_caps | + ((u64)vmx->nested.nested_vmx_vpid_caps << 32); break; default: return 1; -- cgit v1.2.3 From 951f9fd74f2d826fff1d84a8ec34b491517dc15d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 23 Sep 2015 10:34:26 +0200 Subject: KVM: x86: manually unroll bad_mt_xwr loop The loop is computing one of two constants, it can be simpler to write everything inline. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c3f39aa9b9cb..b8482c0b75b2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3706,7 +3706,7 @@ static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, int maxphyaddr, bool execonly) { - int pte; + u64 bad_mt_xwr; rsvd_check->rsvd_bits_mask[0][3] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); @@ -3724,14 +3724,16 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; - for (pte = 0; pte < 64; pte++) { - int rwx_bits = pte & 7; - int mt = pte >> 3; - if (mt == 0x2 || mt == 0x3 || mt == 0x7 || - rwx_bits == 0x2 || rwx_bits == 0x6 || - (rwx_bits == 0x4 && !execonly)) - rsvd_check->bad_mt_xwr |= (1ull << pte); + bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ + bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ + bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */ + if (!execonly) { + /* bits 0..2 must not be 100 unless VMX capabilities allow it */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 4); } + rsvd_check->bad_mt_xwr = bad_mt_xwr; } static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From 6092d3d3e6db983048469d424a8f2221915a8dd3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 14 Oct 2015 15:10:54 +0200 Subject: kvm: svm: Only propagate next_rip when guest supports it Currently we always write the next_rip of the shadow vmcb to the guests vmcb when we emulate a vmexit. This could confuse the guest when its cpuid indicated no support for the next_rip feature. Fix this by only propagating next_rip if the guest actually supports it. Cc: Bandan Das Cc: Dirk Mueller Tested-By: Dirk Mueller Signed-off-by: Joerg Roedel Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.h | 21 +++++++++++++++++++++ arch/x86/kvm/svm.c | 11 ++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index d434ee952b00..06332cb7e7d1 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -149,4 +149,25 @@ static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); return best && (best->edx & bit(X86_FEATURE_RDTSCP)); } + +/* + * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3 + */ +#define BIT_NRIPS 3 + +static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x8000000a, 0); + + /* + * NRIPS is a scattered cpuid feature, so we can't use + * X86_FEATURE_NRIPS here (X86_FEATURE_NRIPS would be bit + * position 8, not 3). + */ + return best && (best->edx & bit(BIT_NRIPS)); +} +#undef BIT_NRIPS + #endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 54a86183e5d3..cd8659cfc632 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -159,6 +159,9 @@ struct vcpu_svm { u32 apf_reason; u64 tsc_ratio; + + /* cached guest cpuid flags for faster access */ + bool nrips_enabled : 1; }; static DEFINE_PER_CPU(u64, current_tsc_ratio); @@ -2365,7 +2368,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; - nested_vmcb->control.next_rip = vmcb->control.next_rip; + + if (svm->nrips_enabled) + nested_vmcb->control.next_rip = vmcb->control.next_rip; /* * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have @@ -4085,6 +4090,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) static void svm_cpuid_update(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + + /* Update nrips enabled cache */ + svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu); } static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) -- cgit v1.2.3 From cd1872f028556dc0e8424e58413c0268c159383b Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 16 Oct 2015 17:04:13 +0900 Subject: KVM: x86: MMU: Make force_pt_level bool This will be passed to a function later. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 8 ++++---- arch/x86/kvm/paging_tmpl.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b8482c0b75b2..2262728863de 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2962,7 +2962,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, { int r; int level; - int force_pt_level; + bool force_pt_level; pfn_t pfn; unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; @@ -3476,7 +3476,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, pfn_t pfn; int r; int level; - int force_pt_level; + bool force_pt_level; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; @@ -3497,9 +3497,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (mapping_level_dirty_bitmap(vcpu, gfn) || !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL)) - force_pt_level = 1; + force_pt_level = true; else - force_pt_level = 0; + force_pt_level = false; if (likely(!force_pt_level)) { level = mapping_level(vcpu, gfn); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 736e6ab8784d..07f1a4ede637 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -698,7 +698,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; - int force_pt_level; + bool force_pt_level; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; @@ -747,7 +747,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) || is_self_change_mapping; else - force_pt_level = 1; + force_pt_level = true; if (!force_pt_level) { level = min(walker.level, mapping_level(vcpu, walker.gfn)); walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); -- cgit v1.2.3 From 5ed5c5c8fdbab889837c9223fc6f4bdaa830879c Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 16 Oct 2015 17:05:13 +0900 Subject: KVM: x86: MMU: Simplify force_pt_level calculation code in FNAME(page_fault)() As a bonus, an extra memory slot search can be eliminated when is_self_change_mapping is true. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/paging_tmpl.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 07f1a4ede637..8ebc3a5560ce 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -743,15 +743,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); - if (walker.level >= PT_DIRECTORY_LEVEL) - force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) - || is_self_change_mapping; - else + if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) { + force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); + if (!force_pt_level) { + level = min(walker.level, mapping_level(vcpu, walker.gfn)); + walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); + } + } else force_pt_level = true; - if (!force_pt_level) { - level = min(walker.level, mapping_level(vcpu, walker.gfn)); - walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); - } mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); -- cgit v1.2.3 From fd136902187838bcae3a572f41cb703553dd63b8 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 16 Oct 2015 17:06:02 +0900 Subject: KVM: x86: MMU: Move mapping_level_dirty_bitmap() call in mapping_level() This is necessary to eliminate an extra memory slot search later. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 29 ++++++++++++++--------------- arch/x86/kvm/paging_tmpl.h | 6 +++--- 2 files changed, 17 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2262728863de..890cd694c9a2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -870,10 +870,16 @@ static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true); } -static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) +static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, + bool *force_pt_level) { int host_level, level, max_level; + if (likely(!*force_pt_level)) + *force_pt_level = mapping_level_dirty_bitmap(vcpu, large_gfn); + if (unlikely(*force_pt_level)) + return PT_PAGE_TABLE_LEVEL; + host_level = host_mapping_level(vcpu->kvm, large_gfn); if (host_level == PT_PAGE_TABLE_LEVEL) @@ -2962,14 +2968,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, { int r; int level; - bool force_pt_level; + bool force_pt_level = false; pfn_t pfn; unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; - force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); + level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { - level = mapping_level(vcpu, gfn); /* * This path builds a PAE pagetable - so we can map * 2mb pages at maximum. Therefore check if the level @@ -2979,8 +2984,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, level = PT_DIRECTORY_LEVEL; gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } else - level = PT_PAGE_TABLE_LEVEL; + } if (fast_page_fault(vcpu, v, level, error_code)) return 0; @@ -3495,20 +3499,15 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - if (mapping_level_dirty_bitmap(vcpu, gfn) || - !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL)) - force_pt_level = true; - else - force_pt_level = false; - + force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, + PT_DIRECTORY_LEVEL); + level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { - level = mapping_level(vcpu, gfn); if (level > PT_DIRECTORY_LEVEL && !check_hugepage_cache_consistency(vcpu, gfn, level)) level = PT_DIRECTORY_LEVEL; gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } else - level = PT_PAGE_TABLE_LEVEL; + } if (fast_page_fault(vcpu, gpa, level, error_code)) return 0; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 8ebc3a5560ce..bf39d0f3efa9 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -744,9 +744,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) { - force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); - if (!force_pt_level) { - level = min(walker.level, mapping_level(vcpu, walker.gfn)); + level = mapping_level(vcpu, walker.gfn, &force_pt_level); + if (likely(!force_pt_level)) { + level = min(walker.level, level); walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); } } else -- cgit v1.2.3 From d8aacf5df86a961923a2c9c547d341d64a9d9f5d Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 16 Oct 2015 17:07:01 +0900 Subject: KVM: x86: MMU: Remove mapping_level_dirty_bitmap() Now that it has only one caller, and its name is not so helpful for readers, remove it. The new memslot_valid_for_gpte() function makes it possible to share the common code between gfn_to_memslot_dirty_bitmap() and mapping_level(). Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 890cd694c9a2..09833b04fc97 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -851,6 +851,17 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) return ret; } +static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot, + bool no_dirty_log) +{ + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + return false; + if (no_dirty_log && slot->dirty_bitmap) + return false; + + return true; +} + static struct kvm_memory_slot * gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) @@ -858,25 +869,22 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_memory_slot *slot; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - if (!slot || slot->flags & KVM_MEMSLOT_INVALID || - (no_dirty_log && slot->dirty_bitmap)) + if (!memslot_valid_for_gpte(slot, no_dirty_log)) slot = NULL; return slot; } -static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) -{ - return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true); -} - static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, bool *force_pt_level) { int host_level, level, max_level; + struct kvm_memory_slot *slot; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); if (likely(!*force_pt_level)) - *force_pt_level = mapping_level_dirty_bitmap(vcpu, large_gfn); + *force_pt_level = !memslot_valid_for_gpte(slot, true); if (unlikely(*force_pt_level)) return PT_PAGE_TABLE_LEVEL; -- cgit v1.2.3 From 5225fdf8c8bea4418f69875804584c89a27c170e Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 16 Oct 2015 17:08:03 +0900 Subject: KVM: x86: MMU: Eliminate an extra memory slot search in mapping_level() Calling kvm_vcpu_gfn_to_memslot() twice in mapping_level() should be avoided since getting a slot by binary search may not be negligible, especially for virtual machines with many memory slots. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 09833b04fc97..dd2a7c6ec2ca 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -818,14 +818,11 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm->arch.indirect_shadow_pages--; } -static int has_wrprotected_page(struct kvm_vcpu *vcpu, - gfn_t gfn, - int level) +static int __has_wrprotected_page(gfn_t gfn, int level, + struct kvm_memory_slot *slot) { - struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (slot) { linfo = lpage_info_slot(gfn, slot, level); return linfo->write_count; @@ -834,6 +831,14 @@ static int has_wrprotected_page(struct kvm_vcpu *vcpu, return 1; } +static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level) +{ + struct kvm_memory_slot *slot; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + return __has_wrprotected_page(gfn, level, slot); +} + static int host_mapping_level(struct kvm *kvm, gfn_t gfn) { unsigned long page_size; @@ -896,7 +901,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, max_level = min(kvm_x86_ops->get_lpage_level(), host_level); for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) - if (has_wrprotected_page(vcpu, large_gfn, level)) + if (__has_wrprotected_page(large_gfn, level, slot)) break; return level - 1; -- cgit v1.2.3 From 7cae2bedcbd4680b155999655e49c27b9cf020fa Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 14 Oct 2015 19:33:09 -0300 Subject: KVM: x86: move steal time initialization to vcpu entry time As reported at https://bugs.launchpad.net/qemu/+bug/1494350, it is possible to have vcpu->arch.st.last_steal initialized from a thread other than vcpu thread, say the iothread, via KVM_SET_MSRS. Which can cause an overflow later (when subtracting from vcpu threads sched_info.run_delay). To avoid that, move steal time accumulation to vcpu entry time, before copying steal time data to guest. Signed-off-by: Marcelo Tosatti Reviewed-by: David Matlack Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e33aebbf189e..9e9c226cb79d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1903,6 +1903,8 @@ static void accumulate_steal_time(struct kvm_vcpu *vcpu) static void record_steal_time(struct kvm_vcpu *vcpu) { + accumulate_steal_time(vcpu); + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -2053,12 +2055,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & KVM_MSR_ENABLED)) break; - vcpu->arch.st.last_steal = current->sched_info.run_delay; - - preempt_disable(); - accumulate_steal_time(vcpu); - preempt_enable(); - kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); break; @@ -2634,7 +2630,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vcpu->cpu = cpu; } - accumulate_steal_time(vcpu); kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); } -- cgit v1.2.3 From f5f3497cad8c8416a74b9aaceb127908755d020a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 Oct 2015 13:30:45 +0200 Subject: x86/setup: Extend low identity map to cover whole kernel range On 32-bit systems, the initial_page_table is reused by efi_call_phys_prolog as an identity map to call SetVirtualAddressMap. efi_call_phys_prolog takes care of converting the current CPU's GDT to a physical address too. For PAE kernels the identity mapping is achieved by aliasing the first PDPE for the kernel memory mapping into the first PDPE of initial_page_table. This makes the EFI stub's trick "just work". However, for non-PAE kernels there is no guarantee that the identity mapping in the initial_page_table extends as far as the GDT; in this case, accesses to the GDT will cause a page fault (which quickly becomes a triple fault). Fix this by copying the kernel mappings from swapper_pg_dir to initial_page_table twice, both at PAGE_OFFSET and at identity mapping. For some reason, this is only reproducible with QEMU's dynamic translation mode, and not for example with KVM. However, even under KVM one can clearly see that the page table is bogus: $ qemu-system-i386 -pflash OVMF.fd -M q35 vmlinuz0 -s -S -daemonize $ gdb (gdb) target remote localhost:1234 (gdb) hb *0x02858f6f Hardware assisted breakpoint 1 at 0x2858f6f (gdb) c Continuing. Breakpoint 1, 0x02858f6f in ?? () (gdb) monitor info registers ... GDT= 0724e000 000000ff IDT= fffbb000 000007ff CR0=0005003b CR2=ff896000 CR3=032b7000 CR4=00000690 ... The page directory is sane: (gdb) x/4wx 0x32b7000 0x32b7000: 0x03398063 0x03399063 0x0339a063 0x0339b063 (gdb) x/4wx 0x3398000 0x3398000: 0x00000163 0x00001163 0x00002163 0x00003163 (gdb) x/4wx 0x3399000 0x3399000: 0x00400003 0x00401003 0x00402003 0x00403003 but our particular page directory entry is empty: (gdb) x/1wx 0x32b7000 + (0x724e000 >> 22) * 4 0x32b7070: 0x00000000 [ It appears that you can skate past this issue if you don't receive any interrupts while the bogus GDT pointer is loaded, or if you avoid reloading the segment registers in general. Andy Lutomirski provides some additional insight: "AFAICT it's entirely permissible for the GDTR and/or LDT descriptor to point to unmapped memory. Any attempt to use them (segment loads, interrupts, IRET, etc) will try to access that memory as if the access came from CPL 0 and, if the access fails, will generate a valid page fault with CR2 pointing into the GDT or LDT." Up until commit 23a0d4e8fa6d ("efi: Disable interrupts around EFI calls, not in the epilog/prolog calls") interrupts were disabled around the prolog and epilog calls, and the functional GDT was re-installed before interrupts were re-enabled. Which explains why no one has hit this issue until now. ] Signed-off-by: Paolo Bonzini Reported-by: Laszlo Ersek Cc: Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Signed-off-by: Matt Fleming [ Updated changelog. ] --- arch/x86/kernel/setup.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index fdb7f2a2d328..a3cccbfc5f77 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1173,6 +1173,14 @@ void __init setup_arch(char **cmdline_p) clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); + + /* + * sync back low identity map too. It is used for example + * in the 32-bit EFI stub. + */ + clone_pgd_range(initial_page_table, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); #endif tboot_probe(); -- cgit v1.2.3 From c0ff971ef9acacd4d2caa508e444edad958dead9 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 15 Oct 2015 19:42:23 +0200 Subject: x86/ioapic: Disable interrupts when re-routing legacy IRQs A sporadic hang with consequent crash is observed when booting Hyper-V Gen1 guests: Call Trace: [] ? trace_hardirqs_off+0xd/0x10 [] queue_work_on+0x46/0x90 [] ? add_interrupt_randomness+0x176/0x1d0 ... [] ? _raw_spin_unlock_irqrestore+0x3b/0x60 [] __irq_put_desc_unlock+0x1e/0x40 [] irq_modify_status+0xb5/0xd0 [] mp_register_handler+0x4b/0x70 [] mp_irqdomain_alloc+0x1ea/0x2a0 [] irq_domain_alloc_irqs_recursive+0x40/0xa0 [] __irq_domain_alloc_irqs+0x13c/0x2b0 [] alloc_isa_irq_from_domain.isra.1+0xc0/0xe0 [] mp_map_pin_to_irq+0x165/0x2d0 [] pin_2_irq+0x47/0x80 [] setup_IO_APIC+0xfe/0x802 ... [] ? rest_init+0x140/0x140 The issue is easily reproducible with a simple instrumentation: if mdelay(10) is put between mp_setup_entry() and mp_register_handler() calls in mp_irqdomain_alloc() Hyper-V guest always fails to boot when re-routing IRQ0. The issue seems to be caused by the fact that we don't disable interrupts while doing IOPIC programming for legacy IRQs and IRQ0 actually happens. Protect the setup sequence against concurrent interrupts. [ tglx: Make the protection unconditional and not only for legacy interrupts ] Signed-off-by: Vitaly Kuznetsov Cc: Jiang Liu Cc: Yinghai Lu Cc: K. Y. Srinivasan Link: http://lkml.kernel.org/r/1444930943-19336-1-git-send-email-vkuznets@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5c60bb162622..bb6bfc01cb82 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2907,6 +2907,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, struct irq_data *irq_data; struct mp_chip_data *data; struct irq_alloc_info *info = arg; + unsigned long flags; if (!info || nr_irqs > 1) return -EINVAL; @@ -2939,11 +2940,14 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, cfg = irqd_cfg(irq_data); add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin); + + local_irq_save(flags); if (info->ioapic_entry) mp_setup_entry(cfg, data, info->ioapic_entry); mp_register_handler(virq, data->trigger); if (virq < nr_legacy_irqs()) legacy_pic->mask(virq); + local_irq_restore(flags); apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n", -- cgit v1.2.3 From a3669868d99c8647d780895d83e74d9a921eba2b Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 14 Oct 2015 14:29:40 +0800 Subject: ACPI/PCI: Reset acpi_root_dev->domain to 0 when pci_ignore_seg is set Reset acpi_root_dev->domain to 0 when pci_ignore_seg is set to keep consistence between ACPI PCI root device and PCI host bridge device. Acked-by: Bjorn Helgaas Signed-off-by: Jiang Liu Signed-off-by: Rafael J. Wysocki --- arch/x86/pci/acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index ff9911707160..5bc018559cc4 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -401,7 +401,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) int node; if (pci_ignore_seg) - domain = 0; + root->segment = domain = 0; if (domain && !pci_domains_supported) { printk(KERN_WARNING "pci_bus %04x:%02x: " -- cgit v1.2.3 From 4d6b4e69a245e9df4b84dba387596086cb66887d Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 14 Oct 2015 14:29:41 +0800 Subject: x86/PCI/ACPI: Use common interface to support PCI host bridge Use common interface to simplify ACPI PCI host bridge implementation. Signed-off-by: Jiang Liu Reviewed-by: Hanjun Guo Acked-by: Bjorn Helgaas Signed-off-by: Rafael J. Wysocki --- arch/x86/pci/acpi.c | 294 ++++++++++++++++------------------------------------ 1 file changed, 87 insertions(+), 207 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 5bc018559cc4..3cd69832d7f4 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -4,16 +4,15 @@ #include #include #include +#include #include #include struct pci_root_info { - struct acpi_device *bridge; - char name[16]; + struct acpi_pci_root_info common; struct pci_sysdata sd; #ifdef CONFIG_PCI_MMCONFIG bool mcfg_added; - u16 segment; u8 start_bus; u8 end_bus; #endif @@ -178,15 +177,18 @@ static int check_segment(u16 seg, struct device *dev, char *estr) return 0; } -static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start, - u8 end, phys_addr_t addr) +static int setup_mcfg_map(struct acpi_pci_root_info *ci) { - int result; - struct device *dev = &info->bridge->dev; + int result, seg; + struct pci_root_info *info; + struct acpi_pci_root *root = ci->root; + struct device *dev = &ci->bridge->dev; - info->start_bus = start; - info->end_bus = end; + info = container_of(ci, struct pci_root_info, common); + info->start_bus = (u8)root->secondary.start; + info->end_bus = (u8)root->secondary.end; info->mcfg_added = false; + seg = info->sd.domain; /* return success if MMCFG is not in use */ if (raw_pci_ext_ops && raw_pci_ext_ops != &pci_mmcfg) @@ -195,7 +197,8 @@ static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start, if (!(pci_probe & PCI_PROBE_MMCONF)) return check_segment(seg, dev, "MMCONFIG is disabled,"); - result = pci_mmconfig_insert(dev, seg, start, end, addr); + result = pci_mmconfig_insert(dev, seg, info->start_bus, info->end_bus, + root->mcfg_addr); if (result == 0) { /* enable MMCFG if it hasn't been enabled yet */ if (raw_pci_ext_ops == NULL) @@ -208,134 +211,55 @@ static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start, return 0; } -static void teardown_mcfg_map(struct pci_root_info *info) +static void teardown_mcfg_map(struct acpi_pci_root_info *ci) { + struct pci_root_info *info; + + info = container_of(ci, struct pci_root_info, common); if (info->mcfg_added) { - pci_mmconfig_delete(info->segment, info->start_bus, - info->end_bus); + pci_mmconfig_delete(info->sd.domain, + info->start_bus, info->end_bus); info->mcfg_added = false; } } #else -static int setup_mcfg_map(struct pci_root_info *info, - u16 seg, u8 start, u8 end, - phys_addr_t addr) +static int setup_mcfg_map(struct acpi_pci_root_info *ci) { return 0; } -static void teardown_mcfg_map(struct pci_root_info *info) + +static void teardown_mcfg_map(struct acpi_pci_root_info *ci) { } #endif -static void validate_resources(struct device *dev, struct list_head *crs_res, - unsigned long type) +static int pci_acpi_root_get_node(struct acpi_pci_root *root) { - LIST_HEAD(list); - struct resource *res1, *res2, *root = NULL; - struct resource_entry *tmp, *entry, *entry2; - - BUG_ON((type & (IORESOURCE_MEM | IORESOURCE_IO)) == 0); - root = (type & IORESOURCE_MEM) ? &iomem_resource : &ioport_resource; - - list_splice_init(crs_res, &list); - resource_list_for_each_entry_safe(entry, tmp, &list) { - bool free = false; - resource_size_t end; - - res1 = entry->res; - if (!(res1->flags & type)) - goto next; - - /* Exclude non-addressable range or non-addressable portion */ - end = min(res1->end, root->end); - if (end <= res1->start) { - dev_info(dev, "host bridge window %pR (ignored, not CPU addressable)\n", - res1); - free = true; - goto next; - } else if (res1->end != end) { - dev_info(dev, "host bridge window %pR ([%#llx-%#llx] ignored, not CPU addressable)\n", - res1, (unsigned long long)end + 1, - (unsigned long long)res1->end); - res1->end = end; - } - - resource_list_for_each_entry(entry2, crs_res) { - res2 = entry2->res; - if (!(res2->flags & type)) - continue; - - /* - * I don't like throwing away windows because then - * our resources no longer match the ACPI _CRS, but - * the kernel resource tree doesn't allow overlaps. - */ - if (resource_overlaps(res1, res2)) { - res2->start = min(res1->start, res2->start); - res2->end = max(res1->end, res2->end); - dev_info(dev, "host bridge window expanded to %pR; %pR ignored\n", - res2, res1); - free = true; - goto next; - } - } + int busnum = root->secondary.start; + struct acpi_device *device = root->device; + int node = acpi_get_node(device->handle); -next: - resource_list_del(entry); - if (free) - resource_list_free_entry(entry); - else - resource_list_add_tail(entry, crs_res); + if (node == NUMA_NO_NODE) { + node = x86_pci_root_bus_node(busnum); + if (node != 0 && node != NUMA_NO_NODE) + dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n", + node); } + if (node != NUMA_NO_NODE && !node_online(node)) + node = NUMA_NO_NODE; + + return node; } -static void add_resources(struct pci_root_info *info, - struct list_head *resources, - struct list_head *crs_res) +static int pci_acpi_root_init_info(struct acpi_pci_root_info *ci) { - struct resource_entry *entry, *tmp; - struct resource *res, *conflict, *root = NULL; - - validate_resources(&info->bridge->dev, crs_res, IORESOURCE_MEM); - validate_resources(&info->bridge->dev, crs_res, IORESOURCE_IO); - - resource_list_for_each_entry_safe(entry, tmp, crs_res) { - res = entry->res; - if (res->flags & IORESOURCE_MEM) - root = &iomem_resource; - else if (res->flags & IORESOURCE_IO) - root = &ioport_resource; - else - BUG_ON(res); - - conflict = insert_resource_conflict(root, res); - if (conflict) { - dev_info(&info->bridge->dev, - "ignoring host bridge window %pR (conflicts with %s %pR)\n", - res, conflict->name, conflict); - resource_list_destroy_entry(entry); - } - } - - list_splice_tail(crs_res, resources); + return setup_mcfg_map(ci); } -static void release_pci_root_info(struct pci_host_bridge *bridge) +static void pci_acpi_root_release_info(struct acpi_pci_root_info *ci) { - struct resource *res; - struct resource_entry *entry; - struct pci_root_info *info = bridge->release_data; - - resource_list_for_each_entry(entry, &bridge->windows) { - res = entry->res; - if (res->parent && - (res->flags & (IORESOURCE_MEM | IORESOURCE_IO))) - release_resource(res); - } - - teardown_mcfg_map(info); - kfree(info); + teardown_mcfg_map(ci); + kfree(container_of(ci, struct pci_root_info, common)); } /* @@ -358,47 +282,44 @@ static bool resource_is_pcicfg_ioport(struct resource *res) res->start == 0xCF8 && res->end == 0xCFF; } -static void probe_pci_root_info(struct pci_root_info *info, - struct acpi_device *device, - int busnum, int domain, - struct list_head *list) +static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci) { - int ret; + struct acpi_device *device = ci->bridge; + int busnum = ci->root->secondary.start; struct resource_entry *entry, *tmp; + int status; - sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum); - info->bridge = device; - ret = acpi_dev_get_resources(device, list, - acpi_dev_filter_resource_type_cb, - (void *)(IORESOURCE_IO | IORESOURCE_MEM)); - if (ret < 0) - dev_warn(&device->dev, - "failed to parse _CRS method, error code %d\n", ret); - else if (ret == 0) - dev_dbg(&device->dev, - "no IO and memory resources present in _CRS\n"); - else - resource_list_for_each_entry_safe(entry, tmp, list) { - if ((entry->res->flags & IORESOURCE_DISABLED) || - resource_is_pcicfg_ioport(entry->res)) + status = acpi_pci_probe_root_resources(ci); + if (pci_use_crs) { + resource_list_for_each_entry_safe(entry, tmp, &ci->resources) + if (resource_is_pcicfg_ioport(entry->res)) resource_list_destroy_entry(entry); - else - entry->res->name = info->name; - } + return status; + } + + resource_list_for_each_entry_safe(entry, tmp, &ci->resources) { + dev_printk(KERN_DEBUG, &device->dev, + "host bridge window %pR (ignored)\n", entry->res); + resource_list_destroy_entry(entry); + } + x86_pci_root_bus_resources(busnum, &ci->resources); + + return 0; } +static struct acpi_pci_root_ops acpi_pci_root_ops = { + .pci_ops = &pci_root_ops, + .init_info = pci_acpi_root_init_info, + .release_info = pci_acpi_root_release_info, + .prepare_resources = pci_acpi_root_prepare_resources, +}; + struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) { - struct acpi_device *device = root->device; - struct pci_root_info *info; int domain = root->segment; int busnum = root->secondary.start; - struct resource_entry *res_entry; - LIST_HEAD(crs_res); - LIST_HEAD(resources); + int node = pci_acpi_root_get_node(root); struct pci_bus *bus; - struct pci_sysdata *sd; - int node; if (pci_ignore_seg) root->segment = domain = 0; @@ -410,71 +331,33 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) return NULL; } - node = acpi_get_node(device->handle); - if (node == NUMA_NO_NODE) { - node = x86_pci_root_bus_node(busnum); - if (node != 0 && node != NUMA_NO_NODE) - dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n", - node); - } - - if (node != NUMA_NO_NODE && !node_online(node)) - node = NUMA_NO_NODE; - - info = kzalloc_node(sizeof(*info), GFP_KERNEL, node); - if (!info) { - printk(KERN_WARNING "pci_bus %04x:%02x: " - "ignored (out of memory)\n", domain, busnum); - return NULL; - } - - sd = &info->sd; - sd->domain = domain; - sd->node = node; - sd->companion = device; - bus = pci_find_bus(domain, busnum); if (bus) { /* * If the desired bus has been scanned already, replace * its bus->sysdata. */ - memcpy(bus->sysdata, sd, sizeof(*sd)); - kfree(info); - } else { - /* insert busn res at first */ - pci_add_resource(&resources, &root->secondary); + struct pci_sysdata sd = { + .domain = domain, + .node = node, + .companion = root->device + }; - /* - * _CRS with no apertures is normal, so only fall back to - * defaults or native bridge info if we're ignoring _CRS. - */ - probe_pci_root_info(info, device, busnum, domain, &crs_res); - if (pci_use_crs) { - add_resources(info, &resources, &crs_res); - } else { - resource_list_for_each_entry(res_entry, &crs_res) - dev_printk(KERN_DEBUG, &device->dev, - "host bridge window %pR (ignored)\n", - res_entry->res); - resource_list_free(&crs_res); - x86_pci_root_bus_resources(busnum, &resources); - } - - if (!setup_mcfg_map(info, domain, (u8)root->secondary.start, - (u8)root->secondary.end, root->mcfg_addr)) - bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, - sd, &resources); - - if (bus) { - pci_scan_child_bus(bus); - pci_set_host_bridge_release( - to_pci_host_bridge(bus->bridge), - release_pci_root_info, info); - } else { - resource_list_free(&resources); - teardown_mcfg_map(info); - kfree(info); + memcpy(bus->sysdata, &sd, sizeof(sd)); + } else { + struct pci_root_info *info; + + info = kzalloc_node(sizeof(*info), GFP_KERNEL, node); + if (!info) + dev_err(&root->device->dev, + "pci_bus %04x:%02x: ignored (out of memory)\n", + domain, busnum); + else { + info->sd.domain = domain; + info->sd.node = node; + info->sd.companion = root->device; + bus = acpi_pci_root_create(root, &acpi_pci_root_ops, + &info->common, &info->sd); } } @@ -487,9 +370,6 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) pcie_bus_configure_settings(child); } - if (bus && node != NUMA_NO_NODE) - dev_printk(KERN_DEBUG, &bus->dev, "on NUMA node %d\n", node); - return bus; } -- cgit v1.2.3 From 657c1eea0019e80685a84cbb1919794243a187c9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 16 Oct 2015 15:42:54 -0700 Subject: x86/entry/32: Fix entry_INT80_32() to expect interrupts to be on When I rewrote entry_INT80_32, I thought that int80 was an interrupt gate. It's a trap gate. *facepalm* Thanks to Brian Gerst for pointing out that it's better to change the entry code than to change the gate type. Suggested-by: Brian Gerst Reported-and-tested-by: Borislav Petkov Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 150ac78d63af ("x86/entry/32: Switch INT80 to the new C syscall path") Link: http://lkml.kernel.org/r/dc09d9b574a5c1dcca996847875c73f8341ce0ad.1445035014.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 15 ++++++++++++--- arch/x86/entry/entry_32.S | 8 ++++---- arch/x86/entry/entry_64_compat.S | 2 +- 3 files changed, 17 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index b53e04d301a3..a89fdbc1f0be 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -351,7 +351,14 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs) * in workloads that use it, and it's usually called from * do_fast_syscall_32, so forcibly inline it to improve performance. */ -static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) +#ifdef CONFIG_X86_32 +/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */ +__visible +#else +/* 64-bit kernels use do_syscall_32_irqs_off() instead. */ +static +#endif +__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) { struct thread_info *ti = pt_regs_to_thread_info(regs); unsigned int nr = (unsigned int)regs->orig_ax; @@ -386,12 +393,14 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) syscall_return_slowpath(regs); } -/* Handles int $0x80 */ -__visible void do_int80_syscall_32(struct pt_regs *regs) +#ifdef CONFIG_X86_64 +/* Handles INT80 on 64-bit kernels */ +__visible void do_syscall_32_irqs_off(struct pt_regs *regs) { local_irq_enable(); do_syscall_32_irqs_on(regs); } +#endif /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ __visible long do_fast_syscall_32(struct pt_regs *regs) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index c1c7c6364216..4f97f49261d3 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -345,13 +345,13 @@ ENTRY(entry_INT80_32) SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ /* - * User mode is traced as though IRQs are on, and the interrupt gate - * turned them off. + * User mode is traced as though IRQs are on. Unlike the 64-bit + * case, INT80 is a trap gate on 32-bit kernels, so interrupts + * are already on (unless user code is messing around with iopl). */ - TRACE_IRQS_OFF movl %esp, %eax - call do_int80_syscall_32 + call do_syscall_32_irqs_on .Lsyscall_32_done: restore_all: diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 92b0b27b43c6..c3201830a85e 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -303,7 +303,7 @@ ENTRY(entry_INT80_compat) TRACE_IRQS_OFF movq %rsp, %rdi - call do_int80_syscall_32 + call do_syscall_32_irqs_off .Lsyscall_32_done: /* Go back to user mode. */ -- cgit v1.2.3 From 3bd29515d1cad26fa85a1a9b442de8816c1f5c54 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 16 Oct 2015 15:42:55 -0700 Subject: x86/entry/32: Fix FS and GS restore in opportunistic SYSEXIT We either need to restore them before popping and thus changing ESP, or we need to adjust the offsets. The former is simpler. Reported-and-tested-by: Borislav Petkov Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 5f310f739b4c x86/entry/32: ("Re-implement SYSENTER using the new C path") Link: http://lkml.kernel.org/r/461e5c7d8fa3821529893a4893ac9c4bc37f9e17.1445035014.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 4f97f49261d3..3eb572ed3d7a 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -315,14 +315,14 @@ sysenter_past_esp: TRACE_IRQS_ON /* User mode traces as IRQs on. */ movl PT_EIP(%esp), %edx /* pt_regs->ip */ movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */ +1: mov PT_FS(%esp), %fs + PTGS_TO_GS popl %ebx /* pt_regs->bx */ addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */ popl %esi /* pt_regs->si */ popl %edi /* pt_regs->di */ popl %ebp /* pt_regs->bp */ popl %eax /* pt_regs->ax */ -1: mov PT_FS(%esp), %fs - PTGS_TO_GS /* * Return back to the vDSO, which will pop ecx and edx. -- cgit v1.2.3 From f1ccd249319efca4ee4faf1d904f5a362cac7c81 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 16 Oct 2015 00:14:28 -0400 Subject: x86/smpboot: Fix cpu_init_udelay=10000 corner case boot parameter misbehavior For legacy machines cpu_init_udelay defaults to 10,000. For modern machines it is set to 0. The user should be able to set cpu_init_udelay to any value on the cmdline, including 10,000. Before this patch, that was seen as "unchanged from default" and thus on a modern machine, the user request was ignored and the delay was set to 0. Signed-off-by: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dparsons@brightdsl.net Cc: shrybman@teksavvy.com Link: http://lkml.kernel.org/r/de363cdbbcfcca1d22569683f7eb9873e0177251.1444968087.git.len.brown@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index e0c198e5f920..32267ccac3d7 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -509,7 +509,7 @@ void __inquire_remote_apic(int apicid) */ #define UDELAY_10MS_DEFAULT 10000 -static unsigned int init_udelay = UDELAY_10MS_DEFAULT; +static unsigned int init_udelay = INT_MAX; static int __init cpu_init_udelay(char *str) { @@ -522,13 +522,16 @@ early_param("cpu_init_udelay", cpu_init_udelay); static void __init smp_quirk_init_udelay(void) { /* if cmdline changed it from default, leave it alone */ - if (init_udelay != UDELAY_10MS_DEFAULT) + if (init_udelay != INT_MAX) return; /* if modern processor, use no delay */ if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) init_udelay = 0; + + /* else, use legacy delay */ + init_udelay = UDELAY_10MS_DEFAULT; } /* -- cgit v1.2.3 From fcafddec4e78a7776db4b6685db6b2902d4300fc Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 16 Oct 2015 00:14:29 -0400 Subject: x86/smpboot: Fix CPU #1 boot timeout The following commit: a9bcaa02a5104ac ("x86/smpboot: Remove SIPI delays from cpu_up()") Caused some Intel Core2 processors to time-out when bringing up CPU #1, resulting in the missing of that CPU after bootup. That patch reduced the SIPI delays from udelay() 300, 200 to udelay() 0, 0 on modern processors. Several Intel(R) Core(TM)2 systems failed to bring up CPU #1 10/10 times after that change. Increasing either of the SIPI delays to udelay(1) results in success. So here we increase both to udelay(10). While this may be 20x slower than the absolute minimum, it is still 20x to 30x faster than the original code. Tested-by: Donald Parsons Tested-by: Shane Signed-off-by: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dparsons@brightdsl.net Cc: shrybman@teksavvy.com Link: http://lkml.kernel.org/r/6dd554ee8945984d85aafb2ad35793174d068af0.1444968087.git.len.brown@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 32267ccac3d7..892ee2e5ecbc 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -660,7 +660,9 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) /* * Give the other CPU some time to accept the IPI. */ - if (init_udelay) + if (init_udelay == 0) + udelay(10); + else udelay(300); pr_debug("Startup point 1\n"); @@ -671,7 +673,9 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) /* * Give the other CPU some time to accept the IPI. */ - if (init_udelay) + if (init_udelay == 0) + udelay(10); + else udelay(200); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ -- cgit v1.2.3 From a75ca545e8d57473da47ece828ad98a10727ec6f Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 16 Oct 2015 14:28:53 +0300 Subject: x86, kasan: Fix build failure on KASAN=y && KMEMCHECK=y kernels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Declaration of memcpy() is hidden under #ifndef CONFIG_KMEMCHECK. In asm/efi.h under #ifdef CONFIG_KASAN we #undef memcpy(), due to which the following happens: In file included from arch/x86/kernel/setup.c:96:0: ./arch/x86/include/asm/desc.h: In function ‘native_write_idt_entry’: ./arch/x86/include/asm/desc.h:122:2: error: implicit declaration of function ‘memcpy’ [-Werror=implicit-function-declaration] memcpy(&idt[entry], gate, sizeof(*gate)); ^ cc1: some warnings being treated as errors make[2]: *** [arch/x86/kernel/setup.o] Error 1 We will get rid of that #undef in asm/efi.h eventually. But in the meanwhile move memcpy() declaration out of #ifdefs to fix the build. Reported-by: Borislav Petkov Signed-off-by: Andrey Ryabinin Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1444994933-28328-1-git-send-email-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/string_64.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index e4661196994e..ff8b9a17dc4b 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -27,12 +27,11 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t function. */ #define __HAVE_ARCH_MEMCPY 1 +extern void *memcpy(void *to, const void *from, size_t len); extern void *__memcpy(void *to, const void *from, size_t len); #ifndef CONFIG_KMEMCHECK -#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4 -extern void *memcpy(void *to, const void *from, size_t len); -#else +#if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4 #define memcpy(dst, src, len) \ ({ \ size_t __len = (len); \ -- cgit v1.2.3 From 558a65bc31a0c7811b34dad32f51f47c55a40000 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Wed, 14 Oct 2015 14:31:19 -0400 Subject: sched/x86: Fix typo in __switch_to() comments Fix obvious mistake: FS/GS should be DS/ES. Signed-off-by: Chuck Ebbert Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20151014143119.78858eeb@r5 Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index d7f1d5c4387d..e835d263a33b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -332,7 +332,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch FS and GS. * - * These are even more complicated than FS and GS: they have + * These are even more complicated than DS and ES: they have * 64-bit bases are that controlled by arch_prctl. Those bases * only differ from the values in the GDT or LDT if the selector * is 0. -- cgit v1.2.3 From 5690891bcec5fcfda38da974ffa5488e36a59811 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 19 Oct 2015 11:30:19 +0200 Subject: kvm: x86: zero EFER on INIT Not zeroing EFER means that a 32-bit firmware cannot enter paging mode without clearing EFER.LME first (which it should not know about). Yang Zhang from Intel confirmed that the manual is wrong and EFER is cleared to zero on INIT. Fixes: d28bc9dd25ce023270d2e039e7c98d38ecbf7758 Cc: stable@vger.kernel.org Cc: Yang Z Zhang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 11 +++++------ arch/x86/kvm/vmx.c | 3 +-- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index cd8659cfc632..f2c8e4917688 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1089,7 +1089,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) return target_tsc - tsc; } -static void init_vmcb(struct vcpu_svm *svm, bool init_event) +static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; @@ -1160,8 +1160,7 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event) init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - if (!init_event) - svm_set_efer(&svm->vcpu, 0); + svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; kvm_set_rflags(&svm->vcpu, 2); save->rip = 0x0000fff0; @@ -1215,7 +1214,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (kvm_vcpu_is_reset_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; } - init_vmcb(svm, init_event); + init_vmcb(svm); kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); kvm_register_write(vcpu, VCPU_REGS_RDX, eax); @@ -1271,7 +1270,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) clear_page(svm->vmcb); svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; - init_vmcb(svm, false); + init_vmcb(svm); svm_init_osvw(&svm->vcpu); @@ -1893,7 +1892,7 @@ static int shutdown_interception(struct vcpu_svm *svm) * so reinitialize it. */ clear_page(svm->vmcb); - init_vmcb(svm, false); + init_vmcb(svm); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 15bff51d492a..4d0aa31a42ba 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4932,8 +4932,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_set_cr0(vcpu, cr0); /* enter rmode */ vmx->vcpu.arch.cr0 = cr0; vmx_set_cr4(vcpu, 0); - if (!init_event) - vmx_set_efer(vcpu, 0); + vmx_set_efer(vcpu, 0); vmx_fpu_activate(vcpu); update_exception_bitmap(vcpu); -- cgit v1.2.3 From 8c85ac1c0a1b41299370857765bc0950666ed5d9 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 19 Oct 2015 15:13:29 +0900 Subject: KVM: x86: MMU: Initialize force_pt_level before calling mapping_level() Commit fd1369021878 ("KVM: x86: MMU: Move mapping_level_dirty_bitmap() call in mapping_level()") forgot to initialize force_pt_level to false in FNAME(page_fault)() before calling mapping_level() like nonpaging_map() does. This can sometimes result in forcing page table level mapping unnecessarily. Fix this and move the first *force_pt_level check in mapping_level() before kvm_vcpu_gfn_to_memslot() call to make it a bit clearer that the variable must be initialized before mapping_level() gets called. This change can also avoid calling kvm_vcpu_gfn_to_memslot() when !check_hugepage_cache_consistency() check in tdp_page_fault() forces page table level mapping. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 7 ++++--- arch/x86/kvm/paging_tmpl.h | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dd2a7c6ec2ca..7d85bcae3332 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -886,10 +886,11 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, int host_level, level, max_level; struct kvm_memory_slot *slot; - slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); + if (unlikely(*force_pt_level)) + return PT_PAGE_TABLE_LEVEL; - if (likely(!*force_pt_level)) - *force_pt_level = !memslot_valid_for_gpte(slot, true); + slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); + *force_pt_level = !memslot_valid_for_gpte(slot, true); if (unlikely(*force_pt_level)) return PT_PAGE_TABLE_LEVEL; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index bf39d0f3efa9..b41faa91a6f9 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -698,7 +698,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; - bool force_pt_level; + bool force_pt_level = false; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; -- cgit v1.2.3 From 37e81a016cc847c03ea71570fea29f12ca390bee Mon Sep 17 00:00:00 2001 From: Hans-Werner Hilse Date: Mon, 29 Jun 2015 11:50:32 +0200 Subject: um: Do not rely on libc to provide modify_ldt() modify_ldt() was declared as an external symbol. Despite the man page for this syscall telling that there is no wrapper in glibc, since version 2.1 there actually is, so linking to the glibc works. Since modify_ldt() is not a POSIX interface, other libc implementations do not always provide a wrapper function. Even glibc headers do not provide a corresponding declaration. So go the recommended way to call this using syscall(). Signed-off-by: Hans-Werner Hilse Signed-off-by: Richard Weinberger --- arch/x86/um/ldt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c index 9701a4fd7bf2..836a1eb5df43 100644 --- a/arch/x86/um/ldt.c +++ b/arch/x86/um/ldt.c @@ -12,7 +12,10 @@ #include #include -extern int modify_ldt(int func, void *ptr, unsigned long bytecount); +static inline int modify_ldt (int func, void *ptr, unsigned long bytecount) +{ + return syscall(__NR_modify_ldt, func, ptr, bytecount); +} static long write_ldt_entry(struct mm_id *mm_idp, int func, struct user_desc *desc, void **addr, int done) -- cgit v1.2.3 From b9511cd761faafca7a1acc059e792c1399f9d7c6 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 16 Oct 2015 16:24:05 +0300 Subject: perf/x86: Fix time_shift in perf_event_mmap_page Commit: b20112edeadf ("perf/x86: Improve accuracy of perf/sched clock") allowed the time_shift value in perf_event_mmap_page to be as much as 32. Unfortunately the documented algorithms for using time_shift have it shifting an integer, whereas to work correctly with the value 32, the type must be u64. In the case of perf tools, Intel PT decodes correctly but the timestamps that are output (for example by perf script) have lost 32-bits of granularity so they look like they are not changing at all. Fix by limiting the shift to 31 and adjusting the multiplier accordingly. Also update the documentation of perf_event_mmap_page so that new code based on it will be more future-proof. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: b20112edeadf ("perf/x86: Improve accuracy of perf/sched clock") Link: http://lkml.kernel.org/r/1445001845-13688-2-git-send-email-adrian.hunter@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 11 +++++++++++ include/uapi/linux/perf_event.h | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 69b84a26ea17..c7c4d9c51e99 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -259,6 +259,17 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz, NSEC_PER_MSEC, 0); + /* + * cyc2ns_shift is exported via arch_perf_update_userpage() where it is + * not expected to be greater than 31 due to the original published + * conversion algorithm shifting a 32-bit value (now specifies a 64-bit + * value) - refer perf_event_mmap_page documentation in perf_event.h. + */ + if (data->cyc2ns_shift == 32) { + data->cyc2ns_shift = 31; + data->cyc2ns_mul >>= 1; + } + data->cyc2ns_offset = ns_now - mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift); diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 2881145cda86..6c72e72e975c 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -476,7 +476,7 @@ struct perf_event_mmap_page { * u64 delta; * * quot = (cyc >> time_shift); - * rem = cyc & ((1 << time_shift) - 1); + * rem = cyc & (((u64)1 << time_shift) - 1); * delta = time_offset + quot * time_mult + * ((rem * time_mult) >> time_shift); * @@ -507,7 +507,7 @@ struct perf_event_mmap_page { * And vice versa: * * quot = cyc >> time_shift; - * rem = cyc & ((1 << time_shift) - 1); + * rem = cyc & (((u64)1 << time_shift) - 1); * timestamp = time_zero + quot * time_mult + * ((rem * time_mult) >> time_shift); */ -- cgit v1.2.3 From d892819faa6860d469aae71d70c336b391c25505 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 13 Oct 2015 09:09:09 +0200 Subject: perf/x86: Add support for PERF_SAMPLE_BRANCH_CALL This patch enables the suport for the PERF_SAMPLE_BRANCH_CALL for Intel x86 processors. When the processor support LBR filtering this the selection is done in hardware. Otherwise, the filter is applied by software. Note that we chose to include zero length calls because they also represent calls. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Cc: khandual@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1444720151-10275-3-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index ad0b8b0490a0..bfd0b717e944 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -555,6 +555,8 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP) mask |= X86_BR_IND_JMP; + if (br_type & PERF_SAMPLE_BRANCH_CALL) + mask |= X86_BR_CALL | X86_BR_ZERO_CALL; /* * stash actual user request into reg, it may * be used by fixup code for some CPU @@ -890,6 +892,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, }; static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { @@ -905,6 +908,7 @@ static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL | LBR_RETURN | LBR_CALL_STACK, [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, }; /* core */ -- cgit v1.2.3 From f7d27c35ddff7c100d7a98db499ac0040149ac05 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 19 Oct 2015 11:37:18 +0300 Subject: x86/mm, kasan: Silence KASAN warnings in get_wchan() get_wchan() is racy by design, it may access volatile stack of running task, thus it may access redzone in a stack frame and cause KASAN to warn about this. Use READ_ONCE_NOCHECK() to silence these warnings. Reported-by: Sasha Levin Signed-off-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrew Morton Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: Kostya Serebryany Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Wolfram Gloger Cc: kasan-dev Link: http://lkml.kernel.org/r/1445243838-17763-3-git-send-email-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 39e585a554b7..e28db181e4fc 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -550,14 +550,14 @@ unsigned long get_wchan(struct task_struct *p) if (sp < bottom || sp > top) return 0; - fp = READ_ONCE(*(unsigned long *)sp); + fp = READ_ONCE_NOCHECK(*(unsigned long *)sp); do { if (fp < bottom || fp > top) return 0; - ip = READ_ONCE(*(unsigned long *)(fp + sizeof(unsigned long))); + ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long))); if (!in_sched_functions(ip)) return ip; - fp = READ_ONCE(*(unsigned long *)fp); + fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); } while (count++ < 16 && p->state != TASK_RUNNING); return 0; } -- cgit v1.2.3 From eb6db83d105914c246ac5875be76fd4b944833d5 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 19 Oct 2015 11:17:41 +0200 Subject: x86/setup: Do not reserve crashkernel high memory if low reservation failed People reported that when allocating crashkernel memory using the ",high" and ",low" syntax, there were cases where the reservation of the high portion succeeds but the reservation of the low portion fails. Then kexec can load the kdump kernel successfully, but booting the kdump kernel fails as there's no low memory. The low memory allocation for the kdump kernel can fail on large systems for a couple of reasons. For example, the manually specified crashkernel low memory can be too large and thus no adequate memblock region would be found. Therefore, we try to reserve low memory for the crash kernel *after* the high memory portion has been allocated. If that fails, we free crashkernel high memory too and return. The user can then take measures accordingly. Tested-by: Joerg Roedel Signed-off-by: Baoquan He [ Massage text. ] Signed-off-by: Borislav Petkov Reviewed-by: Joerg Roedel Cc: Andrew Morton Cc: Andy Lutomirski Cc: Dave Young Cc: H. Peter Anvin Cc: Jiri Kosina Cc: Juergen Gross Cc: Linus Torvalds Cc: Mark Salter Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: WANG Chao Cc: jerry_hoemann@hp.com Cc: yinghai@kernel.org Link: http://lkml.kernel.org/r/1445246268-26285-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index fdb7f2a2d328..1b36839e41eb 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -493,7 +493,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) # define CRASH_KERNEL_ADDR_HIGH_MAX MAXMEM #endif -static void __init reserve_crashkernel_low(void) +static int __init reserve_crashkernel_low(void) { #ifdef CONFIG_X86_64 const unsigned long long alignment = 16<<20; /* 16M */ @@ -522,17 +522,16 @@ static void __init reserve_crashkernel_low(void) } else { /* passed with crashkernel=0,low ? */ if (!low_size) - return; + return 0; } low_base = memblock_find_in_range(low_size, (1ULL<<32), low_size, alignment); if (!low_base) { - if (!auto_set) - pr_info("crashkernel low reservation failed - No suitable area found.\n"); - - return; + pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", + (unsigned long)(low_size >> 20)); + return -ENOMEM; } memblock_reserve(low_base, low_size); @@ -544,6 +543,7 @@ static void __init reserve_crashkernel_low(void) crashk_low_res.end = low_base + low_size - 1; insert_resource(&iomem_resource, &crashk_low_res); #endif + return 0; } static void __init reserve_crashkernel(void) @@ -595,6 +595,11 @@ static void __init reserve_crashkernel(void) } memblock_reserve(crash_base, crash_size); + if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) { + memblock_free(crash_base, crash_size); + return; + } + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " "for crashkernel (System RAM: %ldMB)\n", (unsigned long)(crash_size >> 20), @@ -604,9 +609,6 @@ static void __init reserve_crashkernel(void) crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; insert_resource(&iomem_resource, &crashk_res); - - if (crash_base >= (1ULL<<32)) - reserve_crashkernel_low(); } #else static void __init reserve_crashkernel(void) -- cgit v1.2.3 From 1a6775c1a2c2ed863699403cda517916c22aeb72 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Mon, 19 Oct 2015 11:17:42 +0200 Subject: x86/amd_nb, EDAC: Rename amd_get_node_id() This function doesn't give us the "Node ID" as the function name suggests. Rather, it receives a PCI device as argument, checks the available F3 PCI device IDs in the system and returns the index of the matching Bus/Device IDs. Rename it to amd_pci_dev_to_node_id(). No functional change is introduced. Suggested-by: Ingo Molnar Signed-off-by: Aravind Gopalakrishnan Signed-off-by: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Suravee Suthikulpanit Cc: Thomas Gleixner Cc: linux-edac Link: http://lkml.kernel.org/r/1445246268-26285-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/amd_nb.h | 2 +- drivers/edac/amd64_edac.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 1a5da2e63aee..3c56ef1ae068 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -81,7 +81,7 @@ static inline struct amd_northbridge *node_to_amd_nb(int node) return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL; } -static inline u16 amd_get_node_id(struct pci_dev *pdev) +static inline u16 amd_pci_dev_to_node_id(struct pci_dev *pdev) { struct pci_dev *misc; int i; diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 73aea40a9c89..d5fcdbfd99b8 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2770,7 +2770,7 @@ static int init_one_instance(struct pci_dev *F2) struct mem_ctl_info *mci = NULL; struct edac_mc_layer layers[2]; int err = 0, ret; - u16 nid = amd_get_node_id(F2); + u16 nid = amd_pci_dev_to_node_id(F2); ret = -ENOMEM; pvt = kzalloc(sizeof(struct amd64_pvt), GFP_KERNEL); @@ -2860,7 +2860,7 @@ err_ret: static int probe_one_instance(struct pci_dev *pdev, const struct pci_device_id *mc_type) { - u16 nid = amd_get_node_id(pdev); + u16 nid = amd_pci_dev_to_node_id(pdev); struct pci_dev *F3 = node_to_amd_nb(nid)->misc; struct ecc_settings *s; int ret = 0; @@ -2910,7 +2910,7 @@ static void remove_one_instance(struct pci_dev *pdev) { struct mem_ctl_info *mci; struct amd64_pvt *pvt; - u16 nid = amd_get_node_id(pdev); + u16 nid = amd_pci_dev_to_node_id(pdev); struct pci_dev *F3 = node_to_amd_nb(nid)->misc; struct ecc_settings *s = ecc_stngs[nid]; -- cgit v1.2.3 From 97eac21babe47e1a8ed4cac4f8874c5746cf6e36 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 19 Oct 2015 11:17:43 +0200 Subject: x86/setup: Cleanup crashkernel reservation functions * Shorten variable names * Realign code, space out for better readability No code changed: # arch/x86/kernel/setup.o: text data bss dec hex filename 4543 3096 69904 77543 12ee7 setup.o.before 4543 3096 69904 77543 12ee7 setup.o.after md5: 8a1b7c6738a553ca207b56bd84a8f359 setup.o.before.asm 8a1b7c6738a553ca207b56bd84a8f359 setup.o.after.asm Signed-off-by: Borislav Petkov Reviewed-by: Dave Young Reviewed-by: Joerg Roedel Cc: Andrew Morton Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Jiri Kosina Cc: Juergen Gross Cc: Linus Torvalds Cc: Mark Salter Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: WANG Chao Cc: jerry_hoemann@hp.com Link: http://lkml.kernel.org/r/1445246268-26285-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1b36839e41eb..fd9e178aa890 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -486,11 +486,11 @@ static void __init memblock_x86_reserve_range_setup_data(void) * On 64bit, old kexec-tools need to under 896MiB. */ #ifdef CONFIG_X86_32 -# define CRASH_KERNEL_ADDR_LOW_MAX (512 << 20) -# define CRASH_KERNEL_ADDR_HIGH_MAX (512 << 20) +# define CRASH_ADDR_LOW_MAX (512 << 20) +# define CRASH_ADDR_HIGH_MAX (512 << 20) #else -# define CRASH_KERNEL_ADDR_LOW_MAX (896UL<<20) -# define CRASH_KERNEL_ADDR_HIGH_MAX MAXMEM +# define CRASH_ADDR_LOW_MAX (896UL << 20) +# define CRASH_ADDR_HIGH_MAX MAXMEM #endif static int __init reserve_crashkernel_low(void) @@ -503,10 +503,10 @@ static int __init reserve_crashkernel_low(void) bool auto_set = false; int ret; - total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT)); + total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT)); + /* crashkernel=Y,low */ - ret = parse_crashkernel_low(boot_command_line, total_low_mem, - &low_size, &base); + ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); if (ret != 0) { /* * two parts from lib/swiotlb.c: @@ -517,7 +517,7 @@ static int __init reserve_crashkernel_low(void) * make sure we allocate enough extra low memory so that we * don't run out of DMA buffers for 32-bit devices. */ - low_size = max(swiotlb_size_or_default() + (8UL<<20), 256UL<<20); + low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20); auto_set = true; } else { /* passed with crashkernel=0,low ? */ @@ -525,9 +525,7 @@ static int __init reserve_crashkernel_low(void) return 0; } - low_base = memblock_find_in_range(low_size, (1ULL<<32), - low_size, alignment); - + low_base = memblock_find_in_range(low_size, 1ULL << 32, low_size, alignment); if (!low_base) { pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", (unsigned long)(low_size >> 20)); @@ -535,10 +533,12 @@ static int __init reserve_crashkernel_low(void) } memblock_reserve(low_base, low_size); + pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", - (unsigned long)(low_size >> 20), - (unsigned long)(low_base >> 20), - (unsigned long)(total_low_mem >> 20)); + (unsigned long)(low_size >> 20), + (unsigned long)(low_base >> 20), + (unsigned long)(total_low_mem >> 20)); + crashk_low_res.start = low_base; crashk_low_res.end = low_base + low_size - 1; insert_resource(&iomem_resource, &crashk_low_res); @@ -557,12 +557,11 @@ static void __init reserve_crashkernel(void) total_mem = memblock_phys_mem_size(); /* crashkernel=XM */ - ret = parse_crashkernel(boot_command_line, total_mem, - &crash_size, &crash_base); + ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); if (ret != 0 || crash_size <= 0) { /* crashkernel=X,high */ ret = parse_crashkernel_high(boot_command_line, total_mem, - &crash_size, &crash_base); + &crash_size, &crash_base); if (ret != 0 || crash_size <= 0) return; high = true; @@ -574,10 +573,9 @@ static void __init reserve_crashkernel(void) * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX */ crash_base = memblock_find_in_range(alignment, - high ? CRASH_KERNEL_ADDR_HIGH_MAX : - CRASH_KERNEL_ADDR_LOW_MAX, - crash_size, alignment); - + high ? CRASH_ADDR_HIGH_MAX + : CRASH_ADDR_LOW_MAX, + crash_size, alignment); if (!crash_base) { pr_info("crashkernel reservation failed - No suitable area found.\n"); return; @@ -587,7 +585,8 @@ static void __init reserve_crashkernel(void) unsigned long long start; start = memblock_find_in_range(crash_base, - crash_base + crash_size, crash_size, 1<<20); + crash_base + crash_size, + crash_size, 1 << 20); if (start != crash_base) { pr_info("crashkernel reservation failed - memory is in use.\n"); return; -- cgit v1.2.3 From 606134f77ce22997fd2800d5937698d85c6990d9 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 19 Oct 2015 11:17:44 +0200 Subject: x86/setup/crash: Remove alignment variable Use a macro instead. No functionality change. Signed-off-by: Borislav Petkov Reviewed-by: Dave Young Reviewed-by: Joerg Roedel Cc: Andrew Morton Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Jiri Kosina Cc: Juergen Gross Cc: Linus Torvalds Cc: Mark Salter Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: WANG Chao Cc: jerry_hoemann@hp.com Link: http://lkml.kernel.org/r/1445246268-26285-5-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index fd9e178aa890..ea086dd8e821 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -480,6 +480,9 @@ static void __init memblock_x86_reserve_range_setup_data(void) #ifdef CONFIG_KEXEC_CORE +/* 16M alignment for crash kernel regions */ +#define CRASH_ALIGN (16 << 20) + /* * Keep the crash kernel below this limit. On 32 bits earlier kernels * would limit the kernel to the low 512 MiB due to mapping restrictions. @@ -496,7 +499,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) static int __init reserve_crashkernel_low(void) { #ifdef CONFIG_X86_64 - const unsigned long long alignment = 16<<20; /* 16M */ unsigned long long low_base = 0, low_size = 0; unsigned long total_low_mem; unsigned long long base; @@ -525,7 +527,7 @@ static int __init reserve_crashkernel_low(void) return 0; } - low_base = memblock_find_in_range(low_size, 1ULL << 32, low_size, alignment); + low_base = memblock_find_in_range(low_size, 1ULL << 32, low_size, CRASH_ALIGN); if (!low_base) { pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", (unsigned long)(low_size >> 20)); @@ -548,7 +550,6 @@ static int __init reserve_crashkernel_low(void) static void __init reserve_crashkernel(void) { - const unsigned long long alignment = 16<<20; /* 16M */ unsigned long long total_mem; unsigned long long crash_size, crash_base; bool high = false; @@ -572,10 +573,10 @@ static void __init reserve_crashkernel(void) /* * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX */ - crash_base = memblock_find_in_range(alignment, + crash_base = memblock_find_in_range(CRASH_ALIGN, high ? CRASH_ADDR_HIGH_MAX : CRASH_ADDR_LOW_MAX, - crash_size, alignment); + crash_size, CRASH_ALIGN); if (!crash_base) { pr_info("crashkernel reservation failed - No suitable area found.\n"); return; -- cgit v1.2.3 From f56d55781c1ff5663874775d0672ba954fe5634c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 19 Oct 2015 11:17:45 +0200 Subject: x86/setup/crash: Cleanup some more * Remove unused auto_set variable * Cleanup local function variable declarations * Reformat printk string and use pr_info() No functionality change. Signed-off-by: Borislav Petkov Reviewed-by: Dave Young Reviewed-by: Joerg Roedel Cc: Andrew Morton Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Jiri Kosina Cc: Juergen Gross Cc: Linus Torvalds Cc: Mark Salter Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: WANG Chao Cc: jerry_hoemann@hp.com Link: http://lkml.kernel.org/r/1445246268-26285-6-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ea086dd8e821..d4788719a1e2 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -499,17 +499,15 @@ static void __init memblock_x86_reserve_range_setup_data(void) static int __init reserve_crashkernel_low(void) { #ifdef CONFIG_X86_64 - unsigned long long low_base = 0, low_size = 0; + unsigned long long base, low_base = 0, low_size = 0; unsigned long total_low_mem; - unsigned long long base; - bool auto_set = false; int ret; total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT)); /* crashkernel=Y,low */ ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); - if (ret != 0) { + if (ret) { /* * two parts from lib/swiotlb.c: * -swiotlb size: user-specified with swiotlb= or default. @@ -520,7 +518,6 @@ static int __init reserve_crashkernel_low(void) * don't run out of DMA buffers for 32-bit devices. */ low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20); - auto_set = true; } else { /* passed with crashkernel=0,low ? */ if (!low_size) @@ -550,8 +547,7 @@ static int __init reserve_crashkernel_low(void) static void __init reserve_crashkernel(void) { - unsigned long long total_mem; - unsigned long long crash_size, crash_base; + unsigned long long crash_size, crash_base, total_mem; bool high = false; int ret; @@ -600,11 +596,10 @@ static void __init reserve_crashkernel(void) return; } - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " - "for crashkernel (System RAM: %ldMB)\n", - (unsigned long)(crash_size >> 20), - (unsigned long)(crash_base >> 20), - (unsigned long)(total_mem >> 20)); + pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(total_mem >> 20)); crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; -- cgit v1.2.3 From 6f3760570e26eefc214e641b6daeddb7106240bb Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 19 Oct 2015 11:17:46 +0200 Subject: x86/setup/crash: Check memblock_reserve() retval memblock_reserve() can fail but the crashkernel reservation code doesn't check that and this can lead the user into believing that the crashkernel region was actually reserved. Make sure we check that return value and we exit early with a failure message in the error case. Signed-off-by: Borislav Petkov Reviewed-by: Dave Young Reviewed-by: Joerg Roedel Cc: Andrew Morton Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Jiri Kosina Cc: Juergen Gross Cc: Linus Torvalds Cc: Mark Salter Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: WANG Chao Cc: jerry_hoemann@hp.com Link: http://lkml.kernel.org/r/1445246268-26285-7-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d4788719a1e2..3f75297d5fd0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -531,7 +531,11 @@ static int __init reserve_crashkernel_low(void) return -ENOMEM; } - memblock_reserve(low_base, low_size); + ret = memblock_reserve(low_base, low_size); + if (ret) { + pr_err("%s: Error reserving crashkernel low memblock.\n", __func__); + return ret; + } pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", (unsigned long)(low_size >> 20), @@ -589,7 +593,11 @@ static void __init reserve_crashkernel(void) return; } } - memblock_reserve(crash_base, crash_size); + ret = memblock_reserve(crash_base, crash_size); + if (ret) { + pr_err("%s: Error reserving crashkernel memblock.\n", __func__); + return; + } if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) { memblock_free(crash_base, crash_size); -- cgit v1.2.3 From 81ffdcdd97d94110627caa81c23d5d780083731d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 19 Oct 2015 11:17:48 +0200 Subject: x86/mce: Fix thermal throttling reporting after kexec The per CPU thermal vector init code checks if the thermal vector is already installed and complains and bails out if it is. This happens after kexec, as kernel shut down does not clear the thermal vector APIC register. This causes two problems: 1. So we always do not fully initialize thermal reports after kexec. The CPU is still likely initialized, as the previous kernel should have done it. But we don't set up the software pointer to the thermal vector, so reporting may end up with a unknown thermal interrupt message. 2. Also it complains for every logical CPU, even though the value is actually derived from BP only. The problem is that we end up with one message per CPU, so on larger systems it becomes very noisy and messes up the otherwise nicely formatted CPU bootup numbers in the kernel log. Just remove the check. I checked the code and there's no valid code paths where the thermal init code for a CPU could be called multiple times. Why the kernel does not clean up this value on shutdown: The thermal monitoring is controlled per logical CPU thread. Normal shutdown code is just running on one CPU. To disable it we would need a broadcast NMI to all CPUs on shut down. That's overkill for this. So we just ignore it after kexec. Signed-off-by: Andi Kleen Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1445246268-26285-9-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 1af51b1586d7..2c5aaf8c2e2f 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -503,14 +503,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c) return; } - /* Check whether a vector already exists */ - if (h & APIC_VECTOR_MASK) { - printk(KERN_DEBUG - "CPU%d: Thermal LVT vector (%#x) already installed\n", - cpu, (h & APIC_VECTOR_MASK)); - return; - } - /* early Pentium M models use different method for enabling TM2 */ if (cpu_has(c, X86_FEATURE_TM2)) { if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) { -- cgit v1.2.3 From 221836e92cd5664de6fc2f1d836f6343ae5f2e43 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 19 Oct 2015 10:41:17 +0200 Subject: x86/Kconfig/cpus: Fix/complete CPU type help texts Move the generic help text explaining each CPU type and what to select under the "Processor Family" prompt and not under the M486 option. Also, amend it with the missing options. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1445244077-25120-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 6983314c8b37..3ba5ff2f2d08 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -3,10 +3,6 @@ choice prompt "Processor family" default M686 if X86_32 default GENERIC_CPU if X86_64 - -config M486 - bool "486" - depends on X86_32 ---help--- This is the processor type of your CPU. This information is used for optimizing purposes. In order to compile a kernel @@ -23,9 +19,9 @@ config M486 Here are the settings recommended for greatest speed: - "486" for the AMD/Cyrix/IBM/Intel 486DX/DX2/DX4 or - SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S. + SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S. - "586" for generic Pentium CPUs lacking the TSC - (time stamp counter) register. + (time stamp counter) register. - "Pentium-Classic" for the Intel Pentium. - "Pentium-MMX" for the Intel Pentium MMX. - "Pentium-Pro" for the Intel Pentium Pro. @@ -34,17 +30,31 @@ config M486 - "Pentium-4" for the Intel Pentium 4 or P4-based Celeron. - "K6" for the AMD K6, K6-II and K6-III (aka K6-3D). - "Athlon" for the AMD K7 family (Athlon/Duron/Thunderbird). + - "Opteron/Athlon64/Hammer/K8" for all K8 and newer AMD CPUs. - "Crusoe" for the Transmeta Crusoe series. - "Efficeon" for the Transmeta Efficeon series. - "Winchip-C6" for original IDT Winchip. - "Winchip-2" for IDT Winchips with 3dNow! capabilities. + - "AMD Elan" for the 32-bit AMD Elan embedded CPU. - "GeodeGX1" for Geode GX1 (Cyrix MediaGX). - "Geode GX/LX" For AMD Geode GX and LX processors. - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. - "VIA C3-2" for VIA C3-2 "Nehemiah" (model 9 and above). - "VIA C7" for VIA C7. + - "Intel P4" for the Pentium 4/Netburst microarchitecture. + - "Core 2/newer Xeon" for all core2 and newer Intel CPUs. + - "Intel Atom" for the Atom-microarchitecture CPUs. + - "Generic-x86-64" for a kernel which runs on any x86-64 CPU. + + See each option's help text for additional details. If you don't know + what to do, choose "486". - If you don't know what to do, choose "486". +config M486 + bool "486" + depends on X86_32 + ---help--- + Select this for an 486-class CPU such as AMD/Cyrix/IBM/Intel + 486DX/DX2/DX4 or SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S. config M586 bool "586/K5/5x86/6x86/6x86MX" -- cgit v1.2.3 From 3d45ac4b35cbdf942f2a45b2b927f2ef6a8bda48 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 19 Oct 2015 04:35:44 -0600 Subject: timers/x86/hpet: Type adjustments Standardize on bool instead of an inconsistent mixture of u8 and plain 'int'. Also use u32 or 'unsigned int' instead of 'unsigned long' when a 32-bit type suffices, generating slightly better code on x86-64. Signed-off-by: Jan Beulich Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5624E3A002000078000AC49A@prv-mh.provo.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hpet.h | 6 +++--- arch/x86/kernel/early-quirks.c | 2 +- arch/x86/kernel/hpet.c | 29 ++++++++++++++--------------- arch/x86/kernel/quirks.c | 2 +- 4 files changed, 19 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 5fa9fb0f8809..cc285ec4b2c1 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -63,10 +63,10 @@ /* hpet memory map physical address */ extern unsigned long hpet_address; extern unsigned long force_hpet_address; -extern int boot_hpet_disable; +extern bool boot_hpet_disable; extern u8 hpet_blockid; -extern int hpet_force_user; -extern u8 hpet_msi_disable; +extern bool hpet_force_user; +extern bool hpet_msi_disable; extern int is_hpet_enabled(void); extern int hpet_enable(void); extern void hpet_disable(void); diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 9f9cc682e561..db9a675e751b 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -584,7 +584,7 @@ static void __init intel_graphics_stolen(int num, int slot, int func) static void __init force_disable_hpet(int num, int slot, int func) { #ifdef CONFIG_HPET_TIMER - boot_hpet_disable = 1; + boot_hpet_disable = true; pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n"); #endif } diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 88b4da373081..b8e6ff5cd5d0 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -37,10 +37,10 @@ */ unsigned long hpet_address; u8 hpet_blockid; /* OS timer block num */ -u8 hpet_msi_disable; +bool hpet_msi_disable; #ifdef CONFIG_PCI_MSI -static unsigned long hpet_num_timers; +static unsigned int hpet_num_timers; #endif static void __iomem *hpet_virt_address; @@ -86,9 +86,9 @@ static inline void hpet_clear_mapping(void) /* * HPET command line enable / disable */ -int boot_hpet_disable; -int hpet_force_user; -static int hpet_verbose; +bool boot_hpet_disable; +bool hpet_force_user; +static bool hpet_verbose; static int __init hpet_setup(char *str) { @@ -98,11 +98,11 @@ static int __init hpet_setup(char *str) if (next) *next++ = 0; if (!strncmp("disable", str, 7)) - boot_hpet_disable = 1; + boot_hpet_disable = true; if (!strncmp("force", str, 5)) - hpet_force_user = 1; + hpet_force_user = true; if (!strncmp("verbose", str, 7)) - hpet_verbose = 1; + hpet_verbose = true; str = next; } return 1; @@ -111,7 +111,7 @@ __setup("hpet=", hpet_setup); static int __init disable_hpet(char *str) { - boot_hpet_disable = 1; + boot_hpet_disable = true; return 1; } __setup("nohpet", disable_hpet); @@ -124,7 +124,7 @@ static inline int is_hpet_capable(void) /* * HPET timer interrupt enable / disable */ -static int hpet_legacy_int_enabled; +static bool hpet_legacy_int_enabled; /** * is_hpet_enabled - check whether the hpet timer interrupt is enabled @@ -230,7 +230,7 @@ static struct clock_event_device hpet_clockevent; static void hpet_stop_counter(void) { - unsigned long cfg = hpet_readl(HPET_CFG); + u32 cfg = hpet_readl(HPET_CFG); cfg &= ~HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); } @@ -272,7 +272,7 @@ static void hpet_enable_legacy_int(void) cfg |= HPET_CFG_LEGACY; hpet_writel(cfg, HPET_CFG); - hpet_legacy_int_enabled = 1; + hpet_legacy_int_enabled = true; } static void hpet_legacy_clockevent_register(void) @@ -983,7 +983,7 @@ void hpet_disable(void) cfg = *hpet_boot_cfg; else if (hpet_legacy_int_enabled) { cfg &= ~HPET_CFG_LEGACY; - hpet_legacy_int_enabled = 0; + hpet_legacy_int_enabled = false; } cfg &= ~HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); @@ -1121,8 +1121,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_timer_init); static void hpet_disable_rtc_channel(void) { - unsigned long cfg; - cfg = hpet_readl(HPET_T1_CFG); + u32 cfg = hpet_readl(HPET_T1_CFG); cfg &= ~HPET_TN_ENABLE; hpet_writel(cfg, HPET_T1_CFG); } diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 176a0f99d4da..cc457ff818ad 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -524,7 +524,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E6XX_CU, */ static void force_disable_hpet_msi(struct pci_dev *unused) { - hpet_msi_disable = 1; + hpet_msi_disable = true; } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, -- cgit v1.2.3 From 9a2bc335f100a0f6ee6392b9f97ac4188d84db1d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 20 Oct 2015 11:54:44 +0200 Subject: x86/microcode: Unmodularize the microcode driver Make CONFIG_MICROCODE a bool. It was practically a bool already anyway, since early loader was forcing it to =y. Regardless, there's no real reason to have something be a module which gets built-in on the majority of installations out there. And its not like there's noticeable change in functionality - we still can load late microcode - just the module glue disappears. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Jones Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1445334889-300-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 5 +++-- arch/x86/include/asm/microcode.h | 6 ++++++ arch/x86/kernel/cpu/microcode/amd.c | 2 +- arch/x86/kernel/cpu/microcode/core.c | 36 ++--------------------------------- arch/x86/kernel/cpu/microcode/intel.c | 2 +- arch/x86/kernel/setup.c | 3 +++ 6 files changed, 16 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 96d058a87100..fdf1f0cdf6b6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1123,7 +1123,8 @@ config X86_REBOOTFIXUPS Say N otherwise. config MICROCODE - tristate "CPU microcode loading support" + bool "CPU microcode loading support" + default y depends on CPU_SUP_AMD || CPU_SUP_INTEL select FW_LOADER ---help--- @@ -1174,7 +1175,7 @@ config MICROCODE_AMD_EARLY config MICROCODE_EARLY bool "Early load microcode" - depends on MICROCODE=y && BLK_DEV_INITRD + depends on MICROCODE && BLK_DEV_INITRD select MICROCODE_INTEL_EARLY if MICROCODE_INTEL select MICROCODE_AMD_EARLY if MICROCODE_AMD default y diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 9e6278c7140e..d1ff724f352b 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -55,6 +55,12 @@ struct ucode_cpu_info { }; extern struct ucode_cpu_info ucode_cpu_info[]; +#ifdef CONFIG_MICROCODE +int __init microcode_init(void); +#else +static inline int __init microcode_init(void) { return 0; }; +#endif + #ifdef CONFIG_MICROCODE_INTEL extern struct microcode_ops * __init init_intel_microcode(void); #else diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index da922d1e2f71..5dcce5dc39b9 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -523,7 +523,7 @@ static struct microcode_ops microcode_amd_ops = { struct microcode_ops * __init init_amd_microcode(void) { - struct cpuinfo_x86 *c = &cpu_data(0); + struct cpuinfo_x86 *c = &boot_cpu_data; if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { pr_warning("AMD CPU family 0x%x not supported\n", c->x86); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 9e3f3c7dd5d7..15491dd3131e 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -487,9 +487,9 @@ static struct attribute_group cpu_root_microcode_group = { .attrs = cpu_root_microcode_attrs, }; -static int __init microcode_init(void) +int __init microcode_init(void) { - struct cpuinfo_x86 *c = &cpu_data(0); + struct cpuinfo_x86 *c = &boot_cpu_data; int error; if (paravirt_enabled() || dis_ucode_ldr) @@ -560,35 +560,3 @@ static int __init microcode_init(void) return error; } -module_init(microcode_init); - -static void __exit microcode_exit(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - microcode_dev_exit(); - - unregister_hotcpu_notifier(&mc_cpu_notifier); - unregister_syscore_ops(&mc_syscore_ops); - - sysfs_remove_group(&cpu_subsys.dev_root->kobj, - &cpu_root_microcode_group); - - get_online_cpus(); - mutex_lock(µcode_mutex); - - subsys_interface_unregister(&mc_cpu_interface); - - mutex_unlock(µcode_mutex); - put_online_cpus(); - - platform_device_unregister(microcode_pdev); - - microcode_ops = NULL; - - if (c->x86_vendor == X86_VENDOR_AMD) - exit_amd_microcode(); - - pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); -} -module_exit(microcode_exit); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 969dc17eb1b4..bfd6fcd242ea 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -264,7 +264,7 @@ static struct microcode_ops microcode_intel_ops = { struct microcode_ops * __init init_intel_microcode(void) { - struct cpuinfo_x86 *c = &cpu_data(0); + struct cpuinfo_x86 *c = &boot_cpu_data; if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || cpu_has(c, X86_FEATURE_IA64)) { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3f75297d5fd0..3a896109e1df 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -111,6 +111,7 @@ #include #include #include +#include /* * max_low_pfn_mapped: highest direct mapped pfn under 4GB @@ -1239,6 +1240,8 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_apply_memmap_quirks(); #endif + + microcode_init(); } #ifdef CONFIG_X86_32 -- cgit v1.2.3 From fe055896c040df571e4ff56fb196d6845130057b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 20 Oct 2015 11:54:45 +0200 Subject: x86/microcode: Merge the early microcode loader Merge the early loader functionality into the driver proper. The diff is huge but logically, it is simply moving code from the _early.c files into the main driver. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Jones Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1445334889-300-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 19 +- arch/x86/include/asm/microcode.h | 19 +- arch/x86/include/asm/microcode_amd.h | 2 +- arch/x86/include/asm/microcode_intel.h | 10 +- arch/x86/kernel/cpu/microcode/Makefile | 3 - arch/x86/kernel/cpu/microcode/amd.c | 446 ++++++++++++++- arch/x86/kernel/cpu/microcode/amd_early.c | 444 --------------- arch/x86/kernel/cpu/microcode/core.c | 160 +++++- arch/x86/kernel/cpu/microcode/core_early.c | 170 ------ arch/x86/kernel/cpu/microcode/intel.c | 788 ++++++++++++++++++++++++++- arch/x86/kernel/cpu/microcode/intel_early.c | 808 ---------------------------- arch/x86/kernel/head_32.S | 5 +- arch/x86/mm/init.c | 2 - 13 files changed, 1399 insertions(+), 1477 deletions(-) delete mode 100644 arch/x86/kernel/cpu/microcode/amd_early.c delete mode 100644 arch/x86/kernel/cpu/microcode/core_early.c delete mode 100644 arch/x86/kernel/cpu/microcode/intel_early.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fdf1f0cdf6b6..255ea22ccbec 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1126,6 +1126,7 @@ config MICROCODE bool "CPU microcode loading support" default y depends on CPU_SUP_AMD || CPU_SUP_INTEL + depends on BLK_DEV_INITRD select FW_LOADER ---help--- @@ -1167,24 +1168,6 @@ config MICROCODE_OLD_INTERFACE def_bool y depends on MICROCODE -config MICROCODE_INTEL_EARLY - bool - -config MICROCODE_AMD_EARLY - bool - -config MICROCODE_EARLY - bool "Early load microcode" - depends on MICROCODE && BLK_DEV_INITRD - select MICROCODE_INTEL_EARLY if MICROCODE_INTEL - select MICROCODE_AMD_EARLY if MICROCODE_AMD - default y - help - This option provides functionality to read additional microcode data - at the beginning of initrd image. The data tells kernel to load - microcode to CPU's as early as possible. No functional change if no - microcode data is glued to the initrd, therefore it's safe to say Y. - config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" ---help--- diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index d1ff724f352b..9f953f7851bb 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -81,7 +81,6 @@ static inline struct microcode_ops * __init init_amd_microcode(void) static inline void __exit exit_amd_microcode(void) {} #endif -#ifdef CONFIG_MICROCODE_EARLY #define MAX_UCODE_COUNT 128 #define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) @@ -156,22 +155,18 @@ static inline unsigned int x86_model(unsigned int sig) return model; } +#ifdef CONFIG_MICROCODE extern void __init load_ucode_bsp(void); extern void load_ucode_ap(void); extern int __init save_microcode_in_initrd(void); void reload_early_microcode(void); extern bool get_builtin_firmware(struct cpio_data *cd, const char *name); #else -static inline void __init load_ucode_bsp(void) {} -static inline void load_ucode_ap(void) {} -static inline int __init save_microcode_in_initrd(void) -{ - return 0; -} -static inline void reload_early_microcode(void) {} -static inline bool get_builtin_firmware(struct cpio_data *cd, const char *name) -{ - return false; -} +static inline void __init load_ucode_bsp(void) { } +static inline void load_ucode_ap(void) { } +static inline int __init save_microcode_in_initrd(void) { return 0; } +static inline void reload_early_microcode(void) { } +static inline bool +get_builtin_firmware(struct cpio_data *cd, const char *name) { return false; } #endif #endif /* _ASM_X86_MICROCODE_H */ diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index d3e86cfd08fe..adfc847a395e 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -64,7 +64,7 @@ extern enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, s #define PATCH_MAX_SIZE PAGE_SIZE extern u8 amd_ucode_patch[PATCH_MAX_SIZE]; -#ifdef CONFIG_MICROCODE_AMD_EARLY +#ifdef CONFIG_MICROCODE_AMD extern void __init load_ucode_amd_bsp(unsigned int family); extern void load_ucode_amd_ap(void); extern int __init save_microcode_in_initrd_amd(void); diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 7991c606125d..8559b0102ea1 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -57,7 +57,7 @@ extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev); extern int microcode_sanity_check(void *mc, int print_err); extern int find_matching_signature(void *mc, unsigned int csig, int cpf); -#ifdef CONFIG_MICROCODE_INTEL_EARLY +#ifdef CONFIG_MICROCODE_INTEL extern void __init load_ucode_intel_bsp(void); extern void load_ucode_intel_ap(void); extern void show_ucode_info_early(void); @@ -71,13 +71,9 @@ static inline int __init save_microcode_in_initrd_intel(void) { return -EINVAL; static inline void reload_ucode_intel(void) {} #endif -#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU) +#ifdef CONFIG_HOTPLUG_CPU extern int save_mc_for_early(u8 *mc); #else -static inline int save_mc_for_early(u8 *mc) -{ - return 0; -} +static inline int save_mc_for_early(u8 *mc) { return 0; } #endif - #endif /* _ASM_X86_MICROCODE_INTEL_H */ diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile index 285c85427c32..220b1a508513 100644 --- a/arch/x86/kernel/cpu/microcode/Makefile +++ b/arch/x86/kernel/cpu/microcode/Makefile @@ -2,6 +2,3 @@ microcode-y := core.o obj-$(CONFIG_MICROCODE) += microcode.o microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o microcode-$(CONFIG_MICROCODE_AMD) += amd.o -obj-$(CONFIG_MICROCODE_EARLY) += core_early.o -obj-$(CONFIG_MICROCODE_INTEL_EARLY) += intel_early.o -obj-$(CONFIG_MICROCODE_AMD_EARLY) += amd_early.o diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 5dcce5dc39b9..20297fbb7355 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -1,5 +1,9 @@ /* * AMD CPU Microcode Update Driver for Linux + * + * This driver allows to upgrade microcode on F10h AMD + * CPUs and later. + * * Copyright (C) 2008-2011 Advanced Micro Devices Inc. * * Author: Peter Oruba @@ -11,26 +15,32 @@ * Andreas Herrmann * Borislav Petkov * - * This driver allows to upgrade microcode on F10h AMD - * CPUs and later. + * early loader: + * Copyright (C) 2013 Advanced Micro Devices, Inc. + * + * Author: Jacob Shin + * Fixes: Borislav Petkov * * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. */ - #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include +#include #include #include #include +#include #include #include +#include +#include #include -#include MODULE_DESCRIPTION("AMD Microcode Update Driver"); MODULE_AUTHOR("Peter Oruba"); @@ -47,6 +57,432 @@ struct ucode_patch { static LIST_HEAD(pcache); +/* + * This points to the current valid container of microcode patches which we will + * save from the initrd before jettisoning its contents. + */ +static u8 *container; +static size_t container_size; + +static u32 ucode_new_rev; +u8 amd_ucode_patch[PATCH_MAX_SIZE]; +static u16 this_equiv_id; + +static struct cpio_data ucode_cpio; + +/* + * Microcode patch container file is prepended to the initrd in cpio format. + * See Documentation/x86/early-microcode.txt + */ +static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin"; + +static struct cpio_data __init find_ucode_in_initrd(void) +{ + long offset = 0; + char *path; + void *start; + size_t size; + +#ifdef CONFIG_X86_32 + struct boot_params *p; + + /* + * On 32-bit, early load occurs before paging is turned on so we need + * to use physical addresses. + */ + p = (struct boot_params *)__pa_nodebug(&boot_params); + path = (char *)__pa_nodebug(ucode_path); + start = (void *)p->hdr.ramdisk_image; + size = p->hdr.ramdisk_size; +#else + path = ucode_path; + start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); + size = boot_params.hdr.ramdisk_size; +#endif + + return find_cpio_data(path, start, size, &offset); +} + +static size_t compute_container_size(u8 *data, u32 total_size) +{ + size_t size = 0; + u32 *header = (u32 *)data; + + if (header[0] != UCODE_MAGIC || + header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ + header[2] == 0) /* size */ + return size; + + size = header[2] + CONTAINER_HDR_SZ; + total_size -= size; + data += size; + + while (total_size) { + u16 patch_size; + + header = (u32 *)data; + + if (header[0] != UCODE_UCODE_TYPE) + break; + + /* + * Sanity-check patch size. + */ + patch_size = header[1]; + if (patch_size > PATCH_MAX_SIZE) + break; + + size += patch_size + SECTION_HDR_SIZE; + data += patch_size + SECTION_HDR_SIZE; + total_size -= patch_size + SECTION_HDR_SIZE; + } + + return size; +} + +/* + * Early load occurs before we can vmalloc(). So we look for the microcode + * patch container file in initrd, traverse equivalent cpu table, look for a + * matching microcode patch, and update, all in initrd memory in place. + * When vmalloc() is available for use later -- on 64-bit during first AP load, + * and on 32-bit during save_microcode_in_initrd_amd() -- we can call + * load_microcode_amd() to save equivalent cpu table and microcode patches in + * kernel heap memory. + */ +static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) +{ + struct equiv_cpu_entry *eq; + size_t *cont_sz; + u32 *header; + u8 *data, **cont; + u8 (*patch)[PATCH_MAX_SIZE]; + u16 eq_id = 0; + int offset, left; + u32 rev, eax, ebx, ecx, edx; + u32 *new_rev; + +#ifdef CONFIG_X86_32 + new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); + cont_sz = (size_t *)__pa_nodebug(&container_size); + cont = (u8 **)__pa_nodebug(&container); + patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); +#else + new_rev = &ucode_new_rev; + cont_sz = &container_size; + cont = &container; + patch = &amd_ucode_patch; +#endif + + data = ucode; + left = size; + header = (u32 *)data; + + /* find equiv cpu table */ + if (header[0] != UCODE_MAGIC || + header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ + header[2] == 0) /* size */ + return; + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + + while (left > 0) { + eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); + + *cont = data; + + /* Advance past the container header */ + offset = header[2] + CONTAINER_HDR_SZ; + data += offset; + left -= offset; + + eq_id = find_equiv_id(eq, eax); + if (eq_id) { + this_equiv_id = eq_id; + *cont_sz = compute_container_size(*cont, left + offset); + + /* + * truncate how much we need to iterate over in the + * ucode update loop below + */ + left = *cont_sz - offset; + break; + } + + /* + * support multiple container files appended together. if this + * one does not have a matching equivalent cpu entry, we fast + * forward to the next container file. + */ + while (left > 0) { + header = (u32 *)data; + if (header[0] == UCODE_MAGIC && + header[1] == UCODE_EQUIV_CPU_TABLE_TYPE) + break; + + offset = header[1] + SECTION_HDR_SIZE; + data += offset; + left -= offset; + } + + /* mark where the next microcode container file starts */ + offset = data - (u8 *)ucode; + ucode = data; + } + + if (!eq_id) { + *cont = NULL; + *cont_sz = 0; + return; + } + + if (check_current_patch_level(&rev, true)) + return; + + while (left > 0) { + struct microcode_amd *mc; + + header = (u32 *)data; + if (header[0] != UCODE_UCODE_TYPE || /* type */ + header[1] == 0) /* size */ + break; + + mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE); + + if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) { + + if (!__apply_microcode_amd(mc)) { + rev = mc->hdr.patch_id; + *new_rev = rev; + + if (save_patch) + memcpy(patch, mc, + min_t(u32, header[1], PATCH_MAX_SIZE)); + } + } + + offset = header[1] + SECTION_HDR_SIZE; + data += offset; + left -= offset; + } +} + +static bool __init load_builtin_amd_microcode(struct cpio_data *cp, + unsigned int family) +{ +#ifdef CONFIG_X86_64 + char fw_name[36] = "amd-ucode/microcode_amd.bin"; + + if (family >= 0x15) + snprintf(fw_name, sizeof(fw_name), + "amd-ucode/microcode_amd_fam%.2xh.bin", family); + + return get_builtin_firmware(cp, fw_name); +#else + return false; +#endif +} + +void __init load_ucode_amd_bsp(unsigned int family) +{ + struct cpio_data cp; + void **data; + size_t *size; + +#ifdef CONFIG_X86_32 + data = (void **)__pa_nodebug(&ucode_cpio.data); + size = (size_t *)__pa_nodebug(&ucode_cpio.size); +#else + data = &ucode_cpio.data; + size = &ucode_cpio.size; +#endif + + cp = find_ucode_in_initrd(); + if (!cp.data) { + if (!load_builtin_amd_microcode(&cp, family)) + return; + } + + *data = cp.data; + *size = cp.size; + + apply_ucode_in_initrd(cp.data, cp.size, true); +} + +#ifdef CONFIG_X86_32 +/* + * On 32-bit, since AP's early load occurs before paging is turned on, we + * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during + * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During + * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, + * which is used upon resume from suspend. + */ +void load_ucode_amd_ap(void) +{ + struct microcode_amd *mc; + size_t *usize; + void **ucode; + + mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch); + if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { + __apply_microcode_amd(mc); + return; + } + + ucode = (void *)__pa_nodebug(&container); + usize = (size_t *)__pa_nodebug(&container_size); + + if (!*ucode || !*usize) + return; + + apply_ucode_in_initrd(*ucode, *usize, false); +} + +static void __init collect_cpu_sig_on_bsp(void *arg) +{ + unsigned int cpu = smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + uci->cpu_sig.sig = cpuid_eax(0x00000001); +} + +static void __init get_bsp_sig(void) +{ + unsigned int bsp = boot_cpu_data.cpu_index; + struct ucode_cpu_info *uci = ucode_cpu_info + bsp; + + if (!uci->cpu_sig.sig) + smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); +} +#else +void load_ucode_amd_ap(void) +{ + unsigned int cpu = smp_processor_id(); + struct equiv_cpu_entry *eq; + struct microcode_amd *mc; + u32 rev, eax; + u16 eq_id; + + /* Exit if called on the BSP. */ + if (!cpu) + return; + + if (!container) + return; + + /* + * 64-bit runs with paging enabled, thus early==false. + */ + if (check_current_patch_level(&rev, false)) + return; + + eax = cpuid_eax(0x00000001); + eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ); + + eq_id = find_equiv_id(eq, eax); + if (!eq_id) + return; + + if (eq_id == this_equiv_id) { + mc = (struct microcode_amd *)amd_ucode_patch; + + if (mc && rev < mc->hdr.patch_id) { + if (!__apply_microcode_amd(mc)) + ucode_new_rev = mc->hdr.patch_id; + } + + } else { + if (!ucode_cpio.data) + return; + + /* + * AP has a different equivalence ID than BSP, looks like + * mixed-steppings silicon so go through the ucode blob anew. + */ + apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false); + } +} +#endif + +int __init save_microcode_in_initrd_amd(void) +{ + unsigned long cont; + int retval = 0; + enum ucode_state ret; + u8 *cont_va; + u32 eax; + + if (!container) + return -EINVAL; + +#ifdef CONFIG_X86_32 + get_bsp_sig(); + cont = (unsigned long)container; + cont_va = __va(container); +#else + /* + * We need the physical address of the container for both bitness since + * boot_params.hdr.ramdisk_image is a physical address. + */ + cont = __pa(container); + cont_va = container; +#endif + + /* + * Take into account the fact that the ramdisk might get relocated and + * therefore we need to recompute the container's position in virtual + * memory space. + */ + if (relocated_ramdisk) + container = (u8 *)(__va(relocated_ramdisk) + + (cont - boot_params.hdr.ramdisk_image)); + else + container = cont_va; + + if (ucode_new_rev) + pr_info("microcode: updated early to new patch_level=0x%08x\n", + ucode_new_rev); + + eax = cpuid_eax(0x00000001); + eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + + ret = load_microcode_amd(smp_processor_id(), eax, container, container_size); + if (ret != UCODE_OK) + retval = -EINVAL; + + /* + * This will be freed any msec now, stash patches for the current + * family and switch to patch cache for cpu hotplug, etc later. + */ + container = NULL; + container_size = 0; + + return retval; +} + +void reload_ucode_amd(void) +{ + struct microcode_amd *mc; + u32 rev; + + /* + * early==false because this is a syscore ->resume path and by + * that time paging is long enabled. + */ + if (check_current_patch_level(&rev, false)) + return; + + mc = (struct microcode_amd *)amd_ucode_patch; + + if (mc && rev < mc->hdr.patch_id) { + if (!__apply_microcode_amd(mc)) { + ucode_new_rev = mc->hdr.patch_id; + pr_info("microcode: reload patch_level=0x%08x\n", + ucode_new_rev); + } + } +} static u16 __find_equiv_id(unsigned int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -435,7 +871,7 @@ enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t s if (ret != UCODE_OK) cleanup(); -#if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32) +#ifdef CONFIG_X86_32 /* save BSP's matching patch for early load */ if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) { struct ucode_patch *p = find_patch(cpu); diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c deleted file mode 100644 index a54a47b9d8ea..000000000000 --- a/arch/x86/kernel/cpu/microcode/amd_early.c +++ /dev/null @@ -1,444 +0,0 @@ -/* - * Copyright (C) 2013 Advanced Micro Devices, Inc. - * - * Author: Jacob Shin - * Fixes: Borislav Petkov - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include - -#include -#include -#include - -/* - * This points to the current valid container of microcode patches which we will - * save from the initrd before jettisoning its contents. - */ -static u8 *container; -static size_t container_size; - -static u32 ucode_new_rev; -u8 amd_ucode_patch[PATCH_MAX_SIZE]; -static u16 this_equiv_id; - -static struct cpio_data ucode_cpio; - -/* - * Microcode patch container file is prepended to the initrd in cpio format. - * See Documentation/x86/early-microcode.txt - */ -static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin"; - -static struct cpio_data __init find_ucode_in_initrd(void) -{ - long offset = 0; - char *path; - void *start; - size_t size; - -#ifdef CONFIG_X86_32 - struct boot_params *p; - - /* - * On 32-bit, early load occurs before paging is turned on so we need - * to use physical addresses. - */ - p = (struct boot_params *)__pa_nodebug(&boot_params); - path = (char *)__pa_nodebug(ucode_path); - start = (void *)p->hdr.ramdisk_image; - size = p->hdr.ramdisk_size; -#else - path = ucode_path; - start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); - size = boot_params.hdr.ramdisk_size; -#endif - - return find_cpio_data(path, start, size, &offset); -} - -static size_t compute_container_size(u8 *data, u32 total_size) -{ - size_t size = 0; - u32 *header = (u32 *)data; - - if (header[0] != UCODE_MAGIC || - header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ - header[2] == 0) /* size */ - return size; - - size = header[2] + CONTAINER_HDR_SZ; - total_size -= size; - data += size; - - while (total_size) { - u16 patch_size; - - header = (u32 *)data; - - if (header[0] != UCODE_UCODE_TYPE) - break; - - /* - * Sanity-check patch size. - */ - patch_size = header[1]; - if (patch_size > PATCH_MAX_SIZE) - break; - - size += patch_size + SECTION_HDR_SIZE; - data += patch_size + SECTION_HDR_SIZE; - total_size -= patch_size + SECTION_HDR_SIZE; - } - - return size; -} - -/* - * Early load occurs before we can vmalloc(). So we look for the microcode - * patch container file in initrd, traverse equivalent cpu table, look for a - * matching microcode patch, and update, all in initrd memory in place. - * When vmalloc() is available for use later -- on 64-bit during first AP load, - * and on 32-bit during save_microcode_in_initrd_amd() -- we can call - * load_microcode_amd() to save equivalent cpu table and microcode patches in - * kernel heap memory. - */ -static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) -{ - struct equiv_cpu_entry *eq; - size_t *cont_sz; - u32 *header; - u8 *data, **cont; - u8 (*patch)[PATCH_MAX_SIZE]; - u16 eq_id = 0; - int offset, left; - u32 rev, eax, ebx, ecx, edx; - u32 *new_rev; - -#ifdef CONFIG_X86_32 - new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); - cont_sz = (size_t *)__pa_nodebug(&container_size); - cont = (u8 **)__pa_nodebug(&container); - patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); -#else - new_rev = &ucode_new_rev; - cont_sz = &container_size; - cont = &container; - patch = &amd_ucode_patch; -#endif - - data = ucode; - left = size; - header = (u32 *)data; - - /* find equiv cpu table */ - if (header[0] != UCODE_MAGIC || - header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ - header[2] == 0) /* size */ - return; - - eax = 0x00000001; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - - while (left > 0) { - eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); - - *cont = data; - - /* Advance past the container header */ - offset = header[2] + CONTAINER_HDR_SZ; - data += offset; - left -= offset; - - eq_id = find_equiv_id(eq, eax); - if (eq_id) { - this_equiv_id = eq_id; - *cont_sz = compute_container_size(*cont, left + offset); - - /* - * truncate how much we need to iterate over in the - * ucode update loop below - */ - left = *cont_sz - offset; - break; - } - - /* - * support multiple container files appended together. if this - * one does not have a matching equivalent cpu entry, we fast - * forward to the next container file. - */ - while (left > 0) { - header = (u32 *)data; - if (header[0] == UCODE_MAGIC && - header[1] == UCODE_EQUIV_CPU_TABLE_TYPE) - break; - - offset = header[1] + SECTION_HDR_SIZE; - data += offset; - left -= offset; - } - - /* mark where the next microcode container file starts */ - offset = data - (u8 *)ucode; - ucode = data; - } - - if (!eq_id) { - *cont = NULL; - *cont_sz = 0; - return; - } - - if (check_current_patch_level(&rev, true)) - return; - - while (left > 0) { - struct microcode_amd *mc; - - header = (u32 *)data; - if (header[0] != UCODE_UCODE_TYPE || /* type */ - header[1] == 0) /* size */ - break; - - mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE); - - if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) { - - if (!__apply_microcode_amd(mc)) { - rev = mc->hdr.patch_id; - *new_rev = rev; - - if (save_patch) - memcpy(patch, mc, - min_t(u32, header[1], PATCH_MAX_SIZE)); - } - } - - offset = header[1] + SECTION_HDR_SIZE; - data += offset; - left -= offset; - } -} - -static bool __init load_builtin_amd_microcode(struct cpio_data *cp, - unsigned int family) -{ -#ifdef CONFIG_X86_64 - char fw_name[36] = "amd-ucode/microcode_amd.bin"; - - if (family >= 0x15) - snprintf(fw_name, sizeof(fw_name), - "amd-ucode/microcode_amd_fam%.2xh.bin", family); - - return get_builtin_firmware(cp, fw_name); -#else - return false; -#endif -} - -void __init load_ucode_amd_bsp(unsigned int family) -{ - struct cpio_data cp; - void **data; - size_t *size; - -#ifdef CONFIG_X86_32 - data = (void **)__pa_nodebug(&ucode_cpio.data); - size = (size_t *)__pa_nodebug(&ucode_cpio.size); -#else - data = &ucode_cpio.data; - size = &ucode_cpio.size; -#endif - - cp = find_ucode_in_initrd(); - if (!cp.data) { - if (!load_builtin_amd_microcode(&cp, family)) - return; - } - - *data = cp.data; - *size = cp.size; - - apply_ucode_in_initrd(cp.data, cp.size, true); -} - -#ifdef CONFIG_X86_32 -/* - * On 32-bit, since AP's early load occurs before paging is turned on, we - * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during - * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During - * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, - * which is used upon resume from suspend. - */ -void load_ucode_amd_ap(void) -{ - struct microcode_amd *mc; - size_t *usize; - void **ucode; - - mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch); - if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { - __apply_microcode_amd(mc); - return; - } - - ucode = (void *)__pa_nodebug(&container); - usize = (size_t *)__pa_nodebug(&container_size); - - if (!*ucode || !*usize) - return; - - apply_ucode_in_initrd(*ucode, *usize, false); -} - -static void __init collect_cpu_sig_on_bsp(void *arg) -{ - unsigned int cpu = smp_processor_id(); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - uci->cpu_sig.sig = cpuid_eax(0x00000001); -} - -static void __init get_bsp_sig(void) -{ - unsigned int bsp = boot_cpu_data.cpu_index; - struct ucode_cpu_info *uci = ucode_cpu_info + bsp; - - if (!uci->cpu_sig.sig) - smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); -} -#else -void load_ucode_amd_ap(void) -{ - unsigned int cpu = smp_processor_id(); - struct equiv_cpu_entry *eq; - struct microcode_amd *mc; - u32 rev, eax; - u16 eq_id; - - /* Exit if called on the BSP. */ - if (!cpu) - return; - - if (!container) - return; - - /* - * 64-bit runs with paging enabled, thus early==false. - */ - if (check_current_patch_level(&rev, false)) - return; - - eax = cpuid_eax(0x00000001); - eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ); - - eq_id = find_equiv_id(eq, eax); - if (!eq_id) - return; - - if (eq_id == this_equiv_id) { - mc = (struct microcode_amd *)amd_ucode_patch; - - if (mc && rev < mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc)) - ucode_new_rev = mc->hdr.patch_id; - } - - } else { - if (!ucode_cpio.data) - return; - - /* - * AP has a different equivalence ID than BSP, looks like - * mixed-steppings silicon so go through the ucode blob anew. - */ - apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false); - } -} -#endif - -int __init save_microcode_in_initrd_amd(void) -{ - unsigned long cont; - int retval = 0; - enum ucode_state ret; - u8 *cont_va; - u32 eax; - - if (!container) - return -EINVAL; - -#ifdef CONFIG_X86_32 - get_bsp_sig(); - cont = (unsigned long)container; - cont_va = __va(container); -#else - /* - * We need the physical address of the container for both bitness since - * boot_params.hdr.ramdisk_image is a physical address. - */ - cont = __pa(container); - cont_va = container; -#endif - - /* - * Take into account the fact that the ramdisk might get relocated and - * therefore we need to recompute the container's position in virtual - * memory space. - */ - if (relocated_ramdisk) - container = (u8 *)(__va(relocated_ramdisk) + - (cont - boot_params.hdr.ramdisk_image)); - else - container = cont_va; - - if (ucode_new_rev) - pr_info("microcode: updated early to new patch_level=0x%08x\n", - ucode_new_rev); - - eax = cpuid_eax(0x00000001); - eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); - - ret = load_microcode_amd(smp_processor_id(), eax, container, container_size); - if (ret != UCODE_OK) - retval = -EINVAL; - - /* - * This will be freed any msec now, stash patches for the current - * family and switch to patch cache for cpu hotplug, etc later. - */ - container = NULL; - container_size = 0; - - return retval; -} - -void reload_ucode_amd(void) -{ - struct microcode_amd *mc; - u32 rev; - - /* - * early==false because this is a syscore ->resume path and by - * that time paging is long enabled. - */ - if (check_current_patch_level(&rev, false)) - return; - - mc = (struct microcode_amd *)amd_ucode_patch; - - if (mc && rev < mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc)) { - ucode_new_rev = mc->hdr.patch_id; - pr_info("microcode: reload patch_level=0x%08x\n", - ucode_new_rev); - } - } -} diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 15491dd3131e..18848c7de058 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -5,6 +5,12 @@ * 2006 Shaohua Li * 2013-2015 Borislav Petkov * + * X86 CPU microcode early update for Linux: + * + * Copyright (C) 2012 Fenghua Yu + * H Peter Anvin" + * (C) 2015 Borislav Petkov + * * This driver allows to upgrade microcode on x86 processors. * * This program is free software; you can redistribute it and/or @@ -16,20 +22,24 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include +#include #include #include #include #include #include #include -#include -#include -#include +#include #include +#include #include +#include +#include +#include MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); @@ -68,6 +78,150 @@ struct cpu_info_ctx { int err; }; +static bool __init check_loader_disabled_bsp(void) +{ +#ifdef CONFIG_X86_32 + const char *cmdline = (const char *)__pa_nodebug(boot_command_line); + const char *opt = "dis_ucode_ldr"; + const char *option = (const char *)__pa_nodebug(opt); + bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); + +#else /* CONFIG_X86_64 */ + const char *cmdline = boot_command_line; + const char *option = "dis_ucode_ldr"; + bool *res = &dis_ucode_ldr; +#endif + + if (cmdline_find_option_bool(cmdline, option)) + *res = true; + + return *res; +} + +extern struct builtin_fw __start_builtin_fw[]; +extern struct builtin_fw __end_builtin_fw[]; + +bool get_builtin_firmware(struct cpio_data *cd, const char *name) +{ +#ifdef CONFIG_FW_LOADER + struct builtin_fw *b_fw; + + for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) { + if (!strcmp(name, b_fw->name)) { + cd->size = b_fw->size; + cd->data = b_fw->data; + return true; + } + } +#endif + return false; +} + +void __init load_ucode_bsp(void) +{ + int vendor; + unsigned int family; + + if (check_loader_disabled_bsp()) + return; + + if (!have_cpuid_p()) + return; + + vendor = x86_vendor(); + family = x86_family(); + + switch (vendor) { + case X86_VENDOR_INTEL: + if (family >= 6) + load_ucode_intel_bsp(); + break; + case X86_VENDOR_AMD: + if (family >= 0x10) + load_ucode_amd_bsp(family); + break; + default: + break; + } +} + +static bool check_loader_disabled_ap(void) +{ +#ifdef CONFIG_X86_32 + return *((bool *)__pa_nodebug(&dis_ucode_ldr)); +#else + return dis_ucode_ldr; +#endif +} + +void load_ucode_ap(void) +{ + int vendor, family; + + if (check_loader_disabled_ap()) + return; + + if (!have_cpuid_p()) + return; + + vendor = x86_vendor(); + family = x86_family(); + + switch (vendor) { + case X86_VENDOR_INTEL: + if (family >= 6) + load_ucode_intel_ap(); + break; + case X86_VENDOR_AMD: + if (family >= 0x10) + load_ucode_amd_ap(); + break; + default: + break; + } +} + +int __init save_microcode_in_initrd(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + if (c->x86 >= 6) + save_microcode_in_initrd_intel(); + break; + case X86_VENDOR_AMD: + if (c->x86 >= 0x10) + save_microcode_in_initrd_amd(); + break; + default: + break; + } + + return 0; +} + +void reload_early_microcode(void) +{ + int vendor, family; + + vendor = x86_vendor(); + family = x86_family(); + + switch (vendor) { + case X86_VENDOR_INTEL: + if (family >= 6) + reload_ucode_intel(); + break; + case X86_VENDOR_AMD: + if (family >= 0x10) + reload_ucode_amd(); + break; + default: + break; + } +} + static void collect_cpu_info_local(void *arg) { struct cpu_info_ctx *ctx = arg; diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c deleted file mode 100644 index 8ebc421d6299..000000000000 --- a/arch/x86/kernel/cpu/microcode/core_early.c +++ /dev/null @@ -1,170 +0,0 @@ -/* - * X86 CPU microcode early update for Linux - * - * Copyright (C) 2012 Fenghua Yu - * H Peter Anvin" - * (C) 2015 Borislav Petkov - * - * This driver allows to early upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture - * Software Developer's Manual. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include -#include -#include -#include - -static bool __init check_loader_disabled_bsp(void) -{ -#ifdef CONFIG_X86_32 - const char *cmdline = (const char *)__pa_nodebug(boot_command_line); - const char *opt = "dis_ucode_ldr"; - const char *option = (const char *)__pa_nodebug(opt); - bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); - -#else /* CONFIG_X86_64 */ - const char *cmdline = boot_command_line; - const char *option = "dis_ucode_ldr"; - bool *res = &dis_ucode_ldr; -#endif - - if (cmdline_find_option_bool(cmdline, option)) - *res = true; - - return *res; -} - -extern struct builtin_fw __start_builtin_fw[]; -extern struct builtin_fw __end_builtin_fw[]; - -bool get_builtin_firmware(struct cpio_data *cd, const char *name) -{ -#ifdef CONFIG_FW_LOADER - struct builtin_fw *b_fw; - - for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) { - if (!strcmp(name, b_fw->name)) { - cd->size = b_fw->size; - cd->data = b_fw->data; - return true; - } - } -#endif - return false; -} - -void __init load_ucode_bsp(void) -{ - int vendor; - unsigned int family; - - if (check_loader_disabled_bsp()) - return; - - if (!have_cpuid_p()) - return; - - vendor = x86_vendor(); - family = x86_family(); - - switch (vendor) { - case X86_VENDOR_INTEL: - if (family >= 6) - load_ucode_intel_bsp(); - break; - case X86_VENDOR_AMD: - if (family >= 0x10) - load_ucode_amd_bsp(family); - break; - default: - break; - } -} - -static bool check_loader_disabled_ap(void) -{ -#ifdef CONFIG_X86_32 - return *((bool *)__pa_nodebug(&dis_ucode_ldr)); -#else - return dis_ucode_ldr; -#endif -} - -void load_ucode_ap(void) -{ - int vendor, family; - - if (check_loader_disabled_ap()) - return; - - if (!have_cpuid_p()) - return; - - vendor = x86_vendor(); - family = x86_family(); - - switch (vendor) { - case X86_VENDOR_INTEL: - if (family >= 6) - load_ucode_intel_ap(); - break; - case X86_VENDOR_AMD: - if (family >= 0x10) - load_ucode_amd_ap(); - break; - default: - break; - } -} - -int __init save_microcode_in_initrd(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - switch (c->x86_vendor) { - case X86_VENDOR_INTEL: - if (c->x86 >= 6) - save_microcode_in_initrd_intel(); - break; - case X86_VENDOR_AMD: - if (c->x86 >= 0x10) - save_microcode_in_initrd_amd(); - break; - default: - break; - } - - return 0; -} - -void reload_early_microcode(void) -{ - int vendor, family; - - vendor = x86_vendor(); - family = x86_family(); - - switch (vendor) { - case X86_VENDOR_INTEL: - if (family >= 6) - reload_ucode_intel(); - break; - case X86_VENDOR_AMD: - if (family >= 0x10) - reload_ucode_amd(); - break; - default: - break; - } -} diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index bfd6fcd242ea..2e09171e5338 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -4,28 +4,814 @@ * Copyright (C) 2000-2006 Tigran Aivazian * 2006 Shaohua Li * + * Intel CPU microcode early update for Linux + * + * Copyright (C) 2012 Fenghua Yu + * H Peter Anvin" + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +/* + * This needs to be before all headers so that pr_debug in printk.h doesn't turn + * printk calls into no_printk(). + * + *#define DEBUG + */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include +#include +#include #include #include -#include +#include +#include +#include #include #include +#include +#include #include MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); MODULE_LICENSE("GPL"); +static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; +static struct mc_saved_data { + unsigned int mc_saved_count; + struct microcode_intel **mc_saved; +} mc_saved_data; + +static enum ucode_state +load_microcode_early(struct microcode_intel **saved, + unsigned int num_saved, struct ucode_cpu_info *uci) +{ + struct microcode_intel *ucode_ptr, *new_mc = NULL; + struct microcode_header_intel *mc_hdr; + int new_rev, ret, i; + + new_rev = uci->cpu_sig.rev; + + for (i = 0; i < num_saved; i++) { + ucode_ptr = saved[i]; + mc_hdr = (struct microcode_header_intel *)ucode_ptr; + + ret = has_newer_microcode(ucode_ptr, + uci->cpu_sig.sig, + uci->cpu_sig.pf, + new_rev); + if (!ret) + continue; + + new_rev = mc_hdr->rev; + new_mc = ucode_ptr; + } + + if (!new_mc) + return UCODE_NFOUND; + + uci->mc = (struct microcode_intel *)new_mc; + return UCODE_OK; +} + +static inline void +copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd, + unsigned long off, int num_saved) +{ + int i; + + for (i = 0; i < num_saved; i++) + mc_saved[i] = (struct microcode_intel *)(initrd[i] + off); +} + +#ifdef CONFIG_X86_32 +static void +microcode_phys(struct microcode_intel **mc_saved_tmp, + struct mc_saved_data *mc_saved_data) +{ + int i; + struct microcode_intel ***mc_saved; + + mc_saved = (struct microcode_intel ***) + __pa_nodebug(&mc_saved_data->mc_saved); + for (i = 0; i < mc_saved_data->mc_saved_count; i++) { + struct microcode_intel *p; + + p = *(struct microcode_intel **) + __pa_nodebug(mc_saved_data->mc_saved + i); + mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p); + } +} +#endif + +static enum ucode_state +load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, + unsigned long initrd_start, struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int count = mc_saved_data->mc_saved_count; + + if (!mc_saved_data->mc_saved) { + copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count); + + return load_microcode_early(mc_saved_tmp, count, uci); + } else { +#ifdef CONFIG_X86_32 + microcode_phys(mc_saved_tmp, mc_saved_data); + return load_microcode_early(mc_saved_tmp, count, uci); +#else + return load_microcode_early(mc_saved_data->mc_saved, + count, uci); +#endif + } +} + +/* + * Given CPU signature and a microcode patch, this function finds if the + * microcode patch has matching family and model with the CPU. + */ +static enum ucode_state +matching_model_microcode(struct microcode_header_intel *mc_header, + unsigned long sig) +{ + unsigned int fam, model; + unsigned int fam_ucode, model_ucode; + struct extended_sigtable *ext_header; + unsigned long total_size = get_totalsize(mc_header); + unsigned long data_size = get_datasize(mc_header); + int ext_sigcount, i; + struct extended_signature *ext_sig; + + fam = __x86_family(sig); + model = x86_model(sig); + + fam_ucode = __x86_family(mc_header->sig); + model_ucode = x86_model(mc_header->sig); + + if (fam == fam_ucode && model == model_ucode) + return UCODE_OK; + + /* Look for ext. headers: */ + if (total_size <= data_size + MC_HEADER_SIZE) + return UCODE_NFOUND; + + ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + ext_sigcount = ext_header->count; + + for (i = 0; i < ext_sigcount; i++) { + fam_ucode = __x86_family(ext_sig->sig); + model_ucode = x86_model(ext_sig->sig); + + if (fam == fam_ucode && model == model_ucode) + return UCODE_OK; + + ext_sig++; + } + return UCODE_NFOUND; +} + +static int +save_microcode(struct mc_saved_data *mc_saved_data, + struct microcode_intel **mc_saved_src, + unsigned int mc_saved_count) +{ + int i, j; + struct microcode_intel **saved_ptr; + int ret; + + if (!mc_saved_count) + return -EINVAL; + + /* + * Copy new microcode data. + */ + saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL); + if (!saved_ptr) + return -ENOMEM; + + for (i = 0; i < mc_saved_count; i++) { + struct microcode_header_intel *mc_hdr; + struct microcode_intel *mc; + unsigned long size; + + if (!mc_saved_src[i]) { + ret = -EINVAL; + goto err; + } + + mc = mc_saved_src[i]; + mc_hdr = &mc->hdr; + size = get_totalsize(mc_hdr); + + saved_ptr[i] = kmalloc(size, GFP_KERNEL); + if (!saved_ptr[i]) { + ret = -ENOMEM; + goto err; + } + + memcpy(saved_ptr[i], mc, size); + } + + /* + * Point to newly saved microcode. + */ + mc_saved_data->mc_saved = saved_ptr; + mc_saved_data->mc_saved_count = mc_saved_count; + + return 0; + +err: + for (j = 0; j <= i; j++) + kfree(saved_ptr[j]); + kfree(saved_ptr); + + return ret; +} + +/* + * A microcode patch in ucode_ptr is saved into mc_saved + * - if it has matching signature and newer revision compared to an existing + * patch mc_saved. + * - or if it is a newly discovered microcode patch. + * + * The microcode patch should have matching model with CPU. + * + * Returns: The updated number @num_saved of saved microcode patches. + */ +static unsigned int _save_mc(struct microcode_intel **mc_saved, + u8 *ucode_ptr, unsigned int num_saved) +{ + struct microcode_header_intel *mc_hdr, *mc_saved_hdr; + unsigned int sig, pf; + int found = 0, i; + + mc_hdr = (struct microcode_header_intel *)ucode_ptr; + + for (i = 0; i < num_saved; i++) { + mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i]; + sig = mc_saved_hdr->sig; + pf = mc_saved_hdr->pf; + + if (!find_matching_signature(ucode_ptr, sig, pf)) + continue; + + found = 1; + + if (mc_hdr->rev <= mc_saved_hdr->rev) + continue; + + /* + * Found an older ucode saved earlier. Replace it with + * this newer one. + */ + mc_saved[i] = (struct microcode_intel *)ucode_ptr; + break; + } + + /* Newly detected microcode, save it to memory. */ + if (i >= num_saved && !found) + mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr; + + return num_saved; +} + +/* + * Get microcode matching with BSP's model. Only CPUs with the same model as + * BSP can stay in the platform. + */ +static enum ucode_state __init +get_matching_model_microcode(int cpu, unsigned long start, + void *data, size_t size, + struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + struct ucode_cpu_info *uci) +{ + u8 *ucode_ptr = data; + unsigned int leftover = size; + enum ucode_state state = UCODE_OK; + unsigned int mc_size; + struct microcode_header_intel *mc_header; + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int mc_saved_count = mc_saved_data->mc_saved_count; + int i; + + while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) { + + if (leftover < sizeof(mc_header)) + break; + + mc_header = (struct microcode_header_intel *)ucode_ptr; + + mc_size = get_totalsize(mc_header); + if (!mc_size || mc_size > leftover || + microcode_sanity_check(ucode_ptr, 0) < 0) + break; + + leftover -= mc_size; + + /* + * Since APs with same family and model as the BSP may boot in + * the platform, we need to find and save microcode patches + * with the same family and model as the BSP. + */ + if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != + UCODE_OK) { + ucode_ptr += mc_size; + continue; + } + + mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count); + + ucode_ptr += mc_size; + } + + if (leftover) { + state = UCODE_ERROR; + goto out; + } + + if (mc_saved_count == 0) { + state = UCODE_NFOUND; + goto out; + } + + for (i = 0; i < mc_saved_count; i++) + mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start; + + mc_saved_data->mc_saved_count = mc_saved_count; +out: + return state; +} + +static int collect_cpu_info_early(struct ucode_cpu_info *uci) +{ + unsigned int val[2]; + unsigned int family, model; + struct cpu_signature csig; + unsigned int eax, ebx, ecx, edx; + + csig.sig = 0; + csig.pf = 0; + csig.rev = 0; + + memset(uci, 0, sizeof(*uci)); + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + csig.sig = eax; + + family = __x86_family(csig.sig); + model = x86_model(csig.sig); + + if ((model >= 5) || (family > 6)) { + /* get processor flags from MSR 0x17 */ + native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig.pf = 1 << ((val[1] >> 18) & 7); + } + native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + + csig.rev = val[1]; + + uci->cpu_sig = csig; + uci->valid = 1; + + return 0; +} + +#ifdef DEBUG +static void show_saved_mc(void) +{ + int i, j; + unsigned int sig, pf, rev, total_size, data_size, date; + struct ucode_cpu_info uci; + + if (mc_saved_data.mc_saved_count == 0) { + pr_debug("no microcode data saved.\n"); + return; + } + pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count); + + collect_cpu_info_early(&uci); + + sig = uci.cpu_sig.sig; + pf = uci.cpu_sig.pf; + rev = uci.cpu_sig.rev; + pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev); + + for (i = 0; i < mc_saved_data.mc_saved_count; i++) { + struct microcode_header_intel *mc_saved_header; + struct extended_sigtable *ext_header; + int ext_sigcount; + struct extended_signature *ext_sig; + + mc_saved_header = (struct microcode_header_intel *) + mc_saved_data.mc_saved[i]; + sig = mc_saved_header->sig; + pf = mc_saved_header->pf; + rev = mc_saved_header->rev; + total_size = get_totalsize(mc_saved_header); + data_size = get_datasize(mc_saved_header); + date = mc_saved_header->date; + + pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n", + i, sig, pf, rev, total_size, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); + + /* Look for ext. headers: */ + if (total_size <= data_size + MC_HEADER_SIZE) + continue; + + ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + + for (j = 0; j < ext_sigcount; j++) { + sig = ext_sig->sig; + pf = ext_sig->pf; + + pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n", + j, sig, pf); + + ext_sig++; + } + + } +} +#else +static inline void show_saved_mc(void) +{ +} +#endif + +#ifdef CONFIG_HOTPLUG_CPU +static DEFINE_MUTEX(x86_cpu_microcode_mutex); +/* + * Save this mc into mc_saved_data. So it will be loaded early when a CPU is + * hot added or resumes. + * + * Please make sure this mc should be a valid microcode patch before calling + * this function. + */ +int save_mc_for_early(u8 *mc) +{ + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int mc_saved_count_init; + unsigned int mc_saved_count; + struct microcode_intel **mc_saved; + int ret = 0; + int i; + + /* + * Hold hotplug lock so mc_saved_data is not accessed by a CPU in + * hotplug. + */ + mutex_lock(&x86_cpu_microcode_mutex); + + mc_saved_count_init = mc_saved_data.mc_saved_count; + mc_saved_count = mc_saved_data.mc_saved_count; + mc_saved = mc_saved_data.mc_saved; + + if (mc_saved && mc_saved_count) + memcpy(mc_saved_tmp, mc_saved, + mc_saved_count * sizeof(struct microcode_intel *)); + /* + * Save the microcode patch mc in mc_save_tmp structure if it's a newer + * version. + */ + mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count); + + /* + * Save the mc_save_tmp in global mc_saved_data. + */ + ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count); + if (ret) { + pr_err("Cannot save microcode patch.\n"); + goto out; + } + + show_saved_mc(); + + /* + * Free old saved microcode data. + */ + if (mc_saved) { + for (i = 0; i < mc_saved_count_init; i++) + kfree(mc_saved[i]); + kfree(mc_saved); + } + +out: + mutex_unlock(&x86_cpu_microcode_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(save_mc_for_early); +#endif + +static bool __init load_builtin_intel_microcode(struct cpio_data *cp) +{ +#ifdef CONFIG_X86_64 + unsigned int eax = 0x00000001, ebx, ecx = 0, edx; + unsigned int family, model, stepping; + char name[30]; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + family = __x86_family(eax); + model = x86_model(eax); + stepping = eax & 0xf; + + sprintf(name, "intel-ucode/%02x-%02x-%02x", family, model, stepping); + + return get_builtin_firmware(cp, name); +#else + return false; +#endif +} + +static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; +static __init enum ucode_state +scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, + unsigned long start, unsigned long size, + struct ucode_cpu_info *uci) +{ + struct cpio_data cd; + long offset = 0; +#ifdef CONFIG_X86_32 + char *p = (char *)__pa_nodebug(ucode_name); +#else + char *p = ucode_name; +#endif + + cd.data = NULL; + cd.size = 0; + + cd = find_cpio_data(p, (void *)start, size, &offset); + if (!cd.data) { + if (!load_builtin_intel_microcode(&cd)) + return UCODE_ERROR; + } + + return get_matching_model_microcode(0, start, cd.data, cd.size, + mc_saved_data, initrd, uci); +} + +/* + * Print ucode update info. + */ +static void +print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) +{ + int cpu = smp_processor_id(); + + pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", + cpu, + uci->cpu_sig.rev, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); +} + +#ifdef CONFIG_X86_32 + +static int delay_ucode_info; +static int current_mc_date; + +/* + * Print early updated ucode info after printk works. This is delayed info dump. + */ +void show_ucode_info_early(void) +{ + struct ucode_cpu_info uci; + + if (delay_ucode_info) { + collect_cpu_info_early(&uci); + print_ucode_info(&uci, current_mc_date); + delay_ucode_info = 0; + } +} + +/* + * At this point, we can not call printk() yet. Keep microcode patch number in + * mc_saved_data.mc_saved and delay printing microcode info in + * show_ucode_info_early() until printk() works. + */ +static void print_ucode(struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_intel; + int *delay_ucode_info_p; + int *current_mc_date_p; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return; + + delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); + current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date); + + *delay_ucode_info_p = 1; + *current_mc_date_p = mc_intel->hdr.date; +} +#else + +/* + * Flush global tlb. We only do this in x86_64 where paging has been enabled + * already and PGE should be enabled as well. + */ +static inline void flush_tlb_early(void) +{ + __native_flush_tlb_global_irq_disabled(); +} + +static inline void print_ucode(struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_intel; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return; + + print_ucode_info(uci, mc_intel->hdr.date); +} +#endif + +static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) +{ + struct microcode_intel *mc_intel; + unsigned int val[2]; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return 0; + + /* write microcode via MSR 0x79 */ + native_wrmsr(MSR_IA32_UCODE_WRITE, + (unsigned long) mc_intel->bits, + (unsigned long) mc_intel->bits >> 16 >> 16); + native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + if (val[1] != mc_intel->hdr.rev) + return -1; + +#ifdef CONFIG_X86_64 + /* Flush global tlb. This is precaution. */ + flush_tlb_early(); +#endif + uci->cpu_sig.rev = val[1]; + + if (early) + print_ucode(uci); + else + print_ucode_info(uci, mc_intel->hdr.date); + + return 0; +} + +/* + * This function converts microcode patch offsets previously stored in + * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data. + */ +int __init save_microcode_in_initrd_intel(void) +{ + unsigned int count = mc_saved_data.mc_saved_count; + struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; + int ret = 0; + + if (count == 0) + return ret; + + copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count); + ret = save_microcode(&mc_saved_data, mc_saved, count); + if (ret) + pr_err("Cannot save microcode patches from initrd.\n"); + + show_saved_mc(); + + return ret; +} + +static void __init +_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, + unsigned long *initrd, + unsigned long start, unsigned long size) +{ + struct ucode_cpu_info uci; + enum ucode_state ret; + + collect_cpu_info_early(&uci); + + ret = scan_microcode(mc_saved_data, initrd, start, size, &uci); + if (ret != UCODE_OK) + return; + + ret = load_microcode(mc_saved_data, initrd, start, &uci); + if (ret != UCODE_OK) + return; + + apply_microcode_early(&uci, true); +} + +void __init load_ucode_intel_bsp(void) +{ + u64 start, size; +#ifdef CONFIG_X86_32 + struct boot_params *p; + + p = (struct boot_params *)__pa_nodebug(&boot_params); + start = p->hdr.ramdisk_image; + size = p->hdr.ramdisk_size; + + _load_ucode_intel_bsp( + (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), + (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), + start, size); +#else + start = boot_params.hdr.ramdisk_image + PAGE_OFFSET; + size = boot_params.hdr.ramdisk_size; + + _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size); +#endif +} + +void load_ucode_intel_ap(void) +{ + struct mc_saved_data *mc_saved_data_p; + struct ucode_cpu_info uci; + unsigned long *mc_saved_in_initrd_p; + unsigned long initrd_start_addr; + enum ucode_state ret; +#ifdef CONFIG_X86_32 + unsigned long *initrd_start_p; + + mc_saved_in_initrd_p = + (unsigned long *)__pa_nodebug(mc_saved_in_initrd); + mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); + initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); + initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p); +#else + mc_saved_data_p = &mc_saved_data; + mc_saved_in_initrd_p = mc_saved_in_initrd; + initrd_start_addr = initrd_start; +#endif + + /* + * If there is no valid ucode previously saved in memory, no need to + * update ucode on this AP. + */ + if (mc_saved_data_p->mc_saved_count == 0) + return; + + collect_cpu_info_early(&uci); + ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, + initrd_start_addr, &uci); + + if (ret != UCODE_OK) + return; + + apply_microcode_early(&uci, true); +} + +void reload_ucode_intel(void) +{ + struct ucode_cpu_info uci; + enum ucode_state ret; + + if (!mc_saved_data.mc_saved_count) + return; + + collect_cpu_info_early(&uci); + + ret = load_microcode_early(mc_saved_data.mc_saved, + mc_saved_data.mc_saved_count, &uci); + if (ret != UCODE_OK) + return; + + apply_microcode_early(&uci, false); +} + static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu_num); diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c deleted file mode 100644 index 37ea89c11520..000000000000 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ /dev/null @@ -1,808 +0,0 @@ -/* - * Intel CPU microcode early update for Linux - * - * Copyright (C) 2012 Fenghua Yu - * H Peter Anvin" - * - * This allows to early upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture - * Software Developer's Manual. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -/* - * This needs to be before all headers so that pr_debug in printk.h doesn't turn - * printk calls into no_printk(). - * - *#define DEBUG - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#undef pr_fmt -#define pr_fmt(fmt) "microcode: " fmt - -static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; -static struct mc_saved_data { - unsigned int mc_saved_count; - struct microcode_intel **mc_saved; -} mc_saved_data; - -static enum ucode_state -load_microcode_early(struct microcode_intel **saved, - unsigned int num_saved, struct ucode_cpu_info *uci) -{ - struct microcode_intel *ucode_ptr, *new_mc = NULL; - struct microcode_header_intel *mc_hdr; - int new_rev, ret, i; - - new_rev = uci->cpu_sig.rev; - - for (i = 0; i < num_saved; i++) { - ucode_ptr = saved[i]; - mc_hdr = (struct microcode_header_intel *)ucode_ptr; - - ret = has_newer_microcode(ucode_ptr, - uci->cpu_sig.sig, - uci->cpu_sig.pf, - new_rev); - if (!ret) - continue; - - new_rev = mc_hdr->rev; - new_mc = ucode_ptr; - } - - if (!new_mc) - return UCODE_NFOUND; - - uci->mc = (struct microcode_intel *)new_mc; - return UCODE_OK; -} - -static inline void -copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd, - unsigned long off, int num_saved) -{ - int i; - - for (i = 0; i < num_saved; i++) - mc_saved[i] = (struct microcode_intel *)(initrd[i] + off); -} - -#ifdef CONFIG_X86_32 -static void -microcode_phys(struct microcode_intel **mc_saved_tmp, - struct mc_saved_data *mc_saved_data) -{ - int i; - struct microcode_intel ***mc_saved; - - mc_saved = (struct microcode_intel ***) - __pa_nodebug(&mc_saved_data->mc_saved); - for (i = 0; i < mc_saved_data->mc_saved_count; i++) { - struct microcode_intel *p; - - p = *(struct microcode_intel **) - __pa_nodebug(mc_saved_data->mc_saved + i); - mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p); - } -} -#endif - -static enum ucode_state -load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, - unsigned long initrd_start, struct ucode_cpu_info *uci) -{ - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int count = mc_saved_data->mc_saved_count; - - if (!mc_saved_data->mc_saved) { - copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count); - - return load_microcode_early(mc_saved_tmp, count, uci); - } else { -#ifdef CONFIG_X86_32 - microcode_phys(mc_saved_tmp, mc_saved_data); - return load_microcode_early(mc_saved_tmp, count, uci); -#else - return load_microcode_early(mc_saved_data->mc_saved, - count, uci); -#endif - } -} - -/* - * Given CPU signature and a microcode patch, this function finds if the - * microcode patch has matching family and model with the CPU. - */ -static enum ucode_state -matching_model_microcode(struct microcode_header_intel *mc_header, - unsigned long sig) -{ - unsigned int fam, model; - unsigned int fam_ucode, model_ucode; - struct extended_sigtable *ext_header; - unsigned long total_size = get_totalsize(mc_header); - unsigned long data_size = get_datasize(mc_header); - int ext_sigcount, i; - struct extended_signature *ext_sig; - - fam = __x86_family(sig); - model = x86_model(sig); - - fam_ucode = __x86_family(mc_header->sig); - model_ucode = x86_model(mc_header->sig); - - if (fam == fam_ucode && model == model_ucode) - return UCODE_OK; - - /* Look for ext. headers: */ - if (total_size <= data_size + MC_HEADER_SIZE) - return UCODE_NFOUND; - - ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; - ext_sigcount = ext_header->count; - - for (i = 0; i < ext_sigcount; i++) { - fam_ucode = __x86_family(ext_sig->sig); - model_ucode = x86_model(ext_sig->sig); - - if (fam == fam_ucode && model == model_ucode) - return UCODE_OK; - - ext_sig++; - } - return UCODE_NFOUND; -} - -static int -save_microcode(struct mc_saved_data *mc_saved_data, - struct microcode_intel **mc_saved_src, - unsigned int mc_saved_count) -{ - int i, j; - struct microcode_intel **saved_ptr; - int ret; - - if (!mc_saved_count) - return -EINVAL; - - /* - * Copy new microcode data. - */ - saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL); - if (!saved_ptr) - return -ENOMEM; - - for (i = 0; i < mc_saved_count; i++) { - struct microcode_header_intel *mc_hdr; - struct microcode_intel *mc; - unsigned long size; - - if (!mc_saved_src[i]) { - ret = -EINVAL; - goto err; - } - - mc = mc_saved_src[i]; - mc_hdr = &mc->hdr; - size = get_totalsize(mc_hdr); - - saved_ptr[i] = kmalloc(size, GFP_KERNEL); - if (!saved_ptr[i]) { - ret = -ENOMEM; - goto err; - } - - memcpy(saved_ptr[i], mc, size); - } - - /* - * Point to newly saved microcode. - */ - mc_saved_data->mc_saved = saved_ptr; - mc_saved_data->mc_saved_count = mc_saved_count; - - return 0; - -err: - for (j = 0; j <= i; j++) - kfree(saved_ptr[j]); - kfree(saved_ptr); - - return ret; -} - -/* - * A microcode patch in ucode_ptr is saved into mc_saved - * - if it has matching signature and newer revision compared to an existing - * patch mc_saved. - * - or if it is a newly discovered microcode patch. - * - * The microcode patch should have matching model with CPU. - * - * Returns: The updated number @num_saved of saved microcode patches. - */ -static unsigned int _save_mc(struct microcode_intel **mc_saved, - u8 *ucode_ptr, unsigned int num_saved) -{ - struct microcode_header_intel *mc_hdr, *mc_saved_hdr; - unsigned int sig, pf; - int found = 0, i; - - mc_hdr = (struct microcode_header_intel *)ucode_ptr; - - for (i = 0; i < num_saved; i++) { - mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i]; - sig = mc_saved_hdr->sig; - pf = mc_saved_hdr->pf; - - if (!find_matching_signature(ucode_ptr, sig, pf)) - continue; - - found = 1; - - if (mc_hdr->rev <= mc_saved_hdr->rev) - continue; - - /* - * Found an older ucode saved earlier. Replace it with - * this newer one. - */ - mc_saved[i] = (struct microcode_intel *)ucode_ptr; - break; - } - - /* Newly detected microcode, save it to memory. */ - if (i >= num_saved && !found) - mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr; - - return num_saved; -} - -/* - * Get microcode matching with BSP's model. Only CPUs with the same model as - * BSP can stay in the platform. - */ -static enum ucode_state __init -get_matching_model_microcode(int cpu, unsigned long start, - void *data, size_t size, - struct mc_saved_data *mc_saved_data, - unsigned long *mc_saved_in_initrd, - struct ucode_cpu_info *uci) -{ - u8 *ucode_ptr = data; - unsigned int leftover = size; - enum ucode_state state = UCODE_OK; - unsigned int mc_size; - struct microcode_header_intel *mc_header; - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int mc_saved_count = mc_saved_data->mc_saved_count; - int i; - - while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) { - - if (leftover < sizeof(mc_header)) - break; - - mc_header = (struct microcode_header_intel *)ucode_ptr; - - mc_size = get_totalsize(mc_header); - if (!mc_size || mc_size > leftover || - microcode_sanity_check(ucode_ptr, 0) < 0) - break; - - leftover -= mc_size; - - /* - * Since APs with same family and model as the BSP may boot in - * the platform, we need to find and save microcode patches - * with the same family and model as the BSP. - */ - if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != - UCODE_OK) { - ucode_ptr += mc_size; - continue; - } - - mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count); - - ucode_ptr += mc_size; - } - - if (leftover) { - state = UCODE_ERROR; - goto out; - } - - if (mc_saved_count == 0) { - state = UCODE_NFOUND; - goto out; - } - - for (i = 0; i < mc_saved_count; i++) - mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start; - - mc_saved_data->mc_saved_count = mc_saved_count; -out: - return state; -} - -static int collect_cpu_info_early(struct ucode_cpu_info *uci) -{ - unsigned int val[2]; - unsigned int family, model; - struct cpu_signature csig; - unsigned int eax, ebx, ecx, edx; - - csig.sig = 0; - csig.pf = 0; - csig.rev = 0; - - memset(uci, 0, sizeof(*uci)); - - eax = 0x00000001; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - csig.sig = eax; - - family = __x86_family(csig.sig); - model = x86_model(csig.sig); - - if ((model >= 5) || (family > 6)) { - /* get processor flags from MSR 0x17 */ - native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - csig.pf = 1 << ((val[1] >> 18) & 7); - } - native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); - - /* As documented in the SDM: Do a CPUID 1 here */ - sync_core(); - - /* get the current revision from MSR 0x8B */ - native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - - csig.rev = val[1]; - - uci->cpu_sig = csig; - uci->valid = 1; - - return 0; -} - -#ifdef DEBUG -static void show_saved_mc(void) -{ - int i, j; - unsigned int sig, pf, rev, total_size, data_size, date; - struct ucode_cpu_info uci; - - if (mc_saved_data.mc_saved_count == 0) { - pr_debug("no microcode data saved.\n"); - return; - } - pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count); - - collect_cpu_info_early(&uci); - - sig = uci.cpu_sig.sig; - pf = uci.cpu_sig.pf; - rev = uci.cpu_sig.rev; - pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev); - - for (i = 0; i < mc_saved_data.mc_saved_count; i++) { - struct microcode_header_intel *mc_saved_header; - struct extended_sigtable *ext_header; - int ext_sigcount; - struct extended_signature *ext_sig; - - mc_saved_header = (struct microcode_header_intel *) - mc_saved_data.mc_saved[i]; - sig = mc_saved_header->sig; - pf = mc_saved_header->pf; - rev = mc_saved_header->rev; - total_size = get_totalsize(mc_saved_header); - data_size = get_datasize(mc_saved_header); - date = mc_saved_header->date; - - pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n", - i, sig, pf, rev, total_size, - date & 0xffff, - date >> 24, - (date >> 16) & 0xff); - - /* Look for ext. headers: */ - if (total_size <= data_size + MC_HEADER_SIZE) - continue; - - ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE; - ext_sigcount = ext_header->count; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; - - for (j = 0; j < ext_sigcount; j++) { - sig = ext_sig->sig; - pf = ext_sig->pf; - - pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n", - j, sig, pf); - - ext_sig++; - } - - } -} -#else -static inline void show_saved_mc(void) -{ -} -#endif - -#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU) -static DEFINE_MUTEX(x86_cpu_microcode_mutex); -/* - * Save this mc into mc_saved_data. So it will be loaded early when a CPU is - * hot added or resumes. - * - * Please make sure this mc should be a valid microcode patch before calling - * this function. - */ -int save_mc_for_early(u8 *mc) -{ - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int mc_saved_count_init; - unsigned int mc_saved_count; - struct microcode_intel **mc_saved; - int ret = 0; - int i; - - /* - * Hold hotplug lock so mc_saved_data is not accessed by a CPU in - * hotplug. - */ - mutex_lock(&x86_cpu_microcode_mutex); - - mc_saved_count_init = mc_saved_data.mc_saved_count; - mc_saved_count = mc_saved_data.mc_saved_count; - mc_saved = mc_saved_data.mc_saved; - - if (mc_saved && mc_saved_count) - memcpy(mc_saved_tmp, mc_saved, - mc_saved_count * sizeof(struct microcode_intel *)); - /* - * Save the microcode patch mc in mc_save_tmp structure if it's a newer - * version. - */ - mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count); - - /* - * Save the mc_save_tmp in global mc_saved_data. - */ - ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count); - if (ret) { - pr_err("Cannot save microcode patch.\n"); - goto out; - } - - show_saved_mc(); - - /* - * Free old saved microcode data. - */ - if (mc_saved) { - for (i = 0; i < mc_saved_count_init; i++) - kfree(mc_saved[i]); - kfree(mc_saved); - } - -out: - mutex_unlock(&x86_cpu_microcode_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(save_mc_for_early); -#endif - -static bool __init load_builtin_intel_microcode(struct cpio_data *cp) -{ -#ifdef CONFIG_X86_64 - unsigned int eax = 0x00000001, ebx, ecx = 0, edx; - unsigned int family, model, stepping; - char name[30]; - - native_cpuid(&eax, &ebx, &ecx, &edx); - - family = __x86_family(eax); - model = x86_model(eax); - stepping = eax & 0xf; - - sprintf(name, "intel-ucode/%02x-%02x-%02x", family, model, stepping); - - return get_builtin_firmware(cp, name); -#else - return false; -#endif -} - -static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; -static __init enum ucode_state -scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, - unsigned long start, unsigned long size, - struct ucode_cpu_info *uci) -{ - struct cpio_data cd; - long offset = 0; -#ifdef CONFIG_X86_32 - char *p = (char *)__pa_nodebug(ucode_name); -#else - char *p = ucode_name; -#endif - - cd.data = NULL; - cd.size = 0; - - cd = find_cpio_data(p, (void *)start, size, &offset); - if (!cd.data) { - if (!load_builtin_intel_microcode(&cd)) - return UCODE_ERROR; - } - - return get_matching_model_microcode(0, start, cd.data, cd.size, - mc_saved_data, initrd, uci); -} - -/* - * Print ucode update info. - */ -static void -print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) -{ - int cpu = smp_processor_id(); - - pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", - cpu, - uci->cpu_sig.rev, - date & 0xffff, - date >> 24, - (date >> 16) & 0xff); -} - -#ifdef CONFIG_X86_32 - -static int delay_ucode_info; -static int current_mc_date; - -/* - * Print early updated ucode info after printk works. This is delayed info dump. - */ -void show_ucode_info_early(void) -{ - struct ucode_cpu_info uci; - - if (delay_ucode_info) { - collect_cpu_info_early(&uci); - print_ucode_info(&uci, current_mc_date); - delay_ucode_info = 0; - } -} - -/* - * At this point, we can not call printk() yet. Keep microcode patch number in - * mc_saved_data.mc_saved and delay printing microcode info in - * show_ucode_info_early() until printk() works. - */ -static void print_ucode(struct ucode_cpu_info *uci) -{ - struct microcode_intel *mc_intel; - int *delay_ucode_info_p; - int *current_mc_date_p; - - mc_intel = uci->mc; - if (mc_intel == NULL) - return; - - delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); - current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date); - - *delay_ucode_info_p = 1; - *current_mc_date_p = mc_intel->hdr.date; -} -#else - -/* - * Flush global tlb. We only do this in x86_64 where paging has been enabled - * already and PGE should be enabled as well. - */ -static inline void flush_tlb_early(void) -{ - __native_flush_tlb_global_irq_disabled(); -} - -static inline void print_ucode(struct ucode_cpu_info *uci) -{ - struct microcode_intel *mc_intel; - - mc_intel = uci->mc; - if (mc_intel == NULL) - return; - - print_ucode_info(uci, mc_intel->hdr.date); -} -#endif - -static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) -{ - struct microcode_intel *mc_intel; - unsigned int val[2]; - - mc_intel = uci->mc; - if (mc_intel == NULL) - return 0; - - /* write microcode via MSR 0x79 */ - native_wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) mc_intel->bits, - (unsigned long) mc_intel->bits >> 16 >> 16); - native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); - - /* As documented in the SDM: Do a CPUID 1 here */ - sync_core(); - - /* get the current revision from MSR 0x8B */ - native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - if (val[1] != mc_intel->hdr.rev) - return -1; - -#ifdef CONFIG_X86_64 - /* Flush global tlb. This is precaution. */ - flush_tlb_early(); -#endif - uci->cpu_sig.rev = val[1]; - - if (early) - print_ucode(uci); - else - print_ucode_info(uci, mc_intel->hdr.date); - - return 0; -} - -/* - * This function converts microcode patch offsets previously stored in - * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data. - */ -int __init save_microcode_in_initrd_intel(void) -{ - unsigned int count = mc_saved_data.mc_saved_count; - struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; - int ret = 0; - - if (count == 0) - return ret; - - copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count); - ret = save_microcode(&mc_saved_data, mc_saved, count); - if (ret) - pr_err("Cannot save microcode patches from initrd.\n"); - - show_saved_mc(); - - return ret; -} - -static void __init -_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, - unsigned long *initrd, - unsigned long start, unsigned long size) -{ - struct ucode_cpu_info uci; - enum ucode_state ret; - - collect_cpu_info_early(&uci); - - ret = scan_microcode(mc_saved_data, initrd, start, size, &uci); - if (ret != UCODE_OK) - return; - - ret = load_microcode(mc_saved_data, initrd, start, &uci); - if (ret != UCODE_OK) - return; - - apply_microcode_early(&uci, true); -} - -void __init load_ucode_intel_bsp(void) -{ - u64 start, size; -#ifdef CONFIG_X86_32 - struct boot_params *p; - - p = (struct boot_params *)__pa_nodebug(&boot_params); - start = p->hdr.ramdisk_image; - size = p->hdr.ramdisk_size; - - _load_ucode_intel_bsp( - (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), - (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), - start, size); -#else - start = boot_params.hdr.ramdisk_image + PAGE_OFFSET; - size = boot_params.hdr.ramdisk_size; - - _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size); -#endif -} - -void load_ucode_intel_ap(void) -{ - struct mc_saved_data *mc_saved_data_p; - struct ucode_cpu_info uci; - unsigned long *mc_saved_in_initrd_p; - unsigned long initrd_start_addr; - enum ucode_state ret; -#ifdef CONFIG_X86_32 - unsigned long *initrd_start_p; - - mc_saved_in_initrd_p = - (unsigned long *)__pa_nodebug(mc_saved_in_initrd); - mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); - initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); - initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p); -#else - mc_saved_data_p = &mc_saved_data; - mc_saved_in_initrd_p = mc_saved_in_initrd; - initrd_start_addr = initrd_start; -#endif - - /* - * If there is no valid ucode previously saved in memory, no need to - * update ucode on this AP. - */ - if (mc_saved_data_p->mc_saved_count == 0) - return; - - collect_cpu_info_early(&uci); - ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, - initrd_start_addr, &uci); - - if (ret != UCODE_OK) - return; - - apply_microcode_early(&uci, true); -} - -void reload_ucode_intel(void) -{ - struct ucode_cpu_info uci; - enum ucode_state ret; - - if (!mc_saved_data.mc_saved_count) - return; - - collect_cpu_info_early(&uci); - - ret = load_microcode_early(mc_saved_data.mc_saved, - mc_saved_data.mc_saved_count, &uci); - if (ret != UCODE_OK) - return; - - apply_microcode_early(&uci, false); -} diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 0e2d96ffd158..6bc9ae24b6d2 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -152,7 +152,7 @@ ENTRY(startup_32) movl %eax, pa(olpc_ofw_pgd) #endif -#ifdef CONFIG_MICROCODE_EARLY +#ifdef CONFIG_MICROCODE /* Early load ucode on BSP. */ call load_ucode_bsp #endif @@ -311,12 +311,11 @@ ENTRY(startup_32_smp) movl %eax,%ss leal -__PAGE_OFFSET(%ecx),%esp -#ifdef CONFIG_MICROCODE_EARLY +#ifdef CONFIG_MICROCODE /* Early load ucode on AP. */ call load_ucode_ap #endif - default_entry: #define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 1d8a83df153a..1f37cb2b56a9 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -693,14 +693,12 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { -#ifdef CONFIG_MICROCODE_EARLY /* * Remember, initrd memory may contain microcode or other useful things. * Before we lose initrd mem, we need to find a place to hold them * now that normal virtual memory is enabled. */ save_microcode_in_initrd(); -#endif /* * end could be not aligned, and We can not align that, -- cgit v1.2.3 From 6b26e1bf66bb4bf1b1b9b4f27d1f324875689cf0 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 20 Oct 2015 11:54:46 +0200 Subject: x86/microcode: Remove modularization leftovers Remove the remaining module functionality leftovers. Make "dis_ucode_ldr" an early_param and make it static again. Drop module aliases, autoloading table, description, etc. Bump version number, while at it. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Jones Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1445334889-300-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/microcode.h | 1 - arch/x86/kernel/cpu/microcode/amd.c | 7 +----- arch/x86/kernel/cpu/microcode/core.c | 36 +++++++++---------------------- arch/x86/kernel/cpu/microcode/intel.c | 7 +----- arch/x86/kernel/cpu/microcode/intel_lib.c | 1 - 5 files changed, 12 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 9f953f7851bb..34e62b1dcfce 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -27,7 +27,6 @@ struct cpu_signature { struct device; enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; -extern bool dis_ucode_ldr; struct microcode_ops { enum ucode_state (*request_microcode_user) (int cpu, diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 20297fbb7355..6eeda7b024c3 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -24,7 +24,7 @@ * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define pr_fmt(fmt) "microcode: " fmt #include #include @@ -32,7 +32,6 @@ #include #include #include -#include #include #include @@ -42,10 +41,6 @@ #include #include -MODULE_DESCRIPTION("AMD Microcode Update Driver"); -MODULE_AUTHOR("Peter Oruba"); -MODULE_LICENSE("GPL v2"); - static struct equiv_cpu_entry *equiv_cpu_table; struct ucode_patch { diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 18848c7de058..7fc27f1cca58 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -19,7 +19,7 @@ * 2 of the License, or (at your option) any later version. */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define pr_fmt(fmt) "microcode: " fmt #include #include @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -41,16 +40,18 @@ #include #include -MODULE_DESCRIPTION("Microcode Update Driver"); -MODULE_AUTHOR("Tigran Aivazian "); -MODULE_LICENSE("GPL"); - -#define MICROCODE_VERSION "2.00" +#define MICROCODE_VERSION "2.01" static struct microcode_ops *microcode_ops; -bool dis_ucode_ldr; -module_param(dis_ucode_ldr, bool, 0); +static bool dis_ucode_ldr; + +static int __init disable_loader(char *str) +{ + dis_ucode_ldr = true; + return 1; +} +__setup("dis_ucode_ldr", disable_loader); /* * Synchronization. @@ -364,9 +365,6 @@ static void __exit microcode_dev_exit(void) { misc_deregister(µcode_dev); } - -MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); -MODULE_ALIAS("devname:cpu/microcode"); #else #define microcode_dev_init() 0 #define microcode_dev_exit() do { } while (0) @@ -617,20 +615,6 @@ static struct notifier_block mc_cpu_notifier = { .notifier_call = mc_cpu_callback, }; -#ifdef MODULE -/* Autoload on Intel and AMD systems */ -static const struct x86_cpu_id __initconst microcode_id[] = { -#ifdef CONFIG_MICROCODE_INTEL - { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, -#endif -#ifdef CONFIG_MICROCODE_AMD - { X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, }, -#endif - {} -}; -MODULE_DEVICE_TABLE(x86cpu, microcode_id); -#endif - static struct attribute *cpu_root_microcode_attrs[] = { &dev_attr_reload.attr, NULL diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 2e09171e5338..3f325586965e 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -21,7 +21,7 @@ * *#define DEBUG */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define pr_fmt(fmt) "microcode: " fmt #include #include @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include @@ -40,10 +39,6 @@ #include #include -MODULE_DESCRIPTION("Microcode Update Driver"); -MODULE_AUTHOR("Tigran Aivazian "); -MODULE_LICENSE("GPL"); - static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; static struct mc_saved_data { unsigned int mc_saved_count; diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index 1883d252ff7d..b96896bcbdaf 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3 From 6f7fc44bf1eef6768f9dcb527c737ab24a3203ac Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 20 Oct 2015 11:54:47 +0200 Subject: x86/microcode/amd: Remove maintainers from comments We have the MAINTAINERS file for that. Also, Andreas doesn't have the time for this work anymore. Signed-off-by: Borislav Petkov Cc: Andreas Herrmann Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Jones Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1445334889-300-5-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/amd.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 6eeda7b024c3..2233f8a76615 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -11,10 +11,6 @@ * Based on work by: * Tigran Aivazian * - * Maintainers: - * Andreas Herrmann - * Borislav Petkov - * * early loader: * Copyright (C) 2013 Advanced Micro Devices, Inc. * -- cgit v1.2.3 From c595ac2bac930ce79f336c7a7e45e1ea38abfe16 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 20 Oct 2015 11:54:48 +0200 Subject: x86/microcode/intel: Move #ifdef DEBUG inside the function ... and save us the stub. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Jones Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1445334889-300-6-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/intel.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 3f325586965e..ce47402eb2f9 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -389,9 +389,9 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) return 0; } -#ifdef DEBUG static void show_saved_mc(void) { +#ifdef DEBUG int i, j; unsigned int sig, pf, rev, total_size, data_size, date; struct ucode_cpu_info uci; @@ -449,12 +449,8 @@ static void show_saved_mc(void) } } -} -#else -static inline void show_saved_mc(void) -{ -} #endif +} #ifdef CONFIG_HOTPLUG_CPU static DEFINE_MUTEX(x86_cpu_microcode_mutex); -- cgit v1.2.3 From 8affb487d4a4e223d961d7034cb41cd31982b618 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 9 Oct 2015 12:23:34 +0200 Subject: x86/PCI: Don't alloc pcibios-irq when MSI is enabled The pcibios-irq and MSI both use dev->irq to store the IRQ number. While the MSI code checks for that and frees the pcibios-irq before overwriting dev->irq, the pcibios_alloc_irq() function does not. Usually this is not a problem, as the pcibios-irq is allocated before probe time of the device and the MSI IRQ is allocted from the driver's probe path. But there are PCI devices handled by the core kernel and not by a standard PCI driver, like the AMD IOMMU for example. For the AMD IOMMU a normal PCI device driver does not make sense, because a driver can be forcibly unbound from its device, which is not a good idea for an IOMMU. Nevertheless the PCI core code tries to match the PCI device implementing the AMD IOMMU against drivers, and allocates/frees a pcibios IRQ every time it tries out a new driver. This overwrites the dev->irq field set by pci_enable_msi() and sets it to 0 in the end (because the probe fails and the pcibios-irq is freed again). On suspend/resume this breaks the kernel, because the IRQ descriptor for IRQ 0 is NULL. Fix this by not allocating a pcibios-irq when MSI is already active. This also has the benefit, that a device claimed by the core kernel can not be probed by a PCI driver later. Reported-by: Borislav Petkov Signed-off-by: Joerg Roedel Signed-off-by: Bjorn Helgaas Reviewed-by: Thomas Gleixner Cc: Jiang Liu --- arch/x86/pci/common.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 09d3afc0a181..b106c67f88c5 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -674,6 +674,14 @@ int pcibios_add_device(struct pci_dev *dev) int pcibios_alloc_irq(struct pci_dev *dev) { + /* + * If the PCI device was already claimed by core code and has + * MSI enabled, probing of the pcibios IRQ will overwrite + * dev->irq. So bail out if MSI is already enabled. + */ + if (pci_dev_msi_enabled(dev)) + return -EBUSY; + return pcibios_enable_irq(dev); } -- cgit v1.2.3 From 883a1e867e0fe7c2dc2e5844ef692f80177631d5 Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Thu, 17 Sep 2015 00:19:42 +0800 Subject: ftrace: Calculate the correct dyn_ftrace number to report to the userspace Now, ftrace only calculate the dyn_ftrace number in the adding breakpoint loop, not in adding update and finish update loop. Calculate the correct dyn_ftrace, once ftrace reports the failure message to the userspace. Link: http://lkml.kernel.org/r/1442420382-13130-1-git-send-email-mnfhuang@gmail.com Signed-off-by: Minfei Huang Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 8b7b0a51e742..311bcf338f07 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -556,6 +556,7 @@ void ftrace_replace_code(int enable) run_sync(); report = "updating code"; + count = 0; for_ftrace_rec_iter(iter) { rec = ftrace_rec_iter_record(iter); @@ -563,11 +564,13 @@ void ftrace_replace_code(int enable) ret = add_update(rec, enable); if (ret) goto remove_breakpoints; + count++; } run_sync(); report = "removing breakpoints"; + count = 0; for_ftrace_rec_iter(iter) { rec = ftrace_rec_iter_record(iter); @@ -575,6 +578,7 @@ void ftrace_replace_code(int enable) ret = finish_update(rec, enable); if (ret) goto remove_breakpoints; + count++; } run_sync(); -- cgit v1.2.3 From 3217f7c25bca66eed9b07f0b8bfd1937169b0736 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Thu, 27 Aug 2015 16:41:15 +0200 Subject: KVM: Add kvm_arch_vcpu_{un}blocking callbacks Some times it is useful for architecture implementations of KVM to know when the VCPU thread is about to block or when it comes back from blocking (arm/arm64 needs to know this to properly implement timers, for example). Therefore provide a generic architecture callback function in line with what we do elsewhere for KVM generic-arch interactions. Reviewed-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 3 +++ arch/arm64/include/asm/kvm_host.h | 3 +++ arch/mips/include/asm/kvm_host.h | 2 ++ arch/powerpc/include/asm/kvm_host.h | 2 ++ arch/s390/include/asm/kvm_host.h | 2 ++ arch/x86/include/asm/kvm_host.h | 3 +++ include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 3 +++ 8 files changed, 20 insertions(+) (limited to 'arch/x86') diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index c4072d9f32c7..84da97901f1f 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -234,4 +234,7 @@ static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index ed039688c221..e4f4d65f7d2b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -255,4 +255,7 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 5a1a882e0a75..6ded8d347af9 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -847,5 +847,7 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} #endif /* __MIPS_KVM_HOST_H__ */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 827a38d7a9db..c9f122d00920 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -718,5 +718,7 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_exit(void) {} +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} #endif /* __POWERPC_KVM_HOST_H__ */ diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 8ced426091e1..72a614c68ed8 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -644,5 +644,7 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} #endif diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2beee0382088..b28f0f142ecb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1233,4 +1233,7 @@ int x86_set_memory_region(struct kvm *kvm, bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1bef9e21e725..4a86f5f072c0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -625,6 +625,8 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); void kvm_vcpu_block(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); int kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8db1d9361993..7873d6daccb1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2018,6 +2018,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) } while (single_task_running() && ktime_before(cur, stop)); } + kvm_arch_vcpu_blocking(vcpu); + for (;;) { prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); @@ -2031,6 +2033,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) finish_wait(&vcpu->wq, &wait); cur = ktime_get(); + kvm_arch_vcpu_unblocking(vcpu); out: block_ns = ktime_to_ns(cur) - ktime_to_ns(start); -- cgit v1.2.3 From f5775e0b6116b7e2425ccf535243b21768566d87 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 19 Jan 2015 11:08:05 +0000 Subject: x86/xen: discard RAM regions above the maximum reservation During setup, discard RAM regions that are above the maximum reservation (instead of marking them as E820_UNUSABLE). This allows hotplug memory to be placed at these addresses. Signed-off-by: David Vrabel Reviewed-by: Daniel Kiper --- arch/x86/xen/setup.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 1c30e4ab1022..387b60d9bd0e 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -829,6 +829,8 @@ char * __init xen_memory_setup(void) addr = xen_e820_map[0].addr; size = xen_e820_map[0].size; while (i < xen_e820_map_entries) { + bool discard = false; + chunk_size = size; type = xen_e820_map[i].type; @@ -843,10 +845,11 @@ char * __init xen_memory_setup(void) xen_add_extra_mem(pfn_s, n_pfns); xen_max_p2m_pfn = pfn_s + n_pfns; } else - type = E820_UNUSABLE; + discard = true; } - xen_align_and_add_e820_region(addr, chunk_size, type); + if (!discard) + xen_align_and_add_e820_region(addr, chunk_size, type); addr += chunk_size; size -= chunk_size; -- cgit v1.2.3 From 81b286e0f1fe520f2a96f736ffa7e508ac9139ba Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 25 Jun 2015 13:12:46 +0100 Subject: xen/balloon: make alloc_xenballoon_pages() always allocate low pages All users of alloc_xenballoon_pages() wanted low memory pages, so remove the option for high memory. Signed-off-by: David Vrabel Reviewed-by: Daniel Kiper --- arch/x86/xen/grant-table.c | 2 +- drivers/xen/balloon.c | 21 ++++++++------------- drivers/xen/grant-table.c | 2 +- drivers/xen/privcmd.c | 2 +- drivers/xen/xenbus/xenbus_client.c | 3 +-- include/xen/balloon.h | 3 +-- 6 files changed, 13 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 1580e7a5a4cf..e079500b17f3 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -133,7 +133,7 @@ static int __init xlated_setup_gnttab_pages(void) kfree(pages); return -ENOMEM; } - rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */); + rc = alloc_xenballooned_pages(nr_grant_frames, pages); if (rc) { pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__, nr_grant_frames, rc); diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index ac6391bd8029..7ec933d505d2 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -136,17 +136,16 @@ static void balloon_append(struct page *page) } /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ -static struct page *balloon_retrieve(bool prefer_highmem) +static struct page *balloon_retrieve(bool require_lowmem) { struct page *page; if (list_empty(&ballooned_pages)) return NULL; - if (prefer_highmem) - page = list_entry(ballooned_pages.prev, struct page, lru); - else - page = list_entry(ballooned_pages.next, struct page, lru); + page = list_entry(ballooned_pages.next, struct page, lru); + if (require_lowmem && PageHighMem(page)) + return NULL; list_del(&page->lru); if (PageHighMem(page)) @@ -521,24 +520,20 @@ EXPORT_SYMBOL_GPL(balloon_set_new_target); * alloc_xenballooned_pages - get pages that have been ballooned out * @nr_pages: Number of pages to get * @pages: pages returned - * @highmem: allow highmem pages * @return 0 on success, error otherwise */ -int alloc_xenballooned_pages(int nr_pages, struct page **pages, bool highmem) +int alloc_xenballooned_pages(int nr_pages, struct page **pages) { int pgno = 0; struct page *page; mutex_lock(&balloon_mutex); while (pgno < nr_pages) { - page = balloon_retrieve(highmem); - if (page && (highmem || !PageHighMem(page))) { + page = balloon_retrieve(true); + if (page) { pages[pgno++] = page; } else { enum bp_state st; - if (page) - balloon_append(page); - st = decrease_reservation(nr_pages - pgno, - highmem ? GFP_HIGHUSER : GFP_USER); + st = decrease_reservation(nr_pages - pgno, GFP_USER); if (st != BP_DONE) goto out_undo; } diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 62f591f8763c..a4b702c9ac68 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -687,7 +687,7 @@ int gnttab_alloc_pages(int nr_pages, struct page **pages) int i; int ret; - ret = alloc_xenballooned_pages(nr_pages, pages, false); + ret = alloc_xenballooned_pages(nr_pages, pages); if (ret < 0) return ret; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 5e9adac928e6..b199ad3d4587 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -401,7 +401,7 @@ static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) if (pages == NULL) return -ENOMEM; - rc = alloc_xenballooned_pages(numpgs, pages, 0); + rc = alloc_xenballooned_pages(numpgs, pages); if (rc != 0) { pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__, numpgs, rc); diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 2ba09c1195c8..aa304d05101b 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -614,8 +614,7 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev, if (!node) return -ENOMEM; - err = alloc_xenballooned_pages(nr_grefs, node->hvm.pages, - false /* lowmem */); + err = alloc_xenballooned_pages(nr_grefs, node->hvm.pages); if (err) goto out_err; diff --git a/include/xen/balloon.h b/include/xen/balloon.h index c8aee7a8b8d2..83efdeb243bf 100644 --- a/include/xen/balloon.h +++ b/include/xen/balloon.h @@ -22,8 +22,7 @@ extern struct balloon_stats balloon_stats; void balloon_set_new_target(unsigned long target); -int alloc_xenballooned_pages(int nr_pages, struct page **pages, - bool highmem); +int alloc_xenballooned_pages(int nr_pages, struct page **pages); void free_xenballooned_pages(int nr_pages, struct page **pages); struct device; -- cgit v1.2.3 From 8edfcf882eb91ec9028c7334f90f6ef3db5b0fcf Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Wed, 22 Jul 2015 14:48:09 +0100 Subject: x86/xen: export xen_alloc_p2m_entry() Rename alloc_p2m() to xen_alloc_p2m_entry() and export it. This is useful for ensuring that a p2m entry is allocated (i.e., not a shared missing or identity entry) so that subsequent set_phys_to_machine() calls will require no further allocations. Signed-off-by: David Vrabel Reviewed-by: Daniel Kiper --- v3: - Make xen_alloc_p2m_entry() a nop on auto-xlate guests. --- arch/x86/include/asm/xen/page.h | 2 ++ arch/x86/xen/p2m.c | 19 +++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 0679e11d2cf7..b922fa4bb4a1 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -43,6 +43,8 @@ extern unsigned long *xen_p2m_addr; extern unsigned long xen_p2m_size; extern unsigned long xen_max_p2m_pfn; +extern int xen_alloc_p2m_entry(unsigned long pfn); + extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 660b3cfef234..cab9f766bb06 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -530,7 +530,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg) * the new pages are installed with cmpxchg; if we lose the race then * simply free the page we allocated and use the one that's there. */ -static bool alloc_p2m(unsigned long pfn) +int xen_alloc_p2m_entry(unsigned long pfn) { unsigned topidx; unsigned long *top_mfn_p, *mid_mfn; @@ -540,6 +540,9 @@ static bool alloc_p2m(unsigned long pfn) unsigned long addr = (unsigned long)(xen_p2m_addr + pfn); unsigned long p2m_pfn; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + ptep = lookup_address(addr, &level); BUG_ON(!ptep || level != PG_LEVEL_4K); pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); @@ -548,7 +551,7 @@ static bool alloc_p2m(unsigned long pfn) /* PMD level is missing, allocate a new one */ ptep = alloc_p2m_pmd(addr, pte_pg); if (!ptep) - return false; + return -ENOMEM; } if (p2m_top_mfn && pfn < MAX_P2M_PFN) { @@ -566,7 +569,7 @@ static bool alloc_p2m(unsigned long pfn) mid_mfn = alloc_p2m_page(); if (!mid_mfn) - return false; + return -ENOMEM; p2m_mid_mfn_init(mid_mfn, p2m_missing); @@ -592,7 +595,7 @@ static bool alloc_p2m(unsigned long pfn) p2m = alloc_p2m_page(); if (!p2m) - return false; + return -ENOMEM; if (p2m_pfn == PFN_DOWN(__pa(p2m_missing))) p2m_init(p2m); @@ -625,8 +628,9 @@ static bool alloc_p2m(unsigned long pfn) HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn; } - return true; + return 0; } +EXPORT_SYMBOL(xen_alloc_p2m_entry); unsigned long __init set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e) @@ -688,7 +692,10 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) { if (unlikely(!__set_phys_to_machine(pfn, mfn))) { - if (!alloc_p2m(pfn)) + int ret; + + ret = xen_alloc_p2m_entry(pfn); + if (ret < 0) return false; return __set_phys_to_machine(pfn, mfn); -- cgit v1.2.3 From 008c320a96d218712043f8db0111d5472697785c Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Fri, 19 Jun 2015 17:49:03 +0100 Subject: xen/grant: Introduce helpers to split a page into grant Currently, a grant is always based on the Xen page granularity (i.e 4KB). When Linux is using a different page granularity, a single page will be split between multiple grants. The new helpers will be in charge of splitting the Linux page into grants and call a function given by the caller on each grant. Also provide an helper to count the number of grants within a given contiguous region. Note that the x86/include/asm/xen/page.h is now including xen/interface/grant_table.h rather than xen/grant_table.h. It's necessary because xen/grant_table.h depends on asm/xen/page.h and will break the compilation. Furthermore, only definition in interface/grant_table.h is required. Signed-off-by: Julien Grall Reviewed-by: David Vrabel Reviewed-by: Stefano Stabellini Signed-off-by: David Vrabel --- arch/x86/include/asm/xen/page.h | 2 +- drivers/xen/grant-table.c | 26 +++++++++++++++++++++++++ include/xen/grant_table.h | 42 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index b922fa4bb4a1..fe58e3a935de 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -12,7 +12,7 @@ #include #include -#include +#include #include /* Xen machine address */ diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index a4b702c9ac68..dbeaa67dec47 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -776,6 +776,32 @@ void gnttab_batch_copy(struct gnttab_copy *batch, unsigned count) } EXPORT_SYMBOL_GPL(gnttab_batch_copy); +void gnttab_foreach_grant_in_range(struct page *page, + unsigned int offset, + unsigned int len, + xen_grant_fn_t fn, + void *data) +{ + unsigned int goffset; + unsigned int glen; + unsigned long xen_pfn; + + len = min_t(unsigned int, PAGE_SIZE - offset, len); + goffset = xen_offset_in_page(offset); + + xen_pfn = page_to_xen_pfn(page) + XEN_PFN_DOWN(offset); + + while (len) { + glen = min_t(unsigned int, XEN_PAGE_SIZE - goffset, len); + fn(pfn_to_gfn(xen_pfn), goffset, glen, data); + + goffset = 0; + xen_pfn++; + len -= glen; + } +} +EXPORT_SYMBOL_GPL(gnttab_foreach_grant_in_range); + int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count) diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 4478f4b4aae2..05b5b08c2afc 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -45,8 +45,10 @@ #include #include +#include #include #include +#include #define GNTTAB_RESERVED_XENSTORE 1 @@ -224,4 +226,44 @@ static inline struct xen_page_foreign *xen_page_foreign(struct page *page) #endif } +/* Split Linux page in chunk of the size of the grant and call fn + * + * Parameters of fn: + * gfn: guest frame number + * offset: offset in the grant + * len: length of the data in the grant. + * data: internal information + */ +typedef void (*xen_grant_fn_t)(unsigned long gfn, unsigned int offset, + unsigned int len, void *data); + +void gnttab_foreach_grant_in_range(struct page *page, + unsigned int offset, + unsigned int len, + xen_grant_fn_t fn, + void *data); + +/* Helper to get to call fn only on the first "grant chunk" */ +static inline void gnttab_for_one_grant(struct page *page, unsigned int offset, + unsigned len, xen_grant_fn_t fn, + void *data) +{ + /* The first request is limited to the size of one grant */ + len = min_t(unsigned int, XEN_PAGE_SIZE - (offset & ~XEN_PAGE_MASK), + len); + + gnttab_foreach_grant_in_range(page, offset, len, fn, data); +} + +/* Get the number of grant in a specified region + * + * start: Offset from the beginning of the first page + * len: total length of data (can cross multiple page) + */ +static inline unsigned int gnttab_count_grant(unsigned int start, + unsigned int len) +{ + return XEN_PFN_UP(xen_offset_in_page(start) + len); +} + #endif /* __ASM_GNTTAB_H__ */ -- cgit v1.2.3 From 291be10fd7511101d44cf98166d049bd31bc7600 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Wed, 9 Sep 2015 15:17:33 +0100 Subject: xen/swiotlb: Pass addresses rather than frame numbers to xen_arch_need_swiotlb With 64KB page granularity support, the frame number will be different. It will be easier to modify the behavior in a single place rather than in each caller. Signed-off-by: Julien Grall Reviewed-by: Stefano Stabellini Signed-off-by: David Vrabel --- arch/arm/include/asm/xen/page.h | 4 ++-- arch/arm/xen/mm.c | 7 +++++-- arch/x86/include/asm/xen/page.h | 4 ++-- drivers/xen/swiotlb-xen.c | 4 ++-- 4 files changed, 11 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h index e3d94cfa4d46..415dbc6e43fd 100644 --- a/arch/arm/include/asm/xen/page.h +++ b/arch/arm/include/asm/xen/page.h @@ -115,8 +115,8 @@ static inline bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) #define xen_unmap(cookie) iounmap((cookie)) bool xen_arch_need_swiotlb(struct device *dev, - unsigned long pfn, - unsigned long bfn); + phys_addr_t phys, + dma_addr_t dev_addr); unsigned long xen_get_swiotlb_free_pages(unsigned int order); #endif /* _ASM_ARM_XEN_PAGE_H */ diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index 6dd911d1f0ac..7b517e913762 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -138,9 +138,12 @@ void __xen_dma_sync_single_for_device(struct device *hwdev, } bool xen_arch_need_swiotlb(struct device *dev, - unsigned long pfn, - unsigned long bfn) + phys_addr_t phys, + dma_addr_t dev_addr) { + unsigned long pfn = PFN_DOWN(phys); + unsigned long bfn = PFN_DOWN(dev_addr); + return (!hypercall_cflush && (pfn != bfn) && !is_device_dma_coherent(dev)); } diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index fe58e3a935de..f5fb840b43e8 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -298,8 +298,8 @@ void make_lowmem_page_readwrite(void *vaddr); #define xen_unmap(cookie) iounmap((cookie)) static inline bool xen_arch_need_swiotlb(struct device *dev, - unsigned long pfn, - unsigned long bfn) + phys_addr_t phys, + dma_addr_t dev_addr) { return false; } diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 79bc4933b13e..0a5a0e949862 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -392,7 +392,7 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, */ if (dma_capable(dev, dev_addr, size) && !range_straddles_page_boundary(phys, size) && - !xen_arch_need_swiotlb(dev, PFN_DOWN(phys), PFN_DOWN(dev_addr)) && + !xen_arch_need_swiotlb(dev, phys, dev_addr) && !swiotlb_force) { /* we are not interested in the dma_addr returned by * xen_dma_map_page, only in the potential cache flushes executed @@ -551,7 +551,7 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, dma_addr_t dev_addr = xen_phys_to_bus(paddr); if (swiotlb_force || - xen_arch_need_swiotlb(hwdev, PFN_DOWN(paddr), PFN_DOWN(dev_addr)) || + xen_arch_need_swiotlb(hwdev, paddr, dev_addr) || !dma_capable(hwdev, dev_addr, sg->length) || range_straddles_page_boundary(paddr, sg->length)) { phys_addr_t map = swiotlb_tbl_map_single(hwdev, -- cgit v1.2.3 From a314e3eb845389b8f68130c79a63832229dea87b Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 22 Oct 2015 16:20:46 +0000 Subject: xen/arm: Enable cpu_hotplug.c Build cpu_hotplug for ARM and ARM64 guests. Rename arch_(un)register_cpu to xen_(un)register_cpu and provide an empty implementation on ARM and ARM64. On x86 just call arch_(un)register_cpu as we are already doing. Initialize cpu_hotplug on ARM. Signed-off-by: Stefano Stabellini Reviewed-by: Julien Grall Reviewed-by: Boris Ostrovsky --- arch/arm/include/asm/xen/hypervisor.h | 10 ++++++++++ arch/x86/include/asm/xen/hypervisor.h | 5 +++++ arch/x86/xen/enlighten.c | 15 +++++++++++++++ drivers/xen/Makefile | 2 -- drivers/xen/cpu_hotplug.c | 8 ++++++-- 5 files changed, 36 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/include/asm/xen/hypervisor.h b/arch/arm/include/asm/xen/hypervisor.h index 04ff8e7b37df..95251512e2c4 100644 --- a/arch/arm/include/asm/xen/hypervisor.h +++ b/arch/arm/include/asm/xen/hypervisor.h @@ -26,4 +26,14 @@ void __init xen_early_init(void); static inline void xen_early_init(void) { return; } #endif +#ifdef CONFIG_HOTPLUG_CPU +static inline void xen_arch_register_cpu(int num) +{ +} + +static inline void xen_arch_unregister_cpu(int num) +{ +} +#endif + #endif /* _ASM_ARM_XEN_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index d866959e5685..8b2d4bea9962 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -57,4 +57,9 @@ static inline bool xen_x2apic_para_available(void) } #endif +#ifdef CONFIG_HOTPLUG_CPU +void xen_arch_register_cpu(int num); +void xen_arch_unregister_cpu(int num); +#endif + #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 993b7a71386d..5774800ff583 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -75,6 +75,7 @@ #include #include #include +#include #ifdef CONFIG_ACPI #include @@ -1899,3 +1900,17 @@ const struct hypervisor_x86 x86_hyper_xen = { .set_cpu_features = xen_set_cpu_features, }; EXPORT_SYMBOL(x86_hyper_xen); + +#ifdef CONFIG_HOTPLUG_CPU +void xen_arch_register_cpu(int num) +{ + arch_register_cpu(num); +} +EXPORT_SYMBOL(xen_arch_register_cpu); + +void xen_arch_unregister_cpu(int num) +{ + arch_unregister_cpu(num); +} +EXPORT_SYMBOL(xen_arch_unregister_cpu); +#endif diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index e293bc507cbc..aa8a7f71f310 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,6 +1,4 @@ -ifeq ($(filter y, $(CONFIG_ARM) $(CONFIG_ARM64)),) obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o -endif obj-$(CONFIG_X86) += fallback.o obj-y += grant-table.o features.o balloon.o manage.o preempt.o obj-y += events/ diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index cc6513a176b0..43de1f51b53f 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -11,7 +11,7 @@ static void enable_hotplug_cpu(int cpu) { if (!cpu_present(cpu)) - arch_register_cpu(cpu); + xen_arch_register_cpu(cpu); set_cpu_present(cpu, true); } @@ -19,7 +19,7 @@ static void enable_hotplug_cpu(int cpu) static void disable_hotplug_cpu(int cpu) { if (cpu_present(cpu)) - arch_unregister_cpu(cpu); + xen_arch_unregister_cpu(cpu); set_cpu_present(cpu, false); } @@ -102,7 +102,11 @@ static int __init setup_vcpu_hotplug_event(void) static struct notifier_block xsn_cpu = { .notifier_call = setup_cpu_watcher }; +#ifdef CONFIG_X86 if (!xen_pv_domain()) +#else + if (!xen_domain()) +#endif return -ENODEV; register_xenstore_notifier(&xsn_cpu); -- cgit v1.2.3 From 2c66e24d75d424919c42288b418d2e593fa818b1 Mon Sep 17 00:00:00 2001 From: Sai Praneeth Date: Fri, 16 Oct 2015 16:20:27 -0700 Subject: x86/efi: Fix kernel panic when CONFIG_DEBUG_VIRTUAL is enabled When CONFIG_DEBUG_VIRTUAL is enabled, all accesses to __pa(address) are monitored to see whether address falls in direct mapping or kernel text mapping (see Documentation/x86/x86_64/mm.txt for details), if it does not, the kernel panics. During 1:1 mapping of EFI runtime services we access virtual addresses which are == physical addresses, thus the 1:1 mapping and these addresses do not fall in either of the above two regions and hence when passed as arguments to __pa() kernel panics as reported by Dave Hansen here https://lkml.kernel.org/r/5462999A.7090706@intel.com. So, before calling __pa() virtual addresses should be validated which results in skipping call to split_page_count() and that should be fine because it is used to keep track of everything *but* 1:1 mappings. Signed-off-by: Sai Praneeth Prakhya Reported-by: Dave Hansen Reviewed-by: Borislav Petkov Cc: Ricardo Neri Cc: Glenn P Williamson Cc: Ravi Shankar Signed-off-by: Matt Fleming --- arch/x86/mm/pageattr.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 727158cb3b3c..9abe0c9b1098 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -648,9 +648,12 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot))); - if (pfn_range_is_mapped(PFN_DOWN(__pa(address)), - PFN_DOWN(__pa(address)) + 1)) - split_page_count(level); + if (virt_addr_valid(address)) { + unsigned long pfn = PFN_DOWN(__pa(address)); + + if (pfn_range_is_mapped(pfn, pfn + 1)) + split_page_count(level); + } /* * Install the new, split up pagetable. -- cgit v1.2.3 From 298a96c12b2d8fd845ae0c2c21c0a1c0b470f99e Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Sun, 25 Oct 2015 23:11:43 +0200 Subject: x86/dma-mapping: Fix arch_dma_alloc_attrs() oops with NULL dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 6894258eda2f broke drivers that pass NULL as the device pointer to dma_alloc. The reason is that arch_dma_alloc_attrs() now calls dma_alloc_coherent_gfp_flags() which in turn calls dma_alloc_coherent_mask(), where the device pointer is dereferenced unconditionally. Fix things by moving the ISA DMA fallback device assignment before the call to dma_alloc_coherent_gfp_flags(). Fixes: 6894258eda2f ("dma-mapping: consolidate dma_{alloc,free}_{attrs,coherent}") Reported-and-tested-by: Meelis Roos Signed-off-by: Ville Syrjälä Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/1445807503-8920-1-git-send-email-ville.syrjala@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/pci-dma.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1b55de1267cf..cd99433b8ba1 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -131,11 +131,12 @@ void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr, bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp) { + if (!*dev) + *dev = &x86_dma_fallback_dev; + *gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); *gfp = dma_alloc_coherent_gfp_flags(*dev, *gfp); - if (!*dev) - *dev = &x86_dma_fallback_dev; if (!is_device_dma_capable(*dev)) return false; return true; -- cgit v1.2.3 From ababae44108b0e94b58eef6cb5bd830bd040a47f Mon Sep 17 00:00:00 2001 From: Werner Pawlitschko Date: Tue, 27 Oct 2015 09:08:04 +0900 Subject: x86/ioapic: Prevent NULL pointer dereference in setup_ioapic_dest() Commit 4857c91f0d19 changed the way how irq affinity is setup in setup_ioapic_dest() from using the core helper function to unconditionally calling the irq_set_affinity() callback of the underlying irq chip. That results in a NULL pointer dereference for the rare case where the underlying irq chip is lapic_chip which has no irq_set_affinity() callback. lapic_chip is occasionally used for the timer interrupt (irq 0). The fix is simple: Check the availability of the callback instead of calling it unconditionally. Fixes: 4857c91f0d19 "x86/ioapic: Force affinity setting in setup_ioapic_dest()" Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- arch/x86/kernel/apic/io_apic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index bb6bfc01cb82..4f2821527014 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2547,7 +2547,9 @@ void __init setup_ioapic_dest(void) mask = apic->target_cpus(); chip = irq_data_get_irq_chip(idata); - chip->irq_set_affinity(idata, mask, false); + /* Might be lapic_chip for irq 0 */ + if (chip->irq_set_affinity) + chip->irq_set_affinity(idata, mask, false); } } #endif -- cgit v1.2.3 From 44511fb9e55ada760822b0b0d7be9d150576f17f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 23 Oct 2015 11:48:16 +0200 Subject: efi: Use correct type for struct efi_memory_map::phys_map We have been getting away with using a void* for the physical address of the UEFI memory map, since, even on 32-bit platforms with 64-bit physical addresses, no truncation takes place if the memory map has been allocated by the firmware (which only uses 1:1 virtually addressable memory), which is usually the case. However, commit: 0f96a99dab36 ("efi: Add "efi_fake_mem" boot option") adds code that clones and modifies the UEFI memory map, and the clone may live above 4 GB on 32-bit platforms. This means our use of void* for struct efi_memory_map::phys_map has graduated from 'incorrect but working' to 'incorrect and broken', and we need to fix it. So redefine struct efi_memory_map::phys_map as phys_addr_t, and get rid of a bunch of casts that are now unneeded. Signed-off-by: Ard Biesheuvel Reviewed-by: Matt Fleming Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: izumi.taku@jp.fujitsu.com Cc: kamezawa.hiroyu@jp.fujitsu.com Cc: linux-efi@vger.kernel.org Cc: matt.fleming@intel.com Link: http://lkml.kernel.org/r/1445593697-1342-1-git-send-email-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar --- arch/arm64/kernel/efi.c | 4 ++-- arch/x86/platform/efi/efi.c | 4 ++-- drivers/firmware/efi/efi.c | 8 ++++---- include/linux/efi.h | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 4b7df346e388..61eb1d17586a 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -208,7 +208,7 @@ void __init efi_init(void) memblock_reserve(params.mmap & PAGE_MASK, PAGE_ALIGN(params.mmap_size + (params.mmap & ~PAGE_MASK))); - memmap.phys_map = (void *)params.mmap; + memmap.phys_map = params.mmap; memmap.map = early_memremap(params.mmap, params.mmap_size); memmap.map_end = memmap.map + params.mmap_size; memmap.desc_size = params.desc_size; @@ -282,7 +282,7 @@ static int __init arm64_enable_runtime_services(void) pr_info("Remapping and enabling EFI services.\n"); mapsize = memmap.map_end - memmap.map; - memmap.map = (__force void *)ioremap_cache((phys_addr_t)memmap.phys_map, + memmap.map = (__force void *)ioremap_cache(memmap.phys_map, mapsize); if (!memmap.map) { pr_err("Failed to remap EFI memory map\n"); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 3e1d09e885c0..ad285404ea7f 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -194,7 +194,7 @@ static void __init do_add_efi_memmap(void) int __init efi_memblock_x86_reserve_range(void) { struct efi_info *e = &boot_params.efi_info; - unsigned long pmap; + phys_addr_t pmap; if (efi_enabled(EFI_PARAVIRT)) return 0; @@ -209,7 +209,7 @@ int __init efi_memblock_x86_reserve_range(void) #else pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); #endif - memmap.phys_map = (void *)pmap; + memmap.phys_map = pmap; memmap.nr_map = e->efi_memmap_size / e->efi_memdesc_size; memmap.desc_size = e->efi_memdesc_size; diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 31fc864eb037..027ca212179f 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -254,7 +254,7 @@ subsys_initcall(efisubsys_init); int __init efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md) { struct efi_memory_map *map = efi.memmap; - void *p, *e; + phys_addr_t p, e; if (!efi_enabled(EFI_MEMMAP)) { pr_err_once("EFI_MEMMAP is not enabled.\n"); @@ -286,10 +286,10 @@ int __init efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md) * So just always get our own virtual map on the CPU. * */ - md = early_memremap((phys_addr_t)p, sizeof (*md)); + md = early_memremap(p, sizeof (*md)); if (!md) { - pr_err_once("early_memremap(%p, %zu) failed.\n", - p, sizeof (*md)); + pr_err_once("early_memremap(%pa, %zu) failed.\n", + &p, sizeof (*md)); return -ENOMEM; } diff --git a/include/linux/efi.h b/include/linux/efi.h index 4d01c1033fce..569b5a866bb1 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -680,7 +680,7 @@ typedef struct { } efi_system_table_t; struct efi_memory_map { - void *phys_map; + phys_addr_t phys_map; void *map; void *map_end; int nr_map; -- cgit v1.2.3 From 914beb9fc26d6225295b8315ab54026f8f22755c Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Wed, 28 Oct 2015 13:39:05 +0000 Subject: x86/xen: add reschedule point when mapping foreign GFNs Mapping a large range of foreign GFNs can take a long time, add a reschedule point after each batch of 16 GFNs. Signed-off-by: David Vrabel Reviewed-by: Boris Ostrovsky --- arch/x86/xen/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 9c479fe40459..ac161db63388 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2888,6 +2888,7 @@ static int do_remap_gfn(struct vm_area_struct *vma, addr += range; if (err_ptr) err_ptr += batch; + cond_resched(); } out: -- cgit v1.2.3 From 2459ee8651dc5ab72790c2ffa99af288c7641b64 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 30 Oct 2015 22:42:46 -0700 Subject: x86/vm86: Set thread.vm86 to NULL on fork/clone thread.vm86 points to per-task information -- the pointer should not be copied on clone. Fixes: d4ce0f26c790 ("x86/vm86: Move fields from 'struct kernel_vm86_struct' to 'struct vm86'") Signed-off-by: Andy Lutomirski Cc: Brian Gerst Cc: Linus Torvalds Cc: Borislav Petkov Cc: Stas Sergeev Link: http://lkml.kernel.org/r/71c5d6985d70ec8197c8d72f003823c81b7dcf99.1446270067.git.luto@kernel.org Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e28db181e4fc..9f7c21c22477 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -84,6 +84,9 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister); int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { memcpy(dst, src, arch_task_struct_size); +#ifdef CONFIG_VM86 + dst->thread.vm86 = NULL; +#endif return fpu__copy(&dst->thread.fpu, &src->thread.fpu); } -- cgit v1.2.3 From c7f54d21fb02e90042e6233b46716dcb244e70e6 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Fri, 30 Oct 2015 13:11:37 +0100 Subject: x86/mce: Add a Scalable MCA vendor flags bit Scalable MCA (SMCA) is a new feature in AMD Fam17h processors which indicates presence of MCA extensions. MCA extensions expands existing register space for the MCE banks and also introduces a new MSR range to accommodate new banks. Add the detection bit. Signed-off-by: Aravind Gopalakrishnan [ Reformat mce_vendor_flags definitions and save indentation levels. Improve comments. ] Signed-off-by: Borislav Petkov Cc: Ashok Raj Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1446207099-24948-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mce.h | 34 +++++++++++++++++++++------------- arch/x86/kernel/cpu/mcheck/mce.c | 2 ++ 2 files changed, 23 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 2dbc0bf2b9f3..2ea4527e462f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -123,19 +123,27 @@ struct mca_config { }; struct mce_vendor_flags { - /* - * overflow recovery cpuid bit indicates that overflow - * conditions are not fatal - */ - __u64 overflow_recov : 1, - - /* - * SUCCOR stands for S/W UnCorrectable error COntainment - * and Recovery. It indicates support for data poisoning - * in HW and deferred error interrupts. - */ - succor : 1, - __reserved_0 : 62; + /* + * Indicates that overflow conditions are not fatal, when set. + */ + __u64 overflow_recov : 1, + + /* + * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and + * Recovery. It indicates support for data poisoning in HW and deferred + * error interrupts. + */ + succor : 1, + + /* + * (AMD) SMCA: This bit indicates support for Scalable MCA which expands + * the register space for each MCA bank and also increases number of + * banks. Also, to accommodate the new banks and registers, the MCA + * register space is moved to a new MSR range. + */ + smca : 1, + + __reserved_0 : 61; }; extern struct mce_vendor_flags mce_flags; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 17b5ec6edcb6..3d631c4abf91 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1605,6 +1605,8 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) mce_amd_feature_init(c); mce_flags.overflow_recov = !!(ebx & BIT(0)); mce_flags.succor = !!(ebx & BIT(1)); + mce_flags.smca = !!(ebx & BIT(3)); + break; } -- cgit v1.2.3 From dc34bdd2367fd31744ee3ba1de1b1cc0fa2ce193 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 30 Oct 2015 13:11:38 +0100 Subject: x86/mce: Add a default case to the switch in __mcheck_cpu_ancient_init() Caught by building with W= which enable -Wswitch-default also. Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1446207099-24948-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3d631c4abf91..c5b0d562dbf5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1586,6 +1586,8 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) winchip_mcheck_init(c); return 1; break; + default: + return 0; } return 0; -- cgit v1.2.3 From 2167ceabf34163727ca4e283c0f030e3960932e5 Mon Sep 17 00:00:00 2001 From: Wan Zongshun Date: Fri, 30 Oct 2015 13:11:39 +0100 Subject: x86/cpu: Add CLZERO detection AMD Fam17h processors introduce support for the CLZERO instruction. It zeroes out the 64 byte cache line specified in RAX. Add the bit here to allow /proc/cpuinfo to list the feature. Boris: we're adding this as a separate ->x86_capability leaf because CPUID_80000008_EBX is going to contain more feature bits and it will fill out with time. Signed-off-by: Wan Zongshun Signed-off-by: Aravind Gopalakrishnan [ Wrap code in patch form, fix comments. ] Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1446207099-24948-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 5 ++++- arch/x86/kernel/cpu/common.c | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 9727b3b48bd1..e4f8010f22e0 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -12,7 +12,7 @@ #include #endif -#define NCAPINTS 13 /* N 32-bit words worth of info */ +#define NCAPINTS 14 /* N 32-bit words worth of info */ #define NBUGINTS 1 /* N 32-bit bug flags */ /* @@ -255,6 +255,9 @@ /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ #define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ +/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ +#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ + /* * BUG word(s) */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index de22ea7ff82f..4ddd780aeac9 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -670,6 +670,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; + c->x86_capability[13] = cpuid_ebx(0x80000008); } #ifdef CONFIG_X86_32 else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) -- cgit v1.2.3 From abed7d0710e8f892c267932a9492ccf447674fb8 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Tue, 27 Oct 2015 15:19:52 -0400 Subject: xen: fix the check of e_pfn in xen_find_pfn_range On some NUMA system, after dom0 up, we see below warning even if there are enough pfn ranges that could be used for remapping: "Unable to find available pfn range, not remapping identity pages" Fix it to avoid getting a memory region of zero size in xen_find_pfn_range. Signed-off-by: Zhenzhong Duan Reviewed-by: Juergen Gross Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- arch/x86/xen/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 387b60d9bd0e..d1fac0e53d57 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -212,7 +212,7 @@ static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn) e_pfn = PFN_DOWN(entry->addr + entry->size); /* We only care about E820 after this */ - if (e_pfn < *min_pfn) + if (e_pfn <= *min_pfn) continue; s_pfn = PFN_UP(entry->addr); -- cgit v1.2.3 From 2da29bccc5045ea10c70cb3a69be777768fd0b66 Mon Sep 17 00:00:00 2001 From: Saurabh Sengar Date: Fri, 30 Oct 2015 12:56:11 +0530 Subject: KVM: x86: removing unused variable removing unused variables, found by coccinelle Signed-off-by: Saurabh Sengar Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9e9c226cb79d..57b5f79aad08 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3438,41 +3438,35 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) { - int r = 0; - mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); mutex_unlock(&kvm->arch.vpit->pit_state.lock); - return r; + return 0; } static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) { - int r = 0; - mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); mutex_unlock(&kvm->arch.vpit->pit_state.lock); - return r; + return 0; } static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) { - int r = 0; - mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, sizeof(ps->channels)); ps->flags = kvm->arch.vpit->pit_state.flags; mutex_unlock(&kvm->arch.vpit->pit_state.lock); memset(&ps->reserved, 0, sizeof(ps->reserved)); - return r; + return 0; } static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) { - int r = 0, start = 0; + int start = 0; u32 prev_legacy, cur_legacy; mutex_lock(&kvm->arch.vpit->pit_state.lock); prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; @@ -3484,7 +3478,7 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) kvm->arch.vpit->pit_state.flags = ps->flags; kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); mutex_unlock(&kvm->arch.vpit->pit_state.lock); - return r; + return 0; } static int kvm_vm_ioctl_reinject(struct kvm *kvm, -- cgit v1.2.3 From 7a036a6f670f63b32c5ee126425f9109271ca13f Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Fri, 30 Oct 2015 16:36:24 +0100 Subject: KVM: x86: add read_phys to x86_emulate_ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want to read the physical memory when emulating RSM. X86EMUL_IO_NEEDED is returned on all errors for consistency with other helpers. Signed-off-by: Radim Krčmář Tested-by: Laszlo Ersek Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 10 ++++++++++ arch/x86/kvm/x86.c | 10 ++++++++++ 2 files changed, 20 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index e16466ec473c..e9cd7befcb76 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -111,6 +111,16 @@ struct x86_emulate_ops { unsigned int bytes, struct x86_exception *fault); + /* + * read_phys: Read bytes of standard (non-emulated/special) memory. + * Used for descriptor reading. + * @addr: [IN ] Physical address from which to read. + * @val: [OUT] Value read from memory. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr, + void *val, unsigned int bytes); + /* * write_std: Write bytes of standard (non-emulated/special) memory. * Used for descriptor writing. diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 57b5f79aad08..a24bae0a83a2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4089,6 +4089,15 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); } +static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes) +{ + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes); + + return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE; +} + int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, @@ -4824,6 +4833,7 @@ static const struct x86_emulate_ops emulate_ops = { .write_gpr = emulator_write_gpr, .read_std = kvm_read_guest_virt_system, .write_std = kvm_write_guest_virt_system, + .read_phys = kvm_read_guest_phys_system, .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, -- cgit v1.2.3 From f40606b147dd5b4678cedc877a71deb520ca507e Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Fri, 30 Oct 2015 16:36:25 +0100 Subject: KVM: x86: handle SMBASE as physical address in RSM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GET_SMSTATE depends on real mode to ensure that smbase+offset is treated as a physical address, which has already caused a bug after shuffling the code. Enforce physical addressing. Signed-off-by: Radim Krčmář Reported-by: Laszlo Ersek Tested-by: Laszlo Ersek Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9da95b9daf8d..b60fed56671b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2272,8 +2272,8 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) #define GET_SMSTATE(type, smbase, offset) \ ({ \ type __val; \ - int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val, \ - sizeof(__val), NULL); \ + int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val, \ + sizeof(__val)); \ if (r != X86EMUL_CONTINUE) \ return X86EMUL_UNHANDLEABLE; \ __val; \ @@ -2484,8 +2484,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) /* * Get back to real mode, to prepare a safe state in which to load - * CR0/CR3/CR4/EFER. Also this will ensure that addresses passed - * to read_std/write_std are not virtual. + * CR0/CR3/CR4/EFER. * * CR4.PCIDE must be zero, because it is a 64-bit mode only feature. */ -- cgit v1.2.3 From c75efa974e013640496620f26f0b532cb5cb17f9 Mon Sep 17 00:00:00 2001 From: Andrey Smetanin Date: Fri, 16 Oct 2015 10:07:50 +0300 Subject: drivers/hv: share Hyper-V SynIC constants with userspace Moved Hyper-V synic contants from guest Hyper-V drivers private header into x86 arch uapi Hyper-V header. Added Hyper-V synic msr's flags into x86 arch uapi Hyper-V header. Signed-off-by: Andrey Smetanin Reviewed-by: Roman Kagan Signed-off-by: Denis V. Lunev CC: Vitaly Kuznetsov CC: "K. Y. Srinivasan" CC: Gleb Natapov CC: Paolo Bonzini Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/hyperv.h | 12 ++++++++++++ drivers/hv/hyperv_vmbus.h | 5 ----- include/linux/hyperv.h | 1 + 3 files changed, 13 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index 2677a0aac2cc..040d4083c24f 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -257,4 +257,16 @@ typedef struct _HV_REFERENCE_TSC_PAGE { __s64 tsc_offset; } HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE; +/* Define the number of synthetic interrupt sources. */ +#define HV_SYNIC_SINT_COUNT (16) +/* Define the expected SynIC version. */ +#define HV_SYNIC_VERSION_1 (0x1) + +#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0) +#define HV_SYNIC_SIMP_ENABLE (1ULL << 0) +#define HV_SYNIC_SIEFP_ENABLE (1ULL << 0) +#define HV_SYNIC_SINT_MASKED (1ULL << 16) +#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17) +#define HV_SYNIC_SINT_VECTOR_MASK (0xFF) + #endif diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 3d70e36c918e..3782636562a1 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -63,9 +63,6 @@ enum hv_cpuid_function { /* Define version of the synthetic interrupt controller. */ #define HV_SYNIC_VERSION (1) -/* Define the expected SynIC version. */ -#define HV_SYNIC_VERSION_1 (0x1) - /* Define synthetic interrupt controller message constants. */ #define HV_MESSAGE_SIZE (256) #define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) @@ -105,8 +102,6 @@ enum hv_message_type { HVMSG_X64_LEGACY_FP_ERROR = 0x80010005 }; -/* Define the number of synthetic interrupt sources. */ -#define HV_SYNIC_SINT_COUNT (16) #define HV_SYNIC_STIMER_COUNT (4) /* Define invalid partition identifier. */ diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 54733d5b503e..8fdc17b84739 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -26,6 +26,7 @@ #define _HYPERV_H #include +#include #include #include -- cgit v1.2.3 From 0669a51015c58b1f036030743a0c0781eb63867f Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Fri, 30 Oct 2015 15:48:20 +0100 Subject: KVM: x86: zero apic_arb_prio on reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BSP doesn't get INIT so its apic_arb_prio isn't zeroed after reboot. BSP won't get lowest priority interrupts until other VCPUs get enough interrupts to match their pre-reboot apic_arb_prio. That behavior doesn't fit into KVM's round-robin-like interpretation of lowest priority delivery ... userspace should KVM_SET_LAPIC on reset, so just zero apic_arb_prio there. Reported-by: Yuki Shibuya Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 168b8759bd73..ecd4ea1d28a8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1918,6 +1918,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_EVENT, vcpu); if (ioapic_in_kernel(vcpu->kvm)) kvm_rtc_eoi_tracking_restore_one(vcpu); + + vcpu->arch.apic_arb_prio = 0; } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From b97e6de9c96cefaa02a6a7464731ea504b45e150 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 28 Oct 2015 19:16:47 +0100 Subject: KVM: x86: merge kvm_arch_set_irq with kvm_set_msi_inatomic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We do not want to do too much work in atomic context, in particular not walking all the VCPUs of the virtual machine. So we want to distinguish the architecture-specific injection function for irqfd from kvm_set_msi. Since it's still empty, reuse the newly added kvm_arch_set_irq and rename it to kvm_arch_set_irq_inatomic. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/irq_comm.c | 14 ++++++++------ include/linux/kvm_host.h | 7 +++---- virt/kvm/eventfd.c | 11 ++++------- 3 files changed, 15 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 8f4499c7ffc1..75dc633c48dc 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -124,12 +124,16 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, } -static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm) +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) { struct kvm_lapic_irq irq; int r; + if (unlikely(e->type != KVM_IRQ_ROUTING_MSI)) + return -EWOULDBLOCK; + kvm_set_msi_irq(e, &irq); if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) @@ -165,10 +169,8 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) idx = srcu_read_lock(&kvm->irq_srcu); if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { e = &entries[0]; - if (likely(e->type == KVM_IRQ_ROUTING_MSI)) - ret = kvm_set_msi_inatomic(e, kvm); - else - ret = -EWOULDBLOCK; + ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id, + irq, level); } srcu_read_unlock(&kvm->irq_srcu, idx); return ret; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 87189a41d904..15c78f320678 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -830,10 +830,9 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level); int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, int irq_source_id, int level, bool line_status); - -int kvm_arch_set_irq(struct kvm_kernel_irq_routing_entry *irq, struct kvm *kvm, - int irq_source_id, int level, bool line_status); - +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, + int level, bool line_status); bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_notify_acked_gsi(struct kvm *kvm, int gsi); void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index e29fd2640709..46dbc0a7dfc1 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -171,7 +171,7 @@ irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) queue_work(irqfd_cleanup_wq, &irqfd->shutdown); } -int __attribute__((weak)) kvm_arch_set_irq( +int __attribute__((weak)) kvm_arch_set_irq_inatomic( struct kvm_kernel_irq_routing_entry *irq, struct kvm *kvm, int irq_source_id, int level, @@ -201,12 +201,9 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) irq = irqfd->irq_entry; } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq)); /* An event has been signaled, inject an interrupt */ - if (irq.type == KVM_IRQ_ROUTING_MSI) - kvm_set_msi(&irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, - false); - else if (kvm_arch_set_irq(&irq, kvm, - KVM_USERSPACE_IRQ_SOURCE_ID, 1, - false) == -EWOULDBLOCK) + if (kvm_arch_set_irq_inatomic(&irq, kvm, + KVM_USERSPACE_IRQ_SOURCE_ID, 1, + false) == -EWOULDBLOCK) schedule_work(&irqfd->inject); srcu_read_unlock(&kvm->irq_srcu, idx); } -- cgit v1.2.3 From 7695405698d011c05a95288f1f3c0a3dd252dccc Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 28 Oct 2015 19:22:20 +0100 Subject: KVM: device assignment: remove pointless #ifdefs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The symbols are always defined. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/assigned-dev.c | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c index d090ecf08809..1c17ee807ef7 100644 --- a/arch/x86/kvm/assigned-dev.c +++ b/arch/x86/kvm/assigned-dev.c @@ -131,7 +131,6 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) return IRQ_HANDLED; } -#ifdef __KVM_HAVE_MSI static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) { struct kvm_assigned_dev_kernel *assigned_dev = dev_id; @@ -150,9 +149,7 @@ static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) return IRQ_HANDLED; } -#endif -#ifdef __KVM_HAVE_MSIX static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id) { struct kvm_assigned_dev_kernel *assigned_dev = dev_id; @@ -183,7 +180,6 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) return IRQ_HANDLED; } -#endif /* Ack the irq line for an assigned device */ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) @@ -386,7 +382,6 @@ static int assigned_device_enable_host_intx(struct kvm *kvm, return 0; } -#ifdef __KVM_HAVE_MSI static int assigned_device_enable_host_msi(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev) { @@ -408,9 +403,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm, return 0; } -#endif -#ifdef __KVM_HAVE_MSIX static int assigned_device_enable_host_msix(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev) { @@ -443,8 +436,6 @@ err: return r; } -#endif - static int assigned_device_enable_guest_intx(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev, struct kvm_assigned_irq *irq) @@ -454,7 +445,6 @@ static int assigned_device_enable_guest_intx(struct kvm *kvm, return 0; } -#ifdef __KVM_HAVE_MSI static int assigned_device_enable_guest_msi(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev, struct kvm_assigned_irq *irq) @@ -463,9 +453,7 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm, dev->ack_notifier.gsi = -1; return 0; } -#endif -#ifdef __KVM_HAVE_MSIX static int assigned_device_enable_guest_msix(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev, struct kvm_assigned_irq *irq) @@ -474,7 +462,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm, dev->ack_notifier.gsi = -1; return 0; } -#endif static int assign_host_irq(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev, @@ -492,16 +479,12 @@ static int assign_host_irq(struct kvm *kvm, case KVM_DEV_IRQ_HOST_INTX: r = assigned_device_enable_host_intx(kvm, dev); break; -#ifdef __KVM_HAVE_MSI case KVM_DEV_IRQ_HOST_MSI: r = assigned_device_enable_host_msi(kvm, dev); break; -#endif -#ifdef __KVM_HAVE_MSIX case KVM_DEV_IRQ_HOST_MSIX: r = assigned_device_enable_host_msix(kvm, dev); break; -#endif default: r = -EINVAL; } @@ -534,16 +517,12 @@ static int assign_guest_irq(struct kvm *kvm, case KVM_DEV_IRQ_GUEST_INTX: r = assigned_device_enable_guest_intx(kvm, dev, irq); break; -#ifdef __KVM_HAVE_MSI case KVM_DEV_IRQ_GUEST_MSI: r = assigned_device_enable_guest_msi(kvm, dev, irq); break; -#endif -#ifdef __KVM_HAVE_MSIX case KVM_DEV_IRQ_GUEST_MSIX: r = assigned_device_enable_guest_msix(kvm, dev, irq); break; -#endif default: r = -EINVAL; } @@ -826,7 +805,6 @@ out: } -#ifdef __KVM_HAVE_MSIX static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, struct kvm_assigned_msix_nr *entry_nr) { @@ -906,7 +884,6 @@ msix_entry_out: return r; } -#endif static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, struct kvm_assigned_pci_dev *assigned_dev) @@ -1012,7 +989,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, goto out; break; } -#ifdef __KVM_HAVE_MSIX case KVM_ASSIGN_SET_MSIX_NR: { struct kvm_assigned_msix_nr entry_nr; r = -EFAULT; @@ -1033,7 +1009,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, goto out; break; } -#endif case KVM_ASSIGN_SET_INTX_MASK: { struct kvm_assigned_pci_dev assigned_dev; -- cgit v1.2.3 From 8a22f234a81ab4d1de5d948c3478608f08a9b844 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 28 Oct 2015 18:52:02 +0100 Subject: KVM: x86: move kvm_set_irq_inatomic to legacy device assignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function is not used outside device assignment, and kvm_arch_set_irq_inatomic has a different prototype. Move it here and make it static to avoid confusion. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/assigned-dev.c | 37 +++++++++++++++++++++++++++++++++++++ arch/x86/kvm/irq_comm.c | 34 ---------------------------------- include/linux/kvm_host.h | 1 - 3 files changed, 37 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c index 1c17ee807ef7..9dc091acd5fb 100644 --- a/arch/x86/kvm/assigned-dev.c +++ b/arch/x86/kvm/assigned-dev.c @@ -21,6 +21,7 @@ #include #include "irq.h" #include "assigned-dev.h" +#include "trace/events/kvm.h" struct kvm_assigned_dev_kernel { struct kvm_irq_ack_notifier ack_notifier; @@ -131,6 +132,42 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) return IRQ_HANDLED; } +/* + * Deliver an IRQ in an atomic context if we can, or return a failure, + * user can retry in a process context. + * Return value: + * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context. + * Other values - No need to retry. + */ +static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, + int level) +{ + struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; + struct kvm_kernel_irq_routing_entry *e; + int ret = -EINVAL; + int idx; + + trace_kvm_set_irq(irq, level, irq_source_id); + + /* + * Injection into either PIC or IOAPIC might need to scan all CPUs, + * which would need to be retried from thread context; when same GSI + * is connected to both PIC and IOAPIC, we'd have to report a + * partial failure here. + * Since there's no easy way to do this, we only support injecting MSI + * which is limited to 1:1 GSI mapping. + */ + idx = srcu_read_lock(&kvm->irq_srcu); + if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { + e = &entries[0]; + ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id, + irq, level); + } + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} + + static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) { struct kvm_assigned_dev_kernel *assigned_dev = dev_id; diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 75dc633c48dc..84b96d319909 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -142,40 +142,6 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, return -EWOULDBLOCK; } -/* - * Deliver an IRQ in an atomic context if we can, or return a failure, - * user can retry in a process context. - * Return value: - * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context. - * Other values - No need to retry. - */ -int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) -{ - struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; - struct kvm_kernel_irq_routing_entry *e; - int ret = -EINVAL; - int idx; - - trace_kvm_set_irq(irq, level, irq_source_id); - - /* - * Injection into either PIC or IOAPIC might need to scan all CPUs, - * which would need to be retried from thread context; when same GSI - * is connected to both PIC and IOAPIC, we'd have to report a - * partial failure here. - * Since there's no easy way to do this, we only support injecting MSI - * which is limited to 1:1 GSI mapping. - */ - idx = srcu_read_lock(&kvm->irq_srcu); - if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { - e = &entries[0]; - ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id, - irq, level); - } - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - int kvm_request_irq_source_id(struct kvm *kvm) { unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 15c78f320678..242a6d2b53ff 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -827,7 +827,6 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin); int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status); -int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level); int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, int irq_source_id, int level, bool line_status); int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, -- cgit v1.2.3 From 656ec4a4928a3db7d16e5cb9bce351a478cfd3d5 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Mon, 2 Nov 2015 22:20:00 +0100 Subject: KVM: VMX: fix SMEP and SMAP without EPT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comment in code had it mostly right, but we enable paging for emulated real mode regardless of EPT. Without EPT (which implies emulated real mode), secondary VCPUs won't start unless we disable SM[AE]P when the guest doesn't use paging. Signed-off-by: Radim Krčmář Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4d0aa31a42ba..2ac116417583 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3788,20 +3788,21 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!is_paging(vcpu)) { hw_cr4 &= ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; - /* - * SMEP/SMAP is disabled if CPU is in non-paging mode - * in hardware. However KVM always uses paging mode to - * emulate guest non-paging mode with TDP. - * To emulate this behavior, SMEP/SMAP needs to be - * manually disabled when guest switches to non-paging - * mode. - */ - hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); } else if (!(cr4 & X86_CR4_PAE)) { hw_cr4 &= ~X86_CR4_PAE; } } + if (!enable_unrestricted_guest && !is_paging(vcpu)) + /* + * SMEP/SMAP is disabled if CPU is in non-paging mode in + * hardware. However KVM always uses paging mode without + * unrestricted guest. + * To emulate this behavior, SMEP/SMAP needs to be manually + * disabled when guest switches to non-paging mode. + */ + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); + vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); return 0; -- cgit v1.2.3 From 89651a3decbe03754f304a0b248f27eeb9a37937 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 3 Nov 2015 13:43:05 +0100 Subject: KVM: x86: allow RSM from 64-bit mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SDM says that exiting system management mode from 64-bit mode is invalid, but that would be too good to be true. But actually, most of the code is already there to support exiting from compat mode (EFER.LME=1, EFER.LMA=0). Getting all the way from 64-bit mode to real mode only requires clearing CS.L and CR4.PCIDE. Cc: stable@vger.kernel.org Fixes: 660a5d517aaab9187f93854425c4c63f4a09195c Tested-by: Laszlo Ersek Cc: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b60fed56671b..1505587d06e9 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2484,16 +2484,36 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) /* * Get back to real mode, to prepare a safe state in which to load - * CR0/CR3/CR4/EFER. - * - * CR4.PCIDE must be zero, because it is a 64-bit mode only feature. + * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU + * supports long mode. */ + cr4 = ctxt->ops->get_cr(ctxt, 4); + if (emulator_has_longmode(ctxt)) { + struct desc_struct cs_desc; + + /* Zero CR4.PCIDE before CR0.PG. */ + if (cr4 & X86_CR4_PCIDE) { + ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); + cr4 &= ~X86_CR4_PCIDE; + } + + /* A 32-bit code segment is required to clear EFER.LMA. */ + memset(&cs_desc, 0, sizeof(cs_desc)); + cs_desc.type = 0xb; + cs_desc.s = cs_desc.g = cs_desc.p = 1; + ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS); + } + + /* For the 64-bit case, this will clear EFER.LMA. */ cr0 = ctxt->ops->get_cr(ctxt, 0); if (cr0 & X86_CR0_PE) ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); - cr4 = ctxt->ops->get_cr(ctxt, 4); + + /* Now clear CR4.PAE (which must be done before clearing EFER.LME). */ if (cr4 & X86_CR4_PAE) ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); + + /* And finally go back to 32-bit mode. */ efer = 0; ctxt->ops->set_msr(ctxt, MSR_EFER, efer); @@ -4454,7 +4474,7 @@ static const struct opcode twobyte_table[256] = { F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N, /* 0xA8 - 0xAF */ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), - II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm), + II(EmulateOnUD | ImplicitOps, em_rsm, rsm), F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd), F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), -- cgit v1.2.3 From 879ae1880449c88db11c1ebdaedc2da79b2fe73f Mon Sep 17 00:00:00 2001 From: Laszlo Ersek Date: Wed, 4 Nov 2015 12:54:41 +0100 Subject: KVM: x86: obey KVM_X86_QUIRK_CD_NW_CLEARED in kvm_set_cr0() Commit b18d5431acc7 ("KVM: x86: fix CR0.CD virtualization") was technically correct, but it broke OVMF guests by slowing down various parts of the firmware. Commit fb279950ba02 ("KVM: vmx: obey KVM_QUIRK_CD_NW_CLEARED") quirked the first function modified by b18d5431acc7, vmx_get_mt_mask(), for OVMF's sake. This restored the speed of the OVMF code that runs before PlatformPei (including the memory intensive LZMA decompression in SEC). This patch extends the quirk to the second function modified by b18d5431acc7, kvm_set_cr0(). It eliminates the intrusive slowdown that hits the EFI_MP_SERVICES_PROTOCOL implementation of edk2's UefiCpuPkg/CpuDxe -- which is built into OVMF --, when CpuDxe starts up all APs at once for initialization, in order to count them. We also carry over the kvm_arch_has_noncoherent_dma() sub-condition from the other half of the original commit b18d5431acc7. Fixes: b18d5431acc7a2fd22767925f3a6f597aa4bd29e Cc: stable@vger.kernel.org Cc: Jordan Justen Cc: Alex Williamson Reviewed-by: Xiao Guangrong Tested-by: Janusz Mocek Signed-off-by: Laszlo Ersek # Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a24bae0a83a2..30723a4122cd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -625,7 +625,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if ((cr0 ^ old_cr0) & update_bits) kvm_mmu_reset_context(vcpu); - if ((cr0 ^ old_cr0) & X86_CR0_CD) + if (((cr0 ^ old_cr0) & X86_CR0_CD) && + kvm_arch_has_noncoherent_dma(vcpu->kvm) && + !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); return 0; -- cgit v1.2.3 From a3eaa8649e4c6a6afdafaa04b9114fb230617bb1 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 4 Nov 2015 13:46:05 +0800 Subject: KVM: VMX: Fix commit which broke PML I found PML was broken since below commit: commit feda805fe7c4ed9cf78158e73b1218752e3b4314 Author: Xiao Guangrong Date: Wed Sep 9 14:05:55 2015 +0800 KVM: VMX: unify SECONDARY_VM_EXEC_CONTROL update Unify the update in vmx_cpuid_update() Signed-off-by: Xiao Guangrong [Rewrite to use vmcs_set_secondary_exec_control. - Paolo] Signed-off-by: Paolo Bonzini The reason is in above commit vmx_cpuid_update calls vmx_secondary_exec_control, in which currently SECONDARY_EXEC_ENABLE_PML bit is cleared unconditionally (as PML is enabled in creating vcpu). Therefore if vcpu_cpuid_update is called after vcpu is created, PML will be disabled unexpectedly while log-dirty code still thinks PML is used. Fix this by clearing SECONDARY_EXEC_ENABLE_PML in vmx_secondary_exec_control only when PML is not supported or not enabled (!enable_pml). This is more reasonable as PML is currently either always enabled or disabled. With this explicit updating SECONDARY_EXEC_ENABLE_PML in vmx_enable{disable}_pml is not needed so also rename vmx_enable{disable}_pml to vmx_create{destroy}_pml_buffer. Fixes: feda805fe7c4ed9cf78158e73b1218752e3b4314 Signed-off-by: Kai Huang [While at it, change a wrong ASSERT to an "if". The condition can happen if creating the VCPU fails with ENOMEM. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2ac116417583..5eb56ed77c1f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4718,8 +4718,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) a current VMCS12 */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - /* PML is enabled/disabled in creating/destorying vcpu */ - exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + + if (!enable_pml) + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; /* Currently, we allow L1 guest to directly run pcommit instruction. */ exec_control &= ~SECONDARY_EXEC_PCOMMIT; @@ -7804,7 +7805,7 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = vmcs_read32(VM_EXIT_INTR_INFO); } -static int vmx_enable_pml(struct vcpu_vmx *vmx) +static int vmx_create_pml_buffer(struct vcpu_vmx *vmx) { struct page *pml_pg; @@ -7817,18 +7818,15 @@ static int vmx_enable_pml(struct vcpu_vmx *vmx) vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML); - return 0; } -static void vmx_disable_pml(struct vcpu_vmx *vmx) +static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) { - ASSERT(vmx->pml_pg); - __free_page(vmx->pml_pg); - vmx->pml_pg = NULL; - - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML); + if (vmx->pml_pg) { + __free_page(vmx->pml_pg); + vmx->pml_pg = NULL; + } } static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) @@ -8706,7 +8704,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); if (enable_pml) - vmx_disable_pml(vmx); + vmx_destroy_pml_buffer(vmx); free_vpid(vmx->vpid); leave_guest_mode(vcpu); vmx_load_vmcs01(vcpu); @@ -8790,7 +8788,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) * for the guest, etc. */ if (enable_pml) { - err = vmx_enable_pml(vmx); + err = vmx_create_pml_buffer(vmx); if (err) goto free_vmcs; } -- cgit v1.2.3 From 25add7ec708170e4eaef1f9793a07803b2fb5c71 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 5 Nov 2015 18:51:03 -0800 Subject: kasan: update log messages We decided to use KASAN as the short name of the tool and KernelAddressSanitizer as the full one. Update log messages according to that. Signed-off-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Konstantin Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/kasan_init_64.c | 2 +- mm/kasan/kasan.c | 2 +- mm/kasan/report.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 9ce5da27b136..d470cf219a2d 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -126,5 +126,5 @@ void __init kasan_init(void) __flush_tlb_all(); init_task.kasan_depth = 0; - pr_info("Kernel address sanitizer initialized\n"); + pr_info("KernelAddressSanitizer initialized\n"); } diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 1104cb0171e2..be9b78d7c2c4 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -518,7 +518,7 @@ static int kasan_mem_notifier(struct notifier_block *nb, static int __init kasan_memhotplug_init(void) { - pr_err("WARNING: KASan doesn't support memory hot-add\n"); + pr_err("WARNING: KASAN doesn't support memory hot-add\n"); pr_err("Memory hot-add will be disabled\n"); hotplug_memory_notifier(kasan_mem_notifier, 0); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index be53a8f9e5dd..ae6bd3697c91 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -91,7 +91,7 @@ static void print_error_description(struct kasan_access_info *info) break; } - pr_err("BUG: KASan: %s in %pS at addr %p\n", + pr_err("BUG: KASAN: %s in %pS at addr %p\n", bug_type, (void *)info->ip, info->access_addr); pr_err("%s of size %zu by task %s/%d\n", @@ -224,7 +224,7 @@ static void kasan_report_error(struct kasan_access_info *info) bug_type = "user-memory-access"; else bug_type = "wild-memory-access"; - pr_err("BUG: KASan: %s on address %p\n", + pr_err("BUG: KASAN: %s on address %p\n", bug_type, info->access_addr); pr_err("%s of size %zu by task %s/%d\n", info->is_write ? "Write" : "Read", -- cgit v1.2.3 From c63f06dd15797736c31dc86e6392811d0bac34a1 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 5 Nov 2015 18:51:09 -0800 Subject: kasan: move KASAN_SANITIZE in arch/x86/boot/Makefile Move KASAN_SANITIZE in arch/x86/boot/Makefile above the comment related to SVGA_MODE, since the comment refers to 'the next line'. Signed-off-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Konstantin Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/boot/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 0d553e54171b..2ee62dba0373 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -9,13 +9,13 @@ # Changed by many, many contributors over the years. # +KASAN_SANITIZE := n + # If you want to preset the SVGA mode, uncomment the next line and # set SVGA_MODE to whatever number you want. # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. # The number is the same as you would ordinarily press at bootup. -KASAN_SANITIZE := n - SVGA_MODE := -DSVGA_MODE=NORMAL_VGA targets := vmlinux.bin setup.bin setup.elf bzImage -- cgit v1.2.3 From a8ca5d0ecbdde5cc3d7accacbd69968b0c98764e Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 5 Nov 2015 18:51:33 -0800 Subject: mm: mlock: add new mlock system call With the refactored mlock code, introduce a new system call for mlock. The new call will allow the user to specify what lock states are being added. mlock2 is trivial at the moment, but a follow on patch will add a new mlock state making it useful. Signed-off-by: Eric B Munson Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Heiko Carstens Cc: Geert Uytterhoeven Cc: Catalin Marinas Cc: Stephen Rothwell Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Michael Kerrisk Cc: Ralf Baechle Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 2 ++ include/uapi/asm-generic/unistd.h | 4 +++- kernel/sys_ni.c | 1 + mm/mlock.c | 8 ++++++++ 6 files changed, 16 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index caa2c712d1e7..f17705e1332c 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -382,3 +382,4 @@ 373 i386 shutdown sys_shutdown 374 i386 userfaultfd sys_userfaultfd 375 i386 membarrier sys_membarrier +376 i386 mlock2 sys_mlock2 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 278842fdf1f6..314a90bfc09c 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -331,6 +331,7 @@ 322 64 execveat stub_execveat 323 common userfaultfd sys_userfaultfd 324 common membarrier sys_membarrier +325 common mlock2 sys_mlock2 # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a460e2ef2843..a156b82dd14c 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -887,4 +887,6 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename, asmlinkage long sys_membarrier(int cmd, int flags); +asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags); + #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index ee124009e12a..1324b0292ec2 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -713,9 +713,11 @@ __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat) __SYSCALL(__NR_userfaultfd, sys_userfaultfd) #define __NR_membarrier 283 __SYSCALL(__NR_membarrier, sys_membarrier) +#define __NR_mlock2 284 +__SYSCALL(__NR_mlock2, sys_mlock2) #undef __NR_syscalls -#define __NR_syscalls 284 +#define __NR_syscalls 285 /* * All syscalls below here should go away really, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index a02decf15583..0623787ec67a 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -194,6 +194,7 @@ cond_syscall(sys_mlock); cond_syscall(sys_munlock); cond_syscall(sys_mlockall); cond_syscall(sys_munlockall); +cond_syscall(sys_mlock2); cond_syscall(sys_mincore); cond_syscall(sys_madvise); cond_syscall(sys_mremap); diff --git a/mm/mlock.c b/mm/mlock.c index fbd8c03f7b37..35dcf8fa7195 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -644,6 +644,14 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) return do_mlock(start, len, VM_LOCKED); } +SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags) +{ + if (flags) + return -EINVAL; + + return do_mlock(start, len, VM_LOCKED); +} + SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; -- cgit v1.2.3 From e2391a2dcaca41c6de0fbc25329969fc7142f20d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 5 Nov 2015 15:18:03 -0600 Subject: livepatch: Fix crash with !CONFIG_DEBUG_SET_MODULE_RONX When loading a patch module on a kernel with !CONFIG_DEBUG_SET_MODULE_RONX, the following crash occurs: [ 205.988776] livepatch: enabling patch 'kpatch_meminfo_string' [ 205.989829] BUG: unable to handle kernel paging request at ffffffffa08d2fc0 [ 205.989863] IP: [] do_init_module+0x8c/0x1ba [ 205.989888] PGD 1a10067 PUD 1a11063 PMD 7bcde067 PTE 3740e161 [ 205.989915] Oops: 0003 [#1] SMP [ 205.990187] CPU: 2 PID: 14570 Comm: insmod Tainted: G O K 4.1.12 [ 205.990214] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.1-20150318_183358- 04/01/2014 [ 205.990249] task: ffff8800374aaa90 ti: ffff8800794b8000 task.ti: ffff8800794b8000 [ 205.990276] RIP: 0010:[] [] do_init_module+0x8c/0x1ba [ 205.990307] RSP: 0018:ffff8800794bbd58 EFLAGS: 00010246 [ 205.990327] RAX: 0000000000000000 RBX: ffffffffa08d2fc0 RCX: 0000000000000000 [ 205.990356] RDX: 01ffff8000000080 RSI: 0000000000000000 RDI: ffffffff81a54b40 [ 205.990382] RBP: ffff88007b4c4d80 R08: 0000000000000007 R09: 0000000000000000 [ 205.990408] R10: 0000000000000008 R11: ffffea0001f18840 R12: 0000000000000000 [ 205.990433] R13: 0000000000000001 R14: ffffffffa08d2fc0 R15: ffff88007bd0bc40 [ 205.990459] FS: 00007f1128fbc700(0000) GS:ffff88007fc80000(0000) knlGS:0000000000000000 [ 205.990488] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 205.990509] CR2: ffffffffa08d2fc0 CR3: 000000002606e000 CR4: 00000000001406e0 [ 205.990536] Stack: [ 205.990545] ffff8800794bbec8 0000000000000001 ffffffffa08d3010 ffffffff810ecea9 [ 205.990576] ffffffff810e8e40 000000000005f360 ffff88007bd0bc50 ffffffffa08d3240 [ 205.990608] ffffffffa08d52c0 ffffffffa08d3210 ffff8800794bbed8 ffff8800794bbf1c [ 205.990639] Call Trace: [ 205.990651] [] ? load_module+0x1e59/0x23a0 [ 205.990672] [] ? store_uevent+0x40/0x40 [ 205.990693] [] ? copy_module_from_fd.isra.49+0xb5/0x140 [ 205.990718] [] ? SyS_finit_module+0x7d/0xa0 [ 205.990741] [] ? system_call_fastpath+0x16/0x75 [ 205.990763] Code: f9 00 00 00 74 23 49 c7 c0 92 e1 60 81 48 8d 53 18 89 c1 4c 89 c6 48 c7 c7 f0 85 7d 81 31 c0 e8 71 fa ff ff e8 58 0e 00 00 31 f6 03 00 00 00 00 48 89 da 48 c7 c7 20 c7 a5 81 e8 d0 ec b3 ff [ 205.990916] RIP [] do_init_module+0x8c/0x1ba [ 205.990940] RSP [ 205.990953] CR2: ffffffffa08d2fc0 With !CONFIG_DEBUG_SET_MODULE_RONX, module text and rodata pages are writable, and the debug_align() macro allows the module struct to share a page with executable text. When klp_write_module_reloc() calls set_memory_ro() on the page, it effectively turns the module struct into a read-only structure, resulting in a page fault when load_module() does "mod->state = MODULE_STATE_LIVE". Reported-by: Cyril B. Tested-by: Cyril B. Signed-off-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- arch/x86/kernel/livepatch.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c index ff3c3101d003..d1d35ccffed3 100644 --- a/arch/x86/kernel/livepatch.c +++ b/arch/x86/kernel/livepatch.c @@ -42,7 +42,6 @@ int klp_write_module_reloc(struct module *mod, unsigned long type, bool readonly; unsigned long val; unsigned long core = (unsigned long)mod->module_core; - unsigned long core_ro_size = mod->core_ro_size; unsigned long core_size = mod->core_size; switch (type) { @@ -70,10 +69,12 @@ int klp_write_module_reloc(struct module *mod, unsigned long type, /* loc does not point to any symbol inside the module */ return -EINVAL; - if (loc < core + core_ro_size) + readonly = false; + +#ifdef CONFIG_DEBUG_SET_MODULE_RONX + if (loc < core + mod->core_ro_size) readonly = true; - else - readonly = false; +#endif /* determine if the relocation spans a page boundary */ numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2; -- cgit v1.2.3 From 54727e6e950aacd14ec9cd4260e9fe498322828c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 6 Nov 2015 09:12:41 -0800 Subject: x86: don't make DEBUG_WX default to 'y' even with DEBUG_RODATA It turns out that we still have issues with the EFI memory map that ends up polluting our kernel page tables with writable executable pages. That will get sorted out, but in the meantime let's not make the scary complaint about them be on by default. The code is useful for developers, but not ready for end user testing yet. Acked-by: Borislav Petkov Acked-by: Ingo Molnar Signed-off-by: Linus Torvalds --- arch/x86/Kconfig.debug | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 3e0baf726eef..137dfa96aa14 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -113,7 +113,6 @@ config DEBUG_RODATA_TEST config DEBUG_WX bool "Warn on W+X mappings at boot" depends on DEBUG_RODATA - default y select X86_PTDUMP_CORE ---help--- Generate a warning if any W+X mappings are found at boot. -- cgit v1.2.3 From ec2c6c01ff241f8fad95c79de3364c76a474b3fa Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sun, 25 Oct 2015 21:41:10 +0100 Subject: um: Remove dead code from x86_64 syscall stub syscall_stub is dead code as um is using only batch_syscall_stub. Signed-off-by: Richard Weinberger --- arch/x86/um/stub_64.S | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/stub_64.S b/arch/x86/um/stub_64.S index 7160b20172d0..a212445358d6 100644 --- a/arch/x86/um/stub_64.S +++ b/arch/x86/um/stub_64.S @@ -1,19 +1,6 @@ #include - .globl syscall_stub .section .__syscall_stub, "ax" -syscall_stub: - syscall - /* We don't have 64-bit constants, so this constructs the address - * we need. - */ - movq $(STUB_DATA >> 32), %rbx - salq $32, %rbx - movq $(STUB_DATA & 0xffffffff), %rcx - or %rcx, %rbx - movq %rax, (%rbx) - int3 - .globl batch_syscall_stub batch_syscall_stub: mov $(STUB_DATA >> 32), %rbx -- cgit v1.2.3 From 246d254f1ab62bc83ac84a8ebd263e5384412ce8 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sun, 25 Oct 2015 21:41:10 +0100 Subject: um: Remove dead symbol from i386 syscall stub syscall_stub is nowhere used these days. Signed-off-by: Richard Weinberger --- arch/x86/um/stub_32.S | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/um/stub_32.S b/arch/x86/um/stub_32.S index b972649d3a18..98816804e131 100644 --- a/arch/x86/um/stub_32.S +++ b/arch/x86/um/stub_32.S @@ -1,6 +1,5 @@ #include - .globl syscall_stub .section .__syscall_stub, "ax" .globl batch_syscall_stub -- cgit v1.2.3 From 1b2411c283e8e178b1c57d07f7fe082442a0927b Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Mon, 26 Oct 2015 10:38:27 +0100 Subject: um: Simplify STUB_DATA loading As long STUB_DATA fits into 32bits we can use a plain mov. If it will grow at some point in future we will switch to movabsq. In any case the code is smaller and more easy to read than the current one Signed-off-by: Richard Weinberger --- arch/x86/um/stub_64.S | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/stub_64.S b/arch/x86/um/stub_64.S index a212445358d6..ba914b3b8cc4 100644 --- a/arch/x86/um/stub_64.S +++ b/arch/x86/um/stub_64.S @@ -3,10 +3,7 @@ .section .__syscall_stub, "ax" .globl batch_syscall_stub batch_syscall_stub: - mov $(STUB_DATA >> 32), %rbx - sal $32, %rbx - mov $(STUB_DATA & 0xffffffff), %rax - or %rax, %rbx + mov $(STUB_DATA), %rbx /* load pointer to first operation */ mov %rbx, %rsp add $0x10, %rsp -- cgit v1.2.3 From d0164adc89f6bb374d304ffcc375c6d2652fe67d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:21 -0800 Subject: mm, page_alloc: distinguish between being unable to sleep, unwilling to sleep and avoiding waking kswapd __GFP_WAIT has been used to identify atomic context in callers that hold spinlocks or are in interrupts. They are expected to be high priority and have access one of two watermarks lower than "min" which can be referred to as the "atomic reserve". __GFP_HIGH users get access to the first lower watermark and can be called the "high priority reserve". Over time, callers had a requirement to not block when fallback options were available. Some have abused __GFP_WAIT leading to a situation where an optimisitic allocation with a fallback option can access atomic reserves. This patch uses __GFP_ATOMIC to identify callers that are truely atomic, cannot sleep and have no alternative. High priority users continue to use __GFP_HIGH. __GFP_DIRECT_RECLAIM identifies callers that can sleep and are willing to enter direct reclaim. __GFP_KSWAPD_RECLAIM to identify callers that want to wake kswapd for background reclaim. __GFP_WAIT is redefined as a caller that is willing to enter direct reclaim and wake kswapd for background reclaim. This patch then converts a number of sites o __GFP_ATOMIC is used by callers that are high priority and have memory pools for those requests. GFP_ATOMIC uses this flag. o Callers that have a limited mempool to guarantee forward progress clear __GFP_DIRECT_RECLAIM but keep __GFP_KSWAPD_RECLAIM. bio allocations fall into this category where kswapd will still be woken but atomic reserves are not used as there is a one-entry mempool to guarantee progress. o Callers that are checking if they are non-blocking should use the helper gfpflags_allow_blocking() where possible. This is because checking for __GFP_WAIT as was done historically now can trigger false positives. Some exceptions like dm-crypt.c exist where the code intent is clearer if __GFP_DIRECT_RECLAIM is used instead of the helper due to flag manipulations. o Callers that built their own GFP flags instead of starting with GFP_KERNEL and friends now also need to specify __GFP_KSWAPD_RECLAIM. The first key hazard to watch out for is callers that removed __GFP_WAIT and was depending on access to atomic reserves for inconspicuous reasons. In some cases it may be appropriate for them to use __GFP_HIGH. The second key hazard is callers that assembled their own combination of GFP flags instead of starting with something like GFP_KERNEL. They may now wish to specify __GFP_KSWAPD_RECLAIM. It's almost certainly harmless if it's missed in most cases as other activity will wake kswapd. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Christoph Lameter Cc: David Rientjes Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/balance | 14 ++++--- arch/arm/mm/dma-mapping.c | 6 +-- arch/arm/xen/mm.c | 2 +- arch/arm64/mm/dma-mapping.c | 4 +- arch/x86/kernel/pci-dma.c | 2 +- block/bio.c | 26 ++++++------ block/blk-core.c | 16 ++++---- block/blk-ioc.c | 2 +- block/blk-mq-tag.c | 2 +- block/blk-mq.c | 6 +-- drivers/block/drbd/drbd_receiver.c | 3 +- drivers/block/osdblk.c | 2 +- drivers/connector/connector.c | 3 +- drivers/firewire/core-cdev.c | 2 +- drivers/gpu/drm/i915/i915_gem.c | 2 +- drivers/infiniband/core/sa_query.c | 2 +- drivers/iommu/amd_iommu.c | 2 +- drivers/iommu/intel-iommu.c | 2 +- drivers/md/dm-crypt.c | 6 +-- drivers/md/dm-kcopyd.c | 2 +- drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c | 2 +- drivers/media/pci/solo6x10/solo6x10-v4l2.c | 2 +- drivers/media/pci/tw68/tw68-video.c | 2 +- drivers/mtd/mtdcore.c | 3 +- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 2 +- drivers/staging/android/ion/ion_system_heap.c | 2 +- .../lustre/include/linux/libcfs/libcfs_private.h | 2 +- drivers/usb/host/u132-hcd.c | 2 +- drivers/video/fbdev/vermilion/vermilion.c | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent_io.c | 14 +++---- fs/btrfs/volumes.c | 4 +- fs/ext4/super.c | 2 +- fs/fscache/cookie.c | 2 +- fs/fscache/page.c | 6 +-- fs/jbd2/transaction.c | 4 +- fs/nfs/file.c | 6 +-- fs/xfs/xfs_qm.c | 2 +- include/linux/gfp.h | 46 ++++++++++++++++------ include/linux/skbuff.h | 6 +-- include/net/sock.h | 2 +- include/trace/events/gfpflags.h | 5 ++- kernel/audit.c | 6 +-- kernel/cgroup.c | 2 +- kernel/locking/lockdep.c | 2 +- kernel/power/snapshot.c | 2 +- kernel/smp.c | 2 +- lib/idr.c | 4 +- lib/radix-tree.c | 10 ++--- mm/backing-dev.c | 2 +- mm/dmapool.c | 2 +- mm/memcontrol.c | 6 +-- mm/mempool.c | 10 ++--- mm/migrate.c | 2 +- mm/page_alloc.c | 43 ++++++++++++-------- mm/slab.c | 18 ++++----- mm/slub.c | 10 ++--- mm/vmalloc.c | 2 +- mm/vmscan.c | 4 +- mm/zswap.c | 5 ++- net/core/skbuff.c | 8 ++-- net/core/sock.c | 6 ++- net/netlink/af_netlink.c | 2 +- net/rds/ib_recv.c | 4 +- net/rxrpc/ar-connection.c | 2 +- net/sctp/associola.c | 2 +- 66 files changed, 210 insertions(+), 172 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/vm/balance b/Documentation/vm/balance index c46e68cf9344..964595481af6 100644 --- a/Documentation/vm/balance +++ b/Documentation/vm/balance @@ -1,12 +1,14 @@ Started Jan 2000 by Kanoj Sarcar -Memory balancing is needed for non __GFP_WAIT as well as for non -__GFP_IO allocations. +Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as +well as for non __GFP_IO allocations. -There are two reasons to be requesting non __GFP_WAIT allocations: -the caller can not sleep (typically intr context), or does not want -to incur cost overheads of page stealing and possible swap io for -whatever reasons. +The first reason why a caller may avoid reclaim is that the caller can not +sleep due to holding a spinlock or is in interrupt context. The second may +be that the caller is willing to fail the allocation without incurring the +overhead of page reclaim. This may happen for opportunistic high-order +allocation requests that have order-0 fallback options. In such cases, +the caller may also wish to avoid waking kswapd. __GFP_IO allocation requests are made to prevent file system deadlocks. diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index ad4eb2d26e16..e62400e5fb99 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -651,12 +651,12 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, if (nommu()) addr = __alloc_simple_buffer(dev, size, gfp, &page); - else if (dev_get_cma_area(dev) && (gfp & __GFP_WAIT)) + else if (dev_get_cma_area(dev) && (gfp & __GFP_DIRECT_RECLAIM)) addr = __alloc_from_contiguous(dev, size, prot, &page, caller, want_vaddr); else if (is_coherent) addr = __alloc_simple_buffer(dev, size, gfp, &page); - else if (!(gfp & __GFP_WAIT)) + else if (!gfpflags_allow_blocking(gfp)) addr = __alloc_from_pool(size, &page); else addr = __alloc_remap_buffer(dev, size, gfp, prot, &page, @@ -1363,7 +1363,7 @@ static void *arm_iommu_alloc_attrs(struct device *dev, size_t size, *handle = DMA_ERROR_CODE; size = PAGE_ALIGN(size); - if (!(gfp & __GFP_WAIT)) + if (!gfpflags_allow_blocking(gfp)) return __iommu_alloc_atomic(dev, size, handle); /* diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index 7c34f7126b04..c5f9a9e3d1f3 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -25,7 +25,7 @@ unsigned long xen_get_swiotlb_free_pages(unsigned int order) { struct memblock_region *reg; - gfp_t flags = __GFP_NOWARN; + gfp_t flags = __GFP_NOWARN|__GFP_KSWAPD_RECLAIM; for_each_memblock(memory, reg) { if (reg->base < (phys_addr_t)0xffffffff) { diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 6320361d8d4c..bb4bf6a06ad6 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -100,7 +100,7 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size, if (IS_ENABLED(CONFIG_ZONE_DMA) && dev->coherent_dma_mask <= DMA_BIT_MASK(32)) flags |= GFP_DMA; - if (dev_get_cma_area(dev) && (flags & __GFP_WAIT)) { + if (dev_get_cma_area(dev) && gfpflags_allow_blocking(flags)) { struct page *page; void *addr; @@ -148,7 +148,7 @@ static void *__dma_alloc(struct device *dev, size_t size, size = PAGE_ALIGN(size); - if (!coherent && !(flags & __GFP_WAIT)) { + if (!coherent && !gfpflags_allow_blocking(flags)) { struct page *page = NULL; void *addr = __alloc_from_pool(size, &page, flags); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index cd99433b8ba1..6ba014c61d62 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -90,7 +90,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, again: page = NULL; /* CMA can be used only in the context which permits sleeping */ - if (flag & __GFP_WAIT) { + if (gfpflags_allow_blocking(flag)) { page = dma_alloc_from_contiguous(dev, count, get_order(size)); if (page && page_to_phys(page) + size > dma_mask) { dma_release_from_contiguous(dev, page, count); diff --git a/block/bio.c b/block/bio.c index ad3f276d74bc..4f184d938942 100644 --- a/block/bio.c +++ b/block/bio.c @@ -211,7 +211,7 @@ fallback: bvl = mempool_alloc(pool, gfp_mask); } else { struct biovec_slab *bvs = bvec_slabs + *idx; - gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); + gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); /* * Make this allocation restricted and don't dump info on @@ -221,11 +221,11 @@ fallback: __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; /* - * Try a slab allocation. If this fails and __GFP_WAIT + * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM * is set, retry with the 1-entry mempool */ bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); - if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) { + if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { *idx = BIOVEC_MAX_IDX; goto fallback; } @@ -395,12 +395,12 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is * backed by the @bs's mempool. * - * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be - * able to allocate a bio. This is due to the mempool guarantees. To make this - * work, callers must never allocate more than 1 bio at a time from this pool. - * Callers that need to allocate more than 1 bio must always submit the - * previously allocated bio for IO before attempting to allocate a new one. - * Failure to do so can cause deadlocks under memory pressure. + * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will + * always be able to allocate a bio. This is due to the mempool guarantees. + * To make this work, callers must never allocate more than 1 bio at a time + * from this pool. Callers that need to allocate more than 1 bio must always + * submit the previously allocated bio for IO before attempting to allocate + * a new one. Failure to do so can cause deadlocks under memory pressure. * * Note that when running under generic_make_request() (i.e. any block * driver), bios are not submitted until after you return - see the code in @@ -459,13 +459,13 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) * We solve this, and guarantee forward progress, with a rescuer * workqueue per bio_set. If we go to allocate and there are * bios on current->bio_list, we first try the allocation - * without __GFP_WAIT; if that fails, we punt those bios we - * would be blocking to the rescuer workqueue before we retry - * with the original gfp_flags. + * without __GFP_DIRECT_RECLAIM; if that fails, we punt those + * bios we would be blocking to the rescuer workqueue before + * we retry with the original gfp_flags. */ if (current->bio_list && !bio_list_empty(current->bio_list)) - gfp_mask &= ~__GFP_WAIT; + gfp_mask &= ~__GFP_DIRECT_RECLAIM; p = mempool_alloc(bs->bio_pool, gfp_mask); if (!p && gfp_mask != saved_gfp) { diff --git a/block/blk-core.c b/block/blk-core.c index 89eec7965870..9e32f0868e36 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1206,8 +1206,8 @@ rq_starved: * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask * - * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this - * function keeps retrying under memory pressure and fails iff @q is dead. + * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask, + * this function keeps retrying under memory pressure and fails iff @q is dead. * * Must be called with @q->queue_lock held and, * Returns ERR_PTR on failure, with @q->queue_lock held. @@ -1227,7 +1227,7 @@ retry: if (!IS_ERR(rq)) return rq; - if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) { + if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) { blk_put_rl(rl); return rq; } @@ -1305,11 +1305,11 @@ EXPORT_SYMBOL(blk_get_request); * BUG. * * WARNING: When allocating/cloning a bio-chain, careful consideration should be - * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for - * anything but the first bio in the chain. Otherwise you risk waiting for IO - * completion of a bio that hasn't been submitted yet, thus resulting in a - * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead - * of bio_alloc(), as that avoids the mempool deadlock. + * given to how you allocate bios. In particular, you cannot use + * __GFP_DIRECT_RECLAIM for anything but the first bio in the chain. Otherwise + * you risk waiting for IO completion of a bio that hasn't been submitted yet, + * thus resulting in a deadlock. Alternatively bios should be allocated using + * bio_kmalloc() instead of bio_alloc(), as that avoids the mempool deadlock. * If possible a big IO should be split into smaller parts when allocation * fails. Partial allocation should not be an error, or you risk a live-lock. */ diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 1a27f45ec776..381cb50a673c 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -289,7 +289,7 @@ struct io_context *get_task_io_context(struct task_struct *task, { struct io_context *ioc; - might_sleep_if(gfp_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_flags)); do { task_lock(task); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 60ac684c8b8c..a07ca3488d96 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -268,7 +268,7 @@ static int bt_get(struct blk_mq_alloc_data *data, if (tag != -1) return tag; - if (!(data->gfp & __GFP_WAIT)) + if (!gfpflags_allow_blocking(data->gfp)) return -1; bs = bt_wait_ptr(bt, hctx); diff --git a/block/blk-mq.c b/block/blk-mq.c index 1c27b3eaef64..68c0a3416b34 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -244,11 +244,11 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT, + blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM, reserved, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); - if (!rq && (gfp & __GFP_WAIT)) { + if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) { __blk_mq_run_hw_queue(hctx); blk_mq_put_ctx(ctx); @@ -1186,7 +1186,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); blk_mq_set_alloc_data(&alloc_data, q, - __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx); + __GFP_WAIT|__GFP_HIGH, false, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); ctx = alloc_data.ctx; hctx = alloc_data.hctx; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index c097909c589c..b4b5680ac6ad 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -357,7 +357,8 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto } if (has_payload && data_size) { - page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT)); + page = drbd_alloc_pages(peer_device, nr_pages, + gfpflags_allow_blocking(gfp_mask)); if (!page) goto fail; } diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c index e22942596207..1b709a4e3b5e 100644 --- a/drivers/block/osdblk.c +++ b/drivers/block/osdblk.c @@ -271,7 +271,7 @@ static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask) goto err_out; tmp->bi_bdev = NULL; - gfpmask &= ~__GFP_WAIT; + gfpmask &= ~__GFP_DIRECT_RECLAIM; tmp->bi_next = NULL; if (!new_chain) diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c index 30f522848c73..d7373ca69c99 100644 --- a/drivers/connector/connector.c +++ b/drivers/connector/connector.c @@ -124,7 +124,8 @@ int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 __group, if (group) return netlink_broadcast(dev->nls, skb, portid, group, gfp_mask); - return netlink_unicast(dev->nls, skb, portid, !(gfp_mask&__GFP_WAIT)); + return netlink_unicast(dev->nls, skb, portid, + !gfpflags_allow_blocking(gfp_mask)); } EXPORT_SYMBOL_GPL(cn_netlink_send_mult); diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 2a3973a7c441..36a7c2d89a01 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -486,7 +486,7 @@ static int ioctl_get_info(struct client *client, union ioctl_arg *arg) static int add_client_resource(struct client *client, struct client_resource *resource, gfp_t gfp_mask) { - bool preload = !!(gfp_mask & __GFP_WAIT); + bool preload = gfpflags_allow_blocking(gfp_mask); unsigned long flags; int ret; diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 4d631a946481..d58cb9e034fe 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2215,7 +2215,7 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj) */ mapping = file_inode(obj->base.filp)->i_mapping; gfp = mapping_gfp_mask(mapping); - gfp |= __GFP_NORETRY | __GFP_NOWARN | __GFP_NO_KSWAPD; + gfp |= __GFP_NORETRY | __GFP_NOWARN; gfp &= ~(__GFP_IO | __GFP_WAIT); sg = st->sgl; st->nents = 0; diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 8c014b33d8e0..59ab264c99c4 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1083,7 +1083,7 @@ static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent) static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) { - bool preload = !!(gfp_mask & __GFP_WAIT); + bool preload = gfpflags_allow_blocking(gfp_mask); unsigned long flags; int ret, id; diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 0d533bba4ad1..8b2be1e7714f 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2668,7 +2668,7 @@ static void *alloc_coherent(struct device *dev, size_t size, page = alloc_pages(flag | __GFP_NOWARN, get_order(size)); if (!page) { - if (!(flag & __GFP_WAIT)) + if (!gfpflags_allow_blocking(flag)) return NULL; page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT, diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 7cf80c1a8a16..f1042daef9ad 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -3647,7 +3647,7 @@ static void *intel_alloc_coherent(struct device *dev, size_t size, flags |= GFP_DMA32; } - if (flags & __GFP_WAIT) { + if (gfpflags_allow_blocking(flags)) { unsigned int count = size >> PAGE_SHIFT; page = dma_alloc_from_contiguous(dev, count, order); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 3729b394432c..917d47e290ae 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -994,7 +994,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size) struct bio_vec *bvec; retry: - if (unlikely(gfp_mask & __GFP_WAIT)) + if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) mutex_lock(&cc->bio_alloc_lock); clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); @@ -1010,7 +1010,7 @@ retry: if (!page) { crypt_free_buffer_pages(cc, clone); bio_put(clone); - gfp_mask |= __GFP_WAIT; + gfp_mask |= __GFP_DIRECT_RECLAIM; goto retry; } @@ -1027,7 +1027,7 @@ retry: } return_clone: - if (unlikely(gfp_mask & __GFP_WAIT)) + if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) mutex_unlock(&cc->bio_alloc_lock); return clone; diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 3a7cade5e27d..1452ed9aacb4 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -244,7 +244,7 @@ static int kcopyd_get_pages(struct dm_kcopyd_client *kc, *pages = NULL; do { - pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY); + pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY | __GFP_KSWAPD_RECLAIM); if (unlikely(!pl)) { /* Use reserved pages */ pl = kc->pages; diff --git a/drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c b/drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c index 1bd2fd47421f..4432fd69b7cb 100644 --- a/drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c +++ b/drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c @@ -1297,7 +1297,7 @@ static struct solo_enc_dev *solo_enc_alloc(struct solo_dev *solo_dev, solo_enc->vidq.ops = &solo_enc_video_qops; solo_enc->vidq.mem_ops = &vb2_dma_sg_memops; solo_enc->vidq.drv_priv = solo_enc; - solo_enc->vidq.gfp_flags = __GFP_DMA32; + solo_enc->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM; solo_enc->vidq.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; solo_enc->vidq.buf_struct_size = sizeof(struct solo_vb2_buf); solo_enc->vidq.lock = &solo_enc->lock; diff --git a/drivers/media/pci/solo6x10/solo6x10-v4l2.c b/drivers/media/pci/solo6x10/solo6x10-v4l2.c index 26df903585d7..f7ce493b1fee 100644 --- a/drivers/media/pci/solo6x10/solo6x10-v4l2.c +++ b/drivers/media/pci/solo6x10/solo6x10-v4l2.c @@ -678,7 +678,7 @@ int solo_v4l2_init(struct solo_dev *solo_dev, unsigned nr) solo_dev->vidq.mem_ops = &vb2_dma_contig_memops; solo_dev->vidq.drv_priv = solo_dev; solo_dev->vidq.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; - solo_dev->vidq.gfp_flags = __GFP_DMA32; + solo_dev->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM; solo_dev->vidq.buf_struct_size = sizeof(struct solo_vb2_buf); solo_dev->vidq.lock = &solo_dev->lock; ret = vb2_queue_init(&solo_dev->vidq); diff --git a/drivers/media/pci/tw68/tw68-video.c b/drivers/media/pci/tw68/tw68-video.c index 4c3293dcddbc..46642ef9151b 100644 --- a/drivers/media/pci/tw68/tw68-video.c +++ b/drivers/media/pci/tw68/tw68-video.c @@ -979,7 +979,7 @@ int tw68_video_init2(struct tw68_dev *dev, int video_nr) dev->vidq.ops = &tw68_video_qops; dev->vidq.mem_ops = &vb2_dma_sg_memops; dev->vidq.drv_priv = dev; - dev->vidq.gfp_flags = __GFP_DMA32; + dev->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM; dev->vidq.buf_struct_size = sizeof(struct tw68_buf); dev->vidq.lock = &dev->lock; dev->vidq.min_buffers_needed = 2; diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 8bbbb751bf45..2dfb291a47c6 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -1188,8 +1188,7 @@ EXPORT_SYMBOL_GPL(mtd_writev); */ void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size) { - gfp_t flags = __GFP_NOWARN | __GFP_WAIT | - __GFP_NORETRY | __GFP_NO_KSWAPD; + gfp_t flags = __GFP_NOWARN | __GFP_DIRECT_RECLAIM | __GFP_NORETRY; size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE); void *kbuf; diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index 44173be5cbf0..f8d7a2f06950 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -691,7 +691,7 @@ static void *bnx2x_frag_alloc(const struct bnx2x_fastpath *fp, gfp_t gfp_mask) { if (fp->rx_frag_size) { /* GFP_KERNEL allocations are used only during initialization */ - if (unlikely(gfp_mask & __GFP_WAIT)) + if (unlikely(gfpflags_allow_blocking(gfp_mask))) return (void *)__get_free_page(gfp_mask); return netdev_alloc_frag(fp->rx_frag_size); diff --git a/drivers/staging/android/ion/ion_system_heap.c b/drivers/staging/android/ion/ion_system_heap.c index ada724aab3d5..d4c3e5512dd5 100644 --- a/drivers/staging/android/ion/ion_system_heap.c +++ b/drivers/staging/android/ion/ion_system_heap.c @@ -27,7 +27,7 @@ #include "ion_priv.h" static gfp_t high_order_gfp_flags = (GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN | - __GFP_NORETRY) & ~__GFP_WAIT; + __GFP_NORETRY) & ~__GFP_DIRECT_RECLAIM; static gfp_t low_order_gfp_flags = (GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN); static const unsigned int orders[] = {8, 4, 0}; static const int num_orders = ARRAY_SIZE(orders); diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h index 6af733de69ca..f0b0423a716b 100644 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h @@ -95,7 +95,7 @@ do { \ do { \ LASSERT(!in_interrupt() || \ ((size) <= LIBCFS_VMALLOC_SIZE && \ - ((mask) & __GFP_WAIT) == 0)); \ + !gfpflags_allow_blocking(mask))); \ } while (0) #define LIBCFS_ALLOC_POST(ptr, size) \ diff --git a/drivers/usb/host/u132-hcd.c b/drivers/usb/host/u132-hcd.c index 0a94895a358d..692ccc69345e 100644 --- a/drivers/usb/host/u132-hcd.c +++ b/drivers/usb/host/u132-hcd.c @@ -2244,7 +2244,7 @@ static int u132_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, { struct u132 *u132 = hcd_to_u132(hcd); if (irqs_disabled()) { - if (__GFP_WAIT & mem_flags) { + if (gfpflags_allow_blocking(mem_flags)) { printk(KERN_ERR "invalid context for function that might sleep\n"); return -EINVAL; } diff --git a/drivers/video/fbdev/vermilion/vermilion.c b/drivers/video/fbdev/vermilion/vermilion.c index 6b70d7f62b2f..1c1e95a0b8fa 100644 --- a/drivers/video/fbdev/vermilion/vermilion.c +++ b/drivers/video/fbdev/vermilion/vermilion.c @@ -99,7 +99,7 @@ static int vmlfb_alloc_vram_area(struct vram_area *va, unsigned max_order, * below the first 16MB. */ - flags = __GFP_DMA | __GFP_HIGH; + flags = __GFP_DMA | __GFP_HIGH | __GFP_KSWAPD_RECLAIM; va->logical = __get_free_pages(flags, --max_order); } while (va->logical == 0 && max_order > min_order); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1e60d00d4ea7..c339d561e596 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2572,7 +2572,7 @@ int open_ctree(struct super_block *sb, fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ /* readahead state */ - INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); spin_lock_init(&fs_info->reada_lock); fs_info->thread_pool_size = min_t(unsigned long, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3915c9473e94..032abfbebe76 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -594,7 +594,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) clear = 1; again: - if (!prealloc && (mask & __GFP_WAIT)) { + if (!prealloc && gfpflags_allow_blocking(mask)) { /* * Don't care for allocation failure here because we might end * up not needing the pre-allocated extent state at all, which @@ -718,7 +718,7 @@ search_again: if (start > end) goto out; spin_unlock(&tree->lock); - if (mask & __GFP_WAIT) + if (gfpflags_allow_blocking(mask)) cond_resched(); goto again; } @@ -850,7 +850,7 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, bits |= EXTENT_FIRST_DELALLOC; again: - if (!prealloc && (mask & __GFP_WAIT)) { + if (!prealloc && gfpflags_allow_blocking(mask)) { prealloc = alloc_extent_state(mask); BUG_ON(!prealloc); } @@ -1028,7 +1028,7 @@ search_again: if (start > end) goto out; spin_unlock(&tree->lock); - if (mask & __GFP_WAIT) + if (gfpflags_allow_blocking(mask)) cond_resched(); goto again; } @@ -1076,7 +1076,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, btrfs_debug_check_extent_io_range(tree, start, end); again: - if (!prealloc && (mask & __GFP_WAIT)) { + if (!prealloc && gfpflags_allow_blocking(mask)) { /* * Best effort, don't worry if extent state allocation fails * here for the first iteration. We might have a cached state @@ -1253,7 +1253,7 @@ search_again: if (start > end) goto out; spin_unlock(&tree->lock); - if (mask & __GFP_WAIT) + if (gfpflags_allow_blocking(mask)) cond_resched(); first_iteration = false; goto again; @@ -4319,7 +4319,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; - if ((mask & __GFP_WAIT) && + if (gfpflags_allow_blocking(mask) && page->mapping->host->i_size > 16 * 1024 * 1024) { u64 len; while (start <= end) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6fc735869c18..e023919b4470 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -156,8 +156,8 @@ static struct btrfs_device *__alloc_device(void) spin_lock_init(&dev->reada_lock); atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); - INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT); - INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); + INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); return dev; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a63c7b0a10cf..49f6c78ee3af 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1058,7 +1058,7 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page, return 0; if (journal) return jbd2_journal_try_to_free_buffers(journal, page, - wait & ~__GFP_WAIT); + wait & ~__GFP_DIRECT_RECLAIM); return try_to_free_buffers(page); } diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index d403c69bee08..4304072161aa 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -111,7 +111,7 @@ struct fscache_cookie *__fscache_acquire_cookie( /* radix tree insertion won't use the preallocation pool unless it's * told it may not wait */ - INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); switch (cookie->def->type) { case FSCACHE_COOKIE_TYPE_INDEX: diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 483bbc613bf0..79483b3d8c6f 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -58,7 +58,7 @@ bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page) /* * decide whether a page can be released, possibly by cancelling a store to it - * - we're allowed to sleep if __GFP_WAIT is flagged + * - we're allowed to sleep if __GFP_DIRECT_RECLAIM is flagged */ bool __fscache_maybe_release_page(struct fscache_cookie *cookie, struct page *page, @@ -122,7 +122,7 @@ page_busy: * allocator as the work threads writing to the cache may all end up * sleeping on memory allocation, so we may need to impose a timeout * too. */ - if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) { + if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) { fscache_stat(&fscache_n_store_vmscan_busy); return false; } @@ -132,7 +132,7 @@ page_busy: _debug("fscache writeout timeout page: %p{%lx}", page, page->index); - gfp &= ~__GFP_WAIT; + gfp &= ~__GFP_DIRECT_RECLAIM; goto try_again; } EXPORT_SYMBOL(__fscache_maybe_release_page); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6b8338ec2464..89463eee6791 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1937,8 +1937,8 @@ out: * @journal: journal for operation * @page: to try and free * @gfp_mask: we use the mask to detect how hard should we try to release - * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to - * release the buffers. + * buffers. If __GFP_DIRECT_RECLAIM and __GFP_FS is set, we wait for commit + * code to release the buffers. * * * For all the buffers on this page, diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 37f639d50af5..93e236429c5d 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -473,8 +473,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp) dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); /* Always try to initiate a 'commit' if relevant, but only - * wait for it if __GFP_WAIT is set. Even then, only wait 1 - * second and only if the 'bdi' is not congested. + * wait for it if the caller allows blocking. Even then, + * only wait 1 second and only if the 'bdi' is not congested. * Waiting indefinitely can cause deadlocks when the NFS * server is on this machine, when a new TCP connection is * needed and in other rare cases. There is no particular @@ -484,7 +484,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp) if (mapping) { struct nfs_server *nfss = NFS_SERVER(mapping->host); nfs_commit_inode(mapping->host, 0); - if ((gfp & __GFP_WAIT) && + if (gfpflags_allow_blocking(gfp) && !bdi_write_congested(&nfss->backing_dev_info)) { wait_on_page_bit_killable_timeout(page, PG_private, HZ); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index eac9549efd52..587174fd4f2c 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -525,7 +525,7 @@ xfs_qm_shrink_scan( unsigned long freed; int error; - if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) + if ((sc->gfp_mask & (__GFP_FS|__GFP_DIRECT_RECLAIM)) != (__GFP_FS|__GFP_DIRECT_RECLAIM)) return 0; INIT_LIST_HEAD(&isol.buffers); diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 440fca3e7e5d..b56e811b6f7c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -29,12 +29,13 @@ struct vm_area_struct; #define ___GFP_NOMEMALLOC 0x10000u #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u -#define ___GFP_WAIT 0x80000u +#define ___GFP_ATOMIC 0x80000u #define ___GFP_NOACCOUNT 0x100000u #define ___GFP_NOTRACK 0x200000u -#define ___GFP_NO_KSWAPD 0x400000u +#define ___GFP_DIRECT_RECLAIM 0x400000u #define ___GFP_OTHER_NODE 0x800000u #define ___GFP_WRITE 0x1000000u +#define ___GFP_KSWAPD_RECLAIM 0x2000000u /* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* @@ -71,7 +72,7 @@ struct vm_area_struct; * __GFP_MOVABLE: Flag that this page will be movable by the page migration * mechanism or reclaimed */ -#define __GFP_WAIT ((__force gfp_t)___GFP_WAIT) /* Can wait and reschedule? */ +#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) /* Caller cannot wait or reschedule */ #define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) /* Should access emergency pools? */ #define __GFP_IO ((__force gfp_t)___GFP_IO) /* Can start physical IO? */ #define __GFP_FS ((__force gfp_t)___GFP_FS) /* Can call down to low-level FS? */ @@ -94,23 +95,37 @@ struct vm_area_struct; #define __GFP_NOACCOUNT ((__force gfp_t)___GFP_NOACCOUNT) /* Don't account to kmemcg */ #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ -#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ +/* + * A caller that is willing to wait may enter direct reclaim and will + * wake kswapd to reclaim pages in the background until the high + * watermark is met. A caller may wish to clear __GFP_DIRECT_RECLAIM to + * avoid unnecessary delays when a fallback option is available but + * still allow kswapd to reclaim in the background. The kswapd flag + * can be cleared when the reclaiming of pages would cause unnecessary + * disruption. + */ +#define __GFP_WAIT ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM)) +#define __GFP_DIRECT_RECLAIM ((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */ +#define __GFP_KSWAPD_RECLAIM ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */ + /* * This may seem redundant, but it's a way of annotating false positives vs. * allocations that simply cannot be supported (e.g. page tables). */ #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 26 /* Room for N __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) -/* This equals 0, but use constants in case they ever change */ -#define GFP_NOWAIT (GFP_ATOMIC & ~__GFP_HIGH) -/* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */ -#define GFP_ATOMIC (__GFP_HIGH) +/* + * GFP_ATOMIC callers can not sleep, need the allocation to succeed. + * A lower watermark is applied to allow access to "atomic reserves" + */ +#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) +#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) #define GFP_NOIO (__GFP_WAIT) #define GFP_NOFS (__GFP_WAIT | __GFP_IO) #define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS) @@ -119,10 +134,10 @@ struct vm_area_struct; #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) #define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) #define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) -#define GFP_IOFS (__GFP_IO | __GFP_FS) -#define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ - __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ - __GFP_NO_KSWAPD) +#define GFP_IOFS (__GFP_IO | __GFP_FS | __GFP_KSWAPD_RECLAIM) +#define GFP_TRANSHUGE ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & \ + ~__GFP_KSWAPD_RECLAIM) /* This mask makes up all the page movable related flags */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) @@ -164,6 +179,11 @@ static inline int gfpflags_to_migratetype(const gfp_t gfp_flags) return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT; } +static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) +{ + return gfp_flags & __GFP_DIRECT_RECLAIM; +} + #ifdef CONFIG_HIGHMEM #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM #else diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 24f4dfd94c51..4355129fff91 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1224,7 +1224,7 @@ static inline int skb_cloned(const struct sk_buff *skb) static inline int skb_unclone(struct sk_buff *skb, gfp_t pri) { - might_sleep_if(pri & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) return pskb_expand_head(skb, 0, 0, pri); @@ -1308,7 +1308,7 @@ static inline int skb_shared(const struct sk_buff *skb) */ static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri) { - might_sleep_if(pri & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, pri); @@ -1344,7 +1344,7 @@ static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri) static inline struct sk_buff *skb_unshare(struct sk_buff *skb, gfp_t pri) { - might_sleep_if(pri & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) { struct sk_buff *nskb = skb_copy(skb, pri); diff --git a/include/net/sock.h b/include/net/sock.h index f570e75e3da9..bbf7c2cf15b4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2041,7 +2041,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, */ static inline struct page_frag *sk_page_frag(struct sock *sk) { - if (sk->sk_allocation & __GFP_WAIT) + if (gfpflags_allow_blocking(sk->sk_allocation)) return ¤t->task_frag; return &sk->sk_frag; diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index d6fd8e5b14b7..dde6bf092c8a 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -20,7 +20,7 @@ {(unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \ {(unsigned long)GFP_NOIO, "GFP_NOIO"}, \ {(unsigned long)__GFP_HIGH, "GFP_HIGH"}, \ - {(unsigned long)__GFP_WAIT, "GFP_WAIT"}, \ + {(unsigned long)__GFP_ATOMIC, "GFP_ATOMIC"}, \ {(unsigned long)__GFP_IO, "GFP_IO"}, \ {(unsigned long)__GFP_COLD, "GFP_COLD"}, \ {(unsigned long)__GFP_NOWARN, "GFP_NOWARN"}, \ @@ -36,7 +36,8 @@ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ - {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ + {(unsigned long)__GFP_DIRECT_RECLAIM, "GFP_DIRECT_RECLAIM"}, \ + {(unsigned long)__GFP_KSWAPD_RECLAIM, "GFP_KSWAPD_RECLAIM"}, \ {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \ ) : "GFP_NOWAIT" diff --git a/kernel/audit.c b/kernel/audit.c index 8a056a32ded7..5ffcbd354a52 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1371,16 +1371,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (unlikely(audit_filter_type(type))) return NULL; - if (gfp_mask & __GFP_WAIT) { + if (gfp_mask & __GFP_DIRECT_RECLAIM) { if (audit_pid && audit_pid == current->pid) - gfp_mask &= ~__GFP_WAIT; + gfp_mask &= ~__GFP_DIRECT_RECLAIM; else reserve = 0; } while (audit_backlog_limit && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { - if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) { + if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) { long sleep_time; sleep_time = timeout_start + audit_backlog_wait_time - jiffies; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b9d0cce3f9ce..f1603c153890 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -299,7 +299,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, idr_preload(gfp_mask); spin_lock_bh(&cgroup_idr_lock); - ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT); + ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM); spin_unlock_bh(&cgroup_idr_lock); idr_preload_end(); return ret; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4e49cc4c9952..deae3907ac1e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2738,7 +2738,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) return; /* no reclaim without waiting on it */ - if (!(gfp_mask & __GFP_WAIT)) + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) return; /* this guy won't enter reclaim */ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5235dd4e1e2f..3a970604308f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1779,7 +1779,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) while (to_alloc-- > 0) { struct page *page; - page = alloc_image_page(__GFP_HIGHMEM); + page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM); memory_bm_set_bit(bm, page_to_pfn(page)); } return nr_highmem; diff --git a/kernel/smp.c b/kernel/smp.c index 07854477c164..d903c02223af 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -669,7 +669,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), cpumask_var_t cpus; int cpu, ret; - might_sleep_if(gfp_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_flags)); if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { preempt_disable(); diff --git a/lib/idr.c b/lib/idr.c index 5335c43adf46..6098336df267 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -399,7 +399,7 @@ void idr_preload(gfp_t gfp_mask) * allocation guarantee. Disallow usage from those contexts. */ WARN_ON_ONCE(in_interrupt()); - might_sleep_if(gfp_mask & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); preempt_disable(); @@ -453,7 +453,7 @@ int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) struct idr_layer *pa[MAX_IDR_LEVEL + 1]; int id; - might_sleep_if(gfp_mask & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); /* sanity checks */ if (WARN_ON_ONCE(start < 0)) diff --git a/lib/radix-tree.c b/lib/radix-tree.c index f9ebe1c82060..fcf5d98574ce 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -188,7 +188,7 @@ radix_tree_node_alloc(struct radix_tree_root *root) * preloading in the interrupt anyway as all the allocations have to * be atomic. So just do normal allocation when in interrupt. */ - if (!(gfp_mask & __GFP_WAIT) && !in_interrupt()) { + if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) { struct radix_tree_preload *rtp; /* @@ -249,7 +249,7 @@ radix_tree_node_free(struct radix_tree_node *node) * with preemption not disabled. * * To make use of this facility, the radix tree must be initialised without - * __GFP_WAIT being passed to INIT_RADIX_TREE(). + * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ static int __radix_tree_preload(gfp_t gfp_mask) { @@ -286,12 +286,12 @@ out: * with preemption not disabled. * * To make use of this facility, the radix tree must be initialised without - * __GFP_WAIT being passed to INIT_RADIX_TREE(). + * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ int radix_tree_preload(gfp_t gfp_mask) { /* Warn on non-sensical use... */ - WARN_ON_ONCE(!(gfp_mask & __GFP_WAIT)); + WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); return __radix_tree_preload(gfp_mask); } EXPORT_SYMBOL(radix_tree_preload); @@ -303,7 +303,7 @@ EXPORT_SYMBOL(radix_tree_preload); */ int radix_tree_maybe_preload(gfp_t gfp_mask) { - if (gfp_mask & __GFP_WAIT) + if (gfpflags_allow_blocking(gfp_mask)) return __radix_tree_preload(gfp_mask); /* Preloading doesn't help anything with this gfp mask, skip it */ preempt_disable(); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 619984fc07ec..8ed2ffd963c5 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -637,7 +637,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, { struct bdi_writeback *wb; - might_sleep_if(gfp & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp)); if (!memcg_css->parent) return &bdi->wb; diff --git a/mm/dmapool.c b/mm/dmapool.c index 312a716fa14c..57312b5d6e12 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -326,7 +326,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, size_t offset; void *retval; - might_sleep_if(mem_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(mem_flags)); spin_lock_irqsave(&pool->lock, flags); list_for_each_entry(page, &pool->page_list, page_list) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bc502e590366..05374f09339c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2046,7 +2046,7 @@ retry: if (unlikely(task_in_memcg_oom(current))) goto nomem; - if (!(gfp_mask & __GFP_WAIT)) + if (!gfpflags_allow_blocking(gfp_mask)) goto nomem; mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); @@ -4364,8 +4364,8 @@ static int mem_cgroup_do_precharge(unsigned long count) { int ret; - /* Try a single bulk charge without reclaim first */ - ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); + /* Try a single bulk charge without reclaim first, kswapd may wake */ + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); if (!ret) { mc.precharge += count; return ret; diff --git a/mm/mempool.c b/mm/mempool.c index 4c533bc51d73..004d42b1dfaf 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -320,13 +320,13 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) gfp_t gfp_temp; VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); - might_sleep_if(gfp_mask & __GFP_WAIT); + might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NOWARN; /* failures are OK */ - gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO); + gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); repeat_alloc: @@ -349,7 +349,7 @@ repeat_alloc: } /* - * We use gfp mask w/o __GFP_WAIT or IO for the first round. If + * We use gfp mask w/o direct reclaim or IO for the first round. If * alloc failed with that and @pool was empty, retry immediately. */ if (gfp_temp != gfp_mask) { @@ -358,8 +358,8 @@ repeat_alloc: goto repeat_alloc; } - /* We must not sleep if !__GFP_WAIT */ - if (!(gfp_mask & __GFP_WAIT)) { + /* We must not sleep if !__GFP_DIRECT_RECLAIM */ + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { spin_unlock_irqrestore(&pool->lock, flags); return NULL; } diff --git a/mm/migrate.c b/mm/migrate.c index 2834faba719a..e60379eb23f8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1578,7 +1578,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, (GFP_HIGHUSER_MOVABLE | __GFP_THISNODE | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & - ~GFP_IOFS, 0); + ~(__GFP_IO | __GFP_FS), 0); return newpage; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 67390988881a..70461f3e3378 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -169,12 +169,12 @@ void pm_restrict_gfp_mask(void) WARN_ON(!mutex_is_locked(&pm_mutex)); WARN_ON(saved_gfp_mask); saved_gfp_mask = gfp_allowed_mask; - gfp_allowed_mask &= ~GFP_IOFS; + gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); } bool pm_suspended_storage(void) { - if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) + if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) return false; return true; } @@ -2183,7 +2183,7 @@ static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) return false; if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) return false; - if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) + if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_DIRECT_RECLAIM)) return false; return should_fail(&fail_page_alloc.attr, 1 << order); @@ -2685,7 +2685,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) if (test_thread_flag(TIF_MEMDIE) || (current->flags & (PF_MEMALLOC | PF_EXITING))) filter &= ~SHOW_MEM_FILTER_NODES; - if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) + if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; if (fmt) { @@ -2945,7 +2945,6 @@ static inline int gfp_to_alloc_flags(gfp_t gfp_mask) { int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; - const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD)); /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); @@ -2954,11 +2953,11 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). */ alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); - if (atomic) { + if (gfp_mask & __GFP_ATOMIC) { /* * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. @@ -2995,11 +2994,16 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); } +static inline bool is_thp_gfp_mask(gfp_t gfp_mask) +{ + return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE; +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { - const gfp_t wait = gfp_mask & __GFP_WAIT; + bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; struct page *page = NULL; int alloc_flags; unsigned long pages_reclaimed = 0; @@ -3019,16 +3023,24 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, return NULL; } + /* + * We also sanity check to catch abuse of atomic reserves being used by + * callers that are not in atomic context. + */ + if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == + (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) + gfp_mask &= ~__GFP_ATOMIC; + /* * If this allocation cannot block and it is for a specific node, then * fail early. There's no need to wakeup kswapd or retry for a * speculative node-specific allocation. */ - if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait) + if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim) goto nopage; retry: - if (!(gfp_mask & __GFP_NO_KSWAPD)) + if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); /* @@ -3071,8 +3083,8 @@ retry: } } - /* Atomic allocations - we can't balance anything */ - if (!wait) { + /* Caller is not willing to reclaim, we can't balance anything */ + if (!can_direct_reclaim) { /* * All existing users of the deprecated __GFP_NOFAIL are * blockable, so warn of any new users that actually allow this @@ -3102,7 +3114,7 @@ retry: goto got_pg; /* Checks for THP-specific high-order allocations */ - if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) { + if (is_thp_gfp_mask(gfp_mask)) { /* * If compaction is deferred for high-order allocations, it is * because sync compaction recently failed. If this is the case @@ -3137,8 +3149,7 @@ retry: * fault, so use asynchronous memory compaction for THP unless it is * khugepaged trying to collapse. */ - if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE || - (current->flags & PF_KTHREAD)) + if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD)) migration_mode = MIGRATE_SYNC_LIGHT; /* Try direct reclaim and then allocating */ @@ -3209,7 +3220,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, lockdep_trace_alloc(gfp_mask); - might_sleep_if(gfp_mask & __GFP_WAIT); + might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); if (should_fail_alloc_page(gfp_mask, order)) return NULL; diff --git a/mm/slab.c b/mm/slab.c index 272e809404d5..a9ef77d19a9a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1031,12 +1031,12 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } /* - * Construct gfp mask to allocate from a specific node but do not invoke reclaim - * or warn about failures. + * Construct gfp mask to allocate from a specific node but do not direct reclaim + * or warn about failures. kswapd may still wake to reclaim in the background. */ static inline gfp_t gfp_exact_node(gfp_t flags) { - return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT; + return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM; } #endif @@ -2633,7 +2633,7 @@ static int cache_grow(struct kmem_cache *cachep, offset *= cachep->colour_off; - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_enable(); /* @@ -2663,7 +2663,7 @@ static int cache_grow(struct kmem_cache *cachep, cache_init_objs(cachep, page); - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); check_irq_off(); spin_lock(&n->list_lock); @@ -2677,7 +2677,7 @@ static int cache_grow(struct kmem_cache *cachep, opps1: kmem_freepages(cachep, page); failed: - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); return 0; } @@ -2869,7 +2869,7 @@ force_grow: static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) { - might_sleep_if(flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(flags)); #if DEBUG kmem_flagcheck(cachep, flags); #endif @@ -3057,11 +3057,11 @@ retry: */ struct page *page; - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_enable(); kmem_flagcheck(cache, flags); page = kmem_getpages(cache, local_flags, numa_mem_id()); - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); if (page) { /* diff --git a/mm/slub.c b/mm/slub.c index 75a5fa92ac2a..97695622a858 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1265,7 +1265,7 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, { flags &= gfp_allowed_mask; lockdep_trace_alloc(flags); - might_sleep_if(flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(flags)); if (should_failslab(s->object_size, flags, s->flags)) return NULL; @@ -1353,7 +1353,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) flags &= gfp_allowed_mask; - if (flags & __GFP_WAIT) + if (gfpflags_allow_blocking(flags)) local_irq_enable(); flags |= s->allocflags; @@ -1363,8 +1363,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * so we fall-back to the minimum order allocation. */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; - if ((alloc_gfp & __GFP_WAIT) && oo_order(oo) > oo_order(s->min)) - alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_WAIT; + if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) + alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM; page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { @@ -1424,7 +1424,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) page->frozen = 1; out: - if (flags & __GFP_WAIT) + if (gfpflags_allow_blocking(flags)) local_irq_disable(); if (!page) return NULL; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9db9ef5e8481..7ee94dc10000 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1617,7 +1617,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } area->pages[i] = page; - if (gfp_mask & __GFP_WAIT) + if (gfpflags_allow_blocking(gfp_mask)) cond_resched(); } diff --git a/mm/vmscan.c b/mm/vmscan.c index e0cd7eed4e38..2aec4241b42a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1476,7 +1476,7 @@ static int too_many_isolated(struct zone *zone, int file, * won't get blocked by normal direct-reclaimers, forming a circular * deadlock. */ - if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) + if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) inactive >>= 3; return isolated > inactive; @@ -3791,7 +3791,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) /* * Do not scan if the allocation should not be delayed. */ - if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) + if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) return ZONE_RECLAIM_NOSCAN; /* diff --git a/mm/zswap.c b/mm/zswap.c index 4043df7c672f..e54166d3732e 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -571,7 +571,7 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { struct zswap_pool *pool; - gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; + gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) { @@ -1011,7 +1011,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* store */ len = dlen + sizeof(struct zswap_header); ret = zpool_malloc(entry->pool->zpool, len, - __GFP_NORETRY | __GFP_NOWARN, &handle); + __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, + &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; goto put_dstmem; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fab4599ba8b2..aa41e6dd6429 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -414,7 +414,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, len += NET_SKB_PAD; if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || - (gfp_mask & (__GFP_WAIT | GFP_DMA))) { + (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) goto skb_fail; @@ -481,7 +481,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, len += NET_SKB_PAD + NET_IP_ALIGN; if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || - (gfp_mask & (__GFP_WAIT | GFP_DMA))) { + (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) goto skb_fail; @@ -4452,7 +4452,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, return NULL; gfp_head = gfp_mask; - if (gfp_head & __GFP_WAIT) + if (gfp_head & __GFP_DIRECT_RECLAIM) gfp_head |= __GFP_REPEAT; *errcode = -ENOBUFS; @@ -4467,7 +4467,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, while (order) { if (npages >= 1 << order) { - page = alloc_pages((gfp_mask & ~__GFP_WAIT) | + page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY, diff --git a/net/core/sock.c b/net/core/sock.c index 7529eb9463be..1e4dd54bfb5a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1944,8 +1944,10 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) pfrag->offset = 0; if (SKB_FRAG_PAGE_ORDER) { - pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP | - __GFP_NOWARN | __GFP_NORETRY, + /* Avoid direct reclaim but allow kswapd to wake */ + pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | + __GFP_COMP | __GFP_NOWARN | + __GFP_NORETRY, SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index fafe33bdb619..59651af8cc27 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2116,7 +2116,7 @@ int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid consume_skb(info.skb2); if (info.delivered) { - if (info.congested && (allocation & __GFP_WAIT)) + if (info.congested && gfpflags_allow_blocking(allocation)) yield(); return 0; } diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 96744b75db93..977fb86065b7 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -305,7 +305,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, gfp_t slab_mask = GFP_NOWAIT; gfp_t page_mask = GFP_NOWAIT; - if (gfp & __GFP_WAIT) { + if (gfp & __GFP_DIRECT_RECLAIM) { slab_mask = GFP_KERNEL; page_mask = GFP_HIGHUSER; } @@ -379,7 +379,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) struct ib_recv_wr *failed_wr; unsigned int posted = 0; int ret = 0; - bool can_wait = !!(gfp & __GFP_WAIT); + bool can_wait = !!(gfp & __GFP_DIRECT_RECLAIM); u32 pos; /* the goal here is to just make sure that someone, somewhere diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c index 692b3e67fb54..6c71ed1caf16 100644 --- a/net/rxrpc/ar-connection.c +++ b/net/rxrpc/ar-connection.c @@ -500,7 +500,7 @@ int rxrpc_connect_call(struct rxrpc_sock *rx, if (bundle->num_conns >= 20) { _debug("too many conns"); - if (!(gfp & __GFP_WAIT)) { + if (!gfpflags_allow_blocking(gfp)) { _leave(" = -EAGAIN"); return -EAGAIN; } diff --git a/net/sctp/associola.c b/net/sctp/associola.c index b00f1f9611d6..559afd0ee7de 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1590,7 +1590,7 @@ int sctp_assoc_lookup_laddr(struct sctp_association *asoc, /* Set an association id for a given association */ int sctp_assoc_set_id(struct sctp_association *asoc, gfp_t gfp) { - bool preload = !!(gfp & __GFP_WAIT); + bool preload = gfpflags_allow_blocking(gfp); int ret; /* If the id is already assigned, keep it. */ -- cgit v1.2.3 From 78e3c7951021b4e1a554b3d619506b55b0619073 Mon Sep 17 00:00:00 2001 From: Martin Kepplinger Date: Fri, 6 Nov 2015 16:31:08 -0800 Subject: arch/x86/kernel/cpu/perf_event_msr.c: use sign_extend64() for sign extension Signed-off-by: Martin Kepplinger Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: George Spelvin Cc: Rasmus Villemoes Cc: Maxime Coquelin Cc: Denys Vlasenko Cc: Yury Norov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/perf_event_msr.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c index f32ac13934f2..ec863b9a9f78 100644 --- a/arch/x86/kernel/cpu/perf_event_msr.c +++ b/arch/x86/kernel/cpu/perf_event_msr.c @@ -163,10 +163,9 @@ again: goto again; delta = now - prev; - if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) { - delta <<= 32; - delta >>= 32; /* sign extend */ - } + if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) + delta = sign_extend64(delta, 31); + local64_add(now - prev, &event->count); } -- cgit v1.2.3 From 354dbaa7ff5b53a0ed1c0f7a9773d5953b3a1bb9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 8 Oct 2015 18:56:26 +0300 Subject: x86/cpu/intel: Enable X86_FEATURE_NONSTOP_TSC_S3 for Merrifield The Intel Merrifield SoC is a successor of the Intel MID line of SoCs. Let's set the neccessary capability for that chip. See commit c54fdbb2823d (x86: Add cpu capability flag X86_FEATURE_NONSTOP_TSC_S3) for the details. Signed-off-by: Andy Shevchenko Link: http://lkml.kernel.org/r/1444319786-36125-1-git-send-email-andriy.shevchenko@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/intel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 98a13db5f4be..209ac1e7d1f0 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -97,6 +97,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) switch (c->x86_model) { case 0x27: /* Penwell */ case 0x35: /* Cloverview */ + case 0x4a: /* Merrifield */ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3); break; default: -- cgit v1.2.3 From 8c058b0b9c34d8c8d7912880956543769323e2d8 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 3 Nov 2015 10:40:14 +0100 Subject: x86/irq: Probe for PIC presence before allocating descs for legacy IRQs Commit d32932d02e18 ("x86/irq: Convert IOAPIC to use hierarchical irqdomain interfaces") brought a regression for Hyper-V Gen2 instances. These instances don't have i8259 legacy PIC but they use legacy IRQs for serial port, rtc, and acpi. With this commit included we end up with these IRQs not initialized. Earlier, there was a special workaround for legacy IRQs in mp_map_pin_to_irq() doing mp_irqdomain_map() without looking at nr_legacy_irqs() and now we fail in __irq_domain_alloc_irqs() when irq_domain_alloc_descs() returns -EEXIST. The essence of the issue seems to be that early_irq_init() calls arch_probe_nr_irqs() to figure out the number of legacy IRQs before we probe for i8259 and gets 16. Later when init_8259A() is called we switch to NULL legacy PIC and nr_legacy_irqs() starts to return 0 but we already have 16 descs allocated. Solve the issue by separating i8259 probe from init and calling it in arch_probe_nr_irqs() before we actually use nr_legacy_irqs() information. Fixes: d32932d02e18 ("x86/irq: Convert IOAPIC to use hierarchical irqdomain interfaces") Signed-off-by: Vitaly Kuznetsov Cc: Jiang Liu Cc: K. Y. Srinivasan Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1446543614-3621-1-git-send-email-vkuznets@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/i8259.h | 1 + arch/x86/kernel/apic/vector.c | 6 +++++- arch/x86/kernel/i8259.c | 29 +++++++++++++++++++++-------- 3 files changed, 27 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index ccffa53750a8..39bcefc20de7 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -60,6 +60,7 @@ struct legacy_pic { void (*mask_all)(void); void (*restore_mask)(void); void (*init)(int auto_eoi); + int (*probe)(void); int (*irq_pending)(unsigned int irq); void (*make_irq)(unsigned int irq); }; diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 836d11b92811..861bc59c8f25 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -361,7 +361,11 @@ int __init arch_probe_nr_irqs(void) if (nr < nr_irqs) nr_irqs = nr; - return nr_legacy_irqs(); + /* + * We don't know if PIC is present at this point so we need to do + * probe() to get the right number of legacy IRQs. + */ + return legacy_pic->probe(); } #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 16cb827a5b27..be22f5a2192e 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -295,16 +295,11 @@ static void unmask_8259A(void) raw_spin_unlock_irqrestore(&i8259A_lock, flags); } -static void init_8259A(int auto_eoi) +static int probe_8259A(void) { unsigned long flags; unsigned char probe_val = ~(1 << PIC_CASCADE_IR); unsigned char new_val; - - i8259A_auto_eoi = auto_eoi; - - raw_spin_lock_irqsave(&i8259A_lock, flags); - /* * Check to see if we have a PIC. * Mask all except the cascade and read @@ -312,16 +307,28 @@ static void init_8259A(int auto_eoi) * have a PIC, we will read 0xff as opposed to the * value we wrote. */ + raw_spin_lock_irqsave(&i8259A_lock, flags); + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ outb(probe_val, PIC_MASTER_IMR); new_val = inb(PIC_MASTER_IMR); if (new_val != probe_val) { printk(KERN_INFO "Using NULL legacy PIC\n"); legacy_pic = &null_legacy_pic; - raw_spin_unlock_irqrestore(&i8259A_lock, flags); - return; } + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + return nr_legacy_irqs(); +} + +static void init_8259A(int auto_eoi) +{ + unsigned long flags; + + i8259A_auto_eoi = auto_eoi; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ /* @@ -379,6 +386,10 @@ static int legacy_pic_irq_pending_noop(unsigned int irq) { return 0; } +static int legacy_pic_probe(void) +{ + return 0; +} struct legacy_pic null_legacy_pic = { .nr_legacy_irqs = 0, @@ -388,6 +399,7 @@ struct legacy_pic null_legacy_pic = { .mask_all = legacy_pic_noop, .restore_mask = legacy_pic_noop, .init = legacy_pic_int_noop, + .probe = legacy_pic_probe, .irq_pending = legacy_pic_irq_pending_noop, .make_irq = legacy_pic_uint_noop, }; @@ -400,6 +412,7 @@ struct legacy_pic default_legacy_pic = { .mask_all = mask_8259A, .restore_mask = unmask_8259A, .init = init_8259A, + .probe = probe_8259A, .irq_pending = i8259A_irq_pending, .make_irq = make_8259A_irq, }; -- cgit v1.2.3 From 3849e91f571dcb48cf2c8143480c59137d44d6bc Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Wed, 4 Nov 2015 12:49:42 +0100 Subject: x86/AMD: Fix last level cache topology for AMD Fam17h systems On AMD Fam17h systems, the last level cache is not resident in the northbridge. Therefore, we cannot assign cpu_llc_id to the same value as Node ID as we have been doing until now. We should rather look at the ApicID bits of the core to provide us the last level cache ID info. Signed-off-by: Aravind Gopalakrishnan Cc: Andrew Morton Cc: Andy Lutomirski Cc: Frederic Weisbecker Cc: "H. Peter Anvin" Cc: Huang Rui Cc: Ingo Molnar Cc: Jacob Shin Link: http://lkml.kernel.org/r/1446582899-9378-1-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/amd.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 4a70fc6d400a..a8816b325162 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -352,6 +352,7 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) #ifdef CONFIG_SMP unsigned bits; int cpu = smp_processor_id(); + unsigned int socket_id, core_complex_id; bits = c->x86_coreid_bits; /* Low order bits define the core id (index of core in socket) */ @@ -361,6 +362,18 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) /* use socket ID also for last level cache */ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; amd_get_topology(c); + + /* + * Fix percpu cpu_llc_id here as LLC topology is different + * for Fam17h systems. + */ + if (c->x86 != 0x17 || !cpuid_edx(0x80000006)) + return; + + socket_id = (c->apicid >> bits) - 1; + core_complex_id = (c->apicid & ((1 << bits) - 1)) >> 3; + + per_cpu(cpu_llc_id, cpu) = (socket_id << 3) | core_complex_id; #endif } -- cgit v1.2.3 From f4e342c87776884f0309942a3880ca7e835239f9 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Thu, 5 Nov 2015 13:56:35 -0500 Subject: x86/mm: Skip the hypervisor range when walking PGD The range between 0xffff800000000000 and 0xffff87ffffffffff is reserved for hypervisor and therefore we should not try to follow PGD's indexes corresponding to those addresses. While this has always been a problem, with the new W+X warning mechanism ptdump_walk_pgd_level_core() can now be called during boot, causing a PV Xen guest to crash. [ tglx: Replaced the macro with a readable inline ] Fixes: e1a58320a38d "x86/mm: Warn on W^X mappings" Reported-by: Sander Eikelenboom Signed-off-by: Boris Ostrovsky Cc: xen-devel@lists.xen.org Link: http://lkml.kernel.org/r/1446749795-27764-1-git-send-email-boris.ostrovsky@oracle.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/dump_pagetables.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 1bf417e9cc13..a035c2aa7801 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -358,6 +358,21 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, #define pgd_none(a) pud_none(__pud(pgd_val(a))) #endif +#ifdef CONFIG_X86_64 +static inline bool is_hypervisor_range(int idx) +{ + /* + * ffff800000000000 - ffff87ffffffffff is reserved for + * the hypervisor. + */ + return paravirt_enabled() && + (idx >= pgd_index(__PAGE_OFFSET) - 16) && + (idx < pgd_index(__PAGE_OFFSET)); +} +#else +static inline bool is_hypervisor_range(int idx) { return false; } +#endif + static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, bool checkwx) { @@ -381,7 +396,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, for (i = 0; i < PTRS_PER_PGD; i++) { st.current_address = normalize_addr(i * PGD_LEVEL_MULT); - if (!pgd_none(*start)) { + if (!pgd_none(*start) && !is_hypervisor_range(i)) { if (pgd_large(*start) || !pgd_present(*start)) { prot = pgd_flags(*start); note_page(m, &st, __pgprot(prot), 1); -- cgit v1.2.3 From 68accac392d859d24adcf1be3a90e41f978bd54c Mon Sep 17 00:00:00 2001 From: Krzysztof Mazur Date: Fri, 6 Nov 2015 14:18:36 +0100 Subject: x86/setup: Fix low identity map for >= 2GB kernel range The commit f5f3497cad8c extended the low identity mapping. However, if the kernel uses more than 2 GB (VMSPLIT_2G_OPT or VMSPLIT_1G memory split), the normal memory mapping is overwritten by the low identity mapping causing a crash. To avoid overwritting, limit the low identity map to cover only memory before kernel range (PAGE_OFFSET). Fixes: f5f3497cad8c "x86/setup: Extend low identity map to cover whole kernel range Signed-off-by: Krzysztof Mazur Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Laszlo Ersek Cc: Matt Fleming Cc: Paolo Bonzini Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1446815916-22105-1-git-send-email-krzysiek@podlesie.net Signed-off-by: Thomas Gleixner --- arch/x86/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a1e4da98c8f0..29db25f9a745 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1188,7 +1188,7 @@ void __init setup_arch(char **cmdline_p) */ clone_pgd_range(initial_page_table, swapper_pg_dir + KERNEL_PGD_BOUNDARY, - KERNEL_PGD_PTRS); + min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); #endif tboot_probe(); -- cgit v1.2.3 From 04633df0c43d710e5f696b06539c100898678235 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 5 Nov 2015 16:57:56 +0100 Subject: x86/cpu: Call verify_cpu() after having entered long mode too When we get loaded by a 64-bit bootloader, kernel entry point is startup_64 in head_64.S. We don't trust any and all bootloaders because some will fiddle with CPU configuration so we go ahead and massage each CPU into sanity again. For example, some dell BIOSes have this XD disable feature which set IA32_MISC_ENABLE[34] and disable NX. This might be some dumb workaround for other OSes but Linux sure doesn't need it. A similar thing is present in the Surface 3 firmware - see https://bugzilla.kernel.org/show_bug.cgi?id=106051 - which sets this bit only on the BSP: # rdmsr -a 0x1a0 400850089 850089 850089 850089 I know, right?! There's not even an off switch in there. So fix all those cases by sanitizing the 64-bit entry point too. For that, make verify_cpu() callable in 64-bit mode also. Requested-and-debugged-by: "H. Peter Anvin" Reported-and-tested-by: Bastien Nocera Signed-off-by: Borislav Petkov Cc: Matt Fleming Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1446739076-21303-1-git-send-email-bp@alien8.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/head_64.S | 8 ++++++++ arch/x86/kernel/verify_cpu.S | 12 +++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 1d40ca8a73f2..ffdc0e860390 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -65,6 +65,9 @@ startup_64: * tables and then reload them. */ + /* Sanitize CPU configuration */ + call verify_cpu + /* * Compute the delta between the address I am compiled to run at and the * address I am actually running at. @@ -174,6 +177,9 @@ ENTRY(secondary_startup_64) * after the boot processor executes this code. */ + /* Sanitize CPU configuration */ + call verify_cpu + movq $(init_level4_pgt - __START_KERNEL_map), %rax 1: @@ -288,6 +294,8 @@ ENTRY(secondary_startup_64) pushq %rax # target address in negative space lretq +#include "verify_cpu.S" + #ifdef CONFIG_HOTPLUG_CPU /* * Boot CPU0 entry point. It's called from play_dead(). Everything has been set diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index b9242bacbe59..4cf401f581e7 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -34,10 +34,11 @@ #include verify_cpu: - pushfl # Save caller passed flags - pushl $0 # Kill any dangerous flags - popfl + pushf # Save caller passed flags + push $0 # Kill any dangerous flags + popf +#ifndef __x86_64__ pushfl # standard way to check for cpuid popl %eax movl %eax,%ebx @@ -48,6 +49,7 @@ verify_cpu: popl %eax cmpl %eax,%ebx jz verify_cpu_no_longmode # cpu has no cpuid +#endif movl $0x0,%eax # See if cpuid 1 is implemented cpuid @@ -130,10 +132,10 @@ verify_cpu_sse_test: jmp verify_cpu_sse_test # try again verify_cpu_no_longmode: - popfl # Restore caller passed flags + popf # Restore caller passed flags movl $1,%eax ret verify_cpu_sse_ok: - popfl # Restore caller passed flags + popf # Restore caller passed flags xorl %eax, %eax ret -- cgit v1.2.3 From 77c5b5da02f0a30d61144a546c4ef3657e3b817d Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 9 Nov 2015 14:58:23 -0800 Subject: kmap_atomic_to_page() has no users, remove it Removal started in commit 5bbeed12bdc3 ("sparc32: drop unused kmap_atomic_to_page"). Let's do it across the whole tree. Signed-off-by: Nicolas Pitre Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/highmem.h | 1 - arch/arm/mm/highmem.c | 10 ---------- arch/frv/include/asm/highmem.h | 2 -- arch/frv/mm/highmem.c | 5 ----- arch/metag/include/asm/highmem.h | 1 - arch/metag/mm/highmem.c | 14 -------------- arch/microblaze/include/asm/highmem.h | 13 ------------- arch/mips/include/asm/highmem.h | 1 - arch/mips/mm/highmem.c | 13 ------------- arch/parisc/include/asm/cacheflush.h | 1 - arch/powerpc/include/asm/highmem.h | 13 ------------- arch/tile/include/asm/highmem.h | 1 - arch/tile/mm/highmem.c | 12 ------------ arch/x86/include/asm/highmem.h | 1 - arch/x86/mm/highmem_32.c | 14 -------------- include/linux/highmem.h | 1 - 16 files changed, 103 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index 535579511ed0..0a0e2d1784c0 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -68,7 +68,6 @@ extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); -extern struct page *kmap_atomic_to_page(const void *ptr); #endif #endif diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index 9df5f09585ca..d02f8187b1cc 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -147,13 +147,3 @@ void *kmap_atomic_pfn(unsigned long pfn) return (void *)vaddr; } - -struct page *kmap_atomic_to_page(const void *ptr) -{ - unsigned long vaddr = (unsigned long)ptr; - - if (vaddr < FIXADDR_START) - return virt_to_page(ptr); - - return pte_page(get_fixmap_pte(vaddr)); -} diff --git a/arch/frv/include/asm/highmem.h b/arch/frv/include/asm/highmem.h index b3adc93611f3..1f58938703ab 100644 --- a/arch/frv/include/asm/highmem.h +++ b/arch/frv/include/asm/highmem.h @@ -62,8 +62,6 @@ extern void kunmap_high(struct page *page); extern void *kmap(struct page *page); extern void kunmap(struct page *page); -extern struct page *kmap_atomic_to_page(void *ptr); - #endif /* !__ASSEMBLY__ */ /* diff --git a/arch/frv/mm/highmem.c b/arch/frv/mm/highmem.c index 785344bbdc07..45750fb65c49 100644 --- a/arch/frv/mm/highmem.c +++ b/arch/frv/mm/highmem.c @@ -32,11 +32,6 @@ void kunmap(struct page *page) EXPORT_SYMBOL(kunmap); -struct page *kmap_atomic_to_page(void *ptr) -{ - return virt_to_page(ptr); -} - void *kmap_atomic(struct page *page) { unsigned long paddr; diff --git a/arch/metag/include/asm/highmem.h b/arch/metag/include/asm/highmem.h index 6646a15c73dd..9b1d172cd884 100644 --- a/arch/metag/include/asm/highmem.h +++ b/arch/metag/include/asm/highmem.h @@ -56,7 +56,6 @@ extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); -extern struct page *kmap_atomic_to_page(void *ptr); #endif #endif diff --git a/arch/metag/mm/highmem.c b/arch/metag/mm/highmem.c index 807f1b1c4e65..f19a87f2c1ec 100644 --- a/arch/metag/mm/highmem.c +++ b/arch/metag/mm/highmem.c @@ -111,20 +111,6 @@ void *kmap_atomic_pfn(unsigned long pfn) return (void *)vaddr; } -struct page *kmap_atomic_to_page(void *ptr) -{ - unsigned long vaddr = (unsigned long)ptr; - int idx; - pte_t *pte; - - if (vaddr < FIXADDR_START) - return virt_to_page(ptr); - - idx = virt_to_fix(vaddr); - pte = kmap_pte - (idx - FIX_KMAP_BEGIN); - return pte_page(*pte); -} - void __init kmap_init(void) { unsigned long kmap_vstart; diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index d04638932438..67925ef18cfa 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -76,19 +76,6 @@ static inline void *kmap_atomic(struct page *page) return kmap_atomic_prot(page, kmap_prot); } -static inline struct page *kmap_atomic_to_page(void *ptr) -{ - unsigned long idx, vaddr = (unsigned long) ptr; - pte_t *pte; - - if (vaddr < FIXADDR_START) - return virt_to_page(ptr); - - idx = virt_to_fix(vaddr); - pte = kmap_pte - (idx - FIX_KMAP_BEGIN); - return pte_page(*pte); -} - #define flush_cache_kmaps() { flush_icache(); flush_dcache(); } #endif /* __KERNEL__ */ diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 572e63ec2a38..01880b34a209 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -49,7 +49,6 @@ extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); -extern struct page *kmap_atomic_to_page(void *ptr); #define flush_cache_kmaps() flush_cache_all() diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index 11661cbc11a8..d7258a103439 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -118,19 +118,6 @@ void *kmap_atomic_pfn(unsigned long pfn) return (void*) vaddr; } -struct page *kmap_atomic_to_page(void *ptr) -{ - unsigned long idx, vaddr = (unsigned long)ptr; - pte_t *pte; - - if (vaddr < FIXADDR_START) - return virt_to_page(ptr); - - idx = virt_to_fix(vaddr); - pte = kmap_pte - (idx - FIX_KMAP_BEGIN); - return pte_page(*pte); -} - void __init kmap_init(void) { unsigned long kmap_vstart; diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index ec2df4bab302..845272ce9cc5 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -156,7 +156,6 @@ static inline void __kunmap_atomic(void *addr) #define kmap_atomic_prot(page, prot) kmap_atomic(page) #define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) -#define kmap_atomic_to_page(ptr) virt_to_page(ptr) #endif /* _PARISC_CACHEFLUSH_H */ diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index caaf6e00630d..01c2c23b307e 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -84,19 +84,6 @@ static inline void *kmap_atomic(struct page *page) return kmap_atomic_prot(page, kmap_prot); } -static inline struct page *kmap_atomic_to_page(void *ptr) -{ - unsigned long idx, vaddr = (unsigned long) ptr; - pte_t *pte; - - if (vaddr < FIXADDR_START) - return virt_to_page(ptr); - - idx = virt_to_fix(vaddr); - pte = kmap_pte - (idx - FIX_KMAP_BEGIN); - return pte_page(*pte); -} - #define flush_cache_kmaps() flush_cache_all() diff --git a/arch/tile/include/asm/highmem.h b/arch/tile/include/asm/highmem.h index fc8429a31c85..979579b38e57 100644 --- a/arch/tile/include/asm/highmem.h +++ b/arch/tile/include/asm/highmem.h @@ -63,7 +63,6 @@ void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); void *kmap_atomic_pfn(unsigned long pfn); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); -struct page *kmap_atomic_to_page(void *ptr); void *kmap_atomic_prot(struct page *page, pgprot_t prot); void kmap_atomic_fix_kpte(struct page *page, int finished); diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c index fcd545014e79..eca28551b22d 100644 --- a/arch/tile/mm/highmem.c +++ b/arch/tile/mm/highmem.c @@ -275,15 +275,3 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) { return kmap_atomic_prot(pfn_to_page(pfn), prot); } - -struct page *kmap_atomic_to_page(void *ptr) -{ - pte_t *pte; - unsigned long vaddr = (unsigned long)ptr; - - if (vaddr < FIXADDR_START) - return virt_to_page(ptr); - - pte = kmap_get_pte(vaddr); - return pte_page(*pte); -} diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 04e9d023168f..1c0b43724ce3 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -68,7 +68,6 @@ void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); void *kmap_atomic_pfn(unsigned long pfn); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); -struct page *kmap_atomic_to_page(void *ptr); #define flush_cache_kmaps() do { } while (0) diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index eecb207a2037..a6d739258137 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -104,20 +104,6 @@ void __kunmap_atomic(void *kvaddr) } EXPORT_SYMBOL(__kunmap_atomic); -struct page *kmap_atomic_to_page(void *ptr) -{ - unsigned long idx, vaddr = (unsigned long)ptr; - pte_t *pte; - - if (vaddr < FIXADDR_START) - return virt_to_page(ptr); - - idx = virt_to_fix(vaddr); - pte = kmap_pte - (idx - FIX_KMAP_BEGIN); - return pte_page(*pte); -} -EXPORT_SYMBOL(kmap_atomic_to_page); - void __init set_highmem_pages_init(void) { struct zone *zone; diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 6aefcd0031a6..bb3f3297062a 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -78,7 +78,6 @@ static inline void __kunmap_atomic(void *addr) } #define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) -#define kmap_atomic_to_page(ptr) virt_to_page(ptr) #define kmap_flush_unused() do {} while(0) #endif -- cgit v1.2.3 From 450869d6dbb72b370774fd4ee14c4f275bb08f98 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 4 Nov 2015 13:41:21 +0100 Subject: KVM: x86: merge handle_mmio_page_fault and handle_mmio_page_fault_common They are exactly the same, except that handle_mmio_page_fault has an unused argument and a call to WARN_ON. Remove the unused argument from the callers, and move the warning to (the former) handle_mmio_page_fault_common. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 20 +++++--------------- arch/x86/kvm/mmu.h | 6 +++--- arch/x86/kvm/paging_tmpl.h | 3 +-- arch/x86/kvm/vmx.c | 2 +- 4 files changed, 10 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7d85bcae3332..e7c2c1428a69 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3359,7 +3359,7 @@ exit: return reserved; } -int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) +int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) { u64 spte; bool reserved; @@ -3368,7 +3368,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) return RET_MMIO_PF_EMULATE; reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); - if (unlikely(reserved)) + if (WARN_ON(reserved)) return RET_MMIO_PF_BUG; if (is_mmio_spte(spte)) { @@ -3392,17 +3392,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) */ return RET_MMIO_PF_RETRY; } -EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); - -static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, - u32 error_code, bool direct) -{ - int ret; - - ret = handle_mmio_page_fault_common(vcpu, addr, direct); - WARN_ON(ret == RET_MMIO_PF_BUG); - return ret; -} +EXPORT_SYMBOL_GPL(handle_mmio_page_fault); static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, bool prefault) @@ -3413,7 +3403,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, gva, error_code, true); + r = handle_mmio_page_fault(vcpu, gva, true); if (likely(r != RET_MMIO_PF_INVALID)) return r; @@ -3503,7 +3493,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, gpa, error_code, true); + r = handle_mmio_page_fault(vcpu, gpa, true); if (likely(r != RET_MMIO_PF_INVALID)) return r; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e4202e41d535..55ffb7b0f95e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -56,13 +56,13 @@ void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); /* - * Return values of handle_mmio_page_fault_common: + * Return values of handle_mmio_page_fault: * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction * directly. * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page * fault path update the mmio spte. * RET_MMIO_PF_RETRY: let CPU fault again on the address. - * RET_MMIO_PF_BUG: bug is detected. + * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed). */ enum { RET_MMIO_PF_EMULATE = 1, @@ -71,7 +71,7 @@ enum { RET_MMIO_PF_BUG = -1 }; -int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); +int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index b41faa91a6f9..3058a22a658d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -705,8 +705,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, addr, error_code, - mmu_is_nested(vcpu)); + r = handle_mmio_page_fault(vcpu, addr, mmu_is_nested(vcpu)); if (likely(r != RET_MMIO_PF_INVALID)) return r; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5eb56ed77c1f..a26ed285931b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5908,7 +5908,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) return 1; } - ret = handle_mmio_page_fault_common(vcpu, gpa, true); + ret = handle_mmio_page_fault(vcpu, gpa, true); if (likely(ret == RET_MMIO_PF_EMULATE)) return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == EMULATE_DONE; -- cgit v1.2.3 From 893590c73426585dfd9f33358b19f18d9395fb2f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 6 Nov 2015 11:46:24 +0100 Subject: KVM: x86: declare a few variables as __read_mostly These include module parameters and variables that are set by kvm_x86_ops->hardware_setup. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 -- arch/x86/kvm/x86.c | 14 +++++++------- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9265196e877f..b0322d8df03b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -986,8 +986,6 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); /* control of guest tsc rate supported? */ extern bool kvm_has_tsc_control; -/* minimum supported tsc_khz for guests */ -extern u32 kvm_min_guest_tsc_khz; /* maximum supported tsc_khz for guests */ extern u32 kvm_max_guest_tsc_khz; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 30723a4122cd..ec85e6f0364f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -93,10 +93,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); -struct kvm_x86_ops *kvm_x86_ops; +struct kvm_x86_ops *kvm_x86_ops __read_mostly; EXPORT_SYMBOL_GPL(kvm_x86_ops); -static bool ignore_msrs = 0; +static bool __read_mostly ignore_msrs = 0; module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); unsigned int min_timer_period_us = 500; @@ -105,20 +105,20 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); static bool __read_mostly kvmclock_periodic_sync = true; module_param(kvmclock_periodic_sync, bool, S_IRUGO); -bool kvm_has_tsc_control; +bool __read_mostly kvm_has_tsc_control; EXPORT_SYMBOL_GPL(kvm_has_tsc_control); -u32 kvm_max_guest_tsc_khz; +u32 __read_mostly kvm_max_guest_tsc_khz; EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ -static u32 tsc_tolerance_ppm = 250; +static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); /* lapic timer advance (tscdeadline mode only) in nanoseconds */ -unsigned int lapic_timer_advance_ns = 0; +unsigned int __read_mostly lapic_timer_advance_ns = 0; module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); -static bool backwards_tsc_observed = false; +static bool __read_mostly backwards_tsc_observed = false; #define KVM_NR_SHARED_MSRS 16 -- cgit v1.2.3 From bc9b961b357ea8129d75613b7af4fdf57ced9b9f Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Tue, 20 Oct 2015 15:39:01 +0800 Subject: KVM: x86: Collect information for setting TSC scaling ratio The number of bits of the fractional part of the 64-bit TSC scaling ratio in VMX and SVM is different. This patch makes the architecture code to collect the number of fractional bits and other related information into variables that can be accessed in the common code. Signed-off-by: Haozhong Zhang Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 ++++ arch/x86/kvm/svm.c | 3 +++ arch/x86/kvm/x86.c | 4 ++++ 3 files changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b0322d8df03b..5333767560c0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -988,6 +988,10 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); extern bool kvm_has_tsc_control; /* maximum supported tsc_khz for guests */ extern u32 kvm_max_guest_tsc_khz; +/* number of bits of the fractional part of the TSC scaling ratio */ +extern u8 kvm_tsc_scaling_ratio_frac_bits; +/* maximum allowed value of TSC scaling ratio */ +extern u64 kvm_max_tsc_scaling_ratio; enum emulation_result { EMULATE_DONE, /* no further processing */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f2c8e4917688..74712ead9787 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -908,6 +908,9 @@ static __init int svm_hardware_setup(void) max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX)); kvm_max_guest_tsc_khz = max; + + kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; + kvm_tsc_scaling_ratio_frac_bits = 32; } if (nested) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ec85e6f0364f..e728c539ecfb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -109,6 +109,10 @@ bool __read_mostly kvm_has_tsc_control; EXPORT_SYMBOL_GPL(kvm_has_tsc_control); u32 __read_mostly kvm_max_guest_tsc_khz; EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); +u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; +EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); +u64 __read_mostly kvm_max_tsc_scaling_ratio; +EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; -- cgit v1.2.3 From ad721883e9c5f46cc5fa9496bc12c097c6238b4a Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Tue, 20 Oct 2015 15:39:02 +0800 Subject: KVM: x86: Add a common TSC scaling ratio field in kvm_vcpu_arch This patch moves the field of TSC scaling ratio from the architecture struct vcpu_svm to the common struct kvm_vcpu_arch. Signed-off-by: Haozhong Zhang Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 26 +++++++++++--------------- arch/x86/kvm/x86.c | 9 ++++++++- 3 files changed, 20 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5333767560c0..f3354bd92364 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -505,6 +505,7 @@ struct kvm_vcpu_arch { u32 virtual_tsc_mult; u32 virtual_tsc_khz; s64 ia32_tsc_adjust_msr; + u64 tsc_scaling_ratio; atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ unsigned nmi_pending; /* NMI queued after currently running handler */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 74712ead9787..9c92e6f429d0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -158,8 +158,6 @@ struct vcpu_svm { unsigned long int3_rip; u32 apf_reason; - u64 tsc_ratio; - /* cached guest cpuid flags for faster access */ bool nrips_enabled : 1; }; @@ -991,24 +989,22 @@ static u64 __scale_tsc(u64 ratio, u64 tsc) static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) { - struct vcpu_svm *svm = to_svm(vcpu); u64 _tsc = tsc; - if (svm->tsc_ratio != TSC_RATIO_DEFAULT) - _tsc = __scale_tsc(svm->tsc_ratio, tsc); + if (vcpu->arch.tsc_scaling_ratio != TSC_RATIO_DEFAULT) + _tsc = __scale_tsc(vcpu->arch.tsc_scaling_ratio, tsc); return _tsc; } static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) { - struct vcpu_svm *svm = to_svm(vcpu); u64 ratio; u64 khz; /* Guest TSC same frequency as host TSC? */ if (!scale) { - svm->tsc_ratio = TSC_RATIO_DEFAULT; + vcpu->arch.tsc_scaling_ratio = TSC_RATIO_DEFAULT; return; } @@ -1033,7 +1029,7 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) user_tsc_khz); return; } - svm->tsc_ratio = ratio; + vcpu->arch.tsc_scaling_ratio = ratio; } static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu) @@ -1067,7 +1063,7 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho struct vcpu_svm *svm = to_svm(vcpu); if (host) { - if (svm->tsc_ratio != TSC_RATIO_DEFAULT) + if (vcpu->arch.tsc_scaling_ratio != TSC_RATIO_DEFAULT) WARN_ON(adjustment < 0); adjustment = svm_scale_tsc(vcpu, (u64)adjustment); } @@ -1238,8 +1234,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) goto out; } - svm->tsc_ratio = TSC_RATIO_DEFAULT; - err = kvm_vcpu_init(&svm->vcpu, kvm, id); if (err) goto free_svm; @@ -1325,10 +1319,12 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); - if (static_cpu_has(X86_FEATURE_TSCRATEMSR) && - svm->tsc_ratio != __this_cpu_read(current_tsc_ratio)) { - __this_cpu_write(current_tsc_ratio, svm->tsc_ratio); - wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio); + if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { + u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio; + if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) { + __this_cpu_write(current_tsc_ratio, tsc_ratio); + wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio); + } } } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e728c539ecfb..ef5b9d66cd71 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -113,6 +113,7 @@ u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); u64 __read_mostly kvm_max_tsc_scaling_ratio; EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); +static u64 __read_mostly kvm_default_tsc_scaling_ratio; /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; @@ -1258,8 +1259,11 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) int use_scaling = 0; /* tsc_khz can be zero if TSC calibration fails */ - if (this_tsc_khz == 0) + if (this_tsc_khz == 0) { + /* set tsc_scaling_ratio to a safe value */ + vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; return; + } /* Compute a scale to convert nanoseconds in TSC cycles */ kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, @@ -7367,6 +7371,9 @@ int kvm_arch_hardware_setup(void) if (r != 0) return r; + if (kvm_has_tsc_control) + kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; + kvm_init_msr_list(); return 0; } -- cgit v1.2.3 From 35181e86df97e4223f4a28fb33e2bcf3b73de141 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Tue, 20 Oct 2015 15:39:03 +0800 Subject: KVM: x86: Add a common TSC scaling function VMX and SVM calculate the TSC scaling ratio in a similar logic, so this patch generalizes it to a common TSC scaling function. Signed-off-by: Haozhong Zhang [Inline the multiplication and shift steps into mul_u64_u64_shr. Remove BUG_ON. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/svm.c | 48 ++++---------------------------------- arch/x86/kvm/x86.c | 40 +++++++++++++++++++++++++++++++- include/linux/kvm_host.h | 1 + include/linux/math64.h | 51 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 97 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f3354bd92364..52d1419968eb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1238,6 +1238,8 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); +u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); + unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9c92e6f429d0..65f4f1947a62 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -212,7 +212,6 @@ static int nested_svm_intercept(struct vcpu_svm *svm); static int nested_svm_vmexit(struct vcpu_svm *svm); static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); -static u64 __scale_tsc(u64 ratio, u64 tsc); enum { VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, @@ -892,21 +891,7 @@ static __init int svm_hardware_setup(void) kvm_enable_efer_bits(EFER_FFXSR); if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { - u64 max; - kvm_has_tsc_control = true; - - /* - * Make sure the user can only configure tsc_khz values that - * fit into a signed integer. - * A min value is not calculated needed because it will always - * be 1 on all machines and a value of 0 is used to disable - * tsc-scaling for the vcpu. - */ - max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX)); - - kvm_max_guest_tsc_khz = max; - kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; kvm_tsc_scaling_ratio_frac_bits = 32; } @@ -972,31 +957,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } -static u64 __scale_tsc(u64 ratio, u64 tsc) -{ - u64 mult, frac, _tsc; - - mult = ratio >> 32; - frac = ratio & ((1ULL << 32) - 1); - - _tsc = tsc; - _tsc *= mult; - _tsc += (tsc >> 32) * frac; - _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32; - - return _tsc; -} - -static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) -{ - u64 _tsc = tsc; - - if (vcpu->arch.tsc_scaling_ratio != TSC_RATIO_DEFAULT) - _tsc = __scale_tsc(vcpu->arch.tsc_scaling_ratio, tsc); - - return _tsc; -} - static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) { u64 ratio; @@ -1065,7 +1025,7 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho if (host) { if (vcpu->arch.tsc_scaling_ratio != TSC_RATIO_DEFAULT) WARN_ON(adjustment < 0); - adjustment = svm_scale_tsc(vcpu, (u64)adjustment); + adjustment = kvm_scale_tsc(vcpu, (u64)adjustment); } svm->vmcb->control.tsc_offset += adjustment; @@ -1083,7 +1043,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { u64 tsc; - tsc = svm_scale_tsc(vcpu, rdtsc()); + tsc = kvm_scale_tsc(vcpu, rdtsc()); return target_tsc - tsc; } @@ -3075,7 +3035,7 @@ static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); return vmcb->control.tsc_offset + - svm_scale_tsc(vcpu, host_tsc); + kvm_scale_tsc(vcpu, host_tsc); } static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) @@ -3085,7 +3045,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr_info->index) { case MSR_IA32_TSC: { msr_info->data = svm->vmcb->control.tsc_offset + - svm_scale_tsc(vcpu, rdtsc()); + kvm_scale_tsc(vcpu, rdtsc()); break; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ef5b9d66cd71..1473e64cb744 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1329,6 +1329,33 @@ static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; } +/* + * Multiply tsc by a fixed point number represented by ratio. + * + * The most significant 64-N bits (mult) of ratio represent the + * integral part of the fixed point number; the remaining N bits + * (frac) represent the fractional part, ie. ratio represents a fixed + * point number (mult + frac * 2^(-N)). + * + * N equals to kvm_tsc_scaling_ratio_frac_bits. + */ +static inline u64 __scale_tsc(u64 ratio, u64 tsc) +{ + return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); +} + +u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) +{ + u64 _tsc = tsc; + u64 ratio = vcpu->arch.tsc_scaling_ratio; + + if (ratio != kvm_default_tsc_scaling_ratio) + _tsc = __scale_tsc(ratio, tsc); + + return _tsc; +} +EXPORT_SYMBOL_GPL(kvm_scale_tsc); + void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct kvm *kvm = vcpu->kvm; @@ -7371,8 +7398,19 @@ int kvm_arch_hardware_setup(void) if (r != 0) return r; - if (kvm_has_tsc_control) + if (kvm_has_tsc_control) { + /* + * Make sure the user can only configure tsc_khz values that + * fit into a signed integer. + * A min value is not calculated needed because it will always + * be 1 on all machines. + */ + u64 max = min(0x7fffffffULL, + __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz)); + kvm_max_guest_tsc_khz = max; + kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; + } kvm_init_msr_list(); return 0; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 242a6d2b53ff..5706a2108f0a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1183,4 +1183,5 @@ void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *); int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ + #endif diff --git a/include/linux/math64.h b/include/linux/math64.h index c45c089bfdac..44282ec7b682 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -142,6 +142,13 @@ static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) } #endif /* mul_u64_u32_shr */ +#ifndef mul_u64_u64_shr +static inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift) +{ + return (u64)(((unsigned __int128)a * mul) >> shift); +} +#endif /* mul_u64_u64_shr */ + #else #ifndef mul_u64_u32_shr @@ -161,6 +168,50 @@ static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) } #endif /* mul_u64_u32_shr */ +#ifndef mul_u64_u64_shr +static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift) +{ + union { + u64 ll; + struct { +#ifdef __BIG_ENDIAN + u32 high, low; +#else + u32 low, high; +#endif + } l; + } rl, rm, rn, rh, a0, b0; + u64 c; + + a0.ll = a; + b0.ll = b; + + rl.ll = (u64)a0.l.low * b0.l.low; + rm.ll = (u64)a0.l.low * b0.l.high; + rn.ll = (u64)a0.l.high * b0.l.low; + rh.ll = (u64)a0.l.high * b0.l.high; + + /* + * Each of these lines computes a 64-bit intermediate result into "c", + * starting at bits 32-95. The low 32-bits go into the result of the + * multiplication, the high 32-bits are carried into the next step. + */ + rl.l.high = c = (u64)rl.l.high + rm.l.low + rn.l.low; + rh.l.low = c = (c >> 32) + rm.l.high + rn.l.high + rh.l.low; + rh.l.high = (c >> 32) + rh.l.high; + + /* + * The 128-bit result of the multiplication is in rl.ll and rh.ll, + * shift it right and throw away the high part of the result. + */ + if (shift == 0) + return rl.ll; + if (shift < 64) + return (rl.ll >> shift) | (rh.ll << (64 - shift)); + return rh.ll >> (shift & 63); +} +#endif /* mul_u64_u64_shr */ + #endif #endif /* _LINUX_MATH64_H */ -- cgit v1.2.3 From 381d585c80e34988269bd7901ad910981e900be1 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Tue, 20 Oct 2015 15:39:04 +0800 Subject: KVM: x86: Replace call-back set_tsc_khz() with a common function Both VMX and SVM propagate virtual_tsc_khz in the same way, so this patch removes the call-back set_tsc_khz() and replaces it with a common function. Signed-off-by: Haozhong Zhang Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/svm.c | 36 -------------------------------- arch/x86/kvm/vmx.c | 17 --------------- arch/x86/kvm/x86.c | 46 ++++++++++++++++++++++++++++++++++++----- include/linux/math64.h | 29 ++++++++++++++++++++++++++ 5 files changed, 70 insertions(+), 59 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 52d1419968eb..c5a3f3d66e90 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -853,7 +853,6 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); - void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale); u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 65f4f1947a62..f6e49a6c9ab0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -957,41 +957,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } -static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) -{ - u64 ratio; - u64 khz; - - /* Guest TSC same frequency as host TSC? */ - if (!scale) { - vcpu->arch.tsc_scaling_ratio = TSC_RATIO_DEFAULT; - return; - } - - /* TSC scaling supported? */ - if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { - if (user_tsc_khz > tsc_khz) { - vcpu->arch.tsc_catchup = 1; - vcpu->arch.tsc_always_catchup = 1; - } else - WARN(1, "user requested TSC rate below hardware speed\n"); - return; - } - - khz = user_tsc_khz; - - /* TSC scaling required - calculate ratio */ - ratio = khz << 32; - do_div(ratio, tsc_khz); - - if (ratio == 0 || ratio & TSC_RATIO_RSVD) { - WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n", - user_tsc_khz); - return; - } - vcpu->arch.tsc_scaling_ratio = ratio; -} - static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4402,7 +4367,6 @@ static struct kvm_x86_ops svm_x86_ops = { .has_wbinvd_exit = svm_has_wbinvd_exit, - .set_tsc_khz = svm_set_tsc_khz, .read_tsc_offset = svm_read_tsc_offset, .write_tsc_offset = svm_write_tsc_offset, .adjust_tsc_offset = svm_adjust_tsc_offset, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a26ed285931b..baee46893899 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2382,22 +2382,6 @@ static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) return host_tsc + tsc_offset; } -/* - * Engage any workarounds for mis-matched TSC rates. Currently limited to - * software catchup for faster rates on slower CPUs. - */ -static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) -{ - if (!scale) - return; - - if (user_tsc_khz > tsc_khz) { - vcpu->arch.tsc_catchup = 1; - vcpu->arch.tsc_always_catchup = 1; - } else - WARN(1, "user requested TSC rate below hardware speed\n"); -} - static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) { return vmcs_read64(TSC_OFFSET); @@ -10826,7 +10810,6 @@ static struct kvm_x86_ops vmx_x86_ops = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - .set_tsc_khz = vmx_set_tsc_khz, .read_tsc_offset = vmx_read_tsc_offset, .write_tsc_offset = vmx_write_tsc_offset, .adjust_tsc_offset = vmx_adjust_tsc_offset, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1473e64cb744..c314e8d22a67 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1253,7 +1253,43 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm) return v; } -static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) +static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) +{ + u64 ratio; + + /* Guest TSC same frequency as host TSC? */ + if (!scale) { + vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; + return 0; + } + + /* TSC scaling supported? */ + if (!kvm_has_tsc_control) { + if (user_tsc_khz > tsc_khz) { + vcpu->arch.tsc_catchup = 1; + vcpu->arch.tsc_always_catchup = 1; + return 0; + } else { + WARN(1, "user requested TSC rate below hardware speed\n"); + return -1; + } + } + + /* TSC scaling required - calculate ratio */ + ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits, + user_tsc_khz, tsc_khz); + + if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { + WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", + user_tsc_khz); + return -1; + } + + vcpu->arch.tsc_scaling_ratio = ratio; + return 0; +} + +static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) { u32 thresh_lo, thresh_hi; int use_scaling = 0; @@ -1262,7 +1298,7 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) if (this_tsc_khz == 0) { /* set tsc_scaling_ratio to a safe value */ vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; - return; + return -1; } /* Compute a scale to convert nanoseconds in TSC cycles */ @@ -1283,7 +1319,7 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi); use_scaling = 1; } - kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling); + return set_tsc_khz(vcpu, this_tsc_khz, use_scaling); } static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) @@ -3353,9 +3389,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (user_tsc_khz == 0) user_tsc_khz = tsc_khz; - kvm_set_tsc_khz(vcpu, user_tsc_khz); + if (!kvm_set_tsc_khz(vcpu, user_tsc_khz)) + r = 0; - r = 0; goto out; } case KVM_GET_TSC_KHZ: { diff --git a/include/linux/math64.h b/include/linux/math64.h index 44282ec7b682..6e8b5b270ffe 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -214,4 +214,33 @@ static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift) #endif +#ifndef mul_u64_u32_div +static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor) +{ + union { + u64 ll; + struct { +#ifdef __BIG_ENDIAN + u32 high, low; +#else + u32 low, high; +#endif + } l; + } u, rl, rh; + + u.ll = a; + rl.ll = (u64)u.l.low * mul; + rh.ll = (u64)u.l.high * mul + rl.l.high; + + /* Bits 32-63 of the result will be in rh.l.low. */ + rl.l.high = do_div(rh.ll, divisor); + + /* Bits 0-31 of the result will be in rl.l.low. */ + do_div(rl.ll, divisor); + + rl.l.high = rh.l.low; + return rl.ll; +} +#endif /* mul_u64_u32_div */ + #endif /* _LINUX_MATH64_H */ -- cgit v1.2.3 From 07c1419a32bbba08cf1efb6d1ecaf24f174fa4c3 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Tue, 20 Oct 2015 15:39:05 +0800 Subject: KVM: x86: Replace call-back compute_tsc_offset() with a common function Both VMX and SVM calculate the tsc-offset in the same way, so this patch removes the call-back compute_tsc_offset() and replaces it with a common function kvm_compute_tsc_offset(). Signed-off-by: Haozhong Zhang Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/svm.c | 10 ---------- arch/x86/kvm/vmx.c | 6 ------ arch/x86/kvm/x86.c | 15 ++++++++++++--- 4 files changed, 12 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c5a3f3d66e90..672f960e8144 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -856,7 +856,6 @@ struct kvm_x86_ops { u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); - u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc); void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f6e49a6c9ab0..d99b175ffbea 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1004,15 +1004,6 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) -{ - u64 tsc; - - tsc = kvm_scale_tsc(vcpu, rdtsc()); - - return target_tsc - tsc; -} - static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -4370,7 +4361,6 @@ static struct kvm_x86_ops svm_x86_ops = { .read_tsc_offset = svm_read_tsc_offset, .write_tsc_offset = svm_write_tsc_offset, .adjust_tsc_offset = svm_adjust_tsc_offset, - .compute_tsc_offset = svm_compute_tsc_offset, .read_l1_tsc = svm_read_l1_tsc, .set_tdp_cr3 = set_tdp_cr3, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index baee46893899..2d4782ce9a93 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2426,11 +2426,6 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho offset + adjustment); } -static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) -{ - return target_tsc - rdtsc(); -} - static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0); @@ -10813,7 +10808,6 @@ static struct kvm_x86_ops vmx_x86_ops = { .read_tsc_offset = vmx_read_tsc_offset, .write_tsc_offset = vmx_write_tsc_offset, .adjust_tsc_offset = vmx_adjust_tsc_offset, - .compute_tsc_offset = vmx_compute_tsc_offset, .read_l1_tsc = vmx_read_l1_tsc, .set_tdp_cr3 = vmx_set_cr3, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c314e8d22a67..bb46066e125b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1392,6 +1392,15 @@ u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) } EXPORT_SYMBOL_GPL(kvm_scale_tsc); +static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) +{ + u64 tsc; + + tsc = kvm_scale_tsc(vcpu, rdtsc()); + + return target_tsc - tsc; +} + void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct kvm *kvm = vcpu->kvm; @@ -1403,7 +1412,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) u64 data = msr->data; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); - offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); + offset = kvm_compute_tsc_offset(vcpu, data); ns = get_kernel_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; @@ -1460,7 +1469,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) } else { u64 delta = nsec_to_cycles(vcpu, elapsed); data += delta; - offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); + offset = kvm_compute_tsc_offset(vcpu, data); pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } matched = true; @@ -2687,7 +2696,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); if (check_tsc_unstable()) { - u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu, + u64 offset = kvm_compute_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_x86_ops->write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; -- cgit v1.2.3 From 58ea6767874e791a6c4f5c96c7d9155de4b1af28 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Tue, 20 Oct 2015 15:39:06 +0800 Subject: KVM: x86: Move TSC scaling logic out of call-back adjust_tsc_offset() For both VMX and SVM, if the 2nd argument of call-back adjust_tsc_offset() is the host TSC, then adjust_tsc_offset() will scale it first. This patch moves this common TSC scaling logic to its caller adjust_tsc_offset_host() and rename the call-back adjust_tsc_offset() to adjust_tsc_offset_guest(). Signed-off-by: Haozhong Zhang Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 13 +------------ arch/x86/kvm/svm.c | 10 ++-------- arch/x86/kvm/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 14 ++++++++++++++ 4 files changed, 19 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 672f960e8144..8465944fe8da 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -845,7 +845,7 @@ struct kvm_x86_ops { int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); bool (*invpcid_supported)(void); - void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); + void (*adjust_tsc_offset_guest)(struct kvm_vcpu *vcpu, s64 adjustment); void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -922,17 +922,6 @@ struct kvm_arch_async_pf { extern struct kvm_x86_ops *kvm_x86_ops; -static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, - s64 adjustment) -{ - kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, false); -} - -static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) -{ - kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, true); -} - int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d99b175ffbea..b5824a3894bf 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -983,16 +983,10 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) +static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { struct vcpu_svm *svm = to_svm(vcpu); - if (host) { - if (vcpu->arch.tsc_scaling_ratio != TSC_RATIO_DEFAULT) - WARN_ON(adjustment < 0); - adjustment = kvm_scale_tsc(vcpu, (u64)adjustment); - } - svm->vmcb->control.tsc_offset += adjustment; if (is_guest_mode(vcpu)) svm->nested.hsave->control.tsc_offset += adjustment; @@ -4360,7 +4354,7 @@ static struct kvm_x86_ops svm_x86_ops = { .read_tsc_offset = svm_read_tsc_offset, .write_tsc_offset = svm_write_tsc_offset, - .adjust_tsc_offset = svm_adjust_tsc_offset, + .adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest, .read_l1_tsc = svm_read_l1_tsc, .set_tdp_cr3 = set_tdp_cr3, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2d4782ce9a93..c0fb398ac50e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2413,7 +2413,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) } } -static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) +static void vmx_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { u64 offset = vmcs_read64(TSC_OFFSET); @@ -10807,7 +10807,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .read_tsc_offset = vmx_read_tsc_offset, .write_tsc_offset = vmx_write_tsc_offset, - .adjust_tsc_offset = vmx_adjust_tsc_offset, + .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest, .read_l1_tsc = vmx_read_l1_tsc, .set_tdp_cr3 = vmx_set_cr3, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bb46066e125b..4073009fe578 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1526,6 +1526,20 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) EXPORT_SYMBOL_GPL(kvm_write_tsc); +static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, + s64 adjustment) +{ + kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment); +} + +static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) +{ + if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) + WARN_ON(adjustment < 0); + adjustment = kvm_scale_tsc(vcpu, (u64) adjustment); + kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment); +} + #ifdef CONFIG_X86_64 static cycle_t read_tsc(void) -- cgit v1.2.3 From 4ba76538dd52dd9b18b464e509cb8f3ed4ed993f Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Tue, 20 Oct 2015 15:39:07 +0800 Subject: KVM: x86: Move TSC scaling logic out of call-back read_l1_tsc() Both VMX and SVM scales the host TSC in the same way in call-back read_l1_tsc(), so this patch moves the scaling logic from call-back read_l1_tsc() to a common function kvm_read_l1_tsc(). Signed-off-by: Haozhong Zhang Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/lapic.c | 4 ++-- arch/x86/kvm/svm.c | 3 +-- arch/x86/kvm/x86.c | 11 ++++++++--- 4 files changed, 12 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8465944fe8da..456a3869a57e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1226,6 +1226,7 @@ void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); +u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ecd4ea1d28a8..4d30b865be30 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1250,7 +1250,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc()); + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ @@ -1318,7 +1318,7 @@ static void start_apic_timer(struct kvm_lapic *apic) local_irq_save(flags); now = apic->lapic_timer.timer.base->get_time(); - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc()); + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b5824a3894bf..f2ba91990b4e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2984,8 +2984,7 @@ static int cr8_write_interception(struct vcpu_svm *svm) static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); - return vmcb->control.tsc_offset + - kvm_scale_tsc(vcpu, host_tsc); + return vmcb->control.tsc_offset + host_tsc; } static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4073009fe578..3d008de9cb05 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1401,6 +1401,12 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) return target_tsc - tsc; } +u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) +{ + return kvm_x86_ops->read_l1_tsc(vcpu, kvm_scale_tsc(vcpu, host_tsc)); +} +EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); + void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct kvm *kvm = vcpu->kvm; @@ -1738,7 +1744,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) kernel_ns = get_kernel_ns(); } - tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc); + tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); /* * We may have to catch up the TSC to match elapsed wall clock @@ -6545,8 +6551,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); - vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, - rdtsc()); + vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); -- cgit v1.2.3 From 27cca94e032c1749825fdd9b6b379e4235cd52e1 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Tue, 20 Oct 2015 15:39:08 +0800 Subject: KVM: x86: Use the correct vcpu's TSC rate to compute time scale This patch makes KVM use virtual_tsc_khz rather than the host TSC rate as vcpu's TSC rate to compute the time scale if TSC scaling is enabled. Signed-off-by: Haozhong Zhang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3d008de9cb05..2cb074f5aaed 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1707,7 +1707,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) static int kvm_guest_time_update(struct kvm_vcpu *v) { - unsigned long flags, this_tsc_khz; + unsigned long flags, this_tsc_khz, tgt_tsc_khz; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; @@ -1770,7 +1770,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) return 0; if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { - kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, + tgt_tsc_khz = kvm_has_tsc_control ? + vcpu->virtual_tsc_khz : this_tsc_khz; + kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz, &vcpu->hv_clock.tsc_shift, &vcpu->hv_clock.tsc_to_system_mul); vcpu->hw_tsc_khz = this_tsc_khz; -- cgit v1.2.3 From 64903d6195cbfb051ce339d30848cc64babdba12 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Tue, 20 Oct 2015 15:39:09 +0800 Subject: KVM: VMX: Enable and initialize VMX TSC scaling This patch exhances kvm-intel module to enable VMX TSC scaling and collects information of TSC scaling ratio during initialization. Signed-off-by: Haozhong Zhang Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 3 +++ arch/x86/kvm/vmx.c | 17 ++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index aa336ff3e03e..14c63c7e8337 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -73,6 +73,7 @@ #define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 #define SECONDARY_EXEC_PCOMMIT 0x00200000 +#define SECONDARY_EXEC_TSC_SCALING 0x02000000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_NMI_EXITING 0x00000008 @@ -167,6 +168,8 @@ enum vmcs_field { VMWRITE_BITMAP = 0x00002028, XSS_EXIT_BITMAP = 0x0000202C, XSS_EXIT_BITMAP_HIGH = 0x0000202D, + TSC_MULTIPLIER = 0x00002032, + TSC_MULTIPLIER_HIGH = 0x00002033, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c0fb398ac50e..e45b03ed5c52 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -107,6 +107,8 @@ static u64 __read_mostly host_xss; static bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, S_IRUGO); +#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL + #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ @@ -1172,6 +1174,12 @@ static inline bool cpu_has_vmx_pml(void) return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; } +static inline bool cpu_has_vmx_tsc_scaling(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_TSC_SCALING; +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -3133,7 +3141,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_XSAVES | SECONDARY_EXEC_ENABLE_PML | - SECONDARY_EXEC_PCOMMIT; + SECONDARY_EXEC_PCOMMIT | + SECONDARY_EXEC_TSC_SCALING; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -6178,6 +6187,12 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_apicv()) enable_apicv = 0; + if (cpu_has_vmx_tsc_scaling()) { + kvm_has_tsc_control = true; + kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; + kvm_tsc_scaling_ratio_frac_bits = 48; + } + if (enable_apicv) kvm_x86_ops->update_cr8_intercept = NULL; else { -- cgit v1.2.3 From ff2c3a1803775cc72dc6f624b59554956396b0ee Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Tue, 20 Oct 2015 15:39:10 +0800 Subject: KVM: VMX: Setup TSC scaling ratio when a vcpu is loaded This patch makes kvm-intel module to load TSC scaling ratio into TSC multiplier field of VMCS when a vcpu is loaded, so that TSC scaling ratio can take effect if VMX TSC scaling is enabled. Signed-off-by: Haozhong Zhang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e45b03ed5c52..890295287afa 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2061,6 +2061,12 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ + + /* Setup TSC multiplier */ + if (cpu_has_vmx_tsc_scaling()) + vmcs_write64(TSC_MULTIPLIER, + vcpu->arch.tsc_scaling_ratio); + vmx->loaded_vmcs->cpu = cpu; } -- cgit v1.2.3 From be7b263ea925324e54e48c3558d4719be5374053 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Tue, 20 Oct 2015 15:39:11 +0800 Subject: KVM: VMX: Use a scaled host TSC for guest readings of MSR_IA32_TSC This patch makes kvm-intel to return a scaled host TSC plus the TSC offset when handling guest readings to MSR_IA32_TSC. Signed-off-by: Haozhong Zhang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 890295287afa..c6016e4a5af0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2371,15 +2371,16 @@ static void setup_msrs(struct vcpu_vmx *vmx) /* * reads and returns guest's timestamp counter "register" - * guest_tsc = host_tsc + tsc_offset -- 21.3 + * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset + * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3 */ -static u64 guest_read_tsc(void) +static u64 guest_read_tsc(struct kvm_vcpu *vcpu) { u64 host_tsc, tsc_offset; host_tsc = rdtsc(); tsc_offset = vmcs_read64(TSC_OFFSET); - return host_tsc + tsc_offset; + return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset; } /* @@ -2771,7 +2772,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_EFER: return kvm_get_msr_common(vcpu, msr_info); case MSR_IA32_TSC: - msr_info->data = guest_read_tsc(); + msr_info->data = guest_read_tsc(vcpu); break; case MSR_IA32_SYSENTER_CS: msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); -- cgit v1.2.3 From 8cfe9866960581303f244780945c5d12ecc4e5bc Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Tue, 20 Oct 2015 15:39:12 +0800 Subject: KVM: VMX: Dump TSC multiplier in dump_vmcs() This patch enhances dump_vmcs() to dump the value of TSC multiplier field in VMCS. Signed-off-by: Haozhong Zhang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c6016e4a5af0..b765b036a048 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8009,6 +8009,9 @@ static void dump_vmcs(void) vmcs_read32(IDT_VECTORING_INFO_FIELD), vmcs_read32(IDT_VECTORING_ERROR_CODE)); pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET)); + if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) + pr_err("TSC Multiplier = 0x%016lx\n", + vmcs_readl(TSC_MULTIPLIER)); if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) -- cgit v1.2.3 From 54a20552e1eae07aa240fa370a0293e006b5faed Mon Sep 17 00:00:00 2001 From: Eric Northup Date: Tue, 3 Nov 2015 18:03:53 +0100 Subject: KVM: x86: work around infinite loop in microcode when #AC is delivered It was found that a guest can DoS a host by triggering an infinite stream of "alignment check" (#AC) exceptions. This causes the microcode to enter an infinite loop where the core never receives another interrupt. The host kernel panics pretty quickly due to the effects (CVE-2015-5307). Signed-off-by: Eric Northup Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/svm.h | 1 + arch/x86/kvm/svm.c | 8 ++++++++ arch/x86/kvm/vmx.c | 5 ++++- 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index b5d7640abc5d..8a4add8e4639 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -100,6 +100,7 @@ { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \ { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \ + { SVM_EXIT_EXCP_BASE + AC_VECTOR, "AC excp" }, \ { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \ { SVM_EXIT_INTR, "interrupt" }, \ { SVM_EXIT_NMI, "nmi" }, \ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f2ba91990b4e..183926483c3a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1019,6 +1019,7 @@ static void init_vmcb(struct vcpu_svm *svm) set_exception_intercept(svm, PF_VECTOR); set_exception_intercept(svm, UD_VECTOR); set_exception_intercept(svm, MC_VECTOR); + set_exception_intercept(svm, AC_VECTOR); set_intercept(svm, INTERCEPT_INTR); set_intercept(svm, INTERCEPT_NMI); @@ -1707,6 +1708,12 @@ static int ud_interception(struct vcpu_svm *svm) return 1; } +static int ac_interception(struct vcpu_svm *svm) +{ + kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0); + return 1; +} + static void svm_fpu_activate(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3270,6 +3277,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, + [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception, [SVM_EXIT_INTR] = intr_interception, [SVM_EXIT_NMI] = nmi_interception, [SVM_EXIT_SMI] = nop_on_interception, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b765b036a048..89aaedd2a91d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1639,7 +1639,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) u32 eb; eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | - (1u << NM_VECTOR) | (1u << DB_VECTOR); + (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); if ((vcpu->guest_debug & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) @@ -5261,6 +5261,9 @@ static int handle_exception(struct kvm_vcpu *vcpu) return handle_rmode_exception(vcpu, ex_no, error_code); switch (ex_no) { + case AC_VECTOR: + kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); + return 1; case DB_VECTOR: dr6 = vmcs_readl(EXIT_QUALIFICATION); if (!(vcpu->guest_debug & -- cgit v1.2.3 From cbdb967af3d54993f5814f1cee0ed311a055377d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 Nov 2015 09:14:39 +0100 Subject: KVM: svm: unconditionally intercept #DB This is needed to avoid the possibility that the guest triggers an infinite stream of #DB exceptions (CVE-2015-8104). VMX is not affected: because it does not save DR6 in the VMCS, it already intercepts #DB unconditionally. Reported-by: Jan Beulich Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 183926483c3a..1cc1ffca0d8c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1020,6 +1020,7 @@ static void init_vmcb(struct vcpu_svm *svm) set_exception_intercept(svm, UD_VECTOR); set_exception_intercept(svm, MC_VECTOR); set_exception_intercept(svm, AC_VECTOR); + set_exception_intercept(svm, DB_VECTOR); set_intercept(svm, INTERCEPT_INTR); set_intercept(svm, INTERCEPT_NMI); @@ -1554,20 +1555,13 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, mark_dirty(svm->vmcb, VMCB_SEG); } -static void update_db_bp_intercept(struct kvm_vcpu *vcpu) +static void update_bp_intercept(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - clr_exception_intercept(svm, DB_VECTOR); clr_exception_intercept(svm, BP_VECTOR); - if (svm->nmi_singlestep) - set_exception_intercept(svm, DB_VECTOR); - if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { - if (vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - set_exception_intercept(svm, DB_VECTOR); if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) set_exception_intercept(svm, BP_VECTOR); } else @@ -1673,7 +1667,6 @@ static int db_interception(struct vcpu_svm *svm) if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); - update_db_bp_intercept(&svm->vcpu); } if (svm->vcpu.guest_debug & @@ -3661,7 +3654,6 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) */ svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); - update_db_bp_intercept(vcpu); } static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) @@ -4287,7 +4279,7 @@ static struct kvm_x86_ops svm_x86_ops = { .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, - .update_db_bp_intercept = update_db_bp_intercept, + .update_db_bp_intercept = update_bp_intercept, .get_msr = svm_get_msr, .set_msr = svm_set_msr, .get_segment_base = svm_get_segment_base, -- cgit v1.2.3 From a96036b8ef7df9f10cd575c0d78359bd33188e8e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 Nov 2015 11:55:36 +0100 Subject: KVM: x86: rename update_db_bp_intercept to update_bp_intercept Because #DB is now intercepted unconditionally, this callback only operates on #BP for both VMX and SVM. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 456a3869a57e..30cfd64295a0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -778,7 +778,7 @@ struct kvm_x86_ops { void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); - void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu); + void (*update_bp_intercept)(struct kvm_vcpu *vcpu); int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1cc1ffca0d8c..83a1c643f9a5 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4279,7 +4279,7 @@ static struct kvm_x86_ops svm_x86_ops = { .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, - .update_db_bp_intercept = update_bp_intercept, + .update_bp_intercept = update_bp_intercept, .get_msr = svm_get_msr, .set_msr = svm_set_msr, .get_segment_base = svm_get_segment_base, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 89aaedd2a91d..87acc5221740 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10759,7 +10759,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, - .update_db_bp_intercept = update_exception_bitmap, + .update_bp_intercept = update_exception_bitmap, .get_msr = vmx_get_msr, .set_msr = vmx_set_msr, .get_segment_base = vmx_get_segment_base, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2cb074f5aaed..aba7f95d7a64 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7115,7 +7115,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, */ kvm_set_rflags(vcpu, rflags); - kvm_x86_ops->update_db_bp_intercept(vcpu); + kvm_x86_ops->update_bp_intercept(vcpu); r = 0; -- cgit v1.2.3 From 46561c3959d6307d22139c24cd0bf196162e5681 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 11 Nov 2015 10:19:31 -0800 Subject: x86/mpx: Do proper get_user() when running 32-bit binaries on 64-bit kernels When you call get_user(foo, bar), you effectively do a copy_from_user(&foo, bar, sizeof(*bar)); Note that the sizeof() is implicit. When we reach out to userspace to try to zap an entire "bounds table" we need to go read a "bounds directory entry" in order to locate the table's address. The size of a "directory entry" depends on the binary being run and is always the size of a pointer. But, when we have a 64-bit kernel and a 32-bit application, the directory entry is still only 32-bits long, but we fetch it with a 64-bit pointer which makes get_user() does a 64-bit fetch. Reading 4 extra bytes isn't harmful, unless we are at the end of and run off the table. It might also cause the zero page to get faulted in unnecessarily even if you are not at the end. Fix it up by doing a special 32-bit get_user() via a cast when we have 32-bit userspace. Signed-off-by: Dave Hansen Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20151111181931.3ACF6822@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/mpx.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index b0ae85f90f10..6127c5e3a19b 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -585,6 +585,29 @@ static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm, return bt_addr; } +/* + * We only want to do a 4-byte get_user() on 32-bit. Otherwise, + * we might run off the end of the bounds table if we are on + * a 64-bit kernel and try to get 8 bytes. + */ +int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret, + long __user *bd_entry_ptr) +{ + u32 bd_entry_32; + int ret; + + if (is_64bit_mm(mm)) + return get_user(*bd_entry_ret, bd_entry_ptr); + + /* + * Note that get_user() uses the type of the *pointer* to + * establish the size of the get, not the destination. + */ + ret = get_user(bd_entry_32, (u32 __user *)bd_entry_ptr); + *bd_entry_ret = bd_entry_32; + return ret; +} + /* * Get the base of bounds tables pointed by specific bounds * directory entry. @@ -605,7 +628,7 @@ static int get_bt_addr(struct mm_struct *mm, int need_write = 0; pagefault_disable(); - ret = get_user(bd_entry, bd_entry_ptr); + ret = get_user_bd_entry(mm, &bd_entry, bd_entry_ptr); pagefault_enable(); if (!ret) break; -- cgit v1.2.3 From f3119b830264d89d216bfb378ab65065dffa02d9 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 11 Nov 2015 10:19:34 -0800 Subject: x86/mpx: Fix 32-bit address space calculation I received a bug report that running 32-bit MPX binaries on 64-bit kernels was broken. I traced it down to this little code snippet. We were switching our "number of bounds directory entries" calculation correctly. But, we didn't switch the other side of the calculation: the virtual space size. This meant that we were calculating an absurd size for bd_entry_virt_space() on 32-bit because we used the 64-bit virt_space. This was _also_ broken for 32-bit kernels running on 64-bit hardware since boot_cpu_data.x86_virt_bits=48 even when running in 32-bit mode. Correct that and properly handle all 3 possible cases: 1. 32-bit binary on 64-bit kernel 2. 64-bit binary on 64-bit kernel 3. 32-bit binary on 32-bit kernel This manifested in having bounds tables not properly unmapped. It "leaked" memory but had no functional impact otherwise. Signed-off-by: Dave Hansen Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20151111181934.FA7FAC34@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/mpx.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 6127c5e3a19b..1202d5ca2fb5 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -723,11 +723,23 @@ static unsigned long mpx_get_bt_entry_offset_bytes(struct mm_struct *mm, */ static inline unsigned long bd_entry_virt_space(struct mm_struct *mm) { - unsigned long long virt_space = (1ULL << boot_cpu_data.x86_virt_bits); - if (is_64bit_mm(mm)) - return virt_space / MPX_BD_NR_ENTRIES_64; - else - return virt_space / MPX_BD_NR_ENTRIES_32; + unsigned long long virt_space; + unsigned long long GB = (1ULL << 30); + + /* + * This covers 32-bit emulation as well as 32-bit kernels + * running on 64-bit harware. + */ + if (!is_64bit_mm(mm)) + return (4ULL * GB) / MPX_BD_NR_ENTRIES_32; + + /* + * 'x86_virt_bits' returns what the hardware is capable + * of, and returns the full >32-bit adddress space when + * running 32-bit kernels on 64-bit hardware. + */ + virt_space = (1ULL << boot_cpu_data.x86_virt_bits); + return virt_space / MPX_BD_NR_ENTRIES_64; } /* -- cgit v1.2.3 From ab6b52947545a5355154f64f449f97af9d05845f Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 10 Nov 2015 16:23:54 -0800 Subject: x86/fpu: Fix 32-bit signal frame handling (This should have gone to LKML originally. Sorry for the extra noise, folks on the cc.) Background: Signal frames on x86 have two formats: 1. For 32-bit executables (whether on a real 32-bit kernel or under 32-bit emulation on a 64-bit kernel) we have a 'fpregset_t' that includes the "FSAVE" registers. 2. For 64-bit executables (on 64-bit kernels obviously), the 'fpregset_t' is smaller and does not contain the "FSAVE" state. When creating the signal frame, we have to be aware of whether we are running a 32 or 64-bit executable so we create the correct format signal frame. Problem: save_xstate_epilog() uses 'fx_sw_reserved_ia32' whenever it is called for a 32-bit executable. This is for real 32-bit and ia32 emulation. But, fpu__init_prepare_fx_sw_frame() only initializes 'fx_sw_reserved_ia32' when emulation is enabled, *NOT* for real 32-bit kernels. This leads to really wierd situations where 32-bit programs lose their extended state when returning from a signal handler. The kernel copies the uninitialized (zero) 'fx_sw_reserved_ia32' out to userspace in save_xstate_epilog(). But when returning from the signal, the kernel errors out in check_for_xstate() when it does not see FP_XSTATE_MAGIC1 present (because it was zeroed). This leads to the FPU/XSAVE state being initialized. For MPX, this leads to the most permissive state and means we silently lose bounds violations. I think this would also mean that we could lose *ANY* FPU/SSE/AVX state. I'm not sure why no one has spotted this bug. I believe this was broken by: 72a671ced66d ("x86, fpu: Unify signal handling code paths for x86 and x86_64 kernels") way back in 2012. Signed-off-by: Dave Hansen Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@sr71.net Cc: fenghua.yu@intel.com Cc: yu-cheng.yu@intel.com Link: http://lkml.kernel.org/r/20151111002354.A0799571@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/signal.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index ef29b742cea7..31c6a60505e6 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -385,20 +385,19 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, */ void fpu__init_prepare_fx_sw_frame(void) { - int fsave_header_size = sizeof(struct fregs_state); int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; - if (config_enabled(CONFIG_X86_32)) - size += fsave_header_size; - fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; fx_sw_reserved.extended_size = size; fx_sw_reserved.xfeatures = xfeatures_mask; fx_sw_reserved.xstate_size = xstate_size; - if (config_enabled(CONFIG_IA32_EMULATION)) { + if (config_enabled(CONFIG_IA32_EMULATION) || + config_enabled(CONFIG_X86_32)) { + int fsave_header_size = sizeof(struct fregs_state); + fx_sw_reserved_ia32 = fx_sw_reserved; - fx_sw_reserved_ia32.extended_size += fsave_header_size; + fx_sw_reserved_ia32.extended_size = size + fsave_header_size; } } -- cgit v1.2.3 From a05917b6ba9dc9a95fc42bdcbe3a875e8ad83935 Mon Sep 17 00:00:00 2001 From: Huaitong Han Date: Fri, 6 Nov 2015 17:00:23 +0800 Subject: x86/fpu: Fix get_xsave_addr() behavior under virtualization KVM uses the get_xsave_addr() function in a different fashion from the native kernel, in that the 'xsave' parameter belongs to guest vcpu, not the currently running task. But 'xsave' is replaced with current task's (host) xsave structure, so get_xsave_addr() will incorrectly return the bad xsave address to KVM. Fix it so that the passed in 'xsave' address is used - as intended originally. Signed-off-by: Huaitong Han Reviewed-by: Dave Hansen Cc: Cc: Andy Lutomirski Cc: Paolo Bonzini Cc: Borislav Petkov Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Cc: dave.hansen@intel.com Link: http://lkml.kernel.org/r/1446800423-21622-1-git-send-email-huaitong.han@intel.com [ Tidied up the changelog. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 6454f2731b56..70fc312221fc 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -694,7 +694,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) if (!boot_cpu_has(X86_FEATURE_XSAVE)) return NULL; - xsave = ¤t->thread.fpu.state.xsave; /* * We should not ever be requesting features that we * have not enabled. Remember that pcntxt_mask is -- cgit v1.2.3 From 41ac18ebfc429ce3f4d369ef07447d652999a0cd Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Wed, 4 Nov 2015 17:43:53 +0800 Subject: perf/x86/intel/rapl: Remove the unused RAPL_EVENT_DESC() macro Signed-off-by: Huang Rui Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Dasaratharaman Chandramouli Cc: Fengguang Wu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Tony Li Link: http://lkml.kernel.org/r/1446630233-3166-1-git-send-email-ray.huang@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_rapl.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 81431c0f0614..ed446bdcbf31 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -107,12 +107,6 @@ static ssize_t __rapl_##_var##_show(struct kobject *kobj, \ static struct kobj_attribute format_attr_##_var = \ __ATTR(_name, 0444, __rapl_##_var##_show, NULL) -#define RAPL_EVENT_DESC(_name, _config) \ -{ \ - .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \ - .config = _config, \ -} - #define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ #define RAPL_EVENT_ATTR_STR(_name, v, str) \ -- cgit v1.2.3 From 5369a21e3f26ef9d2bf6ea1b322d6899a4ed08e0 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 12 Nov 2015 02:42:32 -0500 Subject: x86: remove unused definition of MSR_NHM_PLATFORM_INFO MSR_NHM_PLATFORM_INFO has been replaced by... MSR_PLATFORM_INFO Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/msr-index.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b8c14bb7fc8f..705c4082b7f1 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -35,7 +35,7 @@ #define MSR_IA32_PERFCTR0 0x000000c1 #define MSR_IA32_PERFCTR1 0x000000c2 #define MSR_FSB_FREQ 0x000000cd -#define MSR_NHM_PLATFORM_INFO 0x000000ce +#define MSR_PLATFORM_INFO 0x000000ce #define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2 #define NHM_C3_AUTO_DEMOTE (1UL << 25) @@ -44,7 +44,6 @@ #define SNB_C1_AUTO_UNDEMOTE (1UL << 27) #define SNB_C3_AUTO_UNDEMOTE (1UL << 28) -#define MSR_PLATFORM_INFO 0x000000ce #define MSR_MTRRcap 0x000000fe #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e -- cgit v1.2.3 From 112677d683d31ebd6a8e8b02e0620ae512354b2d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 17 Nov 2015 09:43:24 +0900 Subject: x86/ftrace: Add comment on static function tracing There was a confusion between update_ftrace_function() and static function tracing trampoline regarding 3rd parameter (ftrace_ops). Add a comment for clarification. Suggested-by: Steven Rostedt Signed-off-by: Namhyung Kim Cc: H. Peter Anvin Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1447721004-2551-1-git-send-email-namhyung@kernel.org Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mcount_64.S | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S index 94ea120fa21f..87e1762e2bca 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/mcount_64.S @@ -278,6 +278,12 @@ trace: /* save_mcount_regs fills in first two parameters */ save_mcount_regs + /* + * When DYNAMIC_FTRACE is not defined, ARCH_SUPPORTS_FTRACE_OPS is not + * set (see include/asm/ftrace.h and include/linux/ftrace.h). Only the + * ip and parent ip are used and the list function is called when + * function tracing is enabled. + */ call *ftrace_trace_function restore_mcount_regs -- cgit v1.2.3 From 581b7f158fe0383b492acd1ce3fb4e99d4e57808 Mon Sep 17 00:00:00 2001 From: Andrew Cooper Date: Wed, 3 Jun 2015 10:31:14 +0100 Subject: x86/cpu: Fix SMAP check in PVOPS environments There appears to be no formal statement of what pv_irq_ops.save_fl() is supposed to return precisely. Native returns the full flags, while lguest and Xen only return the Interrupt Flag, and both have comments by the implementations stating that only the Interrupt Flag is looked at. This may have been true when initially implemented, but no longer is. To make matters worse, the Xen PVOP leaves the upper bits undefined, making the BUG_ON() undefined behaviour. Experimentally, this now trips for 32bit PV guests on Broadwell hardware. The BUG_ON() is consistent for an individual build, but not consistent for all builds. It has also been a sitting timebomb since SMAP support was introduced. Use native_save_fl() instead, which will obtain an accurate view of the AC flag. Signed-off-by: Andrew Cooper Reviewed-by: David Vrabel Tested-by: Rusty Russell Cc: Rusty Russell Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: Cc: Xen-devel CC: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1433323874-6927-1-git-send-email-andrew.cooper3@citrix.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/common.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4ddd780aeac9..c2b7522cbf35 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -273,10 +273,9 @@ __setup("nosmap", setup_disable_smap); static __always_inline void setup_smap(struct cpuinfo_x86 *c) { - unsigned long eflags; + unsigned long eflags = native_save_fl(); /* This should have been cleared long ago */ - raw_local_save_flags(eflags); BUG_ON(eflags & X86_EFLAGS_AC); if (cpu_has(c, X86_FEATURE_SMAP)) { -- cgit v1.2.3