diff options
Diffstat (limited to 'arch/x86')
97 files changed, 1112 insertions, 2307 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8d4f87e5bba3..cd18994a9555 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -160,6 +160,7 @@ config X86 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER && STACK_VALIDATION select HAVE_STACK_VALIDATION if X86_64 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 4430dd489620..5851411e60fb 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -179,7 +179,8 @@ ifdef CONFIG_JUMP_LABEL endif ifeq ($(ACCUMULATE_OUTGOING_ARGS), 1) - KBUILD_CFLAGS += -maccumulate-outgoing-args + # This compiler flag is not supported by Clang: + KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args,) endif # Stackpointer is addressed different for 32 bit and 64 bit x86 diff --git a/arch/x86/boot/compressed/error.h b/arch/x86/boot/compressed/error.h index 2e59dac07f9e..d732e608e3af 100644 --- a/arch/x86/boot/compressed/error.h +++ b/arch/x86/boot/compressed/error.h @@ -1,7 +1,9 @@ #ifndef BOOT_COMPRESSED_ERROR_H #define BOOT_COMPRESSED_ERROR_H +#include <linux/compiler.h> + void warn(char *m); -void error(char *m); +void error(char *m) __noreturn; #endif /* BOOT_COMPRESSED_ERROR_H */ diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 56589d0a804b..1d78f1739087 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -70,7 +70,7 @@ static unsigned long level4p; * Due to relocation, pointers must be assigned at run time not build time. */ static struct x86_mapping_info mapping_info = { - .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, + .page_flag = __PAGE_KERNEL_LARGE_EXEC, }; /* Locates and clears a region for a new top level page table. */ diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S index a916c4a61165..5f6a5af9c489 100644 --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S +++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S @@ -65,7 +65,6 @@ #include <linux/linkage.h> #include <asm/inst.h> -#define CONCAT(a,b) a##b #define VMOVDQ vmovdqu #define xdata0 %xmm0 @@ -92,8 +91,6 @@ #define num_bytes %r8 #define tmp %r10 -#define DDQ(i) CONCAT(ddq_add_,i) -#define XMM(i) CONCAT(%xmm, i) #define DDQ_DATA 0 #define XDATA 1 #define KEY_128 1 @@ -131,12 +128,12 @@ ddq_add_8: /* generate a unique variable for ddq_add_x */ .macro setddq n - var_ddq_add = DDQ(\n) + var_ddq_add = ddq_add_\n .endm /* generate a unique variable for xmm register */ .macro setxdata n - var_xdata = XMM(\n) + var_xdata = %xmm\n .endm /* club the numeric 'id' to the symbol 'name' */ diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index aa76cad9d262..af4840ab2a3d 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -1522,7 +1522,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[2 * 4]; + le128 buf[2 * 4]; struct xts_crypt_req req = { .tbuf = buf, .tbuflen = sizeof(buf), @@ -1540,7 +1540,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[2 * 4]; + le128 buf[2 * 4]; struct xts_crypt_req req = { .tbuf = buf, .tbuflen = sizeof(buf), diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 260a060d7275..24ac9fad832d 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -27,6 +27,7 @@ #include <linux/module.h> #include <crypto/b128ops.h> +#include <crypto/gf128mul.h> #include <crypto/internal/skcipher.h> #include <crypto/lrw.h> #include <crypto/xts.h> @@ -457,7 +458,7 @@ void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv, le128 ivblk = *iv; /* generate next IV */ - le128_gf128mul_x_ble(iv, &ivblk); + gf128mul_x_ble(iv, &ivblk); /* CC <- T xor C */ u128_xor(dst, src, (u128 *)&ivblk); diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 644f97ab8cac..ac0e831943f5 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -328,7 +328,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[SERPENT_PARALLEL_BLOCKS]; + le128 buf[SERPENT_PARALLEL_BLOCKS]; struct crypt_priv crypt_ctx = { .ctx = &ctx->crypt_ctx, .fpu_enabled = false, @@ -355,7 +355,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[SERPENT_PARALLEL_BLOCKS]; + le128 buf[SERPENT_PARALLEL_BLOCKS]; struct crypt_priv crypt_ctx = { .ctx = &ctx->crypt_ctx, .fpu_enabled = false, diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 2ebb5e9789f3..243e90a4b5d9 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -296,7 +296,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[3]; + le128 buf[3]; struct xts_crypt_req req = { .tbuf = buf, .tbuflen = sizeof(buf), @@ -314,7 +314,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[3]; + le128 buf[3]; struct xts_crypt_req req = { .tbuf = buf, .tbuflen = sizeof(buf), diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 370c42c7f046..cdefcfdd9e63 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -22,6 +22,7 @@ #include <linux/context_tracking.h> #include <linux/user-return-notifier.h> #include <linux/uprobes.h> +#include <linux/livepatch.h> #include <asm/desc.h> #include <asm/traps.h> @@ -130,14 +131,13 @@ static long syscall_trace_enter(struct pt_regs *regs) #define EXIT_TO_USERMODE_LOOP_FLAGS \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY) + _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING) static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) { /* * In order to return to user mode, we need to have IRQs off with - * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY, - * _TIF_UPROBE, or _TIF_NEED_RESCHED set. Several of these flags + * none of EXIT_TO_USERMODE_LOOP_FLAGS set. Several of these flags * can be set at any time on preemptable kernels if we have IRQs on, * so we need to loop. Disabling preemption wouldn't help: doing the * work to clear some of the flags can sleep. @@ -164,6 +164,9 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) if (cached_flags & _TIF_USER_RETURN_NOTIFY) fire_user_return_notifiers(); + if (cached_flags & _TIF_PATCH_PENDING) + klp_update_patch_state(current); + /* Disable IRQs and retry */ local_irq_disable(); diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 0af59fa789ea..448ac2161112 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -226,7 +226,7 @@ 217 i386 pivot_root sys_pivot_root 218 i386 mincore sys_mincore 219 i386 madvise sys_madvise -220 i386 getdents64 sys_getdents64 compat_sys_getdents64 +220 i386 getdents64 sys_getdents64 221 i386 fcntl64 sys_fcntl64 compat_sys_fcntl64 # 222 is unused # 223 is unused diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 9d05c7e67f60..a45e2114a846 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -761,7 +761,7 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsx_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_rapl_init), diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 2b01421f7d0f..5b882cc0c0e9 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -25,7 +25,7 @@ #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/clockchips.h> - +#include <linux/hyperv.h> #ifdef CONFIG_HYPERV_TSCPAGE diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 7acb51c49fec..7a9df3beb89b 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -32,6 +32,7 @@ #define _ASM_ADD __ASM_SIZE(add) #define _ASM_SUB __ASM_SIZE(sub) #define _ASM_XADD __ASM_SIZE(xadd) +#define _ASM_MUL __ASM_SIZE(mul) #define _ASM_AX __ASM_REG(ax) #define _ASM_BX __ASM_REG(bx) diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index e7e1942edff7..8b4140f6724f 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -5,93 +5,8 @@ #include <asm-generic/cacheflush.h> #include <asm/special_insns.h> -/* - * The set_memory_* API can be used to change various attributes of a virtual - * address range. The attributes include: - * Cachability : UnCached, WriteCombining, WriteThrough, WriteBack - * Executability : eXeutable, NoteXecutable - * Read/Write : ReadOnly, ReadWrite - * Presence : NotPresent - * - * Within a category, the attributes are mutually exclusive. - * - * The implementation of this API will take care of various aspects that - * are associated with changing such attributes, such as: - * - Flushing TLBs - * - Flushing CPU caches - * - Making sure aliases of the memory behind the mapping don't violate - * coherency rules as defined by the CPU in the system. - * - * What this API does not do: - * - Provide exclusion between various callers - including callers that - * operation on other mappings of the same physical page - * - Restore default attributes when a page is freed - * - Guarantee that mappings other than the requested one are - * in any state, other than that these do not violate rules for - * the CPU you have. Do not depend on any effects on other mappings, - * CPUs other than the one you have may have more relaxed rules. - * The caller is required to take care of these. - */ - -int _set_memory_uc(unsigned long addr, int numpages); -int _set_memory_wc(unsigned long addr, int numpages); -int _set_memory_wt(unsigned long addr, int numpages); -int _set_memory_wb(unsigned long addr, int numpages); -int set_memory_uc(unsigned long addr, int numpages); -int set_memory_wc(unsigned long addr, int numpages); -int set_memory_wt(unsigned long addr, int numpages); -int set_memory_wb(unsigned long addr, int numpages); -int set_memory_x(unsigned long addr, int numpages); -int set_memory_nx(unsigned long addr, int numpages); -int set_memory_ro(unsigned long addr, int numpages); -int set_memory_rw(unsigned long addr, int numpages); -int set_memory_np(unsigned long addr, int numpages); -int set_memory_4k(unsigned long addr, int numpages); - -int set_memory_array_uc(unsigned long *addr, int addrinarray); -int set_memory_array_wc(unsigned long *addr, int addrinarray); -int set_memory_array_wt(unsigned long *addr, int addrinarray); -int set_memory_array_wb(unsigned long *addr, int addrinarray); - -int set_pages_array_uc(struct page **pages, int addrinarray); -int set_pages_array_wc(struct page **pages, int addrinarray); -int set_pages_array_wt(struct page **pages, int addrinarray); -int set_pages_array_wb(struct page **pages, int addrinarray); - -/* - * For legacy compatibility with the old APIs, a few functions - * are provided that work on a "struct page". - * These functions operate ONLY on the 1:1 kernel mapping of the - * memory that the struct page represents, and internally just - * call the set_memory_* function. See the description of the - * set_memory_* function for more details on conventions. - * - * These APIs should be considered *deprecated* and are likely going to - * be removed in the future. - * The reason for this is the implicit operation on the 1:1 mapping only, - * making this not a generally useful API. - * - * Specifically, many users of the old APIs had a virtual address, - * called virt_to_page() or vmalloc_to_page() on that address to - * get a struct page* that the old API required. - * To convert these cases, use set_memory_*() on the original - * virtual address, do not use these functions. - */ - -int set_pages_uc(struct page *page, int numpages); -int set_pages_wb(struct page *page, int numpages); -int set_pages_x(struct page *page, int numpages); -int set_pages_nx(struct page *page, int numpages); -int set_pages_ro(struct page *page, int numpages); -int set_pages_rw(struct page *page, int numpages); - - void clflush_cache_range(void *addr, unsigned int size); #define mmio_flush_range(addr, size) clflush_cache_range(addr, size) -extern int kernel_set_to_readonly; -void set_kernel_text_rw(void); -void set_kernel_text_ro(void); - #endif /* _ASM_X86_CACHEFLUSH_H */ diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h index 29e53ea7d764..ed8b66de541f 100644 --- a/arch/x86/include/asm/crypto/glue_helper.h +++ b/arch/x86/include/asm/crypto/glue_helper.h @@ -125,16 +125,6 @@ static inline void le128_inc(le128 *i) i->b = cpu_to_le64(b); } -static inline void le128_gf128mul_x_ble(le128 *dst, const le128 *src) -{ - u64 a = le64_to_cpu(src->a); - u64 b = le64_to_cpu(src->b); - u64 _tt = ((s64)a >> 63) & 0x87; - - dst->a = cpu_to_le64((a << 1) ^ (b >> 63)); - dst->b = cpu_to_le64((b << 1) ^ _tt); -} - extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, struct blkcipher_desc *desc, struct scatterlist *dst, diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 737da62bfeb0..474eb8c66fee 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -4,8 +4,9 @@ struct x86_mapping_info { void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ void *context; /* context for alloc_pgt_page */ - unsigned long pmd_flag; /* page flag for PMD entry */ + unsigned long page_flag; /* page flag for PMD or PUD entry */ unsigned long offset; /* ident mapping offset */ + bool direct_gbpages; /* PUD level 1GB page support */ }; int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, diff --git a/arch/x86/include/asm/intel_pmc_ipc.h b/arch/x86/include/asm/intel_pmc_ipc.h index 4291b6a5ddf7..fac89eb78a6b 100644 --- a/arch/x86/include/asm/intel_pmc_ipc.h +++ b/arch/x86/include/asm/intel_pmc_ipc.h @@ -23,6 +23,11 @@ #define IPC_ERR_EMSECURITY 6 #define IPC_ERR_UNSIGNEDKERNEL 7 +/* GCR reg offsets from gcr base*/ +#define PMC_GCR_PMC_CFG_REG 0x08 +#define PMC_GCR_TELEM_DEEP_S0IX_REG 0x78 +#define PMC_GCR_TELEM_SHLW_S0IX_REG 0x80 + #if IS_ENABLED(CONFIG_INTEL_PMC_IPC) int intel_pmc_ipc_simple_command(int cmd, int sub); @@ -31,6 +36,9 @@ int intel_pmc_ipc_raw_cmd(u32 cmd, u32 sub, u8 *in, u32 inlen, int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen, u32 *out, u32 outlen); int intel_pmc_s0ix_counter_read(u64 *data); +int intel_pmc_gcr_read(u32 offset, u32 *data); +int intel_pmc_gcr_write(u32 offset, u32 data); +int intel_pmc_gcr_update(u32 offset, u32 mask, u32 val); #else @@ -56,6 +64,21 @@ static inline int intel_pmc_s0ix_counter_read(u64 *data) return -EINVAL; } +static inline int intel_pmc_gcr_read(u32 offset, u32 *data) +{ + return -EINVAL; +} + +static inline int intel_pmc_gcr_write(u32 offset, u32 data) +{ + return -EINVAL; +} + +static inline int intel_pmc_gcr_update(u32 offset, u32 mask, u32 val) +{ + return -EINVAL; +} + #endif /*CONFIG_INTEL_PMC_IPC*/ #endif diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h index 4fb1d0abef95..81d3d8776fd9 100644 --- a/arch/x86/include/asm/intel_scu_ipc.h +++ b/arch/x86/include/asm/intel_scu_ipc.h @@ -3,6 +3,9 @@ #include <linux/notifier.h> +#define IPCMSG_INDIRECT_READ 0x02 +#define IPCMSG_INDIRECT_WRITE 0x05 + #define IPCMSG_COLD_OFF 0x80 /* Only for Tangier */ #define IPCMSG_WARM_RESET 0xF0 @@ -45,7 +48,10 @@ int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask); /* Issue commands to the SCU with or without data */ int intel_scu_ipc_simple_command(int cmd, int sub); int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, - u32 *out, int outlen); + u32 *out, int outlen); +int intel_scu_ipc_raw_command(int cmd, int sub, u8 *in, int inlen, + u32 *out, int outlen, u32 dptr, u32 sptr); + /* I2C control api */ int intel_scu_ipc_i2c_cntrl(u32 addr, u32 *data); diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h index b41ee164930a..c313cac36f56 100644 --- a/arch/x86/include/asm/iosf_mbi.h +++ b/arch/x86/include/asm/iosf_mbi.h @@ -5,6 +5,8 @@ #ifndef IOSF_MBI_SYMS_H #define IOSF_MBI_SYMS_H +#include <linux/notifier.h> + #define MBI_MCR_OFFSET 0xD0 #define MBI_MDR_OFFSET 0xD4 #define MBI_MCRX_OFFSET 0xD8 @@ -47,6 +49,10 @@ #define QRK_MBI_UNIT_MM 0x05 #define QRK_MBI_UNIT_SOC 0x31 +/* Action values for the pmic_bus_access_notifier functions */ +#define MBI_PMIC_BUS_ACCESS_BEGIN 1 +#define MBI_PMIC_BUS_ACCESS_END 2 + #if IS_ENABLED(CONFIG_IOSF_MBI) bool iosf_mbi_available(void); @@ -88,6 +94,65 @@ int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr); */ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask); +/** + * iosf_mbi_punit_acquire() - Acquire access to the P-Unit + * + * One some systems the P-Unit accesses the PMIC to change various voltages + * through the same bus as other kernel drivers use for e.g. battery monitoring. + * + * If a driver sends requests to the P-Unit which require the P-Unit to access + * the PMIC bus while another driver is also accessing the PMIC bus various bad + * things happen. + * + * To avoid these problems this function must be called before accessing the + * P-Unit or the PMIC, be it through iosf_mbi* functions or through other means. + * + * Note on these systems the i2c-bus driver will request a sempahore from the + * P-Unit for exclusive access to the PMIC bus when i2c drivers are accessing + * it, but this does not appear to be sufficient, we still need to avoid making + * certain P-Unit requests during the access window to avoid problems. + * + * This function locks a mutex, as such it may sleep. + */ +void iosf_mbi_punit_acquire(void); + +/** + * iosf_mbi_punit_release() - Release access to the P-Unit + */ +void iosf_mbi_punit_release(void); + +/** + * iosf_mbi_register_pmic_bus_access_notifier - Register PMIC bus notifier + * + * This function can be used by drivers which may need to acquire P-Unit + * managed resources from interrupt context, where iosf_mbi_punit_acquire() + * can not be used. + * + * This function allows a driver to register a notifier to get notified (in a + * process context) before other drivers start accessing the PMIC bus. + * + * This allows the driver to acquire any resources, which it may need during + * the window the other driver is accessing the PMIC, before hand. + * + * @nb: notifier_block to register + */ +int iosf_mbi_register_pmic_bus_access_notifier(struct notifier_block *nb); + +/** + * iosf_mbi_register_pmic_bus_access_notifier - Unregister PMIC bus notifier + * + * @nb: notifier_block to unregister + */ +int iosf_mbi_unregister_pmic_bus_access_notifier(struct notifier_block *nb); + +/** + * iosf_mbi_call_pmic_bus_access_notifier_chain - Call PMIC bus notifier chain + * + * @val: action to pass into listener's notifier_call function + * @v: data pointer to pass into listener's notifier_call function + */ +int iosf_mbi_call_pmic_bus_access_notifier_chain(unsigned long val, void *v); + #else /* CONFIG_IOSF_MBI is not enabled */ static inline bool iosf_mbi_available(void) @@ -115,6 +180,28 @@ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask) WARN(1, "IOSF_MBI driver not available"); return -EPERM; } + +static inline void iosf_mbi_punit_acquire(void) {} +static inline void iosf_mbi_punit_release(void) {} + +static inline +int iosf_mbi_register_pmic_bus_access_notifier(struct notifier_block *nb) +{ + return 0; +} + +static inline +int iosf_mbi_unregister_pmic_bus_access_notifier(struct notifier_block *nb) +{ + return 0; +} + +static inline +int iosf_mbi_call_pmic_bus_access_notifier_chain(unsigned long val, void *v) +{ + return 0; +} + #endif /* CONFIG_IOSF_MBI */ #endif /* IOSF_MBI_SYMS_H */ diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 3e8c287090e4..055962615779 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -221,6 +221,9 @@ struct x86_emulate_ops { void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); + + unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); + void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags); }; typedef u32 __attribute__((vector_size(16))) sse128_t; @@ -290,7 +293,6 @@ struct x86_emulate_ctxt { /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; - int emul_flags; bool perm_ok; /* do not check permissions if true */ bool ud; /* inject an #UD if host doesn't support insn */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 74ef58c8ff53..9c761fea0c98 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -43,8 +43,6 @@ #define KVM_PRIVATE_MEM_SLOTS 3 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) -#define KVM_PIO_PAGE_OFFSET 1 -#define KVM_COALESCED_MMIO_PAGE_OFFSET 2 #define KVM_HALT_POLL_NS_DEFAULT 400000 #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS @@ -63,10 +61,10 @@ #define KVM_REQ_PMI 19 #define KVM_REQ_SMI 20 #define KVM_REQ_MASTERCLOCK_UPDATE 21 -#define KVM_REQ_MCLOCK_INPROGRESS 22 -#define KVM_REQ_SCAN_IOAPIC 23 +#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_SCAN_IOAPIC (23 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_GLOBAL_CLOCK_UPDATE 24 -#define KVM_REQ_APIC_PAGE_RELOAD 25 +#define KVM_REQ_APIC_PAGE_RELOAD (25 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_HV_CRASH 26 #define KVM_REQ_IOAPIC_EOI_EXIT 27 #define KVM_REQ_HV_RESET 28 @@ -343,9 +341,10 @@ struct kvm_mmu { void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte); hpa_t root_hpa; - int root_level; - int shadow_root_level; union kvm_mmu_page_role base_role; + u8 root_level; + u8 shadow_root_level; + u8 ept_ad; bool direct_map; /* @@ -612,6 +611,8 @@ struct kvm_vcpu_arch { unsigned long dr7; unsigned long eff_db[KVM_NR_DB_REGS]; unsigned long guest_debug_dr7; + u64 msr_platform_info; + u64 msr_misc_features_enables; u64 mcg_cap; u64 mcg_status; @@ -1019,6 +1020,8 @@ struct kvm_x86_ops { void (*enable_log_dirty_pt_masked)(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t offset, unsigned long mask); + int (*write_log_dirty)(struct kvm_vcpu *vcpu); + /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 1411dbed5e5e..f513cc231151 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -7,6 +7,7 @@ #include <linux/string.h> #include <linux/scatterlist.h> #include <asm/io.h> +#include <asm/pat.h> #include <asm/x86_init.h> #ifdef __KERNEL__ @@ -102,10 +103,8 @@ int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); #define HAVE_PCI_MMAP -extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, - enum pci_mmap_state mmap_state, - int write_combine); - +#define arch_can_pci_mmap_wc() pat_enabled() +#define ARCH_GENERIC_PCI_MMAP_RESOURCE #ifdef CONFIG_PCI extern void early_quirks(void); diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index 529bb4a6487a..0ff8fe71b255 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -44,11 +44,6 @@ static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n) BUG(); } -static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n) -{ - return memcpy_mcsafe(dst, src, n); -} - /** * arch_wb_cache_pmem - write back a cache range with CLWB * @vaddr: virtual start address @@ -103,7 +98,7 @@ static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes, if (bytes < 8) { if (!IS_ALIGNED(dest, 4) || (bytes != 4)) - arch_wb_cache_pmem(addr, 1); + arch_wb_cache_pmem(addr, bytes); } else { if (!IS_ALIGNED(dest, 8)) { dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h new file mode 100644 index 000000000000..eaec6c364e42 --- /dev/null +++ b/arch/x86/include/asm/set_memory.h @@ -0,0 +1,87 @@ +#ifndef _ASM_X86_SET_MEMORY_H +#define _ASM_X86_SET_MEMORY_H + +#include <asm/page.h> +#include <asm-generic/set_memory.h> + +/* + * The set_memory_* API can be used to change various attributes of a virtual + * address range. The attributes include: + * Cachability : UnCached, WriteCombining, WriteThrough, WriteBack + * Executability : eXeutable, NoteXecutable + * Read/Write : ReadOnly, ReadWrite + * Presence : NotPresent + * + * Within a category, the attributes are mutually exclusive. + * + * The implementation of this API will take care of various aspects that + * are associated with changing such attributes, such as: + * - Flushing TLBs + * - Flushing CPU caches + * - Making sure aliases of the memory behind the mapping don't violate + * coherency rules as defined by the CPU in the system. + * + * What this API does not do: + * - Provide exclusion between various callers - including callers that + * operation on other mappings of the same physical page + * - Restore default attributes when a page is freed + * - Guarantee that mappings other than the requested one are + * in any state, other than that these do not violate rules for + * the CPU you have. Do not depend on any effects on other mappings, + * CPUs other than the one you have may have more relaxed rules. + * The caller is required to take care of these. + */ + +int _set_memory_uc(unsigned long addr, int numpages); +int _set_memory_wc(unsigned long addr, int numpages); +int _set_memory_wt(unsigned long addr, int numpages); +int _set_memory_wb(unsigned long addr, int numpages); +int set_memory_uc(unsigned long addr, int numpages); +int set_memory_wc(unsigned long addr, int numpages); +int set_memory_wt(unsigned long addr, int numpages); +int set_memory_wb(unsigned long addr, int numpages); +int set_memory_np(unsigned long addr, int numpages); +int set_memory_4k(unsigned long addr, int numpages); + +int set_memory_array_uc(unsigned long *addr, int addrinarray); +int set_memory_array_wc(unsigned long *addr, int addrinarray); +int set_memory_array_wt(unsigned long *addr, int addrinarray); +int set_memory_array_wb(unsigned long *addr, int addrinarray); + +int set_pages_array_uc(struct page **pages, int addrinarray); +int set_pages_array_wc(struct page **pages, int addrinarray); +int set_pages_array_wt(struct page **pages, int addrinarray); +int set_pages_array_wb(struct page **pages, int addrinarray); + +/* + * For legacy compatibility with the old APIs, a few functions + * are provided that work on a "struct page". + * These functions operate ONLY on the 1:1 kernel mapping of the + * memory that the struct page represents, and internally just + * call the set_memory_* function. See the description of the + * set_memory_* function for more details on conventions. + * + * These APIs should be considered *deprecated* and are likely going to + * be removed in the future. + * The reason for this is the implicit operation on the 1:1 mapping only, + * making this not a generally useful API. + * + * Specifically, many users of the old APIs had a virtual address, + * called virt_to_page() or vmalloc_to_page() on that address to + * get a struct page* that the old API required. + * To convert these cases, use set_memory_*() on the original + * virtual address, do not use these functions. + */ + +int set_pages_uc(struct page *page, int numpages); +int set_pages_wb(struct page *page, int numpages); +int set_pages_x(struct page *page, int numpages); +int set_pages_nx(struct page *page, int numpages); +int set_pages_ro(struct page *page, int numpages); +int set_pages_rw(struct page *page, int numpages); + +extern int kernel_set_to_readonly; +void set_kernel_text_rw(void); +void set_kernel_text_ro(void); + +#endif /* _ASM_X86_SET_MEMORY_H */ diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index a164862d77e3..733bae07fb29 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -79,6 +79,7 @@ int strcmp(const char *cs, const char *ct); #define memset(s, c, n) __memset(s, c, n) #endif +#define __HAVE_ARCH_MEMCPY_MCSAFE 1 __must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t cnt); DECLARE_STATIC_KEY_FALSE(mcsafe_key); diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 9fc44b95f7cb..e00e1bd6e7b3 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -73,9 +73,6 @@ struct thread_info { * thread information flags * - these are process state flags that various assembly files * may need to access - * - pending work-to-be-done flags are in LSW - * - other flags in MSW - * Warning: layout of LSW is hardcoded in entry.S */ #define TIF_SYSCALL_TRACE 0 /* syscall trace active */ #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ @@ -87,6 +84,7 @@ struct thread_info { #define TIF_SECCOMP 8 /* secure computing */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_UPROBE 12 /* breakpointed or singlestepping */ +#define TIF_PATCH_PENDING 13 /* pending live patching update */ #define TIF_NOCPUID 15 /* CPUID is not accessible in userland */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* IA32 compatibility process */ @@ -104,13 +102,14 @@ struct thread_info { #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) -#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_UPROBE (1 << TIF_UPROBE) +#define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) #define _TIF_NOCPUID (1 << TIF_NOCPUID) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) @@ -135,8 +134,10 @@ struct thread_info { /* work to do on any return to user space */ #define _TIF_ALLWORK_MASK \ - ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT | \ - _TIF_NOHZ) + (_TIF_SYSCALL_TRACE | _TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \ + _TIF_NEED_RESCHED | _TIF_SINGLESTEP | _TIF_SYSCALL_EMU | \ + _TIF_SYSCALL_AUDIT | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE | \ + _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ @@ -170,9 +171,9 @@ static inline unsigned long current_stack_pointer(void) * entirely contained by a single stack frame. * * Returns: - * 1 if within a frame - * -1 if placed across a frame boundary (or outside stack) - * 0 unable to determine (no frame pointers, etc) + * GOOD_FRAME if within a frame + * BAD_STACK if placed across a frame boundary (or outside stack) + * NOT_STACK unable to determine (no frame pointers, etc) */ static inline int arch_within_stack_frames(const void * const stack, const void * const stackend, @@ -199,13 +200,14 @@ static inline int arch_within_stack_frames(const void * const stack, * the copy as invalid. */ if (obj + len <= frame) - return obj >= oldframe + 2 * sizeof(void *) ? 1 : -1; + return obj >= oldframe + 2 * sizeof(void *) ? + GOOD_FRAME : BAD_STACK; oldframe = frame; frame = *(const void * const *)frame; } - return -1; + return BAD_STACK; #else - return 0; + return NOT_STACK; #endif } diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 32712a925f26..1ba1536f627e 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -23,7 +23,6 @@ # include <asm/unistd_64.h> # include <asm/unistd_64_x32.h> # define __ARCH_WANT_COMPAT_SYS_TIME -# define __ARCH_WANT_COMPAT_SYS_GETDENTS64 # define __ARCH_WANT_COMPAT_SYS_PREADV64 # define __ARCH_WANT_COMPAT_SYS_PWRITEV64 # define __ARCH_WANT_COMPAT_SYS_PREADV64V2 diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index 9b10dcd51716..e6676495b125 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -11,6 +11,7 @@ struct unwind_state { unsigned long stack_mask; struct task_struct *task; int graph_idx; + bool error; #ifdef CONFIG_FRAME_POINTER bool got_irq; unsigned long *bp, *orig_sp; @@ -42,6 +43,11 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, __unwind_start(state, task, regs, first_frame); } +static inline bool unwind_error(struct unwind_state *state) +{ + return state->error; +} + #ifdef CONFIG_FRAME_POINTER static inline diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index cc54b7026567..35cd06f636ab 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -70,8 +70,10 @@ #define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100 #define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 +#define SECONDARY_EXEC_RDRAND 0x00000800 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 +#define SECONDARY_EXEC_RDSEED 0x00010000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 #define SECONDARY_EXEC_TSC_SCALING 0x02000000 @@ -516,12 +518,14 @@ struct vmx_msr_entry { #define EPT_VIOLATION_READABLE_BIT 3 #define EPT_VIOLATION_WRITABLE_BIT 4 #define EPT_VIOLATION_EXECUTABLE_BIT 5 +#define EPT_VIOLATION_GVA_TRANSLATED_BIT 8 #define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT) #define EPT_VIOLATION_ACC_WRITE (1 << EPT_VIOLATION_ACC_WRITE_BIT) #define EPT_VIOLATION_ACC_INSTR (1 << EPT_VIOLATION_ACC_INSTR_BIT) #define EPT_VIOLATION_READABLE (1 << EPT_VIOLATION_READABLE_BIT) #define EPT_VIOLATION_WRITABLE (1 << EPT_VIOLATION_WRITABLE_BIT) #define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT) +#define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT) /* * VM-instruction error numbers diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild index 3dec769cadf7..83b6e9a0dce4 100644 --- a/arch/x86/include/uapi/asm/Kbuild +++ b/arch/x86/include/uapi/asm/Kbuild @@ -4,62 +4,3 @@ include include/uapi/asm-generic/Kbuild.asm genhdr-y += unistd_32.h genhdr-y += unistd_64.h genhdr-y += unistd_x32.h -header-y += a.out.h -header-y += auxvec.h -header-y += bitsperlong.h -header-y += boot.h -header-y += bootparam.h -header-y += byteorder.h -header-y += debugreg.h -header-y += e820.h -header-y += errno.h -header-y += fcntl.h -header-y += hw_breakpoint.h -header-y += hyperv.h -header-y += ioctl.h -header-y += ioctls.h -header-y += ipcbuf.h -header-y += ist.h -header-y += kvm.h -header-y += kvm_para.h -header-y += kvm_perf.h -header-y += ldt.h -header-y += mce.h -header-y += mman.h -header-y += msgbuf.h -header-y += msr-index.h -header-y += msr.h -header-y += mtrr.h -header-y += param.h -header-y += perf_regs.h -header-y += poll.h -header-y += posix_types.h -header-y += posix_types_32.h -header-y += posix_types_64.h -header-y += posix_types_x32.h -header-y += prctl.h -header-y += processor-flags.h -header-y += ptrace-abi.h -header-y += ptrace.h -header-y += resource.h -header-y += sembuf.h -header-y += setup.h -header-y += shmbuf.h -header-y += sigcontext.h -header-y += sigcontext32.h -header-y += siginfo.h -header-y += signal.h -header-y += socket.h -header-y += sockios.h -header-y += stat.h -header-y += statfs.h -header-y += svm.h -header-y += swab.h -header-y += termbits.h -header-y += termios.h -header-y += types.h -header-y += ucontext.h -header-y += unistd.h -header-y += vm86.h -header-y += vmx.h -header-y += vsyscall.h diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index 3a20ccf787b8..432df4b1baec 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -124,7 +124,7 @@ * Recommend using hypercall for address space switches rather * than MOV to CR3 instruction */ -#define HV_X64_MWAIT_RECOMMENDED (1 << 0) +#define HV_X64_AS_SWITCH_RECOMMENDED (1 << 0) /* Recommend using hypercall for local TLB flushes rather * than INVLPG or MOV to CR3 instructions */ #define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1) @@ -148,6 +148,11 @@ #define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5) /* + * Virtual APIC support + */ +#define HV_X64_DEPRECATING_AEOI_RECOMMENDED (1 << 9) + +/* * Crash notification flag. */ #define HV_CRASH_CTL_CRASH_NOTIFY (1ULL << 63) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 739c0c594022..c2824d02ba37 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -9,6 +9,9 @@ #include <linux/types.h> #include <linux/ioctl.h> +#define KVM_PIO_PAGE_OFFSET 1 +#define KVM_COALESCED_MMIO_PAGE_OFFSET 2 + #define DE_VECTOR 0 #define DB_VECTOR 1 #define BP_VECTOR 3 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 14458658e988..690a2dcf4078 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -76,7 +76,11 @@ #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 #define EXIT_REASON_APIC_WRITE 56 +#define EXIT_REASON_RDRAND 57 #define EXIT_REASON_INVPCID 58 +#define EXIT_REASON_VMFUNC 59 +#define EXIT_REASON_ENCLS 60 +#define EXIT_REASON_RDSEED 61 #define EXIT_REASON_PML_FULL 62 #define EXIT_REASON_XSAVES 63 #define EXIT_REASON_XRSTORS 64 @@ -90,6 +94,7 @@ { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ { EXIT_REASON_CPUID, "CPUID" }, \ { EXIT_REASON_HLT, "HLT" }, \ + { EXIT_REASON_INVD, "INVD" }, \ { EXIT_REASON_INVLPG, "INVLPG" }, \ { EXIT_REASON_RDPMC, "RDPMC" }, \ { EXIT_REASON_RDTSC, "RDTSC" }, \ @@ -108,6 +113,8 @@ { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \ { EXIT_REASON_MSR_READ, "MSR_READ" }, \ { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \ + { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ + { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \ { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \ { EXIT_REASON_MONITOR_TRAP_FLAG, "MONITOR_TRAP_FLAG" }, \ { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \ @@ -115,20 +122,24 @@ { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ - { EXIT_REASON_GDTR_IDTR, "GDTR_IDTR" }, \ - { EXIT_REASON_LDTR_TR, "LDTR_TR" }, \ + { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ + { EXIT_REASON_GDTR_IDTR, "GDTR_IDTR" }, \ + { EXIT_REASON_LDTR_TR, "LDTR_TR" }, \ { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ { EXIT_REASON_INVEPT, "INVEPT" }, \ + { EXIT_REASON_RDTSCP, "RDTSCP" }, \ { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }, \ + { EXIT_REASON_INVVPID, "INVVPID" }, \ { EXIT_REASON_WBINVD, "WBINVD" }, \ + { EXIT_REASON_XSETBV, "XSETBV" }, \ { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ - { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ - { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ - { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \ - { EXIT_REASON_INVD, "INVD" }, \ - { EXIT_REASON_INVVPID, "INVVPID" }, \ + { EXIT_REASON_RDRAND, "RDRAND" }, \ { EXIT_REASON_INVPCID, "INVPCID" }, \ + { EXIT_REASON_VMFUNC, "VMFUNC" }, \ + { EXIT_REASON_ENCLS, "ENCLS" }, \ + { EXIT_REASON_RDSEED, "RDSEED" }, \ + { EXIT_REASON_PML_FULL, "PML_FULL" }, \ { EXIT_REASON_XSAVES, "XSAVES" }, \ { EXIT_REASON_XRSTORS, "XRSTORS" } diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index df083efe6ee0..815dd63f49d0 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -36,7 +36,7 @@ #include <asm/proto.h> #include <asm/iommu.h> #include <asm/gart.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/swiotlb.h> #include <asm/dma.h> #include <asm/amd_nb.h> diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index b6da6e75e3a8..bb5abe8f5fd4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -16,7 +16,7 @@ #ifdef CONFIG_X86_64 # include <asm/mmconfig.h> -# include <asm/cacheflush.h> +# include <asm/set_memory.h> #endif #include "cpu.h" diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index a44ef52184df..0af86d9242da 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -17,7 +17,7 @@ #include <asm/paravirt.h> #include <asm/alternative.h> #include <asm/pgtable.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> void __init check_bugs(void) { diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 7889ae492af0..45db4d2ebd01 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -10,7 +10,7 @@ * Author: Peter Oruba <peter.oruba@amd.com> * * Based on work by: - * Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * Tigran Aivazian <aivazian.tigran@gmail.com> * * early loader: * Copyright (C) 2013 Advanced Micro Devices, Inc. @@ -352,8 +352,6 @@ void reload_ucode_amd(void) u32 rev, dummy; mc = (struct microcode_amd *)amd_ucode_patch; - if (!mc) - return; rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index b4a4cd39b358..e53d3c909840 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -1,7 +1,7 @@ /* * CPU Microcode Update Driver for Linux * - * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * Copyright (C) 2000-2006 Tigran Aivazian <aivazian.tigran@gmail.com> * 2006 Shaohua Li <shaohua.li@intel.com> * 2013-2016 Borislav Petkov <bp@alien8.de> * diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 8325d8a09ab0..afdfd237b59f 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -1,7 +1,7 @@ /* * Intel CPU Microcode Update Driver for Linux * - * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * Copyright (C) 2000-2006 Tigran Aivazian <aivazian.tigran@gmail.com> * 2006 Shaohua Li <shaohua.li@intel.com> * * Intel CPU microcode early update for Linux diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index b5375b9497b3..04cb8d34ccb8 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -49,6 +49,9 @@ void hyperv_vector_handler(struct pt_regs *regs) if (vmbus_handler) vmbus_handler(); + if (ms_hyperv.hints & HV_X64_DEPRECATING_AEOI_RECOMMENDED) + ack_APIC_irq(); + exiting_irq(); set_irq_regs(old_regs); } diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index ff7e4b3988ed..d907c3d8633f 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -526,6 +526,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_SKL_IDS(&gen9_early_ops), INTEL_BXT_IDS(&gen9_early_ops), INTEL_KBL_IDS(&gen9_early_ops), + INTEL_GLK_IDS(&gen9_early_ops), }; static void __init diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 5b7153540727..0651e974dcb3 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -24,7 +24,7 @@ #include <trace/syscall.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/kprobes.h> #include <asm/ftrace.h> #include <asm/nops.h> @@ -533,7 +533,13 @@ static void do_sync_core(void *data) static void run_sync(void) { - int enable_irqs = irqs_disabled(); + int enable_irqs; + + /* No need to sync if there's only one CPU */ + if (num_online_cpus() == 1) + return; + + enable_irqs = irqs_disabled(); /* We may be called with interrupts disabled (on bootup). */ if (enable_irqs) diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index be22f5a2192e..4e3b8a587c88 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -418,6 +418,7 @@ struct legacy_pic default_legacy_pic = { }; struct legacy_pic *legacy_pic = &default_legacy_pic; +EXPORT_SYMBOL(legacy_pic); static int __init i8259A_init_ops(void) { diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 19e1f2a6d7b0..5b2bbfbb3712 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -61,6 +61,7 @@ #include <asm/alternative.h> #include <asm/insn.h> #include <asm/debugreg.h> +#include <asm/set_memory.h> #include "common.h" diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 9aadff3d0902..901c640d152f 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -37,6 +37,7 @@ #include <asm/alternative.h> #include <asm/insn.h> #include <asm/debugreg.h> +#include <asm/set_memory.h> #include "common.h" diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 14f65a5f938e..da5c09789984 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -396,9 +396,9 @@ static u64 kvm_steal_clock(int cpu) src = &per_cpu(steal_time, cpu); do { version = src->version; - rmb(); + virt_rmb(); steal = src->steal; - rmb(); + virt_rmb(); } while ((version & 1) || (version != src->version)); return steal; diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 5f43cec296c5..8c53c5d7a1bc 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -23,7 +23,7 @@ #include <asm/io_apic.h> #include <asm/cpufeature.h> #include <asm/desc.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/debugreg.h> static void set_idt(void *newidt, __u16 limit) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 085c3b300d32..6f5ca4ebe6e5 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -27,6 +27,7 @@ #include <asm/debugreg.h> #include <asm/kexec-bzimage64.h> #include <asm/setup.h> +#include <asm/set_memory.h> #ifdef CONFIG_KEXEC_FILE static struct kexec_file_ops *kexec_file_loaders[] = { @@ -113,7 +114,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) struct x86_mapping_info info = { .alloc_pgt_page = alloc_pgt_page, .context = image, - .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, + .page_flag = __PAGE_KERNEL_LARGE_EXEC, }; unsigned long mstart, mend; pgd_t *level4p; @@ -122,6 +123,10 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) level4p = (pgd_t *)__va(start_pgtable); clear_page(level4p); + + if (direct_gbpages) + info.direct_gbpages = true; + for (i = 0; i < nr_pfn_mapped; i++) { mstart = pfn_mapped[i].start << PAGE_SHIFT; mend = pfn_mapped[i].end << PAGE_SHIFT; diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 477ae806c2fa..f67bd3205df7 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -85,7 +85,7 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), - MODULES_END, GFP_KERNEL | __GFP_HIGHMEM, + MODULES_END, GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, __builtin_return_address(0)); if (p && (kasan_module_alloc(p, size) < 0)) { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 603a1669a2ec..0b4d3c686b1e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1225,6 +1225,21 @@ void __init setup_arch(char **cmdline_p) kasan_init(); +#ifdef CONFIG_X86_32 + /* sync back kernel address range */ + clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + + /* + * sync back low identity map too. It is used for example + * in the 32-bit EFI stub. + */ + clone_pgd_range(initial_page_table, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); +#endif + tboot_probe(); map_vsyscall(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index bb1e8cc0bc84..10edd1e69a68 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -291,11 +291,11 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_X86_32 /* - * Sync back kernel address range. We want to make sure that - * all kernel mappings, including percpu mappings, are available - * in the smpboot asm. We can't reliably pick up percpu - * mappings using vmalloc_fault(), because exception dispatch - * needs percpu data. + * Sync back kernel address range again. We already did this in + * setup_arch(), but percpu data also needs to be available in + * the smpboot asm. We can't reliably pick up percpu mappings + * using vmalloc_fault(), because exception dispatch needs + * percpu data. */ clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 8e2b79b88e51..8dabd7bf1673 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -76,6 +76,101 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); +#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE + +#define STACKTRACE_DUMP_ONCE(task) ({ \ + static bool __section(.data.unlikely) __dumped; \ + \ + if (!__dumped) { \ + __dumped = true; \ + WARN_ON(1); \ + show_stack(task, NULL); \ + } \ +}) + +static int __save_stack_trace_reliable(struct stack_trace *trace, + struct task_struct *task) +{ + struct unwind_state state; + struct pt_regs *regs; + unsigned long addr; + + for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + + regs = unwind_get_entry_regs(&state); + if (regs) { + /* + * Kernel mode registers on the stack indicate an + * in-kernel interrupt or exception (e.g., preemption + * or a page fault), which can make frame pointers + * unreliable. + */ + if (!user_mode(regs)) + return -EINVAL; + + /* + * The last frame contains the user mode syscall + * pt_regs. Skip it and finish the unwind. + */ + unwind_next_frame(&state); + if (!unwind_done(&state)) { + STACKTRACE_DUMP_ONCE(task); + return -EINVAL; + } + break; + } + + addr = unwind_get_return_address(&state); + + /* + * A NULL or invalid return address probably means there's some + * generated code which __kernel_text_address() doesn't know + * about. + */ + if (!addr) { + STACKTRACE_DUMP_ONCE(task); + return -EINVAL; + } + + if (save_stack_address(trace, addr, false)) + return -EINVAL; + } + + /* Check for stack corruption */ + if (unwind_error(&state)) { + STACKTRACE_DUMP_ONCE(task); + return -EINVAL; + } + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; + + return 0; +} + +/* + * This function returns an error if it detects any unreliable features of the + * stack. Otherwise it guarantees that the stack trace is reliable. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. + */ +int save_stack_trace_tsk_reliable(struct task_struct *tsk, + struct stack_trace *trace) +{ + int ret; + + if (!try_get_task_stack(tsk)) + return -EINVAL; + + ret = __save_stack_trace_reliable(trace, tsk); + + put_task_stack(tsk); + + return ret; +} +#endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */ + /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ struct stack_frame_user { @@ -138,4 +233,3 @@ void save_stack_trace_user(struct stack_trace *trace) if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } - diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index d4c8011a2293..4b1724059909 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -514,6 +514,9 @@ int tboot_force_iommu(void) if (!tboot_enabled()) return 0; + if (!intel_iommu_tboot_noforce) + return 1; + if (no_iommu || swiotlb || dmar_disabled) pr_warning("Forcing Intel-IOMMU to enabled\n"); diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index fec70fe3b1ec..82c6d7f1fd73 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -283,6 +283,8 @@ bool unwind_next_frame(struct unwind_state *state) return true; bad_address: + state->error = true; + /* * When unwinding a non-current task, the task might actually be * running on another CPU, in which case it could be modifying its diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ab8e32f7b9a8..760433b2574a 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -86,18 +86,6 @@ config KVM_MMU_AUDIT This option adds a R/W kVM module parameter 'mmu_audit', which allows auditing of KVM MMU events at runtime. -config KVM_DEVICE_ASSIGNMENT - bool "KVM legacy PCI device assignment support (DEPRECATED)" - depends on KVM && PCI && IOMMU_API - default n - ---help--- - Provide support for legacy PCI device assignment through KVM. The - kernel now also supports a full featured userspace device driver - framework through VFIO, which supersedes this support and provides - better security. - - If unsure, say N. - # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 3bff20710471..09d4b17be022 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -15,8 +15,6 @@ kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o page_track.o debugfs.o -kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o - kvm-intel-y += vmx.o pmu_intel.o kvm-amd-y += svm.o pmu_amd.o diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c deleted file mode 100644 index 308b8597c691..000000000000 --- a/arch/x86/kvm/assigned-dev.c +++ /dev/null @@ -1,1058 +0,0 @@ -/* - * Kernel-based Virtual Machine - device assignment support - * - * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include <linux/kvm_host.h> -#include <linux/kvm.h> -#include <linux/uaccess.h> -#include <linux/vmalloc.h> -#include <linux/errno.h> -#include <linux/spinlock.h> -#include <linux/pci.h> -#include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/namei.h> -#include <linux/fs.h> -#include "irq.h" -#include "assigned-dev.h" -#include "trace/events/kvm.h" - -struct kvm_assigned_dev_kernel { - struct kvm_irq_ack_notifier ack_notifier; - struct list_head list; - int assigned_dev_id; - int host_segnr; - int host_busnr; - int host_devfn; - unsigned int entries_nr; - int host_irq; - bool host_irq_disabled; - bool pci_2_3; - struct msix_entry *host_msix_entries; - int guest_irq; - struct msix_entry *guest_msix_entries; - unsigned long irq_requested_type; - int irq_source_id; - int flags; - struct pci_dev *dev; - struct kvm *kvm; - spinlock_t intx_lock; - spinlock_t intx_mask_lock; - char irq_name[32]; - struct pci_saved_state *pci_saved_state; -}; - -static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, - int assigned_dev_id) -{ - struct kvm_assigned_dev_kernel *match; - - list_for_each_entry(match, head, list) { - if (match->assigned_dev_id == assigned_dev_id) - return match; - } - return NULL; -} - -static int find_index_from_host_irq(struct kvm_assigned_dev_kernel - *assigned_dev, int irq) -{ - int i, index; - struct msix_entry *host_msix_entries; - - host_msix_entries = assigned_dev->host_msix_entries; - - index = -1; - for (i = 0; i < assigned_dev->entries_nr; i++) - if (irq == host_msix_entries[i].vector) { - index = i; - break; - } - if (index < 0) - printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); - - return index; -} - -static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int ret; - - spin_lock(&assigned_dev->intx_lock); - if (pci_check_and_mask_intx(assigned_dev->dev)) { - assigned_dev->host_irq_disabled = true; - ret = IRQ_WAKE_THREAD; - } else - ret = IRQ_NONE; - spin_unlock(&assigned_dev->intx_lock); - - return ret; -} - -static void -kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev, - int vector) -{ - if (unlikely(assigned_dev->irq_requested_type & - KVM_DEV_IRQ_GUEST_INTX)) { - spin_lock(&assigned_dev->intx_mask_lock); - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) - kvm_set_irq(assigned_dev->kvm, - assigned_dev->irq_source_id, vector, 1, - false); - spin_unlock(&assigned_dev->intx_mask_lock); - } else - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - vector, 1, false); -} - -static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - spin_lock_irq(&assigned_dev->intx_lock); - disable_irq_nosync(irq); - assigned_dev->host_irq_disabled = true; - spin_unlock_irq(&assigned_dev->intx_lock); - } - - kvm_assigned_dev_raise_guest_irq(assigned_dev, - assigned_dev->guest_irq); - - return IRQ_HANDLED; -} - -/* - * Deliver an IRQ in an atomic context if we can, or return a failure, - * user can retry in a process context. - * Return value: - * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context. - * Other values - No need to retry. - */ -static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, - int level) -{ - struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; - struct kvm_kernel_irq_routing_entry *e; - int ret = -EINVAL; - int idx; - - trace_kvm_set_irq(irq, level, irq_source_id); - - /* - * Injection into either PIC or IOAPIC might need to scan all CPUs, - * which would need to be retried from thread context; when same GSI - * is connected to both PIC and IOAPIC, we'd have to report a - * partial failure here. - * Since there's no easy way to do this, we only support injecting MSI - * which is limited to 1:1 GSI mapping. - */ - idx = srcu_read_lock(&kvm->irq_srcu); - if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { - e = &entries[0]; - ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id, - irq, level); - } - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - - -static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int ret = kvm_set_irq_inatomic(assigned_dev->kvm, - assigned_dev->irq_source_id, - assigned_dev->guest_irq, 1); - return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - - kvm_assigned_dev_raise_guest_irq(assigned_dev, - assigned_dev->guest_irq); - - return IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int index = find_index_from_host_irq(assigned_dev, irq); - u32 vector; - int ret = 0; - - if (index >= 0) { - vector = assigned_dev->guest_msix_entries[index].vector; - ret = kvm_set_irq_inatomic(assigned_dev->kvm, - assigned_dev->irq_source_id, - vector, 1); - } - - return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int index = find_index_from_host_irq(assigned_dev, irq); - u32 vector; - - if (index >= 0) { - vector = assigned_dev->guest_msix_entries[index].vector; - kvm_assigned_dev_raise_guest_irq(assigned_dev, vector); - } - - return IRQ_HANDLED; -} - -/* Ack the irq line for an assigned device */ -static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) -{ - struct kvm_assigned_dev_kernel *dev = - container_of(kian, struct kvm_assigned_dev_kernel, - ack_notifier); - - kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false); - - spin_lock(&dev->intx_mask_lock); - - if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) { - bool reassert = false; - - spin_lock_irq(&dev->intx_lock); - /* - * The guest IRQ may be shared so this ack can come from an - * IRQ for another guest device. - */ - if (dev->host_irq_disabled) { - if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) - enable_irq(dev->host_irq); - else if (!pci_check_and_unmask_intx(dev->dev)) - reassert = true; - dev->host_irq_disabled = reassert; - } - spin_unlock_irq(&dev->intx_lock); - - if (reassert) - kvm_set_irq(dev->kvm, dev->irq_source_id, - dev->guest_irq, 1, false); - } - - spin_unlock(&dev->intx_mask_lock); -} - -static void deassign_guest_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - if (assigned_dev->ack_notifier.gsi != -1) - kvm_unregister_irq_ack_notifier(kvm, - &assigned_dev->ack_notifier); - - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - assigned_dev->guest_irq, 0, false); - - if (assigned_dev->irq_source_id != -1) - kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); - assigned_dev->irq_source_id = -1; - assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK); -} - -/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ -static void deassign_host_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - /* - * We disable irq here to prevent further events. - * - * Notice this maybe result in nested disable if the interrupt type is - * INTx, but it's OK for we are going to free it. - * - * If this function is a part of VM destroy, please ensure that till - * now, the kvm state is still legal for probably we also have to wait - * on a currently running IRQ handler. - */ - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { - int i; - for (i = 0; i < assigned_dev->entries_nr; i++) - disable_irq(assigned_dev->host_msix_entries[i].vector); - - for (i = 0; i < assigned_dev->entries_nr; i++) - free_irq(assigned_dev->host_msix_entries[i].vector, - assigned_dev); - - assigned_dev->entries_nr = 0; - kfree(assigned_dev->host_msix_entries); - kfree(assigned_dev->guest_msix_entries); - pci_disable_msix(assigned_dev->dev); - } else { - /* Deal with MSI and INTx */ - if ((assigned_dev->irq_requested_type & - KVM_DEV_IRQ_HOST_INTX) && - (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - spin_lock_irq(&assigned_dev->intx_lock); - pci_intx(assigned_dev->dev, false); - spin_unlock_irq(&assigned_dev->intx_lock); - synchronize_irq(assigned_dev->host_irq); - } else - disable_irq(assigned_dev->host_irq); - - free_irq(assigned_dev->host_irq, assigned_dev); - - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) - pci_disable_msi(assigned_dev->dev); - } - - assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK); -} - -static int kvm_deassign_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev, - unsigned long irq_requested_type) -{ - unsigned long guest_irq_type, host_irq_type; - - if (!irqchip_in_kernel(kvm)) - return -EINVAL; - /* no irq assignment to deassign */ - if (!assigned_dev->irq_requested_type) - return -ENXIO; - - host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK; - guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK; - - if (host_irq_type) - deassign_host_irq(kvm, assigned_dev); - if (guest_irq_type) - deassign_guest_irq(kvm, assigned_dev); - - return 0; -} - -static void kvm_free_assigned_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); -} - -static void kvm_free_assigned_device(struct kvm *kvm, - struct kvm_assigned_dev_kernel - *assigned_dev) -{ - kvm_free_assigned_irq(kvm, assigned_dev); - - pci_reset_function(assigned_dev->dev); - if (pci_load_and_free_saved_state(assigned_dev->dev, - &assigned_dev->pci_saved_state)) - printk(KERN_INFO "%s: Couldn't reload %s saved state\n", - __func__, dev_name(&assigned_dev->dev->dev)); - else - pci_restore_state(assigned_dev->dev); - - pci_clear_dev_assigned(assigned_dev->dev); - - pci_release_regions(assigned_dev->dev); - pci_disable_device(assigned_dev->dev); - pci_dev_put(assigned_dev->dev); - - list_del(&assigned_dev->list); - kfree(assigned_dev); -} - -void kvm_free_all_assigned_devices(struct kvm *kvm) -{ - struct kvm_assigned_dev_kernel *assigned_dev, *tmp; - - list_for_each_entry_safe(assigned_dev, tmp, - &kvm->arch.assigned_dev_head, list) { - kvm_free_assigned_device(kvm, assigned_dev); - } -} - -static int assigned_device_enable_host_intx(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - irq_handler_t irq_handler; - unsigned long flags; - - dev->host_irq = dev->dev->irq; - - /* - * We can only share the IRQ line with other host devices if we are - * able to disable the IRQ source at device-level - independently of - * the guest driver. Otherwise host devices may suffer from unbounded - * IRQ latencies when the guest keeps the line asserted. - */ - if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { - irq_handler = kvm_assigned_dev_intx; - flags = IRQF_SHARED; - } else { - irq_handler = NULL; - flags = IRQF_ONESHOT; - } - if (request_threaded_irq(dev->host_irq, irq_handler, - kvm_assigned_dev_thread_intx, flags, - dev->irq_name, dev)) - return -EIO; - - if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { - spin_lock_irq(&dev->intx_lock); - pci_intx(dev->dev, true); - spin_unlock_irq(&dev->intx_lock); - } - return 0; -} - -static int assigned_device_enable_host_msi(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - int r; - - if (!dev->dev->msi_enabled) { - r = pci_enable_msi(dev->dev); - if (r) - return r; - } - - dev->host_irq = dev->dev->irq; - if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi, - kvm_assigned_dev_thread_msi, 0, - dev->irq_name, dev)) { - pci_disable_msi(dev->dev); - return -EIO; - } - - return 0; -} - -static int assigned_device_enable_host_msix(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - int i, r = -EINVAL; - - /* host_msix_entries and guest_msix_entries should have been - * initialized */ - if (dev->entries_nr == 0) - return r; - - r = pci_enable_msix_exact(dev->dev, - dev->host_msix_entries, dev->entries_nr); - if (r) - return r; - - for (i = 0; i < dev->entries_nr; i++) { - r = request_threaded_irq(dev->host_msix_entries[i].vector, - kvm_assigned_dev_msix, - kvm_assigned_dev_thread_msix, - 0, dev->irq_name, dev); - if (r) - goto err; - } - - return 0; -err: - for (i -= 1; i >= 0; i--) - free_irq(dev->host_msix_entries[i].vector, dev); - pci_disable_msix(dev->dev); - return r; -} - -static int assigned_device_enable_guest_intx(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = irq->guest_irq; - return 0; -} - -static int assigned_device_enable_guest_msi(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = -1; - return 0; -} - -static int assigned_device_enable_guest_msix(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = -1; - return 0; -} - -static int assign_host_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - __u32 host_irq_type) -{ - int r = -EEXIST; - - if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) - return r; - - snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s", - pci_name(dev->dev)); - - switch (host_irq_type) { - case KVM_DEV_IRQ_HOST_INTX: - r = assigned_device_enable_host_intx(kvm, dev); - break; - case KVM_DEV_IRQ_HOST_MSI: - r = assigned_device_enable_host_msi(kvm, dev); - break; - case KVM_DEV_IRQ_HOST_MSIX: - r = assigned_device_enable_host_msix(kvm, dev); - break; - default: - r = -EINVAL; - } - dev->host_irq_disabled = false; - - if (!r) - dev->irq_requested_type |= host_irq_type; - - return r; -} - -static int assign_guest_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq, - unsigned long guest_irq_type) -{ - int id; - int r = -EEXIST; - - if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK) - return r; - - id = kvm_request_irq_source_id(kvm); - if (id < 0) - return id; - - dev->irq_source_id = id; - - switch (guest_irq_type) { - case KVM_DEV_IRQ_GUEST_INTX: - r = assigned_device_enable_guest_intx(kvm, dev, irq); - break; - case KVM_DEV_IRQ_GUEST_MSI: - r = assigned_device_enable_guest_msi(kvm, dev, irq); - break; - case KVM_DEV_IRQ_GUEST_MSIX: - r = assigned_device_enable_guest_msix(kvm, dev, irq); - break; - default: - r = -EINVAL; - } - - if (!r) { - dev->irq_requested_type |= guest_irq_type; - if (dev->ack_notifier.gsi != -1) - kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); - } else { - kvm_free_irq_source_id(kvm, dev->irq_source_id); - dev->irq_source_id = -1; - } - - return r; -} - -/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */ -static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, - struct kvm_assigned_irq *assigned_irq) -{ - int r = -EINVAL; - struct kvm_assigned_dev_kernel *match; - unsigned long host_irq_type, guest_irq_type; - - if (!irqchip_in_kernel(kvm)) - return r; - - mutex_lock(&kvm->lock); - r = -ENODEV; - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_irq->assigned_dev_id); - if (!match) - goto out; - - host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK); - guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK); - - r = -EINVAL; - /* can only assign one type at a time */ - if (hweight_long(host_irq_type) > 1) - goto out; - if (hweight_long(guest_irq_type) > 1) - goto out; - if (host_irq_type == 0 && guest_irq_type == 0) - goto out; - - r = 0; - if (host_irq_type) - r = assign_host_irq(kvm, match, host_irq_type); - if (r) - goto out; - - if (guest_irq_type) - r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type); -out: - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, - struct kvm_assigned_irq - *assigned_irq) -{ - int r = -ENODEV; - struct kvm_assigned_dev_kernel *match; - unsigned long irq_type; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_irq->assigned_dev_id); - if (!match) - goto out; - - irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK | - KVM_DEV_IRQ_GUEST_MASK); - r = kvm_deassign_irq(kvm, match, irq_type); -out: - mutex_unlock(&kvm->lock); - return r; -} - -/* - * We want to test whether the caller has been granted permissions to - * use this device. To be able to configure and control the device, - * the user needs access to PCI configuration space and BAR resources. - * These are accessed through PCI sysfs. PCI config space is often - * passed to the process calling this ioctl via file descriptor, so we - * can't rely on access to that file. We can check for permissions - * on each of the BAR resource files, which is a pretty clear - * indicator that the user has been granted access to the device. - */ -static int probe_sysfs_permissions(struct pci_dev *dev) -{ -#ifdef CONFIG_SYSFS - int i; - bool bar_found = false; - - for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) { - char *kpath, *syspath; - struct path path; - struct inode *inode; - int r; - - if (!pci_resource_len(dev, i)) - continue; - - kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL); - if (!kpath) - return -ENOMEM; - - /* Per sysfs-rules, sysfs is always at /sys */ - syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i); - kfree(kpath); - if (!syspath) - return -ENOMEM; - - r = kern_path(syspath, LOOKUP_FOLLOW, &path); - kfree(syspath); - if (r) - return r; - - inode = d_backing_inode(path.dentry); - - r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS); - path_put(&path); - if (r) - return r; - - bar_found = true; - } - - /* If no resources, probably something special */ - if (!bar_found) - return -EPERM; - - return 0; -#else - return -EINVAL; /* No way to control the device without sysfs */ -#endif -} - -static int kvm_vm_ioctl_assign_device(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0, idx; - struct kvm_assigned_dev_kernel *match; - struct pci_dev *dev; - - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)) - return -EINVAL; - - mutex_lock(&kvm->lock); - idx = srcu_read_lock(&kvm->srcu); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (match) { - /* device already assigned */ - r = -EEXIST; - goto out; - } - - match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); - if (match == NULL) { - printk(KERN_INFO "%s: Couldn't allocate memory\n", - __func__); - r = -ENOMEM; - goto out; - } - dev = pci_get_domain_bus_and_slot(assigned_dev->segnr, - assigned_dev->busnr, - assigned_dev->devfn); - if (!dev) { - printk(KERN_INFO "%s: host device not found\n", __func__); - r = -EINVAL; - goto out_free; - } - - /* Don't allow bridges to be assigned */ - if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) { - r = -EPERM; - goto out_put; - } - - r = probe_sysfs_permissions(dev); - if (r) - goto out_put; - - if (pci_enable_device(dev)) { - printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); - r = -EBUSY; - goto out_put; - } - r = pci_request_regions(dev, "kvm_assigned_device"); - if (r) { - printk(KERN_INFO "%s: Could not get access to device regions\n", - __func__); - goto out_disable; - } - - pci_reset_function(dev); - pci_save_state(dev); - match->pci_saved_state = pci_store_saved_state(dev); - if (!match->pci_saved_state) - printk(KERN_DEBUG "%s: Couldn't store %s saved state\n", - __func__, dev_name(&dev->dev)); - - if (!pci_intx_mask_supported(dev)) - assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3; - - match->assigned_dev_id = assigned_dev->assigned_dev_id; - match->host_segnr = assigned_dev->segnr; - match->host_busnr = assigned_dev->busnr; - match->host_devfn = assigned_dev->devfn; - match->flags = assigned_dev->flags; - match->dev = dev; - spin_lock_init(&match->intx_lock); - spin_lock_init(&match->intx_mask_lock); - match->irq_source_id = -1; - match->kvm = kvm; - match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; - - list_add(&match->list, &kvm->arch.assigned_dev_head); - - if (!kvm->arch.iommu_domain) { - r = kvm_iommu_map_guest(kvm); - if (r) - goto out_list_del; - } - r = kvm_assign_device(kvm, match->dev); - if (r) - goto out_list_del; - -out: - srcu_read_unlock(&kvm->srcu, idx); - mutex_unlock(&kvm->lock); - return r; -out_list_del: - if (pci_load_and_free_saved_state(dev, &match->pci_saved_state)) - printk(KERN_INFO "%s: Couldn't reload %s saved state\n", - __func__, dev_name(&dev->dev)); - list_del(&match->list); - pci_release_regions(dev); -out_disable: - pci_disable_device(dev); -out_put: - pci_dev_put(dev); -out_free: - kfree(match); - srcu_read_unlock(&kvm->srcu, idx); - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (!match) { - printk(KERN_INFO "%s: device hasn't been assigned before, " - "so cannot be deassigned\n", __func__); - r = -EINVAL; - goto out; - } - - kvm_deassign_device(kvm, match->dev); - - kvm_free_assigned_device(kvm, match); - -out: - mutex_unlock(&kvm->lock); - return r; -} - - -static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, - struct kvm_assigned_msix_nr *entry_nr) -{ - int r = 0; - struct kvm_assigned_dev_kernel *adev; - - mutex_lock(&kvm->lock); - - adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - entry_nr->assigned_dev_id); - if (!adev) { - r = -EINVAL; - goto msix_nr_out; - } - - if (adev->entries_nr == 0) { - adev->entries_nr = entry_nr->entry_nr; - if (adev->entries_nr == 0 || - adev->entries_nr > KVM_MAX_MSIX_PER_DEV) { - r = -EINVAL; - goto msix_nr_out; - } - - adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) * - entry_nr->entry_nr, - GFP_KERNEL); - if (!adev->host_msix_entries) { - r = -ENOMEM; - goto msix_nr_out; - } - adev->guest_msix_entries = - kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr, - GFP_KERNEL); - if (!adev->guest_msix_entries) { - kfree(adev->host_msix_entries); - r = -ENOMEM; - goto msix_nr_out; - } - } else /* Not allowed set MSI-X number twice */ - r = -EINVAL; -msix_nr_out: - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, - struct kvm_assigned_msix_entry *entry) -{ - int r = 0, i; - struct kvm_assigned_dev_kernel *adev; - - mutex_lock(&kvm->lock); - - adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - entry->assigned_dev_id); - - if (!adev) { - r = -EINVAL; - goto msix_entry_out; - } - - for (i = 0; i < adev->entries_nr; i++) - if (adev->guest_msix_entries[i].vector == 0 || - adev->guest_msix_entries[i].entry == entry->entry) { - adev->guest_msix_entries[i].entry = entry->entry; - adev->guest_msix_entries[i].vector = entry->gsi; - adev->host_msix_entries[i].entry = entry->entry; - break; - } - if (i == adev->entries_nr) { - r = -ENOSPC; - goto msix_entry_out; - } - -msix_entry_out: - mutex_unlock(&kvm->lock); - - return r; -} - -static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (!match) { - r = -ENODEV; - goto out; - } - - spin_lock(&match->intx_mask_lock); - - match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX; - match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX; - - if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { - if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) { - kvm_set_irq(match->kvm, match->irq_source_id, - match->guest_irq, 0, false); - /* - * Masking at hardware-level is performed on demand, - * i.e. when an IRQ actually arrives at the host. - */ - } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - /* - * Unmask the IRQ line if required. Unmasking at - * device level will be performed by user space. - */ - spin_lock_irq(&match->intx_lock); - if (match->host_irq_disabled) { - enable_irq(match->host_irq); - match->host_irq_disabled = false; - } - spin_unlock_irq(&match->intx_lock); - } - } - - spin_unlock(&match->intx_mask_lock); - -out: - mutex_unlock(&kvm->lock); - return r; -} - -long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg) -{ - void __user *argp = (void __user *)arg; - int r; - - switch (ioctl) { - case KVM_ASSIGN_PCI_DEVICE: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); - if (r) - goto out; - break; - } - case KVM_ASSIGN_IRQ: { - r = -EOPNOTSUPP; - break; - } - case KVM_ASSIGN_DEV_IRQ: { - struct kvm_assigned_irq assigned_irq; - - r = -EFAULT; - if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) - goto out; - r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); - if (r) - goto out; - break; - } - case KVM_DEASSIGN_DEV_IRQ: { - struct kvm_assigned_irq assigned_irq; - - r = -EFAULT; - if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) - goto out; - r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq); - if (r) - goto out; - break; - } - case KVM_DEASSIGN_PCI_DEVICE: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_MSIX_NR: { - struct kvm_assigned_msix_nr entry_nr; - r = -EFAULT; - if (copy_from_user(&entry_nr, argp, sizeof entry_nr)) - goto out; - r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_MSIX_ENTRY: { - struct kvm_assigned_msix_entry entry; - r = -EFAULT; - if (copy_from_user(&entry, argp, sizeof entry)) - goto out; - r = kvm_vm_ioctl_set_msix_entry(kvm, &entry); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_INTX_MASK: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev); - break; - } - default: - r = -ENOTTY; - break; - } -out: - return r; -} diff --git a/arch/x86/kvm/assigned-dev.h b/arch/x86/kvm/assigned-dev.h deleted file mode 100644 index a428c1a211b2..000000000000 --- a/arch/x86/kvm/assigned-dev.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef ARCH_X86_KVM_ASSIGNED_DEV_H -#define ARCH_X86_KVM_ASSIGNED_DEV_H - -#include <linux/kvm_host.h> - -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT -int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev); -int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev); - -int kvm_iommu_map_guest(struct kvm *kvm); -int kvm_iommu_unmap_guest(struct kvm *kvm); - -long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg); - -void kvm_free_all_assigned_devices(struct kvm *kvm); -#else -static inline int kvm_iommu_unmap_guest(struct kvm *kvm) -{ - return 0; -} - -static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg) -{ - return -ENOTTY; -} - -static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {} -#endif /* CONFIG_KVM_DEVICE_ASSIGNMENT */ - -#endif /* ARCH_X86_KVM_ASSIGNED_DEV_H */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index efde6cc50875..a181ae76c71c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -876,6 +876,9 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { u32 eax, ebx, ecx, edx; + if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0)) + return 1; + eax = kvm_register_read(vcpu, VCPU_REGS_RAX); ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 35058c2c0eea..a6fd40aade7c 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -205,4 +205,15 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) return x86_stepping(best->eax); } +static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT; +} + +static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.msr_misc_features_enables & + MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; +} + #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 45c7306c8780..c25cfaf584e7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2547,7 +2547,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) u64 smbase; int ret; - if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0) + if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0) return emulate_ud(ctxt); /* @@ -2596,11 +2596,11 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) return X86EMUL_UNHANDLEABLE; } - if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) + if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) ctxt->ops->set_nmi_mask(ctxt, false); - ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK; - ctxt->emul_flags &= ~X86EMUL_SMM_MASK; + ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) & + ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK)); return X86EMUL_CONTINUE; } @@ -3854,6 +3854,13 @@ static int em_sti(struct x86_emulate_ctxt *ctxt) static int em_cpuid(struct x86_emulate_ctxt *ctxt) { u32 eax, ebx, ecx, edx; + u64 msr = 0; + + ctxt->ops->get_msr(ctxt, MSR_MISC_FEATURES_ENABLES, &msr); + if (msr & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT && + ctxt->ops->cpl(ctxt)) { + return emulate_gp(ctxt, 0); + } eax = reg_read(ctxt, VCPU_REGS_RAX); ecx = reg_read(ctxt, VCPU_REGS_RCX); @@ -5316,6 +5323,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) const struct x86_emulate_ops *ops = ctxt->ops; int rc = X86EMUL_CONTINUE; int saved_dst_type = ctxt->dst.type; + unsigned emul_flags; ctxt->mem_read.pos = 0; @@ -5330,6 +5338,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } + emul_flags = ctxt->ops->get_hflags(ctxt); if (unlikely(ctxt->d & (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || @@ -5363,7 +5372,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) fetch_possible_mmx_operand(ctxt, &ctxt->dst); } - if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { + if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5392,7 +5401,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { + if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5446,7 +5455,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: - if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { + if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 047b17a26269..bdcd4139eca9 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -49,7 +49,7 @@ static void pic_unlock(struct kvm_pic *s) __releases(&s->lock) { bool wakeup = s->wakeup_needed; - struct kvm_vcpu *vcpu, *found = NULL; + struct kvm_vcpu *vcpu; int i; s->wakeup_needed = false; @@ -59,16 +59,11 @@ static void pic_unlock(struct kvm_pic *s) if (wakeup) { kvm_for_each_vcpu(i, vcpu, s->kvm) { if (kvm_apic_accept_pic_intr(vcpu)) { - found = vcpu; - break; + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + return; } } - - if (!found) - return; - - kvm_make_request(KVM_REQ_EVENT, found); - kvm_vcpu_kick(found); } } @@ -239,7 +234,7 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq) int kvm_pic_read_irq(struct kvm *kvm) { int irq, irq2, intno; - struct kvm_pic *s = pic_irqchip(kvm); + struct kvm_pic *s = kvm->arch.vpic; s->output = 0; @@ -273,7 +268,7 @@ int kvm_pic_read_irq(struct kvm *kvm) return intno; } -void kvm_pic_reset(struct kvm_kpic_state *s) +static void kvm_pic_reset(struct kvm_kpic_state *s) { int irq, i; struct kvm_vcpu *vcpu; @@ -422,19 +417,16 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) return ret; } -static u32 pic_ioport_read(void *opaque, u32 addr1) +static u32 pic_ioport_read(void *opaque, u32 addr) { struct kvm_kpic_state *s = opaque; - unsigned int addr; int ret; - addr = addr1; - addr &= 1; if (s->poll) { - ret = pic_poll_read(s, addr1); + ret = pic_poll_read(s, addr); s->poll = 0; } else - if (addr == 0) + if ((addr & 1) == 0) if (s->read_reg_select) ret = s->isr; else @@ -456,76 +448,64 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1) return s->elcr; } -static int picdev_in_range(gpa_t addr) -{ - switch (addr) { - case 0x20: - case 0x21: - case 0xa0: - case 0xa1: - case 0x4d0: - case 0x4d1: - return 1; - default: - return 0; - } -} - static int picdev_write(struct kvm_pic *s, gpa_t addr, int len, const void *val) { unsigned char data = *(unsigned char *)val; - if (!picdev_in_range(addr)) - return -EOPNOTSUPP; if (len != 1) { pr_pic_unimpl("non byte write\n"); return 0; } - pic_lock(s); switch (addr) { case 0x20: case 0x21: case 0xa0: case 0xa1: + pic_lock(s); pic_ioport_write(&s->pics[addr >> 7], addr, data); + pic_unlock(s); break; case 0x4d0: case 0x4d1: + pic_lock(s); elcr_ioport_write(&s->pics[addr & 1], addr, data); + pic_unlock(s); break; + default: + return -EOPNOTSUPP; } - pic_unlock(s); return 0; } static int picdev_read(struct kvm_pic *s, gpa_t addr, int len, void *val) { - unsigned char data = 0; - if (!picdev_in_range(addr)) - return -EOPNOTSUPP; + unsigned char *data = (unsigned char *)val; if (len != 1) { memset(val, 0, len); pr_pic_unimpl("non byte read\n"); return 0; } - pic_lock(s); switch (addr) { case 0x20: case 0x21: case 0xa0: case 0xa1: - data = pic_ioport_read(&s->pics[addr >> 7], addr); + pic_lock(s); + *data = pic_ioport_read(&s->pics[addr >> 7], addr); + pic_unlock(s); break; case 0x4d0: case 0x4d1: - data = elcr_ioport_read(&s->pics[addr & 1], addr); + pic_lock(s); + *data = elcr_ioport_read(&s->pics[addr & 1], addr); + pic_unlock(s); break; + default: + return -EOPNOTSUPP; } - *(unsigned char *)val = data; - pic_unlock(s); return 0; } @@ -576,7 +556,7 @@ static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, */ static void pic_irq_request(struct kvm *kvm, int level) { - struct kvm_pic *s = pic_irqchip(kvm); + struct kvm_pic *s = kvm->arch.vpic; if (!s->output) s->wakeup_needed = true; @@ -660,9 +640,11 @@ void kvm_pic_destroy(struct kvm *kvm) if (!vpic) return; + mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr); + mutex_unlock(&kvm->slots_lock); kvm->arch.vpic = NULL; kfree(vpic); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 289270a6aecb..bdff437acbcb 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -266,11 +266,9 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) spin_unlock(&ioapic->lock); } -void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) +void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm) { - struct kvm_ioapic *ioapic = kvm->arch.vioapic; - - if (!ioapic) + if (!ioapic_in_kernel(kvm)) return; kvm_make_scan_ioapic_request(kvm); } @@ -315,7 +313,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG && ioapic->irr & (1 << index)) ioapic_service(ioapic, index, false); - kvm_vcpu_request_scan_ioapic(ioapic->kvm); + kvm_make_scan_ioapic_request(ioapic->kvm); break; } } @@ -624,10 +622,8 @@ int kvm_ioapic_init(struct kvm *kvm) if (ret < 0) { kvm->arch.vioapic = NULL; kfree(ioapic); - return ret; } - kvm_vcpu_request_scan_ioapic(kvm); return ret; } @@ -639,36 +635,32 @@ void kvm_ioapic_destroy(struct kvm *kvm) return; cancel_delayed_work_sync(&ioapic->eoi_inject); + mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + mutex_unlock(&kvm->slots_lock); kvm->arch.vioapic = NULL; kfree(ioapic); } -int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) +void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) { - struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); - if (!ioapic) - return -EINVAL; + struct kvm_ioapic *ioapic = kvm->arch.vioapic; spin_lock(&ioapic->lock); memcpy(state, ioapic, sizeof(struct kvm_ioapic_state)); state->irr &= ~ioapic->irr_delivered; spin_unlock(&ioapic->lock); - return 0; } -int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) +void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) { - struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); - if (!ioapic) - return -EINVAL; + struct kvm_ioapic *ioapic = kvm->arch.vioapic; spin_lock(&ioapic->lock); memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); ioapic->irr = 0; ioapic->irr_delivered = 0; - kvm_vcpu_request_scan_ioapic(kvm); + kvm_make_scan_ioapic_request(kvm); kvm_ioapic_inject_all(ioapic, state->irr); spin_unlock(&ioapic->lock); - return 0; } diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 1cc6e54436db..29ce19732ccf 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -105,17 +105,13 @@ do { \ #define ASSERT(x) do { } while (0) #endif -static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) -{ - return kvm->arch.vioapic; -} - static inline int ioapic_in_kernel(struct kvm *kvm) { - int ret; + int mode = kvm->arch.irqchip_mode; - ret = (ioapic_irqchip(kvm) != NULL); - return ret; + /* Matches smp_wmb() when setting irqchip_mode */ + smp_rmb(); + return mode == KVM_IRQCHIP_KERNEL; } void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); @@ -132,8 +128,8 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, struct dest_map *dest_map); -int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); -int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); +void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); +void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors); void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c deleted file mode 100644 index b181426f67b4..000000000000 --- a/arch/x86/kvm/iommu.c +++ /dev/null @@ -1,356 +0,0 @@ -/* - * Copyright (c) 2006, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * - * Copyright (C) 2006-2008 Intel Corporation - * Copyright IBM Corporation, 2008 - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Author: Allen M. Kay <allen.m.kay@intel.com> - * Author: Weidong Han <weidong.han@intel.com> - * Author: Ben-Ami Yassour <benami@il.ibm.com> - */ - -#include <linux/list.h> -#include <linux/kvm_host.h> -#include <linux/moduleparam.h> -#include <linux/pci.h> -#include <linux/stat.h> -#include <linux/iommu.h> -#include "assigned-dev.h" - -static bool allow_unsafe_assigned_interrupts; -module_param_named(allow_unsafe_assigned_interrupts, - allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(allow_unsafe_assigned_interrupts, - "Enable device assignment on platforms without interrupt remapping support."); - -static int kvm_iommu_unmap_memslots(struct kvm *kvm); -static void kvm_iommu_put_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages); - -static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, - unsigned long npages) -{ - gfn_t end_gfn; - kvm_pfn_t pfn; - - pfn = gfn_to_pfn_memslot(slot, gfn); - end_gfn = gfn + npages; - gfn += 1; - - if (is_error_noslot_pfn(pfn)) - return pfn; - - while (gfn < end_gfn) - gfn_to_pfn_memslot(slot, gfn++); - - return pfn; -} - -static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn, - unsigned long npages) -{ - unsigned long i; - - for (i = 0; i < npages; ++i) - kvm_release_pfn_clean(pfn + i); -} - -int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - gfn_t gfn, end_gfn; - kvm_pfn_t pfn; - int r = 0; - struct iommu_domain *domain = kvm->arch.iommu_domain; - int flags; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - gfn = slot->base_gfn; - end_gfn = gfn + slot->npages; - - flags = IOMMU_READ; - if (!(slot->flags & KVM_MEM_READONLY)) - flags |= IOMMU_WRITE; - if (!kvm->arch.iommu_noncoherent) - flags |= IOMMU_CACHE; - - - while (gfn < end_gfn) { - unsigned long page_size; - - /* Check if already mapped */ - if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) { - gfn += 1; - continue; - } - - /* Get the page size we could use to map */ - page_size = kvm_host_page_size(kvm, gfn); - - /* Make sure the page_size does not exceed the memslot */ - while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn) - page_size >>= 1; - - /* Make sure gfn is aligned to the page size we want to map */ - while ((gfn << PAGE_SHIFT) & (page_size - 1)) - page_size >>= 1; - - /* Make sure hva is aligned to the page size we want to map */ - while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1)) - page_size >>= 1; - - /* - * Pin all pages we are about to map in memory. This is - * important because we unmap and unpin in 4kb steps later. - */ - pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT); - if (is_error_noslot_pfn(pfn)) { - gfn += 1; - continue; - } - - /* Map into IO address space */ - r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn), - page_size, flags); - if (r) { - printk(KERN_ERR "kvm_iommu_map_address:" - "iommu failed to map pfn=%llx\n", pfn); - kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT); - goto unmap_pages; - } - - gfn += page_size >> PAGE_SHIFT; - - cond_resched(); - } - - return 0; - -unmap_pages: - kvm_iommu_put_pages(kvm, slot->base_gfn, gfn - slot->base_gfn); - return r; -} - -static int kvm_iommu_map_memslots(struct kvm *kvm) -{ - int idx, r = 0; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - - if (kvm->arch.iommu_noncoherent) - kvm_arch_register_noncoherent_dma(kvm); - - idx = srcu_read_lock(&kvm->srcu); - slots = kvm_memslots(kvm); - - kvm_for_each_memslot(memslot, slots) { - r = kvm_iommu_map_pages(kvm, memslot); - if (r) - break; - } - srcu_read_unlock(&kvm->srcu, idx); - - return r; -} - -int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - int r; - bool noncoherent; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - if (pdev == NULL) - return -ENODEV; - - r = iommu_attach_device(domain, &pdev->dev); - if (r) { - dev_err(&pdev->dev, "kvm assign device failed ret %d", r); - return r; - } - - noncoherent = !iommu_capable(&pci_bus_type, IOMMU_CAP_CACHE_COHERENCY); - - /* Check if need to update IOMMU page table for guest memory */ - if (noncoherent != kvm->arch.iommu_noncoherent) { - kvm_iommu_unmap_memslots(kvm); - kvm->arch.iommu_noncoherent = noncoherent; - r = kvm_iommu_map_memslots(kvm); - if (r) - goto out_unmap; - } - - kvm_arch_start_assignment(kvm); - pci_set_dev_assigned(pdev); - - dev_info(&pdev->dev, "kvm assign device\n"); - - return 0; -out_unmap: - kvm_iommu_unmap_memslots(kvm); - return r; -} - -int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - if (pdev == NULL) - return -ENODEV; - - iommu_detach_device(domain, &pdev->dev); - - pci_clear_dev_assigned(pdev); - kvm_arch_end_assignment(kvm); - - dev_info(&pdev->dev, "kvm deassign device\n"); - - return 0; -} - -int kvm_iommu_map_guest(struct kvm *kvm) -{ - int r; - - if (!iommu_present(&pci_bus_type)) { - printk(KERN_ERR "%s: iommu not found\n", __func__); - return -ENODEV; - } - - mutex_lock(&kvm->slots_lock); - - kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type); - if (!kvm->arch.iommu_domain) { - r = -ENOMEM; - goto out_unlock; - } - - if (!allow_unsafe_assigned_interrupts && - !iommu_capable(&pci_bus_type, IOMMU_CAP_INTR_REMAP)) { - printk(KERN_WARNING "%s: No interrupt remapping support," - " disallowing device assignment." - " Re-enable with \"allow_unsafe_assigned_interrupts=1\"" - " module option.\n", __func__); - iommu_domain_free(kvm->arch.iommu_domain); - kvm->arch.iommu_domain = NULL; - r = -EPERM; - goto out_unlock; - } - - r = kvm_iommu_map_memslots(kvm); - if (r) - kvm_iommu_unmap_memslots(kvm); - -out_unlock: - mutex_unlock(&kvm->slots_lock); - return r; -} - -static void kvm_iommu_put_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages) -{ - struct iommu_domain *domain; - gfn_t end_gfn, gfn; - kvm_pfn_t pfn; - u64 phys; - - domain = kvm->arch.iommu_domain; - end_gfn = base_gfn + npages; - gfn = base_gfn; - - /* check if iommu exists and in use */ - if (!domain) - return; - - while (gfn < end_gfn) { - unsigned long unmap_pages; - size_t size; - - /* Get physical address */ - phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); - - if (!phys) { - gfn++; - continue; - } - - pfn = phys >> PAGE_SHIFT; - - /* Unmap address from IO address space */ - size = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE); - unmap_pages = 1ULL << get_order(size); - - /* Unpin all pages we just unmapped to not leak any memory */ - kvm_unpin_pages(kvm, pfn, unmap_pages); - - gfn += unmap_pages; - - cond_resched(); - } -} - -void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages); -} - -static int kvm_iommu_unmap_memslots(struct kvm *kvm) -{ - int idx; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - - idx = srcu_read_lock(&kvm->srcu); - slots = kvm_memslots(kvm); - - kvm_for_each_memslot(memslot, slots) - kvm_iommu_unmap_pages(kvm, memslot); - - srcu_read_unlock(&kvm->srcu, idx); - - if (kvm->arch.iommu_noncoherent) - kvm_arch_unregister_noncoherent_dma(kvm); - - return 0; -} - -int kvm_iommu_unmap_guest(struct kvm *kvm) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - mutex_lock(&kvm->slots_lock); - kvm_iommu_unmap_memslots(kvm); - kvm->arch.iommu_domain = NULL; - kvm->arch.iommu_noncoherent = false; - mutex_unlock(&kvm->slots_lock); - - iommu_domain_free(domain); - return 0; -} diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 60d91c9d160c..5c24811e8b0b 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -60,7 +60,7 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v) if (irqchip_split(v->kvm)) return pending_userspace_extint(v); else - return pic_irqchip(v->kvm)->output; + return v->kvm->arch.vpic->output; } else return 0; } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 40d5b2cf6061..d5005cc26521 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -78,40 +78,42 @@ void kvm_pic_destroy(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); -static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) -{ - return kvm->arch.vpic; -} - static inline int pic_in_kernel(struct kvm *kvm) { - int ret; + int mode = kvm->arch.irqchip_mode; - ret = (pic_irqchip(kvm) != NULL); - return ret; + /* Matches smp_wmb() when setting irqchip_mode */ + smp_rmb(); + return mode == KVM_IRQCHIP_KERNEL; } static inline int irqchip_split(struct kvm *kvm) { - return kvm->arch.irqchip_mode == KVM_IRQCHIP_SPLIT; + int mode = kvm->arch.irqchip_mode; + + /* Matches smp_wmb() when setting irqchip_mode */ + smp_rmb(); + return mode == KVM_IRQCHIP_SPLIT; } static inline int irqchip_kernel(struct kvm *kvm) { - return kvm->arch.irqchip_mode == KVM_IRQCHIP_KERNEL; + int mode = kvm->arch.irqchip_mode; + + /* Matches smp_wmb() when setting irqchip_mode */ + smp_rmb(); + return mode == KVM_IRQCHIP_KERNEL; } static inline int irqchip_in_kernel(struct kvm *kvm) { - bool ret = kvm->arch.irqchip_mode != KVM_IRQCHIP_NONE; + int mode = kvm->arch.irqchip_mode; - /* Matches with wmb after initializing kvm->irq_routing. */ + /* Matches smp_wmb() when setting irqchip_mode */ smp_rmb(); - return ret; + return mode != KVM_IRQCHIP_NONE; } -void kvm_pic_reset(struct kvm_kpic_state *s); - void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 6825cd36d13b..3cc3b2d130a0 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -42,7 +42,7 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) { - struct kvm_pic *pic = pic_irqchip(kvm); + struct kvm_pic *pic = kvm->arch.vpic; return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); } @@ -232,11 +232,11 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) goto unlock; } clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); - if (!ioapic_in_kernel(kvm)) + if (!irqchip_kernel(kvm)) goto unlock; kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); - kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id); + kvm_pic_clear_all(kvm->arch.vpic, irq_source_id); unlock: mutex_unlock(&kvm->irq_lock); } @@ -274,42 +274,42 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, srcu_read_unlock(&kvm->irq_srcu, idx); } +bool kvm_arch_can_set_irq_routing(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm); +} + int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { - int r = -EINVAL; - int delta; - unsigned max_pin; - + /* We can't check irqchip_in_kernel() here as some callers are + * currently inititalizing the irqchip. Other callers should therefore + * check kvm_arch_can_set_irq_routing() before calling this function. + */ switch (ue->type) { case KVM_IRQ_ROUTING_IRQCHIP: - delta = 0; + if (irqchip_split(kvm)) + return -EINVAL; + e->irqchip.pin = ue->u.irqchip.pin; switch (ue->u.irqchip.irqchip) { case KVM_IRQCHIP_PIC_SLAVE: - delta = 8; + e->irqchip.pin += PIC_NUM_PINS / 2; /* fall through */ case KVM_IRQCHIP_PIC_MASTER: - if (!pic_in_kernel(kvm)) - goto out; - + if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2) + return -EINVAL; e->set = kvm_set_pic_irq; - max_pin = PIC_NUM_PINS; break; case KVM_IRQCHIP_IOAPIC: - if (!ioapic_in_kernel(kvm)) - goto out; - - max_pin = KVM_IOAPIC_NUM_PINS; + if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS) + return -EINVAL; e->set = kvm_set_ioapic_irq; break; default: - goto out; + return -EINVAL; } e->irqchip.irqchip = ue->u.irqchip.irqchip; - e->irqchip.pin = ue->u.irqchip.pin + delta; - if (e->irqchip.pin >= max_pin) - goto out; break; case KVM_IRQ_ROUTING_MSI: e->set = kvm_set_msi; @@ -318,7 +318,7 @@ int kvm_set_routing_entry(struct kvm *kvm, e->msi.data = ue->u.msi.data; if (kvm_msi_route_invalid(kvm, e)) - goto out; + return -EINVAL; break; case KVM_IRQ_ROUTING_HV_SINT: e->set = kvm_hv_set_sint; @@ -326,12 +326,10 @@ int kvm_set_routing_entry(struct kvm *kvm, e->hv_sint.sint = ue->u.hv_sint.sint; break; default: - goto out; + return -EINVAL; } - r = 0; -out: - return r; + return 0; } bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index bad6a25067bc..c329d2894905 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -177,8 +177,8 @@ static void recalculate_apic_map(struct kvm *kvm) if (kvm_apic_present(vcpu)) max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic)); - new = kvm_kvzalloc(sizeof(struct kvm_apic_map) + - sizeof(struct kvm_lapic *) * ((u64)max_id + 1)); + new = kvzalloc(sizeof(struct kvm_apic_map) + + sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL); if (!new) goto out; @@ -529,14 +529,16 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) { - return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, &val, - sizeof(val)); + + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, + sizeof(val)); } static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) { - return kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, val, - sizeof(*val)); + + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, + sizeof(*val)); } static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) @@ -2285,8 +2287,8 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - if (kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data, - sizeof(u32))) + if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32))) return; apic_set_tpr(vcpu->arch.apic, data & 0xff); @@ -2338,14 +2340,14 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) max_isr = 0; data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); - kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data, - sizeof(u32)); + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32)); } int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) { if (vapic_addr) { - if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apic->vapic_cache, vapic_addr, sizeof(u32))) return -EINVAL; @@ -2439,7 +2441,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) vcpu->arch.pv_eoi.msr_val = data; if (!pv_eoi_enabled(vcpu)) return 0; - return kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.pv_eoi.data, + return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, addr, sizeof(u8)); } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ac7810513d0e..5d3376f67794 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1498,6 +1498,21 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } +/** + * kvm_arch_write_log_dirty - emulate dirty page logging + * @vcpu: Guest mode vcpu + * + * Emulate arch specific page modification logging for the + * nested hypervisor + */ +int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu) +{ + if (kvm_x86_ops->write_log_dirty) + return kvm_x86_ops->write_log_dirty(vcpu); + + return 0; +} + bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn) { @@ -4340,7 +4355,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); -void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, + bool accessed_dirty) { struct kvm_mmu *context = &vcpu->arch.mmu; @@ -4349,6 +4365,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->nx = true; + context->ept_ad = accessed_dirty; context->page_fault = ept_page_fault; context->gva_to_gpa = ept_gva_to_gpa; context->sync_page = ept_sync_page; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index ddc56e91f2e4..27975807cc64 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -74,7 +74,8 @@ enum { int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); -void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, + bool accessed_dirty); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { @@ -201,4 +202,5 @@ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); +int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index 60168cdd0546..ea67dc876316 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -40,8 +40,8 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, int i; for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { - slot->arch.gfn_track[i] = kvm_kvzalloc(npages * - sizeof(*slot->arch.gfn_track[i])); + slot->arch.gfn_track[i] = kvzalloc(npages * + sizeof(*slot->arch.gfn_track[i]), GFP_KERNEL); if (!slot->arch.gfn_track[i]) goto track_free; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a01105485315..56241746abbd 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -23,13 +23,6 @@ * so the code in this file is compiled twice, once per pte size. */ -/* - * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro - * uses for EPT without A/D paging type. - */ -extern u64 __pure __using_nonexistent_pte_bit(void) - __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT"); - #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 @@ -39,10 +32,9 @@ extern u64 __pure __using_nonexistent_pte_bit(void) #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_BITS PT64_LEVEL_BITS - #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK - #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT + #define PT_HAVE_ACCESSED_DIRTY(mmu) true #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS 4 #define CMPXCHG cmpxchg @@ -60,10 +52,9 @@ extern u64 __pure __using_nonexistent_pte_bit(void) #define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 - #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK - #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT + #define PT_HAVE_ACCESSED_DIRTY(mmu) true #define CMPXCHG cmpxchg #elif PTTYPE == PTTYPE_EPT #define pt_element_t u64 @@ -74,16 +65,18 @@ extern u64 __pure __using_nonexistent_pte_bit(void) #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_BITS PT64_LEVEL_BITS - #define PT_GUEST_ACCESSED_MASK 0 - #define PT_GUEST_DIRTY_MASK 0 - #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit() - #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit() + #define PT_GUEST_DIRTY_SHIFT 9 + #define PT_GUEST_ACCESSED_SHIFT 8 + #define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad) #define CMPXCHG cmpxchg64 #define PT_MAX_FULL_LEVELS 4 #else #error Invalid PTTYPE value #endif +#define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT) +#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT) + #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) #define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL) @@ -111,12 +104,13 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } -static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) +static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access, + unsigned gpte) { unsigned mask; /* dirty bit is not supported, so no need to track it */ - if (!PT_GUEST_DIRTY_MASK) + if (!PT_HAVE_ACCESSED_DIRTY(mmu)) return; BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); @@ -171,7 +165,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, goto no_present; /* if accessed bit is not supported prefetch non accessed gpte */ - if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK)) + if (PT_HAVE_ACCESSED_DIRTY(&vcpu->arch.mmu) && !(gpte & PT_GUEST_ACCESSED_MASK)) goto no_present; return false; @@ -217,7 +211,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, int ret; /* dirty/accessed bits are not supported, so no need to update them */ - if (!PT_GUEST_DIRTY_MASK) + if (!PT_HAVE_ACCESSED_DIRTY(mmu)) return 0; for (level = walker->max_level; level >= walker->level; --level) { @@ -232,6 +226,10 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, if (level == walker->level && write_fault && !(pte & PT_GUEST_DIRTY_MASK)) { trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); +#if PTTYPE == PTTYPE_EPT + if (kvm_arch_write_log_dirty(vcpu)) + return -EINVAL; +#endif pte |= PT_GUEST_DIRTY_MASK; } if (pte == orig_pte) @@ -286,7 +284,9 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey; + unsigned nested_access; gpa_t pte_gpa; + bool have_ad; int offset; const int write_fault = access & PFERR_WRITE_MASK; const int user_fault = access & PFERR_USER_MASK; @@ -299,6 +299,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, retry_walk: walker->level = mmu->root_level; pte = mmu->get_cr3(vcpu); + have_ad = PT_HAVE_ACCESSED_DIRTY(mmu); #if PTTYPE == 64 if (walker->level == PT32E_ROOT_LEVEL) { @@ -312,7 +313,15 @@ retry_walk: walker->max_level = walker->level; ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu))); - accessed_dirty = PT_GUEST_ACCESSED_MASK; + accessed_dirty = have_ad ? PT_GUEST_ACCESSED_MASK : 0; + + /* + * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging + * by the MOV to CR instruction are treated as reads and do not cause the + * processor to set the dirty flag in any EPT paging-structure entry. + */ + nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK; + pt_access = pte_access = ACC_ALL; ++walker->level; @@ -332,7 +341,7 @@ retry_walk: walker->pte_gpa[walker->level - 1] = pte_gpa; real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), - PFERR_USER_MASK|PFERR_WRITE_MASK, + nested_access, &walker->fault); /* @@ -394,7 +403,7 @@ retry_walk: walker->gfn = real_gpa >> PAGE_SHIFT; if (!write_fault) - FNAME(protect_clean_gpte)(&pte_access, pte); + FNAME(protect_clean_gpte)(mmu, &pte_access, pte); else /* * On a write fault, fold the dirty bit into accessed_dirty. @@ -485,7 +494,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gfn = gpte_to_gfn(gpte); pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); - FNAME(protect_clean_gpte)(&pte_access, gpte); + FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); if (is_error_pfn(pfn)) @@ -979,7 +988,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; pte_access &= FNAME(gpte_access)(vcpu, gpte); - FNAME(protect_clean_gpte)(&pte_access, gpte); + FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte); if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access, &nr_present)) @@ -1025,3 +1034,4 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef PT_GUEST_DIRTY_MASK #undef PT_GUEST_DIRTY_SHIFT #undef PT_GUEST_ACCESSED_SHIFT +#undef PT_HAVE_ACCESSED_DIRTY diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5f48f62b8dc2..c27ac6923a18 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1196,10 +1196,13 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_CLGI); set_intercept(svm, INTERCEPT_SKINIT); set_intercept(svm, INTERCEPT_WBINVD); - set_intercept(svm, INTERCEPT_MONITOR); - set_intercept(svm, INTERCEPT_MWAIT); set_intercept(svm, INTERCEPT_XSETBV); + if (!kvm_mwait_in_guest()) { + set_intercept(svm, INTERCEPT_MONITOR); + set_intercept(svm, INTERCEPT_MWAIT); + } + control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __pa(svm->msrpm); control->int_ctl = V_INTR_MASKING_MASK; @@ -5254,6 +5257,12 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } +static void svm_setup_mce(struct kvm_vcpu *vcpu) +{ + /* [63:9] are reserved. */ + vcpu->arch.mcg_cap &= 0x1ff; +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -5365,6 +5374,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .pmu_ops = &amd_pmu_ops, .deliver_posted_interrupt = svm_deliver_avic_intr, .update_pi_irte = svm_update_pi_irte, + .setup_mce = svm_setup_mce, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1a471e5f963f..c6f4ad44aa95 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -84,9 +84,6 @@ module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); static bool __read_mostly emulate_invalid_guest_state = true; module_param(emulate_invalid_guest_state, bool, S_IRUGO); -static bool __read_mostly vmm_exclusive = 1; -module_param(vmm_exclusive, bool, S_IRUGO); - static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); @@ -251,6 +248,7 @@ struct __packed vmcs12 { u64 xss_exit_bitmap; u64 guest_physical_address; u64 vmcs_link_pointer; + u64 pml_address; u64 guest_ia32_debugctl; u64 guest_ia32_pat; u64 guest_ia32_efer; @@ -372,6 +370,7 @@ struct __packed vmcs12 { u16 guest_ldtr_selector; u16 guest_tr_selector; u16 guest_intr_status; + u16 guest_pml_index; u16 host_es_selector; u16 host_cs_selector; u16 host_ss_selector; @@ -410,6 +409,7 @@ struct nested_vmx { /* Has the level1 guest done vmxon? */ bool vmxon; gpa_t vmxon_ptr; + bool pml_full; /* The guest-physical address of the current VMCS L1 keeps for L2 */ gpa_t current_vmptr; @@ -615,10 +615,6 @@ struct vcpu_vmx { int vpid; bool emulation_required; - /* Support for vnmi-less CPUs */ - int soft_vnmi_blocked; - ktime_t entry_time; - s64 vnmi_blocked_time; u32 exit_reason; /* Posted interrupt descriptor */ @@ -749,6 +745,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), FIELD(GUEST_TR_SELECTOR, guest_tr_selector), FIELD(GUEST_INTR_STATUS, guest_intr_status), + FIELD(GUEST_PML_INDEX, guest_pml_index), FIELD(HOST_ES_SELECTOR, host_es_selector), FIELD(HOST_CS_SELECTOR, host_cs_selector), FIELD(HOST_SS_SELECTOR, host_ss_selector), @@ -774,6 +771,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), + FIELD64(PML_ADDRESS, pml_address), FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), FIELD64(GUEST_IA32_PAT, guest_ia32_pat), FIELD64(GUEST_IA32_EFER, guest_ia32_efer), @@ -914,8 +912,6 @@ static void nested_release_page_clean(struct page *page) static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(unsigned long root_hpa); -static void kvm_cpu_vmxon(u64 addr); -static void kvm_cpu_vmxoff(void); static bool vmx_xsaves_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, @@ -1289,11 +1285,6 @@ static inline bool cpu_has_vmx_invpcid(void) SECONDARY_EXEC_ENABLE_INVPCID; } -static inline bool cpu_has_virtual_nmis(void) -{ - return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; -} - static inline bool cpu_has_vmx_wbinvd_exit(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -1328,6 +1319,11 @@ static inline bool report_flexpriority(void) return flexpriority_enabled; } +static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu) +{ + return vmx_misc_cr3_count(to_vmx(vcpu)->nested.nested_vmx_misc_low); +} + static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) { return vmcs12->cpu_based_vm_exec_control & bit; @@ -1362,6 +1358,11 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) vmx_xsaves_supported(); } +static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML); +} + static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); @@ -2238,15 +2239,10 @@ static void decache_tsc_multiplier(struct vcpu_vmx *vmx) static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); bool already_loaded = vmx->loaded_vmcs->cpu == cpu; - if (!vmm_exclusive) - kvm_cpu_vmxon(phys_addr); - else if (!already_loaded) - loaded_vmcs_clear(vmx->loaded_vmcs); - if (!already_loaded) { + loaded_vmcs_clear(vmx->loaded_vmcs); local_irq_disable(); crash_disable_local_vmclear(cpu); @@ -2324,11 +2320,6 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) vmx_vcpu_pi_put(vcpu); __vmx_load_host_state(to_vmx(vcpu)); - if (!vmm_exclusive) { - __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); - vcpu->cpu = -1; - kvm_cpu_vmxoff(); - } } static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); @@ -2752,6 +2743,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_secondary_ctls_high); vmx->nested.nested_vmx_secondary_ctls_low = 0; vmx->nested.nested_vmx_secondary_ctls_high &= + SECONDARY_EXEC_RDRAND | SECONDARY_EXEC_RDSEED | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_DESC | @@ -2766,14 +2758,19 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | - VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | - VMX_EPT_INVEPT_BIT; + VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; if (cpu_has_vmx_ept_execute_only()) vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXECUTE_ONLY_BIT; vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept; vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | - VMX_EPT_EXTENT_CONTEXT_BIT; + VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | + VMX_EPT_1GB_PAGE_BIT; + if (enable_ept_ad_bits) { + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_PML; + vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; + } } else vmx->nested.nested_vmx_ept_caps = 0; @@ -3420,6 +3417,7 @@ static __init int vmx_disabled_by_bios(void) static void kvm_cpu_vmxon(u64 addr) { + cr4_set_bits(X86_CR4_VMXE); intel_pt_handle_vmx(1); asm volatile (ASM_VMX_VMXON_RAX @@ -3462,12 +3460,8 @@ static int hardware_enable(void) /* enable and lock */ wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); } - cr4_set_bits(X86_CR4_VMXE); - - if (vmm_exclusive) { - kvm_cpu_vmxon(phys_addr); - ept_sync_global(); - } + kvm_cpu_vmxon(phys_addr); + ept_sync_global(); return 0; } @@ -3491,15 +3485,13 @@ static void kvm_cpu_vmxoff(void) asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); intel_pt_handle_vmx(0); + cr4_clear_bits(X86_CR4_VMXE); } static void hardware_disable(void) { - if (vmm_exclusive) { - vmclear_local_loaded_vmcss(); - kvm_cpu_vmxoff(); - } - cr4_clear_bits(X86_CR4_VMXE); + vmclear_local_loaded_vmcss(); + kvm_cpu_vmxoff(); } static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, @@ -3549,11 +3541,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING | - CPU_BASED_MWAIT_EXITING | - CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_RDPMC_EXITING; + if (!kvm_mwait_in_guest()) + min |= CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING; + opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; @@ -3619,9 +3613,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) &_vmexit_control) < 0) return -EIO; - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | - PIN_BASED_VMX_PREEMPTION_TIMER; + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | + PIN_BASED_VIRTUAL_NMIS; + opt = PIN_BASED_POSTED_INTR | PIN_BASED_VMX_PREEMPTION_TIMER; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, &_pin_based_exec_control) < 0) return -EIO; @@ -4013,11 +4007,12 @@ static void exit_lmode(struct kvm_vcpu *vcpu) static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) { - vpid_sync_context(vpid); if (enable_ept) { if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); + } else { + vpid_sync_context(vpid); } } @@ -5293,8 +5288,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx->rmode.vm86_active = 0; - vmx->soft_vnmi_blocked = 0; - vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(vcpu, 0); @@ -5414,8 +5407,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) static void enable_nmi_window(struct kvm_vcpu *vcpu) { - if (!cpu_has_virtual_nmis() || - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { + if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { enable_irq_window(vcpu); return; } @@ -5456,19 +5448,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); if (!is_guest_mode(vcpu)) { - if (!cpu_has_virtual_nmis()) { - /* - * Tracking the NMI-blocked state in software is built upon - * finding the next open IRQ window. This, in turn, depends on - * well-behaving guests: They have to keep IRQs disabled at - * least as long as the NMI handler runs. Otherwise we may - * cause NMI nesting, maybe breaking the guest. But as this is - * highly unlikely, we can live with the residual risk. - */ - vmx->soft_vnmi_blocked = 1; - vmx->vnmi_blocked_time = 0; - } - ++vcpu->stat.nmi_injections; vmx->nmi_known_unmasked = false; } @@ -5485,8 +5464,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) { - if (!cpu_has_virtual_nmis()) - return to_vmx(vcpu)->soft_vnmi_blocked; if (to_vmx(vcpu)->nmi_known_unmasked) return false; return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; @@ -5496,20 +5473,13 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!cpu_has_virtual_nmis()) { - if (vmx->soft_vnmi_blocked != masked) { - vmx->soft_vnmi_blocked = masked; - vmx->vnmi_blocked_time = 0; - } - } else { - vmx->nmi_known_unmasked = !masked; - if (masked) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - } + vmx->nmi_known_unmasked = !masked; + if (masked) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); } static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) @@ -5517,9 +5487,6 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) if (to_vmx(vcpu)->nested.nested_run_pending) return 0; - if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) - return 0; - return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | GUEST_INTR_STATE_NMI)); @@ -6240,21 +6207,18 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) unsigned long exit_qualification; gpa_t gpa; u32 error_code; - int gla_validity; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - gla_validity = (exit_qualification >> 7) & 0x3; - if (gla_validity == 0x2) { - printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); - printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", - (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), - vmcs_readl(GUEST_LINEAR_ADDRESS)); - printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", - (long unsigned int)exit_qualification); - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; - vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; - return 0; + if (is_guest_mode(vcpu) + && !(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED)) { + /* + * Fix up exit_qualification according to whether guest + * page table accesses are reads or writes. + */ + u64 eptp = nested_ept_get_cr3(vcpu); + if (!(eptp & VMX_EPT_AD_ENABLE_BIT)) + exit_qualification &= ~EPT_VIOLATION_ACC_WRITE; } /* @@ -6264,7 +6228,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) * AAK134, BY25. */ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && - cpu_has_virtual_nmis() && (exit_qualification & INTR_INFO_UNBLOCK_NMI)) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); @@ -6350,7 +6313,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (intr_window_requested && vmx_interrupt_allowed(vcpu)) return handle_interrupt_window(&vmx->vcpu); - if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) + if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); @@ -7103,34 +7066,24 @@ out_msr_bitmap: static int handle_vmon(struct kvm_vcpu *vcpu) { int ret; - struct kvm_segment cs; struct vcpu_vmx *vmx = to_vmx(vcpu); const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - /* The Intel VMX Instruction Reference lists a bunch of bits that - * are prerequisite to running VMXON, most notably cr4.VMXE must be - * set to 1 (see vmx_set_cr4() for when we allow the guest to set this). - * Otherwise, we should fail with #UD. We test these now: + /* + * The Intel VMX Instruction Reference lists a bunch of bits that are + * prerequisite to running VMXON, most notably cr4.VMXE must be set to + * 1 (see vmx_set_cr4() for when we allow the guest to set this). + * Otherwise, we should fail with #UD. But most faulting conditions + * have already been checked by hardware, prior to the VM-exit for + * VMXON. We do test guest cr4.VMXE because processor CR4 always has + * that bit set to 1 in non-root mode. */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) || - !kvm_read_cr0_bits(vcpu, X86_CR0_PE) || - (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { + if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); - if (is_long_mode(vcpu) && !cs.l) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - if (vmx_get_cpl(vcpu)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - if (vmx->nested.vmxon) { nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); return kvm_skip_emulated_instruction(vcpu); @@ -7157,29 +7110,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) * Intel's VMX Instruction Reference specifies a common set of prerequisites * for running VMX instructions (except VMXON, whose prerequisites are * slightly different). It also specifies what exception to inject otherwise. + * Note that many of these exceptions have priority over VM exits, so they + * don't have to be checked again here. */ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) { - struct kvm_segment cs; - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!vmx->nested.vmxon) { + if (!to_vmx(vcpu)->nested.vmxon) { kvm_queue_exception(vcpu, UD_VECTOR); return 0; } - - vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); - if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) || - (is_long_mode(vcpu) && !cs.l)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 0; - } - - if (vmx_get_cpl(vcpu)) { - kvm_inject_gp(vcpu, 0); - return 0; - } - return 1; } @@ -7523,7 +7462,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) if (get_vmx_mem_address(vcpu, exit_qualification, vmx_instruction_info, true, &gva)) return 1; - /* _system ok, as nested_vmx_check_permission verified cpl=0 */ + /* _system ok, as hardware has verified cpl=0 */ kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva, &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL); } @@ -7656,7 +7595,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) if (get_vmx_mem_address(vcpu, exit_qualification, vmx_instruction_info, true, &vmcs_gva)) return 1; - /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */ + /* ok to use *_system, as hardware has verified cpl=0 */ if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva, (void *)&to_vmx(vcpu)->nested.current_vmptr, sizeof(u64), &e)) { @@ -7689,11 +7628,6 @@ static int handle_invept(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); @@ -7815,7 +7749,6 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) * "blocked by NMI" bit has to be set before next VM entry. */ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && - cpu_has_virtual_nmis() && (exit_qualification & INTR_INFO_UNBLOCK_NMI)) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); @@ -8117,6 +8050,10 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); case EXIT_REASON_RDPMC: return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); + case EXIT_REASON_RDRAND: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND); + case EXIT_REASON_RDSEED: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED); case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: @@ -8195,7 +8132,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_PREEMPTION_TIMER: return false; case EXIT_REASON_PML_FULL: - /* We don't expose PML support to L1. */ + /* We emulate PML support to L1. */ return false; default: return true; @@ -8490,26 +8427,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) return 0; } - if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && - !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( - get_vmcs12(vcpu))))) { - if (vmx_interrupt_allowed(vcpu)) { - vmx->soft_vnmi_blocked = 0; - } else if (vmx->vnmi_blocked_time > 1000000000LL && - vcpu->arch.nmi_pending) { - /* - * This CPU don't support us in finding the end of an - * NMI-blocked window if the guest runs with IRQs - * disabled. So we pull the trigger after 1 s of - * futile waiting, but inform the user about this. - */ - printk(KERN_WARNING "%s: Breaking out of NMI-blocked " - "state on VCPU %d after 1 s timeout\n", - __func__, vcpu->vcpu_id); - vmx->soft_vnmi_blocked = 0; - } - } - if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu); @@ -8785,37 +8702,33 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; - if (cpu_has_virtual_nmis()) { - if (vmx->nmi_known_unmasked) - return; - /* - * Can't use vmx->exit_intr_info since we're not sure what - * the exit reason is. - */ - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; - vector = exit_intr_info & INTR_INFO_VECTOR_MASK; - /* - * SDM 3: 27.7.1.2 (September 2008) - * Re-set bit "block by NMI" before VM entry if vmexit caused by - * a guest IRET fault. - * SDM 3: 23.2.2 (September 2008) - * Bit 12 is undefined in any of the following cases: - * If the VM exit sets the valid bit in the IDT-vectoring - * information field. - * If the VM exit is due to a double fault. - */ - if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && - vector != DF_VECTOR && !idtv_info_valid) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmx->nmi_known_unmasked = - !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) - & GUEST_INTR_STATE_NMI); - } else if (unlikely(vmx->soft_vnmi_blocked)) - vmx->vnmi_blocked_time += - ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); + if (vmx->nmi_known_unmasked) + return; + /* + * Can't use vmx->exit_intr_info since we're not sure what + * the exit reason is. + */ + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + /* + * SDM 3: 27.7.1.2 (September 2008) + * Re-set bit "block by NMI" before VM entry if vmexit caused by + * a guest IRET fault. + * SDM 3: 23.2.2 (September 2008) + * Bit 12 is undefined in any of the following cases: + * If the VM exit sets the valid bit in the IDT-vectoring + * information field. + * If the VM exit is due to a double fault. + */ + if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && + vector != DF_VECTOR && !idtv_info_valid) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmx->nmi_known_unmasked = + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) + & GUEST_INTR_STATE_NMI); } static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, @@ -8932,10 +8845,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long debugctlmsr, cr4; - /* Record the guest's net vcpu time for enforced NMI injections. */ - if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) - vmx->entry_time = ktime_get(); - /* Don't enter VMX if guest state is invalid, let the exit handler start emulation until we arrive back to a valid state */ if (vmx->emulation_required) @@ -9143,16 +9052,16 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_complete_interrupts(vmx); } -static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) +static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) { struct vcpu_vmx *vmx = to_vmx(vcpu); int cpu; - if (vmx->loaded_vmcs == &vmx->vmcs01) + if (vmx->loaded_vmcs == vmcs) return; cpu = get_cpu(); - vmx->loaded_vmcs = &vmx->vmcs01; + vmx->loaded_vmcs = vmcs; vmx_vcpu_put(vcpu); vmx_vcpu_load(vcpu, cpu); vcpu->cpu = cpu; @@ -9170,7 +9079,7 @@ static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu) r = vcpu_load(vcpu); BUG_ON(r); - vmx_load_vmcs01(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); free_nested(vmx); vcpu_put(vcpu); } @@ -9231,11 +9140,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->loaded_vmcs->shadow_vmcs = NULL; if (!vmx->loaded_vmcs->vmcs) goto free_msrs; - if (!vmm_exclusive) - kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id()))); loaded_vmcs_init(vmx->loaded_vmcs); - if (!vmm_exclusive) - kvm_cpu_vmxoff(); cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); @@ -9477,13 +9382,20 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exit_reason; + unsigned long exit_qualification = vcpu->arch.exit_qualification; - if (fault->error_code & PFERR_RSVD_MASK) + if (vmx->nested.pml_full) { + exit_reason = EXIT_REASON_PML_FULL; + vmx->nested.pml_full = false; + exit_qualification &= INTR_INFO_UNBLOCK_NMI; + } else if (fault->error_code & PFERR_RSVD_MASK) exit_reason = EXIT_REASON_EPT_MISCONFIG; else exit_reason = EXIT_REASON_EPT_VIOLATION; - nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification); + + nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); vmcs12->guest_physical_address = fault->address; } @@ -9495,17 +9407,26 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) return get_vmcs12(vcpu)->ept_pointer; } -static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { + u64 eptp; + WARN_ON(mmu_is_nested(vcpu)); + eptp = nested_ept_get_cr3(vcpu); + if ((eptp & VMX_EPT_AD_ENABLE_BIT) && !enable_ept_ad_bits) + return 1; + + kvm_mmu_unload(vcpu); kvm_init_shadow_ept_mmu(vcpu, to_vmx(vcpu)->nested.nested_vmx_ept_caps & - VMX_EPT_EXECUTE_ONLY_BIT); + VMX_EPT_EXECUTE_ONLY_BIT, + eptp & VMX_EPT_AD_ENABLE_BIT); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; + return 0; } static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) @@ -9817,6 +9738,22 @@ static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, return 0; } +static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u64 address = vmcs12->pml_address; + int maxphyaddr = cpuid_maxphyaddr(vcpu); + + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) { + if (!nested_cpu_has_ept(vmcs12) || + !IS_ALIGNED(address, 4096) || + address >> maxphyaddr) + return -EINVAL; + } + + return 0; +} + static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { @@ -9990,7 +9927,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, bool from_vmentry, u32 *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exec_control; + u32 exec_control, vmcs12_exec_ctrl; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); @@ -10121,8 +10058,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT); if (nested_cpu_has(vmcs12, - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) - exec_control |= vmcs12->secondary_vm_exec_control; + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { + vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & + ~SECONDARY_EXEC_ENABLE_PML; + exec_control |= vmcs12_exec_ctrl; + } if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { vmcs_write64(EOI_EXIT_BITMAP0, @@ -10279,8 +10219,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } if (nested_cpu_has_ept(vmcs12)) { - kvm_mmu_unload(vcpu); - nested_ept_init_mmu_context(vcpu); + if (nested_ept_init_mmu_context(vcpu)) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return 1; + } } else if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { vmx_flush_tlb_ept_only(vcpu); @@ -10350,12 +10292,16 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_pml_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, vmx->nested.nested_vmx_procbased_ctls_low, vmx->nested.nested_vmx_procbased_ctls_high) || - !vmx_control_verify(vmcs12->secondary_vm_exec_control, - vmx->nested.nested_vmx_secondary_ctls_low, - vmx->nested.nested_vmx_secondary_ctls_high) || + (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && + !vmx_control_verify(vmcs12->secondary_vm_exec_control, + vmx->nested.nested_vmx_secondary_ctls_low, + vmx->nested.nested_vmx_secondary_ctls_high)) || !vmx_control_verify(vmcs12->pin_based_vm_exec_control, vmx->nested.nested_vmx_pinbased_ctls_low, vmx->nested.nested_vmx_pinbased_ctls_high) || @@ -10367,6 +10313,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx->nested.nested_vmx_entry_ctls_high)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || !nested_cr3_valid(vcpu, vmcs12->host_cr3)) @@ -10434,7 +10383,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct loaded_vmcs *vmcs02; - int cpu; u32 msr_entry_idx; u32 exit_qual; @@ -10447,18 +10395,12 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); - cpu = get_cpu(); - vmx->loaded_vmcs = vmcs02; - vmx_vcpu_put(vcpu); - vmx_vcpu_load(vcpu, cpu); - vcpu->cpu = cpu; - put_cpu(); - + vmx_switch_vmcs(vcpu, vmcs02); vmx_segment_cache_clear(vmx); if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) { leave_guest_mode(vcpu); - vmx_load_vmcs01(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, exit_qual); return 1; @@ -10471,7 +10413,7 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) vmcs12->vm_entry_msr_load_count); if (msr_entry_idx) { leave_guest_mode(vcpu); - vmx_load_vmcs01(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx); return 1; @@ -11039,7 +10981,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, if (unlikely(vmx->fail)) vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR); - vmx_load_vmcs01(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) && nested_exit_intr_ack_set(vcpu)) { @@ -11251,6 +11193,46 @@ static void vmx_flush_log_dirty(struct kvm *kvm) kvm_flush_pml_buffers(kvm); } +static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12; + struct vcpu_vmx *vmx = to_vmx(vcpu); + gpa_t gpa; + struct page *page = NULL; + u64 *pml_address; + + if (is_guest_mode(vcpu)) { + WARN_ON_ONCE(vmx->nested.pml_full); + + /* + * Check if PML is enabled for the nested guest. + * Whether eptp bit 6 is set is already checked + * as part of A/D emulation. + */ + vmcs12 = get_vmcs12(vcpu); + if (!nested_cpu_has_pml(vmcs12)) + return 0; + + if (vmcs12->guest_pml_index > PML_ENTITY_NUM) { + vmx->nested.pml_full = true; + return 1; + } + + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; + + page = nested_get_page(vcpu, vmcs12->pml_address); + if (!page) + return 0; + + pml_address = kmap(page); + pml_address[vmcs12->guest_pml_index--] = gpa; + kunmap(page); + nested_release_page_clean(page); + } + + return 0; +} + static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t offset, unsigned long mask) @@ -11610,6 +11592,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .slot_disable_log_dirty = vmx_slot_disable_log_dirty, .flush_log_dirty = vmx_flush_log_dirty, .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, + .write_log_dirty = vmx_write_pml_buffer, .pre_block = vmx_pre_block, .post_block = vmx_post_block, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ccbd45ecd41a..464da936c53d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -27,7 +27,6 @@ #include "kvm_cache_regs.h" #include "x86.h" #include "cpuid.h" -#include "assigned-dev.h" #include "pmu.h" #include "hyperv.h" @@ -1008,6 +1007,8 @@ static u32 emulated_msrs[] = { MSR_IA32_MCG_CTL, MSR_IA32_MCG_EXT_CTL, MSR_IA32_SMBASE, + MSR_PLATFORM_INFO, + MSR_MISC_FEATURES_ENABLES, }; static unsigned num_emulated_msrs; @@ -1444,10 +1445,10 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; - s64 usdiff; bool matched; bool already_matched; u64 data = msr->data; + bool synchronizing = false; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_tsc_offset(vcpu, data); @@ -1455,51 +1456,34 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { - int faulted = 0; - - /* n.b - signed multiplication and division required */ - usdiff = data - kvm->arch.last_tsc_write; -#ifdef CONFIG_X86_64 - usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; -#else - /* do_div() only does unsigned */ - asm("1: idivl %[divisor]\n" - "2: xor %%edx, %%edx\n" - " movl $0, %[faulted]\n" - "3:\n" - ".section .fixup,\"ax\"\n" - "4: movl $1, %[faulted]\n" - " jmp 3b\n" - ".previous\n" - - _ASM_EXTABLE(1b, 4b) - - : "=A"(usdiff), [faulted] "=r" (faulted) - : "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz)); - -#endif - do_div(elapsed, 1000); - usdiff -= elapsed; - if (usdiff < 0) - usdiff = -usdiff; - - /* idivl overflow => difference is larger than USEC_PER_SEC */ - if (faulted) - usdiff = USEC_PER_SEC; - } else - usdiff = USEC_PER_SEC; /* disable TSC match window below */ + if (data == 0 && msr->host_initiated) { + /* + * detection of vcpu initialization -- need to sync + * with other vCPUs. This particularly helps to keep + * kvm_clock stable after CPU hotplug + */ + synchronizing = true; + } else { + u64 tsc_exp = kvm->arch.last_tsc_write + + nsec_to_cycles(vcpu, elapsed); + u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL; + /* + * Special case: TSC write with a small delta (1 second) + * of virtual cycle time against real time is + * interpreted as an attempt to synchronize the CPU. + */ + synchronizing = data < tsc_exp + tsc_hz && + data + tsc_hz > tsc_exp; + } + } /* - * Special case: TSC write with a small delta (1 second) of virtual - * cycle time against real time is interpreted as an attempt to - * synchronize the CPU. - * * For a reliable TSC, we can match TSC offsets, and for an unstable * TSC, we add elapsed time in this computation. We could let the * compensation code attempt to catch up if we fall behind, but * it's better to try to match offsets from the beginning. */ - if (usdiff < USEC_PER_SEC && + if (synchronizing && vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!check_tsc_unstable()) { offset = kvm->arch.cur_tsc_offset; @@ -1769,13 +1753,13 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) - clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); + kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); spin_unlock(&ka->pvclock_gtod_sync_lock); #endif } -static u64 __get_kvmclock_ns(struct kvm *kvm) +u64 get_kvmclock_ns(struct kvm *kvm) { struct kvm_arch *ka = &kvm->arch; struct pvclock_vcpu_time_info hv_clock; @@ -1796,24 +1780,12 @@ static u64 __get_kvmclock_ns(struct kvm *kvm) return __pvclock_read_cycles(&hv_clock, rdtsc()); } -u64 get_kvmclock_ns(struct kvm *kvm) -{ - unsigned long flags; - s64 ns; - - local_irq_save(flags); - ns = __get_kvmclock_ns(kvm); - local_irq_restore(flags); - - return ns; -} - static void kvm_setup_pvclock_page(struct kvm_vcpu *v) { struct kvm_vcpu_arch *vcpu = &v->arch; struct pvclock_vcpu_time_info guest_hv_clock; - if (unlikely(kvm_vcpu_read_guest_cached(v, &vcpu->pv_time, + if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, &guest_hv_clock, sizeof(guest_hv_clock)))) return; @@ -1834,9 +1806,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); vcpu->hv_clock.version = guest_hv_clock.version + 1; - kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); smp_wmb(); @@ -1850,16 +1822,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); - kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); smp_wmb(); vcpu->hv_clock.version++; - kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); } static int kvm_guest_time_update(struct kvm_vcpu *v) @@ -2092,7 +2064,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 0; } - if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.apf.data, gpa, + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, sizeof(u32))) return 1; @@ -2111,7 +2083,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - if (unlikely(kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.st.stime, + if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) return; @@ -2122,7 +2094,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) vcpu->arch.st.steal.version += 1; - kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); smp_wmb(); @@ -2131,14 +2103,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu) vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; - kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); smp_wmb(); vcpu->arch.st.steal.version += 1; - kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); } @@ -2155,6 +2127,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_VM_HSAVE_PA: case MSR_AMD64_PATCH_LOADER: case MSR_AMD64_BU_CFG2: + case MSR_AMD64_DC_CFG: break; case MSR_EFER: @@ -2230,8 +2203,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) bool tmp = (msr == MSR_KVM_SYSTEM_TIME); if (ka->boot_vcpu_runs_old_kvmclock != tmp) - set_bit(KVM_REQ_MASTERCLOCK_UPDATE, - &vcpu->requests); + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); ka->boot_vcpu_runs_old_kvmclock = tmp; } @@ -2243,7 +2215,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & 1)) break; - if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_time, data & ~1ULL, sizeof(struct pvclock_vcpu_time_info))) vcpu->arch.pv_time_enabled = false; @@ -2264,7 +2236,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & KVM_STEAL_RESERVED_MASK) return 1; - if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.st.stime, + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, data & KVM_STEAL_VALID_BITS, sizeof(struct kvm_steal_time))) return 1; @@ -2331,6 +2303,21 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.osvw.status = data; break; + case MSR_PLATFORM_INFO: + if (!msr_info->host_initiated || + data & ~MSR_PLATFORM_INFO_CPUID_FAULT || + (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) && + cpuid_fault_enabled(vcpu))) + return 1; + vcpu->arch.msr_platform_info = data; + break; + case MSR_MISC_FEATURES_ENABLES: + if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT || + (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT && + !supports_cpuid_fault(vcpu))) + return 1; + vcpu->arch.msr_misc_features_enables = data; + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -2417,6 +2404,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_FAM10H_MMIO_CONF_BASE: case MSR_AMD64_BU_CFG2: case MSR_IA32_PERF_CTL: + case MSR_AMD64_DC_CFG: msr_info->data = 0; break; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: @@ -2545,6 +2533,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vcpu->arch.osvw.status; break; + case MSR_PLATFORM_INFO: + msr_info->data = vcpu->arch.msr_platform_info; + break; + case MSR_MISC_FEATURES_ENABLES: + msr_info->data = vcpu->arch.msr_misc_features_enables; + break; default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); @@ -2675,15 +2669,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT - case KVM_CAP_ASSIGN_DEV_IRQ: - case KVM_CAP_PCI_2_3: -#endif r = 1; break; case KVM_CAP_ADJUST_CLOCK: r = KVM_CLOCK_TSC_STABLE; break; + case KVM_CAP_X86_GUEST_MWAIT: + r = kvm_mwait_in_guest(); + break; case KVM_CAP_X86_SMM: /* SMBASE is usually relocated above 1M on modern chipsets, * and SMM handlers might indeed rely on 4G segment limits, @@ -2695,9 +2688,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) */ r = kvm_x86_ops->cpu_has_high_real_mode_segbase(); break; - case KVM_CAP_COALESCED_MMIO: - r = KVM_COALESCED_MMIO_PAGE_OFFSET; - break; case KVM_CAP_VAPIC: r = !kvm_x86_ops->cpu_has_accelerated_tpr(); break; @@ -2713,11 +2703,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PV_MMU: /* obsolete */ r = 0; break; -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT - case KVM_CAP_IOMMU: - r = iommu_present(&pci_bus_type); - break; -#endif case KVM_CAP_MCE: r = KVM_MAX_MCE_BANKS; break; @@ -2816,11 +2801,6 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) return kvm_arch_has_noncoherent_dma(vcpu->kvm); } -static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu) -{ - set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); -} - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { /* Address WBINVD may be executed by guest */ @@ -2864,7 +2844,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); if (vcpu->cpu != cpu) - kvm_migrate_timers(vcpu); + kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu); vcpu->cpu = cpu; } @@ -2878,7 +2858,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) vcpu->arch.st.steal.preempted = 1; - kvm_vcpu_write_guest_offset_cached(vcpu, &vcpu->arch.st.stime, + kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal.preempted, offsetof(struct kvm_steal_time, preempted), sizeof(vcpu->arch.st.steal.preempted)); @@ -3124,7 +3104,14 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, return -EINVAL; if (events->exception.injected && - (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR)) + (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR || + is_guest_mode(vcpu))) + return -EINVAL; + + /* INITs are latched while in SMM */ + if (events->flags & KVM_VCPUEVENT_VALID_SMM && + (events->smi.smm || events->smi.pending) && + vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) return -EINVAL; process_nmi(vcpu); @@ -3721,22 +3708,21 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { + struct kvm_pic *pic = kvm->arch.vpic; int r; r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - memcpy(&chip->chip.pic, - &pic_irqchip(kvm)->pics[0], + memcpy(&chip->chip.pic, &pic->pics[0], sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_PIC_SLAVE: - memcpy(&chip->chip.pic, - &pic_irqchip(kvm)->pics[1], + memcpy(&chip->chip.pic, &pic->pics[1], sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_IOAPIC: - r = kvm_get_ioapic(kvm, &chip->chip.ioapic); + kvm_get_ioapic(kvm, &chip->chip.ioapic); break; default: r = -EINVAL; @@ -3747,32 +3733,31 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { + struct kvm_pic *pic = kvm->arch.vpic; int r; r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - spin_lock(&pic_irqchip(kvm)->lock); - memcpy(&pic_irqchip(kvm)->pics[0], - &chip->chip.pic, + spin_lock(&pic->lock); + memcpy(&pic->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + spin_unlock(&pic->lock); break; case KVM_IRQCHIP_PIC_SLAVE: - spin_lock(&pic_irqchip(kvm)->lock); - memcpy(&pic_irqchip(kvm)->pics[1], - &chip->chip.pic, + spin_lock(&pic->lock); + memcpy(&pic->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + spin_unlock(&pic->lock); break; case KVM_IRQCHIP_IOAPIC: - r = kvm_set_ioapic(kvm, &chip->chip.ioapic); + kvm_set_ioapic(kvm, &chip->chip.ioapic); break; default: r = -EINVAL; break; } - kvm_pic_update_irq(pic_irqchip(kvm)); + kvm_pic_update_irq(pic); return r; } @@ -4018,20 +4003,14 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_ioapic_init(kvm); if (r) { - mutex_lock(&kvm->slots_lock); kvm_pic_destroy(kvm); - mutex_unlock(&kvm->slots_lock); goto create_irqchip_unlock; } r = kvm_setup_default_irq_routing(kvm); if (r) { - mutex_lock(&kvm->slots_lock); - mutex_lock(&kvm->irq_lock); kvm_ioapic_destroy(kvm); kvm_pic_destroy(kvm); - mutex_unlock(&kvm->irq_lock); - mutex_unlock(&kvm->slots_lock); goto create_irqchip_unlock; } /* Write kvm->irq_routing before enabling irqchip_in_kernel. */ @@ -4196,10 +4175,8 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = 0; - local_irq_disable(); - now_ns = __get_kvmclock_ns(kvm); + now_ns = get_kvmclock_ns(kvm); kvm->arch.kvmclock_offset += user_ns.clock - now_ns; - local_irq_enable(); kvm_gen_update_masterclock(kvm); break; } @@ -4207,11 +4184,9 @@ long kvm_arch_vm_ioctl(struct file *filp, struct kvm_clock_data user_ns; u64 now_ns; - local_irq_disable(); - now_ns = __get_kvmclock_ns(kvm); + now_ns = get_kvmclock_ns(kvm); user_ns.clock = now_ns; user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0; - local_irq_enable(); memset(&user_ns.pad, 0, sizeof(user_ns.pad)); r = -EFAULT; @@ -4230,7 +4205,7 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } default: - r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); + r = -ENOTTY; } out: return r; @@ -5223,6 +5198,16 @@ static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked); } +static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) +{ + return emul_to_vcpu(ctxt)->arch.hflags; +} + +static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags) +{ + kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags); +} + static const struct x86_emulate_ops emulate_ops = { .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, @@ -5262,6 +5247,8 @@ static const struct x86_emulate_ops emulate_ops = { .intercept = emulator_intercept, .get_cpuid = emulator_get_cpuid, .set_nmi_mask = emulator_set_nmi_mask, + .get_hflags = emulator_get_hflags, + .set_hflags = emulator_set_hflags, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -5314,7 +5301,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK); BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); - ctxt->emul_flags = vcpu->arch.hflags; init_decode_cache(ctxt); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; @@ -5718,8 +5704,6 @@ restart: unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; - if (vcpu->arch.hflags != ctxt->emul_flags) - kvm_set_hflags(vcpu, ctxt->emul_flags); kvm_rip_write(vcpu, ctxt->eip); if (r == EMULATE_DONE) kvm_vcpu_check_singlestep(vcpu, rflags, &r); @@ -6869,7 +6853,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* * 1) We should set ->mode before checking ->requests. Please see - * the comment in kvm_make_all_cpus_request. + * the comment in kvm_vcpu_exiting_guest_mode(). * * 2) For APICv, we should set ->mode before checking PIR.ON. This * pairs with the memory barrier implicit in pi_test_and_set_on @@ -7051,7 +7035,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (r <= 0) break; - clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu); if (kvm_cpu_has_pending_timer(vcpu)) kvm_inject_pending_timer_irqs(vcpu); @@ -7179,7 +7163,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { kvm_vcpu_block(vcpu); kvm_apic_accept_events(vcpu); - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); r = -EAGAIN; goto out; } @@ -7355,6 +7339,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, mp_state->mp_state != KVM_MP_STATE_RUNNABLE) return -EINVAL; + /* INITs are latched while in SMM */ + if ((is_smm(vcpu) || vcpu->arch.smi_pending) && + (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED || + mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED)) + return -EINVAL; + if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); @@ -7724,6 +7714,9 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (!init_event) { kvm_pmu_reset(vcpu); vcpu->arch.smbase = 0x30000; + + vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; + vcpu->arch.msr_misc_features_enables = 0; } memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); @@ -8068,7 +8061,6 @@ void kvm_arch_sync_events(struct kvm *kvm) { cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); - kvm_free_all_assigned_devices(kvm); kvm_free_pit(kvm); } @@ -8152,7 +8144,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm) } if (kvm_x86_ops->vm_destroy) kvm_x86_ops->vm_destroy(kvm); - kvm_iommu_unmap_guest(kvm); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); @@ -8199,13 +8190,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, slot->base_gfn, level) + 1; slot->arch.rmap[i] = - kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i])); + kvzalloc(lpages * sizeof(*slot->arch.rmap[i]), GFP_KERNEL); if (!slot->arch.rmap[i]) goto out_free; if (i == 0) continue; - linfo = kvm_kvzalloc(lpages * sizeof(*linfo)); + linfo = kvzalloc(lpages * sizeof(*linfo), GFP_KERNEL); if (!linfo) goto out_free; @@ -8385,7 +8376,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (atomic_read(&vcpu->arch.nmi_queued)) return true; - if (test_bit(KVM_REQ_SMI, &vcpu->requests)) + if (kvm_test_request(KVM_REQ_SMI, vcpu)) return true; if (kvm_arch_interrupt_allowed(vcpu) && @@ -8536,8 +8527,9 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) { - return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apf.data, &val, - sizeof(val)); + + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, + sizeof(val)); } void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e8ff3e4ce38a..612067074905 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -1,6 +1,8 @@ #ifndef ARCH_X86_KVM_X86_H #define ARCH_X86_KVM_X86_H +#include <asm/processor.h> +#include <asm/mwait.h> #include <linux/kvm_host.h> #include <asm/pvclock.h> #include "kvm_cache_regs.h" @@ -212,4 +214,38 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) __rem; \ }) +static inline bool kvm_mwait_in_guest(void) +{ + unsigned int eax, ebx, ecx, edx; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT)) + return false; + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + /* All AMD CPUs have a working MWAIT implementation */ + return true; + case X86_VENDOR_INTEL: + /* Handle Intel below */ + break; + default: + return false; + } + + /* + * Intel CPUs without CPUID5_ECX_INTERRUPT_BREAK are problematic as + * they would allow guest to stop the CPU completely by disabling + * interrupts then invoking MWAIT. + */ + if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return false; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + + if (!(ecx & CPUID5_ECX_INTERRUPT_BREAK)) + return false; + + return true; +} + #endif diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index 7e48807b2fa1..45a53dfe1859 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -55,7 +55,7 @@ ENTRY(csum_partial_copy_generic) movq %r12, 3*8(%rsp) movq %r14, 4*8(%rsp) movq %r13, 5*8(%rsp) - movq %rbp, 6*8(%rsp) + movq %r15, 6*8(%rsp) movq %r8, (%rsp) movq %r9, 1*8(%rsp) @@ -74,7 +74,7 @@ ENTRY(csum_partial_copy_generic) /* main loop. clear in 64 byte blocks */ /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ /* r11: temp3, rdx: temp4, r12 loopcnt */ - /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */ + /* r10: temp5, r15: temp6, r14 temp7, r13 temp8 */ .p2align 4 .Lloop: source @@ -89,7 +89,7 @@ ENTRY(csum_partial_copy_generic) source movq 32(%rdi), %r10 source - movq 40(%rdi), %rbp + movq 40(%rdi), %r15 source movq 48(%rdi), %r14 source @@ -103,7 +103,7 @@ ENTRY(csum_partial_copy_generic) adcq %r11, %rax adcq %rdx, %rax adcq %r10, %rax - adcq %rbp, %rax + adcq %r15, %rax adcq %r14, %rax adcq %r13, %rax @@ -121,7 +121,7 @@ ENTRY(csum_partial_copy_generic) dest movq %r10, 32(%rsi) dest - movq %rbp, 40(%rsi) + movq %r15, 40(%rsi) dest movq %r14, 48(%rsi) dest @@ -203,7 +203,7 @@ ENTRY(csum_partial_copy_generic) movq 3*8(%rsp), %r12 movq 4*8(%rsp), %r14 movq 5*8(%rsp), %r13 - movq 6*8(%rsp), %rbp + movq 6*8(%rsp), %r15 addq $7*8, %rsp ret diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c index 5761a4f19455..ab2d1d73e9e7 100644 --- a/arch/x86/lib/kaslr.c +++ b/arch/x86/lib/kaslr.c @@ -5,6 +5,7 @@ * kernel starts. This file is included in the compressed kernel and * normally linked in the regular. */ +#include <asm/asm.h> #include <asm/kaslr.h> #include <asm/msr.h> #include <asm/archrandom.h> @@ -79,7 +80,7 @@ unsigned long kaslr_get_random_long(const char *purpose) } /* Circular multiply for better bit diffusion */ - asm("mul %3" + asm(_ASM_MUL "%3" : "=a" (random), "=d" (raw) : "a" (random), "rm" (mix_const)); random += raw; diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index 04210a29dd60..adab1595f4bd 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -13,7 +13,7 @@ static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page, if (pmd_present(*pmd)) continue; - set_pmd(pmd, __pmd((addr - info->offset) | info->pmd_flag)); + set_pmd(pmd, __pmd((addr - info->offset) | info->page_flag)); } } @@ -30,6 +30,18 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, if (next > end) next = end; + if (info->direct_gbpages) { + pud_t pudval; + + if (pud_present(*pud)) + continue; + + addr &= PUD_MASK; + pudval = __pud((addr - info->offset) | info->page_flag); + set_pud(pud, pudval); + continue; + } + if (pud_present(*pud)) { pmd = pmd_offset(pud, 0); ident_pmd_init(info, pmd, addr, next); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 138bad2fb6bc..cbc87ea98751 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -5,7 +5,7 @@ #include <linux/memblock.h> #include <linux/bootmem.h> /* for max_low_pfn */ -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/e820/api.h> #include <asm/init.h> #include <asm/page.h> diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index f34d275ee201..99fb83819a5f 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -48,7 +48,7 @@ #include <asm/sections.h> #include <asm/paravirt.h> #include <asm/setup.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/page_types.h> #include <asm/init.h> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 745e5e183169..95651dc58e09 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -50,7 +50,7 @@ #include <asm/sections.h> #include <asm/kdebug.h> #include <asm/numa.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/init.h> #include <asm/uv/uv.h> #include <asm/setup.h> @@ -94,10 +94,10 @@ __setup("noexec32=", nonx32_setup); */ void sync_global_pgds(unsigned long start, unsigned long end) { - unsigned long address; + unsigned long addr; - for (address = start; address <= end; address += PGDIR_SIZE) { - pgd_t *pgd_ref = pgd_offset_k(address); + for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) { + pgd_t *pgd_ref = pgd_offset_k(addr); const p4d_t *p4d_ref; struct page *page; @@ -106,7 +106,7 @@ void sync_global_pgds(unsigned long start, unsigned long end) * handle synchonization on p4d level. */ BUILD_BUG_ON(pgd_none(*pgd_ref)); - p4d_ref = p4d_offset(pgd_ref, address); + p4d_ref = p4d_offset(pgd_ref, addr); if (p4d_none(*p4d_ref)) continue; @@ -117,8 +117,8 @@ void sync_global_pgds(unsigned long start, unsigned long end) p4d_t *p4d; spinlock_t *pgt_lock; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - p4d = p4d_offset(pgd, address); + pgd = (pgd_t *)page_address(page) + pgd_index(addr); + p4d = p4d_offset(pgd, addr); /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index e4f7b25df18e..bbc558b88a88 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -14,7 +14,7 @@ #include <linux/vmalloc.h> #include <linux/mmiotrace.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/e820/api.h> #include <asm/fixmap.h> #include <asm/pgtable.h> diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 6b7ce6279133..aca6295350f3 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -100,5 +100,6 @@ void __init initmem_init(void) printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", (ulong) pfn_to_kaddr(highstart_pfn)); + __vmalloc_start_set = true; setup_bootmem_allocator(); } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 56b22fa504df..1dcd2be4cce4 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -24,6 +24,7 @@ #include <asm/pgalloc.h> #include <asm/proto.h> #include <asm/pat.h> +#include <asm/set_memory.h> /* * The current flushing context - we pass it instead of 5 arguments: diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index 38868adf07ea..f6ae6830b341 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -9,7 +9,7 @@ #include <linux/mmiotrace.h> static unsigned long mmio_address; -module_param(mmio_address, ulong, 0); +module_param_hw(mmio_address, ulong, iomem, 0); MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " "(or 8 MB if read_far is non-zero)."); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 32322ce9b405..f58939393eef 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -12,6 +12,7 @@ #include <linux/filter.h> #include <linux/if_vlan.h> #include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <linux/bpf.h> int bpf_jit_enable __read_mostly; @@ -490,13 +491,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, break; case BPF_LD | BPF_IMM | BPF_DW: - if (insn[1].code != 0 || insn[1].src_reg != 0 || - insn[1].dst_reg != 0 || insn[1].off != 0) { - /* verifier must catch invalid insns */ - pr_err("invalid BPF_LD_IMM64 insn\n"); - return -EINVAL; - } - /* optimization: if imm64 is zero, use 'xor <dst>,<dst>' * to save 7 bytes. */ diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 6fa84d531f4f..7b4307163eac 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -406,50 +406,3 @@ void __init pcibios_resource_survey(void) */ ioapic_insert_resources(); } - -static const struct vm_operations_struct pci_mmap_ops = { - .access = generic_access_phys, -}; - -int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, - enum pci_mmap_state mmap_state, int write_combine) -{ - unsigned long prot; - - /* I/O space cannot be accessed via normal processor loads and - * stores on this platform. - */ - if (mmap_state == pci_mmap_io) - return -EINVAL; - - prot = pgprot_val(vma->vm_page_prot); - - /* - * Return error if pat is not enabled and write_combine is requested. - * Caller can followup with UC MINUS request and add a WC mtrr if there - * is a free mtrr slot. - */ - if (!pat_enabled() && write_combine) - return -EINVAL; - - if (pat_enabled() && write_combine) - prot |= cachemode2protval(_PAGE_CACHE_MODE_WC); - else if (pat_enabled() || boot_cpu_data.x86 > 3) - /* - * ioremap() and ioremap_nocache() defaults to UC MINUS for now. - * To avoid attribute conflicts, request UC MINUS here - * as well. - */ - prot |= cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS); - - vma->vm_page_prot = __pgprot(prot); - - if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, - vma->vm_end - vma->vm_start, - vma->vm_page_prot)) - return -EAGAIN; - - vma->vm_ops = &pci_mmap_ops; - - return 0; -} diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 29e9ba6ace9d..c1bdb9edcae7 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -11,7 +11,7 @@ #include <asm/pci_x86.h> #include <asm/e820/types.h> #include <asm/pci-functions.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> /* BIOS32 signature: "_32_" */ #define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24)) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index a15cf815ac4e..7e76a4d8304b 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -49,7 +49,7 @@ #include <asm/efi.h> #include <asm/e820/api.h> #include <asm/time.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/tlbflush.h> #include <asm/x86_init.h> #include <asm/uv/uv.h> diff --git a/arch/x86/platform/intel/iosf_mbi.c b/arch/x86/platform/intel/iosf_mbi.c index edf2c54bf131..a952ac199741 100644 --- a/arch/x86/platform/intel/iosf_mbi.c +++ b/arch/x86/platform/intel/iosf_mbi.c @@ -34,6 +34,8 @@ static struct pci_dev *mbi_pdev; static DEFINE_SPINLOCK(iosf_mbi_lock); +static DEFINE_MUTEX(iosf_mbi_punit_mutex); +static BLOCKING_NOTIFIER_HEAD(iosf_mbi_pmic_bus_access_notifier); static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset) { @@ -190,6 +192,53 @@ bool iosf_mbi_available(void) } EXPORT_SYMBOL(iosf_mbi_available); +void iosf_mbi_punit_acquire(void) +{ + mutex_lock(&iosf_mbi_punit_mutex); +} +EXPORT_SYMBOL(iosf_mbi_punit_acquire); + +void iosf_mbi_punit_release(void) +{ + mutex_unlock(&iosf_mbi_punit_mutex); +} +EXPORT_SYMBOL(iosf_mbi_punit_release); + +int iosf_mbi_register_pmic_bus_access_notifier(struct notifier_block *nb) +{ + int ret; + + /* Wait for the bus to go inactive before registering */ + mutex_lock(&iosf_mbi_punit_mutex); + ret = blocking_notifier_chain_register( + &iosf_mbi_pmic_bus_access_notifier, nb); + mutex_unlock(&iosf_mbi_punit_mutex); + + return ret; +} +EXPORT_SYMBOL(iosf_mbi_register_pmic_bus_access_notifier); + +int iosf_mbi_unregister_pmic_bus_access_notifier(struct notifier_block *nb) +{ + int ret; + + /* Wait for the bus to go inactive before unregistering */ + mutex_lock(&iosf_mbi_punit_mutex); + ret = blocking_notifier_chain_unregister( + &iosf_mbi_pmic_bus_access_notifier, nb); + mutex_unlock(&iosf_mbi_punit_mutex); + + return ret; +} +EXPORT_SYMBOL(iosf_mbi_unregister_pmic_bus_access_notifier); + +int iosf_mbi_call_pmic_bus_access_notifier_chain(unsigned long val, void *v) +{ + return blocking_notifier_call_chain( + &iosf_mbi_pmic_bus_access_notifier, val, v); +} +EXPORT_SYMBOL(iosf_mbi_call_pmic_bus_access_notifier_chain); + #ifdef CONFIG_IOSF_MBI_DEBUG static u32 dbg_mdr; static u32 dbg_mcr; diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 6a61194ffd58..a6e21fee22ea 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -104,7 +104,7 @@ static int set_up_temporary_mappings(void) { struct x86_mapping_info info = { .alloc_pgt_page = alloc_pgt_page, - .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, + .page_flag = __PAGE_KERNEL_LARGE_EXEC, .offset = __PAGE_OFFSET, }; unsigned long mstart, mend; diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 5db706f14111..a163a90af4aa 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -2,7 +2,7 @@ #include <linux/slab.h> #include <linux/memblock.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/pgtable.h> #include <asm/realmode.h> #include <asm/tlbflush.h> diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index a5c9910d234f..09a085bde0d4 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -125,7 +125,7 @@ int poke_user(struct task_struct *child, long addr, long data) else if ((addr >= offsetof(struct user, u_debugreg[0])) && (addr <= offsetof(struct user, u_debugreg[7]))) { addr -= offsetof(struct user, u_debugreg[0]); - addr = addr >> 2; + addr = addr >> 3; if ((addr == 4) || (addr == 5)) return -EIO; child->thread.arch.debugregs[addr] = data; diff --git a/arch/x86/um/shared/sysdep/kernel-offsets.h b/arch/x86/um/shared/sysdep/kernel-offsets.h index 46a9df99f3c5..7e1d35b6ad5c 100644 --- a/arch/x86/um/shared/sysdep/kernel-offsets.h +++ b/arch/x86/um/shared/sysdep/kernel-offsets.h @@ -2,16 +2,9 @@ #include <linux/sched.h> #include <linux/elf.h> #include <linux/crypto.h> +#include <linux/kbuild.h> #include <asm/mman.h> -#define DEFINE(sym, val) \ - asm volatile("\n->" #sym " %0 " #val : : "i" (val)) - -#define BLANK() asm volatile("\n->" : : ) - -#define OFFSET(sym, str, mem) \ - DEFINE(sym, offsetof(struct str, mem)); - void foo(void) { #include <common-offsets.h> |