summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/boot/compressed/Makefile2
-rw-r--r--arch/x86/entry/entry_32.S30
-rw-r--r--arch/x86/entry/entry_64.S11
-rw-r--r--arch/x86/include/asm/kvm_host.h2
-rw-r--r--arch/x86/include/asm/mce.h1
-rw-r--r--arch/x86/include/asm/uaccess.h11
-rw-r--r--arch/x86/kernel/alternative.c9
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c13
-rw-r--r--arch/x86/kernel/fpu/init.c1
-rw-r--r--arch/x86/kernel/ftrace.c20
-rw-r--r--arch/x86/kernel/kprobes/core.c9
-rw-r--r--arch/x86/kernel/setup.c4
-rw-r--r--arch/x86/kernel/unwind_frame.c49
-rw-r--r--arch/x86/kvm/emulate.c2
-rw-r--r--arch/x86/kvm/paging_tmpl.h35
-rw-r--r--arch/x86/kvm/pmu_intel.c2
-rw-r--r--arch/x86/kvm/svm.c3
-rw-r--r--arch/x86/kvm/vmx.c4
-rw-r--r--arch/x86/kvm/x86.c45
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--arch/x86/mm/pat.c9
-rw-r--r--arch/x86/xen/enlighten_pv.c15
-rw-r--r--arch/x86/xen/mmu.c2
-rw-r--r--arch/x86/xen/mmu_pv.c102
26 files changed, 223 insertions, 164 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cd18994a9555..4ccfacc7232a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -360,7 +360,7 @@ config SMP
Management" code will be disabled if you say Y here.
See also <file:Documentation/x86/i386/IO-APIC.txt>,
- <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at
+ <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO available at
<http://www.tldp.org/docs.html#howto>.
If you don't know what to do here, say N.
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 5851411e60fb..bf240b920473 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -159,7 +159,7 @@ ifdef CONFIG_FUNCTION_GRAPH_TRACER
# If '-Os' is enabled, disable it and print a warning.
ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
undefine CONFIG_CC_OPTIMIZE_FOR_SIZE
- $(warning Disabling CONFIG_CC_OPTIMIZE_FOR_SIZE. Your compiler does not have -mfentry so you cannot optimize for size with CONFIG_FUNCTION_GRAPH_TRACER.)
+ $(warning Disabling CONFIG_CC_OPTIMIZE_FOR_SIZE. Your compiler does not have -mfentry so you cannot optimize for size with CONFIG_FUNCTION_GRAPH_TRACER.)
endif
endif
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 44163e8c3868..2c860ad4fe06 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -94,7 +94,7 @@ vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o
quiet_cmd_check_data_rel = DATAREL $@
define cmd_check_data_rel
for obj in $(filter %.o,$^); do \
- readelf -S $$obj | grep -qF .rel.local && { \
+ ${CROSS_COMPILE}readelf -S $$obj | grep -qF .rel.local && { \
echo "error: $$obj has data relocations!" >&2; \
exit 1; \
} || true; \
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 50bc26949e9e..48ef7bb32c42 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -252,6 +252,23 @@ ENTRY(__switch_to_asm)
END(__switch_to_asm)
/*
+ * The unwinder expects the last frame on the stack to always be at the same
+ * offset from the end of the page, which allows it to validate the stack.
+ * Calling schedule_tail() directly would break that convention because its an
+ * asmlinkage function so its argument has to be pushed on the stack. This
+ * wrapper creates a proper "end of stack" frame header before the call.
+ */
+ENTRY(schedule_tail_wrapper)
+ FRAME_BEGIN
+
+ pushl %eax
+ call schedule_tail
+ popl %eax
+
+ FRAME_END
+ ret
+ENDPROC(schedule_tail_wrapper)
+/*
* A newly forked process directly context switches into this address.
*
* eax: prev task we switched from
@@ -259,24 +276,15 @@ END(__switch_to_asm)
* edi: kernel thread arg
*/
ENTRY(ret_from_fork)
- FRAME_BEGIN /* help unwinder find end of stack */
-
- /*
- * schedule_tail() is asmlinkage so we have to put its 'prev' argument
- * on the stack.
- */
- pushl %eax
- call schedule_tail
- popl %eax
+ call schedule_tail_wrapper
testl %ebx, %ebx
jnz 1f /* kernel threads are uncommon */
2:
/* When we fork, we trace the syscall return in the child, too. */
- leal FRAME_OFFSET(%esp), %eax
+ movl %esp, %eax
call syscall_return_slowpath
- FRAME_END
jmp restore_all
/* kernel thread */
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 607d72c4a485..4a4c0834f965 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -36,7 +36,6 @@
#include <asm/smap.h>
#include <asm/pgtable_types.h>
#include <asm/export.h>
-#include <asm/frame.h>
#include <linux/err.h>
.code64
@@ -406,19 +405,17 @@ END(__switch_to_asm)
* r12: kernel thread arg
*/
ENTRY(ret_from_fork)
- FRAME_BEGIN /* help unwinder find end of stack */
movq %rax, %rdi
- call schedule_tail /* rdi: 'prev' task parameter */
+ call schedule_tail /* rdi: 'prev' task parameter */
- testq %rbx, %rbx /* from kernel_thread? */
- jnz 1f /* kernel threads are uncommon */
+ testq %rbx, %rbx /* from kernel_thread? */
+ jnz 1f /* kernel threads are uncommon */
2:
- leaq FRAME_OFFSET(%rsp),%rdi /* pt_regs pointer */
+ movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
SWAPGS
- FRAME_END
jmp restore_regs_and_iret
1:
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9c761fea0c98..695605eb1dfb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -43,7 +43,7 @@
#define KVM_PRIVATE_MEM_SLOTS 3
#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
-#define KVM_HALT_POLL_NS_DEFAULT 400000
+#define KVM_HALT_POLL_NS_DEFAULT 200000
#define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 4fd5195deed0..3f9a3d2a5209 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -266,6 +266,7 @@ static inline int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *s
#endif
int mce_available(struct cpuinfo_x86 *c);
+bool mce_is_memory_error(struct mce *m);
DECLARE_PER_CPU(unsigned, mce_exception_count);
DECLARE_PER_CPU(unsigned, mce_poll_count);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 68766b276d9e..a059aac9e937 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -319,10 +319,10 @@ do { \
#define __get_user_asm_u64(x, ptr, retval, errret) \
({ \
__typeof__(ptr) __ptr = (ptr); \
- asm volatile(ASM_STAC "\n" \
+ asm volatile("\n" \
"1: movl %2,%%eax\n" \
"2: movl %3,%%edx\n" \
- "3: " ASM_CLAC "\n" \
+ "3:\n" \
".section .fixup,\"ax\"\n" \
"4: mov %4,%0\n" \
" xorl %%eax,%%eax\n" \
@@ -331,7 +331,7 @@ do { \
".previous\n" \
_ASM_EXTABLE(1b, 4b) \
_ASM_EXTABLE(2b, 4b) \
- : "=r" (retval), "=A"(x) \
+ : "=r" (retval), "=&A"(x) \
: "m" (__m(__ptr)), "m" __m(((u32 *)(__ptr)) + 1), \
"i" (errret), "0" (retval)); \
})
@@ -703,14 +703,15 @@ extern struct movsl_mask {
#define unsafe_put_user(x, ptr, err_label) \
do { \
int __pu_err; \
- __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \
+ __typeof__(*(ptr)) __pu_val = (x); \
+ __put_user_size(__pu_val, (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \
if (unlikely(__pu_err)) goto err_label; \
} while (0)
#define unsafe_get_user(x, ptr, err_label) \
do { \
int __gu_err; \
- unsigned long __gu_val; \
+ __inttype(*(ptr)) __gu_val; \
__get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
if (unlikely(__gu_err)) goto err_label; \
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c5b8f760473c..32e14d137416 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -409,8 +409,13 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
memcpy(insnbuf, replacement, a->replacementlen);
insnbuf_sz = a->replacementlen;
- /* 0xe8 is a relative jump; fix the offset. */
- if (*insnbuf == 0xe8 && a->replacementlen == 5) {
+ /*
+ * 0xe8 is a relative jump; fix the offset.
+ *
+ * Instruction length is checked before the opcode to avoid
+ * accessing uninitialized bytes for zero-length replacements.
+ */
+ if (a->replacementlen == 5 && *insnbuf == 0xe8) {
*(s32 *)(insnbuf + 1) += replacement - instr;
DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
*(s32 *)(insnbuf + 1),
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5abd4bf73d6e..5cfbaeb6529a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -499,16 +499,14 @@ static int mce_usable_address(struct mce *m)
return 1;
}
-static bool memory_error(struct mce *m)
+bool mce_is_memory_error(struct mce *m)
{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- if (c->x86_vendor == X86_VENDOR_AMD) {
+ if (m->cpuvendor == X86_VENDOR_AMD) {
/* ErrCodeExt[20:16] */
u8 xec = (m->status >> 16) & 0x1f;
return (xec == 0x0 || xec == 0x8);
- } else if (c->x86_vendor == X86_VENDOR_INTEL) {
+ } else if (m->cpuvendor == X86_VENDOR_INTEL) {
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
*
@@ -529,6 +527,7 @@ static bool memory_error(struct mce *m)
return false;
}
+EXPORT_SYMBOL_GPL(mce_is_memory_error);
static bool cec_add_mce(struct mce *m)
{
@@ -536,7 +535,7 @@ static bool cec_add_mce(struct mce *m)
return false;
/* We eat only correctable DRAM errors with usable addresses. */
- if (memory_error(m) &&
+ if (mce_is_memory_error(m) &&
!(m->status & MCI_STATUS_UC) &&
mce_usable_address(m))
if (!cec_add_elem(m->addr >> PAGE_SHIFT))
@@ -713,7 +712,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
- if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
+ if (severity == MCE_DEFERRED_SEVERITY && mce_is_memory_error(&m))
if (m.status & MCI_STATUS_ADDRV)
m.severity = severity;
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index c2f8dde3255c..d5d44c452624 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -90,6 +90,7 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
* Boot time FPU feature detection code:
*/
unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
+EXPORT_SYMBOL_GPL(mxcsr_feature_mask);
static void __init fpu__init_system_mxcsr(void)
{
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 0651e974dcb3..9bef1bbeba63 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -689,8 +689,12 @@ static inline void *alloc_tramp(unsigned long size)
{
return module_alloc(size);
}
-static inline void tramp_free(void *tramp)
+static inline void tramp_free(void *tramp, int size)
{
+ int npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ set_memory_nx((unsigned long)tramp, npages);
+ set_memory_rw((unsigned long)tramp, npages);
module_memfree(tramp);
}
#else
@@ -699,7 +703,7 @@ static inline void *alloc_tramp(unsigned long size)
{
return NULL;
}
-static inline void tramp_free(void *tramp) { }
+static inline void tramp_free(void *tramp, int size) { }
#endif
/* Defined as markers to the end of the ftrace default trampolines */
@@ -771,7 +775,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
/* Copy ftrace_caller onto the trampoline memory */
ret = probe_kernel_read(trampoline, (void *)start_offset, size);
if (WARN_ON(ret < 0)) {
- tramp_free(trampoline);
+ tramp_free(trampoline, *tramp_size);
return 0;
}
@@ -797,7 +801,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
/* Are we pointing to the reference? */
if (WARN_ON(memcmp(op_ptr.op, op_ref, 3) != 0)) {
- tramp_free(trampoline);
+ tramp_free(trampoline, *tramp_size);
return 0;
}
@@ -839,7 +843,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
unsigned long offset;
unsigned long ip;
unsigned int size;
- int ret;
+ int ret, npages;
if (ops->trampoline) {
/*
@@ -848,11 +852,14 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
*/
if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
return;
+ npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT;
+ set_memory_rw(ops->trampoline, npages);
} else {
ops->trampoline = create_trampoline(ops, &size);
if (!ops->trampoline)
return;
ops->trampoline_size = size;
+ npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
}
offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
@@ -863,6 +870,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
/* Do a safe modify in case the trampoline is executing */
new = ftrace_call_replace(ip, (unsigned long)func);
ret = update_ftrace_func(ip, new);
+ set_memory_ro(ops->trampoline, npages);
/* The update should never fail */
WARN_ON(ret);
@@ -939,7 +947,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
return;
- tramp_free((void *)ops->trampoline);
+ tramp_free((void *)ops->trampoline, ops->trampoline_size);
ops->trampoline = 0;
}
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 5b2bbfbb3712..6b877807598b 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -52,6 +52,7 @@
#include <linux/ftrace.h>
#include <linux/frame.h>
#include <linux/kasan.h>
+#include <linux/moduleloader.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
@@ -417,6 +418,14 @@ static void prepare_boost(struct kprobe *p, struct insn *insn)
}
}
+/* Recover page to RW mode before releasing it */
+void free_insn_page(void *page)
+{
+ set_memory_nx((unsigned long)page & PAGE_MASK, 1);
+ set_memory_rw((unsigned long)page & PAGE_MASK, 1);
+ module_memfree(page);
+}
+
static int arch_copy_kprobe(struct kprobe *p)
{
struct insn insn;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0b4d3c686b1e..f81823695014 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -980,8 +980,6 @@ void __init setup_arch(char **cmdline_p)
*/
x86_configure_nx();
- simple_udelay_calibration();
-
parse_early_param();
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -1041,6 +1039,8 @@ void __init setup_arch(char **cmdline_p)
*/
init_hypervisor_platform();
+ simple_udelay_calibration();
+
x86_init.resources.probe_roms();
/* after parse_early_param, so could debug it */
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 82c6d7f1fd73..b9389d72b2f7 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -104,6 +104,11 @@ static inline unsigned long *last_frame(struct unwind_state *state)
return (unsigned long *)task_pt_regs(state->task) - 2;
}
+static bool is_last_frame(struct unwind_state *state)
+{
+ return state->bp == last_frame(state);
+}
+
#ifdef CONFIG_X86_32
#define GCC_REALIGN_WORDS 3
#else
@@ -115,16 +120,15 @@ static inline unsigned long *last_aligned_frame(struct unwind_state *state)
return last_frame(state) - GCC_REALIGN_WORDS;
}
-static bool is_last_task_frame(struct unwind_state *state)
+static bool is_last_aligned_frame(struct unwind_state *state)
{
unsigned long *last_bp = last_frame(state);
unsigned long *aligned_bp = last_aligned_frame(state);
/*
- * We have to check for the last task frame at two different locations
- * because gcc can occasionally decide to realign the stack pointer and
- * change the offset of the stack frame in the prologue of a function
- * called by head/entry code. Examples:
+ * GCC can occasionally decide to realign the stack pointer and change
+ * the offset of the stack frame in the prologue of a function called
+ * by head/entry code. Examples:
*
* <start_secondary>:
* push %edi
@@ -141,11 +145,38 @@ static bool is_last_task_frame(struct unwind_state *state)
* push %rbp
* mov %rsp,%rbp
*
- * Note that after aligning the stack, it pushes a duplicate copy of
- * the return address before pushing the frame pointer.
+ * After aligning the stack, it pushes a duplicate copy of the return
+ * address before pushing the frame pointer.
+ */
+ return (state->bp == aligned_bp && *(aligned_bp + 1) == *(last_bp + 1));
+}
+
+static bool is_last_ftrace_frame(struct unwind_state *state)
+{
+ unsigned long *last_bp = last_frame(state);
+ unsigned long *last_ftrace_bp = last_bp - 3;
+
+ /*
+ * When unwinding from an ftrace handler of a function called by entry
+ * code, the stack layout of the last frame is:
+ *
+ * bp
+ * parent ret addr
+ * bp
+ * function ret addr
+ * parent ret addr
+ * pt_regs
+ * -----------------
*/
- return (state->bp == last_bp ||
- (state->bp == aligned_bp && *(aligned_bp+1) == *(last_bp+1)));
+ return (state->bp == last_ftrace_bp &&
+ *state->bp == *(state->bp + 2) &&
+ *(state->bp + 1) == *(state->bp + 4));
+}
+
+static bool is_last_task_frame(struct unwind_state *state)
+{
+ return is_last_frame(state) || is_last_aligned_frame(state) ||
+ is_last_ftrace_frame(state);
}
/*
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c25cfaf584e7..0816ab2e8adc 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4173,7 +4173,7 @@ static int check_dr_write(struct x86_emulate_ctxt *ctxt)
static int check_svme(struct x86_emulate_ctxt *ctxt)
{
- u64 efer;
+ u64 efer = 0;
ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 56241746abbd..b0454c7e4cff 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -283,11 +283,13 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
pt_element_t pte;
pt_element_t __user *uninitialized_var(ptep_user);
gfn_t table_gfn;
- unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey;
+ u64 pt_access, pte_access;
+ unsigned index, accessed_dirty, pte_pkey;
unsigned nested_access;
gpa_t pte_gpa;
bool have_ad;
int offset;
+ u64 walk_nx_mask = 0;
const int write_fault = access & PFERR_WRITE_MASK;
const int user_fault = access & PFERR_USER_MASK;
const int fetch_fault = access & PFERR_FETCH_MASK;
@@ -302,6 +304,7 @@ retry_walk:
have_ad = PT_HAVE_ACCESSED_DIRTY(mmu);
#if PTTYPE == 64
+ walk_nx_mask = 1ULL << PT64_NX_SHIFT;
if (walker->level == PT32E_ROOT_LEVEL) {
pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
trace_kvm_mmu_paging_element(pte, walker->level);
@@ -313,8 +316,6 @@ retry_walk:
walker->max_level = walker->level;
ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu)));
- accessed_dirty = have_ad ? PT_GUEST_ACCESSED_MASK : 0;
-
/*
* FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
* by the MOV to CR instruction are treated as reads and do not cause the
@@ -322,14 +323,14 @@ retry_walk:
*/
nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
- pt_access = pte_access = ACC_ALL;
+ pte_access = ~0;
++walker->level;
do {
gfn_t real_gfn;
unsigned long host_addr;
- pt_access &= pte_access;
+ pt_access = pte_access;
--walker->level;
index = PT_INDEX(addr, walker->level);
@@ -371,6 +372,12 @@ retry_walk:
trace_kvm_mmu_paging_element(pte, walker->level);
+ /*
+ * Inverting the NX it lets us AND it like other
+ * permission bits.
+ */
+ pte_access = pt_access & (pte ^ walk_nx_mask);
+
if (unlikely(!FNAME(is_present_gpte)(pte)))
goto error;
@@ -379,14 +386,16 @@ retry_walk:
goto error;
}
- accessed_dirty &= pte;
- pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
-
walker->ptes[walker->level - 1] = pte;
} while (!is_last_gpte(mmu, walker->level, pte));
pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
- errcode = permission_fault(vcpu, mmu, pte_access, pte_pkey, access);
+ accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
+
+ /* Convert to ACC_*_MASK flags for struct guest_walker. */
+ walker->pt_access = FNAME(gpte_access)(vcpu, pt_access ^ walk_nx_mask);
+ walker->pte_access = FNAME(gpte_access)(vcpu, pte_access ^ walk_nx_mask);
+ errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
if (unlikely(errcode))
goto error;
@@ -403,7 +412,7 @@ retry_walk:
walker->gfn = real_gpa >> PAGE_SHIFT;
if (!write_fault)
- FNAME(protect_clean_gpte)(mmu, &pte_access, pte);
+ FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte);
else
/*
* On a write fault, fold the dirty bit into accessed_dirty.
@@ -421,10 +430,8 @@ retry_walk:
goto retry_walk;
}
- walker->pt_access = pt_access;
- walker->pte_access = pte_access;
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
- __func__, (u64)pte, pte_access, pt_access);
+ __func__, (u64)pte, walker->pte_access, walker->pt_access);
return 1;
error:
@@ -452,7 +459,7 @@ error:
*/
if (!(errcode & PFERR_RSVD_MASK)) {
vcpu->arch.exit_qualification &= 0x187;
- vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3;
+ vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3;
}
#endif
walker->fault.address = addr;
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c
index 9d4a8504a95a..5ab4a364348e 100644
--- a/arch/x86/kvm/pmu_intel.c
+++ b/arch/x86/kvm/pmu_intel.c
@@ -294,7 +294,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
((u64)1 << edx.split.bit_width_fixed) - 1;
}
- pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
+ pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
pmu->global_ctrl_mask = ~pmu->global_ctrl;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c27ac6923a18..183ddb235fb4 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1272,7 +1272,8 @@ static void init_vmcb(struct vcpu_svm *svm)
}
-static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, int index)
+static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
+ unsigned int index)
{
u64 *avic_physical_id_table;
struct kvm_arch *vm_data = &vcpu->kvm->arch;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c6f4ad44aa95..72f78396bc09 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6504,7 +6504,7 @@ static __init int hardware_setup(void)
enable_ept_ad_bits = 0;
}
- if (!cpu_has_vmx_ept_ad_bits())
+ if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
enable_ept_ad_bits = 0;
if (!cpu_has_vmx_unrestricted_guest())
@@ -11213,7 +11213,7 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
if (!nested_cpu_has_pml(vmcs12))
return 0;
- if (vmcs12->guest_pml_index > PML_ENTITY_NUM) {
+ if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
vmx->nested.pml_full = true;
return 1;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 464da936c53d..02363e37d4a6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1763,6 +1763,7 @@ u64 get_kvmclock_ns(struct kvm *kvm)
{
struct kvm_arch *ka = &kvm->arch;
struct pvclock_vcpu_time_info hv_clock;
+ u64 ret;
spin_lock(&ka->pvclock_gtod_sync_lock);
if (!ka->use_master_clock) {
@@ -1774,10 +1775,17 @@ u64 get_kvmclock_ns(struct kvm *kvm)
hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
spin_unlock(&ka->pvclock_gtod_sync_lock);
+ /* both __this_cpu_read() and rdtsc() should be on the same cpu */
+ get_cpu();
+
kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
&hv_clock.tsc_shift,
&hv_clock.tsc_to_system_mul);
- return __pvclock_read_cycles(&hv_clock, rdtsc());
+ ret = __pvclock_read_cycles(&hv_clock, rdtsc());
+
+ put_cpu();
+
+ return ret;
}
static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
@@ -3288,11 +3296,14 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
}
}
+#define XSAVE_MXCSR_OFFSET 24
+
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
u64 xstate_bv =
*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
+ u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
/*
@@ -3300,11 +3311,13 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
* CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
* with old userspace.
*/
- if (xstate_bv & ~kvm_supported_xcr0())
+ if (xstate_bv & ~kvm_supported_xcr0() ||
+ mxcsr & ~mxcsr_feature_mask)
return -EINVAL;
load_xsave(vcpu, (u8 *)guest_xsave->region);
} else {
- if (xstate_bv & ~XFEATURE_MASK_FPSSE)
+ if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
+ mxcsr & ~mxcsr_feature_mask)
return -EINVAL;
memcpy(&vcpu->arch.guest_fpu.state.fxsave,
guest_xsave->region, sizeof(struct fxregs_state));
@@ -4818,16 +4831,20 @@ emul_write:
static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
{
- /* TODO: String I/O for in kernel device */
- int r;
+ int r = 0, i;
- if (vcpu->arch.pio.in)
- r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
- vcpu->arch.pio.size, pd);
- else
- r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
- vcpu->arch.pio.port, vcpu->arch.pio.size,
- pd);
+ for (i = 0; i < vcpu->arch.pio.count; i++) {
+ if (vcpu->arch.pio.in)
+ r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
+ vcpu->arch.pio.size, pd);
+ else
+ r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
+ vcpu->arch.pio.port, vcpu->arch.pio.size,
+ pd);
+ if (r)
+ break;
+ pd += vcpu->arch.pio.size;
+ }
return r;
}
@@ -4865,6 +4882,8 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
if (vcpu->arch.pio.count)
goto data_avail;
+ memset(vcpu->arch.pio_data, 0, size * count);
+
ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
if (ret) {
data_avail:
@@ -5048,6 +5067,8 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
if (var.unusable) {
memset(desc, 0, sizeof(*desc));
+ if (base3)
+ *base3 = 0;
return false;
}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 1dcd2be4cce4..c8520b2c62d2 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -186,7 +186,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
unsigned int i, level;
unsigned long addr;
- BUG_ON(irqs_disabled());
+ BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
WARN_ON(PAGE_ALIGN(start) != start);
on_each_cpu(__cpa_flush_range, NULL, 1);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 9b78685b66e6..83a59a67757a 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -65,9 +65,11 @@ static int __init nopat(char *str)
}
early_param("nopat", nopat);
+static bool __read_mostly __pat_initialized = false;
+
bool pat_enabled(void)
{
- return !!__pat_enabled;
+ return __pat_initialized;
}
EXPORT_SYMBOL_GPL(pat_enabled);
@@ -225,13 +227,14 @@ static void pat_bsp_init(u64 pat)
}
wrmsrl(MSR_IA32_CR_PAT, pat);
+ __pat_initialized = true;
__init_cache_modes(pat);
}
static void pat_ap_init(u64 pat)
{
- if (!boot_cpu_has(X86_FEATURE_PAT)) {
+ if (!this_cpu_has(X86_FEATURE_PAT)) {
/*
* If this happens we are on a secondary CPU, but switched to
* PAT on the boot CPU. We have no way to undo PAT.
@@ -306,7 +309,7 @@ void pat_init(void)
u64 pat;
struct cpuinfo_x86 *c = &boot_cpu_data;
- if (!pat_enabled()) {
+ if (!__pat_enabled) {
init_cache_modes();
return;
}
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 7cd442690f9d..f33eef4ebd12 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -142,9 +142,7 @@ static void __init xen_banner(void)
struct xen_extraversion extra;
HYPERVISOR_xen_version(XENVER_extraversion, &extra);
- pr_info("Booting paravirtualized kernel %son %s\n",
- xen_feature(XENFEAT_auto_translated_physmap) ?
- "with PVH extensions " : "", pv_info.name);
+ pr_info("Booting paravirtualized kernel on %s\n", pv_info.name);
printk(KERN_INFO "Xen version: %d.%d%s%s\n",
version >> 16, version & 0xffff, extra.extraversion,
xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
@@ -957,15 +955,10 @@ static void xen_write_msr(unsigned int msr, unsigned low, unsigned high)
void xen_setup_shared_info(void)
{
- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
- set_fixmap(FIX_PARAVIRT_BOOTMAP,
- xen_start_info->shared_info);
+ set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info);
- HYPERVISOR_shared_info =
- (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
- } else
- HYPERVISOR_shared_info =
- (struct shared_info *)__va(xen_start_info->shared_info);
+ HYPERVISOR_shared_info =
+ (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
#ifndef CONFIG_SMP
/* In UP this is as good a place as any to set up shared info */
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 5e375a5e815f..3be06f3caf3c 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -42,7 +42,7 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr)
}
EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
-void xen_flush_tlb_all(void)
+static void xen_flush_tlb_all(void)
{
struct mmuext_op *op;
struct multicall_space mcs;
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 7397d8b8459d..1f386d7fdf70 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -355,10 +355,8 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
pteval_t flags = val & PTE_FLAGS_MASK;
unsigned long mfn;
- if (!xen_feature(XENFEAT_auto_translated_physmap))
- mfn = __pfn_to_mfn(pfn);
- else
- mfn = pfn;
+ mfn = __pfn_to_mfn(pfn);
+
/*
* If there's no mfn for the pfn, then just create an
* empty non-present pte. Unfortunately this loses
@@ -647,9 +645,6 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
limit--;
BUG_ON(limit >= FIXADDR_TOP);
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return 0;
-
/*
* 64-bit has a great big hole in the middle of the address
* space, which contains the Xen mappings. On 32-bit these
@@ -1289,9 +1284,6 @@ static void __init xen_pagetable_cleanhighmap(void)
static void __init xen_pagetable_p2m_setup(void)
{
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return;
-
xen_vmalloc_p2m_tree();
#ifdef CONFIG_X86_64
@@ -1314,8 +1306,7 @@ static void __init xen_pagetable_init(void)
xen_build_mfn_list_list();
/* Remap memory freed due to conflicts with E820 map */
- if (!xen_feature(XENFEAT_auto_translated_physmap))
- xen_remap_memory();
+ xen_remap_memory();
xen_setup_shared_info();
}
@@ -1925,21 +1916,20 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* Zap identity mapping */
init_level4_pgt[0] = __pgd(0);
- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
- /* Pre-constructed entries are in pfn, so convert to mfn */
- /* L4[272] -> level3_ident_pgt
- * L4[511] -> level3_kernel_pgt */
- convert_pfn_mfn(init_level4_pgt);
+ /* Pre-constructed entries are in pfn, so convert to mfn */
+ /* L4[272] -> level3_ident_pgt */
+ /* L4[511] -> level3_kernel_pgt */
+ convert_pfn_mfn(init_level4_pgt);
- /* L3_i[0] -> level2_ident_pgt */
- convert_pfn_mfn(level3_ident_pgt);
- /* L3_k[510] -> level2_kernel_pgt
- * L3_k[511] -> level2_fixmap_pgt */
- convert_pfn_mfn(level3_kernel_pgt);
+ /* L3_i[0] -> level2_ident_pgt */
+ convert_pfn_mfn(level3_ident_pgt);
+ /* L3_k[510] -> level2_kernel_pgt */
+ /* L3_k[511] -> level2_fixmap_pgt */
+ convert_pfn_mfn(level3_kernel_pgt);
+
+ /* L3_k[511][506] -> level1_fixmap_pgt */
+ convert_pfn_mfn(level2_fixmap_pgt);
- /* L3_k[511][506] -> level1_fixmap_pgt */
- convert_pfn_mfn(level2_fixmap_pgt);
- }
/* We get [511][511] and have Xen's version of level2_kernel_pgt */
l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
@@ -1962,34 +1952,30 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
if (i && i < pgd_index(__START_KERNEL_map))
init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
- /* Make pagetable pieces RO */
- set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
- set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
- set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
- set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
- set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
- set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
- set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
- set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);
-
- /* Pin down new L4 */
- pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
- PFN_DOWN(__pa_symbol(init_level4_pgt)));
-
- /* Unpin Xen-provided one */
- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+ /* Make pagetable pieces RO */
+ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+ set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);
+
+ /* Pin down new L4 */
+ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
+ PFN_DOWN(__pa_symbol(init_level4_pgt)));
+
+ /* Unpin Xen-provided one */
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
- /*
- * At this stage there can be no user pgd, and no page
- * structure to attach it to, so make sure we just set kernel
- * pgd.
- */
- xen_mc_batch();
- __xen_write_cr3(true, __pa(init_level4_pgt));
- xen_mc_issue(PARAVIRT_LAZY_CPU);
- } else
- native_write_cr3(__pa(init_level4_pgt));
+ /*
+ * At this stage there can be no user pgd, and no page structure to
+ * attach it to, so make sure we just set kernel pgd.
+ */
+ xen_mc_batch();
+ __xen_write_cr3(true, __pa(init_level4_pgt));
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
/* We can't that easily rip out L3 and L2, as the Xen pagetables are
* set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for
@@ -2403,9 +2389,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
static void __init xen_post_allocator_init(void)
{
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return;
-
pv_mmu_ops.set_pte = xen_set_pte;
pv_mmu_ops.set_pmd = xen_set_pmd;
pv_mmu_ops.set_pud = xen_set_pud;
@@ -2511,9 +2494,6 @@ void __init xen_init_mmu_ops(void)
{
x86_init.paging.pagetable_init = xen_pagetable_init;
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return;
-
pv_mmu_ops = xen_mmu_ops;
memset(dummy_mapping, 0xff, PAGE_SIZE);
@@ -2650,9 +2630,6 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
* this function are redundant and can be ignored.
*/
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return 0;
-
if (unlikely(order > MAX_CONTIG_ORDER))
return -ENOMEM;
@@ -2689,9 +2666,6 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
int success;
unsigned long vstart;
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return;
-
if (unlikely(order > MAX_CONTIG_ORDER))
return;