summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-05-06 16:13:31 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-05-06 16:13:31 -0700
commit0bc40e549aeea2de20fc571749de9bbfc099fb34 (patch)
treed18f3339bd383a17431fca23b6c5f3e54c93cf2f /arch/x86/kernel
parente913c4a4c21cd83317fafe63bfdc9d34d2910114 (diff)
parentcaa841360134f863987f2d4f77b8dc2fbb7596f8 (diff)
downloadlinux-0bc40e549aeea2de20fc571749de9bbfc099fb34.tar.bz2
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar: "The changes in here are: - text_poke() fixes and an extensive set of executability lockdowns, to (hopefully) eliminate the last residual circumstances under which we are using W|X mappings even temporarily on x86 kernels. This required a broad range of surgery in text patching facilities, module loading, trampoline handling and other bits. - tweak page fault messages to be more informative and more structured. - remove DISCONTIGMEM support on x86-32 and make SPARSEMEM the default. - reduce KASLR granularity on 5-level paging kernels from 512 GB to 1 GB. - misc other changes and updates" * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits) x86/mm: Initialize PGD cache during mm initialization x86/alternatives: Add comment about module removal races x86/kprobes: Use vmalloc special flag x86/ftrace: Use vmalloc special flag bpf: Use vmalloc special flag modules: Use vmalloc special flag mm/vmalloc: Add flag for freeing of special permsissions mm/hibernation: Make hibernation handle unmapped pages x86/mm/cpa: Add set_direct_map_*() functions x86/alternatives: Remove the return value of text_poke_*() x86/jump-label: Remove support for custom text poker x86/modules: Avoid breaking W^X while loading modules x86/kprobes: Set instruction page as executable x86/ftrace: Set trampoline pages as executable x86/kgdb: Avoid redundant comparison of patched code x86/alternatives: Use temporary mm for text poking x86/alternatives: Initialize temporary mm for patching fork: Provide a function for copying init_mm uprobes: Initialize uprobes earlier x86/mm: Save debug registers when loading a temporary mm ...
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/alternative.c201
-rw-r--r--arch/x86/kernel/ftrace.c22
-rw-r--r--arch/x86/kernel/jump_label.c21
-rw-r--r--arch/x86/kernel/kgdb.c25
-rw-r--r--arch/x86/kernel/kprobes/core.c19
-rw-r--r--arch/x86/kernel/module.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S6
7 files changed, 204 insertions, 92 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 9a79c7808f9c..7b9b49dfc05a 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -12,6 +12,7 @@
#include <linux/slab.h>
#include <linux/kdebug.h>
#include <linux/kprobes.h>
+#include <linux/mmu_context.h>
#include <asm/text-patching.h>
#include <asm/alternative.h>
#include <asm/sections.h>
@@ -264,7 +265,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
-void *text_poke_early(void *addr, const void *opcode, size_t len);
+void text_poke_early(void *addr, const void *opcode, size_t len);
/*
* Are we looking at a near JMP with a 1 or 4-byte displacement.
@@ -666,16 +667,136 @@ void __init alternative_instructions(void)
* instructions. And on the local CPU you need to be protected again NMI or MCE
* handlers seeing an inconsistent instruction while you patch.
*/
-void *__init_or_module text_poke_early(void *addr, const void *opcode,
- size_t len)
+void __init_or_module text_poke_early(void *addr, const void *opcode,
+ size_t len)
{
unsigned long flags;
+
+ if (boot_cpu_has(X86_FEATURE_NX) &&
+ is_module_text_address((unsigned long)addr)) {
+ /*
+ * Modules text is marked initially as non-executable, so the
+ * code cannot be running and speculative code-fetches are
+ * prevented. Just change the code.
+ */
+ memcpy(addr, opcode, len);
+ } else {
+ local_irq_save(flags);
+ memcpy(addr, opcode, len);
+ local_irq_restore(flags);
+ sync_core();
+
+ /*
+ * Could also do a CLFLUSH here to speed up CPU recovery; but
+ * that causes hangs on some VIA CPUs.
+ */
+ }
+}
+
+__ro_after_init struct mm_struct *poking_mm;
+__ro_after_init unsigned long poking_addr;
+
+static void *__text_poke(void *addr, const void *opcode, size_t len)
+{
+ bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
+ struct page *pages[2] = {NULL};
+ temp_mm_state_t prev;
+ unsigned long flags;
+ pte_t pte, *ptep;
+ spinlock_t *ptl;
+ pgprot_t pgprot;
+
+ /*
+ * While boot memory allocator is running we cannot use struct pages as
+ * they are not yet initialized. There is no way to recover.
+ */
+ BUG_ON(!after_bootmem);
+
+ if (!core_kernel_text((unsigned long)addr)) {
+ pages[0] = vmalloc_to_page(addr);
+ if (cross_page_boundary)
+ pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
+ } else {
+ pages[0] = virt_to_page(addr);
+ WARN_ON(!PageReserved(pages[0]));
+ if (cross_page_boundary)
+ pages[1] = virt_to_page(addr + PAGE_SIZE);
+ }
+ /*
+ * If something went wrong, crash and burn since recovery paths are not
+ * implemented.
+ */
+ BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
+
local_irq_save(flags);
- memcpy(addr, opcode, len);
+
+ /*
+ * Map the page without the global bit, as TLB flushing is done with
+ * flush_tlb_mm_range(), which is intended for non-global PTEs.
+ */
+ pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
+
+ /*
+ * The lock is not really needed, but this allows to avoid open-coding.
+ */
+ ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
+
+ /*
+ * This must not fail; preallocated in poking_init().
+ */
+ VM_BUG_ON(!ptep);
+
+ pte = mk_pte(pages[0], pgprot);
+ set_pte_at(poking_mm, poking_addr, ptep, pte);
+
+ if (cross_page_boundary) {
+ pte = mk_pte(pages[1], pgprot);
+ set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
+ }
+
+ /*
+ * Loading the temporary mm behaves as a compiler barrier, which
+ * guarantees that the PTE will be set at the time memcpy() is done.
+ */
+ prev = use_temporary_mm(poking_mm);
+
+ kasan_disable_current();
+ memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
+ kasan_enable_current();
+
+ /*
+ * Ensure that the PTE is only cleared after the instructions of memcpy
+ * were issued by using a compiler barrier.
+ */
+ barrier();
+
+ pte_clear(poking_mm, poking_addr, ptep);
+ if (cross_page_boundary)
+ pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
+
+ /*
+ * Loading the previous page-table hierarchy requires a serializing
+ * instruction that already allows the core to see the updated version.
+ * Xen-PV is assumed to serialize execution in a similar manner.
+ */
+ unuse_temporary_mm(prev);
+
+ /*
+ * Flushing the TLB might involve IPIs, which would require enabled
+ * IRQs, but not if the mm is not used, as it is in this point.
+ */
+ flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
+ (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
+ PAGE_SHIFT, false);
+
+ /*
+ * If the text does not match what we just wrote then something is
+ * fundamentally screwy; there's nothing we can really do about that.
+ */
+ BUG_ON(memcmp(addr, opcode, len));
+
+ pte_unmap_unlock(ptep, ptl);
local_irq_restore(flags);
- sync_core();
- /* Could also do a CLFLUSH here to speed up CPU recovery; but
- that causes hangs on some VIA CPUs. */
return addr;
}
@@ -689,48 +810,36 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
* It means the size must be writable atomically and the address must be aligned
* in a way that permits an atomic write. It also makes sure we fit on a single
* page.
+ *
+ * Note that the caller must ensure that if the modified code is part of a
+ * module, the module would not be removed during poking. This can be achieved
+ * by registering a module notifier, and ordering module removal and patching
+ * trough a mutex.
*/
void *text_poke(void *addr, const void *opcode, size_t len)
{
- unsigned long flags;
- char *vaddr;
- struct page *pages[2];
- int i;
-
- /*
- * While boot memory allocator is runnig we cannot use struct
- * pages as they are not yet initialized.
- */
- BUG_ON(!after_bootmem);
-
lockdep_assert_held(&text_mutex);
- if (!core_kernel_text((unsigned long)addr)) {
- pages[0] = vmalloc_to_page(addr);
- pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
- } else {
- pages[0] = virt_to_page(addr);
- WARN_ON(!PageReserved(pages[0]));
- pages[1] = virt_to_page(addr + PAGE_SIZE);
- }
- BUG_ON(!pages[0]);
- local_irq_save(flags);
- set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
- if (pages[1])
- set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
- vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
- memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
- clear_fixmap(FIX_TEXT_POKE0);
- if (pages[1])
- clear_fixmap(FIX_TEXT_POKE1);
- local_flush_tlb();
- sync_core();
- /* Could also do a CLFLUSH here to speed up CPU recovery; but
- that causes hangs on some VIA CPUs. */
- for (i = 0; i < len; i++)
- BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
- local_irq_restore(flags);
- return addr;
+ return __text_poke(addr, opcode, len);
+}
+
+/**
+ * text_poke_kgdb - Update instructions on a live kernel by kgdb
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy
+ *
+ * Only atomic text poke/set should be allowed when not doing early patching.
+ * It means the size must be writable atomically and the address must be aligned
+ * in a way that permits an atomic write. It also makes sure we fit on a single
+ * page.
+ *
+ * Context: should only be used by kgdb, which ensures no other core is running,
+ * despite the fact it does not hold the text_mutex.
+ */
+void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
+{
+ return __text_poke(addr, opcode, len);
}
static void do_sync_core(void *info)
@@ -788,7 +897,7 @@ NOKPROBE_SYMBOL(poke_int3_handler);
* replacing opcode
* - sync cores
*/
-void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
+void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
{
unsigned char int3 = 0xcc;
@@ -830,7 +939,5 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
* the writing of the new instruction.
*/
bp_patching_in_progress = false;
-
- return addr;
}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index ef49517f6bb2..0caf8122d680 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -678,12 +678,8 @@ static inline void *alloc_tramp(unsigned long size)
{
return module_alloc(size);
}
-static inline void tramp_free(void *tramp, int size)
+static inline void tramp_free(void *tramp)
{
- int npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
- set_memory_nx((unsigned long)tramp, npages);
- set_memory_rw((unsigned long)tramp, npages);
module_memfree(tramp);
}
#else
@@ -692,7 +688,7 @@ static inline void *alloc_tramp(unsigned long size)
{
return NULL;
}
-static inline void tramp_free(void *tramp, int size) { }
+static inline void tramp_free(void *tramp) { }
#endif
/* Defined as markers to the end of the ftrace default trampolines */
@@ -730,6 +726,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
unsigned long end_offset;
unsigned long op_offset;
unsigned long offset;
+ unsigned long npages;
unsigned long size;
unsigned long retq;
unsigned long *ptr;
@@ -762,6 +759,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
return 0;
*tramp_size = size + RET_SIZE + sizeof(void *);
+ npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE);
/* Copy ftrace_caller onto the trampoline memory */
ret = probe_kernel_read(trampoline, (void *)start_offset, size);
@@ -806,9 +804,17 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
/* ALLOC_TRAMP flags lets us know we created it */
ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
+ set_vm_flush_reset_perms(trampoline);
+
+ /*
+ * Module allocation needs to be completed by making the page
+ * executable. The page is still writable, which is a security hazard,
+ * but anyhow ftrace breaks W^X completely.
+ */
+ set_memory_x((unsigned long)trampoline, npages);
return (unsigned long)trampoline;
fail:
- tramp_free(trampoline, *tramp_size);
+ tramp_free(trampoline);
return 0;
}
@@ -939,7 +945,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
return;
- tramp_free((void *)ops->trampoline, ops->trampoline_size);
+ tramp_free((void *)ops->trampoline);
ops->trampoline = 0;
}
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index f99bd26bd3f1..e631c358f7f4 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -37,7 +37,6 @@ static void bug_at(unsigned char *ip, int line)
static void __ref __jump_label_transform(struct jump_entry *entry,
enum jump_label_type type,
- void *(*poker)(void *, const void *, size_t),
int init)
{
union jump_code_union jmp;
@@ -50,9 +49,6 @@ static void __ref __jump_label_transform(struct jump_entry *entry,
jmp.offset = jump_entry_target(entry) -
(jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
- if (early_boot_irqs_disabled)
- poker = text_poke_early;
-
if (type == JUMP_LABEL_JMP) {
if (init) {
expect = default_nop; line = __LINE__;
@@ -75,16 +71,19 @@ static void __ref __jump_label_transform(struct jump_entry *entry,
bug_at((void *)jump_entry_code(entry), line);
/*
- * Make text_poke_bp() a default fallback poker.
+ * As long as only a single processor is running and the code is still
+ * not marked as RO, text_poke_early() can be used; Checking that
+ * system_state is SYSTEM_BOOTING guarantees it. It will be set to
+ * SYSTEM_SCHEDULING before other cores are awaken and before the
+ * code is write-protected.
*
* At the time the change is being done, just ignore whether we
* are doing nop -> jump or jump -> nop transition, and assume
* always nop being the 'currently valid' instruction
- *
*/
- if (poker) {
- (*poker)((void *)jump_entry_code(entry), code,
- JUMP_LABEL_NOP_SIZE);
+ if (init || system_state == SYSTEM_BOOTING) {
+ text_poke_early((void *)jump_entry_code(entry), code,
+ JUMP_LABEL_NOP_SIZE);
return;
}
@@ -96,7 +95,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
enum jump_label_type type)
{
mutex_lock(&text_mutex);
- __jump_label_transform(entry, type, NULL, 0);
+ __jump_label_transform(entry, type, 0);
mutex_unlock(&text_mutex);
}
@@ -126,5 +125,5 @@ __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
jlstate = JL_STATE_NO_UPDATE;
}
if (jlstate == JL_STATE_UPDATE)
- __jump_label_transform(entry, type, text_poke_early, 1);
+ __jump_label_transform(entry, type, 1);
}
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 4ff6b4cdb941..13b13311b792 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -747,7 +747,6 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
{
int err;
- char opc[BREAK_INSTR_SIZE];
bpt->type = BP_BREAKPOINT;
err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
@@ -759,18 +758,13 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
if (!err)
return err;
/*
- * It is safe to call text_poke() because normal kernel execution
+ * It is safe to call text_poke_kgdb() because normal kernel execution
* is stopped on all cores, so long as the text_mutex is not locked.
*/
if (mutex_is_locked(&text_mutex))
return -EBUSY;
- text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
- BREAK_INSTR_SIZE);
- err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
- if (err)
- return err;
- if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
- return -EINVAL;
+ text_poke_kgdb((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
+ BREAK_INSTR_SIZE);
bpt->type = BP_POKE_BREAKPOINT;
return err;
@@ -778,22 +772,17 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
{
- int err;
- char opc[BREAK_INSTR_SIZE];
-
if (bpt->type != BP_POKE_BREAKPOINT)
goto knl_write;
/*
- * It is safe to call text_poke() because normal kernel execution
+ * It is safe to call text_poke_kgdb() because normal kernel execution
* is stopped on all cores, so long as the text_mutex is not locked.
*/
if (mutex_is_locked(&text_mutex))
goto knl_write;
- text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE);
- err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
- if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
- goto knl_write;
- return err;
+ text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr,
+ BREAK_INSTR_SIZE);
+ return 0;
knl_write:
return probe_kernel_write((char *)bpt->bpt_addr,
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 122548ad5c2e..cf52ee0d8711 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -431,8 +431,21 @@ void *alloc_insn_page(void)
void *page;
page = module_alloc(PAGE_SIZE);
- if (page)
- set_memory_ro((unsigned long)page & PAGE_MASK, 1);
+ if (!page)
+ return NULL;
+
+ set_vm_flush_reset_perms(page);
+ /*
+ * First make the page read-only, and only then make it executable to
+ * prevent it from being W+X in between.
+ */
+ set_memory_ro((unsigned long)page, 1);
+
+ /*
+ * TODO: Once additional kernel code protection mechanisms are set, ensure
+ * that the page was not maliciously altered and it is still zeroed.
+ */
+ set_memory_x((unsigned long)page, 1);
return page;
}
@@ -440,8 +453,6 @@ void *alloc_insn_page(void)
/* Recover page to RW mode before releasing it */
void free_insn_page(void *page)
{
- set_memory_nx((unsigned long)page & PAGE_MASK, 1);
- set_memory_rw((unsigned long)page & PAGE_MASK, 1);
module_memfree(page);
}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index b052e883dd8c..cfa3106faee4 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -87,7 +87,7 @@ void *module_alloc(unsigned long size)
p = __vmalloc_node_range(size, MODULE_ALIGN,
MODULES_VADDR + get_module_load_offset(),
MODULES_END, GFP_KERNEL,
- PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+ PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0));
if (p && (kasan_module_alloc(p, size) < 0)) {
vfree(p);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 4d1517022a14..0850b5149345 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -141,11 +141,11 @@ SECTIONS
*(.text.__x86.indirect_thunk)
__indirect_thunk_end = .;
#endif
-
- /* End of text section */
- _etext = .;
} :text = 0x9090
+ /* End of text section */
+ _etext = .;
+
NOTES :text :note
EXCEPTION_TABLE(16) :text = 0x9090