diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-10-06 07:59:37 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-10-06 07:59:37 -0700 |
commit | 82fa407da081d05323171577d86f62d77df17465 (patch) | |
tree | 4fd548ec817dc41a69170bb64761ace9abc071f3 /arch/arm/kernel/module-plts.c | |
parent | c7f5d36a3cc26e5068f4444aa22c4579e5eac85f (diff) | |
parent | 81a63001862f92d47c8c40a7fad870d5fbd8680b (diff) | |
download | linux-82fa407da081d05323171577d86f62d77df17465.tar.bz2 |
Merge branch 'for-linus' of git://git.armlinux.org.uk/~rmk/linux-arm
Pull ARM updates from Russell King:
- Correct ARMs dma-mapping to use the correct printk format strings.
- Avoid defining OBJCOPYFLAGS globally which upsets lkdtm rodata
testing.
- Cleanups to ARMs asm/memory.h include.
- L2 cache cleanups.
- Allow flat nommu binaries to be executed on ARM MMU systems.
- Kernel hardening - add more read-only after init annotations,
including making some kernel vdso variables const.
- Ensure AMBA primecell clocks are appropriately defaulted.
- ARM breakpoint cleanup.
- Various StrongARM 11x0 and companion chip (SA1111) updates to bring
this legacy platform to use more modern APIs for (eg) GPIOs and
interrupts, which will allow us in the future to reduce some of the
board-level driver clutter and elimate function callbacks into board
code via platform data. There still appears to be interest in these
platforms!
- Remove the now redundant secure_flush_area() API.
- Module PLT relocation optimisations. Ard says: This series of 4
patches optimizes the ARM PLT generation code that is invoked at
module load time, to get rid of the O(n^2) algorithm that results in
pathological load times of 10 seconds or more for large modules on
certain STB platforms.
- ARMv7M cache maintanence support.
- L2 cache PMU support
* 'for-linus' of git://git.armlinux.org.uk/~rmk/linux-arm: (35 commits)
ARM: sa1111: provide to_sa1111_device() macro
ARM: sa1111: add sa1111_get_irq()
ARM: sa1111: clean up duplication in IRQ chip implementation
ARM: sa1111: implement a gpio_chip for SA1111 GPIOs
ARM: sa1111: move irq cleanup to separate function
ARM: sa1111: use devm_clk_get()
ARM: sa1111: use devm_kzalloc()
ARM: sa1111: ensure we only touch RAB bus type devices when removing
ARM: 8611/1: l2x0: add PMU support
ARM: 8610/1: V7M: Add dsb before jumping in handler mode
ARM: 8609/1: V7M: Add support for the Cortex-M7 processor
ARM: 8608/1: V7M: Indirect proc_info construction for V7M CPUs
ARM: 8607/1: V7M: Wire up caches for V7M processors with cache support.
ARM: 8606/1: V7M: introduce cache operations
ARM: 8605/1: V7M: fix notrace variant of save_and_disable_irqs
ARM: 8604/1: V7M: Add support for reading the CTR with read_cpuid_cachetype()
ARM: 8603/1: V7M: Add addresses for mem-mapped V7M cache operations
ARM: 8602/1: factor out CSSELR/CCSIDR operations that use cp15 directly
ARM: kernel: avoid brute force search on PLT generation
ARM: kernel: sort relocation sections before allocating PLTs
...
Diffstat (limited to 'arch/arm/kernel/module-plts.c')
-rw-r--r-- | arch/arm/kernel/module-plts.c | 243 |
1 files changed, 144 insertions, 99 deletions
diff --git a/arch/arm/kernel/module-plts.c b/arch/arm/kernel/module-plts.c index 0c7efc3446c0..3a5cba90c971 100644 --- a/arch/arm/kernel/module-plts.c +++ b/arch/arm/kernel/module-plts.c @@ -9,6 +9,7 @@ #include <linux/elf.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/sort.h> #include <asm/cache.h> #include <asm/opcodes.h> @@ -30,154 +31,198 @@ struct plt_entries { u32 lit[PLT_ENT_COUNT]; }; -static bool in_init(const struct module *mod, u32 addr) +u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val) { - return addr - (u32)mod->init_layout.base < mod->init_layout.size; + struct plt_entries *plt = (struct plt_entries *)mod->arch.plt->sh_addr; + int idx = 0; + + /* + * Look for an existing entry pointing to 'val'. Given that the + * relocations are sorted, this will be the last entry we allocated. + * (if one exists). + */ + if (mod->arch.plt_count > 0) { + plt += (mod->arch.plt_count - 1) / PLT_ENT_COUNT; + idx = (mod->arch.plt_count - 1) % PLT_ENT_COUNT; + + if (plt->lit[idx] == val) + return (u32)&plt->ldr[idx]; + + idx = (idx + 1) % PLT_ENT_COUNT; + if (!idx) + plt++; + } + + mod->arch.plt_count++; + BUG_ON(mod->arch.plt_count * PLT_ENT_SIZE > mod->arch.plt->sh_size); + + if (!idx) + /* Populate a new set of entries */ + *plt = (struct plt_entries){ + { [0 ... PLT_ENT_COUNT - 1] = PLT_ENT_LDR, }, + { val, } + }; + else + plt->lit[idx] = val; + + return (u32)&plt->ldr[idx]; } -u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val) +#define cmp_3way(a,b) ((a) < (b) ? -1 : (a) > (b)) + +static int cmp_rel(const void *a, const void *b) { - struct plt_entries *plt, *plt_end; - int c, *count; - - if (in_init(mod, loc)) { - plt = (void *)mod->arch.init_plt->sh_addr; - plt_end = (void *)plt + mod->arch.init_plt->sh_size; - count = &mod->arch.init_plt_count; - } else { - plt = (void *)mod->arch.core_plt->sh_addr; - plt_end = (void *)plt + mod->arch.core_plt->sh_size; - count = &mod->arch.core_plt_count; - } + const Elf32_Rel *x = a, *y = b; + int i; - /* Look for an existing entry pointing to 'val' */ - for (c = *count; plt < plt_end; c -= PLT_ENT_COUNT, plt++) { - int i; - - if (!c) { - /* Populate a new set of entries */ - *plt = (struct plt_entries){ - { [0 ... PLT_ENT_COUNT - 1] = PLT_ENT_LDR, }, - { val, } - }; - ++*count; - return (u32)plt->ldr; - } - for (i = 0; i < PLT_ENT_COUNT; i++) { - if (!plt->lit[i]) { - plt->lit[i] = val; - ++*count; - } - if (plt->lit[i] == val) - return (u32)&plt->ldr[i]; - } + /* sort by type and symbol index */ + i = cmp_3way(ELF32_R_TYPE(x->r_info), ELF32_R_TYPE(y->r_info)); + if (i == 0) + i = cmp_3way(ELF32_R_SYM(x->r_info), ELF32_R_SYM(y->r_info)); + return i; +} + +static bool is_zero_addend_relocation(Elf32_Addr base, const Elf32_Rel *rel) +{ + u32 *tval = (u32 *)(base + rel->r_offset); + + /* + * Do a bitwise compare on the raw addend rather than fully decoding + * the offset and doing an arithmetic comparison. + * Note that a zero-addend jump/call relocation is encoded taking the + * PC bias into account, i.e., -8 for ARM and -4 for Thumb2. + */ + switch (ELF32_R_TYPE(rel->r_info)) { + u16 upper, lower; + + case R_ARM_THM_CALL: + case R_ARM_THM_JUMP24: + upper = __mem_to_opcode_thumb16(((u16 *)tval)[0]); + lower = __mem_to_opcode_thumb16(((u16 *)tval)[1]); + + return (upper & 0x7ff) == 0x7ff && (lower & 0x2fff) == 0x2ffe; + + case R_ARM_CALL: + case R_ARM_PC24: + case R_ARM_JUMP24: + return (__mem_to_opcode_arm(*tval) & 0xffffff) == 0xfffffe; } BUG(); } -static int duplicate_rel(Elf32_Addr base, const Elf32_Rel *rel, int num, - u32 mask) +static bool duplicate_rel(Elf32_Addr base, const Elf32_Rel *rel, int num) { - u32 *loc1, *loc2; - int i; + const Elf32_Rel *prev; - for (i = 0; i < num; i++) { - if (rel[i].r_info != rel[num].r_info) - continue; + /* + * Entries are sorted by type and symbol index. That means that, + * if a duplicate entry exists, it must be in the preceding + * slot. + */ + if (!num) + return false; - /* - * Identical relocation types against identical symbols can - * still result in different PLT entries if the addend in the - * place is different. So resolve the target of the relocation - * to compare the values. - */ - loc1 = (u32 *)(base + rel[i].r_offset); - loc2 = (u32 *)(base + rel[num].r_offset); - if (((*loc1 ^ *loc2) & mask) == 0) - return 1; - } - return 0; + prev = rel + num - 1; + return cmp_rel(rel + num, prev) == 0 && + is_zero_addend_relocation(base, prev); } /* Count how many PLT entries we may need */ -static unsigned int count_plts(Elf32_Addr base, const Elf32_Rel *rel, int num) +static unsigned int count_plts(const Elf32_Sym *syms, Elf32_Addr base, + const Elf32_Rel *rel, int num) { unsigned int ret = 0; + const Elf32_Sym *s; int i; - /* - * Sure, this is order(n^2), but it's usually short, and not - * time critical - */ - for (i = 0; i < num; i++) + for (i = 0; i < num; i++) { switch (ELF32_R_TYPE(rel[i].r_info)) { case R_ARM_CALL: case R_ARM_PC24: case R_ARM_JUMP24: - if (!duplicate_rel(base, rel, i, - __opcode_to_mem_arm(0x00ffffff))) - ret++; - break; -#ifdef CONFIG_THUMB2_KERNEL case R_ARM_THM_CALL: case R_ARM_THM_JUMP24: - if (!duplicate_rel(base, rel, i, - __opcode_to_mem_thumb32(0x07ff2fff))) + /* + * We only have to consider branch targets that resolve + * to undefined symbols. This is not simply a heuristic, + * it is a fundamental limitation, since the PLT itself + * is part of the module, and needs to be within range + * as well, so modules can never grow beyond that limit. + */ + s = syms + ELF32_R_SYM(rel[i].r_info); + if (s->st_shndx != SHN_UNDEF) + break; + + /* + * Jump relocations with non-zero addends against + * undefined symbols are supported by the ELF spec, but + * do not occur in practice (e.g., 'jump n bytes past + * the entry point of undefined function symbol f'). + * So we need to support them, but there is no need to + * take them into consideration when trying to optimize + * this code. So let's only check for duplicates when + * the addend is zero. + */ + if (!is_zero_addend_relocation(base, rel + i) || + !duplicate_rel(base, rel, i)) ret++; -#endif } + } return ret; } int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod) { - unsigned long core_plts = 0, init_plts = 0; + unsigned long plts = 0; Elf32_Shdr *s, *sechdrs_end = sechdrs + ehdr->e_shnum; + Elf32_Sym *syms = NULL; /* * To store the PLTs, we expand the .text section for core module code - * and the .init.text section for initialization code. + * and for initialization code. */ - for (s = sechdrs; s < sechdrs_end; ++s) - if (strcmp(".core.plt", secstrings + s->sh_name) == 0) - mod->arch.core_plt = s; - else if (strcmp(".init.plt", secstrings + s->sh_name) == 0) - mod->arch.init_plt = s; - - if (!mod->arch.core_plt || !mod->arch.init_plt) { - pr_err("%s: sections missing\n", mod->name); + for (s = sechdrs; s < sechdrs_end; ++s) { + if (strcmp(".plt", secstrings + s->sh_name) == 0) + mod->arch.plt = s; + else if (s->sh_type == SHT_SYMTAB) + syms = (Elf32_Sym *)s->sh_addr; + } + + if (!mod->arch.plt) { + pr_err("%s: module PLT section missing\n", mod->name); + return -ENOEXEC; + } + if (!syms) { + pr_err("%s: module symtab section missing\n", mod->name); return -ENOEXEC; } for (s = sechdrs + 1; s < sechdrs_end; ++s) { - const Elf32_Rel *rels = (void *)ehdr + s->sh_offset; + Elf32_Rel *rels = (void *)ehdr + s->sh_offset; int numrels = s->sh_size / sizeof(Elf32_Rel); Elf32_Shdr *dstsec = sechdrs + s->sh_info; if (s->sh_type != SHT_REL) continue; - if (strstr(secstrings + s->sh_name, ".init")) - init_plts += count_plts(dstsec->sh_addr, rels, numrels); - else - core_plts += count_plts(dstsec->sh_addr, rels, numrels); + /* ignore relocations that operate on non-exec sections */ + if (!(dstsec->sh_flags & SHF_EXECINSTR)) + continue; + + /* sort by type and symbol index */ + sort(rels, numrels, sizeof(Elf32_Rel), cmp_rel, NULL); + + plts += count_plts(syms, dstsec->sh_addr, rels, numrels); } - mod->arch.core_plt->sh_type = SHT_NOBITS; - mod->arch.core_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; - mod->arch.core_plt->sh_addralign = L1_CACHE_BYTES; - mod->arch.core_plt->sh_size = round_up(core_plts * PLT_ENT_SIZE, - sizeof(struct plt_entries)); - mod->arch.core_plt_count = 0; - - mod->arch.init_plt->sh_type = SHT_NOBITS; - mod->arch.init_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; - mod->arch.init_plt->sh_addralign = L1_CACHE_BYTES; - mod->arch.init_plt->sh_size = round_up(init_plts * PLT_ENT_SIZE, - sizeof(struct plt_entries)); - mod->arch.init_plt_count = 0; - pr_debug("%s: core.plt=%x, init.plt=%x\n", __func__, - mod->arch.core_plt->sh_size, mod->arch.init_plt->sh_size); + mod->arch.plt->sh_type = SHT_NOBITS; + mod->arch.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; + mod->arch.plt->sh_addralign = L1_CACHE_BYTES; + mod->arch.plt->sh_size = round_up(plts * PLT_ENT_SIZE, + sizeof(struct plt_entries)); + mod->arch.plt_count = 0; + + pr_debug("%s: plt=%x\n", __func__, mod->arch.plt->sh_size); return 0; } |