Merge branch 'for-linus' of git://git.armlinux.org.uk/~rmk/linux-arm

Pull ARM updates from Russell King: - Correct ARMs dma-mapping to use the correct printk format strings. - Avoid defining OBJCOPYFLAGS globally which upsets lkdtm rodata testing. - Cleanups to ARMs asm/memory.h include. - L2 cache cleanups. - Allow flat nommu binaries to be executed on ARM MMU systems. - Kernel hardening - add more read-only after init annotations, including making some kernel vdso variables const. - Ensure AMBA primecell clocks are appropriately defaulted. - ARM breakpoint cleanup. - Various StrongARM 11x0 and companion chip (SA1111) updates to bring this legacy platform to use more modern APIs for (eg) GPIOs and interrupts, which will allow us in the future to reduce some of the board-level driver clutter and elimate function callbacks into board code via platform data. There still appears to be interest in these platforms! - Remove the now redundant secure_flush_area() API. - Module PLT relocation optimisations. Ard says: This series of 4 patches optimizes the ARM PLT generation code that is invoked at module load time, to get rid of the O(n^2) algorithm that results in pathological load times of 10 seconds or more for large modules on certain STB platforms. - ARMv7M cache maintanence support. - L2 cache PMU support * 'for-linus' of git://git.armlinux.org.uk/~rmk/linux-arm: (35 commits) ARM: sa1111: provide to_sa1111_device() macro ARM: sa1111: add sa1111_get_irq() ARM: sa1111: clean up duplication in IRQ chip implementation ARM: sa1111: implement a gpio_chip for SA1111 GPIOs ARM: sa1111: move irq cleanup to separate function ARM: sa1111: use devm_clk_get() ARM: sa1111: use devm_kzalloc() ARM: sa1111: ensure we only touch RAB bus type devices when removing ARM: 8611/1: l2x0: add PMU support ARM: 8610/1: V7M: Add dsb before jumping in handler mode ARM: 8609/1: V7M: Add support for the Cortex-M7 processor ARM: 8608/1: V7M: Indirect proc_info construction for V7M CPUs ARM: 8607/1: V7M: Wire up caches for V7M processors with cache support. ARM: 8606/1: V7M: introduce cache operations ARM: 8605/1: V7M: fix notrace variant of save_and_disable_irqs ARM: 8604/1: V7M: Add support for reading the CTR with read_cpuid_cachetype() ARM: 8603/1: V7M: Add addresses for mem-mapped V7M cache operations ARM: 8602/1: factor out CSSELR/CCSIDR operations that use cp15 directly ARM: kernel: avoid brute force search on PLT generation ARM: kernel: sort relocation sections before allocating PLTs ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2016-10-06 07:59:37 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-10-06 07:59:37 -0700
commit: 82fa407da081d05323171577d86f62d77df17465 (patch)
tree: 4fd548ec817dc41a69170bb64761ace9abc071f3 /arch/arm/kernel/module-plts.c
parent: c7f5d36a3cc26e5068f4444aa22c4579e5eac85f (diff)
parent: 81a63001862f92d47c8c40a7fad870d5fbd8680b (diff)
download: linux-82fa407da081d05323171577d86f62d77df17465.tar.bz2
1 files changed, 144 insertions, 99 deletions
diff --git a/arch/arm/kernel/module-plts.c b/arch/arm/kernel/module-plts.c
index 0c7efc3446c0..3a5cba90c971 100644
--- a/arch/arm/kernel/module-plts.c
+++ b/arch/arm/kernel/module-plts.c
@@ -9,6 +9,7 @@
 #include <linux/elf.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/sort.h>
 
 #include <asm/cache.h>
 #include <asm/opcodes.h>
@@ -30,154 +31,198 @@ struct plt_entries {
 	u32	lit[PLT_ENT_COUNT];
 };
 
-static bool in_init(const struct module *mod, u32 addr)
+u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val)
 {
-	return addr - (u32)mod->init_layout.base < mod->init_layout.size;
+	struct plt_entries *plt = (struct plt_entries *)mod->arch.plt->sh_addr;
+	int idx = 0;
+
+	/*
+	 * Look for an existing entry pointing to 'val'. Given that the
+	 * relocations are sorted, this will be the last entry we allocated.
+	 * (if one exists).
+	 */
+	if (mod->arch.plt_count > 0) {
+		plt += (mod->arch.plt_count - 1) / PLT_ENT_COUNT;
+		idx = (mod->arch.plt_count - 1) % PLT_ENT_COUNT;
+
+		if (plt->lit[idx] == val)
+			return (u32)&plt->ldr[idx];
+
+		idx = (idx + 1) % PLT_ENT_COUNT;
+		if (!idx)
+			plt++;
+	}
+
+	mod->arch.plt_count++;
+	BUG_ON(mod->arch.plt_count * PLT_ENT_SIZE > mod->arch.plt->sh_size);
+
+	if (!idx)
+		/* Populate a new set of entries */
+		*plt = (struct plt_entries){
+			{ [0 ... PLT_ENT_COUNT - 1] = PLT_ENT_LDR, },
+			{ val, }
+		};
+	else
+		plt->lit[idx] = val;
+
+	return (u32)&plt->ldr[idx];
 }
 
-u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val)
+#define cmp_3way(a,b)	((a) < (b) ? -1 : (a) > (b))
+
+static int cmp_rel(const void *a, const void *b)
 {
-	struct plt_entries *plt, *plt_end;
-	int c, *count;
-
-	if (in_init(mod, loc)) {
-		plt = (void *)mod->arch.init_plt->sh_addr;
-		plt_end = (void *)plt + mod->arch.init_plt->sh_size;
-		count = &mod->arch.init_plt_count;
-	} else {
-		plt = (void *)mod->arch.core_plt->sh_addr;
-		plt_end = (void *)plt + mod->arch.core_plt->sh_size;
-		count = &mod->arch.core_plt_count;
-	}
+	const Elf32_Rel *x = a, *y = b;
+	int i;
 
-	/* Look for an existing entry pointing to 'val' */
-	for (c = *count; plt < plt_end; c -= PLT_ENT_COUNT, plt++) {
-		int i;
-
-		if (!c) {
-			/* Populate a new set of entries */
-			*plt = (struct plt_entries){
-				{ [0 ... PLT_ENT_COUNT - 1] = PLT_ENT_LDR, },
-				{ val, }
-			};
-			++*count;
-			return (u32)plt->ldr;
-		}
-		for (i = 0; i < PLT_ENT_COUNT; i++) {
-			if (!plt->lit[i]) {
-				plt->lit[i] = val;
-				++*count;
-			}
-			if (plt->lit[i] == val)
-				return (u32)&plt->ldr[i];
-		}
+	/* sort by type and symbol index */
+	i = cmp_3way(ELF32_R_TYPE(x->r_info), ELF32_R_TYPE(y->r_info));
+	if (i == 0)
+		i = cmp_3way(ELF32_R_SYM(x->r_info), ELF32_R_SYM(y->r_info));
+	return i;
+}
+
+static bool is_zero_addend_relocation(Elf32_Addr base, const Elf32_Rel *rel)
+{
+	u32 *tval = (u32 *)(base + rel->r_offset);
+
+	/*
+	 * Do a bitwise compare on the raw addend rather than fully decoding
+	 * the offset and doing an arithmetic comparison.
+	 * Note that a zero-addend jump/call relocation is encoded taking the
+	 * PC bias into account, i.e., -8 for ARM and -4 for Thumb2.
+	 */
+	switch (ELF32_R_TYPE(rel->r_info)) {
+		u16 upper, lower;
+
+	case R_ARM_THM_CALL:
+	case R_ARM_THM_JUMP24:
+		upper = __mem_to_opcode_thumb16(((u16 *)tval)[0]);
+		lower = __mem_to_opcode_thumb16(((u16 *)tval)[1]);
+
+		return (upper & 0x7ff) == 0x7ff && (lower & 0x2fff) == 0x2ffe;
+
+	case R_ARM_CALL:
+	case R_ARM_PC24:
+	case R_ARM_JUMP24:
+		return (__mem_to_opcode_arm(*tval) & 0xffffff) == 0xfffffe;
 	}
 	BUG();
 }
 
-static int duplicate_rel(Elf32_Addr base, const Elf32_Rel *rel, int num,
-			   u32 mask)
+static bool duplicate_rel(Elf32_Addr base, const Elf32_Rel *rel, int num)
 {
-	u32 *loc1, *loc2;
-	int i;
+	const Elf32_Rel *prev;
 
-	for (i = 0; i < num; i++) {
-		if (rel[i].r_info != rel[num].r_info)
-			continue;
+	/*
+	 * Entries are sorted by type and symbol index. That means that,
+	 * if a duplicate entry exists, it must be in the preceding
+	 * slot.
+	 */
+	if (!num)
+		return false;
 
-		/*
-		 * Identical relocation types against identical symbols can
-		 * still result in different PLT entries if the addend in the
-		 * place is different. So resolve the target of the relocation
-		 * to compare the values.
-		 */
-		loc1 = (u32 *)(base + rel[i].r_offset);
-		loc2 = (u32 *)(base + rel[num].r_offset);
-		if (((*loc1 ^ *loc2) & mask) == 0)
-			return 1;
-	}
-	return 0;
+	prev = rel + num - 1;
+	return cmp_rel(rel + num, prev) == 0 &&
+	       is_zero_addend_relocation(base, prev);
 }
 
 /* Count how many PLT entries we may need */
-static unsigned int count_plts(Elf32_Addr base, const Elf32_Rel *rel, int num)
+static unsigned int count_plts(const Elf32_Sym *syms, Elf32_Addr base,
+			       const Elf32_Rel *rel, int num)
 {
 	unsigned int ret = 0;
+	const Elf32_Sym *s;
 	int i;
 
-	/*
-	 * Sure, this is order(n^2), but it's usually short, and not
-	 * time critical
-	 */
-	for (i = 0; i < num; i++)
+	for (i = 0; i < num; i++) {
 		switch (ELF32_R_TYPE(rel[i].r_info)) {
 		case R_ARM_CALL:
 		case R_ARM_PC24:
 		case R_ARM_JUMP24:
-			if (!duplicate_rel(base, rel, i,
-					   __opcode_to_mem_arm(0x00ffffff)))
-				ret++;
-			break;
-#ifdef CONFIG_THUMB2_KERNEL
 		case R_ARM_THM_CALL:
 		case R_ARM_THM_JUMP24:
-			if (!duplicate_rel(base, rel, i,
-					   __opcode_to_mem_thumb32(0x07ff2fff)))
+			/*
+			 * We only have to consider branch targets that resolve
+			 * to undefined symbols. This is not simply a heuristic,
+			 * it is a fundamental limitation, since the PLT itself
+			 * is part of the module, and needs to be within range
+			 * as well, so modules can never grow beyond that limit.
+			 */
+			s = syms + ELF32_R_SYM(rel[i].r_info);
+			if (s->st_shndx != SHN_UNDEF)
+				break;
+
+			/*
+			 * Jump relocations with non-zero addends against
+			 * undefined symbols are supported by the ELF spec, but
+			 * do not occur in practice (e.g., 'jump n bytes past
+			 * the entry point of undefined function symbol f').
+			 * So we need to support them, but there is no need to
+			 * take them into consideration when trying to optimize
+			 * this code. So let's only check for duplicates when
+			 * the addend is zero.
+			 */
+			if (!is_zero_addend_relocation(base, rel + i) ||
+			    !duplicate_rel(base, rel, i))
 				ret++;
-#endif
 		}
+	}
 	return ret;
 }
 
 int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 			      char *secstrings, struct module *mod)
 {
-	unsigned long core_plts = 0, init_plts = 0;
+	unsigned long plts = 0;
 	Elf32_Shdr *s, *sechdrs_end = sechdrs + ehdr->e_shnum;
+	Elf32_Sym *syms = NULL;
 
 	/*
 	 * To store the PLTs, we expand the .text section for core module code
-	 * and the .init.text section for initialization code.
+	 * and for initialization code.
 	 */
-	for (s = sechdrs; s < sechdrs_end; ++s)
-		if (strcmp(".core.plt", secstrings + s->sh_name) == 0)
-			mod->arch.core_plt = s;
-		else if (strcmp(".init.plt", secstrings + s->sh_name) == 0)
-			mod->arch.init_plt = s;
-
-	if (!mod->arch.core_plt || !mod->arch.init_plt) {
-		pr_err("%s: sections missing\n", mod->name);
+	for (s = sechdrs; s < sechdrs_end; ++s) {
+		if (strcmp(".plt", secstrings + s->sh_name) == 0)
+			mod->arch.plt = s;
+		else if (s->sh_type == SHT_SYMTAB)
+			syms = (Elf32_Sym *)s->sh_addr;
+	}
+
+	if (!mod->arch.plt) {
+		pr_err("%s: module PLT section missing\n", mod->name);
+		return -ENOEXEC;
+	}
+	if (!syms) {
+		pr_err("%s: module symtab section missing\n", mod->name);
 		return -ENOEXEC;
 	}
 
 	for (s = sechdrs + 1; s < sechdrs_end; ++s) {
-		const Elf32_Rel *rels = (void *)ehdr + s->sh_offset;
+		Elf32_Rel *rels = (void *)ehdr + s->sh_offset;
 		int numrels = s->sh_size / sizeof(Elf32_Rel);
 		Elf32_Shdr *dstsec = sechdrs + s->sh_info;
 
 		if (s->sh_type != SHT_REL)
 			continue;
 
-		if (strstr(secstrings + s->sh_name, ".init"))
-			init_plts += count_plts(dstsec->sh_addr, rels, numrels);
-		else
-			core_plts += count_plts(dstsec->sh_addr, rels, numrels);
+		/* ignore relocations that operate on non-exec sections */
+		if (!(dstsec->sh_flags & SHF_EXECINSTR))
+			continue;
+
+		/* sort by type and symbol index */
+		sort(rels, numrels, sizeof(Elf32_Rel), cmp_rel, NULL);
+
+		plts += count_plts(syms, dstsec->sh_addr, rels, numrels);
 	}
 
-	mod->arch.core_plt->sh_type = SHT_NOBITS;
-	mod->arch.core_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
-	mod->arch.core_plt->sh_addralign = L1_CACHE_BYTES;
-	mod->arch.core_plt->sh_size = round_up(core_plts * PLT_ENT_SIZE,
-					       sizeof(struct plt_entries));
-	mod->arch.core_plt_count = 0;
-
-	mod->arch.init_plt->sh_type = SHT_NOBITS;
-	mod->arch.init_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
-	mod->arch.init_plt->sh_addralign = L1_CACHE_BYTES;
-	mod->arch.init_plt->sh_size = round_up(init_plts * PLT_ENT_SIZE,
-					       sizeof(struct plt_entries));
-	mod->arch.init_plt_count = 0;
-	pr_debug("%s: core.plt=%x, init.plt=%x\n", __func__,
-		 mod->arch.core_plt->sh_size, mod->arch.init_plt->sh_size);
+	mod->arch.plt->sh_type = SHT_NOBITS;
+	mod->arch.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
+	mod->arch.plt->sh_addralign = L1_CACHE_BYTES;
+	mod->arch.plt->sh_size = round_up(plts * PLT_ENT_SIZE,
+					  sizeof(struct plt_entries));
+	mod->arch.plt_count = 0;
+
+	pr_debug("%s: plt=%x\n", __func__, mod->arch.plt->sh_size);
 	return 0;
 }
author	Linus Torvalds <torvalds@linux-foundation.org>	2016-10-06 07:59:37 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-10-06 07:59:37 -0700
commit	82fa407da081d05323171577d86f62d77df17465 (patch)
tree	4fd548ec817dc41a69170bb64761ace9abc071f3 /arch/arm/kernel/module-plts.c
parent	c7f5d36a3cc26e5068f4444aa22c4579e5eac85f (diff)
parent	81a63001862f92d47c8c40a7fad870d5fbd8680b (diff)
download	linux-82fa407da081d05323171577d86f62d77df17465.tar.bz2