From c7225494be79f8629c9166b106d6b1febf2a882f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 18 Feb 2020 12:34:05 +0100 Subject: efi/arm: Work around missing cache maintenance in decompressor handover The EFI stub executes within the context of the zImage as it was loaded by the firmware, which means it is treated as an ordinary PE/COFF executable, which is loaded into memory, and cleaned to the PoU to ensure that it can be executed safely while the MMU and caches are on. When the EFI stub hands over to the decompressor, we clean the caches by set/way and disable the MMU and D-cache, to comply with the Linux boot protocol for ARM. However, cache maintenance by set/way is not sufficient to ensure that subsequent instruction fetches and data accesses done with the MMU off see the correct data. This means that proceeding as we do currently is not safe, especially since we also perform data accesses with the MMU off, from a literal pool as well as the stack. So let's kick this can down the road a bit, and jump into the relocated zImage before disabling the caches. This removes the requirement to perform any by-VA cache maintenance on the original PE/COFF executable, but it does require that the relocated zImage is cleaned to the PoC, which is currently not the case. This will be addressed in a subsequent patch. Signed-off-by: Ard Biesheuvel --- arch/arm/boot/compressed/head.S | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'arch/arm') diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index 088b0a060876..39f7071d47c7 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S @@ -1461,6 +1461,17 @@ ENTRY(efi_stub_entry) @ Preserve return value of efi_entry() in r4 mov r4, r0 bl cache_clean_flush + + @ The PE/COFF loader might not have cleaned the code we are + @ running beyond the PoU, and so calling cache_off below from + @ inside the PE/COFF loader allocated region is unsafe. Let's + @ assume our own zImage relocation code did a better job, and + @ jump into its version of this routine before proceeding. + ldr r0, [sp] @ relocated zImage + ldr r1, .Ljmp + sub r1, r0, r1 + mov pc, r1 @ no mode switch +0: bl cache_off @ Set parameters for booting zImage according to boot protocol @@ -1469,18 +1480,15 @@ ENTRY(efi_stub_entry) mov r0, #0 mov r1, #0xFFFFFFFF mov r2, r4 - - @ Branch to (possibly) relocated zImage that is in [sp] - ldr lr, [sp] - ldr ip, =start_offset - add lr, lr, ip - mov pc, lr @ no mode switch + b __efi_start efi_load_fail: @ Return EFI_LOAD_ERROR to EFI firmware on error. ldr r0, =0x80000001 ldmfd sp!, {ip, pc} ENDPROC(efi_stub_entry) + .align 2 +.Ljmp: .long start - 0b #endif .align -- cgit v1.2.3 From e951a1f427f2312e17b4e0f485e60068ca1423bb Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 19 Feb 2020 00:09:48 +0100 Subject: efi/arm: Pass start and end addresses to cache_clean_flush() In preparation for turning the decompressor's cache clean/flush operations into proper by-VA maintenance for v7 cores, pass the start and end addresses of the regions that need cache maintenance into cache_clean_flush in registers r0 and r1. Currently, all implementations of cache_clean_flush ignore these values, so no functional change is expected as a result of this patch. Signed-off-by: Ard Biesheuvel --- arch/arm/boot/compressed/head.S | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/arm') diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index 39f7071d47c7..8487221bedb0 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S @@ -1460,6 +1460,12 @@ ENTRY(efi_stub_entry) @ Preserve return value of efi_entry() in r4 mov r4, r0 + add r1, r4, #SZ_2M @ DT end + bl cache_clean_flush + + ldr r0, [sp] @ relocated zImage + ldr r1, =_edata @ size of zImage + add r1, r1, r0 @ end of zImage bl cache_clean_flush @ The PE/COFF loader might not have cleaned the code we are -- cgit v1.2.3 From 184bf653a7a452c18b29136e6ef59972af288c7e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 24 Feb 2020 11:15:27 +0100 Subject: ARM: decompressor: factor out routine to obtain the inflated image size Before adding another reference to the inflated image size, factor out the slightly complicated way of loading the unaligned little-endian constant from the end of the compressed data. Tested-by: Tony Lindgren Tested-by: Linus Walleij Signed-off-by: Ard Biesheuvel --- arch/arm/boot/compressed/head.S | 43 +++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 17 deletions(-) (limited to 'arch/arm') diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index 8487221bedb0..d45952aae2b5 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S @@ -151,6 +151,25 @@ .L_\@: .endm + /* + * The kernel build system appends the size of the + * decompressed kernel at the end of the compressed data + * in little-endian form. + */ + .macro get_inflated_image_size, res:req, tmp1:req, tmp2:req + adr \res, .Linflated_image_size_offset + ldr \tmp1, [\res] + add \tmp1, \tmp1, \res @ address of inflated image size + + ldrb \res, [\tmp1] @ get_unaligned_le32 + ldrb \tmp2, [\tmp1, #1] + orr \res, \res, \tmp2, lsl #8 + ldrb \tmp2, [\tmp1, #2] + ldrb \tmp1, [\tmp1, #3] + orr \res, \res, \tmp2, lsl #16 + orr \res, \res, \tmp1, lsl #24 + .endm + .section ".start", "ax" /* * sort out different calling conventions @@ -268,15 +287,15 @@ not_angel: */ mov r0, pc cmp r0, r4 - ldrcc r0, LC0+32 + ldrcc r0, LC0+28 addcc r0, r0, pc cmpcc r4, r0 orrcc r4, r4, #1 @ remember we skipped cache_on blcs cache_on restart: adr r0, LC0 - ldmia r0, {r1, r2, r3, r6, r10, r11, r12} - ldr sp, [r0, #28] + ldmia r0, {r1, r2, r3, r6, r11, r12} + ldr sp, [r0, #24] /* * We might be running at a different address. We need @@ -284,20 +303,8 @@ restart: adr r0, LC0 */ sub r0, r0, r1 @ calculate the delta offset add r6, r6, r0 @ _edata - add r10, r10, r0 @ inflated kernel size location - /* - * The kernel build system appends the size of the - * decompressed kernel at the end of the compressed data - * in little-endian form. - */ - ldrb r9, [r10, #0] - ldrb lr, [r10, #1] - orr r9, r9, lr, lsl #8 - ldrb lr, [r10, #2] - ldrb r10, [r10, #3] - orr r9, r9, lr, lsl #16 - orr r9, r9, r10, lsl #24 + get_inflated_image_size r9, r10, lr #ifndef CONFIG_ZBOOT_ROM /* malloc space is above the relocated stack (64k max) */ @@ -652,13 +659,15 @@ LC0: .word LC0 @ r1 .word __bss_start @ r2 .word _end @ r3 .word _edata @ r6 - .word input_data_end - 4 @ r10 (inflated size location) .word _got_start @ r11 .word _got_end @ ip .word .L_user_stack_end @ sp .word _end - restart + 16384 + 1024*1024 .size LC0, . - LC0 +.Linflated_image_size_offset: + .long (input_data_end - 4) - . + #ifdef CONFIG_ARCH_RPC .globl params params: ldr r0, =0x10000100 @ params_phys for RPC -- cgit v1.2.3 From e114412f616446708b3d1e559ff4af9db7ade46e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 18 Feb 2020 16:50:54 +0100 Subject: ARM: decompressor: prepare cache_clean_flush for doing by-VA maintenance In preparation for turning the decompressor's cache clean/flush operations into proper by-VA maintenance for v7 cores, pass the start and end addresses of the regions that need cache maintenance into cache_clean_flush in registers r0 and r1. Currently, all implementations of cache_clean_flush ignore these values, so no functional change is expected as a result of this patch. Tested-by: Tony Lindgren Tested-by: Linus Walleij Signed-off-by: Ard Biesheuvel --- arch/arm/boot/compressed/head.S | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/arm') diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index d45952aae2b5..f90034151aef 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S @@ -528,6 +528,8 @@ dtb_check_done: /* Preserve offset to relocated code. */ sub r6, r9, r6 + mov r0, r9 @ start of relocated zImage + add r1, sp, r6 @ end of relocated zImage #ifndef CONFIG_ZBOOT_ROM /* cache_clean_flush may use the stack, so relocate it */ add sp, sp, r6 @@ -629,6 +631,11 @@ not_relocated: mov r0, #0 add r2, sp, #0x10000 @ 64k max mov r3, r7 bl decompress_kernel + + get_inflated_image_size r1, r2, r3 + + mov r0, r4 @ start of inflated image + add r1, r1, r0 @ end of inflated image bl cache_clean_flush bl cache_off @@ -1182,6 +1189,9 @@ __armv7_mmu_cache_off: /* * Clean and flush the cache to maintain consistency. * + * On entry, + * r0 = start address + * r1 = end address (exclusive) * On exit, * r1, r2, r3, r9, r10, r11, r12 corrupted * This routine must preserve: -- cgit v1.2.3 From 401b368caaecdce1cf8f05bab448172752230cb0 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 18 Feb 2020 17:06:14 +0100 Subject: ARM: decompressor: switch to by-VA cache maintenance for v7 cores Update the v7 cache_clean_flush routine to take into account the memory range passed in r0/r1, and perform cache maintenance by virtual address on this range instead of set/way maintenance, which is inappropriate for the purpose of maintaining the cached state of memory contents. Since this removes any use of the stack in the implementation of cache_clean_flush(), we can also drop some code that manages the value of the stack pointer before calling it. Tested-by: Tony Lindgren Tested-by: Linus Walleij Signed-off-by: Ard Biesheuvel --- arch/arm/boot/compressed/head.S | 83 +++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 53 deletions(-) (limited to 'arch/arm') diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index f90034151aef..4f7c6145e31f 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S @@ -530,11 +530,6 @@ dtb_check_done: mov r0, r9 @ start of relocated zImage add r1, sp, r6 @ end of relocated zImage -#ifndef CONFIG_ZBOOT_ROM - /* cache_clean_flush may use the stack, so relocate it */ - add sp, sp, r6 -#endif - bl cache_clean_flush badr r0, restart @@ -683,6 +678,24 @@ params: ldr r0, =0x10000100 @ params_phys for RPC .align #endif +/* + * dcache_line_size - get the minimum D-cache line size from the CTR register + * on ARMv7. + */ + .macro dcache_line_size, reg, tmp +#ifdef CONFIG_CPU_V7M + movw \tmp, #:lower16:BASEADDR_V7M_SCB + V7M_SCB_CTR + movt \tmp, #:upper16:BASEADDR_V7M_SCB + V7M_SCB_CTR + ldr \tmp, [\tmp] +#else + mrc p15, 0, \tmp, c0, c0, 1 @ read ctr +#endif + lsr \tmp, \tmp, #16 + and \tmp, \tmp, #0xf @ cache line size encoding + mov \reg, #4 @ bytes per word + mov \reg, \reg, lsl \tmp @ actual cache line size + .endm + /* * Turn on the cache. We need to setup some page tables so that we * can have both the I and D caches on. @@ -1175,8 +1188,6 @@ __armv7_mmu_cache_off: bic r0, r0, #0x000c #endif mcr p15, 0, r0, c1, c0 @ turn MMU and cache off - mov r12, lr - bl __armv7_mmu_cache_flush mov r0, #0 #ifdef CONFIG_MMU mcr p15, 0, r0, c8, c7, 0 @ invalidate whole TLB @@ -1184,7 +1195,7 @@ __armv7_mmu_cache_off: mcr p15, 0, r0, c7, c5, 6 @ invalidate BTC mcr p15, 0, r0, c7, c10, 4 @ DSB mcr p15, 0, r0, c7, c5, 4 @ ISB - mov pc, r12 + mov pc, lr /* * Clean and flush the cache to maintain consistency. @@ -1200,6 +1211,7 @@ __armv7_mmu_cache_off: .align 5 cache_clean_flush: mov r3, #16 + mov r11, r1 b call_cache_fn __armv4_mpu_cache_flush: @@ -1250,51 +1262,16 @@ __armv7_mmu_cache_flush: mcr p15, 0, r10, c7, c14, 0 @ clean+invalidate D b iflush hierarchical: - mcr p15, 0, r10, c7, c10, 5 @ DMB - stmfd sp!, {r0-r7, r9-r11} - mrc p15, 1, r0, c0, c0, 1 @ read clidr - ands r3, r0, #0x7000000 @ extract loc from clidr - mov r3, r3, lsr #23 @ left align loc bit field - beq finished @ if loc is 0, then no need to clean - mov r10, #0 @ start clean at cache level 0 -loop1: - add r2, r10, r10, lsr #1 @ work out 3x current cache level - mov r1, r0, lsr r2 @ extract cache type bits from clidr - and r1, r1, #7 @ mask of the bits for current cache only - cmp r1, #2 @ see what cache we have at this level - blt skip @ skip if no cache, or just i-cache - mcr p15, 2, r10, c0, c0, 0 @ select current cache level in cssr - mcr p15, 0, r10, c7, c5, 4 @ isb to sych the new cssr&csidr - mrc p15, 1, r1, c0, c0, 0 @ read the new csidr - and r2, r1, #7 @ extract the length of the cache lines - add r2, r2, #4 @ add 4 (line length offset) - ldr r4, =0x3ff - ands r4, r4, r1, lsr #3 @ find maximum number on the way size - clz r5, r4 @ find bit position of way size increment - ldr r7, =0x7fff - ands r7, r7, r1, lsr #13 @ extract max number of the index size -loop2: - mov r9, r4 @ create working copy of max way size -loop3: - ARM( orr r11, r10, r9, lsl r5 ) @ factor way and cache number into r11 - ARM( orr r11, r11, r7, lsl r2 ) @ factor index number into r11 - THUMB( lsl r6, r9, r5 ) - THUMB( orr r11, r10, r6 ) @ factor way and cache number into r11 - THUMB( lsl r6, r7, r2 ) - THUMB( orr r11, r11, r6 ) @ factor index number into r11 - mcr p15, 0, r11, c7, c14, 2 @ clean & invalidate by set/way - subs r9, r9, #1 @ decrement the way - bge loop3 - subs r7, r7, #1 @ decrement the index - bge loop2 -skip: - add r10, r10, #2 @ increment cache number - cmp r3, r10 - bgt loop1 -finished: - ldmfd sp!, {r0-r7, r9-r11} - mov r10, #0 @ switch back to cache level 0 - mcr p15, 2, r10, c0, c0, 0 @ select current cache level in cssr + dcache_line_size r1, r2 @ r1 := dcache min line size + sub r2, r1, #1 @ r2 := line size mask + bic r0, r0, r2 @ round down start to line size + sub r11, r11, #1 @ end address is exclusive + bic r11, r11, r2 @ round down end to line size +0: cmp r0, r11 @ finished? + bgt iflush + mcr p15, 0, r0, c7, c14, 1 @ Dcache clean/invalidate by VA + add r0, r0, r1 + b 0b iflush: mcr p15, 0, r10, c7, c10, 4 @ DSB mcr p15, 0, r10, c7, c5, 0 @ invalidate I+BTB -- cgit v1.2.3