From 61365e132ef987f7719af5d2e434db4465957637 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 7 Dec 2009 12:51:42 +0100 Subject: [S390] Improve address space check. A data access in access-register mode always is a user mode access, the code to inspect the access-registers can be removed. The second change is to use a different test to check for no-execute fault. The third change is to pass the translation exception identification as parameter, in theory the trans_exc_code in the lowcore could have been overwritten by the time the call to check_space from do_no_context is done. Signed-off-by: Martin Schwidefsky --- arch/s390/mm/fault.c | 99 ++++++++++++++++++++++++---------------------------- 1 file changed, 45 insertions(+), 54 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 6d507462967a..3df5b918cfe2 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -100,39 +100,28 @@ void bust_spinlocks(int yes) /* * Returns the address space associated with the fault. - * Returns 0 for kernel space, 1 for user space and - * 2 for code execution in user space with noexec=on. + * Returns 0 for kernel space and 1 for user space. */ -static inline int check_space(struct task_struct *tsk) +static inline int user_space_fault(unsigned long trans_exc_code) { /* - * The lowest two bits of S390_lowcore.trans_exc_code - * indicate which paging table was used. + * The lowest two bits of the translation exception + * identification indicate which paging table was used. */ - int desc = S390_lowcore.trans_exc_code & 3; - - if (desc == 3) /* Home Segment Table Descriptor */ - return switch_amode == 0; - if (desc == 2) /* Secondary Segment Table Descriptor */ - return tsk->thread.mm_segment.ar4; -#ifdef CONFIG_S390_SWITCH_AMODE - if (unlikely(desc == 1)) { /* STD determined via access register */ - /* %a0 always indicates primary space. */ - if (S390_lowcore.exc_access_id != 0) { - save_access_regs(tsk->thread.acrs); - /* - * An alet of 0 indicates primary space. - * An alet of 1 indicates secondary space. - * Any other alet values generate an - * alen-translation exception. - */ - if (tsk->thread.acrs[S390_lowcore.exc_access_id]) - return tsk->thread.mm_segment.ar4; - } - } -#endif - /* Primary Segment Table Descriptor */ - return switch_amode << s390_noexec; + trans_exc_code &= 3; + if (trans_exc_code == 2) + /* Access via secondary space, set_fs setting decides */ + return current->thread.mm_segment.ar4; + if (!switch_amode) + /* User space if the access has been done via home space. */ + return trans_exc_code == 3; + /* + * If the user space is not the home space the kernel runs in home + * space. Access via secondary space has already been covered, + * access via primary space or access register is from user space + * and access via home space is from the kernel. + */ + return trans_exc_code != 3; } /* @@ -162,9 +151,10 @@ static void do_sigsegv(struct pt_regs *regs, unsigned long error_code, } static void do_no_context(struct pt_regs *regs, unsigned long error_code, - unsigned long address) + unsigned long trans_exc_code) { const struct exception_table_entry *fixup; + unsigned long address; /* Are we prepared to handle this kernel fault? */ fixup = search_exception_tables(regs->psw.addr & __FIXUP_MASK); @@ -177,7 +167,8 @@ static void do_no_context(struct pt_regs *regs, unsigned long error_code, * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ - if (check_space(current) == 0) + address = trans_exc_code & __FAIL_ADDR_MASK; + if (user_space_fault(trans_exc_code) == 0) printk(KERN_ALERT "Unable to handle kernel pointer dereference" " at virtual kernel address %p\n", (void *)address); else @@ -188,7 +179,8 @@ static void do_no_context(struct pt_regs *regs, unsigned long error_code, do_exit(SIGKILL); } -static void do_low_address(struct pt_regs *regs, unsigned long error_code) +static void do_low_address(struct pt_regs *regs, unsigned long error_code, + unsigned long trans_exc_code) { /* Low-address protection hit in kernel mode means NULL pointer write access in kernel mode. */ @@ -198,11 +190,11 @@ static void do_low_address(struct pt_regs *regs, unsigned long error_code) do_exit(SIGKILL); } - do_no_context(regs, error_code, 0); + do_no_context(regs, error_code, trans_exc_code); } static void do_sigbus(struct pt_regs *regs, unsigned long error_code, - unsigned long address) + unsigned long trans_exc_code) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; @@ -212,13 +204,13 @@ static void do_sigbus(struct pt_regs *regs, unsigned long error_code, * Send a sigbus, regardless of whether we were in kernel * or user mode. */ - tsk->thread.prot_addr = address; + tsk->thread.prot_addr = trans_exc_code & __FAIL_ADDR_MASK; tsk->thread.trap_no = error_code; force_sig(SIGBUS, tsk); /* Kernel mode? Handle exceptions or die */ if (!(regs->psw.mask & PSW_MASK_PSTATE)) - do_no_context(regs, error_code, address); + do_no_context(regs, error_code, trans_exc_code); } #ifdef CONFIG_S390_EXEC_PROTECT @@ -272,13 +264,13 @@ static int signal_return(struct mm_struct *mm, struct pt_regs *regs, * 3b Region third trans. -> Not present (nullification) */ static inline void -do_exception(struct pt_regs *regs, unsigned long error_code, int write) +do_exception(struct pt_regs *regs, unsigned long error_code, int write, + unsigned long trans_exc_code) { struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct *vma; unsigned long address; - int space; int si_code; int fault; @@ -288,18 +280,15 @@ do_exception(struct pt_regs *regs, unsigned long error_code, int write) tsk = current; mm = tsk->mm; - /* get the failing address and the affected space */ - address = S390_lowcore.trans_exc_code & __FAIL_ADDR_MASK; - space = check_space(tsk); - /* * Verify that the fault happened in user space, that * we are not in an interrupt and that there is a * user context. */ - if (unlikely(space == 0 || in_atomic() || !mm)) + if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) goto no_context; + address = trans_exc_code & __FAIL_ADDR_MASK; /* * When we get here, the fault happened in the current * task's user address space, so we can switch on the @@ -315,7 +304,8 @@ do_exception(struct pt_regs *regs, unsigned long error_code, int write) goto bad_area; #ifdef CONFIG_S390_EXEC_PROTECT - if (unlikely((space == 2) && !(vma->vm_flags & VM_EXEC))) + if (unlikely((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY && + (trans_exc_code & 3) == 0 && !(vma->vm_flags & VM_EXEC))) if (!signal_return(mm, regs, address, error_code)) /* * signal_return() has done an up_read(&mm->mmap_sem) @@ -397,12 +387,14 @@ bad_area: } no_context: - do_no_context(regs, error_code, address); + do_no_context(regs, error_code, trans_exc_code); } void __kprobes do_protection_exception(struct pt_regs *regs, long error_code) { + unsigned long trans_exc_code = S390_lowcore.trans_exc_code; + /* Protection exception is supressing, decrement psw address. */ regs->psw.addr -= (error_code >> 16); /* @@ -410,31 +402,30 @@ void __kprobes do_protection_exception(struct pt_regs *regs, * as a special case because the translation exception code * field is not guaranteed to contain valid data in this case. */ - if (unlikely(!(S390_lowcore.trans_exc_code & 4))) { - do_low_address(regs, error_code); + if (unlikely(!(trans_exc_code & 4))) { + do_low_address(regs, error_code, trans_exc_code); return; } - do_exception(regs, 4, 1); + do_exception(regs, 4, 1, trans_exc_code); } void __kprobes do_dat_exception(struct pt_regs *regs, long error_code) { - do_exception(regs, error_code & 0xff, 0); + do_exception(regs, error_code & 0xff, 0, S390_lowcore.trans_exc_code); } #ifdef CONFIG_64BIT void __kprobes do_asce_exception(struct pt_regs *regs, unsigned long error_code) { + unsigned long trans_exc_code = S390_lowcore.trans_exc_code; struct mm_struct *mm; struct vm_area_struct *vma; unsigned long address; - int space; mm = current->mm; - address = S390_lowcore.trans_exc_code & __FAIL_ADDR_MASK; - space = check_space(current); + address = trans_exc_code & __FAIL_ADDR_MASK; - if (unlikely(space == 0 || in_atomic() || !mm)) + if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) goto no_context; local_irq_enable(); @@ -457,7 +448,7 @@ void __kprobes do_asce_exception(struct pt_regs *regs, unsigned long error_code) } no_context: - do_no_context(regs, error_code, address); + do_no_context(regs, error_code, trans_exc_code); } #endif -- cgit v1.2.3 From b11b53342773361f3353b285eb6a3fd6074e7997 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 7 Dec 2009 12:51:43 +0100 Subject: [S390] Improve address space mode selection. Introduce user_mode to replace the two variables switch_amode and s390_noexec. There are three valid combinations of the old values: 1) switch_amode == 0 && s390_noexec == 0 2) switch_amode == 1 && s390_noexec == 0 3) switch_amode == 1 && s390_noexec == 1 They get replaced by 1) user_mode == HOME_SPACE_MODE 2) user_mode == PRIMARY_SPACE_MODE 3) user_mode == SECONDARY_SPACE_MODE The new kernel parameter user_mode=[primary,secondary,home] lets you choose the address space mode the user space processes should use. In addition the CONFIG_S390_SWITCH_AMODE config option is removed. Signed-off-by: Martin Schwidefsky --- arch/s390/Kconfig | 15 --------------- arch/s390/defconfig | 1 - arch/s390/include/asm/mmu_context.h | 4 ++-- arch/s390/include/asm/pgalloc.h | 3 ++- arch/s390/include/asm/setup.h | 17 ++++++----------- arch/s390/kernel/setup.c | 36 ++++++++++++++++++++---------------- arch/s390/kernel/vdso.c | 9 +++++---- arch/s390/kvm/Kconfig | 1 - arch/s390/lib/uaccess_mvcos.c | 4 ---- arch/s390/mm/fault.c | 4 ++-- arch/s390/mm/pgtable.c | 2 +- 11 files changed, 38 insertions(+), 58 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 16c673096a22..c80235206c01 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -220,23 +220,8 @@ config AUDIT_ARCH bool default y -config S390_SWITCH_AMODE - bool "Switch kernel/user addressing modes" - help - This option allows to switch the addressing modes of kernel and user - space. The kernel parameter switch_amode=on will enable this feature, - default is disabled. Enabling this (via kernel parameter) on machines - earlier than IBM System z9-109 EC/BC will reduce system performance. - - Note that this option will also be selected by selecting the execute - protection option below. Enabling the execute protection via the - noexec kernel parameter will also switch the addressing modes, - independent of the switch_amode kernel parameter. - - config S390_EXEC_PROTECT bool "Data execute protection" - select S390_SWITCH_AMODE help This option allows to enable a buffer overflow protection for user space programs and it also selects the addressing mode option above. diff --git a/arch/s390/defconfig b/arch/s390/defconfig index ab4464486b7a..f4e53c6708dc 100644 --- a/arch/s390/defconfig +++ b/arch/s390/defconfig @@ -185,7 +185,6 @@ CONFIG_HOTPLUG_CPU=y CONFIG_COMPAT=y CONFIG_SYSVIPC_COMPAT=y CONFIG_AUDIT_ARCH=y -CONFIG_S390_SWITCH_AMODE=y CONFIG_S390_EXEC_PROTECT=y # diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index fc7edd6f41b6..976e273988c2 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -36,7 +36,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.has_pgste = 1; mm->context.alloc_pgste = 1; } else { - mm->context.noexec = s390_noexec; + mm->context.noexec = (user_mode == SECONDARY_SPACE_MODE); mm->context.has_pgste = 0; mm->context.alloc_pgste = 0; } @@ -58,7 +58,7 @@ static inline void update_mm(struct mm_struct *mm, struct task_struct *tsk) pgd_t *pgd = mm->pgd; S390_lowcore.user_asce = mm->context.asce_bits | __pa(pgd); - if (switch_amode) { + if (user_mode != HOME_SPACE_MODE) { /* Load primary space page table origin. */ pgd = mm->context.noexec ? get_shadow_table(pgd) : pgd; S390_lowcore.user_exec_asce = mm->context.asce_bits | __pa(pgd); diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index ddad5903341c..68940d0bad91 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -143,7 +143,8 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) spin_lock_init(&mm->context.list_lock); INIT_LIST_HEAD(&mm->context.crst_list); INIT_LIST_HEAD(&mm->context.pgtable_list); - return (pgd_t *) crst_table_alloc(mm, s390_noexec); + return (pgd_t *) + crst_table_alloc(mm, user_mode == SECONDARY_SPACE_MODE); } #define pgd_free(mm, pgd) crst_table_free(mm, (unsigned long *) pgd) diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index e37478e87286..52a779c337e8 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -49,17 +49,12 @@ extern unsigned long memory_end; void detect_memory_layout(struct mem_chunk chunk[]); -#ifdef CONFIG_S390_SWITCH_AMODE -extern unsigned int switch_amode; -#else -#define switch_amode (0) -#endif - -#ifdef CONFIG_S390_EXEC_PROTECT -extern unsigned int s390_noexec; -#else -#define s390_noexec (0) -#endif +#define PRIMARY_SPACE_MODE 0 +#define ACCESS_REGISTER_MODE 1 +#define SECONDARY_SPACE_MODE 2 +#define HOME_SPACE_MODE 3 + +extern unsigned int user_mode; /* * Machine features detected in head.S diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 061479ff029f..0663287fa1b3 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -305,9 +305,8 @@ static int __init early_parse_mem(char *p) } early_param("mem", early_parse_mem); -#ifdef CONFIG_S390_SWITCH_AMODE -unsigned int switch_amode = 0; -EXPORT_SYMBOL_GPL(switch_amode); +unsigned int user_mode = HOME_SPACE_MODE; +EXPORT_SYMBOL_GPL(user_mode); static int set_amode_and_uaccess(unsigned long user_amode, unsigned long user32_amode) @@ -340,23 +339,29 @@ static int set_amode_and_uaccess(unsigned long user_amode, */ static int __init early_parse_switch_amode(char *p) { - switch_amode = 1; + if (user_mode != SECONDARY_SPACE_MODE) + user_mode = PRIMARY_SPACE_MODE; return 0; } early_param("switch_amode", early_parse_switch_amode); -#else /* CONFIG_S390_SWITCH_AMODE */ -static inline int set_amode_and_uaccess(unsigned long user_amode, - unsigned long user32_amode) +static int __init early_parse_user_mode(char *p) { + if (p && strcmp(p, "primary") == 0) + user_mode = PRIMARY_SPACE_MODE; +#ifdef CONFIG_S390_EXEC_PROTECT + else if (p && strcmp(p, "secondary") == 0) + user_mode = SECONDARY_SPACE_MODE; +#endif + else if (!p || strcmp(p, "home") == 0) + user_mode = HOME_SPACE_MODE; + else + return 1; return 0; } -#endif /* CONFIG_S390_SWITCH_AMODE */ +early_param("user_mode", early_parse_user_mode); #ifdef CONFIG_S390_EXEC_PROTECT -unsigned int s390_noexec = 0; -EXPORT_SYMBOL_GPL(s390_noexec); - /* * Enable execute protection? */ @@ -364,8 +369,7 @@ static int __init early_parse_noexec(char *p) { if (!strncmp(p, "off", 3)) return 0; - switch_amode = 1; - s390_noexec = 1; + user_mode = SECONDARY_SPACE_MODE; return 0; } early_param("noexec", early_parse_noexec); @@ -373,7 +377,7 @@ early_param("noexec", early_parse_noexec); static void setup_addressing_mode(void) { - if (s390_noexec) { + if (user_mode == SECONDARY_SPACE_MODE) { if (set_amode_and_uaccess(PSW_ASC_SECONDARY, PSW32_ASC_SECONDARY)) pr_info("Execute protection active, " @@ -381,7 +385,7 @@ static void setup_addressing_mode(void) else pr_info("Execute protection active, " "mvcos not available\n"); - } else if (switch_amode) { + } else if (user_mode == PRIMARY_SPACE_MODE) { if (set_amode_and_uaccess(PSW_ASC_PRIMARY, PSW32_ASC_PRIMARY)) pr_info("Address spaces switched, " "mvcos available\n"); @@ -411,7 +415,7 @@ setup_lowcore(void) lc->restart_psw.mask = PSW_BASE_BITS | PSW_DEFAULT_KEY; lc->restart_psw.addr = PSW_ADDR_AMODE | (unsigned long) restart_int_handler; - if (switch_amode) + if (user_mode != HOME_SPACE_MODE) lc->restart_psw.mask |= PSW_ASC_HOME; lc->external_new_psw.mask = psw_kernel_bits; lc->external_new_psw.addr = diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index adfb32aa6d59..5f99e66c51c3 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -86,7 +86,8 @@ static void vdso_init_data(struct vdso_data *vd) unsigned int facility_list; facility_list = stfl(); - vd->ectg_available = switch_amode && (facility_list & 1); + vd->ectg_available = + user_mode != HOME_SPACE_MODE && (facility_list & 1); } #ifdef CONFIG_64BIT @@ -114,7 +115,7 @@ int vdso_alloc_per_cpu(int cpu, struct _lowcore *lowcore) lowcore->vdso_per_cpu_data = __LC_PASTE; - if (!switch_amode || !vdso_enabled) + if (user_mode == HOME_SPACE_MODE || !vdso_enabled) return 0; segment_table = __get_free_pages(GFP_KERNEL, SEGMENT_ORDER); @@ -160,7 +161,7 @@ void vdso_free_per_cpu(int cpu, struct _lowcore *lowcore) unsigned long segment_table, page_table, page_frame; u32 *psal, *aste; - if (!switch_amode || !vdso_enabled) + if (user_mode == HOME_SPACE_MODE || !vdso_enabled) return; psal = (u32 *)(addr_t) lowcore->paste[4]; @@ -184,7 +185,7 @@ static void __vdso_init_cr5(void *dummy) static void vdso_init_cr5(void) { - if (switch_amode && vdso_enabled) + if (user_mode != HOME_SPACE_MODE && vdso_enabled) on_each_cpu(__vdso_init_cr5, NULL, 1); } #endif /* CONFIG_64BIT */ diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index bf164fc21864..6ee55ae84ce2 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -20,7 +20,6 @@ config KVM depends on HAVE_KVM && EXPERIMENTAL select PREEMPT_NOTIFIERS select ANON_INODES - select S390_SWITCH_AMODE ---help--- Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work diff --git a/arch/s390/lib/uaccess_mvcos.c b/arch/s390/lib/uaccess_mvcos.c index 58da3f461214..60455f104ea3 100644 --- a/arch/s390/lib/uaccess_mvcos.c +++ b/arch/s390/lib/uaccess_mvcos.c @@ -162,7 +162,6 @@ static size_t clear_user_mvcos(size_t size, void __user *to) return size; } -#ifdef CONFIG_S390_SWITCH_AMODE static size_t strnlen_user_mvcos(size_t count, const char __user *src) { char buf[256]; @@ -200,7 +199,6 @@ static size_t strncpy_from_user_mvcos(size_t count, const char __user *src, } while ((len_str == len) && (done < count)); return done; } -#endif /* CONFIG_S390_SWITCH_AMODE */ struct uaccess_ops uaccess_mvcos = { .copy_from_user = copy_from_user_mvcos_check, @@ -215,7 +213,6 @@ struct uaccess_ops uaccess_mvcos = { .futex_atomic_cmpxchg = futex_atomic_cmpxchg_std, }; -#ifdef CONFIG_S390_SWITCH_AMODE struct uaccess_ops uaccess_mvcos_switch = { .copy_from_user = copy_from_user_mvcos, .copy_from_user_small = copy_from_user_mvcos, @@ -228,4 +225,3 @@ struct uaccess_ops uaccess_mvcos_switch = { .futex_atomic_op = futex_atomic_op_pt, .futex_atomic_cmpxchg = futex_atomic_cmpxchg_pt, }; -#endif diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 3df5b918cfe2..77108e34fc1a 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -112,7 +112,7 @@ static inline int user_space_fault(unsigned long trans_exc_code) if (trans_exc_code == 2) /* Access via secondary space, set_fs setting decides */ return current->thread.mm_segment.ar4; - if (!switch_amode) + if (user_mode == HOME_SPACE_MODE) /* User space if the access has been done via home space. */ return trans_exc_code == 3; /* @@ -168,7 +168,7 @@ static void do_no_context(struct pt_regs *regs, unsigned long error_code, * terminate things with extreme prejudice. */ address = trans_exc_code & __FAIL_ADDR_MASK; - if (user_space_fault(trans_exc_code) == 0) + if (!user_space_fault(trans_exc_code)) printk(KERN_ALERT "Unable to handle kernel pointer dereference" " at virtual kernel address %p\n", (void *)address); else diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 2757c5616a07..ad621e06ada3 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -269,7 +269,7 @@ int s390_enable_sie(void) struct mm_struct *mm, *old_mm; /* Do we have switched amode? If no, we cannot do sie */ - if (!switch_amode) + if (user_mode == HOME_SPACE_MODE) return -EINVAL; /* Do we have pgstes? if yes, we are done */ -- cgit v1.2.3 From 7ecb344ae80bc03397ded3b004e06ecfe32becf9 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 7 Dec 2009 12:51:44 +0100 Subject: [S390] Improve notify_page_fault implementation. notify_page_fault does a preempt_disable/preempt_enable for each fault generated by a kernel access to user space. If kprobes is not active that is unnecessary since the interrupts are not reenabled yet. To play safe repeat the kprobe_running check after preempt_disable(). Signed-off-by: Martin Schwidefsky --- arch/s390/mm/fault.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 77108e34fc1a..fd72c269cdb4 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -52,11 +52,11 @@ extern int sysctl_userprocess_debug; #endif -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs, long err) +static inline int notify_page_fault(struct pt_regs *regs) { int ret = 0; +#ifdef CONFIG_KPROBES /* kprobe_running() needs smp_processor_id() */ if (!user_mode(regs)) { preempt_disable(); @@ -64,15 +64,9 @@ static inline int notify_page_fault(struct pt_regs *regs, long err) ret = 1; preempt_enable(); } - +#endif return ret; } -#else -static inline int notify_page_fault(struct pt_regs *regs, long err) -{ - return 0; -} -#endif /* @@ -274,7 +268,7 @@ do_exception(struct pt_regs *regs, unsigned long error_code, int write, int si_code; int fault; - if (notify_page_fault(regs, error_code)) + if (notify_page_fault(regs)) return; tsk = current; -- cgit v1.2.3 From 50d7280d430484a890ddcadc7f738b5b6dd28bf1 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 7 Dec 2009 12:51:45 +0100 Subject: [S390] fault handler performance optimization. Slim down the do_exception function to handle only the fast path of a fault and move the exceptional cases into a new function. That slightly increases the performance of the fault handling. Build fix for !CONFIG_COMPAT by Kamalesh Babulal Signed-off-by: Martin Schwidefsky --- arch/s390/mm/fault.c | 258 +++++++++++++++++++++++++-------------------------- 1 file changed, 129 insertions(+), 129 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index fd72c269cdb4..0dcfcfb5b5be 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -34,16 +34,15 @@ #include #include #include +#include #include "../kernel/entry.h" #ifndef CONFIG_64BIT #define __FAIL_ADDR_MASK 0x7ffff000 -#define __FIXUP_MASK 0x7fffffff #define __SUBCODE_MASK 0x0200 #define __PF_RES_FIELD 0ULL #else /* CONFIG_64BIT */ #define __FAIL_ADDR_MASK -4096L -#define __FIXUP_MASK ~0L #define __SUBCODE_MASK 0x0600 #define __PF_RES_FIELD 0x8000000000000000ULL #endif /* CONFIG_64BIT */ @@ -52,6 +51,10 @@ extern int sysctl_userprocess_debug; #endif +#define VM_FAULT_BADCONTEXT 0x010000 +#define VM_FAULT_BADMAP 0x020000 +#define VM_FAULT_BADACCESS 0x040000 + static inline int notify_page_fault(struct pt_regs *regs) { int ret = 0; @@ -122,18 +125,22 @@ static inline int user_space_fault(unsigned long trans_exc_code) * Send SIGSEGV to task. This is an external routine * to keep the stack usage of do_page_fault small. */ -static void do_sigsegv(struct pt_regs *regs, unsigned long error_code, - int si_code, unsigned long address) +static noinline void do_sigsegv(struct pt_regs *regs, long int_code, + int si_code, unsigned long trans_exc_code) { struct siginfo si; + unsigned long address; + address = trans_exc_code & __FAIL_ADDR_MASK; + current->thread.prot_addr = address; + current->thread.trap_no = int_code; #if defined(CONFIG_SYSCTL) || defined(CONFIG_PROCESS_DEBUG) #if defined(CONFIG_SYSCTL) if (sysctl_userprocess_debug) #endif { printk("User process fault: interruption code 0x%lX\n", - error_code); + int_code); printk("failing address: %lX\n", address); show_regs(regs); } @@ -144,14 +151,14 @@ static void do_sigsegv(struct pt_regs *regs, unsigned long error_code, force_sig_info(SIGSEGV, &si, current); } -static void do_no_context(struct pt_regs *regs, unsigned long error_code, - unsigned long trans_exc_code) +static noinline void do_no_context(struct pt_regs *regs, long int_code, + unsigned long trans_exc_code) { const struct exception_table_entry *fixup; unsigned long address; /* Are we prepared to handle this kernel fault? */ - fixup = search_exception_tables(regs->psw.addr & __FIXUP_MASK); + fixup = search_exception_tables(regs->psw.addr & PSW_ADDR_INSN); if (fixup) { regs->psw.addr = fixup->fixup | PSW_ADDR_AMODE; return; @@ -169,107 +176,127 @@ static void do_no_context(struct pt_regs *regs, unsigned long error_code, printk(KERN_ALERT "Unable to handle kernel paging request" " at virtual user address %p\n", (void *)address); - die("Oops", regs, error_code); + die("Oops", regs, int_code); do_exit(SIGKILL); } -static void do_low_address(struct pt_regs *regs, unsigned long error_code, - unsigned long trans_exc_code) +static noinline void do_low_address(struct pt_regs *regs, long int_code, + unsigned long trans_exc_code) { /* Low-address protection hit in kernel mode means NULL pointer write access in kernel mode. */ if (regs->psw.mask & PSW_MASK_PSTATE) { /* Low-address protection hit in user mode 'cannot happen'. */ - die ("Low-address protection", regs, error_code); + die ("Low-address protection", regs, int_code); do_exit(SIGKILL); } - do_no_context(regs, error_code, trans_exc_code); + do_no_context(regs, int_code, trans_exc_code); } -static void do_sigbus(struct pt_regs *regs, unsigned long error_code, - unsigned long trans_exc_code) +static noinline void do_sigbus(struct pt_regs *regs, long int_code, + unsigned long trans_exc_code) { struct task_struct *tsk = current; - struct mm_struct *mm = tsk->mm; - up_read(&mm->mmap_sem); /* * Send a sigbus, regardless of whether we were in kernel * or user mode. */ tsk->thread.prot_addr = trans_exc_code & __FAIL_ADDR_MASK; - tsk->thread.trap_no = error_code; + tsk->thread.trap_no = int_code; force_sig(SIGBUS, tsk); - - /* Kernel mode? Handle exceptions or die */ - if (!(regs->psw.mask & PSW_MASK_PSTATE)) - do_no_context(regs, error_code, trans_exc_code); } #ifdef CONFIG_S390_EXEC_PROTECT -static int signal_return(struct mm_struct *mm, struct pt_regs *regs, - unsigned long address, unsigned long error_code) +static noinline int signal_return(struct pt_regs *regs, long int_code, + unsigned long trans_exc_code) { u16 instruction; int rc; -#ifdef CONFIG_COMPAT - int compat; -#endif - pagefault_disable(); rc = __get_user(instruction, (u16 __user *) regs->psw.addr); - pagefault_enable(); - if (rc) - return -EFAULT; - up_read(&mm->mmap_sem); - clear_tsk_thread_flag(current, TIF_SINGLE_STEP); -#ifdef CONFIG_COMPAT - compat = is_compat_task(); - if (compat && instruction == 0x0a77) - sys32_sigreturn(); - else if (compat && instruction == 0x0aad) - sys32_rt_sigreturn(); - else -#endif - if (instruction == 0x0a77) - sys_sigreturn(); - else if (instruction == 0x0aad) - sys_rt_sigreturn(); - else { - current->thread.prot_addr = address; - current->thread.trap_no = error_code; - do_sigsegv(regs, error_code, SEGV_MAPERR, address); - } + if (!rc && instruction == 0x0a77) { + clear_tsk_thread_flag(current, TIF_SINGLE_STEP); + if (is_compat_task()) + sys32_sigreturn(); + else + sys_sigreturn(); + } else if (!rc && instruction == 0x0aad) { + clear_tsk_thread_flag(current, TIF_SINGLE_STEP); + if (is_compat_task()) + sys32_rt_sigreturn(); + else + sys_rt_sigreturn(); + } else + do_sigsegv(regs, int_code, SEGV_MAPERR, trans_exc_code); return 0; } #endif /* CONFIG_S390_EXEC_PROTECT */ +static noinline void do_fault_error(struct pt_regs *regs, long int_code, + unsigned long trans_exc_code, int fault) +{ + int si_code; + + switch (fault) { + case VM_FAULT_BADACCESS: +#ifdef CONFIG_S390_EXEC_PROTECT + if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY && + (trans_exc_code & 3) == 0) { + signal_return(regs, int_code, trans_exc_code); + break; + } +#endif /* CONFIG_S390_EXEC_PROTECT */ + case VM_FAULT_BADMAP: + /* Bad memory access. Check if it is kernel or user space. */ + if (regs->psw.mask & PSW_MASK_PSTATE) { + /* User mode accesses just cause a SIGSEGV */ + si_code = (fault == VM_FAULT_BADMAP) ? + SEGV_MAPERR : SEGV_ACCERR; + do_sigsegv(regs, int_code, si_code, trans_exc_code); + return; + } + case VM_FAULT_BADCONTEXT: + do_no_context(regs, int_code, trans_exc_code); + break; + default: /* fault & VM_FAULT_ERROR */ + if (fault & VM_FAULT_OOM) + pagefault_out_of_memory(); + else if (fault & VM_FAULT_SIGBUS) { + do_sigbus(regs, int_code, trans_exc_code); + /* Kernel mode? Handle exceptions or die */ + if (!(regs->psw.mask & PSW_MASK_PSTATE)) + do_no_context(regs, int_code, trans_exc_code); + } else + BUG(); + break; + } +} + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. * - * error_code: + * interruption code (int_code): * 04 Protection -> Write-Protection (suprression) * 10 Segment translation -> Not present (nullification) * 11 Page translation -> Not present (nullification) * 3b Region third trans. -> Not present (nullification) */ -static inline void -do_exception(struct pt_regs *regs, unsigned long error_code, int write, - unsigned long trans_exc_code) +static inline int do_exception(struct pt_regs *regs, int write, + unsigned long trans_exc_code) { struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct *vma; unsigned long address; - int si_code; int fault; if (notify_page_fault(regs)) - return; + return 0; tsk = current; mm = tsk->mm; @@ -279,8 +306,9 @@ do_exception(struct pt_regs *regs, unsigned long error_code, int write, * we are not in an interrupt and that there is a * user context. */ + fault = VM_FAULT_BADCONTEXT; if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) - goto no_context; + goto out; address = trans_exc_code & __FAIL_ADDR_MASK; /* @@ -292,41 +320,35 @@ do_exception(struct pt_regs *regs, unsigned long error_code, int write, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); down_read(&mm->mmap_sem); - si_code = SEGV_MAPERR; + fault = VM_FAULT_BADMAP; vma = find_vma(mm, address); if (!vma) - goto bad_area; + goto out_up; + if (unlikely(vma->vm_start > address)) { + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out_up; + if (expand_stack(vma, address)) + goto out_up; + } + + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ + fault = VM_FAULT_BADACCESS; #ifdef CONFIG_S390_EXEC_PROTECT if (unlikely((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY && (trans_exc_code & 3) == 0 && !(vma->vm_flags & VM_EXEC))) - if (!signal_return(mm, regs, address, error_code)) - /* - * signal_return() has done an up_read(&mm->mmap_sem) - * if it returns 0. - */ - return; + goto out_up; #endif - - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ -good_area: - si_code = SEGV_ACCERR; if (!write) { /* page not present, check vm flags */ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; + goto out_up; } else { if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; + goto out_up; } if (is_vm_hugetlb_page(vma)) @@ -337,17 +359,9 @@ good_area: * the fault. */ fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); - if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) { - up_read(&mm->mmap_sem); - pagefault_out_of_memory(); - return; - } else if (fault & VM_FAULT_SIGBUS) { - do_sigbus(regs, error_code, address); - return; - } - BUG(); - } + if (unlikely(fault & VM_FAULT_ERROR)) + goto out_up; + if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, @@ -357,67 +371,55 @@ good_area: perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } - up_read(&mm->mmap_sem); /* * The instruction that caused the program check will * be repeated. Don't signal single step via SIGTRAP. */ clear_tsk_thread_flag(tsk, TIF_SINGLE_STEP); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: + fault = 0; +out_up: up_read(&mm->mmap_sem); - - /* User mode accesses just cause a SIGSEGV */ - if (regs->psw.mask & PSW_MASK_PSTATE) { - tsk->thread.prot_addr = address; - tsk->thread.trap_no = error_code; - do_sigsegv(regs, error_code, si_code, address); - return; - } - -no_context: - do_no_context(regs, error_code, trans_exc_code); +out: + return fault; } -void __kprobes do_protection_exception(struct pt_regs *regs, - long error_code) +void __kprobes do_protection_exception(struct pt_regs *regs, long int_code) { unsigned long trans_exc_code = S390_lowcore.trans_exc_code; + int fault; /* Protection exception is supressing, decrement psw address. */ - regs->psw.addr -= (error_code >> 16); + regs->psw.addr -= (int_code >> 16); /* * Check for low-address protection. This needs to be treated * as a special case because the translation exception code * field is not guaranteed to contain valid data in this case. */ if (unlikely(!(trans_exc_code & 4))) { - do_low_address(regs, error_code, trans_exc_code); + do_low_address(regs, int_code, trans_exc_code); return; } - do_exception(regs, 4, 1, trans_exc_code); + fault = do_exception(regs, 1, trans_exc_code); + if (unlikely(fault)) + do_fault_error(regs, 4, trans_exc_code, fault); } -void __kprobes do_dat_exception(struct pt_regs *regs, long error_code) +void __kprobes do_dat_exception(struct pt_regs *regs, long int_code) { - do_exception(regs, error_code & 0xff, 0, S390_lowcore.trans_exc_code); + unsigned long trans_exc_code = S390_lowcore.trans_exc_code; + int fault; + + fault = do_exception(regs, 0, trans_exc_code); + if (unlikely(fault)) + do_fault_error(regs, int_code & 255, trans_exc_code, fault); } #ifdef CONFIG_64BIT -void __kprobes do_asce_exception(struct pt_regs *regs, unsigned long error_code) +void __kprobes do_asce_exception(struct pt_regs *regs, long int_code) { unsigned long trans_exc_code = S390_lowcore.trans_exc_code; - struct mm_struct *mm; + struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long address; - - mm = current->mm; - address = trans_exc_code & __FAIL_ADDR_MASK; if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) goto no_context; @@ -425,7 +427,7 @@ void __kprobes do_asce_exception(struct pt_regs *regs, unsigned long error_code) local_irq_enable(); down_read(&mm->mmap_sem); - vma = find_vma(mm, address); + vma = find_vma(mm, trans_exc_code & __FAIL_ADDR_MASK); up_read(&mm->mmap_sem); if (vma) { @@ -435,14 +437,12 @@ void __kprobes do_asce_exception(struct pt_regs *regs, unsigned long error_code) /* User mode accesses just cause a SIGSEGV */ if (regs->psw.mask & PSW_MASK_PSTATE) { - current->thread.prot_addr = address; - current->thread.trap_no = error_code; - do_sigsegv(regs, error_code, SEGV_MAPERR, address); + do_sigsegv(regs, int_code, SEGV_MAPERR, trans_exc_code); return; } no_context: - do_no_context(regs, error_code, trans_exc_code); + do_no_context(regs, int_code, trans_exc_code); } #endif @@ -507,7 +507,7 @@ void pfault_fini(void) : : "a" (&refbk), "m" (refbk) : "cc"); } -static void pfault_interrupt(__u16 error_code) +static void pfault_interrupt(__u16 int_code) { struct task_struct *tsk; __u16 subcode; -- cgit v1.2.3 From 1ab947de293f43812276b60cf9fa21127e7a5bb2 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 7 Dec 2009 12:51:46 +0100 Subject: [S390] fault handler access flags check. Simplify the check of the vma->flags in do_exception for the different fault types. Signed-off-by: Martin Schwidefsky --- arch/s390/mm/fault.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 0dcfcfb5b5be..5a9e9a77dc16 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -286,7 +286,7 @@ static noinline void do_fault_error(struct pt_regs *regs, long int_code, * 11 Page translation -> Not present (nullification) * 3b Region third trans. -> Not present (nullification) */ -static inline int do_exception(struct pt_regs *regs, int write, +static inline int do_exception(struct pt_regs *regs, int access, unsigned long trans_exc_code) { struct task_struct *tsk; @@ -337,19 +337,8 @@ static inline int do_exception(struct pt_regs *regs, int write, * we can handle it.. */ fault = VM_FAULT_BADACCESS; -#ifdef CONFIG_S390_EXEC_PROTECT - if (unlikely((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY && - (trans_exc_code & 3) == 0 && !(vma->vm_flags & VM_EXEC))) + if (unlikely(!(vma->vm_flags & access))) goto out_up; -#endif - if (!write) { - /* page not present, check vm flags */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto out_up; - } else { - if (!(vma->vm_flags & VM_WRITE)) - goto out_up; - } if (is_vm_hugetlb_page(vma)) address &= HPAGE_MASK; @@ -358,7 +347,8 @@ static inline int do_exception(struct pt_regs *regs, int write, * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); + fault = handle_mm_fault(mm, vma, address, + (access == VM_WRITE) ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) goto out_up; @@ -399,7 +389,7 @@ void __kprobes do_protection_exception(struct pt_regs *regs, long int_code) do_low_address(regs, int_code, trans_exc_code); return; } - fault = do_exception(regs, 1, trans_exc_code); + fault = do_exception(regs, VM_WRITE, trans_exc_code); if (unlikely(fault)) do_fault_error(regs, 4, trans_exc_code, fault); } @@ -407,9 +397,15 @@ void __kprobes do_protection_exception(struct pt_regs *regs, long int_code) void __kprobes do_dat_exception(struct pt_regs *regs, long int_code) { unsigned long trans_exc_code = S390_lowcore.trans_exc_code; - int fault; + int access, fault; - fault = do_exception(regs, 0, trans_exc_code); + access = VM_READ | VM_EXEC | VM_WRITE; +#ifdef CONFIG_S390_EXEC_PROTECT + if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY && + (trans_exc_code & 3) == 0) + access = VM_EXEC; +#endif + fault = do_exception(regs, access, trans_exc_code); if (unlikely(fault)) do_fault_error(regs, int_code & 255, trans_exc_code, fault); } -- cgit v1.2.3 From 6c1e3e79430615d0472dbf9f8fed89c571e66423 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 7 Dec 2009 12:51:47 +0100 Subject: [S390] Use do_exception() in pagetable walk usercopy functions. The pagetable walk usercopy functions have used a modified copy of the do_exception() function for fault handling. This lead to inconsistencies with recent changes to do_exception(), e.g. performance counters. This patch changes the pagetable walk usercopy code to call do_exception() directly, eliminating the redundancy. A new parameter is added to do_exception() to specify the fault address. Signed-off-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/uaccess.h | 2 + arch/s390/lib/uaccess_pt.c | 147 ++++++++++++++-------------------------- arch/s390/mm/fault.c | 23 +++++++ 3 files changed, 76 insertions(+), 96 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 8377e91533d2..cbf0a8745bf4 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -93,6 +93,8 @@ extern struct uaccess_ops uaccess_mvcos; extern struct uaccess_ops uaccess_mvcos_switch; extern struct uaccess_ops uaccess_pt; +extern int __handle_fault(unsigned long, unsigned long, int); + static inline int __put_user_fn(size_t size, void __user *ptr, void *x) { size = uaccess.copy_to_user_small(size, ptr, x); diff --git a/arch/s390/lib/uaccess_pt.c b/arch/s390/lib/uaccess_pt.c index cb5d59eab0ee..404f2de296dc 100644 --- a/arch/s390/lib/uaccess_pt.c +++ b/arch/s390/lib/uaccess_pt.c @@ -23,86 +23,21 @@ static inline pte_t *follow_table(struct mm_struct *mm, unsigned long addr) pgd = pgd_offset(mm, addr); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - return NULL; + return (pte_t *) 0x3a; pud = pud_offset(pgd, addr); if (pud_none(*pud) || unlikely(pud_bad(*pud))) - return NULL; + return (pte_t *) 0x3b; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - return NULL; + return (pte_t *) 0x10; return pte_offset_map(pmd, addr); } -static int __handle_fault(struct mm_struct *mm, unsigned long address, - int write_access) -{ - struct vm_area_struct *vma; - int ret = -EFAULT; - int fault; - - if (in_atomic()) - return ret; - down_read(&mm->mmap_sem); - vma = find_vma(mm, address); - if (unlikely(!vma)) - goto out; - if (unlikely(vma->vm_start > address)) { - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto out; - if (expand_stack(vma, address)) - goto out; - } - - if (!write_access) { - /* page not present, check vm flags */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto out; - } else { - if (!(vma->vm_flags & VM_WRITE)) - goto out; - } - -survive: - fault = handle_mm_fault(mm, vma, address, write_access ? FAULT_FLAG_WRITE : 0); - if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto out_sigbus; - BUG(); - } - if (fault & VM_FAULT_MAJOR) - current->maj_flt++; - else - current->min_flt++; - ret = 0; -out: - up_read(&mm->mmap_sem); - return ret; - -out_of_memory: - up_read(&mm->mmap_sem); - if (is_global_init(current)) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - printk("VM: killing process %s\n", current->comm); - return ret; - -out_sigbus: - up_read(&mm->mmap_sem); - current->thread.prot_addr = address; - current->thread.trap_no = 0x11; - force_sig(SIGBUS, current); - return ret; -} - -static size_t __user_copy_pt(unsigned long uaddr, void *kptr, - size_t n, int write_user) +static __always_inline size_t __user_copy_pt(unsigned long uaddr, void *kptr, + size_t n, int write_user) { struct mm_struct *mm = current->mm; unsigned long offset, pfn, done, size; @@ -114,12 +49,17 @@ retry: spin_lock(&mm->page_table_lock); do { pte = follow_table(mm, uaddr); - if (!pte || !pte_present(*pte) || - (write_user && !pte_write(*pte))) + if ((unsigned long) pte < 0x1000) goto fault; + if (!pte_present(*pte)) { + pte = (pte_t *) 0x11; + goto fault; + } else if (write_user && !pte_write(*pte)) { + pte = (pte_t *) 0x04; + goto fault; + } pfn = pte_pfn(*pte); - offset = uaddr & (PAGE_SIZE - 1); size = min(n - done, PAGE_SIZE - offset); if (write_user) { @@ -137,7 +77,7 @@ retry: return n - done; fault: spin_unlock(&mm->page_table_lock); - if (__handle_fault(mm, uaddr, write_user)) + if (__handle_fault(uaddr, (unsigned long) pte, write_user)) return n - done; goto retry; } @@ -146,30 +86,31 @@ fault: * Do DAT for user address by page table walk, return kernel address. * This function needs to be called with current->mm->page_table_lock held. */ -static unsigned long __dat_user_addr(unsigned long uaddr) +static __always_inline unsigned long __dat_user_addr(unsigned long uaddr) { struct mm_struct *mm = current->mm; - unsigned long pfn, ret; + unsigned long pfn; pte_t *pte; int rc; - ret = 0; retry: pte = follow_table(mm, uaddr); - if (!pte || !pte_present(*pte)) + if ((unsigned long) pte < 0x1000) goto fault; + if (!pte_present(*pte)) { + pte = (pte_t *) 0x11; + goto fault; + } pfn = pte_pfn(*pte); - ret = (pfn << PAGE_SHIFT) + (uaddr & (PAGE_SIZE - 1)); -out: - return ret; + return (pfn << PAGE_SHIFT) + (uaddr & (PAGE_SIZE - 1)); fault: spin_unlock(&mm->page_table_lock); - rc = __handle_fault(mm, uaddr, 0); + rc = __handle_fault(uaddr, (unsigned long) pte, 0); spin_lock(&mm->page_table_lock); - if (rc) - goto out; - goto retry; + if (!rc) + goto retry; + return 0; } size_t copy_from_user_pt(size_t n, const void __user *from, void *to) @@ -234,8 +175,12 @@ retry: spin_lock(&mm->page_table_lock); do { pte = follow_table(mm, uaddr); - if (!pte || !pte_present(*pte)) + if ((unsigned long) pte < 0x1000) + goto fault; + if (!pte_present(*pte)) { + pte = (pte_t *) 0x11; goto fault; + } pfn = pte_pfn(*pte); offset = uaddr & (PAGE_SIZE-1); @@ -249,9 +194,8 @@ retry: return done + 1; fault: spin_unlock(&mm->page_table_lock); - if (__handle_fault(mm, uaddr, 0)) { + if (__handle_fault(uaddr, (unsigned long) pte, 0)) return 0; - } goto retry; } @@ -284,7 +228,7 @@ static size_t copy_in_user_pt(size_t n, void __user *to, { struct mm_struct *mm = current->mm; unsigned long offset_from, offset_to, offset_max, pfn_from, pfn_to, - uaddr, done, size; + uaddr, done, size, error_code; unsigned long uaddr_from = (unsigned long) from; unsigned long uaddr_to = (unsigned long) to; pte_t *pte_from, *pte_to; @@ -298,17 +242,28 @@ static size_t copy_in_user_pt(size_t n, void __user *to, retry: spin_lock(&mm->page_table_lock); do { + write_user = 0; + uaddr = uaddr_from; pte_from = follow_table(mm, uaddr_from); - if (!pte_from || !pte_present(*pte_from)) { - uaddr = uaddr_from; - write_user = 0; + error_code = (unsigned long) pte_from; + if (error_code < 0x1000) + goto fault; + if (!pte_present(*pte_from)) { + error_code = 0x11; goto fault; } + write_user = 1; + uaddr = uaddr_to; pte_to = follow_table(mm, uaddr_to); - if (!pte_to || !pte_present(*pte_to) || !pte_write(*pte_to)) { - uaddr = uaddr_to; - write_user = 1; + error_code = (unsigned long) pte_to; + if (error_code < 0x1000) + goto fault; + if (!pte_present(*pte_to)) { + error_code = 0x11; + goto fault; + } else if (!pte_write(*pte_to)) { + error_code = 0x04; goto fault; } @@ -329,7 +284,7 @@ retry: return n - done; fault: spin_unlock(&mm->page_table_lock); - if (__handle_fault(mm, uaddr, write_user)) + if (__handle_fault(uaddr, error_code, write_user)) return n - done; goto retry; } diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 5a9e9a77dc16..fc102e70d9c2 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -442,6 +442,29 @@ no_context: } #endif +int __handle_fault(unsigned long uaddr, unsigned long int_code, int write_user) +{ + struct pt_regs regs; + int access, fault; + + regs.psw.mask = psw_kernel_bits; + if (!irqs_disabled()) + regs.psw.mask |= PSW_MASK_IO | PSW_MASK_EXT; + regs.psw.addr = (unsigned long) __builtin_return_address(0); + regs.psw.addr |= PSW_ADDR_AMODE; + uaddr &= PAGE_MASK; + access = write_user ? VM_WRITE : VM_READ; + fault = do_exception(®s, access, uaddr | 2); + if (unlikely(fault)) { + if (fault & VM_FAULT_OOM) { + pagefault_out_of_memory(); + fault = 0; + } else if (fault & VM_FAULT_SIGBUS) + do_sigbus(®s, int_code, uaddr); + } + return fault ? -EFAULT : 0; +} + #ifdef CONFIG_PFAULT /* * 'pfault' pseudo page faults routines. -- cgit v1.2.3 From 52b169c864ea8622c4755172844fd24168c81195 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 7 Dec 2009 12:52:06 +0100 Subject: [S390] cmm: free pages on hibernate. The pages allocated by the cmm memory balloon should be freed before the hibernation image is created. Otherwise the memory reserved by the balloon gets written to the swap device but there is no content in these pages that need to be preserved. Signed-off-by: Martin Schwidefsky --- arch/s390/mm/cmm.c | 61 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 11 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index b201135cc18c..5e5f3849660f 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -44,6 +45,7 @@ static volatile long cmm_pages_target; static volatile long cmm_timed_pages_target; static long cmm_timeout_pages; static long cmm_timeout_seconds; +static int cmm_suspended; static struct cmm_page_array *cmm_page_list; static struct cmm_page_array *cmm_timed_page_list; @@ -147,9 +149,9 @@ cmm_thread(void *dummy) while (1) { rc = wait_event_interruptible(cmm_thread_wait, - (cmm_pages != cmm_pages_target || - cmm_timed_pages != cmm_timed_pages_target || - kthread_should_stop())); + (!cmm_suspended && (cmm_pages != cmm_pages_target || + cmm_timed_pages != cmm_timed_pages_target)) || + kthread_should_stop()); if (kthread_should_stop() || rc == -ERESTARTSYS) { cmm_pages_target = cmm_pages; cmm_timed_pages_target = cmm_timed_pages; @@ -411,6 +413,38 @@ cmm_smsg_target(char *from, char *msg) static struct ctl_table_header *cmm_sysctl_header; +static int cmm_suspend(void) +{ + cmm_suspended = 1; + cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list); + cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list); + return 0; +} + +static int cmm_resume(void) +{ + cmm_suspended = 0; + cmm_kick_thread(); + return 0; +} + +static int cmm_power_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + switch (event) { + case PM_POST_HIBERNATION: + return cmm_resume(); + case PM_HIBERNATION_PREPARE: + return cmm_suspend(); + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block cmm_power_notifier = { + .notifier_call = cmm_power_event, +}; + static int cmm_init (void) { @@ -419,7 +453,7 @@ cmm_init (void) #ifdef CONFIG_CMM_PROC cmm_sysctl_header = register_sysctl_table(cmm_dir_table); if (!cmm_sysctl_header) - goto out; + goto out_sysctl; #endif #ifdef CONFIG_CMM_IUCV rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target); @@ -429,17 +463,21 @@ cmm_init (void) rc = register_oom_notifier(&cmm_oom_nb); if (rc < 0) goto out_oom_notify; + rc = register_pm_notifier(&cmm_power_notifier); + if (rc) + goto out_pm; init_waitqueue_head(&cmm_thread_wait); init_timer(&cmm_timer); cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread"); rc = IS_ERR(cmm_thread_ptr) ? PTR_ERR(cmm_thread_ptr) : 0; - if (!rc) - goto out; - /* - * kthread_create failed. undo all the stuff from above again. - */ - unregister_oom_notifier(&cmm_oom_nb); + if (rc) + goto out_kthread; + return 0; +out_kthread: + unregister_pm_notifier(&cmm_power_notifier); +out_pm: + unregister_oom_notifier(&cmm_oom_nb); out_oom_notify: #ifdef CONFIG_CMM_IUCV smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target); @@ -447,8 +485,8 @@ out_smsg: #endif #ifdef CONFIG_CMM_PROC unregister_sysctl_table(cmm_sysctl_header); +out_sysctl: #endif -out: return rc; } @@ -456,6 +494,7 @@ static void cmm_exit(void) { kthread_stop(cmm_thread_ptr); + unregister_pm_notifier(&cmm_power_notifier); unregister_oom_notifier(&cmm_oom_nb); cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list); cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list); -- cgit v1.2.3 From 6a985c6194017de2c062916ad1cd00dee0302c40 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 7 Dec 2009 12:52:11 +0100 Subject: [S390] s390: use change recording override for kernel mapping We dont need the dirty bit if a write access is done via the kernel mapping. In that case SetPageDirty and friends are used anyway, no need to do that a second time. We can use the change-recording overide function for the kernel mapping, if available. Signed-off-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgtable.h | 4 +++- arch/s390/mm/vmem.c | 11 ++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 60a7b1a1702f..e2fa79cf0614 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -169,12 +169,13 @@ extern unsigned long VMALLOC_START; * STL Segment-Table-Length: Segment-table length (STL+1*16 entries -> up to 2048) * * A 64 bit pagetable entry of S390 has following format: - * | PFRA |0IP0| OS | + * | PFRA |0IPC| OS | * 0000000000111111111122222222223333333333444444444455555555556666 * 0123456789012345678901234567890123456789012345678901234567890123 * * I Page-Invalid Bit: Page is not available for address-translation * P Page-Protection Bit: Store access not possible for page + * C Change-bit override: HW is not required to set change bit * * A 64 bit segmenttable entry of S390 has following format: * | P-table origin | TT @@ -218,6 +219,7 @@ extern unsigned long VMALLOC_START; */ /* Hardware bits in the page table entry */ +#define _PAGE_CO 0x100 /* HW Change-bit override */ #define _PAGE_RO 0x200 /* HW read-only bit */ #define _PAGE_INVALID 0x400 /* HW invalid bit */ diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 5f91a38d7592..300ab012b0fd 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -70,8 +70,12 @@ static pte_t __ref *vmem_pte_alloc(void) pte = alloc_bootmem(PTRS_PER_PTE * sizeof(pte_t)); if (!pte) return NULL; - clear_table((unsigned long *) pte, _PAGE_TYPE_EMPTY, - PTRS_PER_PTE * sizeof(pte_t)); + if (MACHINE_HAS_HPAGE) + clear_table((unsigned long *) pte, _PAGE_TYPE_EMPTY | _PAGE_CO, + PTRS_PER_PTE * sizeof(pte_t)); + else + clear_table((unsigned long *) pte, _PAGE_TYPE_EMPTY, + PTRS_PER_PTE * sizeof(pte_t)); return pte; } @@ -112,7 +116,8 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro) if (MACHINE_HAS_HPAGE && !(address & ~HPAGE_MASK) && (address + HPAGE_SIZE <= start + size) && (address >= HPAGE_SIZE)) { - pte_val(pte) |= _SEGMENT_ENTRY_LARGE; + pte_val(pte) |= _SEGMENT_ENTRY_LARGE | + _SEGMENT_ENTRY_CO; pmd_val(*pm_dir) = pte_val(pte); address += HPAGE_SIZE - PAGE_SIZE; continue; -- cgit v1.2.3