From a0458b07c17a10ea316e6ae65ab15b78bf5f44ee Mon Sep 17 00:00:00 2001 From: Giuseppe CAVALLARO Date: Tue, 7 Jul 2009 16:25:10 +0200 Subject: sh: add sleazy FPU optimization sh port of the sLeAZY-fpu feature currently implemented for some architectures such us i386. Right now the SH kernel has a 100% lazy fpu behaviour. This is of course great for applications that have very sporadic or no FPU use. However for very frequent FPU users... you take an extra trap every context switch. The patch below adds a simple heuristic to this code: after 5 consecutive context switches of FPU use, the lazy behavior is disabled and the context gets restored every context switch. After 256 switches, this is reset and the 100% lazy behavior is returned. Tests with LMbench showed no regression. I saw a little improvement due to the prefetching (~2%). The tests below also show that, with this sLeazy patch, indeed, the number of FPU exceptions is reduced. To test this. I hacked the lat_ctx LMBench to use the FPU a little more. sLeasy implementation =========================================== switch_to calls | 79326 sleasy calls | 42577 do_fpu_state_restore calls| 59232 restore_fpu calls | 59032 Exceptions: 0x800 (FPU disabled ): 16604 100% Leazy (default implementation) =========================================== switch_to calls | 79690 do_fpu_state_restore calls | 53299 restore_fpu calls | 53101 Exceptions: 0x800 (FPU disabled ): 53273 Signed-off-by: Giuseppe Cavallaro Signed-off-by: Stuart Menefy Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4/fpu.c | 16 ++++++++++++---- arch/sh/kernel/process_32.c | 16 ++++++++++++++++ 2 files changed, 28 insertions(+), 4 deletions(-) (limited to 'arch/sh/kernel') diff --git a/arch/sh/kernel/cpu/sh4/fpu.c b/arch/sh/kernel/cpu/sh4/fpu.c index e3ea5411da6d..d79226fa59d1 100644 --- a/arch/sh/kernel/cpu/sh4/fpu.c +++ b/arch/sh/kernel/cpu/sh4/fpu.c @@ -483,18 +483,18 @@ BUILD_TRAP_HANDLER(fpu_error) force_sig(SIGFPE, tsk); } -BUILD_TRAP_HANDLER(fpu_state_restore) +void fpu_state_restore(struct pt_regs *regs) { struct task_struct *tsk = current; - TRAP_HANDLER_DECL; grab_fpu(regs); - if (!user_mode(regs)) { + if (unlikely(!user_mode(regs))) { printk(KERN_ERR "BUG: FPU is used in kernel mode.\n"); + BUG(); return; } - if (used_math()) { + if (likely(used_math())) { /* Using the FPU again. */ restore_fpu(tsk); } else { @@ -503,4 +503,12 @@ BUILD_TRAP_HANDLER(fpu_state_restore) set_used_math(); } set_tsk_thread_flag(tsk, TIF_USEDFPU); + tsk->fpu_counter++; +} + +BUILD_TRAP_HANDLER(fpu_state_restore) +{ + TRAP_HANDLER_DECL; + + fpu_state_restore(regs); } diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index 0673c4746be3..aff5fe02e393 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -288,8 +288,14 @@ static void ubc_set_tracing(int asid, unsigned long pc) __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev, struct task_struct *next) { + struct thread_struct *next_t = &next->thread; + #if defined(CONFIG_SH_FPU) unlazy_fpu(prev, task_pt_regs(prev)); + + /* we're going to use this soon, after a few expensive things */ + if (next->fpu_counter > 5) + prefetch(&next_t->fpu.hard); #endif #ifdef CONFIG_MMU @@ -321,6 +327,16 @@ __switch_to(struct task_struct *prev, struct task_struct *next) #endif } +#if defined(CONFIG_SH_FPU) + /* If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + if (next->fpu_counter > 5) { + fpu_state_restore(task_pt_regs(next)); + } +#endif + return prev; } -- cgit v1.2.3