summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/include
diff options
context:
space:
mode:
authorNicholas Piggin <npiggin@gmail.com>2018-09-15 01:30:56 +1000
committerMichael Ellerman <mpe@ellerman.id.au>2018-10-14 18:04:09 +1100
commit5434ae74629af58ad0fc27143a9ea435f7734410 (patch)
tree5af6105fc36007c4228cfeeda75405eddf19a8c1 /arch/powerpc/include
parent425d33146260a4a2e8a1ba64003d6c8ff3bdfcc4 (diff)
downloadlinux-5434ae74629af58ad0fc27143a9ea435f7734410.tar.bz2
powerpc/64s/hash: Add a SLB preload cache
When switching processes, currently all user SLBEs are cleared, and a few (exec_base, pc, and stack) are preloaded. In trivial testing with small apps, this tends to miss the heap and low 256MB segments, and it will also miss commonly accessed segments on large memory workloads. Add a simple round-robin preload cache that just inserts the last SLB miss into the head of the cache and preloads those at context switch time. Every 256 context switches, the oldest entry is removed from the cache to shrink the cache and require fewer slbmte if they are unused. Much more could go into this, including into the SLB entry reclaim side to track some LRU information etc, which would require a study of large memory workloads. But this is a simple thing we can do now that is an obvious win for common workloads. With the full series, process switching speed on the context_switch benchmark on POWER9/hash (with kernel speculation security masures disabled) increases from 140K/s to 178K/s (27%). POWER8 does not change much (within 1%), it's unclear why it does not see a big gain like POWER9. Booting to busybox init with 256MB segments has SLB misses go down from 945 to 69, and with 1T segments 900 to 21. These could almost all be eliminated by preloading a bit more carefully with ELF binary loading. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Diffstat (limited to 'arch/powerpc/include')
-rw-r--r--arch/powerpc/include/asm/processor.h1
-rw-r--r--arch/powerpc/include/asm/thread_info.h5
2 files changed, 6 insertions, 0 deletions
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 3fefb8a65b17..7d04d60a39c9 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -273,6 +273,7 @@ struct thread_struct {
#endif /* CONFIG_HAVE_HW_BREAKPOINT */
struct arch_hw_breakpoint hw_brk; /* info on the hardware breakpoint */
unsigned long trap_nr; /* last trap # on this thread */
+ u8 load_slb; /* Ages out SLB preload cache entries */
u8 load_fp;
#ifdef CONFIG_ALTIVEC
u8 load_vec;
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index 916a3d67b592..544cac0474cb 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -29,6 +29,7 @@
#include <asm/page.h>
#include <asm/accounting.h>
+#define SLB_PRELOAD_NR 16U
/*
* low level task data.
*/
@@ -44,6 +45,10 @@ struct thread_info {
#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC32)
struct cpu_accounting_data accounting;
#endif
+ unsigned char slb_preload_nr;
+ unsigned char slb_preload_tail;
+ u32 slb_preload_esid[SLB_PRELOAD_NR];
+
/* low level flags - has atomic operations done on it */
unsigned long flags ____cacheline_aligned_in_smp;
};