diff options
-rw-r--r-- | arch/x86/Kconfig | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable-3level.h | 46 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel.c | 8 | ||||
-rw-r--r-- | arch/x86/mm/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/mm/pat.c | 8 | ||||
-rw-r--r-- | arch/x86/mm/pat_internal.h | 20 | ||||
-rw-r--r-- | arch/x86/mm/pat_interval.c | 185 | ||||
-rw-r--r-- | arch/x86/mm/pat_rbtree.c | 268 |
8 files changed, 229 insertions, 309 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2e6a53881b7b..b89eb1f0c0d2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1462,6 +1462,7 @@ config X86_PAE config X86_5LEVEL bool "Enable 5-level page tables support" + default y select DYNAMIC_MEMORY_LAYOUT select SPARSEMEM_VMEMMAP depends on X86_64 diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index e3633795fb22..5afb5e0fe903 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -36,39 +36,41 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte) #define pmd_read_atomic pmd_read_atomic /* - * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with - * a "*pmdp" dereference done by gcc. Problem is, in certain places - * where pte_offset_map_lock is called, concurrent page faults are + * pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with + * a "*pmdp" dereference done by GCC. Problem is, in certain places + * where pte_offset_map_lock() is called, concurrent page faults are * allowed, if the mmap_sem is hold for reading. An example is mincore * vs page faults vs MADV_DONTNEED. On the page fault side - * pmd_populate rightfully does a set_64bit, but if we're reading the + * pmd_populate() rightfully does a set_64bit(), but if we're reading the * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen - * because gcc will not read the 64bit of the pmd atomically. To fix - * this all places running pmd_offset_map_lock() while holding the + * because GCC will not read the 64-bit value of the pmd atomically. + * + * To fix this all places running pte_offset_map_lock() while holding the * mmap_sem in read mode, shall read the pmdp pointer using this - * function to know if the pmd is null nor not, and in turn to know if - * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd + * function to know if the pmd is null or not, and in turn to know if + * they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd * operations. * - * Without THP if the mmap_sem is hold for reading, the pmd can only - * transition from null to not null while pmd_read_atomic runs. So + * Without THP if the mmap_sem is held for reading, the pmd can only + * transition from null to not null while pmd_read_atomic() runs. So * we can always return atomic pmd values with this function. * - * With THP if the mmap_sem is hold for reading, the pmd can become + * With THP if the mmap_sem is held for reading, the pmd can become * trans_huge or none or point to a pte (and in turn become "stable") - * at any time under pmd_read_atomic. We could read it really - * atomically here with a atomic64_read for the THP enabled case (and + * at any time under pmd_read_atomic(). We could read it truly + * atomically here with an atomic64_read() for the THP enabled case (and * it would be a whole lot simpler), but to avoid using cmpxchg8b we * only return an atomic pmdval if the low part of the pmdval is later - * found stable (i.e. pointing to a pte). And we're returning a none - * pmdval if the low part of the pmd is none. In some cases the high - * and low part of the pmdval returned may not be consistent if THP is - * enabled (the low part may point to previously mapped hugepage, - * while the high part may point to a more recently mapped hugepage), - * but pmd_none_or_trans_huge_or_clear_bad() only needs the low part - * of the pmd to be read atomically to decide if the pmd is unstable - * or not, with the only exception of when the low part of the pmd is - * zero in which case we return a none pmd. + * found to be stable (i.e. pointing to a pte). We are also returning a + * 'none' (zero) pmdval if the low part of the pmd is zero. + * + * In some cases the high and low part of the pmdval returned may not be + * consistent if THP is enabled (the low part may point to previously + * mapped hugepage, while the high part may point to a more recently + * mapped hugepage), but pmd_none_or_trans_huge_or_clear_bad() only + * needs the low part of the pmd to be read atomically to decide if the + * pmd is unstable or not, with the only exception when the low part + * of the pmd is zero, in which case we return a 'none' pmd. */ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) { diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 11d5c5950e2d..4a900804a023 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -819,7 +819,7 @@ static const struct _tlb_table intel_tlb_table[] = { { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" }, { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" }, { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" }, - { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" }, + { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" }, { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, @@ -847,7 +847,7 @@ static const struct _tlb_table intel_tlb_table[] = { { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" }, - { 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" }, + { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" }, { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, { 0x00, 0, 0 } }; @@ -859,8 +859,8 @@ static void intel_tlb_lookup(const unsigned char desc) return; /* look up this descriptor in the table */ - for (k = 0; intel_tlb_table[k].descriptor != desc && \ - intel_tlb_table[k].descriptor != 0; k++) + for (k = 0; intel_tlb_table[k].descriptor != desc && + intel_tlb_table[k].descriptor != 0; k++) ; if (intel_tlb_table[k].tlb_type == 0) diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index bbc68a54795e..3b89c201ac26 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -23,7 +23,7 @@ CFLAGS_mem_encrypt_identity.o := $(nostackp) CFLAGS_fault.o := -I $(srctree)/$(src)/../include/asm/trace -obj-$(CONFIG_X86_PAT) += pat_rbtree.o +obj-$(CONFIG_X86_PAT) += pat_interval.o obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index d9fbd4f69920..2d758e19ef22 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -603,7 +603,7 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, spin_lock(&memtype_lock); - err = rbt_memtype_check_insert(new, new_type); + err = memtype_check_insert(new, new_type); if (err) { pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", start, end - 1, @@ -650,7 +650,7 @@ int free_memtype(u64 start, u64 end) } spin_lock(&memtype_lock); - entry = rbt_memtype_erase(start, end); + entry = memtype_erase(start, end); spin_unlock(&memtype_lock); if (IS_ERR(entry)) { @@ -693,7 +693,7 @@ static enum page_cache_mode lookup_memtype(u64 paddr) spin_lock(&memtype_lock); - entry = rbt_memtype_lookup(paddr); + entry = memtype_lookup(paddr); if (entry != NULL) rettype = entry->type; else @@ -1109,7 +1109,7 @@ static struct memtype *memtype_get_idx(loff_t pos) return NULL; spin_lock(&memtype_lock); - ret = rbt_memtype_copy_nth_element(print_entry, pos); + ret = memtype_copy_nth_element(print_entry, pos); spin_unlock(&memtype_lock); if (!ret) { diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h index eeb5caeb089b..79a06684349e 100644 --- a/arch/x86/mm/pat_internal.h +++ b/arch/x86/mm/pat_internal.h @@ -29,20 +29,20 @@ static inline char *cattr_name(enum page_cache_mode pcm) } #ifdef CONFIG_X86_PAT -extern int rbt_memtype_check_insert(struct memtype *new, - enum page_cache_mode *new_type); -extern struct memtype *rbt_memtype_erase(u64 start, u64 end); -extern struct memtype *rbt_memtype_lookup(u64 addr); -extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos); +extern int memtype_check_insert(struct memtype *new, + enum page_cache_mode *new_type); +extern struct memtype *memtype_erase(u64 start, u64 end); +extern struct memtype *memtype_lookup(u64 addr); +extern int memtype_copy_nth_element(struct memtype *out, loff_t pos); #else -static inline int rbt_memtype_check_insert(struct memtype *new, - enum page_cache_mode *new_type) +static inline int memtype_check_insert(struct memtype *new, + enum page_cache_mode *new_type) { return 0; } -static inline struct memtype *rbt_memtype_erase(u64 start, u64 end) +static inline struct memtype *memtype_erase(u64 start, u64 end) { return NULL; } -static inline struct memtype *rbt_memtype_lookup(u64 addr) +static inline struct memtype *memtype_lookup(u64 addr) { return NULL; } -static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos) +static inline int memtype_copy_nth_element(struct memtype *out, loff_t pos) { return 0; } #endif diff --git a/arch/x86/mm/pat_interval.c b/arch/x86/mm/pat_interval.c new file mode 100644 index 000000000000..47a1bf30748f --- /dev/null +++ b/arch/x86/mm/pat_interval.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Handle caching attributes in page tables (PAT) + * + * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Suresh B Siddha <suresh.b.siddha@intel.com> + * + * Interval tree used to store the PAT memory type reservations. + */ + +#include <linux/seq_file.h> +#include <linux/debugfs.h> +#include <linux/kernel.h> +#include <linux/interval_tree_generic.h> +#include <linux/sched.h> +#include <linux/gfp.h> + +#include <asm/pgtable.h> +#include <asm/pat.h> + +#include "pat_internal.h" + +/* + * The memtype tree keeps track of memory type for specific + * physical memory areas. Without proper tracking, conflicting memory + * types in different mappings can cause CPU cache corruption. + * + * The tree is an interval tree (augmented rbtree) with tree ordered + * on starting address. Tree can contain multiple entries for + * different regions which overlap. All the aliases have the same + * cache attributes of course. + * + * memtype_lock protects the rbtree. + */ +static inline u64 memtype_interval_start(struct memtype *memtype) +{ + return memtype->start; +} + +static inline u64 memtype_interval_end(struct memtype *memtype) +{ + return memtype->end - 1; +} +INTERVAL_TREE_DEFINE(struct memtype, rb, u64, subtree_max_end, + memtype_interval_start, memtype_interval_end, + static, memtype_interval) + +static struct rb_root_cached memtype_rbroot = RB_ROOT_CACHED; + +enum { + MEMTYPE_EXACT_MATCH = 0, + MEMTYPE_END_MATCH = 1 +}; + +static struct memtype *memtype_match(u64 start, u64 end, int match_type) +{ + struct memtype *match; + + match = memtype_interval_iter_first(&memtype_rbroot, start, end); + while (match != NULL && match->start < end) { + if ((match_type == MEMTYPE_EXACT_MATCH) && + (match->start == start) && (match->end == end)) + return match; + + if ((match_type == MEMTYPE_END_MATCH) && + (match->start < start) && (match->end == end)) + return match; + + match = memtype_interval_iter_next(match, start, end); + } + + return NULL; /* Returns NULL if there is no match */ +} + +static int memtype_check_conflict(u64 start, u64 end, + enum page_cache_mode reqtype, + enum page_cache_mode *newtype) +{ + struct memtype *match; + enum page_cache_mode found_type = reqtype; + + match = memtype_interval_iter_first(&memtype_rbroot, start, end); + if (match == NULL) + goto success; + + if (match->type != found_type && newtype == NULL) + goto failure; + + dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end); + found_type = match->type; + + match = memtype_interval_iter_next(match, start, end); + while (match) { + if (match->type != found_type) + goto failure; + + match = memtype_interval_iter_next(match, start, end); + } +success: + if (newtype) + *newtype = found_type; + + return 0; + +failure: + pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, start, end, + cattr_name(found_type), cattr_name(match->type)); + return -EBUSY; +} + +int memtype_check_insert(struct memtype *new, + enum page_cache_mode *ret_type) +{ + int err = 0; + + err = memtype_check_conflict(new->start, new->end, new->type, ret_type); + if (err) + return err; + + if (ret_type) + new->type = *ret_type; + + memtype_interval_insert(new, &memtype_rbroot); + return 0; +} + +struct memtype *memtype_erase(u64 start, u64 end) +{ + struct memtype *data; + + /* + * Since the memtype_rbroot tree allows overlapping ranges, + * memtype_erase() checks with EXACT_MATCH first, i.e. free + * a whole node for the munmap case. If no such entry is found, + * it then checks with END_MATCH, i.e. shrink the size of a node + * from the end for the mremap case. + */ + data = memtype_match(start, end, MEMTYPE_EXACT_MATCH); + if (!data) { + data = memtype_match(start, end, MEMTYPE_END_MATCH); + if (!data) + return ERR_PTR(-EINVAL); + } + + if (data->start == start) { + /* munmap: erase this node */ + memtype_interval_remove(data, &memtype_rbroot); + } else { + /* mremap: update the end value of this node */ + memtype_interval_remove(data, &memtype_rbroot); + data->end = start; + memtype_interval_insert(data, &memtype_rbroot); + return NULL; + } + + return data; +} + +struct memtype *memtype_lookup(u64 addr) +{ + return memtype_interval_iter_first(&memtype_rbroot, addr, + addr + PAGE_SIZE); +} + +#if defined(CONFIG_DEBUG_FS) +int memtype_copy_nth_element(struct memtype *out, loff_t pos) +{ + struct memtype *match; + int i = 1; + + match = memtype_interval_iter_first(&memtype_rbroot, 0, ULONG_MAX); + while (match && pos != i) { + match = memtype_interval_iter_next(match, 0, ULONG_MAX); + i++; + } + + if (match) { /* pos == i */ + *out = *match; + return 0; + } else { + return 1; + } +} +#endif diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c deleted file mode 100644 index 65ebe4b88f7c..000000000000 --- a/arch/x86/mm/pat_rbtree.c +++ /dev/null @@ -1,268 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Handle caching attributes in page tables (PAT) - * - * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> - * Suresh B Siddha <suresh.b.siddha@intel.com> - * - * Interval tree (augmented rbtree) used to store the PAT memory type - * reservations. - */ - -#include <linux/seq_file.h> -#include <linux/debugfs.h> -#include <linux/kernel.h> -#include <linux/rbtree_augmented.h> -#include <linux/sched.h> -#include <linux/gfp.h> - -#include <asm/pgtable.h> -#include <asm/pat.h> - -#include "pat_internal.h" - -/* - * The memtype tree keeps track of memory type for specific - * physical memory areas. Without proper tracking, conflicting memory - * types in different mappings can cause CPU cache corruption. - * - * The tree is an interval tree (augmented rbtree) with tree ordered - * on starting address. Tree can contain multiple entries for - * different regions which overlap. All the aliases have the same - * cache attributes of course. - * - * memtype_lock protects the rbtree. - */ - -static struct rb_root memtype_rbroot = RB_ROOT; - -static int is_node_overlap(struct memtype *node, u64 start, u64 end) -{ - if (node->start >= end || node->end <= start) - return 0; - - return 1; -} - -static u64 get_subtree_max_end(struct rb_node *node) -{ - u64 ret = 0; - if (node) { - struct memtype *data = rb_entry(node, struct memtype, rb); - ret = data->subtree_max_end; - } - return ret; -} - -#define NODE_END(node) ((node)->end) - -RB_DECLARE_CALLBACKS_MAX(static, memtype_rb_augment_cb, - struct memtype, rb, u64, subtree_max_end, NODE_END) - -/* Find the first (lowest start addr) overlapping range from rb tree */ -static struct memtype *memtype_rb_lowest_match(struct rb_root *root, - u64 start, u64 end) -{ - struct rb_node *node = root->rb_node; - struct memtype *last_lower = NULL; - - while (node) { - struct memtype *data = rb_entry(node, struct memtype, rb); - - if (get_subtree_max_end(node->rb_left) > start) { - /* Lowest overlap if any must be on left side */ - node = node->rb_left; - } else if (is_node_overlap(data, start, end)) { - last_lower = data; - break; - } else if (start >= data->start) { - /* Lowest overlap if any must be on right side */ - node = node->rb_right; - } else { - break; - } - } - return last_lower; /* Returns NULL if there is no overlap */ -} - -enum { - MEMTYPE_EXACT_MATCH = 0, - MEMTYPE_END_MATCH = 1 -}; - -static struct memtype *memtype_rb_match(struct rb_root *root, - u64 start, u64 end, int match_type) -{ - struct memtype *match; - - match = memtype_rb_lowest_match(root, start, end); - while (match != NULL && match->start < end) { - struct rb_node *node; - - if ((match_type == MEMTYPE_EXACT_MATCH) && - (match->start == start) && (match->end == end)) - return match; - - if ((match_type == MEMTYPE_END_MATCH) && - (match->start < start) && (match->end == end)) - return match; - - node = rb_next(&match->rb); - if (node) - match = rb_entry(node, struct memtype, rb); - else - match = NULL; - } - - return NULL; /* Returns NULL if there is no match */ -} - -static int memtype_rb_check_conflict(struct rb_root *root, - u64 start, u64 end, - enum page_cache_mode reqtype, - enum page_cache_mode *newtype) -{ - struct rb_node *node; - struct memtype *match; - enum page_cache_mode found_type = reqtype; - - match = memtype_rb_lowest_match(&memtype_rbroot, start, end); - if (match == NULL) - goto success; - - if (match->type != found_type && newtype == NULL) - goto failure; - - dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end); - found_type = match->type; - - node = rb_next(&match->rb); - while (node) { - match = rb_entry(node, struct memtype, rb); - - if (match->start >= end) /* Checked all possible matches */ - goto success; - - if (is_node_overlap(match, start, end) && - match->type != found_type) { - goto failure; - } - - node = rb_next(&match->rb); - } -success: - if (newtype) - *newtype = found_type; - - return 0; - -failure: - pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n", - current->comm, current->pid, start, end, - cattr_name(found_type), cattr_name(match->type)); - return -EBUSY; -} - -static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata) -{ - struct rb_node **node = &(root->rb_node); - struct rb_node *parent = NULL; - - while (*node) { - struct memtype *data = rb_entry(*node, struct memtype, rb); - - parent = *node; - if (data->subtree_max_end < newdata->end) - data->subtree_max_end = newdata->end; - if (newdata->start <= data->start) - node = &((*node)->rb_left); - else if (newdata->start > data->start) - node = &((*node)->rb_right); - } - - newdata->subtree_max_end = newdata->end; - rb_link_node(&newdata->rb, parent, node); - rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb); -} - -int rbt_memtype_check_insert(struct memtype *new, - enum page_cache_mode *ret_type) -{ - int err = 0; - - err = memtype_rb_check_conflict(&memtype_rbroot, new->start, new->end, - new->type, ret_type); - - if (!err) { - if (ret_type) - new->type = *ret_type; - - new->subtree_max_end = new->end; - memtype_rb_insert(&memtype_rbroot, new); - } - return err; -} - -struct memtype *rbt_memtype_erase(u64 start, u64 end) -{ - struct memtype *data; - - /* - * Since the memtype_rbroot tree allows overlapping ranges, - * rbt_memtype_erase() checks with EXACT_MATCH first, i.e. free - * a whole node for the munmap case. If no such entry is found, - * it then checks with END_MATCH, i.e. shrink the size of a node - * from the end for the mremap case. - */ - data = memtype_rb_match(&memtype_rbroot, start, end, - MEMTYPE_EXACT_MATCH); - if (!data) { - data = memtype_rb_match(&memtype_rbroot, start, end, - MEMTYPE_END_MATCH); - if (!data) - return ERR_PTR(-EINVAL); - } - - if (data->start == start) { - /* munmap: erase this node */ - rb_erase_augmented(&data->rb, &memtype_rbroot, - &memtype_rb_augment_cb); - } else { - /* mremap: update the end value of this node */ - rb_erase_augmented(&data->rb, &memtype_rbroot, - &memtype_rb_augment_cb); - data->end = start; - data->subtree_max_end = data->end; - memtype_rb_insert(&memtype_rbroot, data); - return NULL; - } - - return data; -} - -struct memtype *rbt_memtype_lookup(u64 addr) -{ - return memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE); -} - -#if defined(CONFIG_DEBUG_FS) -int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos) -{ - struct rb_node *node; - int i = 1; - - node = rb_first(&memtype_rbroot); - while (node && pos != i) { - node = rb_next(node); - i++; - } - - if (node) { /* pos == i */ - struct memtype *this = rb_entry(node, struct memtype, rb); - *out = *this; - return 0; - } else { - return 1; - } -} -#endif |