diff options
-rw-r--r-- | Documentation/virt/kvm/api.rst | 5 | ||||
-rw-r--r-- | arch/arm64/Kconfig | 1 | ||||
-rw-r--r-- | arch/arm64/include/asm/mte.h | 65 | ||||
-rw-r--r-- | arch/arm64/include/asm/pgtable.h | 4 | ||||
-rw-r--r-- | arch/arm64/kernel/cpufeature.c | 4 | ||||
-rw-r--r-- | arch/arm64/kernel/elfcore.c | 2 | ||||
-rw-r--r-- | arch/arm64/kernel/hibernate.c | 2 | ||||
-rw-r--r-- | arch/arm64/kernel/mte.c | 21 | ||||
-rw-r--r-- | arch/arm64/kvm/guest.c | 18 | ||||
-rw-r--r-- | arch/arm64/kvm/mmu.c | 55 | ||||
-rw-r--r-- | arch/arm64/mm/copypage.c | 7 | ||||
-rw-r--r-- | arch/arm64/mm/fault.c | 4 | ||||
-rw-r--r-- | arch/arm64/mm/mteswap.c | 16 | ||||
-rw-r--r-- | fs/proc/page.c | 3 | ||||
-rw-r--r-- | include/linux/kernel-page-flags.h | 1 | ||||
-rw-r--r-- | include/linux/page-flags.h | 3 | ||||
-rw-r--r-- | include/trace/events/mmflags.h | 9 | ||||
-rw-r--r-- | mm/Kconfig | 8 | ||||
-rw-r--r-- | mm/huge_memory.c | 3 |
19 files changed, 152 insertions, 79 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 773e4b202f47..226b40baffb8 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7385,8 +7385,9 @@ hibernation of the host; however the VMM needs to manually save/restore the tags as appropriate if the VM is migrated. When this capability is enabled all memory in memslots must be mapped as -not-shareable (no MAP_SHARED), attempts to create a memslot with a -MAP_SHARED mmap will result in an -EINVAL return. +``MAP_ANONYMOUS`` or with a RAM-based file mapping (``tmpfs``, ``memfd``), +attempts to create a memslot with an invalid mmap will result in an +-EINVAL return. When enabled the VMM may make use of the ``KVM_ARM_MTE_COPY_TAGS`` ioctl to perform a bulk copy of tags to/from the guest. diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 505c8a1ccbe0..cd93d0738425 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1965,6 +1965,7 @@ config ARM64_MTE depends on ARM64_PAN select ARCH_HAS_SUBPAGE_FAULTS select ARCH_USES_HIGH_VMA_FLAGS + select ARCH_USES_PG_ARCH_X help Memory Tagging (part of the ARMv8.5 Extensions) provides architectural support for run-time, always-on detection of diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index 760c62f8e22f..20dd06d70af5 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -25,7 +25,7 @@ unsigned long mte_copy_tags_to_user(void __user *to, void *from, unsigned long n); int mte_save_tags(struct page *page); void mte_save_page_tags(const void *page_addr, void *tag_storage); -bool mte_restore_tags(swp_entry_t entry, struct page *page); +void mte_restore_tags(swp_entry_t entry, struct page *page); void mte_restore_page_tags(void *page_addr, const void *tag_storage); void mte_invalidate_tags(int type, pgoff_t offset); void mte_invalidate_tags_area(int type); @@ -36,6 +36,58 @@ void mte_free_tag_storage(char *storage); /* track which pages have valid allocation tags */ #define PG_mte_tagged PG_arch_2 +/* simple lock to avoid multiple threads tagging the same page */ +#define PG_mte_lock PG_arch_3 + +static inline void set_page_mte_tagged(struct page *page) +{ + /* + * Ensure that the tags written prior to this function are visible + * before the page flags update. + */ + smp_wmb(); + set_bit(PG_mte_tagged, &page->flags); +} + +static inline bool page_mte_tagged(struct page *page) +{ + bool ret = test_bit(PG_mte_tagged, &page->flags); + + /* + * If the page is tagged, ensure ordering with a likely subsequent + * read of the tags. + */ + if (ret) + smp_rmb(); + return ret; +} + +/* + * Lock the page for tagging and return 'true' if the page can be tagged, + * 'false' if already tagged. PG_mte_tagged is never cleared and therefore the + * locking only happens once for page initialisation. + * + * The page MTE lock state: + * + * Locked: PG_mte_lock && !PG_mte_tagged + * Unlocked: !PG_mte_lock || PG_mte_tagged + * + * Acquire semantics only if the page is tagged (returning 'false'). + */ +static inline bool try_page_mte_tagging(struct page *page) +{ + if (!test_and_set_bit(PG_mte_lock, &page->flags)) + return true; + + /* + * The tags are either being initialised or may have been initialised + * already. Check if the PG_mte_tagged flag has been set or wait + * otherwise. + */ + smp_cond_load_acquire(&page->flags, VAL & (1UL << PG_mte_tagged)); + + return false; +} void mte_zero_clear_page_tags(void *addr); void mte_sync_tags(pte_t old_pte, pte_t pte); @@ -56,6 +108,17 @@ size_t mte_probe_user_range(const char __user *uaddr, size_t size); /* unused if !CONFIG_ARM64_MTE, silence the compiler */ #define PG_mte_tagged 0 +static inline void set_page_mte_tagged(struct page *page) +{ +} +static inline bool page_mte_tagged(struct page *page) +{ + return false; +} +static inline bool try_page_mte_tagging(struct page *page) +{ + return false; +} static inline void mte_zero_clear_page_tags(void *addr) { } diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 71a1af42f0e8..8735ac1a1e32 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1049,8 +1049,8 @@ static inline void arch_swap_invalidate_area(int type) #define __HAVE_ARCH_SWAP_RESTORE static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) { - if (system_supports_mte() && mte_restore_tags(entry, &folio->page)) - set_bit(PG_mte_tagged, &folio->flags); + if (system_supports_mte()) + mte_restore_tags(entry, &folio->page); } #endif /* CONFIG_ARM64_MTE */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index b3f37e2209ad..79d153d34206 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2074,8 +2074,10 @@ static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap) * Clear the tags in the zero page. This needs to be done via the * linear map which has the Tagged attribute. */ - if (!test_and_set_bit(PG_mte_tagged, &ZERO_PAGE(0)->flags)) + if (try_page_mte_tagging(ZERO_PAGE(0))) { mte_clear_page_tags(lm_alias(empty_zero_page)); + set_page_mte_tagged(ZERO_PAGE(0)); + } kasan_init_hw_tags_cpu(); } diff --git a/arch/arm64/kernel/elfcore.c b/arch/arm64/kernel/elfcore.c index 27ef7ad3ffd2..353009d7f307 100644 --- a/arch/arm64/kernel/elfcore.c +++ b/arch/arm64/kernel/elfcore.c @@ -47,7 +47,7 @@ static int mte_dump_tag_range(struct coredump_params *cprm, * Pages mapped in user space as !pte_access_permitted() (e.g. * PROT_EXEC only) may not have the PG_mte_tagged flag set. */ - if (!test_bit(PG_mte_tagged, &page->flags)) { + if (!page_mte_tagged(page)) { put_page(page); dump_skip(cprm, MTE_PAGE_TAG_STORAGE); continue; diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index af5df48ba915..788597a6b6a2 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -271,7 +271,7 @@ static int swsusp_mte_save_tags(void) if (!page) continue; - if (!test_bit(PG_mte_tagged, &page->flags)) + if (!page_mte_tagged(page)) continue; ret = save_tags(page, pfn); diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 7467217c1eaf..f5bcb0dc6267 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -41,19 +41,17 @@ static void mte_sync_page_tags(struct page *page, pte_t old_pte, if (check_swap && is_swap_pte(old_pte)) { swp_entry_t entry = pte_to_swp_entry(old_pte); - if (!non_swap_entry(entry) && mte_restore_tags(entry, page)) - return; + if (!non_swap_entry(entry)) + mte_restore_tags(entry, page); } if (!pte_is_tagged) return; - /* - * Test PG_mte_tagged again in case it was racing with another - * set_pte_at(). - */ - if (!test_and_set_bit(PG_mte_tagged, &page->flags)) + if (try_page_mte_tagging(page)) { mte_clear_page_tags(page_address(page)); + set_page_mte_tagged(page); + } } void mte_sync_tags(pte_t old_pte, pte_t pte) @@ -69,9 +67,11 @@ void mte_sync_tags(pte_t old_pte, pte_t pte) /* if PG_mte_tagged is set, tags have already been initialised */ for (i = 0; i < nr_pages; i++, page++) { - if (!test_bit(PG_mte_tagged, &page->flags)) + if (!page_mte_tagged(page)) { mte_sync_page_tags(page, old_pte, check_swap, pte_is_tagged); + set_page_mte_tagged(page); + } } /* ensure the tags are visible before the PTE is set */ @@ -96,8 +96,7 @@ int memcmp_pages(struct page *page1, struct page *page2) * pages is tagged, set_pte_at() may zero or change the tags of the * other page via mte_sync_tags(). */ - if (test_bit(PG_mte_tagged, &page1->flags) || - test_bit(PG_mte_tagged, &page2->flags)) + if (page_mte_tagged(page1) || page_mte_tagged(page2)) return addr1 != addr2; return ret; @@ -454,7 +453,7 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, put_page(page); break; } - WARN_ON_ONCE(!test_bit(PG_mte_tagged, &page->flags)); + WARN_ON_ONCE(!page_mte_tagged(page)); /* limit access to the end of the page */ offset = offset_in_page(addr); diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 2ff13a3f8479..5626ddb540ce 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -1059,7 +1059,7 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, maddr = page_address(page); if (!write) { - if (test_bit(PG_mte_tagged, &page->flags)) + if (page_mte_tagged(page)) num_tags = mte_copy_tags_to_user(tags, maddr, MTE_GRANULES_PER_PAGE); else @@ -1068,15 +1068,19 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, clear_user(tags, MTE_GRANULES_PER_PAGE); kvm_release_pfn_clean(pfn); } else { + /* + * Only locking to serialise with a concurrent + * set_pte_at() in the VMM but still overriding the + * tags, hence ignoring the return value. + */ + try_page_mte_tagging(page); num_tags = mte_copy_tags_from_user(maddr, tags, MTE_GRANULES_PER_PAGE); - /* - * Set the flag after checking the write - * completed fully - */ - if (num_tags == MTE_GRANULES_PER_PAGE) - set_bit(PG_mte_tagged, &page->flags); + /* uaccess failed, don't leave stale tags */ + if (num_tags != MTE_GRANULES_PER_PAGE) + mte_clear_page_tags(page); + set_page_mte_tagged(page); kvm_release_pfn_dirty(pfn); } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index a1b05e60aebe..39d9a334efb5 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1164,32 +1164,26 @@ static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) * - mmap_lock protects between a VM faulting a page in and the VMM performing * an mprotect() to add VM_MTE */ -static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, - unsigned long size) +static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, + unsigned long size) { unsigned long i, nr_pages = size >> PAGE_SHIFT; - struct page *page; + struct page *page = pfn_to_page(pfn); if (!kvm_has_mte(kvm)) - return 0; - - /* - * pfn_to_online_page() is used to reject ZONE_DEVICE pages - * that may not support tags. - */ - page = pfn_to_online_page(pfn); - - if (!page) - return -EFAULT; + return; for (i = 0; i < nr_pages; i++, page++) { - if (!test_bit(PG_mte_tagged, &page->flags)) { + if (try_page_mte_tagging(page)) { mte_clear_page_tags(page_address(page)); - set_bit(PG_mte_tagged, &page->flags); + set_page_mte_tagged(page); } } +} - return 0; +static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_MTE_ALLOWED; } static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, @@ -1200,7 +1194,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, bool write_fault, writable, force_pte = false; bool exec_fault; bool device = false; - bool shared; unsigned long mmu_seq; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; @@ -1247,8 +1240,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, vma_shift = get_vma_page_shift(vma, hva); } - shared = (vma->vm_flags & VM_SHARED); - switch (vma_shift) { #ifndef __PAGETABLE_PMD_FOLDED case PUD_SHIFT: @@ -1360,13 +1351,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) { - /* Check the VMM hasn't introduced a new VM_SHARED VMA */ - if (!shared) - ret = sanitise_mte_tags(kvm, pfn, vma_pagesize); - else + /* Check the VMM hasn't introduced a new disallowed VMA */ + if (kvm_vma_mte_allowed(vma)) { + sanitise_mte_tags(kvm, pfn, vma_pagesize); + } else { ret = -EFAULT; - if (ret) goto out_unlock; + } } if (writable) @@ -1582,15 +1573,18 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { kvm_pfn_t pfn = pte_pfn(range->pte); - int ret; if (!kvm->arch.mmu.pgt) return false; WARN_ON(range->end - range->start != 1); - ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE); - if (ret) + /* + * If the page isn't tagged, defer to user_mem_abort() for sanitising + * the MTE tags. The S2 pte should have been unmapped by + * mmu_notifier_invalidate_range_end(). + */ + if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn))) return false; /* @@ -1822,12 +1816,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, if (!vma) break; - /* - * VM_SHARED mappings are not allowed with MTE to avoid races - * when updating the PG_mte_tagged page flag, see - * sanitise_mte_tags for more details. - */ - if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) { + if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { ret = -EINVAL; break; } diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c index 24913271e898..8dd5a8fe64b4 100644 --- a/arch/arm64/mm/copypage.c +++ b/arch/arm64/mm/copypage.c @@ -21,9 +21,12 @@ void copy_highpage(struct page *to, struct page *from) copy_page(kto, kfrom); - if (system_supports_mte() && test_bit(PG_mte_tagged, &from->flags)) { - set_bit(PG_mte_tagged, &to->flags); + if (system_supports_mte() && page_mte_tagged(from)) { + page_kasan_tag_reset(to); + /* It's a new page, shouldn't have been tagged yet */ + WARN_ON_ONCE(!try_page_mte_tagging(to)); mte_copy_page_tags(kto, kfrom); + set_page_mte_tagged(to); } } EXPORT_SYMBOL(copy_highpage); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 3e9cf9826417..0b1c102b89c9 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -937,6 +937,8 @@ struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, void tag_clear_highpage(struct page *page) { + /* Newly allocated page, shouldn't have been tagged yet */ + WARN_ON_ONCE(!try_page_mte_tagging(page)); mte_zero_clear_page_tags(page_address(page)); - set_bit(PG_mte_tagged, &page->flags); + set_page_mte_tagged(page); } diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c index bed803d8e158..cd508ba80ab1 100644 --- a/arch/arm64/mm/mteswap.c +++ b/arch/arm64/mm/mteswap.c @@ -24,7 +24,7 @@ int mte_save_tags(struct page *page) { void *tag_storage, *ret; - if (!test_bit(PG_mte_tagged, &page->flags)) + if (!page_mte_tagged(page)) return 0; tag_storage = mte_allocate_tag_storage(); @@ -46,21 +46,17 @@ int mte_save_tags(struct page *page) return 0; } -bool mte_restore_tags(swp_entry_t entry, struct page *page) +void mte_restore_tags(swp_entry_t entry, struct page *page) { void *tags = xa_load(&mte_pages, entry.val); if (!tags) - return false; + return; - /* - * Test PG_mte_tagged again in case it was racing with another - * set_pte_at(). - */ - if (!test_and_set_bit(PG_mte_tagged, &page->flags)) + if (try_page_mte_tagging(page)) { mte_restore_page_tags(page_address(page), tags); - - return true; + set_page_mte_tagged(page); + } } void mte_invalidate_tags(int type, pgoff_t offset) diff --git a/fs/proc/page.c b/fs/proc/page.c index f2273b164535..6249c347809a 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -219,8 +219,9 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2); u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1); u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); -#ifdef CONFIG_64BIT +#ifdef CONFIG_ARCH_USES_PG_ARCH_X u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2); + u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3); #endif return u; diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h index eee1877a354e..859f4b0c1b2b 100644 --- a/include/linux/kernel-page-flags.h +++ b/include/linux/kernel-page-flags.h @@ -18,5 +18,6 @@ #define KPF_UNCACHED 39 #define KPF_SOFTDIRTY 40 #define KPF_ARCH_2 41 +#define KPF_ARCH_3 42 #endif /* LINUX_KERNEL_PAGE_FLAGS_H */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0b0ae5084e60..c50ce2812f17 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -132,8 +132,9 @@ enum pageflags { PG_young, PG_idle, #endif -#ifdef CONFIG_64BIT +#ifdef CONFIG_ARCH_USES_PG_ARCH_X PG_arch_2, + PG_arch_3, #endif #ifdef CONFIG_KASAN_HW_TAGS PG_skip_kasan_poison, diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index e87cb2b80ed3..412b5a46374c 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -91,10 +91,10 @@ #define IF_HAVE_PG_IDLE(flag,string) #endif -#ifdef CONFIG_64BIT -#define IF_HAVE_PG_ARCH_2(flag,string) ,{1UL << flag, string} +#ifdef CONFIG_ARCH_USES_PG_ARCH_X +#define IF_HAVE_PG_ARCH_X(flag,string) ,{1UL << flag, string} #else -#define IF_HAVE_PG_ARCH_2(flag,string) +#define IF_HAVE_PG_ARCH_X(flag,string) #endif #ifdef CONFIG_KASAN_HW_TAGS @@ -130,7 +130,8 @@ IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \ IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \ IF_HAVE_PG_IDLE(PG_young, "young" ) \ IF_HAVE_PG_IDLE(PG_idle, "idle" ) \ -IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ) \ +IF_HAVE_PG_ARCH_X(PG_arch_2, "arch_2" ) \ +IF_HAVE_PG_ARCH_X(PG_arch_3, "arch_3" ) \ IF_HAVE_PG_SKIP_KASAN_POISON(PG_skip_kasan_poison, "skip_kasan_poison") #define show_page_flags(flags) \ diff --git a/mm/Kconfig b/mm/Kconfig index 57e1d8c5b505..807bd7192f51 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1005,6 +1005,14 @@ config ARCH_USES_HIGH_VMA_FLAGS config ARCH_HAS_PKEYS bool +config ARCH_USES_PG_ARCH_X + bool + help + Enable the definition of PG_arch_x page flags with x > 1. Only + suitable for 64-bit architectures with CONFIG_FLATMEM or + CONFIG_SPARSEMEM_VMEMMAP enabled, otherwise there may not be + enough room for additional bits in page->flags. + config VM_EVENT_COUNTERS default y bool "Enable VM event counters for /proc/vmstat" if EXPERT diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 561a42567477..dfe72ea23c5f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2444,8 +2444,9 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_workingset) | (1L << PG_locked) | (1L << PG_unevictable) | -#ifdef CONFIG_64BIT +#ifdef CONFIG_ARCH_USES_PG_ARCH_X (1L << PG_arch_2) | + (1L << PG_arch_3) | #endif (1L << PG_dirty) | LRU_GEN_MASK | LRU_REFS_MASK)); |