summaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/fault.c4
-rw-r--r--arch/x86/mm/init.c8
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/mm/init_64.c210
-rw-r--r--arch/x86/mm/ioremap.c19
-rw-r--r--arch/x86/mm/kaslr.c2
-rw-r--r--arch/x86/mm/kmmio.c2
-rw-r--r--arch/x86/mm/mem_encrypt.c6
-rw-r--r--arch/x86/mm/mem_encrypt_boot.S2
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c35
-rw-r--r--arch/x86/mm/pat/memtype.c4
-rw-r--r--arch/x86/mm/pat/set_memory.c2
-rw-r--r--arch/x86/mm/pgtable.c13
-rw-r--r--arch/x86/mm/pkeys.c2
-rw-r--r--arch/x86/mm/pti.c11
-rw-r--r--arch/x86/mm/tlb.c182
16 files changed, 283 insertions, 221 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a73347e2cdfc..1c548ad00752 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1497,7 +1497,7 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
* userspace task is trying to access some valid (from guest's point of
* view) memory which is not currently mapped by the host (e.g. the
* memory is swapped out). Note, the corresponding "page ready" event
- * which is injected when the memory becomes available, is delived via
+ * which is injected when the memory becomes available, is delivered via
* an interrupt mechanism and not a #PF exception
* (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()).
*
@@ -1523,7 +1523,7 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
*
* In case the fault hit a RCU idle region the conditional entry
* code reenabled RCU to avoid subsequent wreckage which helps
- * debugability.
+ * debuggability.
*/
state = irqentry_enter(regs);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index dd694fb93916..75ef19aa8903 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -29,7 +29,7 @@
/*
* We need to define the tracepoints somewhere, and tlb.c
- * is only compied when SMP=y.
+ * is only compiled when SMP=y.
*/
#define CREATE_TRACE_POINTS
#include <trace/events/tlb.h>
@@ -756,7 +756,7 @@ void __init init_mem_mapping(void)
#ifdef CONFIG_X86_64
if (max_pfn > max_low_pfn) {
- /* can we preseve max_low_pfn ?*/
+ /* can we preserve max_low_pfn ?*/
max_low_pfn = max_pfn;
}
#else
@@ -939,7 +939,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
{
/*
* end could be not aligned, and We can not align that,
- * decompresser could be confused by aligned initrd_end
+ * decompressor could be confused by aligned initrd_end
* We already reserve the end partial page before in
* - i386_start_kernel()
* - x86_64_start_kernel()
@@ -1017,7 +1017,7 @@ void __init zone_sizes_init(void)
free_area_init(max_zone_pfns);
}
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+__visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.loaded_mm = &init_mm,
.next_asid = 1,
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index da31c2635ee4..21ffb03f6c72 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -755,8 +755,6 @@ void __init mem_init(void)
after_bootmem = 1;
x86_init.hyper.init_after_bootmem();
- mem_init_print_info(NULL);
-
/*
* Check boundaries twice: Some fundamental inconsistencies can
* be detected at build time already.
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index b5a3fa4033d3..e527d829e1ed 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -172,7 +172,7 @@ static void sync_global_pgds_l4(unsigned long start, unsigned long end)
/*
* With folded p4d, pgd_none() is always false, we need to
- * handle synchonization on p4d level.
+ * handle synchronization on p4d level.
*/
MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref));
p4d_ref = p4d_offset(pgd_ref, addr);
@@ -826,6 +826,106 @@ void __init paging_init(void)
zone_sizes_init();
}
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+#define PAGE_UNUSED 0xFD
+
+/*
+ * The unused vmemmap range, which was not yet memset(PAGE_UNUSED), ranges
+ * from unused_pmd_start to next PMD_SIZE boundary.
+ */
+static unsigned long unused_pmd_start __meminitdata;
+
+static void __meminit vmemmap_flush_unused_pmd(void)
+{
+ if (!unused_pmd_start)
+ return;
+ /*
+ * Clears (unused_pmd_start, PMD_END]
+ */
+ memset((void *)unused_pmd_start, PAGE_UNUSED,
+ ALIGN(unused_pmd_start, PMD_SIZE) - unused_pmd_start);
+ unused_pmd_start = 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/* Returns true if the PMD is completely unused and thus it can be freed */
+static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
+{
+ unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
+
+ /*
+ * Flush the unused range cache to ensure that memchr_inv() will work
+ * for the whole range.
+ */
+ vmemmap_flush_unused_pmd();
+ memset((void *)addr, PAGE_UNUSED, end - addr);
+
+ return !memchr_inv((void *)start, PAGE_UNUSED, PMD_SIZE);
+}
+#endif
+
+static void __meminit __vmemmap_use_sub_pmd(unsigned long start)
+{
+ /*
+ * As we expect to add in the same granularity as we remove, it's
+ * sufficient to mark only some piece used to block the memmap page from
+ * getting removed when removing some other adjacent memmap (just in
+ * case the first memmap never gets initialized e.g., because the memory
+ * block never gets onlined).
+ */
+ memset((void *)start, 0, sizeof(struct page));
+}
+
+static void __meminit vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
+{
+ /*
+ * We only optimize if the new used range directly follows the
+ * previously unused range (esp., when populating consecutive sections).
+ */
+ if (unused_pmd_start == start) {
+ if (likely(IS_ALIGNED(end, PMD_SIZE)))
+ unused_pmd_start = 0;
+ else
+ unused_pmd_start = end;
+ return;
+ }
+
+ /*
+ * If the range does not contiguously follows previous one, make sure
+ * to mark the unused range of the previous one so it can be removed.
+ */
+ vmemmap_flush_unused_pmd();
+ __vmemmap_use_sub_pmd(start);
+}
+
+
+static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
+{
+ vmemmap_flush_unused_pmd();
+
+ /*
+ * Could be our memmap page is filled with PAGE_UNUSED already from a
+ * previous remove. Make sure to reset it.
+ */
+ __vmemmap_use_sub_pmd(start);
+
+ /*
+ * Mark with PAGE_UNUSED the unused parts of the new memmap range
+ */
+ if (!IS_ALIGNED(start, PMD_SIZE))
+ memset((void *)start, PAGE_UNUSED,
+ start - ALIGN_DOWN(start, PMD_SIZE));
+
+ /*
+ * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
+ * consecutive sections. Remember for the last added PMD where the
+ * unused range begins.
+ */
+ if (!IS_ALIGNED(end, PMD_SIZE))
+ unused_pmd_start = end;
+}
+#endif
+
/*
* Memory hotplug specific functions
*/
@@ -871,8 +971,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
return add_pages(nid, start_pfn, nr_pages, params);
}
-#define PAGE_INUSE 0xFD
-
static void __meminit free_pagetable(struct page *page, int order)
{
unsigned long magic;
@@ -962,7 +1060,6 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
{
unsigned long next, pages = 0;
pte_t *pte;
- void *page_addr;
phys_addr_t phys_addr;
pte = pte_start + pte_index(addr);
@@ -983,42 +1080,15 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
if (phys_addr < (phys_addr_t)0x40000000)
return;
- if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
- /*
- * Do not free direct mapping pages since they were
- * freed when offlining, or simplely not in use.
- */
- if (!direct)
- free_pagetable(pte_page(*pte), 0);
-
- spin_lock(&init_mm.page_table_lock);
- pte_clear(&init_mm, addr, pte);
- spin_unlock(&init_mm.page_table_lock);
-
- /* For non-direct mapping, pages means nothing. */
- pages++;
- } else {
- /*
- * If we are here, we are freeing vmemmap pages since
- * direct mapped memory ranges to be freed are aligned.
- *
- * If we are not removing the whole page, it means
- * other page structs in this page are being used and
- * we canot remove them. So fill the unused page_structs
- * with 0xFD, and remove the page when it is wholly
- * filled with 0xFD.
- */
- memset((void *)addr, PAGE_INUSE, next - addr);
+ if (!direct)
+ free_pagetable(pte_page(*pte), 0);
- page_addr = page_address(pte_page(*pte));
- if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
- free_pagetable(pte_page(*pte), 0);
+ spin_lock(&init_mm.page_table_lock);
+ pte_clear(&init_mm, addr, pte);
+ spin_unlock(&init_mm.page_table_lock);
- spin_lock(&init_mm.page_table_lock);
- pte_clear(&init_mm, addr, pte);
- spin_unlock(&init_mm.page_table_lock);
- }
- }
+ /* For non-direct mapping, pages means nothing. */
+ pages++;
}
/* Call free_pte_table() in remove_pmd_table(). */
@@ -1034,7 +1104,6 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
unsigned long next, pages = 0;
pte_t *pte_base;
pmd_t *pmd;
- void *page_addr;
pmd = pmd_start + pmd_index(addr);
for (; addr < end; addr = next, pmd++) {
@@ -1054,22 +1123,16 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
pmd_clear(pmd);
spin_unlock(&init_mm.page_table_lock);
pages++;
- } else {
- /* If here, we are freeing vmemmap pages. */
- memset((void *)addr, PAGE_INUSE, next - addr);
-
- page_addr = page_address(pmd_page(*pmd));
- if (!memchr_inv(page_addr, PAGE_INUSE,
- PMD_SIZE)) {
+ }
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ else if (vmemmap_pmd_is_unused(addr, next)) {
free_hugepage_table(pmd_page(*pmd),
altmap);
-
spin_lock(&init_mm.page_table_lock);
pmd_clear(pmd);
spin_unlock(&init_mm.page_table_lock);
- }
}
-
+#endif
continue;
}
@@ -1090,7 +1153,6 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
unsigned long next, pages = 0;
pmd_t *pmd_base;
pud_t *pud;
- void *page_addr;
pud = pud_start + pud_index(addr);
for (; addr < end; addr = next, pud++) {
@@ -1099,33 +1161,13 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
if (!pud_present(*pud))
continue;
- if (pud_large(*pud)) {
- if (IS_ALIGNED(addr, PUD_SIZE) &&
- IS_ALIGNED(next, PUD_SIZE)) {
- if (!direct)
- free_pagetable(pud_page(*pud),
- get_order(PUD_SIZE));
-
- spin_lock(&init_mm.page_table_lock);
- pud_clear(pud);
- spin_unlock(&init_mm.page_table_lock);
- pages++;
- } else {
- /* If here, we are freeing vmemmap pages. */
- memset((void *)addr, PAGE_INUSE, next - addr);
-
- page_addr = page_address(pud_page(*pud));
- if (!memchr_inv(page_addr, PAGE_INUSE,
- PUD_SIZE)) {
- free_pagetable(pud_page(*pud),
- get_order(PUD_SIZE));
-
- spin_lock(&init_mm.page_table_lock);
- pud_clear(pud);
- spin_unlock(&init_mm.page_table_lock);
- }
- }
-
+ if (pud_large(*pud) &&
+ IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE)) {
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+ pages++;
continue;
}
@@ -1197,6 +1239,9 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct,
void __ref vmemmap_free(unsigned long start, unsigned long end,
struct vmem_altmap *altmap)
{
+ VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
+ VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
+
remove_pagetable(start, end, false, altmap);
}
@@ -1306,8 +1351,6 @@ void __init mem_init(void)
kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER);
preallocate_vmalloc_pages();
-
- mem_init_print_info(NULL);
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -1538,11 +1581,17 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
addr_end = addr + PMD_SIZE;
p_end = p + PMD_SIZE;
+
+ if (!IS_ALIGNED(addr, PMD_SIZE) ||
+ !IS_ALIGNED(next, PMD_SIZE))
+ vmemmap_use_new_sub_pmd(addr, next);
+
continue;
} else if (altmap)
return -ENOMEM; /* no fallback */
} else if (pmd_large(*pmd)) {
vmemmap_verify((pte_t *)pmd, node, addr, next);
+ vmemmap_use_sub_pmd(addr, next);
continue;
}
if (vmemmap_populate_basepages(addr, next, node, NULL))
@@ -1556,6 +1605,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
{
int err;
+ VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
+ VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
+
if (end - start < PAGES_PER_SECTION * sizeof(struct page))
err = vmemmap_populate_basepages(start, end, node, NULL);
else if (boot_cpu_has(X86_FEATURE_PSE))
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 9e5ccc56f8e0..12c686c65ea9 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -481,25 +481,6 @@ void iounmap(volatile void __iomem *addr)
}
EXPORT_SYMBOL(iounmap);
-int __init arch_ioremap_p4d_supported(void)
-{
- return 0;
-}
-
-int __init arch_ioremap_pud_supported(void)
-{
-#ifdef CONFIG_X86_64
- return boot_cpu_has(X86_FEATURE_GBPAGES);
-#else
- return 0;
-#endif
-}
-
-int __init arch_ioremap_pmd_supported(void)
-{
- return boot_cpu_has(X86_FEATURE_PSE);
-}
-
/*
* Convert a physical pointer to a virtual kernel pointer for /dev/mem
* access
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 6e6b39710e5f..557f0fe25dff 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -96,7 +96,7 @@ void __init kernel_randomize_memory(void)
memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
- /* Adapt phyiscal memory region size based on available memory */
+ /* Adapt physical memory region size based on available memory */
if (memory_tb < kaslr_regions[0].size_tb)
kaslr_regions[0].size_tb = memory_tb;
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index be020a7bc414..d3efbc5b3449 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* Support for MMIO probes.
- * Benfit many code from kprobes
+ * Benefit many code from kprobes
* (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
* 2007 Alexander Eichner
* 2008 Pekka Paalanen <pq@iki.fi>
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index ae78cef79980..f633f9e23b8f 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -19,6 +19,7 @@
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/dma-mapping.h>
+#include <linux/virtio_config.h>
#include <asm/tlbflush.h>
#include <asm/fixmap.h>
@@ -484,3 +485,8 @@ void __init mem_encrypt_init(void)
print_mem_encrypt_feature_info();
}
+int arch_has_restricted_virtio_memory_access(void)
+{
+ return sev_active();
+}
+EXPORT_SYMBOL_GPL(arch_has_restricted_virtio_memory_access);
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
index 7a84fc8bc5c3..17d292b7072f 100644
--- a/arch/x86/mm/mem_encrypt_boot.S
+++ b/arch/x86/mm/mem_encrypt_boot.S
@@ -27,7 +27,7 @@ SYM_FUNC_START(sme_encrypt_execute)
* - stack page (PAGE_SIZE)
* - encryption routine page (PAGE_SIZE)
* - intermediate copy buffer (PMD_PAGE_SIZE)
- * R8 - physcial address of the pagetables to use for encryption
+ * R8 - physical address of the pagetables to use for encryption
*/
push %rbp
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index 6c5eb6f3f14f..a19374d26101 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -503,14 +503,10 @@ void __init sme_enable(struct boot_params *bp)
#define AMD_SME_BIT BIT(0)
#define AMD_SEV_BIT BIT(1)
- /*
- * Set the feature mask (SME or SEV) based on whether we are
- * running under a hypervisor.
- */
- eax = 1;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT;
+
+ /* Check the SEV MSR whether SEV or SME is enabled */
+ sev_status = __rdmsr(MSR_AMD64_SEV);
+ feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
/*
* Check for the SME/SEV feature:
@@ -530,19 +526,26 @@ void __init sme_enable(struct boot_params *bp)
/* Check if memory encryption is enabled */
if (feature_mask == AMD_SME_BIT) {
+ /*
+ * No SME if Hypervisor bit is set. This check is here to
+ * prevent a guest from trying to enable SME. For running as a
+ * KVM guest the MSR_K8_SYSCFG will be sufficient, but there
+ * might be other hypervisors which emulate that MSR as non-zero
+ * or even pass it through to the guest.
+ * A malicious hypervisor can still trick a guest into this
+ * path, but there is no way to protect against that.
+ */
+ eax = 1;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (ecx & BIT(31))
+ return;
+
/* For SME, check the SYSCFG MSR */
msr = __rdmsr(MSR_K8_SYSCFG);
if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
return;
} else {
- /* For SEV, check the SEV MSR */
- msr = __rdmsr(MSR_AMD64_SEV);
- if (!(msr & MSR_AMD64_SEV_ENABLED))
- return;
-
- /* Save SEV_STATUS to avoid reading MSR again */
- sev_status = msr;
-
/* SEV state cannot be controlled by a command line option */
sme_me_mask = me_mask;
sev_enabled = true;
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index ca311aaa67b8..3112ca7786ed 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -695,7 +695,7 @@ int memtype_free(u64 start, u64 end)
/**
- * lookup_memtype - Looksup the memory type for a physical address
+ * lookup_memtype - Looks up the memory type for a physical address
* @paddr: physical address of which memory type needs to be looked up
*
* Only to be called when PAT is enabled
@@ -800,6 +800,7 @@ void memtype_free_io(resource_size_t start, resource_size_t end)
memtype_free(start, end);
}
+#ifdef CONFIG_X86_PAT
int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size)
{
enum page_cache_mode type = _PAGE_CACHE_MODE_WC;
@@ -813,6 +814,7 @@ void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
memtype_free_io(start, start + size);
}
EXPORT_SYMBOL(arch_io_free_memtype_wc);
+#endif
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t vma_prot)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 16f878c26667..427980617557 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -680,7 +680,7 @@ pmd_t *lookup_pmd_address(unsigned long address)
* end up in this kind of memory, for instance.
*
* This could be optimized, but it is only intended to be
- * used at inititalization time, and keeping it
+ * used at initialization time, and keeping it
* unoptimized should increase the testing coverage for
* the more obscure platforms.
*/
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index f6a9e2e36642..d27cf69e811d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -780,14 +780,6 @@ int pmd_clear_huge(pmd_t *pmd)
return 0;
}
-/*
- * Until we support 512GB pages, skip them in the vmap area.
- */
-int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
-{
- return 0;
-}
-
#ifdef CONFIG_X86_64
/**
* pud_free_pmd_page - Clear pud entry and free pmd page.
@@ -861,11 +853,6 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
#else /* !CONFIG_X86_64 */
-int pud_free_pmd_page(pud_t *pud, unsigned long addr)
-{
- return pud_none(*pud);
-}
-
/*
* Disable free page handling on x86-PAE. This assures that ioremap()
* does not update sync'd pmd entries. See vmalloc_sync_one().
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 8873ed1438a9..a2332eef66e9 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -128,7 +128,7 @@ u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
/*
* Called from the FPU code when creating a fresh set of FPU
* registers. This is called from a very specific context where
- * we know the FPU regstiers are safe for use and we can use PKRU
+ * we know the FPU registers are safe for use and we can use PKRU
* directly.
*/
void copy_init_pkru_to_fpregs(void)
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 1aab92930569..5d5c7bb50ce9 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -361,7 +361,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end,
* global, so set it as global in both copies. Note:
* the X86_FEATURE_PGE check is not _required_ because
* the CPU ignores _PAGE_GLOBAL when PGE is not
- * supported. The check keeps consistentency with
+ * supported. The check keeps consistency with
* code that only set this bit when supported.
*/
if (boot_cpu_has(X86_FEATURE_PGE))
@@ -440,10 +440,9 @@ static void __init pti_clone_user_shared(void)
for_each_possible_cpu(cpu) {
/*
- * The SYSCALL64 entry code needs to be able to find the
- * thread stack and needs one word of scratch space in which
- * to spill a register. All of this lives in the TSS, in
- * the sp1 and sp2 slots.
+ * The SYSCALL64 entry code needs one word of scratch space
+ * in which to spill a register. It lives in the sp2 slot
+ * of the CPU's TSS.
*
* This is done for all possible CPUs during boot to ensure
* that it's propagated to all mms.
@@ -512,7 +511,7 @@ static void pti_clone_entry_text(void)
static inline bool pti_kernel_image_global_ok(void)
{
/*
- * Systems with PCIDs get litlle benefit from global
+ * Systems with PCIDs get little benefit from global
* kernel text and are not worth the downsides.
*/
if (cpu_feature_enabled(X86_FEATURE_PCID))
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 569ac1d57f55..78804680e923 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -24,7 +24,7 @@
# define __flush_tlb_local native_flush_tlb_local
# define __flush_tlb_global native_flush_tlb_global
# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr)
-# define __flush_tlb_others(msk, info) native_flush_tlb_others(msk, info)
+# define __flush_tlb_multi(msk, info) native_flush_tlb_multi(msk, info)
#endif
/*
@@ -106,7 +106,7 @@ static inline u16 kern_pcid(u16 asid)
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
- * Make sure that the dynamic ASID space does not confict with the
+ * Make sure that the dynamic ASID space does not conflict with the
* bit we are using to switch between user and kernel ASIDs.
*/
BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
@@ -300,7 +300,7 @@ void leave_mm(int cpu)
return;
/* Warn if we're not lazy. */
- WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
+ WARN_ON(!this_cpu_read(cpu_tlbstate_shared.is_lazy));
switch_mm(NULL, &init_mm, NULL);
}
@@ -316,7 +316,7 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
local_irq_restore(flags);
}
-static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
+static unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
{
unsigned long next_tif = task_thread_info(next)->flags;
unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
@@ -424,7 +424,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
+ bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
bool need_flush;
@@ -439,7 +439,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* NB: leave_mm() calls us with prev == NULL and tsk == NULL.
*/
- /* We don't want flush_tlb_func_* to run concurrently with us. */
+ /* We don't want flush_tlb_func() to run concurrently with us. */
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
WARN_ON_ONCE(!irqs_disabled());
@@ -469,7 +469,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
__flush_tlb_all();
}
#endif
- this_cpu_write(cpu_tlbstate.is_lazy, false);
+ if (was_lazy)
+ this_cpu_write(cpu_tlbstate_shared.is_lazy, false);
/*
* The membarrier system call requires a full memory barrier and
@@ -490,7 +491,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
/*
* Even in lazy TLB mode, the CPU should stay set in the
* mm_cpumask. The TLB shootdown code can figure out from
- * from cpu_tlbstate.is_lazy whether or not to send an IPI.
+ * cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
*/
if (WARN_ON_ONCE(real_prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
@@ -598,7 +599,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
- this_cpu_write(cpu_tlbstate.is_lazy, true);
+ this_cpu_write(cpu_tlbstate_shared.is_lazy, true);
}
/*
@@ -647,14 +648,13 @@ void initialize_tlbstate_and_flush(void)
}
/*
- * flush_tlb_func_common()'s memory ordering requirement is that any
+ * flush_tlb_func()'s memory ordering requirement is that any
* TLB fills that happen after we flush the TLB are ordered after we
* read active_mm's tlb_gen. We don't need any explicit barriers
* because all x86 flush operations are serializing and the
* atomic64_read operation won't be reordered by the compiler.
*/
-static void flush_tlb_func_common(const struct flush_tlb_info *f,
- bool local, enum tlb_flush_reason reason)
+static void flush_tlb_func(void *info)
{
/*
* We have three different tlb_gen values in here. They are:
@@ -665,28 +665,40 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* - f->new_tlb_gen: the generation that the requester of the flush
* wants us to catch up to.
*/
+ const struct flush_tlb_info *f = info;
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+ bool local = smp_processor_id() == f->initiating_cpu;
+ unsigned long nr_invalidate = 0;
/* This code cannot presently handle being reentered. */
VM_WARN_ON(!irqs_disabled());
+ if (!local) {
+ inc_irq_stat(irq_tlb_count);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+
+ /* Can only happen on remote CPUs */
+ if (f->mm && f->mm != loaded_mm)
+ return;
+ }
+
if (unlikely(loaded_mm == &init_mm))
return;
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
loaded_mm->context.ctx_id);
- if (this_cpu_read(cpu_tlbstate.is_lazy)) {
+ if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) {
/*
* We're in lazy mode. We need to at least flush our
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
*
- * This should be rare, with native_flush_tlb_others skipping
+ * This should be rare, with native_flush_tlb_multi() skipping
* IPIs to lazy TLB mode CPUs.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
@@ -700,8 +712,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* be handled can catch us all the way up, leaving no work for
* the second flush.
*/
- trace_tlb_flush(reason, 0);
- return;
+ goto done;
}
WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
@@ -736,7 +747,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* 3, we'd be break the invariant: we'd update local_tlb_gen above
* 1 without the full flush that's needed for tlb_gen 2.
*
- * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation.
+ * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimization.
* Partial TLB flushes are not all that much cheaper than full TLB
* flushes, so it seems unlikely that it would be a performance win
* to do a partial flush if that won't bring our TLB fully up to
@@ -748,56 +759,54 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
f->new_tlb_gen == local_tlb_gen + 1 &&
f->new_tlb_gen == mm_tlb_gen) {
/* Partial flush */
- unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
unsigned long addr = f->start;
+ nr_invalidate = (f->end - f->start) >> f->stride_shift;
+
while (addr < f->end) {
flush_tlb_one_user(addr);
addr += 1UL << f->stride_shift;
}
if (local)
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
- trace_tlb_flush(reason, nr_invalidate);
} else {
/* Full flush. */
+ nr_invalidate = TLB_FLUSH_ALL;
+
flush_tlb_local();
if (local)
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- trace_tlb_flush(reason, TLB_FLUSH_ALL);
}
/* Both paths above update our state to mm_tlb_gen. */
this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
-}
-
-static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason)
-{
- const struct flush_tlb_info *f = info;
- flush_tlb_func_common(f, true, reason);
+ /* Tracing is done in a unified manner to reduce the code size */
+done:
+ trace_tlb_flush(!local ? TLB_REMOTE_SHOOTDOWN :
+ (f->mm == NULL) ? TLB_LOCAL_SHOOTDOWN :
+ TLB_LOCAL_MM_SHOOTDOWN,
+ nr_invalidate);
}
-static void flush_tlb_func_remote(void *info)
+static bool tlb_is_not_lazy(int cpu)
{
- const struct flush_tlb_info *f = info;
-
- inc_irq_stat(irq_tlb_count);
-
- if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
- return;
-
- count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
- flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
+ return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu);
}
-static bool tlb_is_not_lazy(int cpu, void *data)
-{
- return !per_cpu(cpu_tlbstate.is_lazy, cpu);
-}
+static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
+EXPORT_PER_CPU_SYMBOL(cpu_tlbstate_shared);
-STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
+STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
+ /*
+ * Do accounting and tracing. Note that there are (and have always been)
+ * cases in which a remote TLB flush will be traced, but eventually
+ * would not happen.
+ */
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (info->end == TLB_FLUSH_ALL)
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
@@ -815,18 +824,42 @@ STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
* up on the new contents of what used to be page tables, while
* doing a speculative memory access.
*/
- if (info->freed_tables)
- smp_call_function_many(cpumask, flush_tlb_func_remote,
- (void *)info, 1);
- else
- on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
- (void *)info, 1, cpumask);
+ if (info->freed_tables) {
+ on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
+ } else {
+ /*
+ * Although we could have used on_each_cpu_cond_mask(),
+ * open-coding it has performance advantages, as it eliminates
+ * the need for indirect calls or retpolines. In addition, it
+ * allows to use a designated cpumask for evaluating the
+ * condition, instead of allocating one.
+ *
+ * This code works under the assumption that there are no nested
+ * TLB flushes, an assumption that is already made in
+ * flush_tlb_mm_range().
+ *
+ * cond_cpumask is logically a stack-local variable, but it is
+ * more efficient to have it off the stack and not to allocate
+ * it on demand. Preemption is disabled and this code is
+ * non-reentrant.
+ */
+ struct cpumask *cond_cpumask = this_cpu_ptr(&flush_tlb_mask);
+ int cpu;
+
+ cpumask_clear(cond_cpumask);
+
+ for_each_cpu(cpu, cpumask) {
+ if (tlb_is_not_lazy(cpu))
+ __cpumask_set_cpu(cpu, cond_cpumask);
+ }
+ on_each_cpu_mask(cond_cpumask, flush_tlb_func, (void *)info, true);
+ }
}
-void flush_tlb_others(const struct cpumask *cpumask,
+void flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
- __flush_tlb_others(cpumask, info);
+ __flush_tlb_multi(cpumask, info);
}
/*
@@ -847,7 +880,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
#endif
-static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
+static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
unsigned long start, unsigned long end,
unsigned int stride_shift, bool freed_tables,
u64 new_tlb_gen)
@@ -869,14 +902,15 @@ static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
info->stride_shift = stride_shift;
info->freed_tables = freed_tables;
info->new_tlb_gen = new_tlb_gen;
+ info->initiating_cpu = smp_processor_id();
return info;
}
-static inline void put_flush_tlb_info(void)
+static void put_flush_tlb_info(void)
{
#ifdef CONFIG_DEBUG_VM
- /* Complete reentrency prevention checks */
+ /* Complete reentrancy prevention checks */
barrier();
this_cpu_dec(flush_tlb_info_idx);
#endif
@@ -905,16 +939,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
new_tlb_gen);
- if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
+ /*
+ * flush_tlb_multi() is not optimized for the common case in which only
+ * a local TLB flush is needed. Optimize this use-case by calling
+ * flush_tlb_func_local() directly in this case.
+ */
+ if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ flush_tlb_multi(mm_cpumask(mm), info);
+ } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
lockdep_assert_irqs_enabled();
local_irq_disable();
- flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN);
+ flush_tlb_func(info);
local_irq_enable();
}
- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), info);
-
put_flush_tlb_info();
put_cpu();
}
@@ -1119,34 +1157,30 @@ void __flush_tlb_all(void)
}
EXPORT_SYMBOL_GPL(__flush_tlb_all);
-/*
- * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm.
- * This means that the 'struct flush_tlb_info' that describes which mappings to
- * flush is actually fixed. We therefore set a single fixed struct and use it in
- * arch_tlbbatch_flush().
- */
-static const struct flush_tlb_info full_flush_tlb_info = {
- .mm = NULL,
- .start = 0,
- .end = TLB_FLUSH_ALL,
-};
-
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
{
+ struct flush_tlb_info *info;
+
int cpu = get_cpu();
- if (cpumask_test_cpu(cpu, &batch->cpumask)) {
+ info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, 0);
+ /*
+ * flush_tlb_multi() is not optimized for the common case in which only
+ * a local TLB flush is needed. Optimize this use-case by calling
+ * flush_tlb_func_local() directly in this case.
+ */
+ if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+ flush_tlb_multi(&batch->cpumask, info);
+ } else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
lockdep_assert_irqs_enabled();
local_irq_disable();
- flush_tlb_func_local(&full_flush_tlb_info, TLB_LOCAL_SHOOTDOWN);
+ flush_tlb_func(info);
local_irq_enable();
}
- if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
- flush_tlb_others(&batch->cpumask, &full_flush_tlb_info);
-
cpumask_clear(&batch->cpumask);
+ put_flush_tlb_info();
put_cpu();
}