From dc4bd2a2ddafeffcdc33bb5c67a058d00885bb99 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@suse.com>
Date: Thu, 30 Sep 2021 14:34:56 +0200
Subject: xen/x86: streamline set_pte_mfn()

In preparation for restoring xen_set_pte_init()'s original behavior of
avoiding hypercalls, make set_pte_mfn() no longer use the standard
set_pte() code path. That one is more complicated than the alternative
of simply using an available hypercall directly. This way we can avoid
introducing a fair number (2k on my test system) of cases where the
hypervisor would trap-and-emulate page table updates.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: https://lore.kernel.org/r/b39c08e8-4a53-8bca-e6e7-3684a6cab8d0@suse.com
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/mmu_pv.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 1ce436eeda15..d2d6980c66b3 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -241,9 +241,11 @@ static void xen_set_pmd(pmd_t *ptr, pmd_t val)
  * Associate a virtual page frame with a given physical page frame
  * and protection flags for that frame.
  */
-void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
+void __init set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 {
-	set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
+	if (HYPERVISOR_update_va_mapping(vaddr, mfn_pte(mfn, flags),
+					 UVMF_INVLPG))
+		BUG();
 }
 
 static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
-- 
cgit v1.2.3


From cae739518314fcd75642236f6813537fe9d23546 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@suse.com>
Date: Thu, 30 Sep 2021 14:35:22 +0200
Subject: xen/x86: restore (fix) xen_set_pte_init() behavior

Commit f7c90c2aa400 ("x86/xen: don't write ptes directly in 32-bit PV
guests") needlessly (and heavily) penalized 64-bit guests here: The
majority of the early page table updates is to writable pages (which get
converted to r/o only after all the writes are done), in particular
those involved in building the direct map (which consists of all 4k
mappings in PV). On my test system this accounts for almost 16 million
hypercalls when each could simply have been a plain memory write.

Switch back to using native_set_pte(), except for updates of early
ioremap tables (where a suitable accessor exists to recognize them).
With 32-bit PV support gone, this doesn't need to be further
conditionalized (albeit backports thereof may need adjustment).

To avoid a fair number (almost 256k on my test system) of trap-and-
emulate cases appearing as a result, switch the hook in
xen_pagetable_init().

Finally commit d6b186c1e2d8 ("x86/xen: avoid m2p lookup when setting
early page table entries") inserted a function ahead of
xen_set_pte_init(), separating it from its comment (which may have been
part of the reason why the performance regression wasn't anticipated /
recognized while codeing / reviewing the change mentioned further up).
Move the function up and adjust that comment to describe the new
behavior.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: https://lore.kernel.org/r/57ce1289-0297-e96e-79e1-cedafb5d9bf6@suse.com
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/mmu_pv.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index d2d6980c66b3..7b366e252e03 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1194,6 +1194,13 @@ static void __init xen_pagetable_p2m_setup(void)
 
 static void __init xen_pagetable_init(void)
 {
+	/*
+	 * The majority of further PTE writes is to pagetables already
+	 * announced as such to Xen. Hence it is more efficient to use
+	 * hypercalls for these updates.
+	 */
+	pv_ops.mmu.set_pte = __xen_set_pte;
+
 	paging_init();
 	xen_post_allocator_init();
 
@@ -1423,10 +1430,18 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
  *
  * Many of these PTE updates are done on unpinned and writable pages
  * and doing a hypercall for these is unnecessary and expensive.  At
- * this point it is not possible to tell if a page is pinned or not,
- * so always write the PTE directly and rely on Xen trapping and
+ * this point it is rarely possible to tell if a page is pinned, so
+ * mostly write the PTE directly and rely on Xen trapping and
  * emulating any updates as necessary.
  */
+static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
+{
+	if (unlikely(is_early_ioremap_ptep(ptep)))
+		__xen_set_pte(ptep, pte);
+	else
+		native_set_pte(ptep, pte);
+}
+
 __visible pte_t xen_make_pte_init(pteval_t pte)
 {
 	unsigned long pfn;
@@ -1448,11 +1463,6 @@ __visible pte_t xen_make_pte_init(pteval_t pte)
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init);
 
-static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
-{
-	__xen_set_pte(ptep, pte);
-}
-
 /* Early in boot, while setting up the initial pagetable, assume
    everything is pinned. */
 static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
-- 
cgit v1.2.3


From 4c360db6ccdb12d82c5c255c90c249970f5d1956 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@suse.com>
Date: Thu, 30 Sep 2021 14:35:52 +0200
Subject: xen/x86: adjust xen_set_fixmap()

Using __native_set_fixmap() here means guaranteed trap-and-emulate
instances the hypervisor has to deal with. Since the virtual address
covered by the to be adjusted page table entry is easy to determine (and
actually already gets obtained in a special case), simply use an
available, easy to invoke hypercall instead.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: https://lore.kernel.org/r/11fcaea2-ec17-3edd-ecdf-4cdd2d472bd0@suse.com
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/mmu_pv.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 7b366e252e03..ad0fbc7e4872 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2011,6 +2011,7 @@ static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 {
 	pte_t pte;
+	unsigned long vaddr;
 
 	phys >>= PAGE_SHIFT;
 
@@ -2051,15 +2052,15 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 		break;
 	}
 
-	__native_set_fixmap(idx, pte);
+	vaddr = __fix_to_virt(idx);
+	if (HYPERVISOR_update_va_mapping(vaddr, pte, UVMF_INVLPG))
+		BUG();
 
 #ifdef CONFIG_X86_VSYSCALL_EMULATION
 	/* Replicate changes to map the vsyscall page into the user
 	   pagetable vsyscall mapping. */
-	if (idx == VSYSCALL_PAGE) {
-		unsigned long vaddr = __fix_to_virt(idx);
+	if (idx == VSYSCALL_PAGE)
 		set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
-	}
 #endif
 }
 
-- 
cgit v1.2.3


From d2a3ef44c2a2e49fc08270a9d33f0e4918ee2cb9 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@suse.com>
Date: Thu, 30 Sep 2021 14:36:16 +0200
Subject: xen/x86: adjust handling of the L3 user vsyscall special page table

Marking the page tableas pinned without ever actually pinning is was
probably an oversight in the first place. The main reason for the change
is more subtle, though: The write of the one present entry each here and
in the subsequently allocated L2 table engage a code path in the
hypervisor which exists only for thought-to-be-broken guests: An mmu-
update operation to a page which is neither a page table nor marked
writable. The hypervisor merely assumes (or should I say "hopes") that
the fact that a writable reference to the page can be obtained means it
is okay to actually write to that page in response to such a hypercall.

While there make all involved code and data dependent upon
X86_VSYSCALL_EMULATION (some code was already).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: https://lore.kernel.org/r/1048f5b8-b726-dcc1-1216-9d5ac328ce82@suse.com
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/mmu_pv.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index ad0fbc7e4872..325a67910855 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -86,8 +86,10 @@
 #include "mmu.h"
 #include "debugfs.h"
 
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
 /* l3 pud for userspace vsyscall mapping */
 static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
+#endif
 
 /*
  * Protects atomic reservation decrease/increase against concurrent increases.
@@ -791,7 +793,9 @@ static void __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
 static void __init xen_after_bootmem(void)
 {
 	static_branch_enable(&xen_struct_pages_ready);
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
 	SetPagePinned(virt_to_page(level3_user_vsyscall));
+#endif
 	xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
 }
 
@@ -1762,7 +1766,6 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 	set_page_prot(init_top_pgt, PAGE_KERNEL_RO);
 	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
 	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
-	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
 	set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
 	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
 	set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
@@ -1779,6 +1782,13 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 	/* Unpin Xen-provided one */
 	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
 
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+	/* Pin user vsyscall L3 */
+	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
+			  PFN_DOWN(__pa_symbol(level3_user_vsyscall)));
+#endif
+
 	/*
 	 * At this stage there can be no user pgd, and no page structure to
 	 * attach it to, so make sure we just set kernel pgd.
-- 
cgit v1.2.3


From 344485a21ddb6a21b0911a06fafce18170381d47 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@suse.com>
Date: Thu, 30 Sep 2021 14:36:56 +0200
Subject: xen/x86: there's no highmem anymore in PV mode

Considerations for it are a leftover from when 32-bit was still
supported.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: https://lore.kernel.org/r/ba6e0779-18f4-ae64-b216-73205b4eec3c@suse.com
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/setup.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 8bfc10330107..c1eba7e1d242 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -306,10 +306,6 @@ static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
 		BUG();
 	}
 
-	/* Update kernel mapping, but not for highmem. */
-	if (pfn >= PFN_UP(__pa(high_memory - 1)))
-		return;
-
 	if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
 					 mfn_pte(mfn, PAGE_KERNEL), 0)) {
 		WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n",
-- 
cgit v1.2.3


From 9a58b352e9e8a8c897acf6aae5cacd7d481965f9 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@suse.com>
Date: Thu, 30 Sep 2021 14:37:19 +0200
Subject: xen/x86: restrict PV Dom0 identity mapping

When moving away RAM pages, there having been a mapping of those is not
a proper indication that instead MMIO should be mapped there. At the
point in time this effectively covers the low megabyte only. Mapping of
that is, however, the job of init_mem_mapping(). Comparing the two one
can also spot that we've been wrongly (or at least inconsistently) using
PAGE_KERNEL_IO here.

Simply zap any such mappings instead.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: https://lore.kernel.org/r/038b8c02-3621-d66a-63ae-982ccf67ae88@suse.com
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/setup.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index c1eba7e1d242..b5a6c2031f6e 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -425,13 +425,13 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
 	}
 
 	/*
-	 * If the PFNs are currently mapped, the VA mapping also needs
-	 * to be updated to be 1:1.
+	 * If the PFNs are currently mapped, their VA mappings need to be
+	 * zapped.
 	 */
 	for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
 		(void)HYPERVISOR_update_va_mapping(
 			(unsigned long)__va(pfn << PAGE_SHIFT),
-			mfn_pte(pfn, PAGE_KERNEL_IO), 0);
+			native_make_pte(0), 0);
 
 	return remap_pfn;
 }
-- 
cgit v1.2.3


From dce69259aebbc43b248d3593e46bdc7d4d33d94c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Oct 2021 15:33:09 +0200
Subject: x86/xen: Remove redundant irq_enter/exit() invocations

All these handlers are regular device interrupt handlers, so they already
went through the proper entry code which handles this correctly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Juergen Gross <jgross@suse.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: x86@kernel.org
Cc: xen-devel@lists.xenproject.org
Reviewed-by: Juergen Gross <jgross@suse.com>
Link: https://lore.kernel.org/r/877deicqqy.ffs@tglx
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/smp.c    | 4 ----
 arch/x86/xen/smp_pv.c | 2 --
 2 files changed, 6 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c1b2f764b29a..8f6d3dfaffa0 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -268,20 +268,16 @@ void xen_send_IPI_allbutself(int vector)
 
 static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
 {
-	irq_enter();
 	generic_smp_call_function_interrupt();
 	inc_irq_stat(irq_call_count);
-	irq_exit();
 
 	return IRQ_HANDLED;
 }
 
 static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
 {
-	irq_enter();
 	generic_smp_call_function_single_interrupt();
 	inc_irq_stat(irq_call_count);
-	irq_exit();
 
 	return IRQ_HANDLED;
 }
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 7ed56c6075b0..9e55bcbfcd33 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -458,10 +458,8 @@ static void xen_pv_stop_other_cpus(int wait)
 
 static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id)
 {
-	irq_enter();
 	irq_work_run();
 	inc_irq_stat(apic_irq_work_irqs);
-	irq_exit();
 
 	return IRQ_HANDLED;
 }
-- 
cgit v1.2.3


From 12ad6cfc09a5bbe59ba9bae3a103bd3c7130c7c2 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 28 Oct 2021 09:27:47 +0200
Subject: x86/xen: remove xen_have_vcpu_info_placement flag

The flag xen_have_vcpu_info_placement was needed to support Xen
hypervisors older than version 3.4, which didn't support the
VCPUOP_register_vcpu_info hypercall. Today the Linux kernel requires
at least Xen 4.0 to be able to run, so xen_have_vcpu_info_placement
can be dropped (in theory the flag was used to ensure a working kernel
even in case of the VCPUOP_register_vcpu_info hypercall failing for
other reasons than the hypercall not being supported, but the only
cases covered by the flag would be parameter errors, which ought not
to be made anyway).

This allows to let some functions return void now, as they can never
fail.

Signed-off-by: Juergen Gross <jgross@suse.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: https://lore.kernel.org/r/20211028072748.29862-2-jgross@suse.com
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/enlighten.c     | 97 +++++++++++---------------------------------
 arch/x86/xen/enlighten_hvm.c |  6 +--
 arch/x86/xen/enlighten_pv.c  | 28 +++----------
 arch/x86/xen/smp.c           | 24 -----------
 arch/x86/xen/xen-ops.h       |  4 +-
 5 files changed, 33 insertions(+), 126 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 95d970359e17..006b4a814fac 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -84,21 +84,6 @@ EXPORT_SYMBOL(xen_start_flags);
  */
 struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
 
-/*
- * Flag to determine whether vcpu info placement is available on all
- * VCPUs.  We assume it is to start with, and then set it to zero on
- * the first failure.  This is because it can succeed on some VCPUs
- * and not others, since it can involve hypervisor memory allocation,
- * or because the guest failed to guarantee all the appropriate
- * constraints on all VCPUs (ie buffer can't cross a page boundary).
- *
- * Note that any particular CPU may be using a placed vcpu structure,
- * but we can only optimise if the all are.
- *
- * 0: not available, 1: available
- */
-int xen_have_vcpu_info_placement = 1;
-
 static int xen_cpu_up_online(unsigned int cpu)
 {
 	xen_init_lock_cpu(cpu);
@@ -124,10 +109,8 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int),
 	return rc >= 0 ? 0 : rc;
 }
 
-static int xen_vcpu_setup_restore(int cpu)
+static void xen_vcpu_setup_restore(int cpu)
 {
-	int rc = 0;
-
 	/* Any per_cpu(xen_vcpu) is stale, so reset it */
 	xen_vcpu_info_reset(cpu);
 
@@ -136,11 +119,8 @@ static int xen_vcpu_setup_restore(int cpu)
 	 * be handled by hotplug.
 	 */
 	if (xen_pv_domain() ||
-	    (xen_hvm_domain() && cpu_online(cpu))) {
-		rc = xen_vcpu_setup(cpu);
-	}
-
-	return rc;
+	    (xen_hvm_domain() && cpu_online(cpu)))
+		xen_vcpu_setup(cpu);
 }
 
 /*
@@ -150,7 +130,7 @@ static int xen_vcpu_setup_restore(int cpu)
  */
 void xen_vcpu_restore(void)
 {
-	int cpu, rc;
+	int cpu;
 
 	for_each_possible_cpu(cpu) {
 		bool other_cpu = (cpu != smp_processor_id());
@@ -170,20 +150,9 @@ void xen_vcpu_restore(void)
 		if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock))
 			xen_setup_runstate_info(cpu);
 
-		rc = xen_vcpu_setup_restore(cpu);
-		if (rc)
-			pr_emerg_once("vcpu restore failed for cpu=%d err=%d. "
-					"System will hang.\n", cpu, rc);
-		/*
-		 * In case xen_vcpu_setup_restore() fails, do not bring up the
-		 * VCPU. This helps us avoid the resulting OOPS when the VCPU
-		 * accesses pvclock_vcpu_time via xen_vcpu (which is NULL.)
-		 * Note that this does not improve the situation much -- now the
-		 * VM hangs instead of OOPSing -- with the VCPUs that did not
-		 * fail, spinning in stop_machine(), waiting for the failed
-		 * VCPUs to come up.
-		 */
-		if (other_cpu && is_up && (rc == 0) &&
+		xen_vcpu_setup_restore(cpu);
+
+		if (other_cpu && is_up &&
 		    HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL))
 			BUG();
 	}
@@ -200,7 +169,7 @@ void xen_vcpu_info_reset(int cpu)
 	}
 }
 
-int xen_vcpu_setup(int cpu)
+void xen_vcpu_setup(int cpu)
 {
 	struct vcpu_register_vcpu_info info;
 	int err;
@@ -221,44 +190,26 @@ int xen_vcpu_setup(int cpu)
 	 */
 	if (xen_hvm_domain()) {
 		if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu))
-			return 0;
+			return;
 	}
 
-	if (xen_have_vcpu_info_placement) {
-		vcpup = &per_cpu(xen_vcpu_info, cpu);
-		info.mfn = arbitrary_virt_to_mfn(vcpup);
-		info.offset = offset_in_page(vcpup);
+	vcpup = &per_cpu(xen_vcpu_info, cpu);
+	info.mfn = arbitrary_virt_to_mfn(vcpup);
+	info.offset = offset_in_page(vcpup);
 
-		/*
-		 * Check to see if the hypervisor will put the vcpu_info
-		 * structure where we want it, which allows direct access via
-		 * a percpu-variable.
-		 * N.B. This hypercall can _only_ be called once per CPU.
-		 * Subsequent calls will error out with -EINVAL. This is due to
-		 * the fact that hypervisor has no unregister variant and this
-		 * hypercall does not allow to over-write info.mfn and
-		 * info.offset.
-		 */
-		err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info,
-					 xen_vcpu_nr(cpu), &info);
-
-		if (err) {
-			pr_warn_once("register_vcpu_info failed: cpu=%d err=%d\n",
-				     cpu, err);
-			xen_have_vcpu_info_placement = 0;
-		} else {
-			/*
-			 * This cpu is using the registered vcpu info, even if
-			 * later ones fail to.
-			 */
-			per_cpu(xen_vcpu, cpu) = vcpup;
-		}
-	}
-
-	if (!xen_have_vcpu_info_placement)
-		xen_vcpu_info_reset(cpu);
+	/*
+	 * N.B. This hypercall can _only_ be called once per CPU.
+	 * Subsequent calls will error out with -EINVAL. This is due to
+	 * the fact that hypervisor has no unregister variant and this
+	 * hypercall does not allow to over-write info.mfn and
+	 * info.offset.
+	 */
+	err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, xen_vcpu_nr(cpu),
+				 &info);
+	if (err)
+		panic("register_vcpu_info failed: cpu=%d err=%d\n", cpu, err);
 
-	return ((per_cpu(xen_vcpu, cpu) == NULL) ? -ENODEV : 0);
+	per_cpu(xen_vcpu, cpu) = vcpup;
 }
 
 void __init xen_banner(void)
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index e68ea5f4ad1c..42300941ec29 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -163,9 +163,9 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu)
 		per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu);
 	else
 		per_cpu(xen_vcpu_id, cpu) = cpu;
-	rc = xen_vcpu_setup(cpu);
-	if (rc || !xen_have_vector_callback)
-		return rc;
+	xen_vcpu_setup(cpu);
+	if (!xen_have_vector_callback)
+		return 0;
 
 	if (xen_feature(XENFEAT_hvm_safe_pvclock))
 		xen_setup_timer(cpu);
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 4f63117f09bb..312349673215 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -993,31 +993,13 @@ void __init xen_setup_vcpu_info_placement(void)
 	for_each_possible_cpu(cpu) {
 		/* Set up direct vCPU id mapping for PV guests. */
 		per_cpu(xen_vcpu_id, cpu) = cpu;
-
-		/*
-		 * xen_vcpu_setup(cpu) can fail  -- in which case it
-		 * falls back to the shared_info version for cpus
-		 * where xen_vcpu_nr(cpu) < MAX_VIRT_CPUS.
-		 *
-		 * xen_cpu_up_prepare_pv() handles the rest by failing
-		 * them in hotplug.
-		 */
-		(void) xen_vcpu_setup(cpu);
+		xen_vcpu_setup(cpu);
 	}
 
-	/*
-	 * xen_vcpu_setup managed to place the vcpu_info within the
-	 * percpu area for all cpus, so make use of it.
-	 */
-	if (xen_have_vcpu_info_placement) {
-		pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
-		pv_ops.irq.irq_disable =
-			__PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
-		pv_ops.irq.irq_enable =
-			__PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
-		pv_ops.mmu.read_cr2 =
-			__PV_IS_CALLEE_SAVE(xen_read_cr2_direct);
-	}
+	pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
+	pv_ops.irq.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
+	pv_ops.irq.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
+	pv_ops.mmu.read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2_direct);
 }
 
 static const struct pv_info xen_info __initconst = {
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 8f6d3dfaffa0..c3e1f9a7d43a 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -121,34 +121,10 @@ int xen_smp_intr_init(unsigned int cpu)
 
 void __init xen_smp_cpus_done(unsigned int max_cpus)
 {
-	int cpu, rc, count = 0;
-
 	if (xen_hvm_domain())
 		native_smp_cpus_done(max_cpus);
 	else
 		calculate_max_logical_packages();
-
-	if (xen_have_vcpu_info_placement)
-		return;
-
-	for_each_online_cpu(cpu) {
-		if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS)
-			continue;
-
-		rc = remove_cpu(cpu);
-
-		if (rc == 0) {
-			/*
-			 * Reset vcpu_info so this cpu cannot be onlined again.
-			 */
-			xen_vcpu_info_reset(cpu);
-			count++;
-		} else {
-			pr_warn("%s: failed to bring CPU %d down, error %d\n",
-				__func__, cpu, rc);
-		}
-	}
-	WARN(count, "%s: brought %d CPUs offline\n", __func__, count);
 }
 
 void xen_smp_send_reschedule(int cpu)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 8bc8b72a205d..fd0fec6e92f4 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -76,9 +76,7 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
 
 bool xen_vcpu_stolen(int vcpu);
 
-extern int xen_have_vcpu_info_placement;
-
-int xen_vcpu_setup(int cpu);
+void xen_vcpu_setup(int cpu);
 void xen_vcpu_info_reset(int cpu);
 void xen_setup_vcpu_info_placement(void);
 
-- 
cgit v1.2.3


From e453f872b72fb93e54ab0f3a07eeeacc2215fd15 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 28 Oct 2021 09:27:48 +0200
Subject: x86/xen: switch initial pvops IRQ functions to dummy ones

The initial pvops functions handling irq flags will only ever be called
before interrupts are being enabled.

So switch them to be dummy functions:
- xen_save_fl() can always return 0
- xen_irq_disable() is a nop
- xen_irq_enable() can BUG()

Add some generic paravirt functions for that purpose.

Signed-off-by: Juergen Gross <jgross@suse.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: https://lore.kernel.org/r/20211028072748.29862-3-jgross@suse.com
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/include/asm/paravirt_types.h |  2 ++
 arch/x86/kernel/paravirt.c            | 13 +++++++-
 arch/x86/xen/enlighten.c              | 19 ++---------
 arch/x86/xen/irq.c                    | 62 +++--------------------------------
 4 files changed, 20 insertions(+), 76 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index d9d6b0203ec4..fc1151e77569 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -577,7 +577,9 @@ void paravirt_leave_lazy_mmu(void);
 void paravirt_flush_lazy_mmu(void);
 
 void _paravirt_nop(void);
+void paravirt_BUG(void);
 u64 _paravirt_ident_64(u64);
+unsigned long paravirt_ret0(void);
 
 #define paravirt_nop	((void *)_paravirt_nop)
 
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index ebc45360ffd4..7157c2df3bc2 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -46,6 +46,17 @@ asm (".pushsection .entry.text, \"ax\"\n"
      ".type _paravirt_nop, @function\n\t"
      ".popsection");
 
+/* stub always returning 0. */
+asm (".pushsection .entry.text, \"ax\"\n"
+     ".global paravirt_ret0\n"
+     "paravirt_ret0:\n\t"
+     "xor %" _ASM_AX ", %" _ASM_AX ";\n\t"
+     "ret\n\t"
+     ".size paravirt_ret0, . - paravirt_ret0\n\t"
+     ".type paravirt_ret0, @function\n\t"
+     ".popsection");
+
+
 void __init default_banner(void)
 {
 	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
@@ -53,7 +64,7 @@ void __init default_banner(void)
 }
 
 /* Undefined instruction for dealing with missing ops pointers. */
-static void paravirt_BUG(void)
+noinstr void paravirt_BUG(void)
 {
 	BUG();
 }
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 006b4a814fac..30c6e986a6cd 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -31,25 +31,10 @@ EXPORT_SYMBOL_GPL(hypercall_page);
  * Pointer to the xen_vcpu_info structure or
  * &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info
  * and xen_vcpu_setup for details. By default it points to share_info->vcpu_info
- * but if the hypervisor supports VCPUOP_register_vcpu_info then it can point
- * to xen_vcpu_info. The pointer is used in __xen_evtchn_do_upcall to
- * acknowledge pending events.
- * Also more subtly it is used by the patched version of irq enable/disable
- * e.g. xen_irq_enable_direct and xen_iret in PV mode.
- *
- * The desire to be able to do those mask/unmask operations as a single
- * instruction by using the per-cpu offset held in %gs is the real reason
- * vcpu info is in a per-cpu pointer and the original reason for this
- * hypercall.
- *
+ * but during boot it is switched to point to xen_vcpu_info.
+ * The pointer is used in __xen_evtchn_do_upcall to acknowledge pending events.
  */
 DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
-
-/*
- * Per CPU pages used if hypervisor supports VCPUOP_register_vcpu_info
- * hypercall. This can be used both in PV and PVHVM mode. The structure
- * overrides the default per_cpu(xen_vcpu, cpu) value.
- */
 DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
 
 /* Linux <-> Xen vCPU id mapping */
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 4fe387e520af..06c3c2fb4b06 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -24,60 +24,6 @@ noinstr void xen_force_evtchn_callback(void)
 	(void)HYPERVISOR_xen_version(0, NULL);
 }
 
-asmlinkage __visible noinstr unsigned long xen_save_fl(void)
-{
-	struct vcpu_info *vcpu;
-	unsigned long flags;
-
-	vcpu = this_cpu_read(xen_vcpu);
-
-	/* flag has opposite sense of mask */
-	flags = !vcpu->evtchn_upcall_mask;
-
-	/* convert to IF type flag
-	   -0 -> 0x00000000
-	   -1 -> 0xffffffff
-	*/
-	return (-flags) & X86_EFLAGS_IF;
-}
-__PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl, ".noinstr.text");
-
-asmlinkage __visible noinstr void xen_irq_disable(void)
-{
-	/* There's a one instruction preempt window here.  We need to
-	   make sure we're don't switch CPUs between getting the vcpu
-	   pointer and updating the mask. */
-	preempt_disable();
-	this_cpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
-	preempt_enable_no_resched();
-}
-__PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable, ".noinstr.text");
-
-asmlinkage __visible noinstr void xen_irq_enable(void)
-{
-	struct vcpu_info *vcpu;
-
-	/*
-	 * We may be preempted as soon as vcpu->evtchn_upcall_mask is
-	 * cleared, so disable preemption to ensure we check for
-	 * events on the VCPU we are still running on.
-	 */
-	preempt_disable();
-
-	vcpu = this_cpu_read(xen_vcpu);
-	vcpu->evtchn_upcall_mask = 0;
-
-	/* Doesn't matter if we get preempted here, because any
-	   pending event will get dealt with anyway. */
-
-	barrier(); /* unmask then check (avoid races) */
-	if (unlikely(vcpu->evtchn_upcall_pending))
-		xen_force_evtchn_callback();
-
-	preempt_enable();
-}
-__PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable, ".noinstr.text");
-
 static void xen_safe_halt(void)
 {
 	/* Blocking includes an implicit local_irq_enable(). */
@@ -96,10 +42,10 @@ static void xen_halt(void)
 
 static const typeof(pv_ops) xen_irq_ops __initconst = {
 	.irq = {
-
-		.save_fl = PV_CALLEE_SAVE(xen_save_fl),
-		.irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
-		.irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
+		/* Initial interrupt flag handling only called while interrupts off. */
+		.save_fl = __PV_IS_CALLEE_SAVE(paravirt_ret0),
+		.irq_disable = __PV_IS_CALLEE_SAVE(paravirt_nop),
+		.irq_enable = __PV_IS_CALLEE_SAVE(paravirt_BUG),
 
 		.safe_halt = xen_safe_halt,
 		.halt = xen_halt,
-- 
cgit v1.2.3


From 3ac876e8b5fc6561c15031b5a56ff1f2e4fa0e5f Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 28 Oct 2021 10:12:20 +0200
Subject: xen: remove highmem remnants

There are some references to highmem left in Xen pv specific code which
can be removed.

Signed-off-by: Juergen Gross <jgross@suse.com>
Link: https://lore.kernel.org/r/20211028081221.2475-4-jgross@suse.com
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/enlighten_pv.c | 7 -------
 arch/x86/xen/mmu_pv.c       | 1 -
 2 files changed, 8 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 312349673215..5004feb16783 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -27,7 +27,6 @@
 #include <linux/export.h>
 #include <linux/mm.h>
 #include <linux/page-flags.h>
-#include <linux/highmem.h>
 #include <linux/pci.h>
 #include <linux/gfp.h>
 #include <linux/edd.h>
@@ -1229,12 +1228,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	__supported_pte_mask &= ~_PAGE_GLOBAL;
 	__default_kernel_pte_mask &= ~_PAGE_GLOBAL;
 
-	/*
-	 * Prevent page tables from being allocated in highmem, even
-	 * if CONFIG_HIGHPTE is enabled.
-	 */
-	__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
-
 	/* Get mfn list */
 	xen_build_dynamic_phys_to_machine();
 
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 325a67910855..dee8da28bf9b 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -41,7 +41,6 @@
  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  */
 #include <linux/sched/mm.h>
-#include <linux/highmem.h>
 #include <linux/debugfs.h>
 #include <linux/bug.h>
 #include <linux/vmalloc.h>
-- 
cgit v1.2.3


From eae446b7654f25b2ca42ed0de54d7005e126fdac Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 28 Oct 2021 10:12:21 +0200
Subject: x86/xen: remove 32-bit awareness from startup_xen

startup_xen is still 32-bit aware, even if no longer needed.

Replace the register macros by the 64-bit register names for making
it more readable.

Signed-off-by: Juergen Gross <jgross@suse.com>
Link: https://lore.kernel.org/r/20211028081221.2475-5-jgross@suse.com
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/xen-head.S | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 9e27b86a0c31..6a64496edefb 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -45,13 +45,13 @@ SYM_CODE_START(startup_xen)
 
 	/* Clear .bss */
 	xor %eax,%eax
-	mov $__bss_start, %_ASM_DI
-	mov $__bss_stop, %_ASM_CX
-	sub %_ASM_DI, %_ASM_CX
-	shr $__ASM_SEL(2, 3), %_ASM_CX
-	rep __ASM_SIZE(stos)
+	mov $__bss_start, %rdi
+	mov $__bss_stop, %rcx
+	sub %rdi, %rcx
+	shr $3, %rcx
+	rep stosq
 
-	mov %_ASM_SI, xen_start_info
+	mov %rsi, xen_start_info
 	mov initial_stack(%rip), %rsp
 
 	/* Set up %gs.
-- 
cgit v1.2.3