diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-04 11:37:09 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-04 11:37:09 -0700 |
commit | a96480723c287c502b02659f4b347aecaa651ea1 (patch) | |
tree | 7abee4eb6d0d0b43103dc4f48554736d2b642bc2 | |
parent | a1be8edda4fe1f0a75007f26000a51436800869d (diff) | |
parent | 3dbd8204af48d0da442f11ad39aa778a5fd462bf (diff) | |
download | linux-a96480723c287c502b02659f4b347aecaa651ea1.tar.bz2 |
Merge tag 'for-linus-4.12b-rc0b-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip
Pull xen updates from Juergen Gross:
"Xen fixes and featrues for 4.12. The main changes are:
- enable building the kernel with Xen support but without enabling
paravirtualized mode (Vitaly Kuznetsov)
- add a new 9pfs xen frontend driver (Stefano Stabellini)
- simplify Xen's cpuid handling by making use of cpu capabilities
(Juergen Gross)
- add/modify some headers for new Xen paravirtualized devices
(Oleksandr Andrushchenko)
- EFI reset_system support under Xen (Julien Grall)
- and the usual cleanups and corrections"
* tag 'for-linus-4.12b-rc0b-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: (57 commits)
xen: Move xen_have_vector_callback definition to enlighten.c
xen: Implement EFI reset_system callback
arm/xen: Consolidate calls to shutdown hypercall in a single helper
xen: Export xen_reboot
xen/x86: Call xen_smp_intr_init_pv() on BSP
xen: Revert commits da72ff5bfcb0 and 72a9b186292d
xen/pvh: Do not fill kernel's e820 map in init_pvh_bootparams()
xen/scsifront: use offset_in_page() macro
xen/arm,arm64: rename __generic_dma_ops to xen_get_dma_ops
xen/arm,arm64: fix xen_dma_ops after 815dd18 "Consolidate get_dma_ops..."
xen/9pfs: select CONFIG_XEN_XENBUS_FRONTEND
x86/cpu: remove hypervisor specific set_cpu_features
vmware: set cpu capabilities during platform initialization
x86/xen: use capabilities instead of fake cpuid values for xsave
x86/xen: use capabilities instead of fake cpuid values for x2apic
x86/xen: use capabilities instead of fake cpuid values for mwait
x86/xen: use capabilities instead of fake cpuid values for acpi
x86/xen: use capabilities instead of fake cpuid values for acc
x86/xen: use capabilities instead of fake cpuid values for mtrr
x86/xen: use capabilities instead of fake cpuid values for aperf
...
53 files changed, 8410 insertions, 5301 deletions
diff --git a/arch/arm/include/asm/device.h b/arch/arm/include/asm/device.h index 220ba207be91..36ec9c8f6e16 100644 --- a/arch/arm/include/asm/device.h +++ b/arch/arm/include/asm/device.h @@ -16,6 +16,9 @@ struct dev_archdata { #ifdef CONFIG_ARM_DMA_USE_IOMMU struct dma_iommu_mapping *mapping; #endif +#ifdef CONFIG_XEN + const struct dma_map_ops *dev_dma_ops; +#endif bool dma_coherent; }; diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h index 716656925975..680d3f3889e7 100644 --- a/arch/arm/include/asm/dma-mapping.h +++ b/arch/arm/include/asm/dma-mapping.h @@ -16,19 +16,9 @@ extern const struct dma_map_ops arm_dma_ops; extern const struct dma_map_ops arm_coherent_dma_ops; -static inline const struct dma_map_ops *__generic_dma_ops(struct device *dev) -{ - if (dev && dev->dma_ops) - return dev->dma_ops; - return &arm_dma_ops; -} - static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { - if (xen_initial_domain()) - return xen_dma_ops; - else - return __generic_dma_ops(NULL); + return &arm_dma_ops; } #define HAVE_ARCH_DMA_SUPPORTED 1 diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 475811f5383a..0268584f1fa0 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -2414,6 +2414,13 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, dma_ops = arm_get_dma_map_ops(coherent); set_dma_ops(dev, dma_ops); + +#ifdef CONFIG_XEN + if (xen_initial_domain()) { + dev->archdata.dev_dma_ops = dev->dma_ops; + dev->dma_ops = xen_dma_ops; + } +#endif } void arch_teardown_dma_ops(struct device *dev) diff --git a/arch/arm/xen/efi.c b/arch/arm/xen/efi.c index 16db419f9e90..b4d78959cadf 100644 --- a/arch/arm/xen/efi.c +++ b/arch/arm/xen/efi.c @@ -35,6 +35,6 @@ void __init xen_efi_runtime_setup(void) efi.update_capsule = xen_efi_update_capsule; efi.query_capsule_caps = xen_efi_query_capsule_caps; efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count; - efi.reset_system = NULL; /* Functionality provided by Xen. */ + efi.reset_system = xen_efi_reset_system; } EXPORT_SYMBOL_GPL(xen_efi_runtime_setup); diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index 81e3217b12d3..ba7f4c8f5c3e 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -191,20 +191,24 @@ static int xen_dying_cpu(unsigned int cpu) return 0; } -static void xen_restart(enum reboot_mode reboot_mode, const char *cmd) +void xen_reboot(int reason) { - struct sched_shutdown r = { .reason = SHUTDOWN_reboot }; + struct sched_shutdown r = { .reason = reason }; int rc; + rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r); BUG_ON(rc); } +static void xen_restart(enum reboot_mode reboot_mode, const char *cmd) +{ + xen_reboot(SHUTDOWN_reboot); +} + + static void xen_power_off(void) { - struct sched_shutdown r = { .reason = SHUTDOWN_poweroff }; - int rc; - rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r); - BUG_ON(rc); + xen_reboot(SHUTDOWN_poweroff); } static irqreturn_t xen_arm_callback(int irq, void *arg) diff --git a/arch/arm64/include/asm/device.h b/arch/arm64/include/asm/device.h index 73d5bab015eb..5a5fa47a6b18 100644 --- a/arch/arm64/include/asm/device.h +++ b/arch/arm64/include/asm/device.h @@ -20,6 +20,9 @@ struct dev_archdata { #ifdef CONFIG_IOMMU_API void *iommu; /* private IOMMU data */ #endif +#ifdef CONFIG_XEN + const struct dma_map_ops *dev_dma_ops; +#endif bool dma_coherent; }; diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h index 505756cdc67a..5392dbeffa45 100644 --- a/arch/arm64/include/asm/dma-mapping.h +++ b/arch/arm64/include/asm/dma-mapping.h @@ -27,11 +27,8 @@ #define DMA_ERROR_CODE (~(dma_addr_t)0) extern const struct dma_map_ops dummy_dma_ops; -static inline const struct dma_map_ops *__generic_dma_ops(struct device *dev) +static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { - if (dev && dev->dma_ops) - return dev->dma_ops; - /* * We expect no ISA devices, and all other DMA masters are expected to * have someone call arch_setup_dma_ops at device creation time. @@ -39,14 +36,6 @@ static inline const struct dma_map_ops *__generic_dma_ops(struct device *dev) return &dummy_dma_ops; } -static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) -{ - if (xen_initial_domain()) - return xen_dma_ops; - else - return __generic_dma_ops(NULL); -} - void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, const struct iommu_ops *iommu, bool coherent); #define arch_setup_dma_ops arch_setup_dma_ops diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 81cdb2e844ed..7f8b37e85a2b 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -977,4 +977,11 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, dev->archdata.dma_coherent = coherent; __iommu_setup_dma_ops(dev, dma_base, size, iommu); + +#ifdef CONFIG_XEN + if (xen_initial_domain()) { + dev->archdata.dev_dma_ops = dev->dma_ops; + dev->dma_ops = xen_dma_ops; + } +#endif } diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 67942b6ad4b7..21126155a739 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -35,9 +35,6 @@ struct hypervisor_x86 { /* Detection routine */ uint32_t (*detect)(void); - /* Adjust CPU feature bits (run once per CPU) */ - void (*set_cpu_features)(struct cpuinfo_x86 *); - /* Platform setup (run once per boot) */ void (*init_platform)(void); @@ -53,15 +50,14 @@ extern const struct hypervisor_x86 *x86_hyper; /* Recognized hypervisors */ extern const struct hypervisor_x86 x86_hyper_vmware; extern const struct hypervisor_x86 x86_hyper_ms_hyperv; -extern const struct hypervisor_x86 x86_hyper_xen; +extern const struct hypervisor_x86 x86_hyper_xen_pv; +extern const struct hypervisor_x86 x86_hyper_xen_hvm; extern const struct hypervisor_x86 x86_hyper_kvm; -extern void init_hypervisor(struct cpuinfo_x86 *c); extern void init_hypervisor_platform(void); extern bool hypervisor_x2apic_available(void); extern void hypervisor_pin_vcpu(int cpu); #else -static inline void init_hypervisor(struct cpuinfo_x86 *c) { } static inline void init_hypervisor_platform(void) { } static inline bool hypervisor_x2apic_available(void) { return false; } #endif /* CONFIG_HYPERVISOR_GUEST */ diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index 608a79d5a466..e6911caf5bbf 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) /* No need for a barrier -- XCHG is a barrier on x86. */ #define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) +extern int xen_have_vector_callback; + +/* + * Events delivered via platform PCI interrupts are always + * routed to vcpu 0 and hence cannot be rebound. + */ +static inline bool xen_support_evtchn_rebind(void) +{ + return (!xen_hvm_domain() || xen_have_vector_callback); +} + #endif /* _ASM_X86_XEN_EVENTS_H */ diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 8a5a02b1dfba..8417ef7c3885 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -52,12 +52,30 @@ extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern unsigned long __init set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e); +#ifdef CONFIG_XEN_PV extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count); extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, struct gnttab_unmap_grant_ref *kunmap_ops, struct page **pages, unsigned int count); +#else +static inline int +set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count) +{ + return 0; +} + +static inline int +clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, + struct gnttab_unmap_grant_ref *kunmap_ops, + struct page **pages, unsigned int count) +{ + return 0; +} +#endif /* * Helper functions to write or read unsigned long values to/from @@ -73,6 +91,7 @@ static inline int xen_safe_read_ulong(unsigned long *addr, unsigned long *val) return __get_user(*val, (unsigned long __user *)addr); } +#ifdef CONFIG_XEN_PV /* * When to use pfn_to_mfn(), __pfn_to_mfn() or get_phys_to_machine(): * - pfn_to_mfn() returns either INVALID_P2M_ENTRY or the mfn. No indicator @@ -99,6 +118,12 @@ static inline unsigned long __pfn_to_mfn(unsigned long pfn) return mfn; } +#else +static inline unsigned long __pfn_to_mfn(unsigned long pfn) +{ + return pfn; +} +#endif static inline unsigned long pfn_to_mfn(unsigned long pfn) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8ee32119144d..c8b39870f33e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1149,7 +1149,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) detect_ht(c); #endif - init_hypervisor(c); x86_init_rdrand(c); x86_init_cache_qos(c); setup_pku(c); diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 35691a6b0d32..4fa90006ac68 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -28,8 +28,11 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = { -#ifdef CONFIG_XEN - &x86_hyper_xen, +#ifdef CONFIG_XEN_PV + &x86_hyper_xen_pv, +#endif +#ifdef CONFIG_XEN_PVHVM + &x86_hyper_xen_hvm, #endif &x86_hyper_vmware, &x86_hyper_ms_hyperv, @@ -60,12 +63,6 @@ detect_hypervisor_vendor(void) pr_info("Hypervisor detected: %s\n", x86_hyper->name); } -void init_hypervisor(struct cpuinfo_x86 *c) -{ - if (x86_hyper && x86_hyper->set_cpu_features) - x86_hyper->set_cpu_features(c); -} - void __init init_hypervisor_platform(void) { @@ -74,8 +71,6 @@ void __init init_hypervisor_platform(void) if (!x86_hyper) return; - init_hypervisor(&boot_cpu_data); - if (x86_hyper->init_platform) x86_hyper->init_platform(); } diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 22403a28caf5..40ed26852ebd 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -113,6 +113,24 @@ static void __init vmware_paravirt_ops_setup(void) #define vmware_paravirt_ops_setup() do {} while (0) #endif +/* + * VMware hypervisor takes care of exporting a reliable TSC to the guest. + * Still, due to timing difference when running on virtual cpus, the TSC can + * be marked as unstable in some cases. For example, the TSC sync check at + * bootup can fail due to a marginal offset between vcpus' TSCs (though the + * TSCs do not drift from each other). Also, the ACPI PM timer clocksource + * is not suitable as a watchdog when running on a hypervisor because the + * kernel may miss a wrap of the counter if the vcpu is descheduled for a + * long time. To skip these checks at runtime we set these capability bits, + * so that the kernel could just trust the hypervisor with providing a + * reliable virtual TSC that is suitable for timekeeping. + */ +static void __init vmware_set_capabilities(void) +{ + setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); +} + static void __init vmware_platform_setup(void) { uint32_t eax, ebx, ecx, edx; @@ -152,6 +170,8 @@ static void __init vmware_platform_setup(void) #ifdef CONFIG_X86_IO_APIC no_timer_check = 1; #endif + + vmware_set_capabilities(); } /* @@ -176,24 +196,6 @@ static uint32_t __init vmware_platform(void) return 0; } -/* - * VMware hypervisor takes care of exporting a reliable TSC to the guest. - * Still, due to timing difference when running on virtual cpus, the TSC can - * be marked as unstable in some cases. For example, the TSC sync check at - * bootup can fail due to a marginal offset between vcpus' TSCs (though the - * TSCs do not drift from each other). Also, the ACPI PM timer clocksource - * is not suitable as a watchdog when running on a hypervisor because the - * kernel may miss a wrap of the counter if the vcpu is descheduled for a - * long time. To skip these checks at runtime we set these capability bits, - * so that the kernel could just trust the hypervisor with providing a - * reliable virtual TSC that is suitable for timekeeping. - */ -static void vmware_set_cpu_features(struct cpuinfo_x86 *c) -{ - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); -} - /* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */ static bool __init vmware_legacy_x2apic_available(void) { @@ -206,7 +208,6 @@ static bool __init vmware_legacy_x2apic_available(void) const __refconst struct hypervisor_x86 x86_hyper_vmware = { .name = "VMware", .detect = vmware_platform, - .set_cpu_features = vmware_set_cpu_features, .init_platform = vmware_platform_setup, .x2apic_available = vmware_legacy_x2apic_available, }; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 825a1e47cf3e..b6840bf3940b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -446,7 +446,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); -#ifdef CONFIG_XEN +#ifdef CONFIG_XEN_PV /* * On Xen PV, IOPL bits in pt_regs->flags have no effect, and * current_pt_regs()->flags may not match the current task's diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 292ab0364a89..c4b3646bd04c 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -447,7 +447,7 @@ void __init xen_msi_init(void) int __init pci_xen_hvm_init(void) { - if (!xen_feature(XENFEAT_hvm_pirqs)) + if (!xen_have_vector_callback || !xen_feature(XENFEAT_hvm_pirqs)) return 0; #ifdef CONFIG_ACPI diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 76b6dbd627df..027987638e98 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -6,8 +6,6 @@ config XEN bool "Xen guest support" depends on PARAVIRT select PARAVIRT_CLOCK - select XEN_HAVE_PVMMU - select XEN_HAVE_VPMU depends on X86_64 || (X86_32 && X86_PAE) depends on X86_LOCAL_APIC && X86_TSC help @@ -15,18 +13,41 @@ config XEN kernel to boot in a paravirtualized environment under the Xen hypervisor. -config XEN_DOM0 +config XEN_PV + bool "Xen PV guest support" + default y + depends on XEN + select XEN_HAVE_PVMMU + select XEN_HAVE_VPMU + help + Support running as a Xen PV guest. + +config XEN_PV_SMP def_bool y - depends on XEN && PCI_XEN && SWIOTLB_XEN + depends on XEN_PV && SMP + +config XEN_DOM0 + bool "Xen PV Dom0 support" + default y + depends on XEN_PV && PCI_XEN && SWIOTLB_XEN depends on X86_IO_APIC && ACPI && PCI + help + Support running as a Xen PV Dom0 guest. config XEN_PVHVM - def_bool y + bool "Xen PVHVM guest support" + default y depends on XEN && PCI && X86_LOCAL_APIC + help + Support running as a Xen PVHVM guest. + +config XEN_PVHVM_SMP + def_bool y + depends on XEN_PVHVM && SMP config XEN_512GB bool "Limit Xen pv-domain memory to 512GB" - depends on XEN && X86_64 + depends on XEN_PV && X86_64 default y help Limit paravirtualized user domains to 512GB of RAM. diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index cb0164aee156..fffb0a16f9e3 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -7,17 +7,23 @@ endif # Make sure early boot has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) -CFLAGS_enlighten.o := $(nostackp) -CFLAGS_mmu.o := $(nostackp) +CFLAGS_enlighten_pv.o := $(nostackp) +CFLAGS_mmu_pv.o := $(nostackp) -obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ +obj-y := enlighten.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ - grant-table.o suspend.o platform-pci-unplug.o \ - p2m.o apic.o pmu.o + grant-table.o suspend.o platform-pci-unplug.o + +obj-$(CONFIG_XEN_PVHVM) += enlighten_hvm.o mmu_hvm.o suspend_hvm.o +obj-$(CONFIG_XEN_PV) += setup.o apic.o pmu.o suspend_pv.o \ + p2m.o enlighten_pv.o mmu_pv.o +obj-$(CONFIG_XEN_PVH) += enlighten_pvh.o obj-$(CONFIG_EVENT_TRACING) += trace.o obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_XEN_PV_SMP) += smp_pv.o +obj-$(CONFIG_XEN_PVHVM_SMP) += smp_hvm.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o obj-$(CONFIG_XEN_DOM0) += vga.o diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 3be012115853..30bb2e80cfe7 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -81,7 +81,7 @@ static const struct efi efi_xen __initconst = { .update_capsule = xen_efi_update_capsule, .query_capsule_caps = xen_efi_query_capsule_caps, .get_next_high_mono_count = xen_efi_get_next_high_mono_count, - .reset_system = NULL, /* Functionality provided by Xen. */ + .reset_system = xen_efi_reset_system, .set_virtual_address_map = NULL, /* Not used under Xen. */ .flags = 0 /* Initialized later. */ }; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 30822e8e64ac..a5ffcbb20cc0 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1,95 +1,16 @@ -/* - * Core of Xen paravirt_ops implementation. - * - * This file contains the xen_paravirt_ops structure itself, and the - * implementations for: - * - privileged instructions - * - interrupt flags - * - segment operations - * - booting and setup - * - * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 - */ - #include <linux/cpu.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/preempt.h> -#include <linux/hardirq.h> -#include <linux/percpu.h> -#include <linux/delay.h> -#include <linux/start_kernel.h> -#include <linux/sched.h> -#include <linux/kprobes.h> -#include <linux/bootmem.h> -#include <linux/export.h> -#include <linux/mm.h> -#include <linux/page-flags.h> -#include <linux/highmem.h> -#include <linux/console.h> -#include <linux/pci.h> -#include <linux/gfp.h> -#include <linux/memblock.h> -#include <linux/edd.h> -#include <linux/frame.h> - #include <linux/kexec.h> -#include <xen/xen.h> -#include <xen/events.h> -#include <xen/interface/xen.h> -#include <xen/interface/version.h> -#include <xen/interface/physdev.h> -#include <xen/interface/vcpu.h> -#include <xen/interface/memory.h> -#include <xen/interface/nmi.h> -#include <xen/interface/xen-mca.h> -#include <xen/interface/hvm/start_info.h> #include <xen/features.h> #include <xen/page.h> -#include <xen/hvm.h> -#include <xen/hvc-console.h> -#include <xen/acpi.h> -#include <asm/paravirt.h> -#include <asm/apic.h> -#include <asm/page.h> -#include <asm/xen/pci.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> -#include <asm/xen/cpuid.h> -#include <asm/fixmap.h> -#include <asm/processor.h> -#include <asm/proto.h> -#include <asm/msr-index.h> -#include <asm/traps.h> -#include <asm/setup.h> -#include <asm/desc.h> -#include <asm/pgalloc.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/reboot.h> -#include <asm/stackprotector.h> -#include <asm/hypervisor.h> -#include <asm/mach_traps.h> -#include <asm/mwait.h> -#include <asm/pci_x86.h> #include <asm/cpu.h> #include <asm/e820/api.h> -#ifdef CONFIG_ACPI -#include <linux/acpi.h> -#include <asm/acpi.h> -#include <acpi/pdc_intel.h> -#include <acpi/processor.h> -#include <xen/interface/platform.h> -#endif - #include "xen-ops.h" -#include "mmu.h" #include "smp.h" -#include "multicalls.h" #include "pmu.h" EXPORT_SYMBOL_GPL(hypercall_page); @@ -136,13 +57,8 @@ EXPORT_SYMBOL_GPL(xen_start_info); struct shared_info xen_dummy_shared_info; -void *xen_initial_gdt; - -RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); - -static int xen_cpu_up_prepare(unsigned int cpu); -static int xen_cpu_up_online(unsigned int cpu); -static int xen_cpu_dead(unsigned int cpu); +__read_mostly int xen_have_vector_callback; +EXPORT_SYMBOL_GPL(xen_have_vector_callback); /* * Point at some empty memory to start with. We map the real shared_info @@ -163,34 +79,32 @@ struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; * * 0: not available, 1: available */ -static int have_vcpu_info_placement = 1; +int xen_have_vcpu_info_placement = 1; -struct tls_descs { - struct desc_struct desc[3]; -}; +static int xen_cpu_up_online(unsigned int cpu) +{ + xen_init_lock_cpu(cpu); + return 0; +} -/* - * Updating the 3 TLS descriptors in the GDT on every task switch is - * surprisingly expensive so we avoid updating them if they haven't - * changed. Since Xen writes different descriptors than the one - * passed in the update_descriptor hypercall we keep shadow copies to - * compare against. - */ -static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); +int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), + int (*cpu_dead_cb)(unsigned int)) +{ + int rc; -#ifdef CONFIG_XEN_PVH -/* - * PVH variables. - * - * xen_pvh and pvh_bootparams need to live in data segment since they - * are used after startup_{32|64}, which clear .bss, are invoked. - */ -bool xen_pvh __attribute__((section(".data"))) = 0; -struct boot_params pvh_bootparams __attribute__((section(".data"))); + rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE, + "x86/xen/hvm_guest:prepare", + cpu_up_prepare_cb, cpu_dead_cb); + if (rc >= 0) { + rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "x86/xen/hvm_guest:online", + xen_cpu_up_online, NULL); + if (rc < 0) + cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE); + } -struct hvm_start_info pvh_start_info; -unsigned int pvh_start_info_sz = sizeof(pvh_start_info); -#endif + return rc >= 0 ? 0 : rc; +} static void clamp_max_cpus(void) { @@ -227,7 +141,7 @@ void xen_vcpu_setup(int cpu) per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; - if (!have_vcpu_info_placement) { + if (!xen_have_vcpu_info_placement) { if (cpu >= MAX_VIRT_CPUS) clamp_max_cpus(); return; @@ -250,7 +164,7 @@ void xen_vcpu_setup(int cpu) if (err) { printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); - have_vcpu_info_placement = 0; + xen_have_vcpu_info_placement = 0; clamp_max_cpus(); } else { /* This cpu is using the registered vcpu info, even if @@ -259,1043 +173,7 @@ void xen_vcpu_setup(int cpu) } } -/* - * On restore, set the vcpu placement up again. - * If it fails, then we're in a bad state, since - * we can't back out from using it... - */ -void xen_vcpu_restore(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - bool other_cpu = (cpu != smp_processor_id()); - bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu), - NULL); - - if (other_cpu && is_up && - HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL)) - BUG(); - - xen_setup_runstate_info(cpu); - - if (have_vcpu_info_placement) - xen_vcpu_setup(cpu); - - if (other_cpu && is_up && - HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)) - BUG(); - } -} - -static void __init xen_banner(void) -{ - unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); - struct xen_extraversion extra; - HYPERVISOR_xen_version(XENVER_extraversion, &extra); - - pr_info("Booting paravirtualized kernel %son %s\n", - xen_feature(XENFEAT_auto_translated_physmap) ? - "with PVH extensions " : "", pv_info.name); - printk(KERN_INFO "Xen version: %d.%d%s%s\n", - version >> 16, version & 0xffff, extra.extraversion, - xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); -} -/* Check if running on Xen version (major, minor) or later */ -bool -xen_running_on_version_or_later(unsigned int major, unsigned int minor) -{ - unsigned int version; - - if (!xen_domain()) - return false; - - version = HYPERVISOR_xen_version(XENVER_version, NULL); - if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) || - ((version >> 16) > major)) - return true; - return false; -} - -#define CPUID_THERM_POWER_LEAF 6 -#define APERFMPERF_PRESENT 0 - -static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0; -static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; - -static __read_mostly unsigned int cpuid_leaf1_ecx_set_mask; -static __read_mostly unsigned int cpuid_leaf5_ecx_val; -static __read_mostly unsigned int cpuid_leaf5_edx_val; - -static void xen_cpuid(unsigned int *ax, unsigned int *bx, - unsigned int *cx, unsigned int *dx) -{ - unsigned maskebx = ~0; - unsigned maskecx = ~0; - unsigned maskedx = ~0; - unsigned setecx = 0; - /* - * Mask out inconvenient features, to try and disable as many - * unsupported kernel subsystems as possible. - */ - switch (*ax) { - case 1: - maskecx = cpuid_leaf1_ecx_mask; - setecx = cpuid_leaf1_ecx_set_mask; - maskedx = cpuid_leaf1_edx_mask; - break; - - case CPUID_MWAIT_LEAF: - /* Synthesize the values.. */ - *ax = 0; - *bx = 0; - *cx = cpuid_leaf5_ecx_val; - *dx = cpuid_leaf5_edx_val; - return; - - case CPUID_THERM_POWER_LEAF: - /* Disabling APERFMPERF for kernel usage */ - maskecx = ~(1 << APERFMPERF_PRESENT); - break; - - case 0xb: - /* Suppress extended topology stuff */ - maskebx = 0; - break; - } - - asm(XEN_EMULATE_PREFIX "cpuid" - : "=a" (*ax), - "=b" (*bx), - "=c" (*cx), - "=d" (*dx) - : "0" (*ax), "2" (*cx)); - - *bx &= maskebx; - *cx &= maskecx; - *cx |= setecx; - *dx &= maskedx; -} -STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */ - -static bool __init xen_check_mwait(void) -{ -#ifdef CONFIG_ACPI - struct xen_platform_op op = { - .cmd = XENPF_set_processor_pminfo, - .u.set_pminfo.id = -1, - .u.set_pminfo.type = XEN_PM_PDC, - }; - uint32_t buf[3]; - unsigned int ax, bx, cx, dx; - unsigned int mwait_mask; - - /* We need to determine whether it is OK to expose the MWAIT - * capability to the kernel to harvest deeper than C3 states from ACPI - * _CST using the processor_harvest_xen.c module. For this to work, we - * need to gather the MWAIT_LEAF values (which the cstate.c code - * checks against). The hypervisor won't expose the MWAIT flag because - * it would break backwards compatibility; so we will find out directly - * from the hardware and hypercall. - */ - if (!xen_initial_domain()) - return false; - - /* - * When running under platform earlier than Xen4.2, do not expose - * mwait, to avoid the risk of loading native acpi pad driver - */ - if (!xen_running_on_version_or_later(4, 2)) - return false; - - ax = 1; - cx = 0; - - native_cpuid(&ax, &bx, &cx, &dx); - - mwait_mask = (1 << (X86_FEATURE_EST % 32)) | - (1 << (X86_FEATURE_MWAIT % 32)); - - if ((cx & mwait_mask) != mwait_mask) - return false; - - /* We need to emulate the MWAIT_LEAF and for that we need both - * ecx and edx. The hypercall provides only partial information. - */ - - ax = CPUID_MWAIT_LEAF; - bx = 0; - cx = 0; - dx = 0; - - native_cpuid(&ax, &bx, &cx, &dx); - - /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so, - * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3. - */ - buf[0] = ACPI_PDC_REVISION_ID; - buf[1] = 1; - buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP); - - set_xen_guest_handle(op.u.set_pminfo.pdc, buf); - - if ((HYPERVISOR_platform_op(&op) == 0) && - (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) { - cpuid_leaf5_ecx_val = cx; - cpuid_leaf5_edx_val = dx; - } - return true; -#else - return false; -#endif -} -static void __init xen_init_cpuid_mask(void) -{ - unsigned int ax, bx, cx, dx; - unsigned int xsave_mask; - - cpuid_leaf1_edx_mask = - ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */ - (1 << X86_FEATURE_ACC)); /* thermal monitoring */ - - if (!xen_initial_domain()) - cpuid_leaf1_edx_mask &= - ~((1 << X86_FEATURE_ACPI)); /* disable ACPI */ - - cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32)); - - ax = 1; - cx = 0; - cpuid(1, &ax, &bx, &cx, &dx); - - xsave_mask = - (1 << (X86_FEATURE_XSAVE % 32)) | - (1 << (X86_FEATURE_OSXSAVE % 32)); - - /* Xen will set CR4.OSXSAVE if supported and not disabled by force */ - if ((cx & xsave_mask) != xsave_mask) - cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */ - if (xen_check_mwait()) - cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32)); -} - -static void xen_set_debugreg(int reg, unsigned long val) -{ - HYPERVISOR_set_debugreg(reg, val); -} - -static unsigned long xen_get_debugreg(int reg) -{ - return HYPERVISOR_get_debugreg(reg); -} - -static void xen_end_context_switch(struct task_struct *next) -{ - xen_mc_flush(); - paravirt_end_context_switch(next); -} - -static unsigned long xen_store_tr(void) -{ - return 0; -} - -/* - * Set the page permissions for a particular virtual address. If the - * address is a vmalloc mapping (or other non-linear mapping), then - * find the linear mapping of the page and also set its protections to - * match. - */ -static void set_aliased_prot(void *v, pgprot_t prot) -{ - int level; - pte_t *ptep; - pte_t pte; - unsigned long pfn; - struct page *page; - unsigned char dummy; - - ptep = lookup_address((unsigned long)v, &level); - BUG_ON(ptep == NULL); - - pfn = pte_pfn(*ptep); - page = pfn_to_page(pfn); - - pte = pfn_pte(pfn, prot); - - /* - * Careful: update_va_mapping() will fail if the virtual address - * we're poking isn't populated in the page tables. We don't - * need to worry about the direct map (that's always in the page - * tables), but we need to be careful about vmap space. In - * particular, the top level page table can lazily propagate - * entries between processes, so if we've switched mms since we - * vmapped the target in the first place, we might not have the - * top-level page table entry populated. - * - * We disable preemption because we want the same mm active when - * we probe the target and when we issue the hypercall. We'll - * have the same nominal mm, but if we're a kernel thread, lazy - * mm dropping could change our pgd. - * - * Out of an abundance of caution, this uses __get_user() to fault - * in the target address just in case there's some obscure case - * in which the target address isn't readable. - */ - - preempt_disable(); - - probe_kernel_read(&dummy, v, 1); - - if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) - BUG(); - - if (!PageHighMem(page)) { - void *av = __va(PFN_PHYS(pfn)); - - if (av != v) - if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0)) - BUG(); - } else - kmap_flush_unused(); - - preempt_enable(); -} - -static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) -{ - const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; - int i; - - /* - * We need to mark the all aliases of the LDT pages RO. We - * don't need to call vm_flush_aliases(), though, since that's - * only responsible for flushing aliases out the TLBs, not the - * page tables, and Xen will flush the TLB for us if needed. - * - * To avoid confusing future readers: none of this is necessary - * to load the LDT. The hypervisor only checks this when the - * LDT is faulted in due to subsequent descriptor access. - */ - - for(i = 0; i < entries; i += entries_per_page) - set_aliased_prot(ldt + i, PAGE_KERNEL_RO); -} - -static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) -{ - const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; - int i; - - for(i = 0; i < entries; i += entries_per_page) - set_aliased_prot(ldt + i, PAGE_KERNEL); -} - -static void xen_set_ldt(const void *addr, unsigned entries) -{ - struct mmuext_op *op; - struct multicall_space mcs = xen_mc_entry(sizeof(*op)); - - trace_xen_cpu_set_ldt(addr, entries); - - op = mcs.args; - op->cmd = MMUEXT_SET_LDT; - op->arg1.linear_addr = (unsigned long)addr; - op->arg2.nr_ents = entries; - - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_CPU); -} - -static void xen_load_gdt(const struct desc_ptr *dtr) -{ - unsigned long va = dtr->address; - unsigned int size = dtr->size + 1; - unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE); - unsigned long frames[pages]; - int f; - - /* - * A GDT can be up to 64k in size, which corresponds to 8192 - * 8-byte entries, or 16 4k pages.. - */ - - BUG_ON(size > 65536); - BUG_ON(va & ~PAGE_MASK); - - for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { - int level; - pte_t *ptep; - unsigned long pfn, mfn; - void *virt; - - /* - * The GDT is per-cpu and is in the percpu data area. - * That can be virtually mapped, so we need to do a - * page-walk to get the underlying MFN for the - * hypercall. The page can also be in the kernel's - * linear range, so we need to RO that mapping too. - */ - ptep = lookup_address(va, &level); - BUG_ON(ptep == NULL); - - pfn = pte_pfn(*ptep); - mfn = pfn_to_mfn(pfn); - virt = __va(PFN_PHYS(pfn)); - - frames[f] = mfn; - - make_lowmem_page_readonly((void *)va); - make_lowmem_page_readonly(virt); - } - - if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) - BUG(); -} - -/* - * load_gdt for early boot, when the gdt is only mapped once - */ -static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) -{ - unsigned long va = dtr->address; - unsigned int size = dtr->size + 1; - unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE); - unsigned long frames[pages]; - int f; - - /* - * A GDT can be up to 64k in size, which corresponds to 8192 - * 8-byte entries, or 16 4k pages.. - */ - - BUG_ON(size > 65536); - BUG_ON(va & ~PAGE_MASK); - - for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { - pte_t pte; - unsigned long pfn, mfn; - - pfn = virt_to_pfn(va); - mfn = pfn_to_mfn(pfn); - - pte = pfn_pte(pfn, PAGE_KERNEL_RO); - - if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0)) - BUG(); - - frames[f] = mfn; - } - - if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) - BUG(); -} - -static inline bool desc_equal(const struct desc_struct *d1, - const struct desc_struct *d2) -{ - return d1->a == d2->a && d1->b == d2->b; -} - -static void load_TLS_descriptor(struct thread_struct *t, - unsigned int cpu, unsigned int i) -{ - struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i]; - struct desc_struct *gdt; - xmaddr_t maddr; - struct multicall_space mc; - - if (desc_equal(shadow, &t->tls_array[i])) - return; - - *shadow = t->tls_array[i]; - - gdt = get_cpu_gdt_rw(cpu); - maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); - mc = __xen_mc_entry(0); - - MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); -} - -static void xen_load_tls(struct thread_struct *t, unsigned int cpu) -{ - /* - * XXX sleazy hack: If we're being called in a lazy-cpu zone - * and lazy gs handling is enabled, it means we're in a - * context switch, and %gs has just been saved. This means we - * can zero it out to prevent faults on exit from the - * hypervisor if the next process has no %gs. Either way, it - * has been saved, and the new value will get loaded properly. - * This will go away as soon as Xen has been modified to not - * save/restore %gs for normal hypercalls. - * - * On x86_64, this hack is not used for %gs, because gs points - * to KERNEL_GS_BASE (and uses it for PDA references), so we - * must not zero %gs on x86_64 - * - * For x86_64, we need to zero %fs, otherwise we may get an - * exception between the new %fs descriptor being loaded and - * %fs being effectively cleared at __switch_to(). - */ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { -#ifdef CONFIG_X86_32 - lazy_load_gs(0); -#else - loadsegment(fs, 0); -#endif - } - - xen_mc_batch(); - - load_TLS_descriptor(t, cpu, 0); - load_TLS_descriptor(t, cpu, 1); - load_TLS_descriptor(t, cpu, 2); - - xen_mc_issue(PARAVIRT_LAZY_CPU); -} - -#ifdef CONFIG_X86_64 -static void xen_load_gs_index(unsigned int idx) -{ - if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx)) - BUG(); -} -#endif - -static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, - const void *ptr) -{ - xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]); - u64 entry = *(u64 *)ptr; - - trace_xen_cpu_write_ldt_entry(dt, entrynum, entry); - - preempt_disable(); - - xen_mc_flush(); - if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) - BUG(); - - preempt_enable(); -} - -static int cvt_gate_to_trap(int vector, const gate_desc *val, - struct trap_info *info) -{ - unsigned long addr; - - if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) - return 0; - - info->vector = vector; - - addr = gate_offset(*val); -#ifdef CONFIG_X86_64 - /* - * Look for known traps using IST, and substitute them - * appropriately. The debugger ones are the only ones we care - * about. Xen will handle faults like double_fault, - * so we should never see them. Warn if - * there's an unexpected IST-using fault handler. - */ - if (addr == (unsigned long)debug) - addr = (unsigned long)xen_debug; - else if (addr == (unsigned long)int3) - addr = (unsigned long)xen_int3; - else if (addr == (unsigned long)stack_segment) - addr = (unsigned long)xen_stack_segment; - else if (addr == (unsigned long)double_fault) { - /* Don't need to handle these */ - return 0; -#ifdef CONFIG_X86_MCE - } else if (addr == (unsigned long)machine_check) { - /* - * when xen hypervisor inject vMCE to guest, - * use native mce handler to handle it - */ - ; -#endif - } else if (addr == (unsigned long)nmi) - /* - * Use the native version as well. - */ - ; - else { - /* Some other trap using IST? */ - if (WARN_ON(val->ist != 0)) - return 0; - } -#endif /* CONFIG_X86_64 */ - info->address = addr; - - info->cs = gate_segment(*val); - info->flags = val->dpl; - /* interrupt gates clear IF */ - if (val->type == GATE_INTERRUPT) - info->flags |= 1 << 2; - - return 1; -} - -/* Locations of each CPU's IDT */ -static DEFINE_PER_CPU(struct desc_ptr, idt_desc); - -/* Set an IDT entry. If the entry is part of the current IDT, then - also update Xen. */ -static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) -{ - unsigned long p = (unsigned long)&dt[entrynum]; - unsigned long start, end; - - trace_xen_cpu_write_idt_entry(dt, entrynum, g); - - preempt_disable(); - - start = __this_cpu_read(idt_desc.address); - end = start + __this_cpu_read(idt_desc.size) + 1; - - xen_mc_flush(); - - native_write_idt_entry(dt, entrynum, g); - - if (p >= start && (p + 8) <= end) { - struct trap_info info[2]; - - info[1].address = 0; - - if (cvt_gate_to_trap(entrynum, g, &info[0])) - if (HYPERVISOR_set_trap_table(info)) - BUG(); - } - - preempt_enable(); -} - -static void xen_convert_trap_info(const struct desc_ptr *desc, - struct trap_info *traps) -{ - unsigned in, out, count; - - count = (desc->size+1) / sizeof(gate_desc); - BUG_ON(count > 256); - - for (in = out = 0; in < count; in++) { - gate_desc *entry = (gate_desc*)(desc->address) + in; - - if (cvt_gate_to_trap(in, entry, &traps[out])) - out++; - } - traps[out].address = 0; -} - -void xen_copy_trap_info(struct trap_info *traps) -{ - const struct desc_ptr *desc = this_cpu_ptr(&idt_desc); - - xen_convert_trap_info(desc, traps); -} - -/* Load a new IDT into Xen. In principle this can be per-CPU, so we - hold a spinlock to protect the static traps[] array (static because - it avoids allocation, and saves stack space). */ -static void xen_load_idt(const struct desc_ptr *desc) -{ - static DEFINE_SPINLOCK(lock); - static struct trap_info traps[257]; - - trace_xen_cpu_load_idt(desc); - - spin_lock(&lock); - - memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc)); - - xen_convert_trap_info(desc, traps); - - xen_mc_flush(); - if (HYPERVISOR_set_trap_table(traps)) - BUG(); - - spin_unlock(&lock); -} - -/* Write a GDT descriptor entry. Ignore LDT descriptors, since - they're handled differently. */ -static void xen_write_gdt_entry(struct desc_struct *dt, int entry, - const void *desc, int type) -{ - trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); - - preempt_disable(); - - switch (type) { - case DESC_LDT: - case DESC_TSS: - /* ignore */ - break; - - default: { - xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]); - - xen_mc_flush(); - if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) - BUG(); - } - - } - - preempt_enable(); -} - -/* - * Version of write_gdt_entry for use at early boot-time needed to - * update an entry as simply as possible. - */ -static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, - const void *desc, int type) -{ - trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); - - switch (type) { - case DESC_LDT: - case DESC_TSS: - /* ignore */ - break; - - default: { - xmaddr_t maddr = virt_to_machine(&dt[entry]); - - if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) - dt[entry] = *(struct desc_struct *)desc; - } - - } -} - -static void xen_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) -{ - struct multicall_space mcs; - - mcs = xen_mc_entry(0); - MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); - xen_mc_issue(PARAVIRT_LAZY_CPU); - tss->x86_tss.sp0 = thread->sp0; -} - -void xen_set_iopl_mask(unsigned mask) -{ - struct physdev_set_iopl set_iopl; - - /* Force the change at ring 0. */ - set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; - HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); -} - -static void xen_io_delay(void) -{ -} - -static DEFINE_PER_CPU(unsigned long, xen_cr0_value); - -static unsigned long xen_read_cr0(void) -{ - unsigned long cr0 = this_cpu_read(xen_cr0_value); - - if (unlikely(cr0 == 0)) { - cr0 = native_read_cr0(); - this_cpu_write(xen_cr0_value, cr0); - } - - return cr0; -} - -static void xen_write_cr0(unsigned long cr0) -{ - struct multicall_space mcs; - - this_cpu_write(xen_cr0_value, cr0); - - /* Only pay attention to cr0.TS; everything else is - ignored. */ - mcs = xen_mc_entry(0); - - MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0); - - xen_mc_issue(PARAVIRT_LAZY_CPU); -} - -static void xen_write_cr4(unsigned long cr4) -{ - cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE); - - native_write_cr4(cr4); -} -#ifdef CONFIG_X86_64 -static inline unsigned long xen_read_cr8(void) -{ - return 0; -} -static inline void xen_write_cr8(unsigned long val) -{ - BUG_ON(val); -} -#endif - -static u64 xen_read_msr_safe(unsigned int msr, int *err) -{ - u64 val; - - if (pmu_msr_read(msr, &val, err)) - return val; - - val = native_read_msr_safe(msr, err); - switch (msr) { - case MSR_IA32_APICBASE: -#ifdef CONFIG_X86_X2APIC - if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31)))) -#endif - val &= ~X2APIC_ENABLE; - break; - } - return val; -} - -static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) -{ - int ret; - - ret = 0; - - switch (msr) { -#ifdef CONFIG_X86_64 - unsigned which; - u64 base; - - case MSR_FS_BASE: which = SEGBASE_FS; goto set; - case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; - case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; - - set: - base = ((u64)high << 32) | low; - if (HYPERVISOR_set_segment_base(which, base) != 0) - ret = -EIO; - break; -#endif - - case MSR_STAR: - case MSR_CSTAR: - case MSR_LSTAR: - case MSR_SYSCALL_MASK: - case MSR_IA32_SYSENTER_CS: - case MSR_IA32_SYSENTER_ESP: - case MSR_IA32_SYSENTER_EIP: - /* Fast syscall setup is all done in hypercalls, so - these are all ignored. Stub them out here to stop - Xen console noise. */ - break; - - default: - if (!pmu_msr_write(msr, low, high, &ret)) - ret = native_write_msr_safe(msr, low, high); - } - - return ret; -} - -static u64 xen_read_msr(unsigned int msr) -{ - /* - * This will silently swallow a #GP from RDMSR. It may be worth - * changing that. - */ - int err; - - return xen_read_msr_safe(msr, &err); -} - -static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) -{ - /* - * This will silently swallow a #GP from WRMSR. It may be worth - * changing that. - */ - xen_write_msr_safe(msr, low, high); -} - -void xen_setup_shared_info(void) -{ - if (!xen_feature(XENFEAT_auto_translated_physmap)) { - set_fixmap(FIX_PARAVIRT_BOOTMAP, - xen_start_info->shared_info); - - HYPERVISOR_shared_info = - (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); - } else - HYPERVISOR_shared_info = - (struct shared_info *)__va(xen_start_info->shared_info); - -#ifndef CONFIG_SMP - /* In UP this is as good a place as any to set up shared info */ - xen_setup_vcpu_info_placement(); -#endif - - xen_setup_mfn_list_list(); -} - -/* This is called once we have the cpu_possible_mask */ -void xen_setup_vcpu_info_placement(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - /* Set up direct vCPU id mapping for PV guests. */ - per_cpu(xen_vcpu_id, cpu) = cpu; - xen_vcpu_setup(cpu); - } - - /* - * xen_vcpu_setup managed to place the vcpu_info within the - * percpu area for all cpus, so make use of it. - */ - if (have_vcpu_info_placement) { - pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); - pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); - pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); - pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); - pv_mmu_ops.read_cr2 = xen_read_cr2_direct; - } -} - -static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, - unsigned long addr, unsigned len) -{ - char *start, *end, *reloc; - unsigned ret; - - start = end = reloc = NULL; - -#define SITE(op, x) \ - case PARAVIRT_PATCH(op.x): \ - if (have_vcpu_info_placement) { \ - start = (char *)xen_##x##_direct; \ - end = xen_##x##_direct_end; \ - reloc = xen_##x##_direct_reloc; \ - } \ - goto patch_site - - switch (type) { - SITE(pv_irq_ops, irq_enable); - SITE(pv_irq_ops, irq_disable); - SITE(pv_irq_ops, save_fl); - SITE(pv_irq_ops, restore_fl); -#undef SITE - - patch_site: - if (start == NULL || (end-start) > len) - goto default_patch; - - ret = paravirt_patch_insns(insnbuf, len, start, end); - - /* Note: because reloc is assigned from something that - appears to be an array, gcc assumes it's non-null, - but doesn't know its relationship with start and - end. */ - if (reloc > start && reloc < end) { - int reloc_off = reloc - start; - long *relocp = (long *)(insnbuf + reloc_off); - long delta = start - (char *)addr; - - *relocp += delta; - } - break; - - default_patch: - default: - ret = paravirt_patch_default(type, clobbers, insnbuf, - addr, len); - break; - } - - return ret; -} - -static const struct pv_info xen_info __initconst = { - .shared_kernel_pmd = 0, - -#ifdef CONFIG_X86_64 - .extra_user_64bit_cs = FLAT_USER_CS64, -#endif - .name = "Xen", -}; - -static const struct pv_init_ops xen_init_ops __initconst = { - .patch = xen_patch, -}; - -static const struct pv_cpu_ops xen_cpu_ops __initconst = { - .cpuid = xen_cpuid, - - .set_debugreg = xen_set_debugreg, - .get_debugreg = xen_get_debugreg, - - .read_cr0 = xen_read_cr0, - .write_cr0 = xen_write_cr0, - - .read_cr4 = native_read_cr4, - .write_cr4 = xen_write_cr4, - -#ifdef CONFIG_X86_64 - .read_cr8 = xen_read_cr8, - .write_cr8 = xen_write_cr8, -#endif - - .wbinvd = native_wbinvd, - - .read_msr = xen_read_msr, - .write_msr = xen_write_msr, - - .read_msr_safe = xen_read_msr_safe, - .write_msr_safe = xen_write_msr_safe, - - .read_pmc = xen_read_pmc, - - .iret = xen_iret, -#ifdef CONFIG_X86_64 - .usergs_sysret64 = xen_sysret64, -#endif - - .load_tr_desc = paravirt_nop, - .set_ldt = xen_set_ldt, - .load_gdt = xen_load_gdt, - .load_idt = xen_load_idt, - .load_tls = xen_load_tls, -#ifdef CONFIG_X86_64 - .load_gs_index = xen_load_gs_index, -#endif - - .alloc_ldt = xen_alloc_ldt, - .free_ldt = xen_free_ldt, - - .store_idt = native_store_idt, - .store_tr = xen_store_tr, - - .write_ldt_entry = xen_write_ldt_entry, - .write_gdt_entry = xen_write_gdt_entry, - .write_idt_entry = xen_write_idt_entry, - .load_sp0 = xen_load_sp0, - - .set_iopl_mask = xen_set_iopl_mask, - .io_delay = xen_io_delay, - - /* Xen takes care of %gs when switching to usermode for us */ - .swapgs = paravirt_nop, - - .start_context_switch = paravirt_start_context_switch, - .end_context_switch = xen_end_context_switch, -}; - -static void xen_reboot(int reason) +void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; int cpu; @@ -1307,33 +185,11 @@ static void xen_reboot(int reason) BUG(); } -static void xen_restart(char *msg) +void xen_emergency_restart(void) { xen_reboot(SHUTDOWN_reboot); } -static void xen_emergency_restart(void) -{ - xen_reboot(SHUTDOWN_reboot); -} - -static void xen_machine_halt(void) -{ - xen_reboot(SHUTDOWN_poweroff); -} - -static void xen_machine_power_off(void) -{ - if (pm_power_off) - pm_power_off(); - xen_reboot(SHUTDOWN_poweroff); -} - -static void xen_crash_shutdown(struct pt_regs *regs) -{ - xen_reboot(SHUTDOWN_crash); -} - static int xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -1343,7 +199,7 @@ xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) } static struct notifier_block xen_panic_block = { - .notifier_call= xen_panic_event, + .notifier_call = xen_panic_event, .priority = INT_MIN }; @@ -1353,624 +209,7 @@ int xen_panic_handler_init(void) return 0; } -static const struct machine_ops xen_machine_ops __initconst = { - .restart = xen_restart, - .halt = xen_machine_halt, - .power_off = xen_machine_power_off, - .shutdown = xen_machine_halt, - .crash_shutdown = xen_crash_shutdown, - .emergency_restart = xen_emergency_restart, -}; - -static unsigned char xen_get_nmi_reason(void) -{ - unsigned char reason = 0; - - /* Construct a value which looks like it came from port 0x61. */ - if (test_bit(_XEN_NMIREASON_io_error, - &HYPERVISOR_shared_info->arch.nmi_reason)) - reason |= NMI_REASON_IOCHK; - if (test_bit(_XEN_NMIREASON_pci_serr, - &HYPERVISOR_shared_info->arch.nmi_reason)) - reason |= NMI_REASON_SERR; - - return reason; -} - -static void __init xen_boot_params_init_edd(void) -{ -#if IS_ENABLED(CONFIG_EDD) - struct xen_platform_op op; - struct edd_info *edd_info; - u32 *mbr_signature; - unsigned nr; - int ret; - - edd_info = boot_params.eddbuf; - mbr_signature = boot_params.edd_mbr_sig_buffer; - - op.cmd = XENPF_firmware_info; - - op.u.firmware_info.type = XEN_FW_DISK_INFO; - for (nr = 0; nr < EDDMAXNR; nr++) { - struct edd_info *info = edd_info + nr; - - op.u.firmware_info.index = nr; - info->params.length = sizeof(info->params); - set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params, - &info->params); - ret = HYPERVISOR_platform_op(&op); - if (ret) - break; - -#define C(x) info->x = op.u.firmware_info.u.disk_info.x - C(device); - C(version); - C(interface_support); - C(legacy_max_cylinder); - C(legacy_max_head); - C(legacy_sectors_per_track); -#undef C - } - boot_params.eddbuf_entries = nr; - - op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE; - for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) { - op.u.firmware_info.index = nr; - ret = HYPERVISOR_platform_op(&op); - if (ret) - break; - mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature; - } - boot_params.edd_mbr_sig_buf_entries = nr; -#endif -} - -/* - * Set up the GDT and segment registers for -fstack-protector. Until - * we do this, we have to be careful not to call any stack-protected - * function, which is most of the kernel. - */ -static void xen_setup_gdt(int cpu) -{ - pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; - pv_cpu_ops.load_gdt = xen_load_gdt_boot; - - setup_stack_canary_segment(0); - switch_to_new_gdt(0); - - pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry; - pv_cpu_ops.load_gdt = xen_load_gdt; -} - -static void __init xen_dom0_set_legacy_features(void) -{ - x86_platform.legacy.rtc = 1; -} - -static int xen_cpuhp_setup(void) -{ - int rc; - - rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE, - "x86/xen/hvm_guest:prepare", - xen_cpu_up_prepare, xen_cpu_dead); - if (rc >= 0) { - rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "x86/xen/hvm_guest:online", - xen_cpu_up_online, NULL); - if (rc < 0) - cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE); - } - - return rc >= 0 ? 0 : rc; -} - -/* First C function to be called on Xen boot */ -asmlinkage __visible void __init xen_start_kernel(void) -{ - struct physdev_set_iopl set_iopl; - unsigned long initrd_start = 0; - int rc; - - if (!xen_start_info) - return; - - xen_domain_type = XEN_PV_DOMAIN; - - xen_setup_features(); - - xen_setup_machphys_mapping(); - - /* Install Xen paravirt ops */ - pv_info = xen_info; - pv_init_ops = xen_init_ops; - pv_cpu_ops = xen_cpu_ops; - - x86_platform.get_nmi_reason = xen_get_nmi_reason; - - x86_init.resources.memory_setup = xen_memory_setup; - x86_init.oem.arch_setup = xen_arch_setup; - x86_init.oem.banner = xen_banner; - - xen_init_time_ops(); - - /* - * Set up some pagetable state before starting to set any ptes. - */ - - xen_init_mmu_ops(); - - /* Prevent unwanted bits from being set in PTEs. */ - __supported_pte_mask &= ~_PAGE_GLOBAL; - - /* - * Prevent page tables from being allocated in highmem, even - * if CONFIG_HIGHPTE is enabled. - */ - __userpte_alloc_gfp &= ~__GFP_HIGHMEM; - - /* Work out if we support NX */ - x86_configure_nx(); - - /* Get mfn list */ - xen_build_dynamic_phys_to_machine(); - - /* - * Set up kernel GDT and segment registers, mainly so that - * -fstack-protector code can be executed. - */ - xen_setup_gdt(0); - - xen_init_irq_ops(); - xen_init_cpuid_mask(); - -#ifdef CONFIG_X86_LOCAL_APIC - /* - * set up the basic apic ops. - */ - xen_init_apic(); -#endif - - if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { - pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; - pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; - } - - machine_ops = xen_machine_ops; - - /* - * The only reliable way to retain the initial address of the - * percpu gdt_page is to remember it here, so we can go and - * mark it RW later, when the initial percpu area is freed. - */ - xen_initial_gdt = &per_cpu(gdt_page, 0); - - xen_smp_init(); - -#ifdef CONFIG_ACPI_NUMA - /* - * The pages we from Xen are not related to machine pages, so - * any NUMA information the kernel tries to get from ACPI will - * be meaningless. Prevent it from trying. - */ - acpi_numa = -1; -#endif - /* Don't do the full vcpu_info placement stuff until we have a - possible map and a non-dummy shared_info. */ - per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; - - WARN_ON(xen_cpuhp_setup()); - - local_irq_disable(); - early_boot_irqs_disabled = true; - - xen_raw_console_write("mapping kernel into physical memory\n"); - xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, - xen_start_info->nr_pages); - xen_reserve_special_pages(); - - /* keep using Xen gdt for now; no urgent need to change it */ - -#ifdef CONFIG_X86_32 - pv_info.kernel_rpl = 1; - if (xen_feature(XENFEAT_supervisor_mode_kernel)) - pv_info.kernel_rpl = 0; -#else - pv_info.kernel_rpl = 0; -#endif - /* set the limit of our address space */ - xen_reserve_top(); - - /* - * We used to do this in xen_arch_setup, but that is too late - * on AMD were early_cpu_init (run before ->arch_setup()) calls - * early_amd_init which pokes 0xcf8 port. - */ - set_iopl.iopl = 1; - rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); - if (rc != 0) - xen_raw_printk("physdev_op failed %d\n", rc); - -#ifdef CONFIG_X86_32 - /* set up basic CPUID stuff */ - cpu_detect(&new_cpu_data); - set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); - new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1); -#endif - - if (xen_start_info->mod_start) { - if (xen_start_info->flags & SIF_MOD_START_PFN) - initrd_start = PFN_PHYS(xen_start_info->mod_start); - else - initrd_start = __pa(xen_start_info->mod_start); - } - - /* Poke various useful things into boot_params */ - boot_params.hdr.type_of_loader = (9 << 4) | 0; - boot_params.hdr.ramdisk_image = initrd_start; - boot_params.hdr.ramdisk_size = xen_start_info->mod_len; - boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); - boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN; - - if (!xen_initial_domain()) { - add_preferred_console("xenboot", 0, NULL); - add_preferred_console("tty", 0, NULL); - add_preferred_console("hvc", 0, NULL); - if (pci_xen) - x86_init.pci.arch_init = pci_xen_init; - } else { - const struct dom0_vga_console_info *info = - (void *)((char *)xen_start_info + - xen_start_info->console.dom0.info_off); - struct xen_platform_op op = { - .cmd = XENPF_firmware_info, - .interface_version = XENPF_INTERFACE_VERSION, - .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS, - }; - - x86_platform.set_legacy_features = - xen_dom0_set_legacy_features; - xen_init_vga(info, xen_start_info->console.dom0.info_size); - xen_start_info->console.domU.mfn = 0; - xen_start_info->console.domU.evtchn = 0; - - if (HYPERVISOR_platform_op(&op) == 0) - boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags; - - /* Make sure ACS will be enabled */ - pci_request_acs(); - - xen_acpi_sleep_register(); - - /* Avoid searching for BIOS MP tables */ - x86_init.mpparse.find_smp_config = x86_init_noop; - x86_init.mpparse.get_smp_config = x86_init_uint_noop; - - xen_boot_params_init_edd(); - } -#ifdef CONFIG_PCI - /* PCI BIOS service won't work from a PV guest. */ - pci_probe &= ~PCI_PROBE_BIOS; -#endif - xen_raw_console_write("about to get started...\n"); - - /* Let's presume PV guests always boot on vCPU with id 0. */ - per_cpu(xen_vcpu_id, 0) = 0; - - xen_setup_runstate_info(0); - - xen_efi_init(); - - /* Start the world */ -#ifdef CONFIG_X86_32 - i386_start_kernel(); -#else - cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */ - x86_64_start_reservations((char *)__pa_symbol(&boot_params)); -#endif -} - -#ifdef CONFIG_XEN_PVH - -static void xen_pvh_arch_setup(void) -{ -#ifdef CONFIG_ACPI - /* Make sure we don't fall back to (default) ACPI_IRQ_MODEL_PIC. */ - if (nr_ioapics == 0) - acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; -#endif -} - -static void __init init_pvh_bootparams(void) -{ - struct xen_memory_map memmap; - unsigned int i; - int rc; - - memset(&pvh_bootparams, 0, sizeof(pvh_bootparams)); - - memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table); - set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table); - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); - if (rc) { - xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); - BUG(); - } - - if (memmap.nr_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) { - pvh_bootparams.e820_table[memmap.nr_entries].addr = - ISA_START_ADDRESS; - pvh_bootparams.e820_table[memmap.nr_entries].size = - ISA_END_ADDRESS - ISA_START_ADDRESS; - pvh_bootparams.e820_table[memmap.nr_entries].type = - E820_TYPE_RESERVED; - memmap.nr_entries++; - } else - xen_raw_printk("Warning: Can fit ISA range into e820\n"); - - pvh_bootparams.e820_entries = memmap.nr_entries; - for (i = 0; i < pvh_bootparams.e820_entries; i++) - e820__range_add(pvh_bootparams.e820_table[i].addr, - pvh_bootparams.e820_table[i].size, - pvh_bootparams.e820_table[i].type); - - e820__update_table(e820_table); - - pvh_bootparams.hdr.cmd_line_ptr = - pvh_start_info.cmdline_paddr; - - /* The first module is always ramdisk. */ - if (pvh_start_info.nr_modules) { - struct hvm_modlist_entry *modaddr = - __va(pvh_start_info.modlist_paddr); - pvh_bootparams.hdr.ramdisk_image = modaddr->paddr; - pvh_bootparams.hdr.ramdisk_size = modaddr->size; - } - - /* - * See Documentation/x86/boot.txt. - * - * Version 2.12 supports Xen entry point but we will use default x86/PC - * environment (i.e. hardware_subarch 0). - */ - pvh_bootparams.hdr.version = 0x212; - pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */ -} - -/* - * This routine (and those that it might call) should not use - * anything that lives in .bss since that segment will be cleared later. - */ -void __init xen_prepare_pvh(void) -{ - u32 msr; - u64 pfn; - - if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) { - xen_raw_printk("Error: Unexpected magic value (0x%08x)\n", - pvh_start_info.magic); - BUG(); - } - - xen_pvh = 1; - - msr = cpuid_ebx(xen_cpuid_base() + 2); - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - - init_pvh_bootparams(); - - x86_init.oem.arch_setup = xen_pvh_arch_setup; -} -#endif - -void __ref xen_hvm_init_shared_info(void) -{ - int cpu; - struct xen_add_to_physmap xatp; - static struct shared_info *shared_info_page = 0; - - if (!shared_info_page) - shared_info_page = (struct shared_info *) - extend_brk(PAGE_SIZE, PAGE_SIZE); - xatp.domid = DOMID_SELF; - xatp.idx = 0; - xatp.space = XENMAPSPACE_shared_info; - xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; - if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) - BUG(); - - HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; - - /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info - * page, we use it in the event channel upcall and in some pvclock - * related functions. We don't need the vcpu_info placement - * optimizations because we don't use any pv_mmu or pv_irq op on - * HVM. - * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is - * online but xen_hvm_init_shared_info is run at resume time too and - * in that case multiple vcpus might be online. */ - for_each_online_cpu(cpu) { - /* Leave it to be NULL. */ - if (xen_vcpu_nr(cpu) >= MAX_VIRT_CPUS) - continue; - per_cpu(xen_vcpu, cpu) = - &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; - } -} - -#ifdef CONFIG_XEN_PVHVM -static void __init init_hvm_pv_info(void) -{ - int major, minor; - uint32_t eax, ebx, ecx, edx, base; - - base = xen_cpuid_base(); - eax = cpuid_eax(base + 1); - - major = eax >> 16; - minor = eax & 0xffff; - printk(KERN_INFO "Xen version %d.%d.\n", major, minor); - - xen_domain_type = XEN_HVM_DOMAIN; - - /* PVH set up hypercall page in xen_prepare_pvh(). */ - if (xen_pvh_domain()) - pv_info.name = "Xen PVH"; - else { - u64 pfn; - uint32_t msr; - - pv_info.name = "Xen HVM"; - msr = cpuid_ebx(base + 2); - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - } - - xen_setup_features(); - - cpuid(base + 4, &eax, &ebx, &ecx, &edx); - if (eax & XEN_HVM_CPUID_VCPU_ID_PRESENT) - this_cpu_write(xen_vcpu_id, ebx); - else - this_cpu_write(xen_vcpu_id, smp_processor_id()); -} -#endif - -static int xen_cpu_up_prepare(unsigned int cpu) -{ - int rc; - - if (xen_hvm_domain()) { - /* - * This can happen if CPU was offlined earlier and - * offlining timed out in common_cpu_die(). - */ - if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) { - xen_smp_intr_free(cpu); - xen_uninit_lock_cpu(cpu); - } - - if (cpu_acpi_id(cpu) != U32_MAX) - per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu); - else - per_cpu(xen_vcpu_id, cpu) = cpu; - xen_vcpu_setup(cpu); - } - - if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock)) - xen_setup_timer(cpu); - - rc = xen_smp_intr_init(cpu); - if (rc) { - WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n", - cpu, rc); - return rc; - } - return 0; -} - -static int xen_cpu_dead(unsigned int cpu) -{ - xen_smp_intr_free(cpu); - - if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock)) - xen_teardown_timer(cpu); - - return 0; -} - -static int xen_cpu_up_online(unsigned int cpu) -{ - xen_init_lock_cpu(cpu); - return 0; -} - -#ifdef CONFIG_XEN_PVHVM -#ifdef CONFIG_KEXEC_CORE -static void xen_hvm_shutdown(void) -{ - native_machine_shutdown(); - if (kexec_in_progress) - xen_reboot(SHUTDOWN_soft_reset); -} - -static void xen_hvm_crash_shutdown(struct pt_regs *regs) -{ - native_machine_crash_shutdown(regs); - xen_reboot(SHUTDOWN_soft_reset); -} -#endif - -static void __init xen_hvm_guest_init(void) -{ - if (xen_pv_domain()) - return; - - init_hvm_pv_info(); - - xen_hvm_init_shared_info(); - - xen_panic_handler_init(); - - BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector)); - - xen_hvm_smp_init(); - WARN_ON(xen_cpuhp_setup()); - xen_unplug_emulated_devices(); - x86_init.irqs.intr_init = xen_init_IRQ; - xen_hvm_init_time_ops(); - xen_hvm_init_mmu_ops(); - - if (xen_pvh_domain()) - machine_ops.emergency_restart = xen_emergency_restart; -#ifdef CONFIG_KEXEC_CORE - machine_ops.shutdown = xen_hvm_shutdown; - machine_ops.crash_shutdown = xen_hvm_crash_shutdown; -#endif -} -#endif - -static bool xen_nopv = false; -static __init int xen_parse_nopv(char *arg) -{ - xen_nopv = true; - return 0; -} -early_param("xen_nopv", xen_parse_nopv); - -static uint32_t __init xen_platform(void) -{ - if (xen_nopv) - return 0; - - return xen_cpuid_base(); -} - -bool xen_hvm_need_lapic(void) -{ - if (xen_nopv) - return false; - if (xen_pv_domain()) - return false; - if (!xen_hvm_domain()) - return false; - if (xen_feature(XENFEAT_hvm_pirqs)) - return false; - return true; -} -EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); - -static void xen_set_cpu_features(struct cpuinfo_x86 *c) -{ - if (xen_pv_domain()) { - clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); - set_cpu_cap(c, X86_FEATURE_XENPV); - } -} - -static void xen_pin_vcpu(int cpu) +void xen_pin_vcpu(int cpu) { static bool disable_pinning; struct sched_pin_override pin_override; @@ -2009,18 +248,6 @@ static void xen_pin_vcpu(int cpu) } } -const struct hypervisor_x86 x86_hyper_xen = { - .name = "Xen", - .detect = xen_platform, -#ifdef CONFIG_XEN_PVHVM - .init_platform = xen_hvm_guest_init, -#endif - .x2apic_available = xen_x2apic_para_available, - .set_cpu_features = xen_set_cpu_features, - .pin_vcpu = xen_pin_vcpu, -}; -EXPORT_SYMBOL(x86_hyper_xen); - #ifdef CONFIG_HOTPLUG_CPU void xen_arch_register_cpu(int num) { diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c new file mode 100644 index 000000000000..a6d014f47e52 --- /dev/null +++ b/arch/x86/xen/enlighten_hvm.c @@ -0,0 +1,214 @@ +#include <linux/cpu.h> +#include <linux/kexec.h> + +#include <xen/features.h> +#include <xen/events.h> +#include <xen/interface/memory.h> + +#include <asm/cpu.h> +#include <asm/smp.h> +#include <asm/reboot.h> +#include <asm/setup.h> +#include <asm/hypervisor.h> + +#include <asm/xen/cpuid.h> +#include <asm/xen/hypervisor.h> + +#include "xen-ops.h" +#include "mmu.h" +#include "smp.h" + +void __ref xen_hvm_init_shared_info(void) +{ + int cpu; + struct xen_add_to_physmap xatp; + static struct shared_info *shared_info_page; + + if (!shared_info_page) + shared_info_page = (struct shared_info *) + extend_brk(PAGE_SIZE, PAGE_SIZE); + xatp.domid = DOMID_SELF; + xatp.idx = 0; + xatp.space = XENMAPSPACE_shared_info; + xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; + if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) + BUG(); + + HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; + + /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info + * page, we use it in the event channel upcall and in some pvclock + * related functions. We don't need the vcpu_info placement + * optimizations because we don't use any pv_mmu or pv_irq op on + * HVM. + * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is + * online but xen_hvm_init_shared_info is run at resume time too and + * in that case multiple vcpus might be online. */ + for_each_online_cpu(cpu) { + /* Leave it to be NULL. */ + if (xen_vcpu_nr(cpu) >= MAX_VIRT_CPUS) + continue; + per_cpu(xen_vcpu, cpu) = + &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; + } +} + +static void __init init_hvm_pv_info(void) +{ + int major, minor; + uint32_t eax, ebx, ecx, edx, base; + + base = xen_cpuid_base(); + eax = cpuid_eax(base + 1); + + major = eax >> 16; + minor = eax & 0xffff; + printk(KERN_INFO "Xen version %d.%d.\n", major, minor); + + xen_domain_type = XEN_HVM_DOMAIN; + + /* PVH set up hypercall page in xen_prepare_pvh(). */ + if (xen_pvh_domain()) + pv_info.name = "Xen PVH"; + else { + u64 pfn; + uint32_t msr; + + pv_info.name = "Xen HVM"; + msr = cpuid_ebx(base + 2); + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + } + + xen_setup_features(); + + cpuid(base + 4, &eax, &ebx, &ecx, &edx); + if (eax & XEN_HVM_CPUID_VCPU_ID_PRESENT) + this_cpu_write(xen_vcpu_id, ebx); + else + this_cpu_write(xen_vcpu_id, smp_processor_id()); +} + +#ifdef CONFIG_KEXEC_CORE +static void xen_hvm_shutdown(void) +{ + native_machine_shutdown(); + if (kexec_in_progress) + xen_reboot(SHUTDOWN_soft_reset); +} + +static void xen_hvm_crash_shutdown(struct pt_regs *regs) +{ + native_machine_crash_shutdown(regs); + xen_reboot(SHUTDOWN_soft_reset); +} +#endif + +static int xen_cpu_up_prepare_hvm(unsigned int cpu) +{ + int rc; + + /* + * This can happen if CPU was offlined earlier and + * offlining timed out in common_cpu_die(). + */ + if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) { + xen_smp_intr_free(cpu); + xen_uninit_lock_cpu(cpu); + } + + if (cpu_acpi_id(cpu) != U32_MAX) + per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu); + else + per_cpu(xen_vcpu_id, cpu) = cpu; + xen_vcpu_setup(cpu); + + if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock)) + xen_setup_timer(cpu); + + rc = xen_smp_intr_init(cpu); + if (rc) { + WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n", + cpu, rc); + return rc; + } + return 0; +} + +static int xen_cpu_dead_hvm(unsigned int cpu) +{ + xen_smp_intr_free(cpu); + + if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock)) + xen_teardown_timer(cpu); + + return 0; +} + +static void __init xen_hvm_guest_init(void) +{ + if (xen_pv_domain()) + return; + + init_hvm_pv_info(); + + xen_hvm_init_shared_info(); + + xen_panic_handler_init(); + + if (xen_feature(XENFEAT_hvm_callback_vector)) + xen_have_vector_callback = 1; + + xen_hvm_smp_init(); + WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_hvm, xen_cpu_dead_hvm)); + xen_unplug_emulated_devices(); + x86_init.irqs.intr_init = xen_init_IRQ; + xen_hvm_init_time_ops(); + xen_hvm_init_mmu_ops(); + + if (xen_pvh_domain()) + machine_ops.emergency_restart = xen_emergency_restart; +#ifdef CONFIG_KEXEC_CORE + machine_ops.shutdown = xen_hvm_shutdown; + machine_ops.crash_shutdown = xen_hvm_crash_shutdown; +#endif +} + +static bool xen_nopv; +static __init int xen_parse_nopv(char *arg) +{ + xen_nopv = true; + return 0; +} +early_param("xen_nopv", xen_parse_nopv); + +bool xen_hvm_need_lapic(void) +{ + if (xen_nopv) + return false; + if (xen_pv_domain()) + return false; + if (!xen_hvm_domain()) + return false; + if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback) + return false; + return true; +} +EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); + +static uint32_t __init xen_platform_hvm(void) +{ + if (xen_pv_domain() || xen_nopv) + return 0; + + return xen_cpuid_base(); +} + +const struct hypervisor_x86 x86_hyper_xen_hvm = { + .name = "Xen HVM", + .detect = xen_platform_hvm, + .init_platform = xen_hvm_guest_init, + .pin_vcpu = xen_pin_vcpu, + .x2apic_available = xen_x2apic_para_available, +}; +EXPORT_SYMBOL(x86_hyper_xen_hvm); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c new file mode 100644 index 000000000000..a732bc2b9dfc --- /dev/null +++ b/arch/x86/xen/enlighten_pv.c @@ -0,0 +1,1513 @@ +/* + * Core of Xen paravirt_ops implementation. + * + * This file contains the xen_paravirt_ops structure itself, and the + * implementations for: + * - privileged instructions + * - interrupt flags + * - segment operations + * - booting and setup + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#include <linux/cpu.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/preempt.h> +#include <linux/hardirq.h> +#include <linux/percpu.h> +#include <linux/delay.h> +#include <linux/start_kernel.h> +#include <linux/sched.h> +#include <linux/kprobes.h> +#include <linux/bootmem.h> +#include <linux/export.h> +#include <linux/mm.h> +#include <linux/page-flags.h> +#include <linux/highmem.h> +#include <linux/console.h> +#include <linux/pci.h> +#include <linux/gfp.h> +#include <linux/memblock.h> +#include <linux/edd.h> +#include <linux/frame.h> + +#include <xen/xen.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/version.h> +#include <xen/interface/physdev.h> +#include <xen/interface/vcpu.h> +#include <xen/interface/memory.h> +#include <xen/interface/nmi.h> +#include <xen/interface/xen-mca.h> +#include <xen/features.h> +#include <xen/page.h> +#include <xen/hvc-console.h> +#include <xen/acpi.h> + +#include <asm/paravirt.h> +#include <asm/apic.h> +#include <asm/page.h> +#include <asm/xen/pci.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/cpuid.h> +#include <asm/fixmap.h> +#include <asm/processor.h> +#include <asm/proto.h> +#include <asm/msr-index.h> +#include <asm/traps.h> +#include <asm/setup.h> +#include <asm/desc.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/reboot.h> +#include <asm/stackprotector.h> +#include <asm/hypervisor.h> +#include <asm/mach_traps.h> +#include <asm/mwait.h> +#include <asm/pci_x86.h> +#include <asm/cpu.h> + +#ifdef CONFIG_ACPI +#include <linux/acpi.h> +#include <asm/acpi.h> +#include <acpi/pdc_intel.h> +#include <acpi/processor.h> +#include <xen/interface/platform.h> +#endif + +#include "xen-ops.h" +#include "mmu.h" +#include "smp.h" +#include "multicalls.h" +#include "pmu.h" + +void *xen_initial_gdt; + +RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); + +static int xen_cpu_up_prepare_pv(unsigned int cpu); +static int xen_cpu_dead_pv(unsigned int cpu); + +struct tls_descs { + struct desc_struct desc[3]; +}; + +/* + * Updating the 3 TLS descriptors in the GDT on every task switch is + * surprisingly expensive so we avoid updating them if they haven't + * changed. Since Xen writes different descriptors than the one + * passed in the update_descriptor hypercall we keep shadow copies to + * compare against. + */ +static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); + +/* + * On restore, set the vcpu placement up again. + * If it fails, then we're in a bad state, since + * we can't back out from using it... + */ +void xen_vcpu_restore(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + bool other_cpu = (cpu != smp_processor_id()); + bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu), + NULL); + + if (other_cpu && is_up && + HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL)) + BUG(); + + xen_setup_runstate_info(cpu); + + if (xen_have_vcpu_info_placement) + xen_vcpu_setup(cpu); + + if (other_cpu && is_up && + HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)) + BUG(); + } +} + +static void __init xen_banner(void) +{ + unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); + struct xen_extraversion extra; + HYPERVISOR_xen_version(XENVER_extraversion, &extra); + + pr_info("Booting paravirtualized kernel %son %s\n", + xen_feature(XENFEAT_auto_translated_physmap) ? + "with PVH extensions " : "", pv_info.name); + printk(KERN_INFO "Xen version: %d.%d%s%s\n", + version >> 16, version & 0xffff, extra.extraversion, + xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); +} +/* Check if running on Xen version (major, minor) or later */ +bool +xen_running_on_version_or_later(unsigned int major, unsigned int minor) +{ + unsigned int version; + + if (!xen_domain()) + return false; + + version = HYPERVISOR_xen_version(XENVER_version, NULL); + if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) || + ((version >> 16) > major)) + return true; + return false; +} + +static __read_mostly unsigned int cpuid_leaf5_ecx_val; +static __read_mostly unsigned int cpuid_leaf5_edx_val; + +static void xen_cpuid(unsigned int *ax, unsigned int *bx, + unsigned int *cx, unsigned int *dx) +{ + unsigned maskebx = ~0; + + /* + * Mask out inconvenient features, to try and disable as many + * unsupported kernel subsystems as possible. + */ + switch (*ax) { + case CPUID_MWAIT_LEAF: + /* Synthesize the values.. */ + *ax = 0; + *bx = 0; + *cx = cpuid_leaf5_ecx_val; + *dx = cpuid_leaf5_edx_val; + return; + + case 0xb: + /* Suppress extended topology stuff */ + maskebx = 0; + break; + } + + asm(XEN_EMULATE_PREFIX "cpuid" + : "=a" (*ax), + "=b" (*bx), + "=c" (*cx), + "=d" (*dx) + : "0" (*ax), "2" (*cx)); + + *bx &= maskebx; +} +STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */ + +static bool __init xen_check_mwait(void) +{ +#ifdef CONFIG_ACPI + struct xen_platform_op op = { + .cmd = XENPF_set_processor_pminfo, + .u.set_pminfo.id = -1, + .u.set_pminfo.type = XEN_PM_PDC, + }; + uint32_t buf[3]; + unsigned int ax, bx, cx, dx; + unsigned int mwait_mask; + + /* We need to determine whether it is OK to expose the MWAIT + * capability to the kernel to harvest deeper than C3 states from ACPI + * _CST using the processor_harvest_xen.c module. For this to work, we + * need to gather the MWAIT_LEAF values (which the cstate.c code + * checks against). The hypervisor won't expose the MWAIT flag because + * it would break backwards compatibility; so we will find out directly + * from the hardware and hypercall. + */ + if (!xen_initial_domain()) + return false; + + /* + * When running under platform earlier than Xen4.2, do not expose + * mwait, to avoid the risk of loading native acpi pad driver + */ + if (!xen_running_on_version_or_later(4, 2)) + return false; + + ax = 1; + cx = 0; + + native_cpuid(&ax, &bx, &cx, &dx); + + mwait_mask = (1 << (X86_FEATURE_EST % 32)) | + (1 << (X86_FEATURE_MWAIT % 32)); + + if ((cx & mwait_mask) != mwait_mask) + return false; + + /* We need to emulate the MWAIT_LEAF and for that we need both + * ecx and edx. The hypercall provides only partial information. + */ + + ax = CPUID_MWAIT_LEAF; + bx = 0; + cx = 0; + dx = 0; + + native_cpuid(&ax, &bx, &cx, &dx); + + /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so, + * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3. + */ + buf[0] = ACPI_PDC_REVISION_ID; + buf[1] = 1; + buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP); + + set_xen_guest_handle(op.u.set_pminfo.pdc, buf); + + if ((HYPERVISOR_platform_op(&op) == 0) && + (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) { + cpuid_leaf5_ecx_val = cx; + cpuid_leaf5_edx_val = dx; + } + return true; +#else + return false; +#endif +} + +static bool __init xen_check_xsave(void) +{ + unsigned int err, eax, edx; + + /* + * Xen 4.0 and older accidentally leaked the host XSAVE flag into guest + * view, despite not being able to support guests using the + * functionality. Probe for the actual availability of XSAVE by seeing + * whether xgetbv executes successfully or raises #UD. + */ + asm volatile("1: .byte 0x0f,0x01,0xd0\n\t" /* xgetbv */ + "xor %[err], %[err]\n" + "2:\n\t" + ".pushsection .fixup,\"ax\"\n\t" + "3: movl $1,%[err]\n\t" + "jmp 2b\n\t" + ".popsection\n\t" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err), "=a" (eax), "=d" (edx) + : "c" (0)); + + return err == 0; +} + +static void __init xen_init_capabilities(void) +{ + setup_clear_cpu_cap(X86_BUG_SYSRET_SS_ATTRS); + setup_force_cpu_cap(X86_FEATURE_XENPV); + setup_clear_cpu_cap(X86_FEATURE_DCA); + setup_clear_cpu_cap(X86_FEATURE_APERFMPERF); + setup_clear_cpu_cap(X86_FEATURE_MTRR); + setup_clear_cpu_cap(X86_FEATURE_ACC); + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + + if (!xen_initial_domain()) + setup_clear_cpu_cap(X86_FEATURE_ACPI); + + if (xen_check_mwait()) + setup_force_cpu_cap(X86_FEATURE_MWAIT); + else + setup_clear_cpu_cap(X86_FEATURE_MWAIT); + + if (xen_check_xsave()) { + setup_force_cpu_cap(X86_FEATURE_XSAVE); + setup_force_cpu_cap(X86_FEATURE_OSXSAVE); + } else { + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_OSXSAVE); + } +} + +static void xen_set_debugreg(int reg, unsigned long val) +{ + HYPERVISOR_set_debugreg(reg, val); +} + +static unsigned long xen_get_debugreg(int reg) +{ + return HYPERVISOR_get_debugreg(reg); +} + +static void xen_end_context_switch(struct task_struct *next) +{ + xen_mc_flush(); + paravirt_end_context_switch(next); +} + +static unsigned long xen_store_tr(void) +{ + return 0; +} + +/* + * Set the page permissions for a particular virtual address. If the + * address is a vmalloc mapping (or other non-linear mapping), then + * find the linear mapping of the page and also set its protections to + * match. + */ +static void set_aliased_prot(void *v, pgprot_t prot) +{ + int level; + pte_t *ptep; + pte_t pte; + unsigned long pfn; + struct page *page; + unsigned char dummy; + + ptep = lookup_address((unsigned long)v, &level); + BUG_ON(ptep == NULL); + + pfn = pte_pfn(*ptep); + page = pfn_to_page(pfn); + + pte = pfn_pte(pfn, prot); + + /* + * Careful: update_va_mapping() will fail if the virtual address + * we're poking isn't populated in the page tables. We don't + * need to worry about the direct map (that's always in the page + * tables), but we need to be careful about vmap space. In + * particular, the top level page table can lazily propagate + * entries between processes, so if we've switched mms since we + * vmapped the target in the first place, we might not have the + * top-level page table entry populated. + * + * We disable preemption because we want the same mm active when + * we probe the target and when we issue the hypercall. We'll + * have the same nominal mm, but if we're a kernel thread, lazy + * mm dropping could change our pgd. + * + * Out of an abundance of caution, this uses __get_user() to fault + * in the target address just in case there's some obscure case + * in which the target address isn't readable. + */ + + preempt_disable(); + + probe_kernel_read(&dummy, v, 1); + + if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) + BUG(); + + if (!PageHighMem(page)) { + void *av = __va(PFN_PHYS(pfn)); + + if (av != v) + if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0)) + BUG(); + } else + kmap_flush_unused(); + + preempt_enable(); +} + +static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) +{ + const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; + int i; + + /* + * We need to mark the all aliases of the LDT pages RO. We + * don't need to call vm_flush_aliases(), though, since that's + * only responsible for flushing aliases out the TLBs, not the + * page tables, and Xen will flush the TLB for us if needed. + * + * To avoid confusing future readers: none of this is necessary + * to load the LDT. The hypervisor only checks this when the + * LDT is faulted in due to subsequent descriptor access. + */ + + for (i = 0; i < entries; i += entries_per_page) + set_aliased_prot(ldt + i, PAGE_KERNEL_RO); +} + +static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) +{ + const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; + int i; + + for (i = 0; i < entries; i += entries_per_page) + set_aliased_prot(ldt + i, PAGE_KERNEL); +} + +static void xen_set_ldt(const void *addr, unsigned entries) +{ + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + + trace_xen_cpu_set_ldt(addr, entries); + + op = mcs.args; + op->cmd = MMUEXT_SET_LDT; + op->arg1.linear_addr = (unsigned long)addr; + op->arg2.nr_ents = entries; + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + +static void xen_load_gdt(const struct desc_ptr *dtr) +{ + unsigned long va = dtr->address; + unsigned int size = dtr->size + 1; + unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE); + unsigned long frames[pages]; + int f; + + /* + * A GDT can be up to 64k in size, which corresponds to 8192 + * 8-byte entries, or 16 4k pages.. + */ + + BUG_ON(size > 65536); + BUG_ON(va & ~PAGE_MASK); + + for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { + int level; + pte_t *ptep; + unsigned long pfn, mfn; + void *virt; + + /* + * The GDT is per-cpu and is in the percpu data area. + * That can be virtually mapped, so we need to do a + * page-walk to get the underlying MFN for the + * hypercall. The page can also be in the kernel's + * linear range, so we need to RO that mapping too. + */ + ptep = lookup_address(va, &level); + BUG_ON(ptep == NULL); + + pfn = pte_pfn(*ptep); + mfn = pfn_to_mfn(pfn); + virt = __va(PFN_PHYS(pfn)); + + frames[f] = mfn; + + make_lowmem_page_readonly((void *)va); + make_lowmem_page_readonly(virt); + } + + if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) + BUG(); +} + +/* + * load_gdt for early boot, when the gdt is only mapped once + */ +static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) +{ + unsigned long va = dtr->address; + unsigned int size = dtr->size + 1; + unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE); + unsigned long frames[pages]; + int f; + + /* + * A GDT can be up to 64k in size, which corresponds to 8192 + * 8-byte entries, or 16 4k pages.. + */ + + BUG_ON(size > 65536); + BUG_ON(va & ~PAGE_MASK); + + for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { + pte_t pte; + unsigned long pfn, mfn; + + pfn = virt_to_pfn(va); + mfn = pfn_to_mfn(pfn); + + pte = pfn_pte(pfn, PAGE_KERNEL_RO); + + if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0)) + BUG(); + + frames[f] = mfn; + } + + if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) + BUG(); +} + +static inline bool desc_equal(const struct desc_struct *d1, + const struct desc_struct *d2) +{ + return d1->a == d2->a && d1->b == d2->b; +} + +static void load_TLS_descriptor(struct thread_struct *t, + unsigned int cpu, unsigned int i) +{ + struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i]; + struct desc_struct *gdt; + xmaddr_t maddr; + struct multicall_space mc; + + if (desc_equal(shadow, &t->tls_array[i])) + return; + + *shadow = t->tls_array[i]; + + gdt = get_cpu_gdt_rw(cpu); + maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); + mc = __xen_mc_entry(0); + + MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); +} + +static void xen_load_tls(struct thread_struct *t, unsigned int cpu) +{ + /* + * XXX sleazy hack: If we're being called in a lazy-cpu zone + * and lazy gs handling is enabled, it means we're in a + * context switch, and %gs has just been saved. This means we + * can zero it out to prevent faults on exit from the + * hypervisor if the next process has no %gs. Either way, it + * has been saved, and the new value will get loaded properly. + * This will go away as soon as Xen has been modified to not + * save/restore %gs for normal hypercalls. + * + * On x86_64, this hack is not used for %gs, because gs points + * to KERNEL_GS_BASE (and uses it for PDA references), so we + * must not zero %gs on x86_64 + * + * For x86_64, we need to zero %fs, otherwise we may get an + * exception between the new %fs descriptor being loaded and + * %fs being effectively cleared at __switch_to(). + */ + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { +#ifdef CONFIG_X86_32 + lazy_load_gs(0); +#else + loadsegment(fs, 0); +#endif + } + + xen_mc_batch(); + + load_TLS_descriptor(t, cpu, 0); + load_TLS_descriptor(t, cpu, 1); + load_TLS_descriptor(t, cpu, 2); + + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + +#ifdef CONFIG_X86_64 +static void xen_load_gs_index(unsigned int idx) +{ + if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx)) + BUG(); +} +#endif + +static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, + const void *ptr) +{ + xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]); + u64 entry = *(u64 *)ptr; + + trace_xen_cpu_write_ldt_entry(dt, entrynum, entry); + + preempt_disable(); + + xen_mc_flush(); + if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) + BUG(); + + preempt_enable(); +} + +static int cvt_gate_to_trap(int vector, const gate_desc *val, + struct trap_info *info) +{ + unsigned long addr; + + if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) + return 0; + + info->vector = vector; + + addr = gate_offset(*val); +#ifdef CONFIG_X86_64 + /* + * Look for known traps using IST, and substitute them + * appropriately. The debugger ones are the only ones we care + * about. Xen will handle faults like double_fault, + * so we should never see them. Warn if + * there's an unexpected IST-using fault handler. + */ + if (addr == (unsigned long)debug) + addr = (unsigned long)xen_debug; + else if (addr == (unsigned long)int3) + addr = (unsigned long)xen_int3; + else if (addr == (unsigned long)stack_segment) + addr = (unsigned long)xen_stack_segment; + else if (addr == (unsigned long)double_fault) { + /* Don't need to handle these */ + return 0; +#ifdef CONFIG_X86_MCE + } else if (addr == (unsigned long)machine_check) { + /* + * when xen hypervisor inject vMCE to guest, + * use native mce handler to handle it + */ + ; +#endif + } else if (addr == (unsigned long)nmi) + /* + * Use the native version as well. + */ + ; + else { + /* Some other trap using IST? */ + if (WARN_ON(val->ist != 0)) + return 0; + } +#endif /* CONFIG_X86_64 */ + info->address = addr; + + info->cs = gate_segment(*val); + info->flags = val->dpl; + /* interrupt gates clear IF */ + if (val->type == GATE_INTERRUPT) + info->flags |= 1 << 2; + + return 1; +} + +/* Locations of each CPU's IDT */ +static DEFINE_PER_CPU(struct desc_ptr, idt_desc); + +/* Set an IDT entry. If the entry is part of the current IDT, then + also update Xen. */ +static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) +{ + unsigned long p = (unsigned long)&dt[entrynum]; + unsigned long start, end; + + trace_xen_cpu_write_idt_entry(dt, entrynum, g); + + preempt_disable(); + + start = __this_cpu_read(idt_desc.address); + end = start + __this_cpu_read(idt_desc.size) + 1; + + xen_mc_flush(); + + native_write_idt_entry(dt, entrynum, g); + + if (p >= start && (p + 8) <= end) { + struct trap_info info[2]; + + info[1].address = 0; + + if (cvt_gate_to_trap(entrynum, g, &info[0])) + if (HYPERVISOR_set_trap_table(info)) + BUG(); + } + + preempt_enable(); +} + +static void xen_convert_trap_info(const struct desc_ptr *desc, + struct trap_info *traps) +{ + unsigned in, out, count; + + count = (desc->size+1) / sizeof(gate_desc); + BUG_ON(count > 256); + + for (in = out = 0; in < count; in++) { + gate_desc *entry = (gate_desc *)(desc->address) + in; + + if (cvt_gate_to_trap(in, entry, &traps[out])) + out++; + } + traps[out].address = 0; +} + +void xen_copy_trap_info(struct trap_info *traps) +{ + const struct desc_ptr *desc = this_cpu_ptr(&idt_desc); + + xen_convert_trap_info(desc, traps); +} + +/* Load a new IDT into Xen. In principle this can be per-CPU, so we + hold a spinlock to protect the static traps[] array (static because + it avoids allocation, and saves stack space). */ +static void xen_load_idt(const struct desc_ptr *desc) +{ + static DEFINE_SPINLOCK(lock); + static struct trap_info traps[257]; + + trace_xen_cpu_load_idt(desc); + + spin_lock(&lock); + + memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc)); + + xen_convert_trap_info(desc, traps); + + xen_mc_flush(); + if (HYPERVISOR_set_trap_table(traps)) + BUG(); + + spin_unlock(&lock); +} + +/* Write a GDT descriptor entry. Ignore LDT descriptors, since + they're handled differently. */ +static void xen_write_gdt_entry(struct desc_struct *dt, int entry, + const void *desc, int type) +{ + trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); + + preempt_disable(); + + switch (type) { + case DESC_LDT: + case DESC_TSS: + /* ignore */ + break; + + default: { + xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]); + + xen_mc_flush(); + if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) + BUG(); + } + + } + + preempt_enable(); +} + +/* + * Version of write_gdt_entry for use at early boot-time needed to + * update an entry as simply as possible. + */ +static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, + const void *desc, int type) +{ + trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); + + switch (type) { + case DESC_LDT: + case DESC_TSS: + /* ignore */ + break; + + default: { + xmaddr_t maddr = virt_to_machine(&dt[entry]); + + if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) + dt[entry] = *(struct desc_struct *)desc; + } + + } +} + +static void xen_load_sp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + struct multicall_space mcs; + + mcs = xen_mc_entry(0); + MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); + xen_mc_issue(PARAVIRT_LAZY_CPU); + tss->x86_tss.sp0 = thread->sp0; +} + +void xen_set_iopl_mask(unsigned mask) +{ + struct physdev_set_iopl set_iopl; + + /* Force the change at ring 0. */ + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; + HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); +} + +static void xen_io_delay(void) +{ +} + +static DEFINE_PER_CPU(unsigned long, xen_cr0_value); + +static unsigned long xen_read_cr0(void) +{ + unsigned long cr0 = this_cpu_read(xen_cr0_value); + + if (unlikely(cr0 == 0)) { + cr0 = native_read_cr0(); + this_cpu_write(xen_cr0_value, cr0); + } + + return cr0; +} + +static void xen_write_cr0(unsigned long cr0) +{ + struct multicall_space mcs; + + this_cpu_write(xen_cr0_value, cr0); + + /* Only pay attention to cr0.TS; everything else is + ignored. */ + mcs = xen_mc_entry(0); + + MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0); + + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + +static void xen_write_cr4(unsigned long cr4) +{ + cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE); + + native_write_cr4(cr4); +} +#ifdef CONFIG_X86_64 +static inline unsigned long xen_read_cr8(void) +{ + return 0; +} +static inline void xen_write_cr8(unsigned long val) +{ + BUG_ON(val); +} +#endif + +static u64 xen_read_msr_safe(unsigned int msr, int *err) +{ + u64 val; + + if (pmu_msr_read(msr, &val, err)) + return val; + + val = native_read_msr_safe(msr, err); + switch (msr) { + case MSR_IA32_APICBASE: +#ifdef CONFIG_X86_X2APIC + if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31)))) +#endif + val &= ~X2APIC_ENABLE; + break; + } + return val; +} + +static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) +{ + int ret; + + ret = 0; + + switch (msr) { +#ifdef CONFIG_X86_64 + unsigned which; + u64 base; + + case MSR_FS_BASE: which = SEGBASE_FS; goto set; + case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; + case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; + + set: + base = ((u64)high << 32) | low; + if (HYPERVISOR_set_segment_base(which, base) != 0) + ret = -EIO; + break; +#endif + + case MSR_STAR: + case MSR_CSTAR: + case MSR_LSTAR: + case MSR_SYSCALL_MASK: + case MSR_IA32_SYSENTER_CS: + case MSR_IA32_SYSENTER_ESP: + case MSR_IA32_SYSENTER_EIP: + /* Fast syscall setup is all done in hypercalls, so + these are all ignored. Stub them out here to stop + Xen console noise. */ + break; + + default: + if (!pmu_msr_write(msr, low, high, &ret)) + ret = native_write_msr_safe(msr, low, high); + } + + return ret; +} + +static u64 xen_read_msr(unsigned int msr) +{ + /* + * This will silently swallow a #GP from RDMSR. It may be worth + * changing that. + */ + int err; + + return xen_read_msr_safe(msr, &err); +} + +static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) +{ + /* + * This will silently swallow a #GP from WRMSR. It may be worth + * changing that. + */ + xen_write_msr_safe(msr, low, high); +} + +void xen_setup_shared_info(void) +{ + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + set_fixmap(FIX_PARAVIRT_BOOTMAP, + xen_start_info->shared_info); + + HYPERVISOR_shared_info = + (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); + } else + HYPERVISOR_shared_info = + (struct shared_info *)__va(xen_start_info->shared_info); + +#ifndef CONFIG_SMP + /* In UP this is as good a place as any to set up shared info */ + xen_setup_vcpu_info_placement(); +#endif + + xen_setup_mfn_list_list(); +} + +/* This is called once we have the cpu_possible_mask */ +void xen_setup_vcpu_info_placement(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + /* Set up direct vCPU id mapping for PV guests. */ + per_cpu(xen_vcpu_id, cpu) = cpu; + xen_vcpu_setup(cpu); + } + + /* + * xen_vcpu_setup managed to place the vcpu_info within the + * percpu area for all cpus, so make use of it. + */ + if (xen_have_vcpu_info_placement) { + pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); + pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); + pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); + pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); + pv_mmu_ops.read_cr2 = xen_read_cr2_direct; + } +} + +static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, + unsigned long addr, unsigned len) +{ + char *start, *end, *reloc; + unsigned ret; + + start = end = reloc = NULL; + +#define SITE(op, x) \ + case PARAVIRT_PATCH(op.x): \ + if (xen_have_vcpu_info_placement) { \ + start = (char *)xen_##x##_direct; \ + end = xen_##x##_direct_end; \ + reloc = xen_##x##_direct_reloc; \ + } \ + goto patch_site + + switch (type) { + SITE(pv_irq_ops, irq_enable); + SITE(pv_irq_ops, irq_disable); + SITE(pv_irq_ops, save_fl); + SITE(pv_irq_ops, restore_fl); +#undef SITE + + patch_site: + if (start == NULL || (end-start) > len) + goto default_patch; + + ret = paravirt_patch_insns(insnbuf, len, start, end); + + /* Note: because reloc is assigned from something that + appears to be an array, gcc assumes it's non-null, + but doesn't know its relationship with start and + end. */ + if (reloc > start && reloc < end) { + int reloc_off = reloc - start; + long *relocp = (long *)(insnbuf + reloc_off); + long delta = start - (char *)addr; + + *relocp += delta; + } + break; + + default_patch: + default: + ret = paravirt_patch_default(type, clobbers, insnbuf, + addr, len); + break; + } + + return ret; +} + +static const struct pv_info xen_info __initconst = { + .shared_kernel_pmd = 0, + +#ifdef CONFIG_X86_64 + .extra_user_64bit_cs = FLAT_USER_CS64, +#endif + .name = "Xen", +}; + +static const struct pv_init_ops xen_init_ops __initconst = { + .patch = xen_patch, +}; + +static const struct pv_cpu_ops xen_cpu_ops __initconst = { + .cpuid = xen_cpuid, + + .set_debugreg = xen_set_debugreg, + .get_debugreg = xen_get_debugreg, + + .read_cr0 = xen_read_cr0, + .write_cr0 = xen_write_cr0, + + .read_cr4 = native_read_cr4, + .write_cr4 = xen_write_cr4, + +#ifdef CONFIG_X86_64 + .read_cr8 = xen_read_cr8, + .write_cr8 = xen_write_cr8, +#endif + + .wbinvd = native_wbinvd, + + .read_msr = xen_read_msr, + .write_msr = xen_write_msr, + + .read_msr_safe = xen_read_msr_safe, + .write_msr_safe = xen_write_msr_safe, + + .read_pmc = xen_read_pmc, + + .iret = xen_iret, +#ifdef CONFIG_X86_64 + .usergs_sysret64 = xen_sysret64, +#endif + + .load_tr_desc = paravirt_nop, + .set_ldt = xen_set_ldt, + .load_gdt = xen_load_gdt, + .load_idt = xen_load_idt, + .load_tls = xen_load_tls, +#ifdef CONFIG_X86_64 + .load_gs_index = xen_load_gs_index, +#endif + + .alloc_ldt = xen_alloc_ldt, + .free_ldt = xen_free_ldt, + + .store_idt = native_store_idt, + .store_tr = xen_store_tr, + + .write_ldt_entry = xen_write_ldt_entry, + .write_gdt_entry = xen_write_gdt_entry, + .write_idt_entry = xen_write_idt_entry, + .load_sp0 = xen_load_sp0, + + .set_iopl_mask = xen_set_iopl_mask, + .io_delay = xen_io_delay, + + /* Xen takes care of %gs when switching to usermode for us */ + .swapgs = paravirt_nop, + + .start_context_switch = paravirt_start_context_switch, + .end_context_switch = xen_end_context_switch, +}; + +static void xen_restart(char *msg) +{ + xen_reboot(SHUTDOWN_reboot); +} + +static void xen_machine_halt(void) +{ + xen_reboot(SHUTDOWN_poweroff); +} + +static void xen_machine_power_off(void) +{ + if (pm_power_off) + pm_power_off(); + xen_reboot(SHUTDOWN_poweroff); +} + +static void xen_crash_shutdown(struct pt_regs *regs) +{ + xen_reboot(SHUTDOWN_crash); +} + +static const struct machine_ops xen_machine_ops __initconst = { + .restart = xen_restart, + .halt = xen_machine_halt, + .power_off = xen_machine_power_off, + .shutdown = xen_machine_halt, + .crash_shutdown = xen_crash_shutdown, + .emergency_restart = xen_emergency_restart, +}; + +static unsigned char xen_get_nmi_reason(void) +{ + unsigned char reason = 0; + + /* Construct a value which looks like it came from port 0x61. */ + if (test_bit(_XEN_NMIREASON_io_error, + &HYPERVISOR_shared_info->arch.nmi_reason)) + reason |= NMI_REASON_IOCHK; + if (test_bit(_XEN_NMIREASON_pci_serr, + &HYPERVISOR_shared_info->arch.nmi_reason)) + reason |= NMI_REASON_SERR; + + return reason; +} + +static void __init xen_boot_params_init_edd(void) +{ +#if IS_ENABLED(CONFIG_EDD) + struct xen_platform_op op; + struct edd_info *edd_info; + u32 *mbr_signature; + unsigned nr; + int ret; + + edd_info = boot_params.eddbuf; + mbr_signature = boot_params.edd_mbr_sig_buffer; + + op.cmd = XENPF_firmware_info; + + op.u.firmware_info.type = XEN_FW_DISK_INFO; + for (nr = 0; nr < EDDMAXNR; nr++) { + struct edd_info *info = edd_info + nr; + + op.u.firmware_info.index = nr; + info->params.length = sizeof(info->params); + set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params, + &info->params); + ret = HYPERVISOR_platform_op(&op); + if (ret) + break; + +#define C(x) info->x = op.u.firmware_info.u.disk_info.x + C(device); + C(version); + C(interface_support); + C(legacy_max_cylinder); + C(legacy_max_head); + C(legacy_sectors_per_track); +#undef C + } + boot_params.eddbuf_entries = nr; + + op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE; + for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) { + op.u.firmware_info.index = nr; + ret = HYPERVISOR_platform_op(&op); + if (ret) + break; + mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature; + } + boot_params.edd_mbr_sig_buf_entries = nr; +#endif +} + +/* + * Set up the GDT and segment registers for -fstack-protector. Until + * we do this, we have to be careful not to call any stack-protected + * function, which is most of the kernel. + */ +static void xen_setup_gdt(int cpu) +{ + pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; + pv_cpu_ops.load_gdt = xen_load_gdt_boot; + + setup_stack_canary_segment(0); + switch_to_new_gdt(0); + + pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry; + pv_cpu_ops.load_gdt = xen_load_gdt; +} + +static void __init xen_dom0_set_legacy_features(void) +{ + x86_platform.legacy.rtc = 1; +} + +/* First C function to be called on Xen boot */ +asmlinkage __visible void __init xen_start_kernel(void) +{ + struct physdev_set_iopl set_iopl; + unsigned long initrd_start = 0; + int rc; + + if (!xen_start_info) + return; + + xen_domain_type = XEN_PV_DOMAIN; + + xen_setup_features(); + + xen_setup_machphys_mapping(); + + /* Install Xen paravirt ops */ + pv_info = xen_info; + pv_init_ops = xen_init_ops; + pv_cpu_ops = xen_cpu_ops; + + x86_platform.get_nmi_reason = xen_get_nmi_reason; + + x86_init.resources.memory_setup = xen_memory_setup; + x86_init.oem.arch_setup = xen_arch_setup; + x86_init.oem.banner = xen_banner; + + xen_init_time_ops(); + + /* + * Set up some pagetable state before starting to set any ptes. + */ + + xen_init_mmu_ops(); + + /* Prevent unwanted bits from being set in PTEs. */ + __supported_pte_mask &= ~_PAGE_GLOBAL; + + /* + * Prevent page tables from being allocated in highmem, even + * if CONFIG_HIGHPTE is enabled. + */ + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + + /* Work out if we support NX */ + x86_configure_nx(); + + /* Get mfn list */ + xen_build_dynamic_phys_to_machine(); + + /* + * Set up kernel GDT and segment registers, mainly so that + * -fstack-protector code can be executed. + */ + xen_setup_gdt(0); + + xen_init_irq_ops(); + xen_init_capabilities(); + +#ifdef CONFIG_X86_LOCAL_APIC + /* + * set up the basic apic ops. + */ + xen_init_apic(); +#endif + + if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { + pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; + pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; + } + + machine_ops = xen_machine_ops; + + /* + * The only reliable way to retain the initial address of the + * percpu gdt_page is to remember it here, so we can go and + * mark it RW later, when the initial percpu area is freed. + */ + xen_initial_gdt = &per_cpu(gdt_page, 0); + + xen_smp_init(); + +#ifdef CONFIG_ACPI_NUMA + /* + * The pages we from Xen are not related to machine pages, so + * any NUMA information the kernel tries to get from ACPI will + * be meaningless. Prevent it from trying. + */ + acpi_numa = -1; +#endif + /* Don't do the full vcpu_info placement stuff until we have a + possible map and a non-dummy shared_info. */ + per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; + + WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv)); + + local_irq_disable(); + early_boot_irqs_disabled = true; + + xen_raw_console_write("mapping kernel into physical memory\n"); + xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, + xen_start_info->nr_pages); + xen_reserve_special_pages(); + + /* keep using Xen gdt for now; no urgent need to change it */ + +#ifdef CONFIG_X86_32 + pv_info.kernel_rpl = 1; + if (xen_feature(XENFEAT_supervisor_mode_kernel)) + pv_info.kernel_rpl = 0; +#else + pv_info.kernel_rpl = 0; +#endif + /* set the limit of our address space */ + xen_reserve_top(); + + /* + * We used to do this in xen_arch_setup, but that is too late + * on AMD were early_cpu_init (run before ->arch_setup()) calls + * early_amd_init which pokes 0xcf8 port. + */ + set_iopl.iopl = 1; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); + if (rc != 0) + xen_raw_printk("physdev_op failed %d\n", rc); + +#ifdef CONFIG_X86_32 + /* set up basic CPUID stuff */ + cpu_detect(&new_cpu_data); + set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); + new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1); +#endif + + if (xen_start_info->mod_start) { + if (xen_start_info->flags & SIF_MOD_START_PFN) + initrd_start = PFN_PHYS(xen_start_info->mod_start); + else + initrd_start = __pa(xen_start_info->mod_start); + } + + /* Poke various useful things into boot_params */ + boot_params.hdr.type_of_loader = (9 << 4) | 0; + boot_params.hdr.ramdisk_image = initrd_start; + boot_params.hdr.ramdisk_size = xen_start_info->mod_len; + boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); + boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN; + + if (!xen_initial_domain()) { + add_preferred_console("xenboot", 0, NULL); + add_preferred_console("tty", 0, NULL); + add_preferred_console("hvc", 0, NULL); + if (pci_xen) + x86_init.pci.arch_init = pci_xen_init; + } else { + const struct dom0_vga_console_info *info = + (void *)((char *)xen_start_info + + xen_start_info->console.dom0.info_off); + struct xen_platform_op op = { + .cmd = XENPF_firmware_info, + .interface_version = XENPF_INTERFACE_VERSION, + .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS, + }; + + x86_platform.set_legacy_features = + xen_dom0_set_legacy_features; + xen_init_vga(info, xen_start_info->console.dom0.info_size); + xen_start_info->console.domU.mfn = 0; + xen_start_info->console.domU.evtchn = 0; + + if (HYPERVISOR_platform_op(&op) == 0) + boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags; + + /* Make sure ACS will be enabled */ + pci_request_acs(); + + xen_acpi_sleep_register(); + + /* Avoid searching for BIOS MP tables */ + x86_init.mpparse.find_smp_config = x86_init_noop; + x86_init.mpparse.get_smp_config = x86_init_uint_noop; + + xen_boot_params_init_edd(); + } +#ifdef CONFIG_PCI + /* PCI BIOS service won't work from a PV guest. */ + pci_probe &= ~PCI_PROBE_BIOS; +#endif + xen_raw_console_write("about to get started...\n"); + + /* Let's presume PV guests always boot on vCPU with id 0. */ + per_cpu(xen_vcpu_id, 0) = 0; + + xen_setup_runstate_info(0); + + xen_efi_init(); + + /* Start the world */ +#ifdef CONFIG_X86_32 + i386_start_kernel(); +#else + cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */ + x86_64_start_reservations((char *)__pa_symbol(&boot_params)); +#endif +} + +static int xen_cpu_up_prepare_pv(unsigned int cpu) +{ + int rc; + + xen_setup_timer(cpu); + + rc = xen_smp_intr_init(cpu); + if (rc) { + WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n", + cpu, rc); + return rc; + } + + rc = xen_smp_intr_init_pv(cpu); + if (rc) { + WARN(1, "xen_smp_intr_init_pv() for CPU %d failed: %d\n", + cpu, rc); + return rc; + } + + return 0; +} + +static int xen_cpu_dead_pv(unsigned int cpu) +{ + xen_smp_intr_free(cpu); + xen_smp_intr_free_pv(cpu); + + xen_teardown_timer(cpu); + + return 0; +} + +static uint32_t __init xen_platform_pv(void) +{ + if (xen_pv_domain()) + return xen_cpuid_base(); + + return 0; +} + +const struct hypervisor_x86 x86_hyper_xen_pv = { + .name = "Xen PV", + .detect = xen_platform_pv, + .pin_vcpu = xen_pin_vcpu, +}; +EXPORT_SYMBOL(x86_hyper_xen_pv); diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c new file mode 100644 index 000000000000..98ab17673454 --- /dev/null +++ b/arch/x86/xen/enlighten_pvh.c @@ -0,0 +1,106 @@ +#include <linux/acpi.h> + +#include <xen/hvc-console.h> + +#include <asm/io_apic.h> +#include <asm/hypervisor.h> +#include <asm/e820/api.h> + +#include <asm/xen/interface.h> +#include <asm/xen/hypercall.h> + +#include <xen/interface/memory.h> +#include <xen/interface/hvm/start_info.h> + +/* + * PVH variables. + * + * xen_pvh and pvh_bootparams need to live in data segment since they + * are used after startup_{32|64}, which clear .bss, are invoked. + */ +bool xen_pvh __attribute__((section(".data"))) = 0; +struct boot_params pvh_bootparams __attribute__((section(".data"))); + +struct hvm_start_info pvh_start_info; +unsigned int pvh_start_info_sz = sizeof(pvh_start_info); + +static void xen_pvh_arch_setup(void) +{ + /* Make sure we don't fall back to (default) ACPI_IRQ_MODEL_PIC. */ + if (nr_ioapics == 0) + acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; +} + +static void __init init_pvh_bootparams(void) +{ + struct xen_memory_map memmap; + int rc; + + memset(&pvh_bootparams, 0, sizeof(pvh_bootparams)); + + memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table); + set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table); + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); + if (rc) { + xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); + BUG(); + } + pvh_bootparams.e820_entries = memmap.nr_entries; + + if (pvh_bootparams.e820_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) { + pvh_bootparams.e820_table[pvh_bootparams.e820_entries].addr = + ISA_START_ADDRESS; + pvh_bootparams.e820_table[pvh_bootparams.e820_entries].size = + ISA_END_ADDRESS - ISA_START_ADDRESS; + pvh_bootparams.e820_table[pvh_bootparams.e820_entries].type = + E820_TYPE_RESERVED; + pvh_bootparams.e820_entries++; + } else + xen_raw_printk("Warning: Can fit ISA range into e820\n"); + + pvh_bootparams.hdr.cmd_line_ptr = + pvh_start_info.cmdline_paddr; + + /* The first module is always ramdisk. */ + if (pvh_start_info.nr_modules) { + struct hvm_modlist_entry *modaddr = + __va(pvh_start_info.modlist_paddr); + pvh_bootparams.hdr.ramdisk_image = modaddr->paddr; + pvh_bootparams.hdr.ramdisk_size = modaddr->size; + } + + /* + * See Documentation/x86/boot.txt. + * + * Version 2.12 supports Xen entry point but we will use default x86/PC + * environment (i.e. hardware_subarch 0). + */ + pvh_bootparams.hdr.version = 0x212; + pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */ +} + +/* + * This routine (and those that it might call) should not use + * anything that lives in .bss since that segment will be cleared later. + */ +void __init xen_prepare_pvh(void) +{ + u32 msr; + u64 pfn; + + if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) { + xen_raw_printk("Error: Unexpected magic value (0x%08x)\n", + pvh_start_info.magic); + BUG(); + } + + xen_pvh = 1; + + msr = cpuid_ebx(xen_cpuid_base() + 2); + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + + init_pvh_bootparams(); + + x86_init.oem.arch_setup = xen_pvh_arch_setup; +} diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f226038a39ca..5e375a5e815f 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1,84 +1,10 @@ -/* - * Xen mmu operations - * - * This file contains the various mmu fetch and update operations. - * The most important job they must perform is the mapping between the - * domain's pfn and the overall machine mfns. - * - * Xen allows guests to directly update the pagetable, in a controlled - * fashion. In other words, the guest modifies the same pagetable - * that the CPU actually uses, which eliminates the overhead of having - * a separate shadow pagetable. - * - * In order to allow this, it falls on the guest domain to map its - * notion of a "physical" pfn - which is just a domain-local linear - * address - into a real "machine address" which the CPU's MMU can - * use. - * - * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be - * inserted directly into the pagetable. When creating a new - * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely, - * when reading the content back with __(pgd|pmd|pte)_val, it converts - * the mfn back into a pfn. - * - * The other constraint is that all pages which make up a pagetable - * must be mapped read-only in the guest. This prevents uncontrolled - * guest updates to the pagetable. Xen strictly enforces this, and - * will disallow any pagetable update which will end up mapping a - * pagetable page RW, and will disallow using any writable page as a - * pagetable. - * - * Naively, when loading %cr3 with the base of a new pagetable, Xen - * would need to validate the whole pagetable before going on. - * Naturally, this is quite slow. The solution is to "pin" a - * pagetable, which enforces all the constraints on the pagetable even - * when it is not actively in use. This menas that Xen can be assured - * that it is still valid when you do load it into %cr3, and doesn't - * need to revalidate it. - * - * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 - */ -#include <linux/sched/mm.h> -#include <linux/highmem.h> -#include <linux/debugfs.h> -#include <linux/bug.h> -#include <linux/vmalloc.h> -#include <linux/export.h> -#include <linux/init.h> -#include <linux/gfp.h> -#include <linux/memblock.h> -#include <linux/seq_file.h> -#include <linux/crash_dump.h> - -#include <trace/events/xen.h> - -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/fixmap.h> -#include <asm/mmu_context.h> -#include <asm/setup.h> -#include <asm/paravirt.h> -#include <asm/e820/api.h> -#include <asm/linkage.h> -#include <asm/page.h> -#include <asm/init.h> -#include <asm/pat.h> -#include <asm/smp.h> - +#include <linux/pfn.h> +#include <asm/xen/page.h> #include <asm/xen/hypercall.h> -#include <asm/xen/hypervisor.h> - -#include <xen/xen.h> -#include <xen/page.h> -#include <xen/interface/xen.h> -#include <xen/interface/hvm/hvm_op.h> -#include <xen/interface/version.h> #include <xen/interface/memory.h> -#include <xen/hvc-console.h> #include "multicalls.h" #include "mmu.h" -#include "debugfs.h" /* * Protects atomic reservation decrease/increase against concurrent increases. @@ -86,45 +12,6 @@ */ DEFINE_SPINLOCK(xen_reservation_lock); -#ifdef CONFIG_X86_32 -/* - * Identity map, in addition to plain kernel map. This needs to be - * large enough to allocate page table pages to allocate the rest. - * Each page can map 2MB. - */ -#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) -static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); -#endif -#ifdef CONFIG_X86_64 -/* l3 pud for userspace vsyscall mapping */ -static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; -#endif /* CONFIG_X86_64 */ - -/* - * Note about cr3 (pagetable base) values: - * - * xen_cr3 contains the current logical cr3 value; it contains the - * last set cr3. This may not be the current effective cr3, because - * its update may be being lazily deferred. However, a vcpu looking - * at its own cr3 can use this value knowing that it everything will - * be self-consistent. - * - * xen_current_cr3 contains the actual vcpu cr3; it is set once the - * hypercall to set the vcpu cr3 is complete (so it may be a little - * out of date, but it will never be set early). If one vcpu is - * looking at another vcpu's cr3 value, it should use this variable. - */ -DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ -DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ - -static phys_addr_t xen_pt_base, xen_pt_size __initdata; - -/* - * Just beyond the highest usermode address. STACK_TOP_MAX has a - * redzone above it, so round it up to a PGD boundary. - */ -#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) - unsigned long arbitrary_virt_to_mfn(void *vaddr) { xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); @@ -155,1218 +42,6 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr) } EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine); -void make_lowmem_page_readonly(void *vaddr) -{ - pte_t *pte, ptev; - unsigned long address = (unsigned long)vaddr; - unsigned int level; - - pte = lookup_address(address, &level); - if (pte == NULL) - return; /* vaddr missing */ - - ptev = pte_wrprotect(*pte); - - if (HYPERVISOR_update_va_mapping(address, ptev, 0)) - BUG(); -} - -void make_lowmem_page_readwrite(void *vaddr) -{ - pte_t *pte, ptev; - unsigned long address = (unsigned long)vaddr; - unsigned int level; - - pte = lookup_address(address, &level); - if (pte == NULL) - return; /* vaddr missing */ - - ptev = pte_mkwrite(*pte); - - if (HYPERVISOR_update_va_mapping(address, ptev, 0)) - BUG(); -} - - -static bool xen_page_pinned(void *ptr) -{ - struct page *page = virt_to_page(ptr); - - return PagePinned(page); -} - -void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) -{ - struct multicall_space mcs; - struct mmu_update *u; - - trace_xen_mmu_set_domain_pte(ptep, pteval, domid); - - mcs = xen_mc_entry(sizeof(*u)); - u = mcs.args; - - /* ptep might be kmapped when using 32-bit HIGHPTE */ - u->ptr = virt_to_machine(ptep).maddr; - u->val = pte_val_ma(pteval); - - MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); - - xen_mc_issue(PARAVIRT_LAZY_MMU); -} -EXPORT_SYMBOL_GPL(xen_set_domain_pte); - -static void xen_extend_mmu_update(const struct mmu_update *update) -{ - struct multicall_space mcs; - struct mmu_update *u; - - mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); - - if (mcs.mc != NULL) { - mcs.mc->args[1]++; - } else { - mcs = __xen_mc_entry(sizeof(*u)); - MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); - } - - u = mcs.args; - *u = *update; -} - -static void xen_extend_mmuext_op(const struct mmuext_op *op) -{ - struct multicall_space mcs; - struct mmuext_op *u; - - mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u)); - - if (mcs.mc != NULL) { - mcs.mc->args[1]++; - } else { - mcs = __xen_mc_entry(sizeof(*u)); - MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); - } - - u = mcs.args; - *u = *op; -} - -static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) -{ - struct mmu_update u; - - preempt_disable(); - - xen_mc_batch(); - - /* ptr may be ioremapped for 64-bit pagetable setup */ - u.ptr = arbitrary_virt_to_machine(ptr).maddr; - u.val = pmd_val_ma(val); - xen_extend_mmu_update(&u); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - preempt_enable(); -} - -static void xen_set_pmd(pmd_t *ptr, pmd_t val) -{ - trace_xen_mmu_set_pmd(ptr, val); - - /* If page is not pinned, we can just update the entry - directly */ - if (!xen_page_pinned(ptr)) { - *ptr = val; - return; - } - - xen_set_pmd_hyper(ptr, val); -} - -/* - * Associate a virtual page frame with a given physical page frame - * and protection flags for that frame. - */ -void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) -{ - set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); -} - -static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) -{ - struct mmu_update u; - - if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) - return false; - - xen_mc_batch(); - - u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; - u.val = pte_val_ma(pteval); - xen_extend_mmu_update(&u); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - return true; -} - -static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) -{ - if (!xen_batched_set_pte(ptep, pteval)) { - /* - * Could call native_set_pte() here and trap and - * emulate the PTE write but with 32-bit guests this - * needs two traps (one for each of the two 32-bit - * words in the PTE) so do one hypercall directly - * instead. - */ - struct mmu_update u; - - u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; - u.val = pte_val_ma(pteval); - HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF); - } -} - -static void xen_set_pte(pte_t *ptep, pte_t pteval) -{ - trace_xen_mmu_set_pte(ptep, pteval); - __xen_set_pte(ptep, pteval); -} - -static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) -{ - trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval); - __xen_set_pte(ptep, pteval); -} - -pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - /* Just return the pte as-is. We preserve the bits on commit */ - trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep); - return *ptep; -} - -void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - struct mmu_update u; - - trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte); - xen_mc_batch(); - - u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; - u.val = pte_val_ma(pte); - xen_extend_mmu_update(&u); - - xen_mc_issue(PARAVIRT_LAZY_MMU); -} - -/* Assume pteval_t is equivalent to all the other *val_t types. */ -static pteval_t pte_mfn_to_pfn(pteval_t val) -{ - if (val & _PAGE_PRESENT) { - unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; - unsigned long pfn = mfn_to_pfn(mfn); - - pteval_t flags = val & PTE_FLAGS_MASK; - if (unlikely(pfn == ~0)) - val = flags & ~_PAGE_PRESENT; - else - val = ((pteval_t)pfn << PAGE_SHIFT) | flags; - } - - return val; -} - -static pteval_t pte_pfn_to_mfn(pteval_t val) -{ - if (val & _PAGE_PRESENT) { - unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; - pteval_t flags = val & PTE_FLAGS_MASK; - unsigned long mfn; - - if (!xen_feature(XENFEAT_auto_translated_physmap)) - mfn = __pfn_to_mfn(pfn); - else - mfn = pfn; - /* - * If there's no mfn for the pfn, then just create an - * empty non-present pte. Unfortunately this loses - * information about the original pfn, so - * pte_mfn_to_pfn is asymmetric. - */ - if (unlikely(mfn == INVALID_P2M_ENTRY)) { - mfn = 0; - flags = 0; - } else - mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); - val = ((pteval_t)mfn << PAGE_SHIFT) | flags; - } - - return val; -} - -__visible pteval_t xen_pte_val(pte_t pte) -{ - pteval_t pteval = pte.pte; - - return pte_mfn_to_pfn(pteval); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); - -__visible pgdval_t xen_pgd_val(pgd_t pgd) -{ - return pte_mfn_to_pfn(pgd.pgd); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); - -__visible pte_t xen_make_pte(pteval_t pte) -{ - pte = pte_pfn_to_mfn(pte); - - return native_make_pte(pte); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); - -__visible pgd_t xen_make_pgd(pgdval_t pgd) -{ - pgd = pte_pfn_to_mfn(pgd); - return native_make_pgd(pgd); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); - -__visible pmdval_t xen_pmd_val(pmd_t pmd) -{ - return pte_mfn_to_pfn(pmd.pmd); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); - -static void xen_set_pud_hyper(pud_t *ptr, pud_t val) -{ - struct mmu_update u; - - preempt_disable(); - - xen_mc_batch(); - - /* ptr may be ioremapped for 64-bit pagetable setup */ - u.ptr = arbitrary_virt_to_machine(ptr).maddr; - u.val = pud_val_ma(val); - xen_extend_mmu_update(&u); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - preempt_enable(); -} - -static void xen_set_pud(pud_t *ptr, pud_t val) -{ - trace_xen_mmu_set_pud(ptr, val); - - /* If page is not pinned, we can just update the entry - directly */ - if (!xen_page_pinned(ptr)) { - *ptr = val; - return; - } - - xen_set_pud_hyper(ptr, val); -} - -#ifdef CONFIG_X86_PAE -static void xen_set_pte_atomic(pte_t *ptep, pte_t pte) -{ - trace_xen_mmu_set_pte_atomic(ptep, pte); - set_64bit((u64 *)ptep, native_pte_val(pte)); -} - -static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - trace_xen_mmu_pte_clear(mm, addr, ptep); - if (!xen_batched_set_pte(ptep, native_make_pte(0))) - native_pte_clear(mm, addr, ptep); -} - -static void xen_pmd_clear(pmd_t *pmdp) -{ - trace_xen_mmu_pmd_clear(pmdp); - set_pmd(pmdp, __pmd(0)); -} -#endif /* CONFIG_X86_PAE */ - -__visible pmd_t xen_make_pmd(pmdval_t pmd) -{ - pmd = pte_pfn_to_mfn(pmd); - return native_make_pmd(pmd); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); - -#if CONFIG_PGTABLE_LEVELS == 4 -__visible pudval_t xen_pud_val(pud_t pud) -{ - return pte_mfn_to_pfn(pud.pud); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); - -__visible pud_t xen_make_pud(pudval_t pud) -{ - pud = pte_pfn_to_mfn(pud); - - return native_make_pud(pud); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); - -static pgd_t *xen_get_user_pgd(pgd_t *pgd) -{ - pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); - unsigned offset = pgd - pgd_page; - pgd_t *user_ptr = NULL; - - if (offset < pgd_index(USER_LIMIT)) { - struct page *page = virt_to_page(pgd_page); - user_ptr = (pgd_t *)page->private; - if (user_ptr) - user_ptr += offset; - } - - return user_ptr; -} - -static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) -{ - struct mmu_update u; - - u.ptr = virt_to_machine(ptr).maddr; - u.val = p4d_val_ma(val); - xen_extend_mmu_update(&u); -} - -/* - * Raw hypercall-based set_p4d, intended for in early boot before - * there's a page structure. This implies: - * 1. The only existing pagetable is the kernel's - * 2. It is always pinned - * 3. It has no user pagetable attached to it - */ -static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) -{ - preempt_disable(); - - xen_mc_batch(); - - __xen_set_p4d_hyper(ptr, val); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - preempt_enable(); -} - -static void xen_set_p4d(p4d_t *ptr, p4d_t val) -{ - pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr); - pgd_t pgd_val; - - trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val); - - /* If page is not pinned, we can just update the entry - directly */ - if (!xen_page_pinned(ptr)) { - *ptr = val; - if (user_ptr) { - WARN_ON(xen_page_pinned(user_ptr)); - pgd_val.pgd = p4d_val_ma(val); - *user_ptr = pgd_val; - } - return; - } - - /* If it's pinned, then we can at least batch the kernel and - user updates together. */ - xen_mc_batch(); - - __xen_set_p4d_hyper(ptr, val); - if (user_ptr) - __xen_set_p4d_hyper((p4d_t *)user_ptr, val); - - xen_mc_issue(PARAVIRT_LAZY_MMU); -} -#endif /* CONFIG_PGTABLE_LEVELS == 4 */ - -static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, - int (*func)(struct mm_struct *mm, struct page *, enum pt_level), - bool last, unsigned long limit) -{ - int i, nr, flush = 0; - - nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD; - for (i = 0; i < nr; i++) { - if (!pmd_none(pmd[i])) - flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE); - } - return flush; -} - -static int xen_pud_walk(struct mm_struct *mm, pud_t *pud, - int (*func)(struct mm_struct *mm, struct page *, enum pt_level), - bool last, unsigned long limit) -{ - int i, nr, flush = 0; - - nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD; - for (i = 0; i < nr; i++) { - pmd_t *pmd; - - if (pud_none(pud[i])) - continue; - - pmd = pmd_offset(&pud[i], 0); - if (PTRS_PER_PMD > 1) - flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); - flush |= xen_pmd_walk(mm, pmd, func, - last && i == nr - 1, limit); - } - return flush; -} - -static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d, - int (*func)(struct mm_struct *mm, struct page *, enum pt_level), - bool last, unsigned long limit) -{ - int i, nr, flush = 0; - - nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D; - for (i = 0; i < nr; i++) { - pud_t *pud; - - if (p4d_none(p4d[i])) - continue; - - pud = pud_offset(&p4d[i], 0); - if (PTRS_PER_PUD > 1) - flush |= (*func)(mm, virt_to_page(pud), PT_PUD); - flush |= xen_pud_walk(mm, pud, func, - last && i == nr - 1, limit); - } - return flush; -} - -/* - * (Yet another) pagetable walker. This one is intended for pinning a - * pagetable. This means that it walks a pagetable and calls the - * callback function on each page it finds making up the page table, - * at every level. It walks the entire pagetable, but it only bothers - * pinning pte pages which are below limit. In the normal case this - * will be STACK_TOP_MAX, but at boot we need to pin up to - * FIXADDR_TOP. - * - * For 32-bit the important bit is that we don't pin beyond there, - * because then we start getting into Xen's ptes. - * - * For 64-bit, we must skip the Xen hole in the middle of the address - * space, just after the big x86-64 virtual hole. - */ -static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, - int (*func)(struct mm_struct *mm, struct page *, - enum pt_level), - unsigned long limit) -{ - int i, nr, flush = 0; - unsigned hole_low, hole_high; - - /* The limit is the last byte to be touched */ - limit--; - BUG_ON(limit >= FIXADDR_TOP); - - if (xen_feature(XENFEAT_auto_translated_physmap)) - return 0; - - /* - * 64-bit has a great big hole in the middle of the address - * space, which contains the Xen mappings. On 32-bit these - * will end up making a zero-sized hole and so is a no-op. - */ - hole_low = pgd_index(USER_LIMIT); - hole_high = pgd_index(PAGE_OFFSET); - - nr = pgd_index(limit) + 1; - for (i = 0; i < nr; i++) { - p4d_t *p4d; - - if (i >= hole_low && i < hole_high) - continue; - - if (pgd_none(pgd[i])) - continue; - - p4d = p4d_offset(&pgd[i], 0); - if (PTRS_PER_P4D > 1) - flush |= (*func)(mm, virt_to_page(p4d), PT_P4D); - flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit); - } - - /* Do the top level last, so that the callbacks can use it as - a cue to do final things like tlb flushes. */ - flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); - - return flush; -} - -static int xen_pgd_walk(struct mm_struct *mm, - int (*func)(struct mm_struct *mm, struct page *, - enum pt_level), - unsigned long limit) -{ - return __xen_pgd_walk(mm, mm->pgd, func, limit); -} - -/* If we're using split pte locks, then take the page's lock and - return a pointer to it. Otherwise return NULL. */ -static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) -{ - spinlock_t *ptl = NULL; - -#if USE_SPLIT_PTE_PTLOCKS - ptl = ptlock_ptr(page); - spin_lock_nest_lock(ptl, &mm->page_table_lock); -#endif - - return ptl; -} - -static void xen_pte_unlock(void *v) -{ - spinlock_t *ptl = v; - spin_unlock(ptl); -} - -static void xen_do_pin(unsigned level, unsigned long pfn) -{ - struct mmuext_op op; - - op.cmd = level; - op.arg1.mfn = pfn_to_mfn(pfn); - - xen_extend_mmuext_op(&op); -} - -static int xen_pin_page(struct mm_struct *mm, struct page *page, - enum pt_level level) -{ - unsigned pgfl = TestSetPagePinned(page); - int flush; - - if (pgfl) - flush = 0; /* already pinned */ - else if (PageHighMem(page)) - /* kmaps need flushing if we found an unpinned - highpage */ - flush = 1; - else { - void *pt = lowmem_page_address(page); - unsigned long pfn = page_to_pfn(page); - struct multicall_space mcs = __xen_mc_entry(0); - spinlock_t *ptl; - - flush = 0; - - /* - * We need to hold the pagetable lock between the time - * we make the pagetable RO and when we actually pin - * it. If we don't, then other users may come in and - * attempt to update the pagetable by writing it, - * which will fail because the memory is RO but not - * pinned, so Xen won't do the trap'n'emulate. - * - * If we're using split pte locks, we can't hold the - * entire pagetable's worth of locks during the - * traverse, because we may wrap the preempt count (8 - * bits). The solution is to mark RO and pin each PTE - * page while holding the lock. This means the number - * of locks we end up holding is never more than a - * batch size (~32 entries, at present). - * - * If we're not using split pte locks, we needn't pin - * the PTE pages independently, because we're - * protected by the overall pagetable lock. - */ - ptl = NULL; - if (level == PT_PTE) - ptl = xen_pte_lock(page, mm); - - MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, - pfn_pte(pfn, PAGE_KERNEL_RO), - level == PT_PGD ? UVMF_TLB_FLUSH : 0); - - if (ptl) { - xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); - - /* Queue a deferred unlock for when this batch - is completed. */ - xen_mc_callback(xen_pte_unlock, ptl); - } - } - - return flush; -} - -/* This is called just after a mm has been created, but it has not - been used yet. We need to make sure that its pagetable is all - read-only, and can be pinned. */ -static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) -{ - trace_xen_mmu_pgd_pin(mm, pgd); - - xen_mc_batch(); - - if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { - /* re-enable interrupts for flushing */ - xen_mc_issue(0); - - kmap_flush_unused(); - - xen_mc_batch(); - } - -#ifdef CONFIG_X86_64 - { - pgd_t *user_pgd = xen_get_user_pgd(pgd); - - xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); - - if (user_pgd) { - xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); - xen_do_pin(MMUEXT_PIN_L4_TABLE, - PFN_DOWN(__pa(user_pgd))); - } - } -#else /* CONFIG_X86_32 */ -#ifdef CONFIG_X86_PAE - /* Need to make sure unshared kernel PMD is pinnable */ - xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), - PT_PMD); -#endif - xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); -#endif /* CONFIG_X86_64 */ - xen_mc_issue(0); -} - -static void xen_pgd_pin(struct mm_struct *mm) -{ - __xen_pgd_pin(mm, mm->pgd); -} - -/* - * On save, we need to pin all pagetables to make sure they get their - * mfns turned into pfns. Search the list for any unpinned pgds and pin - * them (unpinned pgds are not currently in use, probably because the - * process is under construction or destruction). - * - * Expected to be called in stop_machine() ("equivalent to taking - * every spinlock in the system"), so the locking doesn't really - * matter all that much. - */ -void xen_mm_pin_all(void) -{ - struct page *page; - - spin_lock(&pgd_lock); - - list_for_each_entry(page, &pgd_list, lru) { - if (!PagePinned(page)) { - __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page)); - SetPageSavePinned(page); - } - } - - spin_unlock(&pgd_lock); -} - -/* - * The init_mm pagetable is really pinned as soon as its created, but - * that's before we have page structures to store the bits. So do all - * the book-keeping now. - */ -static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, - enum pt_level level) -{ - SetPagePinned(page); - return 0; -} - -static void __init xen_mark_init_mm_pinned(void) -{ - xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); -} - -static int xen_unpin_page(struct mm_struct *mm, struct page *page, - enum pt_level level) -{ - unsigned pgfl = TestClearPagePinned(page); - - if (pgfl && !PageHighMem(page)) { - void *pt = lowmem_page_address(page); - unsigned long pfn = page_to_pfn(page); - spinlock_t *ptl = NULL; - struct multicall_space mcs; - - /* - * Do the converse to pin_page. If we're using split - * pte locks, we must be holding the lock for while - * the pte page is unpinned but still RO to prevent - * concurrent updates from seeing it in this - * partially-pinned state. - */ - if (level == PT_PTE) { - ptl = xen_pte_lock(page, mm); - - if (ptl) - xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); - } - - mcs = __xen_mc_entry(0); - - MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, - pfn_pte(pfn, PAGE_KERNEL), - level == PT_PGD ? UVMF_TLB_FLUSH : 0); - - if (ptl) { - /* unlock when batch completed */ - xen_mc_callback(xen_pte_unlock, ptl); - } - } - - return 0; /* never need to flush on unpin */ -} - -/* Release a pagetables pages back as normal RW */ -static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) -{ - trace_xen_mmu_pgd_unpin(mm, pgd); - - xen_mc_batch(); - - xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - -#ifdef CONFIG_X86_64 - { - pgd_t *user_pgd = xen_get_user_pgd(pgd); - - if (user_pgd) { - xen_do_pin(MMUEXT_UNPIN_TABLE, - PFN_DOWN(__pa(user_pgd))); - xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); - } - } -#endif - -#ifdef CONFIG_X86_PAE - /* Need to make sure unshared kernel PMD is unpinned */ - xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), - PT_PMD); -#endif - - __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT); - - xen_mc_issue(0); -} - -static void xen_pgd_unpin(struct mm_struct *mm) -{ - __xen_pgd_unpin(mm, mm->pgd); -} - -/* - * On resume, undo any pinning done at save, so that the rest of the - * kernel doesn't see any unexpected pinned pagetables. - */ -void xen_mm_unpin_all(void) -{ - struct page *page; - - spin_lock(&pgd_lock); - - list_for_each_entry(page, &pgd_list, lru) { - if (PageSavePinned(page)) { - BUG_ON(!PagePinned(page)); - __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page)); - ClearPageSavePinned(page); - } - } - - spin_unlock(&pgd_lock); -} - -static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) -{ - spin_lock(&next->page_table_lock); - xen_pgd_pin(next); - spin_unlock(&next->page_table_lock); -} - -static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) -{ - spin_lock(&mm->page_table_lock); - xen_pgd_pin(mm); - spin_unlock(&mm->page_table_lock); -} - - -#ifdef CONFIG_SMP -/* Another cpu may still have their %cr3 pointing at the pagetable, so - we need to repoint it somewhere else before we can unpin it. */ -static void drop_other_mm_ref(void *info) -{ - struct mm_struct *mm = info; - struct mm_struct *active_mm; - - active_mm = this_cpu_read(cpu_tlbstate.active_mm); - - if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) - leave_mm(smp_processor_id()); - - /* If this cpu still has a stale cr3 reference, then make sure - it has been flushed. */ - if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd)) - load_cr3(swapper_pg_dir); -} - -static void xen_drop_mm_ref(struct mm_struct *mm) -{ - cpumask_var_t mask; - unsigned cpu; - - if (current->active_mm == mm) { - if (current->mm == mm) - load_cr3(swapper_pg_dir); - else - leave_mm(smp_processor_id()); - } - - /* Get the "official" set of cpus referring to our pagetable. */ - if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { - for_each_online_cpu(cpu) { - if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) - && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) - continue; - smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); - } - return; - } - cpumask_copy(mask, mm_cpumask(mm)); - - /* It's possible that a vcpu may have a stale reference to our - cr3, because its in lazy mode, and it hasn't yet flushed - its set of pending hypercalls yet. In this case, we can - look at its actual current cr3 value, and force it to flush - if needed. */ - for_each_online_cpu(cpu) { - if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) - cpumask_set_cpu(cpu, mask); - } - - if (!cpumask_empty(mask)) - smp_call_function_many(mask, drop_other_mm_ref, mm, 1); - free_cpumask_var(mask); -} -#else -static void xen_drop_mm_ref(struct mm_struct *mm) -{ - if (current->active_mm == mm) - load_cr3(swapper_pg_dir); -} -#endif - -/* - * While a process runs, Xen pins its pagetables, which means that the - * hypervisor forces it to be read-only, and it controls all updates - * to it. This means that all pagetable updates have to go via the - * hypervisor, which is moderately expensive. - * - * Since we're pulling the pagetable down, we switch to use init_mm, - * unpin old process pagetable and mark it all read-write, which - * allows further operations on it to be simple memory accesses. - * - * The only subtle point is that another CPU may be still using the - * pagetable because of lazy tlb flushing. This means we need need to - * switch all CPUs off this pagetable before we can unpin it. - */ -static void xen_exit_mmap(struct mm_struct *mm) -{ - get_cpu(); /* make sure we don't move around */ - xen_drop_mm_ref(mm); - put_cpu(); - - spin_lock(&mm->page_table_lock); - - /* pgd may not be pinned in the error exit path of execve */ - if (xen_page_pinned(mm->pgd)) - xen_pgd_unpin(mm); - - spin_unlock(&mm->page_table_lock); -} - -static void xen_post_allocator_init(void); - -static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn) -{ - struct mmuext_op op; - - op.cmd = cmd; - op.arg1.mfn = pfn_to_mfn(pfn); - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); -} - -#ifdef CONFIG_X86_64 -static void __init xen_cleanhighmap(unsigned long vaddr, - unsigned long vaddr_end) -{ - unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; - pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr); - - /* NOTE: The loop is more greedy than the cleanup_highmap variant. - * We include the PMD passed in on _both_ boundaries. */ - for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PTRS_PER_PMD)); - pmd++, vaddr += PMD_SIZE) { - if (pmd_none(*pmd)) - continue; - if (vaddr < (unsigned long) _text || vaddr > kernel_end) - set_pmd(pmd, __pmd(0)); - } - /* In case we did something silly, we should crash in this function - * instead of somewhere later and be confusing. */ - xen_mc_flush(); -} - -/* - * Make a page range writeable and free it. - */ -static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size) -{ - void *vaddr = __va(paddr); - void *vaddr_end = vaddr + size; - - for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) - make_lowmem_page_readwrite(vaddr); - - memblock_free(paddr, size); -} - -static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin) -{ - unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK; - - if (unpin) - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa)); - ClearPagePinned(virt_to_page(__va(pa))); - xen_free_ro_pages(pa, PAGE_SIZE); -} - -static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin) -{ - unsigned long pa; - pte_t *pte_tbl; - int i; - - if (pmd_large(*pmd)) { - pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; - xen_free_ro_pages(pa, PMD_SIZE); - return; - } - - pte_tbl = pte_offset_kernel(pmd, 0); - for (i = 0; i < PTRS_PER_PTE; i++) { - if (pte_none(pte_tbl[i])) - continue; - pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT; - xen_free_ro_pages(pa, PAGE_SIZE); - } - set_pmd(pmd, __pmd(0)); - xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin); -} - -static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin) -{ - unsigned long pa; - pmd_t *pmd_tbl; - int i; - - if (pud_large(*pud)) { - pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; - xen_free_ro_pages(pa, PUD_SIZE); - return; - } - - pmd_tbl = pmd_offset(pud, 0); - for (i = 0; i < PTRS_PER_PMD; i++) { - if (pmd_none(pmd_tbl[i])) - continue; - xen_cleanmfnmap_pmd(pmd_tbl + i, unpin); - } - set_pud(pud, __pud(0)); - xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin); -} - -static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin) -{ - unsigned long pa; - pud_t *pud_tbl; - int i; - - if (p4d_large(*p4d)) { - pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK; - xen_free_ro_pages(pa, P4D_SIZE); - return; - } - - pud_tbl = pud_offset(p4d, 0); - for (i = 0; i < PTRS_PER_PUD; i++) { - if (pud_none(pud_tbl[i])) - continue; - xen_cleanmfnmap_pud(pud_tbl + i, unpin); - } - set_p4d(p4d, __p4d(0)); - xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin); -} - -/* - * Since it is well isolated we can (and since it is perhaps large we should) - * also free the page tables mapping the initial P->M table. - */ -static void __init xen_cleanmfnmap(unsigned long vaddr) -{ - pgd_t *pgd; - p4d_t *p4d; - unsigned int i; - bool unpin; - - unpin = (vaddr == 2 * PGDIR_SIZE); - vaddr &= PMD_MASK; - pgd = pgd_offset_k(vaddr); - p4d = p4d_offset(pgd, 0); - for (i = 0; i < PTRS_PER_P4D; i++) { - if (p4d_none(p4d[i])) - continue; - xen_cleanmfnmap_p4d(p4d + i, unpin); - } - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - set_pgd(pgd, __pgd(0)); - xen_cleanmfnmap_free_pgtbl(p4d, unpin); - } -} - -static void __init xen_pagetable_p2m_free(void) -{ - unsigned long size; - unsigned long addr; - - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - - /* No memory or already called. */ - if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list) - return; - - /* using __ka address and sticking INVALID_P2M_ENTRY! */ - memset((void *)xen_start_info->mfn_list, 0xff, size); - - addr = xen_start_info->mfn_list; - /* - * We could be in __ka space. - * We roundup to the PMD, which means that if anybody at this stage is - * using the __ka address of xen_start_info or - * xen_start_info->shared_info they are in going to crash. Fortunatly - * we have already revectored in xen_setup_kernel_pagetable and in - * xen_setup_shared_info. - */ - size = roundup(size, PMD_SIZE); - - if (addr >= __START_KERNEL_map) { - xen_cleanhighmap(addr, addr + size); - size = PAGE_ALIGN(xen_start_info->nr_pages * - sizeof(unsigned long)); - memblock_free(__pa(addr), size); - } else { - xen_cleanmfnmap(addr); - } -} - -static void __init xen_pagetable_cleanhighmap(void) -{ - unsigned long size; - unsigned long addr; - - /* At this stage, cleanup_highmap has already cleaned __ka space - * from _brk_limit way up to the max_pfn_mapped (which is the end of - * the ramdisk). We continue on, erasing PMD entries that point to page - * tables - do note that they are accessible at this stage via __va. - * For good measure we also round up to the PMD - which means that if - * anybody is using __ka address to the initial boot-stack - and try - * to use it - they are going to crash. The xen_start_info has been - * taken care of already in xen_setup_kernel_pagetable. */ - addr = xen_start_info->pt_base; - size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); - - xen_cleanhighmap(addr, addr + size); - xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); -#ifdef DEBUG - /* This is superfluous and is not necessary, but you know what - * lets do it. The MODULES_VADDR -> MODULES_END should be clear of - * anything at this stage. */ - xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); -#endif -} -#endif - -static void __init xen_pagetable_p2m_setup(void) -{ - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - - xen_vmalloc_p2m_tree(); - -#ifdef CONFIG_X86_64 - xen_pagetable_p2m_free(); - - xen_pagetable_cleanhighmap(); -#endif - /* And revector! Bye bye old array */ - xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; -} - -static void __init xen_pagetable_init(void) -{ - paging_init(); - xen_post_allocator_init(); - - xen_pagetable_p2m_setup(); - - /* Allocate and initialize top and mid mfn levels for p2m structure */ - xen_build_mfn_list_list(); - - /* Remap memory freed due to conflicts with E820 map */ - if (!xen_feature(XENFEAT_auto_translated_physmap)) - xen_remap_memory(); - - xen_setup_shared_info(); -} -static void xen_write_cr2(unsigned long cr2) -{ - this_cpu_read(xen_vcpu)->arch.cr2 = cr2; -} - -static unsigned long xen_read_cr2(void) -{ - return this_cpu_read(xen_vcpu)->arch.cr2; -} - -unsigned long xen_read_cr2_direct(void) -{ - return this_cpu_read(xen_vcpu_info.arch.cr2); -} - void xen_flush_tlb_all(void) { struct mmuext_op *op; @@ -1386,1464 +61,6 @@ void xen_flush_tlb_all(void) preempt_enable(); } -static void xen_flush_tlb(void) -{ - struct mmuext_op *op; - struct multicall_space mcs; - - trace_xen_mmu_flush_tlb(0); - - preempt_disable(); - - mcs = xen_mc_entry(sizeof(*op)); - - op = mcs.args; - op->cmd = MMUEXT_TLB_FLUSH_LOCAL; - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - preempt_enable(); -} - -static void xen_flush_tlb_single(unsigned long addr) -{ - struct mmuext_op *op; - struct multicall_space mcs; - - trace_xen_mmu_flush_tlb_single(addr); - - preempt_disable(); - - mcs = xen_mc_entry(sizeof(*op)); - op = mcs.args; - op->cmd = MMUEXT_INVLPG_LOCAL; - op->arg1.linear_addr = addr & PAGE_MASK; - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - preempt_enable(); -} - -static void xen_flush_tlb_others(const struct cpumask *cpus, - struct mm_struct *mm, unsigned long start, - unsigned long end) -{ - struct { - struct mmuext_op op; -#ifdef CONFIG_SMP - DECLARE_BITMAP(mask, num_processors); -#else - DECLARE_BITMAP(mask, NR_CPUS); -#endif - } *args; - struct multicall_space mcs; - - trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); - - if (cpumask_empty(cpus)) - return; /* nothing to do */ - - mcs = xen_mc_entry(sizeof(*args)); - args = mcs.args; - args->op.arg2.vcpumask = to_cpumask(args->mask); - - /* Remove us, and any offline CPUS. */ - cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); - - args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; - if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { - args->op.cmd = MMUEXT_INVLPG_MULTI; - args->op.arg1.linear_addr = start; - } - - MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_MMU); -} - -static unsigned long xen_read_cr3(void) -{ - return this_cpu_read(xen_cr3); -} - -static void set_current_cr3(void *v) -{ - this_cpu_write(xen_current_cr3, (unsigned long)v); -} - -static void __xen_write_cr3(bool kernel, unsigned long cr3) -{ - struct mmuext_op op; - unsigned long mfn; - - trace_xen_mmu_write_cr3(kernel, cr3); - - if (cr3) - mfn = pfn_to_mfn(PFN_DOWN(cr3)); - else - mfn = 0; - - WARN_ON(mfn == 0 && kernel); - - op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; - op.arg1.mfn = mfn; - - xen_extend_mmuext_op(&op); - - if (kernel) { - this_cpu_write(xen_cr3, cr3); - - /* Update xen_current_cr3 once the batch has actually - been submitted. */ - xen_mc_callback(set_current_cr3, (void *)cr3); - } -} -static void xen_write_cr3(unsigned long cr3) -{ - BUG_ON(preemptible()); - - xen_mc_batch(); /* disables interrupts */ - - /* Update while interrupts are disabled, so its atomic with - respect to ipis */ - this_cpu_write(xen_cr3, cr3); - - __xen_write_cr3(true, cr3); - -#ifdef CONFIG_X86_64 - { - pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); - if (user_pgd) - __xen_write_cr3(false, __pa(user_pgd)); - else - __xen_write_cr3(false, 0); - } -#endif - - xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ -} - -#ifdef CONFIG_X86_64 -/* - * At the start of the day - when Xen launches a guest, it has already - * built pagetables for the guest. We diligently look over them - * in xen_setup_kernel_pagetable and graft as appropriate them in the - * init_level4_pgt and its friends. Then when we are happy we load - * the new init_level4_pgt - and continue on. - * - * The generic code starts (start_kernel) and 'init_mem_mapping' sets - * up the rest of the pagetables. When it has completed it loads the cr3. - * N.B. that baremetal would start at 'start_kernel' (and the early - * #PF handler would create bootstrap pagetables) - so we are running - * with the same assumptions as what to do when write_cr3 is executed - * at this point. - * - * Since there are no user-page tables at all, we have two variants - * of xen_write_cr3 - the early bootup (this one), and the late one - * (xen_write_cr3). The reason we have to do that is that in 64-bit - * the Linux kernel and user-space are both in ring 3 while the - * hypervisor is in ring 0. - */ -static void __init xen_write_cr3_init(unsigned long cr3) -{ - BUG_ON(preemptible()); - - xen_mc_batch(); /* disables interrupts */ - - /* Update while interrupts are disabled, so its atomic with - respect to ipis */ - this_cpu_write(xen_cr3, cr3); - - __xen_write_cr3(true, cr3); - - xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ -} -#endif - -static int xen_pgd_alloc(struct mm_struct *mm) -{ - pgd_t *pgd = mm->pgd; - int ret = 0; - - BUG_ON(PagePinned(virt_to_page(pgd))); - -#ifdef CONFIG_X86_64 - { - struct page *page = virt_to_page(pgd); - pgd_t *user_pgd; - - BUG_ON(page->private != 0); - - ret = -ENOMEM; - - user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - page->private = (unsigned long)user_pgd; - - if (user_pgd != NULL) { -#ifdef CONFIG_X86_VSYSCALL_EMULATION - user_pgd[pgd_index(VSYSCALL_ADDR)] = - __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); -#endif - ret = 0; - } - - BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); - } -#endif - return ret; -} - -static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ -#ifdef CONFIG_X86_64 - pgd_t *user_pgd = xen_get_user_pgd(pgd); - - if (user_pgd) - free_page((unsigned long)user_pgd); -#endif -} - -/* - * Init-time set_pte while constructing initial pagetables, which - * doesn't allow RO page table pages to be remapped RW. - * - * If there is no MFN for this PFN then this page is initially - * ballooned out so clear the PTE (as in decrease_reservation() in - * drivers/xen/balloon.c). - * - * Many of these PTE updates are done on unpinned and writable pages - * and doing a hypercall for these is unnecessary and expensive. At - * this point it is not possible to tell if a page is pinned or not, - * so always write the PTE directly and rely on Xen trapping and - * emulating any updates as necessary. - */ -__visible pte_t xen_make_pte_init(pteval_t pte) -{ -#ifdef CONFIG_X86_64 - unsigned long pfn; - - /* - * Pages belonging to the initial p2m list mapped outside the default - * address range must be mapped read-only. This region contains the - * page tables for mapping the p2m list, too, and page tables MUST be - * mapped read-only. - */ - pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT; - if (xen_start_info->mfn_list < __START_KERNEL_map && - pfn >= xen_start_info->first_p2m_pfn && - pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames) - pte &= ~_PAGE_RW; -#endif - pte = pte_pfn_to_mfn(pte); - return native_make_pte(pte); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init); - -static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) -{ -#ifdef CONFIG_X86_32 - /* If there's an existing pte, then don't allow _PAGE_RW to be set */ - if (pte_mfn(pte) != INVALID_P2M_ENTRY - && pte_val_ma(*ptep) & _PAGE_PRESENT) - pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & - pte_val_ma(pte)); -#endif - native_set_pte(ptep, pte); -} - -/* Early in boot, while setting up the initial pagetable, assume - everything is pinned. */ -static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) -{ -#ifdef CONFIG_FLATMEM - BUG_ON(mem_map); /* should only be used early */ -#endif - make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); - pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); -} - -/* Used for pmd and pud */ -static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) -{ -#ifdef CONFIG_FLATMEM - BUG_ON(mem_map); /* should only be used early */ -#endif - make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); -} - -/* Early release_pte assumes that all pts are pinned, since there's - only init_mm and anything attached to that is pinned. */ -static void __init xen_release_pte_init(unsigned long pfn) -{ - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); -} - -static void __init xen_release_pmd_init(unsigned long pfn) -{ - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); -} - -static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn) -{ - struct multicall_space mcs; - struct mmuext_op *op; - - mcs = __xen_mc_entry(sizeof(*op)); - op = mcs.args; - op->cmd = cmd; - op->arg1.mfn = pfn_to_mfn(pfn); - - MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); -} - -static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot) -{ - struct multicall_space mcs; - unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT); - - mcs = __xen_mc_entry(0); - MULTI_update_va_mapping(mcs.mc, (unsigned long)addr, - pfn_pte(pfn, prot), 0); -} - -/* This needs to make sure the new pte page is pinned iff its being - attached to a pinned pagetable. */ -static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, - unsigned level) -{ - bool pinned = PagePinned(virt_to_page(mm->pgd)); - - trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned); - - if (pinned) { - struct page *page = pfn_to_page(pfn); - - SetPagePinned(page); - - if (!PageHighMem(page)) { - xen_mc_batch(); - - __set_pfn_prot(pfn, PAGE_KERNEL_RO); - - if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) - __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - } else { - /* make sure there are no stray mappings of - this page */ - kmap_flush_unused(); - } - } -} - -static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) -{ - xen_alloc_ptpage(mm, pfn, PT_PTE); -} - -static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) -{ - xen_alloc_ptpage(mm, pfn, PT_PMD); -} - -/* This should never happen until we're OK to use struct page */ -static inline void xen_release_ptpage(unsigned long pfn, unsigned level) -{ - struct page *page = pfn_to_page(pfn); - bool pinned = PagePinned(page); - - trace_xen_mmu_release_ptpage(pfn, level, pinned); - - if (pinned) { - if (!PageHighMem(page)) { - xen_mc_batch(); - - if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) - __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); - - __set_pfn_prot(pfn, PAGE_KERNEL); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - } - ClearPagePinned(page); - } -} - -static void xen_release_pte(unsigned long pfn) -{ - xen_release_ptpage(pfn, PT_PTE); -} - -static void xen_release_pmd(unsigned long pfn) -{ - xen_release_ptpage(pfn, PT_PMD); -} - -#if CONFIG_PGTABLE_LEVELS >= 4 -static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) -{ - xen_alloc_ptpage(mm, pfn, PT_PUD); -} - -static void xen_release_pud(unsigned long pfn) -{ - xen_release_ptpage(pfn, PT_PUD); -} -#endif - -void __init xen_reserve_top(void) -{ -#ifdef CONFIG_X86_32 - unsigned long top = HYPERVISOR_VIRT_START; - struct xen_platform_parameters pp; - - if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) - top = pp.virt_start; - - reserve_top_address(-top); -#endif /* CONFIG_X86_32 */ -} - -/* - * Like __va(), but returns address in the kernel mapping (which is - * all we have until the physical memory mapping has been set up. - */ -static void * __init __ka(phys_addr_t paddr) -{ -#ifdef CONFIG_X86_64 - return (void *)(paddr + __START_KERNEL_map); -#else - return __va(paddr); -#endif -} - -/* Convert a machine address to physical address */ -static unsigned long __init m2p(phys_addr_t maddr) -{ - phys_addr_t paddr; - - maddr &= PTE_PFN_MASK; - paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; - - return paddr; -} - -/* Convert a machine address to kernel virtual */ -static void * __init m2v(phys_addr_t maddr) -{ - return __ka(m2p(maddr)); -} - -/* Set the page permissions on an identity-mapped pages */ -static void __init set_page_prot_flags(void *addr, pgprot_t prot, - unsigned long flags) -{ - unsigned long pfn = __pa(addr) >> PAGE_SHIFT; - pte_t pte = pfn_pte(pfn, prot); - - if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) - BUG(); -} -static void __init set_page_prot(void *addr, pgprot_t prot) -{ - return set_page_prot_flags(addr, prot, UVMF_NONE); -} -#ifdef CONFIG_X86_32 -static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) -{ - unsigned pmdidx, pteidx; - unsigned ident_pte; - unsigned long pfn; - - level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES, - PAGE_SIZE); - - ident_pte = 0; - pfn = 0; - for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { - pte_t *pte_page; - - /* Reuse or allocate a page of ptes */ - if (pmd_present(pmd[pmdidx])) - pte_page = m2v(pmd[pmdidx].pmd); - else { - /* Check for free pte pages */ - if (ident_pte == LEVEL1_IDENT_ENTRIES) - break; - - pte_page = &level1_ident_pgt[ident_pte]; - ident_pte += PTRS_PER_PTE; - - pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); - } - - /* Install mappings */ - for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { - pte_t pte; - - if (pfn > max_pfn_mapped) - max_pfn_mapped = pfn; - - if (!pte_none(pte_page[pteidx])) - continue; - - pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); - pte_page[pteidx] = pte; - } - } - - for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) - set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); - - set_page_prot(pmd, PAGE_KERNEL_RO); -} -#endif -void __init xen_setup_machphys_mapping(void) -{ - struct xen_machphys_mapping mapping; - - if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { - machine_to_phys_mapping = (unsigned long *)mapping.v_start; - machine_to_phys_nr = mapping.max_mfn + 1; - } else { - machine_to_phys_nr = MACH2PHYS_NR_ENTRIES; - } -#ifdef CONFIG_X86_32 - WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1)) - < machine_to_phys_mapping); -#endif -} - -#ifdef CONFIG_X86_64 -static void __init convert_pfn_mfn(void *v) -{ - pte_t *pte = v; - int i; - - /* All levels are converted the same way, so just treat them - as ptes. */ - for (i = 0; i < PTRS_PER_PTE; i++) - pte[i] = xen_make_pte(pte[i].pte); -} -static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, - unsigned long addr) -{ - if (*pt_base == PFN_DOWN(__pa(addr))) { - set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); - clear_page((void *)addr); - (*pt_base)++; - } - if (*pt_end == PFN_DOWN(__pa(addr))) { - set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); - clear_page((void *)addr); - (*pt_end)--; - } -} -/* - * Set up the initial kernel pagetable. - * - * We can construct this by grafting the Xen provided pagetable into - * head_64.S's preconstructed pagetables. We copy the Xen L2's into - * level2_ident_pgt, and level2_kernel_pgt. This means that only the - * kernel has a physical mapping to start with - but that's enough to - * get __va working. We need to fill in the rest of the physical - * mapping once some sort of allocator has been set up. - */ -void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) -{ - pud_t *l3; - pmd_t *l2; - unsigned long addr[3]; - unsigned long pt_base, pt_end; - unsigned i; - - /* max_pfn_mapped is the last pfn mapped in the initial memory - * mappings. Considering that on Xen after the kernel mappings we - * have the mappings of some pages that don't exist in pfn space, we - * set max_pfn_mapped to the last real pfn mapped. */ - if (xen_start_info->mfn_list < __START_KERNEL_map) - max_pfn_mapped = xen_start_info->first_p2m_pfn; - else - max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); - - pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); - pt_end = pt_base + xen_start_info->nr_pt_frames; - - /* Zap identity mapping */ - init_level4_pgt[0] = __pgd(0); - - if (!xen_feature(XENFEAT_auto_translated_physmap)) { - /* Pre-constructed entries are in pfn, so convert to mfn */ - /* L4[272] -> level3_ident_pgt - * L4[511] -> level3_kernel_pgt */ - convert_pfn_mfn(init_level4_pgt); - - /* L3_i[0] -> level2_ident_pgt */ - convert_pfn_mfn(level3_ident_pgt); - /* L3_k[510] -> level2_kernel_pgt - * L3_k[511] -> level2_fixmap_pgt */ - convert_pfn_mfn(level3_kernel_pgt); - - /* L3_k[511][506] -> level1_fixmap_pgt */ - convert_pfn_mfn(level2_fixmap_pgt); - } - /* We get [511][511] and have Xen's version of level2_kernel_pgt */ - l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); - l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); - - addr[0] = (unsigned long)pgd; - addr[1] = (unsigned long)l3; - addr[2] = (unsigned long)l2; - /* Graft it onto L4[272][0]. Note that we creating an aliasing problem: - * Both L4[272][0] and L4[511][510] have entries that point to the same - * L2 (PMD) tables. Meaning that if you modify it in __va space - * it will be also modified in the __ka space! (But if you just - * modify the PMD table to point to other PTE's or none, then you - * are OK - which is what cleanup_highmap does) */ - copy_page(level2_ident_pgt, l2); - /* Graft it onto L4[511][510] */ - copy_page(level2_kernel_pgt, l2); - - /* Copy the initial P->M table mappings if necessary. */ - i = pgd_index(xen_start_info->mfn_list); - if (i && i < pgd_index(__START_KERNEL_map)) - init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; - - if (!xen_feature(XENFEAT_auto_translated_physmap)) { - /* Make pagetable pieces RO */ - set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); - set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); - set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); - set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO); - - /* Pin down new L4 */ - pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, - PFN_DOWN(__pa_symbol(init_level4_pgt))); - - /* Unpin Xen-provided one */ - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - - /* - * At this stage there can be no user pgd, and no page - * structure to attach it to, so make sure we just set kernel - * pgd. - */ - xen_mc_batch(); - __xen_write_cr3(true, __pa(init_level4_pgt)); - xen_mc_issue(PARAVIRT_LAZY_CPU); - } else - native_write_cr3(__pa(init_level4_pgt)); - - /* We can't that easily rip out L3 and L2, as the Xen pagetables are - * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for - * the initial domain. For guests using the toolstack, they are in: - * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only - * rip out the [L4] (pgd), but for guests we shave off three pages. - */ - for (i = 0; i < ARRAY_SIZE(addr); i++) - check_pt_base(&pt_base, &pt_end, addr[i]); - - /* Our (by three pages) smaller Xen pagetable that we are using */ - xen_pt_base = PFN_PHYS(pt_base); - xen_pt_size = (pt_end - pt_base) * PAGE_SIZE; - memblock_reserve(xen_pt_base, xen_pt_size); - - /* Revector the xen_start_info */ - xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); -} - -/* - * Read a value from a physical address. - */ -static unsigned long __init xen_read_phys_ulong(phys_addr_t addr) -{ - unsigned long *vaddr; - unsigned long val; - - vaddr = early_memremap_ro(addr, sizeof(val)); - val = *vaddr; - early_memunmap(vaddr, sizeof(val)); - return val; -} - -/* - * Translate a virtual address to a physical one without relying on mapped - * page tables. - */ -static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) -{ - phys_addr_t pa; - pgd_t pgd; - pud_t pud; - pmd_t pmd; - pte_t pte; - - pa = read_cr3(); - pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * - sizeof(pgd))); - if (!pgd_present(pgd)) - return 0; - - pa = pgd_val(pgd) & PTE_PFN_MASK; - pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) * - sizeof(pud))); - if (!pud_present(pud)) - return 0; - pa = pud_pfn(pud) << PAGE_SHIFT; - if (pud_large(pud)) - return pa + (vaddr & ~PUD_MASK); - - pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) * - sizeof(pmd))); - if (!pmd_present(pmd)) - return 0; - pa = pmd_pfn(pmd) << PAGE_SHIFT; - if (pmd_large(pmd)) - return pa + (vaddr & ~PMD_MASK); - - pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) * - sizeof(pte))); - if (!pte_present(pte)) - return 0; - pa = pte_pfn(pte) << PAGE_SHIFT; - - return pa | (vaddr & ~PAGE_MASK); -} - -/* - * Find a new area for the hypervisor supplied p2m list and relocate the p2m to - * this area. - */ -void __init xen_relocate_p2m(void) -{ - phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys; - unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; - int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d; - pte_t *pt; - pmd_t *pmd; - pud_t *pud; - p4d_t *p4d = NULL; - pgd_t *pgd; - unsigned long *new_p2m; - int save_pud; - - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT; - n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; - n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; - n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT; - if (PTRS_PER_P4D > 1) - n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; - else - n_p4d = 0; - n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d; - - new_area = xen_find_free_area(PFN_PHYS(n_frames)); - if (!new_area) { - xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n"); - BUG(); - } - - /* - * Setup the page tables for addressing the new p2m list. - * We have asked the hypervisor to map the p2m list at the user address - * PUD_SIZE. It may have done so, or it may have used a kernel space - * address depending on the Xen version. - * To avoid any possible virtual address collision, just use - * 2 * PUD_SIZE for the new area. - */ - p4d_phys = new_area; - pud_phys = p4d_phys + PFN_PHYS(n_p4d); - pmd_phys = pud_phys + PFN_PHYS(n_pud); - pt_phys = pmd_phys + PFN_PHYS(n_pmd); - p2m_pfn = PFN_DOWN(pt_phys) + n_pt; - - pgd = __va(read_cr3()); - new_p2m = (unsigned long *)(2 * PGDIR_SIZE); - idx_p4d = 0; - save_pud = n_pud; - do { - if (n_p4d > 0) { - p4d = early_memremap(p4d_phys, PAGE_SIZE); - clear_page(p4d); - n_pud = min(save_pud, PTRS_PER_P4D); - } - for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { - pud = early_memremap(pud_phys, PAGE_SIZE); - clear_page(pud); - for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); - idx_pmd++) { - pmd = early_memremap(pmd_phys, PAGE_SIZE); - clear_page(pmd); - for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); - idx_pt++) { - pt = early_memremap(pt_phys, PAGE_SIZE); - clear_page(pt); - for (idx_pte = 0; - idx_pte < min(n_pte, PTRS_PER_PTE); - idx_pte++) { - set_pte(pt + idx_pte, - pfn_pte(p2m_pfn, PAGE_KERNEL)); - p2m_pfn++; - } - n_pte -= PTRS_PER_PTE; - early_memunmap(pt, PAGE_SIZE); - make_lowmem_page_readonly(__va(pt_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, - PFN_DOWN(pt_phys)); - set_pmd(pmd + idx_pt, - __pmd(_PAGE_TABLE | pt_phys)); - pt_phys += PAGE_SIZE; - } - n_pt -= PTRS_PER_PMD; - early_memunmap(pmd, PAGE_SIZE); - make_lowmem_page_readonly(__va(pmd_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, - PFN_DOWN(pmd_phys)); - set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); - pmd_phys += PAGE_SIZE; - } - n_pmd -= PTRS_PER_PUD; - early_memunmap(pud, PAGE_SIZE); - make_lowmem_page_readonly(__va(pud_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); - if (n_p4d > 0) - set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys)); - else - set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); - pud_phys += PAGE_SIZE; - } - if (n_p4d > 0) { - save_pud -= PTRS_PER_P4D; - early_memunmap(p4d, PAGE_SIZE); - make_lowmem_page_readonly(__va(p4d_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys)); - set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys)); - p4d_phys += PAGE_SIZE; - } - } while (++idx_p4d < n_p4d); - - /* Now copy the old p2m info to the new area. */ - memcpy(new_p2m, xen_p2m_addr, size); - xen_p2m_addr = new_p2m; - - /* Release the old p2m list and set new list info. */ - p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list)); - BUG_ON(!p2m_pfn); - p2m_pfn_end = p2m_pfn + PFN_DOWN(size); - - if (xen_start_info->mfn_list < __START_KERNEL_map) { - pfn = xen_start_info->first_p2m_pfn; - pfn_end = xen_start_info->first_p2m_pfn + - xen_start_info->nr_p2m_frames; - set_pgd(pgd + 1, __pgd(0)); - } else { - pfn = p2m_pfn; - pfn_end = p2m_pfn_end; - } - - memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn)); - while (pfn < pfn_end) { - if (pfn == p2m_pfn) { - pfn = p2m_pfn_end; - continue; - } - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); - pfn++; - } - - xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; - xen_start_info->first_p2m_pfn = PFN_DOWN(new_area); - xen_start_info->nr_p2m_frames = n_frames; -} - -#else /* !CONFIG_X86_64 */ -static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); -static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); - -static void __init xen_write_cr3_init(unsigned long cr3) -{ - unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); - - BUG_ON(read_cr3() != __pa(initial_page_table)); - BUG_ON(cr3 != __pa(swapper_pg_dir)); - - /* - * We are switching to swapper_pg_dir for the first time (from - * initial_page_table) and therefore need to mark that page - * read-only and then pin it. - * - * Xen disallows sharing of kernel PMDs for PAE - * guests. Therefore we must copy the kernel PMD from - * initial_page_table into a new kernel PMD to be used in - * swapper_pg_dir. - */ - swapper_kernel_pmd = - extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); - copy_page(swapper_kernel_pmd, initial_kernel_pmd); - swapper_pg_dir[KERNEL_PGD_BOUNDARY] = - __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); - set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); - - set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); - xen_write_cr3(cr3); - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn); - - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, - PFN_DOWN(__pa(initial_page_table))); - set_page_prot(initial_page_table, PAGE_KERNEL); - set_page_prot(initial_kernel_pmd, PAGE_KERNEL); - - pv_mmu_ops.write_cr3 = &xen_write_cr3; -} - -/* - * For 32 bit domains xen_start_info->pt_base is the pgd address which might be - * not the first page table in the page table pool. - * Iterate through the initial page tables to find the real page table base. - */ -static phys_addr_t xen_find_pt_base(pmd_t *pmd) -{ - phys_addr_t pt_base, paddr; - unsigned pmdidx; - - pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd)); - - for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) - if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) { - paddr = m2p(pmd[pmdidx].pmd); - pt_base = min(pt_base, paddr); - } - - return pt_base; -} - -void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) -{ - pmd_t *kernel_pmd; - - kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); - - xen_pt_base = xen_find_pt_base(kernel_pmd); - xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE; - - initial_kernel_pmd = - extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); - - max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024); - - copy_page(initial_kernel_pmd, kernel_pmd); - - xen_map_identity_early(initial_kernel_pmd, max_pfn); - - copy_page(initial_page_table, pgd); - initial_page_table[KERNEL_PGD_BOUNDARY] = - __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); - - set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO); - set_page_prot(initial_page_table, PAGE_KERNEL_RO); - set_page_prot(empty_zero_page, PAGE_KERNEL_RO); - - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, - PFN_DOWN(__pa(initial_page_table))); - xen_write_cr3(__pa(initial_page_table)); - - memblock_reserve(xen_pt_base, xen_pt_size); -} -#endif /* CONFIG_X86_64 */ - -void __init xen_reserve_special_pages(void) -{ - phys_addr_t paddr; - - memblock_reserve(__pa(xen_start_info), PAGE_SIZE); - if (xen_start_info->store_mfn) { - paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn)); - memblock_reserve(paddr, PAGE_SIZE); - } - if (!xen_initial_domain()) { - paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn)); - memblock_reserve(paddr, PAGE_SIZE); - } -} - -void __init xen_pt_check_e820(void) -{ - if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) { - xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n"); - BUG(); - } -} - -static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; - -static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) -{ - pte_t pte; - - phys >>= PAGE_SHIFT; - - switch (idx) { - case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: - case FIX_RO_IDT: -#ifdef CONFIG_X86_32 - case FIX_WP_TEST: -# ifdef CONFIG_HIGHMEM - case FIX_KMAP_BEGIN ... FIX_KMAP_END: -# endif -#elif defined(CONFIG_X86_VSYSCALL_EMULATION) - case VSYSCALL_PAGE: -#endif - case FIX_TEXT_POKE0: - case FIX_TEXT_POKE1: - case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END: - /* All local page mappings */ - pte = pfn_pte(phys, prot); - break; - -#ifdef CONFIG_X86_LOCAL_APIC - case FIX_APIC_BASE: /* maps dummy local APIC */ - pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); - break; -#endif - -#ifdef CONFIG_X86_IO_APIC - case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END: - /* - * We just don't map the IO APIC - all access is via - * hypercalls. Keep the address in the pte for reference. - */ - pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); - break; -#endif - - case FIX_PARAVIRT_BOOTMAP: - /* This is an MFN, but it isn't an IO mapping from the - IO domain */ - pte = mfn_pte(phys, prot); - break; - - default: - /* By default, set_fixmap is used for hardware mappings */ - pte = mfn_pte(phys, prot); - break; - } - - __native_set_fixmap(idx, pte); - -#ifdef CONFIG_X86_VSYSCALL_EMULATION - /* Replicate changes to map the vsyscall page into the user - pagetable vsyscall mapping. */ - if (idx == VSYSCALL_PAGE) { - unsigned long vaddr = __fix_to_virt(idx); - set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); - } -#endif -} - -static void __init xen_post_allocator_init(void) -{ - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - - pv_mmu_ops.set_pte = xen_set_pte; - pv_mmu_ops.set_pmd = xen_set_pmd; - pv_mmu_ops.set_pud = xen_set_pud; -#if CONFIG_PGTABLE_LEVELS >= 4 - pv_mmu_ops.set_p4d = xen_set_p4d; -#endif - - /* This will work as long as patching hasn't happened yet - (which it hasn't) */ - pv_mmu_ops.alloc_pte = xen_alloc_pte; - pv_mmu_ops.alloc_pmd = xen_alloc_pmd; - pv_mmu_ops.release_pte = xen_release_pte; - pv_mmu_ops.release_pmd = xen_release_pmd; -#if CONFIG_PGTABLE_LEVELS >= 4 - pv_mmu_ops.alloc_pud = xen_alloc_pud; - pv_mmu_ops.release_pud = xen_release_pud; -#endif - pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte); - -#ifdef CONFIG_X86_64 - pv_mmu_ops.write_cr3 = &xen_write_cr3; - SetPagePinned(virt_to_page(level3_user_vsyscall)); -#endif - xen_mark_init_mm_pinned(); -} - -static void xen_leave_lazy_mmu(void) -{ - preempt_disable(); - xen_mc_flush(); - paravirt_leave_lazy_mmu(); - preempt_enable(); -} - -static const struct pv_mmu_ops xen_mmu_ops __initconst = { - .read_cr2 = xen_read_cr2, - .write_cr2 = xen_write_cr2, - - .read_cr3 = xen_read_cr3, - .write_cr3 = xen_write_cr3_init, - - .flush_tlb_user = xen_flush_tlb, - .flush_tlb_kernel = xen_flush_tlb, - .flush_tlb_single = xen_flush_tlb_single, - .flush_tlb_others = xen_flush_tlb_others, - - .pte_update = paravirt_nop, - - .pgd_alloc = xen_pgd_alloc, - .pgd_free = xen_pgd_free, - - .alloc_pte = xen_alloc_pte_init, - .release_pte = xen_release_pte_init, - .alloc_pmd = xen_alloc_pmd_init, - .release_pmd = xen_release_pmd_init, - - .set_pte = xen_set_pte_init, - .set_pte_at = xen_set_pte_at, - .set_pmd = xen_set_pmd_hyper, - - .ptep_modify_prot_start = __ptep_modify_prot_start, - .ptep_modify_prot_commit = __ptep_modify_prot_commit, - - .pte_val = PV_CALLEE_SAVE(xen_pte_val), - .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), - - .make_pte = PV_CALLEE_SAVE(xen_make_pte_init), - .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), - -#ifdef CONFIG_X86_PAE - .set_pte_atomic = xen_set_pte_atomic, - .pte_clear = xen_pte_clear, - .pmd_clear = xen_pmd_clear, -#endif /* CONFIG_X86_PAE */ - .set_pud = xen_set_pud_hyper, - - .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), - .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), - -#if CONFIG_PGTABLE_LEVELS >= 4 - .pud_val = PV_CALLEE_SAVE(xen_pud_val), - .make_pud = PV_CALLEE_SAVE(xen_make_pud), - .set_p4d = xen_set_p4d_hyper, - - .alloc_pud = xen_alloc_pmd_init, - .release_pud = xen_release_pmd_init, -#endif /* CONFIG_PGTABLE_LEVELS == 4 */ - - .activate_mm = xen_activate_mm, - .dup_mmap = xen_dup_mmap, - .exit_mmap = xen_exit_mmap, - - .lazy_mode = { - .enter = paravirt_enter_lazy_mmu, - .leave = xen_leave_lazy_mmu, - .flush = paravirt_flush_lazy_mmu, - }, - - .set_fixmap = xen_set_fixmap, -}; - -void __init xen_init_mmu_ops(void) -{ - x86_init.paging.pagetable_init = xen_pagetable_init; - - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - - pv_mmu_ops = xen_mmu_ops; - - memset(dummy_mapping, 0xff, PAGE_SIZE); -} - -/* Protected by xen_reservation_lock. */ -#define MAX_CONTIG_ORDER 9 /* 2MB */ -static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; - -#define VOID_PTE (mfn_pte(0, __pgprot(0))) -static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, - unsigned long *in_frames, - unsigned long *out_frames) -{ - int i; - struct multicall_space mcs; - - xen_mc_batch(); - for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) { - mcs = __xen_mc_entry(0); - - if (in_frames) - in_frames[i] = virt_to_mfn(vaddr); - - MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); - __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); - - if (out_frames) - out_frames[i] = virt_to_pfn(vaddr); - } - xen_mc_issue(0); -} - -/* - * Update the pfn-to-mfn mappings for a virtual address range, either to - * point to an array of mfns, or contiguously from a single starting - * mfn. - */ -static void xen_remap_exchanged_ptes(unsigned long vaddr, int order, - unsigned long *mfns, - unsigned long first_mfn) -{ - unsigned i, limit; - unsigned long mfn; - - xen_mc_batch(); - - limit = 1u << order; - for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) { - struct multicall_space mcs; - unsigned flags; - - mcs = __xen_mc_entry(0); - if (mfns) - mfn = mfns[i]; - else - mfn = first_mfn + i; - - if (i < (limit - 1)) - flags = 0; - else { - if (order == 0) - flags = UVMF_INVLPG | UVMF_ALL; - else - flags = UVMF_TLB_FLUSH | UVMF_ALL; - } - - MULTI_update_va_mapping(mcs.mc, vaddr, - mfn_pte(mfn, PAGE_KERNEL), flags); - - set_phys_to_machine(virt_to_pfn(vaddr), mfn); - } - - xen_mc_issue(0); -} - -/* - * Perform the hypercall to exchange a region of our pfns to point to - * memory with the required contiguous alignment. Takes the pfns as - * input, and populates mfns as output. - * - * Returns a success code indicating whether the hypervisor was able to - * satisfy the request or not. - */ -static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in, - unsigned long *pfns_in, - unsigned long extents_out, - unsigned int order_out, - unsigned long *mfns_out, - unsigned int address_bits) -{ - long rc; - int success; - - struct xen_memory_exchange exchange = { - .in = { - .nr_extents = extents_in, - .extent_order = order_in, - .extent_start = pfns_in, - .domid = DOMID_SELF - }, - .out = { - .nr_extents = extents_out, - .extent_order = order_out, - .extent_start = mfns_out, - .address_bits = address_bits, - .domid = DOMID_SELF - } - }; - - BUG_ON(extents_in << order_in != extents_out << order_out); - - rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); - success = (exchange.nr_exchanged == extents_in); - - BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); - BUG_ON(success && (rc != 0)); - - return success; -} - -int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, - unsigned int address_bits, - dma_addr_t *dma_handle) -{ - unsigned long *in_frames = discontig_frames, out_frame; - unsigned long flags; - int success; - unsigned long vstart = (unsigned long)phys_to_virt(pstart); - - /* - * Currently an auto-translated guest will not perform I/O, nor will - * it require PAE page directories below 4GB. Therefore any calls to - * this function are redundant and can be ignored. - */ - - if (xen_feature(XENFEAT_auto_translated_physmap)) - return 0; - - if (unlikely(order > MAX_CONTIG_ORDER)) - return -ENOMEM; - - memset((void *) vstart, 0, PAGE_SIZE << order); - - spin_lock_irqsave(&xen_reservation_lock, flags); - - /* 1. Zap current PTEs, remembering MFNs. */ - xen_zap_pfn_range(vstart, order, in_frames, NULL); - - /* 2. Get a new contiguous memory extent. */ - out_frame = virt_to_pfn(vstart); - success = xen_exchange_memory(1UL << order, 0, in_frames, - 1, order, &out_frame, - address_bits); - - /* 3. Map the new extent in place of old pages. */ - if (success) - xen_remap_exchanged_ptes(vstart, order, NULL, out_frame); - else - xen_remap_exchanged_ptes(vstart, order, in_frames, 0); - - spin_unlock_irqrestore(&xen_reservation_lock, flags); - - *dma_handle = virt_to_machine(vstart).maddr; - return success ? 0 : -ENOMEM; -} -EXPORT_SYMBOL_GPL(xen_create_contiguous_region); - -void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) -{ - unsigned long *out_frames = discontig_frames, in_frame; - unsigned long flags; - int success; - unsigned long vstart; - - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - - if (unlikely(order > MAX_CONTIG_ORDER)) - return; - - vstart = (unsigned long)phys_to_virt(pstart); - memset((void *) vstart, 0, PAGE_SIZE << order); - - spin_lock_irqsave(&xen_reservation_lock, flags); - - /* 1. Find start MFN of contiguous extent. */ - in_frame = virt_to_mfn(vstart); - - /* 2. Zap current PTEs. */ - xen_zap_pfn_range(vstart, order, NULL, out_frames); - - /* 3. Do the exchange for non-contiguous MFNs. */ - success = xen_exchange_memory(1, order, &in_frame, 1UL << order, - 0, out_frames, 0); - - /* 4. Map new pages in place of old pages. */ - if (success) - xen_remap_exchanged_ptes(vstart, order, out_frames, 0); - else - xen_remap_exchanged_ptes(vstart, order, NULL, in_frame); - - spin_unlock_irqrestore(&xen_reservation_lock, flags); -} -EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); - -#ifdef CONFIG_XEN_PVHVM -#ifdef CONFIG_PROC_VMCORE -/* - * This function is used in two contexts: - * - the kdump kernel has to check whether a pfn of the crashed kernel - * was a ballooned page. vmcore is using this function to decide - * whether to access a pfn of the crashed kernel. - * - the kexec kernel has to check whether a pfn was ballooned by the - * previous kernel. If the pfn is ballooned, handle it properly. - * Returns 0 if the pfn is not backed by a RAM page, the caller may - * handle the pfn special in this case. - */ -static int xen_oldmem_pfn_is_ram(unsigned long pfn) -{ - struct xen_hvm_get_mem_type a = { - .domid = DOMID_SELF, - .pfn = pfn, - }; - int ram; - - if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) - return -ENXIO; - - switch (a.mem_type) { - case HVMMEM_mmio_dm: - ram = 0; - break; - case HVMMEM_ram_rw: - case HVMMEM_ram_ro: - default: - ram = 1; - break; - } - - return ram; -} -#endif - -static void xen_hvm_exit_mmap(struct mm_struct *mm) -{ - struct xen_hvm_pagetable_dying a; - int rc; - - a.domid = DOMID_SELF; - a.gpa = __pa(mm->pgd); - rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); - WARN_ON_ONCE(rc < 0); -} - -static int is_pagetable_dying_supported(void) -{ - struct xen_hvm_pagetable_dying a; - int rc = 0; - - a.domid = DOMID_SELF; - a.gpa = 0x00; - rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); - if (rc < 0) { - printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n"); - return 0; - } - return 1; -} - -void __init xen_hvm_init_mmu_ops(void) -{ - if (is_pagetable_dying_supported()) - pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; -#ifdef CONFIG_PROC_VMCORE - register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram); -#endif -} -#endif #define REMAP_BATCH_SIZE 16 @@ -2974,7 +191,6 @@ int xen_remap_domain_gfn_array(struct vm_area_struct *vma, } EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array); - /* Returns: 0 success */ int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, int numpgs, struct page **pages) diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c new file mode 100644 index 000000000000..1c57f1cd545c --- /dev/null +++ b/arch/x86/xen/mmu_hvm.c @@ -0,0 +1,79 @@ +#include <linux/types.h> +#include <linux/crash_dump.h> + +#include <xen/interface/xen.h> +#include <xen/hvm.h> + +#include "mmu.h" + +#ifdef CONFIG_PROC_VMCORE +/* + * This function is used in two contexts: + * - the kdump kernel has to check whether a pfn of the crashed kernel + * was a ballooned page. vmcore is using this function to decide + * whether to access a pfn of the crashed kernel. + * - the kexec kernel has to check whether a pfn was ballooned by the + * previous kernel. If the pfn is ballooned, handle it properly. + * Returns 0 if the pfn is not backed by a RAM page, the caller may + * handle the pfn special in this case. + */ +static int xen_oldmem_pfn_is_ram(unsigned long pfn) +{ + struct xen_hvm_get_mem_type a = { + .domid = DOMID_SELF, + .pfn = pfn, + }; + int ram; + + if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) + return -ENXIO; + + switch (a.mem_type) { + case HVMMEM_mmio_dm: + ram = 0; + break; + case HVMMEM_ram_rw: + case HVMMEM_ram_ro: + default: + ram = 1; + break; + } + + return ram; +} +#endif + +static void xen_hvm_exit_mmap(struct mm_struct *mm) +{ + struct xen_hvm_pagetable_dying a; + int rc; + + a.domid = DOMID_SELF; + a.gpa = __pa(mm->pgd); + rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); + WARN_ON_ONCE(rc < 0); +} + +static int is_pagetable_dying_supported(void) +{ + struct xen_hvm_pagetable_dying a; + int rc = 0; + + a.domid = DOMID_SELF; + a.gpa = 0x00; + rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); + if (rc < 0) { + printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n"); + return 0; + } + return 1; +} + +void __init xen_hvm_init_mmu_ops(void) +{ + if (is_pagetable_dying_supported()) + pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; +#ifdef CONFIG_PROC_VMCORE + register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram); +#endif +} diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c new file mode 100644 index 000000000000..9d9ae6650aa1 --- /dev/null +++ b/arch/x86/xen/mmu_pv.c @@ -0,0 +1,2730 @@ +/* + * Xen mmu operations + * + * This file contains the various mmu fetch and update operations. + * The most important job they must perform is the mapping between the + * domain's pfn and the overall machine mfns. + * + * Xen allows guests to directly update the pagetable, in a controlled + * fashion. In other words, the guest modifies the same pagetable + * that the CPU actually uses, which eliminates the overhead of having + * a separate shadow pagetable. + * + * In order to allow this, it falls on the guest domain to map its + * notion of a "physical" pfn - which is just a domain-local linear + * address - into a real "machine address" which the CPU's MMU can + * use. + * + * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be + * inserted directly into the pagetable. When creating a new + * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely, + * when reading the content back with __(pgd|pmd|pte)_val, it converts + * the mfn back into a pfn. + * + * The other constraint is that all pages which make up a pagetable + * must be mapped read-only in the guest. This prevents uncontrolled + * guest updates to the pagetable. Xen strictly enforces this, and + * will disallow any pagetable update which will end up mapping a + * pagetable page RW, and will disallow using any writable page as a + * pagetable. + * + * Naively, when loading %cr3 with the base of a new pagetable, Xen + * would need to validate the whole pagetable before going on. + * Naturally, this is quite slow. The solution is to "pin" a + * pagetable, which enforces all the constraints on the pagetable even + * when it is not actively in use. This menas that Xen can be assured + * that it is still valid when you do load it into %cr3, and doesn't + * need to revalidate it. + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ +#include <linux/sched/mm.h> +#include <linux/highmem.h> +#include <linux/debugfs.h> +#include <linux/bug.h> +#include <linux/vmalloc.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/gfp.h> +#include <linux/memblock.h> +#include <linux/seq_file.h> +#include <linux/crash_dump.h> +#ifdef CONFIG_KEXEC_CORE +#include <linux/kexec.h> +#endif + +#include <trace/events/xen.h> + +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/fixmap.h> +#include <asm/mmu_context.h> +#include <asm/setup.h> +#include <asm/paravirt.h> +#include <asm/e820/api.h> +#include <asm/linkage.h> +#include <asm/page.h> +#include <asm/init.h> +#include <asm/pat.h> +#include <asm/smp.h> + +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> +#include <xen/page.h> +#include <xen/interface/xen.h> +#include <xen/interface/hvm/hvm_op.h> +#include <xen/interface/version.h> +#include <xen/interface/memory.h> +#include <xen/hvc-console.h> + +#include "multicalls.h" +#include "mmu.h" +#include "debugfs.h" + +#ifdef CONFIG_X86_32 +/* + * Identity map, in addition to plain kernel map. This needs to be + * large enough to allocate page table pages to allocate the rest. + * Each page can map 2MB. + */ +#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) +static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); +#endif +#ifdef CONFIG_X86_64 +/* l3 pud for userspace vsyscall mapping */ +static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; +#endif /* CONFIG_X86_64 */ + +/* + * Note about cr3 (pagetable base) values: + * + * xen_cr3 contains the current logical cr3 value; it contains the + * last set cr3. This may not be the current effective cr3, because + * its update may be being lazily deferred. However, a vcpu looking + * at its own cr3 can use this value knowing that it everything will + * be self-consistent. + * + * xen_current_cr3 contains the actual vcpu cr3; it is set once the + * hypercall to set the vcpu cr3 is complete (so it may be a little + * out of date, but it will never be set early). If one vcpu is + * looking at another vcpu's cr3 value, it should use this variable. + */ +DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ +DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ + +static phys_addr_t xen_pt_base, xen_pt_size __initdata; + +/* + * Just beyond the highest usermode address. STACK_TOP_MAX has a + * redzone above it, so round it up to a PGD boundary. + */ +#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) + +void make_lowmem_page_readonly(void *vaddr) +{ + pte_t *pte, ptev; + unsigned long address = (unsigned long)vaddr; + unsigned int level; + + pte = lookup_address(address, &level); + if (pte == NULL) + return; /* vaddr missing */ + + ptev = pte_wrprotect(*pte); + + if (HYPERVISOR_update_va_mapping(address, ptev, 0)) + BUG(); +} + +void make_lowmem_page_readwrite(void *vaddr) +{ + pte_t *pte, ptev; + unsigned long address = (unsigned long)vaddr; + unsigned int level; + + pte = lookup_address(address, &level); + if (pte == NULL) + return; /* vaddr missing */ + + ptev = pte_mkwrite(*pte); + + if (HYPERVISOR_update_va_mapping(address, ptev, 0)) + BUG(); +} + + +static bool xen_page_pinned(void *ptr) +{ + struct page *page = virt_to_page(ptr); + + return PagePinned(page); +} + +void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) +{ + struct multicall_space mcs; + struct mmu_update *u; + + trace_xen_mmu_set_domain_pte(ptep, pteval, domid); + + mcs = xen_mc_entry(sizeof(*u)); + u = mcs.args; + + /* ptep might be kmapped when using 32-bit HIGHPTE */ + u->ptr = virt_to_machine(ptep).maddr; + u->val = pte_val_ma(pteval); + + MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} +EXPORT_SYMBOL_GPL(xen_set_domain_pte); + +static void xen_extend_mmu_update(const struct mmu_update *update) +{ + struct multicall_space mcs; + struct mmu_update *u; + + mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); + + if (mcs.mc != NULL) { + mcs.mc->args[1]++; + } else { + mcs = __xen_mc_entry(sizeof(*u)); + MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); + } + + u = mcs.args; + *u = *update; +} + +static void xen_extend_mmuext_op(const struct mmuext_op *op) +{ + struct multicall_space mcs; + struct mmuext_op *u; + + mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u)); + + if (mcs.mc != NULL) { + mcs.mc->args[1]++; + } else { + mcs = __xen_mc_entry(sizeof(*u)); + MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); + } + + u = mcs.args; + *u = *op; +} + +static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) +{ + struct mmu_update u; + + preempt_disable(); + + xen_mc_batch(); + + /* ptr may be ioremapped for 64-bit pagetable setup */ + u.ptr = arbitrary_virt_to_machine(ptr).maddr; + u.val = pmd_val_ma(val); + xen_extend_mmu_update(&u); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +static void xen_set_pmd(pmd_t *ptr, pmd_t val) +{ + trace_xen_mmu_set_pmd(ptr, val); + + /* If page is not pinned, we can just update the entry + directly */ + if (!xen_page_pinned(ptr)) { + *ptr = val; + return; + } + + xen_set_pmd_hyper(ptr, val); +} + +/* + * Associate a virtual page frame with a given physical page frame + * and protection flags for that frame. + */ +void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) +{ + set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); +} + +static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) +{ + struct mmu_update u; + + if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) + return false; + + xen_mc_batch(); + + u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; + u.val = pte_val_ma(pteval); + xen_extend_mmu_update(&u); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + return true; +} + +static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) +{ + if (!xen_batched_set_pte(ptep, pteval)) { + /* + * Could call native_set_pte() here and trap and + * emulate the PTE write but with 32-bit guests this + * needs two traps (one for each of the two 32-bit + * words in the PTE) so do one hypercall directly + * instead. + */ + struct mmu_update u; + + u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; + u.val = pte_val_ma(pteval); + HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF); + } +} + +static void xen_set_pte(pte_t *ptep, pte_t pteval) +{ + trace_xen_mmu_set_pte(ptep, pteval); + __xen_set_pte(ptep, pteval); +} + +static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval) +{ + trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval); + __xen_set_pte(ptep, pteval); +} + +pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + /* Just return the pte as-is. We preserve the bits on commit */ + trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep); + return *ptep; +} + +void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + struct mmu_update u; + + trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte); + xen_mc_batch(); + + u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; + u.val = pte_val_ma(pte); + xen_extend_mmu_update(&u); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} + +/* Assume pteval_t is equivalent to all the other *val_t types. */ +static pteval_t pte_mfn_to_pfn(pteval_t val) +{ + if (val & _PAGE_PRESENT) { + unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + unsigned long pfn = mfn_to_pfn(mfn); + + pteval_t flags = val & PTE_FLAGS_MASK; + if (unlikely(pfn == ~0)) + val = flags & ~_PAGE_PRESENT; + else + val = ((pteval_t)pfn << PAGE_SHIFT) | flags; + } + + return val; +} + +static pteval_t pte_pfn_to_mfn(pteval_t val) +{ + if (val & _PAGE_PRESENT) { + unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + pteval_t flags = val & PTE_FLAGS_MASK; + unsigned long mfn; + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + mfn = __pfn_to_mfn(pfn); + else + mfn = pfn; + /* + * If there's no mfn for the pfn, then just create an + * empty non-present pte. Unfortunately this loses + * information about the original pfn, so + * pte_mfn_to_pfn is asymmetric. + */ + if (unlikely(mfn == INVALID_P2M_ENTRY)) { + mfn = 0; + flags = 0; + } else + mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); + val = ((pteval_t)mfn << PAGE_SHIFT) | flags; + } + + return val; +} + +__visible pteval_t xen_pte_val(pte_t pte) +{ + pteval_t pteval = pte.pte; + + return pte_mfn_to_pfn(pteval); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); + +__visible pgdval_t xen_pgd_val(pgd_t pgd) +{ + return pte_mfn_to_pfn(pgd.pgd); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); + +__visible pte_t xen_make_pte(pteval_t pte) +{ + pte = pte_pfn_to_mfn(pte); + + return native_make_pte(pte); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); + +__visible pgd_t xen_make_pgd(pgdval_t pgd) +{ + pgd = pte_pfn_to_mfn(pgd); + return native_make_pgd(pgd); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); + +__visible pmdval_t xen_pmd_val(pmd_t pmd) +{ + return pte_mfn_to_pfn(pmd.pmd); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); + +static void xen_set_pud_hyper(pud_t *ptr, pud_t val) +{ + struct mmu_update u; + + preempt_disable(); + + xen_mc_batch(); + + /* ptr may be ioremapped for 64-bit pagetable setup */ + u.ptr = arbitrary_virt_to_machine(ptr).maddr; + u.val = pud_val_ma(val); + xen_extend_mmu_update(&u); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +static void xen_set_pud(pud_t *ptr, pud_t val) +{ + trace_xen_mmu_set_pud(ptr, val); + + /* If page is not pinned, we can just update the entry + directly */ + if (!xen_page_pinned(ptr)) { + *ptr = val; + return; + } + + xen_set_pud_hyper(ptr, val); +} + +#ifdef CONFIG_X86_PAE +static void xen_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + trace_xen_mmu_set_pte_atomic(ptep, pte); + set_64bit((u64 *)ptep, native_pte_val(pte)); +} + +static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + trace_xen_mmu_pte_clear(mm, addr, ptep); + if (!xen_batched_set_pte(ptep, native_make_pte(0))) + native_pte_clear(mm, addr, ptep); +} + +static void xen_pmd_clear(pmd_t *pmdp) +{ + trace_xen_mmu_pmd_clear(pmdp); + set_pmd(pmdp, __pmd(0)); +} +#endif /* CONFIG_X86_PAE */ + +__visible pmd_t xen_make_pmd(pmdval_t pmd) +{ + pmd = pte_pfn_to_mfn(pmd); + return native_make_pmd(pmd); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); + +#if CONFIG_PGTABLE_LEVELS == 4 +__visible pudval_t xen_pud_val(pud_t pud) +{ + return pte_mfn_to_pfn(pud.pud); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); + +__visible pud_t xen_make_pud(pudval_t pud) +{ + pud = pte_pfn_to_mfn(pud); + + return native_make_pud(pud); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); + +static pgd_t *xen_get_user_pgd(pgd_t *pgd) +{ + pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); + unsigned offset = pgd - pgd_page; + pgd_t *user_ptr = NULL; + + if (offset < pgd_index(USER_LIMIT)) { + struct page *page = virt_to_page(pgd_page); + user_ptr = (pgd_t *)page->private; + if (user_ptr) + user_ptr += offset; + } + + return user_ptr; +} + +static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) +{ + struct mmu_update u; + + u.ptr = virt_to_machine(ptr).maddr; + u.val = p4d_val_ma(val); + xen_extend_mmu_update(&u); +} + +/* + * Raw hypercall-based set_p4d, intended for in early boot before + * there's a page structure. This implies: + * 1. The only existing pagetable is the kernel's + * 2. It is always pinned + * 3. It has no user pagetable attached to it + */ +static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) +{ + preempt_disable(); + + xen_mc_batch(); + + __xen_set_p4d_hyper(ptr, val); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +static void xen_set_p4d(p4d_t *ptr, p4d_t val) +{ + pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr); + pgd_t pgd_val; + + trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val); + + /* If page is not pinned, we can just update the entry + directly */ + if (!xen_page_pinned(ptr)) { + *ptr = val; + if (user_ptr) { + WARN_ON(xen_page_pinned(user_ptr)); + pgd_val.pgd = p4d_val_ma(val); + *user_ptr = pgd_val; + } + return; + } + + /* If it's pinned, then we can at least batch the kernel and + user updates together. */ + xen_mc_batch(); + + __xen_set_p4d_hyper(ptr, val); + if (user_ptr) + __xen_set_p4d_hyper((p4d_t *)user_ptr, val); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} +#endif /* CONFIG_PGTABLE_LEVELS == 4 */ + +static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), + bool last, unsigned long limit) +{ + int i, nr, flush = 0; + + nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD; + for (i = 0; i < nr; i++) { + if (!pmd_none(pmd[i])) + flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE); + } + return flush; +} + +static int xen_pud_walk(struct mm_struct *mm, pud_t *pud, + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), + bool last, unsigned long limit) +{ + int i, nr, flush = 0; + + nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD; + for (i = 0; i < nr; i++) { + pmd_t *pmd; + + if (pud_none(pud[i])) + continue; + + pmd = pmd_offset(&pud[i], 0); + if (PTRS_PER_PMD > 1) + flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); + flush |= xen_pmd_walk(mm, pmd, func, + last && i == nr - 1, limit); + } + return flush; +} + +static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d, + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), + bool last, unsigned long limit) +{ + int i, nr, flush = 0; + + nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D; + for (i = 0; i < nr; i++) { + pud_t *pud; + + if (p4d_none(p4d[i])) + continue; + + pud = pud_offset(&p4d[i], 0); + if (PTRS_PER_PUD > 1) + flush |= (*func)(mm, virt_to_page(pud), PT_PUD); + flush |= xen_pud_walk(mm, pud, func, + last && i == nr - 1, limit); + } + return flush; +} + +/* + * (Yet another) pagetable walker. This one is intended for pinning a + * pagetable. This means that it walks a pagetable and calls the + * callback function on each page it finds making up the page table, + * at every level. It walks the entire pagetable, but it only bothers + * pinning pte pages which are below limit. In the normal case this + * will be STACK_TOP_MAX, but at boot we need to pin up to + * FIXADDR_TOP. + * + * For 32-bit the important bit is that we don't pin beyond there, + * because then we start getting into Xen's ptes. + * + * For 64-bit, we must skip the Xen hole in the middle of the address + * space, just after the big x86-64 virtual hole. + */ +static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, + int (*func)(struct mm_struct *mm, struct page *, + enum pt_level), + unsigned long limit) +{ + int i, nr, flush = 0; + unsigned hole_low, hole_high; + + /* The limit is the last byte to be touched */ + limit--; + BUG_ON(limit >= FIXADDR_TOP); + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + /* + * 64-bit has a great big hole in the middle of the address + * space, which contains the Xen mappings. On 32-bit these + * will end up making a zero-sized hole and so is a no-op. + */ + hole_low = pgd_index(USER_LIMIT); + hole_high = pgd_index(PAGE_OFFSET); + + nr = pgd_index(limit) + 1; + for (i = 0; i < nr; i++) { + p4d_t *p4d; + + if (i >= hole_low && i < hole_high) + continue; + + if (pgd_none(pgd[i])) + continue; + + p4d = p4d_offset(&pgd[i], 0); + if (PTRS_PER_P4D > 1) + flush |= (*func)(mm, virt_to_page(p4d), PT_P4D); + flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit); + } + + /* Do the top level last, so that the callbacks can use it as + a cue to do final things like tlb flushes. */ + flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); + + return flush; +} + +static int xen_pgd_walk(struct mm_struct *mm, + int (*func)(struct mm_struct *mm, struct page *, + enum pt_level), + unsigned long limit) +{ + return __xen_pgd_walk(mm, mm->pgd, func, limit); +} + +/* If we're using split pte locks, then take the page's lock and + return a pointer to it. Otherwise return NULL. */ +static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) +{ + spinlock_t *ptl = NULL; + +#if USE_SPLIT_PTE_PTLOCKS + ptl = ptlock_ptr(page); + spin_lock_nest_lock(ptl, &mm->page_table_lock); +#endif + + return ptl; +} + +static void xen_pte_unlock(void *v) +{ + spinlock_t *ptl = v; + spin_unlock(ptl); +} + +static void xen_do_pin(unsigned level, unsigned long pfn) +{ + struct mmuext_op op; + + op.cmd = level; + op.arg1.mfn = pfn_to_mfn(pfn); + + xen_extend_mmuext_op(&op); +} + +static int xen_pin_page(struct mm_struct *mm, struct page *page, + enum pt_level level) +{ + unsigned pgfl = TestSetPagePinned(page); + int flush; + + if (pgfl) + flush = 0; /* already pinned */ + else if (PageHighMem(page)) + /* kmaps need flushing if we found an unpinned + highpage */ + flush = 1; + else { + void *pt = lowmem_page_address(page); + unsigned long pfn = page_to_pfn(page); + struct multicall_space mcs = __xen_mc_entry(0); + spinlock_t *ptl; + + flush = 0; + + /* + * We need to hold the pagetable lock between the time + * we make the pagetable RO and when we actually pin + * it. If we don't, then other users may come in and + * attempt to update the pagetable by writing it, + * which will fail because the memory is RO but not + * pinned, so Xen won't do the trap'n'emulate. + * + * If we're using split pte locks, we can't hold the + * entire pagetable's worth of locks during the + * traverse, because we may wrap the preempt count (8 + * bits). The solution is to mark RO and pin each PTE + * page while holding the lock. This means the number + * of locks we end up holding is never more than a + * batch size (~32 entries, at present). + * + * If we're not using split pte locks, we needn't pin + * the PTE pages independently, because we're + * protected by the overall pagetable lock. + */ + ptl = NULL; + if (level == PT_PTE) + ptl = xen_pte_lock(page, mm); + + MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, + pfn_pte(pfn, PAGE_KERNEL_RO), + level == PT_PGD ? UVMF_TLB_FLUSH : 0); + + if (ptl) { + xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); + + /* Queue a deferred unlock for when this batch + is completed. */ + xen_mc_callback(xen_pte_unlock, ptl); + } + } + + return flush; +} + +/* This is called just after a mm has been created, but it has not + been used yet. We need to make sure that its pagetable is all + read-only, and can be pinned. */ +static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) +{ + trace_xen_mmu_pgd_pin(mm, pgd); + + xen_mc_batch(); + + if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { + /* re-enable interrupts for flushing */ + xen_mc_issue(0); + + kmap_flush_unused(); + + xen_mc_batch(); + } + +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); + + if (user_pgd) { + xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); + xen_do_pin(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa(user_pgd))); + } + } +#else /* CONFIG_X86_32 */ +#ifdef CONFIG_X86_PAE + /* Need to make sure unshared kernel PMD is pinnable */ + xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), + PT_PMD); +#endif + xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); +#endif /* CONFIG_X86_64 */ + xen_mc_issue(0); +} + +static void xen_pgd_pin(struct mm_struct *mm) +{ + __xen_pgd_pin(mm, mm->pgd); +} + +/* + * On save, we need to pin all pagetables to make sure they get their + * mfns turned into pfns. Search the list for any unpinned pgds and pin + * them (unpinned pgds are not currently in use, probably because the + * process is under construction or destruction). + * + * Expected to be called in stop_machine() ("equivalent to taking + * every spinlock in the system"), so the locking doesn't really + * matter all that much. + */ +void xen_mm_pin_all(void) +{ + struct page *page; + + spin_lock(&pgd_lock); + + list_for_each_entry(page, &pgd_list, lru) { + if (!PagePinned(page)) { + __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page)); + SetPageSavePinned(page); + } + } + + spin_unlock(&pgd_lock); +} + +/* + * The init_mm pagetable is really pinned as soon as its created, but + * that's before we have page structures to store the bits. So do all + * the book-keeping now. + */ +static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, + enum pt_level level) +{ + SetPagePinned(page); + return 0; +} + +static void __init xen_mark_init_mm_pinned(void) +{ + xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); +} + +static int xen_unpin_page(struct mm_struct *mm, struct page *page, + enum pt_level level) +{ + unsigned pgfl = TestClearPagePinned(page); + + if (pgfl && !PageHighMem(page)) { + void *pt = lowmem_page_address(page); + unsigned long pfn = page_to_pfn(page); + spinlock_t *ptl = NULL; + struct multicall_space mcs; + + /* + * Do the converse to pin_page. If we're using split + * pte locks, we must be holding the lock for while + * the pte page is unpinned but still RO to prevent + * concurrent updates from seeing it in this + * partially-pinned state. + */ + if (level == PT_PTE) { + ptl = xen_pte_lock(page, mm); + + if (ptl) + xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); + } + + mcs = __xen_mc_entry(0); + + MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, + pfn_pte(pfn, PAGE_KERNEL), + level == PT_PGD ? UVMF_TLB_FLUSH : 0); + + if (ptl) { + /* unlock when batch completed */ + xen_mc_callback(xen_pte_unlock, ptl); + } + } + + return 0; /* never need to flush on unpin */ +} + +/* Release a pagetables pages back as normal RW */ +static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) +{ + trace_xen_mmu_pgd_unpin(mm, pgd); + + xen_mc_batch(); + + xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + if (user_pgd) { + xen_do_pin(MMUEXT_UNPIN_TABLE, + PFN_DOWN(__pa(user_pgd))); + xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); + } + } +#endif + +#ifdef CONFIG_X86_PAE + /* Need to make sure unshared kernel PMD is unpinned */ + xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), + PT_PMD); +#endif + + __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT); + + xen_mc_issue(0); +} + +static void xen_pgd_unpin(struct mm_struct *mm) +{ + __xen_pgd_unpin(mm, mm->pgd); +} + +/* + * On resume, undo any pinning done at save, so that the rest of the + * kernel doesn't see any unexpected pinned pagetables. + */ +void xen_mm_unpin_all(void) +{ + struct page *page; + + spin_lock(&pgd_lock); + + list_for_each_entry(page, &pgd_list, lru) { + if (PageSavePinned(page)) { + BUG_ON(!PagePinned(page)); + __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page)); + ClearPageSavePinned(page); + } + } + + spin_unlock(&pgd_lock); +} + +static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) +{ + spin_lock(&next->page_table_lock); + xen_pgd_pin(next); + spin_unlock(&next->page_table_lock); +} + +static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +{ + spin_lock(&mm->page_table_lock); + xen_pgd_pin(mm); + spin_unlock(&mm->page_table_lock); +} + + +#ifdef CONFIG_SMP +/* Another cpu may still have their %cr3 pointing at the pagetable, so + we need to repoint it somewhere else before we can unpin it. */ +static void drop_other_mm_ref(void *info) +{ + struct mm_struct *mm = info; + struct mm_struct *active_mm; + + active_mm = this_cpu_read(cpu_tlbstate.active_mm); + + if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) + leave_mm(smp_processor_id()); + + /* If this cpu still has a stale cr3 reference, then make sure + it has been flushed. */ + if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd)) + load_cr3(swapper_pg_dir); +} + +static void xen_drop_mm_ref(struct mm_struct *mm) +{ + cpumask_var_t mask; + unsigned cpu; + + if (current->active_mm == mm) { + if (current->mm == mm) + load_cr3(swapper_pg_dir); + else + leave_mm(smp_processor_id()); + } + + /* Get the "official" set of cpus referring to our pagetable. */ + if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { + for_each_online_cpu(cpu) { + if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) + && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) + continue; + smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); + } + return; + } + cpumask_copy(mask, mm_cpumask(mm)); + + /* It's possible that a vcpu may have a stale reference to our + cr3, because its in lazy mode, and it hasn't yet flushed + its set of pending hypercalls yet. In this case, we can + look at its actual current cr3 value, and force it to flush + if needed. */ + for_each_online_cpu(cpu) { + if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) + cpumask_set_cpu(cpu, mask); + } + + if (!cpumask_empty(mask)) + smp_call_function_many(mask, drop_other_mm_ref, mm, 1); + free_cpumask_var(mask); +} +#else +static void xen_drop_mm_ref(struct mm_struct *mm) +{ + if (current->active_mm == mm) + load_cr3(swapper_pg_dir); +} +#endif + +/* + * While a process runs, Xen pins its pagetables, which means that the + * hypervisor forces it to be read-only, and it controls all updates + * to it. This means that all pagetable updates have to go via the + * hypervisor, which is moderately expensive. + * + * Since we're pulling the pagetable down, we switch to use init_mm, + * unpin old process pagetable and mark it all read-write, which + * allows further operations on it to be simple memory accesses. + * + * The only subtle point is that another CPU may be still using the + * pagetable because of lazy tlb flushing. This means we need need to + * switch all CPUs off this pagetable before we can unpin it. + */ +static void xen_exit_mmap(struct mm_struct *mm) +{ + get_cpu(); /* make sure we don't move around */ + xen_drop_mm_ref(mm); + put_cpu(); + + spin_lock(&mm->page_table_lock); + + /* pgd may not be pinned in the error exit path of execve */ + if (xen_page_pinned(mm->pgd)) + xen_pgd_unpin(mm); + + spin_unlock(&mm->page_table_lock); +} + +static void xen_post_allocator_init(void); + +static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +{ + struct mmuext_op op; + + op.cmd = cmd; + op.arg1.mfn = pfn_to_mfn(pfn); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + +#ifdef CONFIG_X86_64 +static void __init xen_cleanhighmap(unsigned long vaddr, + unsigned long vaddr_end) +{ + unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; + pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr); + + /* NOTE: The loop is more greedy than the cleanup_highmap variant. + * We include the PMD passed in on _both_ boundaries. */ + for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PTRS_PER_PMD)); + pmd++, vaddr += PMD_SIZE) { + if (pmd_none(*pmd)) + continue; + if (vaddr < (unsigned long) _text || vaddr > kernel_end) + set_pmd(pmd, __pmd(0)); + } + /* In case we did something silly, we should crash in this function + * instead of somewhere later and be confusing. */ + xen_mc_flush(); +} + +/* + * Make a page range writeable and free it. + */ +static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size) +{ + void *vaddr = __va(paddr); + void *vaddr_end = vaddr + size; + + for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) + make_lowmem_page_readwrite(vaddr); + + memblock_free(paddr, size); +} + +static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin) +{ + unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK; + + if (unpin) + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa)); + ClearPagePinned(virt_to_page(__va(pa))); + xen_free_ro_pages(pa, PAGE_SIZE); +} + +static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin) +{ + unsigned long pa; + pte_t *pte_tbl; + int i; + + if (pmd_large(*pmd)) { + pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, PMD_SIZE); + return; + } + + pte_tbl = pte_offset_kernel(pmd, 0); + for (i = 0; i < PTRS_PER_PTE; i++) { + if (pte_none(pte_tbl[i])) + continue; + pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT; + xen_free_ro_pages(pa, PAGE_SIZE); + } + set_pmd(pmd, __pmd(0)); + xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin); +} + +static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin) +{ + unsigned long pa; + pmd_t *pmd_tbl; + int i; + + if (pud_large(*pud)) { + pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, PUD_SIZE); + return; + } + + pmd_tbl = pmd_offset(pud, 0); + for (i = 0; i < PTRS_PER_PMD; i++) { + if (pmd_none(pmd_tbl[i])) + continue; + xen_cleanmfnmap_pmd(pmd_tbl + i, unpin); + } + set_pud(pud, __pud(0)); + xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin); +} + +static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin) +{ + unsigned long pa; + pud_t *pud_tbl; + int i; + + if (p4d_large(*p4d)) { + pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, P4D_SIZE); + return; + } + + pud_tbl = pud_offset(p4d, 0); + for (i = 0; i < PTRS_PER_PUD; i++) { + if (pud_none(pud_tbl[i])) + continue; + xen_cleanmfnmap_pud(pud_tbl + i, unpin); + } + set_p4d(p4d, __p4d(0)); + xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin); +} + +/* + * Since it is well isolated we can (and since it is perhaps large we should) + * also free the page tables mapping the initial P->M table. + */ +static void __init xen_cleanmfnmap(unsigned long vaddr) +{ + pgd_t *pgd; + p4d_t *p4d; + unsigned int i; + bool unpin; + + unpin = (vaddr == 2 * PGDIR_SIZE); + vaddr &= PMD_MASK; + pgd = pgd_offset_k(vaddr); + p4d = p4d_offset(pgd, 0); + for (i = 0; i < PTRS_PER_P4D; i++) { + if (p4d_none(p4d[i])) + continue; + xen_cleanmfnmap_p4d(p4d + i, unpin); + } + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + set_pgd(pgd, __pgd(0)); + xen_cleanmfnmap_free_pgtbl(p4d, unpin); + } +} + +static void __init xen_pagetable_p2m_free(void) +{ + unsigned long size; + unsigned long addr; + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + + /* No memory or already called. */ + if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list) + return; + + /* using __ka address and sticking INVALID_P2M_ENTRY! */ + memset((void *)xen_start_info->mfn_list, 0xff, size); + + addr = xen_start_info->mfn_list; + /* + * We could be in __ka space. + * We roundup to the PMD, which means that if anybody at this stage is + * using the __ka address of xen_start_info or + * xen_start_info->shared_info they are in going to crash. Fortunatly + * we have already revectored in xen_setup_kernel_pagetable and in + * xen_setup_shared_info. + */ + size = roundup(size, PMD_SIZE); + + if (addr >= __START_KERNEL_map) { + xen_cleanhighmap(addr, addr + size); + size = PAGE_ALIGN(xen_start_info->nr_pages * + sizeof(unsigned long)); + memblock_free(__pa(addr), size); + } else { + xen_cleanmfnmap(addr); + } +} + +static void __init xen_pagetable_cleanhighmap(void) +{ + unsigned long size; + unsigned long addr; + + /* At this stage, cleanup_highmap has already cleaned __ka space + * from _brk_limit way up to the max_pfn_mapped (which is the end of + * the ramdisk). We continue on, erasing PMD entries that point to page + * tables - do note that they are accessible at this stage via __va. + * For good measure we also round up to the PMD - which means that if + * anybody is using __ka address to the initial boot-stack - and try + * to use it - they are going to crash. The xen_start_info has been + * taken care of already in xen_setup_kernel_pagetable. */ + addr = xen_start_info->pt_base; + size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); + + xen_cleanhighmap(addr, addr + size); + xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); +#ifdef DEBUG + /* This is superfluous and is not necessary, but you know what + * lets do it. The MODULES_VADDR -> MODULES_END should be clear of + * anything at this stage. */ + xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); +#endif +} +#endif + +static void __init xen_pagetable_p2m_setup(void) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + xen_vmalloc_p2m_tree(); + +#ifdef CONFIG_X86_64 + xen_pagetable_p2m_free(); + + xen_pagetable_cleanhighmap(); +#endif + /* And revector! Bye bye old array */ + xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; +} + +static void __init xen_pagetable_init(void) +{ + paging_init(); + xen_post_allocator_init(); + + xen_pagetable_p2m_setup(); + + /* Allocate and initialize top and mid mfn levels for p2m structure */ + xen_build_mfn_list_list(); + + /* Remap memory freed due to conflicts with E820 map */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) + xen_remap_memory(); + + xen_setup_shared_info(); +} +static void xen_write_cr2(unsigned long cr2) +{ + this_cpu_read(xen_vcpu)->arch.cr2 = cr2; +} + +static unsigned long xen_read_cr2(void) +{ + return this_cpu_read(xen_vcpu)->arch.cr2; +} + +unsigned long xen_read_cr2_direct(void) +{ + return this_cpu_read(xen_vcpu_info.arch.cr2); +} + +static void xen_flush_tlb(void) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + trace_xen_mmu_flush_tlb(0); + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_TLB_FLUSH_LOCAL; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +static void xen_flush_tlb_single(unsigned long addr) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + trace_xen_mmu_flush_tlb_single(addr); + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*op)); + op = mcs.args; + op->cmd = MMUEXT_INVLPG_LOCAL; + op->arg1.linear_addr = addr & PAGE_MASK; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +static void xen_flush_tlb_others(const struct cpumask *cpus, + struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + struct { + struct mmuext_op op; +#ifdef CONFIG_SMP + DECLARE_BITMAP(mask, num_processors); +#else + DECLARE_BITMAP(mask, NR_CPUS); +#endif + } *args; + struct multicall_space mcs; + + trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); + + if (cpumask_empty(cpus)) + return; /* nothing to do */ + + mcs = xen_mc_entry(sizeof(*args)); + args = mcs.args; + args->op.arg2.vcpumask = to_cpumask(args->mask); + + /* Remove us, and any offline CPUS. */ + cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); + + args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; + if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { + args->op.cmd = MMUEXT_INVLPG_MULTI; + args->op.arg1.linear_addr = start; + } + + MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} + +static unsigned long xen_read_cr3(void) +{ + return this_cpu_read(xen_cr3); +} + +static void set_current_cr3(void *v) +{ + this_cpu_write(xen_current_cr3, (unsigned long)v); +} + +static void __xen_write_cr3(bool kernel, unsigned long cr3) +{ + struct mmuext_op op; + unsigned long mfn; + + trace_xen_mmu_write_cr3(kernel, cr3); + + if (cr3) + mfn = pfn_to_mfn(PFN_DOWN(cr3)); + else + mfn = 0; + + WARN_ON(mfn == 0 && kernel); + + op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; + op.arg1.mfn = mfn; + + xen_extend_mmuext_op(&op); + + if (kernel) { + this_cpu_write(xen_cr3, cr3); + + /* Update xen_current_cr3 once the batch has actually + been submitted. */ + xen_mc_callback(set_current_cr3, (void *)cr3); + } +} +static void xen_write_cr3(unsigned long cr3) +{ + BUG_ON(preemptible()); + + xen_mc_batch(); /* disables interrupts */ + + /* Update while interrupts are disabled, so its atomic with + respect to ipis */ + this_cpu_write(xen_cr3, cr3); + + __xen_write_cr3(true, cr3); + +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); + if (user_pgd) + __xen_write_cr3(false, __pa(user_pgd)); + else + __xen_write_cr3(false, 0); + } +#endif + + xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ +} + +#ifdef CONFIG_X86_64 +/* + * At the start of the day - when Xen launches a guest, it has already + * built pagetables for the guest. We diligently look over them + * in xen_setup_kernel_pagetable and graft as appropriate them in the + * init_level4_pgt and its friends. Then when we are happy we load + * the new init_level4_pgt - and continue on. + * + * The generic code starts (start_kernel) and 'init_mem_mapping' sets + * up the rest of the pagetables. When it has completed it loads the cr3. + * N.B. that baremetal would start at 'start_kernel' (and the early + * #PF handler would create bootstrap pagetables) - so we are running + * with the same assumptions as what to do when write_cr3 is executed + * at this point. + * + * Since there are no user-page tables at all, we have two variants + * of xen_write_cr3 - the early bootup (this one), and the late one + * (xen_write_cr3). The reason we have to do that is that in 64-bit + * the Linux kernel and user-space are both in ring 3 while the + * hypervisor is in ring 0. + */ +static void __init xen_write_cr3_init(unsigned long cr3) +{ + BUG_ON(preemptible()); + + xen_mc_batch(); /* disables interrupts */ + + /* Update while interrupts are disabled, so its atomic with + respect to ipis */ + this_cpu_write(xen_cr3, cr3); + + __xen_write_cr3(true, cr3); + + xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ +} +#endif + +static int xen_pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = mm->pgd; + int ret = 0; + + BUG_ON(PagePinned(virt_to_page(pgd))); + +#ifdef CONFIG_X86_64 + { + struct page *page = virt_to_page(pgd); + pgd_t *user_pgd; + + BUG_ON(page->private != 0); + + ret = -ENOMEM; + + user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + page->private = (unsigned long)user_pgd; + + if (user_pgd != NULL) { +#ifdef CONFIG_X86_VSYSCALL_EMULATION + user_pgd[pgd_index(VSYSCALL_ADDR)] = + __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); +#endif + ret = 0; + } + + BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); + } +#endif + return ret; +} + +static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ +#ifdef CONFIG_X86_64 + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + if (user_pgd) + free_page((unsigned long)user_pgd); +#endif +} + +/* + * Init-time set_pte while constructing initial pagetables, which + * doesn't allow RO page table pages to be remapped RW. + * + * If there is no MFN for this PFN then this page is initially + * ballooned out so clear the PTE (as in decrease_reservation() in + * drivers/xen/balloon.c). + * + * Many of these PTE updates are done on unpinned and writable pages + * and doing a hypercall for these is unnecessary and expensive. At + * this point it is not possible to tell if a page is pinned or not, + * so always write the PTE directly and rely on Xen trapping and + * emulating any updates as necessary. + */ +__visible pte_t xen_make_pte_init(pteval_t pte) +{ +#ifdef CONFIG_X86_64 + unsigned long pfn; + + /* + * Pages belonging to the initial p2m list mapped outside the default + * address range must be mapped read-only. This region contains the + * page tables for mapping the p2m list, too, and page tables MUST be + * mapped read-only. + */ + pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT; + if (xen_start_info->mfn_list < __START_KERNEL_map && + pfn >= xen_start_info->first_p2m_pfn && + pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames) + pte &= ~_PAGE_RW; +#endif + pte = pte_pfn_to_mfn(pte); + return native_make_pte(pte); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init); + +static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) +{ +#ifdef CONFIG_X86_32 + /* If there's an existing pte, then don't allow _PAGE_RW to be set */ + if (pte_mfn(pte) != INVALID_P2M_ENTRY + && pte_val_ma(*ptep) & _PAGE_PRESENT) + pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & + pte_val_ma(pte)); +#endif + native_set_pte(ptep, pte); +} + +/* Early in boot, while setting up the initial pagetable, assume + everything is pinned. */ +static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) +{ +#ifdef CONFIG_FLATMEM + BUG_ON(mem_map); /* should only be used early */ +#endif + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); +} + +/* Used for pmd and pud */ +static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) +{ +#ifdef CONFIG_FLATMEM + BUG_ON(mem_map); /* should only be used early */ +#endif + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); +} + +/* Early release_pte assumes that all pts are pinned, since there's + only init_mm and anything attached to that is pinned. */ +static void __init xen_release_pte_init(unsigned long pfn) +{ + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); +} + +static void __init xen_release_pmd_init(unsigned long pfn) +{ + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); +} + +static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +{ + struct multicall_space mcs; + struct mmuext_op *op; + + mcs = __xen_mc_entry(sizeof(*op)); + op = mcs.args; + op->cmd = cmd; + op->arg1.mfn = pfn_to_mfn(pfn); + + MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); +} + +static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot) +{ + struct multicall_space mcs; + unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT); + + mcs = __xen_mc_entry(0); + MULTI_update_va_mapping(mcs.mc, (unsigned long)addr, + pfn_pte(pfn, prot), 0); +} + +/* This needs to make sure the new pte page is pinned iff its being + attached to a pinned pagetable. */ +static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, + unsigned level) +{ + bool pinned = PagePinned(virt_to_page(mm->pgd)); + + trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned); + + if (pinned) { + struct page *page = pfn_to_page(pfn); + + SetPagePinned(page); + + if (!PageHighMem(page)) { + xen_mc_batch(); + + __set_pfn_prot(pfn, PAGE_KERNEL_RO); + + if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) + __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + } else { + /* make sure there are no stray mappings of + this page */ + kmap_flush_unused(); + } + } +} + +static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PTE); +} + +static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PMD); +} + +/* This should never happen until we're OK to use struct page */ +static inline void xen_release_ptpage(unsigned long pfn, unsigned level) +{ + struct page *page = pfn_to_page(pfn); + bool pinned = PagePinned(page); + + trace_xen_mmu_release_ptpage(pfn, level, pinned); + + if (pinned) { + if (!PageHighMem(page)) { + xen_mc_batch(); + + if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) + __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); + + __set_pfn_prot(pfn, PAGE_KERNEL); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + } + ClearPagePinned(page); + } +} + +static void xen_release_pte(unsigned long pfn) +{ + xen_release_ptpage(pfn, PT_PTE); +} + +static void xen_release_pmd(unsigned long pfn) +{ + xen_release_ptpage(pfn, PT_PMD); +} + +#if CONFIG_PGTABLE_LEVELS >= 4 +static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PUD); +} + +static void xen_release_pud(unsigned long pfn) +{ + xen_release_ptpage(pfn, PT_PUD); +} +#endif + +void __init xen_reserve_top(void) +{ +#ifdef CONFIG_X86_32 + unsigned long top = HYPERVISOR_VIRT_START; + struct xen_platform_parameters pp; + + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) + top = pp.virt_start; + + reserve_top_address(-top); +#endif /* CONFIG_X86_32 */ +} + +/* + * Like __va(), but returns address in the kernel mapping (which is + * all we have until the physical memory mapping has been set up. + */ +static void * __init __ka(phys_addr_t paddr) +{ +#ifdef CONFIG_X86_64 + return (void *)(paddr + __START_KERNEL_map); +#else + return __va(paddr); +#endif +} + +/* Convert a machine address to physical address */ +static unsigned long __init m2p(phys_addr_t maddr) +{ + phys_addr_t paddr; + + maddr &= PTE_PFN_MASK; + paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; + + return paddr; +} + +/* Convert a machine address to kernel virtual */ +static void * __init m2v(phys_addr_t maddr) +{ + return __ka(m2p(maddr)); +} + +/* Set the page permissions on an identity-mapped pages */ +static void __init set_page_prot_flags(void *addr, pgprot_t prot, + unsigned long flags) +{ + unsigned long pfn = __pa(addr) >> PAGE_SHIFT; + pte_t pte = pfn_pte(pfn, prot); + + if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) + BUG(); +} +static void __init set_page_prot(void *addr, pgprot_t prot) +{ + return set_page_prot_flags(addr, prot, UVMF_NONE); +} +#ifdef CONFIG_X86_32 +static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) +{ + unsigned pmdidx, pteidx; + unsigned ident_pte; + unsigned long pfn; + + level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES, + PAGE_SIZE); + + ident_pte = 0; + pfn = 0; + for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { + pte_t *pte_page; + + /* Reuse or allocate a page of ptes */ + if (pmd_present(pmd[pmdidx])) + pte_page = m2v(pmd[pmdidx].pmd); + else { + /* Check for free pte pages */ + if (ident_pte == LEVEL1_IDENT_ENTRIES) + break; + + pte_page = &level1_ident_pgt[ident_pte]; + ident_pte += PTRS_PER_PTE; + + pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); + } + + /* Install mappings */ + for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { + pte_t pte; + + if (pfn > max_pfn_mapped) + max_pfn_mapped = pfn; + + if (!pte_none(pte_page[pteidx])) + continue; + + pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); + pte_page[pteidx] = pte; + } + } + + for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) + set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); + + set_page_prot(pmd, PAGE_KERNEL_RO); +} +#endif +void __init xen_setup_machphys_mapping(void) +{ + struct xen_machphys_mapping mapping; + + if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { + machine_to_phys_mapping = (unsigned long *)mapping.v_start; + machine_to_phys_nr = mapping.max_mfn + 1; + } else { + machine_to_phys_nr = MACH2PHYS_NR_ENTRIES; + } +#ifdef CONFIG_X86_32 + WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1)) + < machine_to_phys_mapping); +#endif +} + +#ifdef CONFIG_X86_64 +static void __init convert_pfn_mfn(void *v) +{ + pte_t *pte = v; + int i; + + /* All levels are converted the same way, so just treat them + as ptes. */ + for (i = 0; i < PTRS_PER_PTE; i++) + pte[i] = xen_make_pte(pte[i].pte); +} +static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, + unsigned long addr) +{ + if (*pt_base == PFN_DOWN(__pa(addr))) { + set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); + clear_page((void *)addr); + (*pt_base)++; + } + if (*pt_end == PFN_DOWN(__pa(addr))) { + set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); + clear_page((void *)addr); + (*pt_end)--; + } +} +/* + * Set up the initial kernel pagetable. + * + * We can construct this by grafting the Xen provided pagetable into + * head_64.S's preconstructed pagetables. We copy the Xen L2's into + * level2_ident_pgt, and level2_kernel_pgt. This means that only the + * kernel has a physical mapping to start with - but that's enough to + * get __va working. We need to fill in the rest of the physical + * mapping once some sort of allocator has been set up. + */ +void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) +{ + pud_t *l3; + pmd_t *l2; + unsigned long addr[3]; + unsigned long pt_base, pt_end; + unsigned i; + + /* max_pfn_mapped is the last pfn mapped in the initial memory + * mappings. Considering that on Xen after the kernel mappings we + * have the mappings of some pages that don't exist in pfn space, we + * set max_pfn_mapped to the last real pfn mapped. */ + if (xen_start_info->mfn_list < __START_KERNEL_map) + max_pfn_mapped = xen_start_info->first_p2m_pfn; + else + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); + + pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); + pt_end = pt_base + xen_start_info->nr_pt_frames; + + /* Zap identity mapping */ + init_level4_pgt[0] = __pgd(0); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Pre-constructed entries are in pfn, so convert to mfn */ + /* L4[272] -> level3_ident_pgt + * L4[511] -> level3_kernel_pgt */ + convert_pfn_mfn(init_level4_pgt); + + /* L3_i[0] -> level2_ident_pgt */ + convert_pfn_mfn(level3_ident_pgt); + /* L3_k[510] -> level2_kernel_pgt + * L3_k[511] -> level2_fixmap_pgt */ + convert_pfn_mfn(level3_kernel_pgt); + + /* L3_k[511][506] -> level1_fixmap_pgt */ + convert_pfn_mfn(level2_fixmap_pgt); + } + /* We get [511][511] and have Xen's version of level2_kernel_pgt */ + l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); + l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); + + addr[0] = (unsigned long)pgd; + addr[1] = (unsigned long)l3; + addr[2] = (unsigned long)l2; + /* Graft it onto L4[272][0]. Note that we creating an aliasing problem: + * Both L4[272][0] and L4[511][510] have entries that point to the same + * L2 (PMD) tables. Meaning that if you modify it in __va space + * it will be also modified in the __ka space! (But if you just + * modify the PMD table to point to other PTE's or none, then you + * are OK - which is what cleanup_highmap does) */ + copy_page(level2_ident_pgt, l2); + /* Graft it onto L4[511][510] */ + copy_page(level2_kernel_pgt, l2); + + /* Copy the initial P->M table mappings if necessary. */ + i = pgd_index(xen_start_info->mfn_list); + if (i && i < pgd_index(__START_KERNEL_map)) + init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Make pagetable pieces RO */ + set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); + set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); + set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO); + + /* Pin down new L4 */ + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa_symbol(init_level4_pgt))); + + /* Unpin Xen-provided one */ + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + + /* + * At this stage there can be no user pgd, and no page + * structure to attach it to, so make sure we just set kernel + * pgd. + */ + xen_mc_batch(); + __xen_write_cr3(true, __pa(init_level4_pgt)); + xen_mc_issue(PARAVIRT_LAZY_CPU); + } else + native_write_cr3(__pa(init_level4_pgt)); + + /* We can't that easily rip out L3 and L2, as the Xen pagetables are + * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for + * the initial domain. For guests using the toolstack, they are in: + * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only + * rip out the [L4] (pgd), but for guests we shave off three pages. + */ + for (i = 0; i < ARRAY_SIZE(addr); i++) + check_pt_base(&pt_base, &pt_end, addr[i]); + + /* Our (by three pages) smaller Xen pagetable that we are using */ + xen_pt_base = PFN_PHYS(pt_base); + xen_pt_size = (pt_end - pt_base) * PAGE_SIZE; + memblock_reserve(xen_pt_base, xen_pt_size); + + /* Revector the xen_start_info */ + xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); +} + +/* + * Read a value from a physical address. + */ +static unsigned long __init xen_read_phys_ulong(phys_addr_t addr) +{ + unsigned long *vaddr; + unsigned long val; + + vaddr = early_memremap_ro(addr, sizeof(val)); + val = *vaddr; + early_memunmap(vaddr, sizeof(val)); + return val; +} + +/* + * Translate a virtual address to a physical one without relying on mapped + * page tables. + */ +static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) +{ + phys_addr_t pa; + pgd_t pgd; + pud_t pud; + pmd_t pmd; + pte_t pte; + + pa = read_cr3(); + pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * + sizeof(pgd))); + if (!pgd_present(pgd)) + return 0; + + pa = pgd_val(pgd) & PTE_PFN_MASK; + pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) * + sizeof(pud))); + if (!pud_present(pud)) + return 0; + pa = pud_pfn(pud) << PAGE_SHIFT; + if (pud_large(pud)) + return pa + (vaddr & ~PUD_MASK); + + pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) * + sizeof(pmd))); + if (!pmd_present(pmd)) + return 0; + pa = pmd_pfn(pmd) << PAGE_SHIFT; + if (pmd_large(pmd)) + return pa + (vaddr & ~PMD_MASK); + + pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) * + sizeof(pte))); + if (!pte_present(pte)) + return 0; + pa = pte_pfn(pte) << PAGE_SHIFT; + + return pa | (vaddr & ~PAGE_MASK); +} + +/* + * Find a new area for the hypervisor supplied p2m list and relocate the p2m to + * this area. + */ +void __init xen_relocate_p2m(void) +{ + phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys; + unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; + int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d; + pte_t *pt; + pmd_t *pmd; + pud_t *pud; + p4d_t *p4d = NULL; + pgd_t *pgd; + unsigned long *new_p2m; + int save_pud; + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT; + n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; + n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; + n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT; + if (PTRS_PER_P4D > 1) + n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; + else + n_p4d = 0; + n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d; + + new_area = xen_find_free_area(PFN_PHYS(n_frames)); + if (!new_area) { + xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n"); + BUG(); + } + + /* + * Setup the page tables for addressing the new p2m list. + * We have asked the hypervisor to map the p2m list at the user address + * PUD_SIZE. It may have done so, or it may have used a kernel space + * address depending on the Xen version. + * To avoid any possible virtual address collision, just use + * 2 * PUD_SIZE for the new area. + */ + p4d_phys = new_area; + pud_phys = p4d_phys + PFN_PHYS(n_p4d); + pmd_phys = pud_phys + PFN_PHYS(n_pud); + pt_phys = pmd_phys + PFN_PHYS(n_pmd); + p2m_pfn = PFN_DOWN(pt_phys) + n_pt; + + pgd = __va(read_cr3()); + new_p2m = (unsigned long *)(2 * PGDIR_SIZE); + idx_p4d = 0; + save_pud = n_pud; + do { + if (n_p4d > 0) { + p4d = early_memremap(p4d_phys, PAGE_SIZE); + clear_page(p4d); + n_pud = min(save_pud, PTRS_PER_P4D); + } + for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { + pud = early_memremap(pud_phys, PAGE_SIZE); + clear_page(pud); + for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); + idx_pmd++) { + pmd = early_memremap(pmd_phys, PAGE_SIZE); + clear_page(pmd); + for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); + idx_pt++) { + pt = early_memremap(pt_phys, PAGE_SIZE); + clear_page(pt); + for (idx_pte = 0; + idx_pte < min(n_pte, PTRS_PER_PTE); + idx_pte++) { + set_pte(pt + idx_pte, + pfn_pte(p2m_pfn, PAGE_KERNEL)); + p2m_pfn++; + } + n_pte -= PTRS_PER_PTE; + early_memunmap(pt, PAGE_SIZE); + make_lowmem_page_readonly(__va(pt_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, + PFN_DOWN(pt_phys)); + set_pmd(pmd + idx_pt, + __pmd(_PAGE_TABLE | pt_phys)); + pt_phys += PAGE_SIZE; + } + n_pt -= PTRS_PER_PMD; + early_memunmap(pmd, PAGE_SIZE); + make_lowmem_page_readonly(__va(pmd_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, + PFN_DOWN(pmd_phys)); + set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); + pmd_phys += PAGE_SIZE; + } + n_pmd -= PTRS_PER_PUD; + early_memunmap(pud, PAGE_SIZE); + make_lowmem_page_readonly(__va(pud_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); + if (n_p4d > 0) + set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys)); + else + set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); + pud_phys += PAGE_SIZE; + } + if (n_p4d > 0) { + save_pud -= PTRS_PER_P4D; + early_memunmap(p4d, PAGE_SIZE); + make_lowmem_page_readonly(__va(p4d_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys)); + set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys)); + p4d_phys += PAGE_SIZE; + } + } while (++idx_p4d < n_p4d); + + /* Now copy the old p2m info to the new area. */ + memcpy(new_p2m, xen_p2m_addr, size); + xen_p2m_addr = new_p2m; + + /* Release the old p2m list and set new list info. */ + p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list)); + BUG_ON(!p2m_pfn); + p2m_pfn_end = p2m_pfn + PFN_DOWN(size); + + if (xen_start_info->mfn_list < __START_KERNEL_map) { + pfn = xen_start_info->first_p2m_pfn; + pfn_end = xen_start_info->first_p2m_pfn + + xen_start_info->nr_p2m_frames; + set_pgd(pgd + 1, __pgd(0)); + } else { + pfn = p2m_pfn; + pfn_end = p2m_pfn_end; + } + + memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn)); + while (pfn < pfn_end) { + if (pfn == p2m_pfn) { + pfn = p2m_pfn_end; + continue; + } + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + pfn++; + } + + xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; + xen_start_info->first_p2m_pfn = PFN_DOWN(new_area); + xen_start_info->nr_p2m_frames = n_frames; +} + +#else /* !CONFIG_X86_64 */ +static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); +static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); + +static void __init xen_write_cr3_init(unsigned long cr3) +{ + unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); + + BUG_ON(read_cr3() != __pa(initial_page_table)); + BUG_ON(cr3 != __pa(swapper_pg_dir)); + + /* + * We are switching to swapper_pg_dir for the first time (from + * initial_page_table) and therefore need to mark that page + * read-only and then pin it. + * + * Xen disallows sharing of kernel PMDs for PAE + * guests. Therefore we must copy the kernel PMD from + * initial_page_table into a new kernel PMD to be used in + * swapper_pg_dir. + */ + swapper_kernel_pmd = + extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); + copy_page(swapper_kernel_pmd, initial_kernel_pmd); + swapper_pg_dir[KERNEL_PGD_BOUNDARY] = + __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); + set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); + + set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); + xen_write_cr3(cr3); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn); + + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, + PFN_DOWN(__pa(initial_page_table))); + set_page_prot(initial_page_table, PAGE_KERNEL); + set_page_prot(initial_kernel_pmd, PAGE_KERNEL); + + pv_mmu_ops.write_cr3 = &xen_write_cr3; +} + +/* + * For 32 bit domains xen_start_info->pt_base is the pgd address which might be + * not the first page table in the page table pool. + * Iterate through the initial page tables to find the real page table base. + */ +static phys_addr_t xen_find_pt_base(pmd_t *pmd) +{ + phys_addr_t pt_base, paddr; + unsigned pmdidx; + + pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd)); + + for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) + if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) { + paddr = m2p(pmd[pmdidx].pmd); + pt_base = min(pt_base, paddr); + } + + return pt_base; +} + +void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) +{ + pmd_t *kernel_pmd; + + kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); + + xen_pt_base = xen_find_pt_base(kernel_pmd); + xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE; + + initial_kernel_pmd = + extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); + + max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024); + + copy_page(initial_kernel_pmd, kernel_pmd); + + xen_map_identity_early(initial_kernel_pmd, max_pfn); + + copy_page(initial_page_table, pgd); + initial_page_table[KERNEL_PGD_BOUNDARY] = + __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); + + set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO); + set_page_prot(initial_page_table, PAGE_KERNEL_RO); + set_page_prot(empty_zero_page, PAGE_KERNEL_RO); + + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, + PFN_DOWN(__pa(initial_page_table))); + xen_write_cr3(__pa(initial_page_table)); + + memblock_reserve(xen_pt_base, xen_pt_size); +} +#endif /* CONFIG_X86_64 */ + +void __init xen_reserve_special_pages(void) +{ + phys_addr_t paddr; + + memblock_reserve(__pa(xen_start_info), PAGE_SIZE); + if (xen_start_info->store_mfn) { + paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn)); + memblock_reserve(paddr, PAGE_SIZE); + } + if (!xen_initial_domain()) { + paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn)); + memblock_reserve(paddr, PAGE_SIZE); + } +} + +void __init xen_pt_check_e820(void) +{ + if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) { + xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n"); + BUG(); + } +} + +static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; + +static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) +{ + pte_t pte; + + phys >>= PAGE_SHIFT; + + switch (idx) { + case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: + case FIX_RO_IDT: +#ifdef CONFIG_X86_32 + case FIX_WP_TEST: +# ifdef CONFIG_HIGHMEM + case FIX_KMAP_BEGIN ... FIX_KMAP_END: +# endif +#elif defined(CONFIG_X86_VSYSCALL_EMULATION) + case VSYSCALL_PAGE: +#endif + case FIX_TEXT_POKE0: + case FIX_TEXT_POKE1: + case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END: + /* All local page mappings */ + pte = pfn_pte(phys, prot); + break; + +#ifdef CONFIG_X86_LOCAL_APIC + case FIX_APIC_BASE: /* maps dummy local APIC */ + pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); + break; +#endif + +#ifdef CONFIG_X86_IO_APIC + case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END: + /* + * We just don't map the IO APIC - all access is via + * hypercalls. Keep the address in the pte for reference. + */ + pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); + break; +#endif + + case FIX_PARAVIRT_BOOTMAP: + /* This is an MFN, but it isn't an IO mapping from the + IO domain */ + pte = mfn_pte(phys, prot); + break; + + default: + /* By default, set_fixmap is used for hardware mappings */ + pte = mfn_pte(phys, prot); + break; + } + + __native_set_fixmap(idx, pte); + +#ifdef CONFIG_X86_VSYSCALL_EMULATION + /* Replicate changes to map the vsyscall page into the user + pagetable vsyscall mapping. */ + if (idx == VSYSCALL_PAGE) { + unsigned long vaddr = __fix_to_virt(idx); + set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); + } +#endif +} + +static void __init xen_post_allocator_init(void) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + pv_mmu_ops.set_pte = xen_set_pte; + pv_mmu_ops.set_pmd = xen_set_pmd; + pv_mmu_ops.set_pud = xen_set_pud; +#if CONFIG_PGTABLE_LEVELS >= 4 + pv_mmu_ops.set_p4d = xen_set_p4d; +#endif + + /* This will work as long as patching hasn't happened yet + (which it hasn't) */ + pv_mmu_ops.alloc_pte = xen_alloc_pte; + pv_mmu_ops.alloc_pmd = xen_alloc_pmd; + pv_mmu_ops.release_pte = xen_release_pte; + pv_mmu_ops.release_pmd = xen_release_pmd; +#if CONFIG_PGTABLE_LEVELS >= 4 + pv_mmu_ops.alloc_pud = xen_alloc_pud; + pv_mmu_ops.release_pud = xen_release_pud; +#endif + pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte); + +#ifdef CONFIG_X86_64 + pv_mmu_ops.write_cr3 = &xen_write_cr3; + SetPagePinned(virt_to_page(level3_user_vsyscall)); +#endif + xen_mark_init_mm_pinned(); +} + +static void xen_leave_lazy_mmu(void) +{ + preempt_disable(); + xen_mc_flush(); + paravirt_leave_lazy_mmu(); + preempt_enable(); +} + +static const struct pv_mmu_ops xen_mmu_ops __initconst = { + .read_cr2 = xen_read_cr2, + .write_cr2 = xen_write_cr2, + + .read_cr3 = xen_read_cr3, + .write_cr3 = xen_write_cr3_init, + + .flush_tlb_user = xen_flush_tlb, + .flush_tlb_kernel = xen_flush_tlb, + .flush_tlb_single = xen_flush_tlb_single, + .flush_tlb_others = xen_flush_tlb_others, + + .pte_update = paravirt_nop, + + .pgd_alloc = xen_pgd_alloc, + .pgd_free = xen_pgd_free, + + .alloc_pte = xen_alloc_pte_init, + .release_pte = xen_release_pte_init, + .alloc_pmd = xen_alloc_pmd_init, + .release_pmd = xen_release_pmd_init, + + .set_pte = xen_set_pte_init, + .set_pte_at = xen_set_pte_at, + .set_pmd = xen_set_pmd_hyper, + + .ptep_modify_prot_start = __ptep_modify_prot_start, + .ptep_modify_prot_commit = __ptep_modify_prot_commit, + + .pte_val = PV_CALLEE_SAVE(xen_pte_val), + .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), + + .make_pte = PV_CALLEE_SAVE(xen_make_pte_init), + .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), + +#ifdef CONFIG_X86_PAE + .set_pte_atomic = xen_set_pte_atomic, + .pte_clear = xen_pte_clear, + .pmd_clear = xen_pmd_clear, +#endif /* CONFIG_X86_PAE */ + .set_pud = xen_set_pud_hyper, + + .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), + .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), + +#if CONFIG_PGTABLE_LEVELS >= 4 + .pud_val = PV_CALLEE_SAVE(xen_pud_val), + .make_pud = PV_CALLEE_SAVE(xen_make_pud), + .set_p4d = xen_set_p4d_hyper, + + .alloc_pud = xen_alloc_pmd_init, + .release_pud = xen_release_pmd_init, +#endif /* CONFIG_PGTABLE_LEVELS == 4 */ + + .activate_mm = xen_activate_mm, + .dup_mmap = xen_dup_mmap, + .exit_mmap = xen_exit_mmap, + + .lazy_mode = { + .enter = paravirt_enter_lazy_mmu, + .leave = xen_leave_lazy_mmu, + .flush = paravirt_flush_lazy_mmu, + }, + + .set_fixmap = xen_set_fixmap, +}; + +void __init xen_init_mmu_ops(void) +{ + x86_init.paging.pagetable_init = xen_pagetable_init; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + pv_mmu_ops = xen_mmu_ops; + + memset(dummy_mapping, 0xff, PAGE_SIZE); +} + +/* Protected by xen_reservation_lock. */ +#define MAX_CONTIG_ORDER 9 /* 2MB */ +static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; + +#define VOID_PTE (mfn_pte(0, __pgprot(0))) +static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, + unsigned long *in_frames, + unsigned long *out_frames) +{ + int i; + struct multicall_space mcs; + + xen_mc_batch(); + for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) { + mcs = __xen_mc_entry(0); + + if (in_frames) + in_frames[i] = virt_to_mfn(vaddr); + + MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); + __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); + + if (out_frames) + out_frames[i] = virt_to_pfn(vaddr); + } + xen_mc_issue(0); +} + +/* + * Update the pfn-to-mfn mappings for a virtual address range, either to + * point to an array of mfns, or contiguously from a single starting + * mfn. + */ +static void xen_remap_exchanged_ptes(unsigned long vaddr, int order, + unsigned long *mfns, + unsigned long first_mfn) +{ + unsigned i, limit; + unsigned long mfn; + + xen_mc_batch(); + + limit = 1u << order; + for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) { + struct multicall_space mcs; + unsigned flags; + + mcs = __xen_mc_entry(0); + if (mfns) + mfn = mfns[i]; + else + mfn = first_mfn + i; + + if (i < (limit - 1)) + flags = 0; + else { + if (order == 0) + flags = UVMF_INVLPG | UVMF_ALL; + else + flags = UVMF_TLB_FLUSH | UVMF_ALL; + } + + MULTI_update_va_mapping(mcs.mc, vaddr, + mfn_pte(mfn, PAGE_KERNEL), flags); + + set_phys_to_machine(virt_to_pfn(vaddr), mfn); + } + + xen_mc_issue(0); +} + +/* + * Perform the hypercall to exchange a region of our pfns to point to + * memory with the required contiguous alignment. Takes the pfns as + * input, and populates mfns as output. + * + * Returns a success code indicating whether the hypervisor was able to + * satisfy the request or not. + */ +static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in, + unsigned long *pfns_in, + unsigned long extents_out, + unsigned int order_out, + unsigned long *mfns_out, + unsigned int address_bits) +{ + long rc; + int success; + + struct xen_memory_exchange exchange = { + .in = { + .nr_extents = extents_in, + .extent_order = order_in, + .extent_start = pfns_in, + .domid = DOMID_SELF + }, + .out = { + .nr_extents = extents_out, + .extent_order = order_out, + .extent_start = mfns_out, + .address_bits = address_bits, + .domid = DOMID_SELF + } + }; + + BUG_ON(extents_in << order_in != extents_out << order_out); + + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); + success = (exchange.nr_exchanged == extents_in); + + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); + BUG_ON(success && (rc != 0)); + + return success; +} + +int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, + unsigned int address_bits, + dma_addr_t *dma_handle) +{ + unsigned long *in_frames = discontig_frames, out_frame; + unsigned long flags; + int success; + unsigned long vstart = (unsigned long)phys_to_virt(pstart); + + /* + * Currently an auto-translated guest will not perform I/O, nor will + * it require PAE page directories below 4GB. Therefore any calls to + * this function are redundant and can be ignored. + */ + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + if (unlikely(order > MAX_CONTIG_ORDER)) + return -ENOMEM; + + memset((void *) vstart, 0, PAGE_SIZE << order); + + spin_lock_irqsave(&xen_reservation_lock, flags); + + /* 1. Zap current PTEs, remembering MFNs. */ + xen_zap_pfn_range(vstart, order, in_frames, NULL); + + /* 2. Get a new contiguous memory extent. */ + out_frame = virt_to_pfn(vstart); + success = xen_exchange_memory(1UL << order, 0, in_frames, + 1, order, &out_frame, + address_bits); + + /* 3. Map the new extent in place of old pages. */ + if (success) + xen_remap_exchanged_ptes(vstart, order, NULL, out_frame); + else + xen_remap_exchanged_ptes(vstart, order, in_frames, 0); + + spin_unlock_irqrestore(&xen_reservation_lock, flags); + + *dma_handle = virt_to_machine(vstart).maddr; + return success ? 0 : -ENOMEM; +} +EXPORT_SYMBOL_GPL(xen_create_contiguous_region); + +void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) +{ + unsigned long *out_frames = discontig_frames, in_frame; + unsigned long flags; + int success; + unsigned long vstart; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + if (unlikely(order > MAX_CONTIG_ORDER)) + return; + + vstart = (unsigned long)phys_to_virt(pstart); + memset((void *) vstart, 0, PAGE_SIZE << order); + + spin_lock_irqsave(&xen_reservation_lock, flags); + + /* 1. Find start MFN of contiguous extent. */ + in_frame = virt_to_mfn(vstart); + + /* 2. Zap current PTEs. */ + xen_zap_pfn_range(vstart, order, NULL, out_frames); + + /* 3. Do the exchange for non-contiguous MFNs. */ + success = xen_exchange_memory(1, order, &in_frame, 1UL << order, + 0, out_frames, 0); + + /* 4. Map new pages in place of old pages. */ + if (success) + xen_remap_exchanged_ptes(vstart, order, out_frames, 0); + else + xen_remap_exchanged_ptes(vstart, order, NULL, in_frame); + + spin_unlock_irqrestore(&xen_reservation_lock, flags); +} +EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); + +#ifdef CONFIG_KEXEC_CORE +phys_addr_t paddr_vmcoreinfo_note(void) +{ + if (xen_pv_domain()) + return virt_to_machine(&vmcoreinfo_note).maddr; + else + return __pa_symbol(&vmcoreinfo_note); +} +#endif /* CONFIG_KEXEC_CORE */ diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h index af5f0ad94078..4be5355b56f7 100644 --- a/arch/x86/xen/pmu.h +++ b/arch/x86/xen/pmu.h @@ -4,8 +4,13 @@ #include <xen/interface/xenpmu.h> irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id); +#ifdef CONFIG_XEN_HAVE_VPMU void xen_pmu_init(int cpu); void xen_pmu_finish(int cpu); +#else +static inline void xen_pmu_init(int cpu) {} +static inline void xen_pmu_finish(int cpu) {} +#endif bool is_xen_pmu(int cpu); bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err); bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index eaa36162ed4a..82ac611f2fc1 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -1,63 +1,21 @@ -/* - * Xen SMP support - * - * This file implements the Xen versions of smp_ops. SMP under Xen is - * very straightforward. Bringing a CPU up is simply a matter of - * loading its initial context and setting it running. - * - * IPIs are handled through the Xen event mechanism. - * - * Because virtual CPUs can be scheduled onto any real CPU, there's no - * useful topology information for the kernel to make use of. As a - * result, all CPUs are treated as if they're single-core and - * single-threaded. - */ -#include <linux/sched.h> -#include <linux/err.h> -#include <linux/slab.h> #include <linux/smp.h> -#include <linux/irq_work.h> -#include <linux/tick.h> -#include <linux/nmi.h> - -#include <asm/paravirt.h> -#include <asm/desc.h> -#include <asm/pgtable.h> -#include <asm/cpu.h> - -#include <xen/interface/xen.h> -#include <xen/interface/vcpu.h> -#include <xen/interface/xenpmu.h> - -#include <asm/xen/interface.h> -#include <asm/xen/hypercall.h> +#include <linux/slab.h> +#include <linux/cpumask.h> +#include <linux/percpu.h> -#include <xen/xen.h> -#include <xen/page.h> #include <xen/events.h> #include <xen/hvc-console.h> #include "xen-ops.h" -#include "mmu.h" #include "smp.h" -#include "pmu.h" - -cpumask_var_t xen_cpu_initialized_map; -struct xen_common_irq { - int irq; - char *name; -}; static DEFINE_PER_CPU(struct xen_common_irq, xen_resched_irq) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 }; -static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 }; -static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 }; static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); -static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id); /* * Reschedule call back. @@ -70,42 +28,6 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static void cpu_bringup(void) -{ - int cpu; - - cpu_init(); - touch_softlockup_watchdog(); - preempt_disable(); - - /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */ - if (!xen_feature(XENFEAT_supervisor_mode_kernel)) { - xen_enable_sysenter(); - xen_enable_syscall(); - } - cpu = smp_processor_id(); - smp_store_cpu_info(cpu); - cpu_data(cpu).x86_max_cores = 1; - set_cpu_sibling_map(cpu); - - xen_setup_cpu_clockevents(); - - notify_cpu_starting(cpu); - - set_cpu_online(cpu, true); - - cpu_set_state_online(cpu); /* Implies full memory barrier. */ - - /* We can take interrupts now: we're officially "up". */ - local_irq_enable(); -} - -asmlinkage __visible void cpu_bringup_and_idle(void) -{ - cpu_bringup(); - cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); -} - void xen_smp_intr_free(unsigned int cpu) { if (per_cpu(xen_resched_irq, cpu).irq >= 0) { @@ -133,27 +55,12 @@ void xen_smp_intr_free(unsigned int cpu) kfree(per_cpu(xen_callfuncsingle_irq, cpu).name); per_cpu(xen_callfuncsingle_irq, cpu).name = NULL; } - if (xen_hvm_domain()) - return; - - if (per_cpu(xen_irq_work, cpu).irq >= 0) { - unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL); - per_cpu(xen_irq_work, cpu).irq = -1; - kfree(per_cpu(xen_irq_work, cpu).name); - per_cpu(xen_irq_work, cpu).name = NULL; - } +} - if (per_cpu(xen_pmu_irq, cpu).irq >= 0) { - unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL); - per_cpu(xen_pmu_irq, cpu).irq = -1; - kfree(per_cpu(xen_pmu_irq, cpu).name); - per_cpu(xen_pmu_irq, cpu).name = NULL; - } -}; int xen_smp_intr_init(unsigned int cpu) { int rc; - char *resched_name, *callfunc_name, *debug_name, *pmu_name; + char *resched_name, *callfunc_name, *debug_name; resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, @@ -200,37 +107,6 @@ int xen_smp_intr_init(unsigned int cpu) per_cpu(xen_callfuncsingle_irq, cpu).irq = rc; per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name; - /* - * The IRQ worker on PVHVM goes through the native path and uses the - * IPI mechanism. - */ - if (xen_hvm_domain()) - return 0; - - callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu); - rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, - cpu, - xen_irq_work_interrupt, - IRQF_PERCPU|IRQF_NOBALANCING, - callfunc_name, - NULL); - if (rc < 0) - goto fail; - per_cpu(xen_irq_work, cpu).irq = rc; - per_cpu(xen_irq_work, cpu).name = callfunc_name; - - if (is_xen_pmu(cpu)) { - pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu); - rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu, - xen_pmu_irq_handler, - IRQF_PERCPU|IRQF_NOBALANCING, - pmu_name, NULL); - if (rc < 0) - goto fail; - per_cpu(xen_pmu_irq, cpu).irq = rc; - per_cpu(xen_pmu_irq, cpu).name = pmu_name; - } - return 0; fail: @@ -238,333 +114,7 @@ int xen_smp_intr_init(unsigned int cpu) return rc; } -static void __init xen_fill_possible_map(void) -{ - int i, rc; - - if (xen_initial_domain()) - return; - - for (i = 0; i < nr_cpu_ids; i++) { - rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); - if (rc >= 0) { - num_processors++; - set_cpu_possible(i, true); - } - } -} - -static void __init xen_filter_cpu_maps(void) -{ - int i, rc; - unsigned int subtract = 0; - - if (!xen_initial_domain()) - return; - - num_processors = 0; - disabled_cpus = 0; - for (i = 0; i < nr_cpu_ids; i++) { - rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); - if (rc >= 0) { - num_processors++; - set_cpu_possible(i, true); - } else { - set_cpu_possible(i, false); - set_cpu_present(i, false); - subtract++; - } - } -#ifdef CONFIG_HOTPLUG_CPU - /* This is akin to using 'nr_cpus' on the Linux command line. - * Which is OK as when we use 'dom0_max_vcpus=X' we can only - * have up to X, while nr_cpu_ids is greater than X. This - * normally is not a problem, except when CPU hotplugging - * is involved and then there might be more than X CPUs - * in the guest - which will not work as there is no - * hypercall to expand the max number of VCPUs an already - * running guest has. So cap it up to X. */ - if (subtract) - nr_cpu_ids = nr_cpu_ids - subtract; -#endif - -} - -static void __init xen_smp_prepare_boot_cpu(void) -{ - BUG_ON(smp_processor_id() != 0); - native_smp_prepare_boot_cpu(); - - if (xen_pv_domain()) { - if (!xen_feature(XENFEAT_writable_page_tables)) - /* We've switched to the "real" per-cpu gdt, so make - * sure the old memory can be recycled. */ - make_lowmem_page_readwrite(xen_initial_gdt); - -#ifdef CONFIG_X86_32 - /* - * Xen starts us with XEN_FLAT_RING1_DS, but linux code - * expects __USER_DS - */ - loadsegment(ds, __USER_DS); - loadsegment(es, __USER_DS); -#endif - - xen_filter_cpu_maps(); - xen_setup_vcpu_info_placement(); - } - - /* - * Setup vcpu_info for boot CPU. - */ - if (xen_hvm_domain()) - xen_vcpu_setup(0); - - /* - * The alternative logic (which patches the unlock/lock) runs before - * the smp bootup up code is activated. Hence we need to set this up - * the core kernel is being patched. Otherwise we will have only - * modules patched but not core code. - */ - xen_init_spinlocks(); -} - -static void __init xen_smp_prepare_cpus(unsigned int max_cpus) -{ - unsigned cpu; - unsigned int i; - - if (skip_ioapic_setup) { - char *m = (max_cpus == 0) ? - "The nosmp parameter is incompatible with Xen; " \ - "use Xen dom0_max_vcpus=1 parameter" : - "The noapic parameter is incompatible with Xen"; - - xen_raw_printk(m); - panic(m); - } - xen_init_lock_cpu(0); - - smp_store_boot_cpu_info(); - cpu_data(0).x86_max_cores = 1; - - for_each_possible_cpu(i) { - zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); - } - set_cpu_sibling_map(0); - - xen_pmu_init(0); - - if (xen_smp_intr_init(0)) - BUG(); - - if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL)) - panic("could not allocate xen_cpu_initialized_map\n"); - - cpumask_copy(xen_cpu_initialized_map, cpumask_of(0)); - - /* Restrict the possible_map according to max_cpus. */ - while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { - for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--) - continue; - set_cpu_possible(cpu, false); - } - - for_each_possible_cpu(cpu) - set_cpu_present(cpu, true); -} - -static int -cpu_initialize_context(unsigned int cpu, struct task_struct *idle) -{ - struct vcpu_guest_context *ctxt; - struct desc_struct *gdt; - unsigned long gdt_mfn; - - /* used to tell cpu_init() that it can proceed with initialization */ - cpumask_set_cpu(cpu, cpu_callout_mask); - if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) - return 0; - - ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); - if (ctxt == NULL) - return -ENOMEM; - - gdt = get_cpu_gdt_rw(cpu); - -#ifdef CONFIG_X86_32 - ctxt->user_regs.fs = __KERNEL_PERCPU; - ctxt->user_regs.gs = __KERNEL_STACK_CANARY; -#endif - memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); - - ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; - ctxt->flags = VGCF_IN_KERNEL; - ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ - ctxt->user_regs.ds = __USER_DS; - ctxt->user_regs.es = __USER_DS; - ctxt->user_regs.ss = __KERNEL_DS; - - xen_copy_trap_info(ctxt->trap_ctxt); - - ctxt->ldt_ents = 0; - - BUG_ON((unsigned long)gdt & ~PAGE_MASK); - - gdt_mfn = arbitrary_virt_to_mfn(gdt); - make_lowmem_page_readonly(gdt); - make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); - - ctxt->gdt_frames[0] = gdt_mfn; - ctxt->gdt_ents = GDT_ENTRIES; - - ctxt->kernel_ss = __KERNEL_DS; - ctxt->kernel_sp = idle->thread.sp0; - -#ifdef CONFIG_X86_32 - ctxt->event_callback_cs = __KERNEL_CS; - ctxt->failsafe_callback_cs = __KERNEL_CS; -#else - ctxt->gs_base_kernel = per_cpu_offset(cpu); -#endif - ctxt->event_callback_eip = - (unsigned long)xen_hypervisor_callback; - ctxt->failsafe_callback_eip = - (unsigned long)xen_failsafe_callback; - ctxt->user_regs.cs = __KERNEL_CS; - per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); - - ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); - ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); - if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) - BUG(); - - kfree(ctxt); - return 0; -} - -static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) -{ - int rc; - - common_cpu_up(cpu, idle); - - xen_setup_runstate_info(cpu); - - /* - * PV VCPUs are always successfully taken down (see 'while' loop - * in xen_cpu_die()), so -EBUSY is an error. - */ - rc = cpu_check_up_prepare(cpu); - if (rc) - return rc; - - /* make sure interrupts start blocked */ - per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; - - rc = cpu_initialize_context(cpu, idle); - if (rc) - return rc; - - xen_pmu_init(cpu); - - rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL); - BUG_ON(rc); - - while (cpu_report_state(cpu) != CPU_ONLINE) - HYPERVISOR_sched_op(SCHEDOP_yield, NULL); - - return 0; -} - -static void xen_smp_cpus_done(unsigned int max_cpus) -{ -} - -#ifdef CONFIG_HOTPLUG_CPU -static int xen_cpu_disable(void) -{ - unsigned int cpu = smp_processor_id(); - if (cpu == 0) - return -EBUSY; - - cpu_disable_common(); - - load_cr3(swapper_pg_dir); - return 0; -} - -static void xen_cpu_die(unsigned int cpu) -{ - while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, - xen_vcpu_nr(cpu), NULL)) { - __set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ/10); - } - - if (common_cpu_die(cpu) == 0) { - xen_smp_intr_free(cpu); - xen_uninit_lock_cpu(cpu); - xen_teardown_timer(cpu); - xen_pmu_finish(cpu); - } -} - -static void xen_play_dead(void) /* used only with HOTPLUG_CPU */ -{ - play_dead_common(); - HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL); - cpu_bringup(); - /* - * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down) - * clears certain data that the cpu_idle loop (which called us - * and that we return from) expects. The only way to get that - * data back is to call: - */ - tick_nohz_idle_enter(); - - cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); -} - -#else /* !CONFIG_HOTPLUG_CPU */ -static int xen_cpu_disable(void) -{ - return -ENOSYS; -} - -static void xen_cpu_die(unsigned int cpu) -{ - BUG(); -} - -static void xen_play_dead(void) -{ - BUG(); -} - -#endif -static void stop_self(void *v) -{ - int cpu = smp_processor_id(); - - /* make sure we're not pinning something down */ - load_cr3(swapper_pg_dir); - /* should set up a minimal gdt */ - - set_cpu_online(cpu, false); - - HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL); - BUG(); -} - -static void xen_stop_other_cpus(int wait) -{ - smp_call_function(stop_self, NULL, wait); -} - -static void xen_smp_send_reschedule(int cpu) +void xen_smp_send_reschedule(int cpu) { xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); } @@ -578,7 +128,7 @@ static void __xen_send_IPI_mask(const struct cpumask *mask, xen_send_IPI_one(cpu, vector); } -static void xen_smp_send_call_function_ipi(const struct cpumask *mask) +void xen_smp_send_call_function_ipi(const struct cpumask *mask) { int cpu; @@ -593,7 +143,7 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask) } } -static void xen_smp_send_call_function_single_ipi(int cpu) +void xen_smp_send_call_function_single_ipi(int cpu) { __xen_send_IPI_mask(cpumask_of(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); @@ -698,54 +248,3 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } - -static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) -{ - irq_enter(); - irq_work_run(); - inc_irq_stat(apic_irq_work_irqs); - irq_exit(); - - return IRQ_HANDLED; -} - -static const struct smp_ops xen_smp_ops __initconst = { - .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, - .smp_prepare_cpus = xen_smp_prepare_cpus, - .smp_cpus_done = xen_smp_cpus_done, - - .cpu_up = xen_cpu_up, - .cpu_die = xen_cpu_die, - .cpu_disable = xen_cpu_disable, - .play_dead = xen_play_dead, - - .stop_other_cpus = xen_stop_other_cpus, - .smp_send_reschedule = xen_smp_send_reschedule, - - .send_call_func_ipi = xen_smp_send_call_function_ipi, - .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, -}; - -void __init xen_smp_init(void) -{ - smp_ops = xen_smp_ops; - xen_fill_possible_map(); -} - -static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) -{ - native_smp_prepare_cpus(max_cpus); - WARN_ON(xen_smp_intr_init(0)); - - xen_init_lock_cpu(0); -} - -void __init xen_hvm_smp_init(void) -{ - smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; - smp_ops.smp_send_reschedule = xen_smp_send_reschedule; - smp_ops.cpu_die = xen_cpu_die; - smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; - smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; - smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu; -} diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h index 9beef333584a..8ebb6acca64a 100644 --- a/arch/x86/xen/smp.h +++ b/arch/x86/xen/smp.h @@ -11,7 +11,17 @@ extern void xen_send_IPI_self(int vector); extern int xen_smp_intr_init(unsigned int cpu); extern void xen_smp_intr_free(unsigned int cpu); +int xen_smp_intr_init_pv(unsigned int cpu); +void xen_smp_intr_free_pv(unsigned int cpu); +void xen_smp_send_reschedule(int cpu); +void xen_smp_send_call_function_ipi(const struct cpumask *mask); +void xen_smp_send_call_function_single_ipi(int cpu); + +struct xen_common_irq { + int irq; + char *name; +}; #else /* CONFIG_SMP */ static inline int xen_smp_intr_init(unsigned int cpu) @@ -19,6 +29,12 @@ static inline int xen_smp_intr_init(unsigned int cpu) return 0; } static inline void xen_smp_intr_free(unsigned int cpu) {} + +static inline int xen_smp_intr_init_pv(unsigned int cpu) +{ + return 0; +} +static inline void xen_smp_intr_free_pv(unsigned int cpu) {} #endif /* CONFIG_SMP */ #endif diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c new file mode 100644 index 000000000000..f18561bbf5c9 --- /dev/null +++ b/arch/x86/xen/smp_hvm.c @@ -0,0 +1,63 @@ +#include <asm/smp.h> + +#include <xen/events.h> + +#include "xen-ops.h" +#include "smp.h" + + +static void __init xen_hvm_smp_prepare_boot_cpu(void) +{ + BUG_ON(smp_processor_id() != 0); + native_smp_prepare_boot_cpu(); + + /* + * Setup vcpu_info for boot CPU. + */ + xen_vcpu_setup(0); + + /* + * The alternative logic (which patches the unlock/lock) runs before + * the smp bootup up code is activated. Hence we need to set this up + * the core kernel is being patched. Otherwise we will have only + * modules patched but not core code. + */ + xen_init_spinlocks(); +} + +static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) +{ + native_smp_prepare_cpus(max_cpus); + WARN_ON(xen_smp_intr_init(0)); + + xen_init_lock_cpu(0); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void xen_hvm_cpu_die(unsigned int cpu) +{ + if (common_cpu_die(cpu) == 0) { + xen_smp_intr_free(cpu); + xen_uninit_lock_cpu(cpu); + xen_teardown_timer(cpu); + } +} +#else +static void xen_hvm_cpu_die(unsigned int cpu) +{ + BUG(); +} +#endif + +void __init xen_hvm_smp_init(void) +{ + if (!xen_have_vector_callback) + return; + + smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; + smp_ops.smp_send_reschedule = xen_smp_send_reschedule; + smp_ops.cpu_die = xen_hvm_cpu_die; + smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; + smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; + smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu; +} diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c new file mode 100644 index 000000000000..aae32535f4ec --- /dev/null +++ b/arch/x86/xen/smp_pv.c @@ -0,0 +1,490 @@ +/* + * Xen SMP support + * + * This file implements the Xen versions of smp_ops. SMP under Xen is + * very straightforward. Bringing a CPU up is simply a matter of + * loading its initial context and setting it running. + * + * IPIs are handled through the Xen event mechanism. + * + * Because virtual CPUs can be scheduled onto any real CPU, there's no + * useful topology information for the kernel to make use of. As a + * result, all CPUs are treated as if they're single-core and + * single-threaded. + */ +#include <linux/sched.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/irq_work.h> +#include <linux/tick.h> +#include <linux/nmi.h> + +#include <asm/paravirt.h> +#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/cpu.h> + +#include <xen/interface/xen.h> +#include <xen/interface/vcpu.h> +#include <xen/interface/xenpmu.h> + +#include <asm/xen/interface.h> +#include <asm/xen/hypercall.h> + +#include <xen/xen.h> +#include <xen/page.h> +#include <xen/events.h> + +#include <xen/hvc-console.h> +#include "xen-ops.h" +#include "mmu.h" +#include "smp.h" +#include "pmu.h" + +cpumask_var_t xen_cpu_initialized_map; + +static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 }; + +static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id); + +static void cpu_bringup(void) +{ + int cpu; + + cpu_init(); + touch_softlockup_watchdog(); + preempt_disable(); + + /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */ + if (!xen_feature(XENFEAT_supervisor_mode_kernel)) { + xen_enable_sysenter(); + xen_enable_syscall(); + } + cpu = smp_processor_id(); + smp_store_cpu_info(cpu); + cpu_data(cpu).x86_max_cores = 1; + set_cpu_sibling_map(cpu); + + xen_setup_cpu_clockevents(); + + notify_cpu_starting(cpu); + + set_cpu_online(cpu, true); + + cpu_set_state_online(cpu); /* Implies full memory barrier. */ + + /* We can take interrupts now: we're officially "up". */ + local_irq_enable(); +} + +asmlinkage __visible void cpu_bringup_and_idle(void) +{ + cpu_bringup(); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); +} + +void xen_smp_intr_free_pv(unsigned int cpu) +{ + if (per_cpu(xen_irq_work, cpu).irq >= 0) { + unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL); + per_cpu(xen_irq_work, cpu).irq = -1; + kfree(per_cpu(xen_irq_work, cpu).name); + per_cpu(xen_irq_work, cpu).name = NULL; + } + + if (per_cpu(xen_pmu_irq, cpu).irq >= 0) { + unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL); + per_cpu(xen_pmu_irq, cpu).irq = -1; + kfree(per_cpu(xen_pmu_irq, cpu).name); + per_cpu(xen_pmu_irq, cpu).name = NULL; + } +} + +int xen_smp_intr_init_pv(unsigned int cpu) +{ + int rc; + char *callfunc_name, *pmu_name; + + callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu); + rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, + cpu, + xen_irq_work_interrupt, + IRQF_PERCPU|IRQF_NOBALANCING, + callfunc_name, + NULL); + if (rc < 0) + goto fail; + per_cpu(xen_irq_work, cpu).irq = rc; + per_cpu(xen_irq_work, cpu).name = callfunc_name; + + if (is_xen_pmu(cpu)) { + pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu); + rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu, + xen_pmu_irq_handler, + IRQF_PERCPU|IRQF_NOBALANCING, + pmu_name, NULL); + if (rc < 0) + goto fail; + per_cpu(xen_pmu_irq, cpu).irq = rc; + per_cpu(xen_pmu_irq, cpu).name = pmu_name; + } + + return 0; + + fail: + xen_smp_intr_free_pv(cpu); + return rc; +} + +static void __init xen_fill_possible_map(void) +{ + int i, rc; + + if (xen_initial_domain()) + return; + + for (i = 0; i < nr_cpu_ids; i++) { + rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); + if (rc >= 0) { + num_processors++; + set_cpu_possible(i, true); + } + } +} + +static void __init xen_filter_cpu_maps(void) +{ + int i, rc; + unsigned int subtract = 0; + + if (!xen_initial_domain()) + return; + + num_processors = 0; + disabled_cpus = 0; + for (i = 0; i < nr_cpu_ids; i++) { + rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); + if (rc >= 0) { + num_processors++; + set_cpu_possible(i, true); + } else { + set_cpu_possible(i, false); + set_cpu_present(i, false); + subtract++; + } + } +#ifdef CONFIG_HOTPLUG_CPU + /* This is akin to using 'nr_cpus' on the Linux command line. + * Which is OK as when we use 'dom0_max_vcpus=X' we can only + * have up to X, while nr_cpu_ids is greater than X. This + * normally is not a problem, except when CPU hotplugging + * is involved and then there might be more than X CPUs + * in the guest - which will not work as there is no + * hypercall to expand the max number of VCPUs an already + * running guest has. So cap it up to X. */ + if (subtract) + nr_cpu_ids = nr_cpu_ids - subtract; +#endif + +} + +static void __init xen_pv_smp_prepare_boot_cpu(void) +{ + BUG_ON(smp_processor_id() != 0); + native_smp_prepare_boot_cpu(); + + if (!xen_feature(XENFEAT_writable_page_tables)) + /* We've switched to the "real" per-cpu gdt, so make + * sure the old memory can be recycled. */ + make_lowmem_page_readwrite(xen_initial_gdt); + +#ifdef CONFIG_X86_32 + /* + * Xen starts us with XEN_FLAT_RING1_DS, but linux code + * expects __USER_DS + */ + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#endif + + xen_filter_cpu_maps(); + xen_setup_vcpu_info_placement(); + + /* + * The alternative logic (which patches the unlock/lock) runs before + * the smp bootup up code is activated. Hence we need to set this up + * the core kernel is being patched. Otherwise we will have only + * modules patched but not core code. + */ + xen_init_spinlocks(); +} + +static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) +{ + unsigned cpu; + unsigned int i; + + if (skip_ioapic_setup) { + char *m = (max_cpus == 0) ? + "The nosmp parameter is incompatible with Xen; " \ + "use Xen dom0_max_vcpus=1 parameter" : + "The noapic parameter is incompatible with Xen"; + + xen_raw_printk(m); + panic(m); + } + xen_init_lock_cpu(0); + + smp_store_boot_cpu_info(); + cpu_data(0).x86_max_cores = 1; + + for_each_possible_cpu(i) { + zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); + } + set_cpu_sibling_map(0); + + xen_pmu_init(0); + + if (xen_smp_intr_init(0) || xen_smp_intr_init_pv(0)) + BUG(); + + if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL)) + panic("could not allocate xen_cpu_initialized_map\n"); + + cpumask_copy(xen_cpu_initialized_map, cpumask_of(0)); + + /* Restrict the possible_map according to max_cpus. */ + while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { + for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--) + continue; + set_cpu_possible(cpu, false); + } + + for_each_possible_cpu(cpu) + set_cpu_present(cpu, true); +} + +static int +cpu_initialize_context(unsigned int cpu, struct task_struct *idle) +{ + struct vcpu_guest_context *ctxt; + struct desc_struct *gdt; + unsigned long gdt_mfn; + + /* used to tell cpu_init() that it can proceed with initialization */ + cpumask_set_cpu(cpu, cpu_callout_mask); + if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) + return 0; + + ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); + if (ctxt == NULL) + return -ENOMEM; + + gdt = get_cpu_gdt_rw(cpu); + +#ifdef CONFIG_X86_32 + ctxt->user_regs.fs = __KERNEL_PERCPU; + ctxt->user_regs.gs = __KERNEL_STACK_CANARY; +#endif + memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); + + ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; + ctxt->flags = VGCF_IN_KERNEL; + ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ + ctxt->user_regs.ds = __USER_DS; + ctxt->user_regs.es = __USER_DS; + ctxt->user_regs.ss = __KERNEL_DS; + + xen_copy_trap_info(ctxt->trap_ctxt); + + ctxt->ldt_ents = 0; + + BUG_ON((unsigned long)gdt & ~PAGE_MASK); + + gdt_mfn = arbitrary_virt_to_mfn(gdt); + make_lowmem_page_readonly(gdt); + make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); + + ctxt->gdt_frames[0] = gdt_mfn; + ctxt->gdt_ents = GDT_ENTRIES; + + ctxt->kernel_ss = __KERNEL_DS; + ctxt->kernel_sp = idle->thread.sp0; + +#ifdef CONFIG_X86_32 + ctxt->event_callback_cs = __KERNEL_CS; + ctxt->failsafe_callback_cs = __KERNEL_CS; +#else + ctxt->gs_base_kernel = per_cpu_offset(cpu); +#endif + ctxt->event_callback_eip = + (unsigned long)xen_hypervisor_callback; + ctxt->failsafe_callback_eip = + (unsigned long)xen_failsafe_callback; + ctxt->user_regs.cs = __KERNEL_CS; + per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); + + ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); + ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); + if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) + BUG(); + + kfree(ctxt); + return 0; +} + +static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) +{ + int rc; + + common_cpu_up(cpu, idle); + + xen_setup_runstate_info(cpu); + + /* + * PV VCPUs are always successfully taken down (see 'while' loop + * in xen_cpu_die()), so -EBUSY is an error. + */ + rc = cpu_check_up_prepare(cpu); + if (rc) + return rc; + + /* make sure interrupts start blocked */ + per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; + + rc = cpu_initialize_context(cpu, idle); + if (rc) + return rc; + + xen_pmu_init(cpu); + + rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL); + BUG_ON(rc); + + while (cpu_report_state(cpu) != CPU_ONLINE) + HYPERVISOR_sched_op(SCHEDOP_yield, NULL); + + return 0; +} + +static void xen_pv_smp_cpus_done(unsigned int max_cpus) +{ +} + +#ifdef CONFIG_HOTPLUG_CPU +static int xen_pv_cpu_disable(void) +{ + unsigned int cpu = smp_processor_id(); + if (cpu == 0) + return -EBUSY; + + cpu_disable_common(); + + load_cr3(swapper_pg_dir); + return 0; +} + +static void xen_pv_cpu_die(unsigned int cpu) +{ + while (HYPERVISOR_vcpu_op(VCPUOP_is_up, + xen_vcpu_nr(cpu), NULL)) { + __set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ/10); + } + + if (common_cpu_die(cpu) == 0) { + xen_smp_intr_free(cpu); + xen_uninit_lock_cpu(cpu); + xen_teardown_timer(cpu); + xen_pmu_finish(cpu); + } +} + +static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */ +{ + play_dead_common(); + HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL); + cpu_bringup(); + /* + * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down) + * clears certain data that the cpu_idle loop (which called us + * and that we return from) expects. The only way to get that + * data back is to call: + */ + tick_nohz_idle_enter(); + + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); +} + +#else /* !CONFIG_HOTPLUG_CPU */ +static int xen_pv_cpu_disable(void) +{ + return -ENOSYS; +} + +static void xen_pv_cpu_die(unsigned int cpu) +{ + BUG(); +} + +static void xen_pv_play_dead(void) +{ + BUG(); +} + +#endif +static void stop_self(void *v) +{ + int cpu = smp_processor_id(); + + /* make sure we're not pinning something down */ + load_cr3(swapper_pg_dir); + /* should set up a minimal gdt */ + + set_cpu_online(cpu, false); + + HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL); + BUG(); +} + +static void xen_pv_stop_other_cpus(int wait) +{ + smp_call_function(stop_self, NULL, wait); +} + +static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) +{ + irq_enter(); + irq_work_run(); + inc_irq_stat(apic_irq_work_irqs); + irq_exit(); + + return IRQ_HANDLED; +} + +static const struct smp_ops xen_smp_ops __initconst = { + .smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu, + .smp_prepare_cpus = xen_pv_smp_prepare_cpus, + .smp_cpus_done = xen_pv_smp_cpus_done, + + .cpu_up = xen_pv_cpu_up, + .cpu_die = xen_pv_cpu_die, + .cpu_disable = xen_pv_cpu_disable, + .play_dead = xen_pv_play_dead, + + .stop_other_cpus = xen_pv_stop_other_cpus, + .smp_send_reschedule = xen_smp_send_reschedule, + + .send_call_func_ipi = xen_smp_send_call_function_ipi, + .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, +}; + +void __init xen_smp_init(void) +{ + smp_ops = xen_smp_ops; + xen_fill_possible_map(); +} diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 7f664c416faf..d6b1680693a9 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -14,60 +14,6 @@ #include "mmu.h" #include "pmu.h" -static void xen_pv_pre_suspend(void) -{ - xen_mm_pin_all(); - - xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); - xen_start_info->console.domU.mfn = - mfn_to_pfn(xen_start_info->console.domU.mfn); - - BUG_ON(!irqs_disabled()); - - HYPERVISOR_shared_info = &xen_dummy_shared_info; - if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP), - __pte_ma(0), 0)) - BUG(); -} - -static void xen_hvm_post_suspend(int suspend_cancelled) -{ -#ifdef CONFIG_XEN_PVHVM - int cpu; - if (!suspend_cancelled) - xen_hvm_init_shared_info(); - xen_callback_vector(); - xen_unplug_emulated_devices(); - if (xen_feature(XENFEAT_hvm_safe_pvclock)) { - for_each_online_cpu(cpu) { - xen_setup_runstate_info(cpu); - } - } -#endif -} - -static void xen_pv_post_suspend(int suspend_cancelled) -{ - xen_build_mfn_list_list(); - - xen_setup_shared_info(); - - if (suspend_cancelled) { - xen_start_info->store_mfn = - pfn_to_mfn(xen_start_info->store_mfn); - xen_start_info->console.domU.mfn = - pfn_to_mfn(xen_start_info->console.domU.mfn); - } else { -#ifdef CONFIG_SMP - BUG_ON(xen_cpu_initialized_map == NULL); - cpumask_copy(xen_cpu_initialized_map, cpu_online_mask); -#endif - xen_vcpu_restore(); - } - - xen_mm_unpin_all(); -} - void xen_arch_pre_suspend(void) { if (xen_pv_domain()) diff --git a/arch/x86/xen/suspend_hvm.c b/arch/x86/xen/suspend_hvm.c new file mode 100644 index 000000000000..01afcadde50a --- /dev/null +++ b/arch/x86/xen/suspend_hvm.c @@ -0,0 +1,22 @@ +#include <linux/types.h> + +#include <xen/xen.h> +#include <xen/features.h> +#include <xen/interface/features.h> + +#include "xen-ops.h" + +void xen_hvm_post_suspend(int suspend_cancelled) +{ + int cpu; + + if (!suspend_cancelled) + xen_hvm_init_shared_info(); + xen_callback_vector(); + xen_unplug_emulated_devices(); + if (xen_feature(XENFEAT_hvm_safe_pvclock)) { + for_each_online_cpu(cpu) { + xen_setup_runstate_info(cpu); + } + } +} diff --git a/arch/x86/xen/suspend_pv.c b/arch/x86/xen/suspend_pv.c new file mode 100644 index 000000000000..3abe4f07f34a --- /dev/null +++ b/arch/x86/xen/suspend_pv.c @@ -0,0 +1,46 @@ +#include <linux/types.h> + +#include <asm/fixmap.h> + +#include <asm/xen/hypercall.h> +#include <asm/xen/page.h> + +#include "xen-ops.h" + +void xen_pv_pre_suspend(void) +{ + xen_mm_pin_all(); + + xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); + xen_start_info->console.domU.mfn = + mfn_to_pfn(xen_start_info->console.domU.mfn); + + BUG_ON(!irqs_disabled()); + + HYPERVISOR_shared_info = &xen_dummy_shared_info; + if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP), + __pte_ma(0), 0)) + BUG(); +} + +void xen_pv_post_suspend(int suspend_cancelled) +{ + xen_build_mfn_list_list(); + + xen_setup_shared_info(); + + if (suspend_cancelled) { + xen_start_info->store_mfn = + pfn_to_mfn(xen_start_info->store_mfn); + xen_start_info->console.domU.mfn = + pfn_to_mfn(xen_start_info->console.domU.mfn); + } else { +#ifdef CONFIG_SMP + BUG_ON(xen_cpu_initialized_map == NULL); + cpumask_copy(xen_cpu_initialized_map, cpu_online_mask); +#endif + xen_vcpu_restore(); + } + + xen_mm_unpin_all(); +} diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 7a3089285c59..090c7eb4dca9 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -436,6 +436,14 @@ static void xen_hvm_setup_cpu_clockevents(void) void __init xen_hvm_init_time_ops(void) { + /* + * vector callback is needed otherwise we cannot receive interrupts + * on cpu > 0 and at this point we don't know how many cpus are + * available. + */ + if (!xen_have_vector_callback) + return; + if (!xen_feature(XENFEAT_hvm_safe_pvclock)) { printk(KERN_INFO "Xen doesn't support pvclock on HVM," "disable pv timer\n"); diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 37794e42b67d..72a8e6adebe6 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -16,6 +16,7 @@ #include <xen/interface/xen-mca.h> #include <asm/xen/interface.h> +#ifdef CONFIG_XEN_PV __INIT ENTRY(startup_xen) cld @@ -34,6 +35,7 @@ ENTRY(startup_xen) jmp xen_start_kernel __FINIT +#endif .pushsection .text .balign PAGE_SIZE @@ -58,7 +60,9 @@ ENTRY(hypercall_page) /* Map the p2m table to a 512GB-aligned user address. */ ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE) #endif +#ifdef CONFIG_XEN_PV ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) +#endif ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables|pae_pgdir_above_4gb") diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index f6a41c41ebc7..9a440a42c618 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -76,6 +76,8 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id); bool xen_vcpu_stolen(int vcpu); +extern int xen_have_vcpu_info_placement; + void xen_vcpu_setup(int cpu); void xen_setup_vcpu_info_placement(void); @@ -146,4 +148,24 @@ __visible void xen_adjust_exception_frame(void); extern int xen_panic_handler_init(void); +int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), + int (*cpu_dead_cb)(unsigned int)); + +void xen_pin_vcpu(int cpu); + +void xen_emergency_restart(void); +#ifdef CONFIG_XEN_PV +void xen_pv_pre_suspend(void); +void xen_pv_post_suspend(int suspend_cancelled); +#else +static inline void xen_pv_pre_suspend(void) {} +static inline void xen_pv_post_suspend(int suspend_cancelled) {} +#endif + +#ifdef CONFIG_XEN_PVHVM +void xen_hvm_post_suspend(int suspend_cancelled); +#else +static inline void xen_hvm_post_suspend(int suspend_cancelled) {} +#endif + #endif /* XEN_OPS_H */ diff --git a/drivers/scsi/xen-scsifront.c b/drivers/scsi/xen-scsifront.c index 9aa1fe1fc939..a6a8b60d4902 100644 --- a/drivers/scsi/xen-scsifront.c +++ b/drivers/scsi/xen-scsifront.c @@ -434,7 +434,7 @@ static int map_data_for_request(struct vscsifrnt_info *info, if (seg_grants) { page = virt_to_page(seg); - off = (unsigned long)seg & ~PAGE_MASK; + off = offset_in_page(seg); len = sizeof(struct scsiif_request_segment) * data_grants; while (len > 0) { bytes = min_t(unsigned int, len, PAGE_SIZE - off); diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index a6d4378eb8d9..50dcb68d8070 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -709,6 +709,7 @@ void free_xenballooned_pages(int nr_pages, struct page **pages) } EXPORT_SYMBOL(free_xenballooned_pages); +#ifdef CONFIG_XEN_PV static void __init balloon_add_region(unsigned long start_pfn, unsigned long pages) { @@ -732,19 +733,22 @@ static void __init balloon_add_region(unsigned long start_pfn, balloon_stats.total_pages += extra_pfn_end - start_pfn; } +#endif static int __init balloon_init(void) { - int i; - if (!xen_domain()) return -ENODEV; pr_info("Initialising balloon driver\n"); +#ifdef CONFIG_XEN_PV balloon_stats.current_pages = xen_pv_domain() ? min(xen_start_info->nr_pages - xen_released_pages, max_pfn) : get_num_physpages(); +#else + balloon_stats.current_pages = get_num_physpages(); +#endif balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; @@ -761,14 +765,20 @@ static int __init balloon_init(void) register_sysctl_table(xen_root); #endif - /* - * Initialize the balloon with pages from the extra memory - * regions (see arch/x86/xen/setup.c). - */ - for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) - if (xen_extra_mem[i].n_pfns) - balloon_add_region(xen_extra_mem[i].start_pfn, - xen_extra_mem[i].n_pfns); +#ifdef CONFIG_XEN_PV + { + int i; + + /* + * Initialize the balloon with pages from the extra memory + * regions (see arch/x86/xen/setup.c). + */ + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) + if (xen_extra_mem[i].n_pfns) + balloon_add_region(xen_extra_mem[i].start_pfn, + xen_extra_mem[i].n_pfns); + } +#endif return 0; } diff --git a/drivers/xen/efi.c b/drivers/xen/efi.c index 22f71ffd3406..9243a9051078 100644 --- a/drivers/xen/efi.c +++ b/drivers/xen/efi.c @@ -26,6 +26,7 @@ #include <xen/interface/xen.h> #include <xen/interface/platform.h> #include <xen/xen.h> +#include <xen/xen-ops.h> #include <asm/page.h> @@ -263,3 +264,20 @@ efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules, return efi_data(op).status; } EXPORT_SYMBOL_GPL(xen_efi_query_capsule_caps); + +void xen_efi_reset_system(int reset_type, efi_status_t status, + unsigned long data_size, efi_char16_t *data) +{ + switch (reset_type) { + case EFI_RESET_COLD: + case EFI_RESET_WARM: + xen_reboot(SHUTDOWN_reboot); + break; + case EFI_RESET_SHUTDOWN: + xen_reboot(SHUTDOWN_poweroff); + break; + default: + BUG(); + } +} +EXPORT_SYMBOL_GPL(xen_efi_reset_system); diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 6a53577772c9..b52852f38cff 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1312,6 +1312,9 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) if (!VALID_EVTCHN(evtchn)) return -1; + if (!xen_support_evtchn_rebind()) + return -1; + /* Send future instances of this interrupt to other vcpu. */ bind_vcpu.port = evtchn; bind_vcpu.vcpu = xen_vcpu_nr(tcpu); @@ -1646,14 +1649,20 @@ void xen_callback_vector(void) int rc; uint64_t callback_via; - callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR); - rc = xen_set_callback_via(callback_via); - BUG_ON(rc); - pr_info("Xen HVM callback vector for event delivery is enabled\n"); - /* in the restore case the vector has already been allocated */ - if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, - xen_hvm_callback_vector); + if (xen_have_vector_callback) { + callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR); + rc = xen_set_callback_via(callback_via); + if (rc) { + pr_err("Request for Xen HVM callback vector failed\n"); + xen_have_vector_callback = 0; + return; + } + pr_info("Xen HVM callback vector for event delivery is enabled\n"); + /* in the restore case the vector has already been allocated */ + if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, + xen_hvm_callback_vector); + } } #else void xen_callback_vector(void) {} diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index 2a165cc8a43c..1275df83070f 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -90,8 +90,10 @@ static int xen_allocate_irq(struct pci_dev *pdev) static int platform_pci_resume(struct pci_dev *pdev) { int err; - if (!xen_pv_domain()) + + if (xen_have_vector_callback) return 0; + err = xen_set_callback_via(callback_via); if (err) { dev_err(&pdev->dev, "platform_pci_resume failure!\n"); @@ -137,15 +139,7 @@ static int platform_pci_probe(struct pci_dev *pdev, platform_mmio = mmio_addr; platform_mmiolen = mmio_len; - - /* - * Xen HVM guests always use the vector callback mechanism. - * L1 Dom0 in a nested Xen environment is a PV guest inside in an - * HVM environment. It needs the platform-pci driver to get - * notifications from L0 Xen, but it cannot use the vector callback - * as it is not exported by L1 Xen. - */ - if (xen_pv_domain()) { + if (!xen_have_vector_callback) { ret = xen_allocate_irq(pdev); if (ret) { dev_warn(&pdev->dev, "request_irq failed err=%d\n", ret); diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index e8cef1ad0fe3..8dab0d3dc172 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -693,8 +693,8 @@ xen_swiotlb_dma_mmap(struct device *dev, struct vm_area_struct *vma, unsigned long attrs) { #if defined(CONFIG_ARM) || defined(CONFIG_ARM64) - if (__generic_dma_ops(dev)->mmap) - return __generic_dma_ops(dev)->mmap(dev, vma, cpu_addr, + if (xen_get_dma_ops(dev)->mmap) + return xen_get_dma_ops(dev)->mmap(dev, vma, cpu_addr, dma_addr, size, attrs); #endif return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size); @@ -711,7 +711,7 @@ xen_swiotlb_get_sgtable(struct device *dev, struct sg_table *sgt, unsigned long attrs) { #if defined(CONFIG_ARM) || defined(CONFIG_ARM64) - if (__generic_dma_ops(dev)->get_sgtable) { + if (xen_get_dma_ops(dev)->get_sgtable) { #if 0 /* * This check verifies that the page belongs to the current domain and @@ -721,7 +721,7 @@ xen_swiotlb_get_sgtable(struct device *dev, struct sg_table *sgt, unsigned long bfn = PHYS_PFN(dma_to_phys(dev, handle)); BUG_ON (!page_is_ram(bfn)); #endif - return __generic_dma_ops(dev)->get_sgtable(dev, sgt, cpu_addr, + return xen_get_dma_ops(dev)->get_sgtable(dev, sgt, cpu_addr, handle, size, attrs); } #endif diff --git a/include/xen/arm/page-coherent.h b/include/xen/arm/page-coherent.h index 95ce6ac3a971..b1b4ecdf329a 100644 --- a/include/xen/arm/page-coherent.h +++ b/include/xen/arm/page-coherent.h @@ -2,8 +2,16 @@ #define _ASM_ARM_XEN_PAGE_COHERENT_H #include <asm/page.h> +#include <asm/dma-mapping.h> #include <linux/dma-mapping.h> +static inline const struct dma_map_ops *xen_get_dma_ops(struct device *dev) +{ + if (dev && dev->archdata.dev_dma_ops) + return dev->archdata.dev_dma_ops; + return get_arch_dma_ops(NULL); +} + void __xen_dma_map_page(struct device *hwdev, struct page *page, dma_addr_t dev_addr, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs); @@ -19,13 +27,13 @@ void __xen_dma_sync_single_for_device(struct device *hwdev, static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags, unsigned long attrs) { - return __generic_dma_ops(hwdev)->alloc(hwdev, size, dma_handle, flags, attrs); + return xen_get_dma_ops(hwdev)->alloc(hwdev, size, dma_handle, flags, attrs); } static inline void xen_free_coherent_pages(struct device *hwdev, size_t size, void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs) { - __generic_dma_ops(hwdev)->free(hwdev, size, cpu_addr, dma_handle, attrs); + xen_get_dma_ops(hwdev)->free(hwdev, size, cpu_addr, dma_handle, attrs); } static inline void xen_dma_map_page(struct device *hwdev, struct page *page, @@ -49,7 +57,7 @@ static inline void xen_dma_map_page(struct device *hwdev, struct page *page, * specific function. */ if (local) - __generic_dma_ops(hwdev)->map_page(hwdev, page, offset, size, dir, attrs); + xen_get_dma_ops(hwdev)->map_page(hwdev, page, offset, size, dir, attrs); else __xen_dma_map_page(hwdev, page, dev_addr, offset, size, dir, attrs); } @@ -67,8 +75,8 @@ static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle, * specific function. */ if (pfn_valid(pfn)) { - if (__generic_dma_ops(hwdev)->unmap_page) - __generic_dma_ops(hwdev)->unmap_page(hwdev, handle, size, dir, attrs); + if (xen_get_dma_ops(hwdev)->unmap_page) + xen_get_dma_ops(hwdev)->unmap_page(hwdev, handle, size, dir, attrs); } else __xen_dma_unmap_page(hwdev, handle, size, dir, attrs); } @@ -78,8 +86,8 @@ static inline void xen_dma_sync_single_for_cpu(struct device *hwdev, { unsigned long pfn = PFN_DOWN(handle); if (pfn_valid(pfn)) { - if (__generic_dma_ops(hwdev)->sync_single_for_cpu) - __generic_dma_ops(hwdev)->sync_single_for_cpu(hwdev, handle, size, dir); + if (xen_get_dma_ops(hwdev)->sync_single_for_cpu) + xen_get_dma_ops(hwdev)->sync_single_for_cpu(hwdev, handle, size, dir); } else __xen_dma_sync_single_for_cpu(hwdev, handle, size, dir); } @@ -89,8 +97,8 @@ static inline void xen_dma_sync_single_for_device(struct device *hwdev, { unsigned long pfn = PFN_DOWN(handle); if (pfn_valid(pfn)) { - if (__generic_dma_ops(hwdev)->sync_single_for_device) - __generic_dma_ops(hwdev)->sync_single_for_device(hwdev, handle, size, dir); + if (xen_get_dma_ops(hwdev)->sync_single_for_device) + xen_get_dma_ops(hwdev)->sync_single_for_device(hwdev, handle, size, dir); } else __xen_dma_sync_single_for_device(hwdev, handle, size, dir); } diff --git a/include/xen/interface/io/9pfs.h b/include/xen/interface/io/9pfs.h new file mode 100644 index 000000000000..5b6c19dae5e2 --- /dev/null +++ b/include/xen/interface/io/9pfs.h @@ -0,0 +1,36 @@ +/* + * 9pfs.h -- Xen 9PFS transport + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (C) 2017 Stefano Stabellini <stefano@aporeto.com> + */ + +#ifndef __XEN_PUBLIC_IO_9PFS_H__ +#define __XEN_PUBLIC_IO_9PFS_H__ + +#include "xen/interface/io/ring.h" + +/* + * See docs/misc/9pfs.markdown in xen.git for the full specification: + * https://xenbits.xen.org/docs/unstable/misc/9pfs.html + */ +DEFINE_XEN_FLEX_RING_AND_INTF(xen_9pfs); + +#endif diff --git a/include/xen/interface/io/displif.h b/include/xen/interface/io/displif.h new file mode 100644 index 000000000000..596578d9be3e --- /dev/null +++ b/include/xen/interface/io/displif.h @@ -0,0 +1,854 @@ +/****************************************************************************** + * displif.h + * + * Unified display device I/O interface for Xen guest OSes. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (C) 2016-2017 EPAM Systems Inc. + * + * Authors: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com> + * Oleksandr Grytsov <oleksandr_grytsov@epam.com> + */ + +#ifndef __XEN_PUBLIC_IO_DISPLIF_H__ +#define __XEN_PUBLIC_IO_DISPLIF_H__ + +#include "ring.h" +#include "../grant_table.h" + +/* + ****************************************************************************** + * Protocol version + ****************************************************************************** + */ +#define XENDISPL_PROTOCOL_VERSION "1" + +/* + ****************************************************************************** + * Main features provided by the protocol + ****************************************************************************** + * This protocol aims to provide a unified protocol which fits more + * sophisticated use-cases than a framebuffer device can handle. At the + * moment basic functionality is supported with the intention to be extended: + * o multiple dynamically allocated/destroyed framebuffers + * o buffers of arbitrary sizes + * o buffer allocation at either back or front end + * o better configuration options including multiple display support + * + * Note: existing fbif can be used together with displif running at the + * same time, e.g. on Linux one provides framebuffer and another DRM/KMS + * + * Note: display resolution (XenStore's "resolution" property) defines + * visible area of the virtual display. At the same time resolution of + * the display and frame buffers may differ: buffers can be smaller, equal + * or bigger than the visible area. This is to enable use-cases, where backend + * may do some post-processing of the display and frame buffers supplied, + * e.g. those buffers can be just a part of the final composition. + * + ****************************************************************************** + * Direction of improvements + ****************************************************************************** + * Future extensions to the existing protocol may include: + * o display/connector cloning + * o allocation of objects other than display buffers + * o plane/overlay support + * o scaling support + * o rotation support + * + ****************************************************************************** + * Feature and Parameter Negotiation + ****************************************************************************** + * + * Front->back notifications: when enqueuing a new request, sending a + * notification can be made conditional on xendispl_req (i.e., the generic + * hold-off mechanism provided by the ring macros). Backends must set + * xendispl_req appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()). + * + * Back->front notifications: when enqueuing a new response, sending a + * notification can be made conditional on xendispl_resp (i.e., the generic + * hold-off mechanism provided by the ring macros). Frontends must set + * xendispl_resp appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()). + * + * The two halves of a para-virtual display driver utilize nodes within + * XenStore to communicate capabilities and to negotiate operating parameters. + * This section enumerates these nodes which reside in the respective front and + * backend portions of XenStore, following the XenBus convention. + * + * All data in XenStore is stored as strings. Nodes specifying numeric + * values are encoded in decimal. Integer value ranges listed below are + * expressed as fixed sized integer types capable of storing the conversion + * of a properly formated node string, without loss of information. + * + ****************************************************************************** + * Example configuration + ****************************************************************************** + * + * Note: depending on the use-case backend can expose more display connectors + * than the underlying HW physically has by employing SW graphics compositors + * + * This is an example of backend and frontend configuration: + * + *--------------------------------- Backend ----------------------------------- + * + * /local/domain/0/backend/vdispl/1/0/frontend-id = "1" + * /local/domain/0/backend/vdispl/1/0/frontend = "/local/domain/1/device/vdispl/0" + * /local/domain/0/backend/vdispl/1/0/state = "4" + * /local/domain/0/backend/vdispl/1/0/versions = "1,2" + * + *--------------------------------- Frontend ---------------------------------- + * + * /local/domain/1/device/vdispl/0/backend-id = "0" + * /local/domain/1/device/vdispl/0/backend = "/local/domain/0/backend/vdispl/1/0" + * /local/domain/1/device/vdispl/0/state = "4" + * /local/domain/1/device/vdispl/0/version = "1" + * /local/domain/1/device/vdispl/0/be-alloc = "1" + * + *-------------------------- Connector 0 configuration ------------------------ + * + * /local/domain/1/device/vdispl/0/0/resolution = "1920x1080" + * /local/domain/1/device/vdispl/0/0/req-ring-ref = "2832" + * /local/domain/1/device/vdispl/0/0/req-event-channel = "15" + * /local/domain/1/device/vdispl/0/0/evt-ring-ref = "387" + * /local/domain/1/device/vdispl/0/0/evt-event-channel = "16" + * + *-------------------------- Connector 1 configuration ------------------------ + * + * /local/domain/1/device/vdispl/0/1/resolution = "800x600" + * /local/domain/1/device/vdispl/0/1/req-ring-ref = "2833" + * /local/domain/1/device/vdispl/0/1/req-event-channel = "17" + * /local/domain/1/device/vdispl/0/1/evt-ring-ref = "388" + * /local/domain/1/device/vdispl/0/1/evt-event-channel = "18" + * + ****************************************************************************** + * Backend XenBus Nodes + ****************************************************************************** + * + *----------------------------- Protocol version ------------------------------ + * + * versions + * Values: <string> + * + * List of XENDISPL_LIST_SEPARATOR separated protocol versions supported + * by the backend. For example "1,2,3". + * + ****************************************************************************** + * Frontend XenBus Nodes + ****************************************************************************** + * + *-------------------------------- Addressing --------------------------------- + * + * dom-id + * Values: <uint16_t> + * + * Domain identifier. + * + * dev-id + * Values: <uint16_t> + * + * Device identifier. + * + * conn-idx + * Values: <uint8_t> + * + * Zero based contigous index of the connector. + * /local/domain/<dom-id>/device/vdispl/<dev-id>/<conn-idx>/... + * + *----------------------------- Protocol version ------------------------------ + * + * version + * Values: <string> + * + * Protocol version, chosen among the ones supported by the backend. + * + *------------------------- Backend buffer allocation ------------------------- + * + * be-alloc + * Values: "0", "1" + * + * If value is set to "1", then backend can be a buffer provider/allocator + * for this domain during XENDISPL_OP_DBUF_CREATE operation (see below + * for negotiation). + * If value is not "1" or omitted frontend must allocate buffers itself. + * + *----------------------------- Connector settings ---------------------------- + * + * resolution + * Values: <width, uint32_t>x<height, uint32_t> + * + * Width and height of the connector in pixels separated by + * XENDISPL_RESOLUTION_SEPARATOR. This defines visible area of the + * display. + * + *------------------ Connector Request Transport Parameters ------------------- + * + * This communication path is used to deliver requests from frontend to backend + * and get the corresponding responses from backend to frontend, + * set up per connector. + * + * req-event-channel + * Values: <uint32_t> + * + * The identifier of the Xen connector's control event channel + * used to signal activity in the ring buffer. + * + * req-ring-ref + * Values: <uint32_t> + * + * The Xen grant reference granting permission for the backend to map + * a sole page of connector's control ring buffer. + * + *------------------- Connector Event Transport Parameters -------------------- + * + * This communication path is used to deliver asynchronous events from backend + * to frontend, set up per connector. + * + * evt-event-channel + * Values: <uint32_t> + * + * The identifier of the Xen connector's event channel + * used to signal activity in the ring buffer. + * + * evt-ring-ref + * Values: <uint32_t> + * + * The Xen grant reference granting permission for the backend to map + * a sole page of connector's event ring buffer. + */ + +/* + ****************************************************************************** + * STATE DIAGRAMS + ****************************************************************************** + * + * Tool stack creates front and back state nodes with initial state + * XenbusStateInitialising. + * Tool stack creates and sets up frontend display configuration + * nodes per domain. + * + *-------------------------------- Normal flow -------------------------------- + * + * Front Back + * ================================= ===================================== + * XenbusStateInitialising XenbusStateInitialising + * o Query backend device identification + * data. + * o Open and validate backend device. + * | + * | + * V + * XenbusStateInitWait + * + * o Query frontend configuration + * o Allocate and initialize + * event channels per configured + * connector. + * o Publish transport parameters + * that will be in effect during + * this connection. + * | + * | + * V + * XenbusStateInitialised + * + * o Query frontend transport parameters. + * o Connect to the event channels. + * | + * | + * V + * XenbusStateConnected + * + * o Create and initialize OS + * virtual display connectors + * as per configuration. + * | + * | + * V + * XenbusStateConnected + * + * XenbusStateUnknown + * XenbusStateClosed + * XenbusStateClosing + * o Remove virtual display device + * o Remove event channels + * | + * | + * V + * XenbusStateClosed + * + *------------------------------- Recovery flow ------------------------------- + * + * In case of frontend unrecoverable errors backend handles that as + * if frontend goes into the XenbusStateClosed state. + * + * In case of backend unrecoverable errors frontend tries removing + * the virtualized device. If this is possible at the moment of error, + * then frontend goes into the XenbusStateInitialising state and is ready for + * new connection with backend. If the virtualized device is still in use and + * cannot be removed, then frontend goes into the XenbusStateReconfiguring state + * until either the virtualized device is removed or backend initiates a new + * connection. On the virtualized device removal frontend goes into the + * XenbusStateInitialising state. + * + * Note on XenbusStateReconfiguring state of the frontend: if backend has + * unrecoverable errors then frontend cannot send requests to the backend + * and thus cannot provide functionality of the virtualized device anymore. + * After backend is back to normal the virtualized device may still hold some + * state: configuration in use, allocated buffers, client application state etc. + * In most cases, this will require frontend to implement complex recovery + * reconnect logic. Instead, by going into XenbusStateReconfiguring state, + * frontend will make sure no new clients of the virtualized device are + * accepted, allow existing client(s) to exit gracefully by signaling error + * state etc. + * Once all the clients are gone frontend can reinitialize the virtualized + * device and get into XenbusStateInitialising state again signaling the + * backend that a new connection can be made. + * + * There are multiple conditions possible under which frontend will go from + * XenbusStateReconfiguring into XenbusStateInitialising, some of them are OS + * specific. For example: + * 1. The underlying OS framework may provide callbacks to signal that the last + * client of the virtualized device has gone and the device can be removed + * 2. Frontend can schedule a deferred work (timer/tasklet/workqueue) + * to periodically check if this is the right time to re-try removal of + * the virtualized device. + * 3. By any other means. + * + ****************************************************************************** + * REQUEST CODES + ****************************************************************************** + * Request codes [0; 15] are reserved and must not be used + */ + +#define XENDISPL_OP_DBUF_CREATE 0x10 +#define XENDISPL_OP_DBUF_DESTROY 0x11 +#define XENDISPL_OP_FB_ATTACH 0x12 +#define XENDISPL_OP_FB_DETACH 0x13 +#define XENDISPL_OP_SET_CONFIG 0x14 +#define XENDISPL_OP_PG_FLIP 0x15 + +/* + ****************************************************************************** + * EVENT CODES + ****************************************************************************** + */ +#define XENDISPL_EVT_PG_FLIP 0x00 + +/* + ****************************************************************************** + * XENSTORE FIELD AND PATH NAME STRINGS, HELPERS + ****************************************************************************** + */ +#define XENDISPL_DRIVER_NAME "vdispl" + +#define XENDISPL_LIST_SEPARATOR "," +#define XENDISPL_RESOLUTION_SEPARATOR "x" + +#define XENDISPL_FIELD_BE_VERSIONS "versions" +#define XENDISPL_FIELD_FE_VERSION "version" +#define XENDISPL_FIELD_REQ_RING_REF "req-ring-ref" +#define XENDISPL_FIELD_REQ_CHANNEL "req-event-channel" +#define XENDISPL_FIELD_EVT_RING_REF "evt-ring-ref" +#define XENDISPL_FIELD_EVT_CHANNEL "evt-event-channel" +#define XENDISPL_FIELD_RESOLUTION "resolution" +#define XENDISPL_FIELD_BE_ALLOC "be-alloc" + +/* + ****************************************************************************** + * STATUS RETURN CODES + ****************************************************************************** + * + * Status return code is zero on success and -XEN_EXX on failure. + * + ****************************************************************************** + * Assumptions + ****************************************************************************** + * o usage of grant reference 0 as invalid grant reference: + * grant reference 0 is valid, but never exposed to a PV driver, + * because of the fact it is already in use/reserved by the PV console. + * o all references in this document to page sizes must be treated + * as pages of size XEN_PAGE_SIZE unless otherwise noted. + * + ****************************************************************************** + * Description of the protocol between frontend and backend driver + ****************************************************************************** + * + * The two halves of a Para-virtual display driver communicate with + * each other using shared pages and event channels. + * Shared page contains a ring with request/response packets. + * + * All reserved fields in the structures below must be 0. + * Display buffers's cookie of value 0 is treated as invalid. + * Framebuffer's cookie of value 0 is treated as invalid. + * + * For all request/response/event packets that use cookies: + * dbuf_cookie - uint64_t, unique to guest domain value used by the backend + * to map remote display buffer to its local one + * fb_cookie - uint64_t, unique to guest domain value used by the backend + * to map remote framebuffer to its local one + * + *---------------------------------- Requests --------------------------------- + * + * All requests/responses, which are not connector specific, must be sent over + * control ring of the connector which has the index value of 0: + * /local/domain/<dom-id>/device/vdispl/<dev-id>/0/req-ring-ref + * + * All request packets have the same length (64 octets) + * All request packets have common header: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | operation | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * id - uint16_t, private guest value, echoed in response + * operation - uint8_t, operation code, XENDISPL_OP_??? + * + * Request dbuf creation - request creation of a display buffer. + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id |_OP_DBUF_CREATE | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | dbuf_cookie low 32-bit | 12 + * +----------------+----------------+----------------+----------------+ + * | dbuf_cookie high 32-bit | 16 + * +----------------+----------------+----------------+----------------+ + * | width | 20 + * +----------------+----------------+----------------+----------------+ + * | height | 24 + * +----------------+----------------+----------------+----------------+ + * | bpp | 28 + * +----------------+----------------+----------------+----------------+ + * | buffer_sz | 32 + * +----------------+----------------+----------------+----------------+ + * | flags | 36 + * +----------------+----------------+----------------+----------------+ + * | gref_directory | 40 + * +----------------+----------------+----------------+----------------+ + * | reserved | 44 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 64 + * +----------------+----------------+----------------+----------------+ + * + * Must be sent over control ring of the connector which has the index + * value of 0: + * /local/domain/<dom-id>/device/vdispl/<dev-id>/0/req-ring-ref + * All unused bits in flags field must be set to 0. + * + * An attempt to create multiple display buffers with the same dbuf_cookie is + * an error. dbuf_cookie can be re-used after destroying the corresponding + * display buffer. + * + * Width and height of the display buffers can be smaller, equal or bigger + * than the connector's resolution. Depth/pixel format of the individual + * buffers can differ as well. + * + * width - uint32_t, width in pixels + * height - uint32_t, height in pixels + * bpp - uint32_t, bits per pixel + * buffer_sz - uint32_t, buffer size to be allocated, octets + * flags - uint32_t, flags of the operation + * o XENDISPL_DBUF_FLG_REQ_ALLOC - if set, then backend is requested + * to allocate the buffer with the parameters provided in this request. + * Page directory is handled as follows: + * Frontend on request: + * o allocates pages for the directory (gref_directory, + * gref_dir_next_page(s) + * o grants permissions for the pages of the directory to the backend + * o sets gref_dir_next_page fields + * Backend on response: + * o grants permissions for the pages of the buffer allocated to + * the frontend + * o fills in page directory with grant references + * (gref[] in struct xendispl_page_directory) + * gref_directory - grant_ref_t, a reference to the first shared page + * describing shared buffer references. At least one page exists. If shared + * buffer size (buffer_sz) exceeds what can be addressed by this single page, + * then reference to the next page must be supplied (see gref_dir_next_page + * below) + */ + +#define XENDISPL_DBUF_FLG_REQ_ALLOC (1 << 0) + +struct xendispl_dbuf_create_req { + uint64_t dbuf_cookie; + uint32_t width; + uint32_t height; + uint32_t bpp; + uint32_t buffer_sz; + uint32_t flags; + grant_ref_t gref_directory; +}; + +/* + * Shared page for XENDISPL_OP_DBUF_CREATE buffer descriptor (gref_directory in + * the request) employs a list of pages, describing all pages of the shared + * data buffer: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | gref_dir_next_page | 4 + * +----------------+----------------+----------------+----------------+ + * | gref[0] | 8 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | gref[i] | i*4+8 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | gref[N - 1] | N*4+8 + * +----------------+----------------+----------------+----------------+ + * + * gref_dir_next_page - grant_ref_t, reference to the next page describing + * page directory. Must be 0 if there are no more pages in the list. + * gref[i] - grant_ref_t, reference to a shared page of the buffer + * allocated at XENDISPL_OP_DBUF_CREATE + * + * Number of grant_ref_t entries in the whole page directory is not + * passed, but instead can be calculated as: + * num_grefs_total = (XENDISPL_OP_DBUF_CREATE.buffer_sz + XEN_PAGE_SIZE - 1) / + * XEN_PAGE_SIZE + */ + +struct xendispl_page_directory { + grant_ref_t gref_dir_next_page; + grant_ref_t gref[1]; /* Variable length */ +}; + +/* + * Request dbuf destruction - destroy a previously allocated display buffer: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id |_OP_DBUF_DESTROY| reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | dbuf_cookie low 32-bit | 12 + * +----------------+----------------+----------------+----------------+ + * | dbuf_cookie high 32-bit | 16 + * +----------------+----------------+----------------+----------------+ + * | reserved | 20 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 64 + * +----------------+----------------+----------------+----------------+ + * + * Must be sent over control ring of the connector which has the index + * value of 0: + * /local/domain/<dom-id>/device/vdispl/<dev-id>/0/req-ring-ref + */ + +struct xendispl_dbuf_destroy_req { + uint64_t dbuf_cookie; +}; + +/* + * Request framebuffer attachment - request attachment of a framebuffer to + * previously created display buffer. + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | _OP_FB_ATTACH | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | dbuf_cookie low 32-bit | 12 + * +----------------+----------------+----------------+----------------+ + * | dbuf_cookie high 32-bit | 16 + * +----------------+----------------+----------------+----------------+ + * | fb_cookie low 32-bit | 20 + * +----------------+----------------+----------------+----------------+ + * | fb_cookie high 32-bit | 24 + * +----------------+----------------+----------------+----------------+ + * | width | 28 + * +----------------+----------------+----------------+----------------+ + * | height | 32 + * +----------------+----------------+----------------+----------------+ + * | pixel_format | 36 + * +----------------+----------------+----------------+----------------+ + * | reserved | 40 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 64 + * +----------------+----------------+----------------+----------------+ + * + * Must be sent over control ring of the connector which has the index + * value of 0: + * /local/domain/<dom-id>/device/vdispl/<dev-id>/0/req-ring-ref + * Width and height can be smaller, equal or bigger than the connector's + * resolution. + * + * An attempt to create multiple frame buffers with the same fb_cookie is + * an error. fb_cookie can be re-used after destroying the corresponding + * frame buffer. + * + * width - uint32_t, width in pixels + * height - uint32_t, height in pixels + * pixel_format - uint32_t, pixel format of the framebuffer, FOURCC code + */ + +struct xendispl_fb_attach_req { + uint64_t dbuf_cookie; + uint64_t fb_cookie; + uint32_t width; + uint32_t height; + uint32_t pixel_format; +}; + +/* + * Request framebuffer detach - detach a previously + * attached framebuffer from the display buffer in request: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | _OP_FB_DETACH | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | fb_cookie low 32-bit | 12 + * +----------------+----------------+----------------+----------------+ + * | fb_cookie high 32-bit | 16 + * +----------------+----------------+----------------+----------------+ + * | reserved | 20 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 64 + * +----------------+----------------+----------------+----------------+ + * + * Must be sent over control ring of the connector which has the index + * value of 0: + * /local/domain/<dom-id>/device/vdispl/<dev-id>/0/req-ring-ref + */ + +struct xendispl_fb_detach_req { + uint64_t fb_cookie; +}; + +/* + * Request configuration set/reset - request to set or reset + * the configuration/mode of the display: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | _OP_SET_CONFIG | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | fb_cookie low 32-bit | 12 + * +----------------+----------------+----------------+----------------+ + * | fb_cookie high 32-bit | 16 + * +----------------+----------------+----------------+----------------+ + * | x | 20 + * +----------------+----------------+----------------+----------------+ + * | y | 24 + * +----------------+----------------+----------------+----------------+ + * | width | 28 + * +----------------+----------------+----------------+----------------+ + * | height | 32 + * +----------------+----------------+----------------+----------------+ + * | bpp | 40 + * +----------------+----------------+----------------+----------------+ + * | reserved | 44 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 64 + * +----------------+----------------+----------------+----------------+ + * + * Pass all zeros to reset, otherwise command is treated as + * configuration set. + * Framebuffer's cookie defines which framebuffer/dbuf must be + * displayed while enabling display (applying configuration). + * x, y, width and height are bound by the connector's resolution and must not + * exceed it. + * + * x - uint32_t, starting position in pixels by X axis + * y - uint32_t, starting position in pixels by Y axis + * width - uint32_t, width in pixels + * height - uint32_t, height in pixels + * bpp - uint32_t, bits per pixel + */ + +struct xendispl_set_config_req { + uint64_t fb_cookie; + uint32_t x; + uint32_t y; + uint32_t width; + uint32_t height; + uint32_t bpp; +}; + +/* + * Request page flip - request to flip a page identified by the framebuffer + * cookie: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | _OP_PG_FLIP | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | fb_cookie low 32-bit | 12 + * +----------------+----------------+----------------+----------------+ + * | fb_cookie high 32-bit | 16 + * +----------------+----------------+----------------+----------------+ + * | reserved | 20 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 64 + * +----------------+----------------+----------------+----------------+ + */ + +struct xendispl_page_flip_req { + uint64_t fb_cookie; +}; + +/* + *---------------------------------- Responses -------------------------------- + * + * All response packets have the same length (64 octets) + * + * All response packets have common header: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | status | 8 + * +----------------+----------------+----------------+----------------+ + * | reserved | 12 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 64 + * +----------------+----------------+----------------+----------------+ + * + * id - uint16_t, private guest value, echoed from request + * status - int32_t, response status, zero on success and -XEN_EXX on failure + * + *----------------------------------- Events ---------------------------------- + * + * Events are sent via a shared page allocated by the front and propagated by + * evt-event-channel/evt-ring-ref XenStore entries + * All event packets have the same length (64 octets) + * All event packets have common header: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | type | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * + * id - uint16_t, event id, may be used by front + * type - uint8_t, type of the event + * + * + * Page flip complete event - event from back to front on page flip completed: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | _EVT_PG_FLIP | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | fb_cookie low 32-bit | 12 + * +----------------+----------------+----------------+----------------+ + * | fb_cookie high 32-bit | 16 + * +----------------+----------------+----------------+----------------+ + * | reserved | 20 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 64 + * +----------------+----------------+----------------+----------------+ + */ + +struct xendispl_pg_flip_evt { + uint64_t fb_cookie; +}; + +struct xendispl_req { + uint16_t id; + uint8_t operation; + uint8_t reserved[5]; + union { + struct xendispl_dbuf_create_req dbuf_create; + struct xendispl_dbuf_destroy_req dbuf_destroy; + struct xendispl_fb_attach_req fb_attach; + struct xendispl_fb_detach_req fb_detach; + struct xendispl_set_config_req set_config; + struct xendispl_page_flip_req pg_flip; + uint8_t reserved[56]; + } op; +}; + +struct xendispl_resp { + uint16_t id; + uint8_t operation; + uint8_t reserved; + int32_t status; + uint8_t reserved1[56]; +}; + +struct xendispl_evt { + uint16_t id; + uint8_t type; + uint8_t reserved[5]; + union { + struct xendispl_pg_flip_evt pg_flip; + uint8_t reserved[56]; + } op; +}; + +DEFINE_RING_TYPES(xen_displif, struct xendispl_req, struct xendispl_resp); + +/* + ****************************************************************************** + * Back to front events delivery + ****************************************************************************** + * In order to deliver asynchronous events from back to front a shared page is + * allocated by front and its granted reference propagated to back via + * XenStore entries (evt-ring-ref/evt-event-channel). + * This page has a common header used by both front and back to synchronize + * access and control event's ring buffer, while back being a producer of the + * events and front being a consumer. The rest of the page after the header + * is used for event packets. + * + * Upon reception of an event(s) front may confirm its reception + * for either each event, group of events or none. + */ + +struct xendispl_event_page { + uint32_t in_cons; + uint32_t in_prod; + uint8_t reserved[56]; +}; + +#define XENDISPL_EVENT_PAGE_SIZE XEN_PAGE_SIZE +#define XENDISPL_IN_RING_OFFS (sizeof(struct xendispl_event_page)) +#define XENDISPL_IN_RING_SIZE (XENDISPL_EVENT_PAGE_SIZE - XENDISPL_IN_RING_OFFS) +#define XENDISPL_IN_RING_LEN (XENDISPL_IN_RING_SIZE / sizeof(struct xendispl_evt)) +#define XENDISPL_IN_RING(page) \ + ((struct xendispl_evt *)((char *)(page) + XENDISPL_IN_RING_OFFS)) +#define XENDISPL_IN_RING_REF(page, idx) \ + (XENDISPL_IN_RING((page))[(idx) % XENDISPL_IN_RING_LEN]) + +#endif /* __XEN_PUBLIC_IO_DISPLIF_H__ */ diff --git a/include/xen/interface/io/kbdif.h b/include/xen/interface/io/kbdif.h index 8066c7849fbe..2a9510ade701 100644 --- a/include/xen/interface/io/kbdif.h +++ b/include/xen/interface/io/kbdif.h @@ -26,43 +26,432 @@ #ifndef __XEN_PUBLIC_IO_KBDIF_H__ #define __XEN_PUBLIC_IO_KBDIF_H__ -/* In events (backend -> frontend) */ +/* + ***************************************************************************** + * Feature and Parameter Negotiation + ***************************************************************************** + * + * The two halves of a para-virtual driver utilize nodes within + * XenStore to communicate capabilities and to negotiate operating parameters. + * This section enumerates these nodes which reside in the respective front and + * backend portions of XenStore, following XenBus convention. + * + * All data in XenStore is stored as strings. Nodes specifying numeric + * values are encoded in decimal. Integer value ranges listed below are + * expressed as fixed sized integer types capable of storing the conversion + * of a properly formated node string, without loss of information. + * + ***************************************************************************** + * Backend XenBus Nodes + ***************************************************************************** + * + *---------------------------- Features supported ---------------------------- + * + * Capable backend advertises supported features by publishing + * corresponding entries in XenStore and puts 1 as the value of the entry. + * If a feature is not supported then 0 must be set or feature entry omitted. + * + * feature-abs-pointer + * Values: <uint> + * + * Backends, which support reporting of absolute coordinates for pointer + * device should set this to 1. + * + * feature-multi-touch + * Values: <uint> + * + * Backends, which support reporting of multi-touch events + * should set this to 1. + * + *------------------------- Pointer Device Parameters ------------------------ + * + * width + * Values: <uint> + * + * Maximum X coordinate (width) to be used by the frontend + * while reporting input events, pixels, [0; UINT32_MAX]. + * + * height + * Values: <uint> + * + * Maximum Y coordinate (height) to be used by the frontend + * while reporting input events, pixels, [0; UINT32_MAX]. + * + ***************************************************************************** + * Frontend XenBus Nodes + ***************************************************************************** + * + *------------------------------ Feature request ----------------------------- + * + * Capable frontend requests features from backend via setting corresponding + * entries to 1 in XenStore. Requests for features not advertised as supported + * by the backend have no effect. + * + * request-abs-pointer + * Values: <uint> + * + * Request backend to report absolute pointer coordinates + * (XENKBD_TYPE_POS) instead of relative ones (XENKBD_TYPE_MOTION). + * + * request-multi-touch + * Values: <uint> + * + * Request backend to report multi-touch events. + * + *----------------------- Request Transport Parameters ----------------------- + * + * event-channel + * Values: <uint> + * + * The identifier of the Xen event channel used to signal activity + * in the ring buffer. + * + * page-gref + * Values: <uint> + * + * The Xen grant reference granting permission for the backend to map + * a sole page in a single page sized event ring buffer. + * + * page-ref + * Values: <uint> + * + * OBSOLETE, not recommended for use. + * PFN of the shared page. + * + *----------------------- Multi-touch Device Parameters ----------------------- + * + * multi-touch-num-contacts + * Values: <uint> + * + * Number of simultaneous touches reported. + * + * multi-touch-width + * Values: <uint> + * + * Width of the touch area to be used by the frontend + * while reporting input events, pixels, [0; UINT32_MAX]. + * + * multi-touch-height + * Values: <uint> + * + * Height of the touch area to be used by the frontend + * while reporting input events, pixels, [0; UINT32_MAX]. + */ /* - * Frontends should ignore unknown in events. + * EVENT CODES. */ -/* Pointer movement event */ -#define XENKBD_TYPE_MOTION 1 -/* Event type 2 currently not used */ -/* Key event (includes pointer buttons) */ -#define XENKBD_TYPE_KEY 3 +#define XENKBD_TYPE_MOTION 1 +#define XENKBD_TYPE_RESERVED 2 +#define XENKBD_TYPE_KEY 3 +#define XENKBD_TYPE_POS 4 +#define XENKBD_TYPE_MTOUCH 5 + +/* Multi-touch event sub-codes */ + +#define XENKBD_MT_EV_DOWN 0 +#define XENKBD_MT_EV_UP 1 +#define XENKBD_MT_EV_MOTION 2 +#define XENKBD_MT_EV_SYN 3 +#define XENKBD_MT_EV_SHAPE 4 +#define XENKBD_MT_EV_ORIENT 5 + +/* + * CONSTANTS, XENSTORE FIELD AND PATH NAME STRINGS, HELPERS. + */ + +#define XENKBD_DRIVER_NAME "vkbd" + +#define XENKBD_FIELD_FEAT_ABS_POINTER "feature-abs-pointer" +#define XENKBD_FIELD_FEAT_MTOUCH "feature-multi-touch" +#define XENKBD_FIELD_REQ_ABS_POINTER "request-abs-pointer" +#define XENKBD_FIELD_REQ_MTOUCH "request-multi-touch" +#define XENKBD_FIELD_RING_GREF "page-gref" +#define XENKBD_FIELD_EVT_CHANNEL "event-channel" +#define XENKBD_FIELD_WIDTH "width" +#define XENKBD_FIELD_HEIGHT "height" +#define XENKBD_FIELD_MT_WIDTH "multi-touch-width" +#define XENKBD_FIELD_MT_HEIGHT "multi-touch-height" +#define XENKBD_FIELD_MT_NUM_CONTACTS "multi-touch-num-contacts" + +/* OBSOLETE, not recommended for use */ +#define XENKBD_FIELD_RING_REF "page-ref" + /* - * Pointer position event - * Capable backend sets feature-abs-pointer in xenstore. - * Frontend requests ot instead of XENKBD_TYPE_MOTION by setting - * request-abs-update in xenstore. + ***************************************************************************** + * Description of the protocol between frontend and backend driver. + ***************************************************************************** + * + * The two halves of a Para-virtual driver communicate with + * each other using a shared page and an event channel. + * Shared page contains a ring with event structures. + * + * All reserved fields in the structures below must be 0. + * + ***************************************************************************** + * Backend to frontend events + ***************************************************************************** + * + * Frontends should ignore unknown in events. + * All event packets have the same length (40 octets) + * All event packets have common header: + * + * 0 octet + * +-----------------+ + * | type | + * +-----------------+ + * type - uint8_t, event code, XENKBD_TYPE_??? + * + * + * Pointer relative movement event + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | _TYPE_MOTION | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | rel_x | 8 + * +----------------+----------------+----------------+----------------+ + * | rel_y | 12 + * +----------------+----------------+----------------+----------------+ + * | rel_z | 16 + * +----------------+----------------+----------------+----------------+ + * | reserved | 20 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 40 + * +----------------+----------------+----------------+----------------+ + * + * rel_x - int32_t, relative X motion + * rel_y - int32_t, relative Y motion + * rel_z - int32_t, relative Z motion (wheel) */ -#define XENKBD_TYPE_POS 4 struct xenkbd_motion { - uint8_t type; /* XENKBD_TYPE_MOTION */ - int32_t rel_x; /* relative X motion */ - int32_t rel_y; /* relative Y motion */ - int32_t rel_z; /* relative Z motion (wheel) */ + uint8_t type; + int32_t rel_x; + int32_t rel_y; + int32_t rel_z; }; +/* + * Key event (includes pointer buttons) + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | _TYPE_KEY | pressed | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | keycode | 8 + * +----------------+----------------+----------------+----------------+ + * | reserved | 12 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 40 + * +----------------+----------------+----------------+----------------+ + * + * pressed - uint8_t, 1 if pressed; 0 otherwise + * keycode - uint32_t, KEY_* from linux/input.h + */ + struct xenkbd_key { - uint8_t type; /* XENKBD_TYPE_KEY */ - uint8_t pressed; /* 1 if pressed; 0 otherwise */ - uint32_t keycode; /* KEY_* from linux/input.h */ + uint8_t type; + uint8_t pressed; + uint32_t keycode; }; +/* + * Pointer absolute position event + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | _TYPE_POS | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | abs_x | 8 + * +----------------+----------------+----------------+----------------+ + * | abs_y | 12 + * +----------------+----------------+----------------+----------------+ + * | rel_z | 16 + * +----------------+----------------+----------------+----------------+ + * | reserved | 20 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 40 + * +----------------+----------------+----------------+----------------+ + * + * abs_x - int32_t, absolute X position (in FB pixels) + * abs_y - int32_t, absolute Y position (in FB pixels) + * rel_z - int32_t, relative Z motion (wheel) + */ + struct xenkbd_position { - uint8_t type; /* XENKBD_TYPE_POS */ - int32_t abs_x; /* absolute X position (in FB pixels) */ - int32_t abs_y; /* absolute Y position (in FB pixels) */ - int32_t rel_z; /* relative Z motion (wheel) */ + uint8_t type; + int32_t abs_x; + int32_t abs_y; + int32_t rel_z; +}; + +/* + * Multi-touch event and its sub-types + * + * All multi-touch event packets have common header: + * + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | _TYPE_MTOUCH | event_type | contact_id | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * + * event_type - unt8_t, multi-touch event sub-type, XENKBD_MT_EV_??? + * contact_id - unt8_t, ID of the contact + * + * Touch interactions can consist of one or more contacts. + * For each contact, a series of events is generated, starting + * with a down event, followed by zero or more motion events, + * and ending with an up event. Events relating to the same + * contact point can be identified by the ID of the sequence: contact ID. + * Contact ID may be reused after XENKBD_MT_EV_UP event and + * is in the [0; XENKBD_FIELD_NUM_CONTACTS - 1] range. + * + * For further information please refer to documentation on Wayland [1], + * Linux [2] and Windows [3] multi-touch support. + * + * [1] https://cgit.freedesktop.org/wayland/wayland/tree/protocol/wayland.xml + * [2] https://www.kernel.org/doc/Documentation/input/multi-touch-protocol.txt + * [3] https://msdn.microsoft.com/en-us/library/jj151564(v=vs.85).aspx + * + * + * Multi-touch down event - sent when a new touch is made: touch is assigned + * a unique contact ID, sent with this and consequent events related + * to this touch. + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | _TYPE_MTOUCH | _MT_EV_DOWN | contact_id | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | abs_x | 12 + * +----------------+----------------+----------------+----------------+ + * | abs_y | 16 + * +----------------+----------------+----------------+----------------+ + * | reserved | 20 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 40 + * +----------------+----------------+----------------+----------------+ + * + * abs_x - int32_t, absolute X position, in pixels + * abs_y - int32_t, absolute Y position, in pixels + * + * Multi-touch contact release event + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | _TYPE_MTOUCH | _MT_EV_UP | contact_id | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 40 + * +----------------+----------------+----------------+----------------+ + * + * Multi-touch motion event + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | _TYPE_MTOUCH | _MT_EV_MOTION | contact_id | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | abs_x | 12 + * +----------------+----------------+----------------+----------------+ + * | abs_y | 16 + * +----------------+----------------+----------------+----------------+ + * | reserved | 20 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 40 + * +----------------+----------------+----------------+----------------+ + * + * abs_x - int32_t, absolute X position, in pixels, + * abs_y - int32_t, absolute Y position, in pixels, + * + * Multi-touch input synchronization event - shows end of a set of events + * which logically belong together. + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | _TYPE_MTOUCH | _MT_EV_SYN | contact_id | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 40 + * +----------------+----------------+----------------+----------------+ + * + * Multi-touch shape event - touch point's shape has changed its shape. + * Shape is approximated by an ellipse through the major and minor axis + * lengths: major is the longer diameter of the ellipse and minor is the + * shorter one. Center of the ellipse is reported via + * XENKBD_MT_EV_DOWN/XENKBD_MT_EV_MOTION events. + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | _TYPE_MTOUCH | _MT_EV_SHAPE | contact_id | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | major | 12 + * +----------------+----------------+----------------+----------------+ + * | minor | 16 + * +----------------+----------------+----------------+----------------+ + * | reserved | 20 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 40 + * +----------------+----------------+----------------+----------------+ + * + * major - unt32_t, length of the major axis, pixels + * minor - unt32_t, length of the minor axis, pixels + * + * Multi-touch orientation event - touch point's shape has changed + * its orientation: calculated as a clockwise angle between the major axis + * of the ellipse and positive Y axis in degrees, [-180; +180]. + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | _TYPE_MTOUCH | _MT_EV_ORIENT | contact_id | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | orientation | reserved | 12 + * +----------------+----------------+----------------+----------------+ + * | reserved | 16 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 40 + * +----------------+----------------+----------------+----------------+ + * + * orientation - int16_t, clockwise angle of the major axis + */ + +struct xenkbd_mtouch { + uint8_t type; /* XENKBD_TYPE_MTOUCH */ + uint8_t event_type; /* XENKBD_MT_EV_??? */ + uint8_t contact_id; + uint8_t reserved[5]; /* reserved for the future use */ + union { + struct { + int32_t abs_x; /* absolute X position, pixels */ + int32_t abs_y; /* absolute Y position, pixels */ + } pos; + struct { + uint32_t major; /* length of the major axis, pixels */ + uint32_t minor; /* length of the minor axis, pixels */ + } shape; + int16_t orientation; /* clockwise angle of the major axis */ + } u; }; #define XENKBD_IN_EVENT_SIZE 40 @@ -72,15 +461,26 @@ union xenkbd_in_event { struct xenkbd_motion motion; struct xenkbd_key key; struct xenkbd_position pos; + struct xenkbd_mtouch mtouch; char pad[XENKBD_IN_EVENT_SIZE]; }; -/* Out events (frontend -> backend) */ - /* + ***************************************************************************** + * Frontend to backend events + ***************************************************************************** + * * Out events may be sent only when requested by backend, and receipt * of an unknown out event is an error. * No out events currently defined. + + * All event packets have the same length (40 octets) + * All event packets have common header: + * 0 octet + * +-----------------+ + * | type | + * +-----------------+ + * type - uint8_t, event code */ #define XENKBD_OUT_EVENT_SIZE 40 @@ -90,7 +490,11 @@ union xenkbd_out_event { char pad[XENKBD_OUT_EVENT_SIZE]; }; -/* shared page */ +/* + ***************************************************************************** + * Shared page + ***************************************************************************** + */ #define XENKBD_IN_RING_SIZE 2048 #define XENKBD_IN_RING_LEN (XENKBD_IN_RING_SIZE / XENKBD_IN_EVENT_SIZE) @@ -113,4 +517,4 @@ struct xenkbd_page { uint32_t out_cons, out_prod; }; -#endif +#endif /* __XEN_PUBLIC_IO_KBDIF_H__ */ diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h index 21f4fbd55e48..c79456855539 100644 --- a/include/xen/interface/io/ring.h +++ b/include/xen/interface/io/ring.h @@ -283,4 +283,147 @@ struct __name##_back_ring { \ (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \ } while (0) + +/* + * DEFINE_XEN_FLEX_RING_AND_INTF defines two monodirectional rings and + * functions to check if there is data on the ring, and to read and + * write to them. + * + * DEFINE_XEN_FLEX_RING is similar to DEFINE_XEN_FLEX_RING_AND_INTF, but + * does not define the indexes page. As different protocols can have + * extensions to the basic format, this macro allow them to define their + * own struct. + * + * XEN_FLEX_RING_SIZE + * Convenience macro to calculate the size of one of the two rings + * from the overall order. + * + * $NAME_mask + * Function to apply the size mask to an index, to reduce the index + * within the range [0-size]. + * + * $NAME_read_packet + * Function to read data from the ring. The amount of data to read is + * specified by the "size" argument. + * + * $NAME_write_packet + * Function to write data to the ring. The amount of data to write is + * specified by the "size" argument. + * + * $NAME_get_ring_ptr + * Convenience function that returns a pointer to read/write to the + * ring at the right location. + * + * $NAME_data_intf + * Indexes page, shared between frontend and backend. It also + * contains the array of grant refs. + * + * $NAME_queued + * Function to calculate how many bytes are currently on the ring, + * ready to be read. It can also be used to calculate how much free + * space is currently on the ring (XEN_FLEX_RING_SIZE() - + * $NAME_queued()). + */ + +#ifndef XEN_PAGE_SHIFT +/* The PAGE_SIZE for ring protocols and hypercall interfaces is always + * 4K, regardless of the architecture, and page granularity chosen by + * operating systems. + */ +#define XEN_PAGE_SHIFT 12 +#endif +#define XEN_FLEX_RING_SIZE(order) \ + (1UL << ((order) + XEN_PAGE_SHIFT - 1)) + +#define DEFINE_XEN_FLEX_RING(name) \ +static inline RING_IDX name##_mask(RING_IDX idx, RING_IDX ring_size) \ +{ \ + return idx & (ring_size - 1); \ +} \ + \ +static inline unsigned char *name##_get_ring_ptr(unsigned char *buf, \ + RING_IDX idx, \ + RING_IDX ring_size) \ +{ \ + return buf + name##_mask(idx, ring_size); \ +} \ + \ +static inline void name##_read_packet(void *opaque, \ + const unsigned char *buf, \ + size_t size, \ + RING_IDX masked_prod, \ + RING_IDX *masked_cons, \ + RING_IDX ring_size) \ +{ \ + if (*masked_cons < masked_prod || \ + size <= ring_size - *masked_cons) { \ + memcpy(opaque, buf + *masked_cons, size); \ + } else { \ + memcpy(opaque, buf + *masked_cons, ring_size - *masked_cons); \ + memcpy((unsigned char *)opaque + ring_size - *masked_cons, buf, \ + size - (ring_size - *masked_cons)); \ + } \ + *masked_cons = name##_mask(*masked_cons + size, ring_size); \ +} \ + \ +static inline void name##_write_packet(unsigned char *buf, \ + const void *opaque, \ + size_t size, \ + RING_IDX *masked_prod, \ + RING_IDX masked_cons, \ + RING_IDX ring_size) \ +{ \ + if (*masked_prod < masked_cons || \ + size <= ring_size - *masked_prod) { \ + memcpy(buf + *masked_prod, opaque, size); \ + } else { \ + memcpy(buf + *masked_prod, opaque, ring_size - *masked_prod); \ + memcpy(buf, (unsigned char *)opaque + (ring_size - *masked_prod), \ + size - (ring_size - *masked_prod)); \ + } \ + *masked_prod = name##_mask(*masked_prod + size, ring_size); \ +} \ + \ +static inline RING_IDX name##_queued(RING_IDX prod, \ + RING_IDX cons, \ + RING_IDX ring_size) \ +{ \ + RING_IDX size; \ + \ + if (prod == cons) \ + return 0; \ + \ + prod = name##_mask(prod, ring_size); \ + cons = name##_mask(cons, ring_size); \ + \ + if (prod == cons) \ + return ring_size; \ + \ + if (prod > cons) \ + size = prod - cons; \ + else \ + size = ring_size - (cons - prod); \ + return size; \ +} \ + \ +struct name##_data { \ + unsigned char *in; /* half of the allocation */ \ + unsigned char *out; /* half of the allocation */ \ +} + +#define DEFINE_XEN_FLEX_RING_AND_INTF(name) \ +struct name##_data_intf { \ + RING_IDX in_cons, in_prod; \ + \ + uint8_t pad1[56]; \ + \ + RING_IDX out_cons, out_prod; \ + \ + uint8_t pad2[56]; \ + \ + RING_IDX ring_order; \ + grant_ref_t ref[]; \ +}; \ +DEFINE_XEN_FLEX_RING(name) + #endif /* __XEN_PUBLIC_IO_RING_H__ */ diff --git a/include/xen/interface/io/sndif.h b/include/xen/interface/io/sndif.h new file mode 100644 index 000000000000..5c918276835e --- /dev/null +++ b/include/xen/interface/io/sndif.h @@ -0,0 +1,793 @@ +/****************************************************************************** + * sndif.h + * + * Unified sound-device I/O interface for Xen guest OSes. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (C) 2013-2015 GlobalLogic Inc. + * Copyright (C) 2016-2017 EPAM Systems Inc. + * + * Authors: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com> + * Oleksandr Grytsov <oleksandr_grytsov@epam.com> + * Oleksandr Dmytryshyn <oleksandr.dmytryshyn@globallogic.com> + * Iurii Konovalenko <iurii.konovalenko@globallogic.com> + */ + +#ifndef __XEN_PUBLIC_IO_SNDIF_H__ +#define __XEN_PUBLIC_IO_SNDIF_H__ + +#include "ring.h" +#include "../grant_table.h" + +/* + ****************************************************************************** + * Feature and Parameter Negotiation + ****************************************************************************** + * + * Front->back notifications: when enqueuing a new request, sending a + * notification can be made conditional on xensnd_req (i.e., the generic + * hold-off mechanism provided by the ring macros). Backends must set + * xensnd_req appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()). + * + * Back->front notifications: when enqueuing a new response, sending a + * notification can be made conditional on xensnd_resp (i.e., the generic + * hold-off mechanism provided by the ring macros). Frontends must set + * xensnd_resp appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()). + * + * The two halves of a para-virtual sound card driver utilize nodes within + * XenStore to communicate capabilities and to negotiate operating parameters. + * This section enumerates these nodes which reside in the respective front and + * backend portions of XenStore, following the XenBus convention. + * + * All data in XenStore is stored as strings. Nodes specifying numeric + * values are encoded in decimal. Integer value ranges listed below are + * expressed as fixed sized integer types capable of storing the conversion + * of a properly formated node string, without loss of information. + * + ****************************************************************************** + * Example configuration + ****************************************************************************** + * + * Note: depending on the use-case backend can expose more sound cards and + * PCM devices/streams than the underlying HW physically has by employing + * SW mixers, configuring virtual sound streams, channels etc. + * + * This is an example of backend and frontend configuration: + * + *--------------------------------- Backend ----------------------------------- + * + * /local/domain/0/backend/vsnd/1/0/frontend-id = "1" + * /local/domain/0/backend/vsnd/1/0/frontend = "/local/domain/1/device/vsnd/0" + * /local/domain/0/backend/vsnd/1/0/state = "4" + * /local/domain/0/backend/vsnd/1/0/versions = "1,2" + * + *--------------------------------- Frontend ---------------------------------- + * + * /local/domain/1/device/vsnd/0/backend-id = "0" + * /local/domain/1/device/vsnd/0/backend = "/local/domain/0/backend/vsnd/1/0" + * /local/domain/1/device/vsnd/0/state = "4" + * /local/domain/1/device/vsnd/0/version = "1" + * + *----------------------------- Card configuration ---------------------------- + * + * /local/domain/1/device/vsnd/0/short-name = "Card short name" + * /local/domain/1/device/vsnd/0/long-name = "Card long name" + * /local/domain/1/device/vsnd/0/sample-rates = "8000,32000,44100,48000,96000" + * /local/domain/1/device/vsnd/0/sample-formats = "s8,u8,s16_le,s16_be" + * /local/domain/1/device/vsnd/0/buffer-size = "262144" + * + *------------------------------- PCM device 0 -------------------------------- + * + * /local/domain/1/device/vsnd/0/0/name = "General analog" + * /local/domain/1/device/vsnd/0/0/channels-max = "5" + * + *----------------------------- Stream 0, playback ---------------------------- + * + * /local/domain/1/device/vsnd/0/0/0/type = "p" + * /local/domain/1/device/vsnd/0/0/0/sample-formats = "s8,u8" + * /local/domain/1/device/vsnd/0/0/0/unique-id = "0" + * + * /local/domain/1/device/vsnd/0/0/0/ring-ref = "386" + * /local/domain/1/device/vsnd/0/0/0/event-channel = "15" + * + *------------------------------ Stream 1, capture ---------------------------- + * + * /local/domain/1/device/vsnd/0/0/1/type = "c" + * /local/domain/1/device/vsnd/0/0/1/channels-max = "2" + * /local/domain/1/device/vsnd/0/0/1/unique-id = "1" + * + * /local/domain/1/device/vsnd/0/0/1/ring-ref = "384" + * /local/domain/1/device/vsnd/0/0/1/event-channel = "13" + * + *------------------------------- PCM device 1 -------------------------------- + * + * /local/domain/1/device/vsnd/0/1/name = "HDMI-0" + * /local/domain/1/device/vsnd/0/1/sample-rates = "8000,32000,44100" + * + *------------------------------ Stream 0, capture ---------------------------- + * + * /local/domain/1/device/vsnd/0/1/0/type = "c" + * /local/domain/1/device/vsnd/0/1/0/unique-id = "2" + * + * /local/domain/1/device/vsnd/0/1/0/ring-ref = "387" + * /local/domain/1/device/vsnd/0/1/0/event-channel = "151" + * + *------------------------------- PCM device 2 -------------------------------- + * + * /local/domain/1/device/vsnd/0/2/name = "SPDIF" + * + *----------------------------- Stream 0, playback ---------------------------- + * + * /local/domain/1/device/vsnd/0/2/0/type = "p" + * /local/domain/1/device/vsnd/0/2/0/unique-id = "3" + * + * /local/domain/1/device/vsnd/0/2/0/ring-ref = "389" + * /local/domain/1/device/vsnd/0/2/0/event-channel = "152" + * + ****************************************************************************** + * Backend XenBus Nodes + ****************************************************************************** + * + *----------------------------- Protocol version ------------------------------ + * + * versions + * Values: <string> + * + * List of XENSND_LIST_SEPARATOR separated protocol versions supported + * by the backend. For example "1,2,3". + * + ****************************************************************************** + * Frontend XenBus Nodes + ****************************************************************************** + * + *-------------------------------- Addressing --------------------------------- + * + * dom-id + * Values: <uint16_t> + * + * Domain identifier. + * + * dev-id + * Values: <uint16_t> + * + * Device identifier. + * + * pcm-dev-idx + * Values: <uint8_t> + * + * Zero based contigous index of the PCM device. + * + * stream-idx + * Values: <uint8_t> + * + * Zero based contigous index of the stream of the PCM device. + * + * The following pattern is used for addressing: + * /local/domain/<dom-id>/device/vsnd/<dev-id>/<pcm-dev-idx>/<stream-idx>/... + * + *----------------------------- Protocol version ------------------------------ + * + * version + * Values: <string> + * + * Protocol version, chosen among the ones supported by the backend. + * + *------------------------------- PCM settings -------------------------------- + * + * Every virtualized sound frontend has a set of PCM devices and streams, each + * could be individually configured. Part of the PCM configuration can be + * defined at higher level of the hierarchy and be fully or partially re-used + * by the underlying layers. These configuration values are: + * o number of channels (min/max) + * o supported sample rates + * o supported sample formats. + * E.g. one can define these values for the whole card, device or stream. + * Every underlying layer in turn can re-define some or all of them to better + * fit its needs. For example, card may define number of channels to be + * in [1; 8] range, and some particular stream may be limited to [1; 2] only. + * The rule is that the underlying layer must be a subset of the upper layer + * range. + * + * channels-min + * Values: <uint8_t> + * + * The minimum amount of channels that is supported, [1; channels-max]. + * Optional, if not set or omitted a value of 1 is used. + * + * channels-max + * Values: <uint8_t> + * + * The maximum amount of channels that is supported. + * Must be at least <channels-min>. + * + * sample-rates + * Values: <list of uint32_t> + * + * List of supported sample rates separated by XENSND_LIST_SEPARATOR. + * Sample rates are expressed as a list of decimal values w/o any + * ordering requirement. + * + * sample-formats + * Values: <list of XENSND_PCM_FORMAT_XXX_STR> + * + * List of supported sample formats separated by XENSND_LIST_SEPARATOR. + * Items must not exceed XENSND_SAMPLE_FORMAT_MAX_LEN length. + * + * buffer-size + * Values: <uint32_t> + * + * The maximum size in octets of the buffer to allocate per stream. + * + *----------------------- Virtual sound card settings ------------------------- + * short-name + * Values: <char[32]> + * + * Short name of the virtual sound card. Optional. + * + * long-name + * Values: <char[80]> + * + * Long name of the virtual sound card. Optional. + * + *----------------------------- Device settings ------------------------------- + * name + * Values: <char[80]> + * + * Name of the sound device within the virtual sound card. Optional. + * + *----------------------------- Stream settings ------------------------------- + * + * type + * Values: "p", "c" + * + * Stream type: "p" - playback stream, "c" - capture stream + * + * If both capture and playback are needed then two streams need to be + * defined under the same device. + * + * unique-id + * Values: <uint32_t> + * + * After stream initialization it is assigned a unique ID (within the front + * driver), so every stream of the frontend can be identified by the + * backend by this ID. This is not equal to stream-idx as the later is + * zero based within the device, but this index is contigous within the + * driver. + * + *-------------------- Stream Request Transport Parameters -------------------- + * + * event-channel + * Values: <uint32_t> + * + * The identifier of the Xen event channel used to signal activity + * in the ring buffer. + * + * ring-ref + * Values: <uint32_t> + * + * The Xen grant reference granting permission for the backend to map + * a sole page in a single page sized ring buffer. + * + ****************************************************************************** + * STATE DIAGRAMS + ****************************************************************************** + * + * Tool stack creates front and back state nodes with initial state + * XenbusStateInitialising. + * Tool stack creates and sets up frontend sound configuration nodes per domain. + * + * Front Back + * ================================= ===================================== + * XenbusStateInitialising XenbusStateInitialising + * o Query backend device identification + * data. + * o Open and validate backend device. + * | + * | + * V + * XenbusStateInitWait + * + * o Query frontend configuration + * o Allocate and initialize + * event channels per configured + * playback/capture stream. + * o Publish transport parameters + * that will be in effect during + * this connection. + * | + * | + * V + * XenbusStateInitialised + * + * o Query frontend transport parameters. + * o Connect to the event channels. + * | + * | + * V + * XenbusStateConnected + * + * o Create and initialize OS + * virtual sound device instances + * as per configuration. + * | + * | + * V + * XenbusStateConnected + * + * XenbusStateUnknown + * XenbusStateClosed + * XenbusStateClosing + * o Remove virtual sound device + * o Remove event channels + * | + * | + * V + * XenbusStateClosed + * + *------------------------------- Recovery flow ------------------------------- + * + * In case of frontend unrecoverable errors backend handles that as + * if frontend goes into the XenbusStateClosed state. + * + * In case of backend unrecoverable errors frontend tries removing + * the virtualized device. If this is possible at the moment of error, + * then frontend goes into the XenbusStateInitialising state and is ready for + * new connection with backend. If the virtualized device is still in use and + * cannot be removed, then frontend goes into the XenbusStateReconfiguring state + * until either the virtualized device removed or backend initiates a new + * connection. On the virtualized device removal frontend goes into the + * XenbusStateInitialising state. + * + * Note on XenbusStateReconfiguring state of the frontend: if backend has + * unrecoverable errors then frontend cannot send requests to the backend + * and thus cannot provide functionality of the virtualized device anymore. + * After backend is back to normal the virtualized device may still hold some + * state: configuration in use, allocated buffers, client application state etc. + * So, in most cases, this will require frontend to implement complex recovery + * reconnect logic. Instead, by going into XenbusStateReconfiguring state, + * frontend will make sure no new clients of the virtualized device are + * accepted, allow existing client(s) to exit gracefully by signaling error + * state etc. + * Once all the clients are gone frontend can reinitialize the virtualized + * device and get into XenbusStateInitialising state again signaling the + * backend that a new connection can be made. + * + * There are multiple conditions possible under which frontend will go from + * XenbusStateReconfiguring into XenbusStateInitialising, some of them are OS + * specific. For example: + * 1. The underlying OS framework may provide callbacks to signal that the last + * client of the virtualized device has gone and the device can be removed + * 2. Frontend can schedule a deferred work (timer/tasklet/workqueue) + * to periodically check if this is the right time to re-try removal of + * the virtualized device. + * 3. By any other means. + * + ****************************************************************************** + * PCM FORMATS + ****************************************************************************** + * + * XENSND_PCM_FORMAT_<format>[_<endian>] + * + * format: <S/U/F><bits> or <name> + * S - signed, U - unsigned, F - float + * bits - 8, 16, 24, 32 + * name - MU_LAW, GSM, etc. + * + * endian: <LE/BE>, may be absent + * LE - Little endian, BE - Big endian + */ +#define XENSND_PCM_FORMAT_S8 0 +#define XENSND_PCM_FORMAT_U8 1 +#define XENSND_PCM_FORMAT_S16_LE 2 +#define XENSND_PCM_FORMAT_S16_BE 3 +#define XENSND_PCM_FORMAT_U16_LE 4 +#define XENSND_PCM_FORMAT_U16_BE 5 +#define XENSND_PCM_FORMAT_S24_LE 6 +#define XENSND_PCM_FORMAT_S24_BE 7 +#define XENSND_PCM_FORMAT_U24_LE 8 +#define XENSND_PCM_FORMAT_U24_BE 9 +#define XENSND_PCM_FORMAT_S32_LE 10 +#define XENSND_PCM_FORMAT_S32_BE 11 +#define XENSND_PCM_FORMAT_U32_LE 12 +#define XENSND_PCM_FORMAT_U32_BE 13 +#define XENSND_PCM_FORMAT_F32_LE 14 /* 4-byte float, IEEE-754 32-bit, */ +#define XENSND_PCM_FORMAT_F32_BE 15 /* range -1.0 to 1.0 */ +#define XENSND_PCM_FORMAT_F64_LE 16 /* 8-byte float, IEEE-754 64-bit, */ +#define XENSND_PCM_FORMAT_F64_BE 17 /* range -1.0 to 1.0 */ +#define XENSND_PCM_FORMAT_IEC958_SUBFRAME_LE 18 +#define XENSND_PCM_FORMAT_IEC958_SUBFRAME_BE 19 +#define XENSND_PCM_FORMAT_MU_LAW 20 +#define XENSND_PCM_FORMAT_A_LAW 21 +#define XENSND_PCM_FORMAT_IMA_ADPCM 22 +#define XENSND_PCM_FORMAT_MPEG 23 +#define XENSND_PCM_FORMAT_GSM 24 + +/* + ****************************************************************************** + * REQUEST CODES + ****************************************************************************** + */ +#define XENSND_OP_OPEN 0 +#define XENSND_OP_CLOSE 1 +#define XENSND_OP_READ 2 +#define XENSND_OP_WRITE 3 +#define XENSND_OP_SET_VOLUME 4 +#define XENSND_OP_GET_VOLUME 5 +#define XENSND_OP_MUTE 6 +#define XENSND_OP_UNMUTE 7 + +/* + ****************************************************************************** + * XENSTORE FIELD AND PATH NAME STRINGS, HELPERS + ****************************************************************************** + */ +#define XENSND_DRIVER_NAME "vsnd" + +#define XENSND_LIST_SEPARATOR "," +/* Field names */ +#define XENSND_FIELD_BE_VERSIONS "versions" +#define XENSND_FIELD_FE_VERSION "version" +#define XENSND_FIELD_VCARD_SHORT_NAME "short-name" +#define XENSND_FIELD_VCARD_LONG_NAME "long-name" +#define XENSND_FIELD_RING_REF "ring-ref" +#define XENSND_FIELD_EVT_CHNL "event-channel" +#define XENSND_FIELD_DEVICE_NAME "name" +#define XENSND_FIELD_TYPE "type" +#define XENSND_FIELD_STREAM_UNIQUE_ID "unique-id" +#define XENSND_FIELD_CHANNELS_MIN "channels-min" +#define XENSND_FIELD_CHANNELS_MAX "channels-max" +#define XENSND_FIELD_SAMPLE_RATES "sample-rates" +#define XENSND_FIELD_SAMPLE_FORMATS "sample-formats" +#define XENSND_FIELD_BUFFER_SIZE "buffer-size" + +/* Stream type field values. */ +#define XENSND_STREAM_TYPE_PLAYBACK "p" +#define XENSND_STREAM_TYPE_CAPTURE "c" +/* Sample rate max string length */ +#define XENSND_SAMPLE_RATE_MAX_LEN 11 +/* Sample format field values */ +#define XENSND_SAMPLE_FORMAT_MAX_LEN 24 + +#define XENSND_PCM_FORMAT_S8_STR "s8" +#define XENSND_PCM_FORMAT_U8_STR "u8" +#define XENSND_PCM_FORMAT_S16_LE_STR "s16_le" +#define XENSND_PCM_FORMAT_S16_BE_STR "s16_be" +#define XENSND_PCM_FORMAT_U16_LE_STR "u16_le" +#define XENSND_PCM_FORMAT_U16_BE_STR "u16_be" +#define XENSND_PCM_FORMAT_S24_LE_STR "s24_le" +#define XENSND_PCM_FORMAT_S24_BE_STR "s24_be" +#define XENSND_PCM_FORMAT_U24_LE_STR "u24_le" +#define XENSND_PCM_FORMAT_U24_BE_STR "u24_be" +#define XENSND_PCM_FORMAT_S32_LE_STR "s32_le" +#define XENSND_PCM_FORMAT_S32_BE_STR "s32_be" +#define XENSND_PCM_FORMAT_U32_LE_STR "u32_le" +#define XENSND_PCM_FORMAT_U32_BE_STR "u32_be" +#define XENSND_PCM_FORMAT_F32_LE_STR "float_le" +#define XENSND_PCM_FORMAT_F32_BE_STR "float_be" +#define XENSND_PCM_FORMAT_F64_LE_STR "float64_le" +#define XENSND_PCM_FORMAT_F64_BE_STR "float64_be" +#define XENSND_PCM_FORMAT_IEC958_SUBFRAME_LE_STR "iec958_subframe_le" +#define XENSND_PCM_FORMAT_IEC958_SUBFRAME_BE_STR "iec958_subframe_be" +#define XENSND_PCM_FORMAT_MU_LAW_STR "mu_law" +#define XENSND_PCM_FORMAT_A_LAW_STR "a_law" +#define XENSND_PCM_FORMAT_IMA_ADPCM_STR "ima_adpcm" +#define XENSND_PCM_FORMAT_MPEG_STR "mpeg" +#define XENSND_PCM_FORMAT_GSM_STR "gsm" + + +/* + ****************************************************************************** + * STATUS RETURN CODES + ****************************************************************************** + * + * Status return code is zero on success and -XEN_EXX on failure. + * + ****************************************************************************** + * Assumptions + ****************************************************************************** + * o usage of grant reference 0 as invalid grant reference: + * grant reference 0 is valid, but never exposed to a PV driver, + * because of the fact it is already in use/reserved by the PV console. + * o all references in this document to page sizes must be treated + * as pages of size XEN_PAGE_SIZE unless otherwise noted. + * + ****************************************************************************** + * Description of the protocol between frontend and backend driver + ****************************************************************************** + * + * The two halves of a Para-virtual sound driver communicate with + * each other using shared pages and event channels. + * Shared page contains a ring with request/response packets. + * + * Packets, used for input/output operations, e.g. read/write, set/get volume, + * etc., provide offset/length fields in order to allow asynchronous protocol + * operation with buffer space sharing: part of the buffer allocated at + * XENSND_OP_OPEN can be used for audio samples and part, for example, + * for volume control. + * + * All reserved fields in the structures below must be 0. + * + *---------------------------------- Requests --------------------------------- + * + * All request packets have the same length (32 octets) + * All request packets have common header: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | operation | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * id - uint16_t, private guest value, echoed in response + * operation - uint8_t, operation code, XENSND_OP_??? + * + * For all packets which use offset and length: + * offset - uint32_t, read or write data offset within the shared buffer, + * passed with XENSND_OP_OPEN request, octets, + * [0; XENSND_OP_OPEN.buffer_sz - 1]. + * length - uint32_t, read or write data length, octets + * + * Request open - open a PCM stream for playback or capture: + * + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | XENSND_OP_OPEN | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | pcm_rate | 12 + * +----------------+----------------+----------------+----------------+ + * | pcm_format | pcm_channels | reserved | 16 + * +----------------+----------------+----------------+----------------+ + * | buffer_sz | 20 + * +----------------+----------------+----------------+----------------+ + * | gref_directory | 24 + * +----------------+----------------+----------------+----------------+ + * | reserved | 28 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 32 + * +----------------+----------------+----------------+----------------+ + * + * pcm_rate - uint32_t, stream data rate, Hz + * pcm_format - uint8_t, XENSND_PCM_FORMAT_XXX value + * pcm_channels - uint8_t, number of channels of this stream, + * [channels-min; channels-max] + * buffer_sz - uint32_t, buffer size to be allocated, octets + * gref_directory - grant_ref_t, a reference to the first shared page + * describing shared buffer references. At least one page exists. If shared + * buffer size (buffer_sz) exceeds what can be addressed by this single page, + * then reference to the next page must be supplied (see gref_dir_next_page + * below) + */ + +struct xensnd_open_req { + uint32_t pcm_rate; + uint8_t pcm_format; + uint8_t pcm_channels; + uint16_t reserved; + uint32_t buffer_sz; + grant_ref_t gref_directory; +}; + +/* + * Shared page for XENSND_OP_OPEN buffer descriptor (gref_directory in the + * request) employs a list of pages, describing all pages of the shared data + * buffer: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | gref_dir_next_page | 4 + * +----------------+----------------+----------------+----------------+ + * | gref[0] | 8 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | gref[i] | i*4+8 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | gref[N - 1] | N*4+8 + * +----------------+----------------+----------------+----------------+ + * + * gref_dir_next_page - grant_ref_t, reference to the next page describing + * page directory. Must be 0 if there are no more pages in the list. + * gref[i] - grant_ref_t, reference to a shared page of the buffer + * allocated at XENSND_OP_OPEN + * + * Number of grant_ref_t entries in the whole page directory is not + * passed, but instead can be calculated as: + * num_grefs_total = (XENSND_OP_OPEN.buffer_sz + XEN_PAGE_SIZE - 1) / + * XEN_PAGE_SIZE + */ + +struct xensnd_page_directory { + grant_ref_t gref_dir_next_page; + grant_ref_t gref[1]; /* Variable length */ +}; + +/* + * Request close - close an opened pcm stream: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | XENSND_OP_CLOSE| reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 32 + * +----------------+----------------+----------------+----------------+ + * + * Request read/write - used for read (for capture) or write (for playback): + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | operation | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | offset | 12 + * +----------------+----------------+----------------+----------------+ + * | length | 16 + * +----------------+----------------+----------------+----------------+ + * | reserved | 20 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 32 + * +----------------+----------------+----------------+----------------+ + * + * operation - XENSND_OP_READ for read or XENSND_OP_WRITE for write + */ + +struct xensnd_rw_req { + uint32_t offset; + uint32_t length; +}; + +/* + * Request set/get volume - set/get channels' volume of the stream given: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | operation | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | offset | 12 + * +----------------+----------------+----------------+----------------+ + * | length | 16 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 32 + * +----------------+----------------+----------------+----------------+ + * + * operation - XENSND_OP_SET_VOLUME for volume set + * or XENSND_OP_GET_VOLUME for volume get + * Buffer passed with XENSND_OP_OPEN is used to exchange volume + * values: + * + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | channel[0] | 4 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | channel[i] | i*4 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | channel[N - 1] | (N-1)*4 + * +----------------+----------------+----------------+----------------+ + * + * N = XENSND_OP_OPEN.pcm_channels + * i - uint8_t, index of a channel + * channel[i] - sint32_t, volume of i-th channel + * Volume is expressed as a signed value in steps of 0.001 dB, + * while 0 being 0 dB. + * + * Request mute/unmute - mute/unmute stream: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | operation | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | offset | 12 + * +----------------+----------------+----------------+----------------+ + * | length | 16 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 32 + * +----------------+----------------+----------------+----------------+ + * + * operation - XENSND_OP_MUTE for mute or XENSND_OP_UNMUTE for unmute + * Buffer passed with XENSND_OP_OPEN is used to exchange mute/unmute + * values: + * + * 0 octet + * +----------------+----------------+----------------+----------------+ + * | channel[0] | 4 + * +----------------+----------------+----------------+----------------+ + * +/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | channel[i] | i*4 + * +----------------+----------------+----------------+----------------+ + * +/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | channel[N - 1] | (N-1)*4 + * +----------------+----------------+----------------+----------------+ + * + * N = XENSND_OP_OPEN.pcm_channels + * i - uint8_t, index of a channel + * channel[i] - uint8_t, non-zero if i-th channel needs to be muted/unmuted + * + *------------------------------------ N.B. ----------------------------------- + * + * The 'struct xensnd_rw_req' is also used for XENSND_OP_SET_VOLUME, + * XENSND_OP_GET_VOLUME, XENSND_OP_MUTE, XENSND_OP_UNMUTE. + */ + +/* + *---------------------------------- Responses -------------------------------- + * + * All response packets have the same length (32 octets) + * + * Response for all requests: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | operation | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | status | 8 + * +----------------+----------------+----------------+----------------+ + * | reserved | 12 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 32 + * +----------------+----------------+----------------+----------------+ + * + * id - uint16_t, copied from the request + * operation - uint8_t, XENSND_OP_* - copied from request + * status - int32_t, response status, zero on success and -XEN_EXX on failure + */ + +struct xensnd_req { + uint16_t id; + uint8_t operation; + uint8_t reserved[5]; + union { + struct xensnd_open_req open; + struct xensnd_rw_req rw; + uint8_t reserved[24]; + } op; +}; + +struct xensnd_resp { + uint16_t id; + uint8_t operation; + uint8_t reserved; + int32_t status; + uint8_t reserved1[24]; +}; + +DEFINE_RING_TYPES(xen_sndif, struct xensnd_req, struct xensnd_resp); + +#endif /* __XEN_PUBLIC_IO_SNDIF_H__ */ diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index b5486e648607..c44a2ee8c8f8 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -22,6 +22,8 @@ void xen_timer_resume(void); void xen_arch_resume(void); void xen_arch_suspend(void); +void xen_reboot(int reason); + void xen_resume_notifier_register(struct notifier_block *nb); void xen_resume_notifier_unregister(struct notifier_block *nb); @@ -34,11 +36,25 @@ u64 xen_steal_clock(int cpu); int xen_setup_shutdown_event(void); extern unsigned long *xen_contiguous_bitmap; + +#ifdef CONFIG_XEN_PV int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, unsigned int address_bits, dma_addr_t *dma_handle); void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order); +#else +static inline int xen_create_contiguous_region(phys_addr_t pstart, + unsigned int order, + unsigned int address_bits, + dma_addr_t *dma_handle) +{ + return 0; +} + +static inline void xen_destroy_contiguous_region(phys_addr_t pstart, + unsigned int order) { } +#endif struct vm_area_struct; @@ -120,6 +136,9 @@ efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules, efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules, unsigned long count, u64 *max_size, int *reset_type); +void xen_efi_reset_system(int reset_type, efi_status_t status, + unsigned long data_size, efi_char16_t *data); + #ifdef CONFIG_PREEMPT diff --git a/net/9p/Kconfig b/net/9p/Kconfig index a75174a33723..e6014e0e51f7 100644 --- a/net/9p/Kconfig +++ b/net/9p/Kconfig @@ -22,6 +22,15 @@ config NET_9P_VIRTIO This builds support for a transports between guest partitions and a host partition. +config NET_9P_XEN + depends on XEN + select XEN_XENBUS_FRONTEND + tristate "9P Xen Transport" + help + This builds support for a transport for 9pfs between + two Xen domains. + + config NET_9P_RDMA depends on INET && INFINIBAND && INFINIBAND_ADDR_TRANS tristate "9P RDMA Transport (Experimental)" diff --git a/net/9p/Makefile b/net/9p/Makefile index a0874cc1f718..697ea7caf466 100644 --- a/net/9p/Makefile +++ b/net/9p/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_NET_9P) := 9pnet.o +obj-$(CONFIG_NET_9P_XEN) += 9pnet_xen.o obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o @@ -14,5 +15,8 @@ obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o 9pnet_virtio-objs := \ trans_virtio.o \ +9pnet_xen-objs := \ + trans_xen.o \ + 9pnet_rdma-objs := \ trans_rdma.o \ diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c new file mode 100644 index 000000000000..71e85643b3f9 --- /dev/null +++ b/net/9p/trans_xen.c @@ -0,0 +1,545 @@ +/* + * linux/fs/9p/trans_xen + * + * Xen transport layer. + * + * Copyright (C) 2017 by Stefano Stabellini <stefano@aporeto.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <xen/events.h> +#include <xen/grant_table.h> +#include <xen/xen.h> +#include <xen/xenbus.h> +#include <xen/interface/io/9pfs.h> + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/rwlock.h> +#include <net/9p/9p.h> +#include <net/9p/client.h> +#include <net/9p/transport.h> + +#define XEN_9PFS_NUM_RINGS 2 +#define XEN_9PFS_RING_ORDER 6 +#define XEN_9PFS_RING_SIZE XEN_FLEX_RING_SIZE(XEN_9PFS_RING_ORDER) + +struct xen_9pfs_header { + uint32_t size; + uint8_t id; + uint16_t tag; + + /* uint8_t sdata[]; */ +} __attribute__((packed)); + +/* One per ring, more than one per 9pfs share */ +struct xen_9pfs_dataring { + struct xen_9pfs_front_priv *priv; + + struct xen_9pfs_data_intf *intf; + grant_ref_t ref; + int evtchn; + int irq; + /* protect a ring from concurrent accesses */ + spinlock_t lock; + + struct xen_9pfs_data data; + wait_queue_head_t wq; + struct work_struct work; +}; + +/* One per 9pfs share */ +struct xen_9pfs_front_priv { + struct list_head list; + struct xenbus_device *dev; + char *tag; + struct p9_client *client; + + int num_rings; + struct xen_9pfs_dataring *rings; +}; + +static LIST_HEAD(xen_9pfs_devs); +static DEFINE_RWLOCK(xen_9pfs_lock); + +/* We don't currently allow canceling of requests */ +static int p9_xen_cancel(struct p9_client *client, struct p9_req_t *req) +{ + return 1; +} + +static int p9_xen_create(struct p9_client *client, const char *addr, char *args) +{ + struct xen_9pfs_front_priv *priv; + + read_lock(&xen_9pfs_lock); + list_for_each_entry(priv, &xen_9pfs_devs, list) { + if (!strcmp(priv->tag, addr)) { + priv->client = client; + read_unlock(&xen_9pfs_lock); + return 0; + } + } + read_unlock(&xen_9pfs_lock); + return -EINVAL; +} + +static void p9_xen_close(struct p9_client *client) +{ + struct xen_9pfs_front_priv *priv; + + read_lock(&xen_9pfs_lock); + list_for_each_entry(priv, &xen_9pfs_devs, list) { + if (priv->client == client) { + priv->client = NULL; + read_unlock(&xen_9pfs_lock); + return; + } + } + read_unlock(&xen_9pfs_lock); +} + +static bool p9_xen_write_todo(struct xen_9pfs_dataring *ring, RING_IDX size) +{ + RING_IDX cons, prod; + + cons = ring->intf->out_cons; + prod = ring->intf->out_prod; + virt_mb(); + + return XEN_9PFS_RING_SIZE - + xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE) >= size; +} + +static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) +{ + struct xen_9pfs_front_priv *priv = NULL; + RING_IDX cons, prod, masked_cons, masked_prod; + unsigned long flags; + u32 size = p9_req->tc->size; + struct xen_9pfs_dataring *ring; + int num; + + read_lock(&xen_9pfs_lock); + list_for_each_entry(priv, &xen_9pfs_devs, list) { + if (priv->client == client) + break; + } + read_unlock(&xen_9pfs_lock); + if (!priv || priv->client != client) + return -EINVAL; + + num = p9_req->tc->tag % priv->num_rings; + ring = &priv->rings[num]; + +again: + while (wait_event_interruptible(ring->wq, + p9_xen_write_todo(ring, size)) != 0) + ; + + spin_lock_irqsave(&ring->lock, flags); + cons = ring->intf->out_cons; + prod = ring->intf->out_prod; + virt_mb(); + + if (XEN_9PFS_RING_SIZE - xen_9pfs_queued(prod, cons, + XEN_9PFS_RING_SIZE) < size) { + spin_unlock_irqrestore(&ring->lock, flags); + goto again; + } + + masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE); + masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); + + xen_9pfs_write_packet(ring->data.out, p9_req->tc->sdata, size, + &masked_prod, masked_cons, XEN_9PFS_RING_SIZE); + + p9_req->status = REQ_STATUS_SENT; + virt_wmb(); /* write ring before updating pointer */ + prod += size; + ring->intf->out_prod = prod; + spin_unlock_irqrestore(&ring->lock, flags); + notify_remote_via_irq(ring->irq); + + return 0; +} + +static void p9_xen_response(struct work_struct *work) +{ + struct xen_9pfs_front_priv *priv; + struct xen_9pfs_dataring *ring; + RING_IDX cons, prod, masked_cons, masked_prod; + struct xen_9pfs_header h; + struct p9_req_t *req; + int status; + + ring = container_of(work, struct xen_9pfs_dataring, work); + priv = ring->priv; + + while (1) { + cons = ring->intf->in_cons; + prod = ring->intf->in_prod; + virt_rmb(); + + if (xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE) < + sizeof(h)) { + notify_remote_via_irq(ring->irq); + return; + } + + masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE); + masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); + + /* First, read just the header */ + xen_9pfs_read_packet(&h, ring->data.in, sizeof(h), + masked_prod, &masked_cons, + XEN_9PFS_RING_SIZE); + + req = p9_tag_lookup(priv->client, h.tag); + if (!req || req->status != REQ_STATUS_SENT) { + dev_warn(&priv->dev->dev, "Wrong req tag=%x\n", h.tag); + cons += h.size; + virt_mb(); + ring->intf->in_cons = cons; + continue; + } + + memcpy(req->rc, &h, sizeof(h)); + req->rc->offset = 0; + + masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); + /* Then, read the whole packet (including the header) */ + xen_9pfs_read_packet(req->rc->sdata, ring->data.in, h.size, + masked_prod, &masked_cons, + XEN_9PFS_RING_SIZE); + + virt_mb(); + cons += h.size; + ring->intf->in_cons = cons; + + status = (req->status != REQ_STATUS_ERROR) ? + REQ_STATUS_RCVD : REQ_STATUS_ERROR; + + p9_client_cb(priv->client, req, status); + } +} + +static irqreturn_t xen_9pfs_front_event_handler(int irq, void *r) +{ + struct xen_9pfs_dataring *ring = r; + + if (!ring || !ring->priv->client) { + /* ignore spurious interrupt */ + return IRQ_HANDLED; + } + + wake_up_interruptible(&ring->wq); + schedule_work(&ring->work); + + return IRQ_HANDLED; +} + +static struct p9_trans_module p9_xen_trans = { + .name = "xen", + .maxsize = 1 << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT), + .def = 1, + .create = p9_xen_create, + .close = p9_xen_close, + .request = p9_xen_request, + .cancel = p9_xen_cancel, + .owner = THIS_MODULE, +}; + +static const struct xenbus_device_id xen_9pfs_front_ids[] = { + { "9pfs" }, + { "" } +}; + +static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv) +{ + int i, j; + + write_lock(&xen_9pfs_lock); + list_del(&priv->list); + write_unlock(&xen_9pfs_lock); + + for (i = 0; i < priv->num_rings; i++) { + if (!priv->rings[i].intf) + break; + if (priv->rings[i].irq > 0) + unbind_from_irqhandler(priv->rings[i].irq, priv->dev); + if (priv->rings[i].data.in) { + for (j = 0; j < (1 << XEN_9PFS_RING_ORDER); j++) { + grant_ref_t ref; + + ref = priv->rings[i].intf->ref[j]; + gnttab_end_foreign_access(ref, 0, 0); + } + free_pages((unsigned long)priv->rings[i].data.in, + XEN_9PFS_RING_ORDER - + (PAGE_SHIFT - XEN_PAGE_SHIFT)); + } + gnttab_end_foreign_access(priv->rings[i].ref, 0, 0); + free_page((unsigned long)priv->rings[i].intf); + } + kfree(priv->rings); + kfree(priv->tag); + kfree(priv); +} + +static int xen_9pfs_front_remove(struct xenbus_device *dev) +{ + struct xen_9pfs_front_priv *priv = dev_get_drvdata(&dev->dev); + + dev_set_drvdata(&dev->dev, NULL); + xen_9pfs_front_free(priv); + return 0; +} + +static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev, + struct xen_9pfs_dataring *ring) +{ + int i = 0; + int ret = -ENOMEM; + void *bytes = NULL; + + init_waitqueue_head(&ring->wq); + spin_lock_init(&ring->lock); + INIT_WORK(&ring->work, p9_xen_response); + + ring->intf = (struct xen_9pfs_data_intf *)get_zeroed_page(GFP_KERNEL); + if (!ring->intf) + return ret; + ret = gnttab_grant_foreign_access(dev->otherend_id, + virt_to_gfn(ring->intf), 0); + if (ret < 0) + goto out; + ring->ref = ret; + bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + XEN_9PFS_RING_ORDER - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + if (!bytes) { + ret = -ENOMEM; + goto out; + } + for (; i < (1 << XEN_9PFS_RING_ORDER); i++) { + ret = gnttab_grant_foreign_access( + dev->otherend_id, virt_to_gfn(bytes) + i, 0); + if (ret < 0) + goto out; + ring->intf->ref[i] = ret; + } + ring->intf->ring_order = XEN_9PFS_RING_ORDER; + ring->data.in = bytes; + ring->data.out = bytes + XEN_9PFS_RING_SIZE; + + ret = xenbus_alloc_evtchn(dev, &ring->evtchn); + if (ret) + goto out; + ring->irq = bind_evtchn_to_irqhandler(ring->evtchn, + xen_9pfs_front_event_handler, + 0, "xen_9pfs-frontend", ring); + if (ring->irq >= 0) + return 0; + + xenbus_free_evtchn(dev, ring->evtchn); + ret = ring->irq; +out: + if (bytes) { + for (i--; i >= 0; i--) + gnttab_end_foreign_access(ring->intf->ref[i], 0, 0); + free_pages((unsigned long)bytes, + XEN_9PFS_RING_ORDER - + (PAGE_SHIFT - XEN_PAGE_SHIFT)); + } + gnttab_end_foreign_access(ring->ref, 0, 0); + free_page((unsigned long)ring->intf); + return ret; +} + +static int xen_9pfs_front_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int ret, i; + struct xenbus_transaction xbt; + struct xen_9pfs_front_priv *priv = NULL; + char *versions; + unsigned int max_rings, max_ring_order, len = 0; + + versions = xenbus_read(XBT_NIL, dev->otherend, "versions", &len); + if (!len) + return -EINVAL; + if (strcmp(versions, "1")) { + kfree(versions); + return -EINVAL; + } + kfree(versions); + max_rings = xenbus_read_unsigned(dev->otherend, "max-rings", 0); + if (max_rings < XEN_9PFS_NUM_RINGS) + return -EINVAL; + max_ring_order = xenbus_read_unsigned(dev->otherend, + "max-ring-page-order", 0); + if (max_ring_order < XEN_9PFS_RING_ORDER) + return -EINVAL; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->dev = dev; + priv->num_rings = XEN_9PFS_NUM_RINGS; + priv->rings = kcalloc(priv->num_rings, sizeof(*priv->rings), + GFP_KERNEL); + if (!priv->rings) { + kfree(priv); + return -ENOMEM; + } + + for (i = 0; i < priv->num_rings; i++) { + priv->rings[i].priv = priv; + ret = xen_9pfs_front_alloc_dataring(dev, &priv->rings[i]); + if (ret < 0) + goto error; + } + + again: + ret = xenbus_transaction_start(&xbt); + if (ret) { + xenbus_dev_fatal(dev, ret, "starting transaction"); + goto error; + } + ret = xenbus_printf(xbt, dev->nodename, "version", "%u", 1); + if (ret) + goto error_xenbus; + ret = xenbus_printf(xbt, dev->nodename, "num-rings", "%u", + priv->num_rings); + if (ret) + goto error_xenbus; + for (i = 0; i < priv->num_rings; i++) { + char str[16]; + + BUILD_BUG_ON(XEN_9PFS_NUM_RINGS > 9); + sprintf(str, "ring-ref%u", i); + ret = xenbus_printf(xbt, dev->nodename, str, "%d", + priv->rings[i].ref); + if (ret) + goto error_xenbus; + + sprintf(str, "event-channel-%u", i); + ret = xenbus_printf(xbt, dev->nodename, str, "%u", + priv->rings[i].evtchn); + if (ret) + goto error_xenbus; + } + priv->tag = xenbus_read(xbt, dev->nodename, "tag", NULL); + if (!priv->tag) { + ret = -EINVAL; + goto error_xenbus; + } + ret = xenbus_transaction_end(xbt, 0); + if (ret) { + if (ret == -EAGAIN) + goto again; + xenbus_dev_fatal(dev, ret, "completing transaction"); + goto error; + } + + write_lock(&xen_9pfs_lock); + list_add_tail(&priv->list, &xen_9pfs_devs); + write_unlock(&xen_9pfs_lock); + dev_set_drvdata(&dev->dev, priv); + xenbus_switch_state(dev, XenbusStateInitialised); + + return 0; + + error_xenbus: + xenbus_transaction_end(xbt, 1); + xenbus_dev_fatal(dev, ret, "writing xenstore"); + error: + dev_set_drvdata(&dev->dev, NULL); + xen_9pfs_front_free(priv); + return ret; +} + +static int xen_9pfs_front_resume(struct xenbus_device *dev) +{ + dev_warn(&dev->dev, "suspsend/resume unsupported\n"); + return 0; +} + +static void xen_9pfs_front_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + switch (backend_state) { + case XenbusStateReconfiguring: + case XenbusStateReconfigured: + case XenbusStateInitialising: + case XenbusStateInitialised: + case XenbusStateUnknown: + break; + + case XenbusStateInitWait: + break; + + case XenbusStateConnected: + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateClosed: + if (dev->state == XenbusStateClosed) + break; + /* Missed the backend's CLOSING state -- fallthrough */ + case XenbusStateClosing: + xenbus_frontend_closed(dev); + break; + } +} + +static struct xenbus_driver xen_9pfs_front_driver = { + .ids = xen_9pfs_front_ids, + .probe = xen_9pfs_front_probe, + .remove = xen_9pfs_front_remove, + .resume = xen_9pfs_front_resume, + .otherend_changed = xen_9pfs_front_changed, +}; + +int p9_trans_xen_init(void) +{ + if (!xen_domain()) + return -ENODEV; + + pr_info("Initialising Xen transport for 9pfs\n"); + + v9fs_register_trans(&p9_xen_trans); + return xenbus_register_frontend(&xen_9pfs_front_driver); +} +module_init(p9_trans_xen_init); + +void p9_trans_xen_exit(void) +{ + v9fs_unregister_trans(&p9_xen_trans); + return xenbus_unregister_driver(&xen_9pfs_front_driver); +} +module_exit(p9_trans_xen_exit); |