diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-02-21 13:53:41 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-02-21 13:53:41 -0800 |
commit | 252b95c0edead46fb188042584d3dcd6d6ede062 (patch) | |
tree | e43330f4065eae8468f5a549396696608c191ad0 | |
parent | b8989bccd6a0ad49db4795afca56a733e1c19099 (diff) | |
parent | 4610d240d691768203fdd210a5da0a2e02eddb76 (diff) | |
download | linux-252b95c0edead46fb188042584d3dcd6d6ede062.tar.bz2 |
Merge tag 'for-linus-4.11-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip
Pull xen updates from Juergen Gross:
"Xen features and fixes:
- a series from Boris Ostrovsky adding support for booting Linux as
Xen PVH guest
- a series from Juergen Gross streamlining the xenbus driver
- a series from Paul Durrant adding support for the new device model
hypercall
- several small corrections"
* tag 'for-linus-4.11-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip:
xen/privcmd: add IOCTL_PRIVCMD_RESTRICT
xen/privcmd: Add IOCTL_PRIVCMD_DM_OP
xen/privcmd: return -ENOTTY for unimplemented IOCTLs
xen: optimize xenbus driver for multiple concurrent xenstore accesses
xen: modify xenstore watch event interface
xen: clean up xenbus internal headers
xenbus: Neaten xenbus_va_dev_error
xen/pvh: Use Xen's emergency_restart op for PVH guests
xen/pvh: Enable CPU hotplug
xen/pvh: PVH guests always have PV devices
xen/pvh: Initialize grant table for PVH guests
xen/pvh: Make sure we don't use ACPI_IRQ_MODEL_PIC for SCI
xen/pvh: Bootstrap PVH guest
xen/pvh: Import PVH-related Xen public interfaces
xen/x86: Remove PVH support
x86/boot/32: Convert the 32-bit pgtable setup code from assembly to C
xen/manage: correct return value check on xenbus_scanf()
x86/xen: Fix APIC id mismatch warning on Intel
xen/netback: set default upper limit of tx/rx queues to 8
xen/netfront: set default upper limit of tx/rx queues to 8
49 files changed, 1726 insertions, 967 deletions
diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index 11d9f2898b16..81e3217b12d3 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -457,4 +457,5 @@ EXPORT_SYMBOL_GPL(HYPERVISOR_tmem_op); EXPORT_SYMBOL_GPL(HYPERVISOR_platform_op); EXPORT_SYMBOL_GPL(HYPERVISOR_multicall); EXPORT_SYMBOL_GPL(HYPERVISOR_vm_assist); +EXPORT_SYMBOL_GPL(HYPERVISOR_dm_op); EXPORT_SYMBOL_GPL(privcmd_call); diff --git a/arch/arm/xen/hypercall.S b/arch/arm/xen/hypercall.S index a648dfc3be30..b0b80c0f09f3 100644 --- a/arch/arm/xen/hypercall.S +++ b/arch/arm/xen/hypercall.S @@ -92,6 +92,7 @@ HYPERCALL1(tmem_op); HYPERCALL1(platform_op_raw); HYPERCALL2(multicall); HYPERCALL2(vm_assist); +HYPERCALL3(dm_op); ENTRY(privcmd_call) stmdb sp!, {r4} diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S index 947830a459d2..401ceb71540c 100644 --- a/arch/arm64/xen/hypercall.S +++ b/arch/arm64/xen/hypercall.S @@ -84,6 +84,7 @@ HYPERCALL1(tmem_op); HYPERCALL1(platform_op_raw); HYPERCALL2(multicall); HYPERCALL2(vm_assist); +HYPERCALL3(dm_op); ENTRY(privcmd_call) mov x16, x0 diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index a12a047184ee..f6d20f6cca12 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -472,6 +472,13 @@ HYPERVISOR_xenpmu_op(unsigned int op, void *arg) return _hypercall2(int, xenpmu_op, op, arg); } +static inline int +HYPERVISOR_dm_op( + domid_t dom, unsigned int nr_bufs, void *bufs) +{ + return _hypercall3(int, dm_op, dom, nr_bufs, bufs); +} + static inline void MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) { diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index c7b15f3e2cf3..76b6dbd627df 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -53,5 +53,5 @@ config XEN_DEBUG_FS config XEN_PVH bool "Support for running as a PVH guest" - depends on X86_64 && XEN && XEN_PVHVM + depends on XEN && XEN_PVHVM && ACPI def_bool n diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index e47e52787d32..cb0164aee156 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -23,3 +23,4 @@ obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o obj-$(CONFIG_XEN_DOM0) += vga.o obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o obj-$(CONFIG_XEN_EFI) += efi.o +obj-$(CONFIG_XEN_PVH) += xen-pvh.o diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 44c88ad1841a..bcea81f36fc5 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -145,7 +145,7 @@ static void xen_silent_inquire(int apicid) static int xen_cpu_present_to_apicid(int cpu) { if (cpu_present(cpu)) - return xen_get_apic_id(xen_apic_read(APIC_ID)); + return cpu_data(cpu).apicid; else return BAD_APICID; } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 51ef95232725..ec1d5c46e58f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -45,6 +45,7 @@ #include <xen/interface/memory.h> #include <xen/interface/nmi.h> #include <xen/interface/xen-mca.h> +#include <xen/interface/hvm/start_info.h> #include <xen/features.h> #include <xen/page.h> #include <xen/hvm.h> @@ -176,6 +177,20 @@ struct tls_descs { */ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); +#ifdef CONFIG_XEN_PVH +/* + * PVH variables. + * + * xen_pvh and pvh_bootparams need to live in data segment since they + * are used after startup_{32|64}, which clear .bss, are invoked. + */ +bool xen_pvh __attribute__((section(".data"))) = 0; +struct boot_params pvh_bootparams __attribute__((section(".data"))); + +struct hvm_start_info pvh_start_info; +unsigned int pvh_start_info_sz = sizeof(pvh_start_info); +#endif + static void clamp_max_cpus(void) { #ifdef CONFIG_SMP @@ -1138,10 +1153,11 @@ void xen_setup_vcpu_info_placement(void) xen_vcpu_setup(cpu); } - /* xen_vcpu_setup managed to place the vcpu_info within the - * percpu area for all cpus, so make use of it. Note that for - * PVH we want to use native IRQ mechanism. */ - if (have_vcpu_info_placement && !xen_pvh_domain()) { + /* + * xen_vcpu_setup managed to place the vcpu_info within the + * percpu area for all cpus, so make use of it. + */ + if (have_vcpu_info_placement) { pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); @@ -1413,49 +1429,9 @@ static void __init xen_boot_params_init_edd(void) * Set up the GDT and segment registers for -fstack-protector. Until * we do this, we have to be careful not to call any stack-protected * function, which is most of the kernel. - * - * Note, that it is __ref because the only caller of this after init - * is PVH which is not going to use xen_load_gdt_boot or other - * __init functions. */ -static void __ref xen_setup_gdt(int cpu) +static void xen_setup_gdt(int cpu) { - if (xen_feature(XENFEAT_auto_translated_physmap)) { -#ifdef CONFIG_X86_64 - unsigned long dummy; - - load_percpu_segment(cpu); /* We need to access per-cpu area */ - switch_to_new_gdt(cpu); /* GDT and GS set */ - - /* We are switching of the Xen provided GDT to our HVM mode - * GDT. The new GDT has __KERNEL_CS with CS.L = 1 - * and we are jumping to reload it. - */ - asm volatile ("pushq %0\n" - "leaq 1f(%%rip),%0\n" - "pushq %0\n" - "lretq\n" - "1:\n" - : "=&r" (dummy) : "0" (__KERNEL_CS)); - - /* - * While not needed, we also set the %es, %ds, and %fs - * to zero. We don't care about %ss as it is NULL. - * Strictly speaking this is not needed as Xen zeros those - * out (and also MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE) - * - * Linux zeros them in cpu_init() and in secondary_startup_64 - * (for BSP). - */ - loadsegment(es, 0); - loadsegment(ds, 0); - loadsegment(fs, 0); -#else - /* PVH: TODO Implement. */ - BUG(); -#endif - return; /* PVH does not need any PV GDT ops. */ - } pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; pv_cpu_ops.load_gdt = xen_load_gdt_boot; @@ -1466,59 +1442,6 @@ static void __ref xen_setup_gdt(int cpu) pv_cpu_ops.load_gdt = xen_load_gdt; } -#ifdef CONFIG_XEN_PVH -/* - * A PV guest starts with default flags that are not set for PVH, set them - * here asap. - */ -static void xen_pvh_set_cr_flags(int cpu) -{ - - /* Some of these are setup in 'secondary_startup_64'. The others: - * X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests - * (which PVH shared codepaths), while X86_CR0_PG is for PVH. */ - write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM); - - if (!cpu) - return; - /* - * For BSP, PSE PGE are set in probe_page_size_mask(), for APs - * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu__init_cpu(). - */ - if (boot_cpu_has(X86_FEATURE_PSE)) - cr4_set_bits_and_update_boot(X86_CR4_PSE); - - if (boot_cpu_has(X86_FEATURE_PGE)) - cr4_set_bits_and_update_boot(X86_CR4_PGE); -} - -/* - * Note, that it is ref - because the only caller of this after init - * is PVH which is not going to use xen_load_gdt_boot or other - * __init functions. - */ -void __ref xen_pvh_secondary_vcpu_init(int cpu) -{ - xen_setup_gdt(cpu); - xen_pvh_set_cr_flags(cpu); -} - -static void __init xen_pvh_early_guest_init(void) -{ - if (!xen_feature(XENFEAT_auto_translated_physmap)) - return; - - BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector)); - - xen_pvh_early_cpu_init(0, false); - xen_pvh_set_cr_flags(0); - -#ifdef CONFIG_X86_32 - BUG(); /* PVH: Implement proper support. */ -#endif -} -#endif /* CONFIG_XEN_PVH */ - static void __init xen_dom0_set_legacy_features(void) { x86_platform.legacy.rtc = 1; @@ -1555,24 +1478,17 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_domain_type = XEN_PV_DOMAIN; xen_setup_features(); -#ifdef CONFIG_XEN_PVH - xen_pvh_early_guest_init(); -#endif + xen_setup_machphys_mapping(); /* Install Xen paravirt ops */ pv_info = xen_info; pv_init_ops = xen_init_ops; - if (!xen_pvh_domain()) { - pv_cpu_ops = xen_cpu_ops; + pv_cpu_ops = xen_cpu_ops; - x86_platform.get_nmi_reason = xen_get_nmi_reason; - } + x86_platform.get_nmi_reason = xen_get_nmi_reason; - if (xen_feature(XENFEAT_auto_translated_physmap)) - x86_init.resources.memory_setup = xen_auto_xlated_memory_setup; - else - x86_init.resources.memory_setup = xen_memory_setup; + x86_init.resources.memory_setup = xen_memory_setup; x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; @@ -1665,18 +1581,15 @@ asmlinkage __visible void __init xen_start_kernel(void) /* set the limit of our address space */ xen_reserve_top(); - /* PVH: runs at default kernel iopl of 0 */ - if (!xen_pvh_domain()) { - /* - * We used to do this in xen_arch_setup, but that is too late - * on AMD were early_cpu_init (run before ->arch_setup()) calls - * early_amd_init which pokes 0xcf8 port. - */ - set_iopl.iopl = 1; - rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); - if (rc != 0) - xen_raw_printk("physdev_op failed %d\n", rc); - } + /* + * We used to do this in xen_arch_setup, but that is too late + * on AMD were early_cpu_init (run before ->arch_setup()) calls + * early_amd_init which pokes 0xcf8 port. + */ + set_iopl.iopl = 1; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); + if (rc != 0) + xen_raw_printk("physdev_op failed %d\n", rc); #ifdef CONFIG_X86_32 /* set up basic CPUID stuff */ @@ -1758,6 +1671,102 @@ asmlinkage __visible void __init xen_start_kernel(void) #endif } +#ifdef CONFIG_XEN_PVH + +static void xen_pvh_arch_setup(void) +{ +#ifdef CONFIG_ACPI + /* Make sure we don't fall back to (default) ACPI_IRQ_MODEL_PIC. */ + if (nr_ioapics == 0) + acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; +#endif +} + +static void __init init_pvh_bootparams(void) +{ + struct xen_memory_map memmap; + unsigned int i; + int rc; + + memset(&pvh_bootparams, 0, sizeof(pvh_bootparams)); + + memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_map); + set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_map); + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); + if (rc) { + xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); + BUG(); + } + + if (memmap.nr_entries < E820MAX - 1) { + pvh_bootparams.e820_map[memmap.nr_entries].addr = + ISA_START_ADDRESS; + pvh_bootparams.e820_map[memmap.nr_entries].size = + ISA_END_ADDRESS - ISA_START_ADDRESS; + pvh_bootparams.e820_map[memmap.nr_entries].type = + E820_RESERVED; + memmap.nr_entries++; + } else + xen_raw_printk("Warning: Can fit ISA range into e820\n"); + + sanitize_e820_map(pvh_bootparams.e820_map, + ARRAY_SIZE(pvh_bootparams.e820_map), + &memmap.nr_entries); + + pvh_bootparams.e820_entries = memmap.nr_entries; + for (i = 0; i < pvh_bootparams.e820_entries; i++) + e820_add_region(pvh_bootparams.e820_map[i].addr, + pvh_bootparams.e820_map[i].size, + pvh_bootparams.e820_map[i].type); + + pvh_bootparams.hdr.cmd_line_ptr = + pvh_start_info.cmdline_paddr; + + /* The first module is always ramdisk. */ + if (pvh_start_info.nr_modules) { + struct hvm_modlist_entry *modaddr = + __va(pvh_start_info.modlist_paddr); + pvh_bootparams.hdr.ramdisk_image = modaddr->paddr; + pvh_bootparams.hdr.ramdisk_size = modaddr->size; + } + + /* + * See Documentation/x86/boot.txt. + * + * Version 2.12 supports Xen entry point but we will use default x86/PC + * environment (i.e. hardware_subarch 0). + */ + pvh_bootparams.hdr.version = 0x212; + pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */ +} + +/* + * This routine (and those that it might call) should not use + * anything that lives in .bss since that segment will be cleared later. + */ +void __init xen_prepare_pvh(void) +{ + u32 msr; + u64 pfn; + + if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) { + xen_raw_printk("Error: Unexpected magic value (0x%08x)\n", + pvh_start_info.magic); + BUG(); + } + + xen_pvh = 1; + + msr = cpuid_ebx(xen_cpuid_base() + 2); + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + + init_pvh_bootparams(); + + x86_init.oem.arch_setup = xen_pvh_arch_setup; +} +#endif + void __ref xen_hvm_init_shared_info(void) { int cpu; @@ -1797,20 +1806,29 @@ void __ref xen_hvm_init_shared_info(void) static void __init init_hvm_pv_info(void) { int major, minor; - uint32_t eax, ebx, ecx, edx, pages, msr, base; - u64 pfn; + uint32_t eax, ebx, ecx, edx, base; base = xen_cpuid_base(); - cpuid(base + 1, &eax, &ebx, &ecx, &edx); + eax = cpuid_eax(base + 1); major = eax >> 16; minor = eax & 0xffff; printk(KERN_INFO "Xen version %d.%d.\n", major, minor); - cpuid(base + 2, &pages, &msr, &ecx, &edx); + xen_domain_type = XEN_HVM_DOMAIN; - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + /* PVH set up hypercall page in xen_prepare_pvh(). */ + if (xen_pvh_domain()) + pv_info.name = "Xen PVH"; + else { + u64 pfn; + uint32_t msr; + + pv_info.name = "Xen HVM"; + msr = cpuid_ebx(base + 2); + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + } xen_setup_features(); @@ -1819,10 +1837,6 @@ static void __init init_hvm_pv_info(void) this_cpu_write(xen_vcpu_id, ebx); else this_cpu_write(xen_vcpu_id, smp_processor_id()); - - pv_info.name = "Xen HVM"; - - xen_domain_type = XEN_HVM_DOMAIN; } #endif @@ -1910,6 +1924,9 @@ static void __init xen_hvm_guest_init(void) x86_init.irqs.intr_init = xen_init_IRQ; xen_hvm_init_time_ops(); xen_hvm_init_mmu_ops(); + + if (xen_pvh_domain()) + machine_ops.emergency_restart = xen_emergency_restart; #ifdef CONFIG_KEXEC_CORE machine_ops.shutdown = xen_hvm_shutdown; machine_ops.crash_shutdown = xen_hvm_crash_shutdown; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 7d5afdb417cc..f6740b5b1738 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1792,10 +1792,6 @@ static void __init set_page_prot_flags(void *addr, pgprot_t prot, unsigned long pfn = __pa(addr) >> PAGE_SHIFT; pte_t pte = pfn_pte(pfn, prot); - /* For PVH no need to set R/O or R/W to pin them or unpin them. */ - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) BUG(); } @@ -1902,8 +1898,7 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, * level2_ident_pgt, and level2_kernel_pgt. This means that only the * kernel has a physical mapping to start with - but that's enough to * get __va working. We need to fill in the rest of the physical - * mapping once some sort of allocator has been set up. NOTE: for - * PVH, the page tables are native. + * mapping once some sort of allocator has been set up. */ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { @@ -2812,16 +2807,6 @@ static int do_remap_gfn(struct vm_area_struct *vma, BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); - if (xen_feature(XENFEAT_auto_translated_physmap)) { -#ifdef CONFIG_XEN_PVH - /* We need to update the local page tables and the xen HAP */ - return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr, - prot, domid, pages); -#else - return -EINVAL; -#endif - } - rmd.mfn = gfn; rmd.prot = prot; /* We use the err_ptr to indicate if there we are doing a contiguous @@ -2915,10 +2900,6 @@ int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) return 0; -#ifdef CONFIG_XEN_PVH - return xen_xlate_unmap_gfn_range(vma, numpgs, pages); -#else return -EINVAL; -#endif } EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range); diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index 90d1b83cf35f..33a783c77d96 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -73,8 +73,8 @@ bool xen_has_pv_devices(void) if (!xen_domain()) return false; - /* PV domains always have them. */ - if (xen_pv_domain()) + /* PV and PVH domains always have them. */ + if (xen_pv_domain() || xen_pvh_domain()) return true; /* And user has xen_platform_pci=0 set in guest config as diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index f3f7b41116f7..a8c306cf8868 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -915,39 +915,6 @@ char * __init xen_memory_setup(void) } /* - * Machine specific memory setup for auto-translated guests. - */ -char * __init xen_auto_xlated_memory_setup(void) -{ - struct xen_memory_map memmap; - int i; - int rc; - - memmap.nr_entries = ARRAY_SIZE(xen_e820_map); - set_xen_guest_handle(memmap.buffer, xen_e820_map); - - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); - if (rc < 0) - panic("No memory map (%d)\n", rc); - - xen_e820_map_entries = memmap.nr_entries; - - sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map), - &xen_e820_map_entries); - - for (i = 0; i < xen_e820_map_entries; i++) - e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size, - xen_e820_map[i].type); - - /* Remove p2m info, it is not needed. */ - xen_start_info->mfn_list = 0; - xen_start_info->first_p2m_pfn = 0; - xen_start_info->nr_p2m_frames = 0; - - return "Xen"; -} - -/* * Set the bit indicating "nosegneg" library variants should be used. * We only need to bother in pure 32-bit mode; compat 32-bit processes * can have un-truncated segments, so wrapping around is allowed. @@ -1032,8 +999,8 @@ void __init xen_pvmmu_arch_setup(void) void __init xen_arch_setup(void) { xen_panic_handler_init(); - if (!xen_feature(XENFEAT_auto_translated_physmap)) - xen_pvmmu_arch_setup(); + + xen_pvmmu_arch_setup(); #ifdef CONFIG_ACPI if (!(xen_start_info->flags & SIF_INITDOMAIN)) { diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 311acad7dad2..0dee6f59ea82 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -99,18 +99,8 @@ static void cpu_bringup(void) local_irq_enable(); } -/* - * Note: cpu parameter is only relevant for PVH. The reason for passing it - * is we can't do smp_processor_id until the percpu segments are loaded, for - * which we need the cpu number! So we pass it in rdi as first parameter. - */ -asmlinkage __visible void cpu_bringup_and_idle(int cpu) +asmlinkage __visible void cpu_bringup_and_idle(void) { -#ifdef CONFIG_XEN_PVH - if (xen_feature(XENFEAT_auto_translated_physmap) && - xen_feature(XENFEAT_supervisor_mode_kernel)) - xen_pvh_secondary_vcpu_init(cpu); -#endif cpu_bringup(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } @@ -404,61 +394,47 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) gdt = get_cpu_gdt_table(cpu); #ifdef CONFIG_X86_32 - /* Note: PVH is not yet supported on x86_32. */ ctxt->user_regs.fs = __KERNEL_PERCPU; ctxt->user_regs.gs = __KERNEL_STACK_CANARY; #endif memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); - if (!xen_feature(XENFEAT_auto_translated_physmap)) { - ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; - ctxt->flags = VGCF_IN_KERNEL; - ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ - ctxt->user_regs.ds = __USER_DS; - ctxt->user_regs.es = __USER_DS; - ctxt->user_regs.ss = __KERNEL_DS; + ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; + ctxt->flags = VGCF_IN_KERNEL; + ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ + ctxt->user_regs.ds = __USER_DS; + ctxt->user_regs.es = __USER_DS; + ctxt->user_regs.ss = __KERNEL_DS; - xen_copy_trap_info(ctxt->trap_ctxt); + xen_copy_trap_info(ctxt->trap_ctxt); - ctxt->ldt_ents = 0; + ctxt->ldt_ents = 0; - BUG_ON((unsigned long)gdt & ~PAGE_MASK); + BUG_ON((unsigned long)gdt & ~PAGE_MASK); - gdt_mfn = arbitrary_virt_to_mfn(gdt); - make_lowmem_page_readonly(gdt); - make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); + gdt_mfn = arbitrary_virt_to_mfn(gdt); + make_lowmem_page_readonly(gdt); + make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); - ctxt->gdt_frames[0] = gdt_mfn; - ctxt->gdt_ents = GDT_ENTRIES; + ctxt->gdt_frames[0] = gdt_mfn; + ctxt->gdt_ents = GDT_ENTRIES; - ctxt->kernel_ss = __KERNEL_DS; - ctxt->kernel_sp = idle->thread.sp0; + ctxt->kernel_ss = __KERNEL_DS; + ctxt->kernel_sp = idle->thread.sp0; #ifdef CONFIG_X86_32 - ctxt->event_callback_cs = __KERNEL_CS; - ctxt->failsafe_callback_cs = __KERNEL_CS; + ctxt->event_callback_cs = __KERNEL_CS; + ctxt->failsafe_callback_cs = __KERNEL_CS; #else - ctxt->gs_base_kernel = per_cpu_offset(cpu); -#endif - ctxt->event_callback_eip = - (unsigned long)xen_hypervisor_callback; - ctxt->failsafe_callback_eip = - (unsigned long)xen_failsafe_callback; - ctxt->user_regs.cs = __KERNEL_CS; - per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); - } -#ifdef CONFIG_XEN_PVH - else { - /* - * The vcpu comes on kernel page tables which have the NX pte - * bit set. This means before DS/SS is touched, NX in - * EFER must be set. Hence the following assembly glue code. - */ - ctxt->user_regs.eip = (unsigned long)xen_pvh_early_cpu_init; - ctxt->user_regs.rdi = cpu; - ctxt->user_regs.rsi = true; /* entry == true */ - } + ctxt->gs_base_kernel = per_cpu_offset(cpu); #endif + ctxt->event_callback_eip = + (unsigned long)xen_hypervisor_callback; + ctxt->failsafe_callback_eip = + (unsigned long)xen_failsafe_callback; + ctxt->user_regs.cs = __KERNEL_CS; + per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); + ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h index c5c16dc4f694..9beef333584a 100644 --- a/arch/x86/xen/smp.h +++ b/arch/x86/xen/smp.h @@ -21,12 +21,4 @@ static inline int xen_smp_intr_init(unsigned int cpu) static inline void xen_smp_intr_free(unsigned int cpu) {} #endif /* CONFIG_SMP */ -#ifdef CONFIG_XEN_PVH -extern void xen_pvh_early_cpu_init(int cpu, bool entry); -#else -static inline void xen_pvh_early_cpu_init(int cpu, bool entry) -{ -} -#endif - #endif diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 7f8d8abf4c1a..37794e42b67d 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -16,25 +16,6 @@ #include <xen/interface/xen-mca.h> #include <asm/xen/interface.h> -#ifdef CONFIG_XEN_PVH -#define PVH_FEATURES_STR "|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel" -/* Note the lack of 'hvm_callback_vector'. Older hypervisor will - * balk at this being part of XEN_ELFNOTE_FEATURES, so we put it in - * XEN_ELFNOTE_SUPPORTED_FEATURES which older hypervisors will ignore. - */ -#define PVH_FEATURES ((1 << XENFEAT_writable_page_tables) | \ - (1 << XENFEAT_auto_translated_physmap) | \ - (1 << XENFEAT_supervisor_mode_kernel) | \ - (1 << XENFEAT_hvm_callback_vector)) -/* The XENFEAT_writable_page_tables is not stricly necessary as we set that - * up regardless whether this CONFIG option is enabled or not, but it - * clarifies what the right flags need to be. - */ -#else -#define PVH_FEATURES_STR "" -#define PVH_FEATURES (0) -#endif - __INIT ENTRY(startup_xen) cld @@ -54,41 +35,6 @@ ENTRY(startup_xen) __FINIT -#ifdef CONFIG_XEN_PVH -/* - * xen_pvh_early_cpu_init() - early PVH VCPU initialization - * @cpu: this cpu number (%rdi) - * @entry: true if this is a secondary vcpu coming up on this entry - * point, false if this is the boot CPU being initialized for - * the first time (%rsi) - * - * Note: This is called as a function on the boot CPU, and is the entry point - * on the secondary CPU. - */ -ENTRY(xen_pvh_early_cpu_init) - mov %rsi, %r11 - - /* Gather features to see if NX implemented. */ - mov $0x80000001, %eax - cpuid - mov %edx, %esi - - mov $MSR_EFER, %ecx - rdmsr - bts $_EFER_SCE, %eax - - bt $20, %esi - jnc 1f /* No NX, skip setting it */ - bts $_EFER_NX, %eax -1: wrmsr -#ifdef CONFIG_SMP - cmp $0, %r11b - jne cpu_bringup_and_idle -#endif - ret - -#endif /* CONFIG_XEN_PVH */ - .pushsection .text .balign PAGE_SIZE ENTRY(hypercall_page) @@ -114,10 +60,10 @@ ENTRY(hypercall_page) #endif ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) - ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables|pae_pgdir_above_4gb"; .asciz PVH_FEATURES_STR) - ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long (PVH_FEATURES) | - (1 << XENFEAT_writable_page_tables) | - (1 << XENFEAT_dom0)) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, + .ascii "!writable_page_tables|pae_pgdir_above_4gb") + ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, + .long (1 << XENFEAT_writable_page_tables) | (1 << XENFEAT_dom0)) ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index ac0a2b0f9e62..f6a41c41ebc7 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -146,5 +146,4 @@ __visible void xen_adjust_exception_frame(void); extern int xen_panic_handler_init(void); -void xen_pvh_secondary_vcpu_init(int cpu); #endif /* XEN_OPS_H */ diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S new file mode 100644 index 000000000000..5e246716d58f --- /dev/null +++ b/arch/x86/xen/xen-pvh.S @@ -0,0 +1,161 @@ +/* + * Copyright C 2016, Oracle and/or its affiliates. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + */ + + .code32 + .text +#define _pa(x) ((x) - __START_KERNEL_map) + +#include <linux/elfnote.h> +#include <linux/init.h> +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/asm.h> +#include <asm/boot.h> +#include <asm/processor-flags.h> +#include <asm/msr.h> +#include <xen/interface/elfnote.h> + + __HEAD + +/* + * Entry point for PVH guests. + * + * Xen ABI specifies the following register state when we come here: + * + * - `ebx`: contains the physical memory address where the loader has placed + * the boot start info structure. + * - `cr0`: bit 0 (PE) must be set. All the other writeable bits are cleared. + * - `cr4`: all bits are cleared. + * - `cs `: must be a 32-bit read/execute code segment with a base of ‘0’ + * and a limit of ‘0xFFFFFFFF’. The selector value is unspecified. + * - `ds`, `es`: must be a 32-bit read/write data segment with a base of + * ‘0’ and a limit of ‘0xFFFFFFFF’. The selector values are all + * unspecified. + * - `tr`: must be a 32-bit TSS (active) with a base of '0' and a limit + * of '0x67'. + * - `eflags`: bit 17 (VM) must be cleared. Bit 9 (IF) must be cleared. + * Bit 8 (TF) must be cleared. Other bits are all unspecified. + * + * All other processor registers and flag bits are unspecified. The OS is in + * charge of setting up it's own stack, GDT and IDT. + */ + +ENTRY(pvh_start_xen) + cld + + lgdt (_pa(gdt)) + + mov $(__BOOT_DS),%eax + mov %eax,%ds + mov %eax,%es + mov %eax,%ss + + /* Stash hvm_start_info. */ + mov $_pa(pvh_start_info), %edi + mov %ebx, %esi + mov _pa(pvh_start_info_sz), %ecx + shr $2,%ecx + rep + movsl + + mov $_pa(early_stack_end), %esp + + /* Enable PAE mode. */ + mov %cr4, %eax + orl $X86_CR4_PAE, %eax + mov %eax, %cr4 + +#ifdef CONFIG_X86_64 + /* Enable Long mode. */ + mov $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + wrmsr + + /* Enable pre-constructed page tables. */ + mov $_pa(init_level4_pgt), %eax + mov %eax, %cr3 + mov $(X86_CR0_PG | X86_CR0_PE), %eax + mov %eax, %cr0 + + /* Jump to 64-bit mode. */ + ljmp $__KERNEL_CS, $_pa(1f) + + /* 64-bit entry point. */ + .code64 +1: + call xen_prepare_pvh + + /* startup_64 expects boot_params in %rsi. */ + mov $_pa(pvh_bootparams), %rsi + mov $_pa(startup_64), %rax + jmp *%rax + +#else /* CONFIG_X86_64 */ + + call mk_early_pgtbl_32 + + mov $_pa(initial_page_table), %eax + mov %eax, %cr3 + + mov %cr0, %eax + or $(X86_CR0_PG | X86_CR0_PE), %eax + mov %eax, %cr0 + + ljmp $__BOOT_CS, $1f +1: + call xen_prepare_pvh + mov $_pa(pvh_bootparams), %esi + + /* startup_32 doesn't expect paging and PAE to be on. */ + ljmp $__BOOT_CS, $_pa(2f) +2: + mov %cr0, %eax + and $~X86_CR0_PG, %eax + mov %eax, %cr0 + mov %cr4, %eax + and $~X86_CR4_PAE, %eax + mov %eax, %cr4 + + ljmp $__BOOT_CS, $_pa(startup_32) +#endif +END(pvh_start_xen) + + .section ".init.data","aw" + .balign 8 +gdt: + .word gdt_end - gdt_start + .long _pa(gdt_start) + .word 0 +gdt_start: + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x0000000000000000 /* reserved */ +#ifdef CONFIG_X86_64 + .quad GDT_ENTRY(0xa09a, 0, 0xfffff) /* __KERNEL_CS */ +#else + .quad GDT_ENTRY(0xc09a, 0, 0xfffff) /* __KERNEL_CS */ +#endif + .quad GDT_ENTRY(0xc092, 0, 0xfffff) /* __KERNEL_DS */ +gdt_end: + + .balign 4 +early_stack: + .fill 256, 1, 0 +early_stack_end: + + ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY, + _ASM_PTR (pvh_start_xen - __START_KERNEL_map)) diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 415e79b69d34..8fe61b5dc5a6 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -38,8 +38,8 @@ struct backend_info { static struct kmem_cache *xen_blkif_cachep; static void connect(struct backend_info *); static int connect_ring(struct backend_info *); -static void backend_changed(struct xenbus_watch *, const char **, - unsigned int); +static void backend_changed(struct xenbus_watch *, const char *, + const char *); static void xen_blkif_free(struct xen_blkif *blkif); static void xen_vbd_free(struct xen_vbd *vbd); @@ -661,7 +661,7 @@ fail: * ready, connect. */ static void backend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { int err; unsigned major; diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 47b481095d77..f9bcf4a665bc 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -67,6 +67,7 @@ module_param(rx_drain_timeout_msecs, uint, 0444); unsigned int rx_stall_timeout_msecs = 60000; module_param(rx_stall_timeout_msecs, uint, 0444); +#define MAX_QUEUES_DEFAULT 8 unsigned int xenvif_max_queues; module_param_named(max_queues, xenvif_max_queues, uint, 0644); MODULE_PARM_DESC(max_queues, @@ -1622,11 +1623,12 @@ static int __init netback_init(void) if (!xen_domain()) return -ENODEV; - /* Allow as many queues as there are CPUs if user has not + /* Allow as many queues as there are CPUs but max. 8 if user has not * specified a value. */ if (xenvif_max_queues == 0) - xenvif_max_queues = num_online_cpus(); + xenvif_max_queues = min_t(unsigned int, MAX_QUEUES_DEFAULT, + num_online_cpus()); if (fatal_skb_slots < XEN_NETBK_LEGACY_SLOTS_MAX) { pr_info("fatal_skb_slots too small (%d), bump it to XEN_NETBK_LEGACY_SLOTS_MAX (%d)\n", diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index 85b742e1c42f..bb854f92f5a5 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -734,7 +734,7 @@ static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[]) } static void xen_net_rate_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { struct xenvif *vif = container_of(watch, struct xenvif, credit_watch); struct xenbus_device *dev = xenvif_to_xenbus_device(vif); @@ -791,7 +791,7 @@ static void xen_unregister_credit_watch(struct xenvif *vif) } static void xen_mcast_ctrl_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { struct xenvif *vif = container_of(watch, struct xenvif, mcast_ctrl_watch); @@ -866,8 +866,8 @@ static void unregister_hotplug_status_watch(struct backend_info *be) } static void hotplug_status_changed(struct xenbus_watch *watch, - const char **vec, - unsigned int vec_size) + const char *path, + const char *token) { struct backend_info *be = container_of(watch, struct backend_info, diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 1e4125a98291..9c72842e3a58 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -57,6 +57,7 @@ #include <xen/interface/grant_table.h> /* Module parameters */ +#define MAX_QUEUES_DEFAULT 8 static unsigned int xennet_max_queues; module_param_named(max_queues, xennet_max_queues, uint, 0644); MODULE_PARM_DESC(max_queues, @@ -2166,11 +2167,12 @@ static int __init netif_init(void) pr_info("Initialising Xen virtual ethernet driver\n"); - /* Allow as many queues as there are CPUs if user has not + /* Allow as many queues as there are CPUs inut max. 8 if user has not * specified a value. */ if (xennet_max_queues == 0) - xennet_max_queues = num_online_cpus(); + xennet_max_queues = min_t(unsigned int, MAX_QUEUES_DEFAULT, + num_online_cpus()); return xenbus_register_frontend(&netfront_driver); } diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index 5676aefdf2bc..0003912a8111 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -68,13 +68,12 @@ static void vcpu_hotplug(unsigned int cpu) } static void handle_vcpu_hotplug_event(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { unsigned int cpu; char *cpustr; - const char *node = vec[XS_WATCH_PATH]; - cpustr = strstr(node, "cpu/"); + cpustr = strstr(path, "cpu/"); if (cpustr != NULL) { sscanf(cpustr, "cpu/%u", &cpu); vcpu_hotplug(cpu); @@ -107,7 +106,7 @@ static int __init setup_vcpu_hotplug_event(void) .notifier_call = setup_cpu_watcher }; #ifdef CONFIG_X86 - if (!xen_pv_domain()) + if (!xen_pv_domain() && !xen_pvh_domain()) #else if (!xen_domain()) #endif diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index fd8e872d2943..6a53577772c9 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1704,7 +1704,6 @@ void __init xen_init_IRQ(void) pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO); eoi_gmfn.gmfn = virt_to_gfn(pirq_eoi_map); rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn); - /* TODO: No PVH support for PIRQ EOI */ if (rc != 0) { free_page((unsigned long) pirq_eoi_map); pirq_eoi_map = NULL; diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index bb36b1e1dbcc..d6786b87e13b 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -1146,13 +1146,13 @@ EXPORT_SYMBOL_GPL(gnttab_init); static int __gnttab_init(void) { + if (!xen_domain()) + return -ENODEV; + /* Delay grant-table initialization in the PV on HVM case */ - if (xen_hvm_domain()) + if (xen_hvm_domain() && !xen_pvh_domain()) return 0; - if (!xen_pv_domain()) - return -ENODEV; - return gnttab_init(); } /* Starts after core_initcall so that xen_pvh_gnttab_setup can be called diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 26e5e8507f03..c1ec8ee80924 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -218,7 +218,7 @@ static struct shutdown_handler shutdown_handlers[] = { }; static void shutdown_handler(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { char *str; struct xenbus_transaction xbt; @@ -266,8 +266,8 @@ static void shutdown_handler(struct xenbus_watch *watch, } #ifdef CONFIG_MAGIC_SYSRQ -static void sysrq_handler(struct xenbus_watch *watch, const char **vec, - unsigned int len) +static void sysrq_handler(struct xenbus_watch *watch, const char *path, + const char *token) { char sysrq_key = '\0'; struct xenbus_transaction xbt; @@ -277,7 +277,7 @@ static void sysrq_handler(struct xenbus_watch *watch, const char **vec, err = xenbus_transaction_start(&xbt); if (err) return; - if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) { + if (xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key) < 0) { pr_err("Unable to read sysrq code in control/sysrq\n"); xenbus_transaction_end(xbt, 1); return; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 6e3306f4a525..2077a3ac7c0c 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -22,6 +22,7 @@ #include <linux/pagemap.h> #include <linux/seq_file.h> #include <linux/miscdevice.h> +#include <linux/moduleparam.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> @@ -32,6 +33,7 @@ #include <xen/xen.h> #include <xen/privcmd.h> #include <xen/interface/xen.h> +#include <xen/interface/hvm/dm_op.h> #include <xen/features.h> #include <xen/page.h> #include <xen/xen-ops.h> @@ -43,16 +45,36 @@ MODULE_LICENSE("GPL"); #define PRIV_VMA_LOCKED ((void *)1) +static unsigned int privcmd_dm_op_max_num = 16; +module_param_named(dm_op_max_nr_bufs, privcmd_dm_op_max_num, uint, 0644); +MODULE_PARM_DESC(dm_op_max_nr_bufs, + "Maximum number of buffers per dm_op hypercall"); + +static unsigned int privcmd_dm_op_buf_max_size = 4096; +module_param_named(dm_op_buf_max_size, privcmd_dm_op_buf_max_size, uint, + 0644); +MODULE_PARM_DESC(dm_op_buf_max_size, + "Maximum size of a dm_op hypercall buffer"); + +struct privcmd_data { + domid_t domid; +}; + static int privcmd_vma_range_is_mapped( struct vm_area_struct *vma, unsigned long addr, unsigned long nr_pages); -static long privcmd_ioctl_hypercall(void __user *udata) +static long privcmd_ioctl_hypercall(struct file *file, void __user *udata) { + struct privcmd_data *data = file->private_data; struct privcmd_hypercall hypercall; long ret; + /* Disallow arbitrary hypercalls if restricted */ + if (data->domid != DOMID_INVALID) + return -EPERM; + if (copy_from_user(&hypercall, udata, sizeof(hypercall))) return -EFAULT; @@ -229,8 +251,9 @@ static int mmap_gfn_range(void *data, void *state) return 0; } -static long privcmd_ioctl_mmap(void __user *udata) +static long privcmd_ioctl_mmap(struct file *file, void __user *udata) { + struct privcmd_data *data = file->private_data; struct privcmd_mmap mmapcmd; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -245,6 +268,10 @@ static long privcmd_ioctl_mmap(void __user *udata) if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) return -EFAULT; + /* If restriction is in place, check the domid matches */ + if (data->domid != DOMID_INVALID && data->domid != mmapcmd.dom) + return -EPERM; + rc = gather_array(&pagelist, mmapcmd.num, sizeof(struct privcmd_mmap_entry), mmapcmd.entry); @@ -416,8 +443,10 @@ static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) static const struct vm_operations_struct privcmd_vm_ops; -static long privcmd_ioctl_mmap_batch(void __user *udata, int version) +static long privcmd_ioctl_mmap_batch( + struct file *file, void __user *udata, int version) { + struct privcmd_data *data = file->private_data; int ret; struct privcmd_mmapbatch_v2 m; struct mm_struct *mm = current->mm; @@ -446,6 +475,10 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) return -EINVAL; } + /* If restriction is in place, check the domid matches */ + if (data->domid != DOMID_INVALID && data->domid != m.dom) + return -EPERM; + nr_pages = DIV_ROUND_UP(m.num, XEN_PFN_PER_PAGE); if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) return -EINVAL; @@ -548,37 +581,210 @@ out_unlock: goto out; } +static int lock_pages( + struct privcmd_dm_op_buf kbufs[], unsigned int num, + struct page *pages[], unsigned int nr_pages) +{ + unsigned int i; + + for (i = 0; i < num; i++) { + unsigned int requested; + int pinned; + + requested = DIV_ROUND_UP( + offset_in_page(kbufs[i].uptr) + kbufs[i].size, + PAGE_SIZE); + if (requested > nr_pages) + return -ENOSPC; + + pinned = get_user_pages_fast( + (unsigned long) kbufs[i].uptr, + requested, FOLL_WRITE, pages); + if (pinned < 0) + return pinned; + + nr_pages -= pinned; + pages += pinned; + } + + return 0; +} + +static void unlock_pages(struct page *pages[], unsigned int nr_pages) +{ + unsigned int i; + + if (!pages) + return; + + for (i = 0; i < nr_pages; i++) { + if (pages[i]) + put_page(pages[i]); + } +} + +static long privcmd_ioctl_dm_op(struct file *file, void __user *udata) +{ + struct privcmd_data *data = file->private_data; + struct privcmd_dm_op kdata; + struct privcmd_dm_op_buf *kbufs; + unsigned int nr_pages = 0; + struct page **pages = NULL; + struct xen_dm_op_buf *xbufs = NULL; + unsigned int i; + long rc; + + if (copy_from_user(&kdata, udata, sizeof(kdata))) + return -EFAULT; + + /* If restriction is in place, check the domid matches */ + if (data->domid != DOMID_INVALID && data->domid != kdata.dom) + return -EPERM; + + if (kdata.num == 0) + return 0; + + if (kdata.num > privcmd_dm_op_max_num) + return -E2BIG; + + kbufs = kcalloc(kdata.num, sizeof(*kbufs), GFP_KERNEL); + if (!kbufs) + return -ENOMEM; + + if (copy_from_user(kbufs, kdata.ubufs, + sizeof(*kbufs) * kdata.num)) { + rc = -EFAULT; + goto out; + } + + for (i = 0; i < kdata.num; i++) { + if (kbufs[i].size > privcmd_dm_op_buf_max_size) { + rc = -E2BIG; + goto out; + } + + if (!access_ok(VERIFY_WRITE, kbufs[i].uptr, + kbufs[i].size)) { + rc = -EFAULT; + goto out; + } + + nr_pages += DIV_ROUND_UP( + offset_in_page(kbufs[i].uptr) + kbufs[i].size, + PAGE_SIZE); + } + + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) { + rc = -ENOMEM; + goto out; + } + + xbufs = kcalloc(kdata.num, sizeof(*xbufs), GFP_KERNEL); + if (!xbufs) { + rc = -ENOMEM; + goto out; + } + + rc = lock_pages(kbufs, kdata.num, pages, nr_pages); + if (rc) + goto out; + + for (i = 0; i < kdata.num; i++) { + set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr); + xbufs[i].size = kbufs[i].size; + } + + xen_preemptible_hcall_begin(); + rc = HYPERVISOR_dm_op(kdata.dom, kdata.num, xbufs); + xen_preemptible_hcall_end(); + +out: + unlock_pages(pages, nr_pages); + kfree(xbufs); + kfree(pages); + kfree(kbufs); + + return rc; +} + +static long privcmd_ioctl_restrict(struct file *file, void __user *udata) +{ + struct privcmd_data *data = file->private_data; + domid_t dom; + + if (copy_from_user(&dom, udata, sizeof(dom))) + return -EFAULT; + + /* Set restriction to the specified domain, or check it matches */ + if (data->domid == DOMID_INVALID) + data->domid = dom; + else if (data->domid != dom) + return -EINVAL; + + return 0; +} + static long privcmd_ioctl(struct file *file, unsigned int cmd, unsigned long data) { - int ret = -ENOSYS; + int ret = -ENOTTY; void __user *udata = (void __user *) data; switch (cmd) { case IOCTL_PRIVCMD_HYPERCALL: - ret = privcmd_ioctl_hypercall(udata); + ret = privcmd_ioctl_hypercall(file, udata); break; case IOCTL_PRIVCMD_MMAP: - ret = privcmd_ioctl_mmap(udata); + ret = privcmd_ioctl_mmap(file, udata); break; case IOCTL_PRIVCMD_MMAPBATCH: - ret = privcmd_ioctl_mmap_batch(udata, 1); + ret = privcmd_ioctl_mmap_batch(file, udata, 1); break; case IOCTL_PRIVCMD_MMAPBATCH_V2: - ret = privcmd_ioctl_mmap_batch(udata, 2); + ret = privcmd_ioctl_mmap_batch(file, udata, 2); + break; + + case IOCTL_PRIVCMD_DM_OP: + ret = privcmd_ioctl_dm_op(file, udata); + break; + + case IOCTL_PRIVCMD_RESTRICT: + ret = privcmd_ioctl_restrict(file, udata); break; default: - ret = -EINVAL; break; } return ret; } +static int privcmd_open(struct inode *ino, struct file *file) +{ + struct privcmd_data *data = kzalloc(sizeof(*data), GFP_KERNEL); + + if (!data) + return -ENOMEM; + + /* DOMID_INVALID implies no restriction */ + data->domid = DOMID_INVALID; + + file->private_data = data; + return 0; +} + +static int privcmd_release(struct inode *ino, struct file *file) +{ + struct privcmd_data *data = file->private_data; + + kfree(data); + return 0; +} + static void privcmd_close(struct vm_area_struct *vma) { struct page **pages = vma->vm_private_data; @@ -647,6 +853,8 @@ static int privcmd_vma_range_is_mapped( const struct file_operations xen_privcmd_fops = { .owner = THIS_MODULE, .unlocked_ioctl = privcmd_ioctl, + .open = privcmd_open, + .release = privcmd_release, .mmap = privcmd_mmap, }; EXPORT_SYMBOL_GPL(xen_privcmd_fops); diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c index 79865b8901ba..e7715cb62eef 100644 --- a/drivers/xen/xen-balloon.c +++ b/drivers/xen/xen-balloon.c @@ -55,7 +55,7 @@ static int register_balloon(struct device *dev); /* React to a change in the target key */ static void watch_target(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { unsigned long long new_target; int err; diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c index 3f0aee0a068b..3814b44bf1f7 100644 --- a/drivers/xen/xen-pciback/xenbus.c +++ b/drivers/xen/xen-pciback/xenbus.c @@ -652,7 +652,7 @@ out: } static void xen_pcibk_be_watch(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { struct xen_pcibk_device *pdev = container_of(watch, struct xen_pcibk_device, be_watch); diff --git a/drivers/xen/xenbus/xenbus.h b/drivers/xen/xenbus/xenbus.h new file mode 100644 index 000000000000..149c5e7efc89 --- /dev/null +++ b/drivers/xen/xenbus/xenbus.h @@ -0,0 +1,135 @@ +/* + * Private include for xenbus communications. + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2005 XenSource Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _XENBUS_XENBUS_H +#define _XENBUS_XENBUS_H + +#include <linux/mutex.h> +#include <linux/uio.h> +#include <xen/xenbus.h> + +#define XEN_BUS_ID_SIZE 20 + +struct xen_bus_type { + char *root; + unsigned int levels; + int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename); + int (*probe)(struct xen_bus_type *bus, const char *type, + const char *dir); + void (*otherend_changed)(struct xenbus_watch *watch, const char *path, + const char *token); + struct bus_type bus; +}; + +enum xenstore_init { + XS_UNKNOWN, + XS_PV, + XS_HVM, + XS_LOCAL, +}; + +struct xs_watch_event { + struct list_head list; + unsigned int len; + struct xenbus_watch *handle; + const char *path; + const char *token; + char body[]; +}; + +enum xb_req_state { + xb_req_state_queued, + xb_req_state_wait_reply, + xb_req_state_got_reply, + xb_req_state_aborted +}; + +struct xb_req_data { + struct list_head list; + wait_queue_head_t wq; + struct xsd_sockmsg msg; + enum xsd_sockmsg_type type; + char *body; + const struct kvec *vec; + int num_vecs; + int err; + enum xb_req_state state; + void (*cb)(struct xb_req_data *); + void *par; +}; + +extern enum xenstore_init xen_store_domain_type; +extern const struct attribute_group *xenbus_dev_groups[]; +extern struct mutex xs_response_mutex; +extern struct list_head xs_reply_list; +extern struct list_head xb_write_list; +extern wait_queue_head_t xb_waitq; +extern struct mutex xb_write_mutex; + +int xs_init(void); +int xb_init_comms(void); +void xb_deinit_comms(void); +int xs_watch_msg(struct xs_watch_event *event); +void xs_request_exit(struct xb_req_data *req); + +int xenbus_match(struct device *_dev, struct device_driver *_drv); +int xenbus_dev_probe(struct device *_dev); +int xenbus_dev_remove(struct device *_dev); +int xenbus_register_driver_common(struct xenbus_driver *drv, + struct xen_bus_type *bus, + struct module *owner, + const char *mod_name); +int xenbus_probe_node(struct xen_bus_type *bus, + const char *type, + const char *nodename); +int xenbus_probe_devices(struct xen_bus_type *bus); + +void xenbus_dev_changed(const char *node, struct xen_bus_type *bus); + +void xenbus_dev_shutdown(struct device *_dev); + +int xenbus_dev_suspend(struct device *dev); +int xenbus_dev_resume(struct device *dev); +int xenbus_dev_cancel(struct device *dev); + +void xenbus_otherend_changed(struct xenbus_watch *watch, + const char *path, const char *token, + int ignore_on_shutdown); + +int xenbus_read_otherend_details(struct xenbus_device *xendev, + char *id_node, char *path_node); + +void xenbus_ring_ops_init(void); + +int xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void *par); +void xenbus_dev_queue_reply(struct xb_req_data *req); + +#endif diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 056da6ee1a35..82a8866758ee 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -47,7 +47,7 @@ #include <xen/xen.h> #include <xen/features.h> -#include "xenbus_probe.h" +#include "xenbus.h" #define XENBUS_PAGES(_grants) (DIV_ROUND_UP(_grants, XEN_PFN_PER_PAGE)) @@ -115,7 +115,7 @@ EXPORT_SYMBOL_GPL(xenbus_strstate); int xenbus_watch_path(struct xenbus_device *dev, const char *path, struct xenbus_watch *watch, void (*callback)(struct xenbus_watch *, - const char **, unsigned int)) + const char *, const char *)) { int err; @@ -153,7 +153,7 @@ EXPORT_SYMBOL_GPL(xenbus_watch_path); int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch, void (*callback)(struct xenbus_watch *, - const char **, unsigned int), + const char *, const char *), const char *pathfmt, ...) { int err; @@ -259,53 +259,34 @@ int xenbus_frontend_closed(struct xenbus_device *dev) } EXPORT_SYMBOL_GPL(xenbus_frontend_closed); -/** - * Return the path to the error node for the given device, or NULL on failure. - * If the value returned is non-NULL, then it is the caller's to kfree. - */ -static char *error_path(struct xenbus_device *dev) -{ - return kasprintf(GFP_KERNEL, "error/%s", dev->nodename); -} - - static void xenbus_va_dev_error(struct xenbus_device *dev, int err, const char *fmt, va_list ap) { unsigned int len; - char *printf_buffer = NULL; - char *path_buffer = NULL; + char *printf_buffer; + char *path_buffer; #define PRINTF_BUFFER_SIZE 4096 + printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL); - if (printf_buffer == NULL) - goto fail; + if (!printf_buffer) + return; len = sprintf(printf_buffer, "%i ", -err); - vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap); + vsnprintf(printf_buffer + len, PRINTF_BUFFER_SIZE - len, fmt, ap); dev_err(&dev->dev, "%s\n", printf_buffer); - path_buffer = error_path(dev); - - if (path_buffer == NULL) { + path_buffer = kasprintf(GFP_KERNEL, "error/%s", dev->nodename); + if (!path_buffer || + xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer)) dev_err(&dev->dev, "failed to write error node for %s (%s)\n", - dev->nodename, printf_buffer); - goto fail; - } + dev->nodename, printf_buffer); - if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) { - dev_err(&dev->dev, "failed to write error node for %s (%s)\n", - dev->nodename, printf_buffer); - goto fail; - } - -fail: kfree(printf_buffer); kfree(path_buffer); } - /** * xenbus_dev_error * @dev: xenbus device diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c index ecdecce80a6c..856ada5d39c9 100644 --- a/drivers/xen/xenbus/xenbus_comms.c +++ b/drivers/xen/xenbus/xenbus_comms.c @@ -34,19 +34,31 @@ #include <linux/wait.h> #include <linux/interrupt.h> +#include <linux/kthread.h> #include <linux/sched.h> #include <linux/err.h> #include <xen/xenbus.h> #include <asm/xen/hypervisor.h> #include <xen/events.h> #include <xen/page.h> -#include "xenbus_comms.h" +#include "xenbus.h" + +/* A list of replies. Currently only one will ever be outstanding. */ +LIST_HEAD(xs_reply_list); + +/* A list of write requests. */ +LIST_HEAD(xb_write_list); +DECLARE_WAIT_QUEUE_HEAD(xb_waitq); +DEFINE_MUTEX(xb_write_mutex); + +/* Protect xenbus reader thread against save/restore. */ +DEFINE_MUTEX(xs_response_mutex); static int xenbus_irq; +static struct task_struct *xenbus_task; static DECLARE_WORK(probe_work, xenbus_probe); -static DECLARE_WAIT_QUEUE_HEAD(xb_waitq); static irqreturn_t wake_waiting(int irq, void *unused) { @@ -84,30 +96,31 @@ static const void *get_input_chunk(XENSTORE_RING_IDX cons, return buf + MASK_XENSTORE_IDX(cons); } +static int xb_data_to_write(void) +{ + struct xenstore_domain_interface *intf = xen_store_interface; + + return (intf->req_prod - intf->req_cons) != XENSTORE_RING_SIZE && + !list_empty(&xb_write_list); +} + /** * xb_write - low level write * @data: buffer to send * @len: length of buffer * - * Returns 0 on success, error otherwise. + * Returns number of bytes written or -err. */ -int xb_write(const void *data, unsigned len) +static int xb_write(const void *data, unsigned int len) { struct xenstore_domain_interface *intf = xen_store_interface; XENSTORE_RING_IDX cons, prod; - int rc; + unsigned int bytes = 0; while (len != 0) { void *dst; unsigned int avail; - rc = wait_event_interruptible( - xb_waitq, - (intf->req_prod - intf->req_cons) != - XENSTORE_RING_SIZE); - if (rc < 0) - return rc; - /* Read indexes, then verify. */ cons = intf->req_cons; prod = intf->req_prod; @@ -115,6 +128,11 @@ int xb_write(const void *data, unsigned len) intf->req_cons = intf->req_prod = 0; return -EIO; } + if (!xb_data_to_write()) + return bytes; + + /* Must write data /after/ reading the consumer index. */ + virt_mb(); dst = get_output_chunk(cons, prod, intf->req, &avail); if (avail == 0) @@ -122,52 +140,45 @@ int xb_write(const void *data, unsigned len) if (avail > len) avail = len; - /* Must write data /after/ reading the consumer index. */ - virt_mb(); - memcpy(dst, data, avail); data += avail; len -= avail; + bytes += avail; /* Other side must not see new producer until data is there. */ virt_wmb(); intf->req_prod += avail; /* Implies mb(): other side will see the updated producer. */ - notify_remote_via_evtchn(xen_store_evtchn); + if (prod <= intf->req_cons) + notify_remote_via_evtchn(xen_store_evtchn); } - return 0; + return bytes; } -int xb_data_to_read(void) +static int xb_data_to_read(void) { struct xenstore_domain_interface *intf = xen_store_interface; return (intf->rsp_cons != intf->rsp_prod); } -int xb_wait_for_data_to_read(void) -{ - return wait_event_interruptible(xb_waitq, xb_data_to_read()); -} - -int xb_read(void *data, unsigned len) +static int xb_read(void *data, unsigned int len) { struct xenstore_domain_interface *intf = xen_store_interface; XENSTORE_RING_IDX cons, prod; - int rc; + unsigned int bytes = 0; while (len != 0) { unsigned int avail; const char *src; - rc = xb_wait_for_data_to_read(); - if (rc < 0) - return rc; - /* Read indexes, then verify. */ cons = intf->rsp_cons; prod = intf->rsp_prod; + if (cons == prod) + return bytes; + if (!check_indexes(cons, prod)) { intf->rsp_cons = intf->rsp_prod = 0; return -EIO; @@ -185,17 +196,243 @@ int xb_read(void *data, unsigned len) memcpy(data, src, avail); data += avail; len -= avail; + bytes += avail; /* Other side must not see free space until we've copied out */ virt_mb(); intf->rsp_cons += avail; - pr_debug("Finished read of %i bytes (%i to go)\n", avail, len); - /* Implies mb(): other side will see the updated consumer. */ - notify_remote_via_evtchn(xen_store_evtchn); + if (intf->rsp_prod - cons >= XENSTORE_RING_SIZE) + notify_remote_via_evtchn(xen_store_evtchn); + } + + return bytes; +} + +static int process_msg(void) +{ + static struct { + struct xsd_sockmsg msg; + char *body; + union { + void *alloc; + struct xs_watch_event *watch; + }; + bool in_msg; + bool in_hdr; + unsigned int read; + } state; + struct xb_req_data *req; + int err; + unsigned int len; + + if (!state.in_msg) { + state.in_msg = true; + state.in_hdr = true; + state.read = 0; + + /* + * We must disallow save/restore while reading a message. + * A partial read across s/r leaves us out of sync with + * xenstored. + * xs_response_mutex is locked as long as we are processing one + * message. state.in_msg will be true as long as we are holding + * the lock here. + */ + mutex_lock(&xs_response_mutex); + + if (!xb_data_to_read()) { + /* We raced with save/restore: pending data 'gone'. */ + mutex_unlock(&xs_response_mutex); + state.in_msg = false; + return 0; + } + } + + if (state.in_hdr) { + if (state.read != sizeof(state.msg)) { + err = xb_read((void *)&state.msg + state.read, + sizeof(state.msg) - state.read); + if (err < 0) + goto out; + state.read += err; + if (state.read != sizeof(state.msg)) + return 0; + if (state.msg.len > XENSTORE_PAYLOAD_MAX) { + err = -EINVAL; + goto out; + } + } + + len = state.msg.len + 1; + if (state.msg.type == XS_WATCH_EVENT) + len += sizeof(*state.watch); + + state.alloc = kmalloc(len, GFP_NOIO | __GFP_HIGH); + if (!state.alloc) + return -ENOMEM; + + if (state.msg.type == XS_WATCH_EVENT) + state.body = state.watch->body; + else + state.body = state.alloc; + state.in_hdr = false; + state.read = 0; + } + + err = xb_read(state.body + state.read, state.msg.len - state.read); + if (err < 0) + goto out; + + state.read += err; + if (state.read != state.msg.len) + return 0; + + state.body[state.msg.len] = '\0'; + + if (state.msg.type == XS_WATCH_EVENT) { + state.watch->len = state.msg.len; + err = xs_watch_msg(state.watch); + } else { + err = -ENOENT; + mutex_lock(&xb_write_mutex); + list_for_each_entry(req, &xs_reply_list, list) { + if (req->msg.req_id == state.msg.req_id) { + if (req->state == xb_req_state_wait_reply) { + req->msg.type = state.msg.type; + req->msg.len = state.msg.len; + req->body = state.body; + req->state = xb_req_state_got_reply; + list_del(&req->list); + req->cb(req); + } else { + list_del(&req->list); + kfree(req); + } + err = 0; + break; + } + } + mutex_unlock(&xb_write_mutex); + if (err) + goto out; } + mutex_unlock(&xs_response_mutex); + + state.in_msg = false; + state.alloc = NULL; + return err; + + out: + mutex_unlock(&xs_response_mutex); + state.in_msg = false; + kfree(state.alloc); + state.alloc = NULL; + return err; +} + +static int process_writes(void) +{ + static struct { + struct xb_req_data *req; + int idx; + unsigned int written; + } state; + void *base; + unsigned int len; + int err = 0; + + if (!xb_data_to_write()) + return 0; + + mutex_lock(&xb_write_mutex); + + if (!state.req) { + state.req = list_first_entry(&xb_write_list, + struct xb_req_data, list); + state.idx = -1; + state.written = 0; + } + + if (state.req->state == xb_req_state_aborted) + goto out_err; + + while (state.idx < state.req->num_vecs) { + if (state.idx < 0) { + base = &state.req->msg; + len = sizeof(state.req->msg); + } else { + base = state.req->vec[state.idx].iov_base; + len = state.req->vec[state.idx].iov_len; + } + err = xb_write(base + state.written, len - state.written); + if (err < 0) + goto out_err; + state.written += err; + if (state.written != len) + goto out; + + state.idx++; + state.written = 0; + } + + list_del(&state.req->list); + state.req->state = xb_req_state_wait_reply; + list_add_tail(&state.req->list, &xs_reply_list); + state.req = NULL; + + out: + mutex_unlock(&xb_write_mutex); + + return 0; + + out_err: + state.req->msg.type = XS_ERROR; + state.req->err = err; + list_del(&state.req->list); + if (state.req->state == xb_req_state_aborted) + kfree(state.req); + else { + state.req->state = xb_req_state_got_reply; + wake_up(&state.req->wq); + } + + mutex_unlock(&xb_write_mutex); + + state.req = NULL; + + return err; +} + +static int xb_thread_work(void) +{ + return xb_data_to_read() || xb_data_to_write(); +} + +static int xenbus_thread(void *unused) +{ + int err; + + while (!kthread_should_stop()) { + if (wait_event_interruptible(xb_waitq, xb_thread_work())) + continue; + + err = process_msg(); + if (err == -ENOMEM) + schedule(); + else if (err) + pr_warn_ratelimited("error %d while reading message\n", + err); + + err = process_writes(); + if (err) + pr_warn_ratelimited("error %d while writing message\n", + err); + } + + xenbus_task = NULL; return 0; } @@ -223,6 +460,7 @@ int xb_init_comms(void) rebind_evtchn_irq(xen_store_evtchn, xenbus_irq); } else { int err; + err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting, 0, "xenbus", &xb_waitq); if (err < 0) { @@ -231,6 +469,13 @@ int xb_init_comms(void) } xenbus_irq = err; + + if (!xenbus_task) { + xenbus_task = kthread_run(xenbus_thread, NULL, + "xenbus"); + if (IS_ERR(xenbus_task)) + return PTR_ERR(xenbus_task); + } } return 0; diff --git a/drivers/xen/xenbus/xenbus_comms.h b/drivers/xen/xenbus/xenbus_comms.h deleted file mode 100644 index 867a2e425208..000000000000 --- a/drivers/xen/xenbus/xenbus_comms.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Private include for xenbus communications. - * - * Copyright (C) 2005 Rusty Russell, IBM Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation; or, when distributed - * separately from the Linux kernel or incorporated into other - * software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef _XENBUS_COMMS_H -#define _XENBUS_COMMS_H - -#include <linux/fs.h> - -int xs_init(void); -int xb_init_comms(void); -void xb_deinit_comms(void); - -/* Low level routines. */ -int xb_write(const void *data, unsigned len); -int xb_read(void *data, unsigned len); -int xb_data_to_read(void); -int xb_wait_for_data_to_read(void); -extern struct xenstore_domain_interface *xen_store_interface; -extern int xen_store_evtchn; -extern enum xenstore_init xen_store_domain_type; - -extern const struct file_operations xen_xenbus_fops; - -#endif /* _XENBUS_COMMS_H */ diff --git a/drivers/xen/xenbus/xenbus_dev_backend.c b/drivers/xen/xenbus/xenbus_dev_backend.c index 4a41ac9af966..1126701e212e 100644 --- a/drivers/xen/xenbus/xenbus_dev_backend.c +++ b/drivers/xen/xenbus/xenbus_dev_backend.c @@ -16,7 +16,7 @@ #include <xen/events.h> #include <asm/xen/hypervisor.h> -#include "xenbus_comms.h" +#include "xenbus.h" static int xenbus_backend_open(struct inode *inode, struct file *filp) { diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c index 79130b310247..4d343eed08f5 100644 --- a/drivers/xen/xenbus/xenbus_dev_frontend.c +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c @@ -57,12 +57,12 @@ #include <linux/miscdevice.h> #include <linux/init.h> -#include "xenbus_comms.h" - #include <xen/xenbus.h> #include <xen/xen.h> #include <asm/xen/hypervisor.h> +#include "xenbus.h" + /* * An element of a list of outstanding transactions, for which we're * still waiting a reply. @@ -113,6 +113,7 @@ struct xenbus_file_priv { struct list_head read_buffers; wait_queue_head_t read_waitq; + struct kref kref; }; /* Read out any raw xenbus messages queued up. */ @@ -258,26 +259,23 @@ out_fail: } static void watch_fired(struct xenbus_watch *watch, - const char **vec, - unsigned int len) + const char *path, + const char *token) { struct watch_adapter *adap; struct xsd_sockmsg hdr; - const char *path, *token; - int path_len, tok_len, body_len, data_len = 0; + const char *token_caller; + int path_len, tok_len, body_len; int ret; LIST_HEAD(staging_q); adap = container_of(watch, struct watch_adapter, watch); - path = vec[XS_WATCH_PATH]; - token = adap->token; + token_caller = adap->token; path_len = strlen(path) + 1; - tok_len = strlen(token) + 1; - if (len > 2) - data_len = vec[len] - vec[2] + 1; - body_len = path_len + tok_len + data_len; + tok_len = strlen(token_caller) + 1; + body_len = path_len + tok_len; hdr.type = XS_WATCH_EVENT; hdr.len = body_len; @@ -288,9 +286,7 @@ static void watch_fired(struct xenbus_watch *watch, if (!ret) ret = queue_reply(&staging_q, path, path_len); if (!ret) - ret = queue_reply(&staging_q, token, tok_len); - if (!ret && len > 2) - ret = queue_reply(&staging_q, vec[2], data_len); + ret = queue_reply(&staging_q, token_caller, tok_len); if (!ret) { /* success: pass reply list onto watcher */ @@ -302,6 +298,107 @@ static void watch_fired(struct xenbus_watch *watch, mutex_unlock(&adap->dev_data->reply_mutex); } +static void xenbus_file_free(struct kref *kref) +{ + struct xenbus_file_priv *u; + struct xenbus_transaction_holder *trans, *tmp; + struct watch_adapter *watch, *tmp_watch; + struct read_buffer *rb, *tmp_rb; + + u = container_of(kref, struct xenbus_file_priv, kref); + + /* + * No need for locking here because there are no other users, + * by definition. + */ + + list_for_each_entry_safe(trans, tmp, &u->transactions, list) { + xenbus_transaction_end(trans->handle, 1); + list_del(&trans->list); + kfree(trans); + } + + list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) { + unregister_xenbus_watch(&watch->watch); + list_del(&watch->list); + free_watch_adapter(watch); + } + + list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) { + list_del(&rb->list); + kfree(rb); + } + kfree(u); +} + +static struct xenbus_transaction_holder *xenbus_get_transaction( + struct xenbus_file_priv *u, uint32_t tx_id) +{ + struct xenbus_transaction_holder *trans; + + list_for_each_entry(trans, &u->transactions, list) + if (trans->handle.id == tx_id) + return trans; + + return NULL; +} + +void xenbus_dev_queue_reply(struct xb_req_data *req) +{ + struct xenbus_file_priv *u = req->par; + struct xenbus_transaction_holder *trans = NULL; + int rc; + LIST_HEAD(staging_q); + + xs_request_exit(req); + + mutex_lock(&u->msgbuffer_mutex); + + if (req->type == XS_TRANSACTION_START) { + trans = xenbus_get_transaction(u, 0); + if (WARN_ON(!trans)) + goto out; + if (req->msg.type == XS_ERROR) { + list_del(&trans->list); + kfree(trans); + } else { + rc = kstrtou32(req->body, 10, &trans->handle.id); + if (WARN_ON(rc)) + goto out; + } + } else if (req->msg.type == XS_TRANSACTION_END) { + trans = xenbus_get_transaction(u, req->msg.tx_id); + if (WARN_ON(!trans)) + goto out; + list_del(&trans->list); + kfree(trans); + } + + mutex_unlock(&u->msgbuffer_mutex); + + mutex_lock(&u->reply_mutex); + rc = queue_reply(&staging_q, &req->msg, sizeof(req->msg)); + if (!rc) + rc = queue_reply(&staging_q, req->body, req->msg.len); + if (!rc) { + list_splice_tail(&staging_q, &u->read_buffers); + wake_up(&u->read_waitq); + } else { + queue_cleanup(&staging_q); + } + mutex_unlock(&u->reply_mutex); + + kfree(req->body); + kfree(req); + + kref_put(&u->kref, xenbus_file_free); + + return; + + out: + mutex_unlock(&u->msgbuffer_mutex); +} + static int xenbus_command_reply(struct xenbus_file_priv *u, unsigned int msg_type, const char *reply) { @@ -322,6 +419,9 @@ static int xenbus_command_reply(struct xenbus_file_priv *u, wake_up(&u->read_waitq); mutex_unlock(&u->reply_mutex); + if (!rc) + kref_put(&u->kref, xenbus_file_free); + return rc; } @@ -329,57 +429,22 @@ static int xenbus_write_transaction(unsigned msg_type, struct xenbus_file_priv *u) { int rc; - void *reply; struct xenbus_transaction_holder *trans = NULL; - LIST_HEAD(staging_q); if (msg_type == XS_TRANSACTION_START) { - trans = kmalloc(sizeof(*trans), GFP_KERNEL); + trans = kzalloc(sizeof(*trans), GFP_KERNEL); if (!trans) { rc = -ENOMEM; goto out; } - } else if (u->u.msg.tx_id != 0) { - list_for_each_entry(trans, &u->transactions, list) - if (trans->handle.id == u->u.msg.tx_id) - break; - if (&trans->list == &u->transactions) - return xenbus_command_reply(u, XS_ERROR, "ENOENT"); - } - - reply = xenbus_dev_request_and_reply(&u->u.msg); - if (IS_ERR(reply)) { - if (msg_type == XS_TRANSACTION_START) - kfree(trans); - rc = PTR_ERR(reply); - goto out; - } + list_add(&trans->list, &u->transactions); + } else if (u->u.msg.tx_id != 0 && + !xenbus_get_transaction(u, u->u.msg.tx_id)) + return xenbus_command_reply(u, XS_ERROR, "ENOENT"); - if (msg_type == XS_TRANSACTION_START) { - if (u->u.msg.type == XS_ERROR) - kfree(trans); - else { - trans->handle.id = simple_strtoul(reply, NULL, 0); - list_add(&trans->list, &u->transactions); - } - } else if (u->u.msg.type == XS_TRANSACTION_END) { - list_del(&trans->list); + rc = xenbus_dev_request_and_reply(&u->u.msg, u); + if (rc) kfree(trans); - } - - mutex_lock(&u->reply_mutex); - rc = queue_reply(&staging_q, &u->u.msg, sizeof(u->u.msg)); - if (!rc) - rc = queue_reply(&staging_q, reply, u->u.msg.len); - if (!rc) { - list_splice_tail(&staging_q, &u->read_buffers); - wake_up(&u->read_waitq); - } else { - queue_cleanup(&staging_q); - } - mutex_unlock(&u->reply_mutex); - - kfree(reply); out: return rc; @@ -511,6 +576,8 @@ static ssize_t xenbus_file_write(struct file *filp, * OK, now we have a complete message. Do something with it. */ + kref_get(&u->kref); + msg_type = u->u.msg.type; switch (msg_type) { @@ -525,8 +592,10 @@ static ssize_t xenbus_file_write(struct file *filp, ret = xenbus_write_transaction(msg_type, u); break; } - if (ret != 0) + if (ret != 0) { rc = ret; + kref_put(&u->kref, xenbus_file_free); + } /* Buffered message consumed */ u->len = 0; @@ -551,6 +620,8 @@ static int xenbus_file_open(struct inode *inode, struct file *filp) if (u == NULL) return -ENOMEM; + kref_init(&u->kref); + INIT_LIST_HEAD(&u->transactions); INIT_LIST_HEAD(&u->watches); INIT_LIST_HEAD(&u->read_buffers); @@ -567,32 +638,8 @@ static int xenbus_file_open(struct inode *inode, struct file *filp) static int xenbus_file_release(struct inode *inode, struct file *filp) { struct xenbus_file_priv *u = filp->private_data; - struct xenbus_transaction_holder *trans, *tmp; - struct watch_adapter *watch, *tmp_watch; - struct read_buffer *rb, *tmp_rb; - - /* - * No need for locking here because there are no other users, - * by definition. - */ - - list_for_each_entry_safe(trans, tmp, &u->transactions, list) { - xenbus_transaction_end(trans->handle, 1); - list_del(&trans->list); - kfree(trans); - } - - list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) { - unregister_xenbus_watch(&watch->watch); - list_del(&watch->list); - free_watch_adapter(watch); - } - list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) { - list_del(&rb->list); - kfree(rb); - } - kfree(u); + kref_put(&u->kref, xenbus_file_free); return 0; } diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 4bdf654041e9..74888cacd0b0 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -62,8 +62,7 @@ #include <xen/hvm.h> -#include "xenbus_comms.h" -#include "xenbus_probe.h" +#include "xenbus.h" int xen_store_evtchn; @@ -170,7 +169,7 @@ int xenbus_read_otherend_details(struct xenbus_device *xendev, EXPORT_SYMBOL_GPL(xenbus_read_otherend_details); void xenbus_otherend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len, + const char *path, const char *token, int ignore_on_shutdown) { struct xenbus_device *dev = @@ -181,18 +180,15 @@ void xenbus_otherend_changed(struct xenbus_watch *watch, /* Protect us against watches firing on old details when the otherend details change, say immediately after a resume. */ if (!dev->otherend || - strncmp(dev->otherend, vec[XS_WATCH_PATH], - strlen(dev->otherend))) { - dev_dbg(&dev->dev, "Ignoring watch at %s\n", - vec[XS_WATCH_PATH]); + strncmp(dev->otherend, path, strlen(dev->otherend))) { + dev_dbg(&dev->dev, "Ignoring watch at %s\n", path); return; } state = xenbus_read_driver_state(dev->otherend); dev_dbg(&dev->dev, "state is %d, (%s), %s, %s\n", - state, xenbus_strstate(state), dev->otherend_watch.node, - vec[XS_WATCH_PATH]); + state, xenbus_strstate(state), dev->otherend_watch.node, path); /* * Ignore xenbus transitions during shutdown. This prevents us doing diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h deleted file mode 100644 index c9ec7ca1f7ab..000000000000 --- a/drivers/xen/xenbus/xenbus_probe.h +++ /dev/null @@ -1,88 +0,0 @@ -/****************************************************************************** - * xenbus_probe.h - * - * Talks to Xen Store to figure out what devices we have. - * - * Copyright (C) 2005 Rusty Russell, IBM Corporation - * Copyright (C) 2005 XenSource Ltd. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation; or, when distributed - * separately from the Linux kernel or incorporated into other - * software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef _XENBUS_PROBE_H -#define _XENBUS_PROBE_H - -#define XEN_BUS_ID_SIZE 20 - -struct xen_bus_type { - char *root; - unsigned int levels; - int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename); - int (*probe)(struct xen_bus_type *bus, const char *type, - const char *dir); - void (*otherend_changed)(struct xenbus_watch *watch, const char **vec, - unsigned int len); - struct bus_type bus; -}; - -enum xenstore_init { - XS_UNKNOWN, - XS_PV, - XS_HVM, - XS_LOCAL, -}; - -extern const struct attribute_group *xenbus_dev_groups[]; - -extern int xenbus_match(struct device *_dev, struct device_driver *_drv); -extern int xenbus_dev_probe(struct device *_dev); -extern int xenbus_dev_remove(struct device *_dev); -extern int xenbus_register_driver_common(struct xenbus_driver *drv, - struct xen_bus_type *bus, - struct module *owner, - const char *mod_name); -extern int xenbus_probe_node(struct xen_bus_type *bus, - const char *type, - const char *nodename); -extern int xenbus_probe_devices(struct xen_bus_type *bus); - -extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus); - -extern void xenbus_dev_shutdown(struct device *_dev); - -extern int xenbus_dev_suspend(struct device *dev); -extern int xenbus_dev_resume(struct device *dev); -extern int xenbus_dev_cancel(struct device *dev); - -extern void xenbus_otherend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len, - int ignore_on_shutdown); - -extern int xenbus_read_otherend_details(struct xenbus_device *xendev, - char *id_node, char *path_node); - -void xenbus_ring_ops_init(void); - -#endif diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index 37929df829a3..b0bed4faf44c 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -53,8 +53,7 @@ #include <xen/xenbus.h> #include <xen/features.h> -#include "xenbus_comms.h" -#include "xenbus_probe.h" +#include "xenbus.h" /* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */ static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) @@ -182,9 +181,9 @@ static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type, } static void frontend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { - xenbus_otherend_changed(watch, vec, len, 0); + xenbus_otherend_changed(watch, path, token, 0); } static struct xen_bus_type xenbus_backend = { @@ -205,11 +204,11 @@ static struct xen_bus_type xenbus_backend = { }; static void backend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { DPRINTK(""); - xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_backend); + xenbus_dev_changed(path, &xenbus_backend); } static struct xenbus_watch be_watch = { diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c index 6d40a972ffb2..19e45ce21f89 100644 --- a/drivers/xen/xenbus/xenbus_probe_frontend.c +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -27,8 +27,7 @@ #include <xen/platform_pci.h> -#include "xenbus_comms.h" -#include "xenbus_probe.h" +#include "xenbus.h" @@ -87,9 +86,9 @@ static int xenbus_uevent_frontend(struct device *_dev, static void backend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { - xenbus_otherend_changed(watch, vec, len, 1); + xenbus_otherend_changed(watch, path, token, 1); } static void xenbus_frontend_delayed_resume(struct work_struct *w) @@ -154,11 +153,11 @@ static struct xen_bus_type xenbus_frontend = { }; static void frontend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { DPRINTK(""); - xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend); + xenbus_dev_changed(path, &xenbus_frontend); } @@ -333,13 +332,13 @@ static DECLARE_WAIT_QUEUE_HEAD(backend_state_wq); static int backend_state; static void xenbus_reset_backend_state_changed(struct xenbus_watch *w, - const char **v, unsigned int l) + const char *path, const char *token) { - if (xenbus_scanf(XBT_NIL, v[XS_WATCH_PATH], "", "%i", + if (xenbus_scanf(XBT_NIL, path, "", "%i", &backend_state) != 1) backend_state = XenbusStateUnknown; printk(KERN_DEBUG "XENBUS: backend %s %s\n", - v[XS_WATCH_PATH], xenbus_strstate(backend_state)); + path, xenbus_strstate(backend_state)); wake_up(&backend_state_wq); } diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 6afb993c5809..e46080214955 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -43,69 +43,36 @@ #include <linux/slab.h> #include <linux/fcntl.h> #include <linux/kthread.h> +#include <linux/reboot.h> #include <linux/rwsem.h> #include <linux/mutex.h> #include <asm/xen/hypervisor.h> #include <xen/xenbus.h> #include <xen/xen.h> -#include "xenbus_comms.h" -#include "xenbus_probe.h" - -struct xs_stored_msg { - struct list_head list; - - struct xsd_sockmsg hdr; - - union { - /* Queued replies. */ - struct { - char *body; - } reply; - - /* Queued watch events. */ - struct { - struct xenbus_watch *handle; - char **vec; - unsigned int vec_size; - } watch; - } u; -}; +#include "xenbus.h" -struct xs_handle { - /* A list of replies. Currently only one will ever be outstanding. */ - struct list_head reply_list; - spinlock_t reply_lock; - wait_queue_head_t reply_waitq; - - /* - * Mutex ordering: transaction_mutex -> watch_mutex -> request_mutex. - * response_mutex is never taken simultaneously with the other three. - * - * transaction_mutex must be held before incrementing - * transaction_count. The mutex is held when a suspend is in - * progress to prevent new transactions starting. - * - * When decrementing transaction_count to zero the wait queue - * should be woken up, the suspend code waits for count to - * reach zero. - */ - - /* One request at a time. */ - struct mutex request_mutex; - - /* Protect xenbus reader thread against save/restore. */ - struct mutex response_mutex; - - /* Protect transactions against save/restore. */ - struct mutex transaction_mutex; - atomic_t transaction_count; - wait_queue_head_t transaction_wq; - - /* Protect watch (de)register against save/restore. */ - struct rw_semaphore watch_mutex; -}; +/* + * Framework to protect suspend/resume handling against normal Xenstore + * message handling: + * During suspend/resume there must be no open transaction and no pending + * Xenstore request. + * New watch events happening in this time can be ignored by firing all watches + * after resume. + */ + +/* Lock protecting enter/exit critical region. */ +static DEFINE_SPINLOCK(xs_state_lock); +/* Number of users in critical region (protected by xs_state_lock). */ +static unsigned int xs_state_users; +/* Suspend handler waiting or already active (protected by xs_state_lock)? */ +static int xs_suspend_active; +/* Unique Xenstore request id (protected by xs_state_lock). */ +static uint32_t xs_request_id; -static struct xs_handle xs_state; +/* Wait queue for all callers waiting for critical region to become usable. */ +static DECLARE_WAIT_QUEUE_HEAD(xs_state_enter_wq); +/* Wait queue for suspend handling waiting for critical region being empty. */ +static DECLARE_WAIT_QUEUE_HEAD(xs_state_exit_wq); /* List of registered watches, and a lock to protect it. */ static LIST_HEAD(watches); @@ -115,6 +82,9 @@ static DEFINE_SPINLOCK(watches_lock); static LIST_HEAD(watch_events); static DEFINE_SPINLOCK(watch_events_lock); +/* Protect watch (de)register against save/restore. */ +static DECLARE_RWSEM(xs_watch_rwsem); + /* * Details of the xenwatch callback kernel thread. The thread waits on the * watch_events_waitq for work to do (queued on watch_events list). When it @@ -125,6 +95,59 @@ static pid_t xenwatch_pid; static DEFINE_MUTEX(xenwatch_mutex); static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq); +static void xs_suspend_enter(void) +{ + spin_lock(&xs_state_lock); + xs_suspend_active++; + spin_unlock(&xs_state_lock); + wait_event(xs_state_exit_wq, xs_state_users == 0); +} + +static void xs_suspend_exit(void) +{ + spin_lock(&xs_state_lock); + xs_suspend_active--; + spin_unlock(&xs_state_lock); + wake_up_all(&xs_state_enter_wq); +} + +static uint32_t xs_request_enter(struct xb_req_data *req) +{ + uint32_t rq_id; + + req->type = req->msg.type; + + spin_lock(&xs_state_lock); + + while (!xs_state_users && xs_suspend_active) { + spin_unlock(&xs_state_lock); + wait_event(xs_state_enter_wq, xs_suspend_active == 0); + spin_lock(&xs_state_lock); + } + + if (req->type == XS_TRANSACTION_START) + xs_state_users++; + xs_state_users++; + rq_id = xs_request_id++; + + spin_unlock(&xs_state_lock); + + return rq_id; +} + +void xs_request_exit(struct xb_req_data *req) +{ + spin_lock(&xs_state_lock); + xs_state_users--; + if ((req->type == XS_TRANSACTION_START && req->msg.type == XS_ERROR) || + req->type == XS_TRANSACTION_END) + xs_state_users--; + spin_unlock(&xs_state_lock); + + if (xs_suspend_active && !xs_state_users) + wake_up(&xs_state_exit_wq); +} + static int get_error(const char *errorstring) { unsigned int i; @@ -162,21 +185,24 @@ static bool xenbus_ok(void) } return false; } -static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) + +static bool test_reply(struct xb_req_data *req) { - struct xs_stored_msg *msg; - char *body; + if (req->state == xb_req_state_got_reply || !xenbus_ok()) + return true; + + /* Make sure to reread req->state each time. */ + barrier(); - spin_lock(&xs_state.reply_lock); + return false; +} + +static void *read_reply(struct xb_req_data *req) +{ + while (req->state != xb_req_state_got_reply) { + wait_event(req->wq, test_reply(req)); - while (list_empty(&xs_state.reply_list)) { - spin_unlock(&xs_state.reply_lock); - if (xenbus_ok()) - /* XXX FIXME: Avoid synchronous wait for response here. */ - wait_event_timeout(xs_state.reply_waitq, - !list_empty(&xs_state.reply_list), - msecs_to_jiffies(500)); - else { + if (!xenbus_ok()) /* * If we are in the process of being shut-down there is * no point of trying to contact XenBus - it is either @@ -184,76 +210,82 @@ static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) * has been killed or is unreachable. */ return ERR_PTR(-EIO); - } - spin_lock(&xs_state.reply_lock); + if (req->err) + return ERR_PTR(req->err); + } - msg = list_entry(xs_state.reply_list.next, - struct xs_stored_msg, list); - list_del(&msg->list); + return req->body; +} - spin_unlock(&xs_state.reply_lock); +static void xs_send(struct xb_req_data *req, struct xsd_sockmsg *msg) +{ + bool notify; - *type = msg->hdr.type; - if (len) - *len = msg->hdr.len; - body = msg->u.reply.body; + req->msg = *msg; + req->err = 0; + req->state = xb_req_state_queued; + init_waitqueue_head(&req->wq); - kfree(msg); + req->msg.req_id = xs_request_enter(req); - return body; -} + mutex_lock(&xb_write_mutex); + list_add_tail(&req->list, &xb_write_list); + notify = list_is_singular(&xb_write_list); + mutex_unlock(&xb_write_mutex); -static void transaction_start(void) -{ - mutex_lock(&xs_state.transaction_mutex); - atomic_inc(&xs_state.transaction_count); - mutex_unlock(&xs_state.transaction_mutex); + if (notify) + wake_up(&xb_waitq); } -static void transaction_end(void) +static void *xs_wait_for_reply(struct xb_req_data *req, struct xsd_sockmsg *msg) { - if (atomic_dec_and_test(&xs_state.transaction_count)) - wake_up(&xs_state.transaction_wq); -} + void *ret; -static void transaction_suspend(void) -{ - mutex_lock(&xs_state.transaction_mutex); - wait_event(xs_state.transaction_wq, - atomic_read(&xs_state.transaction_count) == 0); + ret = read_reply(req); + + xs_request_exit(req); + + msg->type = req->msg.type; + msg->len = req->msg.len; + + mutex_lock(&xb_write_mutex); + if (req->state == xb_req_state_queued || + req->state == xb_req_state_wait_reply) + req->state = xb_req_state_aborted; + else + kfree(req); + mutex_unlock(&xb_write_mutex); + + return ret; } -static void transaction_resume(void) +static void xs_wake_up(struct xb_req_data *req) { - mutex_unlock(&xs_state.transaction_mutex); + wake_up(&req->wq); } -void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) +int xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void *par) { - void *ret; - enum xsd_sockmsg_type type = msg->type; - int err; - - if (type == XS_TRANSACTION_START) - transaction_start(); + struct xb_req_data *req; + struct kvec *vec; - mutex_lock(&xs_state.request_mutex); + req = kmalloc(sizeof(*req) + sizeof(*vec), GFP_KERNEL); + if (!req) + return -ENOMEM; - err = xb_write(msg, sizeof(*msg) + msg->len); - if (err) { - msg->type = XS_ERROR; - ret = ERR_PTR(err); - } else - ret = read_reply(&msg->type, &msg->len); + vec = (struct kvec *)(req + 1); + vec->iov_len = msg->len; + vec->iov_base = msg + 1; - mutex_unlock(&xs_state.request_mutex); + req->vec = vec; + req->num_vecs = 1; + req->cb = xenbus_dev_queue_reply; + req->par = par; - if ((msg->type == XS_TRANSACTION_END) || - ((type == XS_TRANSACTION_START) && (msg->type == XS_ERROR))) - transaction_end(); + xs_send(req, msg); - return ret; + return 0; } EXPORT_SYMBOL(xenbus_dev_request_and_reply); @@ -264,37 +296,31 @@ static void *xs_talkv(struct xenbus_transaction t, unsigned int num_vecs, unsigned int *len) { + struct xb_req_data *req; struct xsd_sockmsg msg; void *ret = NULL; unsigned int i; int err; + req = kmalloc(sizeof(*req), GFP_NOIO | __GFP_HIGH); + if (!req) + return ERR_PTR(-ENOMEM); + + req->vec = iovec; + req->num_vecs = num_vecs; + req->cb = xs_wake_up; + msg.tx_id = t.id; - msg.req_id = 0; msg.type = type; msg.len = 0; for (i = 0; i < num_vecs; i++) msg.len += iovec[i].iov_len; - mutex_lock(&xs_state.request_mutex); - - err = xb_write(&msg, sizeof(msg)); - if (err) { - mutex_unlock(&xs_state.request_mutex); - return ERR_PTR(err); - } - - for (i = 0; i < num_vecs; i++) { - err = xb_write(iovec[i].iov_base, iovec[i].iov_len); - if (err) { - mutex_unlock(&xs_state.request_mutex); - return ERR_PTR(err); - } - } - - ret = read_reply(&msg.type, len); + xs_send(req, &msg); - mutex_unlock(&xs_state.request_mutex); + ret = xs_wait_for_reply(req, &msg); + if (len) + *len = msg.len; if (IS_ERR(ret)) return ret; @@ -501,13 +527,9 @@ int xenbus_transaction_start(struct xenbus_transaction *t) { char *id_str; - transaction_start(); - id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL); - if (IS_ERR(id_str)) { - transaction_end(); + if (IS_ERR(id_str)) return PTR_ERR(id_str); - } t->id = simple_strtoul(id_str, NULL, 0); kfree(id_str); @@ -521,18 +543,13 @@ EXPORT_SYMBOL_GPL(xenbus_transaction_start); int xenbus_transaction_end(struct xenbus_transaction t, int abort) { char abortstr[2]; - int err; if (abort) strcpy(abortstr, "F"); else strcpy(abortstr, "T"); - err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL)); - - transaction_end(); - - return err; + return xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL)); } EXPORT_SYMBOL_GPL(xenbus_transaction_end); @@ -665,6 +682,30 @@ static struct xenbus_watch *find_watch(const char *token) return NULL; } + +int xs_watch_msg(struct xs_watch_event *event) +{ + if (count_strings(event->body, event->len) != 2) { + kfree(event); + return -EINVAL; + } + event->path = (const char *)event->body; + event->token = (const char *)strchr(event->body, '\0') + 1; + + spin_lock(&watches_lock); + event->handle = find_watch(event->token); + if (event->handle != NULL) { + spin_lock(&watch_events_lock); + list_add_tail(&event->list, &watch_events); + wake_up(&watch_events_waitq); + spin_unlock(&watch_events_lock); + } else + kfree(event); + spin_unlock(&watches_lock); + + return 0; +} + /* * Certain older XenBus toolstack cannot handle reading values that are * not populated. Some Xen 3.4 installation are incapable of doing this @@ -713,7 +754,7 @@ int register_xenbus_watch(struct xenbus_watch *watch) sprintf(token, "%lX", (long)watch); - down_read(&xs_state.watch_mutex); + down_read(&xs_watch_rwsem); spin_lock(&watches_lock); BUG_ON(find_watch(token)); @@ -728,7 +769,7 @@ int register_xenbus_watch(struct xenbus_watch *watch) spin_unlock(&watches_lock); } - up_read(&xs_state.watch_mutex); + up_read(&xs_watch_rwsem); return err; } @@ -736,13 +777,13 @@ EXPORT_SYMBOL_GPL(register_xenbus_watch); void unregister_xenbus_watch(struct xenbus_watch *watch) { - struct xs_stored_msg *msg, *tmp; + struct xs_watch_event *event, *tmp; char token[sizeof(watch) * 2 + 1]; int err; sprintf(token, "%lX", (long)watch); - down_read(&xs_state.watch_mutex); + down_read(&xs_watch_rwsem); spin_lock(&watches_lock); BUG_ON(!find_watch(token)); @@ -753,7 +794,7 @@ void unregister_xenbus_watch(struct xenbus_watch *watch) if (err) pr_warn("Failed to release watch %s: %i\n", watch->node, err); - up_read(&xs_state.watch_mutex); + up_read(&xs_watch_rwsem); /* Make sure there are no callbacks running currently (unless its us) */ @@ -762,12 +803,11 @@ void unregister_xenbus_watch(struct xenbus_watch *watch) /* Cancel pending watch events. */ spin_lock(&watch_events_lock); - list_for_each_entry_safe(msg, tmp, &watch_events, list) { - if (msg->u.watch.handle != watch) + list_for_each_entry_safe(event, tmp, &watch_events, list) { + if (event->handle != watch) continue; - list_del(&msg->list); - kfree(msg->u.watch.vec); - kfree(msg); + list_del(&event->list); + kfree(event); } spin_unlock(&watch_events_lock); @@ -778,10 +818,10 @@ EXPORT_SYMBOL_GPL(unregister_xenbus_watch); void xs_suspend(void) { - transaction_suspend(); - down_write(&xs_state.watch_mutex); - mutex_lock(&xs_state.request_mutex); - mutex_lock(&xs_state.response_mutex); + xs_suspend_enter(); + + down_write(&xs_watch_rwsem); + mutex_lock(&xs_response_mutex); } void xs_resume(void) @@ -791,31 +831,31 @@ void xs_resume(void) xb_init_comms(); - mutex_unlock(&xs_state.response_mutex); - mutex_unlock(&xs_state.request_mutex); - transaction_resume(); + mutex_unlock(&xs_response_mutex); + + xs_suspend_exit(); - /* No need for watches_lock: the watch_mutex is sufficient. */ + /* No need for watches_lock: the xs_watch_rwsem is sufficient. */ list_for_each_entry(watch, &watches, list) { sprintf(token, "%lX", (long)watch); xs_watch(watch->node, token); } - up_write(&xs_state.watch_mutex); + up_write(&xs_watch_rwsem); } void xs_suspend_cancel(void) { - mutex_unlock(&xs_state.response_mutex); - mutex_unlock(&xs_state.request_mutex); - up_write(&xs_state.watch_mutex); - mutex_unlock(&xs_state.transaction_mutex); + mutex_unlock(&xs_response_mutex); + up_write(&xs_watch_rwsem); + + xs_suspend_exit(); } static int xenwatch_thread(void *unused) { struct list_head *ent; - struct xs_stored_msg *msg; + struct xs_watch_event *event; for (;;) { wait_event_interruptible(watch_events_waitq, @@ -833,13 +873,10 @@ static int xenwatch_thread(void *unused) spin_unlock(&watch_events_lock); if (ent != &watch_events) { - msg = list_entry(ent, struct xs_stored_msg, list); - msg->u.watch.handle->callback( - msg->u.watch.handle, - (const char **)msg->u.watch.vec, - msg->u.watch.vec_size); - kfree(msg->u.watch.vec); - kfree(msg); + event = list_entry(ent, struct xs_watch_event, list); + event->handle->callback(event->handle, event->path, + event->token); + kfree(event); } mutex_unlock(&xenwatch_mutex); @@ -848,126 +885,37 @@ static int xenwatch_thread(void *unused) return 0; } -static int process_msg(void) +/* + * Wake up all threads waiting for a xenstore reply. In case of shutdown all + * pending replies will be marked as "aborted" in order to let the waiters + * return in spite of xenstore possibly no longer being able to reply. This + * will avoid blocking shutdown by a thread waiting for xenstore but being + * necessary for shutdown processing to proceed. + */ +static int xs_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) { - struct xs_stored_msg *msg; - char *body; - int err; - - /* - * We must disallow save/restore while reading a xenstore message. - * A partial read across s/r leaves us out of sync with xenstored. - */ - for (;;) { - err = xb_wait_for_data_to_read(); - if (err) - return err; - mutex_lock(&xs_state.response_mutex); - if (xb_data_to_read()) - break; - /* We raced with save/restore: pending data 'disappeared'. */ - mutex_unlock(&xs_state.response_mutex); - } - - - msg = kmalloc(sizeof(*msg), GFP_NOIO | __GFP_HIGH); - if (msg == NULL) { - err = -ENOMEM; - goto out; - } - - err = xb_read(&msg->hdr, sizeof(msg->hdr)); - if (err) { - kfree(msg); - goto out; - } - - if (msg->hdr.len > XENSTORE_PAYLOAD_MAX) { - kfree(msg); - err = -EINVAL; - goto out; - } - - body = kmalloc(msg->hdr.len + 1, GFP_NOIO | __GFP_HIGH); - if (body == NULL) { - kfree(msg); - err = -ENOMEM; - goto out; - } - - err = xb_read(body, msg->hdr.len); - if (err) { - kfree(body); - kfree(msg); - goto out; - } - body[msg->hdr.len] = '\0'; - - if (msg->hdr.type == XS_WATCH_EVENT) { - msg->u.watch.vec = split(body, msg->hdr.len, - &msg->u.watch.vec_size); - if (IS_ERR(msg->u.watch.vec)) { - err = PTR_ERR(msg->u.watch.vec); - kfree(msg); - goto out; - } - - spin_lock(&watches_lock); - msg->u.watch.handle = find_watch( - msg->u.watch.vec[XS_WATCH_TOKEN]); - if (msg->u.watch.handle != NULL) { - spin_lock(&watch_events_lock); - list_add_tail(&msg->list, &watch_events); - wake_up(&watch_events_waitq); - spin_unlock(&watch_events_lock); - } else { - kfree(msg->u.watch.vec); - kfree(msg); - } - spin_unlock(&watches_lock); - } else { - msg->u.reply.body = body; - spin_lock(&xs_state.reply_lock); - list_add_tail(&msg->list, &xs_state.reply_list); - spin_unlock(&xs_state.reply_lock); - wake_up(&xs_state.reply_waitq); - } + struct xb_req_data *req; - out: - mutex_unlock(&xs_state.response_mutex); - return err; + mutex_lock(&xb_write_mutex); + list_for_each_entry(req, &xs_reply_list, list) + wake_up(&req->wq); + list_for_each_entry(req, &xb_write_list, list) + wake_up(&req->wq); + mutex_unlock(&xb_write_mutex); + return NOTIFY_DONE; } -static int xenbus_thread(void *unused) -{ - int err; - - for (;;) { - err = process_msg(); - if (err) - pr_warn("error %d while reading message\n", err); - if (kthread_should_stop()) - break; - } - - return 0; -} +static struct notifier_block xs_reboot_nb = { + .notifier_call = xs_reboot_notify, +}; int xs_init(void) { int err; struct task_struct *task; - INIT_LIST_HEAD(&xs_state.reply_list); - spin_lock_init(&xs_state.reply_lock); - init_waitqueue_head(&xs_state.reply_waitq); - - mutex_init(&xs_state.request_mutex); - mutex_init(&xs_state.response_mutex); - mutex_init(&xs_state.transaction_mutex); - init_rwsem(&xs_state.watch_mutex); - atomic_set(&xs_state.transaction_count, 0); - init_waitqueue_head(&xs_state.transaction_wq); + register_reboot_notifier(&xs_reboot_nb); /* Initialize the shared memory rings to talk to xenstored */ err = xb_init_comms(); @@ -979,10 +927,6 @@ int xs_init(void) return PTR_ERR(task); xenwatch_pid = task->pid; - task = kthread_run(xenbus_thread, NULL, "xenbus"); - if (IS_ERR(task)) - return PTR_ERR(task); - /* shutdown watches for kexec boot */ xs_reset_watches(); diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c index 8559a71f36b1..328c3987b112 100644 --- a/drivers/xen/xenfs/super.c +++ b/drivers/xen/xenfs/super.c @@ -16,10 +16,10 @@ #include <linux/magic.h> #include <xen/xen.h> +#include <xen/xenbus.h> #include "xenfs.h" #include "../privcmd.h" -#include "../xenbus/xenbus_comms.h" #include <asm/xen/hypervisor.h> diff --git a/drivers/xen/xenfs/xenstored.c b/drivers/xen/xenfs/xenstored.c index fef20dbc6a5c..82fd2a396d96 100644 --- a/drivers/xen/xenfs/xenstored.c +++ b/drivers/xen/xenfs/xenstored.c @@ -4,9 +4,9 @@ #include <linux/fs.h> #include <xen/page.h> +#include <xen/xenbus.h> #include "xenfs.h" -#include "../xenbus/xenbus_comms.h" static ssize_t xsd_read(struct file *file, char __user *buf, size_t size, loff_t *off) diff --git a/include/uapi/xen/privcmd.h b/include/uapi/xen/privcmd.h index 7ddeeda93809..63ee95c9dabb 100644 --- a/include/uapi/xen/privcmd.h +++ b/include/uapi/xen/privcmd.h @@ -77,6 +77,17 @@ struct privcmd_mmapbatch_v2 { int __user *err; /* array of error codes */ }; +struct privcmd_dm_op_buf { + void __user *uptr; + size_t size; +}; + +struct privcmd_dm_op { + domid_t dom; + __u16 num; + const struct privcmd_dm_op_buf __user *ubufs; +}; + /* * @cmd: IOCTL_PRIVCMD_HYPERCALL * @arg: &privcmd_hypercall_t @@ -98,5 +109,9 @@ struct privcmd_mmapbatch_v2 { _IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch)) #define IOCTL_PRIVCMD_MMAPBATCH_V2 \ _IOC(_IOC_NONE, 'P', 4, sizeof(struct privcmd_mmapbatch_v2)) +#define IOCTL_PRIVCMD_DM_OP \ + _IOC(_IOC_NONE, 'P', 5, sizeof(struct privcmd_dm_op)) +#define IOCTL_PRIVCMD_RESTRICT \ + _IOC(_IOC_NONE, 'P', 6, sizeof(domid_t)) #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ diff --git a/include/xen/arm/hypercall.h b/include/xen/arm/hypercall.h index 9d874db13c0e..73db4b2eeb89 100644 --- a/include/xen/arm/hypercall.h +++ b/include/xen/arm/hypercall.h @@ -53,6 +53,7 @@ int HYPERVISOR_physdev_op(int cmd, void *arg); int HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args); int HYPERVISOR_tmem_op(void *arg); int HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type); +int HYPERVISOR_dm_op(domid_t domid, unsigned int nr_bufs, void *bufs); int HYPERVISOR_platform_op_raw(void *arg); static inline int HYPERVISOR_platform_op(struct xen_platform_op *op) { diff --git a/include/xen/interface/elfnote.h b/include/xen/interface/elfnote.h index f90b03454659..9e9f9bf7c66d 100644 --- a/include/xen/interface/elfnote.h +++ b/include/xen/interface/elfnote.h @@ -193,9 +193,19 @@ #define XEN_ELFNOTE_SUPPORTED_FEATURES 17 /* + * Physical entry point into the kernel. + * + * 32bit entry point into the kernel. When requested to launch the + * guest kernel in a HVM container, Xen will use this entry point to + * launch the guest in 32bit protected mode with paging disabled. + * Ignored otherwise. + */ +#define XEN_ELFNOTE_PHYS32_ENTRY 18 + +/* * The number of the highest elfnote defined. */ -#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUPPORTED_FEATURES +#define XEN_ELFNOTE_MAX XEN_ELFNOTE_PHYS32_ENTRY #endif /* __XEN_PUBLIC_ELFNOTE_H__ */ diff --git a/include/xen/interface/hvm/dm_op.h b/include/xen/interface/hvm/dm_op.h new file mode 100644 index 000000000000..ee9e480bc559 --- /dev/null +++ b/include/xen/interface/hvm/dm_op.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2016, Citrix Systems Inc + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef __XEN_PUBLIC_HVM_DM_OP_H__ +#define __XEN_PUBLIC_HVM_DM_OP_H__ + +struct xen_dm_op_buf { + GUEST_HANDLE(void) h; + xen_ulong_t size; +}; +DEFINE_GUEST_HANDLE_STRUCT(xen_dm_op_buf); + +#endif /* __XEN_PUBLIC_HVM_DM_OP_H__ */ diff --git a/include/xen/interface/hvm/hvm_vcpu.h b/include/xen/interface/hvm/hvm_vcpu.h new file mode 100644 index 000000000000..32ca83edd44d --- /dev/null +++ b/include/xen/interface/hvm/hvm_vcpu.h @@ -0,0 +1,143 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2015, Roger Pau Monne <roger.pau@citrix.com> + */ + +#ifndef __XEN_PUBLIC_HVM_HVM_VCPU_H__ +#define __XEN_PUBLIC_HVM_HVM_VCPU_H__ + +#include "../xen.h" + +struct vcpu_hvm_x86_32 { + uint32_t eax; + uint32_t ecx; + uint32_t edx; + uint32_t ebx; + uint32_t esp; + uint32_t ebp; + uint32_t esi; + uint32_t edi; + uint32_t eip; + uint32_t eflags; + + uint32_t cr0; + uint32_t cr3; + uint32_t cr4; + + uint32_t pad1; + + /* + * EFER should only be used to set the NXE bit (if required) + * when starting a vCPU in 32bit mode with paging enabled or + * to set the LME/LMA bits in order to start the vCPU in + * compatibility mode. + */ + uint64_t efer; + + uint32_t cs_base; + uint32_t ds_base; + uint32_t ss_base; + uint32_t es_base; + uint32_t tr_base; + uint32_t cs_limit; + uint32_t ds_limit; + uint32_t ss_limit; + uint32_t es_limit; + uint32_t tr_limit; + uint16_t cs_ar; + uint16_t ds_ar; + uint16_t ss_ar; + uint16_t es_ar; + uint16_t tr_ar; + + uint16_t pad2[3]; +}; + +/* + * The layout of the _ar fields of the segment registers is the + * following: + * + * Bits [0,3]: type (bits 40-43). + * Bit 4: s (descriptor type, bit 44). + * Bit [5,6]: dpl (descriptor privilege level, bits 45-46). + * Bit 7: p (segment-present, bit 47). + * Bit 8: avl (available for system software, bit 52). + * Bit 9: l (64-bit code segment, bit 53). + * Bit 10: db (meaning depends on the segment, bit 54). + * Bit 11: g (granularity, bit 55) + * Bits [12,15]: unused, must be blank. + * + * A more complete description of the meaning of this fields can be + * obtained from the Intel SDM, Volume 3, section 3.4.5. + */ + +struct vcpu_hvm_x86_64 { + uint64_t rax; + uint64_t rcx; + uint64_t rdx; + uint64_t rbx; + uint64_t rsp; + uint64_t rbp; + uint64_t rsi; + uint64_t rdi; + uint64_t rip; + uint64_t rflags; + + uint64_t cr0; + uint64_t cr3; + uint64_t cr4; + uint64_t efer; + + /* + * Using VCPU_HVM_MODE_64B implies that the vCPU is launched + * directly in long mode, so the cached parts of the segment + * registers get set to match that environment. + * + * If the user wants to launch the vCPU in compatibility mode + * the 32-bit structure should be used instead. + */ +}; + +struct vcpu_hvm_context { +#define VCPU_HVM_MODE_32B 0 /* 32bit fields of the structure will be used. */ +#define VCPU_HVM_MODE_64B 1 /* 64bit fields of the structure will be used. */ + uint32_t mode; + + uint32_t pad; + + /* CPU registers. */ + union { + struct vcpu_hvm_x86_32 x86_32; + struct vcpu_hvm_x86_64 x86_64; + } cpu_regs; +}; +typedef struct vcpu_hvm_context vcpu_hvm_context_t; + +#endif /* __XEN_PUBLIC_HVM_HVM_VCPU_H__ */ + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/include/xen/interface/hvm/start_info.h b/include/xen/interface/hvm/start_info.h new file mode 100644 index 000000000000..648415976ead --- /dev/null +++ b/include/xen/interface/hvm/start_info.h @@ -0,0 +1,98 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2016, Citrix Systems, Inc. + */ + +#ifndef __XEN_PUBLIC_ARCH_X86_HVM_START_INFO_H__ +#define __XEN_PUBLIC_ARCH_X86_HVM_START_INFO_H__ + +/* + * Start of day structure passed to PVH guests and to HVM guests in %ebx. + * + * NOTE: nothing will be loaded at physical address 0, so a 0 value in any + * of the address fields should be treated as not present. + * + * 0 +----------------+ + * | magic | Contains the magic value XEN_HVM_START_MAGIC_VALUE + * | | ("xEn3" with the 0x80 bit of the "E" set). + * 4 +----------------+ + * | version | Version of this structure. Current version is 0. New + * | | versions are guaranteed to be backwards-compatible. + * 8 +----------------+ + * | flags | SIF_xxx flags. + * 12 +----------------+ + * | nr_modules | Number of modules passed to the kernel. + * 16 +----------------+ + * | modlist_paddr | Physical address of an array of modules + * | | (layout of the structure below). + * 24 +----------------+ + * | cmdline_paddr | Physical address of the command line, + * | | a zero-terminated ASCII string. + * 32 +----------------+ + * | rsdp_paddr | Physical address of the RSDP ACPI data structure. + * 40 +----------------+ + * + * The layout of each entry in the module structure is the following: + * + * 0 +----------------+ + * | paddr | Physical address of the module. + * 8 +----------------+ + * | size | Size of the module in bytes. + * 16 +----------------+ + * | cmdline_paddr | Physical address of the command line, + * | | a zero-terminated ASCII string. + * 24 +----------------+ + * | reserved | + * 32 +----------------+ + * + * The address and sizes are always a 64bit little endian unsigned integer. + * + * NB: Xen on x86 will always try to place all the data below the 4GiB + * boundary. + */ +#define XEN_HVM_START_MAGIC_VALUE 0x336ec578 + +/* + * C representation of the x86/HVM start info layout. + * + * The canonical definition of this layout is above, this is just a way to + * represent the layout described there using C types. + */ +struct hvm_start_info { + uint32_t magic; /* Contains the magic value 0x336ec578 */ + /* ("xEn3" with the 0x80 bit of the "E" set).*/ + uint32_t version; /* Version of this structure. */ + uint32_t flags; /* SIF_xxx flags. */ + uint32_t nr_modules; /* Number of modules passed to the kernel. */ + uint64_t modlist_paddr; /* Physical address of an array of */ + /* hvm_modlist_entry. */ + uint64_t cmdline_paddr; /* Physical address of the command line. */ + uint64_t rsdp_paddr; /* Physical address of the RSDP ACPI data */ + /* structure. */ +}; + +struct hvm_modlist_entry { + uint64_t paddr; /* Physical address of the module. */ + uint64_t size; /* Size of the module in bytes. */ + uint64_t cmdline_paddr; /* Physical address of the command line. */ + uint64_t reserved; +}; + +#endif /* __XEN_PUBLIC_ARCH_X86_HVM_START_INFO_H__ */ diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 1b0d189cd3d3..4f4830ef8f93 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -81,6 +81,7 @@ #define __HYPERVISOR_tmem_op 38 #define __HYPERVISOR_xc_reserved_op 39 /* reserved for XenClient */ #define __HYPERVISOR_xenpmu_op 40 +#define __HYPERVISOR_dm_op 41 /* Architecture-specific hypercall definitions. */ #define __HYPERVISOR_arch_0 48 diff --git a/include/xen/xen.h b/include/xen/xen.h index f0f0252cff9a..6e8b7fc79801 100644 --- a/include/xen/xen.h +++ b/include/xen/xen.h @@ -30,16 +30,10 @@ extern enum xen_domain_type xen_domain_type; #endif /* CONFIG_XEN_DOM0 */ #ifdef CONFIG_XEN_PVH -/* This functionality exists only for x86. The XEN_PVHVM support exists - * only in x86 world - hence on ARM it will be always disabled. - * N.B. ARM guests are neither PV nor HVM nor PVHVM. - * It's a bit like PVH but is different also (it's further towards the H - * end of the spectrum than even PVH). - */ -#include <xen/features.h> -#define xen_pvh_domain() (xen_pv_domain() && \ - xen_feature(XENFEAT_auto_translated_physmap)) +extern bool xen_pvh; +#define xen_pvh_domain() (xen_hvm_domain() && xen_pvh) #else #define xen_pvh_domain() (0) #endif + #endif /* _XEN_XEN_H */ diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index 271ba62503c7..869c816d5f8c 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -38,6 +38,7 @@ #include <linux/notifier.h> #include <linux/mutex.h> #include <linux/export.h> +#include <linux/fs.h> #include <linux/completion.h> #include <linux/init.h> #include <linux/slab.h> @@ -60,7 +61,7 @@ struct xenbus_watch /* Callback (executed in a process context with no locks held). */ void (*callback)(struct xenbus_watch *, - const char **vec, unsigned int len); + const char *path, const char *token); }; @@ -175,16 +176,9 @@ void xs_suspend(void); void xs_resume(void); void xs_suspend_cancel(void); -/* Used by xenbus_dev to borrow kernel's store connection. */ -void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg); - struct work_struct; -/* Prepare for domain suspend: then resume or cancel the suspend. */ -void xenbus_suspend(void); -void xenbus_resume(void); void xenbus_probe(struct work_struct *); -void xenbus_suspend_cancel(void); #define XENBUS_IS_ERR_READ(str) ({ \ if (!IS_ERR(str) && strlen(str) == 0) { \ @@ -199,11 +193,11 @@ void xenbus_suspend_cancel(void); int xenbus_watch_path(struct xenbus_device *dev, const char *path, struct xenbus_watch *watch, void (*callback)(struct xenbus_watch *, - const char **, unsigned int)); + const char *, const char *)); __printf(4, 5) int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch, void (*callback)(struct xenbus_watch *, - const char **, unsigned int), + const char *, const char *), const char *pathfmt, ...); int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state); @@ -235,4 +229,8 @@ const char *xenbus_strstate(enum xenbus_state state); int xenbus_dev_is_online(struct xenbus_device *dev); int xenbus_frontend_closed(struct xenbus_device *dev); +extern const struct file_operations xen_xenbus_fops; +extern struct xenstore_domain_interface *xen_store_interface; +extern int xen_store_evtchn; + #endif /* _XEN_XENBUS_H */ |