diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-04 17:32:42 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-04 17:32:42 -0800 |
commit | 41ecf1404b34d9975eb97f5005d9e4274eaeb76a (patch) | |
tree | 8582dec3a644cfbe178132acded5ebb458e0e32e | |
parent | 2dc10ad81fc017837037e60439662e1b16bdffb9 (diff) | |
parent | abed7d0710e8f892c267932a9492ccf447674fb8 (diff) | |
download | linux-41ecf1404b34d9975eb97f5005d9e4274eaeb76a.tar.bz2 |
Merge tag 'for-linus-4.4-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip
Pull xen updates from David Vrabel:
- Improve balloon driver memory hotplug placement.
- Use unpopulated hotplugged memory for foreign pages (if
supported/enabled).
- Support 64 KiB guest pages on arm64.
- CPU hotplug support on arm/arm64.
* tag 'for-linus-4.4-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: (44 commits)
xen: fix the check of e_pfn in xen_find_pfn_range
x86/xen: add reschedule point when mapping foreign GFNs
xen/arm: don't try to re-register vcpu_info on cpu_hotplug.
xen, cpu_hotplug: call device_offline instead of cpu_down
xen/arm: Enable cpu_hotplug.c
xenbus: Support multiple grants ring with 64KB
xen/grant-table: Add an helper to iterate over a specific number of grants
xen/xenbus: Rename *RING_PAGE* to *RING_GRANT*
xen/arm: correct comment in enlighten.c
xen/gntdev: use types from linux/types.h in userspace headers
xen/gntalloc: use types from linux/types.h in userspace headers
xen/balloon: Use the correct sizeof when declaring frame_list
xen/swiotlb: Add support for 64KB page granularity
xen/swiotlb: Pass addresses rather than frame numbers to xen_arch_need_swiotlb
arm/xen: Add support for 64KB page granularity
xen/privcmd: Add support for Linux 64KB page granularity
net/xen-netback: Make it running on 64KB page granularity
net/xen-netfront: Make it running on 64KB page granularity
block/xen-blkback: Make it running on 64KB page granularity
block/xen-blkfront: Make it running on 64KB page granularity
...
41 files changed, 1365 insertions, 647 deletions
diff --git a/arch/arm/include/asm/xen/hypervisor.h b/arch/arm/include/asm/xen/hypervisor.h index 04ff8e7b37df..95251512e2c4 100644 --- a/arch/arm/include/asm/xen/hypervisor.h +++ b/arch/arm/include/asm/xen/hypervisor.h @@ -26,4 +26,14 @@ void __init xen_early_init(void); static inline void xen_early_init(void) { return; } #endif +#ifdef CONFIG_HOTPLUG_CPU +static inline void xen_arch_register_cpu(int num) +{ +} + +static inline void xen_arch_unregister_cpu(int num) +{ +} +#endif + #endif /* _ASM_ARM_XEN_HYPERVISOR_H */ diff --git a/arch/arm/include/asm/xen/page-coherent.h b/arch/arm/include/asm/xen/page-coherent.h index efd562412850..0375c8caa061 100644 --- a/arch/arm/include/asm/xen/page-coherent.h +++ b/arch/arm/include/asm/xen/page-coherent.h @@ -35,11 +35,15 @@ static inline void xen_dma_map_page(struct device *hwdev, struct page *page, dma_addr_t dev_addr, unsigned long offset, size_t size, enum dma_data_direction dir, struct dma_attrs *attrs) { - bool local = PFN_DOWN(dev_addr) == page_to_pfn(page); - /* Dom0 is mapped 1:1, so if pfn == mfn the page is local otherwise - * is a foreign page grant-mapped in dom0. If the page is local we - * can safely call the native dma_ops function, otherwise we call - * the xen specific function. */ + bool local = XEN_PFN_DOWN(dev_addr) == page_to_xen_pfn(page); + /* + * Dom0 is mapped 1:1, while the Linux page can be spanned accross + * multiple Xen page, it's not possible to have a mix of local and + * foreign Xen page. So if the first xen_pfn == mfn the page is local + * otherwise it's a foreign page grant-mapped in dom0. If the page is + * local we can safely call the native dma_ops function, otherwise we + * call the xen specific function. + */ if (local) __generic_dma_ops(hwdev)->map_page(hwdev, page, offset, size, dir, attrs); else @@ -51,10 +55,14 @@ static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle, struct dma_attrs *attrs) { unsigned long pfn = PFN_DOWN(handle); - /* Dom0 is mapped 1:1, so calling pfn_valid on a foreign mfn will - * always return false. If the page is local we can safely call the - * native dma_ops function, otherwise we call the xen specific - * function. */ + /* + * Dom0 is mapped 1:1, while the Linux page can be spanned accross + * multiple Xen page, it's not possible to have a mix of local and + * foreign Xen page. Dom0 is mapped 1:1, so calling pfn_valid on a + * foreign mfn will always return false. If the page is local we can + * safely call the native dma_ops function, otherwise we call the xen + * specific function. + */ if (pfn_valid(pfn)) { if (__generic_dma_ops(hwdev)->unmap_page) __generic_dma_ops(hwdev)->unmap_page(hwdev, handle, size, dir, attrs); diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h index 127956353b00..415dbc6e43fd 100644 --- a/arch/arm/include/asm/xen/page.h +++ b/arch/arm/include/asm/xen/page.h @@ -13,9 +13,6 @@ #define phys_to_machine_mapping_valid(pfn) (1) -#define pte_mfn pte_pfn -#define mfn_pte pfn_pte - /* Xen machine address */ typedef struct xmaddr { phys_addr_t maddr; @@ -31,6 +28,17 @@ typedef struct xpaddr { #define INVALID_P2M_ENTRY (~0UL) +/* + * The pseudo-physical frame (pfn) used in all the helpers is always based + * on Xen page granularity (i.e 4KB). + * + * A Linux page may be split across multiple non-contiguous Xen page so we + * have to keep track with frame based on 4KB page granularity. + * + * PV drivers should never make a direct usage of those helpers (particularly + * pfn_to_gfn and gfn_to_pfn). + */ + unsigned long __pfn_to_mfn(unsigned long pfn); extern struct rb_root phys_to_mach; @@ -67,8 +75,8 @@ static inline unsigned long bfn_to_pfn(unsigned long bfn) #define bfn_to_local_pfn(bfn) bfn_to_pfn(bfn) /* VIRT <-> GUEST conversion */ -#define virt_to_gfn(v) (pfn_to_gfn(virt_to_pfn(v))) -#define gfn_to_virt(m) (__va(gfn_to_pfn(m) << PAGE_SHIFT)) +#define virt_to_gfn(v) (pfn_to_gfn(virt_to_phys(v) >> XEN_PAGE_SHIFT)) +#define gfn_to_virt(m) (__va(gfn_to_pfn(m) << XEN_PAGE_SHIFT)) /* Only used in PV code. But ARM guests are always HVM. */ static inline xmaddr_t arbitrary_virt_to_machine(void *vaddr) @@ -107,8 +115,8 @@ static inline bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) #define xen_unmap(cookie) iounmap((cookie)) bool xen_arch_need_swiotlb(struct device *dev, - unsigned long pfn, - unsigned long bfn); + phys_addr_t phys, + dma_addr_t dev_addr); unsigned long xen_get_swiotlb_free_pages(unsigned int order); #endif /* _ASM_ARM_XEN_PAGE_H */ diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index eeeab074e154..fc7ea529f462 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -86,16 +86,25 @@ static void xen_percpu_init(void) int err; int cpu = get_cpu(); + /* + * VCPUOP_register_vcpu_info cannot be called twice for the same + * vcpu, so if vcpu_info is already registered, just get out. This + * can happen with cpu-hotplug. + */ + if (per_cpu(xen_vcpu, cpu) != NULL) + goto after_register_vcpu_info; + pr_info("Xen: initializing cpu%d\n", cpu); vcpup = per_cpu_ptr(xen_vcpu_info, cpu); - info.mfn = __pa(vcpup) >> PAGE_SHIFT; - info.offset = offset_in_page(vcpup); + info.mfn = virt_to_gfn(vcpup); + info.offset = xen_offset_in_page(vcpup); err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info); BUG_ON(err); per_cpu(xen_vcpu, cpu) = vcpup; +after_register_vcpu_info: enable_percpu_irq(xen_events_irq, 0); put_cpu(); } @@ -124,6 +133,9 @@ static int xen_cpu_notification(struct notifier_block *self, case CPU_STARTING: xen_percpu_init(); break; + case CPU_DYING: + disable_percpu_irq(xen_events_irq); + break; default: break; } @@ -213,7 +225,7 @@ static int __init xen_guest_init(void) xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; - xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; + xatp.gpfn = virt_to_gfn(shared_info_page); if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) BUG(); @@ -284,7 +296,7 @@ void xen_arch_resume(void) { } void xen_arch_suspend(void) { } -/* In the hypervisor.S file. */ +/* In the hypercall.S file. */ EXPORT_SYMBOL_GPL(HYPERVISOR_event_channel_op); EXPORT_SYMBOL_GPL(HYPERVISOR_grant_table_op); EXPORT_SYMBOL_GPL(HYPERVISOR_xen_version); diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index 6dd911d1f0ac..7c34f7126b04 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -48,22 +48,22 @@ static void dma_cache_maint(dma_addr_t handle, unsigned long offset, size_t size, enum dma_data_direction dir, enum dma_cache_op op) { struct gnttab_cache_flush cflush; - unsigned long pfn; + unsigned long xen_pfn; size_t left = size; - pfn = (handle >> PAGE_SHIFT) + offset / PAGE_SIZE; - offset %= PAGE_SIZE; + xen_pfn = (handle >> XEN_PAGE_SHIFT) + offset / XEN_PAGE_SIZE; + offset %= XEN_PAGE_SIZE; do { size_t len = left; /* buffers in highmem or foreign pages cannot cross page * boundaries */ - if (len + offset > PAGE_SIZE) - len = PAGE_SIZE - offset; + if (len + offset > XEN_PAGE_SIZE) + len = XEN_PAGE_SIZE - offset; cflush.op = 0; - cflush.a.dev_bus_addr = pfn << PAGE_SHIFT; + cflush.a.dev_bus_addr = xen_pfn << XEN_PAGE_SHIFT; cflush.offset = offset; cflush.length = len; @@ -79,7 +79,7 @@ static void dma_cache_maint(dma_addr_t handle, unsigned long offset, HYPERVISOR_grant_table_op(GNTTABOP_cache_flush, &cflush, 1); offset = 0; - pfn++; + xen_pfn++; left -= len; } while (left); } @@ -138,10 +138,29 @@ void __xen_dma_sync_single_for_device(struct device *hwdev, } bool xen_arch_need_swiotlb(struct device *dev, - unsigned long pfn, - unsigned long bfn) + phys_addr_t phys, + dma_addr_t dev_addr) { - return (!hypercall_cflush && (pfn != bfn) && !is_device_dma_coherent(dev)); + unsigned int xen_pfn = XEN_PFN_DOWN(phys); + unsigned int bfn = XEN_PFN_DOWN(dev_addr); + + /* + * The swiotlb buffer should be used if + * - Xen doesn't have the cache flush hypercall + * - The Linux page refers to foreign memory + * - The device doesn't support coherent DMA request + * + * The Linux page may be spanned acrros multiple Xen page, although + * it's not possible to have a mix of local and foreign Xen page. + * Furthermore, range_straddles_page_boundary is already checking + * if buffer is physically contiguous in the host RAM. + * + * Therefore we only need to check the first Xen page to know if we + * require a bounce buffer because the device doesn't support coherent + * memory and we are not able to flush the cache. + */ + return (!hypercall_cflush && (xen_pfn != bfn) && + !is_device_dma_coherent(dev)); } int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, diff --git a/arch/arm/xen/p2m.c b/arch/arm/xen/p2m.c index 887596c67b12..0ed01f2d5ee4 100644 --- a/arch/arm/xen/p2m.c +++ b/arch/arm/xen/p2m.c @@ -93,8 +93,8 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, for (i = 0; i < count; i++) { if (map_ops[i].status) continue; - set_phys_to_machine(map_ops[i].host_addr >> PAGE_SHIFT, - map_ops[i].dev_bus_addr >> PAGE_SHIFT); + set_phys_to_machine(map_ops[i].host_addr >> XEN_PAGE_SHIFT, + map_ops[i].dev_bus_addr >> XEN_PAGE_SHIFT); } return 0; @@ -108,7 +108,7 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, int i; for (i = 0; i < count; i++) { - set_phys_to_machine(unmap_ops[i].host_addr >> PAGE_SHIFT, + set_phys_to_machine(unmap_ops[i].host_addr >> XEN_PAGE_SHIFT, INVALID_P2M_ENTRY); } diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index d866959e5685..8b2d4bea9962 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -57,4 +57,9 @@ static inline bool xen_x2apic_para_available(void) } #endif +#ifdef CONFIG_HOTPLUG_CPU +void xen_arch_register_cpu(int num); +void xen_arch_unregister_cpu(int num); +#endif + #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 0679e11d2cf7..f5fb840b43e8 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -12,7 +12,7 @@ #include <asm/pgtable.h> #include <xen/interface/xen.h> -#include <xen/grant_table.h> +#include <xen/interface/grant_table.h> #include <xen/features.h> /* Xen machine address */ @@ -43,6 +43,8 @@ extern unsigned long *xen_p2m_addr; extern unsigned long xen_p2m_size; extern unsigned long xen_max_p2m_pfn; +extern int xen_alloc_p2m_entry(unsigned long pfn); + extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); @@ -296,8 +298,8 @@ void make_lowmem_page_readwrite(void *vaddr); #define xen_unmap(cookie) iounmap((cookie)) static inline bool xen_arch_need_swiotlb(struct device *dev, - unsigned long pfn, - unsigned long bfn) + phys_addr_t phys, + dma_addr_t dev_addr) { return false; } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 993b7a71386d..5774800ff583 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -75,6 +75,7 @@ #include <asm/mwait.h> #include <asm/pci_x86.h> #include <asm/pat.h> +#include <asm/cpu.h> #ifdef CONFIG_ACPI #include <linux/acpi.h> @@ -1899,3 +1900,17 @@ const struct hypervisor_x86 x86_hyper_xen = { .set_cpu_features = xen_set_cpu_features, }; EXPORT_SYMBOL(x86_hyper_xen); + +#ifdef CONFIG_HOTPLUG_CPU +void xen_arch_register_cpu(int num) +{ + arch_register_cpu(num); +} +EXPORT_SYMBOL(xen_arch_register_cpu); + +void xen_arch_unregister_cpu(int num) +{ + arch_unregister_cpu(num); +} +EXPORT_SYMBOL(xen_arch_unregister_cpu); +#endif diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 1580e7a5a4cf..e079500b17f3 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -133,7 +133,7 @@ static int __init xlated_setup_gnttab_pages(void) kfree(pages); return -ENOMEM; } - rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */); + rc = alloc_xenballooned_pages(nr_grant_frames, pages); if (rc) { pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__, nr_grant_frames, rc); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 9c479fe40459..ac161db63388 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2888,6 +2888,7 @@ static int do_remap_gfn(struct vm_area_struct *vma, addr += range; if (err_ptr) err_ptr += batch; + cond_resched(); } out: diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 660b3cfef234..cab9f766bb06 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -530,7 +530,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg) * the new pages are installed with cmpxchg; if we lose the race then * simply free the page we allocated and use the one that's there. */ -static bool alloc_p2m(unsigned long pfn) +int xen_alloc_p2m_entry(unsigned long pfn) { unsigned topidx; unsigned long *top_mfn_p, *mid_mfn; @@ -540,6 +540,9 @@ static bool alloc_p2m(unsigned long pfn) unsigned long addr = (unsigned long)(xen_p2m_addr + pfn); unsigned long p2m_pfn; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + ptep = lookup_address(addr, &level); BUG_ON(!ptep || level != PG_LEVEL_4K); pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); @@ -548,7 +551,7 @@ static bool alloc_p2m(unsigned long pfn) /* PMD level is missing, allocate a new one */ ptep = alloc_p2m_pmd(addr, pte_pg); if (!ptep) - return false; + return -ENOMEM; } if (p2m_top_mfn && pfn < MAX_P2M_PFN) { @@ -566,7 +569,7 @@ static bool alloc_p2m(unsigned long pfn) mid_mfn = alloc_p2m_page(); if (!mid_mfn) - return false; + return -ENOMEM; p2m_mid_mfn_init(mid_mfn, p2m_missing); @@ -592,7 +595,7 @@ static bool alloc_p2m(unsigned long pfn) p2m = alloc_p2m_page(); if (!p2m) - return false; + return -ENOMEM; if (p2m_pfn == PFN_DOWN(__pa(p2m_missing))) p2m_init(p2m); @@ -625,8 +628,9 @@ static bool alloc_p2m(unsigned long pfn) HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn; } - return true; + return 0; } +EXPORT_SYMBOL(xen_alloc_p2m_entry); unsigned long __init set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e) @@ -688,7 +692,10 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) { if (unlikely(!__set_phys_to_machine(pfn, mfn))) { - if (!alloc_p2m(pfn)) + int ret; + + ret = xen_alloc_p2m_entry(pfn); + if (ret < 0) return false; return __set_phys_to_machine(pfn, mfn); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 63320b6d35bc..7ab29518a3b9 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -212,7 +212,7 @@ static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn) e_pfn = PFN_DOWN(entry->addr + entry->size); /* We only care about E820 after this */ - if (e_pfn < *min_pfn) + if (e_pfn <= *min_pfn) continue; s_pfn = PFN_UP(entry->addr); @@ -829,6 +829,8 @@ char * __init xen_memory_setup(void) addr = xen_e820_map[0].addr; size = xen_e820_map[0].size; while (i < xen_e820_map_entries) { + bool discard = false; + chunk_size = size; type = xen_e820_map[i].type; @@ -843,10 +845,11 @@ char * __init xen_memory_setup(void) xen_add_extra_mem(pfn_s, n_pfns); xen_max_p2m_pfn = pfn_s + n_pfns; } else - type = E820_UNUSABLE; + discard = true; } - xen_align_and_add_e820_region(addr, chunk_size, type); + if (!discard) + xen_align_and_add_e820_region(addr, chunk_size, type); addr += chunk_size; size -= chunk_size; diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 6a685aec6994..f9099940c272 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -87,7 +87,7 @@ MODULE_PARM_DESC(max_persistent_grants, * Maximum order of pages to be used for the shared ring between front and * backend, 4KB page granularity is used. */ -unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_PAGE_ORDER; +unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO); MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring"); /* @@ -961,7 +961,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, seg[n].nsec = segments[i].last_sect - segments[i].first_sect + 1; seg[n].offset = (segments[i].first_sect << 9); - if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) || + if ((segments[i].last_sect >= (XEN_PAGE_SIZE >> 9)) || (segments[i].last_sect < segments[i].first_sect)) { rc = -EINVAL; goto unmap; @@ -1210,6 +1210,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, req_operation = req->operation == BLKIF_OP_INDIRECT ? req->u.indirect.indirect_op : req->operation; + if ((req->operation == BLKIF_OP_INDIRECT) && (req_operation != BLKIF_OP_READ) && (req_operation != BLKIF_OP_WRITE)) { @@ -1268,7 +1269,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, seg[i].nsec = req->u.rw.seg[i].last_sect - req->u.rw.seg[i].first_sect + 1; seg[i].offset = (req->u.rw.seg[i].first_sect << 9); - if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) || + if ((req->u.rw.seg[i].last_sect >= (XEN_PAGE_SIZE >> 9)) || (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect)) goto fail_response; @@ -1445,10 +1446,10 @@ static int __init xen_blkif_init(void) if (!xen_domain()) return -ENODEV; - if (xen_blkif_max_ring_order > XENBUS_MAX_RING_PAGE_ORDER) { + if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) { pr_info("Invalid max_ring_order (%d), will use default max: %d.\n", - xen_blkif_max_ring_order, XENBUS_MAX_RING_PAGE_ORDER); - xen_blkif_max_ring_order = XENBUS_MAX_RING_PAGE_ORDER; + xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER); + xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; } rc = xen_blkif_interface_init(); diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 45a044a53d1e..68e87a037b99 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -39,6 +39,7 @@ #include <asm/pgalloc.h> #include <asm/hypervisor.h> #include <xen/grant_table.h> +#include <xen/page.h> #include <xen/xenbus.h> #include <xen/interface/io/ring.h> #include <xen/interface/io/blkif.h> @@ -51,12 +52,20 @@ extern unsigned int xen_blkif_max_ring_order; */ #define MAX_INDIRECT_SEGMENTS 256 -#define SEGS_PER_INDIRECT_FRAME \ - (PAGE_SIZE/sizeof(struct blkif_request_segment)) +/* + * Xen use 4K pages. The guest may use different page size (4K or 64K) + * Number of Xen pages per segment + */ +#define XEN_PAGES_PER_SEGMENT (PAGE_SIZE / XEN_PAGE_SIZE) + +#define XEN_PAGES_PER_INDIRECT_FRAME \ + (XEN_PAGE_SIZE/sizeof(struct blkif_request_segment)) +#define SEGS_PER_INDIRECT_FRAME \ + (XEN_PAGES_PER_INDIRECT_FRAME / XEN_PAGES_PER_SEGMENT) + #define MAX_INDIRECT_PAGES \ ((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) -#define INDIRECT_PAGES(_segs) \ - ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) +#define INDIRECT_PAGES(_segs) DIV_ROUND_UP(_segs, XEN_PAGES_PER_INDIRECT_FRAME) /* Not a real protocol. Used to generate ring structs which contain * the elements common to all protocols only. This way we get a diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 767657565de6..f53cff42f8da 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -176,21 +176,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, { struct blkif_sring *sring; sring = (struct blkif_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE * nr_grefs); + BACK_RING_INIT(&blkif->blk_rings.native, sring, + XEN_PAGE_SIZE * nr_grefs); break; } case BLKIF_PROTOCOL_X86_32: { struct blkif_x86_32_sring *sring_x86_32; sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE * nr_grefs); + BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, + XEN_PAGE_SIZE * nr_grefs); break; } case BLKIF_PROTOCOL_X86_64: { struct blkif_x86_64_sring *sring_x86_64; sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE * nr_grefs); + BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, + XEN_PAGE_SIZE * nr_grefs); break; } default: @@ -826,7 +829,7 @@ again: static int connect_ring(struct backend_info *be) { struct xenbus_device *dev = be->dev; - unsigned int ring_ref[XENBUS_MAX_RING_PAGES]; + unsigned int ring_ref[XENBUS_MAX_RING_GRANTS]; unsigned int evtchn, nr_grefs, ring_page_order; unsigned int pers_grants; char protocol[64] = ""; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index a69c02dadec0..2fee2eef988d 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -68,7 +68,7 @@ enum blkif_state { struct grant { grant_ref_t gref; - unsigned long pfn; + struct page *page; struct list_head node; }; @@ -78,6 +78,7 @@ struct blk_shadow { struct grant **grants_used; struct grant **indirect_grants; struct scatterlist *sg; + unsigned int num_sg; }; struct split_bio { @@ -106,8 +107,12 @@ static unsigned int xen_blkif_max_ring_order; module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO); MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring"); -#define BLK_RING_SIZE(info) __CONST_RING_SIZE(blkif, PAGE_SIZE * (info)->nr_ring_pages) -#define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * XENBUS_MAX_RING_PAGES) +#define BLK_RING_SIZE(info) \ + __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * (info)->nr_ring_pages) + +#define BLK_MAX_RING_SIZE \ + __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS) + /* * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 * characters are enough. Define to 20 to keep consist with backend. @@ -128,7 +133,7 @@ struct blkfront_info int vdevice; blkif_vdev_t handle; enum blkif_state connected; - int ring_ref[XENBUS_MAX_RING_PAGES]; + int ring_ref[XENBUS_MAX_RING_GRANTS]; unsigned int nr_ring_pages; struct blkif_front_ring ring; unsigned int evtchn, irq; @@ -146,6 +151,7 @@ struct blkfront_info unsigned int discard_granularity; unsigned int discard_alignment; unsigned int feature_persistent:1; + /* Number of 4KB segments handled */ unsigned int max_indirect_segments; int is_ready; struct blk_mq_tag_set tag_set; @@ -174,10 +180,23 @@ static DEFINE_SPINLOCK(minor_lock); #define DEV_NAME "xvd" /* name in /dev */ -#define SEGS_PER_INDIRECT_FRAME \ - (PAGE_SIZE/sizeof(struct blkif_request_segment)) -#define INDIRECT_GREFS(_segs) \ - ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) +/* + * Grants are always the same size as a Xen page (i.e 4KB). + * A physical segment is always the same size as a Linux page. + * Number of grants per physical segment + */ +#define GRANTS_PER_PSEG (PAGE_SIZE / XEN_PAGE_SIZE) + +#define GRANTS_PER_INDIRECT_FRAME \ + (XEN_PAGE_SIZE / sizeof(struct blkif_request_segment)) + +#define PSEGS_PER_INDIRECT_FRAME \ + (GRANTS_INDIRECT_FRAME / GRANTS_PSEGS) + +#define INDIRECT_GREFS(_grants) \ + DIV_ROUND_UP(_grants, GRANTS_PER_INDIRECT_FRAME) + +#define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG) static int blkfront_setup_indirect(struct blkfront_info *info); static int blkfront_gather_backend_features(struct blkfront_info *info); @@ -221,7 +240,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num) kfree(gnt_list_entry); goto out_of_memory; } - gnt_list_entry->pfn = page_to_pfn(granted_page); + gnt_list_entry->page = granted_page; } gnt_list_entry->gref = GRANT_INVALID_REF; @@ -236,7 +255,7 @@ out_of_memory: &info->grants, node) { list_del(&gnt_list_entry->node); if (info->feature_persistent) - __free_page(pfn_to_page(gnt_list_entry->pfn)); + __free_page(gnt_list_entry->page); kfree(gnt_list_entry); i--; } @@ -244,34 +263,77 @@ out_of_memory: return -ENOMEM; } -static struct grant *get_grant(grant_ref_t *gref_head, - unsigned long pfn, - struct blkfront_info *info) +static struct grant *get_free_grant(struct blkfront_info *info) { struct grant *gnt_list_entry; - unsigned long buffer_gfn; BUG_ON(list_empty(&info->grants)); gnt_list_entry = list_first_entry(&info->grants, struct grant, - node); + node); list_del(&gnt_list_entry->node); - if (gnt_list_entry->gref != GRANT_INVALID_REF) { + if (gnt_list_entry->gref != GRANT_INVALID_REF) info->persistent_gnts_c--; + + return gnt_list_entry; +} + +static inline void grant_foreign_access(const struct grant *gnt_list_entry, + const struct blkfront_info *info) +{ + gnttab_page_grant_foreign_access_ref_one(gnt_list_entry->gref, + info->xbdev->otherend_id, + gnt_list_entry->page, + 0); +} + +static struct grant *get_grant(grant_ref_t *gref_head, + unsigned long gfn, + struct blkfront_info *info) +{ + struct grant *gnt_list_entry = get_free_grant(info); + + if (gnt_list_entry->gref != GRANT_INVALID_REF) return gnt_list_entry; + + /* Assign a gref to this page */ + gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); + BUG_ON(gnt_list_entry->gref == -ENOSPC); + if (info->feature_persistent) + grant_foreign_access(gnt_list_entry, info); + else { + /* Grant access to the GFN passed by the caller */ + gnttab_grant_foreign_access_ref(gnt_list_entry->gref, + info->xbdev->otherend_id, + gfn, 0); } + return gnt_list_entry; +} + +static struct grant *get_indirect_grant(grant_ref_t *gref_head, + struct blkfront_info *info) +{ + struct grant *gnt_list_entry = get_free_grant(info); + + if (gnt_list_entry->gref != GRANT_INVALID_REF) + return gnt_list_entry; + /* Assign a gref to this page */ gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); BUG_ON(gnt_list_entry->gref == -ENOSPC); if (!info->feature_persistent) { - BUG_ON(!pfn); - gnt_list_entry->pfn = pfn; + struct page *indirect_page; + + /* Fetch a pre-allocated page to use for indirect grefs */ + BUG_ON(list_empty(&info->indirect_pages)); + indirect_page = list_first_entry(&info->indirect_pages, + struct page, lru); + list_del(&indirect_page->lru); + gnt_list_entry->page = indirect_page; } - buffer_gfn = pfn_to_gfn(gnt_list_entry->pfn); - gnttab_grant_foreign_access_ref(gnt_list_entry->gref, - info->xbdev->otherend_id, - buffer_gfn, 0); + grant_foreign_access(gnt_list_entry, info); + return gnt_list_entry; } @@ -394,20 +456,128 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode, return 0; } -/* - * Generate a Xen blkfront IO request from a blk layer request. Reads - * and writes are handled as expected. - * - * @req: a request struct - */ -static int blkif_queue_request(struct request *req) +static int blkif_queue_discard_req(struct request *req) { struct blkfront_info *info = req->rq_disk->private_data; struct blkif_request *ring_req; unsigned long id; + + /* Fill out a communications ring structure. */ + ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); + id = get_id_from_freelist(info); + info->shadow[id].request = req; + + ring_req->operation = BLKIF_OP_DISCARD; + ring_req->u.discard.nr_sectors = blk_rq_sectors(req); + ring_req->u.discard.id = id; + ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req); + if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) + ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; + else + ring_req->u.discard.flag = 0; + + info->ring.req_prod_pvt++; + + /* Keep a private copy so we can reissue requests when recovering. */ + info->shadow[id].req = *ring_req; + + return 0; +} + +struct setup_rw_req { + unsigned int grant_idx; + struct blkif_request_segment *segments; + struct blkfront_info *info; + struct blkif_request *ring_req; + grant_ref_t gref_head; + unsigned int id; + /* Only used when persistent grant is used and it's a read request */ + bool need_copy; + unsigned int bvec_off; + char *bvec_data; +}; + +static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, + unsigned int len, void *data) +{ + struct setup_rw_req *setup = data; + int n, ref; + struct grant *gnt_list_entry; unsigned int fsect, lsect; - int i, ref, n; - struct blkif_request_segment *segments = NULL; + /* Convenient aliases */ + unsigned int grant_idx = setup->grant_idx; + struct blkif_request *ring_req = setup->ring_req; + struct blkfront_info *info = setup->info; + struct blk_shadow *shadow = &info->shadow[setup->id]; + + if ((ring_req->operation == BLKIF_OP_INDIRECT) && + (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) { + if (setup->segments) + kunmap_atomic(setup->segments); + + n = grant_idx / GRANTS_PER_INDIRECT_FRAME; + gnt_list_entry = get_indirect_grant(&setup->gref_head, info); + shadow->indirect_grants[n] = gnt_list_entry; + setup->segments = kmap_atomic(gnt_list_entry->page); + ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; + } + + gnt_list_entry = get_grant(&setup->gref_head, gfn, info); + ref = gnt_list_entry->gref; + shadow->grants_used[grant_idx] = gnt_list_entry; + + if (setup->need_copy) { + void *shared_data; + + shared_data = kmap_atomic(gnt_list_entry->page); + /* + * this does not wipe data stored outside the + * range sg->offset..sg->offset+sg->length. + * Therefore, blkback *could* see data from + * previous requests. This is OK as long as + * persistent grants are shared with just one + * domain. It may need refactoring if this + * changes + */ + memcpy(shared_data + offset, + setup->bvec_data + setup->bvec_off, + len); + + kunmap_atomic(shared_data); + setup->bvec_off += len; + } + + fsect = offset >> 9; + lsect = fsect + (len >> 9) - 1; + if (ring_req->operation != BLKIF_OP_INDIRECT) { + ring_req->u.rw.seg[grant_idx] = + (struct blkif_request_segment) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + } else { + setup->segments[grant_idx % GRANTS_PER_INDIRECT_FRAME] = + (struct blkif_request_segment) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + } + + (setup->grant_idx)++; +} + +static int blkif_queue_rw_req(struct request *req) +{ + struct blkfront_info *info = req->rq_disk->private_data; + struct blkif_request *ring_req; + unsigned long id; + int i; + struct setup_rw_req setup = { + .grant_idx = 0, + .segments = NULL, + .info = info, + .need_copy = rq_data_dir(req) && info->feature_persistent, + }; /* * Used to store if we are able to queue the request by just using @@ -415,28 +585,23 @@ static int blkif_queue_request(struct request *req) * as there are not sufficiently many free. */ bool new_persistent_gnts; - grant_ref_t gref_head; - struct grant *gnt_list_entry = NULL; struct scatterlist *sg; - int nseg, max_grefs; + int num_sg, max_grefs, num_grant; - if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) - return 1; - - max_grefs = req->nr_phys_segments; + max_grefs = req->nr_phys_segments * GRANTS_PER_PSEG; if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST) /* * If we are using indirect segments we need to account * for the indirect grefs used in the request. */ - max_grefs += INDIRECT_GREFS(req->nr_phys_segments); + max_grefs += INDIRECT_GREFS(max_grefs); /* Check if we have enough grants to allocate a requests */ if (info->persistent_gnts_c < max_grefs) { new_persistent_gnts = 1; if (gnttab_alloc_grant_references( max_grefs - info->persistent_gnts_c, - &gref_head) < 0) { + &setup.gref_head) < 0) { gnttab_request_free_callback( &info->callback, blkif_restart_queue_callback, @@ -452,139 +617,82 @@ static int blkif_queue_request(struct request *req) id = get_id_from_freelist(info); info->shadow[id].request = req; - if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { - ring_req->operation = BLKIF_OP_DISCARD; - ring_req->u.discard.nr_sectors = blk_rq_sectors(req); - ring_req->u.discard.id = id; - ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req); - if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) - ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; - else - ring_req->u.discard.flag = 0; + BUG_ON(info->max_indirect_segments == 0 && + GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST); + BUG_ON(info->max_indirect_segments && + GREFS(req->nr_phys_segments) > info->max_indirect_segments); + + num_sg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); + num_grant = 0; + /* Calculate the number of grant used */ + for_each_sg(info->shadow[id].sg, sg, num_sg, i) + num_grant += gnttab_count_grant(sg->offset, sg->length); + + ring_req->u.rw.id = id; + info->shadow[id].num_sg = num_sg; + if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) { + /* + * The indirect operation can only be a BLKIF_OP_READ or + * BLKIF_OP_WRITE + */ + BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA)); + ring_req->operation = BLKIF_OP_INDIRECT; + ring_req->u.indirect.indirect_op = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req); + ring_req->u.indirect.handle = info->handle; + ring_req->u.indirect.nr_segments = num_grant; } else { - BUG_ON(info->max_indirect_segments == 0 && - req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); - BUG_ON(info->max_indirect_segments && - req->nr_phys_segments > info->max_indirect_segments); - nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); - ring_req->u.rw.id = id; - if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) { + ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); + ring_req->u.rw.handle = info->handle; + ring_req->operation = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { /* - * The indirect operation can only be a BLKIF_OP_READ or - * BLKIF_OP_WRITE + * Ideally we can do an unordered flush-to-disk. + * In case the backend onlysupports barriers, use that. + * A barrier request a superset of FUA, so we can + * implement it the same way. (It's also a FLUSH+FUA, + * since it is guaranteed ordered WRT previous writes.) */ - BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA)); - ring_req->operation = BLKIF_OP_INDIRECT; - ring_req->u.indirect.indirect_op = rq_data_dir(req) ? - BLKIF_OP_WRITE : BLKIF_OP_READ; - ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req); - ring_req->u.indirect.handle = info->handle; - ring_req->u.indirect.nr_segments = nseg; - } else { - ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); - ring_req->u.rw.handle = info->handle; - ring_req->operation = rq_data_dir(req) ? - BLKIF_OP_WRITE : BLKIF_OP_READ; - if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { - /* - * Ideally we can do an unordered flush-to-disk. In case the - * backend onlysupports barriers, use that. A barrier request - * a superset of FUA, so we can implement it the same - * way. (It's also a FLUSH+FUA, since it is - * guaranteed ordered WRT previous writes.) - */ - switch (info->feature_flush & - ((REQ_FLUSH|REQ_FUA))) { - case REQ_FLUSH|REQ_FUA: - ring_req->operation = - BLKIF_OP_WRITE_BARRIER; - break; - case REQ_FLUSH: - ring_req->operation = - BLKIF_OP_FLUSH_DISKCACHE; - break; - default: - ring_req->operation = 0; - } + switch (info->feature_flush & + ((REQ_FLUSH|REQ_FUA))) { + case REQ_FLUSH|REQ_FUA: + ring_req->operation = + BLKIF_OP_WRITE_BARRIER; + break; + case REQ_FLUSH: + ring_req->operation = + BLKIF_OP_FLUSH_DISKCACHE; + break; + default: + ring_req->operation = 0; } - ring_req->u.rw.nr_segments = nseg; } - for_each_sg(info->shadow[id].sg, sg, nseg, i) { - fsect = sg->offset >> 9; - lsect = fsect + (sg->length >> 9) - 1; - - if ((ring_req->operation == BLKIF_OP_INDIRECT) && - (i % SEGS_PER_INDIRECT_FRAME == 0)) { - unsigned long uninitialized_var(pfn); - - if (segments) - kunmap_atomic(segments); - - n = i / SEGS_PER_INDIRECT_FRAME; - if (!info->feature_persistent) { - struct page *indirect_page; - - /* Fetch a pre-allocated page to use for indirect grefs */ - BUG_ON(list_empty(&info->indirect_pages)); - indirect_page = list_first_entry(&info->indirect_pages, - struct page, lru); - list_del(&indirect_page->lru); - pfn = page_to_pfn(indirect_page); - } - gnt_list_entry = get_grant(&gref_head, pfn, info); - info->shadow[id].indirect_grants[n] = gnt_list_entry; - segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); - ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; - } - - gnt_list_entry = get_grant(&gref_head, page_to_pfn(sg_page(sg)), info); - ref = gnt_list_entry->gref; - - info->shadow[id].grants_used[i] = gnt_list_entry; - - if (rq_data_dir(req) && info->feature_persistent) { - char *bvec_data; - void *shared_data; + ring_req->u.rw.nr_segments = num_grant; + } - BUG_ON(sg->offset + sg->length > PAGE_SIZE); + setup.ring_req = ring_req; + setup.id = id; + for_each_sg(info->shadow[id].sg, sg, num_sg, i) { + BUG_ON(sg->offset + sg->length > PAGE_SIZE); - shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); - bvec_data = kmap_atomic(sg_page(sg)); + if (setup.need_copy) { + setup.bvec_off = sg->offset; + setup.bvec_data = kmap_atomic(sg_page(sg)); + } - /* - * this does not wipe data stored outside the - * range sg->offset..sg->offset+sg->length. - * Therefore, blkback *could* see data from - * previous requests. This is OK as long as - * persistent grants are shared with just one - * domain. It may need refactoring if this - * changes - */ - memcpy(shared_data + sg->offset, - bvec_data + sg->offset, - sg->length); + gnttab_foreach_grant_in_range(sg_page(sg), + sg->offset, + sg->length, + blkif_setup_rw_req_grant, + &setup); - kunmap_atomic(bvec_data); - kunmap_atomic(shared_data); - } - if (ring_req->operation != BLKIF_OP_INDIRECT) { - ring_req->u.rw.seg[i] = - (struct blkif_request_segment) { - .gref = ref, - .first_sect = fsect, - .last_sect = lsect }; - } else { - n = i % SEGS_PER_INDIRECT_FRAME; - segments[n] = - (struct blkif_request_segment) { - .gref = ref, - .first_sect = fsect, - .last_sect = lsect }; - } - } - if (segments) - kunmap_atomic(segments); + if (setup.need_copy) + kunmap_atomic(setup.bvec_data); } + if (setup.segments) + kunmap_atomic(setup.segments); info->ring.req_prod_pvt++; @@ -592,11 +700,29 @@ static int blkif_queue_request(struct request *req) info->shadow[id].req = *ring_req; if (new_persistent_gnts) - gnttab_free_grant_references(gref_head); + gnttab_free_grant_references(setup.gref_head); return 0; } +/* + * Generate a Xen blkfront IO request from a blk layer request. Reads + * and writes are handled as expected. + * + * @req: a request struct + */ +static int blkif_queue_request(struct request *req) +{ + struct blkfront_info *info = req->rq_disk->private_data; + + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) + return 1; + + if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) + return blkif_queue_discard_req(req); + else + return blkif_queue_rw_req(req); +} static inline void flush_requests(struct blkfront_info *info) { @@ -691,14 +817,14 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, /* Hard sector size and max sectors impersonate the equiv. hardware. */ blk_queue_logical_block_size(rq, sector_size); blk_queue_physical_block_size(rq, physical_sector_size); - blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512); + blk_queue_max_hw_sectors(rq, (segments * XEN_PAGE_SIZE) / 512); /* Each segment in a request is up to an aligned page in size. */ blk_queue_segment_boundary(rq, PAGE_SIZE - 1); blk_queue_max_segment_size(rq, PAGE_SIZE); /* Ensure a merged request will fit in a single I/O ring slot. */ - blk_queue_max_segments(rq, segments); + blk_queue_max_segments(rq, segments / GRANTS_PER_PSEG); /* Make sure buffer addresses are sector-aligned. */ blk_queue_dma_alignment(rq, 511); @@ -972,7 +1098,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) info->persistent_gnts_c--; } if (info->feature_persistent) - __free_page(pfn_to_page(persistent_gnt->pfn)); + __free_page(persistent_gnt->page); kfree(persistent_gnt); } } @@ -1007,7 +1133,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) persistent_gnt = info->shadow[i].grants_used[j]; gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); if (info->feature_persistent) - __free_page(pfn_to_page(persistent_gnt->pfn)); + __free_page(persistent_gnt->page); kfree(persistent_gnt); } @@ -1021,7 +1147,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) for (j = 0; j < INDIRECT_GREFS(segs); j++) { persistent_gnt = info->shadow[i].indirect_grants[j]; gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); - __free_page(pfn_to_page(persistent_gnt->pfn)); + __free_page(persistent_gnt->page); kfree(persistent_gnt); } @@ -1057,33 +1183,65 @@ free_shadow: } +struct copy_from_grant { + const struct blk_shadow *s; + unsigned int grant_idx; + unsigned int bvec_offset; + char *bvec_data; +}; + +static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset, + unsigned int len, void *data) +{ + struct copy_from_grant *info = data; + char *shared_data; + /* Convenient aliases */ + const struct blk_shadow *s = info->s; + + shared_data = kmap_atomic(s->grants_used[info->grant_idx]->page); + + memcpy(info->bvec_data + info->bvec_offset, + shared_data + offset, len); + + info->bvec_offset += len; + info->grant_idx++; + + kunmap_atomic(shared_data); +} + static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, struct blkif_response *bret) { int i = 0; struct scatterlist *sg; - char *bvec_data; - void *shared_data; - int nseg; + int num_sg, num_grant; + struct copy_from_grant data = { + .s = s, + .grant_idx = 0, + }; - nseg = s->req.operation == BLKIF_OP_INDIRECT ? + num_grant = s->req.operation == BLKIF_OP_INDIRECT ? s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; + num_sg = s->num_sg; if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { - for_each_sg(s->sg, sg, nseg, i) { + for_each_sg(s->sg, sg, num_sg, i) { BUG_ON(sg->offset + sg->length > PAGE_SIZE); - shared_data = kmap_atomic( - pfn_to_page(s->grants_used[i]->pfn)); - bvec_data = kmap_atomic(sg_page(sg)); - memcpy(bvec_data + sg->offset, - shared_data + sg->offset, - sg->length); - kunmap_atomic(bvec_data); - kunmap_atomic(shared_data); + + data.bvec_offset = sg->offset; + data.bvec_data = kmap_atomic(sg_page(sg)); + + gnttab_foreach_grant_in_range(sg_page(sg), + sg->offset, + sg->length, + blkif_copy_from_grant, + &data); + + kunmap_atomic(data.bvec_data); } } /* Add the persistent grant into the list of free grants */ - for (i = 0; i < nseg; i++) { + for (i = 0; i < num_grant; i++) { if (gnttab_query_foreign_access(s->grants_used[i]->gref)) { /* * If the grant is still mapped by the backend (the @@ -1109,7 +1267,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, } } if (s->req.operation == BLKIF_OP_INDIRECT) { - for (i = 0; i < INDIRECT_GREFS(nseg); i++) { + for (i = 0; i < INDIRECT_GREFS(num_grant); i++) { if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) { if (!info->feature_persistent) pr_alert_ratelimited("backed has not unmapped grant: %u\n", @@ -1125,7 +1283,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, * available pages for indirect grefs. */ if (!info->feature_persistent) { - indirect_page = pfn_to_page(s->indirect_grants[i]->pfn); + indirect_page = s->indirect_grants[i]->page; list_add(&indirect_page->lru, &info->indirect_pages); } s->indirect_grants[i]->gref = GRANT_INVALID_REF; @@ -1254,8 +1412,8 @@ static int setup_blkring(struct xenbus_device *dev, { struct blkif_sring *sring; int err, i; - unsigned long ring_size = info->nr_ring_pages * PAGE_SIZE; - grant_ref_t gref[XENBUS_MAX_RING_PAGES]; + unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE; + grant_ref_t gref[XENBUS_MAX_RING_GRANTS]; for (i = 0; i < info->nr_ring_pages; i++) info->ring_ref[i] = GRANT_INVALID_REF; @@ -1583,8 +1741,8 @@ static int blkif_recover(struct blkfront_info *info) atomic_set(&split_bio->pending, pending); split_bio->bio = bio; for (i = 0; i < pending; i++) { - offset = (i * segs * PAGE_SIZE) >> 9; - size = min((unsigned int)(segs * PAGE_SIZE) >> 9, + offset = (i * segs * XEN_PAGE_SIZE) >> 9; + size = min((unsigned int)(segs * XEN_PAGE_SIZE) >> 9, (unsigned int)bio_sectors(bio) - offset); cloned_bio = bio_clone(bio, GFP_NOIO); BUG_ON(cloned_bio == NULL); @@ -1695,15 +1853,17 @@ static void blkfront_setup_discard(struct blkfront_info *info) static int blkfront_setup_indirect(struct blkfront_info *info) { - unsigned int segs; + unsigned int psegs, grants; int err, i; if (info->max_indirect_segments == 0) - segs = BLKIF_MAX_SEGMENTS_PER_REQUEST; + grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; else - segs = info->max_indirect_segments; + grants = info->max_indirect_segments; + psegs = grants / GRANTS_PER_PSEG; - err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE(info)); + err = fill_grant_buffer(info, + (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info)); if (err) goto out_of_memory; @@ -1713,7 +1873,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info) * grants, we need to allocate a set of pages that can be * used for mapping indirect grefs */ - int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE(info); + int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info); BUG_ON(!list_empty(&info->indirect_pages)); for (i = 0; i < num; i++) { @@ -1726,20 +1886,20 @@ static int blkfront_setup_indirect(struct blkfront_info *info) for (i = 0; i < BLK_RING_SIZE(info); i++) { info->shadow[i].grants_used = kzalloc( - sizeof(info->shadow[i].grants_used[0]) * segs, + sizeof(info->shadow[i].grants_used[0]) * grants, GFP_NOIO); - info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO); + info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * psegs, GFP_NOIO); if (info->max_indirect_segments) info->shadow[i].indirect_grants = kzalloc( sizeof(info->shadow[i].indirect_grants[0]) * - INDIRECT_GREFS(segs), + INDIRECT_GREFS(grants), GFP_NOIO); if ((info->shadow[i].grants_used == NULL) || (info->shadow[i].sg == NULL) || (info->max_indirect_segments && (info->shadow[i].indirect_grants == NULL))) goto out_of_memory; - sg_init_table(info->shadow[i].sg, segs); + sg_init_table(info->shadow[i].sg, psegs); } @@ -2125,9 +2285,9 @@ static int __init xlblk_init(void) if (!xen_domain()) return -ENODEV; - if (xen_blkif_max_ring_order > XENBUS_MAX_RING_PAGE_ORDER) { + if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) { pr_info("Invalid max_ring_order (%d), will use default max: %d.\n", - xen_blkif_max_ring_order, XENBUS_MAX_RING_PAGE_ORDER); + xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER); xen_blkif_max_ring_order = 0; } diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index a7bf74727116..0333ab0fd926 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -44,6 +44,7 @@ #include <xen/interface/grant_table.h> #include <xen/grant_table.h> #include <xen/xenbus.h> +#include <xen/page.h> #include <linux/debugfs.h> typedef unsigned int pending_ring_idx_t; @@ -64,8 +65,8 @@ struct pending_tx_info { struct ubuf_info callback_struct; }; -#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE) -#define XEN_NETIF_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE) +#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, XEN_PAGE_SIZE) +#define XEN_NETIF_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, XEN_PAGE_SIZE) struct xenvif_rx_meta { int id; @@ -80,16 +81,21 @@ struct xenvif_rx_meta { /* Discriminate from any valid pending_idx value. */ #define INVALID_PENDING_IDX 0xFFFF -#define MAX_BUFFER_OFFSET PAGE_SIZE +#define MAX_BUFFER_OFFSET XEN_PAGE_SIZE #define MAX_PENDING_REQS XEN_NETIF_TX_RING_SIZE +/* The maximum number of frags is derived from the size of a grant (same + * as a Xen page size for now). + */ +#define MAX_XEN_SKB_FRAGS (65536 / XEN_PAGE_SIZE + 1) + /* It's possible for an skb to have a maximal number of frags * but still be less than MAX_BUFFER_OFFSET in size. Thus the - * worst-case number of copy operations is MAX_SKB_FRAGS per + * worst-case number of copy operations is MAX_XEN_SKB_FRAGS per * ring slot. */ -#define MAX_GRANT_COPY_OPS (MAX_SKB_FRAGS * XEN_NETIF_RX_RING_SIZE) +#define MAX_GRANT_COPY_OPS (MAX_XEN_SKB_FRAGS * XEN_NETIF_RX_RING_SIZE) #define NETBACK_INVALID_HANDLE -1 diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index ec98d43916a8..e481f3710bd3 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -152,9 +152,9 @@ static inline pending_ring_idx_t pending_index(unsigned i) static int xenvif_rx_ring_slots_needed(struct xenvif *vif) { if (vif->gso_mask) - return DIV_ROUND_UP(vif->dev->gso_max_size, PAGE_SIZE) + 1; + return DIV_ROUND_UP(vif->dev->gso_max_size, XEN_PAGE_SIZE) + 1; else - return DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE); + return DIV_ROUND_UP(vif->dev->mtu, XEN_PAGE_SIZE); } static bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue) @@ -274,6 +274,80 @@ static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif_queue *queue, return meta; } +struct gop_frag_copy { + struct xenvif_queue *queue; + struct netrx_pending_operations *npo; + struct xenvif_rx_meta *meta; + int head; + int gso_type; + + struct page *page; +}; + +static void xenvif_setup_copy_gop(unsigned long gfn, + unsigned int offset, + unsigned int *len, + struct gop_frag_copy *info) +{ + struct gnttab_copy *copy_gop; + struct xen_page_foreign *foreign; + /* Convenient aliases */ + struct xenvif_queue *queue = info->queue; + struct netrx_pending_operations *npo = info->npo; + struct page *page = info->page; + + BUG_ON(npo->copy_off > MAX_BUFFER_OFFSET); + + if (npo->copy_off == MAX_BUFFER_OFFSET) + info->meta = get_next_rx_buffer(queue, npo); + + if (npo->copy_off + *len > MAX_BUFFER_OFFSET) + *len = MAX_BUFFER_OFFSET - npo->copy_off; + + copy_gop = npo->copy + npo->copy_prod++; + copy_gop->flags = GNTCOPY_dest_gref; + copy_gop->len = *len; + + foreign = xen_page_foreign(page); + if (foreign) { + copy_gop->source.domid = foreign->domid; + copy_gop->source.u.ref = foreign->gref; + copy_gop->flags |= GNTCOPY_source_gref; + } else { + copy_gop->source.domid = DOMID_SELF; + copy_gop->source.u.gmfn = gfn; + } + copy_gop->source.offset = offset; + + copy_gop->dest.domid = queue->vif->domid; + copy_gop->dest.offset = npo->copy_off; + copy_gop->dest.u.ref = npo->copy_gref; + + npo->copy_off += *len; + info->meta->size += *len; + + /* Leave a gap for the GSO descriptor. */ + if (info->head && ((1 << info->gso_type) & queue->vif->gso_mask)) + queue->rx.req_cons++; + + info->head = 0; /* There must be something in this buffer now */ +} + +static void xenvif_gop_frag_copy_grant(unsigned long gfn, + unsigned offset, + unsigned int len, + void *data) +{ + unsigned int bytes; + + while (len) { + bytes = len; + xenvif_setup_copy_gop(gfn, offset, &bytes, data); + offset += bytes; + len -= bytes; + } +} + /* * Set up the grant operations for this fragment. If it's a flipping * interface, we also set up the unmap request from here. @@ -283,83 +357,52 @@ static void xenvif_gop_frag_copy(struct xenvif_queue *queue, struct sk_buff *skb struct page *page, unsigned long size, unsigned long offset, int *head) { - struct gnttab_copy *copy_gop; - struct xenvif_rx_meta *meta; + struct gop_frag_copy info = { + .queue = queue, + .npo = npo, + .head = *head, + .gso_type = XEN_NETIF_GSO_TYPE_NONE, + }; unsigned long bytes; - int gso_type = XEN_NETIF_GSO_TYPE_NONE; + + if (skb_is_gso(skb)) { + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) + info.gso_type = XEN_NETIF_GSO_TYPE_TCPV4; + else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) + info.gso_type = XEN_NETIF_GSO_TYPE_TCPV6; + } /* Data must not cross a page boundary. */ BUG_ON(size + offset > PAGE_SIZE<<compound_order(page)); - meta = npo->meta + npo->meta_prod - 1; + info.meta = npo->meta + npo->meta_prod - 1; /* Skip unused frames from start of page */ page += offset >> PAGE_SHIFT; offset &= ~PAGE_MASK; while (size > 0) { - struct xen_page_foreign *foreign; - BUG_ON(offset >= PAGE_SIZE); - BUG_ON(npo->copy_off > MAX_BUFFER_OFFSET); - - if (npo->copy_off == MAX_BUFFER_OFFSET) - meta = get_next_rx_buffer(queue, npo); bytes = PAGE_SIZE - offset; if (bytes > size) bytes = size; - if (npo->copy_off + bytes > MAX_BUFFER_OFFSET) - bytes = MAX_BUFFER_OFFSET - npo->copy_off; - - copy_gop = npo->copy + npo->copy_prod++; - copy_gop->flags = GNTCOPY_dest_gref; - copy_gop->len = bytes; - - foreign = xen_page_foreign(page); - if (foreign) { - copy_gop->source.domid = foreign->domid; - copy_gop->source.u.ref = foreign->gref; - copy_gop->flags |= GNTCOPY_source_gref; - } else { - copy_gop->source.domid = DOMID_SELF; - copy_gop->source.u.gmfn = - virt_to_gfn(page_address(page)); - } - copy_gop->source.offset = offset; - - copy_gop->dest.domid = queue->vif->domid; - copy_gop->dest.offset = npo->copy_off; - copy_gop->dest.u.ref = npo->copy_gref; - - npo->copy_off += bytes; - meta->size += bytes; - - offset += bytes; + info.page = page; + gnttab_foreach_grant_in_range(page, offset, bytes, + xenvif_gop_frag_copy_grant, + &info); size -= bytes; + offset = 0; - /* Next frame */ - if (offset == PAGE_SIZE && size) { + /* Next page */ + if (size) { BUG_ON(!PageCompound(page)); page++; - offset = 0; } - - /* Leave a gap for the GSO descriptor. */ - if (skb_is_gso(skb)) { - if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) - gso_type = XEN_NETIF_GSO_TYPE_TCPV4; - else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) - gso_type = XEN_NETIF_GSO_TYPE_TCPV6; - } - - if (*head && ((1 << gso_type) & queue->vif->gso_mask)) - queue->rx.req_cons++; - - *head = 0; /* There must be something in this buffer now. */ - } + + *head = info.head; } /* @@ -758,7 +801,7 @@ static int xenvif_count_requests(struct xenvif_queue *queue, first->size -= txp->size; slots++; - if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) { + if (unlikely((txp->offset + txp->size) > XEN_PAGE_SIZE)) { netdev_err(queue->vif->dev, "Cross page boundary, txp->offset: %u, size: %u\n", txp->offset, txp->size); xenvif_fatal_tx_err(queue->vif); @@ -1339,11 +1382,11 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue, } /* No crossing a page as the payload mustn't fragment. */ - if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) { + if (unlikely((txreq.offset + txreq.size) > XEN_PAGE_SIZE)) { netdev_err(queue->vif->dev, "txreq.offset: %u, size: %u, end: %lu\n", txreq.offset, txreq.size, - (unsigned long)(txreq.offset&~PAGE_MASK) + txreq.size); + (unsigned long)(txreq.offset&~XEN_PAGE_MASK) + txreq.size); xenvif_fatal_tx_err(queue->vif); break; } @@ -1409,7 +1452,7 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue, virt_to_gfn(skb->data); queue->tx_copy_ops[*copy_ops].dest.domid = DOMID_SELF; queue->tx_copy_ops[*copy_ops].dest.offset = - offset_in_page(skb->data); + offset_in_page(skb->data) & ~XEN_PAGE_MASK; queue->tx_copy_ops[*copy_ops].len = data_len; queue->tx_copy_ops[*copy_ops].flags = GNTCOPY_source_gref; @@ -1894,7 +1937,7 @@ int xenvif_map_frontend_rings(struct xenvif_queue *queue, goto err; txs = (struct xen_netif_tx_sring *)addr; - BACK_RING_INIT(&queue->tx, txs, PAGE_SIZE); + BACK_RING_INIT(&queue->tx, txs, XEN_PAGE_SIZE); err = xenbus_map_ring_valloc(xenvif_to_xenbus_device(queue->vif), &rx_ring_ref, 1, &addr); @@ -1902,7 +1945,7 @@ int xenvif_map_frontend_rings(struct xenvif_queue *queue, goto err; rxs = (struct xen_netif_rx_sring *)addr; - BACK_RING_INIT(&queue->rx, rxs, PAGE_SIZE); + BACK_RING_INIT(&queue->rx, rxs, XEN_PAGE_SIZE); return 0; diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 441b158d04f7..d6abf191122a 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -74,8 +74,8 @@ struct netfront_cb { #define GRANT_INVALID_REF 0 -#define NET_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE) -#define NET_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE) +#define NET_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, XEN_PAGE_SIZE) +#define NET_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, XEN_PAGE_SIZE) /* Minimum number of Rx slots (includes slot for GSO metadata). */ #define NET_RX_SLOTS_MIN (XEN_NETIF_NR_SLOTS_MIN + 1) @@ -291,7 +291,7 @@ static void xennet_alloc_rx_buffers(struct netfront_queue *queue) struct sk_buff *skb; unsigned short id; grant_ref_t ref; - unsigned long gfn; + struct page *page; struct xen_netif_rx_request *req; skb = xennet_alloc_one_rx_buffer(queue); @@ -307,14 +307,13 @@ static void xennet_alloc_rx_buffers(struct netfront_queue *queue) BUG_ON((signed short)ref < 0); queue->grant_rx_ref[id] = ref; - gfn = xen_page_to_gfn(skb_frag_page(&skb_shinfo(skb)->frags[0])); + page = skb_frag_page(&skb_shinfo(skb)->frags[0]); req = RING_GET_REQUEST(&queue->rx, req_prod); - gnttab_grant_foreign_access_ref(ref, - queue->info->xbdev->otherend_id, - gfn, - 0); - + gnttab_page_grant_foreign_access_ref_one(ref, + queue->info->xbdev->otherend_id, + page, + 0); req->id = id; req->gref = ref; } @@ -415,25 +414,33 @@ static void xennet_tx_buf_gc(struct netfront_queue *queue) xennet_maybe_wake_tx(queue); } -static struct xen_netif_tx_request *xennet_make_one_txreq( - struct netfront_queue *queue, struct sk_buff *skb, - struct page *page, unsigned int offset, unsigned int len) +struct xennet_gnttab_make_txreq { + struct netfront_queue *queue; + struct sk_buff *skb; + struct page *page; + struct xen_netif_tx_request *tx; /* Last request */ + unsigned int size; +}; + +static void xennet_tx_setup_grant(unsigned long gfn, unsigned int offset, + unsigned int len, void *data) { + struct xennet_gnttab_make_txreq *info = data; unsigned int id; struct xen_netif_tx_request *tx; grant_ref_t ref; - - len = min_t(unsigned int, PAGE_SIZE - offset, len); + /* convenient aliases */ + struct page *page = info->page; + struct netfront_queue *queue = info->queue; + struct sk_buff *skb = info->skb; id = get_id_from_freelist(&queue->tx_skb_freelist, queue->tx_skbs); tx = RING_GET_REQUEST(&queue->tx, queue->tx.req_prod_pvt++); ref = gnttab_claim_grant_reference(&queue->gref_tx_head); BUG_ON((signed short)ref < 0); - gnttab_grant_foreign_access_ref(ref, - queue->info->xbdev->otherend_id, - xen_page_to_gfn(page), - GNTMAP_readonly); + gnttab_grant_foreign_access_ref(ref, queue->info->xbdev->otherend_id, + gfn, GNTMAP_readonly); queue->tx_skbs[id].skb = skb; queue->grant_tx_page[id] = page; @@ -445,7 +452,34 @@ static struct xen_netif_tx_request *xennet_make_one_txreq( tx->size = len; tx->flags = 0; - return tx; + info->tx = tx; + info->size += tx->size; +} + +static struct xen_netif_tx_request *xennet_make_first_txreq( + struct netfront_queue *queue, struct sk_buff *skb, + struct page *page, unsigned int offset, unsigned int len) +{ + struct xennet_gnttab_make_txreq info = { + .queue = queue, + .skb = skb, + .page = page, + .size = 0, + }; + + gnttab_for_one_grant(page, offset, len, xennet_tx_setup_grant, &info); + + return info.tx; +} + +static void xennet_make_one_txreq(unsigned long gfn, unsigned int offset, + unsigned int len, void *data) +{ + struct xennet_gnttab_make_txreq *info = data; + + info->tx->flags |= XEN_NETTXF_more_data; + skb_get(info->skb); + xennet_tx_setup_grant(gfn, offset, len, data); } static struct xen_netif_tx_request *xennet_make_txreqs( @@ -453,20 +487,30 @@ static struct xen_netif_tx_request *xennet_make_txreqs( struct sk_buff *skb, struct page *page, unsigned int offset, unsigned int len) { + struct xennet_gnttab_make_txreq info = { + .queue = queue, + .skb = skb, + .tx = tx, + }; + /* Skip unused frames from start of page */ page += offset >> PAGE_SHIFT; offset &= ~PAGE_MASK; while (len) { - tx->flags |= XEN_NETTXF_more_data; - tx = xennet_make_one_txreq(queue, skb_get(skb), - page, offset, len); + info.page = page; + info.size = 0; + + gnttab_foreach_grant_in_range(page, offset, len, + xennet_make_one_txreq, + &info); + page++; offset = 0; - len -= tx->size; + len -= info.size; } - return tx; + return info.tx; } /* @@ -476,9 +520,10 @@ static struct xen_netif_tx_request *xennet_make_txreqs( static int xennet_count_skb_slots(struct sk_buff *skb) { int i, frags = skb_shinfo(skb)->nr_frags; - int pages; + int slots; - pages = PFN_UP(offset_in_page(skb->data) + skb_headlen(skb)); + slots = gnttab_count_grant(offset_in_page(skb->data), + skb_headlen(skb)); for (i = 0; i < frags; i++) { skb_frag_t *frag = skb_shinfo(skb)->frags + i; @@ -488,10 +533,10 @@ static int xennet_count_skb_slots(struct sk_buff *skb) /* Skip unused frames from start of page */ offset &= ~PAGE_MASK; - pages += PFN_UP(offset + size); + slots += gnttab_count_grant(offset, size); } - return pages; + return slots; } static u16 xennet_select_queue(struct net_device *dev, struct sk_buff *skb, @@ -512,6 +557,8 @@ static u16 xennet_select_queue(struct net_device *dev, struct sk_buff *skb, return queue_idx; } +#define MAX_XEN_SKB_FRAGS (65536 / XEN_PAGE_SIZE + 1) + static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct netfront_info *np = netdev_priv(dev); @@ -546,7 +593,7 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev) } slots = xennet_count_skb_slots(skb); - if (unlikely(slots > MAX_SKB_FRAGS + 1)) { + if (unlikely(slots > MAX_XEN_SKB_FRAGS + 1)) { net_dbg_ratelimited("xennet: skb rides the rocket: %d slots, %d bytes\n", slots, skb->len); if (skb_linearize(skb)) @@ -567,10 +614,13 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev) } /* First request for the linear area. */ - first_tx = tx = xennet_make_one_txreq(queue, skb, - page, offset, len); - page++; - offset = 0; + first_tx = tx = xennet_make_first_txreq(queue, skb, + page, offset, len); + offset += tx->size; + if (offset == PAGE_SIZE) { + page++; + offset = 0; + } len -= tx->size; if (skb->ip_summed == CHECKSUM_PARTIAL) @@ -732,7 +782,7 @@ static int xennet_get_responses(struct netfront_queue *queue, for (;;) { if (unlikely(rx->status < 0 || - rx->offset + rx->status > PAGE_SIZE)) { + rx->offset + rx->status > XEN_PAGE_SIZE)) { if (net_ratelimit()) dev_warn(dev, "rx->offset: %u, size: %d\n", rx->offset, rx->status); @@ -1496,7 +1546,7 @@ static int setup_netfront(struct xenbus_device *dev, goto fail; } SHARED_RING_INIT(txs); - FRONT_RING_INIT(&queue->tx, txs, PAGE_SIZE); + FRONT_RING_INIT(&queue->tx, txs, XEN_PAGE_SIZE); err = xenbus_grant_ring(dev, txs, 1, &gref); if (err < 0) @@ -1510,7 +1560,7 @@ static int setup_netfront(struct xenbus_device *dev, goto alloc_rx_ring_fail; } SHARED_RING_INIT(rxs); - FRONT_RING_INIT(&queue->rx, rxs, PAGE_SIZE); + FRONT_RING_INIT(&queue->rx, rxs, XEN_PAGE_SIZE); err = xenbus_grant_ring(dev, rxs, 1, &gref); if (err < 0) diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c index 10beb1589d83..fa816b7193b6 100644 --- a/drivers/tty/hvc/hvc_xen.c +++ b/drivers/tty/hvc/hvc_xen.c @@ -230,7 +230,7 @@ static int xen_hvm_console_init(void) if (r < 0 || v == 0) goto err; gfn = v; - info->intf = xen_remap(gfn << PAGE_SHIFT, PAGE_SIZE); + info->intf = xen_remap(gfn << XEN_PAGE_SHIFT, XEN_PAGE_SIZE); if (info->intf == NULL) goto err; info->vtermno = HVC_COOKIE; @@ -472,7 +472,7 @@ static int xencons_resume(struct xenbus_device *dev) struct xencons_info *info = dev_get_drvdata(&dev->dev); xencons_disconnect_backend(info); - memset(info->intf, 0, PAGE_SIZE); + memset(info->intf, 0, XEN_PAGE_SIZE); return xencons_connect_backend(dev, info); } diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index e293bc507cbc..aa8a7f71f310 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,6 +1,4 @@ -ifeq ($(filter y, $(CONFIG_ARM) $(CONFIG_ARM64)),) obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o -endif obj-$(CONFIG_X86) += fallback.o obj-y += grant-table.o features.o balloon.o manage.o preempt.o obj-y += events/ diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index c79329fcfa78..12eab503efd1 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -54,6 +54,8 @@ #include <linux/memory.h> #include <linux/memory_hotplug.h> #include <linux/percpu-defs.h> +#include <linux/slab.h> +#include <linux/sysctl.h> #include <asm/page.h> #include <asm/pgalloc.h> @@ -70,16 +72,64 @@ #include <xen/features.h> #include <xen/page.h> +static int xen_hotplug_unpopulated; + +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + +static int zero; +static int one = 1; + +static struct ctl_table balloon_table[] = { + { + .procname = "hotplug_unpopulated", + .data = &xen_hotplug_unpopulated, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { } +}; + +static struct ctl_table balloon_root[] = { + { + .procname = "balloon", + .mode = 0555, + .child = balloon_table, + }, + { } +}; + +static struct ctl_table xen_root[] = { + { + .procname = "xen", + .mode = 0555, + .child = balloon_root, + }, + { } +}; + +#endif + +/* + * Use one extent per PAGE_SIZE to avoid to break down the page into + * multiple frame. + */ +#define EXTENT_ORDER (fls(XEN_PFN_PER_PAGE) - 1) + /* * balloon_process() state: * * BP_DONE: done or nothing to do, + * BP_WAIT: wait to be rescheduled, * BP_EAGAIN: error, go to sleep, * BP_ECANCELED: error, balloon operation canceled. */ enum bp_state { BP_DONE, + BP_WAIT, BP_EAGAIN, BP_ECANCELED }; @@ -91,11 +141,12 @@ struct balloon_stats balloon_stats; EXPORT_SYMBOL_GPL(balloon_stats); /* We increase/decrease in batches which fit in a page */ -static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)]; +static xen_pfn_t frame_list[PAGE_SIZE / sizeof(xen_pfn_t)]; /* List of ballooned pages, threaded through the mem_map array. */ static LIST_HEAD(ballooned_pages); +static DECLARE_WAIT_QUEUE_HEAD(balloon_wq); /* Main work function, always executed in process context. */ static void balloon_process(struct work_struct *work); @@ -124,6 +175,7 @@ static void __balloon_append(struct page *page) list_add(&page->lru, &ballooned_pages); balloon_stats.balloon_low++; } + wake_up(&balloon_wq); } static void balloon_append(struct page *page) @@ -133,17 +185,16 @@ static void balloon_append(struct page *page) } /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ -static struct page *balloon_retrieve(bool prefer_highmem) +static struct page *balloon_retrieve(bool require_lowmem) { struct page *page; if (list_empty(&ballooned_pages)) return NULL; - if (prefer_highmem) - page = list_entry(ballooned_pages.prev, struct page, lru); - else - page = list_entry(ballooned_pages.next, struct page, lru); + page = list_entry(ballooned_pages.next, struct page, lru); + if (require_lowmem && PageHighMem(page)) + return NULL; list_del(&page->lru); if (PageHighMem(page)) @@ -166,6 +217,9 @@ static struct page *balloon_next_page(struct page *page) static enum bp_state update_schedule(enum bp_state state) { + if (state == BP_WAIT) + return BP_WAIT; + if (state == BP_ECANCELED) return BP_ECANCELED; @@ -193,43 +247,75 @@ static enum bp_state update_schedule(enum bp_state state) } #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG -static long current_credit(void) +static struct resource *additional_memory_resource(phys_addr_t size) { - return balloon_stats.target_pages - balloon_stats.current_pages - - balloon_stats.hotplug_pages; + struct resource *res; + int ret; + + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) + return NULL; + + res->name = "System RAM"; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + + ret = allocate_resource(&iomem_resource, res, + size, 0, -1, + PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL); + if (ret < 0) { + pr_err("Cannot allocate new System RAM resource\n"); + kfree(res); + return NULL; + } + + return res; } -static bool balloon_is_inflated(void) +static void release_memory_resource(struct resource *resource) { - if (balloon_stats.balloon_low || balloon_stats.balloon_high || - balloon_stats.balloon_hotplug) - return true; - else - return false; -} + if (!resource) + return; -/* - * reserve_additional_memory() adds memory region of size >= credit above - * max_pfn. New region is section aligned and size is modified to be multiple - * of section size. Those features allow optimal use of address space and - * establish proper alignment when this function is called first time after - * boot (last section not fully populated at boot time contains unused memory - * pages with PG_reserved bit not set; online_pages_range() does not allow page - * onlining in whole range if first onlined page does not have PG_reserved - * bit set). Real size of added memory is established at page onlining stage. - */ + /* + * No need to reset region to identity mapped since we now + * know that no I/O can be in this region + */ + release_resource(resource); + kfree(resource); +} -static enum bp_state reserve_additional_memory(long credit) +static enum bp_state reserve_additional_memory(void) { + long credit; + struct resource *resource; int nid, rc; - u64 hotplug_start_paddr; - unsigned long balloon_hotplug = credit; + unsigned long balloon_hotplug; + + credit = balloon_stats.target_pages + balloon_stats.target_unpopulated + - balloon_stats.total_pages; + + /* + * Already hotplugged enough pages? Wait for them to be + * onlined. + */ + if (credit <= 0) + return BP_WAIT; - hotplug_start_paddr = PFN_PHYS(SECTION_ALIGN_UP(max_pfn)); - balloon_hotplug = round_up(balloon_hotplug, PAGES_PER_SECTION); - nid = memory_add_physaddr_to_nid(hotplug_start_paddr); + balloon_hotplug = round_up(credit, PAGES_PER_SECTION); + + resource = additional_memory_resource(balloon_hotplug * PAGE_SIZE); + if (!resource) + goto err; + + nid = memory_add_physaddr_to_nid(resource->start); #ifdef CONFIG_XEN_HAVE_PVMMU + /* + * We don't support PV MMU when Linux and Xen is using + * different page granularity. + */ + BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); + /* * add_memory() will build page tables for the new memory so * the p2m must contain invalid entries so the correct @@ -242,29 +328,28 @@ static enum bp_state reserve_additional_memory(long credit) if (!xen_feature(XENFEAT_auto_translated_physmap)) { unsigned long pfn, i; - pfn = PFN_DOWN(hotplug_start_paddr); + pfn = PFN_DOWN(resource->start); for (i = 0; i < balloon_hotplug; i++) { if (!set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY)) { pr_warn("set_phys_to_machine() failed, no memory added\n"); - return BP_ECANCELED; + goto err; } } } #endif - rc = add_memory(nid, hotplug_start_paddr, balloon_hotplug << PAGE_SHIFT); - + rc = add_memory_resource(nid, resource); if (rc) { pr_warn("Cannot add additional memory (%i)\n", rc); - return BP_ECANCELED; + goto err; } - balloon_hotplug -= credit; + balloon_stats.total_pages += balloon_hotplug; - balloon_stats.hotplug_pages += credit; - balloon_stats.balloon_hotplug = balloon_hotplug; - - return BP_DONE; + return BP_WAIT; + err: + release_memory_resource(resource); + return BP_ECANCELED; } static void xen_online_page(struct page *page) @@ -275,11 +360,6 @@ static void xen_online_page(struct page *page) __balloon_append(page); - if (balloon_stats.hotplug_pages) - --balloon_stats.hotplug_pages; - else - --balloon_stats.balloon_hotplug; - mutex_unlock(&balloon_mutex); } @@ -296,53 +376,34 @@ static struct notifier_block xen_memory_nb = { .priority = 0 }; #else -static long current_credit(void) +static enum bp_state reserve_additional_memory(void) { - unsigned long target = balloon_stats.target_pages; - - target = min(target, - balloon_stats.current_pages + - balloon_stats.balloon_low + - balloon_stats.balloon_high); - - return target - balloon_stats.current_pages; + balloon_stats.target_pages = balloon_stats.current_pages; + return BP_ECANCELED; } +#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ -static bool balloon_is_inflated(void) +static long current_credit(void) { - if (balloon_stats.balloon_low || balloon_stats.balloon_high) - return true; - else - return false; + return balloon_stats.target_pages - balloon_stats.current_pages; } -static enum bp_state reserve_additional_memory(long credit) +static bool balloon_is_inflated(void) { - balloon_stats.target_pages = balloon_stats.current_pages; - return BP_DONE; + return balloon_stats.balloon_low || balloon_stats.balloon_high; } -#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ static enum bp_state increase_reservation(unsigned long nr_pages) { int rc; - unsigned long pfn, i; + unsigned long i; struct page *page; struct xen_memory_reservation reservation = { .address_bits = 0, - .extent_order = 0, + .extent_order = EXTENT_ORDER, .domid = DOMID_SELF }; -#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG - if (!balloon_stats.balloon_low && !balloon_stats.balloon_high) { - nr_pages = min(nr_pages, balloon_stats.balloon_hotplug); - balloon_stats.hotplug_pages += nr_pages; - balloon_stats.balloon_hotplug -= nr_pages; - return BP_DONE; - } -#endif - if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); @@ -352,7 +413,11 @@ static enum bp_state increase_reservation(unsigned long nr_pages) nr_pages = i; break; } - frame_list[i] = page_to_pfn(page); + + /* XENMEM_populate_physmap requires a PFN based on Xen + * granularity. + */ + frame_list[i] = page_to_xen_pfn(page); page = balloon_next_page(page); } @@ -366,10 +431,16 @@ static enum bp_state increase_reservation(unsigned long nr_pages) page = balloon_retrieve(false); BUG_ON(page == NULL); - pfn = page_to_pfn(page); - #ifdef CONFIG_XEN_HAVE_PVMMU + /* + * We don't support PV MMU when Linux and Xen is using + * different page granularity. + */ + BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + unsigned long pfn = page_to_pfn(page); + set_phys_to_machine(pfn, frame_list[i]); /* Link back into the page tables if not highmem. */ @@ -396,23 +467,15 @@ static enum bp_state increase_reservation(unsigned long nr_pages) static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) { enum bp_state state = BP_DONE; - unsigned long pfn, i; - struct page *page; + unsigned long i; + struct page *page, *tmp; int ret; struct xen_memory_reservation reservation = { .address_bits = 0, - .extent_order = 0, + .extent_order = EXTENT_ORDER, .domid = DOMID_SELF }; - -#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG - if (balloon_stats.hotplug_pages) { - nr_pages = min(nr_pages, balloon_stats.hotplug_pages); - balloon_stats.hotplug_pages -= nr_pages; - balloon_stats.balloon_hotplug += nr_pages; - return BP_DONE; - } -#endif + LIST_HEAD(pages); if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); @@ -425,8 +488,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) break; } scrub_page(page); - - frame_list[i] = page_to_pfn(page); + list_add(&page->lru, &pages); } /* @@ -438,14 +500,25 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) */ kmap_flush_unused(); - /* Update direct mapping, invalidate P2M, and add to balloon. */ - for (i = 0; i < nr_pages; i++) { - pfn = frame_list[i]; - frame_list[i] = pfn_to_gfn(pfn); - page = pfn_to_page(pfn); + /* + * Setup the frame, update direct mapping, invalidate P2M, + * and add to balloon. + */ + i = 0; + list_for_each_entry_safe(page, tmp, &pages, lru) { + /* XENMEM_decrease_reservation requires a GFN */ + frame_list[i++] = xen_page_to_gfn(page); #ifdef CONFIG_XEN_HAVE_PVMMU + /* + * We don't support PV MMU when Linux and Xen is using + * different page granularity. + */ + BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + unsigned long pfn = page_to_pfn(page); + if (!PageHighMem(page)) { ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), @@ -455,6 +528,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } #endif + list_del(&page->lru); balloon_append(page); } @@ -492,7 +566,7 @@ static void balloon_process(struct work_struct *work) if (balloon_is_inflated()) state = increase_reservation(credit); else - state = reserve_additional_memory(credit); + state = reserve_additional_memory(); } if (credit < 0) @@ -520,41 +594,71 @@ void balloon_set_new_target(unsigned long target) } EXPORT_SYMBOL_GPL(balloon_set_new_target); +static int add_ballooned_pages(int nr_pages) +{ + enum bp_state st; + + if (xen_hotplug_unpopulated) { + st = reserve_additional_memory(); + if (st != BP_ECANCELED) { + mutex_unlock(&balloon_mutex); + wait_event(balloon_wq, + !list_empty(&ballooned_pages)); + mutex_lock(&balloon_mutex); + return 0; + } + } + + st = decrease_reservation(nr_pages, GFP_USER); + if (st != BP_DONE) + return -ENOMEM; + + return 0; +} + /** * alloc_xenballooned_pages - get pages that have been ballooned out * @nr_pages: Number of pages to get * @pages: pages returned - * @highmem: allow highmem pages * @return 0 on success, error otherwise */ -int alloc_xenballooned_pages(int nr_pages, struct page **pages, bool highmem) +int alloc_xenballooned_pages(int nr_pages, struct page **pages) { int pgno = 0; struct page *page; + int ret; + mutex_lock(&balloon_mutex); + + balloon_stats.target_unpopulated += nr_pages; + while (pgno < nr_pages) { - page = balloon_retrieve(highmem); - if (page && (highmem || !PageHighMem(page))) { + page = balloon_retrieve(true); + if (page) { pages[pgno++] = page; +#ifdef CONFIG_XEN_HAVE_PVMMU + /* + * We don't support PV MMU when Linux and Xen is using + * different page granularity. + */ + BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); + + ret = xen_alloc_p2m_entry(page_to_pfn(page)); + if (ret < 0) + goto out_undo; +#endif } else { - enum bp_state st; - if (page) - balloon_append(page); - st = decrease_reservation(nr_pages - pgno, - highmem ? GFP_HIGHUSER : GFP_USER); - if (st != BP_DONE) + ret = add_ballooned_pages(nr_pages - pgno); + if (ret < 0) goto out_undo; } } mutex_unlock(&balloon_mutex); return 0; out_undo: - while (pgno) - balloon_append(pages[--pgno]); - /* Free the memory back to the kernel soon */ - schedule_delayed_work(&balloon_worker, 0); mutex_unlock(&balloon_mutex); - return -ENOMEM; + free_xenballooned_pages(pgno, pages); + return ret; } EXPORT_SYMBOL(alloc_xenballooned_pages); @@ -574,6 +678,8 @@ void free_xenballooned_pages(int nr_pages, struct page **pages) balloon_append(pages[i]); } + balloon_stats.target_unpopulated -= nr_pages; + /* The balloon may be too large now. Shrink it if needed. */ if (current_credit()) schedule_delayed_work(&balloon_worker, 0); @@ -602,6 +708,8 @@ static void __init balloon_add_region(unsigned long start_pfn, don't subtract from it. */ __balloon_append(page); } + + balloon_stats.total_pages += extra_pfn_end - start_pfn; } static int __init balloon_init(void) @@ -619,6 +727,7 @@ static int __init balloon_init(void) balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; + balloon_stats.total_pages = balloon_stats.current_pages; balloon_stats.schedule_delay = 1; balloon_stats.max_schedule_delay = 32; @@ -626,11 +735,9 @@ static int __init balloon_init(void) balloon_stats.max_retry_count = RETRY_UNLIMITED; #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG - balloon_stats.hotplug_pages = 0; - balloon_stats.balloon_hotplug = 0; - set_online_page_callback(&xen_online_page); register_memory_notifier(&xen_memory_nb); + register_sysctl_table(xen_root); #endif /* diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c index 8ae2fc90e1ea..4da69dbf7dca 100644 --- a/drivers/xen/biomerge.c +++ b/drivers/xen/biomerge.c @@ -6,10 +6,18 @@ bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, const struct bio_vec *vec2) { +#if XEN_PAGE_SIZE == PAGE_SIZE unsigned long bfn1 = pfn_to_bfn(page_to_pfn(vec1->bv_page)); unsigned long bfn2 = pfn_to_bfn(page_to_pfn(vec2->bv_page)); return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) && ((bfn1 == bfn2) || ((bfn1+1) == bfn2)); +#else + /* + * XXX: Add support for merging bio_vec when using different page + * size in Xen and Linux. + */ + return 0; +#endif } EXPORT_SYMBOL(xen_biovec_phys_mergeable); diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index cc6513a176b0..5676aefdf2bc 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -11,15 +11,20 @@ static void enable_hotplug_cpu(int cpu) { if (!cpu_present(cpu)) - arch_register_cpu(cpu); + xen_arch_register_cpu(cpu); set_cpu_present(cpu, true); } static void disable_hotplug_cpu(int cpu) { + if (cpu_online(cpu)) { + lock_device_hotplug(); + device_offline(get_cpu_device(cpu)); + unlock_device_hotplug(); + } if (cpu_present(cpu)) - arch_unregister_cpu(cpu); + xen_arch_unregister_cpu(cpu); set_cpu_present(cpu, false); } @@ -55,7 +60,6 @@ static void vcpu_hotplug(unsigned int cpu) enable_hotplug_cpu(cpu); break; case 0: - (void)cpu_down(cpu); disable_hotplug_cpu(cpu); break; default: @@ -102,7 +106,11 @@ static int __init setup_vcpu_hotplug_event(void) static struct notifier_block xsn_cpu = { .notifier_call = setup_cpu_watcher }; +#ifdef CONFIG_X86 if (!xen_pv_domain()) +#else + if (!xen_domain()) +#endif return -ENODEV; register_xenstore_notifier(&xsn_cpu); diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 6cd5e65c4aff..849500e4e14d 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -40,11 +40,11 @@ #include <asm/idle.h> #include <asm/io_apic.h> #include <asm/xen/pci.h> -#include <xen/page.h> #endif #include <asm/sync_bitops.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> +#include <xen/page.h> #include <xen/xen.h> #include <xen/hvm.h> diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c index 1d4baf56c36b..e3e9e3d46d1b 100644 --- a/drivers/xen/events/events_fifo.c +++ b/drivers/xen/events/events_fifo.c @@ -54,7 +54,7 @@ #include "events_internal.h" -#define EVENT_WORDS_PER_PAGE (PAGE_SIZE / sizeof(event_word_t)) +#define EVENT_WORDS_PER_PAGE (XEN_PAGE_SIZE / sizeof(event_word_t)) #define MAX_EVENT_ARRAY_PAGES (EVTCHN_FIFO_NR_CHANNELS / EVENT_WORDS_PER_PAGE) struct evtchn_fifo_queue { diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 62f591f8763c..c49f79ed58c5 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -642,7 +642,7 @@ int gnttab_setup_auto_xlat_frames(phys_addr_t addr) if (xen_auto_xlat_grant_frames.count) return -EINVAL; - vaddr = xen_remap(addr, PAGE_SIZE * max_nr_gframes); + vaddr = xen_remap(addr, XEN_PAGE_SIZE * max_nr_gframes); if (vaddr == NULL) { pr_warn("Failed to ioremap gnttab share frames (addr=%pa)!\n", &addr); @@ -654,7 +654,7 @@ int gnttab_setup_auto_xlat_frames(phys_addr_t addr) return -ENOMEM; } for (i = 0; i < max_nr_gframes; i++) - pfn[i] = PFN_DOWN(addr) + i; + pfn[i] = XEN_PFN_DOWN(addr) + i; xen_auto_xlat_grant_frames.vaddr = vaddr; xen_auto_xlat_grant_frames.pfn = pfn; @@ -687,7 +687,7 @@ int gnttab_alloc_pages(int nr_pages, struct page **pages) int i; int ret; - ret = alloc_xenballooned_pages(nr_pages, pages, false); + ret = alloc_xenballooned_pages(nr_pages, pages); if (ret < 0) return ret; @@ -776,6 +776,54 @@ void gnttab_batch_copy(struct gnttab_copy *batch, unsigned count) } EXPORT_SYMBOL_GPL(gnttab_batch_copy); +void gnttab_foreach_grant_in_range(struct page *page, + unsigned int offset, + unsigned int len, + xen_grant_fn_t fn, + void *data) +{ + unsigned int goffset; + unsigned int glen; + unsigned long xen_pfn; + + len = min_t(unsigned int, PAGE_SIZE - offset, len); + goffset = xen_offset_in_page(offset); + + xen_pfn = page_to_xen_pfn(page) + XEN_PFN_DOWN(offset); + + while (len) { + glen = min_t(unsigned int, XEN_PAGE_SIZE - goffset, len); + fn(pfn_to_gfn(xen_pfn), goffset, glen, data); + + goffset = 0; + xen_pfn++; + len -= glen; + } +} +EXPORT_SYMBOL_GPL(gnttab_foreach_grant_in_range); + +void gnttab_foreach_grant(struct page **pages, + unsigned int nr_grefs, + xen_grant_fn_t fn, + void *data) +{ + unsigned int goffset = 0; + unsigned long xen_pfn = 0; + unsigned int i; + + for (i = 0; i < nr_grefs; i++) { + if ((i % XEN_PFN_PER_PAGE) == 0) { + xen_pfn = page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]); + goffset = 0; + } + + fn(pfn_to_gfn(xen_pfn), goffset, XEN_PAGE_SIZE, data); + + goffset += XEN_PAGE_SIZE; + xen_pfn++; + } +} + int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count) @@ -978,7 +1026,7 @@ static void gnttab_request_version(void) { /* Only version 1 is used, which will always be available. */ grant_table_version = 1; - grefs_per_grant_frame = PAGE_SIZE / sizeof(struct grant_entry_v1); + grefs_per_grant_frame = XEN_PAGE_SIZE / sizeof(struct grant_entry_v1); gnttab_interface = &gnttab_v1_ops; pr_info("Grant tables using version %d layout\n", grant_table_version); diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 5e9adac928e6..df2e6f783318 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -401,7 +401,7 @@ static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) if (pages == NULL) return -ENOMEM; - rc = alloc_xenballooned_pages(numpgs, pages, 0); + rc = alloc_xenballooned_pages(numpgs, pages); if (rc != 0) { pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__, numpgs, rc); @@ -446,7 +446,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) return -EINVAL; } - nr_pages = m.num; + nr_pages = DIV_ROUND_UP(m.num, XEN_PFN_PER_PAGE); if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) return -EINVAL; @@ -494,7 +494,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) goto out_unlock; } if (xen_feature(XENFEAT_auto_translated_physmap)) { - ret = alloc_empty_pages(vma, m.num); + ret = alloc_empty_pages(vma, nr_pages); if (ret < 0) goto out_unlock; } else @@ -518,6 +518,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) state.global_error = 0; state.version = version; + BUILD_BUG_ON(((PAGE_SIZE / sizeof(xen_pfn_t)) % XEN_PFN_PER_PAGE) != 0); /* mmap_batch_fn guarantees ret == 0 */ BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t), &pagelist, mmap_batch_fn, &state)); @@ -582,12 +583,13 @@ static void privcmd_close(struct vm_area_struct *vma) { struct page **pages = vma->vm_private_data; int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + int numgfns = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT; int rc; if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages) return; - rc = xen_unmap_domain_gfn_range(vma, numpgs, pages); + rc = xen_unmap_domain_gfn_range(vma, numgfns, pages); if (rc == 0) free_xenballooned_pages(numpgs, pages); else diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 79bc4933b13e..7399782c0998 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -76,27 +76,27 @@ static unsigned long xen_io_tlb_nslabs; static u64 start_dma_addr; /* - * Both of these functions should avoid PFN_PHYS because phys_addr_t + * Both of these functions should avoid XEN_PFN_PHYS because phys_addr_t * can be 32bit when dma_addr_t is 64bit leading to a loss in * information if the shift is done before casting to 64bit. */ static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr) { - unsigned long bfn = pfn_to_bfn(PFN_DOWN(paddr)); - dma_addr_t dma = (dma_addr_t)bfn << PAGE_SHIFT; + unsigned long bfn = pfn_to_bfn(XEN_PFN_DOWN(paddr)); + dma_addr_t dma = (dma_addr_t)bfn << XEN_PAGE_SHIFT; - dma |= paddr & ~PAGE_MASK; + dma |= paddr & ~XEN_PAGE_MASK; return dma; } static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr) { - unsigned long pfn = bfn_to_pfn(PFN_DOWN(baddr)); - dma_addr_t dma = (dma_addr_t)pfn << PAGE_SHIFT; + unsigned long xen_pfn = bfn_to_pfn(XEN_PFN_DOWN(baddr)); + dma_addr_t dma = (dma_addr_t)xen_pfn << XEN_PAGE_SHIFT; phys_addr_t paddr = dma; - paddr |= baddr & ~PAGE_MASK; + paddr |= baddr & ~XEN_PAGE_MASK; return paddr; } @@ -106,7 +106,7 @@ static inline dma_addr_t xen_virt_to_bus(void *address) return xen_phys_to_bus(virt_to_phys(address)); } -static int check_pages_physically_contiguous(unsigned long pfn, +static int check_pages_physically_contiguous(unsigned long xen_pfn, unsigned int offset, size_t length) { @@ -114,11 +114,11 @@ static int check_pages_physically_contiguous(unsigned long pfn, int i; int nr_pages; - next_bfn = pfn_to_bfn(pfn); - nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT; + next_bfn = pfn_to_bfn(xen_pfn); + nr_pages = (offset + length + XEN_PAGE_SIZE-1) >> XEN_PAGE_SHIFT; for (i = 1; i < nr_pages; i++) { - if (pfn_to_bfn(++pfn) != ++next_bfn) + if (pfn_to_bfn(++xen_pfn) != ++next_bfn) return 0; } return 1; @@ -126,28 +126,27 @@ static int check_pages_physically_contiguous(unsigned long pfn, static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) { - unsigned long pfn = PFN_DOWN(p); - unsigned int offset = p & ~PAGE_MASK; + unsigned long xen_pfn = XEN_PFN_DOWN(p); + unsigned int offset = p & ~XEN_PAGE_MASK; - if (offset + size <= PAGE_SIZE) + if (offset + size <= XEN_PAGE_SIZE) return 0; - if (check_pages_physically_contiguous(pfn, offset, size)) + if (check_pages_physically_contiguous(xen_pfn, offset, size)) return 0; return 1; } static int is_xen_swiotlb_buffer(dma_addr_t dma_addr) { - unsigned long bfn = PFN_DOWN(dma_addr); - unsigned long pfn = bfn_to_local_pfn(bfn); - phys_addr_t paddr; + unsigned long bfn = XEN_PFN_DOWN(dma_addr); + unsigned long xen_pfn = bfn_to_local_pfn(bfn); + phys_addr_t paddr = XEN_PFN_PHYS(xen_pfn); /* If the address is outside our domain, it CAN * have the same virtual address as another address * in our domain. Therefore _only_ check address within our domain. */ - if (pfn_valid(pfn)) { - paddr = PFN_PHYS(pfn); + if (pfn_valid(PFN_DOWN(paddr))) { return paddr >= virt_to_phys(xen_io_tlb_start) && paddr < virt_to_phys(xen_io_tlb_end); } @@ -392,7 +391,7 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, */ if (dma_capable(dev, dev_addr, size) && !range_straddles_page_boundary(phys, size) && - !xen_arch_need_swiotlb(dev, PFN_DOWN(phys), PFN_DOWN(dev_addr)) && + !xen_arch_need_swiotlb(dev, phys, dev_addr) && !swiotlb_force) { /* we are not interested in the dma_addr returned by * xen_dma_map_page, only in the potential cache flushes executed @@ -551,7 +550,7 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, dma_addr_t dev_addr = xen_phys_to_bus(paddr); if (swiotlb_force || - xen_arch_need_swiotlb(hwdev, PFN_DOWN(paddr), PFN_DOWN(dev_addr)) || + xen_arch_need_swiotlb(hwdev, paddr, dev_addr) || !dma_capable(hwdev, dev_addr, sg->length) || range_straddles_page_boundary(paddr, sg->length)) { phys_addr_t map = swiotlb_tbl_map_single(hwdev, diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 2ba09c1195c8..056da6ee1a35 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -49,6 +49,10 @@ #include "xenbus_probe.h" +#define XENBUS_PAGES(_grants) (DIV_ROUND_UP(_grants, XEN_PFN_PER_PAGE)) + +#define XENBUS_MAX_RING_PAGES (XENBUS_PAGES(XENBUS_MAX_RING_GRANTS)) + struct xenbus_map_node { struct list_head next; union { @@ -57,10 +61,11 @@ struct xenbus_map_node { } pv; struct { struct page *pages[XENBUS_MAX_RING_PAGES]; + unsigned long addrs[XENBUS_MAX_RING_GRANTS]; void *addr; } hvm; }; - grant_handle_t handles[XENBUS_MAX_RING_PAGES]; + grant_handle_t handles[XENBUS_MAX_RING_GRANTS]; unsigned int nr_handles; }; @@ -388,7 +393,7 @@ int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr, } grefs[i] = err; - vaddr = vaddr + PAGE_SIZE; + vaddr = vaddr + XEN_PAGE_SIZE; } return 0; @@ -479,12 +484,12 @@ static int __xenbus_map_ring(struct xenbus_device *dev, unsigned int flags, bool *leaked) { - struct gnttab_map_grant_ref map[XENBUS_MAX_RING_PAGES]; - struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_PAGES]; + struct gnttab_map_grant_ref map[XENBUS_MAX_RING_GRANTS]; + struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS]; int i, j; int err = GNTST_okay; - if (nr_grefs > XENBUS_MAX_RING_PAGES) + if (nr_grefs > XENBUS_MAX_RING_GRANTS) return -EINVAL; for (i = 0; i < nr_grefs; i++) { @@ -540,22 +545,22 @@ static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev, { struct xenbus_map_node *node; struct vm_struct *area; - pte_t *ptes[XENBUS_MAX_RING_PAGES]; - phys_addr_t phys_addrs[XENBUS_MAX_RING_PAGES]; + pte_t *ptes[XENBUS_MAX_RING_GRANTS]; + phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; int err = GNTST_okay; int i; bool leaked; *vaddr = NULL; - if (nr_grefs > XENBUS_MAX_RING_PAGES) + if (nr_grefs > XENBUS_MAX_RING_GRANTS) return -EINVAL; node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; - area = alloc_vm_area(PAGE_SIZE * nr_grefs, ptes); + area = alloc_vm_area(XEN_PAGE_SIZE * nr_grefs, ptes); if (!area) { kfree(node); return -ENOMEM; @@ -591,21 +596,44 @@ failed: return err; } +struct map_ring_valloc_hvm +{ + unsigned int idx; + + /* Why do we need two arrays? See comment of __xenbus_map_ring */ + phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; + unsigned long addrs[XENBUS_MAX_RING_GRANTS]; +}; + +static void xenbus_map_ring_setup_grant_hvm(unsigned long gfn, + unsigned int goffset, + unsigned int len, + void *data) +{ + struct map_ring_valloc_hvm *info = data; + unsigned long vaddr = (unsigned long)gfn_to_virt(gfn); + + info->phys_addrs[info->idx] = vaddr; + info->addrs[info->idx] = vaddr; + + info->idx++; +} + static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev, grant_ref_t *gnt_ref, unsigned int nr_grefs, void **vaddr) { struct xenbus_map_node *node; - int i; int err; void *addr; bool leaked = false; - /* Why do we need two arrays? See comment of __xenbus_map_ring */ - phys_addr_t phys_addrs[XENBUS_MAX_RING_PAGES]; - unsigned long addrs[XENBUS_MAX_RING_PAGES]; + struct map_ring_valloc_hvm info = { + .idx = 0, + }; + unsigned int nr_pages = XENBUS_PAGES(nr_grefs); - if (nr_grefs > XENBUS_MAX_RING_PAGES) + if (nr_grefs > XENBUS_MAX_RING_GRANTS) return -EINVAL; *vaddr = NULL; @@ -614,25 +642,22 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev, if (!node) return -ENOMEM; - err = alloc_xenballooned_pages(nr_grefs, node->hvm.pages, - false /* lowmem */); + err = alloc_xenballooned_pages(nr_pages, node->hvm.pages); if (err) goto out_err; - for (i = 0; i < nr_grefs; i++) { - unsigned long pfn = page_to_pfn(node->hvm.pages[i]); - phys_addrs[i] = (unsigned long)pfn_to_kaddr(pfn); - addrs[i] = (unsigned long)pfn_to_kaddr(pfn); - } + gnttab_foreach_grant(node->hvm.pages, nr_grefs, + xenbus_map_ring_setup_grant_hvm, + &info); err = __xenbus_map_ring(dev, gnt_ref, nr_grefs, node->handles, - phys_addrs, GNTMAP_host_map, &leaked); + info.phys_addrs, GNTMAP_host_map, &leaked); node->nr_handles = nr_grefs; if (err) goto out_free_ballooned_pages; - addr = vmap(node->hvm.pages, nr_grefs, VM_MAP | VM_IOREMAP, + addr = vmap(node->hvm.pages, nr_pages, VM_MAP | VM_IOREMAP, PAGE_KERNEL); if (!addr) { err = -ENOMEM; @@ -650,14 +675,13 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev, out_xenbus_unmap_ring: if (!leaked) - xenbus_unmap_ring(dev, node->handles, node->nr_handles, - addrs); + xenbus_unmap_ring(dev, node->handles, nr_grefs, info.addrs); else pr_alert("leaking %p size %u page(s)", - addr, nr_grefs); + addr, nr_pages); out_free_ballooned_pages: if (!leaked) - free_xenballooned_pages(nr_grefs, node->hvm.pages); + free_xenballooned_pages(nr_pages, node->hvm.pages); out_err: kfree(node); return err; @@ -687,10 +711,10 @@ int xenbus_map_ring(struct xenbus_device *dev, grant_ref_t *gnt_refs, unsigned int nr_grefs, grant_handle_t *handles, unsigned long *vaddrs, bool *leaked) { - phys_addr_t phys_addrs[XENBUS_MAX_RING_PAGES]; + phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; int i; - if (nr_grefs > XENBUS_MAX_RING_PAGES) + if (nr_grefs > XENBUS_MAX_RING_GRANTS) return -EINVAL; for (i = 0; i < nr_grefs; i++) @@ -723,7 +747,7 @@ EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr) { struct xenbus_map_node *node; - struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_PAGES]; + struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS]; unsigned int level; int i; bool leaked = false; @@ -750,7 +774,7 @@ static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr) unsigned long addr; memset(&unmap[i], 0, sizeof(unmap[i])); - addr = (unsigned long)vaddr + (PAGE_SIZE * i); + addr = (unsigned long)vaddr + (XEN_PAGE_SIZE * i); unmap[i].host_addr = arbitrary_virt_to_machine( lookup_address(addr, &level)).maddr; unmap[i].dev_bus_addr = 0; @@ -783,13 +807,33 @@ static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr) return err; } +struct unmap_ring_vfree_hvm +{ + unsigned int idx; + unsigned long addrs[XENBUS_MAX_RING_GRANTS]; +}; + +static void xenbus_unmap_ring_setup_grant_hvm(unsigned long gfn, + unsigned int goffset, + unsigned int len, + void *data) +{ + struct unmap_ring_vfree_hvm *info = data; + + info->addrs[info->idx] = (unsigned long)gfn_to_virt(gfn); + + info->idx++; +} + static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr) { int rv; struct xenbus_map_node *node; void *addr; - unsigned long addrs[XENBUS_MAX_RING_PAGES]; - int i; + struct unmap_ring_vfree_hvm info = { + .idx = 0, + }; + unsigned int nr_pages; spin_lock(&xenbus_valloc_lock); list_for_each_entry(node, &xenbus_valloc_pages, next) { @@ -809,18 +853,20 @@ static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr) return GNTST_bad_virt_addr; } - for (i = 0; i < node->nr_handles; i++) - addrs[i] = (unsigned long)pfn_to_kaddr(page_to_pfn(node->hvm.pages[i])); + nr_pages = XENBUS_PAGES(node->nr_handles); + + gnttab_foreach_grant(node->hvm.pages, node->nr_handles, + xenbus_unmap_ring_setup_grant_hvm, + &info); rv = xenbus_unmap_ring(dev, node->handles, node->nr_handles, - addrs); + info.addrs); if (!rv) { vunmap(vaddr); - free_xenballooned_pages(node->nr_handles, node->hvm.pages); + free_xenballooned_pages(nr_pages, node->hvm.pages); } else - WARN(1, "Leaking %p, size %u page(s)\n", vaddr, - node->nr_handles); + WARN(1, "Leaking %p, size %u page(s)\n", vaddr, nr_pages); kfree(node); return rv; @@ -841,11 +887,11 @@ int xenbus_unmap_ring(struct xenbus_device *dev, grant_handle_t *handles, unsigned int nr_handles, unsigned long *vaddrs) { - struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_PAGES]; + struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS]; int i; int err; - if (nr_handles > XENBUS_MAX_RING_PAGES) + if (nr_handles > XENBUS_MAX_RING_GRANTS) return -EINVAL; for (i = 0; i < nr_handles; i++) diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 3cbe0556de26..33a31cfef55d 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -802,7 +802,8 @@ static int __init xenbus_init(void) goto out_error; xen_store_gfn = (unsigned long)v; xen_store_interface = - xen_remap(xen_store_gfn << PAGE_SHIFT, PAGE_SIZE); + xen_remap(xen_store_gfn << XEN_PAGE_SHIFT, + XEN_PAGE_SIZE); break; default: pr_warn("Xenstore state unknown\n"); diff --git a/drivers/xen/xlate_mmu.c b/drivers/xen/xlate_mmu.c index cff23872c5a9..5063c5e796b7 100644 --- a/drivers/xen/xlate_mmu.c +++ b/drivers/xen/xlate_mmu.c @@ -38,31 +38,28 @@ #include <xen/interface/xen.h> #include <xen/interface/memory.h> -/* map fgfn of domid to lpfn in the current domain */ -static int map_foreign_page(unsigned long lpfn, unsigned long fgfn, - unsigned int domid) -{ - int rc; - struct xen_add_to_physmap_range xatp = { - .domid = DOMID_SELF, - .foreign_domid = domid, - .size = 1, - .space = XENMAPSPACE_gmfn_foreign, - }; - xen_ulong_t idx = fgfn; - xen_pfn_t gpfn = lpfn; - int err = 0; +typedef void (*xen_gfn_fn_t)(unsigned long gfn, void *data); - set_xen_guest_handle(xatp.idxs, &idx); - set_xen_guest_handle(xatp.gpfns, &gpfn); - set_xen_guest_handle(xatp.errs, &err); +/* Break down the pages in 4KB chunk and call fn for each gfn */ +static void xen_for_each_gfn(struct page **pages, unsigned nr_gfn, + xen_gfn_fn_t fn, void *data) +{ + unsigned long xen_pfn = 0; + struct page *page; + int i; - rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp); - return rc < 0 ? rc : err; + for (i = 0; i < nr_gfn; i++) { + if ((i % XEN_PFN_PER_PAGE) == 0) { + page = pages[i / XEN_PFN_PER_PAGE]; + xen_pfn = page_to_xen_pfn(page); + } + fn(pfn_to_gfn(xen_pfn++), data); + } } struct remap_data { xen_pfn_t *fgfn; /* foreign domain's gfn */ + int nr_fgfn; /* Number of foreign gfn left to map */ pgprot_t prot; domid_t domid; struct vm_area_struct *vma; @@ -71,24 +68,71 @@ struct remap_data { struct xen_remap_gfn_info *info; int *err_ptr; int mapped; + + /* Hypercall parameters */ + int h_errs[XEN_PFN_PER_PAGE]; + xen_ulong_t h_idxs[XEN_PFN_PER_PAGE]; + xen_pfn_t h_gpfns[XEN_PFN_PER_PAGE]; + + int h_iter; /* Iterator */ }; +static void setup_hparams(unsigned long gfn, void *data) +{ + struct remap_data *info = data; + + info->h_idxs[info->h_iter] = *info->fgfn; + info->h_gpfns[info->h_iter] = gfn; + info->h_errs[info->h_iter] = 0; + + info->h_iter++; + info->fgfn++; +} + static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, void *data) { struct remap_data *info = data; struct page *page = info->pages[info->index++]; - unsigned long pfn = page_to_pfn(page); - pte_t pte = pte_mkspecial(pfn_pte(pfn, info->prot)); - int rc; + pte_t pte = pte_mkspecial(pfn_pte(page_to_pfn(page), info->prot)); + int rc, nr_gfn; + uint32_t i; + struct xen_add_to_physmap_range xatp = { + .domid = DOMID_SELF, + .foreign_domid = info->domid, + .space = XENMAPSPACE_gmfn_foreign, + }; - rc = map_foreign_page(pfn, *info->fgfn, info->domid); - *info->err_ptr++ = rc; - if (!rc) { - set_pte_at(info->vma->vm_mm, addr, ptep, pte); - info->mapped++; + nr_gfn = min_t(typeof(info->nr_fgfn), XEN_PFN_PER_PAGE, info->nr_fgfn); + info->nr_fgfn -= nr_gfn; + + info->h_iter = 0; + xen_for_each_gfn(&page, nr_gfn, setup_hparams, info); + BUG_ON(info->h_iter != nr_gfn); + + set_xen_guest_handle(xatp.idxs, info->h_idxs); + set_xen_guest_handle(xatp.gpfns, info->h_gpfns); + set_xen_guest_handle(xatp.errs, info->h_errs); + xatp.size = nr_gfn; + + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp); + + /* info->err_ptr expect to have one error status per Xen PFN */ + for (i = 0; i < nr_gfn; i++) { + int err = (rc < 0) ? rc : info->h_errs[i]; + + *(info->err_ptr++) = err; + if (!err) + info->mapped++; } - info->fgfn++; + + /* + * Note: The hypercall will return 0 in most of the case if even if + * all the fgmfn are not mapped. We still have to update the pte + * as the userspace may decide to continue. + */ + if (!rc) + set_pte_at(info->vma->vm_mm, addr, ptep, pte); return 0; } @@ -102,13 +146,14 @@ int xen_xlate_remap_gfn_array(struct vm_area_struct *vma, { int err; struct remap_data data; - unsigned long range = nr << PAGE_SHIFT; + unsigned long range = DIV_ROUND_UP(nr, XEN_PFN_PER_PAGE) << PAGE_SHIFT; /* Kept here for the purpose of making sure code doesn't break x86 PVOPS */ BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); data.fgfn = gfn; + data.nr_fgfn = nr; data.prot = prot; data.domid = domid; data.vma = vma; @@ -123,21 +168,20 @@ int xen_xlate_remap_gfn_array(struct vm_area_struct *vma, } EXPORT_SYMBOL_GPL(xen_xlate_remap_gfn_array); -int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma, - int nr, struct page **pages) +static void unmap_gfn(unsigned long gfn, void *data) { - int i; + struct xen_remove_from_physmap xrp; - for (i = 0; i < nr; i++) { - struct xen_remove_from_physmap xrp; - unsigned long pfn; + xrp.domid = DOMID_SELF; + xrp.gpfn = gfn; + (void)HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); +} - pfn = page_to_pfn(pages[i]); +int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma, + int nr, struct page **pages) +{ + xen_for_each_gfn(pages, nr, unmap_gfn, NULL); - xrp.domid = DOMID_SELF; - xrp.gpfn = pfn; - (void)HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); - } return 0; } EXPORT_SYMBOL_GPL(xen_xlate_unmap_gfn_range); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 8f60e899b33c..2ea574ff9714 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -11,6 +11,7 @@ struct zone; struct pglist_data; struct mem_section; struct memory_block; +struct resource; #ifdef CONFIG_MEMORY_HOTPLUG @@ -266,6 +267,7 @@ static inline void remove_memory(int nid, u64 start, u64 size) {} extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)); extern int add_memory(int nid, u64 start, u64 size); +extern int add_memory_resource(int nid, struct resource *resource); extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, bool for_device); extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); diff --git a/include/uapi/xen/gntalloc.h b/include/uapi/xen/gntalloc.h index 76bd58065f4f..48d2790ef928 100644 --- a/include/uapi/xen/gntalloc.h +++ b/include/uapi/xen/gntalloc.h @@ -11,6 +11,8 @@ #ifndef __LINUX_PUBLIC_GNTALLOC_H__ #define __LINUX_PUBLIC_GNTALLOC_H__ +#include <linux/types.h> + /* * Allocates a new page and creates a new grant reference. */ @@ -19,17 +21,17 @@ _IOC(_IOC_NONE, 'G', 5, sizeof(struct ioctl_gntalloc_alloc_gref)) struct ioctl_gntalloc_alloc_gref { /* IN parameters */ /* The ID of the domain to be given access to the grants. */ - uint16_t domid; + __u16 domid; /* Flags for this mapping */ - uint16_t flags; + __u16 flags; /* Number of pages to map */ - uint32_t count; + __u32 count; /* OUT parameters */ /* The offset to be used on a subsequent call to mmap(). */ - uint64_t index; + __u64 index; /* The grant references of the newly created grant, one per page */ /* Variable size, depending on count */ - uint32_t gref_ids[1]; + __u32 gref_ids[1]; }; #define GNTALLOC_FLAG_WRITABLE 1 @@ -43,9 +45,9 @@ _IOC(_IOC_NONE, 'G', 6, sizeof(struct ioctl_gntalloc_dealloc_gref)) struct ioctl_gntalloc_dealloc_gref { /* IN parameters */ /* The offset returned in the map operation */ - uint64_t index; + __u64 index; /* Number of references to unmap */ - uint32_t count; + __u32 count; }; /* @@ -67,11 +69,11 @@ struct ioctl_gntalloc_unmap_notify { * be cleared. Otherwise, it can be any byte in the page whose * notification we are adjusting. */ - uint64_t index; + __u64 index; /* Action(s) to take on unmap */ - uint32_t action; + __u32 action; /* Event channel to notify */ - uint32_t event_channel_port; + __u32 event_channel_port; }; /* Clear (set to zero) the byte specified by index */ diff --git a/include/uapi/xen/gntdev.h b/include/uapi/xen/gntdev.h index 5304bd3c84c5..aa7610a9b867 100644 --- a/include/uapi/xen/gntdev.h +++ b/include/uapi/xen/gntdev.h @@ -33,11 +33,13 @@ #ifndef __LINUX_PUBLIC_GNTDEV_H__ #define __LINUX_PUBLIC_GNTDEV_H__ +#include <linux/types.h> + struct ioctl_gntdev_grant_ref { /* The domain ID of the grant to be mapped. */ - uint32_t domid; + __u32 domid; /* The grant reference of the grant to be mapped. */ - uint32_t ref; + __u32 ref; }; /* @@ -50,11 +52,11 @@ _IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref)) struct ioctl_gntdev_map_grant_ref { /* IN parameters */ /* The number of grants to be mapped. */ - uint32_t count; - uint32_t pad; + __u32 count; + __u32 pad; /* OUT parameters */ /* The offset to be used on a subsequent call to mmap(). */ - uint64_t index; + __u64 index; /* Variable IN parameter. */ /* Array of grant references, of size @count. */ struct ioctl_gntdev_grant_ref refs[1]; @@ -70,10 +72,10 @@ _IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) struct ioctl_gntdev_unmap_grant_ref { /* IN parameters */ /* The offset was returned by the corresponding map operation. */ - uint64_t index; + __u64 index; /* The number of pages to be unmapped. */ - uint32_t count; - uint32_t pad; + __u32 count; + __u32 pad; }; /* @@ -93,13 +95,13 @@ _IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr)) struct ioctl_gntdev_get_offset_for_vaddr { /* IN parameters */ /* The virtual address of the first mapped page in a range. */ - uint64_t vaddr; + __u64 vaddr; /* OUT parameters */ /* The offset that was used in the initial mmap() operation. */ - uint64_t offset; + __u64 offset; /* The number of pages mapped in the VM area that begins at @vaddr. */ - uint32_t count; - uint32_t pad; + __u32 count; + __u32 pad; }; /* @@ -113,7 +115,7 @@ _IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants)) struct ioctl_gntdev_set_max_grants { /* IN parameter */ /* The maximum number of grants that may be mapped at once. */ - uint32_t count; + __u32 count; }; /* @@ -135,11 +137,11 @@ struct ioctl_gntdev_unmap_notify { * be cleared. Otherwise, it can be any byte in the page whose * notification we are adjusting. */ - uint64_t index; + __u64 index; /* Action(s) to take on unmap */ - uint32_t action; + __u32 action; /* Event channel to notify */ - uint32_t event_channel_port; + __u32 event_channel_port; }; /* Clear (set to zero) the byte specified by index */ diff --git a/include/xen/balloon.h b/include/xen/balloon.h index a4c1c6a93691..d1767dfb0d95 100644 --- a/include/xen/balloon.h +++ b/include/xen/balloon.h @@ -8,30 +8,24 @@ struct balloon_stats { /* We aim for 'current allocation' == 'target allocation'. */ unsigned long current_pages; unsigned long target_pages; + unsigned long target_unpopulated; /* Number of pages in high- and low-memory balloons. */ unsigned long balloon_low; unsigned long balloon_high; + unsigned long total_pages; unsigned long schedule_delay; unsigned long max_schedule_delay; unsigned long retry_count; unsigned long max_retry_count; -#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG - unsigned long hotplug_pages; - unsigned long balloon_hotplug; -#endif }; extern struct balloon_stats balloon_stats; void balloon_set_new_target(unsigned long target); -int alloc_xenballooned_pages(int nr_pages, struct page **pages, - bool highmem); +int alloc_xenballooned_pages(int nr_pages, struct page **pages); void free_xenballooned_pages(int nr_pages, struct page **pages); -struct page *get_balloon_scratch_page(void); -void put_balloon_scratch_page(void); - struct device; #ifdef CONFIG_XEN_SELFBALLOONING extern int register_xen_selfballooning(struct device *dev); diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 4478f4b4aae2..34b1379f9777 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -45,8 +45,10 @@ #include <asm/xen/hypervisor.h> #include <xen/features.h> +#include <xen/page.h> #include <linux/mm_types.h> #include <linux/page-flags.h> +#include <linux/kernel.h> #define GNTTAB_RESERVED_XENSTORE 1 @@ -129,6 +131,15 @@ void gnttab_cancel_free_callback(struct gnttab_free_callback *callback); void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, unsigned long frame, int readonly); +/* Give access to the first 4K of the page */ +static inline void gnttab_page_grant_foreign_access_ref_one( + grant_ref_t ref, domid_t domid, + struct page *page, int readonly) +{ + gnttab_grant_foreign_access_ref(ref, domid, xen_page_to_gfn(page), + readonly); +} + void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid, unsigned long pfn); @@ -224,4 +235,50 @@ static inline struct xen_page_foreign *xen_page_foreign(struct page *page) #endif } +/* Split Linux page in chunk of the size of the grant and call fn + * + * Parameters of fn: + * gfn: guest frame number + * offset: offset in the grant + * len: length of the data in the grant. + * data: internal information + */ +typedef void (*xen_grant_fn_t)(unsigned long gfn, unsigned int offset, + unsigned int len, void *data); + +void gnttab_foreach_grant_in_range(struct page *page, + unsigned int offset, + unsigned int len, + xen_grant_fn_t fn, + void *data); + +/* Helper to get to call fn only on the first "grant chunk" */ +static inline void gnttab_for_one_grant(struct page *page, unsigned int offset, + unsigned len, xen_grant_fn_t fn, + void *data) +{ + /* The first request is limited to the size of one grant */ + len = min_t(unsigned int, XEN_PAGE_SIZE - (offset & ~XEN_PAGE_MASK), + len); + + gnttab_foreach_grant_in_range(page, offset, len, fn, data); +} + +/* Get @nr_grefs grants from an array of page and call fn for each grant */ +void gnttab_foreach_grant(struct page **pages, + unsigned int nr_grefs, + xen_grant_fn_t fn, + void *data); + +/* Get the number of grant in a specified region + * + * start: Offset from the beginning of the first page + * len: total length of data (can cross multiple page) + */ +static inline unsigned int gnttab_count_grant(unsigned int start, + unsigned int len) +{ + return XEN_PFN_UP(xen_offset_in_page(start) + len); +} + #endif /* __ASM_GNTTAB_H__ */ diff --git a/include/xen/page.h b/include/xen/page.h index 1daae485e336..96294ac93755 100644 --- a/include/xen/page.h +++ b/include/xen/page.h @@ -1,11 +1,36 @@ #ifndef _XEN_PAGE_H #define _XEN_PAGE_H +#include <asm/page.h> + +/* The hypercall interface supports only 4KB page */ +#define XEN_PAGE_SHIFT 12 +#define XEN_PAGE_SIZE (_AC(1, UL) << XEN_PAGE_SHIFT) +#define XEN_PAGE_MASK (~(XEN_PAGE_SIZE-1)) +#define xen_offset_in_page(p) ((unsigned long)(p) & ~XEN_PAGE_MASK) + +/* + * We assume that PAGE_SIZE is a multiple of XEN_PAGE_SIZE + * XXX: Add a BUILD_BUG_ON? + */ + +#define xen_pfn_to_page(xen_pfn) \ + ((pfn_to_page(((unsigned long)(xen_pfn) << XEN_PAGE_SHIFT) >> PAGE_SHIFT))) +#define page_to_xen_pfn(page) \ + (((page_to_pfn(page)) << PAGE_SHIFT) >> XEN_PAGE_SHIFT) + +#define XEN_PFN_PER_PAGE (PAGE_SIZE / XEN_PAGE_SIZE) + +#define XEN_PFN_DOWN(x) ((x) >> XEN_PAGE_SHIFT) +#define XEN_PFN_UP(x) (((x) + XEN_PAGE_SIZE-1) >> XEN_PAGE_SHIFT) +#define XEN_PFN_PHYS(x) ((phys_addr_t)(x) << XEN_PAGE_SHIFT) + #include <asm/xen/page.h> +/* Return the GFN associated to the first 4KB of the page */ static inline unsigned long xen_page_to_gfn(struct page *page) { - return pfn_to_gfn(page_to_pfn(page)); + return pfn_to_gfn(page_to_xen_pfn(page)); } struct xen_memory_region { diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index 289c0b5f08fe..32b944b7cebd 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -46,8 +46,8 @@ #include <xen/interface/io/xenbus.h> #include <xen/interface/io/xs_wire.h> -#define XENBUS_MAX_RING_PAGE_ORDER 4 -#define XENBUS_MAX_RING_PAGES (1U << XENBUS_MAX_RING_PAGE_ORDER) +#define XENBUS_MAX_RING_GRANT_ORDER 4 +#define XENBUS_MAX_RING_GRANTS (1U << XENBUS_MAX_RING_GRANT_ORDER) #define INVALID_GRANT_HANDLE (~0U) /* Register callback to watch this node. */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index aa992e2df58a..0780d118d26e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1232,23 +1232,21 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default, } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -int __ref add_memory(int nid, u64 start, u64 size) +int __ref add_memory_resource(int nid, struct resource *res) { + u64 start, size; pg_data_t *pgdat = NULL; bool new_pgdat; bool new_node; - struct resource *res; int ret; + start = res->start; + size = resource_size(res); + ret = check_hotplug_memory_range(start, size); if (ret) return ret; - res = register_memory_resource(start, size); - ret = -EEXIST; - if (!res) - return ret; - { /* Stupid hack to suppress address-never-null warning */ void *p = NODE_DATA(nid); new_pgdat = !p; @@ -1300,13 +1298,28 @@ error: /* rollback pgdat allocation and others */ if (new_pgdat) rollback_node_hotadd(nid, pgdat); - release_memory_resource(res); memblock_remove(start, size); out: mem_hotplug_done(); return ret; } +EXPORT_SYMBOL_GPL(add_memory_resource); + +int __ref add_memory(int nid, u64 start, u64 size) +{ + struct resource *res; + int ret; + + res = register_memory_resource(start, size); + if (!res) + return -EEXIST; + + ret = add_memory_resource(nid, res); + if (ret < 0) + release_memory_resource(res); + return ret; +} EXPORT_SYMBOL_GPL(add_memory); #ifdef CONFIG_MEMORY_HOTREMOVE |