From 62cedb9f135794ec26a93ae29e5f0231ab263c84 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 25 Jun 2015 16:35:49 +0100 Subject: mm: memory hotplug with an existing resource Add add_memory_resource() to add memory using an existing "System RAM" resource. This is useful if the memory region is being located by finding a free resource slot with allocate_resource(). Xen guests will make use of this in their balloon driver to hotplug arbitrary amounts of memory in response to toolstack requests. Signed-off-by: David Vrabel Reviewed-by: Daniel Kiper Reviewed-by: Tang Chen --- include/linux/memory_hotplug.h | 2 ++ mm/memory_hotplug.c | 29 +++++++++++++++++++++-------- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 8f60e899b33c..2ea574ff9714 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -11,6 +11,7 @@ struct zone; struct pglist_data; struct mem_section; struct memory_block; +struct resource; #ifdef CONFIG_MEMORY_HOTPLUG @@ -266,6 +267,7 @@ static inline void remove_memory(int nid, u64 start, u64 size) {} extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)); extern int add_memory(int nid, u64 start, u64 size); +extern int add_memory_resource(int nid, struct resource *resource); extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, bool for_device); extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index aa992e2df58a..0780d118d26e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1232,23 +1232,21 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default, } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -int __ref add_memory(int nid, u64 start, u64 size) +int __ref add_memory_resource(int nid, struct resource *res) { + u64 start, size; pg_data_t *pgdat = NULL; bool new_pgdat; bool new_node; - struct resource *res; int ret; + start = res->start; + size = resource_size(res); + ret = check_hotplug_memory_range(start, size); if (ret) return ret; - res = register_memory_resource(start, size); - ret = -EEXIST; - if (!res) - return ret; - { /* Stupid hack to suppress address-never-null warning */ void *p = NODE_DATA(nid); new_pgdat = !p; @@ -1300,13 +1298,28 @@ error: /* rollback pgdat allocation and others */ if (new_pgdat) rollback_node_hotadd(nid, pgdat); - release_memory_resource(res); memblock_remove(start, size); out: mem_hotplug_done(); return ret; } +EXPORT_SYMBOL_GPL(add_memory_resource); + +int __ref add_memory(int nid, u64 start, u64 size) +{ + struct resource *res; + int ret; + + res = register_memory_resource(start, size); + if (!res) + return -EEXIST; + + ret = add_memory_resource(nid, res); + if (ret < 0) + release_memory_resource(res); + return ret; +} EXPORT_SYMBOL_GPL(add_memory); #ifdef CONFIG_MEMORY_HOTREMOVE -- cgit v1.2.3 From f6a6cb1afe74d6ccc81aa70aa4ac3953762e7e6e Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 25 Jun 2015 13:18:12 +0100 Subject: xen/balloon: remove scratch page left overs Commit 0bb599fd30108883b00c7d4a226eeb49111e6932 (xen: remove scratch frames for ballooned pages and m2p override) removed the use of the scratch page for ballooned out pages. Remove some left over function definitions. Signed-off-by: David Vrabel Reviewed-by: Daniel Kiper --- include/xen/balloon.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/xen/balloon.h b/include/xen/balloon.h index a4c1c6a93691..cc2e1a7e44ec 100644 --- a/include/xen/balloon.h +++ b/include/xen/balloon.h @@ -29,9 +29,6 @@ int alloc_xenballooned_pages(int nr_pages, struct page **pages, bool highmem); void free_xenballooned_pages(int nr_pages, struct page **pages); -struct page *get_balloon_scratch_page(void); -void put_balloon_scratch_page(void); - struct device; #ifdef CONFIG_XEN_SELFBALLOONING extern int register_xen_selfballooning(struct device *dev); -- cgit v1.2.3 From f5775e0b6116b7e2425ccf535243b21768566d87 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 19 Jan 2015 11:08:05 +0000 Subject: x86/xen: discard RAM regions above the maximum reservation During setup, discard RAM regions that are above the maximum reservation (instead of marking them as E820_UNUSABLE). This allows hotplug memory to be placed at these addresses. Signed-off-by: David Vrabel Reviewed-by: Daniel Kiper --- arch/x86/xen/setup.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 1c30e4ab1022..387b60d9bd0e 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -829,6 +829,8 @@ char * __init xen_memory_setup(void) addr = xen_e820_map[0].addr; size = xen_e820_map[0].size; while (i < xen_e820_map_entries) { + bool discard = false; + chunk_size = size; type = xen_e820_map[i].type; @@ -843,10 +845,11 @@ char * __init xen_memory_setup(void) xen_add_extra_mem(pfn_s, n_pfns); xen_max_p2m_pfn = pfn_s + n_pfns; } else - type = E820_UNUSABLE; + discard = true; } - xen_align_and_add_e820_region(addr, chunk_size, type); + if (!discard) + xen_align_and_add_e820_region(addr, chunk_size, type); addr += chunk_size; size -= chunk_size; -- cgit v1.2.3 From 55b3da98a40dbb3776f7454daf0d95dde25c33d2 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Wed, 24 Jun 2015 15:58:42 +0100 Subject: xen/balloon: find non-conflicting regions to place hotplugged memory Instead of placing hotplugged memory at the end of RAM (which may conflict with PCI devices or reserved regions) use allocate_resource() to get a new, suitably aligned resource that does not conflict. Signed-off-by: David Vrabel Reviewed-by: Daniel Kiper --- v3: - Remove stale comment. --- drivers/xen/balloon.c | 73 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 53 insertions(+), 20 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index c79329fcfa78..095bb6789731 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -208,26 +209,56 @@ static bool balloon_is_inflated(void) return false; } -/* - * reserve_additional_memory() adds memory region of size >= credit above - * max_pfn. New region is section aligned and size is modified to be multiple - * of section size. Those features allow optimal use of address space and - * establish proper alignment when this function is called first time after - * boot (last section not fully populated at boot time contains unused memory - * pages with PG_reserved bit not set; online_pages_range() does not allow page - * onlining in whole range if first onlined page does not have PG_reserved - * bit set). Real size of added memory is established at page onlining stage. - */ +static struct resource *additional_memory_resource(phys_addr_t size) +{ + struct resource *res; + int ret; + + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) + return NULL; + + res->name = "System RAM"; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + + ret = allocate_resource(&iomem_resource, res, + size, 0, -1, + PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL); + if (ret < 0) { + pr_err("Cannot allocate new System RAM resource\n"); + kfree(res); + return NULL; + } + + return res; +} + +static void release_memory_resource(struct resource *resource) +{ + if (!resource) + return; + + /* + * No need to reset region to identity mapped since we now + * know that no I/O can be in this region + */ + release_resource(resource); + kfree(resource); +} static enum bp_state reserve_additional_memory(long credit) { + struct resource *resource; int nid, rc; - u64 hotplug_start_paddr; - unsigned long balloon_hotplug = credit; + unsigned long balloon_hotplug; + + balloon_hotplug = round_up(credit, PAGES_PER_SECTION); + + resource = additional_memory_resource(balloon_hotplug * PAGE_SIZE); + if (!resource) + goto err; - hotplug_start_paddr = PFN_PHYS(SECTION_ALIGN_UP(max_pfn)); - balloon_hotplug = round_up(balloon_hotplug, PAGES_PER_SECTION); - nid = memory_add_physaddr_to_nid(hotplug_start_paddr); + nid = memory_add_physaddr_to_nid(resource->start); #ifdef CONFIG_XEN_HAVE_PVMMU /* @@ -242,21 +273,20 @@ static enum bp_state reserve_additional_memory(long credit) if (!xen_feature(XENFEAT_auto_translated_physmap)) { unsigned long pfn, i; - pfn = PFN_DOWN(hotplug_start_paddr); + pfn = PFN_DOWN(resource->start); for (i = 0; i < balloon_hotplug; i++) { if (!set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY)) { pr_warn("set_phys_to_machine() failed, no memory added\n"); - return BP_ECANCELED; + goto err; } } } #endif - rc = add_memory(nid, hotplug_start_paddr, balloon_hotplug << PAGE_SHIFT); - + rc = add_memory_resource(nid, resource); if (rc) { pr_warn("Cannot add additional memory (%i)\n", rc); - return BP_ECANCELED; + goto err; } balloon_hotplug -= credit; @@ -265,6 +295,9 @@ static enum bp_state reserve_additional_memory(long credit) balloon_stats.balloon_hotplug = balloon_hotplug; return BP_DONE; + err: + release_memory_resource(resource); + return BP_ECANCELED; } static void xen_online_page(struct page *page) -- cgit v1.2.3 From de5a77d8422fc7ed0b2f4349bceb65a1a639e5b2 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 25 Jun 2015 12:08:20 +0100 Subject: xen/balloon: rationalize memory hotplug stats The stats used for memory hotplug make no sense and are fiddled with in odd ways. Remove them and introduce total_pages to track the total number of pages (both populated and unpopulated) including those within hotplugged regions (note that this includes not yet onlined pages). This will be used in a subsequent commit (xen/balloon: only hotplug additional memory if required) when deciding whether additional memory needs to be hotplugged. Signed-off-by: David Vrabel Reviewed-by: Daniel Kiper --- drivers/xen/balloon.c | 75 +++++++++------------------------------------------ include/xen/balloon.h | 5 +--- 2 files changed, 13 insertions(+), 67 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 095bb6789731..ac8054765127 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -194,21 +194,6 @@ static enum bp_state update_schedule(enum bp_state state) } #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG -static long current_credit(void) -{ - return balloon_stats.target_pages - balloon_stats.current_pages - - balloon_stats.hotplug_pages; -} - -static bool balloon_is_inflated(void) -{ - if (balloon_stats.balloon_low || balloon_stats.balloon_high || - balloon_stats.balloon_hotplug) - return true; - else - return false; -} - static struct resource *additional_memory_resource(phys_addr_t size) { struct resource *res; @@ -289,10 +274,7 @@ static enum bp_state reserve_additional_memory(long credit) goto err; } - balloon_hotplug -= credit; - - balloon_stats.hotplug_pages += credit; - balloon_stats.balloon_hotplug = balloon_hotplug; + balloon_stats.total_pages += balloon_hotplug; return BP_DONE; err: @@ -308,11 +290,6 @@ static void xen_online_page(struct page *page) __balloon_append(page); - if (balloon_stats.hotplug_pages) - --balloon_stats.hotplug_pages; - else - --balloon_stats.balloon_hotplug; - mutex_unlock(&balloon_mutex); } @@ -329,32 +306,22 @@ static struct notifier_block xen_memory_nb = { .priority = 0 }; #else -static long current_credit(void) +static enum bp_state reserve_additional_memory(long credit) { - unsigned long target = balloon_stats.target_pages; - - target = min(target, - balloon_stats.current_pages + - balloon_stats.balloon_low + - balloon_stats.balloon_high); - - return target - balloon_stats.current_pages; + balloon_stats.target_pages = balloon_stats.current_pages; + return BP_DONE; } +#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ -static bool balloon_is_inflated(void) +static long current_credit(void) { - if (balloon_stats.balloon_low || balloon_stats.balloon_high) - return true; - else - return false; + return balloon_stats.target_pages - balloon_stats.current_pages; } -static enum bp_state reserve_additional_memory(long credit) +static bool balloon_is_inflated(void) { - balloon_stats.target_pages = balloon_stats.current_pages; - return BP_DONE; + return balloon_stats.balloon_low || balloon_stats.balloon_high; } -#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ static enum bp_state increase_reservation(unsigned long nr_pages) { @@ -367,15 +334,6 @@ static enum bp_state increase_reservation(unsigned long nr_pages) .domid = DOMID_SELF }; -#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG - if (!balloon_stats.balloon_low && !balloon_stats.balloon_high) { - nr_pages = min(nr_pages, balloon_stats.balloon_hotplug); - balloon_stats.hotplug_pages += nr_pages; - balloon_stats.balloon_hotplug -= nr_pages; - return BP_DONE; - } -#endif - if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); @@ -438,15 +396,6 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) .domid = DOMID_SELF }; -#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG - if (balloon_stats.hotplug_pages) { - nr_pages = min(nr_pages, balloon_stats.hotplug_pages); - balloon_stats.hotplug_pages -= nr_pages; - balloon_stats.balloon_hotplug += nr_pages; - return BP_DONE; - } -#endif - if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); @@ -635,6 +584,8 @@ static void __init balloon_add_region(unsigned long start_pfn, don't subtract from it. */ __balloon_append(page); } + + balloon_stats.total_pages += extra_pfn_end - start_pfn; } static int __init balloon_init(void) @@ -652,6 +603,7 @@ static int __init balloon_init(void) balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; + balloon_stats.total_pages = balloon_stats.current_pages; balloon_stats.schedule_delay = 1; balloon_stats.max_schedule_delay = 32; @@ -659,9 +611,6 @@ static int __init balloon_init(void) balloon_stats.max_retry_count = RETRY_UNLIMITED; #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG - balloon_stats.hotplug_pages = 0; - balloon_stats.balloon_hotplug = 0; - set_online_page_callback(&xen_online_page); register_memory_notifier(&xen_memory_nb); #endif diff --git a/include/xen/balloon.h b/include/xen/balloon.h index cc2e1a7e44ec..c8aee7a8b8d2 100644 --- a/include/xen/balloon.h +++ b/include/xen/balloon.h @@ -11,14 +11,11 @@ struct balloon_stats { /* Number of pages in high- and low-memory balloons. */ unsigned long balloon_low; unsigned long balloon_high; + unsigned long total_pages; unsigned long schedule_delay; unsigned long max_schedule_delay; unsigned long retry_count; unsigned long max_retry_count; -#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG - unsigned long hotplug_pages; - unsigned long balloon_hotplug; -#endif }; extern struct balloon_stats balloon_stats; -- cgit v1.2.3 From b2ac6aa8f71bf57b948ee68cd913c350b932da88 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 25 Jun 2015 12:10:28 +0100 Subject: xen/balloon: only hotplug additional memory if required Now that we track the total number of pages (included hotplugged regions), it is easy to determine if more memory needs to be hotplugged. Add a new BP_WAIT state to signal that the balloon process needs to wait until kicked by the memory add notifier (when the new section is onlined by userspace). Signed-off-by: David Vrabel Reviewed-by: Daniel Kiper --- v3: - Return BP_WAIT if enough sections are already hotplugged. v2: - New BP_WAIT status after adding new memory sections. --- drivers/xen/balloon.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index ac8054765127..ac6391bd8029 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -75,12 +75,14 @@ * balloon_process() state: * * BP_DONE: done or nothing to do, + * BP_WAIT: wait to be rescheduled, * BP_EAGAIN: error, go to sleep, * BP_ECANCELED: error, balloon operation canceled. */ enum bp_state { BP_DONE, + BP_WAIT, BP_EAGAIN, BP_ECANCELED }; @@ -167,6 +169,9 @@ static struct page *balloon_next_page(struct page *page) static enum bp_state update_schedule(enum bp_state state) { + if (state == BP_WAIT) + return BP_WAIT; + if (state == BP_ECANCELED) return BP_ECANCELED; @@ -231,12 +236,22 @@ static void release_memory_resource(struct resource *resource) kfree(resource); } -static enum bp_state reserve_additional_memory(long credit) +static enum bp_state reserve_additional_memory(void) { + long credit; struct resource *resource; int nid, rc; unsigned long balloon_hotplug; + credit = balloon_stats.target_pages - balloon_stats.total_pages; + + /* + * Already hotplugged enough pages? Wait for them to be + * onlined. + */ + if (credit <= 0) + return BP_WAIT; + balloon_hotplug = round_up(credit, PAGES_PER_SECTION); resource = additional_memory_resource(balloon_hotplug * PAGE_SIZE); @@ -276,7 +291,7 @@ static enum bp_state reserve_additional_memory(long credit) balloon_stats.total_pages += balloon_hotplug; - return BP_DONE; + return BP_WAIT; err: release_memory_resource(resource); return BP_ECANCELED; @@ -306,7 +321,7 @@ static struct notifier_block xen_memory_nb = { .priority = 0 }; #else -static enum bp_state reserve_additional_memory(long credit) +static enum bp_state reserve_additional_memory(void) { balloon_stats.target_pages = balloon_stats.current_pages; return BP_DONE; @@ -474,7 +489,7 @@ static void balloon_process(struct work_struct *work) if (balloon_is_inflated()) state = increase_reservation(credit); else - state = reserve_additional_memory(credit); + state = reserve_additional_memory(); } if (credit < 0) -- cgit v1.2.3 From 81b286e0f1fe520f2a96f736ffa7e508ac9139ba Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 25 Jun 2015 13:12:46 +0100 Subject: xen/balloon: make alloc_xenballoon_pages() always allocate low pages All users of alloc_xenballoon_pages() wanted low memory pages, so remove the option for high memory. Signed-off-by: David Vrabel Reviewed-by: Daniel Kiper --- arch/x86/xen/grant-table.c | 2 +- drivers/xen/balloon.c | 21 ++++++++------------- drivers/xen/grant-table.c | 2 +- drivers/xen/privcmd.c | 2 +- drivers/xen/xenbus/xenbus_client.c | 3 +-- include/xen/balloon.h | 3 +-- 6 files changed, 13 insertions(+), 20 deletions(-) diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 1580e7a5a4cf..e079500b17f3 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -133,7 +133,7 @@ static int __init xlated_setup_gnttab_pages(void) kfree(pages); return -ENOMEM; } - rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */); + rc = alloc_xenballooned_pages(nr_grant_frames, pages); if (rc) { pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__, nr_grant_frames, rc); diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index ac6391bd8029..7ec933d505d2 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -136,17 +136,16 @@ static void balloon_append(struct page *page) } /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ -static struct page *balloon_retrieve(bool prefer_highmem) +static struct page *balloon_retrieve(bool require_lowmem) { struct page *page; if (list_empty(&ballooned_pages)) return NULL; - if (prefer_highmem) - page = list_entry(ballooned_pages.prev, struct page, lru); - else - page = list_entry(ballooned_pages.next, struct page, lru); + page = list_entry(ballooned_pages.next, struct page, lru); + if (require_lowmem && PageHighMem(page)) + return NULL; list_del(&page->lru); if (PageHighMem(page)) @@ -521,24 +520,20 @@ EXPORT_SYMBOL_GPL(balloon_set_new_target); * alloc_xenballooned_pages - get pages that have been ballooned out * @nr_pages: Number of pages to get * @pages: pages returned - * @highmem: allow highmem pages * @return 0 on success, error otherwise */ -int alloc_xenballooned_pages(int nr_pages, struct page **pages, bool highmem) +int alloc_xenballooned_pages(int nr_pages, struct page **pages) { int pgno = 0; struct page *page; mutex_lock(&balloon_mutex); while (pgno < nr_pages) { - page = balloon_retrieve(highmem); - if (page && (highmem || !PageHighMem(page))) { + page = balloon_retrieve(true); + if (page) { pages[pgno++] = page; } else { enum bp_state st; - if (page) - balloon_append(page); - st = decrease_reservation(nr_pages - pgno, - highmem ? GFP_HIGHUSER : GFP_USER); + st = decrease_reservation(nr_pages - pgno, GFP_USER); if (st != BP_DONE) goto out_undo; } diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 62f591f8763c..a4b702c9ac68 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -687,7 +687,7 @@ int gnttab_alloc_pages(int nr_pages, struct page **pages) int i; int ret; - ret = alloc_xenballooned_pages(nr_pages, pages, false); + ret = alloc_xenballooned_pages(nr_pages, pages); if (ret < 0) return ret; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 5e9adac928e6..b199ad3d4587 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -401,7 +401,7 @@ static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) if (pages == NULL) return -ENOMEM; - rc = alloc_xenballooned_pages(numpgs, pages, 0); + rc = alloc_xenballooned_pages(numpgs, pages); if (rc != 0) { pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__, numpgs, rc); diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 2ba09c1195c8..aa304d05101b 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -614,8 +614,7 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev, if (!node) return -ENOMEM; - err = alloc_xenballooned_pages(nr_grefs, node->hvm.pages, - false /* lowmem */); + err = alloc_xenballooned_pages(nr_grefs, node->hvm.pages); if (err) goto out_err; diff --git a/include/xen/balloon.h b/include/xen/balloon.h index c8aee7a8b8d2..83efdeb243bf 100644 --- a/include/xen/balloon.h +++ b/include/xen/balloon.h @@ -22,8 +22,7 @@ extern struct balloon_stats balloon_stats; void balloon_set_new_target(unsigned long target); -int alloc_xenballooned_pages(int nr_pages, struct page **pages, - bool highmem); +int alloc_xenballooned_pages(int nr_pages, struct page **pages); void free_xenballooned_pages(int nr_pages, struct page **pages); struct device; -- cgit v1.2.3 From 1cf6a6c82918c9aad4bb73a7e7379a649e4d8e50 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 25 Jun 2015 16:29:18 +0100 Subject: xen/balloon: use hotplugged pages for foreign mappings etc. alloc_xenballooned_pages() is used to get ballooned pages to back foreign mappings etc. Instead of having to balloon out real pages, use (if supported) hotplugged memory. This makes more memory available to the guest and reduces fragmentation in the p2m. This is only enabled if the xen.balloon.hotplug_unpopulated sysctl is set to 1. This sysctl defaults to 0 in case the udev rules to automatically online hotplugged memory do not exist. Signed-off-by: David Vrabel Reviewed-by: Daniel Kiper --- v3: - Add xen.balloon.hotplug_unpopulated sysctl to enable use of hotplug for unpopulated pages. --- drivers/xen/balloon.c | 90 +++++++++++++++++++++++++++++++++++++++++++++------ include/xen/balloon.h | 1 + 2 files changed, 81 insertions(+), 10 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 7ec933d505d2..25fd1bd949d8 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -71,6 +72,46 @@ #include #include +static int xen_hotplug_unpopulated; + +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + +static int zero; +static int one = 1; + +static struct ctl_table balloon_table[] = { + { + .procname = "hotplug_unpopulated", + .data = &xen_hotplug_unpopulated, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { } +}; + +static struct ctl_table balloon_root[] = { + { + .procname = "balloon", + .mode = 0555, + .child = balloon_table, + }, + { } +}; + +static struct ctl_table xen_root[] = { + { + .procname = "xen", + .mode = 0555, + .child = balloon_root, + }, + { } +}; + +#endif + /* * balloon_process() state: * @@ -99,6 +140,7 @@ static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)]; /* List of ballooned pages, threaded through the mem_map array. */ static LIST_HEAD(ballooned_pages); +static DECLARE_WAIT_QUEUE_HEAD(balloon_wq); /* Main work function, always executed in process context. */ static void balloon_process(struct work_struct *work); @@ -127,6 +169,7 @@ static void __balloon_append(struct page *page) list_add(&page->lru, &ballooned_pages); balloon_stats.balloon_low++; } + wake_up(&balloon_wq); } static void balloon_append(struct page *page) @@ -242,7 +285,8 @@ static enum bp_state reserve_additional_memory(void) int nid, rc; unsigned long balloon_hotplug; - credit = balloon_stats.target_pages - balloon_stats.total_pages; + credit = balloon_stats.target_pages + balloon_stats.target_unpopulated + - balloon_stats.total_pages; /* * Already hotplugged enough pages? Wait for them to be @@ -323,7 +367,7 @@ static struct notifier_block xen_memory_nb = { static enum bp_state reserve_additional_memory(void) { balloon_stats.target_pages = balloon_stats.current_pages; - return BP_DONE; + return BP_ECANCELED; } #endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ @@ -516,6 +560,28 @@ void balloon_set_new_target(unsigned long target) } EXPORT_SYMBOL_GPL(balloon_set_new_target); +static int add_ballooned_pages(int nr_pages) +{ + enum bp_state st; + + if (xen_hotplug_unpopulated) { + st = reserve_additional_memory(); + if (st != BP_ECANCELED) { + mutex_unlock(&balloon_mutex); + wait_event(balloon_wq, + !list_empty(&ballooned_pages)); + mutex_lock(&balloon_mutex); + return 0; + } + } + + st = decrease_reservation(nr_pages, GFP_USER); + if (st != BP_DONE) + return -ENOMEM; + + return 0; +} + /** * alloc_xenballooned_pages - get pages that have been ballooned out * @nr_pages: Number of pages to get @@ -526,27 +592,28 @@ int alloc_xenballooned_pages(int nr_pages, struct page **pages) { int pgno = 0; struct page *page; + int ret; + mutex_lock(&balloon_mutex); + + balloon_stats.target_unpopulated += nr_pages; + while (pgno < nr_pages) { page = balloon_retrieve(true); if (page) { pages[pgno++] = page; } else { - enum bp_state st; - st = decrease_reservation(nr_pages - pgno, GFP_USER); - if (st != BP_DONE) + ret = add_ballooned_pages(nr_pages - pgno); + if (ret < 0) goto out_undo; } } mutex_unlock(&balloon_mutex); return 0; out_undo: - while (pgno) - balloon_append(pages[--pgno]); - /* Free the memory back to the kernel soon */ - schedule_delayed_work(&balloon_worker, 0); mutex_unlock(&balloon_mutex); - return -ENOMEM; + free_xenballooned_pages(pgno, pages); + return ret; } EXPORT_SYMBOL(alloc_xenballooned_pages); @@ -566,6 +633,8 @@ void free_xenballooned_pages(int nr_pages, struct page **pages) balloon_append(pages[i]); } + balloon_stats.target_unpopulated -= nr_pages; + /* The balloon may be too large now. Shrink it if needed. */ if (current_credit()) schedule_delayed_work(&balloon_worker, 0); @@ -623,6 +692,7 @@ static int __init balloon_init(void) #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG set_online_page_callback(&xen_online_page); register_memory_notifier(&xen_memory_nb); + register_sysctl_table(xen_root); #endif /* diff --git a/include/xen/balloon.h b/include/xen/balloon.h index 83efdeb243bf..d1767dfb0d95 100644 --- a/include/xen/balloon.h +++ b/include/xen/balloon.h @@ -8,6 +8,7 @@ struct balloon_stats { /* We aim for 'current allocation' == 'target allocation'. */ unsigned long current_pages; unsigned long target_pages; + unsigned long target_unpopulated; /* Number of pages in high- and low-memory balloons. */ unsigned long balloon_low; unsigned long balloon_high; -- cgit v1.2.3 From 8edfcf882eb91ec9028c7334f90f6ef3db5b0fcf Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Wed, 22 Jul 2015 14:48:09 +0100 Subject: x86/xen: export xen_alloc_p2m_entry() Rename alloc_p2m() to xen_alloc_p2m_entry() and export it. This is useful for ensuring that a p2m entry is allocated (i.e., not a shared missing or identity entry) so that subsequent set_phys_to_machine() calls will require no further allocations. Signed-off-by: David Vrabel Reviewed-by: Daniel Kiper --- v3: - Make xen_alloc_p2m_entry() a nop on auto-xlate guests. --- arch/x86/include/asm/xen/page.h | 2 ++ arch/x86/xen/p2m.c | 19 +++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 0679e11d2cf7..b922fa4bb4a1 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -43,6 +43,8 @@ extern unsigned long *xen_p2m_addr; extern unsigned long xen_p2m_size; extern unsigned long xen_max_p2m_pfn; +extern int xen_alloc_p2m_entry(unsigned long pfn); + extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 660b3cfef234..cab9f766bb06 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -530,7 +530,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg) * the new pages are installed with cmpxchg; if we lose the race then * simply free the page we allocated and use the one that's there. */ -static bool alloc_p2m(unsigned long pfn) +int xen_alloc_p2m_entry(unsigned long pfn) { unsigned topidx; unsigned long *top_mfn_p, *mid_mfn; @@ -540,6 +540,9 @@ static bool alloc_p2m(unsigned long pfn) unsigned long addr = (unsigned long)(xen_p2m_addr + pfn); unsigned long p2m_pfn; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + ptep = lookup_address(addr, &level); BUG_ON(!ptep || level != PG_LEVEL_4K); pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); @@ -548,7 +551,7 @@ static bool alloc_p2m(unsigned long pfn) /* PMD level is missing, allocate a new one */ ptep = alloc_p2m_pmd(addr, pte_pg); if (!ptep) - return false; + return -ENOMEM; } if (p2m_top_mfn && pfn < MAX_P2M_PFN) { @@ -566,7 +569,7 @@ static bool alloc_p2m(unsigned long pfn) mid_mfn = alloc_p2m_page(); if (!mid_mfn) - return false; + return -ENOMEM; p2m_mid_mfn_init(mid_mfn, p2m_missing); @@ -592,7 +595,7 @@ static bool alloc_p2m(unsigned long pfn) p2m = alloc_p2m_page(); if (!p2m) - return false; + return -ENOMEM; if (p2m_pfn == PFN_DOWN(__pa(p2m_missing))) p2m_init(p2m); @@ -625,8 +628,9 @@ static bool alloc_p2m(unsigned long pfn) HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn; } - return true; + return 0; } +EXPORT_SYMBOL(xen_alloc_p2m_entry); unsigned long __init set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e) @@ -688,7 +692,10 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) { if (unlikely(!__set_phys_to_machine(pfn, mfn))) { - if (!alloc_p2m(pfn)) + int ret; + + ret = xen_alloc_p2m_entry(pfn); + if (ret < 0) return false; return __set_phys_to_machine(pfn, mfn); -- cgit v1.2.3 From 4a69c909deb0dd3cae653d14ac0ff52d5440a19c Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Wed, 22 Jul 2015 14:50:37 +0100 Subject: xen/balloon: pre-allocate p2m entries for ballooned pages Pages returned by alloc_xenballooned_pages() will be used for grant mapping which will call set_phys_to_machine() (in PV guests). Ballooned pages are set as INVALID_P2M_ENTRY in the p2m and thus may be using the (shared) missing tables and a subsequent set_phys_to_machine() will need to allocate new tables. Since the grant mapping may be done from a context that cannot sleep, the p2m entries must already be allocated. Signed-off-by: David Vrabel Reviewed-by: Daniel Kiper --- drivers/xen/balloon.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 25fd1bd949d8..f56662324a47 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -602,6 +602,11 @@ int alloc_xenballooned_pages(int nr_pages, struct page **pages) page = balloon_retrieve(true); if (page) { pages[pgno++] = page; +#ifdef CONFIG_XEN_HAVE_PVMMU + ret = xen_alloc_p2m_entry(page_to_pfn(page)); + if (ret < 0) + goto out_undo; +#endif } else { ret = add_ballooned_pages(nr_pages - pgno); if (ret < 0) -- cgit v1.2.3 From a0f2e80fcd3b6b1369527828b02caab01af60c36 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Wed, 24 Jun 2015 18:03:14 +0100 Subject: net/xen-netback: xenvif_gop_frag_copy: move GSO check out of the loop The skb doesn't change within the function. Therefore it's only necessary to check if we need GSO once at the beginning. Signed-off-by: Julien Grall Acked-by: Wei Liu Signed-off-by: David Vrabel --- drivers/net/xen-netback/netback.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index ec98d43916a8..c4e6c025d64d 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -288,6 +288,13 @@ static void xenvif_gop_frag_copy(struct xenvif_queue *queue, struct sk_buff *skb unsigned long bytes; int gso_type = XEN_NETIF_GSO_TYPE_NONE; + if (skb_is_gso(skb)) { + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) + gso_type = XEN_NETIF_GSO_TYPE_TCPV4; + else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) + gso_type = XEN_NETIF_GSO_TYPE_TCPV6; + } + /* Data must not cross a page boundary. */ BUG_ON(size + offset > PAGE_SIZE<gso_type & SKB_GSO_TCPV4) - gso_type = XEN_NETIF_GSO_TYPE_TCPV4; - else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) - gso_type = XEN_NETIF_GSO_TYPE_TCPV6; - } - if (*head && ((1 << gso_type) & queue->vif->gso_mask)) queue->rx.req_cons++; -- cgit v1.2.3 From 5031612b5eaf15cb6471c6e936a515090810c8f1 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Tue, 21 Jul 2015 17:57:20 +0100 Subject: arm/xen: Drop pte_mfn and mfn_pte They are not used in common code expect in one place in balloon.c which is only compiled when Linux is using PV MMU. It's not the case on ARM. Rather than worrying how to handle the 64KB case, drop them. Signed-off-by: Julien Grall Reviewed-by: Stefano Stabellini Signed-off-by: David Vrabel --- arch/arm/include/asm/xen/page.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h index 127956353b00..98c9fc38b945 100644 --- a/arch/arm/include/asm/xen/page.h +++ b/arch/arm/include/asm/xen/page.h @@ -13,9 +13,6 @@ #define phys_to_machine_mapping_valid(pfn) (1) -#define pte_mfn pte_pfn -#define mfn_pte pfn_pte - /* Xen machine address */ typedef struct xmaddr { phys_addr_t maddr; -- cgit v1.2.3 From 1084b1988d22dc165c9dbbc2b0e057f9248ac4db Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Mon, 4 May 2015 15:47:16 +0100 Subject: xen: Add Xen specific page definition The Xen hypercall interface is always using 4K page granularity on ARM and x86 architecture. With the incoming support of 64K page granularity for ARM64 guest, it won't be possible to re-use the Linux page definition in Xen drivers. Introduce Xen page definition helpers based on the Linux page definition. They have exactly the same name but prefixed with XEN_/xen_ prefix. Also modify xen_page_to_gfn to use new Xen page definition. Signed-off-by: Julien Grall Reviewed-by: Stefano Stabellini Signed-off-by: David Vrabel --- include/xen/page.h | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/include/xen/page.h b/include/xen/page.h index 1daae485e336..96294ac93755 100644 --- a/include/xen/page.h +++ b/include/xen/page.h @@ -1,11 +1,36 @@ #ifndef _XEN_PAGE_H #define _XEN_PAGE_H +#include + +/* The hypercall interface supports only 4KB page */ +#define XEN_PAGE_SHIFT 12 +#define XEN_PAGE_SIZE (_AC(1, UL) << XEN_PAGE_SHIFT) +#define XEN_PAGE_MASK (~(XEN_PAGE_SIZE-1)) +#define xen_offset_in_page(p) ((unsigned long)(p) & ~XEN_PAGE_MASK) + +/* + * We assume that PAGE_SIZE is a multiple of XEN_PAGE_SIZE + * XXX: Add a BUILD_BUG_ON? + */ + +#define xen_pfn_to_page(xen_pfn) \ + ((pfn_to_page(((unsigned long)(xen_pfn) << XEN_PAGE_SHIFT) >> PAGE_SHIFT))) +#define page_to_xen_pfn(page) \ + (((page_to_pfn(page)) << PAGE_SHIFT) >> XEN_PAGE_SHIFT) + +#define XEN_PFN_PER_PAGE (PAGE_SIZE / XEN_PAGE_SIZE) + +#define XEN_PFN_DOWN(x) ((x) >> XEN_PAGE_SHIFT) +#define XEN_PFN_UP(x) (((x) + XEN_PAGE_SIZE-1) >> XEN_PAGE_SHIFT) +#define XEN_PFN_PHYS(x) ((phys_addr_t)(x) << XEN_PAGE_SHIFT) + #include +/* Return the GFN associated to the first 4KB of the page */ static inline unsigned long xen_page_to_gfn(struct page *page) { - return pfn_to_gfn(page_to_pfn(page)); + return pfn_to_gfn(page_to_xen_pfn(page)); } struct xen_memory_region { -- cgit v1.2.3 From 008c320a96d218712043f8db0111d5472697785c Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Fri, 19 Jun 2015 17:49:03 +0100 Subject: xen/grant: Introduce helpers to split a page into grant Currently, a grant is always based on the Xen page granularity (i.e 4KB). When Linux is using a different page granularity, a single page will be split between multiple grants. The new helpers will be in charge of splitting the Linux page into grants and call a function given by the caller on each grant. Also provide an helper to count the number of grants within a given contiguous region. Note that the x86/include/asm/xen/page.h is now including xen/interface/grant_table.h rather than xen/grant_table.h. It's necessary because xen/grant_table.h depends on asm/xen/page.h and will break the compilation. Furthermore, only definition in interface/grant_table.h is required. Signed-off-by: Julien Grall Reviewed-by: David Vrabel Reviewed-by: Stefano Stabellini Signed-off-by: David Vrabel --- arch/x86/include/asm/xen/page.h | 2 +- drivers/xen/grant-table.c | 26 +++++++++++++++++++++++++ include/xen/grant_table.h | 42 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index b922fa4bb4a1..fe58e3a935de 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -12,7 +12,7 @@ #include #include -#include +#include #include /* Xen machine address */ diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index a4b702c9ac68..dbeaa67dec47 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -776,6 +776,32 @@ void gnttab_batch_copy(struct gnttab_copy *batch, unsigned count) } EXPORT_SYMBOL_GPL(gnttab_batch_copy); +void gnttab_foreach_grant_in_range(struct page *page, + unsigned int offset, + unsigned int len, + xen_grant_fn_t fn, + void *data) +{ + unsigned int goffset; + unsigned int glen; + unsigned long xen_pfn; + + len = min_t(unsigned int, PAGE_SIZE - offset, len); + goffset = xen_offset_in_page(offset); + + xen_pfn = page_to_xen_pfn(page) + XEN_PFN_DOWN(offset); + + while (len) { + glen = min_t(unsigned int, XEN_PAGE_SIZE - goffset, len); + fn(pfn_to_gfn(xen_pfn), goffset, glen, data); + + goffset = 0; + xen_pfn++; + len -= glen; + } +} +EXPORT_SYMBOL_GPL(gnttab_foreach_grant_in_range); + int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count) diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 4478f4b4aae2..05b5b08c2afc 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -45,8 +45,10 @@ #include #include +#include #include #include +#include #define GNTTAB_RESERVED_XENSTORE 1 @@ -224,4 +226,44 @@ static inline struct xen_page_foreign *xen_page_foreign(struct page *page) #endif } +/* Split Linux page in chunk of the size of the grant and call fn + * + * Parameters of fn: + * gfn: guest frame number + * offset: offset in the grant + * len: length of the data in the grant. + * data: internal information + */ +typedef void (*xen_grant_fn_t)(unsigned long gfn, unsigned int offset, + unsigned int len, void *data); + +void gnttab_foreach_grant_in_range(struct page *page, + unsigned int offset, + unsigned int len, + xen_grant_fn_t fn, + void *data); + +/* Helper to get to call fn only on the first "grant chunk" */ +static inline void gnttab_for_one_grant(struct page *page, unsigned int offset, + unsigned len, xen_grant_fn_t fn, + void *data) +{ + /* The first request is limited to the size of one grant */ + len = min_t(unsigned int, XEN_PAGE_SIZE - (offset & ~XEN_PAGE_MASK), + len); + + gnttab_foreach_grant_in_range(page, offset, len, fn, data); +} + +/* Get the number of grant in a specified region + * + * start: Offset from the beginning of the first page + * len: total length of data (can cross multiple page) + */ +static inline unsigned int gnttab_count_grant(unsigned int start, + unsigned int len) +{ + return XEN_PFN_UP(xen_offset_in_page(start) + len); +} + #endif /* __ASM_GNTTAB_H__ */ -- cgit v1.2.3 From 3922f32c1e6db2e096ff095a5b8af0b940b97508 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Fri, 19 Jun 2015 18:05:06 +0100 Subject: xen/grant: Add helper gnttab_page_grant_foreign_access_ref_one Many PV drivers contain the idiom: pfn = page_to_gfn(...) /* Or similar */ gnttab_grant_foreign_access_ref Replace it by a new helper. Note that when Linux is using a different page granularity than Xen, the helper only gives access to the first 4KB grant. This is useful where drivers are allocating a full Linux page for each grant. Also include xen/interface/grant_table.h rather than xen/grant_table.h in asm/page.h for x86 to fix a compilation issue [1]. Only the former is useful in order to get the structure definition. [1] Interdependency between asm/page.h and xen/grant_table.h which result to page_mfn not being defined when necessary. Signed-off-by: Julien Grall Reviewed-by: David Vrabel Reviewed-by: Stefano Stabellini Signed-off-by: David Vrabel --- include/xen/grant_table.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 05b5b08c2afc..e17a4b381a16 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -131,6 +131,15 @@ void gnttab_cancel_free_callback(struct gnttab_free_callback *callback); void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, unsigned long frame, int readonly); +/* Give access to the first 4K of the page */ +static inline void gnttab_page_grant_foreign_access_ref_one( + grant_ref_t ref, domid_t domid, + struct page *page, int readonly) +{ + gnttab_grant_foreign_access_ref(ref, domid, xen_page_to_gfn(page), + readonly); +} + void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid, unsigned long pfn); -- cgit v1.2.3 From 33204663ef85d9ce79009b2246afe6a2fef8eb1b Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Mon, 29 Jun 2015 17:35:24 +0100 Subject: block/xen-blkfront: Split blkif_queue_request in 2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, blkif_queue_request has 2 distinct execution path: - Send a discard request - Send a read/write request The function is also allocating grants to use for generating the request. Although, this is only used for read/write request. Rather than having a function with 2 distinct execution path, separate the function in 2. This will also remove one level of tabulation. Signed-off-by: Julien Grall Reviewed-by: Roger Pau Monné Signed-off-by: David Vrabel --- drivers/block/xen-blkfront.c | 277 ++++++++++++++++++++++++------------------- 1 file changed, 153 insertions(+), 124 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 611170896b8c..bf9ccfe03f45 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -394,13 +394,35 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode, return 0; } -/* - * Generate a Xen blkfront IO request from a blk layer request. Reads - * and writes are handled as expected. - * - * @req: a request struct - */ -static int blkif_queue_request(struct request *req) +static int blkif_queue_discard_req(struct request *req) +{ + struct blkfront_info *info = req->rq_disk->private_data; + struct blkif_request *ring_req; + unsigned long id; + + /* Fill out a communications ring structure. */ + ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); + id = get_id_from_freelist(info); + info->shadow[id].request = req; + + ring_req->operation = BLKIF_OP_DISCARD; + ring_req->u.discard.nr_sectors = blk_rq_sectors(req); + ring_req->u.discard.id = id; + ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req); + if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) + ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; + else + ring_req->u.discard.flag = 0; + + info->ring.req_prod_pvt++; + + /* Keep a private copy so we can reissue requests when recovering. */ + info->shadow[id].req = *ring_req; + + return 0; +} + +static int blkif_queue_rw_req(struct request *req) { struct blkfront_info *info = req->rq_disk->private_data; struct blkif_request *ring_req; @@ -420,9 +442,6 @@ static int blkif_queue_request(struct request *req) struct scatterlist *sg; int nseg, max_grefs; - if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) - return 1; - max_grefs = req->nr_phys_segments; if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST) /* @@ -452,139 +471,131 @@ static int blkif_queue_request(struct request *req) id = get_id_from_freelist(info); info->shadow[id].request = req; - if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { - ring_req->operation = BLKIF_OP_DISCARD; - ring_req->u.discard.nr_sectors = blk_rq_sectors(req); - ring_req->u.discard.id = id; - ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req); - if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) - ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; - else - ring_req->u.discard.flag = 0; + BUG_ON(info->max_indirect_segments == 0 && + req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); + BUG_ON(info->max_indirect_segments && + req->nr_phys_segments > info->max_indirect_segments); + nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); + ring_req->u.rw.id = id; + if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) { + /* + * The indirect operation can only be a BLKIF_OP_READ or + * BLKIF_OP_WRITE + */ + BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA)); + ring_req->operation = BLKIF_OP_INDIRECT; + ring_req->u.indirect.indirect_op = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req); + ring_req->u.indirect.handle = info->handle; + ring_req->u.indirect.nr_segments = nseg; } else { - BUG_ON(info->max_indirect_segments == 0 && - req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); - BUG_ON(info->max_indirect_segments && - req->nr_phys_segments > info->max_indirect_segments); - nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); - ring_req->u.rw.id = id; - if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) { + ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); + ring_req->u.rw.handle = info->handle; + ring_req->operation = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { /* - * The indirect operation can only be a BLKIF_OP_READ or - * BLKIF_OP_WRITE + * Ideally we can do an unordered flush-to-disk. + * In case the backend onlysupports barriers, use that. + * A barrier request a superset of FUA, so we can + * implement it the same way. (It's also a FLUSH+FUA, + * since it is guaranteed ordered WRT previous writes.) */ - BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA)); - ring_req->operation = BLKIF_OP_INDIRECT; - ring_req->u.indirect.indirect_op = rq_data_dir(req) ? - BLKIF_OP_WRITE : BLKIF_OP_READ; - ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req); - ring_req->u.indirect.handle = info->handle; - ring_req->u.indirect.nr_segments = nseg; - } else { - ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); - ring_req->u.rw.handle = info->handle; - ring_req->operation = rq_data_dir(req) ? - BLKIF_OP_WRITE : BLKIF_OP_READ; - if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { - /* - * Ideally we can do an unordered flush-to-disk. In case the - * backend onlysupports barriers, use that. A barrier request - * a superset of FUA, so we can implement it the same - * way. (It's also a FLUSH+FUA, since it is - * guaranteed ordered WRT previous writes.) - */ - switch (info->feature_flush & - ((REQ_FLUSH|REQ_FUA))) { - case REQ_FLUSH|REQ_FUA: - ring_req->operation = - BLKIF_OP_WRITE_BARRIER; - break; - case REQ_FLUSH: - ring_req->operation = - BLKIF_OP_FLUSH_DISKCACHE; - break; - default: - ring_req->operation = 0; - } + switch (info->feature_flush & + ((REQ_FLUSH|REQ_FUA))) { + case REQ_FLUSH|REQ_FUA: + ring_req->operation = + BLKIF_OP_WRITE_BARRIER; + break; + case REQ_FLUSH: + ring_req->operation = + BLKIF_OP_FLUSH_DISKCACHE; + break; + default: + ring_req->operation = 0; } - ring_req->u.rw.nr_segments = nseg; } - for_each_sg(info->shadow[id].sg, sg, nseg, i) { - fsect = sg->offset >> 9; - lsect = fsect + (sg->length >> 9) - 1; + ring_req->u.rw.nr_segments = nseg; + } + for_each_sg(info->shadow[id].sg, sg, nseg, i) { + fsect = sg->offset >> 9; + lsect = fsect + (sg->length >> 9) - 1; - if ((ring_req->operation == BLKIF_OP_INDIRECT) && - (i % SEGS_PER_INDIRECT_FRAME == 0)) { - unsigned long uninitialized_var(pfn); + if ((ring_req->operation == BLKIF_OP_INDIRECT) && + (i % SEGS_PER_INDIRECT_FRAME == 0)) { + unsigned long uninitialized_var(pfn); - if (segments) - kunmap_atomic(segments); + if (segments) + kunmap_atomic(segments); - n = i / SEGS_PER_INDIRECT_FRAME; - if (!info->feature_persistent) { - struct page *indirect_page; - - /* Fetch a pre-allocated page to use for indirect grefs */ - BUG_ON(list_empty(&info->indirect_pages)); - indirect_page = list_first_entry(&info->indirect_pages, - struct page, lru); - list_del(&indirect_page->lru); - pfn = page_to_pfn(indirect_page); - } - gnt_list_entry = get_grant(&gref_head, pfn, info); - info->shadow[id].indirect_grants[n] = gnt_list_entry; - segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); - ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; + n = i / SEGS_PER_INDIRECT_FRAME; + if (!info->feature_persistent) { + struct page *indirect_page; + + /* + * Fetch a pre-allocated page to use for + * indirect grefs + */ + BUG_ON(list_empty(&info->indirect_pages)); + indirect_page = list_first_entry(&info->indirect_pages, + struct page, lru); + list_del(&indirect_page->lru); + pfn = page_to_pfn(indirect_page); } + gnt_list_entry = get_grant(&gref_head, pfn, info); + info->shadow[id].indirect_grants[n] = gnt_list_entry; + segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); + ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; + } - gnt_list_entry = get_grant(&gref_head, page_to_pfn(sg_page(sg)), info); - ref = gnt_list_entry->gref; + gnt_list_entry = get_grant(&gref_head, page_to_pfn(sg_page(sg)), info); + ref = gnt_list_entry->gref; - info->shadow[id].grants_used[i] = gnt_list_entry; + info->shadow[id].grants_used[i] = gnt_list_entry; - if (rq_data_dir(req) && info->feature_persistent) { - char *bvec_data; - void *shared_data; + if (rq_data_dir(req) && info->feature_persistent) { + char *bvec_data; + void *shared_data; - BUG_ON(sg->offset + sg->length > PAGE_SIZE); + BUG_ON(sg->offset + sg->length > PAGE_SIZE); - shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); - bvec_data = kmap_atomic(sg_page(sg)); + shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); + bvec_data = kmap_atomic(sg_page(sg)); - /* - * this does not wipe data stored outside the - * range sg->offset..sg->offset+sg->length. - * Therefore, blkback *could* see data from - * previous requests. This is OK as long as - * persistent grants are shared with just one - * domain. It may need refactoring if this - * changes - */ - memcpy(shared_data + sg->offset, - bvec_data + sg->offset, - sg->length); + /* + * this does not wipe data stored outside the + * range sg->offset..sg->offset+sg->length. + * Therefore, blkback *could* see data from + * previous requests. This is OK as long as + * persistent grants are shared with just one + * domain. It may need refactoring if this + * changes + */ + memcpy(shared_data + sg->offset, + bvec_data + sg->offset, + sg->length); - kunmap_atomic(bvec_data); - kunmap_atomic(shared_data); - } - if (ring_req->operation != BLKIF_OP_INDIRECT) { - ring_req->u.rw.seg[i] = - (struct blkif_request_segment) { - .gref = ref, - .first_sect = fsect, - .last_sect = lsect }; - } else { - n = i % SEGS_PER_INDIRECT_FRAME; - segments[n] = + kunmap_atomic(bvec_data); + kunmap_atomic(shared_data); + } + if (ring_req->operation != BLKIF_OP_INDIRECT) { + ring_req->u.rw.seg[i] = (struct blkif_request_segment) { - .gref = ref, - .first_sect = fsect, - .last_sect = lsect }; - } + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + } else { + n = i % SEGS_PER_INDIRECT_FRAME; + segments[n] = + (struct blkif_request_segment) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; } - if (segments) - kunmap_atomic(segments); } + if (segments) + kunmap_atomic(segments); info->ring.req_prod_pvt++; @@ -597,6 +608,24 @@ static int blkif_queue_request(struct request *req) return 0; } +/* + * Generate a Xen blkfront IO request from a blk layer request. Reads + * and writes are handled as expected. + * + * @req: a request struct + */ +static int blkif_queue_request(struct request *req) +{ + struct blkfront_info *info = req->rq_disk->private_data; + + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) + return 1; + + if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) + return blkif_queue_discard_req(req); + else + return blkif_queue_rw_req(req); +} static inline void flush_requests(struct blkfront_info *info) { -- cgit v1.2.3 From a7a6df222351de23791bb64165f14c21ff4d1653 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Tue, 30 Jun 2015 11:58:51 +0100 Subject: block/xen-blkfront: Store a page rather a pfn in the grant structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All the usage of the field pfn are done using the same idiom: pfn_to_page(grant->pfn) This will return always the same page. Store directly the page in the grant to clean up the code. Signed-off-by: Julien Grall Acked-by: Roger Pau Monné Reviewed-by: Stefano Stabellini Signed-off-by: David Vrabel --- drivers/block/xen-blkfront.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index bf9ccfe03f45..7552ca8f8ffe 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -68,7 +68,7 @@ enum blkif_state { struct grant { grant_ref_t gref; - unsigned long pfn; + struct page *page; struct list_head node; }; @@ -221,7 +221,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num) kfree(gnt_list_entry); goto out_of_memory; } - gnt_list_entry->pfn = page_to_pfn(granted_page); + gnt_list_entry->page = granted_page; } gnt_list_entry->gref = GRANT_INVALID_REF; @@ -236,7 +236,7 @@ out_of_memory: &info->grants, node) { list_del(&gnt_list_entry->node); if (info->feature_persistent) - __free_page(pfn_to_page(gnt_list_entry->pfn)); + __free_page(gnt_list_entry->page); kfree(gnt_list_entry); i--; } @@ -245,8 +245,8 @@ out_of_memory: } static struct grant *get_grant(grant_ref_t *gref_head, - unsigned long pfn, - struct blkfront_info *info) + struct page *page, + struct blkfront_info *info) { struct grant *gnt_list_entry; unsigned long buffer_gfn; @@ -265,10 +265,10 @@ static struct grant *get_grant(grant_ref_t *gref_head, gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); BUG_ON(gnt_list_entry->gref == -ENOSPC); if (!info->feature_persistent) { - BUG_ON(!pfn); - gnt_list_entry->pfn = pfn; + BUG_ON(!page); + gnt_list_entry->page = page; } - buffer_gfn = pfn_to_gfn(gnt_list_entry->pfn); + buffer_gfn = xen_page_to_gfn(gnt_list_entry->page); gnttab_grant_foreign_access_ref(gnt_list_entry->gref, info->xbdev->otherend_id, buffer_gfn, 0); @@ -524,7 +524,7 @@ static int blkif_queue_rw_req(struct request *req) if ((ring_req->operation == BLKIF_OP_INDIRECT) && (i % SEGS_PER_INDIRECT_FRAME == 0)) { - unsigned long uninitialized_var(pfn); + struct page *uninitialized_var(page); if (segments) kunmap_atomic(segments); @@ -541,15 +541,15 @@ static int blkif_queue_rw_req(struct request *req) indirect_page = list_first_entry(&info->indirect_pages, struct page, lru); list_del(&indirect_page->lru); - pfn = page_to_pfn(indirect_page); + page = indirect_page; } - gnt_list_entry = get_grant(&gref_head, pfn, info); + gnt_list_entry = get_grant(&gref_head, page, info); info->shadow[id].indirect_grants[n] = gnt_list_entry; - segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); + segments = kmap_atomic(gnt_list_entry->page); ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; } - gnt_list_entry = get_grant(&gref_head, page_to_pfn(sg_page(sg)), info); + gnt_list_entry = get_grant(&gref_head, sg_page(sg), info); ref = gnt_list_entry->gref; info->shadow[id].grants_used[i] = gnt_list_entry; @@ -560,7 +560,7 @@ static int blkif_queue_rw_req(struct request *req) BUG_ON(sg->offset + sg->length > PAGE_SIZE); - shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); + shared_data = kmap_atomic(gnt_list_entry->page); bvec_data = kmap_atomic(sg_page(sg)); /* @@ -1001,7 +1001,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) info->persistent_gnts_c--; } if (info->feature_persistent) - __free_page(pfn_to_page(persistent_gnt->pfn)); + __free_page(persistent_gnt->page); kfree(persistent_gnt); } } @@ -1036,7 +1036,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) persistent_gnt = info->shadow[i].grants_used[j]; gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); if (info->feature_persistent) - __free_page(pfn_to_page(persistent_gnt->pfn)); + __free_page(persistent_gnt->page); kfree(persistent_gnt); } @@ -1050,7 +1050,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) for (j = 0; j < INDIRECT_GREFS(segs); j++) { persistent_gnt = info->shadow[i].indirect_grants[j]; gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); - __free_page(pfn_to_page(persistent_gnt->pfn)); + __free_page(persistent_gnt->page); kfree(persistent_gnt); } @@ -1101,8 +1101,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { for_each_sg(s->sg, sg, nseg, i) { BUG_ON(sg->offset + sg->length > PAGE_SIZE); - shared_data = kmap_atomic( - pfn_to_page(s->grants_used[i]->pfn)); + shared_data = kmap_atomic(s->grants_used[i]->page); bvec_data = kmap_atomic(sg_page(sg)); memcpy(bvec_data + sg->offset, shared_data + sg->offset, @@ -1154,7 +1153,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, * available pages for indirect grefs. */ if (!info->feature_persistent) { - indirect_page = pfn_to_page(s->indirect_grants[i]->pfn); + indirect_page = s->indirect_grants[i]->page; list_add(&indirect_page->lru, &info->indirect_pages); } s->indirect_grants[i]->gref = GRANT_INVALID_REF; -- cgit v1.2.3 From 4f503fbdf319e4411aa48852b8922c93a9cc0c5d Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Wed, 1 Jul 2015 14:10:38 +0100 Subject: block/xen-blkfront: split get_grant in 2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prepare the code to support 64KB page granularity. The first implementation will use a full Linux page per indirect and persistent grant. When non-persistent grant is used, each page of a bio request may be split in multiple grant. Furthermore, the field page of the grant structure is only used to copy data from persistent grant or indirect grant. Avoid to set it for other use case as it will have no meaning given the page will be split in multiple grant. Provide 2 functions, to setup indirect grant, the other for bio page. Signed-off-by: Julien Grall Acked-by: Roger Pau Monné Signed-off-by: David Vrabel --- drivers/block/xen-blkfront.c | 88 +++++++++++++++++++++++++++++--------------- 1 file changed, 59 insertions(+), 29 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 7552ca8f8ffe..f5dfc1694679 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -244,34 +244,77 @@ out_of_memory: return -ENOMEM; } -static struct grant *get_grant(grant_ref_t *gref_head, - struct page *page, - struct blkfront_info *info) +static struct grant *get_free_grant(struct blkfront_info *info) { struct grant *gnt_list_entry; - unsigned long buffer_gfn; BUG_ON(list_empty(&info->grants)); gnt_list_entry = list_first_entry(&info->grants, struct grant, - node); + node); list_del(&gnt_list_entry->node); - if (gnt_list_entry->gref != GRANT_INVALID_REF) { + if (gnt_list_entry->gref != GRANT_INVALID_REF) info->persistent_gnts_c--; + + return gnt_list_entry; +} + +static inline void grant_foreign_access(const struct grant *gnt_list_entry, + const struct blkfront_info *info) +{ + gnttab_page_grant_foreign_access_ref_one(gnt_list_entry->gref, + info->xbdev->otherend_id, + gnt_list_entry->page, + 0); +} + +static struct grant *get_grant(grant_ref_t *gref_head, + unsigned long gfn, + struct blkfront_info *info) +{ + struct grant *gnt_list_entry = get_free_grant(info); + + if (gnt_list_entry->gref != GRANT_INVALID_REF) return gnt_list_entry; + + /* Assign a gref to this page */ + gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); + BUG_ON(gnt_list_entry->gref == -ENOSPC); + if (info->feature_persistent) + grant_foreign_access(gnt_list_entry, info); + else { + /* Grant access to the GFN passed by the caller */ + gnttab_grant_foreign_access_ref(gnt_list_entry->gref, + info->xbdev->otherend_id, + gfn, 0); } + return gnt_list_entry; +} + +static struct grant *get_indirect_grant(grant_ref_t *gref_head, + struct blkfront_info *info) +{ + struct grant *gnt_list_entry = get_free_grant(info); + + if (gnt_list_entry->gref != GRANT_INVALID_REF) + return gnt_list_entry; + /* Assign a gref to this page */ gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); BUG_ON(gnt_list_entry->gref == -ENOSPC); if (!info->feature_persistent) { - BUG_ON(!page); - gnt_list_entry->page = page; + struct page *indirect_page; + + /* Fetch a pre-allocated page to use for indirect grefs */ + BUG_ON(list_empty(&info->indirect_pages)); + indirect_page = list_first_entry(&info->indirect_pages, + struct page, lru); + list_del(&indirect_page->lru); + gnt_list_entry->page = indirect_page; } - buffer_gfn = xen_page_to_gfn(gnt_list_entry->page); - gnttab_grant_foreign_access_ref(gnt_list_entry->gref, - info->xbdev->otherend_id, - buffer_gfn, 0); + grant_foreign_access(gnt_list_entry, info); + return gnt_list_entry; } @@ -524,32 +567,19 @@ static int blkif_queue_rw_req(struct request *req) if ((ring_req->operation == BLKIF_OP_INDIRECT) && (i % SEGS_PER_INDIRECT_FRAME == 0)) { - struct page *uninitialized_var(page); - if (segments) kunmap_atomic(segments); n = i / SEGS_PER_INDIRECT_FRAME; - if (!info->feature_persistent) { - struct page *indirect_page; - - /* - * Fetch a pre-allocated page to use for - * indirect grefs - */ - BUG_ON(list_empty(&info->indirect_pages)); - indirect_page = list_first_entry(&info->indirect_pages, - struct page, lru); - list_del(&indirect_page->lru); - page = indirect_page; - } - gnt_list_entry = get_grant(&gref_head, page, info); + gnt_list_entry = get_indirect_grant(&gref_head, info); info->shadow[id].indirect_grants[n] = gnt_list_entry; segments = kmap_atomic(gnt_list_entry->page); ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; } - gnt_list_entry = get_grant(&gref_head, sg_page(sg), info); + gnt_list_entry = get_grant(&gref_head, + xen_page_to_gfn(sg_page(sg)), + info); ref = gnt_list_entry->gref; info->shadow[id].grants_used[i] = gnt_list_entry; -- cgit v1.2.3 From 36f8abd36febf1c6f67dae26ec7a87be44629138 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Mon, 11 May 2015 13:44:21 +0100 Subject: xen/biomerge: Don't allow biovec's to be merged when Linux is not using 4KB pages On ARM all dma-capable devices on a same platform may not be protected by an IOMMU. The DMA requests have to use the BFN (i.e MFN on ARM) in order to use correctly the device. While the DOM0 memory is allocated in a 1:1 fashion (PFN == MFN), grant mapping will screw this contiguous mapping. When Linux is using 64KB page granularitary, the page may be split accross multiple non-contiguous MFN (Xen is using 4KB page granularity). Therefore a DMA request will likely fail. Checking that a 64KB page is using contiguous MFN is tedious. For now, always says that biovec are not mergeable. Signed-off-by: Julien Grall Reviewed-by: Stefano Stabellini Signed-off-by: David Vrabel --- drivers/xen/biomerge.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c index 8ae2fc90e1ea..4da69dbf7dca 100644 --- a/drivers/xen/biomerge.c +++ b/drivers/xen/biomerge.c @@ -6,10 +6,18 @@ bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, const struct bio_vec *vec2) { +#if XEN_PAGE_SIZE == PAGE_SIZE unsigned long bfn1 = pfn_to_bfn(page_to_pfn(vec1->bv_page)); unsigned long bfn2 = pfn_to_bfn(page_to_pfn(vec2->bv_page)); return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) && ((bfn1 == bfn2) || ((bfn1+1) == bfn2)); +#else + /* + * XXX: Add support for merging bio_vec when using different page + * size in Xen and Linux. + */ + return 0; +#endif } EXPORT_SYMBOL(xen_biovec_phys_mergeable); -- cgit v1.2.3 From 7d567928db59cb249e5539ebb63890e24431669a Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Tue, 5 May 2015 16:38:27 +0100 Subject: xen/xenbus: Use Xen page definition All the ring (xenstore, and PV rings) are always based on the page granularity of Xen. Signed-off-by: Julien Grall Reviewed-by: David Vrabel Reviewed-by: Stefano Stabellini Signed-off-by: David Vrabel --- drivers/xen/xenbus/xenbus_client.c | 6 +++--- drivers/xen/xenbus/xenbus_probe.c | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index aa304d05101b..42abee3bbb27 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -388,7 +388,7 @@ int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr, } grefs[i] = err; - vaddr = vaddr + PAGE_SIZE; + vaddr = vaddr + XEN_PAGE_SIZE; } return 0; @@ -555,7 +555,7 @@ static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev, if (!node) return -ENOMEM; - area = alloc_vm_area(PAGE_SIZE * nr_grefs, ptes); + area = alloc_vm_area(XEN_PAGE_SIZE * nr_grefs, ptes); if (!area) { kfree(node); return -ENOMEM; @@ -749,7 +749,7 @@ static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr) unsigned long addr; memset(&unmap[i], 0, sizeof(unmap[i])); - addr = (unsigned long)vaddr + (PAGE_SIZE * i); + addr = (unsigned long)vaddr + (XEN_PAGE_SIZE * i); unmap[i].host_addr = arbitrary_virt_to_machine( lookup_address(addr, &level)).maddr; unmap[i].dev_bus_addr = 0; diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 3cbe0556de26..33a31cfef55d 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -802,7 +802,8 @@ static int __init xenbus_init(void) goto out_error; xen_store_gfn = (unsigned long)v; xen_store_interface = - xen_remap(xen_store_gfn << PAGE_SHIFT, PAGE_SIZE); + xen_remap(xen_store_gfn << XEN_PAGE_SHIFT, + XEN_PAGE_SIZE); break; default: pr_warn("Xenstore state unknown\n"); -- cgit v1.2.3 From 9652c08012580c9961c77fc8726a877e0d437324 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Tue, 5 May 2015 16:58:43 +0100 Subject: tty/hvc: xen: Use xen page definition The console ring is always based on the page granularity of Xen. Signed-off-by: Julien Grall Reviewed-by: Stefano Stabellini Signed-off-by: David Vrabel --- drivers/tty/hvc/hvc_xen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c index 10beb1589d83..fa816b7193b6 100644 --- a/drivers/tty/hvc/hvc_xen.c +++ b/drivers/tty/hvc/hvc_xen.c @@ -230,7 +230,7 @@ static int xen_hvm_console_init(void) if (r < 0 || v == 0) goto err; gfn = v; - info->intf = xen_remap(gfn << PAGE_SHIFT, PAGE_SIZE); + info->intf = xen_remap(gfn << XEN_PAGE_SHIFT, XEN_PAGE_SIZE); if (info->intf == NULL) goto err; info->vtermno = HVC_COOKIE; @@ -472,7 +472,7 @@ static int xencons_resume(struct xenbus_device *dev) struct xencons_info *info = dev_get_drvdata(&dev->dev); xencons_disconnect_backend(info); - memset(info->intf, 0, PAGE_SIZE); + memset(info->intf, 0, XEN_PAGE_SIZE); return xencons_connect_backend(dev, info); } -- cgit v1.2.3 From 30756c62997822894fb34e2114f5dc727a12af30 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Mon, 4 May 2015 15:39:08 +0100 Subject: xen/balloon: Don't rely on the page granularity is the same for Xen and Linux For ARM64 guests, Linux is able to support either 64K or 4K page granularity. Although, the hypercall interface is always based on 4K page granularity. With 64K page granularity, a single page will be spread over multiple Xen frame. To avoid splitting the page into 4K frame, take advantage of the extent_order field to directly allocate/free chunk of the Linux page size. Note that PVMMU is only used for PV guest (which is x86) and the page granularity is always 4KB. Some BUILD_BUG_ON has been added to ensure that because the code has not been modified. Signed-off-by: Julien Grall Reviewed-by: Stefano Stabellini Signed-off-by: David Vrabel --- drivers/xen/balloon.c | 70 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 15 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index f56662324a47..b50d22960ce7 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -112,6 +112,12 @@ static struct ctl_table xen_root[] = { #endif +/* + * Use one extent per PAGE_SIZE to avoid to break down the page into + * multiple frame. + */ +#define EXTENT_ORDER (fls(XEN_PFN_PER_PAGE) - 1) + /* * balloon_process() state: * @@ -304,6 +310,12 @@ static enum bp_state reserve_additional_memory(void) nid = memory_add_physaddr_to_nid(resource->start); #ifdef CONFIG_XEN_HAVE_PVMMU + /* + * We don't support PV MMU when Linux and Xen is using + * different page granularity. + */ + BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); + /* * add_memory() will build page tables for the new memory so * the p2m must contain invalid entries so the correct @@ -384,11 +396,11 @@ static bool balloon_is_inflated(void) static enum bp_state increase_reservation(unsigned long nr_pages) { int rc; - unsigned long pfn, i; + unsigned long i; struct page *page; struct xen_memory_reservation reservation = { .address_bits = 0, - .extent_order = 0, + .extent_order = EXTENT_ORDER, .domid = DOMID_SELF }; @@ -401,7 +413,11 @@ static enum bp_state increase_reservation(unsigned long nr_pages) nr_pages = i; break; } - frame_list[i] = page_to_pfn(page); + + /* XENMEM_populate_physmap requires a PFN based on Xen + * granularity. + */ + frame_list[i] = page_to_xen_pfn(page); page = balloon_next_page(page); } @@ -415,10 +431,16 @@ static enum bp_state increase_reservation(unsigned long nr_pages) page = balloon_retrieve(false); BUG_ON(page == NULL); - pfn = page_to_pfn(page); - #ifdef CONFIG_XEN_HAVE_PVMMU + /* + * We don't support PV MMU when Linux and Xen is using + * different page granularity. + */ + BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + unsigned long pfn = page_to_pfn(page); + set_phys_to_machine(pfn, frame_list[i]); /* Link back into the page tables if not highmem. */ @@ -445,14 +467,15 @@ static enum bp_state increase_reservation(unsigned long nr_pages) static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) { enum bp_state state = BP_DONE; - unsigned long pfn, i; - struct page *page; + unsigned long i; + struct page *page, *tmp; int ret; struct xen_memory_reservation reservation = { .address_bits = 0, - .extent_order = 0, + .extent_order = EXTENT_ORDER, .domid = DOMID_SELF }; + LIST_HEAD(pages); if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); @@ -465,8 +488,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) break; } scrub_page(page); - - frame_list[i] = page_to_pfn(page); + list_add(&page->lru, &pages); } /* @@ -478,14 +500,25 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) */ kmap_flush_unused(); - /* Update direct mapping, invalidate P2M, and add to balloon. */ - for (i = 0; i < nr_pages; i++) { - pfn = frame_list[i]; - frame_list[i] = pfn_to_gfn(pfn); - page = pfn_to_page(pfn); + /* + * Setup the frame, update direct mapping, invalidate P2M, + * and add to balloon. + */ + i = 0; + list_for_each_entry_safe(page, tmp, &pages, lru) { + /* XENMEM_decrease_reservation requires a GFN */ + frame_list[i++] = xen_page_to_gfn(page); #ifdef CONFIG_XEN_HAVE_PVMMU + /* + * We don't support PV MMU when Linux and Xen is using + * different page granularity. + */ + BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + unsigned long pfn = page_to_pfn(page); + if (!PageHighMem(page)) { ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), @@ -495,6 +528,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } #endif + list_del(&page->lru); balloon_append(page); } @@ -603,6 +637,12 @@ int alloc_xenballooned_pages(int nr_pages, struct page **pages) if (page) { pages[pgno++] = page; #ifdef CONFIG_XEN_HAVE_PVMMU + /* + * We don't support PV MMU when Linux and Xen is using + * different page granularity. + */ + BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); + ret = xen_alloc_p2m_entry(page_to_pfn(page)); if (ret < 0) goto out_undo; -- cgit v1.2.3 From a001c9d95c4ea96589461d58e77c96416a303e2c Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Tue, 5 May 2015 16:37:30 +0100 Subject: xen/events: fifo: Make it running on 64KB granularity Only use the first 4KB of the page to store the events channel info. It means that we will waste 60KB every time we allocate page for: * control block: a page is allocating per CPU * event array: a page is allocating everytime we need to expand it I think we can reduce the memory waste for the 2 areas by: * control block: sharing between multiple vCPUs. Although it will require some bookkeeping in order to not free the page when the CPU goes offline and the other CPUs sharing the page still there * event array: always extend the array event by 64K (i.e 16 4K chunk). That would require more care when we fail to expand the event channel. Signed-off-by: Julien Grall Reviewed-by: David Vrabel Reviewed-by: Stefano Stabellini Signed-off-by: David Vrabel --- drivers/xen/events/events_base.c | 2 +- drivers/xen/events/events_fifo.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 6cd5e65c4aff..849500e4e14d 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -40,11 +40,11 @@ #include #include #include -#include #endif #include #include #include +#include #include #include diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c index 1d4baf56c36b..e3e9e3d46d1b 100644 --- a/drivers/xen/events/events_fifo.c +++ b/drivers/xen/events/events_fifo.c @@ -54,7 +54,7 @@ #include "events_internal.h" -#define EVENT_WORDS_PER_PAGE (PAGE_SIZE / sizeof(event_word_t)) +#define EVENT_WORDS_PER_PAGE (XEN_PAGE_SIZE / sizeof(event_word_t)) #define MAX_EVENT_ARRAY_PAGES (EVTCHN_FIFO_NR_CHANNELS / EVENT_WORDS_PER_PAGE) struct evtchn_fifo_queue { -- cgit v1.2.3 From 5ed5451d997f7a86c62a5557efc00dc3836dc559 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Tue, 5 May 2015 16:37:49 +0100 Subject: xen/grant-table: Make it running on 64KB granularity The Xen interface is using 4KB page granularity. This means that each grant is 4KB. The current implementation allocates a Linux page per grant. On Linux using 64KB page granularity, only the first 4KB of the page will be used. We could decrease the memory wasted by sharing the page with multiple grant. It will require some care with the {Set,Clear}ForeignPage macro. Note that no changes has been made in the x86 code because both Linux and Xen will only use 4KB page granularity. Signed-off-by: Julien Grall Reviewed-by: David Vrabel Reviewed-by: Stefano Stabellini Signed-off-by: David Vrabel --- arch/arm/xen/p2m.c | 6 +++--- drivers/xen/grant-table.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm/xen/p2m.c b/arch/arm/xen/p2m.c index 887596c67b12..0ed01f2d5ee4 100644 --- a/arch/arm/xen/p2m.c +++ b/arch/arm/xen/p2m.c @@ -93,8 +93,8 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, for (i = 0; i < count; i++) { if (map_ops[i].status) continue; - set_phys_to_machine(map_ops[i].host_addr >> PAGE_SHIFT, - map_ops[i].dev_bus_addr >> PAGE_SHIFT); + set_phys_to_machine(map_ops[i].host_addr >> XEN_PAGE_SHIFT, + map_ops[i].dev_bus_addr >> XEN_PAGE_SHIFT); } return 0; @@ -108,7 +108,7 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, int i; for (i = 0; i < count; i++) { - set_phys_to_machine(unmap_ops[i].host_addr >> PAGE_SHIFT, + set_phys_to_machine(unmap_ops[i].host_addr >> XEN_PAGE_SHIFT, INVALID_P2M_ENTRY); } diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index dbeaa67dec47..72d633962095 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -642,7 +642,7 @@ int gnttab_setup_auto_xlat_frames(phys_addr_t addr) if (xen_auto_xlat_grant_frames.count) return -EINVAL; - vaddr = xen_remap(addr, PAGE_SIZE * max_nr_gframes); + vaddr = xen_remap(addr, XEN_PAGE_SIZE * max_nr_gframes); if (vaddr == NULL) { pr_warn("Failed to ioremap gnttab share frames (addr=%pa)!\n", &addr); @@ -654,7 +654,7 @@ int gnttab_setup_auto_xlat_frames(phys_addr_t addr) return -ENOMEM; } for (i = 0; i < max_nr_gframes; i++) - pfn[i] = PFN_DOWN(addr) + i; + pfn[i] = XEN_PFN_DOWN(addr) + i; xen_auto_xlat_grant_frames.vaddr = vaddr; xen_auto_xlat_grant_frames.pfn = pfn; @@ -1004,7 +1004,7 @@ static void gnttab_request_version(void) { /* Only version 1 is used, which will always be available. */ grant_table_version = 1; - grefs_per_grant_frame = PAGE_SIZE / sizeof(struct grant_entry_v1); + grefs_per_grant_frame = XEN_PAGE_SIZE / sizeof(struct grant_entry_v1); gnttab_interface = &gnttab_v1_ops; pr_info("Grant tables using version %d layout\n", grant_table_version); -- cgit v1.2.3 From c004a6fe0c405e2aa91b2a88aa1428724e6d06f6 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Wed, 22 Jul 2015 16:44:54 +0100 Subject: block/xen-blkfront: Make it running on 64KB page granularity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PV block protocol is using 4KB page granularity. The goal of this patch is to allow a Linux using 64KB page granularity using block device on a non-modified Xen. The block API is using segment which should at least be the size of a Linux page. Therefore, the driver will have to break the page in chunk of 4K before giving the page to the backend. When breaking a 64KB segment in 4KB chunks, it is possible that some chunks are empty. As the PV protocol always require to have data in the chunk, we have to count the number of Xen page which will be in use and avoid sending empty chunks. Note that, a pre-defined number of grants are reserved before preparing the request. This pre-defined number is based on the number and the maximum size of the segments. If each segment contains a very small amount of data, the driver may reserve too many grants (16 grants is reserved per segment with 64KB page granularity). Furthermore, in the case of persistent grants we allocate one Linux page per grant although only the first 4KB of the page will be effectively in use. This could be improved by sharing the page with multiple grants. Signed-off-by: Julien Grall Acked-by: Roger Pau Monné Signed-off-by: David Vrabel --- drivers/block/xen-blkfront.c | 324 ++++++++++++++++++++++++++++--------------- 1 file changed, 213 insertions(+), 111 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index f5dfc1694679..057e05da83d1 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -78,6 +78,7 @@ struct blk_shadow { struct grant **grants_used; struct grant **indirect_grants; struct scatterlist *sg; + unsigned int num_sg; }; struct split_bio { @@ -106,8 +107,12 @@ static unsigned int xen_blkif_max_ring_order; module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO); MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring"); -#define BLK_RING_SIZE(info) __CONST_RING_SIZE(blkif, PAGE_SIZE * (info)->nr_ring_pages) -#define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * XENBUS_MAX_RING_PAGES) +#define BLK_RING_SIZE(info) \ + __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * (info)->nr_ring_pages) + +#define BLK_MAX_RING_SIZE \ + __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_PAGES) + /* * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 * characters are enough. Define to 20 to keep consist with backend. @@ -146,6 +151,7 @@ struct blkfront_info unsigned int discard_granularity; unsigned int discard_alignment; unsigned int feature_persistent:1; + /* Number of 4KB segments handled */ unsigned int max_indirect_segments; int is_ready; struct blk_mq_tag_set tag_set; @@ -174,10 +180,23 @@ static DEFINE_SPINLOCK(minor_lock); #define DEV_NAME "xvd" /* name in /dev */ -#define SEGS_PER_INDIRECT_FRAME \ - (PAGE_SIZE/sizeof(struct blkif_request_segment)) -#define INDIRECT_GREFS(_segs) \ - ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) +/* + * Grants are always the same size as a Xen page (i.e 4KB). + * A physical segment is always the same size as a Linux page. + * Number of grants per physical segment + */ +#define GRANTS_PER_PSEG (PAGE_SIZE / XEN_PAGE_SIZE) + +#define GRANTS_PER_INDIRECT_FRAME \ + (XEN_PAGE_SIZE / sizeof(struct blkif_request_segment)) + +#define PSEGS_PER_INDIRECT_FRAME \ + (GRANTS_INDIRECT_FRAME / GRANTS_PSEGS) + +#define INDIRECT_GREFS(_grants) \ + DIV_ROUND_UP(_grants, GRANTS_PER_INDIRECT_FRAME) + +#define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG) static int blkfront_setup_indirect(struct blkfront_info *info); static int blkfront_gather_backend_features(struct blkfront_info *info); @@ -465,14 +484,100 @@ static int blkif_queue_discard_req(struct request *req) return 0; } +struct setup_rw_req { + unsigned int grant_idx; + struct blkif_request_segment *segments; + struct blkfront_info *info; + struct blkif_request *ring_req; + grant_ref_t gref_head; + unsigned int id; + /* Only used when persistent grant is used and it's a read request */ + bool need_copy; + unsigned int bvec_off; + char *bvec_data; +}; + +static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, + unsigned int len, void *data) +{ + struct setup_rw_req *setup = data; + int n, ref; + struct grant *gnt_list_entry; + unsigned int fsect, lsect; + /* Convenient aliases */ + unsigned int grant_idx = setup->grant_idx; + struct blkif_request *ring_req = setup->ring_req; + struct blkfront_info *info = setup->info; + struct blk_shadow *shadow = &info->shadow[setup->id]; + + if ((ring_req->operation == BLKIF_OP_INDIRECT) && + (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) { + if (setup->segments) + kunmap_atomic(setup->segments); + + n = grant_idx / GRANTS_PER_INDIRECT_FRAME; + gnt_list_entry = get_indirect_grant(&setup->gref_head, info); + shadow->indirect_grants[n] = gnt_list_entry; + setup->segments = kmap_atomic(gnt_list_entry->page); + ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; + } + + gnt_list_entry = get_grant(&setup->gref_head, gfn, info); + ref = gnt_list_entry->gref; + shadow->grants_used[grant_idx] = gnt_list_entry; + + if (setup->need_copy) { + void *shared_data; + + shared_data = kmap_atomic(gnt_list_entry->page); + /* + * this does not wipe data stored outside the + * range sg->offset..sg->offset+sg->length. + * Therefore, blkback *could* see data from + * previous requests. This is OK as long as + * persistent grants are shared with just one + * domain. It may need refactoring if this + * changes + */ + memcpy(shared_data + offset, + setup->bvec_data + setup->bvec_off, + len); + + kunmap_atomic(shared_data); + setup->bvec_off += len; + } + + fsect = offset >> 9; + lsect = fsect + (len >> 9) - 1; + if (ring_req->operation != BLKIF_OP_INDIRECT) { + ring_req->u.rw.seg[grant_idx] = + (struct blkif_request_segment) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + } else { + setup->segments[grant_idx % GRANTS_PER_INDIRECT_FRAME] = + (struct blkif_request_segment) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + } + + (setup->grant_idx)++; +} + static int blkif_queue_rw_req(struct request *req) { struct blkfront_info *info = req->rq_disk->private_data; struct blkif_request *ring_req; unsigned long id; - unsigned int fsect, lsect; - int i, ref, n; - struct blkif_request_segment *segments = NULL; + int i; + struct setup_rw_req setup = { + .grant_idx = 0, + .segments = NULL, + .info = info, + .need_copy = rq_data_dir(req) && info->feature_persistent, + }; /* * Used to store if we are able to queue the request by just using @@ -480,25 +585,23 @@ static int blkif_queue_rw_req(struct request *req) * as there are not sufficiently many free. */ bool new_persistent_gnts; - grant_ref_t gref_head; - struct grant *gnt_list_entry = NULL; struct scatterlist *sg; - int nseg, max_grefs; + int num_sg, max_grefs, num_grant; - max_grefs = req->nr_phys_segments; + max_grefs = req->nr_phys_segments * GRANTS_PER_PSEG; if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST) /* * If we are using indirect segments we need to account * for the indirect grefs used in the request. */ - max_grefs += INDIRECT_GREFS(req->nr_phys_segments); + max_grefs += INDIRECT_GREFS(max_grefs); /* Check if we have enough grants to allocate a requests */ if (info->persistent_gnts_c < max_grefs) { new_persistent_gnts = 1; if (gnttab_alloc_grant_references( max_grefs - info->persistent_gnts_c, - &gref_head) < 0) { + &setup.gref_head) < 0) { gnttab_request_free_callback( &info->callback, blkif_restart_queue_callback, @@ -515,12 +618,19 @@ static int blkif_queue_rw_req(struct request *req) info->shadow[id].request = req; BUG_ON(info->max_indirect_segments == 0 && - req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); + GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST); BUG_ON(info->max_indirect_segments && - req->nr_phys_segments > info->max_indirect_segments); - nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); + GREFS(req->nr_phys_segments) > info->max_indirect_segments); + + num_sg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); + num_grant = 0; + /* Calculate the number of grant used */ + for_each_sg(info->shadow[id].sg, sg, num_sg, i) + num_grant += gnttab_count_grant(sg->offset, sg->length); + ring_req->u.rw.id = id; - if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) { + info->shadow[id].num_sg = num_sg; + if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) { /* * The indirect operation can only be a BLKIF_OP_READ or * BLKIF_OP_WRITE @@ -531,7 +641,7 @@ static int blkif_queue_rw_req(struct request *req) BLKIF_OP_WRITE : BLKIF_OP_READ; ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req); ring_req->u.indirect.handle = info->handle; - ring_req->u.indirect.nr_segments = nseg; + ring_req->u.indirect.nr_segments = num_grant; } else { ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); ring_req->u.rw.handle = info->handle; @@ -559,73 +669,30 @@ static int blkif_queue_rw_req(struct request *req) ring_req->operation = 0; } } - ring_req->u.rw.nr_segments = nseg; - } - for_each_sg(info->shadow[id].sg, sg, nseg, i) { - fsect = sg->offset >> 9; - lsect = fsect + (sg->length >> 9) - 1; - - if ((ring_req->operation == BLKIF_OP_INDIRECT) && - (i % SEGS_PER_INDIRECT_FRAME == 0)) { - if (segments) - kunmap_atomic(segments); - - n = i / SEGS_PER_INDIRECT_FRAME; - gnt_list_entry = get_indirect_grant(&gref_head, info); - info->shadow[id].indirect_grants[n] = gnt_list_entry; - segments = kmap_atomic(gnt_list_entry->page); - ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; - } - - gnt_list_entry = get_grant(&gref_head, - xen_page_to_gfn(sg_page(sg)), - info); - ref = gnt_list_entry->gref; - - info->shadow[id].grants_used[i] = gnt_list_entry; - - if (rq_data_dir(req) && info->feature_persistent) { - char *bvec_data; - void *shared_data; + ring_req->u.rw.nr_segments = num_grant; + } - BUG_ON(sg->offset + sg->length > PAGE_SIZE); + setup.ring_req = ring_req; + setup.id = id; + for_each_sg(info->shadow[id].sg, sg, num_sg, i) { + BUG_ON(sg->offset + sg->length > PAGE_SIZE); - shared_data = kmap_atomic(gnt_list_entry->page); - bvec_data = kmap_atomic(sg_page(sg)); + if (setup.need_copy) { + setup.bvec_off = sg->offset; + setup.bvec_data = kmap_atomic(sg_page(sg)); + } - /* - * this does not wipe data stored outside the - * range sg->offset..sg->offset+sg->length. - * Therefore, blkback *could* see data from - * previous requests. This is OK as long as - * persistent grants are shared with just one - * domain. It may need refactoring if this - * changes - */ - memcpy(shared_data + sg->offset, - bvec_data + sg->offset, - sg->length); + gnttab_foreach_grant_in_range(sg_page(sg), + sg->offset, + sg->length, + blkif_setup_rw_req_grant, + &setup); - kunmap_atomic(bvec_data); - kunmap_atomic(shared_data); - } - if (ring_req->operation != BLKIF_OP_INDIRECT) { - ring_req->u.rw.seg[i] = - (struct blkif_request_segment) { - .gref = ref, - .first_sect = fsect, - .last_sect = lsect }; - } else { - n = i % SEGS_PER_INDIRECT_FRAME; - segments[n] = - (struct blkif_request_segment) { - .gref = ref, - .first_sect = fsect, - .last_sect = lsect }; - } + if (setup.need_copy) + kunmap_atomic(setup.bvec_data); } - if (segments) - kunmap_atomic(segments); + if (setup.segments) + kunmap_atomic(setup.segments); info->ring.req_prod_pvt++; @@ -633,7 +700,7 @@ static int blkif_queue_rw_req(struct request *req) info->shadow[id].req = *ring_req; if (new_persistent_gnts) - gnttab_free_grant_references(gref_head); + gnttab_free_grant_references(setup.gref_head); return 0; } @@ -750,14 +817,14 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, /* Hard sector size and max sectors impersonate the equiv. hardware. */ blk_queue_logical_block_size(rq, sector_size); blk_queue_physical_block_size(rq, physical_sector_size); - blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512); + blk_queue_max_hw_sectors(rq, (segments * XEN_PAGE_SIZE) / 512); /* Each segment in a request is up to an aligned page in size. */ blk_queue_segment_boundary(rq, PAGE_SIZE - 1); blk_queue_max_segment_size(rq, PAGE_SIZE); /* Ensure a merged request will fit in a single I/O ring slot. */ - blk_queue_max_segments(rq, segments); + blk_queue_max_segments(rq, segments / GRANTS_PER_PSEG); /* Make sure buffer addresses are sector-aligned. */ blk_queue_dma_alignment(rq, 511); @@ -1116,32 +1183,65 @@ free_shadow: } +struct copy_from_grant { + const struct blk_shadow *s; + unsigned int grant_idx; + unsigned int bvec_offset; + char *bvec_data; +}; + +static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset, + unsigned int len, void *data) +{ + struct copy_from_grant *info = data; + char *shared_data; + /* Convenient aliases */ + const struct blk_shadow *s = info->s; + + shared_data = kmap_atomic(s->grants_used[info->grant_idx]->page); + + memcpy(info->bvec_data + info->bvec_offset, + shared_data + offset, len); + + info->bvec_offset += len; + info->grant_idx++; + + kunmap_atomic(shared_data); +} + static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, struct blkif_response *bret) { int i = 0; struct scatterlist *sg; - char *bvec_data; - void *shared_data; - int nseg; + int num_sg, num_grant; + struct copy_from_grant data = { + .s = s, + .grant_idx = 0, + }; - nseg = s->req.operation == BLKIF_OP_INDIRECT ? + num_grant = s->req.operation == BLKIF_OP_INDIRECT ? s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; + num_sg = s->num_sg; if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { - for_each_sg(s->sg, sg, nseg, i) { + for_each_sg(s->sg, sg, num_sg, i) { BUG_ON(sg->offset + sg->length > PAGE_SIZE); - shared_data = kmap_atomic(s->grants_used[i]->page); - bvec_data = kmap_atomic(sg_page(sg)); - memcpy(bvec_data + sg->offset, - shared_data + sg->offset, - sg->length); - kunmap_atomic(bvec_data); - kunmap_atomic(shared_data); + + data.bvec_offset = sg->offset; + data.bvec_data = kmap_atomic(sg_page(sg)); + + gnttab_foreach_grant_in_range(sg_page(sg), + sg->offset, + sg->length, + blkif_copy_from_grant, + &data); + + kunmap_atomic(data.bvec_data); } } /* Add the persistent grant into the list of free grants */ - for (i = 0; i < nseg; i++) { + for (i = 0; i < num_grant; i++) { if (gnttab_query_foreign_access(s->grants_used[i]->gref)) { /* * If the grant is still mapped by the backend (the @@ -1167,7 +1267,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, } } if (s->req.operation == BLKIF_OP_INDIRECT) { - for (i = 0; i < INDIRECT_GREFS(nseg); i++) { + for (i = 0; i < INDIRECT_GREFS(num_grant); i++) { if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) { if (!info->feature_persistent) pr_alert_ratelimited("backed has not unmapped grant: %u\n", @@ -1312,7 +1412,7 @@ static int setup_blkring(struct xenbus_device *dev, { struct blkif_sring *sring; int err, i; - unsigned long ring_size = info->nr_ring_pages * PAGE_SIZE; + unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE; grant_ref_t gref[XENBUS_MAX_RING_PAGES]; for (i = 0; i < info->nr_ring_pages; i++) @@ -1641,8 +1741,8 @@ static int blkif_recover(struct blkfront_info *info) atomic_set(&split_bio->pending, pending); split_bio->bio = bio; for (i = 0; i < pending; i++) { - offset = (i * segs * PAGE_SIZE) >> 9; - size = min((unsigned int)(segs * PAGE_SIZE) >> 9, + offset = (i * segs * XEN_PAGE_SIZE) >> 9; + size = min((unsigned int)(segs * XEN_PAGE_SIZE) >> 9, (unsigned int)bio_sectors(bio) - offset); cloned_bio = bio_clone(bio, GFP_NOIO); BUG_ON(cloned_bio == NULL); @@ -1753,15 +1853,17 @@ static void blkfront_setup_discard(struct blkfront_info *info) static int blkfront_setup_indirect(struct blkfront_info *info) { - unsigned int segs; + unsigned int psegs, grants; int err, i; if (info->max_indirect_segments == 0) - segs = BLKIF_MAX_SEGMENTS_PER_REQUEST; + grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; else - segs = info->max_indirect_segments; + grants = info->max_indirect_segments; + psegs = grants / GRANTS_PER_PSEG; - err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE(info)); + err = fill_grant_buffer(info, + (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info)); if (err) goto out_of_memory; @@ -1771,7 +1873,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info) * grants, we need to allocate a set of pages that can be * used for mapping indirect grefs */ - int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE(info); + int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info); BUG_ON(!list_empty(&info->indirect_pages)); for (i = 0; i < num; i++) { @@ -1784,20 +1886,20 @@ static int blkfront_setup_indirect(struct blkfront_info *info) for (i = 0; i < BLK_RING_SIZE(info); i++) { info->shadow[i].grants_used = kzalloc( - sizeof(info->shadow[i].grants_used[0]) * segs, + sizeof(info->shadow[i].grants_used[0]) * grants, GFP_NOIO); - info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO); + info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * psegs, GFP_NOIO); if (info->max_indirect_segments) info->shadow[i].indirect_grants = kzalloc( sizeof(info->shadow[i].indirect_grants[0]) * - INDIRECT_GREFS(segs), + INDIRECT_GREFS(grants), GFP_NOIO); if ((info->shadow[i].grants_used == NULL) || (info->shadow[i].sg == NULL) || (info->max_indirect_segments && (info->shadow[i].indirect_grants == NULL))) goto out_of_memory; - sg_init_table(info->shadow[i].sg, segs); + sg_init_table(info->shadow[i].sg, psegs); } -- cgit v1.2.3 From 67de5dfbc176ea86ab0278658b5d55f64207ff2d Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Tue, 5 May 2015 16:25:56 +0100 Subject: block/xen-blkback: Make it running on 64KB page granularity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PV block protocol is using 4KB page granularity. The goal of this patch is to allow a Linux using 64KB page granularity behaving as a block backend on a non-modified Xen. It's only necessary to adapt the ring size and the number of request per indirect frames. The rest of the code is relying on the grant table code. Note that the grant table code is allocating a Linux page per grant which will result to waste 6OKB for every grant when Linux is using 64KB page granularity. This could be improved by sharing the page between multiple grants. Signed-off-by: Julien Grall Acked-by: "Roger Pau Monné" Signed-off-by: David Vrabel --- drivers/block/xen-blkback/blkback.c | 5 +++-- drivers/block/xen-blkback/common.h | 17 +++++++++++++---- drivers/block/xen-blkback/xenbus.c | 9 ++++++--- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 6a685aec6994..809634ce3b67 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -961,7 +961,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, seg[n].nsec = segments[i].last_sect - segments[i].first_sect + 1; seg[n].offset = (segments[i].first_sect << 9); - if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) || + if ((segments[i].last_sect >= (XEN_PAGE_SIZE >> 9)) || (segments[i].last_sect < segments[i].first_sect)) { rc = -EINVAL; goto unmap; @@ -1210,6 +1210,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, req_operation = req->operation == BLKIF_OP_INDIRECT ? req->u.indirect.indirect_op : req->operation; + if ((req->operation == BLKIF_OP_INDIRECT) && (req_operation != BLKIF_OP_READ) && (req_operation != BLKIF_OP_WRITE)) { @@ -1268,7 +1269,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, seg[i].nsec = req->u.rw.seg[i].last_sect - req->u.rw.seg[i].first_sect + 1; seg[i].offset = (req->u.rw.seg[i].first_sect << 9); - if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) || + if ((req->u.rw.seg[i].last_sect >= (XEN_PAGE_SIZE >> 9)) || (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect)) goto fail_response; diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 45a044a53d1e..68e87a037b99 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -51,12 +52,20 @@ extern unsigned int xen_blkif_max_ring_order; */ #define MAX_INDIRECT_SEGMENTS 256 -#define SEGS_PER_INDIRECT_FRAME \ - (PAGE_SIZE/sizeof(struct blkif_request_segment)) +/* + * Xen use 4K pages. The guest may use different page size (4K or 64K) + * Number of Xen pages per segment + */ +#define XEN_PAGES_PER_SEGMENT (PAGE_SIZE / XEN_PAGE_SIZE) + +#define XEN_PAGES_PER_INDIRECT_FRAME \ + (XEN_PAGE_SIZE/sizeof(struct blkif_request_segment)) +#define SEGS_PER_INDIRECT_FRAME \ + (XEN_PAGES_PER_INDIRECT_FRAME / XEN_PAGES_PER_SEGMENT) + #define MAX_INDIRECT_PAGES \ ((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) -#define INDIRECT_PAGES(_segs) \ - ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) +#define INDIRECT_PAGES(_segs) DIV_ROUND_UP(_segs, XEN_PAGES_PER_INDIRECT_FRAME) /* Not a real protocol. Used to generate ring structs which contain * the elements common to all protocols only. This way we get a diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 767657565de6..01c6b41de4e5 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -176,21 +176,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, { struct blkif_sring *sring; sring = (struct blkif_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE * nr_grefs); + BACK_RING_INIT(&blkif->blk_rings.native, sring, + XEN_PAGE_SIZE * nr_grefs); break; } case BLKIF_PROTOCOL_X86_32: { struct blkif_x86_32_sring *sring_x86_32; sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE * nr_grefs); + BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, + XEN_PAGE_SIZE * nr_grefs); break; } case BLKIF_PROTOCOL_X86_64: { struct blkif_x86_64_sring *sring_x86_64; sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE * nr_grefs); + BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, + XEN_PAGE_SIZE * nr_grefs); break; } default: -- cgit v1.2.3 From 30c5d7f0da82f55c86c0a09bf21c0623474bb17f Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Fri, 10 Apr 2015 14:42:21 +0100 Subject: net/xen-netfront: Make it running on 64KB page granularity The PV network protocol is using 4KB page granularity. The goal of this patch is to allow a Linux using 64KB page granularity using network device on a non-modified Xen. It's only necessary to adapt the ring size and break skb data in small chunk of 4KB. The rest of the code is relying on the grant table code. Note that we allocate a Linux page for each rx skb but only the first 4KB is used. We may improve the memory usage by extending the size of the rx skb. Signed-off-by: Julien Grall Reviewed-by: David Vrabel Signed-off-by: David Vrabel --- drivers/net/xen-netfront.c | 122 ++++++++++++++++++++++++++++++++------------- 1 file changed, 86 insertions(+), 36 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index f821a97d7827..badca3101b62 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -74,8 +74,8 @@ struct netfront_cb { #define GRANT_INVALID_REF 0 -#define NET_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE) -#define NET_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE) +#define NET_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, XEN_PAGE_SIZE) +#define NET_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, XEN_PAGE_SIZE) /* Minimum number of Rx slots (includes slot for GSO metadata). */ #define NET_RX_SLOTS_MIN (XEN_NETIF_NR_SLOTS_MIN + 1) @@ -291,7 +291,7 @@ static void xennet_alloc_rx_buffers(struct netfront_queue *queue) struct sk_buff *skb; unsigned short id; grant_ref_t ref; - unsigned long gfn; + struct page *page; struct xen_netif_rx_request *req; skb = xennet_alloc_one_rx_buffer(queue); @@ -307,14 +307,13 @@ static void xennet_alloc_rx_buffers(struct netfront_queue *queue) BUG_ON((signed short)ref < 0); queue->grant_rx_ref[id] = ref; - gfn = xen_page_to_gfn(skb_frag_page(&skb_shinfo(skb)->frags[0])); + page = skb_frag_page(&skb_shinfo(skb)->frags[0]); req = RING_GET_REQUEST(&queue->rx, req_prod); - gnttab_grant_foreign_access_ref(ref, - queue->info->xbdev->otherend_id, - gfn, - 0); - + gnttab_page_grant_foreign_access_ref_one(ref, + queue->info->xbdev->otherend_id, + page, + 0); req->id = id; req->gref = ref; } @@ -415,25 +414,33 @@ static void xennet_tx_buf_gc(struct netfront_queue *queue) xennet_maybe_wake_tx(queue); } -static struct xen_netif_tx_request *xennet_make_one_txreq( - struct netfront_queue *queue, struct sk_buff *skb, - struct page *page, unsigned int offset, unsigned int len) +struct xennet_gnttab_make_txreq { + struct netfront_queue *queue; + struct sk_buff *skb; + struct page *page; + struct xen_netif_tx_request *tx; /* Last request */ + unsigned int size; +}; + +static void xennet_tx_setup_grant(unsigned long gfn, unsigned int offset, + unsigned int len, void *data) { + struct xennet_gnttab_make_txreq *info = data; unsigned int id; struct xen_netif_tx_request *tx; grant_ref_t ref; - - len = min_t(unsigned int, PAGE_SIZE - offset, len); + /* convenient aliases */ + struct page *page = info->page; + struct netfront_queue *queue = info->queue; + struct sk_buff *skb = info->skb; id = get_id_from_freelist(&queue->tx_skb_freelist, queue->tx_skbs); tx = RING_GET_REQUEST(&queue->tx, queue->tx.req_prod_pvt++); ref = gnttab_claim_grant_reference(&queue->gref_tx_head); BUG_ON((signed short)ref < 0); - gnttab_grant_foreign_access_ref(ref, - queue->info->xbdev->otherend_id, - xen_page_to_gfn(page), - GNTMAP_readonly); + gnttab_grant_foreign_access_ref(ref, queue->info->xbdev->otherend_id, + gfn, GNTMAP_readonly); queue->tx_skbs[id].skb = skb; queue->grant_tx_page[id] = page; @@ -445,7 +452,34 @@ static struct xen_netif_tx_request *xennet_make_one_txreq( tx->size = len; tx->flags = 0; - return tx; + info->tx = tx; + info->size += tx->size; +} + +static struct xen_netif_tx_request *xennet_make_first_txreq( + struct netfront_queue *queue, struct sk_buff *skb, + struct page *page, unsigned int offset, unsigned int len) +{ + struct xennet_gnttab_make_txreq info = { + .queue = queue, + .skb = skb, + .page = page, + .size = 0, + }; + + gnttab_for_one_grant(page, offset, len, xennet_tx_setup_grant, &info); + + return info.tx; +} + +static void xennet_make_one_txreq(unsigned long gfn, unsigned int offset, + unsigned int len, void *data) +{ + struct xennet_gnttab_make_txreq *info = data; + + info->tx->flags |= XEN_NETTXF_more_data; + skb_get(info->skb); + xennet_tx_setup_grant(gfn, offset, len, data); } static struct xen_netif_tx_request *xennet_make_txreqs( @@ -453,20 +487,30 @@ static struct xen_netif_tx_request *xennet_make_txreqs( struct sk_buff *skb, struct page *page, unsigned int offset, unsigned int len) { + struct xennet_gnttab_make_txreq info = { + .queue = queue, + .skb = skb, + .tx = tx, + }; + /* Skip unused frames from start of page */ page += offset >> PAGE_SHIFT; offset &= ~PAGE_MASK; while (len) { - tx->flags |= XEN_NETTXF_more_data; - tx = xennet_make_one_txreq(queue, skb_get(skb), - page, offset, len); + info.page = page; + info.size = 0; + + gnttab_foreach_grant_in_range(page, offset, len, + xennet_make_one_txreq, + &info); + page++; offset = 0; - len -= tx->size; + len -= info.size; } - return tx; + return info.tx; } /* @@ -476,9 +520,10 @@ static struct xen_netif_tx_request *xennet_make_txreqs( static int xennet_count_skb_slots(struct sk_buff *skb) { int i, frags = skb_shinfo(skb)->nr_frags; - int pages; + int slots; - pages = PFN_UP(offset_in_page(skb->data) + skb_headlen(skb)); + slots = gnttab_count_grant(offset_in_page(skb->data), + skb_headlen(skb)); for (i = 0; i < frags; i++) { skb_frag_t *frag = skb_shinfo(skb)->frags + i; @@ -488,10 +533,10 @@ static int xennet_count_skb_slots(struct sk_buff *skb) /* Skip unused frames from start of page */ offset &= ~PAGE_MASK; - pages += PFN_UP(offset + size); + slots += gnttab_count_grant(offset, size); } - return pages; + return slots; } static u16 xennet_select_queue(struct net_device *dev, struct sk_buff *skb, @@ -512,6 +557,8 @@ static u16 xennet_select_queue(struct net_device *dev, struct sk_buff *skb, return queue_idx; } +#define MAX_XEN_SKB_FRAGS (65536 / XEN_PAGE_SIZE + 1) + static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct netfront_info *np = netdev_priv(dev); @@ -546,7 +593,7 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev) } slots = xennet_count_skb_slots(skb); - if (unlikely(slots > MAX_SKB_FRAGS + 1)) { + if (unlikely(slots > MAX_XEN_SKB_FRAGS + 1)) { net_dbg_ratelimited("xennet: skb rides the rocket: %d slots, %d bytes\n", slots, skb->len); if (skb_linearize(skb)) @@ -567,10 +614,13 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev) } /* First request for the linear area. */ - first_tx = tx = xennet_make_one_txreq(queue, skb, - page, offset, len); - page++; - offset = 0; + first_tx = tx = xennet_make_first_txreq(queue, skb, + page, offset, len); + offset += tx->size; + if (offset == PAGE_SIZE) { + page++; + offset = 0; + } len -= tx->size; if (skb->ip_summed == CHECKSUM_PARTIAL) @@ -732,7 +782,7 @@ static int xennet_get_responses(struct netfront_queue *queue, for (;;) { if (unlikely(rx->status < 0 || - rx->offset + rx->status > PAGE_SIZE)) { + rx->offset + rx->status > XEN_PAGE_SIZE)) { if (net_ratelimit()) dev_warn(dev, "rx->offset: %u, size: %d\n", rx->offset, rx->status); @@ -1496,7 +1546,7 @@ static int setup_netfront(struct xenbus_device *dev, goto fail; } SHARED_RING_INIT(txs); - FRONT_RING_INIT(&queue->tx, txs, PAGE_SIZE); + FRONT_RING_INIT(&queue->tx, txs, XEN_PAGE_SIZE); err = xenbus_grant_ring(dev, txs, 1, &gref); if (err < 0) @@ -1510,7 +1560,7 @@ static int setup_netfront(struct xenbus_device *dev, goto alloc_rx_ring_fail; } SHARED_RING_INIT(rxs); - FRONT_RING_INIT(&queue->rx, rxs, PAGE_SIZE); + FRONT_RING_INIT(&queue->rx, rxs, XEN_PAGE_SIZE); err = xenbus_grant_ring(dev, rxs, 1, &gref); if (err < 0) -- cgit v1.2.3 From d0089e8a0e4c9723d85b01713671358e3d6960df Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Tue, 5 May 2015 13:15:29 +0100 Subject: net/xen-netback: Make it running on 64KB page granularity The PV network protocol is using 4KB page granularity. The goal of this patch is to allow a Linux using 64KB page granularity working as a network backend on a non-modified Xen. It's only necessary to adapt the ring size and break skb data in small chunk of 4KB. The rest of the code is relying on the grant table code. Signed-off-by: Julien Grall Reviewed-by: Wei Liu Signed-off-by: David Vrabel --- drivers/net/xen-netback/common.h | 16 ++-- drivers/net/xen-netback/netback.c | 157 ++++++++++++++++++++++++-------------- 2 files changed, 111 insertions(+), 62 deletions(-) diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index a7bf74727116..0333ab0fd926 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -44,6 +44,7 @@ #include #include #include +#include #include typedef unsigned int pending_ring_idx_t; @@ -64,8 +65,8 @@ struct pending_tx_info { struct ubuf_info callback_struct; }; -#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE) -#define XEN_NETIF_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE) +#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, XEN_PAGE_SIZE) +#define XEN_NETIF_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, XEN_PAGE_SIZE) struct xenvif_rx_meta { int id; @@ -80,16 +81,21 @@ struct xenvif_rx_meta { /* Discriminate from any valid pending_idx value. */ #define INVALID_PENDING_IDX 0xFFFF -#define MAX_BUFFER_OFFSET PAGE_SIZE +#define MAX_BUFFER_OFFSET XEN_PAGE_SIZE #define MAX_PENDING_REQS XEN_NETIF_TX_RING_SIZE +/* The maximum number of frags is derived from the size of a grant (same + * as a Xen page size for now). + */ +#define MAX_XEN_SKB_FRAGS (65536 / XEN_PAGE_SIZE + 1) + /* It's possible for an skb to have a maximal number of frags * but still be less than MAX_BUFFER_OFFSET in size. Thus the - * worst-case number of copy operations is MAX_SKB_FRAGS per + * worst-case number of copy operations is MAX_XEN_SKB_FRAGS per * ring slot. */ -#define MAX_GRANT_COPY_OPS (MAX_SKB_FRAGS * XEN_NETIF_RX_RING_SIZE) +#define MAX_GRANT_COPY_OPS (MAX_XEN_SKB_FRAGS * XEN_NETIF_RX_RING_SIZE) #define NETBACK_INVALID_HANDLE -1 diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index c4e6c025d64d..e481f3710bd3 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -152,9 +152,9 @@ static inline pending_ring_idx_t pending_index(unsigned i) static int xenvif_rx_ring_slots_needed(struct xenvif *vif) { if (vif->gso_mask) - return DIV_ROUND_UP(vif->dev->gso_max_size, PAGE_SIZE) + 1; + return DIV_ROUND_UP(vif->dev->gso_max_size, XEN_PAGE_SIZE) + 1; else - return DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE); + return DIV_ROUND_UP(vif->dev->mtu, XEN_PAGE_SIZE); } static bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue) @@ -274,6 +274,80 @@ static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif_queue *queue, return meta; } +struct gop_frag_copy { + struct xenvif_queue *queue; + struct netrx_pending_operations *npo; + struct xenvif_rx_meta *meta; + int head; + int gso_type; + + struct page *page; +}; + +static void xenvif_setup_copy_gop(unsigned long gfn, + unsigned int offset, + unsigned int *len, + struct gop_frag_copy *info) +{ + struct gnttab_copy *copy_gop; + struct xen_page_foreign *foreign; + /* Convenient aliases */ + struct xenvif_queue *queue = info->queue; + struct netrx_pending_operations *npo = info->npo; + struct page *page = info->page; + + BUG_ON(npo->copy_off > MAX_BUFFER_OFFSET); + + if (npo->copy_off == MAX_BUFFER_OFFSET) + info->meta = get_next_rx_buffer(queue, npo); + + if (npo->copy_off + *len > MAX_BUFFER_OFFSET) + *len = MAX_BUFFER_OFFSET - npo->copy_off; + + copy_gop = npo->copy + npo->copy_prod++; + copy_gop->flags = GNTCOPY_dest_gref; + copy_gop->len = *len; + + foreign = xen_page_foreign(page); + if (foreign) { + copy_gop->source.domid = foreign->domid; + copy_gop->source.u.ref = foreign->gref; + copy_gop->flags |= GNTCOPY_source_gref; + } else { + copy_gop->source.domid = DOMID_SELF; + copy_gop->source.u.gmfn = gfn; + } + copy_gop->source.offset = offset; + + copy_gop->dest.domid = queue->vif->domid; + copy_gop->dest.offset = npo->copy_off; + copy_gop->dest.u.ref = npo->copy_gref; + + npo->copy_off += *len; + info->meta->size += *len; + + /* Leave a gap for the GSO descriptor. */ + if (info->head && ((1 << info->gso_type) & queue->vif->gso_mask)) + queue->rx.req_cons++; + + info->head = 0; /* There must be something in this buffer now */ +} + +static void xenvif_gop_frag_copy_grant(unsigned long gfn, + unsigned offset, + unsigned int len, + void *data) +{ + unsigned int bytes; + + while (len) { + bytes = len; + xenvif_setup_copy_gop(gfn, offset, &bytes, data); + offset += bytes; + len -= bytes; + } +} + /* * Set up the grant operations for this fragment. If it's a flipping * interface, we also set up the unmap request from here. @@ -283,83 +357,52 @@ static void xenvif_gop_frag_copy(struct xenvif_queue *queue, struct sk_buff *skb struct page *page, unsigned long size, unsigned long offset, int *head) { - struct gnttab_copy *copy_gop; - struct xenvif_rx_meta *meta; + struct gop_frag_copy info = { + .queue = queue, + .npo = npo, + .head = *head, + .gso_type = XEN_NETIF_GSO_TYPE_NONE, + }; unsigned long bytes; - int gso_type = XEN_NETIF_GSO_TYPE_NONE; if (skb_is_gso(skb)) { if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) - gso_type = XEN_NETIF_GSO_TYPE_TCPV4; + info.gso_type = XEN_NETIF_GSO_TYPE_TCPV4; else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) - gso_type = XEN_NETIF_GSO_TYPE_TCPV6; + info.gso_type = XEN_NETIF_GSO_TYPE_TCPV6; } /* Data must not cross a page boundary. */ BUG_ON(size + offset > PAGE_SIZE<meta + npo->meta_prod - 1; + info.meta = npo->meta + npo->meta_prod - 1; /* Skip unused frames from start of page */ page += offset >> PAGE_SHIFT; offset &= ~PAGE_MASK; while (size > 0) { - struct xen_page_foreign *foreign; - BUG_ON(offset >= PAGE_SIZE); - BUG_ON(npo->copy_off > MAX_BUFFER_OFFSET); - - if (npo->copy_off == MAX_BUFFER_OFFSET) - meta = get_next_rx_buffer(queue, npo); bytes = PAGE_SIZE - offset; if (bytes > size) bytes = size; - if (npo->copy_off + bytes > MAX_BUFFER_OFFSET) - bytes = MAX_BUFFER_OFFSET - npo->copy_off; - - copy_gop = npo->copy + npo->copy_prod++; - copy_gop->flags = GNTCOPY_dest_gref; - copy_gop->len = bytes; - - foreign = xen_page_foreign(page); - if (foreign) { - copy_gop->source.domid = foreign->domid; - copy_gop->source.u.ref = foreign->gref; - copy_gop->flags |= GNTCOPY_source_gref; - } else { - copy_gop->source.domid = DOMID_SELF; - copy_gop->source.u.gmfn = - virt_to_gfn(page_address(page)); - } - copy_gop->source.offset = offset; - - copy_gop->dest.domid = queue->vif->domid; - copy_gop->dest.offset = npo->copy_off; - copy_gop->dest.u.ref = npo->copy_gref; - - npo->copy_off += bytes; - meta->size += bytes; - - offset += bytes; + info.page = page; + gnttab_foreach_grant_in_range(page, offset, bytes, + xenvif_gop_frag_copy_grant, + &info); size -= bytes; + offset = 0; - /* Next frame */ - if (offset == PAGE_SIZE && size) { + /* Next page */ + if (size) { BUG_ON(!PageCompound(page)); page++; - offset = 0; } - - /* Leave a gap for the GSO descriptor. */ - if (*head && ((1 << gso_type) & queue->vif->gso_mask)) - queue->rx.req_cons++; - - *head = 0; /* There must be something in this buffer now. */ - } + + *head = info.head; } /* @@ -758,7 +801,7 @@ static int xenvif_count_requests(struct xenvif_queue *queue, first->size -= txp->size; slots++; - if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) { + if (unlikely((txp->offset + txp->size) > XEN_PAGE_SIZE)) { netdev_err(queue->vif->dev, "Cross page boundary, txp->offset: %u, size: %u\n", txp->offset, txp->size); xenvif_fatal_tx_err(queue->vif); @@ -1339,11 +1382,11 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue, } /* No crossing a page as the payload mustn't fragment. */ - if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) { + if (unlikely((txreq.offset + txreq.size) > XEN_PAGE_SIZE)) { netdev_err(queue->vif->dev, "txreq.offset: %u, size: %u, end: %lu\n", txreq.offset, txreq.size, - (unsigned long)(txreq.offset&~PAGE_MASK) + txreq.size); + (unsigned long)(txreq.offset&~XEN_PAGE_MASK) + txreq.size); xenvif_fatal_tx_err(queue->vif); break; } @@ -1409,7 +1452,7 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue, virt_to_gfn(skb->data); queue->tx_copy_ops[*copy_ops].dest.domid = DOMID_SELF; queue->tx_copy_ops[*copy_ops].dest.offset = - offset_in_page(skb->data); + offset_in_page(skb->data) & ~XEN_PAGE_MASK; queue->tx_copy_ops[*copy_ops].len = data_len; queue->tx_copy_ops[*copy_ops].flags = GNTCOPY_source_gref; @@ -1894,7 +1937,7 @@ int xenvif_map_frontend_rings(struct xenvif_queue *queue, goto err; txs = (struct xen_netif_tx_sring *)addr; - BACK_RING_INIT(&queue->tx, txs, PAGE_SIZE); + BACK_RING_INIT(&queue->tx, txs, XEN_PAGE_SIZE); err = xenbus_map_ring_valloc(xenvif_to_xenbus_device(queue->vif), &rx_ring_ref, 1, &addr); @@ -1902,7 +1945,7 @@ int xenvif_map_frontend_rings(struct xenvif_queue *queue, goto err; rxs = (struct xen_netif_rx_sring *)addr; - BACK_RING_INIT(&queue->rx, rxs, PAGE_SIZE); + BACK_RING_INIT(&queue->rx, rxs, XEN_PAGE_SIZE); return 0; -- cgit v1.2.3 From 5995a68a6272e4e8f4fe4de82cdc877e650fe8be Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Tue, 5 May 2015 16:54:12 +0100 Subject: xen/privcmd: Add support for Linux 64KB page granularity The hypercall interface (as well as the toolstack) is always using 4KB page granularity. When the toolstack is asking for mapping a series of guest PFN in a batch, it expects to have the page map contiguously in its virtual memory. When Linux is using 64KB page granularity, the privcmd driver will have to map multiple Xen PFN in a single Linux page. Note that this solution works on page granularity which is a multiple of 4KB. Signed-off-by: Julien Grall Reviewed-by: David Vrabel Signed-off-by: David Vrabel --- drivers/xen/privcmd.c | 8 ++-- drivers/xen/xlate_mmu.c | 124 ++++++++++++++++++++++++++++++++---------------- 2 files changed, 89 insertions(+), 43 deletions(-) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index b199ad3d4587..df2e6f783318 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -446,7 +446,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) return -EINVAL; } - nr_pages = m.num; + nr_pages = DIV_ROUND_UP(m.num, XEN_PFN_PER_PAGE); if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) return -EINVAL; @@ -494,7 +494,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) goto out_unlock; } if (xen_feature(XENFEAT_auto_translated_physmap)) { - ret = alloc_empty_pages(vma, m.num); + ret = alloc_empty_pages(vma, nr_pages); if (ret < 0) goto out_unlock; } else @@ -518,6 +518,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) state.global_error = 0; state.version = version; + BUILD_BUG_ON(((PAGE_SIZE / sizeof(xen_pfn_t)) % XEN_PFN_PER_PAGE) != 0); /* mmap_batch_fn guarantees ret == 0 */ BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t), &pagelist, mmap_batch_fn, &state)); @@ -582,12 +583,13 @@ static void privcmd_close(struct vm_area_struct *vma) { struct page **pages = vma->vm_private_data; int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + int numgfns = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT; int rc; if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages) return; - rc = xen_unmap_domain_gfn_range(vma, numpgs, pages); + rc = xen_unmap_domain_gfn_range(vma, numgfns, pages); if (rc == 0) free_xenballooned_pages(numpgs, pages); else diff --git a/drivers/xen/xlate_mmu.c b/drivers/xen/xlate_mmu.c index cff23872c5a9..5063c5e796b7 100644 --- a/drivers/xen/xlate_mmu.c +++ b/drivers/xen/xlate_mmu.c @@ -38,31 +38,28 @@ #include #include -/* map fgfn of domid to lpfn in the current domain */ -static int map_foreign_page(unsigned long lpfn, unsigned long fgfn, - unsigned int domid) -{ - int rc; - struct xen_add_to_physmap_range xatp = { - .domid = DOMID_SELF, - .foreign_domid = domid, - .size = 1, - .space = XENMAPSPACE_gmfn_foreign, - }; - xen_ulong_t idx = fgfn; - xen_pfn_t gpfn = lpfn; - int err = 0; +typedef void (*xen_gfn_fn_t)(unsigned long gfn, void *data); - set_xen_guest_handle(xatp.idxs, &idx); - set_xen_guest_handle(xatp.gpfns, &gpfn); - set_xen_guest_handle(xatp.errs, &err); +/* Break down the pages in 4KB chunk and call fn for each gfn */ +static void xen_for_each_gfn(struct page **pages, unsigned nr_gfn, + xen_gfn_fn_t fn, void *data) +{ + unsigned long xen_pfn = 0; + struct page *page; + int i; - rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp); - return rc < 0 ? rc : err; + for (i = 0; i < nr_gfn; i++) { + if ((i % XEN_PFN_PER_PAGE) == 0) { + page = pages[i / XEN_PFN_PER_PAGE]; + xen_pfn = page_to_xen_pfn(page); + } + fn(pfn_to_gfn(xen_pfn++), data); + } } struct remap_data { xen_pfn_t *fgfn; /* foreign domain's gfn */ + int nr_fgfn; /* Number of foreign gfn left to map */ pgprot_t prot; domid_t domid; struct vm_area_struct *vma; @@ -71,24 +68,71 @@ struct remap_data { struct xen_remap_gfn_info *info; int *err_ptr; int mapped; + + /* Hypercall parameters */ + int h_errs[XEN_PFN_PER_PAGE]; + xen_ulong_t h_idxs[XEN_PFN_PER_PAGE]; + xen_pfn_t h_gpfns[XEN_PFN_PER_PAGE]; + + int h_iter; /* Iterator */ }; +static void setup_hparams(unsigned long gfn, void *data) +{ + struct remap_data *info = data; + + info->h_idxs[info->h_iter] = *info->fgfn; + info->h_gpfns[info->h_iter] = gfn; + info->h_errs[info->h_iter] = 0; + + info->h_iter++; + info->fgfn++; +} + static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, void *data) { struct remap_data *info = data; struct page *page = info->pages[info->index++]; - unsigned long pfn = page_to_pfn(page); - pte_t pte = pte_mkspecial(pfn_pte(pfn, info->prot)); - int rc; + pte_t pte = pte_mkspecial(pfn_pte(page_to_pfn(page), info->prot)); + int rc, nr_gfn; + uint32_t i; + struct xen_add_to_physmap_range xatp = { + .domid = DOMID_SELF, + .foreign_domid = info->domid, + .space = XENMAPSPACE_gmfn_foreign, + }; - rc = map_foreign_page(pfn, *info->fgfn, info->domid); - *info->err_ptr++ = rc; - if (!rc) { - set_pte_at(info->vma->vm_mm, addr, ptep, pte); - info->mapped++; + nr_gfn = min_t(typeof(info->nr_fgfn), XEN_PFN_PER_PAGE, info->nr_fgfn); + info->nr_fgfn -= nr_gfn; + + info->h_iter = 0; + xen_for_each_gfn(&page, nr_gfn, setup_hparams, info); + BUG_ON(info->h_iter != nr_gfn); + + set_xen_guest_handle(xatp.idxs, info->h_idxs); + set_xen_guest_handle(xatp.gpfns, info->h_gpfns); + set_xen_guest_handle(xatp.errs, info->h_errs); + xatp.size = nr_gfn; + + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp); + + /* info->err_ptr expect to have one error status per Xen PFN */ + for (i = 0; i < nr_gfn; i++) { + int err = (rc < 0) ? rc : info->h_errs[i]; + + *(info->err_ptr++) = err; + if (!err) + info->mapped++; } - info->fgfn++; + + /* + * Note: The hypercall will return 0 in most of the case if even if + * all the fgmfn are not mapped. We still have to update the pte + * as the userspace may decide to continue. + */ + if (!rc) + set_pte_at(info->vma->vm_mm, addr, ptep, pte); return 0; } @@ -102,13 +146,14 @@ int xen_xlate_remap_gfn_array(struct vm_area_struct *vma, { int err; struct remap_data data; - unsigned long range = nr << PAGE_SHIFT; + unsigned long range = DIV_ROUND_UP(nr, XEN_PFN_PER_PAGE) << PAGE_SHIFT; /* Kept here for the purpose of making sure code doesn't break x86 PVOPS */ BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); data.fgfn = gfn; + data.nr_fgfn = nr; data.prot = prot; data.domid = domid; data.vma = vma; @@ -123,21 +168,20 @@ int xen_xlate_remap_gfn_array(struct vm_area_struct *vma, } EXPORT_SYMBOL_GPL(xen_xlate_remap_gfn_array); -int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma, - int nr, struct page **pages) +static void unmap_gfn(unsigned long gfn, void *data) { - int i; + struct xen_remove_from_physmap xrp; - for (i = 0; i < nr; i++) { - struct xen_remove_from_physmap xrp; - unsigned long pfn; + xrp.domid = DOMID_SELF; + xrp.gpfn = gfn; + (void)HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); +} - pfn = page_to_pfn(pages[i]); +int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma, + int nr, struct page **pages) +{ + xen_for_each_gfn(pages, nr, unmap_gfn, NULL); - xrp.domid = DOMID_SELF; - xrp.gpfn = pfn; - (void)HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); - } return 0; } EXPORT_SYMBOL_GPL(xen_xlate_unmap_gfn_range); -- cgit v1.2.3 From 250c9af3d831139317009eaebbe82e20d23a581f Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Tue, 5 May 2015 16:36:56 +0100 Subject: arm/xen: Add support for 64KB page granularity The hypercall interface is always using 4KB page granularity. This is requiring to use xen page definition macro when we deal with hypercall. Note that pfn_to_gfn is working with a Xen pfn (i.e 4KB). We may want to rename pfn_gfn to make this explicit. We also allocate a 64KB page for the shared page even though only the first 4KB is used. I don't think this is really important for now as it helps to have the pointer 4KB aligned (XENMEM_add_to_physmap is taking a Xen PFN). Signed-off-by: Julien Grall Reviewed-by: Stefano Stabellini Signed-off-by: David Vrabel --- arch/arm/include/asm/xen/page.h | 15 +++++++++++++-- arch/arm/xen/enlighten.c | 6 +++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h index 98c9fc38b945..e3d94cfa4d46 100644 --- a/arch/arm/include/asm/xen/page.h +++ b/arch/arm/include/asm/xen/page.h @@ -28,6 +28,17 @@ typedef struct xpaddr { #define INVALID_P2M_ENTRY (~0UL) +/* + * The pseudo-physical frame (pfn) used in all the helpers is always based + * on Xen page granularity (i.e 4KB). + * + * A Linux page may be split across multiple non-contiguous Xen page so we + * have to keep track with frame based on 4KB page granularity. + * + * PV drivers should never make a direct usage of those helpers (particularly + * pfn_to_gfn and gfn_to_pfn). + */ + unsigned long __pfn_to_mfn(unsigned long pfn); extern struct rb_root phys_to_mach; @@ -64,8 +75,8 @@ static inline unsigned long bfn_to_pfn(unsigned long bfn) #define bfn_to_local_pfn(bfn) bfn_to_pfn(bfn) /* VIRT <-> GUEST conversion */ -#define virt_to_gfn(v) (pfn_to_gfn(virt_to_pfn(v))) -#define gfn_to_virt(m) (__va(gfn_to_pfn(m) << PAGE_SHIFT)) +#define virt_to_gfn(v) (pfn_to_gfn(virt_to_phys(v) >> XEN_PAGE_SHIFT)) +#define gfn_to_virt(m) (__va(gfn_to_pfn(m) << XEN_PAGE_SHIFT)) /* Only used in PV code. But ARM guests are always HVM. */ static inline xmaddr_t arbitrary_virt_to_machine(void *vaddr) diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index eeeab074e154..50b4769aad8b 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -89,8 +89,8 @@ static void xen_percpu_init(void) pr_info("Xen: initializing cpu%d\n", cpu); vcpup = per_cpu_ptr(xen_vcpu_info, cpu); - info.mfn = __pa(vcpup) >> PAGE_SHIFT; - info.offset = offset_in_page(vcpup); + info.mfn = virt_to_gfn(vcpup); + info.offset = xen_offset_in_page(vcpup); err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info); BUG_ON(err); @@ -213,7 +213,7 @@ static int __init xen_guest_init(void) xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; - xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; + xatp.gpfn = virt_to_gfn(shared_info_page); if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) BUG(); -- cgit v1.2.3 From 291be10fd7511101d44cf98166d049bd31bc7600 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Wed, 9 Sep 2015 15:17:33 +0100 Subject: xen/swiotlb: Pass addresses rather than frame numbers to xen_arch_need_swiotlb With 64KB page granularity support, the frame number will be different. It will be easier to modify the behavior in a single place rather than in each caller. Signed-off-by: Julien Grall Reviewed-by: Stefano Stabellini Signed-off-by: David Vrabel --- arch/arm/include/asm/xen/page.h | 4 ++-- arch/arm/xen/mm.c | 7 +++++-- arch/x86/include/asm/xen/page.h | 4 ++-- drivers/xen/swiotlb-xen.c | 4 ++-- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h index e3d94cfa4d46..415dbc6e43fd 100644 --- a/arch/arm/include/asm/xen/page.h +++ b/arch/arm/include/asm/xen/page.h @@ -115,8 +115,8 @@ static inline bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) #define xen_unmap(cookie) iounmap((cookie)) bool xen_arch_need_swiotlb(struct device *dev, - unsigned long pfn, - unsigned long bfn); + phys_addr_t phys, + dma_addr_t dev_addr); unsigned long xen_get_swiotlb_free_pages(unsigned int order); #endif /* _ASM_ARM_XEN_PAGE_H */ diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index 6dd911d1f0ac..7b517e913762 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -138,9 +138,12 @@ void __xen_dma_sync_single_for_device(struct device *hwdev, } bool xen_arch_need_swiotlb(struct device *dev, - unsigned long pfn, - unsigned long bfn) + phys_addr_t phys, + dma_addr_t dev_addr) { + unsigned long pfn = PFN_DOWN(phys); + unsigned long bfn = PFN_DOWN(dev_addr); + return (!hypercall_cflush && (pfn != bfn) && !is_device_dma_coherent(dev)); } diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index fe58e3a935de..f5fb840b43e8 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -298,8 +298,8 @@ void make_lowmem_page_readwrite(void *vaddr); #define xen_unmap(cookie) iounmap((cookie)) static inline bool xen_arch_need_swiotlb(struct device *dev, - unsigned long pfn, - unsigned long bfn) + phys_addr_t phys, + dma_addr_t dev_addr) { return false; } diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 79bc4933b13e..0a5a0e949862 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -392,7 +392,7 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, */ if (dma_capable(dev, dev_addr, size) && !range_straddles_page_boundary(phys, size) && - !xen_arch_need_swiotlb(dev, PFN_DOWN(phys), PFN_DOWN(dev_addr)) && + !xen_arch_need_swiotlb(dev, phys, dev_addr) && !swiotlb_force) { /* we are not interested in the dma_addr returned by * xen_dma_map_page, only in the potential cache flushes executed @@ -551,7 +551,7 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, dma_addr_t dev_addr = xen_phys_to_bus(paddr); if (swiotlb_force || - xen_arch_need_swiotlb(hwdev, PFN_DOWN(paddr), PFN_DOWN(dev_addr)) || + xen_arch_need_swiotlb(hwdev, paddr, dev_addr) || !dma_capable(hwdev, dev_addr, sg->length) || range_straddles_page_boundary(paddr, sg->length)) { phys_addr_t map = swiotlb_tbl_map_single(hwdev, -- cgit v1.2.3 From 9435cce87950d805e6c8315410f2cb8ff6b2c6a2 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Wed, 9 Sep 2015 15:18:45 +0100 Subject: xen/swiotlb: Add support for 64KB page granularity Swiotlb is used on ARM64 to support DMA on platform where devices are not protected by an SMMU. Furthermore it's only enabled for DOM0. While Xen is always using 4KB page granularity in the stage-2 page table, Linux ARM64 may either use 4KB or 64KB. This means that a Linux page can be spanned accross multiple Xen page. The Swiotlb code has to validate that the buffer used for DMA is physically contiguous in the memory. As a Linux page can't be shared between local memory and foreign page by design (the balloon code always removing entirely a Linux page), the changes in the code are very minimal because we only need to check the first Xen PFN. Note that it may be possible to optimize the function check_page_physically_contiguous to avoid looping over every Xen PFN for local memory. Although I will let this optimization for a follow-up. Signed-off-by: Julien Grall Reviewed-by: Stefano Stabellini Signed-off-by: David Vrabel --- arch/arm/include/asm/xen/page-coherent.h | 26 +++++++++++++-------- arch/arm/xen/mm.c | 38 ++++++++++++++++++++++--------- drivers/xen/swiotlb-xen.c | 39 ++++++++++++++++---------------- 3 files changed, 63 insertions(+), 40 deletions(-) diff --git a/arch/arm/include/asm/xen/page-coherent.h b/arch/arm/include/asm/xen/page-coherent.h index efd562412850..0375c8caa061 100644 --- a/arch/arm/include/asm/xen/page-coherent.h +++ b/arch/arm/include/asm/xen/page-coherent.h @@ -35,11 +35,15 @@ static inline void xen_dma_map_page(struct device *hwdev, struct page *page, dma_addr_t dev_addr, unsigned long offset, size_t size, enum dma_data_direction dir, struct dma_attrs *attrs) { - bool local = PFN_DOWN(dev_addr) == page_to_pfn(page); - /* Dom0 is mapped 1:1, so if pfn == mfn the page is local otherwise - * is a foreign page grant-mapped in dom0. If the page is local we - * can safely call the native dma_ops function, otherwise we call - * the xen specific function. */ + bool local = XEN_PFN_DOWN(dev_addr) == page_to_xen_pfn(page); + /* + * Dom0 is mapped 1:1, while the Linux page can be spanned accross + * multiple Xen page, it's not possible to have a mix of local and + * foreign Xen page. So if the first xen_pfn == mfn the page is local + * otherwise it's a foreign page grant-mapped in dom0. If the page is + * local we can safely call the native dma_ops function, otherwise we + * call the xen specific function. + */ if (local) __generic_dma_ops(hwdev)->map_page(hwdev, page, offset, size, dir, attrs); else @@ -51,10 +55,14 @@ static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle, struct dma_attrs *attrs) { unsigned long pfn = PFN_DOWN(handle); - /* Dom0 is mapped 1:1, so calling pfn_valid on a foreign mfn will - * always return false. If the page is local we can safely call the - * native dma_ops function, otherwise we call the xen specific - * function. */ + /* + * Dom0 is mapped 1:1, while the Linux page can be spanned accross + * multiple Xen page, it's not possible to have a mix of local and + * foreign Xen page. Dom0 is mapped 1:1, so calling pfn_valid on a + * foreign mfn will always return false. If the page is local we can + * safely call the native dma_ops function, otherwise we call the xen + * specific function. + */ if (pfn_valid(pfn)) { if (__generic_dma_ops(hwdev)->unmap_page) __generic_dma_ops(hwdev)->unmap_page(hwdev, handle, size, dir, attrs); diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index 7b517e913762..7c34f7126b04 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -48,22 +48,22 @@ static void dma_cache_maint(dma_addr_t handle, unsigned long offset, size_t size, enum dma_data_direction dir, enum dma_cache_op op) { struct gnttab_cache_flush cflush; - unsigned long pfn; + unsigned long xen_pfn; size_t left = size; - pfn = (handle >> PAGE_SHIFT) + offset / PAGE_SIZE; - offset %= PAGE_SIZE; + xen_pfn = (handle >> XEN_PAGE_SHIFT) + offset / XEN_PAGE_SIZE; + offset %= XEN_PAGE_SIZE; do { size_t len = left; /* buffers in highmem or foreign pages cannot cross page * boundaries */ - if (len + offset > PAGE_SIZE) - len = PAGE_SIZE - offset; + if (len + offset > XEN_PAGE_SIZE) + len = XEN_PAGE_SIZE - offset; cflush.op = 0; - cflush.a.dev_bus_addr = pfn << PAGE_SHIFT; + cflush.a.dev_bus_addr = xen_pfn << XEN_PAGE_SHIFT; cflush.offset = offset; cflush.length = len; @@ -79,7 +79,7 @@ static void dma_cache_maint(dma_addr_t handle, unsigned long offset, HYPERVISOR_grant_table_op(GNTTABOP_cache_flush, &cflush, 1); offset = 0; - pfn++; + xen_pfn++; left -= len; } while (left); } @@ -141,10 +141,26 @@ bool xen_arch_need_swiotlb(struct device *dev, phys_addr_t phys, dma_addr_t dev_addr) { - unsigned long pfn = PFN_DOWN(phys); - unsigned long bfn = PFN_DOWN(dev_addr); - - return (!hypercall_cflush && (pfn != bfn) && !is_device_dma_coherent(dev)); + unsigned int xen_pfn = XEN_PFN_DOWN(phys); + unsigned int bfn = XEN_PFN_DOWN(dev_addr); + + /* + * The swiotlb buffer should be used if + * - Xen doesn't have the cache flush hypercall + * - The Linux page refers to foreign memory + * - The device doesn't support coherent DMA request + * + * The Linux page may be spanned acrros multiple Xen page, although + * it's not possible to have a mix of local and foreign Xen page. + * Furthermore, range_straddles_page_boundary is already checking + * if buffer is physically contiguous in the host RAM. + * + * Therefore we only need to check the first Xen page to know if we + * require a bounce buffer because the device doesn't support coherent + * memory and we are not able to flush the cache. + */ + return (!hypercall_cflush && (xen_pfn != bfn) && + !is_device_dma_coherent(dev)); } int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 0a5a0e949862..7399782c0998 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -76,27 +76,27 @@ static unsigned long xen_io_tlb_nslabs; static u64 start_dma_addr; /* - * Both of these functions should avoid PFN_PHYS because phys_addr_t + * Both of these functions should avoid XEN_PFN_PHYS because phys_addr_t * can be 32bit when dma_addr_t is 64bit leading to a loss in * information if the shift is done before casting to 64bit. */ static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr) { - unsigned long bfn = pfn_to_bfn(PFN_DOWN(paddr)); - dma_addr_t dma = (dma_addr_t)bfn << PAGE_SHIFT; + unsigned long bfn = pfn_to_bfn(XEN_PFN_DOWN(paddr)); + dma_addr_t dma = (dma_addr_t)bfn << XEN_PAGE_SHIFT; - dma |= paddr & ~PAGE_MASK; + dma |= paddr & ~XEN_PAGE_MASK; return dma; } static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr) { - unsigned long pfn = bfn_to_pfn(PFN_DOWN(baddr)); - dma_addr_t dma = (dma_addr_t)pfn << PAGE_SHIFT; + unsigned long xen_pfn = bfn_to_pfn(XEN_PFN_DOWN(baddr)); + dma_addr_t dma = (dma_addr_t)xen_pfn << XEN_PAGE_SHIFT; phys_addr_t paddr = dma; - paddr |= baddr & ~PAGE_MASK; + paddr |= baddr & ~XEN_PAGE_MASK; return paddr; } @@ -106,7 +106,7 @@ static inline dma_addr_t xen_virt_to_bus(void *address) return xen_phys_to_bus(virt_to_phys(address)); } -static int check_pages_physically_contiguous(unsigned long pfn, +static int check_pages_physically_contiguous(unsigned long xen_pfn, unsigned int offset, size_t length) { @@ -114,11 +114,11 @@ static int check_pages_physically_contiguous(unsigned long pfn, int i; int nr_pages; - next_bfn = pfn_to_bfn(pfn); - nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT; + next_bfn = pfn_to_bfn(xen_pfn); + nr_pages = (offset + length + XEN_PAGE_SIZE-1) >> XEN_PAGE_SHIFT; for (i = 1; i < nr_pages; i++) { - if (pfn_to_bfn(++pfn) != ++next_bfn) + if (pfn_to_bfn(++xen_pfn) != ++next_bfn) return 0; } return 1; @@ -126,28 +126,27 @@ static int check_pages_physically_contiguous(unsigned long pfn, static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) { - unsigned long pfn = PFN_DOWN(p); - unsigned int offset = p & ~PAGE_MASK; + unsigned long xen_pfn = XEN_PFN_DOWN(p); + unsigned int offset = p & ~XEN_PAGE_MASK; - if (offset + size <= PAGE_SIZE) + if (offset + size <= XEN_PAGE_SIZE) return 0; - if (check_pages_physically_contiguous(pfn, offset, size)) + if (check_pages_physically_contiguous(xen_pfn, offset, size)) return 0; return 1; } static int is_xen_swiotlb_buffer(dma_addr_t dma_addr) { - unsigned long bfn = PFN_DOWN(dma_addr); - unsigned long pfn = bfn_to_local_pfn(bfn); - phys_addr_t paddr; + unsigned long bfn = XEN_PFN_DOWN(dma_addr); + unsigned long xen_pfn = bfn_to_local_pfn(bfn); + phys_addr_t paddr = XEN_PFN_PHYS(xen_pfn); /* If the address is outside our domain, it CAN * have the same virtual address as another address * in our domain. Therefore _only_ check address within our domain. */ - if (pfn_valid(pfn)) { - paddr = PFN_PHYS(pfn); + if (pfn_valid(PFN_DOWN(paddr))) { return paddr >= virt_to_phys(xen_io_tlb_start) && paddr < virt_to_phys(xen_io_tlb_end); } -- cgit v1.2.3 From 3990dd27034606312429a09c807ea74a6ec32dde Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Wed, 7 Oct 2015 14:04:33 +0100 Subject: xen/balloon: Use the correct sizeof when declaring frame_list The type of the item in frame_list is xen_pfn_t which is not an unsigned long on ARM but an uint64_t. With the current computation, the size of frame_list will be 2 * PAGE_SIZE rather than PAGE_SIZE. I bet it's just mistake when the type has been switched from "unsigned long" to "xen_pfn_t" in commit 965c0aaafe3e75d4e65cd4ec862915869bde3abd "xen: balloon: use correct type for frame_list". Signed-off-by: Julien Grall Signed-off-by: David Vrabel --- drivers/xen/balloon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index b50d22960ce7..12eab503efd1 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -141,7 +141,7 @@ struct balloon_stats balloon_stats; EXPORT_SYMBOL_GPL(balloon_stats); /* We increase/decrease in batches which fit in a page */ -static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)]; +static xen_pfn_t frame_list[PAGE_SIZE / sizeof(xen_pfn_t)]; /* List of ballooned pages, threaded through the mem_map array. */ -- cgit v1.2.3 From 91afb7c373e881d5038a78e1206a0f6469440ec3 Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Thu, 15 Oct 2015 07:56:07 +0200 Subject: xen/gntalloc: use types from linux/types.h in userspace headers __u32, __u64 etc. are preferred for userspace API headers. Signed-off-by: Mikko Rapeli Signed-off-by: David Vrabel --- include/uapi/xen/gntalloc.h | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/include/uapi/xen/gntalloc.h b/include/uapi/xen/gntalloc.h index 76bd58065f4f..48d2790ef928 100644 --- a/include/uapi/xen/gntalloc.h +++ b/include/uapi/xen/gntalloc.h @@ -11,6 +11,8 @@ #ifndef __LINUX_PUBLIC_GNTALLOC_H__ #define __LINUX_PUBLIC_GNTALLOC_H__ +#include + /* * Allocates a new page and creates a new grant reference. */ @@ -19,17 +21,17 @@ _IOC(_IOC_NONE, 'G', 5, sizeof(struct ioctl_gntalloc_alloc_gref)) struct ioctl_gntalloc_alloc_gref { /* IN parameters */ /* The ID of the domain to be given access to the grants. */ - uint16_t domid; + __u16 domid; /* Flags for this mapping */ - uint16_t flags; + __u16 flags; /* Number of pages to map */ - uint32_t count; + __u32 count; /* OUT parameters */ /* The offset to be used on a subsequent call to mmap(). */ - uint64_t index; + __u64 index; /* The grant references of the newly created grant, one per page */ /* Variable size, depending on count */ - uint32_t gref_ids[1]; + __u32 gref_ids[1]; }; #define GNTALLOC_FLAG_WRITABLE 1 @@ -43,9 +45,9 @@ _IOC(_IOC_NONE, 'G', 6, sizeof(struct ioctl_gntalloc_dealloc_gref)) struct ioctl_gntalloc_dealloc_gref { /* IN parameters */ /* The offset returned in the map operation */ - uint64_t index; + __u64 index; /* Number of references to unmap */ - uint32_t count; + __u32 count; }; /* @@ -67,11 +69,11 @@ struct ioctl_gntalloc_unmap_notify { * be cleared. Otherwise, it can be any byte in the page whose * notification we are adjusting. */ - uint64_t index; + __u64 index; /* Action(s) to take on unmap */ - uint32_t action; + __u32 action; /* Event channel to notify */ - uint32_t event_channel_port; + __u32 event_channel_port; }; /* Clear (set to zero) the byte specified by index */ -- cgit v1.2.3 From a36012be64e65760d208c23ea68dc12a895001d8 Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Thu, 15 Oct 2015 07:56:08 +0200 Subject: xen/gntdev: use types from linux/types.h in userspace headers __u32, __u64 etc. are preferred for userspace API headers. Signed-off-by: Mikko Rapeli Signed-off-by: David Vrabel --- include/uapi/xen/gntdev.h | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/include/uapi/xen/gntdev.h b/include/uapi/xen/gntdev.h index 5304bd3c84c5..aa7610a9b867 100644 --- a/include/uapi/xen/gntdev.h +++ b/include/uapi/xen/gntdev.h @@ -33,11 +33,13 @@ #ifndef __LINUX_PUBLIC_GNTDEV_H__ #define __LINUX_PUBLIC_GNTDEV_H__ +#include + struct ioctl_gntdev_grant_ref { /* The domain ID of the grant to be mapped. */ - uint32_t domid; + __u32 domid; /* The grant reference of the grant to be mapped. */ - uint32_t ref; + __u32 ref; }; /* @@ -50,11 +52,11 @@ _IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref)) struct ioctl_gntdev_map_grant_ref { /* IN parameters */ /* The number of grants to be mapped. */ - uint32_t count; - uint32_t pad; + __u32 count; + __u32 pad; /* OUT parameters */ /* The offset to be used on a subsequent call to mmap(). */ - uint64_t index; + __u64 index; /* Variable IN parameter. */ /* Array of grant references, of size @count. */ struct ioctl_gntdev_grant_ref refs[1]; @@ -70,10 +72,10 @@ _IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) struct ioctl_gntdev_unmap_grant_ref { /* IN parameters */ /* The offset was returned by the corresponding map operation. */ - uint64_t index; + __u64 index; /* The number of pages to be unmapped. */ - uint32_t count; - uint32_t pad; + __u32 count; + __u32 pad; }; /* @@ -93,13 +95,13 @@ _IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr)) struct ioctl_gntdev_get_offset_for_vaddr { /* IN parameters */ /* The virtual address of the first mapped page in a range. */ - uint64_t vaddr; + __u64 vaddr; /* OUT parameters */ /* The offset that was used in the initial mmap() operation. */ - uint64_t offset; + __u64 offset; /* The number of pages mapped in the VM area that begins at @vaddr. */ - uint32_t count; - uint32_t pad; + __u32 count; + __u32 pad; }; /* @@ -113,7 +115,7 @@ _IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants)) struct ioctl_gntdev_set_max_grants { /* IN parameter */ /* The maximum number of grants that may be mapped at once. */ - uint32_t count; + __u32 count; }; /* @@ -135,11 +137,11 @@ struct ioctl_gntdev_unmap_notify { * be cleared. Otherwise, it can be any byte in the page whose * notification we are adjusting. */ - uint64_t index; + __u64 index; /* Action(s) to take on unmap */ - uint32_t action; + __u32 action; /* Event channel to notify */ - uint32_t event_channel_port; + __u32 event_channel_port; }; /* Clear (set to zero) the byte specified by index */ -- cgit v1.2.3 From d5f985c834fbfdedcb629b009ba272ad50add5ac Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 14 Sep 2015 15:20:52 +0200 Subject: xen/arm: correct comment in enlighten.c Correct a comment in arch/arm/xen/enlighten.c referencing a wrong source file. Signed-off-by: Juergen Gross Acked-by: Stefano Stabellini Signed-off-by: David Vrabel --- arch/arm/xen/enlighten.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index 50b4769aad8b..dad4ffb0934c 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -284,7 +284,7 @@ void xen_arch_resume(void) { } void xen_arch_suspend(void) { } -/* In the hypervisor.S file. */ +/* In the hypercall.S file. */ EXPORT_SYMBOL_GPL(HYPERVISOR_event_channel_op); EXPORT_SYMBOL_GPL(HYPERVISOR_grant_table_op); EXPORT_SYMBOL_GPL(HYPERVISOR_xen_version); -- cgit v1.2.3 From 9cce2914e2b21339dca12c91dc9f35790366cc4c Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Tue, 13 Oct 2015 17:50:11 +0100 Subject: xen/xenbus: Rename *RING_PAGE* to *RING_GRANT* Linux may use a different page size than the size of grant. So make clear that the order is actually in number of grant. Signed-off-by: Julien Grall Signed-off-by: David Vrabel --- drivers/block/xen-blkback/blkback.c | 8 ++++---- drivers/block/xen-blkback/xenbus.c | 2 +- drivers/block/xen-blkfront.c | 10 +++++----- drivers/xen/xenbus/xenbus_client.c | 34 +++++++++++++++++----------------- include/xen/xenbus.h | 4 ++-- 5 files changed, 29 insertions(+), 29 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 809634ce3b67..f9099940c272 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -87,7 +87,7 @@ MODULE_PARM_DESC(max_persistent_grants, * Maximum order of pages to be used for the shared ring between front and * backend, 4KB page granularity is used. */ -unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_PAGE_ORDER; +unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO); MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring"); /* @@ -1446,10 +1446,10 @@ static int __init xen_blkif_init(void) if (!xen_domain()) return -ENODEV; - if (xen_blkif_max_ring_order > XENBUS_MAX_RING_PAGE_ORDER) { + if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) { pr_info("Invalid max_ring_order (%d), will use default max: %d.\n", - xen_blkif_max_ring_order, XENBUS_MAX_RING_PAGE_ORDER); - xen_blkif_max_ring_order = XENBUS_MAX_RING_PAGE_ORDER; + xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER); + xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; } rc = xen_blkif_interface_init(); diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 01c6b41de4e5..f53cff42f8da 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -829,7 +829,7 @@ again: static int connect_ring(struct backend_info *be) { struct xenbus_device *dev = be->dev; - unsigned int ring_ref[XENBUS_MAX_RING_PAGES]; + unsigned int ring_ref[XENBUS_MAX_RING_GRANTS]; unsigned int evtchn, nr_grefs, ring_page_order; unsigned int pers_grants; char protocol[64] = ""; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 057e05da83d1..833955f32430 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -111,7 +111,7 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * (info)->nr_ring_pages) #define BLK_MAX_RING_SIZE \ - __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_PAGES) + __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS) /* * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 @@ -133,7 +133,7 @@ struct blkfront_info int vdevice; blkif_vdev_t handle; enum blkif_state connected; - int ring_ref[XENBUS_MAX_RING_PAGES]; + int ring_ref[XENBUS_MAX_RING_GRANTS]; unsigned int nr_ring_pages; struct blkif_front_ring ring; unsigned int evtchn, irq; @@ -1413,7 +1413,7 @@ static int setup_blkring(struct xenbus_device *dev, struct blkif_sring *sring; int err, i; unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE; - grant_ref_t gref[XENBUS_MAX_RING_PAGES]; + grant_ref_t gref[XENBUS_MAX_RING_GRANTS]; for (i = 0; i < info->nr_ring_pages; i++) info->ring_ref[i] = GRANT_INVALID_REF; @@ -2284,9 +2284,9 @@ static int __init xlblk_init(void) if (!xen_domain()) return -ENODEV; - if (xen_blkif_max_ring_order > XENBUS_MAX_RING_PAGE_ORDER) { + if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) { pr_info("Invalid max_ring_order (%d), will use default max: %d.\n", - xen_blkif_max_ring_order, XENBUS_MAX_RING_PAGE_ORDER); + xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER); xen_blkif_max_ring_order = 0; } diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 42abee3bbb27..b77643361853 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -56,11 +56,11 @@ struct xenbus_map_node { struct vm_struct *area; } pv; struct { - struct page *pages[XENBUS_MAX_RING_PAGES]; + struct page *pages[XENBUS_MAX_RING_GRANTS]; void *addr; } hvm; }; - grant_handle_t handles[XENBUS_MAX_RING_PAGES]; + grant_handle_t handles[XENBUS_MAX_RING_GRANTS]; unsigned int nr_handles; }; @@ -479,12 +479,12 @@ static int __xenbus_map_ring(struct xenbus_device *dev, unsigned int flags, bool *leaked) { - struct gnttab_map_grant_ref map[XENBUS_MAX_RING_PAGES]; - struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_PAGES]; + struct gnttab_map_grant_ref map[XENBUS_MAX_RING_GRANTS]; + struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS]; int i, j; int err = GNTST_okay; - if (nr_grefs > XENBUS_MAX_RING_PAGES) + if (nr_grefs > XENBUS_MAX_RING_GRANTS) return -EINVAL; for (i = 0; i < nr_grefs; i++) { @@ -540,15 +540,15 @@ static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev, { struct xenbus_map_node *node; struct vm_struct *area; - pte_t *ptes[XENBUS_MAX_RING_PAGES]; - phys_addr_t phys_addrs[XENBUS_MAX_RING_PAGES]; + pte_t *ptes[XENBUS_MAX_RING_GRANTS]; + phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; int err = GNTST_okay; int i; bool leaked; *vaddr = NULL; - if (nr_grefs > XENBUS_MAX_RING_PAGES) + if (nr_grefs > XENBUS_MAX_RING_GRANTS) return -EINVAL; node = kzalloc(sizeof(*node), GFP_KERNEL); @@ -602,10 +602,10 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev, void *addr; bool leaked = false; /* Why do we need two arrays? See comment of __xenbus_map_ring */ - phys_addr_t phys_addrs[XENBUS_MAX_RING_PAGES]; - unsigned long addrs[XENBUS_MAX_RING_PAGES]; + phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; + unsigned long addrs[XENBUS_MAX_RING_GRANTS]; - if (nr_grefs > XENBUS_MAX_RING_PAGES) + if (nr_grefs > XENBUS_MAX_RING_GRANTS) return -EINVAL; *vaddr = NULL; @@ -686,10 +686,10 @@ int xenbus_map_ring(struct xenbus_device *dev, grant_ref_t *gnt_refs, unsigned int nr_grefs, grant_handle_t *handles, unsigned long *vaddrs, bool *leaked) { - phys_addr_t phys_addrs[XENBUS_MAX_RING_PAGES]; + phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; int i; - if (nr_grefs > XENBUS_MAX_RING_PAGES) + if (nr_grefs > XENBUS_MAX_RING_GRANTS) return -EINVAL; for (i = 0; i < nr_grefs; i++) @@ -722,7 +722,7 @@ EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr) { struct xenbus_map_node *node; - struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_PAGES]; + struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS]; unsigned int level; int i; bool leaked = false; @@ -787,7 +787,7 @@ static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr) int rv; struct xenbus_map_node *node; void *addr; - unsigned long addrs[XENBUS_MAX_RING_PAGES]; + unsigned long addrs[XENBUS_MAX_RING_GRANTS]; int i; spin_lock(&xenbus_valloc_lock); @@ -840,11 +840,11 @@ int xenbus_unmap_ring(struct xenbus_device *dev, grant_handle_t *handles, unsigned int nr_handles, unsigned long *vaddrs) { - struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_PAGES]; + struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS]; int i; int err; - if (nr_handles > XENBUS_MAX_RING_PAGES) + if (nr_handles > XENBUS_MAX_RING_GRANTS) return -EINVAL; for (i = 0; i < nr_handles; i++) diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index 289c0b5f08fe..32b944b7cebd 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -46,8 +46,8 @@ #include #include -#define XENBUS_MAX_RING_PAGE_ORDER 4 -#define XENBUS_MAX_RING_PAGES (1U << XENBUS_MAX_RING_PAGE_ORDER) +#define XENBUS_MAX_RING_GRANT_ORDER 4 +#define XENBUS_MAX_RING_GRANTS (1U << XENBUS_MAX_RING_GRANT_ORDER) #define INVALID_GRANT_HANDLE (~0U) /* Register callback to watch this node. */ -- cgit v1.2.3 From f73314b28148f9ee9f89a0ae961c8fb36e3269fa Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Tue, 13 Oct 2015 17:50:12 +0100 Subject: xen/grant-table: Add an helper to iterate over a specific number of grants With the 64KB page granularity support on ARM64, a Linux page may be split accross multiple grant. Currently we have the helper gnttab_foreach_grant_in_grant to break a Linux page based on an offset and a len, but it doesn't fit when we only have a number of grants in hand. Introduce a new helper which take an array of Linux page and a number of grant and will figure out the address of each grant. Signed-off-by: Julien Grall Signed-off-by: David Vrabel --- drivers/xen/grant-table.c | 22 ++++++++++++++++++++++ include/xen/grant_table.h | 6 ++++++ 2 files changed, 28 insertions(+) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 72d633962095..c49f79ed58c5 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -802,6 +802,28 @@ void gnttab_foreach_grant_in_range(struct page *page, } EXPORT_SYMBOL_GPL(gnttab_foreach_grant_in_range); +void gnttab_foreach_grant(struct page **pages, + unsigned int nr_grefs, + xen_grant_fn_t fn, + void *data) +{ + unsigned int goffset = 0; + unsigned long xen_pfn = 0; + unsigned int i; + + for (i = 0; i < nr_grefs; i++) { + if ((i % XEN_PFN_PER_PAGE) == 0) { + xen_pfn = page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]); + goffset = 0; + } + + fn(pfn_to_gfn(xen_pfn), goffset, XEN_PAGE_SIZE, data); + + goffset += XEN_PAGE_SIZE; + xen_pfn++; + } +} + int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count) diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index e17a4b381a16..34b1379f9777 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -264,6 +264,12 @@ static inline void gnttab_for_one_grant(struct page *page, unsigned int offset, gnttab_foreach_grant_in_range(page, offset, len, fn, data); } +/* Get @nr_grefs grants from an array of page and call fn for each grant */ +void gnttab_foreach_grant(struct page **pages, + unsigned int nr_grefs, + xen_grant_fn_t fn, + void *data); + /* Get the number of grant in a specified region * * start: Offset from the beginning of the first page -- cgit v1.2.3 From 89bf4b4e4a8d9ab219cd03aada24e782cf0ac359 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Tue, 13 Oct 2015 17:50:13 +0100 Subject: xenbus: Support multiple grants ring with 64KB The PV ring may use multiple grants and expect them to be mapped contiguously in the virtual memory. Although, the current code is relying on a Linux page will be mapped to a single grant. On build where Linux is using a different page size than the grant (i.e other than 4KB), the grant will always be mapped on the first 4KB of each Linux page which make the final ring not contiguous in the memory. This can be fixed by mapping multiple grant in a same Linux page. Signed-off-by: Julien Grall Signed-off-by: David Vrabel --- drivers/xen/xenbus/xenbus_client.c | 97 ++++++++++++++++++++++++++++---------- 1 file changed, 72 insertions(+), 25 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index b77643361853..056da6ee1a35 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -49,6 +49,10 @@ #include "xenbus_probe.h" +#define XENBUS_PAGES(_grants) (DIV_ROUND_UP(_grants, XEN_PFN_PER_PAGE)) + +#define XENBUS_MAX_RING_PAGES (XENBUS_PAGES(XENBUS_MAX_RING_GRANTS)) + struct xenbus_map_node { struct list_head next; union { @@ -56,7 +60,8 @@ struct xenbus_map_node { struct vm_struct *area; } pv; struct { - struct page *pages[XENBUS_MAX_RING_GRANTS]; + struct page *pages[XENBUS_MAX_RING_PAGES]; + unsigned long addrs[XENBUS_MAX_RING_GRANTS]; void *addr; } hvm; }; @@ -591,19 +596,42 @@ failed: return err; } +struct map_ring_valloc_hvm +{ + unsigned int idx; + + /* Why do we need two arrays? See comment of __xenbus_map_ring */ + phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; + unsigned long addrs[XENBUS_MAX_RING_GRANTS]; +}; + +static void xenbus_map_ring_setup_grant_hvm(unsigned long gfn, + unsigned int goffset, + unsigned int len, + void *data) +{ + struct map_ring_valloc_hvm *info = data; + unsigned long vaddr = (unsigned long)gfn_to_virt(gfn); + + info->phys_addrs[info->idx] = vaddr; + info->addrs[info->idx] = vaddr; + + info->idx++; +} + static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev, grant_ref_t *gnt_ref, unsigned int nr_grefs, void **vaddr) { struct xenbus_map_node *node; - int i; int err; void *addr; bool leaked = false; - /* Why do we need two arrays? See comment of __xenbus_map_ring */ - phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; - unsigned long addrs[XENBUS_MAX_RING_GRANTS]; + struct map_ring_valloc_hvm info = { + .idx = 0, + }; + unsigned int nr_pages = XENBUS_PAGES(nr_grefs); if (nr_grefs > XENBUS_MAX_RING_GRANTS) return -EINVAL; @@ -614,24 +642,22 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev, if (!node) return -ENOMEM; - err = alloc_xenballooned_pages(nr_grefs, node->hvm.pages); + err = alloc_xenballooned_pages(nr_pages, node->hvm.pages); if (err) goto out_err; - for (i = 0; i < nr_grefs; i++) { - unsigned long pfn = page_to_pfn(node->hvm.pages[i]); - phys_addrs[i] = (unsigned long)pfn_to_kaddr(pfn); - addrs[i] = (unsigned long)pfn_to_kaddr(pfn); - } + gnttab_foreach_grant(node->hvm.pages, nr_grefs, + xenbus_map_ring_setup_grant_hvm, + &info); err = __xenbus_map_ring(dev, gnt_ref, nr_grefs, node->handles, - phys_addrs, GNTMAP_host_map, &leaked); + info.phys_addrs, GNTMAP_host_map, &leaked); node->nr_handles = nr_grefs; if (err) goto out_free_ballooned_pages; - addr = vmap(node->hvm.pages, nr_grefs, VM_MAP | VM_IOREMAP, + addr = vmap(node->hvm.pages, nr_pages, VM_MAP | VM_IOREMAP, PAGE_KERNEL); if (!addr) { err = -ENOMEM; @@ -649,14 +675,13 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev, out_xenbus_unmap_ring: if (!leaked) - xenbus_unmap_ring(dev, node->handles, node->nr_handles, - addrs); + xenbus_unmap_ring(dev, node->handles, nr_grefs, info.addrs); else pr_alert("leaking %p size %u page(s)", - addr, nr_grefs); + addr, nr_pages); out_free_ballooned_pages: if (!leaked) - free_xenballooned_pages(nr_grefs, node->hvm.pages); + free_xenballooned_pages(nr_pages, node->hvm.pages); out_err: kfree(node); return err; @@ -782,13 +807,33 @@ static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr) return err; } +struct unmap_ring_vfree_hvm +{ + unsigned int idx; + unsigned long addrs[XENBUS_MAX_RING_GRANTS]; +}; + +static void xenbus_unmap_ring_setup_grant_hvm(unsigned long gfn, + unsigned int goffset, + unsigned int len, + void *data) +{ + struct unmap_ring_vfree_hvm *info = data; + + info->addrs[info->idx] = (unsigned long)gfn_to_virt(gfn); + + info->idx++; +} + static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr) { int rv; struct xenbus_map_node *node; void *addr; - unsigned long addrs[XENBUS_MAX_RING_GRANTS]; - int i; + struct unmap_ring_vfree_hvm info = { + .idx = 0, + }; + unsigned int nr_pages; spin_lock(&xenbus_valloc_lock); list_for_each_entry(node, &xenbus_valloc_pages, next) { @@ -808,18 +853,20 @@ static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr) return GNTST_bad_virt_addr; } - for (i = 0; i < node->nr_handles; i++) - addrs[i] = (unsigned long)pfn_to_kaddr(page_to_pfn(node->hvm.pages[i])); + nr_pages = XENBUS_PAGES(node->nr_handles); + + gnttab_foreach_grant(node->hvm.pages, node->nr_handles, + xenbus_unmap_ring_setup_grant_hvm, + &info); rv = xenbus_unmap_ring(dev, node->handles, node->nr_handles, - addrs); + info.addrs); if (!rv) { vunmap(vaddr); - free_xenballooned_pages(node->nr_handles, node->hvm.pages); + free_xenballooned_pages(nr_pages, node->hvm.pages); } else - WARN(1, "Leaking %p, size %u page(s)\n", vaddr, - node->nr_handles); + WARN(1, "Leaking %p, size %u page(s)\n", vaddr, nr_pages); kfree(node); return rv; -- cgit v1.2.3 From a314e3eb845389b8f68130c79a63832229dea87b Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 22 Oct 2015 16:20:46 +0000 Subject: xen/arm: Enable cpu_hotplug.c Build cpu_hotplug for ARM and ARM64 guests. Rename arch_(un)register_cpu to xen_(un)register_cpu and provide an empty implementation on ARM and ARM64. On x86 just call arch_(un)register_cpu as we are already doing. Initialize cpu_hotplug on ARM. Signed-off-by: Stefano Stabellini Reviewed-by: Julien Grall Reviewed-by: Boris Ostrovsky --- arch/arm/include/asm/xen/hypervisor.h | 10 ++++++++++ arch/x86/include/asm/xen/hypervisor.h | 5 +++++ arch/x86/xen/enlighten.c | 15 +++++++++++++++ drivers/xen/Makefile | 2 -- drivers/xen/cpu_hotplug.c | 8 ++++++-- 5 files changed, 36 insertions(+), 4 deletions(-) diff --git a/arch/arm/include/asm/xen/hypervisor.h b/arch/arm/include/asm/xen/hypervisor.h index 04ff8e7b37df..95251512e2c4 100644 --- a/arch/arm/include/asm/xen/hypervisor.h +++ b/arch/arm/include/asm/xen/hypervisor.h @@ -26,4 +26,14 @@ void __init xen_early_init(void); static inline void xen_early_init(void) { return; } #endif +#ifdef CONFIG_HOTPLUG_CPU +static inline void xen_arch_register_cpu(int num) +{ +} + +static inline void xen_arch_unregister_cpu(int num) +{ +} +#endif + #endif /* _ASM_ARM_XEN_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index d866959e5685..8b2d4bea9962 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -57,4 +57,9 @@ static inline bool xen_x2apic_para_available(void) } #endif +#ifdef CONFIG_HOTPLUG_CPU +void xen_arch_register_cpu(int num); +void xen_arch_unregister_cpu(int num); +#endif + #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 993b7a71386d..5774800ff583 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -75,6 +75,7 @@ #include #include #include +#include #ifdef CONFIG_ACPI #include @@ -1899,3 +1900,17 @@ const struct hypervisor_x86 x86_hyper_xen = { .set_cpu_features = xen_set_cpu_features, }; EXPORT_SYMBOL(x86_hyper_xen); + +#ifdef CONFIG_HOTPLUG_CPU +void xen_arch_register_cpu(int num) +{ + arch_register_cpu(num); +} +EXPORT_SYMBOL(xen_arch_register_cpu); + +void xen_arch_unregister_cpu(int num) +{ + arch_unregister_cpu(num); +} +EXPORT_SYMBOL(xen_arch_unregister_cpu); +#endif diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index e293bc507cbc..aa8a7f71f310 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,6 +1,4 @@ -ifeq ($(filter y, $(CONFIG_ARM) $(CONFIG_ARM64)),) obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o -endif obj-$(CONFIG_X86) += fallback.o obj-y += grant-table.o features.o balloon.o manage.o preempt.o obj-y += events/ diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index cc6513a176b0..43de1f51b53f 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -11,7 +11,7 @@ static void enable_hotplug_cpu(int cpu) { if (!cpu_present(cpu)) - arch_register_cpu(cpu); + xen_arch_register_cpu(cpu); set_cpu_present(cpu, true); } @@ -19,7 +19,7 @@ static void enable_hotplug_cpu(int cpu) static void disable_hotplug_cpu(int cpu) { if (cpu_present(cpu)) - arch_unregister_cpu(cpu); + xen_arch_unregister_cpu(cpu); set_cpu_present(cpu, false); } @@ -102,7 +102,11 @@ static int __init setup_vcpu_hotplug_event(void) static struct notifier_block xsn_cpu = { .notifier_call = setup_cpu_watcher }; +#ifdef CONFIG_X86 if (!xen_pv_domain()) +#else + if (!xen_domain()) +#endif return -ENODEV; register_xenstore_notifier(&xsn_cpu); -- cgit v1.2.3 From 1c7a62137bb23bc8a2c05d1dad6105afa569b20e Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 22 Oct 2015 16:21:46 +0000 Subject: xen, cpu_hotplug: call device_offline instead of cpu_down When offlining a cpu, instead of cpu_down, call device_offline, which also takes care of updating the cpu.dev.offline field. This keeps the sysfs file /sys/devices/system/cpu/cpuN/online, up to date. Also move the call to disable_hotplug_cpu, because it makes more sense to have it there. We don't call device_online at cpu-hotplug time, because that would immediately take the cpu online, while we want to retain the current behaviour: the user needs to explicitly enable the cpu after it has been hotplugged. Signed-off-by: Stefano Stabellini Reviewed-by: Boris Ostrovsky CC: konrad.wilk@oracle.com CC: boris.ostrovsky@oracle.com CC: david.vrabel@citrix.com --- drivers/xen/cpu_hotplug.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index 43de1f51b53f..5676aefdf2bc 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -18,6 +18,11 @@ static void enable_hotplug_cpu(int cpu) static void disable_hotplug_cpu(int cpu) { + if (cpu_online(cpu)) { + lock_device_hotplug(); + device_offline(get_cpu_device(cpu)); + unlock_device_hotplug(); + } if (cpu_present(cpu)) xen_arch_unregister_cpu(cpu); @@ -55,7 +60,6 @@ static void vcpu_hotplug(unsigned int cpu) enable_hotplug_cpu(cpu); break; case 0: - (void)cpu_down(cpu); disable_hotplug_cpu(cpu); break; default: -- cgit v1.2.3 From cb9644bf3b549d20656cca02e8a6332c67cf37d6 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 22 Oct 2015 16:22:46 +0000 Subject: xen/arm: don't try to re-register vcpu_info on cpu_hotplug. Call disable_percpu_irq on CPU_DYING and enable_percpu_irq when the cpu is coming up. Signed-off-by: Stefano Stabellini Reviewed-by: Julien Grall --- arch/arm/xen/enlighten.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index dad4ffb0934c..fc7ea529f462 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -86,6 +86,14 @@ static void xen_percpu_init(void) int err; int cpu = get_cpu(); + /* + * VCPUOP_register_vcpu_info cannot be called twice for the same + * vcpu, so if vcpu_info is already registered, just get out. This + * can happen with cpu-hotplug. + */ + if (per_cpu(xen_vcpu, cpu) != NULL) + goto after_register_vcpu_info; + pr_info("Xen: initializing cpu%d\n", cpu); vcpup = per_cpu_ptr(xen_vcpu_info, cpu); @@ -96,6 +104,7 @@ static void xen_percpu_init(void) BUG_ON(err); per_cpu(xen_vcpu, cpu) = vcpup; +after_register_vcpu_info: enable_percpu_irq(xen_events_irq, 0); put_cpu(); } @@ -124,6 +133,9 @@ static int xen_cpu_notification(struct notifier_block *self, case CPU_STARTING: xen_percpu_init(); break; + case CPU_DYING: + disable_percpu_irq(xen_events_irq); + break; default: break; } -- cgit v1.2.3 From 914beb9fc26d6225295b8315ab54026f8f22755c Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Wed, 28 Oct 2015 13:39:05 +0000 Subject: x86/xen: add reschedule point when mapping foreign GFNs Mapping a large range of foreign GFNs can take a long time, add a reschedule point after each batch of 16 GFNs. Signed-off-by: David Vrabel Reviewed-by: Boris Ostrovsky --- arch/x86/xen/mmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 9c479fe40459..ac161db63388 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2888,6 +2888,7 @@ static int do_remap_gfn(struct vm_area_struct *vma, addr += range; if (err_ptr) err_ptr += batch; + cond_resched(); } out: -- cgit v1.2.3 From abed7d0710e8f892c267932a9492ccf447674fb8 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Tue, 27 Oct 2015 15:19:52 -0400 Subject: xen: fix the check of e_pfn in xen_find_pfn_range On some NUMA system, after dom0 up, we see below warning even if there are enough pfn ranges that could be used for remapping: "Unable to find available pfn range, not remapping identity pages" Fix it to avoid getting a memory region of zero size in xen_find_pfn_range. Signed-off-by: Zhenzhong Duan Reviewed-by: Juergen Gross Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- arch/x86/xen/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 387b60d9bd0e..d1fac0e53d57 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -212,7 +212,7 @@ static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn) e_pfn = PFN_DOWN(entry->addr + entry->size); /* We only care about E820 after this */ - if (e_pfn < *min_pfn) + if (e_pfn <= *min_pfn) continue; s_pfn = PFN_UP(entry->addr); -- cgit v1.2.3