From 4fbb67e3c87b806ad54445a1b4a9c6bde2359c98 Mon Sep 17 00:00:00 2001 From: Matt Rushton Date: Mon, 11 Aug 2014 11:57:57 -0700 Subject: xen/setup: Remap Xen Identity Mapped RAM Instead of ballooning up and down dom0 memory this remaps the existing mfns that were replaced by the identity map. The reason for this is that the existing implementation ballooned memory up and and down which caused dom0 to have discontiguous pages. In some cases this resulted in the use of bounce buffers which reduced network I/O performance significantly. This change will honor the existing order of the pages with the exception of some boundary conditions. To do this we need to update both the Linux p2m table and the Xen m2p table. Particular care must be taken when updating the p2m table since it's important to limit table memory consumption and reuse the existing leaf pages which get freed when an entire leaf page is set to the identity map. To implement this, mapping updates are grouped into blocks with table entries getting cached temporarily and then released. On my test system before: Total pages: 2105014 Total contiguous: 1640635 After: Total pages: 2105014 Total contiguous: 2098904 Signed-off-by: Matthew Rushton Signed-off-by: David Vrabel --- arch/x86/xen/p2m.c | 23 +--- arch/x86/xen/p2m.h | 15 +++ arch/x86/xen/setup.c | 370 ++++++++++++++++++++++++++++++++++++++++----------- 3 files changed, 314 insertions(+), 94 deletions(-) create mode 100644 arch/x86/xen/p2m.h diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 3172692381ae..9f5983b01ed9 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -173,6 +173,7 @@ #include #include +#include "p2m.h" #include "multicalls.h" #include "xen-ops.h" @@ -180,12 +181,6 @@ static void __init m2p_override_init(void); unsigned long xen_max_p2m_pfn __read_mostly; -#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) -#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) -#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) - -#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) - /* Placeholders for holes in the address space */ static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); @@ -202,16 +197,12 @@ static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE); RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); -/* We might hit two boundary violations at the start and end, at max each - * boundary violation will require three middle nodes. */ -RESERVE_BRK(p2m_mid_extra, PAGE_SIZE * 2 * 3); - -/* When we populate back during bootup, the amount of pages can vary. The - * max we have is seen is 395979, but that does not mean it can't be more. - * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle - * it can re-use Xen provided mfn_list array, so we only need to allocate at - * most three P2M top nodes. */ -RESERVE_BRK(p2m_populated, PAGE_SIZE * 3); +/* For each I/O range remapped we may lose up to two leaf pages for the boundary + * violations and three mid pages to cover up to 3GB. With + * early_can_reuse_p2m_middle() most of the leaf pages will be reused by the + * remapped region. + */ +RESERVE_BRK(p2m_identity_remap, PAGE_SIZE * 2 * 3 * MAX_REMAP_RANGES); static inline unsigned p2m_top_index(unsigned long pfn) { diff --git a/arch/x86/xen/p2m.h b/arch/x86/xen/p2m.h new file mode 100644 index 000000000000..ad8aee24ab72 --- /dev/null +++ b/arch/x86/xen/p2m.h @@ -0,0 +1,15 @@ +#ifndef _XEN_P2M_H +#define _XEN_P2M_H + +#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) +#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) +#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) + +#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) + +#define MAX_REMAP_RANGES 10 + +extern unsigned long __init set_phys_range_identity(unsigned long pfn_s, + unsigned long pfn_e); + +#endif /* _XEN_P2M_H */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 2e555163c2fe..af7216128d93 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -29,6 +29,7 @@ #include #include "xen-ops.h" #include "vdso.h" +#include "p2m.h" /* These are code, but not functions. Defined in entry.S */ extern const char xen_hypervisor_callback[]; @@ -46,6 +47,9 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; /* Number of pages released from the initial allocation. */ unsigned long xen_released_pages; +/* Buffer used to remap identity mapped pages */ +unsigned long xen_remap_buf[P2M_PER_PAGE] __initdata; + /* * The maximum amount of extra memory compared to the base size. The * main scaling factor is the size of struct page. At extreme ratios @@ -151,107 +155,325 @@ static unsigned long __init xen_do_chunk(unsigned long start, return len; } -static unsigned long __init xen_release_chunk(unsigned long start, - unsigned long end) -{ - return xen_do_chunk(start, end, true); -} - -static unsigned long __init xen_populate_chunk( +/* + * Finds the next RAM pfn available in the E820 map after min_pfn. + * This function updates min_pfn with the pfn found and returns + * the size of that range or zero if not found. + */ +static unsigned long __init xen_find_pfn_range( const struct e820entry *list, size_t map_size, - unsigned long max_pfn, unsigned long *last_pfn, - unsigned long credits_left) + unsigned long *min_pfn) { const struct e820entry *entry; unsigned int i; unsigned long done = 0; - unsigned long dest_pfn; for (i = 0, entry = list; i < map_size; i++, entry++) { unsigned long s_pfn; unsigned long e_pfn; - unsigned long pfns; - long capacity; - - if (credits_left <= 0) - break; if (entry->type != E820_RAM) continue; e_pfn = PFN_DOWN(entry->addr + entry->size); - /* We only care about E820 after the xen_start_info->nr_pages */ - if (e_pfn <= max_pfn) + /* We only care about E820 after this */ + if (e_pfn < *min_pfn) continue; s_pfn = PFN_UP(entry->addr); - /* If the E820 falls within the nr_pages, we want to start - * at the nr_pages PFN. - * If that would mean going past the E820 entry, skip it + + /* If min_pfn falls within the E820 entry, we want to start + * at the min_pfn PFN. */ - if (s_pfn <= max_pfn) { - capacity = e_pfn - max_pfn; - dest_pfn = max_pfn; + if (s_pfn <= *min_pfn) { + done = e_pfn - *min_pfn; } else { - capacity = e_pfn - s_pfn; - dest_pfn = s_pfn; + done = e_pfn - s_pfn; + *min_pfn = s_pfn; } + break; + } - if (credits_left < capacity) - capacity = credits_left; + return done; +} - pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false); - done += pfns; - *last_pfn = (dest_pfn + pfns); - if (pfns < capacity) - break; - credits_left -= pfns; +/* + * This releases a chunk of memory and then does the identity map. It's used as + * as a fallback if the remapping fails. + */ +static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, + unsigned long end_pfn, unsigned long nr_pages, unsigned long *identity, + unsigned long *released) +{ + WARN_ON(start_pfn > end_pfn); + + /* Need to release pages first */ + *released += xen_do_chunk(start_pfn, min(end_pfn, nr_pages), true); + *identity += set_phys_range_identity(start_pfn, end_pfn); +} + +/* + * Helper function to update both the p2m and m2p tables. + */ +static unsigned long __init xen_update_mem_tables(unsigned long pfn, + unsigned long mfn) +{ + struct mmu_update update = { + .ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, + .val = pfn + }; + + /* Update p2m */ + if (!early_set_phys_to_machine(pfn, mfn)) { + WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n", + pfn, mfn); + return false; } - return done; + + /* Update m2p */ + if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) { + WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n", + mfn, pfn); + return false; + } + + return true; } -static void __init xen_set_identity_and_release_chunk( - unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, - unsigned long *released, unsigned long *identity) +/* + * This function updates the p2m and m2p tables with an identity map from + * start_pfn to start_pfn+size and remaps the underlying RAM of the original + * allocation at remap_pfn. It must do so carefully in P2M_PER_PAGE sized blocks + * to not exhaust the reserved brk space. Doing it in properly aligned blocks + * ensures we only allocate the minimum required leaf pages in the p2m table. It + * copies the existing mfns from the p2m table under the 1:1 map, overwrites + * them with the identity map and then updates the p2m and m2p tables with the + * remapped memory. + */ +static unsigned long __init xen_do_set_identity_and_remap_chunk( + unsigned long start_pfn, unsigned long size, unsigned long remap_pfn) { - unsigned long pfn; + unsigned long ident_pfn_iter, remap_pfn_iter; + unsigned long ident_start_pfn_align, remap_start_pfn_align; + unsigned long ident_end_pfn_align, remap_end_pfn_align; + unsigned long ident_boundary_pfn, remap_boundary_pfn; + unsigned long ident_cnt = 0; + unsigned long remap_cnt = 0; + unsigned long left = size; + unsigned long mod; + int i; + + WARN_ON(size == 0); + + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); /* - * If the PFNs are currently mapped, clear the mappings - * (except for the ISA region which must be 1:1 mapped) to - * release the refcounts (in Xen) on the original frames. + * Determine the proper alignment to remap memory in P2M_PER_PAGE sized + * blocks. We need to keep track of both the existing pfn mapping and + * the new pfn remapping. */ - for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) { - pte_t pte = __pte_ma(0); + mod = start_pfn % P2M_PER_PAGE; + ident_start_pfn_align = + mod ? (start_pfn - mod + P2M_PER_PAGE) : start_pfn; + mod = remap_pfn % P2M_PER_PAGE; + remap_start_pfn_align = + mod ? (remap_pfn - mod + P2M_PER_PAGE) : remap_pfn; + mod = (start_pfn + size) % P2M_PER_PAGE; + ident_end_pfn_align = start_pfn + size - mod; + mod = (remap_pfn + size) % P2M_PER_PAGE; + remap_end_pfn_align = remap_pfn + size - mod; + + /* Iterate over each p2m leaf node in each range */ + for (ident_pfn_iter = ident_start_pfn_align, remap_pfn_iter = remap_start_pfn_align; + ident_pfn_iter < ident_end_pfn_align && remap_pfn_iter < remap_end_pfn_align; + ident_pfn_iter += P2M_PER_PAGE, remap_pfn_iter += P2M_PER_PAGE) { + /* Check we aren't past the end */ + BUG_ON(ident_pfn_iter + P2M_PER_PAGE > start_pfn + size); + BUG_ON(remap_pfn_iter + P2M_PER_PAGE > remap_pfn + size); + + /* Save p2m mappings */ + for (i = 0; i < P2M_PER_PAGE; i++) + xen_remap_buf[i] = pfn_to_mfn(ident_pfn_iter + i); + + /* Set identity map which will free a p2m leaf */ + ident_cnt += set_phys_range_identity(ident_pfn_iter, + ident_pfn_iter + P2M_PER_PAGE); + +#ifdef DEBUG + /* Helps verify a p2m leaf has been freed */ + for (i = 0; i < P2M_PER_PAGE; i++) { + unsigned int pfn = ident_pfn_iter + i; + BUG_ON(pfn_to_mfn(pfn) != pfn); + } +#endif + /* Now remap memory */ + for (i = 0; i < P2M_PER_PAGE; i++) { + unsigned long mfn = xen_remap_buf[i]; + + /* This will use the p2m leaf freed above */ + if (!xen_update_mem_tables(remap_pfn_iter + i, mfn)) { + WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n", + remap_pfn_iter + i, mfn); + return 0; + } + + remap_cnt++; + } - if (pfn < PFN_UP(ISA_END_ADDRESS)) - pte = mfn_pte(pfn, PAGE_KERNEL_IO); + left -= P2M_PER_PAGE; + } - (void)HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0); + /* Max boundary space possible */ + BUG_ON(left > (P2M_PER_PAGE - 1) * 2); + + /* Now handle the boundary conditions */ + ident_boundary_pfn = start_pfn; + remap_boundary_pfn = remap_pfn; + for (i = 0; i < left; i++) { + unsigned long mfn; + + /* These two checks move from the start to end boundaries */ + if (ident_boundary_pfn == ident_start_pfn_align) + ident_boundary_pfn = ident_pfn_iter; + if (remap_boundary_pfn == remap_start_pfn_align) + remap_boundary_pfn = remap_pfn_iter; + + /* Check we aren't past the end */ + BUG_ON(ident_boundary_pfn >= start_pfn + size); + BUG_ON(remap_boundary_pfn >= remap_pfn + size); + + mfn = pfn_to_mfn(ident_boundary_pfn); + + if (!xen_update_mem_tables(remap_boundary_pfn, mfn)) { + WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n", + remap_pfn_iter + i, mfn); + return 0; + } + remap_cnt++; + + ident_boundary_pfn++; + remap_boundary_pfn++; } - if (start_pfn < nr_pages) - *released += xen_release_chunk( - start_pfn, min(end_pfn, nr_pages)); + /* Finish up the identity map */ + if (ident_start_pfn_align >= ident_end_pfn_align) { + /* + * In this case we have an identity range which does not span an + * aligned block so everything needs to be identity mapped here. + * If we didn't check this we might remap too many pages since + * the align boundaries are not meaningful in this case. + */ + ident_cnt += set_phys_range_identity(start_pfn, + start_pfn + size); + } else { + /* Remapped above so check each end of the chunk */ + if (start_pfn < ident_start_pfn_align) + ident_cnt += set_phys_range_identity(start_pfn, + ident_start_pfn_align); + if (start_pfn + size > ident_pfn_iter) + ident_cnt += set_phys_range_identity(ident_pfn_iter, + start_pfn + size); + } - *identity += set_phys_range_identity(start_pfn, end_pfn); + BUG_ON(ident_cnt != size); + BUG_ON(remap_cnt != size); + + return size; } -static unsigned long __init xen_set_identity_and_release( - const struct e820entry *list, size_t map_size, unsigned long nr_pages) +/* + * This function takes a contiguous pfn range that needs to be identity mapped + * and: + * + * 1) Finds a new range of pfns to use to remap based on E820 and remap_pfn. + * 2) Calls the do_ function to actually do the mapping/remapping work. + * + * The goal is to not allocate additional memory but to remap the existing + * pages. In the case of an error the underlying memory is simply released back + * to Xen and not remapped. + */ +static unsigned long __init xen_set_identity_and_remap_chunk( + const struct e820entry *list, size_t map_size, unsigned long start_pfn, + unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn, + unsigned long *identity, unsigned long *remapped, + unsigned long *released) +{ + unsigned long pfn; + unsigned long i = 0; + unsigned long n = end_pfn - start_pfn; + + while (i < n) { + unsigned long cur_pfn = start_pfn + i; + unsigned long left = n - i; + unsigned long size = left; + unsigned long remap_range_size; + + /* Do not remap pages beyond the current allocation */ + if (cur_pfn >= nr_pages) { + /* Identity map remaining pages */ + *identity += set_phys_range_identity(cur_pfn, + cur_pfn + size); + break; + } + if (cur_pfn + size > nr_pages) + size = nr_pages - cur_pfn; + + remap_range_size = xen_find_pfn_range(list, map_size, + &remap_pfn); + if (!remap_range_size) { + pr_warning("Unable to find available pfn range, not remapping identity pages\n"); + xen_set_identity_and_release_chunk(cur_pfn, + cur_pfn + left, nr_pages, identity, released); + break; + } + /* Adjust size to fit in current e820 RAM region */ + if (size > remap_range_size) + size = remap_range_size; + + if (!xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn)) { + WARN(1, "Failed to remap 1:1 memory cur_pfn=%ld size=%ld remap_pfn=%ld\n", + cur_pfn, size, remap_pfn); + xen_set_identity_and_release_chunk(cur_pfn, + cur_pfn + left, nr_pages, identity, released); + break; + } + + /* Update variables to reflect new mappings. */ + i += size; + remap_pfn += size; + *identity += size; + *remapped += size; + } + + /* + * If the PFNs are currently mapped, the VA mapping also needs + * to be updated to be 1:1. + */ + for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) + (void)HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + mfn_pte(pfn, PAGE_KERNEL_IO), 0); + + return remap_pfn; +} + +static unsigned long __init xen_set_identity_and_remap( + const struct e820entry *list, size_t map_size, unsigned long nr_pages, + unsigned long *released) { phys_addr_t start = 0; - unsigned long released = 0; unsigned long identity = 0; + unsigned long remapped = 0; + unsigned long last_pfn = nr_pages; const struct e820entry *entry; + unsigned long num_released = 0; int i; /* * Combine non-RAM regions and gaps until a RAM region (or the * end of the map) is reached, then set the 1:1 map and - * release the pages (if available) in those non-RAM regions. + * remap the memory in those non-RAM regions. * * The combined non-RAM regions are rounded to a whole number * of pages so any partial pages are accessible via the 1:1 @@ -269,22 +491,24 @@ static unsigned long __init xen_set_identity_and_release( end_pfn = PFN_UP(entry->addr); if (start_pfn < end_pfn) - xen_set_identity_and_release_chunk( - start_pfn, end_pfn, nr_pages, - &released, &identity); - + last_pfn = xen_set_identity_and_remap_chunk( + list, map_size, start_pfn, + end_pfn, nr_pages, last_pfn, + &identity, &remapped, + &num_released); start = end; } } - if (released) - printk(KERN_INFO "Released %lu pages of unused memory\n", released); - if (identity) - printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); + *released = num_released; - return released; -} + pr_info("Set %ld page(s) to 1-1 mapping\n", identity); + pr_info("Remapped %ld page(s), last_pfn=%ld\n", remapped, + last_pfn); + pr_info("Released %ld page(s)\n", num_released); + return last_pfn; +} static unsigned long __init xen_get_max_pages(void) { unsigned long max_pages = MAX_DOMAIN_PAGES; @@ -347,7 +571,6 @@ char * __init xen_memory_setup(void) unsigned long max_pages; unsigned long last_pfn = 0; unsigned long extra_pages = 0; - unsigned long populated; int i; int op; @@ -392,20 +615,11 @@ char * __init xen_memory_setup(void) extra_pages += max_pages - max_pfn; /* - * Set P2M for all non-RAM pages and E820 gaps to be identity - * type PFNs. Any RAM pages that would be made inaccesible by - * this are first released. + * Set identity map on non-RAM pages and remap the underlying RAM. */ - xen_released_pages = xen_set_identity_and_release( - map, memmap.nr_entries, max_pfn); - - /* - * Populate back the non-RAM pages and E820 gaps that had been - * released. */ - populated = xen_populate_chunk(map, memmap.nr_entries, - max_pfn, &last_pfn, xen_released_pages); + last_pfn = xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn, + &xen_released_pages); - xen_released_pages -= populated; extra_pages += xen_released_pages; if (last_pfn > max_pfn) { -- cgit v1.2.3 From 46e3626adb86be2ff1321d26107cb75df07a7f4e Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Tue, 26 Aug 2014 23:38:44 +0800 Subject: xen/grant-table: refactor error cleanup in grow_gnttab_list() The cleanup loop in grow_gnttab_list() is safe from the underflow of the unsigned 'i' since nr_glist_frames is >= 1, but refactor it anyway. Signed-off-by: Chen Gang Signed-off-by: David Vrabel --- drivers/xen/grant-table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index c254ae036f18..7786291ba229 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -592,7 +592,7 @@ static int grow_gnttab_list(unsigned int more_frames) return 0; grow_nomem: - for ( ; i >= nr_glist_frames; i--) + while (i-- > nr_glist_frames) free_page((unsigned long) gnttab_list[i]); return -ENOMEM; } -- cgit v1.2.3 From 854072dd0f73b8e4238c88bdf0dd2a8db0239c1c Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 28 Aug 2014 06:44:09 +0200 Subject: xen/events: support threaded irqs for interdomain event channels Export bind_interdomain_evtchn_to_irq() so drivers can use threaded interrupt handlers with: irq = bind_interdomain_evtchn_to_irq(remote_dom, remote_port); if (irq < 0) /* error */ ret = request_threaded_irq(...); Signed-off-by: Juergen Gross Acked-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- drivers/xen/events/events_base.c | 5 +++-- include/xen/events.h | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 5b5c5ff273fd..b4bca2d4a7e5 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -900,8 +900,8 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) return irq; } -static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, - unsigned int remote_port) +int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, + unsigned int remote_port) { struct evtchn_bind_interdomain bind_interdomain; int err; @@ -914,6 +914,7 @@ static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, return err ? : bind_evtchn_to_irq(bind_interdomain.local_port); } +EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq); static int find_virq(unsigned int virq, unsigned int cpu) { diff --git a/include/xen/events.h b/include/xen/events.h index 8bee7a75e850..5321cd9636e6 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -28,6 +28,8 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, unsigned long irqflags, const char *devname, void *dev_id); +int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, + unsigned int remote_port); int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, unsigned int remote_port, irq_handler_t handler, -- cgit v1.2.3 From e124c9a2c3c4c51555fe6f0bc214fe1b5cce3666 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 28 Aug 2014 06:44:10 +0200 Subject: xen: Add Xen pvSCSI protocol description Add the definition of pvSCSI protocol used between the pvSCSI frontend in a XEN domU and the pvSCSI backend in a XEN driver domain (usually Dom0). This header was originally provided by Fujitsu for Xen based on Linux 2.6.18. Changes are: - Added comments. - Adapt to Linux style guide. - Add support for larger SG-lists by putting them in an own granted page. - Remove stale definitions. Signed-off-by: Juergen Gross Signed-off-by: David Vrabel --- include/xen/interface/io/vscsiif.h | 229 +++++++++++++++++++++++++++++++++++++ 1 file changed, 229 insertions(+) create mode 100644 include/xen/interface/io/vscsiif.h diff --git a/include/xen/interface/io/vscsiif.h b/include/xen/interface/io/vscsiif.h new file mode 100644 index 000000000000..d07d7aca8d1c --- /dev/null +++ b/include/xen/interface/io/vscsiif.h @@ -0,0 +1,229 @@ +/****************************************************************************** + * vscsiif.h + * + * Based on the blkif.h code. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright(c) FUJITSU Limited 2008. + */ + +#ifndef __XEN__PUBLIC_IO_SCSI_H__ +#define __XEN__PUBLIC_IO_SCSI_H__ + +#include "ring.h" +#include "../grant_table.h" + +/* + * Feature and Parameter Negotiation + * ================================= + * The two halves of a Xen pvSCSI driver utilize nodes within the XenStore to + * communicate capabilities and to negotiate operating parameters. This + * section enumerates these nodes which reside in the respective front and + * backend portions of the XenStore, following the XenBus convention. + * + * Any specified default value is in effect if the corresponding XenBus node + * is not present in the XenStore. + * + * XenStore nodes in sections marked "PRIVATE" are solely for use by the + * driver side whose XenBus tree contains them. + * + ***************************************************************************** + * Backend XenBus Nodes + ***************************************************************************** + * + *------------------ Backend Device Identification (PRIVATE) ------------------ + * + * p-devname + * Values: string + * + * A free string used to identify the physical device (e.g. a disk name). + * + * p-dev + * Values: string + * + * A string specifying the backend device: either a 4-tuple "h:c:t:l" + * (host, controller, target, lun, all integers), or a WWN (e.g. + * "naa.60014054ac780582"). + * + * v-dev + * Values: string + * + * A string specifying the frontend device in form of a 4-tuple "h:c:t:l" + * (host, controller, target, lun, all integers). + * + *--------------------------------- Features --------------------------------- + * + * feature-sg-grant + * Values: unsigned [VSCSIIF_SG_TABLESIZE...65535] + * Default Value: 0 + * + * Specifies the maximum number of scatter/gather elements in grant pages + * supported. If not set, the backend supports up to VSCSIIF_SG_TABLESIZE + * SG elements specified directly in the request. + * + ***************************************************************************** + * Frontend XenBus Nodes + ***************************************************************************** + * + *----------------------- Request Transport Parameters ----------------------- + * + * event-channel + * Values: unsigned + * + * The identifier of the Xen event channel used to signal activity + * in the ring buffer. + * + * ring-ref + * Values: unsigned + * + * The Xen grant reference granting permission for the backend to map + * the sole page in a single page sized ring buffer. + * + * protocol + * Values: string (XEN_IO_PROTO_ABI_*) + * Default Value: XEN_IO_PROTO_ABI_NATIVE + * + * The machine ABI rules governing the format of all ring request and + * response structures. + */ + +/* Requests from the frontend to the backend */ + +/* + * Request a SCSI operation specified via a CDB in vscsiif_request.cmnd. + * The target is specified via channel, id and lun. + * + * The operation to be performed is specified via a CDB in cmnd[], the length + * of the CDB is in cmd_len. sc_data_direction specifies the direction of data + * (to the device, from the device, or none at all). + * + * If data is to be transferred to or from the device the buffer(s) in the + * guest memory is/are specified via one or multiple scsiif_request_segment + * descriptors each specifying a memory page via a grant_ref_t, a offset into + * the page and the length of the area in that page. All scsiif_request_segment + * areas concatenated form the resulting data buffer used by the operation. + * If the number of scsiif_request_segment areas is not too large (less than + * or equal VSCSIIF_SG_TABLESIZE) the areas can be specified directly in the + * seg[] array and the number of valid scsiif_request_segment elements is to be + * set in nr_segments. + * + * If "feature-sg-grant" in the Xenstore is set it is possible to specify more + * than VSCSIIF_SG_TABLESIZE scsiif_request_segment elements via indirection. + * The maximum number of allowed scsiif_request_segment elements is the value + * of the "feature-sg-grant" entry from Xenstore. When using indirection the + * seg[] array doesn't contain specifications of the data buffers, but + * references to scsiif_request_segment arrays, which in turn reference the + * data buffers. While nr_segments holds the number of populated seg[] entries + * (plus the set VSCSIIF_SG_GRANT bit), the number of scsiif_request_segment + * elements referencing the target data buffers is calculated from the lengths + * of the seg[] elements (the sum of all valid seg[].length divided by the + * size of one scsiif_request_segment structure). + */ +#define VSCSIIF_ACT_SCSI_CDB 1 + +/* + * Request abort of a running operation for the specified target given by + * channel, id, lun and the operation's rqid in ref_rqid. + */ +#define VSCSIIF_ACT_SCSI_ABORT 2 + +/* + * Request a device reset of the specified target (channel and id). + */ +#define VSCSIIF_ACT_SCSI_RESET 3 + +/* + * Preset scatter/gather elements for a following request. Deprecated. + * Keeping the define only to avoid usage of the value "4" for other actions. + */ +#define VSCSIIF_ACT_SCSI_SG_PRESET 4 + +/* + * Maximum scatter/gather segments per request. + * + * Considering balance between allocating at least 16 "vscsiif_request" + * structures on one page (4096 bytes) and the number of scatter/gather + * elements needed, we decided to use 26 as a magic number. + * + * If "feature-sg-grant" is set, more scatter/gather elements can be specified + * by placing them in one or more (up to VSCSIIF_SG_TABLESIZE) granted pages. + * In this case the vscsiif_request seg elements don't contain references to + * the user data, but to the SG elements referencing the user data. + */ +#define VSCSIIF_SG_TABLESIZE 26 + +/* + * based on Linux kernel 2.6.18, still valid + * Changing these values requires support of multiple protocols via the rings + * as "old clients" will blindly use these values and the resulting structure + * sizes. + */ +#define VSCSIIF_MAX_COMMAND_SIZE 16 +#define VSCSIIF_SENSE_BUFFERSIZE 96 + +struct scsiif_request_segment { + grant_ref_t gref; + uint16_t offset; + uint16_t length; +}; + +#define VSCSIIF_SG_PER_PAGE (PAGE_SIZE / sizeof(struct scsiif_request_segment)) + +/* Size of one request is 252 bytes */ +struct vscsiif_request { + uint16_t rqid; /* private guest value, echoed in resp */ + uint8_t act; /* command between backend and frontend */ + uint8_t cmd_len; /* valid CDB bytes */ + + uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE]; /* the CDB */ + uint16_t timeout_per_command; /* deprecated */ + uint16_t channel, id, lun; /* (virtual) device specification */ + uint16_t ref_rqid; /* command abort reference */ + uint8_t sc_data_direction; /* for DMA_TO_DEVICE(1) + DMA_FROM_DEVICE(2) + DMA_NONE(3) requests */ + uint8_t nr_segments; /* Number of pieces of scatter-gather */ +/* + * flag in nr_segments: SG elements via grant page + * + * If VSCSIIF_SG_GRANT is set, the low 7 bits of nr_segments specify the number + * of grant pages containing SG elements. Usable if "feature-sg-grant" set. + */ +#define VSCSIIF_SG_GRANT 0x80 + + struct scsiif_request_segment seg[VSCSIIF_SG_TABLESIZE]; + uint32_t reserved[3]; +}; + +/* Size of one response is 252 bytes */ +struct vscsiif_response { + uint16_t rqid; /* identifies request */ + uint8_t padding; + uint8_t sense_len; + uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE]; + int32_t rslt; + uint32_t residual_len; /* request bufflen - + return the value from physical device */ + uint32_t reserved[36]; +}; + +DEFINE_RING_TYPES(vscsiif, struct vscsiif_request, struct vscsiif_response); + +#endif /*__XEN__PUBLIC_IO_SCSI_H__*/ -- cgit v1.2.3 From 8beb8d4c91d652617dc992de565ec9904361c33e Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 28 Aug 2014 06:44:11 +0200 Subject: xen-scsifront: Add Xen PV SCSI frontend driver Introduces the Xen pvSCSI frontend. With pvSCSI it is possible for a Xen domU to issue SCSI commands to a SCSI LUN assigned to that domU. The SCSI commands are passed to the pvSCSI backend in a driver domain (usually Dom0) which is owner of the physical device. This allows e.g. to use SCSI tape drives in a Xen domU. The code is taken from the pvSCSI implementation in Xen done by Fujitsu based on Linux kernel 2.6.18. Changes from the original version are: - port to upstream kernel - put all code in just one source file - move module to appropriate location in kernel tree - adapt to Linux style guide - some minor code simplifications - replace constants with defines - remove not used defines - add support for larger SG lists by putting them in a granted page Signed-off-by: Juergen Gross Acked-by: Christoph Hellwig Signed-off-by: David Vrabel --- drivers/scsi/Kconfig | 9 + drivers/scsi/Makefile | 1 + drivers/scsi/xen-scsifront.c | 1024 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1034 insertions(+) create mode 100644 drivers/scsi/xen-scsifront.c diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 18a3358eb1d4..9130df14f742 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -576,6 +576,15 @@ config VMWARE_PVSCSI To compile this driver as a module, choose M here: the module will be called vmw_pvscsi. +config XEN_SCSI_FRONTEND + tristate "XEN SCSI frontend driver" + depends on SCSI && XEN + help + The XEN SCSI frontend driver allows the kernel to access SCSI Devices + within another guest OS (usually Dom0). + Only needed if the kernel is running in a XEN guest and generic + SCSI access to a device is needed. + config HYPERV_STORAGE tristate "Microsoft Hyper-V virtual storage driver" depends on SCSI && HYPERV diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile index 5f0d299b0093..59f1ce6df2d6 100644 --- a/drivers/scsi/Makefile +++ b/drivers/scsi/Makefile @@ -141,6 +141,7 @@ obj-$(CONFIG_SCSI_ESAS2R) += esas2r/ obj-$(CONFIG_SCSI_PMCRAID) += pmcraid.o obj-$(CONFIG_SCSI_VIRTIO) += virtio_scsi.o obj-$(CONFIG_VMWARE_PVSCSI) += vmw_pvscsi.o +obj-$(CONFIG_XEN_SCSI_FRONTEND) += xen-scsifront.o obj-$(CONFIG_HYPERV_STORAGE) += hv_storvsc.o obj-$(CONFIG_ARM) += arm/ diff --git a/drivers/scsi/xen-scsifront.c b/drivers/scsi/xen-scsifront.c new file mode 100644 index 000000000000..0aceb70e1d83 --- /dev/null +++ b/drivers/scsi/xen-scsifront.c @@ -0,0 +1,1024 @@ +/* + * Xen SCSI frontend driver + * + * Copyright (c) 2008, FUJITSU Limited + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + + +#define GRANT_INVALID_REF 0 + +#define VSCSIFRONT_OP_ADD_LUN 1 +#define VSCSIFRONT_OP_DEL_LUN 2 + +/* Tuning point. */ +#define VSCSIIF_DEFAULT_CMD_PER_LUN 10 +#define VSCSIIF_MAX_TARGET 64 +#define VSCSIIF_MAX_LUN 255 + +#define VSCSIIF_RING_SIZE __CONST_RING_SIZE(vscsiif, PAGE_SIZE) +#define VSCSIIF_MAX_REQS VSCSIIF_RING_SIZE + +#define vscsiif_grants_sg(_sg) (PFN_UP((_sg) * \ + sizeof(struct scsiif_request_segment))) + +struct vscsifrnt_shadow { + /* command between backend and frontend */ + unsigned char act; + uint16_t rqid; + + unsigned int nr_grants; /* number of grants in gref[] */ + struct scsiif_request_segment *sg; /* scatter/gather elements */ + + /* Do reset or abort function. */ + wait_queue_head_t wq_reset; /* reset work queue */ + int wait_reset; /* reset work queue condition */ + int32_t rslt_reset; /* reset response status: */ + /* SUCCESS or FAILED or: */ +#define RSLT_RESET_WAITING 0 +#define RSLT_RESET_ERR -1 + + /* Requested struct scsi_cmnd is stored from kernel. */ + struct scsi_cmnd *sc; + int gref[vscsiif_grants_sg(SG_ALL) + SG_ALL]; +}; + +struct vscsifrnt_info { + struct xenbus_device *dev; + + struct Scsi_Host *host; + int host_active; + + unsigned int evtchn; + unsigned int irq; + + grant_ref_t ring_ref; + struct vscsiif_front_ring ring; + struct vscsiif_response ring_rsp; + + spinlock_t shadow_lock; + DECLARE_BITMAP(shadow_free_bitmap, VSCSIIF_MAX_REQS); + struct vscsifrnt_shadow *shadow[VSCSIIF_MAX_REQS]; + + wait_queue_head_t wq_sync; + unsigned int wait_ring_available:1; + + char dev_state_path[64]; + struct task_struct *curr; +}; + +static DEFINE_MUTEX(scsifront_mutex); + +static void scsifront_wake_up(struct vscsifrnt_info *info) +{ + info->wait_ring_available = 0; + wake_up(&info->wq_sync); +} + +static int scsifront_get_rqid(struct vscsifrnt_info *info) +{ + unsigned long flags; + int free; + + spin_lock_irqsave(&info->shadow_lock, flags); + + free = find_first_bit(info->shadow_free_bitmap, VSCSIIF_MAX_REQS); + __clear_bit(free, info->shadow_free_bitmap); + + spin_unlock_irqrestore(&info->shadow_lock, flags); + + return free; +} + +static int _scsifront_put_rqid(struct vscsifrnt_info *info, uint32_t id) +{ + int empty = bitmap_empty(info->shadow_free_bitmap, VSCSIIF_MAX_REQS); + + __set_bit(id, info->shadow_free_bitmap); + info->shadow[id] = NULL; + + return empty || info->wait_ring_available; +} + +static void scsifront_put_rqid(struct vscsifrnt_info *info, uint32_t id) +{ + unsigned long flags; + int kick; + + spin_lock_irqsave(&info->shadow_lock, flags); + kick = _scsifront_put_rqid(info, id); + spin_unlock_irqrestore(&info->shadow_lock, flags); + + if (kick) + scsifront_wake_up(info); +} + +static struct vscsiif_request *scsifront_pre_req(struct vscsifrnt_info *info) +{ + struct vscsiif_front_ring *ring = &(info->ring); + struct vscsiif_request *ring_req; + uint32_t id; + + id = scsifront_get_rqid(info); /* use id in response */ + if (id >= VSCSIIF_MAX_REQS) + return NULL; + + ring_req = RING_GET_REQUEST(&(info->ring), ring->req_prod_pvt); + + ring->req_prod_pvt++; + + ring_req->rqid = (uint16_t)id; + + return ring_req; +} + +static void scsifront_do_request(struct vscsifrnt_info *info) +{ + struct vscsiif_front_ring *ring = &(info->ring); + int notify; + + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(ring, notify); + if (notify) + notify_remote_via_irq(info->irq); +} + +static void scsifront_gnttab_done(struct vscsifrnt_info *info, uint32_t id) +{ + struct vscsifrnt_shadow *s = info->shadow[id]; + int i; + + if (s->sc->sc_data_direction == DMA_NONE) + return; + + for (i = 0; i < s->nr_grants; i++) { + if (unlikely(gnttab_query_foreign_access(s->gref[i]) != 0)) { + shost_printk(KERN_ALERT, info->host, KBUILD_MODNAME + "grant still in use by backend\n"); + BUG(); + } + gnttab_end_foreign_access(s->gref[i], 0, 0UL); + } + + kfree(s->sg); +} + +static void scsifront_cdb_cmd_done(struct vscsifrnt_info *info, + struct vscsiif_response *ring_rsp) +{ + struct scsi_cmnd *sc; + uint32_t id; + uint8_t sense_len; + + id = ring_rsp->rqid; + sc = info->shadow[id]->sc; + + BUG_ON(sc == NULL); + + scsifront_gnttab_done(info, id); + scsifront_put_rqid(info, id); + + sc->result = ring_rsp->rslt; + scsi_set_resid(sc, ring_rsp->residual_len); + + sense_len = min_t(uint8_t, VSCSIIF_SENSE_BUFFERSIZE, + ring_rsp->sense_len); + + if (sense_len) + memcpy(sc->sense_buffer, ring_rsp->sense_buffer, sense_len); + + sc->scsi_done(sc); +} + +static void scsifront_sync_cmd_done(struct vscsifrnt_info *info, + struct vscsiif_response *ring_rsp) +{ + uint16_t id = ring_rsp->rqid; + unsigned long flags; + struct vscsifrnt_shadow *shadow = info->shadow[id]; + int kick; + + spin_lock_irqsave(&info->shadow_lock, flags); + shadow->wait_reset = 1; + switch (shadow->rslt_reset) { + case RSLT_RESET_WAITING: + shadow->rslt_reset = ring_rsp->rslt; + break; + case RSLT_RESET_ERR: + kick = _scsifront_put_rqid(info, id); + spin_unlock_irqrestore(&info->shadow_lock, flags); + kfree(shadow); + if (kick) + scsifront_wake_up(info); + return; + default: + shost_printk(KERN_ERR, info->host, KBUILD_MODNAME + "bad reset state %d, possibly leaking %u\n", + shadow->rslt_reset, id); + break; + } + spin_unlock_irqrestore(&info->shadow_lock, flags); + + wake_up(&shadow->wq_reset); +} + +static int scsifront_cmd_done(struct vscsifrnt_info *info) +{ + struct vscsiif_response *ring_rsp; + RING_IDX i, rp; + int more_to_do = 0; + unsigned long flags; + + spin_lock_irqsave(info->host->host_lock, flags); + + rp = info->ring.sring->rsp_prod; + rmb(); /* ordering required respective to dom0 */ + for (i = info->ring.rsp_cons; i != rp; i++) { + + ring_rsp = RING_GET_RESPONSE(&info->ring, i); + + if (WARN(ring_rsp->rqid >= VSCSIIF_MAX_REQS || + test_bit(ring_rsp->rqid, info->shadow_free_bitmap), + "illegal rqid %u returned by backend!\n", + ring_rsp->rqid)) + continue; + + if (info->shadow[ring_rsp->rqid]->act == VSCSIIF_ACT_SCSI_CDB) + scsifront_cdb_cmd_done(info, ring_rsp); + else + scsifront_sync_cmd_done(info, ring_rsp); + } + + info->ring.rsp_cons = i; + + if (i != info->ring.req_prod_pvt) + RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); + else + info->ring.sring->rsp_event = i + 1; + + info->wait_ring_available = 0; + + spin_unlock_irqrestore(info->host->host_lock, flags); + + wake_up(&info->wq_sync); + + return more_to_do; +} + +static irqreturn_t scsifront_irq_fn(int irq, void *dev_id) +{ + struct vscsifrnt_info *info = dev_id; + + while (scsifront_cmd_done(info)) + /* Yield point for this unbounded loop. */ + cond_resched(); + + return IRQ_HANDLED; +} + +static int map_data_for_request(struct vscsifrnt_info *info, + struct scsi_cmnd *sc, + struct vscsiif_request *ring_req, + struct vscsifrnt_shadow *shadow) +{ + grant_ref_t gref_head; + struct page *page; + int err, ref, ref_cnt = 0; + int grant_ro = (sc->sc_data_direction == DMA_TO_DEVICE); + unsigned int i, off, len, bytes; + unsigned int data_len = scsi_bufflen(sc); + unsigned int data_grants = 0, seg_grants = 0; + struct scatterlist *sg; + unsigned long mfn; + struct scsiif_request_segment *seg; + + ring_req->nr_segments = 0; + if (sc->sc_data_direction == DMA_NONE || !data_len) + return 0; + + scsi_for_each_sg(sc, sg, scsi_sg_count(sc), i) + data_grants += PFN_UP(sg->offset + sg->length); + + if (data_grants > VSCSIIF_SG_TABLESIZE) { + if (data_grants > info->host->sg_tablesize) { + shost_printk(KERN_ERR, info->host, KBUILD_MODNAME + "Unable to map request_buffer for command!\n"); + return -E2BIG; + } + seg_grants = vscsiif_grants_sg(data_grants); + shadow->sg = kcalloc(data_grants, + sizeof(struct scsiif_request_segment), GFP_NOIO); + if (!shadow->sg) + return -ENOMEM; + } + seg = shadow->sg ? : ring_req->seg; + + err = gnttab_alloc_grant_references(seg_grants + data_grants, + &gref_head); + if (err) { + kfree(shadow->sg); + shost_printk(KERN_ERR, info->host, KBUILD_MODNAME + "gnttab_alloc_grant_references() error\n"); + return -ENOMEM; + } + + if (seg_grants) { + page = virt_to_page(seg); + off = (unsigned long)seg & ~PAGE_MASK; + len = sizeof(struct scsiif_request_segment) * data_grants; + while (len > 0) { + bytes = min_t(unsigned int, len, PAGE_SIZE - off); + + ref = gnttab_claim_grant_reference(&gref_head); + BUG_ON(ref == -ENOSPC); + + mfn = pfn_to_mfn(page_to_pfn(page)); + gnttab_grant_foreign_access_ref(ref, + info->dev->otherend_id, mfn, 1); + shadow->gref[ref_cnt] = ref; + ring_req->seg[ref_cnt].gref = ref; + ring_req->seg[ref_cnt].offset = (uint16_t)off; + ring_req->seg[ref_cnt].length = (uint16_t)bytes; + + page++; + len -= bytes; + off = 0; + ref_cnt++; + } + BUG_ON(seg_grants < ref_cnt); + seg_grants = ref_cnt; + } + + scsi_for_each_sg(sc, sg, scsi_sg_count(sc), i) { + page = sg_page(sg); + off = sg->offset; + len = sg->length; + + while (len > 0 && data_len > 0) { + /* + * sg sends a scatterlist that is larger than + * the data_len it wants transferred for certain + * IO sizes. + */ + bytes = min_t(unsigned int, len, PAGE_SIZE - off); + bytes = min(bytes, data_len); + + ref = gnttab_claim_grant_reference(&gref_head); + BUG_ON(ref == -ENOSPC); + + mfn = pfn_to_mfn(page_to_pfn(page)); + gnttab_grant_foreign_access_ref(ref, + info->dev->otherend_id, mfn, grant_ro); + + shadow->gref[ref_cnt] = ref; + seg->gref = ref; + seg->offset = (uint16_t)off; + seg->length = (uint16_t)bytes; + + page++; + seg++; + len -= bytes; + data_len -= bytes; + off = 0; + ref_cnt++; + } + } + + if (seg_grants) + ring_req->nr_segments = VSCSIIF_SG_GRANT | seg_grants; + else + ring_req->nr_segments = (uint8_t)ref_cnt; + shadow->nr_grants = ref_cnt; + + return 0; +} + +static struct vscsiif_request *scsifront_command2ring( + struct vscsifrnt_info *info, struct scsi_cmnd *sc, + struct vscsifrnt_shadow *shadow) +{ + struct vscsiif_request *ring_req; + + memset(shadow, 0, sizeof(*shadow)); + + ring_req = scsifront_pre_req(info); + if (!ring_req) + return NULL; + + info->shadow[ring_req->rqid] = shadow; + shadow->rqid = ring_req->rqid; + + ring_req->id = sc->device->id; + ring_req->lun = sc->device->lun; + ring_req->channel = sc->device->channel; + ring_req->cmd_len = sc->cmd_len; + + BUG_ON(sc->cmd_len > VSCSIIF_MAX_COMMAND_SIZE); + + memcpy(ring_req->cmnd, sc->cmnd, sc->cmd_len); + + ring_req->sc_data_direction = (uint8_t)sc->sc_data_direction; + ring_req->timeout_per_command = sc->request->timeout / HZ; + + return ring_req; +} + +static int scsifront_queuecommand(struct Scsi_Host *shost, + struct scsi_cmnd *sc) +{ + struct vscsifrnt_info *info = shost_priv(shost); + struct vscsiif_request *ring_req; + struct vscsifrnt_shadow *shadow = scsi_cmd_priv(sc); + unsigned long flags; + int err; + uint16_t rqid; + + spin_lock_irqsave(shost->host_lock, flags); + if (RING_FULL(&info->ring)) + goto busy; + + ring_req = scsifront_command2ring(info, sc, shadow); + if (!ring_req) + goto busy; + + sc->result = 0; + + rqid = ring_req->rqid; + ring_req->act = VSCSIIF_ACT_SCSI_CDB; + + shadow->sc = sc; + shadow->act = VSCSIIF_ACT_SCSI_CDB; + + err = map_data_for_request(info, sc, ring_req, shadow); + if (err < 0) { + pr_debug("%s: err %d\n", __func__, err); + scsifront_put_rqid(info, rqid); + spin_unlock_irqrestore(shost->host_lock, flags); + if (err == -ENOMEM) + return SCSI_MLQUEUE_HOST_BUSY; + sc->result = DID_ERROR << 16; + sc->scsi_done(sc); + return 0; + } + + scsifront_do_request(info); + spin_unlock_irqrestore(shost->host_lock, flags); + + return 0; + +busy: + spin_unlock_irqrestore(shost->host_lock, flags); + pr_debug("%s: busy\n", __func__); + return SCSI_MLQUEUE_HOST_BUSY; +} + +/* + * Any exception handling (reset or abort) must be forwarded to the backend. + * We have to wait until an answer is returned. This answer contains the + * result to be returned to the requestor. + */ +static int scsifront_action_handler(struct scsi_cmnd *sc, uint8_t act) +{ + struct Scsi_Host *host = sc->device->host; + struct vscsifrnt_info *info = shost_priv(host); + struct vscsifrnt_shadow *shadow, *s = scsi_cmd_priv(sc); + struct vscsiif_request *ring_req; + int err = 0; + + shadow = kmalloc(sizeof(*shadow), GFP_NOIO); + if (!shadow) + return FAILED; + + for (;;) { + spin_lock_irq(host->host_lock); + if (!RING_FULL(&info->ring)) { + ring_req = scsifront_command2ring(info, sc, shadow); + if (ring_req) + break; + } + if (err) { + spin_unlock_irq(host->host_lock); + kfree(shadow); + return FAILED; + } + info->wait_ring_available = 1; + spin_unlock_irq(host->host_lock); + err = wait_event_interruptible(info->wq_sync, + !info->wait_ring_available); + spin_lock_irq(host->host_lock); + } + + ring_req->act = act; + ring_req->ref_rqid = s->rqid; + + shadow->act = act; + shadow->rslt_reset = RSLT_RESET_WAITING; + init_waitqueue_head(&shadow->wq_reset); + + ring_req->nr_segments = 0; + + scsifront_do_request(info); + + spin_unlock_irq(host->host_lock); + err = wait_event_interruptible(shadow->wq_reset, shadow->wait_reset); + spin_lock_irq(host->host_lock); + + if (!err) { + err = shadow->rslt_reset; + scsifront_put_rqid(info, shadow->rqid); + kfree(shadow); + } else { + spin_lock(&info->shadow_lock); + shadow->rslt_reset = RSLT_RESET_ERR; + spin_unlock(&info->shadow_lock); + err = FAILED; + } + + spin_unlock_irq(host->host_lock); + return err; +} + +static int scsifront_eh_abort_handler(struct scsi_cmnd *sc) +{ + pr_debug("%s\n", __func__); + return scsifront_action_handler(sc, VSCSIIF_ACT_SCSI_ABORT); +} + +static int scsifront_dev_reset_handler(struct scsi_cmnd *sc) +{ + pr_debug("%s\n", __func__); + return scsifront_action_handler(sc, VSCSIIF_ACT_SCSI_RESET); +} + +static int scsifront_sdev_configure(struct scsi_device *sdev) +{ + struct vscsifrnt_info *info = shost_priv(sdev->host); + + if (info && current == info->curr) + xenbus_printf(XBT_NIL, info->dev->nodename, + info->dev_state_path, "%d", XenbusStateConnected); + + return 0; +} + +static void scsifront_sdev_destroy(struct scsi_device *sdev) +{ + struct vscsifrnt_info *info = shost_priv(sdev->host); + + if (info && current == info->curr) + xenbus_printf(XBT_NIL, info->dev->nodename, + info->dev_state_path, "%d", XenbusStateClosed); +} + +static struct scsi_host_template scsifront_sht = { + .module = THIS_MODULE, + .name = "Xen SCSI frontend driver", + .queuecommand = scsifront_queuecommand, + .eh_abort_handler = scsifront_eh_abort_handler, + .eh_device_reset_handler = scsifront_dev_reset_handler, + .slave_configure = scsifront_sdev_configure, + .slave_destroy = scsifront_sdev_destroy, + .cmd_per_lun = VSCSIIF_DEFAULT_CMD_PER_LUN, + .can_queue = VSCSIIF_MAX_REQS, + .this_id = -1, + .cmd_size = sizeof(struct vscsifrnt_shadow), + .sg_tablesize = VSCSIIF_SG_TABLESIZE, + .use_clustering = DISABLE_CLUSTERING, + .proc_name = "scsifront", +}; + +static int scsifront_alloc_ring(struct vscsifrnt_info *info) +{ + struct xenbus_device *dev = info->dev; + struct vscsiif_sring *sring; + int err = -ENOMEM; + + /***** Frontend to Backend ring start *****/ + sring = (struct vscsiif_sring *)__get_free_page(GFP_KERNEL); + if (!sring) { + xenbus_dev_fatal(dev, err, + "fail to allocate shared ring (Front to Back)"); + return err; + } + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); + + err = xenbus_grant_ring(dev, virt_to_mfn(sring)); + if (err < 0) { + free_page((unsigned long)sring); + xenbus_dev_fatal(dev, err, + "fail to grant shared ring (Front to Back)"); + return err; + } + info->ring_ref = err; + + err = xenbus_alloc_evtchn(dev, &info->evtchn); + if (err) { + xenbus_dev_fatal(dev, err, "xenbus_alloc_evtchn"); + goto free_gnttab; + } + + err = bind_evtchn_to_irq(info->evtchn); + if (err <= 0) { + xenbus_dev_fatal(dev, err, "bind_evtchn_to_irq"); + goto free_gnttab; + } + + info->irq = err; + + err = request_threaded_irq(info->irq, NULL, scsifront_irq_fn, + IRQF_ONESHOT, "scsifront", info); + if (err) { + xenbus_dev_fatal(dev, err, "request_threaded_irq"); + goto free_irq; + } + + return 0; + +/* free resource */ +free_irq: + unbind_from_irqhandler(info->irq, info); +free_gnttab: + gnttab_end_foreign_access(info->ring_ref, 0, + (unsigned long)info->ring.sring); + + return err; +} + +static int scsifront_init_ring(struct vscsifrnt_info *info) +{ + struct xenbus_device *dev = info->dev; + struct xenbus_transaction xbt; + int err; + + pr_debug("%s\n", __func__); + + err = scsifront_alloc_ring(info); + if (err) + return err; + pr_debug("%s: %u %u\n", __func__, info->ring_ref, info->evtchn); + +again: + err = xenbus_transaction_start(&xbt); + if (err) + xenbus_dev_fatal(dev, err, "starting transaction"); + + err = xenbus_printf(xbt, dev->nodename, "ring-ref", "%u", + info->ring_ref); + if (err) { + xenbus_dev_fatal(dev, err, "%s", "writing ring-ref"); + goto fail; + } + + err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", + info->evtchn); + + if (err) { + xenbus_dev_fatal(dev, err, "%s", "writing event-channel"); + goto fail; + } + + err = xenbus_transaction_end(xbt, 0); + if (err) { + if (err == -EAGAIN) + goto again; + xenbus_dev_fatal(dev, err, "completing transaction"); + goto free_sring; + } + + return 0; + +fail: + xenbus_transaction_end(xbt, 1); +free_sring: + unbind_from_irqhandler(info->irq, info); + gnttab_end_foreign_access(info->ring_ref, 0, + (unsigned long)info->ring.sring); + + return err; +} + + +static int scsifront_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + struct vscsifrnt_info *info; + struct Scsi_Host *host; + int err = -ENOMEM; + char name[TASK_COMM_LEN]; + + host = scsi_host_alloc(&scsifront_sht, sizeof(*info)); + if (!host) { + xenbus_dev_fatal(dev, err, "fail to allocate scsi host"); + return err; + } + info = (struct vscsifrnt_info *)host->hostdata; + + dev_set_drvdata(&dev->dev, info); + info->dev = dev; + + bitmap_fill(info->shadow_free_bitmap, VSCSIIF_MAX_REQS); + + err = scsifront_init_ring(info); + if (err) { + scsi_host_put(host); + return err; + } + + init_waitqueue_head(&info->wq_sync); + spin_lock_init(&info->shadow_lock); + + snprintf(name, TASK_COMM_LEN, "vscsiif.%d", host->host_no); + + host->max_id = VSCSIIF_MAX_TARGET; + host->max_channel = 0; + host->max_lun = VSCSIIF_MAX_LUN; + host->max_sectors = (host->sg_tablesize - 1) * PAGE_SIZE / 512; + host->max_cmd_len = VSCSIIF_MAX_COMMAND_SIZE; + + err = scsi_add_host(host, &dev->dev); + if (err) { + dev_err(&dev->dev, "fail to add scsi host %d\n", err); + goto free_sring; + } + info->host = host; + info->host_active = 1; + + xenbus_switch_state(dev, XenbusStateInitialised); + + return 0; + +free_sring: + unbind_from_irqhandler(info->irq, info); + gnttab_end_foreign_access(info->ring_ref, 0, + (unsigned long)info->ring.sring); + scsi_host_put(host); + return err; +} + +static int scsifront_remove(struct xenbus_device *dev) +{ + struct vscsifrnt_info *info = dev_get_drvdata(&dev->dev); + + pr_debug("%s: %s removed\n", __func__, dev->nodename); + + mutex_lock(&scsifront_mutex); + if (info->host_active) { + /* Scsi_host not yet removed */ + scsi_remove_host(info->host); + info->host_active = 0; + } + mutex_unlock(&scsifront_mutex); + + gnttab_end_foreign_access(info->ring_ref, 0, + (unsigned long)info->ring.sring); + unbind_from_irqhandler(info->irq, info); + + scsi_host_put(info->host); + + return 0; +} + +static void scsifront_disconnect(struct vscsifrnt_info *info) +{ + struct xenbus_device *dev = info->dev; + struct Scsi_Host *host = info->host; + + pr_debug("%s: %s disconnect\n", __func__, dev->nodename); + + /* + * When this function is executed, all devices of + * Frontend have been deleted. + * Therefore, it need not block I/O before remove_host. + */ + + mutex_lock(&scsifront_mutex); + if (info->host_active) { + scsi_remove_host(host); + info->host_active = 0; + } + mutex_unlock(&scsifront_mutex); + + xenbus_frontend_closed(dev); +} + +static void scsifront_do_lun_hotplug(struct vscsifrnt_info *info, int op) +{ + struct xenbus_device *dev = info->dev; + int i, err = 0; + char str[64]; + char **dir; + unsigned int dir_n = 0; + unsigned int device_state; + unsigned int hst, chn, tgt, lun; + struct scsi_device *sdev; + + dir = xenbus_directory(XBT_NIL, dev->otherend, "vscsi-devs", &dir_n); + if (IS_ERR(dir)) + return; + + /* mark current task as the one allowed to modify device states */ + BUG_ON(info->curr); + info->curr = current; + + for (i = 0; i < dir_n; i++) { + /* read status */ + snprintf(str, sizeof(str), "vscsi-devs/%s/state", dir[i]); + err = xenbus_scanf(XBT_NIL, dev->otherend, str, "%u", + &device_state); + if (XENBUS_EXIST_ERR(err)) + continue; + + /* virtual SCSI device */ + snprintf(str, sizeof(str), "vscsi-devs/%s/v-dev", dir[i]); + err = xenbus_scanf(XBT_NIL, dev->otherend, str, + "%u:%u:%u:%u", &hst, &chn, &tgt, &lun); + if (XENBUS_EXIST_ERR(err)) + continue; + + /* + * Front device state path, used in slave_configure called + * on successfull scsi_add_device, and in slave_destroy called + * on remove of a device. + */ + snprintf(info->dev_state_path, sizeof(info->dev_state_path), + "vscsi-devs/%s/state", dir[i]); + + switch (op) { + case VSCSIFRONT_OP_ADD_LUN: + if (device_state != XenbusStateInitialised) + break; + + if (scsi_add_device(info->host, chn, tgt, lun)) { + dev_err(&dev->dev, "scsi_add_device\n"); + xenbus_printf(XBT_NIL, dev->nodename, + info->dev_state_path, + "%d", XenbusStateClosed); + } + break; + case VSCSIFRONT_OP_DEL_LUN: + if (device_state != XenbusStateClosing) + break; + + sdev = scsi_device_lookup(info->host, chn, tgt, lun); + if (sdev) { + scsi_remove_device(sdev); + scsi_device_put(sdev); + } + break; + default: + break; + } + } + + info->curr = NULL; + + kfree(dir); +} + +static void scsifront_read_backend_params(struct xenbus_device *dev, + struct vscsifrnt_info *info) +{ + unsigned int sg_grant; + int ret; + struct Scsi_Host *host = info->host; + + ret = xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg-grant", "%u", + &sg_grant); + if (ret == 1 && sg_grant) { + sg_grant = min_t(unsigned int, sg_grant, SG_ALL); + sg_grant = max_t(unsigned int, sg_grant, VSCSIIF_SG_TABLESIZE); + host->sg_tablesize = min_t(unsigned int, sg_grant, + VSCSIIF_SG_TABLESIZE * PAGE_SIZE / + sizeof(struct scsiif_request_segment)); + host->max_sectors = (host->sg_tablesize - 1) * PAGE_SIZE / 512; + } + dev_info(&dev->dev, "using up to %d SG entries\n", host->sg_tablesize); +} + +static void scsifront_backend_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + struct vscsifrnt_info *info = dev_get_drvdata(&dev->dev); + + pr_debug("%s: %p %u %u\n", __func__, dev, dev->state, backend_state); + + switch (backend_state) { + case XenbusStateUnknown: + case XenbusStateInitialising: + case XenbusStateInitWait: + case XenbusStateInitialised: + break; + + case XenbusStateConnected: + scsifront_read_backend_params(dev, info); + if (xenbus_read_driver_state(dev->nodename) == + XenbusStateInitialised) + scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_ADD_LUN); + + if (dev->state != XenbusStateConnected) + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateClosed: + if (dev->state == XenbusStateClosed) + break; + /* Missed the backend's Closing state -- fallthrough */ + case XenbusStateClosing: + scsifront_disconnect(info); + break; + + case XenbusStateReconfiguring: + scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_DEL_LUN); + xenbus_switch_state(dev, XenbusStateReconfiguring); + break; + + case XenbusStateReconfigured: + scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_ADD_LUN); + xenbus_switch_state(dev, XenbusStateConnected); + break; + } +} + +static const struct xenbus_device_id scsifront_ids[] = { + { "vscsi" }, + { "" } +}; + +static DEFINE_XENBUS_DRIVER(scsifront, , + .probe = scsifront_probe, + .remove = scsifront_remove, + .otherend_changed = scsifront_backend_changed, +); + +static int __init scsifront_init(void) +{ + if (!xen_domain()) + return -ENODEV; + + return xenbus_register_frontend(&scsifront_driver); +} +module_init(scsifront_init); + +static void __exit scsifront_exit(void) +{ + xenbus_unregister_driver(&scsifront_driver); +} +module_exit(scsifront_exit); + +MODULE_DESCRIPTION("Xen SCSI frontend driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("xen:vscsi"); +MODULE_AUTHOR("Juergen Gross "); -- cgit v1.2.3 From d9d660f6e562a47b4065eeb7e538910b0471b988 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 28 Aug 2014 06:44:12 +0200 Subject: xen-scsiback: Add Xen PV SCSI backend driver Introduces the Xen pvSCSI backend. With pvSCSI it is possible for a Xen domU to issue SCSI commands to a SCSI LUN assigned to that domU. The SCSI commands are passed to the pvSCSI backend in a driver domain (usually Dom0) which is owner of the physical device. This allows e.g. to use SCSI tape drives in a Xen domU. The code is taken from the pvSCSI implementation in Xen done by Fujitsu based on Linux kernel 2.6.18. Changes from the original version are: - port to upstream kernel - put all code in just one source file - adapt to Linux style guide - use target core infrastructure instead doing pure pass-through - enable module unloading - support SG-list in grant page(s) - support task abort - remove redundant struct backend - allocate resources dynamically - correct minor error in scsiback_fast_flush_area - free allocated resources in case of error during I/O preparation - remove CDB emulation, now handled by target core infrastructure Signed-off-by: Juergen Gross Reviewed-by: Nicholas Bellinger Signed-off-by: David Vrabel --- drivers/xen/Kconfig | 9 + drivers/xen/Makefile | 1 + drivers/xen/xen-scsiback.c | 2124 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 2134 insertions(+) create mode 100644 drivers/xen/xen-scsiback.c diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 8bc01838daf9..b812462083fc 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -172,6 +172,15 @@ config XEN_PCIDEV_BACKEND If in doubt, say m. +config XEN_SCSI_BACKEND + tristate "XEN SCSI backend driver" + depends on XEN && XEN_BACKEND && TARGET_CORE + help + The SCSI backend driver allows the kernel to export its SCSI Devices + to other guests via a high-performance shared-memory interface. + Only needed for systems running as XEN driver domains (e.g. Dom0) and + if guests need generic access to SCSI devices. + config XEN_PRIVCMD tristate depends on XEN diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 84044b554e33..2140398a2a8c 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_XEN_ACPI_HOTPLUG_MEMORY) += xen-acpi-memhotplug.o obj-$(CONFIG_XEN_ACPI_HOTPLUG_CPU) += xen-acpi-cpuhotplug.o obj-$(CONFIG_XEN_ACPI_PROCESSOR) += xen-acpi-processor.o obj-$(CONFIG_XEN_EFI) += efi.o +obj-$(CONFIG_XEN_SCSI_BACKEND) += xen-scsiback.o xen-evtchn-y := evtchn.o xen-gntdev-y := gntdev.o xen-gntalloc-y := gntalloc.o diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c new file mode 100644 index 000000000000..7b5656323739 --- /dev/null +++ b/drivers/xen/xen-scsiback.c @@ -0,0 +1,2124 @@ +/* + * Xen SCSI backend driver + * + * Copyright (c) 2008, FUJITSU Limited + * + * Based on the blkback driver code. + * Adaption to kernel taget core infrastructure taken from vhost/scsi.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#define DPRINTK(_f, _a...) \ + pr_debug("(file=%s, line=%d) " _f, __FILE__ , __LINE__ , ## _a) + +#define VSCSI_VERSION "v0.1" +#define VSCSI_NAMELEN 32 + +struct ids_tuple { + unsigned int hst; /* host */ + unsigned int chn; /* channel */ + unsigned int tgt; /* target */ + unsigned int lun; /* LUN */ +}; + +struct v2p_entry { + struct ids_tuple v; /* translate from */ + struct scsiback_tpg *tpg; /* translate to */ + unsigned int lun; + struct kref kref; + struct list_head l; +}; + +struct vscsibk_info { + struct xenbus_device *dev; + + domid_t domid; + unsigned int irq; + + struct vscsiif_back_ring ring; + int ring_error; + + spinlock_t ring_lock; + atomic_t nr_unreplied_reqs; + + spinlock_t v2p_lock; + struct list_head v2p_entry_lists; + + wait_queue_head_t waiting_to_free; +}; + +/* theoretical maximum of grants for one request */ +#define VSCSI_MAX_GRANTS (SG_ALL + VSCSIIF_SG_TABLESIZE) + +/* + * VSCSI_GRANT_BATCH is the maximum number of grants to be processed in one + * call to map/unmap grants. Don't choose it too large, as there are arrays + * with VSCSI_GRANT_BATCH elements allocated on the stack. + */ +#define VSCSI_GRANT_BATCH 16 + +struct vscsibk_pend { + uint16_t rqid; + + uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE]; + uint8_t cmd_len; + + uint8_t sc_data_direction; + uint16_t n_sg; /* real length of SG list */ + uint16_t n_grants; /* SG pages and potentially SG list */ + uint32_t data_len; + uint32_t result; + + struct vscsibk_info *info; + struct v2p_entry *v2p; + struct scatterlist *sgl; + + uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE]; + + grant_handle_t grant_handles[VSCSI_MAX_GRANTS]; + struct page *pages[VSCSI_MAX_GRANTS]; + + struct se_cmd se_cmd; +}; + +struct scsiback_tmr { + atomic_t tmr_complete; + wait_queue_head_t tmr_wait; +}; + +struct scsiback_nexus { + /* Pointer to TCM session for I_T Nexus */ + struct se_session *tvn_se_sess; +}; + +struct scsiback_tport { + /* SCSI protocol the tport is providing */ + u8 tport_proto_id; + /* Binary World Wide unique Port Name for pvscsi Target port */ + u64 tport_wwpn; + /* ASCII formatted WWPN for pvscsi Target port */ + char tport_name[VSCSI_NAMELEN]; + /* Returned by scsiback_make_tport() */ + struct se_wwn tport_wwn; +}; + +struct scsiback_tpg { + /* scsiback port target portal group tag for TCM */ + u16 tport_tpgt; + /* track number of TPG Port/Lun Links wrt explicit I_T Nexus shutdown */ + int tv_tpg_port_count; + /* xen-pvscsi references to tpg_nexus, protected by tv_tpg_mutex */ + int tv_tpg_fe_count; + /* list for scsiback_list */ + struct list_head tv_tpg_list; + /* Used to protect access for tpg_nexus */ + struct mutex tv_tpg_mutex; + /* Pointer to the TCM pvscsi I_T Nexus for this TPG endpoint */ + struct scsiback_nexus *tpg_nexus; + /* Pointer back to scsiback_tport */ + struct scsiback_tport *tport; + /* Returned by scsiback_make_tpg() */ + struct se_portal_group se_tpg; + /* alias used in xenstore */ + char param_alias[VSCSI_NAMELEN]; + /* list of info structures related to this target portal group */ + struct list_head info_list; +}; + +#define SCSIBACK_INVALID_HANDLE (~0) + +static bool log_print_stat; +module_param(log_print_stat, bool, 0644); + +static int scsiback_max_buffer_pages = 1024; +module_param_named(max_buffer_pages, scsiback_max_buffer_pages, int, 0644); +MODULE_PARM_DESC(max_buffer_pages, +"Maximum number of free pages to keep in backend buffer"); + +static struct kmem_cache *scsiback_cachep; +static DEFINE_SPINLOCK(free_pages_lock); +static int free_pages_num; +static LIST_HEAD(scsiback_free_pages); + +/* Global spinlock to protect scsiback TPG list */ +static DEFINE_MUTEX(scsiback_mutex); +static LIST_HEAD(scsiback_list); + +/* Local pointer to allocated TCM configfs fabric module */ +static struct target_fabric_configfs *scsiback_fabric_configfs; + +static void scsiback_get(struct vscsibk_info *info) +{ + atomic_inc(&info->nr_unreplied_reqs); +} + +static void scsiback_put(struct vscsibk_info *info) +{ + if (atomic_dec_and_test(&info->nr_unreplied_reqs)) + wake_up(&info->waiting_to_free); +} + +static void put_free_pages(struct page **page, int num) +{ + unsigned long flags; + int i = free_pages_num + num, n = num; + + if (num == 0) + return; + if (i > scsiback_max_buffer_pages) { + n = min(num, i - scsiback_max_buffer_pages); + free_xenballooned_pages(n, page + num - n); + n = num - n; + } + spin_lock_irqsave(&free_pages_lock, flags); + for (i = 0; i < n; i++) + list_add(&page[i]->lru, &scsiback_free_pages); + free_pages_num += n; + spin_unlock_irqrestore(&free_pages_lock, flags); +} + +static int get_free_page(struct page **page) +{ + unsigned long flags; + + spin_lock_irqsave(&free_pages_lock, flags); + if (list_empty(&scsiback_free_pages)) { + spin_unlock_irqrestore(&free_pages_lock, flags); + return alloc_xenballooned_pages(1, page, false); + } + page[0] = list_first_entry(&scsiback_free_pages, struct page, lru); + list_del(&page[0]->lru); + free_pages_num--; + spin_unlock_irqrestore(&free_pages_lock, flags); + return 0; +} + +static unsigned long vaddr_page(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + + return (unsigned long)pfn_to_kaddr(pfn); +} + +static unsigned long vaddr(struct vscsibk_pend *req, int seg) +{ + return vaddr_page(req->pages[seg]); +} + +static void scsiback_print_status(char *sense_buffer, int errors, + struct vscsibk_pend *pending_req) +{ + struct scsiback_tpg *tpg = pending_req->v2p->tpg; + + pr_err("xen-pvscsi[%s:%d] cmnd[0]=%02x -> st=%02x msg=%02x host=%02x drv=%02x\n", + tpg->tport->tport_name, pending_req->v2p->lun, + pending_req->cmnd[0], status_byte(errors), msg_byte(errors), + host_byte(errors), driver_byte(errors)); + + if (CHECK_CONDITION & status_byte(errors)) + __scsi_print_sense("xen-pvscsi", sense_buffer, + SCSI_SENSE_BUFFERSIZE); +} + +static void scsiback_fast_flush_area(struct vscsibk_pend *req) +{ + struct gnttab_unmap_grant_ref unmap[VSCSI_GRANT_BATCH]; + struct page *pages[VSCSI_GRANT_BATCH]; + unsigned int i, invcount = 0; + grant_handle_t handle; + int err; + + kfree(req->sgl); + req->sgl = NULL; + req->n_sg = 0; + + if (!req->n_grants) + return; + + for (i = 0; i < req->n_grants; i++) { + handle = req->grant_handles[i]; + if (handle == SCSIBACK_INVALID_HANDLE) + continue; + gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), + GNTMAP_host_map, handle); + req->grant_handles[i] = SCSIBACK_INVALID_HANDLE; + pages[invcount] = req->pages[i]; + put_page(pages[invcount]); + invcount++; + if (invcount < VSCSI_GRANT_BATCH) + continue; + err = gnttab_unmap_refs(unmap, NULL, pages, invcount); + BUG_ON(err); + invcount = 0; + } + + if (invcount) { + err = gnttab_unmap_refs(unmap, NULL, pages, invcount); + BUG_ON(err); + } + + put_free_pages(req->pages, req->n_grants); + req->n_grants = 0; +} + +static void scsiback_free_translation_entry(struct kref *kref) +{ + struct v2p_entry *entry = container_of(kref, struct v2p_entry, kref); + struct scsiback_tpg *tpg = entry->tpg; + + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_fe_count--; + mutex_unlock(&tpg->tv_tpg_mutex); + + kfree(entry); +} + +static void scsiback_do_resp_with_sense(char *sense_buffer, int32_t result, + uint32_t resid, struct vscsibk_pend *pending_req) +{ + struct vscsiif_response *ring_res; + struct vscsibk_info *info = pending_req->info; + int notify; + struct scsi_sense_hdr sshdr; + unsigned long flags; + unsigned len; + + spin_lock_irqsave(&info->ring_lock, flags); + + ring_res = RING_GET_RESPONSE(&info->ring, info->ring.rsp_prod_pvt); + info->ring.rsp_prod_pvt++; + + ring_res->rslt = result; + ring_res->rqid = pending_req->rqid; + + if (sense_buffer != NULL && + scsi_normalize_sense(sense_buffer, VSCSIIF_SENSE_BUFFERSIZE, + &sshdr)) { + len = min_t(unsigned, 8 + sense_buffer[7], + VSCSIIF_SENSE_BUFFERSIZE); + memcpy(ring_res->sense_buffer, sense_buffer, len); + ring_res->sense_len = len; + } else { + ring_res->sense_len = 0; + } + + ring_res->residual_len = resid; + + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&info->ring, notify); + spin_unlock_irqrestore(&info->ring_lock, flags); + + if (notify) + notify_remote_via_irq(info->irq); + + if (pending_req->v2p) + kref_put(&pending_req->v2p->kref, + scsiback_free_translation_entry); +} + +static void scsiback_cmd_done(struct vscsibk_pend *pending_req) +{ + struct vscsibk_info *info = pending_req->info; + unsigned char *sense_buffer; + unsigned int resid; + int errors; + + sense_buffer = pending_req->sense_buffer; + resid = pending_req->se_cmd.residual_count; + errors = pending_req->result; + + if (errors && log_print_stat) + scsiback_print_status(sense_buffer, errors, pending_req); + + scsiback_fast_flush_area(pending_req); + scsiback_do_resp_with_sense(sense_buffer, errors, resid, pending_req); + scsiback_put(info); +} + +static void scsiback_cmd_exec(struct vscsibk_pend *pending_req) +{ + struct se_cmd *se_cmd = &pending_req->se_cmd; + struct se_session *sess = pending_req->v2p->tpg->tpg_nexus->tvn_se_sess; + int rc; + + memset(pending_req->sense_buffer, 0, VSCSIIF_SENSE_BUFFERSIZE); + + memset(se_cmd, 0, sizeof(*se_cmd)); + + scsiback_get(pending_req->info); + rc = target_submit_cmd_map_sgls(se_cmd, sess, pending_req->cmnd, + pending_req->sense_buffer, pending_req->v2p->lun, + pending_req->data_len, 0, + pending_req->sc_data_direction, 0, + pending_req->sgl, pending_req->n_sg, + NULL, 0, NULL, 0); + if (rc < 0) { + transport_send_check_condition_and_sense(se_cmd, + TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE, 0); + transport_generic_free_cmd(se_cmd, 0); + } +} + +static int scsiback_gnttab_data_map_batch(struct gnttab_map_grant_ref *map, + struct page **pg, grant_handle_t *grant, int cnt) +{ + int err, i; + + if (!cnt) + return 0; + + err = gnttab_map_refs(map, NULL, pg, cnt); + BUG_ON(err); + for (i = 0; i < cnt; i++) { + if (unlikely(map[i].status != GNTST_okay)) { + pr_err("xen-pvscsi: invalid buffer -- could not remap it\n"); + map[i].handle = SCSIBACK_INVALID_HANDLE; + err = -ENOMEM; + } else { + get_page(pg[i]); + } + grant[i] = map[i].handle; + } + return err; +} + +static int scsiback_gnttab_data_map_list(struct vscsibk_pend *pending_req, + struct scsiif_request_segment *seg, struct page **pg, + grant_handle_t *grant, int cnt, u32 flags) +{ + int mapcount = 0, i, err = 0; + struct gnttab_map_grant_ref map[VSCSI_GRANT_BATCH]; + struct vscsibk_info *info = pending_req->info; + + for (i = 0; i < cnt; i++) { + if (get_free_page(pg + mapcount)) { + put_free_pages(pg, mapcount); + pr_err("xen-pvscsi: no grant page\n"); + return -ENOMEM; + } + gnttab_set_map_op(&map[mapcount], vaddr_page(pg[mapcount]), + flags, seg[i].gref, info->domid); + mapcount++; + if (mapcount < VSCSI_GRANT_BATCH) + continue; + err = scsiback_gnttab_data_map_batch(map, pg, grant, mapcount); + pg += mapcount; + grant += mapcount; + pending_req->n_grants += mapcount; + if (err) + return err; + mapcount = 0; + } + err = scsiback_gnttab_data_map_batch(map, pg, grant, mapcount); + pending_req->n_grants += mapcount; + return err; +} + +static int scsiback_gnttab_data_map(struct vscsiif_request *ring_req, + struct vscsibk_pend *pending_req) +{ + u32 flags; + int i, err, n_segs, i_seg = 0; + struct page **pg; + struct scsiif_request_segment *seg; + unsigned long end_seg = 0; + unsigned int nr_segments = (unsigned int)ring_req->nr_segments; + unsigned int nr_sgl = 0; + struct scatterlist *sg; + grant_handle_t *grant; + + pending_req->n_sg = 0; + pending_req->n_grants = 0; + pending_req->data_len = 0; + + nr_segments &= ~VSCSIIF_SG_GRANT; + if (!nr_segments) + return 0; + + if (nr_segments > VSCSIIF_SG_TABLESIZE) { + DPRINTK("xen-pvscsi: invalid parameter nr_seg = %d\n", + ring_req->nr_segments); + return -EINVAL; + } + + if (ring_req->nr_segments & VSCSIIF_SG_GRANT) { + err = scsiback_gnttab_data_map_list(pending_req, ring_req->seg, + pending_req->pages, pending_req->grant_handles, + nr_segments, GNTMAP_host_map | GNTMAP_readonly); + if (err) + return err; + nr_sgl = nr_segments; + nr_segments = 0; + for (i = 0; i < nr_sgl; i++) { + n_segs = ring_req->seg[i].length / + sizeof(struct scsiif_request_segment); + if ((unsigned)ring_req->seg[i].offset + + (unsigned)ring_req->seg[i].length > PAGE_SIZE || + n_segs * sizeof(struct scsiif_request_segment) != + ring_req->seg[i].length) + return -EINVAL; + nr_segments += n_segs; + } + if (nr_segments > SG_ALL) { + DPRINTK("xen-pvscsi: invalid nr_seg = %d\n", + nr_segments); + return -EINVAL; + } + } + + /* free of (sgl) in fast_flush_area()*/ + pending_req->sgl = kmalloc_array(nr_segments, + sizeof(struct scatterlist), GFP_KERNEL); + if (!pending_req->sgl) + return -ENOMEM; + + sg_init_table(pending_req->sgl, nr_segments); + pending_req->n_sg = nr_segments; + + flags = GNTMAP_host_map; + if (pending_req->sc_data_direction == DMA_TO_DEVICE) + flags |= GNTMAP_readonly; + + pg = pending_req->pages + nr_sgl; + grant = pending_req->grant_handles + nr_sgl; + if (!nr_sgl) { + seg = ring_req->seg; + err = scsiback_gnttab_data_map_list(pending_req, seg, + pg, grant, nr_segments, flags); + if (err) + return err; + } else { + for (i = 0; i < nr_sgl; i++) { + seg = (struct scsiif_request_segment *)( + vaddr(pending_req, i) + ring_req->seg[i].offset); + n_segs = ring_req->seg[i].length / + sizeof(struct scsiif_request_segment); + err = scsiback_gnttab_data_map_list(pending_req, seg, + pg, grant, n_segs, flags); + if (err) + return err; + pg += n_segs; + grant += n_segs; + } + end_seg = vaddr(pending_req, 0) + ring_req->seg[0].offset; + seg = (struct scsiif_request_segment *)end_seg; + end_seg += ring_req->seg[0].length; + pg = pending_req->pages + nr_sgl; + } + + for_each_sg(pending_req->sgl, sg, nr_segments, i) { + sg_set_page(sg, pg[i], seg->length, seg->offset); + pending_req->data_len += seg->length; + seg++; + if (nr_sgl && (unsigned long)seg >= end_seg) { + i_seg++; + end_seg = vaddr(pending_req, i_seg) + + ring_req->seg[i_seg].offset; + seg = (struct scsiif_request_segment *)end_seg; + end_seg += ring_req->seg[i_seg].length; + } + if (sg->offset >= PAGE_SIZE || + sg->length > PAGE_SIZE || + sg->offset + sg->length > PAGE_SIZE) + return -EINVAL; + } + + return 0; +} + +static void scsiback_disconnect(struct vscsibk_info *info) +{ + wait_event(info->waiting_to_free, + atomic_read(&info->nr_unreplied_reqs) == 0); + + unbind_from_irqhandler(info->irq, info); + info->irq = 0; + xenbus_unmap_ring_vfree(info->dev, info->ring.sring); +} + +static void scsiback_device_action(struct vscsibk_pend *pending_req, + enum tcm_tmreq_table act, int tag) +{ + int rc, err = FAILED; + struct scsiback_tpg *tpg = pending_req->v2p->tpg; + struct se_cmd *se_cmd = &pending_req->se_cmd; + struct scsiback_tmr *tmr; + + tmr = kzalloc(sizeof(struct scsiback_tmr), GFP_KERNEL); + if (!tmr) + goto out; + + init_waitqueue_head(&tmr->tmr_wait); + + transport_init_se_cmd(se_cmd, tpg->se_tpg.se_tpg_tfo, + tpg->tpg_nexus->tvn_se_sess, 0, DMA_NONE, MSG_SIMPLE_TAG, + &pending_req->sense_buffer[0]); + + rc = core_tmr_alloc_req(se_cmd, tmr, act, GFP_KERNEL); + if (rc < 0) + goto out; + + se_cmd->se_tmr_req->ref_task_tag = tag; + + if (transport_lookup_tmr_lun(se_cmd, pending_req->v2p->lun) < 0) + goto out; + + transport_generic_handle_tmr(se_cmd); + wait_event(tmr->tmr_wait, atomic_read(&tmr->tmr_complete)); + + err = (se_cmd->se_tmr_req->response == TMR_FUNCTION_COMPLETE) ? + SUCCESS : FAILED; + +out: + if (tmr) { + transport_generic_free_cmd(&pending_req->se_cmd, 1); + kfree(tmr); + } + + scsiback_do_resp_with_sense(NULL, err, 0, pending_req); + + kmem_cache_free(scsiback_cachep, pending_req); +} + +/* + Perform virtual to physical translation +*/ +static struct v2p_entry *scsiback_do_translation(struct vscsibk_info *info, + struct ids_tuple *v) +{ + struct v2p_entry *entry; + struct list_head *head = &(info->v2p_entry_lists); + unsigned long flags; + + spin_lock_irqsave(&info->v2p_lock, flags); + list_for_each_entry(entry, head, l) { + if ((entry->v.chn == v->chn) && + (entry->v.tgt == v->tgt) && + (entry->v.lun == v->lun)) { + kref_get(&entry->kref); + goto out; + } + } + entry = NULL; + +out: + spin_unlock_irqrestore(&info->v2p_lock, flags); + return entry; +} + +static int prepare_pending_reqs(struct vscsibk_info *info, + struct vscsiif_request *ring_req, + struct vscsibk_pend *pending_req) +{ + struct v2p_entry *v2p; + struct ids_tuple vir; + + pending_req->rqid = ring_req->rqid; + pending_req->info = info; + + vir.chn = ring_req->channel; + vir.tgt = ring_req->id; + vir.lun = ring_req->lun; + + v2p = scsiback_do_translation(info, &vir); + if (!v2p) { + pending_req->v2p = NULL; + DPRINTK("xen-pvscsi: doesn't exist.\n"); + return -ENODEV; + } + pending_req->v2p = v2p; + + /* request range check from frontend */ + pending_req->sc_data_direction = ring_req->sc_data_direction; + if ((pending_req->sc_data_direction != DMA_BIDIRECTIONAL) && + (pending_req->sc_data_direction != DMA_TO_DEVICE) && + (pending_req->sc_data_direction != DMA_FROM_DEVICE) && + (pending_req->sc_data_direction != DMA_NONE)) { + DPRINTK("xen-pvscsi: invalid parameter data_dir = %d\n", + pending_req->sc_data_direction); + return -EINVAL; + } + + pending_req->cmd_len = ring_req->cmd_len; + if (pending_req->cmd_len > VSCSIIF_MAX_COMMAND_SIZE) { + DPRINTK("xen-pvscsi: invalid parameter cmd_len = %d\n", + pending_req->cmd_len); + return -EINVAL; + } + memcpy(pending_req->cmnd, ring_req->cmnd, pending_req->cmd_len); + + return 0; +} + +static int scsiback_do_cmd_fn(struct vscsibk_info *info) +{ + struct vscsiif_back_ring *ring = &info->ring; + struct vscsiif_request *ring_req; + struct vscsibk_pend *pending_req; + RING_IDX rc, rp; + int err, more_to_do; + uint32_t result; + uint8_t act; + + rc = ring->req_cons; + rp = ring->sring->req_prod; + rmb(); /* guest system is accessing ring, too */ + + if (RING_REQUEST_PROD_OVERFLOW(ring, rp)) { + rc = ring->rsp_prod_pvt; + pr_warn("xen-pvscsi: Dom%d provided bogus ring requests (%#x - %#x = %u). Halting ring processing\n", + info->domid, rp, rc, rp - rc); + info->ring_error = 1; + return 0; + } + + while ((rc != rp)) { + if (RING_REQUEST_CONS_OVERFLOW(ring, rc)) + break; + pending_req = kmem_cache_alloc(scsiback_cachep, GFP_KERNEL); + if (!pending_req) + return 1; + + ring_req = RING_GET_REQUEST(ring, rc); + ring->req_cons = ++rc; + + act = ring_req->act; + err = prepare_pending_reqs(info, ring_req, pending_req); + if (err) { + switch (err) { + case -ENODEV: + result = DID_NO_CONNECT; + break; + default: + result = DRIVER_ERROR; + break; + } + scsiback_do_resp_with_sense(NULL, result << 24, 0, + pending_req); + kmem_cache_free(scsiback_cachep, pending_req); + return 1; + } + + switch (act) { + case VSCSIIF_ACT_SCSI_CDB: + if (scsiback_gnttab_data_map(ring_req, pending_req)) { + scsiback_fast_flush_area(pending_req); + scsiback_do_resp_with_sense(NULL, + DRIVER_ERROR << 24, 0, pending_req); + kmem_cache_free(scsiback_cachep, pending_req); + } else { + scsiback_cmd_exec(pending_req); + } + break; + case VSCSIIF_ACT_SCSI_ABORT: + scsiback_device_action(pending_req, TMR_ABORT_TASK, + ring_req->ref_rqid); + break; + case VSCSIIF_ACT_SCSI_RESET: + scsiback_device_action(pending_req, TMR_LUN_RESET, 0); + break; + default: + pr_err_ratelimited("xen-pvscsi: invalid request\n"); + scsiback_do_resp_with_sense(NULL, DRIVER_ERROR << 24, + 0, pending_req); + kmem_cache_free(scsiback_cachep, pending_req); + break; + } + + /* Yield point for this unbounded loop. */ + cond_resched(); + } + + RING_FINAL_CHECK_FOR_REQUESTS(&info->ring, more_to_do); + return more_to_do; +} + +static irqreturn_t scsiback_irq_fn(int irq, void *dev_id) +{ + struct vscsibk_info *info = dev_id; + + if (info->ring_error) + return IRQ_HANDLED; + + while (scsiback_do_cmd_fn(info)) + cond_resched(); + + return IRQ_HANDLED; +} + +static int scsiback_init_sring(struct vscsibk_info *info, grant_ref_t ring_ref, + evtchn_port_t evtchn) +{ + void *area; + struct vscsiif_sring *sring; + int err; + + if (info->irq) + return -1; + + err = xenbus_map_ring_valloc(info->dev, ring_ref, &area); + if (err) + return err; + + sring = (struct vscsiif_sring *)area; + BACK_RING_INIT(&info->ring, sring, PAGE_SIZE); + + err = bind_interdomain_evtchn_to_irq(info->domid, evtchn); + if (err < 0) + goto unmap_page; + + info->irq = err; + + err = request_threaded_irq(info->irq, NULL, scsiback_irq_fn, + IRQF_ONESHOT, "vscsiif-backend", info); + if (err) + goto free_irq; + + return 0; + +free_irq: + unbind_from_irqhandler(info->irq, info); + info->irq = 0; +unmap_page: + xenbus_unmap_ring_vfree(info->dev, area); + + return err; +} + +static int scsiback_map(struct vscsibk_info *info) +{ + struct xenbus_device *dev = info->dev; + unsigned int ring_ref, evtchn; + int err; + + err = xenbus_gather(XBT_NIL, dev->otherend, + "ring-ref", "%u", &ring_ref, + "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_fatal(dev, err, "reading %s ring", dev->otherend); + return err; + } + + return scsiback_init_sring(info, ring_ref, evtchn); +} + +/* + Add a new translation entry +*/ +static int scsiback_add_translation_entry(struct vscsibk_info *info, + char *phy, struct ids_tuple *v) +{ + int err = 0; + struct v2p_entry *entry; + struct v2p_entry *new; + struct list_head *head = &(info->v2p_entry_lists); + unsigned long flags; + char *lunp; + unsigned int lun; + struct scsiback_tpg *tpg_entry, *tpg = NULL; + char *error = "doesn't exist"; + + lunp = strrchr(phy, ':'); + if (!lunp) { + pr_err("xen-pvscsi: illegal format of physical device %s\n", + phy); + return -EINVAL; + } + *lunp = 0; + lunp++; + if (kstrtouint(lunp, 10, &lun) || lun >= TRANSPORT_MAX_LUNS_PER_TPG) { + pr_err("xen-pvscsi: lun number not valid: %s\n", lunp); + return -EINVAL; + } + + mutex_lock(&scsiback_mutex); + list_for_each_entry(tpg_entry, &scsiback_list, tv_tpg_list) { + if (!strcmp(phy, tpg_entry->tport->tport_name) || + !strcmp(phy, tpg_entry->param_alias)) { + spin_lock(&tpg_entry->se_tpg.tpg_lun_lock); + if (tpg_entry->se_tpg.tpg_lun_list[lun]->lun_status == + TRANSPORT_LUN_STATUS_ACTIVE) { + if (!tpg_entry->tpg_nexus) + error = "nexus undefined"; + else + tpg = tpg_entry; + } + spin_unlock(&tpg_entry->se_tpg.tpg_lun_lock); + break; + } + } + if (tpg) { + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_fe_count++; + mutex_unlock(&tpg->tv_tpg_mutex); + } + mutex_unlock(&scsiback_mutex); + + if (!tpg) { + pr_err("xen-pvscsi: %s:%d %s\n", phy, lun, error); + return -ENODEV; + } + + new = kmalloc(sizeof(struct v2p_entry), GFP_KERNEL); + if (new == NULL) { + err = -ENOMEM; + goto out_free; + } + + spin_lock_irqsave(&info->v2p_lock, flags); + + /* Check double assignment to identical virtual ID */ + list_for_each_entry(entry, head, l) { + if ((entry->v.chn == v->chn) && + (entry->v.tgt == v->tgt) && + (entry->v.lun == v->lun)) { + pr_warn("xen-pvscsi: Virtual ID is already used. Assignment was not performed.\n"); + err = -EEXIST; + goto out; + } + + } + + /* Create a new translation entry and add to the list */ + kref_init(&new->kref); + new->v = *v; + new->tpg = tpg; + new->lun = lun; + list_add_tail(&new->l, head); + +out: + spin_unlock_irqrestore(&info->v2p_lock, flags); + +out_free: + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_fe_count--; + mutex_unlock(&tpg->tv_tpg_mutex); + + if (err) + kfree(new); + + return err; +} + +static void __scsiback_del_translation_entry(struct v2p_entry *entry) +{ + list_del(&entry->l); + kref_put(&entry->kref, scsiback_free_translation_entry); +} + +/* + Delete the translation entry specfied +*/ +static int scsiback_del_translation_entry(struct vscsibk_info *info, + struct ids_tuple *v) +{ + struct v2p_entry *entry; + struct list_head *head = &(info->v2p_entry_lists); + unsigned long flags; + + spin_lock_irqsave(&info->v2p_lock, flags); + /* Find out the translation entry specified */ + list_for_each_entry(entry, head, l) { + if ((entry->v.chn == v->chn) && + (entry->v.tgt == v->tgt) && + (entry->v.lun == v->lun)) { + goto found; + } + } + + spin_unlock_irqrestore(&info->v2p_lock, flags); + return 1; + +found: + /* Delete the translation entry specfied */ + __scsiback_del_translation_entry(entry); + + spin_unlock_irqrestore(&info->v2p_lock, flags); + return 0; +} + +static void scsiback_do_add_lun(struct vscsibk_info *info, const char *state, + char *phy, struct ids_tuple *vir) +{ + if (!scsiback_add_translation_entry(info, phy, vir)) { + if (xenbus_printf(XBT_NIL, info->dev->nodename, state, + "%d", XenbusStateInitialised)) { + pr_err("xen-pvscsi: xenbus_printf error %s\n", state); + scsiback_del_translation_entry(info, vir); + } + } else { + xenbus_printf(XBT_NIL, info->dev->nodename, state, + "%d", XenbusStateClosed); + } +} + +static void scsiback_do_del_lun(struct vscsibk_info *info, const char *state, + struct ids_tuple *vir) +{ + if (!scsiback_del_translation_entry(info, vir)) { + if (xenbus_printf(XBT_NIL, info->dev->nodename, state, + "%d", XenbusStateClosed)) + pr_err("xen-pvscsi: xenbus_printf error %s\n", state); + } +} + +#define VSCSIBACK_OP_ADD_OR_DEL_LUN 1 +#define VSCSIBACK_OP_UPDATEDEV_STATE 2 + +static void scsiback_do_1lun_hotplug(struct vscsibk_info *info, int op, + char *ent) +{ + int err; + struct ids_tuple vir; + char *val; + int device_state; + char phy[VSCSI_NAMELEN]; + char str[64]; + char state[64]; + struct xenbus_device *dev = info->dev; + + /* read status */ + snprintf(state, sizeof(state), "vscsi-devs/%s/state", ent); + err = xenbus_scanf(XBT_NIL, dev->nodename, state, "%u", &device_state); + if (XENBUS_EXIST_ERR(err)) + return; + + /* physical SCSI device */ + snprintf(str, sizeof(str), "vscsi-devs/%s/p-dev", ent); + val = xenbus_read(XBT_NIL, dev->nodename, str, NULL); + if (IS_ERR(val)) { + xenbus_printf(XBT_NIL, dev->nodename, state, + "%d", XenbusStateClosed); + return; + } + strlcpy(phy, val, VSCSI_NAMELEN); + kfree(val); + + /* virtual SCSI device */ + snprintf(str, sizeof(str), "vscsi-devs/%s/v-dev", ent); + err = xenbus_scanf(XBT_NIL, dev->nodename, str, "%u:%u:%u:%u", + &vir.hst, &vir.chn, &vir.tgt, &vir.lun); + if (XENBUS_EXIST_ERR(err)) { + xenbus_printf(XBT_NIL, dev->nodename, state, + "%d", XenbusStateClosed); + return; + } + + switch (op) { + case VSCSIBACK_OP_ADD_OR_DEL_LUN: + if (device_state == XenbusStateInitialising) + scsiback_do_add_lun(info, state, phy, &vir); + if (device_state == XenbusStateClosing) + scsiback_do_del_lun(info, state, &vir); + break; + + case VSCSIBACK_OP_UPDATEDEV_STATE: + if (device_state == XenbusStateInitialised) { + /* modify vscsi-devs/dev-x/state */ + if (xenbus_printf(XBT_NIL, dev->nodename, state, + "%d", XenbusStateConnected)) { + pr_err("xen-pvscsi: xenbus_printf error %s\n", + str); + scsiback_del_translation_entry(info, &vir); + xenbus_printf(XBT_NIL, dev->nodename, state, + "%d", XenbusStateClosed); + } + } + break; + /*When it is necessary, processing is added here.*/ + default: + break; + } +} + +static void scsiback_do_lun_hotplug(struct vscsibk_info *info, int op) +{ + int i; + char **dir; + unsigned int ndir = 0; + + dir = xenbus_directory(XBT_NIL, info->dev->nodename, "vscsi-devs", + &ndir); + if (IS_ERR(dir)) + return; + + for (i = 0; i < ndir; i++) + scsiback_do_1lun_hotplug(info, op, dir[i]); + + kfree(dir); +} + +static void scsiback_frontend_changed(struct xenbus_device *dev, + enum xenbus_state frontend_state) +{ + struct vscsibk_info *info = dev_get_drvdata(&dev->dev); + + switch (frontend_state) { + case XenbusStateInitialising: + break; + + case XenbusStateInitialised: + if (scsiback_map(info)) + break; + + scsiback_do_lun_hotplug(info, VSCSIBACK_OP_ADD_OR_DEL_LUN); + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateConnected: + scsiback_do_lun_hotplug(info, VSCSIBACK_OP_UPDATEDEV_STATE); + + if (dev->state == XenbusStateConnected) + break; + + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateClosing: + if (info->irq) + scsiback_disconnect(info); + + xenbus_switch_state(dev, XenbusStateClosing); + break; + + case XenbusStateClosed: + xenbus_switch_state(dev, XenbusStateClosed); + if (xenbus_dev_is_online(dev)) + break; + /* fall through if not online */ + case XenbusStateUnknown: + device_unregister(&dev->dev); + break; + + case XenbusStateReconfiguring: + scsiback_do_lun_hotplug(info, VSCSIBACK_OP_ADD_OR_DEL_LUN); + xenbus_switch_state(dev, XenbusStateReconfigured); + + break; + + default: + xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", + frontend_state); + break; + } +} + +/* + Release the translation entry specfied +*/ +static void scsiback_release_translation_entry(struct vscsibk_info *info) +{ + struct v2p_entry *entry, *tmp; + struct list_head *head = &(info->v2p_entry_lists); + unsigned long flags; + + spin_lock_irqsave(&info->v2p_lock, flags); + + list_for_each_entry_safe(entry, tmp, head, l) + __scsiback_del_translation_entry(entry); + + spin_unlock_irqrestore(&info->v2p_lock, flags); +} + +static int scsiback_remove(struct xenbus_device *dev) +{ + struct vscsibk_info *info = dev_get_drvdata(&dev->dev); + + if (info->irq) + scsiback_disconnect(info); + + scsiback_release_translation_entry(info); + + dev_set_drvdata(&dev->dev, NULL); + + return 0; +} + +static int scsiback_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err; + + struct vscsibk_info *info = kzalloc(sizeof(struct vscsibk_info), + GFP_KERNEL); + + DPRINTK("%p %d\n", dev, dev->otherend_id); + + if (!info) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating backend structure"); + return -ENOMEM; + } + info->dev = dev; + dev_set_drvdata(&dev->dev, info); + + info->domid = dev->otherend_id; + spin_lock_init(&info->ring_lock); + info->ring_error = 0; + atomic_set(&info->nr_unreplied_reqs, 0); + init_waitqueue_head(&info->waiting_to_free); + info->dev = dev; + info->irq = 0; + INIT_LIST_HEAD(&info->v2p_entry_lists); + spin_lock_init(&info->v2p_lock); + + err = xenbus_printf(XBT_NIL, dev->nodename, "feature-sg-grant", "%u", + SG_ALL); + if (err) + xenbus_dev_error(dev, err, "writing feature-sg-grant"); + + err = xenbus_switch_state(dev, XenbusStateInitWait); + if (err) + goto fail; + + return 0; + +fail: + pr_warn("xen-pvscsi: %s failed\n", __func__); + scsiback_remove(dev); + + return err; +} + +static char *scsiback_dump_proto_id(struct scsiback_tport *tport) +{ + switch (tport->tport_proto_id) { + case SCSI_PROTOCOL_SAS: + return "SAS"; + case SCSI_PROTOCOL_FCP: + return "FCP"; + case SCSI_PROTOCOL_ISCSI: + return "iSCSI"; + default: + break; + } + + return "Unknown"; +} + +static u8 scsiback_get_fabric_proto_ident(struct se_portal_group *se_tpg) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_tport *tport = tpg->tport; + + switch (tport->tport_proto_id) { + case SCSI_PROTOCOL_SAS: + return sas_get_fabric_proto_ident(se_tpg); + case SCSI_PROTOCOL_FCP: + return fc_get_fabric_proto_ident(se_tpg); + case SCSI_PROTOCOL_ISCSI: + return iscsi_get_fabric_proto_ident(se_tpg); + default: + pr_err("Unknown tport_proto_id: 0x%02x, using SAS emulation\n", + tport->tport_proto_id); + break; + } + + return sas_get_fabric_proto_ident(se_tpg); +} + +static char *scsiback_get_fabric_wwn(struct se_portal_group *se_tpg) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_tport *tport = tpg->tport; + + return &tport->tport_name[0]; +} + +static u16 scsiback_get_tag(struct se_portal_group *se_tpg) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + return tpg->tport_tpgt; +} + +static u32 scsiback_get_default_depth(struct se_portal_group *se_tpg) +{ + return 1; +} + +static u32 +scsiback_get_pr_transport_id(struct se_portal_group *se_tpg, + struct se_node_acl *se_nacl, + struct t10_pr_registration *pr_reg, + int *format_code, + unsigned char *buf) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_tport *tport = tpg->tport; + + switch (tport->tport_proto_id) { + case SCSI_PROTOCOL_SAS: + return sas_get_pr_transport_id(se_tpg, se_nacl, pr_reg, + format_code, buf); + case SCSI_PROTOCOL_FCP: + return fc_get_pr_transport_id(se_tpg, se_nacl, pr_reg, + format_code, buf); + case SCSI_PROTOCOL_ISCSI: + return iscsi_get_pr_transport_id(se_tpg, se_nacl, pr_reg, + format_code, buf); + default: + pr_err("Unknown tport_proto_id: 0x%02x, using SAS emulation\n", + tport->tport_proto_id); + break; + } + + return sas_get_pr_transport_id(se_tpg, se_nacl, pr_reg, + format_code, buf); +} + +static u32 +scsiback_get_pr_transport_id_len(struct se_portal_group *se_tpg, + struct se_node_acl *se_nacl, + struct t10_pr_registration *pr_reg, + int *format_code) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_tport *tport = tpg->tport; + + switch (tport->tport_proto_id) { + case SCSI_PROTOCOL_SAS: + return sas_get_pr_transport_id_len(se_tpg, se_nacl, pr_reg, + format_code); + case SCSI_PROTOCOL_FCP: + return fc_get_pr_transport_id_len(se_tpg, se_nacl, pr_reg, + format_code); + case SCSI_PROTOCOL_ISCSI: + return iscsi_get_pr_transport_id_len(se_tpg, se_nacl, pr_reg, + format_code); + default: + pr_err("Unknown tport_proto_id: 0x%02x, using SAS emulation\n", + tport->tport_proto_id); + break; + } + + return sas_get_pr_transport_id_len(se_tpg, se_nacl, pr_reg, + format_code); +} + +static char * +scsiback_parse_pr_out_transport_id(struct se_portal_group *se_tpg, + const char *buf, + u32 *out_tid_len, + char **port_nexus_ptr) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_tport *tport = tpg->tport; + + switch (tport->tport_proto_id) { + case SCSI_PROTOCOL_SAS: + return sas_parse_pr_out_transport_id(se_tpg, buf, out_tid_len, + port_nexus_ptr); + case SCSI_PROTOCOL_FCP: + return fc_parse_pr_out_transport_id(se_tpg, buf, out_tid_len, + port_nexus_ptr); + case SCSI_PROTOCOL_ISCSI: + return iscsi_parse_pr_out_transport_id(se_tpg, buf, out_tid_len, + port_nexus_ptr); + default: + pr_err("Unknown tport_proto_id: 0x%02x, using SAS emulation\n", + tport->tport_proto_id); + break; + } + + return sas_parse_pr_out_transport_id(se_tpg, buf, out_tid_len, + port_nexus_ptr); +} + +static struct se_wwn * +scsiback_make_tport(struct target_fabric_configfs *tf, + struct config_group *group, + const char *name) +{ + struct scsiback_tport *tport; + char *ptr; + u64 wwpn = 0; + int off = 0; + + tport = kzalloc(sizeof(struct scsiback_tport), GFP_KERNEL); + if (!tport) + return ERR_PTR(-ENOMEM); + + tport->tport_wwpn = wwpn; + /* + * Determine the emulated Protocol Identifier and Target Port Name + * based on the incoming configfs directory name. + */ + ptr = strstr(name, "naa."); + if (ptr) { + tport->tport_proto_id = SCSI_PROTOCOL_SAS; + goto check_len; + } + ptr = strstr(name, "fc."); + if (ptr) { + tport->tport_proto_id = SCSI_PROTOCOL_FCP; + off = 3; /* Skip over "fc." */ + goto check_len; + } + ptr = strstr(name, "iqn."); + if (ptr) { + tport->tport_proto_id = SCSI_PROTOCOL_ISCSI; + goto check_len; + } + + pr_err("Unable to locate prefix for emulated Target Port: %s\n", name); + kfree(tport); + return ERR_PTR(-EINVAL); + +check_len: + if (strlen(name) >= VSCSI_NAMELEN) { + pr_err("Emulated %s Address: %s, exceeds max: %d\n", name, + scsiback_dump_proto_id(tport), VSCSI_NAMELEN); + kfree(tport); + return ERR_PTR(-EINVAL); + } + snprintf(&tport->tport_name[0], VSCSI_NAMELEN, "%s", &name[off]); + + pr_debug("xen-pvscsi: Allocated emulated Target %s Address: %s\n", + scsiback_dump_proto_id(tport), name); + + return &tport->tport_wwn; +} + +static void scsiback_drop_tport(struct se_wwn *wwn) +{ + struct scsiback_tport *tport = container_of(wwn, + struct scsiback_tport, tport_wwn); + + pr_debug("xen-pvscsi: Deallocating emulated Target %s Address: %s\n", + scsiback_dump_proto_id(tport), tport->tport_name); + + kfree(tport); +} + +static struct se_node_acl * +scsiback_alloc_fabric_acl(struct se_portal_group *se_tpg) +{ + return kzalloc(sizeof(struct se_node_acl), GFP_KERNEL); +} + +static void +scsiback_release_fabric_acl(struct se_portal_group *se_tpg, + struct se_node_acl *se_nacl) +{ + kfree(se_nacl); +} + +static u32 scsiback_tpg_get_inst_index(struct se_portal_group *se_tpg) +{ + return 1; +} + +static int scsiback_check_stop_free(struct se_cmd *se_cmd) +{ + /* + * Do not release struct se_cmd's containing a valid TMR + * pointer. These will be released directly in scsiback_device_action() + * with transport_generic_free_cmd(). + */ + if (se_cmd->se_cmd_flags & SCF_SCSI_TMR_CDB) + return 0; + + transport_generic_free_cmd(se_cmd, 0); + return 1; +} + +static void scsiback_release_cmd(struct se_cmd *se_cmd) +{ + struct vscsibk_pend *pending_req = container_of(se_cmd, + struct vscsibk_pend, se_cmd); + + kmem_cache_free(scsiback_cachep, pending_req); +} + +static int scsiback_shutdown_session(struct se_session *se_sess) +{ + return 0; +} + +static void scsiback_close_session(struct se_session *se_sess) +{ +} + +static u32 scsiback_sess_get_index(struct se_session *se_sess) +{ + return 0; +} + +static int scsiback_write_pending(struct se_cmd *se_cmd) +{ + /* Go ahead and process the write immediately */ + target_execute_cmd(se_cmd); + + return 0; +} + +static int scsiback_write_pending_status(struct se_cmd *se_cmd) +{ + return 0; +} + +static void scsiback_set_default_node_attrs(struct se_node_acl *nacl) +{ +} + +static u32 scsiback_get_task_tag(struct se_cmd *se_cmd) +{ + struct vscsibk_pend *pending_req = container_of(se_cmd, + struct vscsibk_pend, se_cmd); + + return pending_req->rqid; +} + +static int scsiback_get_cmd_state(struct se_cmd *se_cmd) +{ + return 0; +} + +static int scsiback_queue_data_in(struct se_cmd *se_cmd) +{ + struct vscsibk_pend *pending_req = container_of(se_cmd, + struct vscsibk_pend, se_cmd); + + pending_req->result = SAM_STAT_GOOD; + scsiback_cmd_done(pending_req); + return 0; +} + +static int scsiback_queue_status(struct se_cmd *se_cmd) +{ + struct vscsibk_pend *pending_req = container_of(se_cmd, + struct vscsibk_pend, se_cmd); + + if (se_cmd->sense_buffer && + ((se_cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) || + (se_cmd->se_cmd_flags & SCF_EMULATED_TASK_SENSE))) + pending_req->result = (DRIVER_SENSE << 24) | + SAM_STAT_CHECK_CONDITION; + else + pending_req->result = se_cmd->scsi_status; + + scsiback_cmd_done(pending_req); + return 0; +} + +static void scsiback_queue_tm_rsp(struct se_cmd *se_cmd) +{ + struct se_tmr_req *se_tmr = se_cmd->se_tmr_req; + struct scsiback_tmr *tmr = se_tmr->fabric_tmr_ptr; + + atomic_set(&tmr->tmr_complete, 1); + wake_up(&tmr->tmr_wait); +} + +static void scsiback_aborted_task(struct se_cmd *se_cmd) +{ +} + +static ssize_t scsiback_tpg_param_show_alias(struct se_portal_group *se_tpg, + char *page) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, struct scsiback_tpg, + se_tpg); + ssize_t rb; + + mutex_lock(&tpg->tv_tpg_mutex); + rb = snprintf(page, PAGE_SIZE, "%s\n", tpg->param_alias); + mutex_unlock(&tpg->tv_tpg_mutex); + + return rb; +} + +static ssize_t scsiback_tpg_param_store_alias(struct se_portal_group *se_tpg, + const char *page, size_t count) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, struct scsiback_tpg, + se_tpg); + int len; + + if (strlen(page) >= VSCSI_NAMELEN) { + pr_err("param alias: %s, exceeds max: %d\n", page, + VSCSI_NAMELEN); + return -EINVAL; + } + + mutex_lock(&tpg->tv_tpg_mutex); + len = snprintf(tpg->param_alias, VSCSI_NAMELEN, "%s", page); + if (tpg->param_alias[len - 1] == '\n') + tpg->param_alias[len - 1] = '\0'; + mutex_unlock(&tpg->tv_tpg_mutex); + + return count; +} + +TF_TPG_PARAM_ATTR(scsiback, alias, S_IRUGO | S_IWUSR); + +static struct configfs_attribute *scsiback_param_attrs[] = { + &scsiback_tpg_param_alias.attr, + NULL, +}; + +static int scsiback_make_nexus(struct scsiback_tpg *tpg, + const char *name) +{ + struct se_portal_group *se_tpg; + struct se_session *se_sess; + struct scsiback_nexus *tv_nexus; + + mutex_lock(&tpg->tv_tpg_mutex); + if (tpg->tpg_nexus) { + mutex_unlock(&tpg->tv_tpg_mutex); + pr_debug("tpg->tpg_nexus already exists\n"); + return -EEXIST; + } + se_tpg = &tpg->se_tpg; + + tv_nexus = kzalloc(sizeof(struct scsiback_nexus), GFP_KERNEL); + if (!tv_nexus) { + mutex_unlock(&tpg->tv_tpg_mutex); + return -ENOMEM; + } + /* + * Initialize the struct se_session pointer + */ + tv_nexus->tvn_se_sess = transport_init_session(TARGET_PROT_NORMAL); + if (IS_ERR(tv_nexus->tvn_se_sess)) { + mutex_unlock(&tpg->tv_tpg_mutex); + kfree(tv_nexus); + return -ENOMEM; + } + se_sess = tv_nexus->tvn_se_sess; + /* + * Since we are running in 'demo mode' this call with generate a + * struct se_node_acl for the scsiback struct se_portal_group with + * the SCSI Initiator port name of the passed configfs group 'name'. + */ + tv_nexus->tvn_se_sess->se_node_acl = core_tpg_check_initiator_node_acl( + se_tpg, (unsigned char *)name); + if (!tv_nexus->tvn_se_sess->se_node_acl) { + mutex_unlock(&tpg->tv_tpg_mutex); + pr_debug("core_tpg_check_initiator_node_acl() failed for %s\n", + name); + goto out; + } + /* + * Now register the TCM pvscsi virtual I_T Nexus as active with the + * call to __transport_register_session() + */ + __transport_register_session(se_tpg, tv_nexus->tvn_se_sess->se_node_acl, + tv_nexus->tvn_se_sess, tv_nexus); + tpg->tpg_nexus = tv_nexus; + + mutex_unlock(&tpg->tv_tpg_mutex); + return 0; + +out: + transport_free_session(se_sess); + kfree(tv_nexus); + return -ENOMEM; +} + +static int scsiback_drop_nexus(struct scsiback_tpg *tpg) +{ + struct se_session *se_sess; + struct scsiback_nexus *tv_nexus; + + mutex_lock(&tpg->tv_tpg_mutex); + tv_nexus = tpg->tpg_nexus; + if (!tv_nexus) { + mutex_unlock(&tpg->tv_tpg_mutex); + return -ENODEV; + } + + se_sess = tv_nexus->tvn_se_sess; + if (!se_sess) { + mutex_unlock(&tpg->tv_tpg_mutex); + return -ENODEV; + } + + if (tpg->tv_tpg_port_count != 0) { + mutex_unlock(&tpg->tv_tpg_mutex); + pr_err("Unable to remove xen-pvscsi I_T Nexus with active TPG port count: %d\n", + tpg->tv_tpg_port_count); + return -EBUSY; + } + + if (tpg->tv_tpg_fe_count != 0) { + mutex_unlock(&tpg->tv_tpg_mutex); + pr_err("Unable to remove xen-pvscsi I_T Nexus with active TPG frontend count: %d\n", + tpg->tv_tpg_fe_count); + return -EBUSY; + } + + pr_debug("xen-pvscsi: Removing I_T Nexus to emulated %s Initiator Port: %s\n", + scsiback_dump_proto_id(tpg->tport), + tv_nexus->tvn_se_sess->se_node_acl->initiatorname); + + /* + * Release the SCSI I_T Nexus to the emulated xen-pvscsi Target Port + */ + transport_deregister_session(tv_nexus->tvn_se_sess); + tpg->tpg_nexus = NULL; + mutex_unlock(&tpg->tv_tpg_mutex); + + kfree(tv_nexus); + return 0; +} + +static ssize_t scsiback_tpg_show_nexus(struct se_portal_group *se_tpg, + char *page) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_nexus *tv_nexus; + ssize_t ret; + + mutex_lock(&tpg->tv_tpg_mutex); + tv_nexus = tpg->tpg_nexus; + if (!tv_nexus) { + mutex_unlock(&tpg->tv_tpg_mutex); + return -ENODEV; + } + ret = snprintf(page, PAGE_SIZE, "%s\n", + tv_nexus->tvn_se_sess->se_node_acl->initiatorname); + mutex_unlock(&tpg->tv_tpg_mutex); + + return ret; +} + +static ssize_t scsiback_tpg_store_nexus(struct se_portal_group *se_tpg, + const char *page, + size_t count) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_tport *tport_wwn = tpg->tport; + unsigned char i_port[VSCSI_NAMELEN], *ptr, *port_ptr; + int ret; + /* + * Shutdown the active I_T nexus if 'NULL' is passed.. + */ + if (!strncmp(page, "NULL", 4)) { + ret = scsiback_drop_nexus(tpg); + return (!ret) ? count : ret; + } + /* + * Otherwise make sure the passed virtual Initiator port WWN matches + * the fabric protocol_id set in scsiback_make_tport(), and call + * scsiback_make_nexus(). + */ + if (strlen(page) >= VSCSI_NAMELEN) { + pr_err("Emulated NAA Sas Address: %s, exceeds max: %d\n", + page, VSCSI_NAMELEN); + return -EINVAL; + } + snprintf(&i_port[0], VSCSI_NAMELEN, "%s", page); + + ptr = strstr(i_port, "naa."); + if (ptr) { + if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_SAS) { + pr_err("Passed SAS Initiator Port %s does not match target port protoid: %s\n", + i_port, scsiback_dump_proto_id(tport_wwn)); + return -EINVAL; + } + port_ptr = &i_port[0]; + goto check_newline; + } + ptr = strstr(i_port, "fc."); + if (ptr) { + if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_FCP) { + pr_err("Passed FCP Initiator Port %s does not match target port protoid: %s\n", + i_port, scsiback_dump_proto_id(tport_wwn)); + return -EINVAL; + } + port_ptr = &i_port[3]; /* Skip over "fc." */ + goto check_newline; + } + ptr = strstr(i_port, "iqn."); + if (ptr) { + if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_ISCSI) { + pr_err("Passed iSCSI Initiator Port %s does not match target port protoid: %s\n", + i_port, scsiback_dump_proto_id(tport_wwn)); + return -EINVAL; + } + port_ptr = &i_port[0]; + goto check_newline; + } + pr_err("Unable to locate prefix for emulated Initiator Port: %s\n", + i_port); + return -EINVAL; + /* + * Clear any trailing newline for the NAA WWN + */ +check_newline: + if (i_port[strlen(i_port) - 1] == '\n') + i_port[strlen(i_port) - 1] = '\0'; + + ret = scsiback_make_nexus(tpg, port_ptr); + if (ret < 0) + return ret; + + return count; +} + +TF_TPG_BASE_ATTR(scsiback, nexus, S_IRUGO | S_IWUSR); + +static struct configfs_attribute *scsiback_tpg_attrs[] = { + &scsiback_tpg_nexus.attr, + NULL, +}; + +static ssize_t +scsiback_wwn_show_attr_version(struct target_fabric_configfs *tf, + char *page) +{ + return sprintf(page, "xen-pvscsi fabric module %s on %s/%s on " + UTS_RELEASE"\n", + VSCSI_VERSION, utsname()->sysname, utsname()->machine); +} + +TF_WWN_ATTR_RO(scsiback, version); + +static struct configfs_attribute *scsiback_wwn_attrs[] = { + &scsiback_wwn_version.attr, + NULL, +}; + +static char *scsiback_get_fabric_name(void) +{ + return "xen-pvscsi"; +} + +static int scsiback_port_link(struct se_portal_group *se_tpg, + struct se_lun *lun) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_port_count++; + mutex_unlock(&tpg->tv_tpg_mutex); + + return 0; +} + +static void scsiback_port_unlink(struct se_portal_group *se_tpg, + struct se_lun *lun) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_port_count--; + mutex_unlock(&tpg->tv_tpg_mutex); +} + +static struct se_portal_group * +scsiback_make_tpg(struct se_wwn *wwn, + struct config_group *group, + const char *name) +{ + struct scsiback_tport *tport = container_of(wwn, + struct scsiback_tport, tport_wwn); + + struct scsiback_tpg *tpg; + unsigned long tpgt; + int ret; + + if (strstr(name, "tpgt_") != name) + return ERR_PTR(-EINVAL); + if (kstrtoul(name + 5, 10, &tpgt) || tpgt > UINT_MAX) + return ERR_PTR(-EINVAL); + + tpg = kzalloc(sizeof(struct scsiback_tpg), GFP_KERNEL); + if (!tpg) + return ERR_PTR(-ENOMEM); + + mutex_init(&tpg->tv_tpg_mutex); + INIT_LIST_HEAD(&tpg->tv_tpg_list); + INIT_LIST_HEAD(&tpg->info_list); + tpg->tport = tport; + tpg->tport_tpgt = tpgt; + + ret = core_tpg_register(&scsiback_fabric_configfs->tf_ops, wwn, + &tpg->se_tpg, tpg, TRANSPORT_TPG_TYPE_NORMAL); + if (ret < 0) { + kfree(tpg); + return NULL; + } + mutex_lock(&scsiback_mutex); + list_add_tail(&tpg->tv_tpg_list, &scsiback_list); + mutex_unlock(&scsiback_mutex); + + return &tpg->se_tpg; +} + +static void scsiback_drop_tpg(struct se_portal_group *se_tpg) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + + mutex_lock(&scsiback_mutex); + list_del(&tpg->tv_tpg_list); + mutex_unlock(&scsiback_mutex); + /* + * Release the virtual I_T Nexus for this xen-pvscsi TPG + */ + scsiback_drop_nexus(tpg); + /* + * Deregister the se_tpg from TCM.. + */ + core_tpg_deregister(se_tpg); + kfree(tpg); +} + +static int scsiback_check_true(struct se_portal_group *se_tpg) +{ + return 1; +} + +static int scsiback_check_false(struct se_portal_group *se_tpg) +{ + return 0; +} + +static struct target_core_fabric_ops scsiback_ops = { + .get_fabric_name = scsiback_get_fabric_name, + .get_fabric_proto_ident = scsiback_get_fabric_proto_ident, + .tpg_get_wwn = scsiback_get_fabric_wwn, + .tpg_get_tag = scsiback_get_tag, + .tpg_get_default_depth = scsiback_get_default_depth, + .tpg_get_pr_transport_id = scsiback_get_pr_transport_id, + .tpg_get_pr_transport_id_len = scsiback_get_pr_transport_id_len, + .tpg_parse_pr_out_transport_id = scsiback_parse_pr_out_transport_id, + .tpg_check_demo_mode = scsiback_check_true, + .tpg_check_demo_mode_cache = scsiback_check_true, + .tpg_check_demo_mode_write_protect = scsiback_check_false, + .tpg_check_prod_mode_write_protect = scsiback_check_false, + .tpg_alloc_fabric_acl = scsiback_alloc_fabric_acl, + .tpg_release_fabric_acl = scsiback_release_fabric_acl, + .tpg_get_inst_index = scsiback_tpg_get_inst_index, + .check_stop_free = scsiback_check_stop_free, + .release_cmd = scsiback_release_cmd, + .put_session = NULL, + .shutdown_session = scsiback_shutdown_session, + .close_session = scsiback_close_session, + .sess_get_index = scsiback_sess_get_index, + .sess_get_initiator_sid = NULL, + .write_pending = scsiback_write_pending, + .write_pending_status = scsiback_write_pending_status, + .set_default_node_attributes = scsiback_set_default_node_attrs, + .get_task_tag = scsiback_get_task_tag, + .get_cmd_state = scsiback_get_cmd_state, + .queue_data_in = scsiback_queue_data_in, + .queue_status = scsiback_queue_status, + .queue_tm_rsp = scsiback_queue_tm_rsp, + .aborted_task = scsiback_aborted_task, + /* + * Setup callers for generic logic in target_core_fabric_configfs.c + */ + .fabric_make_wwn = scsiback_make_tport, + .fabric_drop_wwn = scsiback_drop_tport, + .fabric_make_tpg = scsiback_make_tpg, + .fabric_drop_tpg = scsiback_drop_tpg, + .fabric_post_link = scsiback_port_link, + .fabric_pre_unlink = scsiback_port_unlink, + .fabric_make_np = NULL, + .fabric_drop_np = NULL, +#if 0 + .fabric_make_nodeacl = scsiback_make_nodeacl, + .fabric_drop_nodeacl = scsiback_drop_nodeacl, +#endif +}; + +static int scsiback_register_configfs(void) +{ + struct target_fabric_configfs *fabric; + int ret; + + pr_debug("xen-pvscsi: fabric module %s on %s/%s on "UTS_RELEASE"\n", + VSCSI_VERSION, utsname()->sysname, utsname()->machine); + /* + * Register the top level struct config_item_type with TCM core + */ + fabric = target_fabric_configfs_init(THIS_MODULE, "xen-pvscsi"); + if (IS_ERR(fabric)) + return PTR_ERR(fabric); + + /* + * Setup fabric->tf_ops from our local scsiback_ops + */ + fabric->tf_ops = scsiback_ops; + /* + * Setup default attribute lists for various fabric->tf_cit_tmpl + */ + fabric->tf_cit_tmpl.tfc_wwn_cit.ct_attrs = scsiback_wwn_attrs; + fabric->tf_cit_tmpl.tfc_tpg_base_cit.ct_attrs = scsiback_tpg_attrs; + fabric->tf_cit_tmpl.tfc_tpg_attrib_cit.ct_attrs = NULL; + fabric->tf_cit_tmpl.tfc_tpg_param_cit.ct_attrs = scsiback_param_attrs; + fabric->tf_cit_tmpl.tfc_tpg_np_base_cit.ct_attrs = NULL; + fabric->tf_cit_tmpl.tfc_tpg_nacl_base_cit.ct_attrs = NULL; + fabric->tf_cit_tmpl.tfc_tpg_nacl_attrib_cit.ct_attrs = NULL; + fabric->tf_cit_tmpl.tfc_tpg_nacl_auth_cit.ct_attrs = NULL; + fabric->tf_cit_tmpl.tfc_tpg_nacl_param_cit.ct_attrs = NULL; + /* + * Register the fabric for use within TCM + */ + ret = target_fabric_configfs_register(fabric); + if (ret < 0) { + target_fabric_configfs_free(fabric); + return ret; + } + /* + * Setup our local pointer to *fabric + */ + scsiback_fabric_configfs = fabric; + pr_debug("xen-pvscsi: Set fabric -> scsiback_fabric_configfs\n"); + return 0; +}; + +static void scsiback_deregister_configfs(void) +{ + if (!scsiback_fabric_configfs) + return; + + target_fabric_configfs_deregister(scsiback_fabric_configfs); + scsiback_fabric_configfs = NULL; + pr_debug("xen-pvscsi: Cleared scsiback_fabric_configfs\n"); +}; + +static const struct xenbus_device_id scsiback_ids[] = { + { "vscsi" }, + { "" } +}; + +static DEFINE_XENBUS_DRIVER(scsiback, , + .probe = scsiback_probe, + .remove = scsiback_remove, + .otherend_changed = scsiback_frontend_changed +); + +static void scsiback_init_pend(void *p) +{ + struct vscsibk_pend *pend = p; + int i; + + memset(pend, 0, sizeof(*pend)); + for (i = 0; i < VSCSI_MAX_GRANTS; i++) + pend->grant_handles[i] = SCSIBACK_INVALID_HANDLE; +} + +static int __init scsiback_init(void) +{ + int ret; + + if (!xen_domain()) + return -ENODEV; + + scsiback_cachep = kmem_cache_create("vscsiif_cache", + sizeof(struct vscsibk_pend), 0, 0, scsiback_init_pend); + if (!scsiback_cachep) + return -ENOMEM; + + ret = xenbus_register_backend(&scsiback_driver); + if (ret) + goto out_cache_destroy; + + ret = scsiback_register_configfs(); + if (ret) + goto out_unregister_xenbus; + + return 0; + +out_unregister_xenbus: + xenbus_unregister_driver(&scsiback_driver); +out_cache_destroy: + kmem_cache_destroy(scsiback_cachep); + pr_err("xen-pvscsi: %s: error %d\n", __func__, ret); + return ret; +} + +static void __exit scsiback_exit(void) +{ + struct page *page; + + while (free_pages_num) { + if (get_free_page(&page)) + BUG(); + free_xenballooned_pages(1, &page); + } + scsiback_deregister_configfs(); + xenbus_unregister_driver(&scsiback_driver); + kmem_cache_destroy(scsiback_cachep); +} + +module_init(scsiback_init); +module_exit(scsiback_exit); + +MODULE_DESCRIPTION("Xen SCSI backend driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS("xen-backend:vscsi"); +MODULE_AUTHOR("Juergen Gross "); -- cgit v1.2.3 From 15d036094b75e63a3817ec4f836959d38cecb1e6 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 28 Aug 2014 06:44:13 +0200 Subject: MAINTAINERS: Add xen pvscsi maintainer Add myself as maintainer for the Xen pvSCSI drivers. Signed-off-by: Juergen Gross Acked-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 670b3dcce2de..52a5fb69dbf5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10156,6 +10156,15 @@ S: Supported F: drivers/block/xen-blkback/* F: drivers/block/xen* +XEN PVSCSI DRIVERS +M: Juergen Gross +L: xen-devel@lists.xenproject.org (moderated for non-subscribers) +L: linux-scsi@vger.kernel.org +S: Supported +F: drivers/scsi/xen-scsifront.c +F: drivers/xen/xen-scsiback.c +F: include/xen/interface/io/vscsiif.h + XEN SWIOTLB SUBSYSTEM M: Konrad Rzeszutek Wilk L: xen-devel@lists.xenproject.org (moderated for non-subscribers) -- cgit v1.2.3 From d9b1e6374b3a9b88774e30d0c6bf6e394cd10b76 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 8 Sep 2014 14:15:42 +0300 Subject: xen-scsifront: use GFP_ATOMIC under spin_lock This function is only called with a spin_lock held and IRQs disabled. The allocation is not allowed to sleep and NOIO is not sufficient, it has to be ATOMIC. Signed-off-by: Dan Carpenter Reviewed-by: Juergen Gross Signed-off-by: David Vrabel --- drivers/scsi/xen-scsifront.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/xen-scsifront.c b/drivers/scsi/xen-scsifront.c index 0aceb70e1d83..7e88659bf5af 100644 --- a/drivers/scsi/xen-scsifront.c +++ b/drivers/scsi/xen-scsifront.c @@ -359,7 +359,7 @@ static int map_data_for_request(struct vscsifrnt_info *info, } seg_grants = vscsiif_grants_sg(data_grants); shadow->sg = kcalloc(data_grants, - sizeof(struct scsiif_request_segment), GFP_NOIO); + sizeof(struct scsiif_request_segment), GFP_ATOMIC); if (!shadow->sg) return -ENOMEM; } -- cgit v1.2.3 From 495daef902425e241a0b95791f3aeb737928256a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 8 Sep 2014 14:17:35 +0300 Subject: xen-scsiback: clean up a type issue in scsiback_make_tpg() This code was confusing because we had an unsigned long and then we compared it to UINT_MAX and then we stored it in a u16. How many bytes is this supposed to have: 2, 4 or 16??? I've made it a u16 throughout. Signed-off-by: Dan Carpenter Reviewed-by: Juergen Gross Signed-off-by: David Vrabel --- drivers/xen/xen-scsiback.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c index 7b5656323739..ad11258a78d6 100644 --- a/drivers/xen/xen-scsiback.c +++ b/drivers/xen/xen-scsiback.c @@ -1885,13 +1885,14 @@ scsiback_make_tpg(struct se_wwn *wwn, struct scsiback_tport, tport_wwn); struct scsiback_tpg *tpg; - unsigned long tpgt; + u16 tpgt; int ret; if (strstr(name, "tpgt_") != name) return ERR_PTR(-EINVAL); - if (kstrtoul(name + 5, 10, &tpgt) || tpgt > UINT_MAX) - return ERR_PTR(-EINVAL); + ret = kstrtou16(name + 5, 10, &tpgt); + if (ret) + return ERR_PTR(ret); tpg = kzalloc(sizeof(struct scsiback_tpg), GFP_KERNEL); if (!tpg) -- cgit v1.2.3 From 342cd340f6e73a974053dd09ed1bf8a9c1ed4458 Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Mon, 8 Sep 2014 15:22:18 +0200 Subject: xen/efi: Directly include needed headers I discovered that some needed stuff is defined/declared in headers which are not included directly. Currently it works but if somebody remove required headers from currently included headers then build will break. So, just in case directly include all needed headers. Signed-off-by: Daniel Kiper Signed-off-by: David Vrabel --- arch/x86/xen/efi.c | 2 ++ drivers/xen/efi.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index a02e09e18f57..be14cc3e48d5 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -15,12 +15,14 @@ * with this program. If not, see . */ +#include #include #include #include #include +#include #include void __init xen_efi_init(void) diff --git a/drivers/xen/efi.c b/drivers/xen/efi.c index 31f618a49661..1f850c97482f 100644 --- a/drivers/xen/efi.c +++ b/drivers/xen/efi.c @@ -27,6 +27,8 @@ #include #include +#include + #include #define INIT_EFI_OP(name) \ -- cgit v1.2.3 From 31668511424110ad470315c6a63dec9a10f1a7ba Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 10 Apr 2014 18:46:45 +0100 Subject: x86: skip check for spurious faults for non-present faults If a fault on a kernel address is due to a non-present page, then it cannot be the result of stale TLB entry from a protection change (RO to RW or NX to X). Thus the pagetable walk in spurious_fault() can be skipped. See the initial if in spurious_fault() and the tests in spurious_fault_check()) for the set of possible error codes checked for spurious faults. These are: IRUWP Before x00xx && ( 1xxxx || xxx1x ) After ( 10001 || 00011 ) && ( 1xxxx || xxx1x ) Thus the new condition is a subset of the previous one, excluding only non-present faults (I == 1 and W == 1 are mutually exclusive). This avoids spurious_fault() oopsing in some cases if the pagetables it attempts to walk are not accessible. This obscures the location of the original fault. This also fixes a crash with Xen PV guests when they access entries in the M2P corresponding to device MMIO regions. The M2P is mapped (read-only) by Xen into the kernel address space of the guest and this mapping may contains holes for non-RAM regions. Read faults will result in calls to spurious_fault(), but because the page tables for the M2P mappings are not accessible by the guest the pagetable walk would fault. This was not normally a problem as MMIO mappings would not normally result in a M2P lookup because of the use of the _PAGE_IOMAP bit the PTE. However, removing the _PAGE_IOMAP bit requires M2P lookups for MMIO mappings as well. Signed-off-by: David Vrabel Reported-by: Konrad Rzeszutek Wilk Tested-by: Konrad Rzeszutek Wilk Acked-by: Dave Hansen --- arch/x86/mm/fault.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a24194681513..83bb03bfa259 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -933,8 +933,17 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) * cross-processor TLB flush, even if no stale TLB entries exist * on other processors. * + * Spurious faults may only occur if the TLB contains an entry with + * fewer permission than the page table entry. Non-present (P = 0) + * and reserved bit (R = 1) faults are never spurious. + * * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. + * + * Returns non-zero if a spurious fault was handled, zero otherwise. + * + * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3 + * (Optional Invalidation). */ static noinline int spurious_fault(unsigned long error_code, unsigned long address) @@ -945,8 +954,17 @@ spurious_fault(unsigned long error_code, unsigned long address) pte_t *pte; int ret; - /* Reserved-bit violation or user access to kernel space? */ - if (error_code & (PF_USER | PF_RSVD)) + /* + * Only writes to RO or instruction fetches from NX may cause + * spurious faults. + * + * These could be from user or supervisor accesses but the TLB + * is only lazily flushed after a kernel mapping protection + * change, so user accesses are not expected to cause spurious + * faults. + */ + if (error_code != (PF_WRITE | PF_PROT) + && error_code != (PF_INSTR | PF_PROT)) return 0; pgd = init_mm.pgd + pgd_index(address); -- cgit v1.2.3 From 7f2f88224517cdaad68b772b2a2095b87dc72886 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Wed, 8 Jan 2014 14:01:01 +0000 Subject: x86/xen: do not use _PAGE_IOMAP PTE flag for I/O mappings Since mfn_to_pfn() returns the correct PFN for identity mappings (as used for MMIO regions), the use of _PAGE_IOMAP is not required in pte_mfn_to_pfn(). Do not set the _PAGE_IOMAP flag in pte_pfn_to_mfn() and do not use it in pte_mfn_to_pfn(). This will allow _PAGE_IOMAP to be removed, making it available for future use. Signed-off-by: David Vrabel Reviewed-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 48 ++++-------------------------------------------- 1 file changed, 4 insertions(+), 44 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 16fb0099b7f2..f62af7647ec9 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -399,38 +399,14 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) if (unlikely(mfn == INVALID_P2M_ENTRY)) { mfn = 0; flags = 0; - } else { - /* - * Paramount to do this test _after_ the - * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY & - * IDENTITY_FRAME_BIT resolves to true. - */ - mfn &= ~FOREIGN_FRAME_BIT; - if (mfn & IDENTITY_FRAME_BIT) { - mfn &= ~IDENTITY_FRAME_BIT; - flags |= _PAGE_IOMAP; - } - } + } else + mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); val = ((pteval_t)mfn << PAGE_SHIFT) | flags; } return val; } -static pteval_t iomap_pte(pteval_t val) -{ - if (val & _PAGE_PRESENT) { - unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; - pteval_t flags = val & PTE_FLAGS_MASK; - - /* We assume the pte frame number is a MFN, so - just use it as-is. */ - val = ((pteval_t)pfn << PAGE_SHIFT) | flags; - } - - return val; -} - __visible pteval_t xen_pte_val(pte_t pte) { pteval_t pteval = pte.pte; @@ -441,9 +417,6 @@ __visible pteval_t xen_pte_val(pte_t pte) pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT; } #endif - if (xen_initial_domain() && (pteval & _PAGE_IOMAP)) - return pteval; - return pte_mfn_to_pfn(pteval); } PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); @@ -481,7 +454,6 @@ void xen_set_pat(u64 pat) __visible pte_t xen_make_pte(pteval_t pte) { - phys_addr_t addr = (pte & PTE_PFN_MASK); #if 0 /* If Linux is trying to set a WC pte, then map to the Xen WC. * If _PAGE_PAT is set, then it probably means it is really @@ -496,19 +468,7 @@ __visible pte_t xen_make_pte(pteval_t pte) pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT; } #endif - /* - * Unprivileged domains are allowed to do IOMAPpings for - * PCI passthrough, but not map ISA space. The ISA - * mappings are just dummy local mappings to keep other - * parts of the kernel happy. - */ - if (unlikely(pte & _PAGE_IOMAP) && - (xen_initial_domain() || addr >= ISA_END_ADDRESS)) { - pte = iomap_pte(pte); - } else { - pte &= ~_PAGE_IOMAP; - pte = pte_pfn_to_mfn(pte); - } + pte = pte_pfn_to_mfn(pte); return native_make_pte(pte); } @@ -2091,7 +2051,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) default: /* By default, set_fixmap is used for hardware mappings */ - pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP)); + pte = mfn_pte(phys, prot); break; } -- cgit v1.2.3 From f955371ca9d3986bca100666041fcfa9b6d21962 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Tue, 7 Jan 2014 17:03:06 +0000 Subject: x86: remove the Xen-specific _PAGE_IOMAP PTE flag The _PAGE_IO_MAP PTE flag was only used by Xen PV guests to mark PTEs that were used to map I/O regions that are 1:1 in the p2m. This allowed Xen to obtain the correct PFN when converting the MFNs read from a PTE back to their PFN. Xen guests no longer use _PAGE_IOMAP for this. Instead mfn_to_pfn() returns the correct PFN by using a combination of the m2p and p2m to determine if an MFN corresponds to a 1:1 mapping in the the p2m. Remove _PAGE_IOMAP, replacing it with _PAGE_UNUSED2 to allow for future uses of the PTE flag. Signed-off-by: David Vrabel Acked-by: "H. Peter Anvin" --- arch/x86/include/asm/pgtable_types.h | 11 +++++------ arch/x86/mm/init_32.c | 2 +- arch/x86/mm/init_64.c | 2 +- arch/x86/pci/i386.c | 2 -- arch/x86/xen/enlighten.c | 2 -- 5 files changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index f216963760e5..cee83b26916f 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -23,7 +23,6 @@ #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 #define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */ -#define _PAGE_BIT_IOMAP _PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */ #define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ @@ -52,7 +51,7 @@ #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) -#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) +#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) @@ -168,10 +167,10 @@ #define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) -#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP) -#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP) -#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP) -#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO (__PAGE_KERNEL) +#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) +#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS) +#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC) #define PAGE_KERNEL __pgprot(__PAGE_KERNEL) #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 7d05565ba781..c8140e12816a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -537,7 +537,7 @@ static void __init pagetable_init(void) permanent_kmaps_init(pgd_base); } -pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); +pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); EXPORT_SYMBOL_GPL(__supported_pte_mask); /* user-defined highmem size */ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5621c47d7a1a..5d984769cbd8 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -151,7 +151,7 @@ early_param("gbpages", parse_direct_gbpages_on); * around without checking the pgd every time. */ -pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; +pteval_t __supported_pte_mask __read_mostly = ~0; EXPORT_SYMBOL_GPL(__supported_pte_mask); int force_personality32; diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 2ae525e0d8ba..37c1435889ce 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -442,8 +442,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, */ prot |= _PAGE_CACHE_UC_MINUS; - prot |= _PAGE_IOMAP; /* creating a mapping for IO */ - vma->vm_page_prot = __pgprot(prot); if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c0cb11fb5008..be1cff572ccf 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1559,8 +1559,6 @@ asmlinkage __visible void __init xen_start_kernel(void) #endif __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); - __supported_pte_mask |= _PAGE_IOMAP; - /* * Prevent page tables from being allocated in highmem, even * if CONFIG_HIGHPTE is enabled. -- cgit v1.2.3 From 7921a11c7b2929f3ed6fe9081edcf695c60b23c6 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 11 Sep 2014 14:20:04 +0100 Subject: xen-scsifront: don't deadlock if the ring becomes full scsifront_action_handler() will deadlock on host->host_lock, if the ring is full and it has to wait for entries to become available. Signed-off-by: David Vrabel Reviewed-by: Juergen Gross --- drivers/scsi/xen-scsifront.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/xen-scsifront.c b/drivers/scsi/xen-scsifront.c index 7e88659bf5af..cc14c8ddd369 100644 --- a/drivers/scsi/xen-scsifront.c +++ b/drivers/scsi/xen-scsifront.c @@ -541,8 +541,9 @@ static int scsifront_action_handler(struct scsi_cmnd *sc, uint8_t act) if (!shadow) return FAILED; + spin_lock_irq(host->host_lock); + for (;;) { - spin_lock_irq(host->host_lock); if (!RING_FULL(&info->ring)) { ring_req = scsifront_command2ring(info, sc, shadow); if (ring_req) -- cgit v1.2.3 From c2ba1f7d39ff4f3666f0fe0f9f55a41a81e745b6 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Wed, 17 Sep 2014 14:07:06 -0700 Subject: arm{,64}/xen: Remove "EXPERIMENTAL" in the description of the Xen options The Xen ARM API is stable since Xen 4.4 and everything has been upstreamed in Linux for ARM and ARM64. Therefore we can drop "EXPERIMENTAL" from the Xen option in the both Kconfig. Signed-off-by: Julien Grall Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org --- arch/arm/Kconfig | 2 +- arch/arm64/Kconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 32cbbd565902..aa971080567b 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1758,7 +1758,7 @@ config XEN_DOM0 depends on XEN config XEN - bool "Xen guest support on ARM (EXPERIMENTAL)" + bool "Xen guest support on ARM" depends on ARM && AEABI && OF depends on CPU_V7 && !CPU_V6 depends on !GENERIC_ATOMIC64 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index fd4e81a4e1ce..82bd86c44540 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -317,7 +317,7 @@ config XEN_DOM0 depends on XEN config XEN - bool "Xen guest support on ARM64 (EXPERIMENTAL)" + bool "Xen guest support on ARM64" depends on ARM64 && OF select SWIOTLB_XEN help -- cgit v1.2.3 From 8ab85eba49c6f4ad67b65fb61104b912f86a1546 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 15 Sep 2014 06:45:01 +0200 Subject: xen: make pvscsi frontend dependant on xenbus frontend The pvscsi frontend driver requires the xenbus frontend driver. Reflect this in Kconfig. Signed-off-by: Juergen Gross Signed-off-by: Stefano Stabellini --- drivers/scsi/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 9130df14f742..ff62dc137021 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -579,6 +579,7 @@ config VMWARE_PVSCSI config XEN_SCSI_FRONTEND tristate "XEN SCSI frontend driver" depends on SCSI && XEN + select XEN_XENBUS_FRONTEND help The XEN SCSI frontend driver allows the kernel to access SCSI Devices within another guest OS (usually Dom0). -- cgit v1.2.3 From bca9b6855888d978d6e5399b0a8a8ed9a11e9236 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 17 Sep 2014 06:12:35 +0200 Subject: xen: sync some headers with xen tree To be able to use an initially unmapped initrd with xen the following header files must be synced to a newer version from the xen tree: include/xen/interface/elfnote.h include/xen/interface/xen.h As the KEXEC and DUMPCORE related ELFNOTES are not relevant for the kernel they are omitted from elfnote.h. Signed-off-by: Juergen Gross Signed-off-by: David Vrabel --- include/xen/interface/elfnote.h | 48 ++++++- include/xen/interface/xen.h | 272 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 294 insertions(+), 26 deletions(-) diff --git a/include/xen/interface/elfnote.h b/include/xen/interface/elfnote.h index 6f4eae328ca7..f90b03454659 100644 --- a/include/xen/interface/elfnote.h +++ b/include/xen/interface/elfnote.h @@ -3,6 +3,24 @@ * * Definitions used for the Xen ELF notes. * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Copyright (c) 2006, Ian Campbell, XenSource Ltd. */ @@ -18,12 +36,13 @@ * * LEGACY indicated the fields in the legacy __xen_guest string which * this a note type replaces. + * + * String values (for non-legacy) are NULL terminated ASCII, also known + * as ASCIZ type. */ /* * NAME=VALUE pair (string). - * - * LEGACY: FEATURES and PAE */ #define XEN_ELFNOTE_INFO 0 @@ -137,9 +156,29 @@ /* * Whether or not the guest supports cooperative suspend cancellation. + * This is a numeric value. + * + * Default is 0 */ #define XEN_ELFNOTE_SUSPEND_CANCEL 14 +/* + * The (non-default) location the initial phys-to-machine map should be + * placed at by the hypervisor (Dom0) or the tools (DomU). + * The kernel must be prepared for this mapping to be established using + * large pages, despite such otherwise not being available to guests. + * The kernel must also be able to handle the page table pages used for + * this mapping not being accessible through the initial mapping. + * (Only x86-64 supports this at present.) + */ +#define XEN_ELFNOTE_INIT_P2M 15 + +/* + * Whether or not the guest can deal with being passed an initrd not + * mapped through its initial page tables. + */ +#define XEN_ELFNOTE_MOD_START_PFN 16 + /* * The features supported by this kernel (numeric). * @@ -153,6 +192,11 @@ */ #define XEN_ELFNOTE_SUPPORTED_FEATURES 17 +/* + * The number of the highest elfnote defined. + */ +#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUPPORTED_FEATURES + #endif /* __XEN_PUBLIC_ELFNOTE_H__ */ /* diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index de082130ba4b..f68719f405af 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -3,6 +3,24 @@ * * Guest OS interface to Xen. * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Copyright (c) 2004, K A Fraser */ @@ -73,13 +91,23 @@ * VIRTUAL INTERRUPTS * * Virtual interrupts that a guest OS may receive from Xen. + * In the side comments, 'V.' denotes a per-VCPU VIRQ while 'G.' denotes a + * global VIRQ. The former can be bound once per VCPU and cannot be re-bound. + * The latter can be allocated only once per guest: they must initially be + * allocated to VCPU0 but can subsequently be re-bound. */ -#define VIRQ_TIMER 0 /* Timebase update, and/or requested timeout. */ -#define VIRQ_DEBUG 1 /* Request guest to dump debug info. */ -#define VIRQ_CONSOLE 2 /* (DOM0) Bytes received on emergency console. */ -#define VIRQ_DOM_EXC 3 /* (DOM0) Exceptional event for some domain. */ -#define VIRQ_DEBUGGER 6 /* (DOM0) A domain has paused for debugging. */ -#define VIRQ_PCPU_STATE 9 /* (DOM0) PCPU state changed */ +#define VIRQ_TIMER 0 /* V. Timebase update, and/or requested timeout. */ +#define VIRQ_DEBUG 1 /* V. Request guest to dump debug info. */ +#define VIRQ_CONSOLE 2 /* G. (DOM0) Bytes received on emergency console. */ +#define VIRQ_DOM_EXC 3 /* G. (DOM0) Exceptional event for some domain. */ +#define VIRQ_TBUF 4 /* G. (DOM0) Trace buffer has records available. */ +#define VIRQ_DEBUGGER 6 /* G. (DOM0) A domain has paused for debugging. */ +#define VIRQ_XENOPROF 7 /* V. XenOprofile interrupt: new sample available */ +#define VIRQ_CON_RING 8 /* G. (DOM0) Bytes received on console */ +#define VIRQ_PCPU_STATE 9 /* G. (DOM0) PCPU state changed */ +#define VIRQ_MEM_EVENT 10 /* G. (DOM0) A memory event has occured */ +#define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient */ +#define VIRQ_ENOMEM 12 /* G. (DOM0) Low on heap memory */ /* Architecture-specific VIRQ definitions. */ #define VIRQ_ARCH_0 16 @@ -92,24 +120,68 @@ #define VIRQ_ARCH_7 23 #define NR_VIRQS 24 + /* - * MMU-UPDATE REQUESTS - * - * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs. - * A foreigndom (FD) can be specified (or DOMID_SELF for none). - * Where the FD has some effect, it is described below. - * ptr[1:0] specifies the appropriate MMU_* command. + * enum neg_errnoval HYPERVISOR_mmu_update(const struct mmu_update reqs[], + * unsigned count, unsigned *done_out, + * unsigned foreigndom) + * @reqs is an array of mmu_update_t structures ((ptr, val) pairs). + * @count is the length of the above array. + * @pdone is an output parameter indicating number of completed operations + * @foreigndom[15:0]: FD, the expected owner of data pages referenced in this + * hypercall invocation. Can be DOMID_SELF. + * @foreigndom[31:16]: PFD, the expected owner of pagetable pages referenced + * in this hypercall invocation. The value of this field + * (x) encodes the PFD as follows: + * x == 0 => PFD == DOMID_SELF + * x != 0 => PFD == x - 1 * + * Sub-commands: ptr[1:0] specifies the appropriate MMU_* command. + * ------------- * ptr[1:0] == MMU_NORMAL_PT_UPDATE: - * Updates an entry in a page table. If updating an L1 table, and the new - * table entry is valid/present, the mapped frame must belong to the FD, if - * an FD has been specified. If attempting to map an I/O page then the - * caller assumes the privilege of the FD. + * Updates an entry in a page table belonging to PFD. If updating an L1 table, + * and the new table entry is valid/present, the mapped frame must belong to + * FD. If attempting to map an I/O page then the caller assumes the privilege + * of the FD. * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller. * FD == DOMID_XEN: Map restricted areas of Xen's heap space. * ptr[:2] -- Machine address of the page-table entry to modify. * val -- Value to write. * + * There also certain implicit requirements when using this hypercall. The + * pages that make up a pagetable must be mapped read-only in the guest. + * This prevents uncontrolled guest updates to the pagetable. Xen strictly + * enforces this, and will disallow any pagetable update which will end up + * mapping pagetable page RW, and will disallow using any writable page as a + * pagetable. In practice it means that when constructing a page table for a + * process, thread, etc, we MUST be very dilligient in following these rules: + * 1). Start with top-level page (PGD or in Xen language: L4). Fill out + * the entries. + * 2). Keep on going, filling out the upper (PUD or L3), and middle (PMD + * or L2). + * 3). Start filling out the PTE table (L1) with the PTE entries. Once + * done, make sure to set each of those entries to RO (so writeable bit + * is unset). Once that has been completed, set the PMD (L2) for this + * PTE table as RO. + * 4). When completed with all of the PMD (L2) entries, and all of them have + * been set to RO, make sure to set RO the PUD (L3). Do the same + * operation on PGD (L4) pagetable entries that have a PUD (L3) entry. + * 5). Now before you can use those pages (so setting the cr3), you MUST also + * pin them so that the hypervisor can verify the entries. This is done + * via the HYPERVISOR_mmuext_op(MMUEXT_PIN_L4_TABLE, guest physical frame + * number of the PGD (L4)). And this point the HYPERVISOR_mmuext_op( + * MMUEXT_NEW_BASEPTR, guest physical frame number of the PGD (L4)) can be + * issued. + * For 32-bit guests, the L4 is not used (as there is less pagetables), so + * instead use L3. + * At this point the pagetables can be modified using the MMU_NORMAL_PT_UPDATE + * hypercall. Also if so desired the OS can also try to write to the PTE + * and be trapped by the hypervisor (as the PTE entry is RO). + * + * To deallocate the pages, the operations are the reverse of the steps + * mentioned above. The argument is MMUEXT_UNPIN_TABLE for all levels and the + * pagetable MUST not be in use (meaning that the cr3 is not set to it). + * * ptr[1:0] == MMU_MACHPHYS_UPDATE: * Updates an entry in the machine->pseudo-physical mapping table. * ptr[:2] -- Machine address within the frame whose mapping to modify. @@ -119,6 +191,72 @@ * ptr[1:0] == MMU_PT_UPDATE_PRESERVE_AD: * As MMU_NORMAL_PT_UPDATE above, but A/D bits currently in the PTE are ORed * with those in @val. + * + * @val is usually the machine frame number along with some attributes. + * The attributes by default follow the architecture defined bits. Meaning that + * if this is a X86_64 machine and four page table layout is used, the layout + * of val is: + * - 63 if set means No execute (NX) + * - 46-13 the machine frame number + * - 12 available for guest + * - 11 available for guest + * - 10 available for guest + * - 9 available for guest + * - 8 global + * - 7 PAT (PSE is disabled, must use hypercall to make 4MB or 2MB pages) + * - 6 dirty + * - 5 accessed + * - 4 page cached disabled + * - 3 page write through + * - 2 userspace accessible + * - 1 writeable + * - 0 present + * + * The one bits that does not fit with the default layout is the PAGE_PSE + * also called PAGE_PAT). The MMUEXT_[UN]MARK_SUPER arguments to the + * HYPERVISOR_mmuext_op serve as mechanism to set a pagetable to be 4MB + * (or 2MB) instead of using the PAGE_PSE bit. + * + * The reason that the PAGE_PSE (bit 7) is not being utilized is due to Xen + * using it as the Page Attribute Table (PAT) bit - for details on it please + * refer to Intel SDM 10.12. The PAT allows to set the caching attributes of + * pages instead of using MTRRs. + * + * The PAT MSR is as follows (it is a 64-bit value, each entry is 8 bits): + * PAT4 PAT0 + * +-----+-----+----+----+----+-----+----+----+ + * | UC | UC- | WC | WB | UC | UC- | WC | WB | <= Linux + * +-----+-----+----+----+----+-----+----+----+ + * | UC | UC- | WT | WB | UC | UC- | WT | WB | <= BIOS (default when machine boots) + * +-----+-----+----+----+----+-----+----+----+ + * | rsv | rsv | WP | WC | UC | UC- | WT | WB | <= Xen + * +-----+-----+----+----+----+-----+----+----+ + * + * The lookup of this index table translates to looking up + * Bit 7, Bit 4, and Bit 3 of val entry: + * + * PAT/PSE (bit 7) ... PCD (bit 4) .. PWT (bit 3). + * + * If all bits are off, then we are using PAT0. If bit 3 turned on, + * then we are using PAT1, if bit 3 and bit 4, then PAT2.. + * + * As you can see, the Linux PAT1 translates to PAT4 under Xen. Which means + * that if a guest that follows Linux's PAT setup and would like to set Write + * Combined on pages it MUST use PAT4 entry. Meaning that Bit 7 (PAGE_PAT) is + * set. For example, under Linux it only uses PAT0, PAT1, and PAT2 for the + * caching as: + * + * WB = none (so PAT0) + * WC = PWT (bit 3 on) + * UC = PWT | PCD (bit 3 and 4 are on). + * + * To make it work with Xen, it needs to translate the WC bit as so: + * + * PWT (so bit 3 on) --> PAT (so bit 7 is on) and clear bit 3 + * + * And to translate back it would: + * + * PAT (bit 7 on) --> PWT (bit 3 on) and clear bit 7. */ #define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */ #define MMU_MACHPHYS_UPDATE 1 /* ptr = MA of frame to modify entry for */ @@ -127,7 +265,12 @@ /* * MMU EXTENDED OPERATIONS * - * HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures. + * enum neg_errnoval HYPERVISOR_mmuext_op(mmuext_op_t uops[], + * unsigned int count, + * unsigned int *pdone, + * unsigned int foreigndom) + */ +/* HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures. * A foreigndom (FD) can be specified (or DOMID_SELF for none). * Where the FD has some effect, it is described below. * @@ -164,9 +307,23 @@ * cmd: MMUEXT_FLUSH_CACHE * No additional arguments. Writes back and flushes cache contents. * + * cmd: MMUEXT_FLUSH_CACHE_GLOBAL + * No additional arguments. Writes back and flushes cache contents + * on all CPUs in the system. + * * cmd: MMUEXT_SET_LDT * linear_addr: Linear address of LDT base (NB. must be page-aligned). * nr_ents: Number of entries in LDT. + * + * cmd: MMUEXT_CLEAR_PAGE + * mfn: Machine frame number to be cleared. + * + * cmd: MMUEXT_COPY_PAGE + * mfn: Machine frame number of the destination page. + * src_mfn: Machine frame number of the source page. + * + * cmd: MMUEXT_[UN]MARK_SUPER + * mfn: Machine frame number of head of superpage to be [un]marked. */ #define MMUEXT_PIN_L1_TABLE 0 #define MMUEXT_PIN_L2_TABLE 1 @@ -183,12 +340,18 @@ #define MMUEXT_FLUSH_CACHE 12 #define MMUEXT_SET_LDT 13 #define MMUEXT_NEW_USER_BASEPTR 15 +#define MMUEXT_CLEAR_PAGE 16 +#define MMUEXT_COPY_PAGE 17 +#define MMUEXT_FLUSH_CACHE_GLOBAL 18 +#define MMUEXT_MARK_SUPER 19 +#define MMUEXT_UNMARK_SUPER 20 #ifndef __ASSEMBLY__ struct mmuext_op { unsigned int cmd; union { - /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */ + /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR + * CLEAR_PAGE, COPY_PAGE, [UN]MARK_SUPER */ xen_pfn_t mfn; /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */ unsigned long linear_addr; @@ -198,6 +361,8 @@ struct mmuext_op { unsigned int nr_ents; /* TLB_FLUSH_MULTI, INVLPG_MULTI */ void *vcpumask; + /* COPY_PAGE */ + xen_pfn_t src_mfn; } arg2; }; DEFINE_GUEST_HANDLE_STRUCT(mmuext_op); @@ -225,10 +390,23 @@ DEFINE_GUEST_HANDLE_STRUCT(mmuext_op); */ #define VMASST_CMD_enable 0 #define VMASST_CMD_disable 1 + +/* x86/32 guests: simulate full 4GB segment limits. */ #define VMASST_TYPE_4gb_segments 0 + +/* x86/32 guests: trap (vector 15) whenever above vmassist is used. */ #define VMASST_TYPE_4gb_segments_notify 1 + +/* + * x86 guests: support writes to bottom-level PTEs. + * NB1. Page-directory entries cannot be written. + * NB2. Guest must continue to remove all writable mappings of PTEs. + */ #define VMASST_TYPE_writable_pagetables 2 + +/* x86/PAE guests: support PDPTs above 4GB. */ #define VMASST_TYPE_pae_extended_cr3 3 + #define MAX_VMASST_TYPE 3 #ifndef __ASSEMBLY__ @@ -260,6 +438,15 @@ typedef uint16_t domid_t; */ #define DOMID_XEN (0x7FF2U) +/* DOMID_COW is used as the owner of sharable pages */ +#define DOMID_COW (0x7FF3U) + +/* DOMID_INVALID is used to identify pages with unknown owner. */ +#define DOMID_INVALID (0x7FF4U) + +/* Idle domain. */ +#define DOMID_IDLE (0x7FFFU) + /* * Send an array of these to HYPERVISOR_mmu_update(). * NB. The fields are natural pointer/address size for this architecture. @@ -272,7 +459,9 @@ DEFINE_GUEST_HANDLE_STRUCT(mmu_update); /* * Send an array of these to HYPERVISOR_multicall(). - * NB. The fields are natural register size for this architecture. + * NB. The fields are logically the natural register size for this + * architecture. In cases where xen_ulong_t is larger than this then + * any unused bits in the upper portion must be zero. */ struct multicall_entry { xen_ulong_t op; @@ -442,8 +631,48 @@ struct start_info { unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */ unsigned long mod_len; /* Size (bytes) of pre-loaded module. */ int8_t cmd_line[MAX_GUEST_CMDLINE]; + /* The pfn range here covers both page table and p->m table frames. */ + unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table. */ + unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table. */ }; +/* These flags are passed in the 'flags' field of start_info_t. */ +#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ +#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ +#define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */ +#define SIF_MOD_START_PFN (1<<3) /* Is mod_start a PFN? */ +#define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */ + +/* + * A multiboot module is a package containing modules very similar to a + * multiboot module array. The only differences are: + * - the array of module descriptors is by convention simply at the beginning + * of the multiboot module, + * - addresses in the module descriptors are based on the beginning of the + * multiboot module, + * - the number of modules is determined by a termination descriptor that has + * mod_start == 0. + * + * This permits to both build it statically and reference it in a configuration + * file, and let the PV guest easily rebase the addresses to virtual addresses + * and at the same time count the number of modules. + */ +struct xen_multiboot_mod_list { + /* Address of first byte of the module */ + uint32_t mod_start; + /* Address of last byte of the module (inclusive) */ + uint32_t mod_end; + /* Address of zero-terminated command line */ + uint32_t cmdline; + /* Unused, must be zero */ + uint32_t pad; +}; +/* + * The console structure in start_info.console.dom0 + * + * This structure includes a variety of information required to + * have a working VGA/VESA console. + */ struct dom0_vga_console_info { uint8_t video_type; #define XEN_VGATYPE_TEXT_MODE_3 0x03 @@ -484,11 +713,6 @@ struct dom0_vga_console_info { } u; }; -/* These flags are passed in the 'flags' field of start_info_t. */ -#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ -#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ -#define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */ - typedef uint64_t cpumap_t; typedef uint8_t xen_domain_handle_t[16]; -- cgit v1.2.3 From d1e9abd630fe66046087f6501a4e4b8de55f7ab9 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 17 Sep 2014 06:12:36 +0200 Subject: xen: eliminate scalability issues from initrd handling Size restrictions native kernels wouldn't have resulted from the initrd getting mapped into the initial mapping. The kernel doesn't really need the initrd to be mapped, so use infrastructure available in Xen to avoid the mapping and hence the restriction. Signed-off-by: Juergen Gross Signed-off-by: David Vrabel --- arch/x86/xen/enlighten.c | 11 +++++++++-- arch/x86/xen/xen-head.S | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index be1cff572ccf..f7e01fa1545f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1519,6 +1519,7 @@ static void __init xen_pvh_early_guest_init(void) asmlinkage __visible void __init xen_start_kernel(void) { struct physdev_set_iopl set_iopl; + unsigned long initrd_start = 0; int rc; if (!xen_start_info) @@ -1665,10 +1666,16 @@ asmlinkage __visible void __init xen_start_kernel(void) new_cpu_data.x86_capability[0] = cpuid_edx(1); #endif + if (xen_start_info->mod_start) { + if (xen_start_info->flags & SIF_MOD_START_PFN) + initrd_start = PFN_PHYS(xen_start_info->mod_start); + else + initrd_start = __pa(xen_start_info->mod_start); + } + /* Poke various useful things into boot_params */ boot_params.hdr.type_of_loader = (9 << 4) | 0; - boot_params.hdr.ramdisk_image = xen_start_info->mod_start - ? __pa(xen_start_info->mod_start) : 0; + boot_params.hdr.ramdisk_image = initrd_start; boot_params.hdr.ramdisk_size = xen_start_info->mod_len; boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 485b69585540..46408e51c1dc 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -124,6 +124,7 @@ NEXT_HYPERCALL(arch_6) ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) + ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN, .long 1) ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START) ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0) -- cgit v1.2.3 From a2ef5dc2c7cbedbeb4c847039845afaea5e63745 Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Wed, 10 Sep 2014 16:36:06 -0700 Subject: x86/xen: Set EFER.NX and EFER.SCE in PVH guests This fixes two bugs in PVH guests: - Not setting EFER.NX means the NX bit in page table entries is ignored on Intel processors and causes reserved bit page faults on AMD processors. - After the Xen commit 7645640d6ff1 ("x86/PVH: don't set EFER_SCE for pvh guest") PVH guests are required to set EFER.SCE to enable the SYSCALL instruction. Secondary VCPUs are started with pagetables with the NX bit set so EFER.NX must be set before using any stack or data segment. xen_pvh_cpu_early_init() is the new secondary VCPU entry point that sets EFER before jumping to cpu_bringup_and_idle(). Signed-off-by: Mukesh Rathor Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- arch/x86/xen/enlighten.c | 6 ++++++ arch/x86/xen/smp.c | 29 ++++++++++++++++++----------- arch/x86/xen/smp.h | 8 ++++++++ arch/x86/xen/xen-head.S | 35 +++++++++++++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 11 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f7e01fa1545f..acb0effd8077 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1463,6 +1463,7 @@ static void __ref xen_setup_gdt(int cpu) pv_cpu_ops.load_gdt = xen_load_gdt; } +#ifdef CONFIG_XEN_PVH /* * A PV guest starts with default flags that are not set for PVH, set them * here asap. @@ -1508,12 +1509,15 @@ static void __init xen_pvh_early_guest_init(void) return; xen_have_vector_callback = 1; + + xen_pvh_early_cpu_init(0, false); xen_pvh_set_cr_flags(0); #ifdef CONFIG_X86_32 BUG(); /* PVH: Implement proper support. */ #endif } +#endif /* CONFIG_XEN_PVH */ /* First C function to be called on Xen boot */ asmlinkage __visible void __init xen_start_kernel(void) @@ -1528,7 +1532,9 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_domain_type = XEN_PV_DOMAIN; xen_setup_features(); +#ifdef CONFIG_XEN_PVH xen_pvh_early_guest_init(); +#endif xen_setup_machphys_mapping(); /* Install Xen paravirt ops */ diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 7005974c3ff3..c670d7518cf4 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -37,6 +37,7 @@ #include #include "xen-ops.h" #include "mmu.h" +#include "smp.h" cpumask_var_t xen_cpu_initialized_map; @@ -99,10 +100,14 @@ static void cpu_bringup(void) wmb(); /* make sure everything is out */ } -/* Note: cpu parameter is only relevant for PVH */ -static void cpu_bringup_and_idle(int cpu) +/* + * Note: cpu parameter is only relevant for PVH. The reason for passing it + * is we can't do smp_processor_id until the percpu segments are loaded, for + * which we need the cpu number! So we pass it in rdi as first parameter. + */ +asmlinkage __visible void cpu_bringup_and_idle(int cpu) { -#ifdef CONFIG_X86_64 +#ifdef CONFIG_XEN_PVH if (xen_feature(XENFEAT_auto_translated_physmap) && xen_feature(XENFEAT_supervisor_mode_kernel)) xen_pvh_secondary_vcpu_init(cpu); @@ -374,11 +379,10 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->user_regs.fs = __KERNEL_PERCPU; ctxt->user_regs.gs = __KERNEL_STACK_CANARY; #endif - ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; - memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); if (!xen_feature(XENFEAT_auto_translated_physmap)) { + ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; ctxt->flags = VGCF_IN_KERNEL; ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ ctxt->user_regs.ds = __USER_DS; @@ -413,15 +417,18 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) (unsigned long)xen_failsafe_callback; ctxt->user_regs.cs = __KERNEL_CS; per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); -#ifdef CONFIG_X86_32 } -#else - } else - /* N.B. The user_regs.eip (cpu_bringup_and_idle) is called with - * %rdi having the cpu number - which means are passing in - * as the first parameter the cpu. Subtle! +#ifdef CONFIG_XEN_PVH + else { + /* + * The vcpu comes on kernel page tables which have the NX pte + * bit set. This means before DS/SS is touched, NX in + * EFER must be set. Hence the following assembly glue code. */ + ctxt->user_regs.eip = (unsigned long)xen_pvh_early_cpu_init; ctxt->user_regs.rdi = cpu; + ctxt->user_regs.rsi = true; /* entry == true */ + } #endif ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h index c7c2d89efd76..963d62a35c82 100644 --- a/arch/x86/xen/smp.h +++ b/arch/x86/xen/smp.h @@ -8,4 +8,12 @@ extern void xen_send_IPI_allbutself(int vector); extern void xen_send_IPI_all(int vector); extern void xen_send_IPI_self(int vector); +#ifdef CONFIG_XEN_PVH +extern void xen_pvh_early_cpu_init(int cpu, bool entry); +#else +static inline void xen_pvh_early_cpu_init(int cpu, bool entry) +{ +} +#endif + #endif diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 46408e51c1dc..674b222544b7 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -47,6 +47,41 @@ ENTRY(startup_xen) __FINIT +#ifdef CONFIG_XEN_PVH +/* + * xen_pvh_early_cpu_init() - early PVH VCPU initialization + * @cpu: this cpu number (%rdi) + * @entry: true if this is a secondary vcpu coming up on this entry + * point, false if this is the boot CPU being initialized for + * the first time (%rsi) + * + * Note: This is called as a function on the boot CPU, and is the entry point + * on the secondary CPU. + */ +ENTRY(xen_pvh_early_cpu_init) + mov %rsi, %r11 + + /* Gather features to see if NX implemented. */ + mov $0x80000001, %eax + cpuid + mov %edx, %esi + + mov $MSR_EFER, %ecx + rdmsr + bts $_EFER_SCE, %eax + + bt $20, %esi + jnc 1f /* No NX, skip setting it */ + bts $_EFER_NX, %eax +1: wrmsr +#ifdef CONFIG_SMP + cmp $0, %r11b + jne cpu_bringup_and_idle +#endif + ret + +#endif /* CONFIG_XEN_PVH */ + .pushsection .text .balign PAGE_SIZE ENTRY(hypercall_page) -- cgit v1.2.3 From c7440a2f225e3b37abbe27f069465cd31ba94b3c Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Fri, 26 Sep 2014 23:34:29 +0800 Subject: xen/xenbus: Correct the comments for xenbus_grant_ring() A grant reference (which is a positive number) can indicate success, so the original comments need be improved. Signed-off-by: Chen Gang Signed-off-by: David Vrabel --- drivers/xen/xenbus/xenbus_client.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 439c9dca9eee..aa9b2fcdaeb2 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -361,8 +361,8 @@ static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err, * @ring_mfn: mfn of ring to grant * Grant access to the given @ring_mfn to the peer of the given device. Return - * 0 on success, or -errno on error. On error, the device will switch to - * XenbusStateClosing, and the error will be saved in the store. + * a grant reference on success, or -errno on error. On error, the device will + * switch to XenbusStateClosing, and the error will be saved in the store. */ int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn) { -- cgit v1.2.3 From 305559f16538708b603ceeb317ebaed9c4da9ce9 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Fri, 26 Sep 2014 23:36:03 +0800 Subject: xen/xenbus: Remove BUG_ON() when error string trucated xenbus_va_dev_error() is for printing error, so when error string is too long to be truncated, need not BUG_ON(), still return truncation string is OK. Signed-off-by: Chen Gang Signed-off-by: David Vrabel --- drivers/xen/xenbus/xenbus_client.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index aa9b2fcdaeb2..ca744102b666 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -259,7 +259,6 @@ static char *error_path(struct xenbus_device *dev) static void xenbus_va_dev_error(struct xenbus_device *dev, int err, const char *fmt, va_list ap) { - int ret; unsigned int len; char *printf_buffer = NULL; char *path_buffer = NULL; @@ -270,9 +269,7 @@ static void xenbus_va_dev_error(struct xenbus_device *dev, int err, goto fail; len = sprintf(printf_buffer, "%i ", -err); - ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap); - - BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1); + vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap); dev_err(&dev->dev, "%s\n", printf_buffer); -- cgit v1.2.3 From 95afae481414cbdb0567bf82d5e5077c3ac9da20 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 8 Sep 2014 17:30:41 +0100 Subject: xen: remove DEFINE_XENBUS_DRIVER() macro The DEFINE_XENBUS_DRIVER() macro looks a bit weird and causes sparse errors. Replace the uses with standard structure definitions instead. This is similar to pci and usb device registration. Signed-off-by: David Vrabel --- drivers/block/xen-blkback/xenbus.c | 11 +++-------- drivers/block/xen-blkfront.c | 5 +++-- drivers/char/tpm/xen-tpmfront.c | 13 +++++++------ drivers/input/misc/xen-kbdfront.c | 5 +++-- drivers/net/xen-netback/xenbus.c | 10 +++------- drivers/net/xen-netfront.c | 16 ++++++++-------- drivers/pci/xen-pcifront.c | 6 ++++-- drivers/scsi/xen-scsifront.c | 5 +++-- drivers/tty/hvc/hvc_xen.c | 9 ++++----- drivers/video/fbdev/xen-fbfront.c | 5 +++-- drivers/xen/xen-pciback/xenbus.c | 6 ++++-- drivers/xen/xen-scsiback.c | 5 +++-- drivers/xen/xenbus/xenbus_probe.c | 6 +++++- drivers/xen/xenbus/xenbus_probe.h | 4 +++- drivers/xen/xenbus/xenbus_probe_backend.c | 8 +++++--- drivers/xen/xenbus/xenbus_probe_frontend.c | 8 +++++--- include/xen/xenbus.h | 21 ++++++++++++--------- 17 files changed, 78 insertions(+), 65 deletions(-) diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 3a8b810b4980..0b13b1c9a01e 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -907,22 +907,17 @@ static int connect_ring(struct backend_info *be) return 0; } - -/* ** Driver Registration ** */ - - static const struct xenbus_device_id xen_blkbk_ids[] = { { "vbd" }, { "" } }; - -static DEFINE_XENBUS_DRIVER(xen_blkbk, , +static struct xenbus_driver xen_blkbk_driver = { + .ids = xen_blkbk_ids, .probe = xen_blkbk_probe, .remove = xen_blkbk_remove, .otherend_changed = frontend_changed -); - +}; int xen_blkif_xenbus_init(void) { diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 5deb235bd18f..37af03e9d859 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2055,13 +2055,14 @@ static const struct xenbus_device_id blkfront_ids[] = { { "" } }; -static DEFINE_XENBUS_DRIVER(blkfront, , +static struct xenbus_driver blkfront_driver = { + .ids = blkfront_ids, .probe = blkfront_probe, .remove = blkfront_remove, .resume = blkfront_resume, .otherend_changed = blkback_changed, .is_ready = blkfront_is_ready, -); +}; static int __init xlblk_init(void) { diff --git a/drivers/char/tpm/xen-tpmfront.c b/drivers/char/tpm/xen-tpmfront.c index 2064b4527040..441b44e54226 100644 --- a/drivers/char/tpm/xen-tpmfront.c +++ b/drivers/char/tpm/xen-tpmfront.c @@ -367,12 +367,13 @@ static const struct xenbus_device_id tpmfront_ids[] = { }; MODULE_ALIAS("xen:vtpm"); -static DEFINE_XENBUS_DRIVER(tpmfront, , - .probe = tpmfront_probe, - .remove = tpmfront_remove, - .resume = tpmfront_resume, - .otherend_changed = backend_changed, - ); +static struct xenbus_driver tpmfront_driver = { + .ids = tpmfront_ids, + .probe = tpmfront_probe, + .remove = tpmfront_remove, + .resume = tpmfront_resume, + .otherend_changed = backend_changed, +}; static int __init xen_tpmfront_init(void) { diff --git a/drivers/input/misc/xen-kbdfront.c b/drivers/input/misc/xen-kbdfront.c index fbfdc10573be..1af28b06c713 100644 --- a/drivers/input/misc/xen-kbdfront.c +++ b/drivers/input/misc/xen-kbdfront.c @@ -365,12 +365,13 @@ static const struct xenbus_device_id xenkbd_ids[] = { { "" } }; -static DEFINE_XENBUS_DRIVER(xenkbd, , +static struct xenbus_driver xenkbd_driver = { + .ids = xenkbd_ids, .probe = xenkbd_probe, .remove = xenkbd_remove, .resume = xenkbd_resume, .otherend_changed = xenkbd_backend_changed, -); +}; static int __init xenkbd_init(void) { diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index 9c47b897b6d2..8079c31ac5e6 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -937,22 +937,18 @@ static int read_xenbus_vif_flags(struct backend_info *be) return 0; } - -/* ** Driver Registration ** */ - - static const struct xenbus_device_id netback_ids[] = { { "vif" }, { "" } }; - -static DEFINE_XENBUS_DRIVER(netback, , +static struct xenbus_driver netback_driver = { + .ids = netback_ids, .probe = netback_probe, .remove = netback_remove, .uevent = netback_uevent, .otherend_changed = frontend_changed, -); +}; int xenvif_xenbus_init(void) { diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index ca82f545ec2c..fa671442f420 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -2300,12 +2300,6 @@ static void xennet_sysfs_delif(struct net_device *netdev) #endif /* CONFIG_SYSFS */ -static const struct xenbus_device_id netfront_ids[] = { - { "vif" }, - { "" } -}; - - static int xennet_remove(struct xenbus_device *dev) { struct netfront_info *info = dev_get_drvdata(&dev->dev); @@ -2338,12 +2332,18 @@ static int xennet_remove(struct xenbus_device *dev) return 0; } -static DEFINE_XENBUS_DRIVER(netfront, , +static const struct xenbus_device_id netfront_ids[] = { + { "vif" }, + { "" } +}; + +static struct xenbus_driver netfront_driver = { + .ids = netfront_ids, .probe = netfront_probe, .remove = xennet_remove, .resume = netfront_resume, .otherend_changed = netback_changed, -); +}; static int __init netif_init(void) { diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index 53df39a22c8a..116ca3746adb 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -1136,11 +1136,13 @@ static const struct xenbus_device_id xenpci_ids[] = { {""}, }; -static DEFINE_XENBUS_DRIVER(xenpci, "pcifront", +static struct xenbus_driver xenpci_driver = { + .name = "pcifront", + .ids = xenpci_ids, .probe = pcifront_xenbus_probe, .remove = pcifront_xenbus_remove, .otherend_changed = pcifront_backend_changed, -); +}; static int __init pcifront_init(void) { diff --git a/drivers/scsi/xen-scsifront.c b/drivers/scsi/xen-scsifront.c index cc14c8ddd369..34199d206ba6 100644 --- a/drivers/scsi/xen-scsifront.c +++ b/drivers/scsi/xen-scsifront.c @@ -998,11 +998,12 @@ static const struct xenbus_device_id scsifront_ids[] = { { "" } }; -static DEFINE_XENBUS_DRIVER(scsifront, , +static struct xenbus_driver scsifront_driver = { + .ids = scsifront_ids, .probe = scsifront_probe, .remove = scsifront_remove, .otherend_changed = scsifront_backend_changed, -); +}; static int __init scsifront_init(void) { diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c index 2dc2831840ca..c3d8af917ffc 100644 --- a/drivers/tty/hvc/hvc_xen.c +++ b/drivers/tty/hvc/hvc_xen.c @@ -347,8 +347,6 @@ static int xen_console_remove(struct xencons_info *info) } #ifdef CONFIG_HVC_XEN_FRONTEND -static struct xenbus_driver xencons_driver; - static int xencons_remove(struct xenbus_device *dev) { return xen_console_remove(dev_get_drvdata(&dev->dev)); @@ -502,13 +500,14 @@ static const struct xenbus_device_id xencons_ids[] = { { "" } }; - -static DEFINE_XENBUS_DRIVER(xencons, "xenconsole", +static struct xenbus_driver xencons_driver = { + .name = "xenconsole", + .ids = xencons_ids, .probe = xencons_probe, .remove = xencons_remove, .resume = xencons_resume, .otherend_changed = xencons_backend_changed, -); +}; #endif /* CONFIG_HVC_XEN_FRONTEND */ static int __init xen_hvc_init(void) diff --git a/drivers/video/fbdev/xen-fbfront.c b/drivers/video/fbdev/xen-fbfront.c index 901014bbc821..09dc44736c1a 100644 --- a/drivers/video/fbdev/xen-fbfront.c +++ b/drivers/video/fbdev/xen-fbfront.c @@ -684,12 +684,13 @@ static const struct xenbus_device_id xenfb_ids[] = { { "" } }; -static DEFINE_XENBUS_DRIVER(xenfb, , +static struct xenbus_driver xenfb_driver = { + .ids = xenfb_ids, .probe = xenfb_probe, .remove = xenfb_remove, .resume = xenfb_resume, .otherend_changed = xenfb_backend_changed, -); +}; static int __init xenfb_init(void) { diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c index c214daab4829..ad8d30c088fe 100644 --- a/drivers/xen/xen-pciback/xenbus.c +++ b/drivers/xen/xen-pciback/xenbus.c @@ -719,11 +719,13 @@ static const struct xenbus_device_id xen_pcibk_ids[] = { {""}, }; -static DEFINE_XENBUS_DRIVER(xen_pcibk, DRV_NAME, +static struct xenbus_driver xen_pcibk_driver = { + .name = DRV_NAME, + .ids = xen_pcibk_ids, .probe = xen_pcibk_xenbus_probe, .remove = xen_pcibk_xenbus_remove, .otherend_changed = xen_pcibk_frontend_changed, -); +}; const struct xen_pcibk_backend *__read_mostly xen_pcibk_backend; diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c index ad11258a78d6..3e32146472a5 100644 --- a/drivers/xen/xen-scsiback.c +++ b/drivers/xen/xen-scsiback.c @@ -2056,11 +2056,12 @@ static const struct xenbus_device_id scsiback_ids[] = { { "" } }; -static DEFINE_XENBUS_DRIVER(scsiback, , +static struct xenbus_driver scsiback_driver = { + .ids = scsiback_ids, .probe = scsiback_probe, .remove = scsiback_remove, .otherend_changed = scsiback_frontend_changed -); +}; static void scsiback_init_pend(void *p) { diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 3c0a74b3e9b1..564b31584860 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -297,9 +297,13 @@ void xenbus_dev_shutdown(struct device *_dev) EXPORT_SYMBOL_GPL(xenbus_dev_shutdown); int xenbus_register_driver_common(struct xenbus_driver *drv, - struct xen_bus_type *bus) + struct xen_bus_type *bus, + struct module *owner, const char *mod_name) { + drv->driver.name = drv->name ? drv->name : drv->ids[0].devicetype; drv->driver.bus = &bus->bus; + drv->driver.owner = owner; + drv->driver.mod_name = mod_name; return driver_register(&drv->driver); } diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h index 1085ec294a19..c9ec7ca1f7ab 100644 --- a/drivers/xen/xenbus/xenbus_probe.h +++ b/drivers/xen/xenbus/xenbus_probe.h @@ -60,7 +60,9 @@ extern int xenbus_match(struct device *_dev, struct device_driver *_drv); extern int xenbus_dev_probe(struct device *_dev); extern int xenbus_dev_remove(struct device *_dev); extern int xenbus_register_driver_common(struct xenbus_driver *drv, - struct xen_bus_type *bus); + struct xen_bus_type *bus, + struct module *owner, + const char *mod_name); extern int xenbus_probe_node(struct xen_bus_type *bus, const char *type, const char *nodename); diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index 5125dce11a60..04f7f85a5edf 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -234,13 +234,15 @@ int xenbus_dev_is_online(struct xenbus_device *dev) } EXPORT_SYMBOL_GPL(xenbus_dev_is_online); -int xenbus_register_backend(struct xenbus_driver *drv) +int __xenbus_register_backend(struct xenbus_driver *drv, struct module *owner, + const char *mod_name) { drv->read_otherend_details = read_frontend_details; - return xenbus_register_driver_common(drv, &xenbus_backend); + return xenbus_register_driver_common(drv, &xenbus_backend, + owner, mod_name); } -EXPORT_SYMBOL_GPL(xenbus_register_backend); +EXPORT_SYMBOL_GPL(__xenbus_register_backend); static int backend_probe_and_watch(struct notifier_block *notifier, unsigned long event, diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c index cb385c10d2b1..bcb53bdc469c 100644 --- a/drivers/xen/xenbus/xenbus_probe_frontend.c +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -317,13 +317,15 @@ static void wait_for_devices(struct xenbus_driver *xendrv) print_device_status); } -int xenbus_register_frontend(struct xenbus_driver *drv) +int __xenbus_register_frontend(struct xenbus_driver *drv, struct module *owner, + const char *mod_name) { int ret; drv->read_otherend_details = read_backend_details; - ret = xenbus_register_driver_common(drv, &xenbus_frontend); + ret = xenbus_register_driver_common(drv, &xenbus_frontend, + owner, mod_name); if (ret) return ret; @@ -332,7 +334,7 @@ int xenbus_register_frontend(struct xenbus_driver *drv) return 0; } -EXPORT_SYMBOL_GPL(xenbus_register_frontend); +EXPORT_SYMBOL_GPL(__xenbus_register_frontend); static DECLARE_WAIT_QUEUE_HEAD(backend_state_wq); static int backend_state; diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index 0324c6d340c1..b78f21caf55a 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -86,6 +86,7 @@ struct xenbus_device_id /* A xenbus driver. */ struct xenbus_driver { + const char *name; /* defaults to ids[0].devicetype */ const struct xenbus_device_id *ids; int (*probe)(struct xenbus_device *dev, const struct xenbus_device_id *id); @@ -100,20 +101,22 @@ struct xenbus_driver { int (*is_ready)(struct xenbus_device *dev); }; -#define DEFINE_XENBUS_DRIVER(var, drvname, methods...) \ -struct xenbus_driver var ## _driver = { \ - .driver.name = drvname + 0 ?: var ## _ids->devicetype, \ - .driver.owner = THIS_MODULE, \ - .ids = var ## _ids, ## methods \ -} - static inline struct xenbus_driver *to_xenbus_driver(struct device_driver *drv) { return container_of(drv, struct xenbus_driver, driver); } -int __must_check xenbus_register_frontend(struct xenbus_driver *); -int __must_check xenbus_register_backend(struct xenbus_driver *); +int __must_check __xenbus_register_frontend(struct xenbus_driver *drv, + struct module *owner, + const char *mod_name); +int __must_check __xenbus_register_backend(struct xenbus_driver *drv, + struct module *owner, + const char *mod_name); + +#define xenbus_register_frontend(drv) \ + __xenbus_register_frontend(drv, THIS_MODULE, KBUILD_MODNAME); +#define xenbus_register_backend(drv) \ + __xenbus_register_backend(drv, THIS_MODULE, KBUILD_MODNAME); void xenbus_unregister_driver(struct xenbus_driver *drv); -- cgit v1.2.3