diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-10-11 20:29:01 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-10-11 20:29:01 -0400 |
commit | 81ae31d78239318610d7c2acb3e2610d622a5aa4 (patch) | |
tree | 1e31b300f1574fceaff065a9bd92460b7c466f7c | |
parent | ef4a48c513211d842c55e84f7a1c31884b91dcf7 (diff) | |
parent | 95afae481414cbdb0567bf82d5e5077c3ac9da20 (diff) | |
download | linux-81ae31d78239318610d7c2acb3e2610d622a5aa4.tar.bz2 |
Merge tag 'stable/for-linus-3.18-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip
Pull Xen updates from David Vrabel:
"Features and fixes:
- Add pvscsi frontend and backend drivers.
- Remove _PAGE_IOMAP PTE flag, freeing it for alternate uses.
- Try and keep memory contiguous during PV memory setup (reduces
SWIOTLB usage).
- Allow front/back drivers to use threaded irqs.
- Support large initrds in PV guests.
- Fix PVH guests in preparation for Xen 4.5"
* tag 'stable/for-linus-3.18-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: (22 commits)
xen: remove DEFINE_XENBUS_DRIVER() macro
xen/xenbus: Remove BUG_ON() when error string trucated
xen/xenbus: Correct the comments for xenbus_grant_ring()
x86/xen: Set EFER.NX and EFER.SCE in PVH guests
xen: eliminate scalability issues from initrd handling
xen: sync some headers with xen tree
xen: make pvscsi frontend dependant on xenbus frontend
arm{,64}/xen: Remove "EXPERIMENTAL" in the description of the Xen options
xen-scsifront: don't deadlock if the ring becomes full
x86: remove the Xen-specific _PAGE_IOMAP PTE flag
x86/xen: do not use _PAGE_IOMAP PTE flag for I/O mappings
x86: skip check for spurious faults for non-present faults
xen/efi: Directly include needed headers
xen-scsiback: clean up a type issue in scsiback_make_tpg()
xen-scsifront: use GFP_ATOMIC under spin_lock
MAINTAINERS: Add xen pvscsi maintainer
xen-scsiback: Add Xen PV SCSI backend driver
xen-scsifront: Add Xen PV SCSI frontend driver
xen: Add Xen pvSCSI protocol description
xen/events: support threaded irqs for interdomain event channels
...
46 files changed, 4214 insertions, 263 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index b28dc111d4a7..f8d882e13200 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10268,6 +10268,15 @@ S: Supported F: drivers/block/xen-blkback/* F: drivers/block/xen* +XEN PVSCSI DRIVERS +M: Juergen Gross <jgross@suse.com> +L: xen-devel@lists.xenproject.org (moderated for non-subscribers) +L: linux-scsi@vger.kernel.org +S: Supported +F: drivers/scsi/xen-scsifront.c +F: drivers/xen/xen-scsiback.c +F: include/xen/interface/io/vscsiif.h + XEN SWIOTLB SUBSYSTEM M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> L: xen-devel@lists.xenproject.org (moderated for non-subscribers) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 18f392f8b744..89c4b5ccc68d 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1779,7 +1779,7 @@ config XEN_DOM0 depends on XEN config XEN - bool "Xen guest support on ARM (EXPERIMENTAL)" + bool "Xen guest support on ARM" depends on ARM && AEABI && OF depends on CPU_V7 && !CPU_V6 depends on !GENERIC_ATOMIC64 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c49ca4c738bb..ac9afde76dea 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -349,7 +349,7 @@ config XEN_DOM0 depends on XEN config XEN - bool "Xen guest support on ARM64 (EXPERIMENTAL)" + bool "Xen guest support on ARM64" depends on ARM64 && OF select SWIOTLB_XEN help diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 0f9724c9c510..07789647bf33 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -23,7 +23,6 @@ #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 #define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */ -#define _PAGE_BIT_IOMAP _PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */ #define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ @@ -52,7 +51,7 @@ #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) -#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) +#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) @@ -168,10 +167,10 @@ #define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) -#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP) -#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP) -#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP) -#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO (__PAGE_KERNEL) +#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) +#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS) +#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC) #define PAGE_KERNEL __pgprot(__PAGE_KERNEL) #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a24194681513..83bb03bfa259 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -933,8 +933,17 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) * cross-processor TLB flush, even if no stale TLB entries exist * on other processors. * + * Spurious faults may only occur if the TLB contains an entry with + * fewer permission than the page table entry. Non-present (P = 0) + * and reserved bit (R = 1) faults are never spurious. + * * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. + * + * Returns non-zero if a spurious fault was handled, zero otherwise. + * + * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3 + * (Optional Invalidation). */ static noinline int spurious_fault(unsigned long error_code, unsigned long address) @@ -945,8 +954,17 @@ spurious_fault(unsigned long error_code, unsigned long address) pte_t *pte; int ret; - /* Reserved-bit violation or user access to kernel space? */ - if (error_code & (PF_USER | PF_RSVD)) + /* + * Only writes to RO or instruction fetches from NX may cause + * spurious faults. + * + * These could be from user or supervisor accesses but the TLB + * is only lazily flushed after a kernel mapping protection + * change, so user accesses are not expected to cause spurious + * faults. + */ + if (error_code != (PF_WRITE | PF_PROT) + && error_code != (PF_INSTR | PF_PROT)) return 0; pgd = init_mm.pgd + pgd_index(address); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 7d05565ba781..c8140e12816a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -537,7 +537,7 @@ static void __init pagetable_init(void) permanent_kmaps_init(pgd_base); } -pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); +pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); EXPORT_SYMBOL_GPL(__supported_pte_mask); /* user-defined highmem size */ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5621c47d7a1a..5d984769cbd8 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -151,7 +151,7 @@ early_param("gbpages", parse_direct_gbpages_on); * around without checking the pgd every time. */ -pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; +pteval_t __supported_pte_mask __read_mostly = ~0; EXPORT_SYMBOL_GPL(__supported_pte_mask); int force_personality32; diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 2ae525e0d8ba..37c1435889ce 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -442,8 +442,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, */ prot |= _PAGE_CACHE_UC_MINUS; - prot |= _PAGE_IOMAP; /* creating a mapping for IO */ - vma->vm_page_prot = __pgprot(prot); if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index a02e09e18f57..be14cc3e48d5 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -15,12 +15,14 @@ * with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/bitops.h> #include <linux/efi.h> #include <linux/init.h> #include <linux/string.h> #include <xen/xen-ops.h> +#include <asm/page.h> #include <asm/setup.h> void __init xen_efi_init(void) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c0cb11fb5008..acb0effd8077 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1463,6 +1463,7 @@ static void __ref xen_setup_gdt(int cpu) pv_cpu_ops.load_gdt = xen_load_gdt; } +#ifdef CONFIG_XEN_PVH /* * A PV guest starts with default flags that are not set for PVH, set them * here asap. @@ -1508,17 +1509,21 @@ static void __init xen_pvh_early_guest_init(void) return; xen_have_vector_callback = 1; + + xen_pvh_early_cpu_init(0, false); xen_pvh_set_cr_flags(0); #ifdef CONFIG_X86_32 BUG(); /* PVH: Implement proper support. */ #endif } +#endif /* CONFIG_XEN_PVH */ /* First C function to be called on Xen boot */ asmlinkage __visible void __init xen_start_kernel(void) { struct physdev_set_iopl set_iopl; + unsigned long initrd_start = 0; int rc; if (!xen_start_info) @@ -1527,7 +1532,9 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_domain_type = XEN_PV_DOMAIN; xen_setup_features(); +#ifdef CONFIG_XEN_PVH xen_pvh_early_guest_init(); +#endif xen_setup_machphys_mapping(); /* Install Xen paravirt ops */ @@ -1559,8 +1566,6 @@ asmlinkage __visible void __init xen_start_kernel(void) #endif __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); - __supported_pte_mask |= _PAGE_IOMAP; - /* * Prevent page tables from being allocated in highmem, even * if CONFIG_HIGHPTE is enabled. @@ -1667,10 +1672,16 @@ asmlinkage __visible void __init xen_start_kernel(void) new_cpu_data.x86_capability[0] = cpuid_edx(1); #endif + if (xen_start_info->mod_start) { + if (xen_start_info->flags & SIF_MOD_START_PFN) + initrd_start = PFN_PHYS(xen_start_info->mod_start); + else + initrd_start = __pa(xen_start_info->mod_start); + } + /* Poke various useful things into boot_params */ boot_params.hdr.type_of_loader = (9 << 4) | 0; - boot_params.hdr.ramdisk_image = xen_start_info->mod_start - ? __pa(xen_start_info->mod_start) : 0; + boot_params.hdr.ramdisk_image = initrd_start; boot_params.hdr.ramdisk_size = xen_start_info->mod_len; boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 16fb0099b7f2..f62af7647ec9 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -399,38 +399,14 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) if (unlikely(mfn == INVALID_P2M_ENTRY)) { mfn = 0; flags = 0; - } else { - /* - * Paramount to do this test _after_ the - * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY & - * IDENTITY_FRAME_BIT resolves to true. - */ - mfn &= ~FOREIGN_FRAME_BIT; - if (mfn & IDENTITY_FRAME_BIT) { - mfn &= ~IDENTITY_FRAME_BIT; - flags |= _PAGE_IOMAP; - } - } + } else + mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); val = ((pteval_t)mfn << PAGE_SHIFT) | flags; } return val; } -static pteval_t iomap_pte(pteval_t val) -{ - if (val & _PAGE_PRESENT) { - unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; - pteval_t flags = val & PTE_FLAGS_MASK; - - /* We assume the pte frame number is a MFN, so - just use it as-is. */ - val = ((pteval_t)pfn << PAGE_SHIFT) | flags; - } - - return val; -} - __visible pteval_t xen_pte_val(pte_t pte) { pteval_t pteval = pte.pte; @@ -441,9 +417,6 @@ __visible pteval_t xen_pte_val(pte_t pte) pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT; } #endif - if (xen_initial_domain() && (pteval & _PAGE_IOMAP)) - return pteval; - return pte_mfn_to_pfn(pteval); } PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); @@ -481,7 +454,6 @@ void xen_set_pat(u64 pat) __visible pte_t xen_make_pte(pteval_t pte) { - phys_addr_t addr = (pte & PTE_PFN_MASK); #if 0 /* If Linux is trying to set a WC pte, then map to the Xen WC. * If _PAGE_PAT is set, then it probably means it is really @@ -496,19 +468,7 @@ __visible pte_t xen_make_pte(pteval_t pte) pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT; } #endif - /* - * Unprivileged domains are allowed to do IOMAPpings for - * PCI passthrough, but not map ISA space. The ISA - * mappings are just dummy local mappings to keep other - * parts of the kernel happy. - */ - if (unlikely(pte & _PAGE_IOMAP) && - (xen_initial_domain() || addr >= ISA_END_ADDRESS)) { - pte = iomap_pte(pte); - } else { - pte &= ~_PAGE_IOMAP; - pte = pte_pfn_to_mfn(pte); - } + pte = pte_pfn_to_mfn(pte); return native_make_pte(pte); } @@ -2091,7 +2051,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) default: /* By default, set_fixmap is used for hardware mappings */ - pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP)); + pte = mfn_pte(phys, prot); break; } diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 3172692381ae..9f5983b01ed9 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -173,6 +173,7 @@ #include <xen/balloon.h> #include <xen/grant_table.h> +#include "p2m.h" #include "multicalls.h" #include "xen-ops.h" @@ -180,12 +181,6 @@ static void __init m2p_override_init(void); unsigned long xen_max_p2m_pfn __read_mostly; -#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) -#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) -#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) - -#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) - /* Placeholders for holes in the address space */ static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); @@ -202,16 +197,12 @@ static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE); RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); -/* We might hit two boundary violations at the start and end, at max each - * boundary violation will require three middle nodes. */ -RESERVE_BRK(p2m_mid_extra, PAGE_SIZE * 2 * 3); - -/* When we populate back during bootup, the amount of pages can vary. The - * max we have is seen is 395979, but that does not mean it can't be more. - * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle - * it can re-use Xen provided mfn_list array, so we only need to allocate at - * most three P2M top nodes. */ -RESERVE_BRK(p2m_populated, PAGE_SIZE * 3); +/* For each I/O range remapped we may lose up to two leaf pages for the boundary + * violations and three mid pages to cover up to 3GB. With + * early_can_reuse_p2m_middle() most of the leaf pages will be reused by the + * remapped region. + */ +RESERVE_BRK(p2m_identity_remap, PAGE_SIZE * 2 * 3 * MAX_REMAP_RANGES); static inline unsigned p2m_top_index(unsigned long pfn) { diff --git a/arch/x86/xen/p2m.h b/arch/x86/xen/p2m.h new file mode 100644 index 000000000000..ad8aee24ab72 --- /dev/null +++ b/arch/x86/xen/p2m.h @@ -0,0 +1,15 @@ +#ifndef _XEN_P2M_H +#define _XEN_P2M_H + +#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) +#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) +#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) + +#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) + +#define MAX_REMAP_RANGES 10 + +extern unsigned long __init set_phys_range_identity(unsigned long pfn_s, + unsigned long pfn_e); + +#endif /* _XEN_P2M_H */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 2e555163c2fe..af7216128d93 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -29,6 +29,7 @@ #include <xen/features.h> #include "xen-ops.h" #include "vdso.h" +#include "p2m.h" /* These are code, but not functions. Defined in entry.S */ extern const char xen_hypervisor_callback[]; @@ -46,6 +47,9 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; /* Number of pages released from the initial allocation. */ unsigned long xen_released_pages; +/* Buffer used to remap identity mapped pages */ +unsigned long xen_remap_buf[P2M_PER_PAGE] __initdata; + /* * The maximum amount of extra memory compared to the base size. The * main scaling factor is the size of struct page. At extreme ratios @@ -151,107 +155,325 @@ static unsigned long __init xen_do_chunk(unsigned long start, return len; } -static unsigned long __init xen_release_chunk(unsigned long start, - unsigned long end) -{ - return xen_do_chunk(start, end, true); -} - -static unsigned long __init xen_populate_chunk( +/* + * Finds the next RAM pfn available in the E820 map after min_pfn. + * This function updates min_pfn with the pfn found and returns + * the size of that range or zero if not found. + */ +static unsigned long __init xen_find_pfn_range( const struct e820entry *list, size_t map_size, - unsigned long max_pfn, unsigned long *last_pfn, - unsigned long credits_left) + unsigned long *min_pfn) { const struct e820entry *entry; unsigned int i; unsigned long done = 0; - unsigned long dest_pfn; for (i = 0, entry = list; i < map_size; i++, entry++) { unsigned long s_pfn; unsigned long e_pfn; - unsigned long pfns; - long capacity; - - if (credits_left <= 0) - break; if (entry->type != E820_RAM) continue; e_pfn = PFN_DOWN(entry->addr + entry->size); - /* We only care about E820 after the xen_start_info->nr_pages */ - if (e_pfn <= max_pfn) + /* We only care about E820 after this */ + if (e_pfn < *min_pfn) continue; s_pfn = PFN_UP(entry->addr); - /* If the E820 falls within the nr_pages, we want to start - * at the nr_pages PFN. - * If that would mean going past the E820 entry, skip it + + /* If min_pfn falls within the E820 entry, we want to start + * at the min_pfn PFN. */ - if (s_pfn <= max_pfn) { - capacity = e_pfn - max_pfn; - dest_pfn = max_pfn; + if (s_pfn <= *min_pfn) { + done = e_pfn - *min_pfn; } else { - capacity = e_pfn - s_pfn; - dest_pfn = s_pfn; + done = e_pfn - s_pfn; + *min_pfn = s_pfn; } + break; + } - if (credits_left < capacity) - capacity = credits_left; + return done; +} - pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false); - done += pfns; - *last_pfn = (dest_pfn + pfns); - if (pfns < capacity) - break; - credits_left -= pfns; +/* + * This releases a chunk of memory and then does the identity map. It's used as + * as a fallback if the remapping fails. + */ +static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, + unsigned long end_pfn, unsigned long nr_pages, unsigned long *identity, + unsigned long *released) +{ + WARN_ON(start_pfn > end_pfn); + + /* Need to release pages first */ + *released += xen_do_chunk(start_pfn, min(end_pfn, nr_pages), true); + *identity += set_phys_range_identity(start_pfn, end_pfn); +} + +/* + * Helper function to update both the p2m and m2p tables. + */ +static unsigned long __init xen_update_mem_tables(unsigned long pfn, + unsigned long mfn) +{ + struct mmu_update update = { + .ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, + .val = pfn + }; + + /* Update p2m */ + if (!early_set_phys_to_machine(pfn, mfn)) { + WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n", + pfn, mfn); + return false; } - return done; + + /* Update m2p */ + if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) { + WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n", + mfn, pfn); + return false; + } + + return true; } -static void __init xen_set_identity_and_release_chunk( - unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, - unsigned long *released, unsigned long *identity) +/* + * This function updates the p2m and m2p tables with an identity map from + * start_pfn to start_pfn+size and remaps the underlying RAM of the original + * allocation at remap_pfn. It must do so carefully in P2M_PER_PAGE sized blocks + * to not exhaust the reserved brk space. Doing it in properly aligned blocks + * ensures we only allocate the minimum required leaf pages in the p2m table. It + * copies the existing mfns from the p2m table under the 1:1 map, overwrites + * them with the identity map and then updates the p2m and m2p tables with the + * remapped memory. + */ +static unsigned long __init xen_do_set_identity_and_remap_chunk( + unsigned long start_pfn, unsigned long size, unsigned long remap_pfn) { - unsigned long pfn; + unsigned long ident_pfn_iter, remap_pfn_iter; + unsigned long ident_start_pfn_align, remap_start_pfn_align; + unsigned long ident_end_pfn_align, remap_end_pfn_align; + unsigned long ident_boundary_pfn, remap_boundary_pfn; + unsigned long ident_cnt = 0; + unsigned long remap_cnt = 0; + unsigned long left = size; + unsigned long mod; + int i; + + WARN_ON(size == 0); + + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); /* - * If the PFNs are currently mapped, clear the mappings - * (except for the ISA region which must be 1:1 mapped) to - * release the refcounts (in Xen) on the original frames. + * Determine the proper alignment to remap memory in P2M_PER_PAGE sized + * blocks. We need to keep track of both the existing pfn mapping and + * the new pfn remapping. */ - for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) { - pte_t pte = __pte_ma(0); + mod = start_pfn % P2M_PER_PAGE; + ident_start_pfn_align = + mod ? (start_pfn - mod + P2M_PER_PAGE) : start_pfn; + mod = remap_pfn % P2M_PER_PAGE; + remap_start_pfn_align = + mod ? (remap_pfn - mod + P2M_PER_PAGE) : remap_pfn; + mod = (start_pfn + size) % P2M_PER_PAGE; + ident_end_pfn_align = start_pfn + size - mod; + mod = (remap_pfn + size) % P2M_PER_PAGE; + remap_end_pfn_align = remap_pfn + size - mod; + + /* Iterate over each p2m leaf node in each range */ + for (ident_pfn_iter = ident_start_pfn_align, remap_pfn_iter = remap_start_pfn_align; + ident_pfn_iter < ident_end_pfn_align && remap_pfn_iter < remap_end_pfn_align; + ident_pfn_iter += P2M_PER_PAGE, remap_pfn_iter += P2M_PER_PAGE) { + /* Check we aren't past the end */ + BUG_ON(ident_pfn_iter + P2M_PER_PAGE > start_pfn + size); + BUG_ON(remap_pfn_iter + P2M_PER_PAGE > remap_pfn + size); + + /* Save p2m mappings */ + for (i = 0; i < P2M_PER_PAGE; i++) + xen_remap_buf[i] = pfn_to_mfn(ident_pfn_iter + i); + + /* Set identity map which will free a p2m leaf */ + ident_cnt += set_phys_range_identity(ident_pfn_iter, + ident_pfn_iter + P2M_PER_PAGE); + +#ifdef DEBUG + /* Helps verify a p2m leaf has been freed */ + for (i = 0; i < P2M_PER_PAGE; i++) { + unsigned int pfn = ident_pfn_iter + i; + BUG_ON(pfn_to_mfn(pfn) != pfn); + } +#endif + /* Now remap memory */ + for (i = 0; i < P2M_PER_PAGE; i++) { + unsigned long mfn = xen_remap_buf[i]; + + /* This will use the p2m leaf freed above */ + if (!xen_update_mem_tables(remap_pfn_iter + i, mfn)) { + WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n", + remap_pfn_iter + i, mfn); + return 0; + } + + remap_cnt++; + } - if (pfn < PFN_UP(ISA_END_ADDRESS)) - pte = mfn_pte(pfn, PAGE_KERNEL_IO); + left -= P2M_PER_PAGE; + } - (void)HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0); + /* Max boundary space possible */ + BUG_ON(left > (P2M_PER_PAGE - 1) * 2); + + /* Now handle the boundary conditions */ + ident_boundary_pfn = start_pfn; + remap_boundary_pfn = remap_pfn; + for (i = 0; i < left; i++) { + unsigned long mfn; + + /* These two checks move from the start to end boundaries */ + if (ident_boundary_pfn == ident_start_pfn_align) + ident_boundary_pfn = ident_pfn_iter; + if (remap_boundary_pfn == remap_start_pfn_align) + remap_boundary_pfn = remap_pfn_iter; + + /* Check we aren't past the end */ + BUG_ON(ident_boundary_pfn >= start_pfn + size); + BUG_ON(remap_boundary_pfn >= remap_pfn + size); + + mfn = pfn_to_mfn(ident_boundary_pfn); + + if (!xen_update_mem_tables(remap_boundary_pfn, mfn)) { + WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n", + remap_pfn_iter + i, mfn); + return 0; + } + remap_cnt++; + + ident_boundary_pfn++; + remap_boundary_pfn++; } - if (start_pfn < nr_pages) - *released += xen_release_chunk( - start_pfn, min(end_pfn, nr_pages)); + /* Finish up the identity map */ + if (ident_start_pfn_align >= ident_end_pfn_align) { + /* + * In this case we have an identity range which does not span an + * aligned block so everything needs to be identity mapped here. + * If we didn't check this we might remap too many pages since + * the align boundaries are not meaningful in this case. + */ + ident_cnt += set_phys_range_identity(start_pfn, + start_pfn + size); + } else { + /* Remapped above so check each end of the chunk */ + if (start_pfn < ident_start_pfn_align) + ident_cnt += set_phys_range_identity(start_pfn, + ident_start_pfn_align); + if (start_pfn + size > ident_pfn_iter) + ident_cnt += set_phys_range_identity(ident_pfn_iter, + start_pfn + size); + } - *identity += set_phys_range_identity(start_pfn, end_pfn); + BUG_ON(ident_cnt != size); + BUG_ON(remap_cnt != size); + + return size; } -static unsigned long __init xen_set_identity_and_release( - const struct e820entry *list, size_t map_size, unsigned long nr_pages) +/* + * This function takes a contiguous pfn range that needs to be identity mapped + * and: + * + * 1) Finds a new range of pfns to use to remap based on E820 and remap_pfn. + * 2) Calls the do_ function to actually do the mapping/remapping work. + * + * The goal is to not allocate additional memory but to remap the existing + * pages. In the case of an error the underlying memory is simply released back + * to Xen and not remapped. + */ +static unsigned long __init xen_set_identity_and_remap_chunk( + const struct e820entry *list, size_t map_size, unsigned long start_pfn, + unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn, + unsigned long *identity, unsigned long *remapped, + unsigned long *released) +{ + unsigned long pfn; + unsigned long i = 0; + unsigned long n = end_pfn - start_pfn; + + while (i < n) { + unsigned long cur_pfn = start_pfn + i; + unsigned long left = n - i; + unsigned long size = left; + unsigned long remap_range_size; + + /* Do not remap pages beyond the current allocation */ + if (cur_pfn >= nr_pages) { + /* Identity map remaining pages */ + *identity += set_phys_range_identity(cur_pfn, + cur_pfn + size); + break; + } + if (cur_pfn + size > nr_pages) + size = nr_pages - cur_pfn; + + remap_range_size = xen_find_pfn_range(list, map_size, + &remap_pfn); + if (!remap_range_size) { + pr_warning("Unable to find available pfn range, not remapping identity pages\n"); + xen_set_identity_and_release_chunk(cur_pfn, + cur_pfn + left, nr_pages, identity, released); + break; + } + /* Adjust size to fit in current e820 RAM region */ + if (size > remap_range_size) + size = remap_range_size; + + if (!xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn)) { + WARN(1, "Failed to remap 1:1 memory cur_pfn=%ld size=%ld remap_pfn=%ld\n", + cur_pfn, size, remap_pfn); + xen_set_identity_and_release_chunk(cur_pfn, + cur_pfn + left, nr_pages, identity, released); + break; + } + + /* Update variables to reflect new mappings. */ + i += size; + remap_pfn += size; + *identity += size; + *remapped += size; + } + + /* + * If the PFNs are currently mapped, the VA mapping also needs + * to be updated to be 1:1. + */ + for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) + (void)HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + mfn_pte(pfn, PAGE_KERNEL_IO), 0); + + return remap_pfn; +} + +static unsigned long __init xen_set_identity_and_remap( + const struct e820entry *list, size_t map_size, unsigned long nr_pages, + unsigned long *released) { phys_addr_t start = 0; - unsigned long released = 0; unsigned long identity = 0; + unsigned long remapped = 0; + unsigned long last_pfn = nr_pages; const struct e820entry *entry; + unsigned long num_released = 0; int i; /* * Combine non-RAM regions and gaps until a RAM region (or the * end of the map) is reached, then set the 1:1 map and - * release the pages (if available) in those non-RAM regions. + * remap the memory in those non-RAM regions. * * The combined non-RAM regions are rounded to a whole number * of pages so any partial pages are accessible via the 1:1 @@ -269,22 +491,24 @@ static unsigned long __init xen_set_identity_and_release( end_pfn = PFN_UP(entry->addr); if (start_pfn < end_pfn) - xen_set_identity_and_release_chunk( - start_pfn, end_pfn, nr_pages, - &released, &identity); - + last_pfn = xen_set_identity_and_remap_chunk( + list, map_size, start_pfn, + end_pfn, nr_pages, last_pfn, + &identity, &remapped, + &num_released); start = end; } } - if (released) - printk(KERN_INFO "Released %lu pages of unused memory\n", released); - if (identity) - printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); + *released = num_released; - return released; -} + pr_info("Set %ld page(s) to 1-1 mapping\n", identity); + pr_info("Remapped %ld page(s), last_pfn=%ld\n", remapped, + last_pfn); + pr_info("Released %ld page(s)\n", num_released); + return last_pfn; +} static unsigned long __init xen_get_max_pages(void) { unsigned long max_pages = MAX_DOMAIN_PAGES; @@ -347,7 +571,6 @@ char * __init xen_memory_setup(void) unsigned long max_pages; unsigned long last_pfn = 0; unsigned long extra_pages = 0; - unsigned long populated; int i; int op; @@ -392,20 +615,11 @@ char * __init xen_memory_setup(void) extra_pages += max_pages - max_pfn; /* - * Set P2M for all non-RAM pages and E820 gaps to be identity - * type PFNs. Any RAM pages that would be made inaccesible by - * this are first released. + * Set identity map on non-RAM pages and remap the underlying RAM. */ - xen_released_pages = xen_set_identity_and_release( - map, memmap.nr_entries, max_pfn); - - /* - * Populate back the non-RAM pages and E820 gaps that had been - * released. */ - populated = xen_populate_chunk(map, memmap.nr_entries, - max_pfn, &last_pfn, xen_released_pages); + last_pfn = xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn, + &xen_released_pages); - xen_released_pages -= populated; extra_pages += xen_released_pages; if (last_pfn > max_pfn) { diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 7005974c3ff3..c670d7518cf4 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -37,6 +37,7 @@ #include <xen/hvc-console.h> #include "xen-ops.h" #include "mmu.h" +#include "smp.h" cpumask_var_t xen_cpu_initialized_map; @@ -99,10 +100,14 @@ static void cpu_bringup(void) wmb(); /* make sure everything is out */ } -/* Note: cpu parameter is only relevant for PVH */ -static void cpu_bringup_and_idle(int cpu) +/* + * Note: cpu parameter is only relevant for PVH. The reason for passing it + * is we can't do smp_processor_id until the percpu segments are loaded, for + * which we need the cpu number! So we pass it in rdi as first parameter. + */ +asmlinkage __visible void cpu_bringup_and_idle(int cpu) { -#ifdef CONFIG_X86_64 +#ifdef CONFIG_XEN_PVH if (xen_feature(XENFEAT_auto_translated_physmap) && xen_feature(XENFEAT_supervisor_mode_kernel)) xen_pvh_secondary_vcpu_init(cpu); @@ -374,11 +379,10 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->user_regs.fs = __KERNEL_PERCPU; ctxt->user_regs.gs = __KERNEL_STACK_CANARY; #endif - ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; - memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); if (!xen_feature(XENFEAT_auto_translated_physmap)) { + ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; ctxt->flags = VGCF_IN_KERNEL; ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ ctxt->user_regs.ds = __USER_DS; @@ -413,15 +417,18 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) (unsigned long)xen_failsafe_callback; ctxt->user_regs.cs = __KERNEL_CS; per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); -#ifdef CONFIG_X86_32 } -#else - } else - /* N.B. The user_regs.eip (cpu_bringup_and_idle) is called with - * %rdi having the cpu number - which means are passing in - * as the first parameter the cpu. Subtle! +#ifdef CONFIG_XEN_PVH + else { + /* + * The vcpu comes on kernel page tables which have the NX pte + * bit set. This means before DS/SS is touched, NX in + * EFER must be set. Hence the following assembly glue code. */ + ctxt->user_regs.eip = (unsigned long)xen_pvh_early_cpu_init; ctxt->user_regs.rdi = cpu; + ctxt->user_regs.rsi = true; /* entry == true */ + } #endif ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h index c7c2d89efd76..963d62a35c82 100644 --- a/arch/x86/xen/smp.h +++ b/arch/x86/xen/smp.h @@ -8,4 +8,12 @@ extern void xen_send_IPI_allbutself(int vector); extern void xen_send_IPI_all(int vector); extern void xen_send_IPI_self(int vector); +#ifdef CONFIG_XEN_PVH +extern void xen_pvh_early_cpu_init(int cpu, bool entry); +#else +static inline void xen_pvh_early_cpu_init(int cpu, bool entry) +{ +} +#endif + #endif diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 485b69585540..674b222544b7 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -47,6 +47,41 @@ ENTRY(startup_xen) __FINIT +#ifdef CONFIG_XEN_PVH +/* + * xen_pvh_early_cpu_init() - early PVH VCPU initialization + * @cpu: this cpu number (%rdi) + * @entry: true if this is a secondary vcpu coming up on this entry + * point, false if this is the boot CPU being initialized for + * the first time (%rsi) + * + * Note: This is called as a function on the boot CPU, and is the entry point + * on the secondary CPU. + */ +ENTRY(xen_pvh_early_cpu_init) + mov %rsi, %r11 + + /* Gather features to see if NX implemented. */ + mov $0x80000001, %eax + cpuid + mov %edx, %esi + + mov $MSR_EFER, %ecx + rdmsr + bts $_EFER_SCE, %eax + + bt $20, %esi + jnc 1f /* No NX, skip setting it */ + bts $_EFER_NX, %eax +1: wrmsr +#ifdef CONFIG_SMP + cmp $0, %r11b + jne cpu_bringup_and_idle +#endif + ret + +#endif /* CONFIG_XEN_PVH */ + .pushsection .text .balign PAGE_SIZE ENTRY(hypercall_page) @@ -124,6 +159,7 @@ NEXT_HYPERCALL(arch_6) ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) + ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN, .long 1) ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START) ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0) diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 3a8b810b4980..0b13b1c9a01e 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -907,22 +907,17 @@ static int connect_ring(struct backend_info *be) return 0; } - -/* ** Driver Registration ** */ - - static const struct xenbus_device_id xen_blkbk_ids[] = { { "vbd" }, { "" } }; - -static DEFINE_XENBUS_DRIVER(xen_blkbk, , +static struct xenbus_driver xen_blkbk_driver = { + .ids = xen_blkbk_ids, .probe = xen_blkbk_probe, .remove = xen_blkbk_remove, .otherend_changed = frontend_changed -); - +}; int xen_blkif_xenbus_init(void) { diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 5deb235bd18f..37af03e9d859 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2055,13 +2055,14 @@ static const struct xenbus_device_id blkfront_ids[] = { { "" } }; -static DEFINE_XENBUS_DRIVER(blkfront, , +static struct xenbus_driver blkfront_driver = { + .ids = blkfront_ids, .probe = blkfront_probe, .remove = blkfront_remove, .resume = blkfront_resume, .otherend_changed = blkback_changed, .is_ready = blkfront_is_ready, -); +}; static int __init xlblk_init(void) { diff --git a/drivers/char/tpm/xen-tpmfront.c b/drivers/char/tpm/xen-tpmfront.c index 2064b4527040..441b44e54226 100644 --- a/drivers/char/tpm/xen-tpmfront.c +++ b/drivers/char/tpm/xen-tpmfront.c @@ -367,12 +367,13 @@ static const struct xenbus_device_id tpmfront_ids[] = { }; MODULE_ALIAS("xen:vtpm"); -static DEFINE_XENBUS_DRIVER(tpmfront, , - .probe = tpmfront_probe, - .remove = tpmfront_remove, - .resume = tpmfront_resume, - .otherend_changed = backend_changed, - ); +static struct xenbus_driver tpmfront_driver = { + .ids = tpmfront_ids, + .probe = tpmfront_probe, + .remove = tpmfront_remove, + .resume = tpmfront_resume, + .otherend_changed = backend_changed, +}; static int __init xen_tpmfront_init(void) { diff --git a/drivers/input/misc/xen-kbdfront.c b/drivers/input/misc/xen-kbdfront.c index fbfdc10573be..1af28b06c713 100644 --- a/drivers/input/misc/xen-kbdfront.c +++ b/drivers/input/misc/xen-kbdfront.c @@ -365,12 +365,13 @@ static const struct xenbus_device_id xenkbd_ids[] = { { "" } }; -static DEFINE_XENBUS_DRIVER(xenkbd, , +static struct xenbus_driver xenkbd_driver = { + .ids = xenkbd_ids, .probe = xenkbd_probe, .remove = xenkbd_remove, .resume = xenkbd_resume, .otherend_changed = xenkbd_backend_changed, -); +}; static int __init xenkbd_init(void) { diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index 9c47b897b6d2..8079c31ac5e6 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -937,22 +937,18 @@ static int read_xenbus_vif_flags(struct backend_info *be) return 0; } - -/* ** Driver Registration ** */ - - static const struct xenbus_device_id netback_ids[] = { { "vif" }, { "" } }; - -static DEFINE_XENBUS_DRIVER(netback, , +static struct xenbus_driver netback_driver = { + .ids = netback_ids, .probe = netback_probe, .remove = netback_remove, .uevent = netback_uevent, .otherend_changed = frontend_changed, -); +}; int xenvif_xenbus_init(void) { diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index ca82f545ec2c..fa671442f420 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -2300,12 +2300,6 @@ static void xennet_sysfs_delif(struct net_device *netdev) #endif /* CONFIG_SYSFS */ -static const struct xenbus_device_id netfront_ids[] = { - { "vif" }, - { "" } -}; - - static int xennet_remove(struct xenbus_device *dev) { struct netfront_info *info = dev_get_drvdata(&dev->dev); @@ -2338,12 +2332,18 @@ static int xennet_remove(struct xenbus_device *dev) return 0; } -static DEFINE_XENBUS_DRIVER(netfront, , +static const struct xenbus_device_id netfront_ids[] = { + { "vif" }, + { "" } +}; + +static struct xenbus_driver netfront_driver = { + .ids = netfront_ids, .probe = netfront_probe, .remove = xennet_remove, .resume = netfront_resume, .otherend_changed = netback_changed, -); +}; static int __init netif_init(void) { diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index 53df39a22c8a..116ca3746adb 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -1136,11 +1136,13 @@ static const struct xenbus_device_id xenpci_ids[] = { {""}, }; -static DEFINE_XENBUS_DRIVER(xenpci, "pcifront", +static struct xenbus_driver xenpci_driver = { + .name = "pcifront", + .ids = xenpci_ids, .probe = pcifront_xenbus_probe, .remove = pcifront_xenbus_remove, .otherend_changed = pcifront_backend_changed, -); +}; static int __init pcifront_init(void) { diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index e85e64a07d02..296619b7426c 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -587,6 +587,16 @@ config VMWARE_PVSCSI To compile this driver as a module, choose M here: the module will be called vmw_pvscsi. +config XEN_SCSI_FRONTEND + tristate "XEN SCSI frontend driver" + depends on SCSI && XEN + select XEN_XENBUS_FRONTEND + help + The XEN SCSI frontend driver allows the kernel to access SCSI Devices + within another guest OS (usually Dom0). + Only needed if the kernel is running in a XEN guest and generic + SCSI access to a device is needed. + config HYPERV_STORAGE tristate "Microsoft Hyper-V virtual storage driver" depends on SCSI && HYPERV diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile index 5f0d299b0093..59f1ce6df2d6 100644 --- a/drivers/scsi/Makefile +++ b/drivers/scsi/Makefile @@ -141,6 +141,7 @@ obj-$(CONFIG_SCSI_ESAS2R) += esas2r/ obj-$(CONFIG_SCSI_PMCRAID) += pmcraid.o obj-$(CONFIG_SCSI_VIRTIO) += virtio_scsi.o obj-$(CONFIG_VMWARE_PVSCSI) += vmw_pvscsi.o +obj-$(CONFIG_XEN_SCSI_FRONTEND) += xen-scsifront.o obj-$(CONFIG_HYPERV_STORAGE) += hv_storvsc.o obj-$(CONFIG_ARM) += arm/ diff --git a/drivers/scsi/xen-scsifront.c b/drivers/scsi/xen-scsifront.c new file mode 100644 index 000000000000..34199d206ba6 --- /dev/null +++ b/drivers/scsi/xen-scsifront.c @@ -0,0 +1,1026 @@ +/* + * Xen SCSI frontend driver + * + * Copyright (c) 2008, FUJITSU Limited + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/device.h> +#include <linux/wait.h> +#include <linux/interrupt.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/blkdev.h> +#include <linux/pfn.h> +#include <linux/slab.h> +#include <linux/bitops.h> + +#include <scsi/scsi_cmnd.h> +#include <scsi/scsi_device.h> +#include <scsi/scsi.h> +#include <scsi/scsi_host.h> + +#include <xen/xen.h> +#include <xen/xenbus.h> +#include <xen/grant_table.h> +#include <xen/events.h> +#include <xen/page.h> + +#include <xen/interface/grant_table.h> +#include <xen/interface/io/vscsiif.h> +#include <xen/interface/io/protocols.h> + +#include <asm/xen/hypervisor.h> + + +#define GRANT_INVALID_REF 0 + +#define VSCSIFRONT_OP_ADD_LUN 1 +#define VSCSIFRONT_OP_DEL_LUN 2 + +/* Tuning point. */ +#define VSCSIIF_DEFAULT_CMD_PER_LUN 10 +#define VSCSIIF_MAX_TARGET 64 +#define VSCSIIF_MAX_LUN 255 + +#define VSCSIIF_RING_SIZE __CONST_RING_SIZE(vscsiif, PAGE_SIZE) +#define VSCSIIF_MAX_REQS VSCSIIF_RING_SIZE + +#define vscsiif_grants_sg(_sg) (PFN_UP((_sg) * \ + sizeof(struct scsiif_request_segment))) + +struct vscsifrnt_shadow { + /* command between backend and frontend */ + unsigned char act; + uint16_t rqid; + + unsigned int nr_grants; /* number of grants in gref[] */ + struct scsiif_request_segment *sg; /* scatter/gather elements */ + + /* Do reset or abort function. */ + wait_queue_head_t wq_reset; /* reset work queue */ + int wait_reset; /* reset work queue condition */ + int32_t rslt_reset; /* reset response status: */ + /* SUCCESS or FAILED or: */ +#define RSLT_RESET_WAITING 0 +#define RSLT_RESET_ERR -1 + + /* Requested struct scsi_cmnd is stored from kernel. */ + struct scsi_cmnd *sc; + int gref[vscsiif_grants_sg(SG_ALL) + SG_ALL]; +}; + +struct vscsifrnt_info { + struct xenbus_device *dev; + + struct Scsi_Host *host; + int host_active; + + unsigned int evtchn; + unsigned int irq; + + grant_ref_t ring_ref; + struct vscsiif_front_ring ring; + struct vscsiif_response ring_rsp; + + spinlock_t shadow_lock; + DECLARE_BITMAP(shadow_free_bitmap, VSCSIIF_MAX_REQS); + struct vscsifrnt_shadow *shadow[VSCSIIF_MAX_REQS]; + + wait_queue_head_t wq_sync; + unsigned int wait_ring_available:1; + + char dev_state_path[64]; + struct task_struct *curr; +}; + +static DEFINE_MUTEX(scsifront_mutex); + +static void scsifront_wake_up(struct vscsifrnt_info *info) +{ + info->wait_ring_available = 0; + wake_up(&info->wq_sync); +} + +static int scsifront_get_rqid(struct vscsifrnt_info *info) +{ + unsigned long flags; + int free; + + spin_lock_irqsave(&info->shadow_lock, flags); + + free = find_first_bit(info->shadow_free_bitmap, VSCSIIF_MAX_REQS); + __clear_bit(free, info->shadow_free_bitmap); + + spin_unlock_irqrestore(&info->shadow_lock, flags); + + return free; +} + +static int _scsifront_put_rqid(struct vscsifrnt_info *info, uint32_t id) +{ + int empty = bitmap_empty(info->shadow_free_bitmap, VSCSIIF_MAX_REQS); + + __set_bit(id, info->shadow_free_bitmap); + info->shadow[id] = NULL; + + return empty || info->wait_ring_available; +} + +static void scsifront_put_rqid(struct vscsifrnt_info *info, uint32_t id) +{ + unsigned long flags; + int kick; + + spin_lock_irqsave(&info->shadow_lock, flags); + kick = _scsifront_put_rqid(info, id); + spin_unlock_irqrestore(&info->shadow_lock, flags); + + if (kick) + scsifront_wake_up(info); +} + +static struct vscsiif_request *scsifront_pre_req(struct vscsifrnt_info *info) +{ + struct vscsiif_front_ring *ring = &(info->ring); + struct vscsiif_request *ring_req; + uint32_t id; + + id = scsifront_get_rqid(info); /* use id in response */ + if (id >= VSCSIIF_MAX_REQS) + return NULL; + + ring_req = RING_GET_REQUEST(&(info->ring), ring->req_prod_pvt); + + ring->req_prod_pvt++; + + ring_req->rqid = (uint16_t)id; + + return ring_req; +} + +static void scsifront_do_request(struct vscsifrnt_info *info) +{ + struct vscsiif_front_ring *ring = &(info->ring); + int notify; + + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(ring, notify); + if (notify) + notify_remote_via_irq(info->irq); +} + +static void scsifront_gnttab_done(struct vscsifrnt_info *info, uint32_t id) +{ + struct vscsifrnt_shadow *s = info->shadow[id]; + int i; + + if (s->sc->sc_data_direction == DMA_NONE) + return; + + for (i = 0; i < s->nr_grants; i++) { + if (unlikely(gnttab_query_foreign_access(s->gref[i]) != 0)) { + shost_printk(KERN_ALERT, info->host, KBUILD_MODNAME + "grant still in use by backend\n"); + BUG(); + } + gnttab_end_foreign_access(s->gref[i], 0, 0UL); + } + + kfree(s->sg); +} + +static void scsifront_cdb_cmd_done(struct vscsifrnt_info *info, + struct vscsiif_response *ring_rsp) +{ + struct scsi_cmnd *sc; + uint32_t id; + uint8_t sense_len; + + id = ring_rsp->rqid; + sc = info->shadow[id]->sc; + + BUG_ON(sc == NULL); + + scsifront_gnttab_done(info, id); + scsifront_put_rqid(info, id); + + sc->result = ring_rsp->rslt; + scsi_set_resid(sc, ring_rsp->residual_len); + + sense_len = min_t(uint8_t, VSCSIIF_SENSE_BUFFERSIZE, + ring_rsp->sense_len); + + if (sense_len) + memcpy(sc->sense_buffer, ring_rsp->sense_buffer, sense_len); + + sc->scsi_done(sc); +} + +static void scsifront_sync_cmd_done(struct vscsifrnt_info *info, + struct vscsiif_response *ring_rsp) +{ + uint16_t id = ring_rsp->rqid; + unsigned long flags; + struct vscsifrnt_shadow *shadow = info->shadow[id]; + int kick; + + spin_lock_irqsave(&info->shadow_lock, flags); + shadow->wait_reset = 1; + switch (shadow->rslt_reset) { + case RSLT_RESET_WAITING: + shadow->rslt_reset = ring_rsp->rslt; + break; + case RSLT_RESET_ERR: + kick = _scsifront_put_rqid(info, id); + spin_unlock_irqrestore(&info->shadow_lock, flags); + kfree(shadow); + if (kick) + scsifront_wake_up(info); + return; + default: + shost_printk(KERN_ERR, info->host, KBUILD_MODNAME + "bad reset state %d, possibly leaking %u\n", + shadow->rslt_reset, id); + break; + } + spin_unlock_irqrestore(&info->shadow_lock, flags); + + wake_up(&shadow->wq_reset); +} + +static int scsifront_cmd_done(struct vscsifrnt_info *info) +{ + struct vscsiif_response *ring_rsp; + RING_IDX i, rp; + int more_to_do = 0; + unsigned long flags; + + spin_lock_irqsave(info->host->host_lock, flags); + + rp = info->ring.sring->rsp_prod; + rmb(); /* ordering required respective to dom0 */ + for (i = info->ring.rsp_cons; i != rp; i++) { + + ring_rsp = RING_GET_RESPONSE(&info->ring, i); + + if (WARN(ring_rsp->rqid >= VSCSIIF_MAX_REQS || + test_bit(ring_rsp->rqid, info->shadow_free_bitmap), + "illegal rqid %u returned by backend!\n", + ring_rsp->rqid)) + continue; + + if (info->shadow[ring_rsp->rqid]->act == VSCSIIF_ACT_SCSI_CDB) + scsifront_cdb_cmd_done(info, ring_rsp); + else + scsifront_sync_cmd_done(info, ring_rsp); + } + + info->ring.rsp_cons = i; + + if (i != info->ring.req_prod_pvt) + RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); + else + info->ring.sring->rsp_event = i + 1; + + info->wait_ring_available = 0; + + spin_unlock_irqrestore(info->host->host_lock, flags); + + wake_up(&info->wq_sync); + + return more_to_do; +} + +static irqreturn_t scsifront_irq_fn(int irq, void *dev_id) +{ + struct vscsifrnt_info *info = dev_id; + + while (scsifront_cmd_done(info)) + /* Yield point for this unbounded loop. */ + cond_resched(); + + return IRQ_HANDLED; +} + +static int map_data_for_request(struct vscsifrnt_info *info, + struct scsi_cmnd *sc, + struct vscsiif_request *ring_req, + struct vscsifrnt_shadow *shadow) +{ + grant_ref_t gref_head; + struct page *page; + int err, ref, ref_cnt = 0; + int grant_ro = (sc->sc_data_direction == DMA_TO_DEVICE); + unsigned int i, off, len, bytes; + unsigned int data_len = scsi_bufflen(sc); + unsigned int data_grants = 0, seg_grants = 0; + struct scatterlist *sg; + unsigned long mfn; + struct scsiif_request_segment *seg; + + ring_req->nr_segments = 0; + if (sc->sc_data_direction == DMA_NONE || !data_len) + return 0; + + scsi_for_each_sg(sc, sg, scsi_sg_count(sc), i) + data_grants += PFN_UP(sg->offset + sg->length); + + if (data_grants > VSCSIIF_SG_TABLESIZE) { + if (data_grants > info->host->sg_tablesize) { + shost_printk(KERN_ERR, info->host, KBUILD_MODNAME + "Unable to map request_buffer for command!\n"); + return -E2BIG; + } + seg_grants = vscsiif_grants_sg(data_grants); + shadow->sg = kcalloc(data_grants, + sizeof(struct scsiif_request_segment), GFP_ATOMIC); + if (!shadow->sg) + return -ENOMEM; + } + seg = shadow->sg ? : ring_req->seg; + + err = gnttab_alloc_grant_references(seg_grants + data_grants, + &gref_head); + if (err) { + kfree(shadow->sg); + shost_printk(KERN_ERR, info->host, KBUILD_MODNAME + "gnttab_alloc_grant_references() error\n"); + return -ENOMEM; + } + + if (seg_grants) { + page = virt_to_page(seg); + off = (unsigned long)seg & ~PAGE_MASK; + len = sizeof(struct scsiif_request_segment) * data_grants; + while (len > 0) { + bytes = min_t(unsigned int, len, PAGE_SIZE - off); + + ref = gnttab_claim_grant_reference(&gref_head); + BUG_ON(ref == -ENOSPC); + + mfn = pfn_to_mfn(page_to_pfn(page)); + gnttab_grant_foreign_access_ref(ref, + info->dev->otherend_id, mfn, 1); + shadow->gref[ref_cnt] = ref; + ring_req->seg[ref_cnt].gref = ref; + ring_req->seg[ref_cnt].offset = (uint16_t)off; + ring_req->seg[ref_cnt].length = (uint16_t)bytes; + + page++; + len -= bytes; + off = 0; + ref_cnt++; + } + BUG_ON(seg_grants < ref_cnt); + seg_grants = ref_cnt; + } + + scsi_for_each_sg(sc, sg, scsi_sg_count(sc), i) { + page = sg_page(sg); + off = sg->offset; + len = sg->length; + + while (len > 0 && data_len > 0) { + /* + * sg sends a scatterlist that is larger than + * the data_len it wants transferred for certain + * IO sizes. + */ + bytes = min_t(unsigned int, len, PAGE_SIZE - off); + bytes = min(bytes, data_len); + + ref = gnttab_claim_grant_reference(&gref_head); + BUG_ON(ref == -ENOSPC); + + mfn = pfn_to_mfn(page_to_pfn(page)); + gnttab_grant_foreign_access_ref(ref, + info->dev->otherend_id, mfn, grant_ro); + + shadow->gref[ref_cnt] = ref; + seg->gref = ref; + seg->offset = (uint16_t)off; + seg->length = (uint16_t)bytes; + + page++; + seg++; + len -= bytes; + data_len -= bytes; + off = 0; + ref_cnt++; + } + } + + if (seg_grants) + ring_req->nr_segments = VSCSIIF_SG_GRANT | seg_grants; + else + ring_req->nr_segments = (uint8_t)ref_cnt; + shadow->nr_grants = ref_cnt; + + return 0; +} + +static struct vscsiif_request *scsifront_command2ring( + struct vscsifrnt_info *info, struct scsi_cmnd *sc, + struct vscsifrnt_shadow *shadow) +{ + struct vscsiif_request *ring_req; + + memset(shadow, 0, sizeof(*shadow)); + + ring_req = scsifront_pre_req(info); + if (!ring_req) + return NULL; + + info->shadow[ring_req->rqid] = shadow; + shadow->rqid = ring_req->rqid; + + ring_req->id = sc->device->id; + ring_req->lun = sc->device->lun; + ring_req->channel = sc->device->channel; + ring_req->cmd_len = sc->cmd_len; + + BUG_ON(sc->cmd_len > VSCSIIF_MAX_COMMAND_SIZE); + + memcpy(ring_req->cmnd, sc->cmnd, sc->cmd_len); + + ring_req->sc_data_direction = (uint8_t)sc->sc_data_direction; + ring_req->timeout_per_command = sc->request->timeout / HZ; + + return ring_req; +} + +static int scsifront_queuecommand(struct Scsi_Host *shost, + struct scsi_cmnd *sc) +{ + struct vscsifrnt_info *info = shost_priv(shost); + struct vscsiif_request *ring_req; + struct vscsifrnt_shadow *shadow = scsi_cmd_priv(sc); + unsigned long flags; + int err; + uint16_t rqid; + + spin_lock_irqsave(shost->host_lock, flags); + if (RING_FULL(&info->ring)) + goto busy; + + ring_req = scsifront_command2ring(info, sc, shadow); + if (!ring_req) + goto busy; + + sc->result = 0; + + rqid = ring_req->rqid; + ring_req->act = VSCSIIF_ACT_SCSI_CDB; + + shadow->sc = sc; + shadow->act = VSCSIIF_ACT_SCSI_CDB; + + err = map_data_for_request(info, sc, ring_req, shadow); + if (err < 0) { + pr_debug("%s: err %d\n", __func__, err); + scsifront_put_rqid(info, rqid); + spin_unlock_irqrestore(shost->host_lock, flags); + if (err == -ENOMEM) + return SCSI_MLQUEUE_HOST_BUSY; + sc->result = DID_ERROR << 16; + sc->scsi_done(sc); + return 0; + } + + scsifront_do_request(info); + spin_unlock_irqrestore(shost->host_lock, flags); + + return 0; + +busy: + spin_unlock_irqrestore(shost->host_lock, flags); + pr_debug("%s: busy\n", __func__); + return SCSI_MLQUEUE_HOST_BUSY; +} + +/* + * Any exception handling (reset or abort) must be forwarded to the backend. + * We have to wait until an answer is returned. This answer contains the + * result to be returned to the requestor. + */ +static int scsifront_action_handler(struct scsi_cmnd *sc, uint8_t act) +{ + struct Scsi_Host *host = sc->device->host; + struct vscsifrnt_info *info = shost_priv(host); + struct vscsifrnt_shadow *shadow, *s = scsi_cmd_priv(sc); + struct vscsiif_request *ring_req; + int err = 0; + + shadow = kmalloc(sizeof(*shadow), GFP_NOIO); + if (!shadow) + return FAILED; + + spin_lock_irq(host->host_lock); + + for (;;) { + if (!RING_FULL(&info->ring)) { + ring_req = scsifront_command2ring(info, sc, shadow); + if (ring_req) + break; + } + if (err) { + spin_unlock_irq(host->host_lock); + kfree(shadow); + return FAILED; + } + info->wait_ring_available = 1; + spin_unlock_irq(host->host_lock); + err = wait_event_interruptible(info->wq_sync, + !info->wait_ring_available); + spin_lock_irq(host->host_lock); + } + + ring_req->act = act; + ring_req->ref_rqid = s->rqid; + + shadow->act = act; + shadow->rslt_reset = RSLT_RESET_WAITING; + init_waitqueue_head(&shadow->wq_reset); + + ring_req->nr_segments = 0; + + scsifront_do_request(info); + + spin_unlock_irq(host->host_lock); + err = wait_event_interruptible(shadow->wq_reset, shadow->wait_reset); + spin_lock_irq(host->host_lock); + + if (!err) { + err = shadow->rslt_reset; + scsifront_put_rqid(info, shadow->rqid); + kfree(shadow); + } else { + spin_lock(&info->shadow_lock); + shadow->rslt_reset = RSLT_RESET_ERR; + spin_unlock(&info->shadow_lock); + err = FAILED; + } + + spin_unlock_irq(host->host_lock); + return err; +} + +static int scsifront_eh_abort_handler(struct scsi_cmnd *sc) +{ + pr_debug("%s\n", __func__); + return scsifront_action_handler(sc, VSCSIIF_ACT_SCSI_ABORT); +} + +static int scsifront_dev_reset_handler(struct scsi_cmnd *sc) +{ + pr_debug("%s\n", __func__); + return scsifront_action_handler(sc, VSCSIIF_ACT_SCSI_RESET); +} + +static int scsifront_sdev_configure(struct scsi_device *sdev) +{ + struct vscsifrnt_info *info = shost_priv(sdev->host); + + if (info && current == info->curr) + xenbus_printf(XBT_NIL, info->dev->nodename, + info->dev_state_path, "%d", XenbusStateConnected); + + return 0; +} + +static void scsifront_sdev_destroy(struct scsi_device *sdev) +{ + struct vscsifrnt_info *info = shost_priv(sdev->host); + + if (info && current == info->curr) + xenbus_printf(XBT_NIL, info->dev->nodename, + info->dev_state_path, "%d", XenbusStateClosed); +} + +static struct scsi_host_template scsifront_sht = { + .module = THIS_MODULE, + .name = "Xen SCSI frontend driver", + .queuecommand = scsifront_queuecommand, + .eh_abort_handler = scsifront_eh_abort_handler, + .eh_device_reset_handler = scsifront_dev_reset_handler, + .slave_configure = scsifront_sdev_configure, + .slave_destroy = scsifront_sdev_destroy, + .cmd_per_lun = VSCSIIF_DEFAULT_CMD_PER_LUN, + .can_queue = VSCSIIF_MAX_REQS, + .this_id = -1, + .cmd_size = sizeof(struct vscsifrnt_shadow), + .sg_tablesize = VSCSIIF_SG_TABLESIZE, + .use_clustering = DISABLE_CLUSTERING, + .proc_name = "scsifront", +}; + +static int scsifront_alloc_ring(struct vscsifrnt_info *info) +{ + struct xenbus_device *dev = info->dev; + struct vscsiif_sring *sring; + int err = -ENOMEM; + + /***** Frontend to Backend ring start *****/ + sring = (struct vscsiif_sring *)__get_free_page(GFP_KERNEL); + if (!sring) { + xenbus_dev_fatal(dev, err, + "fail to allocate shared ring (Front to Back)"); + return err; + } + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); + + err = xenbus_grant_ring(dev, virt_to_mfn(sring)); + if (err < 0) { + free_page((unsigned long)sring); + xenbus_dev_fatal(dev, err, + "fail to grant shared ring (Front to Back)"); + return err; + } + info->ring_ref = err; + + err = xenbus_alloc_evtchn(dev, &info->evtchn); + if (err) { + xenbus_dev_fatal(dev, err, "xenbus_alloc_evtchn"); + goto free_gnttab; + } + + err = bind_evtchn_to_irq(info->evtchn); + if (err <= 0) { + xenbus_dev_fatal(dev, err, "bind_evtchn_to_irq"); + goto free_gnttab; + } + + info->irq = err; + + err = request_threaded_irq(info->irq, NULL, scsifront_irq_fn, + IRQF_ONESHOT, "scsifront", info); + if (err) { + xenbus_dev_fatal(dev, err, "request_threaded_irq"); + goto free_irq; + } + + return 0; + +/* free resource */ +free_irq: + unbind_from_irqhandler(info->irq, info); +free_gnttab: + gnttab_end_foreign_access(info->ring_ref, 0, + (unsigned long)info->ring.sring); + + return err; +} + +static int scsifront_init_ring(struct vscsifrnt_info *info) +{ + struct xenbus_device *dev = info->dev; + struct xenbus_transaction xbt; + int err; + + pr_debug("%s\n", __func__); + + err = scsifront_alloc_ring(info); + if (err) + return err; + pr_debug("%s: %u %u\n", __func__, info->ring_ref, info->evtchn); + +again: + err = xenbus_transaction_start(&xbt); + if (err) + xenbus_dev_fatal(dev, err, "starting transaction"); + + err = xenbus_printf(xbt, dev->nodename, "ring-ref", "%u", + info->ring_ref); + if (err) { + xenbus_dev_fatal(dev, err, "%s", "writing ring-ref"); + goto fail; + } + + err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", + info->evtchn); + + if (err) { + xenbus_dev_fatal(dev, err, "%s", "writing event-channel"); + goto fail; + } + + err = xenbus_transaction_end(xbt, 0); + if (err) { + if (err == -EAGAIN) + goto again; + xenbus_dev_fatal(dev, err, "completing transaction"); + goto free_sring; + } + + return 0; + +fail: + xenbus_transaction_end(xbt, 1); +free_sring: + unbind_from_irqhandler(info->irq, info); + gnttab_end_foreign_access(info->ring_ref, 0, + (unsigned long)info->ring.sring); + + return err; +} + + +static int scsifront_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + struct vscsifrnt_info *info; + struct Scsi_Host *host; + int err = -ENOMEM; + char name[TASK_COMM_LEN]; + + host = scsi_host_alloc(&scsifront_sht, sizeof(*info)); + if (!host) { + xenbus_dev_fatal(dev, err, "fail to allocate scsi host"); + return err; + } + info = (struct vscsifrnt_info *)host->hostdata; + + dev_set_drvdata(&dev->dev, info); + info->dev = dev; + + bitmap_fill(info->shadow_free_bitmap, VSCSIIF_MAX_REQS); + + err = scsifront_init_ring(info); + if (err) { + scsi_host_put(host); + return err; + } + + init_waitqueue_head(&info->wq_sync); + spin_lock_init(&info->shadow_lock); + + snprintf(name, TASK_COMM_LEN, "vscsiif.%d", host->host_no); + + host->max_id = VSCSIIF_MAX_TARGET; + host->max_channel = 0; + host->max_lun = VSCSIIF_MAX_LUN; + host->max_sectors = (host->sg_tablesize - 1) * PAGE_SIZE / 512; + host->max_cmd_len = VSCSIIF_MAX_COMMAND_SIZE; + + err = scsi_add_host(host, &dev->dev); + if (err) { + dev_err(&dev->dev, "fail to add scsi host %d\n", err); + goto free_sring; + } + info->host = host; + info->host_active = 1; + + xenbus_switch_state(dev, XenbusStateInitialised); + + return 0; + +free_sring: + unbind_from_irqhandler(info->irq, info); + gnttab_end_foreign_access(info->ring_ref, 0, + (unsigned long)info->ring.sring); + scsi_host_put(host); + return err; +} + +static int scsifront_remove(struct xenbus_device *dev) +{ + struct vscsifrnt_info *info = dev_get_drvdata(&dev->dev); + + pr_debug("%s: %s removed\n", __func__, dev->nodename); + + mutex_lock(&scsifront_mutex); + if (info->host_active) { + /* Scsi_host not yet removed */ + scsi_remove_host(info->host); + info->host_active = 0; + } + mutex_unlock(&scsifront_mutex); + + gnttab_end_foreign_access(info->ring_ref, 0, + (unsigned long)info->ring.sring); + unbind_from_irqhandler(info->irq, info); + + scsi_host_put(info->host); + + return 0; +} + +static void scsifront_disconnect(struct vscsifrnt_info *info) +{ + struct xenbus_device *dev = info->dev; + struct Scsi_Host *host = info->host; + + pr_debug("%s: %s disconnect\n", __func__, dev->nodename); + + /* + * When this function is executed, all devices of + * Frontend have been deleted. + * Therefore, it need not block I/O before remove_host. + */ + + mutex_lock(&scsifront_mutex); + if (info->host_active) { + scsi_remove_host(host); + info->host_active = 0; + } + mutex_unlock(&scsifront_mutex); + + xenbus_frontend_closed(dev); +} + +static void scsifront_do_lun_hotplug(struct vscsifrnt_info *info, int op) +{ + struct xenbus_device *dev = info->dev; + int i, err = 0; + char str[64]; + char **dir; + unsigned int dir_n = 0; + unsigned int device_state; + unsigned int hst, chn, tgt, lun; + struct scsi_device *sdev; + + dir = xenbus_directory(XBT_NIL, dev->otherend, "vscsi-devs", &dir_n); + if (IS_ERR(dir)) + return; + + /* mark current task as the one allowed to modify device states */ + BUG_ON(info->curr); + info->curr = current; + + for (i = 0; i < dir_n; i++) { + /* read status */ + snprintf(str, sizeof(str), "vscsi-devs/%s/state", dir[i]); + err = xenbus_scanf(XBT_NIL, dev->otherend, str, "%u", + &device_state); + if (XENBUS_EXIST_ERR(err)) + continue; + + /* virtual SCSI device */ + snprintf(str, sizeof(str), "vscsi-devs/%s/v-dev", dir[i]); + err = xenbus_scanf(XBT_NIL, dev->otherend, str, + "%u:%u:%u:%u", &hst, &chn, &tgt, &lun); + if (XENBUS_EXIST_ERR(err)) + continue; + + /* + * Front device state path, used in slave_configure called + * on successfull scsi_add_device, and in slave_destroy called + * on remove of a device. + */ + snprintf(info->dev_state_path, sizeof(info->dev_state_path), + "vscsi-devs/%s/state", dir[i]); + + switch (op) { + case VSCSIFRONT_OP_ADD_LUN: + if (device_state != XenbusStateInitialised) + break; + + if (scsi_add_device(info->host, chn, tgt, lun)) { + dev_err(&dev->dev, "scsi_add_device\n"); + xenbus_printf(XBT_NIL, dev->nodename, + info->dev_state_path, + "%d", XenbusStateClosed); + } + break; + case VSCSIFRONT_OP_DEL_LUN: + if (device_state != XenbusStateClosing) + break; + + sdev = scsi_device_lookup(info->host, chn, tgt, lun); + if (sdev) { + scsi_remove_device(sdev); + scsi_device_put(sdev); + } + break; + default: + break; + } + } + + info->curr = NULL; + + kfree(dir); +} + +static void scsifront_read_backend_params(struct xenbus_device *dev, + struct vscsifrnt_info *info) +{ + unsigned int sg_grant; + int ret; + struct Scsi_Host *host = info->host; + + ret = xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg-grant", "%u", + &sg_grant); + if (ret == 1 && sg_grant) { + sg_grant = min_t(unsigned int, sg_grant, SG_ALL); + sg_grant = max_t(unsigned int, sg_grant, VSCSIIF_SG_TABLESIZE); + host->sg_tablesize = min_t(unsigned int, sg_grant, + VSCSIIF_SG_TABLESIZE * PAGE_SIZE / + sizeof(struct scsiif_request_segment)); + host->max_sectors = (host->sg_tablesize - 1) * PAGE_SIZE / 512; + } + dev_info(&dev->dev, "using up to %d SG entries\n", host->sg_tablesize); +} + +static void scsifront_backend_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + struct vscsifrnt_info *info = dev_get_drvdata(&dev->dev); + + pr_debug("%s: %p %u %u\n", __func__, dev, dev->state, backend_state); + + switch (backend_state) { + case XenbusStateUnknown: + case XenbusStateInitialising: + case XenbusStateInitWait: + case XenbusStateInitialised: + break; + + case XenbusStateConnected: + scsifront_read_backend_params(dev, info); + if (xenbus_read_driver_state(dev->nodename) == + XenbusStateInitialised) + scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_ADD_LUN); + + if (dev->state != XenbusStateConnected) + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateClosed: + if (dev->state == XenbusStateClosed) + break; + /* Missed the backend's Closing state -- fallthrough */ + case XenbusStateClosing: + scsifront_disconnect(info); + break; + + case XenbusStateReconfiguring: + scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_DEL_LUN); + xenbus_switch_state(dev, XenbusStateReconfiguring); + break; + + case XenbusStateReconfigured: + scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_ADD_LUN); + xenbus_switch_state(dev, XenbusStateConnected); + break; + } +} + +static const struct xenbus_device_id scsifront_ids[] = { + { "vscsi" }, + { "" } +}; + +static struct xenbus_driver scsifront_driver = { + .ids = scsifront_ids, + .probe = scsifront_probe, + .remove = scsifront_remove, + .otherend_changed = scsifront_backend_changed, +}; + +static int __init scsifront_init(void) +{ + if (!xen_domain()) + return -ENODEV; + + return xenbus_register_frontend(&scsifront_driver); +} +module_init(scsifront_init); + +static void __exit scsifront_exit(void) +{ + xenbus_unregister_driver(&scsifront_driver); +} +module_exit(scsifront_exit); + +MODULE_DESCRIPTION("Xen SCSI frontend driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("xen:vscsi"); +MODULE_AUTHOR("Juergen Gross <jgross@suse.com>"); diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c index 2967f0388d2c..f1e57425e39f 100644 --- a/drivers/tty/hvc/hvc_xen.c +++ b/drivers/tty/hvc/hvc_xen.c @@ -347,8 +347,6 @@ static int xen_console_remove(struct xencons_info *info) } #ifdef CONFIG_HVC_XEN_FRONTEND -static struct xenbus_driver xencons_driver; - static int xencons_remove(struct xenbus_device *dev) { return xen_console_remove(dev_get_drvdata(&dev->dev)); @@ -499,13 +497,14 @@ static const struct xenbus_device_id xencons_ids[] = { { "" } }; - -static DEFINE_XENBUS_DRIVER(xencons, "xenconsole", +static struct xenbus_driver xencons_driver = { + .name = "xenconsole", + .ids = xencons_ids, .probe = xencons_probe, .remove = xencons_remove, .resume = xencons_resume, .otherend_changed = xencons_backend_changed, -); +}; #endif /* CONFIG_HVC_XEN_FRONTEND */ static int __init xen_hvc_init(void) diff --git a/drivers/video/fbdev/xen-fbfront.c b/drivers/video/fbdev/xen-fbfront.c index 901014bbc821..09dc44736c1a 100644 --- a/drivers/video/fbdev/xen-fbfront.c +++ b/drivers/video/fbdev/xen-fbfront.c @@ -684,12 +684,13 @@ static const struct xenbus_device_id xenfb_ids[] = { { "" } }; -static DEFINE_XENBUS_DRIVER(xenfb, , +static struct xenbus_driver xenfb_driver = { + .ids = xenfb_ids, .probe = xenfb_probe, .remove = xenfb_remove, .resume = xenfb_resume, .otherend_changed = xenfb_backend_changed, -); +}; static int __init xenfb_init(void) { diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 8bc01838daf9..b812462083fc 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -172,6 +172,15 @@ config XEN_PCIDEV_BACKEND If in doubt, say m. +config XEN_SCSI_BACKEND + tristate "XEN SCSI backend driver" + depends on XEN && XEN_BACKEND && TARGET_CORE + help + The SCSI backend driver allows the kernel to export its SCSI Devices + to other guests via a high-performance shared-memory interface. + Only needed for systems running as XEN driver domains (e.g. Dom0) and + if guests need generic access to SCSI devices. + config XEN_PRIVCMD tristate depends on XEN diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 84044b554e33..2140398a2a8c 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_XEN_ACPI_HOTPLUG_MEMORY) += xen-acpi-memhotplug.o obj-$(CONFIG_XEN_ACPI_HOTPLUG_CPU) += xen-acpi-cpuhotplug.o obj-$(CONFIG_XEN_ACPI_PROCESSOR) += xen-acpi-processor.o obj-$(CONFIG_XEN_EFI) += efi.o +obj-$(CONFIG_XEN_SCSI_BACKEND) += xen-scsiback.o xen-evtchn-y := evtchn.o xen-gntdev-y := gntdev.o xen-gntalloc-y := gntalloc.o diff --git a/drivers/xen/efi.c b/drivers/xen/efi.c index 31f618a49661..1f850c97482f 100644 --- a/drivers/xen/efi.c +++ b/drivers/xen/efi.c @@ -27,6 +27,8 @@ #include <xen/interface/platform.h> #include <xen/xen.h> +#include <asm/page.h> + #include <asm/xen/hypercall.h> #define INIT_EFI_OP(name) \ diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 5b5c5ff273fd..b4bca2d4a7e5 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -900,8 +900,8 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) return irq; } -static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, - unsigned int remote_port) +int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, + unsigned int remote_port) { struct evtchn_bind_interdomain bind_interdomain; int err; @@ -914,6 +914,7 @@ static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, return err ? : bind_evtchn_to_irq(bind_interdomain.local_port); } +EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq); static int find_virq(unsigned int virq, unsigned int cpu) { diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index c254ae036f18..7786291ba229 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -592,7 +592,7 @@ static int grow_gnttab_list(unsigned int more_frames) return 0; grow_nomem: - for ( ; i >= nr_glist_frames; i--) + while (i-- > nr_glist_frames) free_page((unsigned long) gnttab_list[i]); return -ENOMEM; } diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c index c214daab4829..ad8d30c088fe 100644 --- a/drivers/xen/xen-pciback/xenbus.c +++ b/drivers/xen/xen-pciback/xenbus.c @@ -719,11 +719,13 @@ static const struct xenbus_device_id xen_pcibk_ids[] = { {""}, }; -static DEFINE_XENBUS_DRIVER(xen_pcibk, DRV_NAME, +static struct xenbus_driver xen_pcibk_driver = { + .name = DRV_NAME, + .ids = xen_pcibk_ids, .probe = xen_pcibk_xenbus_probe, .remove = xen_pcibk_xenbus_remove, .otherend_changed = xen_pcibk_frontend_changed, -); +}; const struct xen_pcibk_backend *__read_mostly xen_pcibk_backend; diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c new file mode 100644 index 000000000000..3e32146472a5 --- /dev/null +++ b/drivers/xen/xen-scsiback.c @@ -0,0 +1,2126 @@ +/* + * Xen SCSI backend driver + * + * Copyright (c) 2008, FUJITSU Limited + * + * Based on the blkback driver code. + * Adaption to kernel taget core infrastructure taken from vhost/scsi.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <stdarg.h> + +#include <linux/module.h> +#include <linux/utsname.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/wait.h> +#include <linux/sched.h> +#include <linux/list.h> +#include <linux/gfp.h> +#include <linux/delay.h> +#include <linux/spinlock.h> +#include <linux/configfs.h> + +#include <generated/utsrelease.h> + +#include <scsi/scsi_dbg.h> +#include <scsi/scsi_eh.h> +#include <scsi/scsi_tcq.h> + +#include <target/target_core_base.h> +#include <target/target_core_fabric.h> +#include <target/target_core_configfs.h> +#include <target/target_core_fabric_configfs.h> + +#include <asm/hypervisor.h> + +#include <xen/xen.h> +#include <xen/balloon.h> +#include <xen/events.h> +#include <xen/xenbus.h> +#include <xen/grant_table.h> +#include <xen/page.h> + +#include <xen/interface/grant_table.h> +#include <xen/interface/io/vscsiif.h> + +#define DPRINTK(_f, _a...) \ + pr_debug("(file=%s, line=%d) " _f, __FILE__ , __LINE__ , ## _a) + +#define VSCSI_VERSION "v0.1" +#define VSCSI_NAMELEN 32 + +struct ids_tuple { + unsigned int hst; /* host */ + unsigned int chn; /* channel */ + unsigned int tgt; /* target */ + unsigned int lun; /* LUN */ +}; + +struct v2p_entry { + struct ids_tuple v; /* translate from */ + struct scsiback_tpg *tpg; /* translate to */ + unsigned int lun; + struct kref kref; + struct list_head l; +}; + +struct vscsibk_info { + struct xenbus_device *dev; + + domid_t domid; + unsigned int irq; + + struct vscsiif_back_ring ring; + int ring_error; + + spinlock_t ring_lock; + atomic_t nr_unreplied_reqs; + + spinlock_t v2p_lock; + struct list_head v2p_entry_lists; + + wait_queue_head_t waiting_to_free; +}; + +/* theoretical maximum of grants for one request */ +#define VSCSI_MAX_GRANTS (SG_ALL + VSCSIIF_SG_TABLESIZE) + +/* + * VSCSI_GRANT_BATCH is the maximum number of grants to be processed in one + * call to map/unmap grants. Don't choose it too large, as there are arrays + * with VSCSI_GRANT_BATCH elements allocated on the stack. + */ +#define VSCSI_GRANT_BATCH 16 + +struct vscsibk_pend { + uint16_t rqid; + + uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE]; + uint8_t cmd_len; + + uint8_t sc_data_direction; + uint16_t n_sg; /* real length of SG list */ + uint16_t n_grants; /* SG pages and potentially SG list */ + uint32_t data_len; + uint32_t result; + + struct vscsibk_info *info; + struct v2p_entry *v2p; + struct scatterlist *sgl; + + uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE]; + + grant_handle_t grant_handles[VSCSI_MAX_GRANTS]; + struct page *pages[VSCSI_MAX_GRANTS]; + + struct se_cmd se_cmd; +}; + +struct scsiback_tmr { + atomic_t tmr_complete; + wait_queue_head_t tmr_wait; +}; + +struct scsiback_nexus { + /* Pointer to TCM session for I_T Nexus */ + struct se_session *tvn_se_sess; +}; + +struct scsiback_tport { + /* SCSI protocol the tport is providing */ + u8 tport_proto_id; + /* Binary World Wide unique Port Name for pvscsi Target port */ + u64 tport_wwpn; + /* ASCII formatted WWPN for pvscsi Target port */ + char tport_name[VSCSI_NAMELEN]; + /* Returned by scsiback_make_tport() */ + struct se_wwn tport_wwn; +}; + +struct scsiback_tpg { + /* scsiback port target portal group tag for TCM */ + u16 tport_tpgt; + /* track number of TPG Port/Lun Links wrt explicit I_T Nexus shutdown */ + int tv_tpg_port_count; + /* xen-pvscsi references to tpg_nexus, protected by tv_tpg_mutex */ + int tv_tpg_fe_count; + /* list for scsiback_list */ + struct list_head tv_tpg_list; + /* Used to protect access for tpg_nexus */ + struct mutex tv_tpg_mutex; + /* Pointer to the TCM pvscsi I_T Nexus for this TPG endpoint */ + struct scsiback_nexus *tpg_nexus; + /* Pointer back to scsiback_tport */ + struct scsiback_tport *tport; + /* Returned by scsiback_make_tpg() */ + struct se_portal_group se_tpg; + /* alias used in xenstore */ + char param_alias[VSCSI_NAMELEN]; + /* list of info structures related to this target portal group */ + struct list_head info_list; +}; + +#define SCSIBACK_INVALID_HANDLE (~0) + +static bool log_print_stat; +module_param(log_print_stat, bool, 0644); + +static int scsiback_max_buffer_pages = 1024; +module_param_named(max_buffer_pages, scsiback_max_buffer_pages, int, 0644); +MODULE_PARM_DESC(max_buffer_pages, +"Maximum number of free pages to keep in backend buffer"); + +static struct kmem_cache *scsiback_cachep; +static DEFINE_SPINLOCK(free_pages_lock); +static int free_pages_num; +static LIST_HEAD(scsiback_free_pages); + +/* Global spinlock to protect scsiback TPG list */ +static DEFINE_MUTEX(scsiback_mutex); +static LIST_HEAD(scsiback_list); + +/* Local pointer to allocated TCM configfs fabric module */ +static struct target_fabric_configfs *scsiback_fabric_configfs; + +static void scsiback_get(struct vscsibk_info *info) +{ + atomic_inc(&info->nr_unreplied_reqs); +} + +static void scsiback_put(struct vscsibk_info *info) +{ + if (atomic_dec_and_test(&info->nr_unreplied_reqs)) + wake_up(&info->waiting_to_free); +} + +static void put_free_pages(struct page **page, int num) +{ + unsigned long flags; + int i = free_pages_num + num, n = num; + + if (num == 0) + return; + if (i > scsiback_max_buffer_pages) { + n = min(num, i - scsiback_max_buffer_pages); + free_xenballooned_pages(n, page + num - n); + n = num - n; + } + spin_lock_irqsave(&free_pages_lock, flags); + for (i = 0; i < n; i++) + list_add(&page[i]->lru, &scsiback_free_pages); + free_pages_num += n; + spin_unlock_irqrestore(&free_pages_lock, flags); +} + +static int get_free_page(struct page **page) +{ + unsigned long flags; + + spin_lock_irqsave(&free_pages_lock, flags); + if (list_empty(&scsiback_free_pages)) { + spin_unlock_irqrestore(&free_pages_lock, flags); + return alloc_xenballooned_pages(1, page, false); + } + page[0] = list_first_entry(&scsiback_free_pages, struct page, lru); + list_del(&page[0]->lru); + free_pages_num--; + spin_unlock_irqrestore(&free_pages_lock, flags); + return 0; +} + +static unsigned long vaddr_page(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + + return (unsigned long)pfn_to_kaddr(pfn); +} + +static unsigned long vaddr(struct vscsibk_pend *req, int seg) +{ + return vaddr_page(req->pages[seg]); +} + +static void scsiback_print_status(char *sense_buffer, int errors, + struct vscsibk_pend *pending_req) +{ + struct scsiback_tpg *tpg = pending_req->v2p->tpg; + + pr_err("xen-pvscsi[%s:%d] cmnd[0]=%02x -> st=%02x msg=%02x host=%02x drv=%02x\n", + tpg->tport->tport_name, pending_req->v2p->lun, + pending_req->cmnd[0], status_byte(errors), msg_byte(errors), + host_byte(errors), driver_byte(errors)); + + if (CHECK_CONDITION & status_byte(errors)) + __scsi_print_sense("xen-pvscsi", sense_buffer, + SCSI_SENSE_BUFFERSIZE); +} + +static void scsiback_fast_flush_area(struct vscsibk_pend *req) +{ + struct gnttab_unmap_grant_ref unmap[VSCSI_GRANT_BATCH]; + struct page *pages[VSCSI_GRANT_BATCH]; + unsigned int i, invcount = 0; + grant_handle_t handle; + int err; + + kfree(req->sgl); + req->sgl = NULL; + req->n_sg = 0; + + if (!req->n_grants) + return; + + for (i = 0; i < req->n_grants; i++) { + handle = req->grant_handles[i]; + if (handle == SCSIBACK_INVALID_HANDLE) + continue; + gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), + GNTMAP_host_map, handle); + req->grant_handles[i] = SCSIBACK_INVALID_HANDLE; + pages[invcount] = req->pages[i]; + put_page(pages[invcount]); + invcount++; + if (invcount < VSCSI_GRANT_BATCH) + continue; + err = gnttab_unmap_refs(unmap, NULL, pages, invcount); + BUG_ON(err); + invcount = 0; + } + + if (invcount) { + err = gnttab_unmap_refs(unmap, NULL, pages, invcount); + BUG_ON(err); + } + + put_free_pages(req->pages, req->n_grants); + req->n_grants = 0; +} + +static void scsiback_free_translation_entry(struct kref *kref) +{ + struct v2p_entry *entry = container_of(kref, struct v2p_entry, kref); + struct scsiback_tpg *tpg = entry->tpg; + + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_fe_count--; + mutex_unlock(&tpg->tv_tpg_mutex); + + kfree(entry); +} + +static void scsiback_do_resp_with_sense(char *sense_buffer, int32_t result, + uint32_t resid, struct vscsibk_pend *pending_req) +{ + struct vscsiif_response *ring_res; + struct vscsibk_info *info = pending_req->info; + int notify; + struct scsi_sense_hdr sshdr; + unsigned long flags; + unsigned len; + + spin_lock_irqsave(&info->ring_lock, flags); + + ring_res = RING_GET_RESPONSE(&info->ring, info->ring.rsp_prod_pvt); + info->ring.rsp_prod_pvt++; + + ring_res->rslt = result; + ring_res->rqid = pending_req->rqid; + + if (sense_buffer != NULL && + scsi_normalize_sense(sense_buffer, VSCSIIF_SENSE_BUFFERSIZE, + &sshdr)) { + len = min_t(unsigned, 8 + sense_buffer[7], + VSCSIIF_SENSE_BUFFERSIZE); + memcpy(ring_res->sense_buffer, sense_buffer, len); + ring_res->sense_len = len; + } else { + ring_res->sense_len = 0; + } + + ring_res->residual_len = resid; + + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&info->ring, notify); + spin_unlock_irqrestore(&info->ring_lock, flags); + + if (notify) + notify_remote_via_irq(info->irq); + + if (pending_req->v2p) + kref_put(&pending_req->v2p->kref, + scsiback_free_translation_entry); +} + +static void scsiback_cmd_done(struct vscsibk_pend *pending_req) +{ + struct vscsibk_info *info = pending_req->info; + unsigned char *sense_buffer; + unsigned int resid; + int errors; + + sense_buffer = pending_req->sense_buffer; + resid = pending_req->se_cmd.residual_count; + errors = pending_req->result; + + if (errors && log_print_stat) + scsiback_print_status(sense_buffer, errors, pending_req); + + scsiback_fast_flush_area(pending_req); + scsiback_do_resp_with_sense(sense_buffer, errors, resid, pending_req); + scsiback_put(info); +} + +static void scsiback_cmd_exec(struct vscsibk_pend *pending_req) +{ + struct se_cmd *se_cmd = &pending_req->se_cmd; + struct se_session *sess = pending_req->v2p->tpg->tpg_nexus->tvn_se_sess; + int rc; + + memset(pending_req->sense_buffer, 0, VSCSIIF_SENSE_BUFFERSIZE); + + memset(se_cmd, 0, sizeof(*se_cmd)); + + scsiback_get(pending_req->info); + rc = target_submit_cmd_map_sgls(se_cmd, sess, pending_req->cmnd, + pending_req->sense_buffer, pending_req->v2p->lun, + pending_req->data_len, 0, + pending_req->sc_data_direction, 0, + pending_req->sgl, pending_req->n_sg, + NULL, 0, NULL, 0); + if (rc < 0) { + transport_send_check_condition_and_sense(se_cmd, + TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE, 0); + transport_generic_free_cmd(se_cmd, 0); + } +} + +static int scsiback_gnttab_data_map_batch(struct gnttab_map_grant_ref *map, + struct page **pg, grant_handle_t *grant, int cnt) +{ + int err, i; + + if (!cnt) + return 0; + + err = gnttab_map_refs(map, NULL, pg, cnt); + BUG_ON(err); + for (i = 0; i < cnt; i++) { + if (unlikely(map[i].status != GNTST_okay)) { + pr_err("xen-pvscsi: invalid buffer -- could not remap it\n"); + map[i].handle = SCSIBACK_INVALID_HANDLE; + err = -ENOMEM; + } else { + get_page(pg[i]); + } + grant[i] = map[i].handle; + } + return err; +} + +static int scsiback_gnttab_data_map_list(struct vscsibk_pend *pending_req, + struct scsiif_request_segment *seg, struct page **pg, + grant_handle_t *grant, int cnt, u32 flags) +{ + int mapcount = 0, i, err = 0; + struct gnttab_map_grant_ref map[VSCSI_GRANT_BATCH]; + struct vscsibk_info *info = pending_req->info; + + for (i = 0; i < cnt; i++) { + if (get_free_page(pg + mapcount)) { + put_free_pages(pg, mapcount); + pr_err("xen-pvscsi: no grant page\n"); + return -ENOMEM; + } + gnttab_set_map_op(&map[mapcount], vaddr_page(pg[mapcount]), + flags, seg[i].gref, info->domid); + mapcount++; + if (mapcount < VSCSI_GRANT_BATCH) + continue; + err = scsiback_gnttab_data_map_batch(map, pg, grant, mapcount); + pg += mapcount; + grant += mapcount; + pending_req->n_grants += mapcount; + if (err) + return err; + mapcount = 0; + } + err = scsiback_gnttab_data_map_batch(map, pg, grant, mapcount); + pending_req->n_grants += mapcount; + return err; +} + +static int scsiback_gnttab_data_map(struct vscsiif_request *ring_req, + struct vscsibk_pend *pending_req) +{ + u32 flags; + int i, err, n_segs, i_seg = 0; + struct page **pg; + struct scsiif_request_segment *seg; + unsigned long end_seg = 0; + unsigned int nr_segments = (unsigned int)ring_req->nr_segments; + unsigned int nr_sgl = 0; + struct scatterlist *sg; + grant_handle_t *grant; + + pending_req->n_sg = 0; + pending_req->n_grants = 0; + pending_req->data_len = 0; + + nr_segments &= ~VSCSIIF_SG_GRANT; + if (!nr_segments) + return 0; + + if (nr_segments > VSCSIIF_SG_TABLESIZE) { + DPRINTK("xen-pvscsi: invalid parameter nr_seg = %d\n", + ring_req->nr_segments); + return -EINVAL; + } + + if (ring_req->nr_segments & VSCSIIF_SG_GRANT) { + err = scsiback_gnttab_data_map_list(pending_req, ring_req->seg, + pending_req->pages, pending_req->grant_handles, + nr_segments, GNTMAP_host_map | GNTMAP_readonly); + if (err) + return err; + nr_sgl = nr_segments; + nr_segments = 0; + for (i = 0; i < nr_sgl; i++) { + n_segs = ring_req->seg[i].length / + sizeof(struct scsiif_request_segment); + if ((unsigned)ring_req->seg[i].offset + + (unsigned)ring_req->seg[i].length > PAGE_SIZE || + n_segs * sizeof(struct scsiif_request_segment) != + ring_req->seg[i].length) + return -EINVAL; + nr_segments += n_segs; + } + if (nr_segments > SG_ALL) { + DPRINTK("xen-pvscsi: invalid nr_seg = %d\n", + nr_segments); + return -EINVAL; + } + } + + /* free of (sgl) in fast_flush_area()*/ + pending_req->sgl = kmalloc_array(nr_segments, + sizeof(struct scatterlist), GFP_KERNEL); + if (!pending_req->sgl) + return -ENOMEM; + + sg_init_table(pending_req->sgl, nr_segments); + pending_req->n_sg = nr_segments; + + flags = GNTMAP_host_map; + if (pending_req->sc_data_direction == DMA_TO_DEVICE) + flags |= GNTMAP_readonly; + + pg = pending_req->pages + nr_sgl; + grant = pending_req->grant_handles + nr_sgl; + if (!nr_sgl) { + seg = ring_req->seg; + err = scsiback_gnttab_data_map_list(pending_req, seg, + pg, grant, nr_segments, flags); + if (err) + return err; + } else { + for (i = 0; i < nr_sgl; i++) { + seg = (struct scsiif_request_segment *)( + vaddr(pending_req, i) + ring_req->seg[i].offset); + n_segs = ring_req->seg[i].length / + sizeof(struct scsiif_request_segment); + err = scsiback_gnttab_data_map_list(pending_req, seg, + pg, grant, n_segs, flags); + if (err) + return err; + pg += n_segs; + grant += n_segs; + } + end_seg = vaddr(pending_req, 0) + ring_req->seg[0].offset; + seg = (struct scsiif_request_segment *)end_seg; + end_seg += ring_req->seg[0].length; + pg = pending_req->pages + nr_sgl; + } + + for_each_sg(pending_req->sgl, sg, nr_segments, i) { + sg_set_page(sg, pg[i], seg->length, seg->offset); + pending_req->data_len += seg->length; + seg++; + if (nr_sgl && (unsigned long)seg >= end_seg) { + i_seg++; + end_seg = vaddr(pending_req, i_seg) + + ring_req->seg[i_seg].offset; + seg = (struct scsiif_request_segment *)end_seg; + end_seg += ring_req->seg[i_seg].length; + } + if (sg->offset >= PAGE_SIZE || + sg->length > PAGE_SIZE || + sg->offset + sg->length > PAGE_SIZE) + return -EINVAL; + } + + return 0; +} + +static void scsiback_disconnect(struct vscsibk_info *info) +{ + wait_event(info->waiting_to_free, + atomic_read(&info->nr_unreplied_reqs) == 0); + + unbind_from_irqhandler(info->irq, info); + info->irq = 0; + xenbus_unmap_ring_vfree(info->dev, info->ring.sring); +} + +static void scsiback_device_action(struct vscsibk_pend *pending_req, + enum tcm_tmreq_table act, int tag) +{ + int rc, err = FAILED; + struct scsiback_tpg *tpg = pending_req->v2p->tpg; + struct se_cmd *se_cmd = &pending_req->se_cmd; + struct scsiback_tmr *tmr; + + tmr = kzalloc(sizeof(struct scsiback_tmr), GFP_KERNEL); + if (!tmr) + goto out; + + init_waitqueue_head(&tmr->tmr_wait); + + transport_init_se_cmd(se_cmd, tpg->se_tpg.se_tpg_tfo, + tpg->tpg_nexus->tvn_se_sess, 0, DMA_NONE, MSG_SIMPLE_TAG, + &pending_req->sense_buffer[0]); + + rc = core_tmr_alloc_req(se_cmd, tmr, act, GFP_KERNEL); + if (rc < 0) + goto out; + + se_cmd->se_tmr_req->ref_task_tag = tag; + + if (transport_lookup_tmr_lun(se_cmd, pending_req->v2p->lun) < 0) + goto out; + + transport_generic_handle_tmr(se_cmd); + wait_event(tmr->tmr_wait, atomic_read(&tmr->tmr_complete)); + + err = (se_cmd->se_tmr_req->response == TMR_FUNCTION_COMPLETE) ? + SUCCESS : FAILED; + +out: + if (tmr) { + transport_generic_free_cmd(&pending_req->se_cmd, 1); + kfree(tmr); + } + + scsiback_do_resp_with_sense(NULL, err, 0, pending_req); + + kmem_cache_free(scsiback_cachep, pending_req); +} + +/* + Perform virtual to physical translation +*/ +static struct v2p_entry *scsiback_do_translation(struct vscsibk_info *info, + struct ids_tuple *v) +{ + struct v2p_entry *entry; + struct list_head *head = &(info->v2p_entry_lists); + unsigned long flags; + + spin_lock_irqsave(&info->v2p_lock, flags); + list_for_each_entry(entry, head, l) { + if ((entry->v.chn == v->chn) && + (entry->v.tgt == v->tgt) && + (entry->v.lun == v->lun)) { + kref_get(&entry->kref); + goto out; + } + } + entry = NULL; + +out: + spin_unlock_irqrestore(&info->v2p_lock, flags); + return entry; +} + +static int prepare_pending_reqs(struct vscsibk_info *info, + struct vscsiif_request *ring_req, + struct vscsibk_pend *pending_req) +{ + struct v2p_entry *v2p; + struct ids_tuple vir; + + pending_req->rqid = ring_req->rqid; + pending_req->info = info; + + vir.chn = ring_req->channel; + vir.tgt = ring_req->id; + vir.lun = ring_req->lun; + + v2p = scsiback_do_translation(info, &vir); + if (!v2p) { + pending_req->v2p = NULL; + DPRINTK("xen-pvscsi: doesn't exist.\n"); + return -ENODEV; + } + pending_req->v2p = v2p; + + /* request range check from frontend */ + pending_req->sc_data_direction = ring_req->sc_data_direction; + if ((pending_req->sc_data_direction != DMA_BIDIRECTIONAL) && + (pending_req->sc_data_direction != DMA_TO_DEVICE) && + (pending_req->sc_data_direction != DMA_FROM_DEVICE) && + (pending_req->sc_data_direction != DMA_NONE)) { + DPRINTK("xen-pvscsi: invalid parameter data_dir = %d\n", + pending_req->sc_data_direction); + return -EINVAL; + } + + pending_req->cmd_len = ring_req->cmd_len; + if (pending_req->cmd_len > VSCSIIF_MAX_COMMAND_SIZE) { + DPRINTK("xen-pvscsi: invalid parameter cmd_len = %d\n", + pending_req->cmd_len); + return -EINVAL; + } + memcpy(pending_req->cmnd, ring_req->cmnd, pending_req->cmd_len); + + return 0; +} + +static int scsiback_do_cmd_fn(struct vscsibk_info *info) +{ + struct vscsiif_back_ring *ring = &info->ring; + struct vscsiif_request *ring_req; + struct vscsibk_pend *pending_req; + RING_IDX rc, rp; + int err, more_to_do; + uint32_t result; + uint8_t act; + + rc = ring->req_cons; + rp = ring->sring->req_prod; + rmb(); /* guest system is accessing ring, too */ + + if (RING_REQUEST_PROD_OVERFLOW(ring, rp)) { + rc = ring->rsp_prod_pvt; + pr_warn("xen-pvscsi: Dom%d provided bogus ring requests (%#x - %#x = %u). Halting ring processing\n", + info->domid, rp, rc, rp - rc); + info->ring_error = 1; + return 0; + } + + while ((rc != rp)) { + if (RING_REQUEST_CONS_OVERFLOW(ring, rc)) + break; + pending_req = kmem_cache_alloc(scsiback_cachep, GFP_KERNEL); + if (!pending_req) + return 1; + + ring_req = RING_GET_REQUEST(ring, rc); + ring->req_cons = ++rc; + + act = ring_req->act; + err = prepare_pending_reqs(info, ring_req, pending_req); + if (err) { + switch (err) { + case -ENODEV: + result = DID_NO_CONNECT; + break; + default: + result = DRIVER_ERROR; + break; + } + scsiback_do_resp_with_sense(NULL, result << 24, 0, + pending_req); + kmem_cache_free(scsiback_cachep, pending_req); + return 1; + } + + switch (act) { + case VSCSIIF_ACT_SCSI_CDB: + if (scsiback_gnttab_data_map(ring_req, pending_req)) { + scsiback_fast_flush_area(pending_req); + scsiback_do_resp_with_sense(NULL, + DRIVER_ERROR << 24, 0, pending_req); + kmem_cache_free(scsiback_cachep, pending_req); + } else { + scsiback_cmd_exec(pending_req); + } + break; + case VSCSIIF_ACT_SCSI_ABORT: + scsiback_device_action(pending_req, TMR_ABORT_TASK, + ring_req->ref_rqid); + break; + case VSCSIIF_ACT_SCSI_RESET: + scsiback_device_action(pending_req, TMR_LUN_RESET, 0); + break; + default: + pr_err_ratelimited("xen-pvscsi: invalid request\n"); + scsiback_do_resp_with_sense(NULL, DRIVER_ERROR << 24, + 0, pending_req); + kmem_cache_free(scsiback_cachep, pending_req); + break; + } + + /* Yield point for this unbounded loop. */ + cond_resched(); + } + + RING_FINAL_CHECK_FOR_REQUESTS(&info->ring, more_to_do); + return more_to_do; +} + +static irqreturn_t scsiback_irq_fn(int irq, void *dev_id) +{ + struct vscsibk_info *info = dev_id; + + if (info->ring_error) + return IRQ_HANDLED; + + while (scsiback_do_cmd_fn(info)) + cond_resched(); + + return IRQ_HANDLED; +} + +static int scsiback_init_sring(struct vscsibk_info *info, grant_ref_t ring_ref, + evtchn_port_t evtchn) +{ + void *area; + struct vscsiif_sring *sring; + int err; + + if (info->irq) + return -1; + + err = xenbus_map_ring_valloc(info->dev, ring_ref, &area); + if (err) + return err; + + sring = (struct vscsiif_sring *)area; + BACK_RING_INIT(&info->ring, sring, PAGE_SIZE); + + err = bind_interdomain_evtchn_to_irq(info->domid, evtchn); + if (err < 0) + goto unmap_page; + + info->irq = err; + + err = request_threaded_irq(info->irq, NULL, scsiback_irq_fn, + IRQF_ONESHOT, "vscsiif-backend", info); + if (err) + goto free_irq; + + return 0; + +free_irq: + unbind_from_irqhandler(info->irq, info); + info->irq = 0; +unmap_page: + xenbus_unmap_ring_vfree(info->dev, area); + + return err; +} + +static int scsiback_map(struct vscsibk_info *info) +{ + struct xenbus_device *dev = info->dev; + unsigned int ring_ref, evtchn; + int err; + + err = xenbus_gather(XBT_NIL, dev->otherend, + "ring-ref", "%u", &ring_ref, + "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_fatal(dev, err, "reading %s ring", dev->otherend); + return err; + } + + return scsiback_init_sring(info, ring_ref, evtchn); +} + +/* + Add a new translation entry +*/ +static int scsiback_add_translation_entry(struct vscsibk_info *info, + char *phy, struct ids_tuple *v) +{ + int err = 0; + struct v2p_entry *entry; + struct v2p_entry *new; + struct list_head *head = &(info->v2p_entry_lists); + unsigned long flags; + char *lunp; + unsigned int lun; + struct scsiback_tpg *tpg_entry, *tpg = NULL; + char *error = "doesn't exist"; + + lunp = strrchr(phy, ':'); + if (!lunp) { + pr_err("xen-pvscsi: illegal format of physical device %s\n", + phy); + return -EINVAL; + } + *lunp = 0; + lunp++; + if (kstrtouint(lunp, 10, &lun) || lun >= TRANSPORT_MAX_LUNS_PER_TPG) { + pr_err("xen-pvscsi: lun number not valid: %s\n", lunp); + return -EINVAL; + } + + mutex_lock(&scsiback_mutex); + list_for_each_entry(tpg_entry, &scsiback_list, tv_tpg_list) { + if (!strcmp(phy, tpg_entry->tport->tport_name) || + !strcmp(phy, tpg_entry->param_alias)) { + spin_lock(&tpg_entry->se_tpg.tpg_lun_lock); + if (tpg_entry->se_tpg.tpg_lun_list[lun]->lun_status == + TRANSPORT_LUN_STATUS_ACTIVE) { + if (!tpg_entry->tpg_nexus) + error = "nexus undefined"; + else + tpg = tpg_entry; + } + spin_unlock(&tpg_entry->se_tpg.tpg_lun_lock); + break; + } + } + if (tpg) { + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_fe_count++; + mutex_unlock(&tpg->tv_tpg_mutex); + } + mutex_unlock(&scsiback_mutex); + + if (!tpg) { + pr_err("xen-pvscsi: %s:%d %s\n", phy, lun, error); + return -ENODEV; + } + + new = kmalloc(sizeof(struct v2p_entry), GFP_KERNEL); + if (new == NULL) { + err = -ENOMEM; + goto out_free; + } + + spin_lock_irqsave(&info->v2p_lock, flags); + + /* Check double assignment to identical virtual ID */ + list_for_each_entry(entry, head, l) { + if ((entry->v.chn == v->chn) && + (entry->v.tgt == v->tgt) && + (entry->v.lun == v->lun)) { + pr_warn("xen-pvscsi: Virtual ID is already used. Assignment was not performed.\n"); + err = -EEXIST; + goto out; + } + + } + + /* Create a new translation entry and add to the list */ + kref_init(&new->kref); + new->v = *v; + new->tpg = tpg; + new->lun = lun; + list_add_tail(&new->l, head); + +out: + spin_unlock_irqrestore(&info->v2p_lock, flags); + +out_free: + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_fe_count--; + mutex_unlock(&tpg->tv_tpg_mutex); + + if (err) + kfree(new); + + return err; +} + +static void __scsiback_del_translation_entry(struct v2p_entry *entry) +{ + list_del(&entry->l); + kref_put(&entry->kref, scsiback_free_translation_entry); +} + +/* + Delete the translation entry specfied +*/ +static int scsiback_del_translation_entry(struct vscsibk_info *info, + struct ids_tuple *v) +{ + struct v2p_entry *entry; + struct list_head *head = &(info->v2p_entry_lists); + unsigned long flags; + + spin_lock_irqsave(&info->v2p_lock, flags); + /* Find out the translation entry specified */ + list_for_each_entry(entry, head, l) { + if ((entry->v.chn == v->chn) && + (entry->v.tgt == v->tgt) && + (entry->v.lun == v->lun)) { + goto found; + } + } + + spin_unlock_irqrestore(&info->v2p_lock, flags); + return 1; + +found: + /* Delete the translation entry specfied */ + __scsiback_del_translation_entry(entry); + + spin_unlock_irqrestore(&info->v2p_lock, flags); + return 0; +} + +static void scsiback_do_add_lun(struct vscsibk_info *info, const char *state, + char *phy, struct ids_tuple *vir) +{ + if (!scsiback_add_translation_entry(info, phy, vir)) { + if (xenbus_printf(XBT_NIL, info->dev->nodename, state, + "%d", XenbusStateInitialised)) { + pr_err("xen-pvscsi: xenbus_printf error %s\n", state); + scsiback_del_translation_entry(info, vir); + } + } else { + xenbus_printf(XBT_NIL, info->dev->nodename, state, + "%d", XenbusStateClosed); + } +} + +static void scsiback_do_del_lun(struct vscsibk_info *info, const char *state, + struct ids_tuple *vir) +{ + if (!scsiback_del_translation_entry(info, vir)) { + if (xenbus_printf(XBT_NIL, info->dev->nodename, state, + "%d", XenbusStateClosed)) + pr_err("xen-pvscsi: xenbus_printf error %s\n", state); + } +} + +#define VSCSIBACK_OP_ADD_OR_DEL_LUN 1 +#define VSCSIBACK_OP_UPDATEDEV_STATE 2 + +static void scsiback_do_1lun_hotplug(struct vscsibk_info *info, int op, + char *ent) +{ + int err; + struct ids_tuple vir; + char *val; + int device_state; + char phy[VSCSI_NAMELEN]; + char str[64]; + char state[64]; + struct xenbus_device *dev = info->dev; + + /* read status */ + snprintf(state, sizeof(state), "vscsi-devs/%s/state", ent); + err = xenbus_scanf(XBT_NIL, dev->nodename, state, "%u", &device_state); + if (XENBUS_EXIST_ERR(err)) + return; + + /* physical SCSI device */ + snprintf(str, sizeof(str), "vscsi-devs/%s/p-dev", ent); + val = xenbus_read(XBT_NIL, dev->nodename, str, NULL); + if (IS_ERR(val)) { + xenbus_printf(XBT_NIL, dev->nodename, state, + "%d", XenbusStateClosed); + return; + } + strlcpy(phy, val, VSCSI_NAMELEN); + kfree(val); + + /* virtual SCSI device */ + snprintf(str, sizeof(str), "vscsi-devs/%s/v-dev", ent); + err = xenbus_scanf(XBT_NIL, dev->nodename, str, "%u:%u:%u:%u", + &vir.hst, &vir.chn, &vir.tgt, &vir.lun); + if (XENBUS_EXIST_ERR(err)) { + xenbus_printf(XBT_NIL, dev->nodename, state, + "%d", XenbusStateClosed); + return; + } + + switch (op) { + case VSCSIBACK_OP_ADD_OR_DEL_LUN: + if (device_state == XenbusStateInitialising) + scsiback_do_add_lun(info, state, phy, &vir); + if (device_state == XenbusStateClosing) + scsiback_do_del_lun(info, state, &vir); + break; + + case VSCSIBACK_OP_UPDATEDEV_STATE: + if (device_state == XenbusStateInitialised) { + /* modify vscsi-devs/dev-x/state */ + if (xenbus_printf(XBT_NIL, dev->nodename, state, + "%d", XenbusStateConnected)) { + pr_err("xen-pvscsi: xenbus_printf error %s\n", + str); + scsiback_del_translation_entry(info, &vir); + xenbus_printf(XBT_NIL, dev->nodename, state, + "%d", XenbusStateClosed); + } + } + break; + /*When it is necessary, processing is added here.*/ + default: + break; + } +} + +static void scsiback_do_lun_hotplug(struct vscsibk_info *info, int op) +{ + int i; + char **dir; + unsigned int ndir = 0; + + dir = xenbus_directory(XBT_NIL, info->dev->nodename, "vscsi-devs", + &ndir); + if (IS_ERR(dir)) + return; + + for (i = 0; i < ndir; i++) + scsiback_do_1lun_hotplug(info, op, dir[i]); + + kfree(dir); +} + +static void scsiback_frontend_changed(struct xenbus_device *dev, + enum xenbus_state frontend_state) +{ + struct vscsibk_info *info = dev_get_drvdata(&dev->dev); + + switch (frontend_state) { + case XenbusStateInitialising: + break; + + case XenbusStateInitialised: + if (scsiback_map(info)) + break; + + scsiback_do_lun_hotplug(info, VSCSIBACK_OP_ADD_OR_DEL_LUN); + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateConnected: + scsiback_do_lun_hotplug(info, VSCSIBACK_OP_UPDATEDEV_STATE); + + if (dev->state == XenbusStateConnected) + break; + + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateClosing: + if (info->irq) + scsiback_disconnect(info); + + xenbus_switch_state(dev, XenbusStateClosing); + break; + + case XenbusStateClosed: + xenbus_switch_state(dev, XenbusStateClosed); + if (xenbus_dev_is_online(dev)) + break; + /* fall through if not online */ + case XenbusStateUnknown: + device_unregister(&dev->dev); + break; + + case XenbusStateReconfiguring: + scsiback_do_lun_hotplug(info, VSCSIBACK_OP_ADD_OR_DEL_LUN); + xenbus_switch_state(dev, XenbusStateReconfigured); + + break; + + default: + xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", + frontend_state); + break; + } +} + +/* + Release the translation entry specfied +*/ +static void scsiback_release_translation_entry(struct vscsibk_info *info) +{ + struct v2p_entry *entry, *tmp; + struct list_head *head = &(info->v2p_entry_lists); + unsigned long flags; + + spin_lock_irqsave(&info->v2p_lock, flags); + + list_for_each_entry_safe(entry, tmp, head, l) + __scsiback_del_translation_entry(entry); + + spin_unlock_irqrestore(&info->v2p_lock, flags); +} + +static int scsiback_remove(struct xenbus_device *dev) +{ + struct vscsibk_info *info = dev_get_drvdata(&dev->dev); + + if (info->irq) + scsiback_disconnect(info); + + scsiback_release_translation_entry(info); + + dev_set_drvdata(&dev->dev, NULL); + + return 0; +} + +static int scsiback_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err; + + struct vscsibk_info *info = kzalloc(sizeof(struct vscsibk_info), + GFP_KERNEL); + + DPRINTK("%p %d\n", dev, dev->otherend_id); + + if (!info) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating backend structure"); + return -ENOMEM; + } + info->dev = dev; + dev_set_drvdata(&dev->dev, info); + + info->domid = dev->otherend_id; + spin_lock_init(&info->ring_lock); + info->ring_error = 0; + atomic_set(&info->nr_unreplied_reqs, 0); + init_waitqueue_head(&info->waiting_to_free); + info->dev = dev; + info->irq = 0; + INIT_LIST_HEAD(&info->v2p_entry_lists); + spin_lock_init(&info->v2p_lock); + + err = xenbus_printf(XBT_NIL, dev->nodename, "feature-sg-grant", "%u", + SG_ALL); + if (err) + xenbus_dev_error(dev, err, "writing feature-sg-grant"); + + err = xenbus_switch_state(dev, XenbusStateInitWait); + if (err) + goto fail; + + return 0; + +fail: + pr_warn("xen-pvscsi: %s failed\n", __func__); + scsiback_remove(dev); + + return err; +} + +static char *scsiback_dump_proto_id(struct scsiback_tport *tport) +{ + switch (tport->tport_proto_id) { + case SCSI_PROTOCOL_SAS: + return "SAS"; + case SCSI_PROTOCOL_FCP: + return "FCP"; + case SCSI_PROTOCOL_ISCSI: + return "iSCSI"; + default: + break; + } + + return "Unknown"; +} + +static u8 scsiback_get_fabric_proto_ident(struct se_portal_group *se_tpg) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_tport *tport = tpg->tport; + + switch (tport->tport_proto_id) { + case SCSI_PROTOCOL_SAS: + return sas_get_fabric_proto_ident(se_tpg); + case SCSI_PROTOCOL_FCP: + return fc_get_fabric_proto_ident(se_tpg); + case SCSI_PROTOCOL_ISCSI: + return iscsi_get_fabric_proto_ident(se_tpg); + default: + pr_err("Unknown tport_proto_id: 0x%02x, using SAS emulation\n", + tport->tport_proto_id); + break; + } + + return sas_get_fabric_proto_ident(se_tpg); +} + +static char *scsiback_get_fabric_wwn(struct se_portal_group *se_tpg) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_tport *tport = tpg->tport; + + return &tport->tport_name[0]; +} + +static u16 scsiback_get_tag(struct se_portal_group *se_tpg) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + return tpg->tport_tpgt; +} + +static u32 scsiback_get_default_depth(struct se_portal_group *se_tpg) +{ + return 1; +} + +static u32 +scsiback_get_pr_transport_id(struct se_portal_group *se_tpg, + struct se_node_acl *se_nacl, + struct t10_pr_registration *pr_reg, + int *format_code, + unsigned char *buf) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_tport *tport = tpg->tport; + + switch (tport->tport_proto_id) { + case SCSI_PROTOCOL_SAS: + return sas_get_pr_transport_id(se_tpg, se_nacl, pr_reg, + format_code, buf); + case SCSI_PROTOCOL_FCP: + return fc_get_pr_transport_id(se_tpg, se_nacl, pr_reg, + format_code, buf); + case SCSI_PROTOCOL_ISCSI: + return iscsi_get_pr_transport_id(se_tpg, se_nacl, pr_reg, + format_code, buf); + default: + pr_err("Unknown tport_proto_id: 0x%02x, using SAS emulation\n", + tport->tport_proto_id); + break; + } + + return sas_get_pr_transport_id(se_tpg, se_nacl, pr_reg, + format_code, buf); +} + +static u32 +scsiback_get_pr_transport_id_len(struct se_portal_group *se_tpg, + struct se_node_acl *se_nacl, + struct t10_pr_registration *pr_reg, + int *format_code) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_tport *tport = tpg->tport; + + switch (tport->tport_proto_id) { + case SCSI_PROTOCOL_SAS: + return sas_get_pr_transport_id_len(se_tpg, se_nacl, pr_reg, + format_code); + case SCSI_PROTOCOL_FCP: + return fc_get_pr_transport_id_len(se_tpg, se_nacl, pr_reg, + format_code); + case SCSI_PROTOCOL_ISCSI: + return iscsi_get_pr_transport_id_len(se_tpg, se_nacl, pr_reg, + format_code); + default: + pr_err("Unknown tport_proto_id: 0x%02x, using SAS emulation\n", + tport->tport_proto_id); + break; + } + + return sas_get_pr_transport_id_len(se_tpg, se_nacl, pr_reg, + format_code); +} + +static char * +scsiback_parse_pr_out_transport_id(struct se_portal_group *se_tpg, + const char *buf, + u32 *out_tid_len, + char **port_nexus_ptr) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_tport *tport = tpg->tport; + + switch (tport->tport_proto_id) { + case SCSI_PROTOCOL_SAS: + return sas_parse_pr_out_transport_id(se_tpg, buf, out_tid_len, + port_nexus_ptr); + case SCSI_PROTOCOL_FCP: + return fc_parse_pr_out_transport_id(se_tpg, buf, out_tid_len, + port_nexus_ptr); + case SCSI_PROTOCOL_ISCSI: + return iscsi_parse_pr_out_transport_id(se_tpg, buf, out_tid_len, + port_nexus_ptr); + default: + pr_err("Unknown tport_proto_id: 0x%02x, using SAS emulation\n", + tport->tport_proto_id); + break; + } + + return sas_parse_pr_out_transport_id(se_tpg, buf, out_tid_len, + port_nexus_ptr); +} + +static struct se_wwn * +scsiback_make_tport(struct target_fabric_configfs *tf, + struct config_group *group, + const char *name) +{ + struct scsiback_tport *tport; + char *ptr; + u64 wwpn = 0; + int off = 0; + + tport = kzalloc(sizeof(struct scsiback_tport), GFP_KERNEL); + if (!tport) + return ERR_PTR(-ENOMEM); + + tport->tport_wwpn = wwpn; + /* + * Determine the emulated Protocol Identifier and Target Port Name + * based on the incoming configfs directory name. + */ + ptr = strstr(name, "naa."); + if (ptr) { + tport->tport_proto_id = SCSI_PROTOCOL_SAS; + goto check_len; + } + ptr = strstr(name, "fc."); + if (ptr) { + tport->tport_proto_id = SCSI_PROTOCOL_FCP; + off = 3; /* Skip over "fc." */ + goto check_len; + } + ptr = strstr(name, "iqn."); + if (ptr) { + tport->tport_proto_id = SCSI_PROTOCOL_ISCSI; + goto check_len; + } + + pr_err("Unable to locate prefix for emulated Target Port: %s\n", name); + kfree(tport); + return ERR_PTR(-EINVAL); + +check_len: + if (strlen(name) >= VSCSI_NAMELEN) { + pr_err("Emulated %s Address: %s, exceeds max: %d\n", name, + scsiback_dump_proto_id(tport), VSCSI_NAMELEN); + kfree(tport); + return ERR_PTR(-EINVAL); + } + snprintf(&tport->tport_name[0], VSCSI_NAMELEN, "%s", &name[off]); + + pr_debug("xen-pvscsi: Allocated emulated Target %s Address: %s\n", + scsiback_dump_proto_id(tport), name); + + return &tport->tport_wwn; +} + +static void scsiback_drop_tport(struct se_wwn *wwn) +{ + struct scsiback_tport *tport = container_of(wwn, + struct scsiback_tport, tport_wwn); + + pr_debug("xen-pvscsi: Deallocating emulated Target %s Address: %s\n", + scsiback_dump_proto_id(tport), tport->tport_name); + + kfree(tport); +} + +static struct se_node_acl * +scsiback_alloc_fabric_acl(struct se_portal_group *se_tpg) +{ + return kzalloc(sizeof(struct se_node_acl), GFP_KERNEL); +} + +static void +scsiback_release_fabric_acl(struct se_portal_group *se_tpg, + struct se_node_acl *se_nacl) +{ + kfree(se_nacl); +} + +static u32 scsiback_tpg_get_inst_index(struct se_portal_group *se_tpg) +{ + return 1; +} + +static int scsiback_check_stop_free(struct se_cmd *se_cmd) +{ + /* + * Do not release struct se_cmd's containing a valid TMR + * pointer. These will be released directly in scsiback_device_action() + * with transport_generic_free_cmd(). + */ + if (se_cmd->se_cmd_flags & SCF_SCSI_TMR_CDB) + return 0; + + transport_generic_free_cmd(se_cmd, 0); + return 1; +} + +static void scsiback_release_cmd(struct se_cmd *se_cmd) +{ + struct vscsibk_pend *pending_req = container_of(se_cmd, + struct vscsibk_pend, se_cmd); + + kmem_cache_free(scsiback_cachep, pending_req); +} + +static int scsiback_shutdown_session(struct se_session *se_sess) +{ + return 0; +} + +static void scsiback_close_session(struct se_session *se_sess) +{ +} + +static u32 scsiback_sess_get_index(struct se_session *se_sess) +{ + return 0; +} + +static int scsiback_write_pending(struct se_cmd *se_cmd) +{ + /* Go ahead and process the write immediately */ + target_execute_cmd(se_cmd); + + return 0; +} + +static int scsiback_write_pending_status(struct se_cmd *se_cmd) +{ + return 0; +} + +static void scsiback_set_default_node_attrs(struct se_node_acl *nacl) +{ +} + +static u32 scsiback_get_task_tag(struct se_cmd *se_cmd) +{ + struct vscsibk_pend *pending_req = container_of(se_cmd, + struct vscsibk_pend, se_cmd); + + return pending_req->rqid; +} + +static int scsiback_get_cmd_state(struct se_cmd *se_cmd) +{ + return 0; +} + +static int scsiback_queue_data_in(struct se_cmd *se_cmd) +{ + struct vscsibk_pend *pending_req = container_of(se_cmd, + struct vscsibk_pend, se_cmd); + + pending_req->result = SAM_STAT_GOOD; + scsiback_cmd_done(pending_req); + return 0; +} + +static int scsiback_queue_status(struct se_cmd *se_cmd) +{ + struct vscsibk_pend *pending_req = container_of(se_cmd, + struct vscsibk_pend, se_cmd); + + if (se_cmd->sense_buffer && + ((se_cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) || + (se_cmd->se_cmd_flags & SCF_EMULATED_TASK_SENSE))) + pending_req->result = (DRIVER_SENSE << 24) | + SAM_STAT_CHECK_CONDITION; + else + pending_req->result = se_cmd->scsi_status; + + scsiback_cmd_done(pending_req); + return 0; +} + +static void scsiback_queue_tm_rsp(struct se_cmd *se_cmd) +{ + struct se_tmr_req *se_tmr = se_cmd->se_tmr_req; + struct scsiback_tmr *tmr = se_tmr->fabric_tmr_ptr; + + atomic_set(&tmr->tmr_complete, 1); + wake_up(&tmr->tmr_wait); +} + +static void scsiback_aborted_task(struct se_cmd *se_cmd) +{ +} + +static ssize_t scsiback_tpg_param_show_alias(struct se_portal_group *se_tpg, + char *page) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, struct scsiback_tpg, + se_tpg); + ssize_t rb; + + mutex_lock(&tpg->tv_tpg_mutex); + rb = snprintf(page, PAGE_SIZE, "%s\n", tpg->param_alias); + mutex_unlock(&tpg->tv_tpg_mutex); + + return rb; +} + +static ssize_t scsiback_tpg_param_store_alias(struct se_portal_group *se_tpg, + const char *page, size_t count) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, struct scsiback_tpg, + se_tpg); + int len; + + if (strlen(page) >= VSCSI_NAMELEN) { + pr_err("param alias: %s, exceeds max: %d\n", page, + VSCSI_NAMELEN); + return -EINVAL; + } + + mutex_lock(&tpg->tv_tpg_mutex); + len = snprintf(tpg->param_alias, VSCSI_NAMELEN, "%s", page); + if (tpg->param_alias[len - 1] == '\n') + tpg->param_alias[len - 1] = '\0'; + mutex_unlock(&tpg->tv_tpg_mutex); + + return count; +} + +TF_TPG_PARAM_ATTR(scsiback, alias, S_IRUGO | S_IWUSR); + +static struct configfs_attribute *scsiback_param_attrs[] = { + &scsiback_tpg_param_alias.attr, + NULL, +}; + +static int scsiback_make_nexus(struct scsiback_tpg *tpg, + const char *name) +{ + struct se_portal_group *se_tpg; + struct se_session *se_sess; + struct scsiback_nexus *tv_nexus; + + mutex_lock(&tpg->tv_tpg_mutex); + if (tpg->tpg_nexus) { + mutex_unlock(&tpg->tv_tpg_mutex); + pr_debug("tpg->tpg_nexus already exists\n"); + return -EEXIST; + } + se_tpg = &tpg->se_tpg; + + tv_nexus = kzalloc(sizeof(struct scsiback_nexus), GFP_KERNEL); + if (!tv_nexus) { + mutex_unlock(&tpg->tv_tpg_mutex); + return -ENOMEM; + } + /* + * Initialize the struct se_session pointer + */ + tv_nexus->tvn_se_sess = transport_init_session(TARGET_PROT_NORMAL); + if (IS_ERR(tv_nexus->tvn_se_sess)) { + mutex_unlock(&tpg->tv_tpg_mutex); + kfree(tv_nexus); + return -ENOMEM; + } + se_sess = tv_nexus->tvn_se_sess; + /* + * Since we are running in 'demo mode' this call with generate a + * struct se_node_acl for the scsiback struct se_portal_group with + * the SCSI Initiator port name of the passed configfs group 'name'. + */ + tv_nexus->tvn_se_sess->se_node_acl = core_tpg_check_initiator_node_acl( + se_tpg, (unsigned char *)name); + if (!tv_nexus->tvn_se_sess->se_node_acl) { + mutex_unlock(&tpg->tv_tpg_mutex); + pr_debug("core_tpg_check_initiator_node_acl() failed for %s\n", + name); + goto out; + } + /* + * Now register the TCM pvscsi virtual I_T Nexus as active with the + * call to __transport_register_session() + */ + __transport_register_session(se_tpg, tv_nexus->tvn_se_sess->se_node_acl, + tv_nexus->tvn_se_sess, tv_nexus); + tpg->tpg_nexus = tv_nexus; + + mutex_unlock(&tpg->tv_tpg_mutex); + return 0; + +out: + transport_free_session(se_sess); + kfree(tv_nexus); + return -ENOMEM; +} + +static int scsiback_drop_nexus(struct scsiback_tpg *tpg) +{ + struct se_session *se_sess; + struct scsiback_nexus *tv_nexus; + + mutex_lock(&tpg->tv_tpg_mutex); + tv_nexus = tpg->tpg_nexus; + if (!tv_nexus) { + mutex_unlock(&tpg->tv_tpg_mutex); + return -ENODEV; + } + + se_sess = tv_nexus->tvn_se_sess; + if (!se_sess) { + mutex_unlock(&tpg->tv_tpg_mutex); + return -ENODEV; + } + + if (tpg->tv_tpg_port_count != 0) { + mutex_unlock(&tpg->tv_tpg_mutex); + pr_err("Unable to remove xen-pvscsi I_T Nexus with active TPG port count: %d\n", + tpg->tv_tpg_port_count); + return -EBUSY; + } + + if (tpg->tv_tpg_fe_count != 0) { + mutex_unlock(&tpg->tv_tpg_mutex); + pr_err("Unable to remove xen-pvscsi I_T Nexus with active TPG frontend count: %d\n", + tpg->tv_tpg_fe_count); + return -EBUSY; + } + + pr_debug("xen-pvscsi: Removing I_T Nexus to emulated %s Initiator Port: %s\n", + scsiback_dump_proto_id(tpg->tport), + tv_nexus->tvn_se_sess->se_node_acl->initiatorname); + + /* + * Release the SCSI I_T Nexus to the emulated xen-pvscsi Target Port + */ + transport_deregister_session(tv_nexus->tvn_se_sess); + tpg->tpg_nexus = NULL; + mutex_unlock(&tpg->tv_tpg_mutex); + + kfree(tv_nexus); + return 0; +} + +static ssize_t scsiback_tpg_show_nexus(struct se_portal_group *se_tpg, + char *page) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_nexus *tv_nexus; + ssize_t ret; + + mutex_lock(&tpg->tv_tpg_mutex); + tv_nexus = tpg->tpg_nexus; + if (!tv_nexus) { + mutex_unlock(&tpg->tv_tpg_mutex); + return -ENODEV; + } + ret = snprintf(page, PAGE_SIZE, "%s\n", + tv_nexus->tvn_se_sess->se_node_acl->initiatorname); + mutex_unlock(&tpg->tv_tpg_mutex); + + return ret; +} + +static ssize_t scsiback_tpg_store_nexus(struct se_portal_group *se_tpg, + const char *page, + size_t count) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_tport *tport_wwn = tpg->tport; + unsigned char i_port[VSCSI_NAMELEN], *ptr, *port_ptr; + int ret; + /* + * Shutdown the active I_T nexus if 'NULL' is passed.. + */ + if (!strncmp(page, "NULL", 4)) { + ret = scsiback_drop_nexus(tpg); + return (!ret) ? count : ret; + } + /* + * Otherwise make sure the passed virtual Initiator port WWN matches + * the fabric protocol_id set in scsiback_make_tport(), and call + * scsiback_make_nexus(). + */ + if (strlen(page) >= VSCSI_NAMELEN) { + pr_err("Emulated NAA Sas Address: %s, exceeds max: %d\n", + page, VSCSI_NAMELEN); + return -EINVAL; + } + snprintf(&i_port[0], VSCSI_NAMELEN, "%s", page); + + ptr = strstr(i_port, "naa."); + if (ptr) { + if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_SAS) { + pr_err("Passed SAS Initiator Port %s does not match target port protoid: %s\n", + i_port, scsiback_dump_proto_id(tport_wwn)); + return -EINVAL; + } + port_ptr = &i_port[0]; + goto check_newline; + } + ptr = strstr(i_port, "fc."); + if (ptr) { + if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_FCP) { + pr_err("Passed FCP Initiator Port %s does not match target port protoid: %s\n", + i_port, scsiback_dump_proto_id(tport_wwn)); + return -EINVAL; + } + port_ptr = &i_port[3]; /* Skip over "fc." */ + goto check_newline; + } + ptr = strstr(i_port, "iqn."); + if (ptr) { + if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_ISCSI) { + pr_err("Passed iSCSI Initiator Port %s does not match target port protoid: %s\n", + i_port, scsiback_dump_proto_id(tport_wwn)); + return -EINVAL; + } + port_ptr = &i_port[0]; + goto check_newline; + } + pr_err("Unable to locate prefix for emulated Initiator Port: %s\n", + i_port); + return -EINVAL; + /* + * Clear any trailing newline for the NAA WWN + */ +check_newline: + if (i_port[strlen(i_port) - 1] == '\n') + i_port[strlen(i_port) - 1] = '\0'; + + ret = scsiback_make_nexus(tpg, port_ptr); + if (ret < 0) + return ret; + + return count; +} + +TF_TPG_BASE_ATTR(scsiback, nexus, S_IRUGO | S_IWUSR); + +static struct configfs_attribute *scsiback_tpg_attrs[] = { + &scsiback_tpg_nexus.attr, + NULL, +}; + +static ssize_t +scsiback_wwn_show_attr_version(struct target_fabric_configfs *tf, + char *page) +{ + return sprintf(page, "xen-pvscsi fabric module %s on %s/%s on " + UTS_RELEASE"\n", + VSCSI_VERSION, utsname()->sysname, utsname()->machine); +} + +TF_WWN_ATTR_RO(scsiback, version); + +static struct configfs_attribute *scsiback_wwn_attrs[] = { + &scsiback_wwn_version.attr, + NULL, +}; + +static char *scsiback_get_fabric_name(void) +{ + return "xen-pvscsi"; +} + +static int scsiback_port_link(struct se_portal_group *se_tpg, + struct se_lun *lun) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_port_count++; + mutex_unlock(&tpg->tv_tpg_mutex); + + return 0; +} + +static void scsiback_port_unlink(struct se_portal_group *se_tpg, + struct se_lun *lun) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_port_count--; + mutex_unlock(&tpg->tv_tpg_mutex); +} + +static struct se_portal_group * +scsiback_make_tpg(struct se_wwn *wwn, + struct config_group *group, + const char *name) +{ + struct scsiback_tport *tport = container_of(wwn, + struct scsiback_tport, tport_wwn); + + struct scsiback_tpg *tpg; + u16 tpgt; + int ret; + + if (strstr(name, "tpgt_") != name) + return ERR_PTR(-EINVAL); + ret = kstrtou16(name + 5, 10, &tpgt); + if (ret) + return ERR_PTR(ret); + + tpg = kzalloc(sizeof(struct scsiback_tpg), GFP_KERNEL); + if (!tpg) + return ERR_PTR(-ENOMEM); + + mutex_init(&tpg->tv_tpg_mutex); + INIT_LIST_HEAD(&tpg->tv_tpg_list); + INIT_LIST_HEAD(&tpg->info_list); + tpg->tport = tport; + tpg->tport_tpgt = tpgt; + + ret = core_tpg_register(&scsiback_fabric_configfs->tf_ops, wwn, + &tpg->se_tpg, tpg, TRANSPORT_TPG_TYPE_NORMAL); + if (ret < 0) { + kfree(tpg); + return NULL; + } + mutex_lock(&scsiback_mutex); + list_add_tail(&tpg->tv_tpg_list, &scsiback_list); + mutex_unlock(&scsiback_mutex); + + return &tpg->se_tpg; +} + +static void scsiback_drop_tpg(struct se_portal_group *se_tpg) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + + mutex_lock(&scsiback_mutex); + list_del(&tpg->tv_tpg_list); + mutex_unlock(&scsiback_mutex); + /* + * Release the virtual I_T Nexus for this xen-pvscsi TPG + */ + scsiback_drop_nexus(tpg); + /* + * Deregister the se_tpg from TCM.. + */ + core_tpg_deregister(se_tpg); + kfree(tpg); +} + +static int scsiback_check_true(struct se_portal_group *se_tpg) +{ + return 1; +} + +static int scsiback_check_false(struct se_portal_group *se_tpg) +{ + return 0; +} + +static struct target_core_fabric_ops scsiback_ops = { + .get_fabric_name = scsiback_get_fabric_name, + .get_fabric_proto_ident = scsiback_get_fabric_proto_ident, + .tpg_get_wwn = scsiback_get_fabric_wwn, + .tpg_get_tag = scsiback_get_tag, + .tpg_get_default_depth = scsiback_get_default_depth, + .tpg_get_pr_transport_id = scsiback_get_pr_transport_id, + .tpg_get_pr_transport_id_len = scsiback_get_pr_transport_id_len, + .tpg_parse_pr_out_transport_id = scsiback_parse_pr_out_transport_id, + .tpg_check_demo_mode = scsiback_check_true, + .tpg_check_demo_mode_cache = scsiback_check_true, + .tpg_check_demo_mode_write_protect = scsiback_check_false, + .tpg_check_prod_mode_write_protect = scsiback_check_false, + .tpg_alloc_fabric_acl = scsiback_alloc_fabric_acl, + .tpg_release_fabric_acl = scsiback_release_fabric_acl, + .tpg_get_inst_index = scsiback_tpg_get_inst_index, + .check_stop_free = scsiback_check_stop_free, + .release_cmd = scsiback_release_cmd, + .put_session = NULL, + .shutdown_session = scsiback_shutdown_session, + .close_session = scsiback_close_session, + .sess_get_index = scsiback_sess_get_index, + .sess_get_initiator_sid = NULL, + .write_pending = scsiback_write_pending, + .write_pending_status = scsiback_write_pending_status, + .set_default_node_attributes = scsiback_set_default_node_attrs, + .get_task_tag = scsiback_get_task_tag, + .get_cmd_state = scsiback_get_cmd_state, + .queue_data_in = scsiback_queue_data_in, + .queue_status = scsiback_queue_status, + .queue_tm_rsp = scsiback_queue_tm_rsp, + .aborted_task = scsiback_aborted_task, + /* + * Setup callers for generic logic in target_core_fabric_configfs.c + */ + .fabric_make_wwn = scsiback_make_tport, + .fabric_drop_wwn = scsiback_drop_tport, + .fabric_make_tpg = scsiback_make_tpg, + .fabric_drop_tpg = scsiback_drop_tpg, + .fabric_post_link = scsiback_port_link, + .fabric_pre_unlink = scsiback_port_unlink, + .fabric_make_np = NULL, + .fabric_drop_np = NULL, +#if 0 + .fabric_make_nodeacl = scsiback_make_nodeacl, + .fabric_drop_nodeacl = scsiback_drop_nodeacl, +#endif +}; + +static int scsiback_register_configfs(void) +{ + struct target_fabric_configfs *fabric; + int ret; + + pr_debug("xen-pvscsi: fabric module %s on %s/%s on "UTS_RELEASE"\n", + VSCSI_VERSION, utsname()->sysname, utsname()->machine); + /* + * Register the top level struct config_item_type with TCM core + */ + fabric = target_fabric_configfs_init(THIS_MODULE, "xen-pvscsi"); + if (IS_ERR(fabric)) + return PTR_ERR(fabric); + + /* + * Setup fabric->tf_ops from our local scsiback_ops + */ + fabric->tf_ops = scsiback_ops; + /* + * Setup default attribute lists for various fabric->tf_cit_tmpl + */ + fabric->tf_cit_tmpl.tfc_wwn_cit.ct_attrs = scsiback_wwn_attrs; + fabric->tf_cit_tmpl.tfc_tpg_base_cit.ct_attrs = scsiback_tpg_attrs; + fabric->tf_cit_tmpl.tfc_tpg_attrib_cit.ct_attrs = NULL; + fabric->tf_cit_tmpl.tfc_tpg_param_cit.ct_attrs = scsiback_param_attrs; + fabric->tf_cit_tmpl.tfc_tpg_np_base_cit.ct_attrs = NULL; + fabric->tf_cit_tmpl.tfc_tpg_nacl_base_cit.ct_attrs = NULL; + fabric->tf_cit_tmpl.tfc_tpg_nacl_attrib_cit.ct_attrs = NULL; + fabric->tf_cit_tmpl.tfc_tpg_nacl_auth_cit.ct_attrs = NULL; + fabric->tf_cit_tmpl.tfc_tpg_nacl_param_cit.ct_attrs = NULL; + /* + * Register the fabric for use within TCM + */ + ret = target_fabric_configfs_register(fabric); + if (ret < 0) { + target_fabric_configfs_free(fabric); + return ret; + } + /* + * Setup our local pointer to *fabric + */ + scsiback_fabric_configfs = fabric; + pr_debug("xen-pvscsi: Set fabric -> scsiback_fabric_configfs\n"); + return 0; +}; + +static void scsiback_deregister_configfs(void) +{ + if (!scsiback_fabric_configfs) + return; + + target_fabric_configfs_deregister(scsiback_fabric_configfs); + scsiback_fabric_configfs = NULL; + pr_debug("xen-pvscsi: Cleared scsiback_fabric_configfs\n"); +}; + +static const struct xenbus_device_id scsiback_ids[] = { + { "vscsi" }, + { "" } +}; + +static struct xenbus_driver scsiback_driver = { + .ids = scsiback_ids, + .probe = scsiback_probe, + .remove = scsiback_remove, + .otherend_changed = scsiback_frontend_changed +}; + +static void scsiback_init_pend(void *p) +{ + struct vscsibk_pend *pend = p; + int i; + + memset(pend, 0, sizeof(*pend)); + for (i = 0; i < VSCSI_MAX_GRANTS; i++) + pend->grant_handles[i] = SCSIBACK_INVALID_HANDLE; +} + +static int __init scsiback_init(void) +{ + int ret; + + if (!xen_domain()) + return -ENODEV; + + scsiback_cachep = kmem_cache_create("vscsiif_cache", + sizeof(struct vscsibk_pend), 0, 0, scsiback_init_pend); + if (!scsiback_cachep) + return -ENOMEM; + + ret = xenbus_register_backend(&scsiback_driver); + if (ret) + goto out_cache_destroy; + + ret = scsiback_register_configfs(); + if (ret) + goto out_unregister_xenbus; + + return 0; + +out_unregister_xenbus: + xenbus_unregister_driver(&scsiback_driver); +out_cache_destroy: + kmem_cache_destroy(scsiback_cachep); + pr_err("xen-pvscsi: %s: error %d\n", __func__, ret); + return ret; +} + +static void __exit scsiback_exit(void) +{ + struct page *page; + + while (free_pages_num) { + if (get_free_page(&page)) + BUG(); + free_xenballooned_pages(1, &page); + } + scsiback_deregister_configfs(); + xenbus_unregister_driver(&scsiback_driver); + kmem_cache_destroy(scsiback_cachep); +} + +module_init(scsiback_init); +module_exit(scsiback_exit); + +MODULE_DESCRIPTION("Xen SCSI backend driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS("xen-backend:vscsi"); +MODULE_AUTHOR("Juergen Gross <jgross@suse.com>"); diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 439c9dca9eee..ca744102b666 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -259,7 +259,6 @@ static char *error_path(struct xenbus_device *dev) static void xenbus_va_dev_error(struct xenbus_device *dev, int err, const char *fmt, va_list ap) { - int ret; unsigned int len; char *printf_buffer = NULL; char *path_buffer = NULL; @@ -270,9 +269,7 @@ static void xenbus_va_dev_error(struct xenbus_device *dev, int err, goto fail; len = sprintf(printf_buffer, "%i ", -err); - ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap); - - BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1); + vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap); dev_err(&dev->dev, "%s\n", printf_buffer); @@ -361,8 +358,8 @@ static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err, * @ring_mfn: mfn of ring to grant * Grant access to the given @ring_mfn to the peer of the given device. Return - * 0 on success, or -errno on error. On error, the device will switch to - * XenbusStateClosing, and the error will be saved in the store. + * a grant reference on success, or -errno on error. On error, the device will + * switch to XenbusStateClosing, and the error will be saved in the store. */ int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn) { diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 3c0a74b3e9b1..564b31584860 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -297,9 +297,13 @@ void xenbus_dev_shutdown(struct device *_dev) EXPORT_SYMBOL_GPL(xenbus_dev_shutdown); int xenbus_register_driver_common(struct xenbus_driver *drv, - struct xen_bus_type *bus) + struct xen_bus_type *bus, + struct module *owner, const char *mod_name) { + drv->driver.name = drv->name ? drv->name : drv->ids[0].devicetype; drv->driver.bus = &bus->bus; + drv->driver.owner = owner; + drv->driver.mod_name = mod_name; return driver_register(&drv->driver); } diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h index 1085ec294a19..c9ec7ca1f7ab 100644 --- a/drivers/xen/xenbus/xenbus_probe.h +++ b/drivers/xen/xenbus/xenbus_probe.h @@ -60,7 +60,9 @@ extern int xenbus_match(struct device *_dev, struct device_driver *_drv); extern int xenbus_dev_probe(struct device *_dev); extern int xenbus_dev_remove(struct device *_dev); extern int xenbus_register_driver_common(struct xenbus_driver *drv, - struct xen_bus_type *bus); + struct xen_bus_type *bus, + struct module *owner, + const char *mod_name); extern int xenbus_probe_node(struct xen_bus_type *bus, const char *type, const char *nodename); diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index 5125dce11a60..04f7f85a5edf 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -234,13 +234,15 @@ int xenbus_dev_is_online(struct xenbus_device *dev) } EXPORT_SYMBOL_GPL(xenbus_dev_is_online); -int xenbus_register_backend(struct xenbus_driver *drv) +int __xenbus_register_backend(struct xenbus_driver *drv, struct module *owner, + const char *mod_name) { drv->read_otherend_details = read_frontend_details; - return xenbus_register_driver_common(drv, &xenbus_backend); + return xenbus_register_driver_common(drv, &xenbus_backend, + owner, mod_name); } -EXPORT_SYMBOL_GPL(xenbus_register_backend); +EXPORT_SYMBOL_GPL(__xenbus_register_backend); static int backend_probe_and_watch(struct notifier_block *notifier, unsigned long event, diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c index cb385c10d2b1..bcb53bdc469c 100644 --- a/drivers/xen/xenbus/xenbus_probe_frontend.c +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -317,13 +317,15 @@ static void wait_for_devices(struct xenbus_driver *xendrv) print_device_status); } -int xenbus_register_frontend(struct xenbus_driver *drv) +int __xenbus_register_frontend(struct xenbus_driver *drv, struct module *owner, + const char *mod_name) { int ret; drv->read_otherend_details = read_backend_details; - ret = xenbus_register_driver_common(drv, &xenbus_frontend); + ret = xenbus_register_driver_common(drv, &xenbus_frontend, + owner, mod_name); if (ret) return ret; @@ -332,7 +334,7 @@ int xenbus_register_frontend(struct xenbus_driver *drv) return 0; } -EXPORT_SYMBOL_GPL(xenbus_register_frontend); +EXPORT_SYMBOL_GPL(__xenbus_register_frontend); static DECLARE_WAIT_QUEUE_HEAD(backend_state_wq); static int backend_state; diff --git a/include/xen/events.h b/include/xen/events.h index 8bee7a75e850..5321cd9636e6 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -28,6 +28,8 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, unsigned long irqflags, const char *devname, void *dev_id); +int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, + unsigned int remote_port); int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, unsigned int remote_port, irq_handler_t handler, diff --git a/include/xen/interface/elfnote.h b/include/xen/interface/elfnote.h index 6f4eae328ca7..f90b03454659 100644 --- a/include/xen/interface/elfnote.h +++ b/include/xen/interface/elfnote.h @@ -3,6 +3,24 @@ * * Definitions used for the Xen ELF notes. * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Copyright (c) 2006, Ian Campbell, XenSource Ltd. */ @@ -18,12 +36,13 @@ * * LEGACY indicated the fields in the legacy __xen_guest string which * this a note type replaces. + * + * String values (for non-legacy) are NULL terminated ASCII, also known + * as ASCIZ type. */ /* * NAME=VALUE pair (string). - * - * LEGACY: FEATURES and PAE */ #define XEN_ELFNOTE_INFO 0 @@ -137,10 +156,30 @@ /* * Whether or not the guest supports cooperative suspend cancellation. + * This is a numeric value. + * + * Default is 0 */ #define XEN_ELFNOTE_SUSPEND_CANCEL 14 /* + * The (non-default) location the initial phys-to-machine map should be + * placed at by the hypervisor (Dom0) or the tools (DomU). + * The kernel must be prepared for this mapping to be established using + * large pages, despite such otherwise not being available to guests. + * The kernel must also be able to handle the page table pages used for + * this mapping not being accessible through the initial mapping. + * (Only x86-64 supports this at present.) + */ +#define XEN_ELFNOTE_INIT_P2M 15 + +/* + * Whether or not the guest can deal with being passed an initrd not + * mapped through its initial page tables. + */ +#define XEN_ELFNOTE_MOD_START_PFN 16 + +/* * The features supported by this kernel (numeric). * * Other than XEN_ELFNOTE_FEATURES on pre-4.2 Xen, this note allows a @@ -153,6 +192,11 @@ */ #define XEN_ELFNOTE_SUPPORTED_FEATURES 17 +/* + * The number of the highest elfnote defined. + */ +#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUPPORTED_FEATURES + #endif /* __XEN_PUBLIC_ELFNOTE_H__ */ /* diff --git a/include/xen/interface/io/vscsiif.h b/include/xen/interface/io/vscsiif.h new file mode 100644 index 000000000000..d07d7aca8d1c --- /dev/null +++ b/include/xen/interface/io/vscsiif.h @@ -0,0 +1,229 @@ +/****************************************************************************** + * vscsiif.h + * + * Based on the blkif.h code. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright(c) FUJITSU Limited 2008. + */ + +#ifndef __XEN__PUBLIC_IO_SCSI_H__ +#define __XEN__PUBLIC_IO_SCSI_H__ + +#include "ring.h" +#include "../grant_table.h" + +/* + * Feature and Parameter Negotiation + * ================================= + * The two halves of a Xen pvSCSI driver utilize nodes within the XenStore to + * communicate capabilities and to negotiate operating parameters. This + * section enumerates these nodes which reside in the respective front and + * backend portions of the XenStore, following the XenBus convention. + * + * Any specified default value is in effect if the corresponding XenBus node + * is not present in the XenStore. + * + * XenStore nodes in sections marked "PRIVATE" are solely for use by the + * driver side whose XenBus tree contains them. + * + ***************************************************************************** + * Backend XenBus Nodes + ***************************************************************************** + * + *------------------ Backend Device Identification (PRIVATE) ------------------ + * + * p-devname + * Values: string + * + * A free string used to identify the physical device (e.g. a disk name). + * + * p-dev + * Values: string + * + * A string specifying the backend device: either a 4-tuple "h:c:t:l" + * (host, controller, target, lun, all integers), or a WWN (e.g. + * "naa.60014054ac780582"). + * + * v-dev + * Values: string + * + * A string specifying the frontend device in form of a 4-tuple "h:c:t:l" + * (host, controller, target, lun, all integers). + * + *--------------------------------- Features --------------------------------- + * + * feature-sg-grant + * Values: unsigned [VSCSIIF_SG_TABLESIZE...65535] + * Default Value: 0 + * + * Specifies the maximum number of scatter/gather elements in grant pages + * supported. If not set, the backend supports up to VSCSIIF_SG_TABLESIZE + * SG elements specified directly in the request. + * + ***************************************************************************** + * Frontend XenBus Nodes + ***************************************************************************** + * + *----------------------- Request Transport Parameters ----------------------- + * + * event-channel + * Values: unsigned + * + * The identifier of the Xen event channel used to signal activity + * in the ring buffer. + * + * ring-ref + * Values: unsigned + * + * The Xen grant reference granting permission for the backend to map + * the sole page in a single page sized ring buffer. + * + * protocol + * Values: string (XEN_IO_PROTO_ABI_*) + * Default Value: XEN_IO_PROTO_ABI_NATIVE + * + * The machine ABI rules governing the format of all ring request and + * response structures. + */ + +/* Requests from the frontend to the backend */ + +/* + * Request a SCSI operation specified via a CDB in vscsiif_request.cmnd. + * The target is specified via channel, id and lun. + * + * The operation to be performed is specified via a CDB in cmnd[], the length + * of the CDB is in cmd_len. sc_data_direction specifies the direction of data + * (to the device, from the device, or none at all). + * + * If data is to be transferred to or from the device the buffer(s) in the + * guest memory is/are specified via one or multiple scsiif_request_segment + * descriptors each specifying a memory page via a grant_ref_t, a offset into + * the page and the length of the area in that page. All scsiif_request_segment + * areas concatenated form the resulting data buffer used by the operation. + * If the number of scsiif_request_segment areas is not too large (less than + * or equal VSCSIIF_SG_TABLESIZE) the areas can be specified directly in the + * seg[] array and the number of valid scsiif_request_segment elements is to be + * set in nr_segments. + * + * If "feature-sg-grant" in the Xenstore is set it is possible to specify more + * than VSCSIIF_SG_TABLESIZE scsiif_request_segment elements via indirection. + * The maximum number of allowed scsiif_request_segment elements is the value + * of the "feature-sg-grant" entry from Xenstore. When using indirection the + * seg[] array doesn't contain specifications of the data buffers, but + * references to scsiif_request_segment arrays, which in turn reference the + * data buffers. While nr_segments holds the number of populated seg[] entries + * (plus the set VSCSIIF_SG_GRANT bit), the number of scsiif_request_segment + * elements referencing the target data buffers is calculated from the lengths + * of the seg[] elements (the sum of all valid seg[].length divided by the + * size of one scsiif_request_segment structure). + */ +#define VSCSIIF_ACT_SCSI_CDB 1 + +/* + * Request abort of a running operation for the specified target given by + * channel, id, lun and the operation's rqid in ref_rqid. + */ +#define VSCSIIF_ACT_SCSI_ABORT 2 + +/* + * Request a device reset of the specified target (channel and id). + */ +#define VSCSIIF_ACT_SCSI_RESET 3 + +/* + * Preset scatter/gather elements for a following request. Deprecated. + * Keeping the define only to avoid usage of the value "4" for other actions. + */ +#define VSCSIIF_ACT_SCSI_SG_PRESET 4 + +/* + * Maximum scatter/gather segments per request. + * + * Considering balance between allocating at least 16 "vscsiif_request" + * structures on one page (4096 bytes) and the number of scatter/gather + * elements needed, we decided to use 26 as a magic number. + * + * If "feature-sg-grant" is set, more scatter/gather elements can be specified + * by placing them in one or more (up to VSCSIIF_SG_TABLESIZE) granted pages. + * In this case the vscsiif_request seg elements don't contain references to + * the user data, but to the SG elements referencing the user data. + */ +#define VSCSIIF_SG_TABLESIZE 26 + +/* + * based on Linux kernel 2.6.18, still valid + * Changing these values requires support of multiple protocols via the rings + * as "old clients" will blindly use these values and the resulting structure + * sizes. + */ +#define VSCSIIF_MAX_COMMAND_SIZE 16 +#define VSCSIIF_SENSE_BUFFERSIZE 96 + +struct scsiif_request_segment { + grant_ref_t gref; + uint16_t offset; + uint16_t length; +}; + +#define VSCSIIF_SG_PER_PAGE (PAGE_SIZE / sizeof(struct scsiif_request_segment)) + +/* Size of one request is 252 bytes */ +struct vscsiif_request { + uint16_t rqid; /* private guest value, echoed in resp */ + uint8_t act; /* command between backend and frontend */ + uint8_t cmd_len; /* valid CDB bytes */ + + uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE]; /* the CDB */ + uint16_t timeout_per_command; /* deprecated */ + uint16_t channel, id, lun; /* (virtual) device specification */ + uint16_t ref_rqid; /* command abort reference */ + uint8_t sc_data_direction; /* for DMA_TO_DEVICE(1) + DMA_FROM_DEVICE(2) + DMA_NONE(3) requests */ + uint8_t nr_segments; /* Number of pieces of scatter-gather */ +/* + * flag in nr_segments: SG elements via grant page + * + * If VSCSIIF_SG_GRANT is set, the low 7 bits of nr_segments specify the number + * of grant pages containing SG elements. Usable if "feature-sg-grant" set. + */ +#define VSCSIIF_SG_GRANT 0x80 + + struct scsiif_request_segment seg[VSCSIIF_SG_TABLESIZE]; + uint32_t reserved[3]; +}; + +/* Size of one response is 252 bytes */ +struct vscsiif_response { + uint16_t rqid; /* identifies request */ + uint8_t padding; + uint8_t sense_len; + uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE]; + int32_t rslt; + uint32_t residual_len; /* request bufflen - + return the value from physical device */ + uint32_t reserved[36]; +}; + +DEFINE_RING_TYPES(vscsiif, struct vscsiif_request, struct vscsiif_response); + +#endif /*__XEN__PUBLIC_IO_SCSI_H__*/ diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index de082130ba4b..f68719f405af 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -3,6 +3,24 @@ * * Guest OS interface to Xen. * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Copyright (c) 2004, K A Fraser */ @@ -73,13 +91,23 @@ * VIRTUAL INTERRUPTS * * Virtual interrupts that a guest OS may receive from Xen. + * In the side comments, 'V.' denotes a per-VCPU VIRQ while 'G.' denotes a + * global VIRQ. The former can be bound once per VCPU and cannot be re-bound. + * The latter can be allocated only once per guest: they must initially be + * allocated to VCPU0 but can subsequently be re-bound. */ -#define VIRQ_TIMER 0 /* Timebase update, and/or requested timeout. */ -#define VIRQ_DEBUG 1 /* Request guest to dump debug info. */ -#define VIRQ_CONSOLE 2 /* (DOM0) Bytes received on emergency console. */ -#define VIRQ_DOM_EXC 3 /* (DOM0) Exceptional event for some domain. */ -#define VIRQ_DEBUGGER 6 /* (DOM0) A domain has paused for debugging. */ -#define VIRQ_PCPU_STATE 9 /* (DOM0) PCPU state changed */ +#define VIRQ_TIMER 0 /* V. Timebase update, and/or requested timeout. */ +#define VIRQ_DEBUG 1 /* V. Request guest to dump debug info. */ +#define VIRQ_CONSOLE 2 /* G. (DOM0) Bytes received on emergency console. */ +#define VIRQ_DOM_EXC 3 /* G. (DOM0) Exceptional event for some domain. */ +#define VIRQ_TBUF 4 /* G. (DOM0) Trace buffer has records available. */ +#define VIRQ_DEBUGGER 6 /* G. (DOM0) A domain has paused for debugging. */ +#define VIRQ_XENOPROF 7 /* V. XenOprofile interrupt: new sample available */ +#define VIRQ_CON_RING 8 /* G. (DOM0) Bytes received on console */ +#define VIRQ_PCPU_STATE 9 /* G. (DOM0) PCPU state changed */ +#define VIRQ_MEM_EVENT 10 /* G. (DOM0) A memory event has occured */ +#define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient */ +#define VIRQ_ENOMEM 12 /* G. (DOM0) Low on heap memory */ /* Architecture-specific VIRQ definitions. */ #define VIRQ_ARCH_0 16 @@ -92,24 +120,68 @@ #define VIRQ_ARCH_7 23 #define NR_VIRQS 24 + /* - * MMU-UPDATE REQUESTS - * - * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs. - * A foreigndom (FD) can be specified (or DOMID_SELF for none). - * Where the FD has some effect, it is described below. - * ptr[1:0] specifies the appropriate MMU_* command. + * enum neg_errnoval HYPERVISOR_mmu_update(const struct mmu_update reqs[], + * unsigned count, unsigned *done_out, + * unsigned foreigndom) + * @reqs is an array of mmu_update_t structures ((ptr, val) pairs). + * @count is the length of the above array. + * @pdone is an output parameter indicating number of completed operations + * @foreigndom[15:0]: FD, the expected owner of data pages referenced in this + * hypercall invocation. Can be DOMID_SELF. + * @foreigndom[31:16]: PFD, the expected owner of pagetable pages referenced + * in this hypercall invocation. The value of this field + * (x) encodes the PFD as follows: + * x == 0 => PFD == DOMID_SELF + * x != 0 => PFD == x - 1 * + * Sub-commands: ptr[1:0] specifies the appropriate MMU_* command. + * ------------- * ptr[1:0] == MMU_NORMAL_PT_UPDATE: - * Updates an entry in a page table. If updating an L1 table, and the new - * table entry is valid/present, the mapped frame must belong to the FD, if - * an FD has been specified. If attempting to map an I/O page then the - * caller assumes the privilege of the FD. + * Updates an entry in a page table belonging to PFD. If updating an L1 table, + * and the new table entry is valid/present, the mapped frame must belong to + * FD. If attempting to map an I/O page then the caller assumes the privilege + * of the FD. * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller. * FD == DOMID_XEN: Map restricted areas of Xen's heap space. * ptr[:2] -- Machine address of the page-table entry to modify. * val -- Value to write. * + * There also certain implicit requirements when using this hypercall. The + * pages that make up a pagetable must be mapped read-only in the guest. + * This prevents uncontrolled guest updates to the pagetable. Xen strictly + * enforces this, and will disallow any pagetable update which will end up + * mapping pagetable page RW, and will disallow using any writable page as a + * pagetable. In practice it means that when constructing a page table for a + * process, thread, etc, we MUST be very dilligient in following these rules: + * 1). Start with top-level page (PGD or in Xen language: L4). Fill out + * the entries. + * 2). Keep on going, filling out the upper (PUD or L3), and middle (PMD + * or L2). + * 3). Start filling out the PTE table (L1) with the PTE entries. Once + * done, make sure to set each of those entries to RO (so writeable bit + * is unset). Once that has been completed, set the PMD (L2) for this + * PTE table as RO. + * 4). When completed with all of the PMD (L2) entries, and all of them have + * been set to RO, make sure to set RO the PUD (L3). Do the same + * operation on PGD (L4) pagetable entries that have a PUD (L3) entry. + * 5). Now before you can use those pages (so setting the cr3), you MUST also + * pin them so that the hypervisor can verify the entries. This is done + * via the HYPERVISOR_mmuext_op(MMUEXT_PIN_L4_TABLE, guest physical frame + * number of the PGD (L4)). And this point the HYPERVISOR_mmuext_op( + * MMUEXT_NEW_BASEPTR, guest physical frame number of the PGD (L4)) can be + * issued. + * For 32-bit guests, the L4 is not used (as there is less pagetables), so + * instead use L3. + * At this point the pagetables can be modified using the MMU_NORMAL_PT_UPDATE + * hypercall. Also if so desired the OS can also try to write to the PTE + * and be trapped by the hypervisor (as the PTE entry is RO). + * + * To deallocate the pages, the operations are the reverse of the steps + * mentioned above. The argument is MMUEXT_UNPIN_TABLE for all levels and the + * pagetable MUST not be in use (meaning that the cr3 is not set to it). + * * ptr[1:0] == MMU_MACHPHYS_UPDATE: * Updates an entry in the machine->pseudo-physical mapping table. * ptr[:2] -- Machine address within the frame whose mapping to modify. @@ -119,6 +191,72 @@ * ptr[1:0] == MMU_PT_UPDATE_PRESERVE_AD: * As MMU_NORMAL_PT_UPDATE above, but A/D bits currently in the PTE are ORed * with those in @val. + * + * @val is usually the machine frame number along with some attributes. + * The attributes by default follow the architecture defined bits. Meaning that + * if this is a X86_64 machine and four page table layout is used, the layout + * of val is: + * - 63 if set means No execute (NX) + * - 46-13 the machine frame number + * - 12 available for guest + * - 11 available for guest + * - 10 available for guest + * - 9 available for guest + * - 8 global + * - 7 PAT (PSE is disabled, must use hypercall to make 4MB or 2MB pages) + * - 6 dirty + * - 5 accessed + * - 4 page cached disabled + * - 3 page write through + * - 2 userspace accessible + * - 1 writeable + * - 0 present + * + * The one bits that does not fit with the default layout is the PAGE_PSE + * also called PAGE_PAT). The MMUEXT_[UN]MARK_SUPER arguments to the + * HYPERVISOR_mmuext_op serve as mechanism to set a pagetable to be 4MB + * (or 2MB) instead of using the PAGE_PSE bit. + * + * The reason that the PAGE_PSE (bit 7) is not being utilized is due to Xen + * using it as the Page Attribute Table (PAT) bit - for details on it please + * refer to Intel SDM 10.12. The PAT allows to set the caching attributes of + * pages instead of using MTRRs. + * + * The PAT MSR is as follows (it is a 64-bit value, each entry is 8 bits): + * PAT4 PAT0 + * +-----+-----+----+----+----+-----+----+----+ + * | UC | UC- | WC | WB | UC | UC- | WC | WB | <= Linux + * +-----+-----+----+----+----+-----+----+----+ + * | UC | UC- | WT | WB | UC | UC- | WT | WB | <= BIOS (default when machine boots) + * +-----+-----+----+----+----+-----+----+----+ + * | rsv | rsv | WP | WC | UC | UC- | WT | WB | <= Xen + * +-----+-----+----+----+----+-----+----+----+ + * + * The lookup of this index table translates to looking up + * Bit 7, Bit 4, and Bit 3 of val entry: + * + * PAT/PSE (bit 7) ... PCD (bit 4) .. PWT (bit 3). + * + * If all bits are off, then we are using PAT0. If bit 3 turned on, + * then we are using PAT1, if bit 3 and bit 4, then PAT2.. + * + * As you can see, the Linux PAT1 translates to PAT4 under Xen. Which means + * that if a guest that follows Linux's PAT setup and would like to set Write + * Combined on pages it MUST use PAT4 entry. Meaning that Bit 7 (PAGE_PAT) is + * set. For example, under Linux it only uses PAT0, PAT1, and PAT2 for the + * caching as: + * + * WB = none (so PAT0) + * WC = PWT (bit 3 on) + * UC = PWT | PCD (bit 3 and 4 are on). + * + * To make it work with Xen, it needs to translate the WC bit as so: + * + * PWT (so bit 3 on) --> PAT (so bit 7 is on) and clear bit 3 + * + * And to translate back it would: + * + * PAT (bit 7 on) --> PWT (bit 3 on) and clear bit 7. */ #define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */ #define MMU_MACHPHYS_UPDATE 1 /* ptr = MA of frame to modify entry for */ @@ -127,7 +265,12 @@ /* * MMU EXTENDED OPERATIONS * - * HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures. + * enum neg_errnoval HYPERVISOR_mmuext_op(mmuext_op_t uops[], + * unsigned int count, + * unsigned int *pdone, + * unsigned int foreigndom) + */ +/* HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures. * A foreigndom (FD) can be specified (or DOMID_SELF for none). * Where the FD has some effect, it is described below. * @@ -164,9 +307,23 @@ * cmd: MMUEXT_FLUSH_CACHE * No additional arguments. Writes back and flushes cache contents. * + * cmd: MMUEXT_FLUSH_CACHE_GLOBAL + * No additional arguments. Writes back and flushes cache contents + * on all CPUs in the system. + * * cmd: MMUEXT_SET_LDT * linear_addr: Linear address of LDT base (NB. must be page-aligned). * nr_ents: Number of entries in LDT. + * + * cmd: MMUEXT_CLEAR_PAGE + * mfn: Machine frame number to be cleared. + * + * cmd: MMUEXT_COPY_PAGE + * mfn: Machine frame number of the destination page. + * src_mfn: Machine frame number of the source page. + * + * cmd: MMUEXT_[UN]MARK_SUPER + * mfn: Machine frame number of head of superpage to be [un]marked. */ #define MMUEXT_PIN_L1_TABLE 0 #define MMUEXT_PIN_L2_TABLE 1 @@ -183,12 +340,18 @@ #define MMUEXT_FLUSH_CACHE 12 #define MMUEXT_SET_LDT 13 #define MMUEXT_NEW_USER_BASEPTR 15 +#define MMUEXT_CLEAR_PAGE 16 +#define MMUEXT_COPY_PAGE 17 +#define MMUEXT_FLUSH_CACHE_GLOBAL 18 +#define MMUEXT_MARK_SUPER 19 +#define MMUEXT_UNMARK_SUPER 20 #ifndef __ASSEMBLY__ struct mmuext_op { unsigned int cmd; union { - /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */ + /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR + * CLEAR_PAGE, COPY_PAGE, [UN]MARK_SUPER */ xen_pfn_t mfn; /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */ unsigned long linear_addr; @@ -198,6 +361,8 @@ struct mmuext_op { unsigned int nr_ents; /* TLB_FLUSH_MULTI, INVLPG_MULTI */ void *vcpumask; + /* COPY_PAGE */ + xen_pfn_t src_mfn; } arg2; }; DEFINE_GUEST_HANDLE_STRUCT(mmuext_op); @@ -225,10 +390,23 @@ DEFINE_GUEST_HANDLE_STRUCT(mmuext_op); */ #define VMASST_CMD_enable 0 #define VMASST_CMD_disable 1 + +/* x86/32 guests: simulate full 4GB segment limits. */ #define VMASST_TYPE_4gb_segments 0 + +/* x86/32 guests: trap (vector 15) whenever above vmassist is used. */ #define VMASST_TYPE_4gb_segments_notify 1 + +/* + * x86 guests: support writes to bottom-level PTEs. + * NB1. Page-directory entries cannot be written. + * NB2. Guest must continue to remove all writable mappings of PTEs. + */ #define VMASST_TYPE_writable_pagetables 2 + +/* x86/PAE guests: support PDPTs above 4GB. */ #define VMASST_TYPE_pae_extended_cr3 3 + #define MAX_VMASST_TYPE 3 #ifndef __ASSEMBLY__ @@ -260,6 +438,15 @@ typedef uint16_t domid_t; */ #define DOMID_XEN (0x7FF2U) +/* DOMID_COW is used as the owner of sharable pages */ +#define DOMID_COW (0x7FF3U) + +/* DOMID_INVALID is used to identify pages with unknown owner. */ +#define DOMID_INVALID (0x7FF4U) + +/* Idle domain. */ +#define DOMID_IDLE (0x7FFFU) + /* * Send an array of these to HYPERVISOR_mmu_update(). * NB. The fields are natural pointer/address size for this architecture. @@ -272,7 +459,9 @@ DEFINE_GUEST_HANDLE_STRUCT(mmu_update); /* * Send an array of these to HYPERVISOR_multicall(). - * NB. The fields are natural register size for this architecture. + * NB. The fields are logically the natural register size for this + * architecture. In cases where xen_ulong_t is larger than this then + * any unused bits in the upper portion must be zero. */ struct multicall_entry { xen_ulong_t op; @@ -442,8 +631,48 @@ struct start_info { unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */ unsigned long mod_len; /* Size (bytes) of pre-loaded module. */ int8_t cmd_line[MAX_GUEST_CMDLINE]; + /* The pfn range here covers both page table and p->m table frames. */ + unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table. */ + unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table. */ }; +/* These flags are passed in the 'flags' field of start_info_t. */ +#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ +#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ +#define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */ +#define SIF_MOD_START_PFN (1<<3) /* Is mod_start a PFN? */ +#define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */ + +/* + * A multiboot module is a package containing modules very similar to a + * multiboot module array. The only differences are: + * - the array of module descriptors is by convention simply at the beginning + * of the multiboot module, + * - addresses in the module descriptors are based on the beginning of the + * multiboot module, + * - the number of modules is determined by a termination descriptor that has + * mod_start == 0. + * + * This permits to both build it statically and reference it in a configuration + * file, and let the PV guest easily rebase the addresses to virtual addresses + * and at the same time count the number of modules. + */ +struct xen_multiboot_mod_list { + /* Address of first byte of the module */ + uint32_t mod_start; + /* Address of last byte of the module (inclusive) */ + uint32_t mod_end; + /* Address of zero-terminated command line */ + uint32_t cmdline; + /* Unused, must be zero */ + uint32_t pad; +}; +/* + * The console structure in start_info.console.dom0 + * + * This structure includes a variety of information required to + * have a working VGA/VESA console. + */ struct dom0_vga_console_info { uint8_t video_type; #define XEN_VGATYPE_TEXT_MODE_3 0x03 @@ -484,11 +713,6 @@ struct dom0_vga_console_info { } u; }; -/* These flags are passed in the 'flags' field of start_info_t. */ -#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ -#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ -#define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */ - typedef uint64_t cpumap_t; typedef uint8_t xen_domain_handle_t[16]; diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index 0324c6d340c1..b78f21caf55a 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -86,6 +86,7 @@ struct xenbus_device_id /* A xenbus driver. */ struct xenbus_driver { + const char *name; /* defaults to ids[0].devicetype */ const struct xenbus_device_id *ids; int (*probe)(struct xenbus_device *dev, const struct xenbus_device_id *id); @@ -100,20 +101,22 @@ struct xenbus_driver { int (*is_ready)(struct xenbus_device *dev); }; -#define DEFINE_XENBUS_DRIVER(var, drvname, methods...) \ -struct xenbus_driver var ## _driver = { \ - .driver.name = drvname + 0 ?: var ## _ids->devicetype, \ - .driver.owner = THIS_MODULE, \ - .ids = var ## _ids, ## methods \ -} - static inline struct xenbus_driver *to_xenbus_driver(struct device_driver *drv) { return container_of(drv, struct xenbus_driver, driver); } -int __must_check xenbus_register_frontend(struct xenbus_driver *); -int __must_check xenbus_register_backend(struct xenbus_driver *); +int __must_check __xenbus_register_frontend(struct xenbus_driver *drv, + struct module *owner, + const char *mod_name); +int __must_check __xenbus_register_backend(struct xenbus_driver *drv, + struct module *owner, + const char *mod_name); + +#define xenbus_register_frontend(drv) \ + __xenbus_register_frontend(drv, THIS_MODULE, KBUILD_MODNAME); +#define xenbus_register_backend(drv) \ + __xenbus_register_backend(drv, THIS_MODULE, KBUILD_MODNAME); void xenbus_unregister_driver(struct xenbus_driver *drv); |