diff options
Diffstat (limited to 'drivers/vfio')
-rw-r--r-- | drivers/vfio/Kconfig | 6 | ||||
-rw-r--r-- | drivers/vfio/Makefile | 1 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci.c | 24 | ||||
-rw-r--r-- | drivers/vfio/vfio.c | 52 | ||||
-rw-r--r-- | drivers/vfio/vfio_iommu_spapr_tce.c | 377 | ||||
-rw-r--r-- | drivers/vfio/vfio_iommu_type1.c | 626 |
6 files changed, 831 insertions, 255 deletions
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 7cd5dec0abd1..26b3d9d1409f 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1 depends on VFIO default n +config VFIO_IOMMU_SPAPR_TCE + tristate + depends on VFIO && SPAPR_TCE_IOMMU + default n + menuconfig VFIO tristate "VFIO Non-Privileged userspace driver framework" depends on IOMMU_API select VFIO_IOMMU_TYPE1 if X86 + select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES) help VFIO provides a framework for secure userspace device drivers. See Documentation/vfio.txt for more details. diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 2398d4a0e38b..72bfabc8629e 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_VFIO) += vfio.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o obj-$(CONFIG_VFIO_PCI) += pci/ diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index ac3725440d64..cef6002acbd4 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -137,8 +137,27 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) */ pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); - if (vdev->reset_works) - __pci_reset_function(pdev); + /* + * Careful, device_lock may already be held. This is the case if + * a driver unbind is blocked. Try to get the locks ourselves to + * prevent a deadlock. + */ + if (vdev->reset_works) { + bool reset_done = false; + + if (pci_cfg_access_trylock(pdev)) { + if (device_trylock(&pdev->dev)) { + __pci_reset_function_locked(pdev); + reset_done = true; + device_unlock(&pdev->dev); + } + pci_cfg_access_unlock(pdev); + } + + if (!reset_done) + pr_warn("%s: Unable to acquire locks for reset of %s\n", + __func__, dev_name(&pdev->dev)); + } pci_restore_state(pdev); } @@ -499,7 +518,6 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) } vma->vm_private_data = vdev; - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 6d78736563de..842f4507883e 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -76,6 +76,7 @@ struct vfio_group { struct notifier_block nb; struct list_head vfio_next; struct list_head container_next; + atomic_t opened; }; struct vfio_device { @@ -206,6 +207,7 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); atomic_set(&group->container_users, 0); + atomic_set(&group->opened, 0); group->iommu_group = iommu_group; group->nb.notifier_call = vfio_iommu_group_notifier; @@ -492,27 +494,6 @@ static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev) return 0; } -static int vfio_group_nb_del_dev(struct vfio_group *group, struct device *dev) -{ - struct vfio_device *device; - - /* - * Expect to fall out here. If a device was in use, it would - * have been bound to a vfio sub-driver, which would have blocked - * in .remove at vfio_del_group_dev. Sanity check that we no - * longer track the device, so it's safe to remove. - */ - device = vfio_group_get_device(group, dev); - if (likely(!device)) - return 0; - - WARN("Device %s removed from live group %d!\n", dev_name(dev), - iommu_group_id(group->iommu_group)); - - vfio_device_put(device); - return 0; -} - static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev) { /* We don't care what happens when the group isn't in use */ @@ -529,13 +510,11 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb, struct device *dev = data; /* - * Need to go through a group_lock lookup to get a reference or - * we risk racing a group being removed. Leave a WARN_ON for - * debuging, but if the group no longer exists, a spurious notify - * is harmless. + * Need to go through a group_lock lookup to get a reference or we + * risk racing a group being removed. Ignore spurious notifies. */ group = vfio_group_try_get(group); - if (WARN_ON(!group)) + if (!group) return NOTIFY_OK; switch (action) { @@ -543,7 +522,13 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb, vfio_group_nb_add_dev(group, dev); break; case IOMMU_GROUP_NOTIFY_DEL_DEVICE: - vfio_group_nb_del_dev(group, dev); + /* + * Nothing to do here. If the device is in use, then the + * vfio sub-driver should block the remove callback until + * it is unused. If the device is unused or attached to a + * stub driver, then it should be released and we don't + * care that it will be going away. + */ break; case IOMMU_GROUP_NOTIFY_BIND_DRIVER: pr_debug("%s: Device %s, group %d binding to driver\n", @@ -1236,12 +1221,22 @@ static long vfio_group_fops_compat_ioctl(struct file *filep, static int vfio_group_fops_open(struct inode *inode, struct file *filep) { struct vfio_group *group; + int opened; group = vfio_group_get_from_minor(iminor(inode)); if (!group) return -ENODEV; + /* Do we need multiple instances of the group open? Seems not. */ + opened = atomic_cmpxchg(&group->opened, 0, 1); + if (opened) { + vfio_group_put(group); + return -EBUSY; + } + + /* Is something still in use from a previous open? */ if (group->container) { + atomic_dec(&group->opened); vfio_group_put(group); return -EBUSY; } @@ -1259,6 +1254,8 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) vfio_group_try_dissolve_container(group); + atomic_dec(&group->opened); + vfio_group_put(group); return 0; @@ -1415,6 +1412,7 @@ static int __init vfio_init(void) * drivers. */ request_module_nowait("vfio_iommu_type1"); + request_module_nowait("vfio_iommu_spapr_tce"); return 0; diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c new file mode 100644 index 000000000000..bdae7a04af75 --- /dev/null +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -0,0 +1,377 @@ +/* + * VFIO: IOMMU DMA mapping support for TCE on POWER + * + * Copyright (C) 2013 IBM Corp. All rights reserved. + * Author: Alexey Kardashevskiy <aik@ozlabs.ru> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Derived from original vfio_iommu_type1.c: + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson <alex.williamson@redhat.com> + */ + +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/err.h> +#include <linux/vfio.h> +#include <asm/iommu.h> +#include <asm/tce.h> + +#define DRIVER_VERSION "0.1" +#define DRIVER_AUTHOR "aik@ozlabs.ru" +#define DRIVER_DESC "VFIO IOMMU SPAPR TCE" + +static void tce_iommu_detach_group(void *iommu_data, + struct iommu_group *iommu_group); + +/* + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation + * + * This code handles mapping and unmapping of user data buffers + * into DMA'ble space using the IOMMU + */ + +/* + * The container descriptor supports only a single group per container. + * Required by the API as the container is not supplied with the IOMMU group + * at the moment of initialization. + */ +struct tce_container { + struct mutex lock; + struct iommu_table *tbl; + bool enabled; +}; + +static int tce_iommu_enable(struct tce_container *container) +{ + int ret = 0; + unsigned long locked, lock_limit, npages; + struct iommu_table *tbl = container->tbl; + + if (!container->tbl) + return -ENXIO; + + if (!current->mm) + return -ESRCH; /* process exited */ + + if (container->enabled) + return -EBUSY; + + /* + * When userspace pages are mapped into the IOMMU, they are effectively + * locked memory, so, theoretically, we need to update the accounting + * of locked pages on each map and unmap. For powerpc, the map unmap + * paths can be very hot, though, and the accounting would kill + * performance, especially since it would be difficult to impossible + * to handle the accounting in real mode only. + * + * To address that, rather than precisely accounting every page, we + * instead account for a worst case on locked memory when the iommu is + * enabled and disabled. The worst case upper bound on locked memory + * is the size of the whole iommu window, which is usually relatively + * small (compared to total memory sizes) on POWER hardware. + * + * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, + * that would effectively kill the guest at random points, much better + * enforcing the limit based on the max that the guest can map. + */ + down_write(¤t->mm->mmap_sem); + npages = (tbl->it_size << IOMMU_PAGE_SHIFT) >> PAGE_SHIFT; + locked = current->mm->locked_vm + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { + pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n", + rlimit(RLIMIT_MEMLOCK)); + ret = -ENOMEM; + } else { + + current->mm->locked_vm += npages; + container->enabled = true; + } + up_write(¤t->mm->mmap_sem); + + return ret; +} + +static void tce_iommu_disable(struct tce_container *container) +{ + if (!container->enabled) + return; + + container->enabled = false; + + if (!container->tbl || !current->mm) + return; + + down_write(¤t->mm->mmap_sem); + current->mm->locked_vm -= (container->tbl->it_size << + IOMMU_PAGE_SHIFT) >> PAGE_SHIFT; + up_write(¤t->mm->mmap_sem); +} + +static void *tce_iommu_open(unsigned long arg) +{ + struct tce_container *container; + + if (arg != VFIO_SPAPR_TCE_IOMMU) { + pr_err("tce_vfio: Wrong IOMMU type\n"); + return ERR_PTR(-EINVAL); + } + + container = kzalloc(sizeof(*container), GFP_KERNEL); + if (!container) + return ERR_PTR(-ENOMEM); + + mutex_init(&container->lock); + + return container; +} + +static void tce_iommu_release(void *iommu_data) +{ + struct tce_container *container = iommu_data; + + WARN_ON(container->tbl && !container->tbl->it_group); + tce_iommu_disable(container); + + if (container->tbl && container->tbl->it_group) + tce_iommu_detach_group(iommu_data, container->tbl->it_group); + + mutex_destroy(&container->lock); + + kfree(container); +} + +static long tce_iommu_ioctl(void *iommu_data, + unsigned int cmd, unsigned long arg) +{ + struct tce_container *container = iommu_data; + unsigned long minsz; + long ret; + + switch (cmd) { + case VFIO_CHECK_EXTENSION: + return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0; + + case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { + struct vfio_iommu_spapr_tce_info info; + struct iommu_table *tbl = container->tbl; + + if (WARN_ON(!tbl)) + return -ENXIO; + + minsz = offsetofend(struct vfio_iommu_spapr_tce_info, + dma32_window_size); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT; + info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT; + info.flags = 0; + + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + + return 0; + } + case VFIO_IOMMU_MAP_DMA: { + struct vfio_iommu_type1_dma_map param; + struct iommu_table *tbl = container->tbl; + unsigned long tce, i; + + if (!tbl) + return -ENXIO; + + BUG_ON(!tbl->it_group); + + minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); + + if (copy_from_user(¶m, (void __user *)arg, minsz)) + return -EFAULT; + + if (param.argsz < minsz) + return -EINVAL; + + if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ | + VFIO_DMA_MAP_FLAG_WRITE)) + return -EINVAL; + + if ((param.size & ~IOMMU_PAGE_MASK) || + (param.vaddr & ~IOMMU_PAGE_MASK)) + return -EINVAL; + + /* iova is checked by the IOMMU API */ + tce = param.vaddr; + if (param.flags & VFIO_DMA_MAP_FLAG_READ) + tce |= TCE_PCI_READ; + if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) + tce |= TCE_PCI_WRITE; + + ret = iommu_tce_put_param_check(tbl, param.iova, tce); + if (ret) + return ret; + + for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT); ++i) { + ret = iommu_put_tce_user_mode(tbl, + (param.iova >> IOMMU_PAGE_SHIFT) + i, + tce); + if (ret) + break; + tce += IOMMU_PAGE_SIZE; + } + if (ret) + iommu_clear_tces_and_put_pages(tbl, + param.iova >> IOMMU_PAGE_SHIFT, i); + + iommu_flush_tce(tbl); + + return ret; + } + case VFIO_IOMMU_UNMAP_DMA: { + struct vfio_iommu_type1_dma_unmap param; + struct iommu_table *tbl = container->tbl; + + if (WARN_ON(!tbl)) + return -ENXIO; + + minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, + size); + + if (copy_from_user(¶m, (void __user *)arg, minsz)) + return -EFAULT; + + if (param.argsz < minsz) + return -EINVAL; + + /* No flag is supported now */ + if (param.flags) + return -EINVAL; + + if (param.size & ~IOMMU_PAGE_MASK) + return -EINVAL; + + ret = iommu_tce_clear_param_check(tbl, param.iova, 0, + param.size >> IOMMU_PAGE_SHIFT); + if (ret) + return ret; + + ret = iommu_clear_tces_and_put_pages(tbl, + param.iova >> IOMMU_PAGE_SHIFT, + param.size >> IOMMU_PAGE_SHIFT); + iommu_flush_tce(tbl); + + return ret; + } + case VFIO_IOMMU_ENABLE: + mutex_lock(&container->lock); + ret = tce_iommu_enable(container); + mutex_unlock(&container->lock); + return ret; + + + case VFIO_IOMMU_DISABLE: + mutex_lock(&container->lock); + tce_iommu_disable(container); + mutex_unlock(&container->lock); + return 0; + } + + return -ENOTTY; +} + +static int tce_iommu_attach_group(void *iommu_data, + struct iommu_group *iommu_group) +{ + int ret; + struct tce_container *container = iommu_data; + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group); + + BUG_ON(!tbl); + mutex_lock(&container->lock); + + /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", + iommu_group_id(iommu_group), iommu_group); */ + if (container->tbl) { + pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n", + iommu_group_id(container->tbl->it_group), + iommu_group_id(iommu_group)); + ret = -EBUSY; + } else if (container->enabled) { + pr_err("tce_vfio: attaching group #%u to enabled container\n", + iommu_group_id(iommu_group)); + ret = -EBUSY; + } else { + ret = iommu_take_ownership(tbl); + if (!ret) + container->tbl = tbl; + } + + mutex_unlock(&container->lock); + + return ret; +} + +static void tce_iommu_detach_group(void *iommu_data, + struct iommu_group *iommu_group) +{ + struct tce_container *container = iommu_data; + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group); + + BUG_ON(!tbl); + mutex_lock(&container->lock); + if (tbl != container->tbl) { + pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n", + iommu_group_id(iommu_group), + iommu_group_id(tbl->it_group)); + } else { + if (container->enabled) { + pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n", + iommu_group_id(tbl->it_group)); + tce_iommu_disable(container); + } + + /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n", + iommu_group_id(iommu_group), iommu_group); */ + container->tbl = NULL; + iommu_release_ownership(tbl); + } + mutex_unlock(&container->lock); +} + +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = { + .name = "iommu-vfio-powerpc", + .owner = THIS_MODULE, + .open = tce_iommu_open, + .release = tce_iommu_release, + .ioctl = tce_iommu_ioctl, + .attach_group = tce_iommu_attach_group, + .detach_group = tce_iommu_detach_group, +}; + +static int __init tce_iommu_init(void) +{ + return vfio_register_iommu_driver(&tce_iommu_driver_ops); +} + +static void __exit tce_iommu_cleanup(void) +{ + vfio_unregister_iommu_driver(&tce_iommu_driver_ops); +} + +module_init(tce_iommu_init); +module_exit(tce_iommu_cleanup); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); + diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 6f3fbc48a6c7..a9807dea3887 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -31,6 +31,7 @@ #include <linux/module.h> #include <linux/mm.h> #include <linux/pci.h> /* pci_bus_type */ +#include <linux/rbtree.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/uaccess.h> @@ -47,19 +48,25 @@ module_param_named(allow_unsafe_interrupts, MODULE_PARM_DESC(allow_unsafe_interrupts, "Enable VFIO IOMMU support for on platforms without interrupt remapping support."); +static bool disable_hugepages; +module_param_named(disable_hugepages, + disable_hugepages, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(disable_hugepages, + "Disable VFIO IOMMU support for IOMMU hugepages."); + struct vfio_iommu { struct iommu_domain *domain; struct mutex lock; - struct list_head dma_list; + struct rb_root dma_list; struct list_head group_list; bool cache; }; struct vfio_dma { - struct list_head next; + struct rb_node node; dma_addr_t iova; /* Device address */ unsigned long vaddr; /* Process virtual addr */ - long npage; /* Number of pages */ + size_t size; /* Map size (bytes) */ int prot; /* IOMMU_READ/WRITE */ }; @@ -73,7 +80,48 @@ struct vfio_group { * into DMA'ble space using the IOMMU */ -#define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT) +static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, + dma_addr_t start, size_t size) +{ + struct rb_node *node = iommu->dma_list.rb_node; + + while (node) { + struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); + + if (start + size <= dma->iova) + node = node->rb_left; + else if (start >= dma->iova + dma->size) + node = node->rb_right; + else + return dma; + } + + return NULL; +} + +static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new) +{ + struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL; + struct vfio_dma *dma; + + while (*link) { + parent = *link; + dma = rb_entry(parent, struct vfio_dma, node); + + if (new->iova + new->size <= dma->iova) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, &iommu->dma_list); +} + +static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old) +{ + rb_erase(&old->node, &iommu->dma_list); +} struct vwork { struct mm_struct *mm; @@ -100,8 +148,8 @@ static void vfio_lock_acct(long npage) struct vwork *vwork; struct mm_struct *mm; - if (!current->mm) - return; /* process exited */ + if (!current->mm || !npage) + return; /* process exited or nothing to do */ if (down_write_trylock(¤t->mm->mmap_sem)) { current->mm->locked_vm += npage; @@ -173,33 +221,6 @@ static int put_pfn(unsigned long pfn, int prot) return 0; } -/* Unmap DMA region */ -static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova, - long npage, int prot) -{ - long i, unlocked = 0; - - for (i = 0; i < npage; i++, iova += PAGE_SIZE) { - unsigned long pfn; - - pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT; - if (pfn) { - iommu_unmap(iommu->domain, iova, PAGE_SIZE); - unlocked += put_pfn(pfn, prot); - } - } - return unlocked; -} - -static void vfio_dma_unmap(struct vfio_iommu *iommu, dma_addr_t iova, - long npage, int prot) -{ - long unlocked; - - unlocked = __vfio_dma_do_unmap(iommu, iova, npage, prot); - vfio_lock_acct(-unlocked); -} - static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) { struct page *page[1]; @@ -226,198 +247,306 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) return ret; } -/* Map DMA region */ -static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova, - unsigned long vaddr, long npage, int prot) +/* + * Attempt to pin pages. We really don't want to track all the pfns and + * the iommu can only map chunks of consecutive pfns anyway, so get the + * first page and all consecutive pages with the same locking. + */ +static long vfio_pin_pages(unsigned long vaddr, long npage, + int prot, unsigned long *pfn_base) { - dma_addr_t start = iova; - long i, locked = 0; - int ret; + unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + bool lock_cap = capable(CAP_IPC_LOCK); + long ret, i; - /* Verify that pages are not already mapped */ - for (i = 0; i < npage; i++, iova += PAGE_SIZE) - if (iommu_iova_to_phys(iommu->domain, iova)) - return -EBUSY; + if (!current->mm) + return -ENODEV; - iova = start; + ret = vaddr_get_pfn(vaddr, prot, pfn_base); + if (ret) + return ret; - if (iommu->cache) - prot |= IOMMU_CACHE; + if (is_invalid_reserved_pfn(*pfn_base)) + return 1; - /* - * XXX We break mappings into pages and use get_user_pages_fast to - * pin the pages in memory. It's been suggested that mlock might - * provide a more efficient mechanism, but nothing prevents the - * user from munlocking the pages, which could then allow the user - * access to random host memory. We also have no guarantee from the - * IOMMU API that the iommu driver can unmap sub-pages of previous - * mappings. This means we might lose an entire range if a single - * page within it is unmapped. Single page mappings are inefficient, - * but provide the most flexibility for now. - */ - for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) { + if (!lock_cap && current->mm->locked_vm + 1 > limit) { + put_pfn(*pfn_base, prot); + pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, + limit << PAGE_SHIFT); + return -ENOMEM; + } + + if (unlikely(disable_hugepages)) { + vfio_lock_acct(1); + return 1; + } + + /* Lock all the consecutive pages from pfn_base */ + for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) { unsigned long pfn = 0; ret = vaddr_get_pfn(vaddr, prot, &pfn); - if (ret) { - __vfio_dma_do_unmap(iommu, start, i, prot); - return ret; - } + if (ret) + break; - /* - * Only add actual locked pages to accounting - * XXX We're effectively marking a page locked for every - * IOVA page even though it's possible the user could be - * backing multiple IOVAs with the same vaddr. This over- - * penalizes the user process, but we currently have no - * easy way to do this properly. - */ - if (!is_invalid_reserved_pfn(pfn)) - locked++; + if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) { + put_pfn(pfn, prot); + break; + } - ret = iommu_map(iommu->domain, iova, - (phys_addr_t)pfn << PAGE_SHIFT, - PAGE_SIZE, prot); - if (ret) { - /* Back out mappings on error */ + if (!lock_cap && current->mm->locked_vm + i + 1 > limit) { put_pfn(pfn, prot); - __vfio_dma_do_unmap(iommu, start, i, prot); - return ret; + pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", + __func__, limit << PAGE_SHIFT); + break; } } - vfio_lock_acct(locked); - return 0; + + vfio_lock_acct(i); + + return i; } -static inline bool ranges_overlap(dma_addr_t start1, size_t size1, - dma_addr_t start2, size_t size2) +static long vfio_unpin_pages(unsigned long pfn, long npage, + int prot, bool do_accounting) { - if (start1 < start2) - return (start2 - start1 < size1); - else if (start2 < start1) - return (start1 - start2 < size2); - return (size1 > 0 && size2 > 0); + unsigned long unlocked = 0; + long i; + + for (i = 0; i < npage; i++) + unlocked += put_pfn(pfn++, prot); + + if (do_accounting) + vfio_lock_acct(-unlocked); + + return unlocked; } -static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, - dma_addr_t start, size_t size) +static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, + dma_addr_t iova, size_t *size) { - struct vfio_dma *dma; + dma_addr_t start = iova, end = iova + *size; + long unlocked = 0; - list_for_each_entry(dma, &iommu->dma_list, next) { - if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage), - start, size)) - return dma; + while (iova < end) { + size_t unmapped; + phys_addr_t phys; + + /* + * We use the IOMMU to track the physical address. This + * saves us from having a lot more entries in our mapping + * tree. The downside is that we don't track the size + * used to do the mapping. We request unmap of a single + * page, but expect IOMMUs that support large pages to + * unmap a larger chunk. + */ + phys = iommu_iova_to_phys(iommu->domain, iova); + if (WARN_ON(!phys)) { + iova += PAGE_SIZE; + continue; + } + + unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE); + if (!unmapped) + break; + + unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT, + unmapped >> PAGE_SHIFT, + dma->prot, false); + iova += unmapped; } - return NULL; + + vfio_lock_acct(-unlocked); + + *size = iova - start; + + return 0; } -static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, - size_t size, struct vfio_dma *dma) +static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, + size_t *size, struct vfio_dma *dma) { + size_t offset, overlap, tmp; struct vfio_dma *split; - long npage_lo, npage_hi; - - /* Existing dma region is completely covered, unmap all */ - if (start <= dma->iova && - start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { - vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); - list_del(&dma->next); - npage_lo = dma->npage; + int ret; + + if (!*size) + return 0; + + /* + * Existing dma region is completely covered, unmap all. This is + * the likely case since userspace tends to map and unmap buffers + * in one shot rather than multiple mappings within a buffer. + */ + if (likely(start <= dma->iova && + start + *size >= dma->iova + dma->size)) { + *size = dma->size; + ret = vfio_unmap_unpin(iommu, dma, dma->iova, size); + if (ret) + return ret; + + /* + * Did we remove more than we have? Should never happen + * since a vfio_dma is contiguous in iova and vaddr. + */ + WARN_ON(*size != dma->size); + + vfio_remove_dma(iommu, dma); kfree(dma); - return npage_lo; + return 0; } /* Overlap low address of existing range */ if (start <= dma->iova) { - size_t overlap; + overlap = start + *size - dma->iova; + ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap); + if (ret) + return ret; - overlap = start + size - dma->iova; - npage_lo = overlap >> PAGE_SHIFT; + vfio_remove_dma(iommu, dma); - vfio_dma_unmap(iommu, dma->iova, npage_lo, dma->prot); - dma->iova += overlap; - dma->vaddr += overlap; - dma->npage -= npage_lo; - return npage_lo; + /* + * Check, we may have removed to whole vfio_dma. If not + * fixup and re-insert. + */ + if (overlap < dma->size) { + dma->iova += overlap; + dma->vaddr += overlap; + dma->size -= overlap; + vfio_insert_dma(iommu, dma); + } else + kfree(dma); + + *size = overlap; + return 0; } /* Overlap high address of existing range */ - if (start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { - size_t overlap; + if (start + *size >= dma->iova + dma->size) { + offset = start - dma->iova; + overlap = dma->size - offset; - overlap = dma->iova + NPAGE_TO_SIZE(dma->npage) - start; - npage_hi = overlap >> PAGE_SHIFT; + ret = vfio_unmap_unpin(iommu, dma, start, &overlap); + if (ret) + return ret; - vfio_dma_unmap(iommu, start, npage_hi, dma->prot); - dma->npage -= npage_hi; - return npage_hi; + dma->size -= overlap; + *size = overlap; + return 0; } /* Split existing */ - npage_lo = (start - dma->iova) >> PAGE_SHIFT; - npage_hi = dma->npage - (size >> PAGE_SHIFT) - npage_lo; - split = kzalloc(sizeof *split, GFP_KERNEL); + /* + * Allocate our tracking structure early even though it may not + * be used. An Allocation failure later loses track of pages and + * is more difficult to unwind. + */ + split = kzalloc(sizeof(*split), GFP_KERNEL); if (!split) return -ENOMEM; - vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, dma->prot); + offset = start - dma->iova; + + ret = vfio_unmap_unpin(iommu, dma, start, size); + if (ret || !*size) { + kfree(split); + return ret; + } + + tmp = dma->size; - dma->npage = npage_lo; + /* Resize the lower vfio_dma in place, before the below insert */ + dma->size = offset; - split->npage = npage_hi; - split->iova = start + size; - split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size; - split->prot = dma->prot; - list_add(&split->next, &iommu->dma_list); - return size >> PAGE_SHIFT; + /* Insert new for remainder, assuming it didn't all get unmapped */ + if (likely(offset + *size < tmp)) { + split->size = tmp - offset - *size; + split->iova = dma->iova + offset + *size; + split->vaddr = dma->vaddr + offset + *size; + split->prot = dma->prot; + vfio_insert_dma(iommu, split); + } else + kfree(split); + + return 0; } static int vfio_dma_do_unmap(struct vfio_iommu *iommu, struct vfio_iommu_type1_dma_unmap *unmap) { - long ret = 0, npage = unmap->size >> PAGE_SHIFT; - struct vfio_dma *dma, *tmp; uint64_t mask; + struct vfio_dma *dma; + size_t unmapped = 0, size; + int ret = 0; mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; if (unmap->iova & mask) return -EINVAL; - if (unmap->size & mask) + if (!unmap->size || unmap->size & mask) return -EINVAL; - /* XXX We still break these down into PAGE_SIZE */ WARN_ON(mask & PAGE_MASK); mutex_lock(&iommu->lock); - list_for_each_entry_safe(dma, tmp, &iommu->dma_list, next) { - if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage), - unmap->iova, unmap->size)) { - ret = vfio_remove_dma_overlap(iommu, unmap->iova, - unmap->size, dma); - if (ret > 0) - npage -= ret; - if (ret < 0 || npage == 0) - break; - } + while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { + size = unmap->size; + ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma); + if (ret || !size) + break; + unmapped += size; } + mutex_unlock(&iommu->lock); - return ret > 0 ? 0 : (int)ret; + + /* + * We may unmap more than requested, update the unmap struct so + * userspace can know. + */ + unmap->size = unmapped; + + return ret; +} + +/* + * Turns out AMD IOMMU has a page table bug where it won't map large pages + * to a region that previously mapped smaller pages. This should be fixed + * soon, so this is just a temporary workaround to break mappings down into + * PAGE_SIZE. Better to map smaller pages than nothing. + */ +static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova, + unsigned long pfn, long npage, int prot) +{ + long i; + int ret; + + for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) { + ret = iommu_map(iommu->domain, iova, + (phys_addr_t)pfn << PAGE_SHIFT, + PAGE_SIZE, prot); + if (ret) + break; + } + + for (; i < npage && i > 0; i--, iova -= PAGE_SIZE) + iommu_unmap(iommu->domain, iova, PAGE_SIZE); + + return ret; } static int vfio_dma_do_map(struct vfio_iommu *iommu, struct vfio_iommu_type1_dma_map *map) { - struct vfio_dma *dma, *pdma = NULL; - dma_addr_t iova = map->iova; - unsigned long locked, lock_limit, vaddr = map->vaddr; + dma_addr_t end, iova; + unsigned long vaddr = map->vaddr; size_t size = map->size; + long npage; int ret = 0, prot = 0; uint64_t mask; - long npage; + + end = map->iova + map->size; mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; @@ -430,104 +559,144 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, if (!prot) return -EINVAL; /* No READ/WRITE? */ + if (iommu->cache) + prot |= IOMMU_CACHE; + if (vaddr & mask) return -EINVAL; - if (iova & mask) + if (map->iova & mask) return -EINVAL; - if (size & mask) + if (!map->size || map->size & mask) return -EINVAL; - /* XXX We still break these down into PAGE_SIZE */ WARN_ON(mask & PAGE_MASK); /* Don't allow IOVA wrap */ - if (iova + size && iova + size < iova) + if (end && end < map->iova) return -EINVAL; /* Don't allow virtual address wrap */ - if (vaddr + size && vaddr + size < vaddr) - return -EINVAL; - - npage = size >> PAGE_SHIFT; - if (!npage) + if (vaddr + map->size && vaddr + map->size < vaddr) return -EINVAL; mutex_lock(&iommu->lock); - if (vfio_find_dma(iommu, iova, size)) { - ret = -EBUSY; - goto out_lock; + if (vfio_find_dma(iommu, map->iova, map->size)) { + mutex_unlock(&iommu->lock); + return -EEXIST; } - /* account for locked pages */ - locked = current->mm->locked_vm + npage; - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { - pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", - __func__, rlimit(RLIMIT_MEMLOCK)); - ret = -ENOMEM; - goto out_lock; - } + for (iova = map->iova; iova < end; iova += size, vaddr += size) { + struct vfio_dma *dma = NULL; + unsigned long pfn; + long i; + + /* Pin a contiguous chunk of memory */ + npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT, + prot, &pfn); + if (npage <= 0) { + WARN_ON(!npage); + ret = (int)npage; + break; + } - ret = __vfio_dma_map(iommu, iova, vaddr, npage, prot); - if (ret) - goto out_lock; + /* Verify pages are not already mapped */ + for (i = 0; i < npage; i++) { + if (iommu_iova_to_phys(iommu->domain, + iova + (i << PAGE_SHIFT))) { + vfio_unpin_pages(pfn, npage, prot, true); + ret = -EBUSY; + break; + } + } - /* Check if we abut a region below - nothing below 0 */ - if (iova) { - dma = vfio_find_dma(iommu, iova - 1, 1); - if (dma && dma->prot == prot && - dma->vaddr + NPAGE_TO_SIZE(dma->npage) == vaddr) { + ret = iommu_map(iommu->domain, iova, + (phys_addr_t)pfn << PAGE_SHIFT, + npage << PAGE_SHIFT, prot); + if (ret) { + if (ret != -EBUSY || + map_try_harder(iommu, iova, pfn, npage, prot)) { + vfio_unpin_pages(pfn, npage, prot, true); + break; + } + } - dma->npage += npage; - iova = dma->iova; - vaddr = dma->vaddr; - npage = dma->npage; - size = NPAGE_TO_SIZE(npage); + size = npage << PAGE_SHIFT; - pdma = dma; + /* + * Check if we abut a region below - nothing below 0. + * This is the most likely case when mapping chunks of + * physically contiguous regions within a virtual address + * range. Update the abutting entry in place since iova + * doesn't change. + */ + if (likely(iova)) { + struct vfio_dma *tmp; + tmp = vfio_find_dma(iommu, iova - 1, 1); + if (tmp && tmp->prot == prot && + tmp->vaddr + tmp->size == vaddr) { + tmp->size += size; + iova = tmp->iova; + size = tmp->size; + vaddr = tmp->vaddr; + dma = tmp; + } + } + + /* + * Check if we abut a region above - nothing above ~0 + 1. + * If we abut above and below, remove and free. If only + * abut above, remove, modify, reinsert. + */ + if (likely(iova + size)) { + struct vfio_dma *tmp; + tmp = vfio_find_dma(iommu, iova + size, 1); + if (tmp && tmp->prot == prot && + tmp->vaddr == vaddr + size) { + vfio_remove_dma(iommu, tmp); + if (dma) { + dma->size += tmp->size; + kfree(tmp); + } else { + size += tmp->size; + tmp->size = size; + tmp->iova = iova; + tmp->vaddr = vaddr; + vfio_insert_dma(iommu, tmp); + dma = tmp; + } + } } - } - /* Check if we abut a region above - nothing above ~0 + 1 */ - if (iova + size) { - dma = vfio_find_dma(iommu, iova + size, 1); - if (dma && dma->prot == prot && - dma->vaddr == vaddr + size) { + if (!dma) { + dma = kzalloc(sizeof(*dma), GFP_KERNEL); + if (!dma) { + iommu_unmap(iommu->domain, iova, size); + vfio_unpin_pages(pfn, npage, prot, true); + ret = -ENOMEM; + break; + } - dma->npage += npage; + dma->size = size; dma->iova = iova; dma->vaddr = vaddr; - - /* - * If merged above and below, remove previously - * merged entry. New entry covers it. - */ - if (pdma) { - list_del(&pdma->next); - kfree(pdma); - } - pdma = dma; + dma->prot = prot; + vfio_insert_dma(iommu, dma); } } - /* Isolated, new region */ - if (!pdma) { - dma = kzalloc(sizeof *dma, GFP_KERNEL); - if (!dma) { - ret = -ENOMEM; - vfio_dma_unmap(iommu, iova, npage, prot); - goto out_lock; + if (ret) { + struct vfio_dma *tmp; + iova = map->iova; + size = map->size; + while ((tmp = vfio_find_dma(iommu, iova, size))) { + int r = vfio_remove_dma_overlap(iommu, iova, + &size, tmp); + if (WARN_ON(r || !size)) + break; } - - dma->npage = npage; - dma->iova = iova; - dma->vaddr = vaddr; - dma->prot = prot; - list_add(&dma->next, &iommu->dma_list); } -out_lock: mutex_unlock(&iommu->lock); return ret; } @@ -606,7 +775,7 @@ static void *vfio_iommu_type1_open(unsigned long arg) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&iommu->group_list); - INIT_LIST_HEAD(&iommu->dma_list); + iommu->dma_list = RB_ROOT; mutex_init(&iommu->lock); /* @@ -640,7 +809,7 @@ static void vfio_iommu_type1_release(void *iommu_data) { struct vfio_iommu *iommu = iommu_data; struct vfio_group *group, *group_tmp; - struct vfio_dma *dma, *dma_tmp; + struct rb_node *node; list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) { iommu_detach_group(iommu->domain, group->iommu_group); @@ -648,10 +817,12 @@ static void vfio_iommu_type1_release(void *iommu_data) kfree(group); } - list_for_each_entry_safe(dma, dma_tmp, &iommu->dma_list, next) { - vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); - list_del(&dma->next); - kfree(dma); + while ((node = rb_first(&iommu->dma_list))) { + struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); + size_t size = dma->size; + vfio_remove_dma_overlap(iommu, dma->iova, &size, dma); + if (WARN_ON(!size)) + break; } iommu_domain_free(iommu->domain); @@ -706,6 +877,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, } else if (cmd == VFIO_IOMMU_UNMAP_DMA) { struct vfio_iommu_type1_dma_unmap unmap; + long ret; minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); @@ -715,7 +887,11 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, if (unmap.argsz < minsz || unmap.flags) return -EINVAL; - return vfio_dma_do_unmap(iommu, &unmap); + ret = vfio_dma_do_unmap(iommu, &unmap); + if (ret) + return ret; + + return copy_to_user((void __user *)arg, &unmap, minsz); } return -ENOTTY; |