summaryrefslogtreecommitdiffstats
path: root/drivers/vfio
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/vfio')
-rw-r--r--drivers/vfio/Kconfig1
-rw-r--r--drivers/vfio/Makefile1
-rw-r--r--drivers/vfio/mdev/Kconfig17
-rw-r--r--drivers/vfio/mdev/Makefile5
-rw-r--r--drivers/vfio/mdev/mdev_core.c385
-rw-r--r--drivers/vfio/mdev/mdev_driver.c119
-rw-r--r--drivers/vfio/mdev/mdev_private.h41
-rw-r--r--drivers/vfio/mdev/mdev_sysfs.c286
-rw-r--r--drivers/vfio/mdev/vfio_mdev.c148
-rw-r--r--drivers/vfio/pci/vfio_pci.c78
-rw-r--r--drivers/vfio/pci/vfio_pci_config.c10
-rw-r--r--drivers/vfio/pci/vfio_pci_intrs.c2
-rw-r--r--drivers/vfio/platform/vfio_platform_common.c31
-rw-r--r--drivers/vfio/vfio.c461
-rw-r--r--drivers/vfio/vfio_iommu_type1.c885
15 files changed, 2212 insertions, 258 deletions
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index da6e2ce77495..23eced02aaf6 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -48,4 +48,5 @@ menuconfig VFIO_NOIOMMU
source "drivers/vfio/pci/Kconfig"
source "drivers/vfio/platform/Kconfig"
+source "drivers/vfio/mdev/Kconfig"
source "virt/lib/Kconfig"
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 7b8a31f63fea..4a23c13b6be4 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -7,3 +7,4 @@ obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o
obj-$(CONFIG_VFIO_PCI) += pci/
obj-$(CONFIG_VFIO_PLATFORM) += platform/
+obj-$(CONFIG_VFIO_MDEV) += mdev/
diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig
new file mode 100644
index 000000000000..14fdb106a827
--- /dev/null
+++ b/drivers/vfio/mdev/Kconfig
@@ -0,0 +1,17 @@
+
+config VFIO_MDEV
+ tristate "Mediated device driver framework"
+ depends on VFIO
+ default n
+ help
+ Provides a framework to virtualize devices.
+ See Documentation/vfio-mediated-device.txt for more details.
+
+ If you don't know what do here, say N.
+
+config VFIO_MDEV_DEVICE
+ tristate "VFIO driver for Mediated devices"
+ depends on VFIO && VFIO_MDEV
+ default n
+ help
+ VFIO based driver for Mediated devices.
diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile
new file mode 100644
index 000000000000..fa2d5ea466ee
--- /dev/null
+++ b/drivers/vfio/mdev/Makefile
@@ -0,0 +1,5 @@
+
+mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o
+
+obj-$(CONFIG_VFIO_MDEV) += mdev.o
+obj-$(CONFIG_VFIO_MDEV_DEVICE) += vfio_mdev.o
diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
new file mode 100644
index 000000000000..be1ee89ee917
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -0,0 +1,385 @@
+/*
+ * Mediated device Core Driver
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ * Author: Neo Jia <cjia@nvidia.com>
+ * Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/uuid.h>
+#include <linux/sysfs.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+#define DRIVER_VERSION "0.1"
+#define DRIVER_AUTHOR "NVIDIA Corporation"
+#define DRIVER_DESC "Mediated device Core Driver"
+
+static LIST_HEAD(parent_list);
+static DEFINE_MUTEX(parent_list_lock);
+static struct class_compat *mdev_bus_compat_class;
+
+static int _find_mdev_device(struct device *dev, void *data)
+{
+ struct mdev_device *mdev;
+
+ if (!dev_is_mdev(dev))
+ return 0;
+
+ mdev = to_mdev_device(dev);
+
+ if (uuid_le_cmp(mdev->uuid, *(uuid_le *)data) == 0)
+ return 1;
+
+ return 0;
+}
+
+static bool mdev_device_exist(struct parent_device *parent, uuid_le uuid)
+{
+ struct device *dev;
+
+ dev = device_find_child(parent->dev, &uuid, _find_mdev_device);
+ if (dev) {
+ put_device(dev);
+ return true;
+ }
+
+ return false;
+}
+
+/* Should be called holding parent_list_lock */
+static struct parent_device *__find_parent_device(struct device *dev)
+{
+ struct parent_device *parent;
+
+ list_for_each_entry(parent, &parent_list, next) {
+ if (parent->dev == dev)
+ return parent;
+ }
+ return NULL;
+}
+
+static void mdev_release_parent(struct kref *kref)
+{
+ struct parent_device *parent = container_of(kref, struct parent_device,
+ ref);
+ struct device *dev = parent->dev;
+
+ kfree(parent);
+ put_device(dev);
+}
+
+static
+inline struct parent_device *mdev_get_parent(struct parent_device *parent)
+{
+ if (parent)
+ kref_get(&parent->ref);
+
+ return parent;
+}
+
+static inline void mdev_put_parent(struct parent_device *parent)
+{
+ if (parent)
+ kref_put(&parent->ref, mdev_release_parent);
+}
+
+static int mdev_device_create_ops(struct kobject *kobj,
+ struct mdev_device *mdev)
+{
+ struct parent_device *parent = mdev->parent;
+ int ret;
+
+ ret = parent->ops->create(kobj, mdev);
+ if (ret)
+ return ret;
+
+ ret = sysfs_create_groups(&mdev->dev.kobj,
+ parent->ops->mdev_attr_groups);
+ if (ret)
+ parent->ops->remove(mdev);
+
+ return ret;
+}
+
+/*
+ * mdev_device_remove_ops gets called from sysfs's 'remove' and when parent
+ * device is being unregistered from mdev device framework.
+ * - 'force_remove' is set to 'false' when called from sysfs's 'remove' which
+ * indicates that if the mdev device is active, used by VMM or userspace
+ * application, vendor driver could return error then don't remove the device.
+ * - 'force_remove' is set to 'true' when called from mdev_unregister_device()
+ * which indicate that parent device is being removed from mdev device
+ * framework so remove mdev device forcefully.
+ */
+static int mdev_device_remove_ops(struct mdev_device *mdev, bool force_remove)
+{
+ struct parent_device *parent = mdev->parent;
+ int ret;
+
+ /*
+ * Vendor driver can return error if VMM or userspace application is
+ * using this mdev device.
+ */
+ ret = parent->ops->remove(mdev);
+ if (ret && !force_remove)
+ return -EBUSY;
+
+ sysfs_remove_groups(&mdev->dev.kobj, parent->ops->mdev_attr_groups);
+ return 0;
+}
+
+static int mdev_device_remove_cb(struct device *dev, void *data)
+{
+ if (!dev_is_mdev(dev))
+ return 0;
+
+ return mdev_device_remove(dev, data ? *(bool *)data : true);
+}
+
+/*
+ * mdev_register_device : Register a device
+ * @dev: device structure representing parent device.
+ * @ops: Parent device operation structure to be registered.
+ *
+ * Add device to list of registered parent devices.
+ * Returns a negative value on error, otherwise 0.
+ */
+int mdev_register_device(struct device *dev, const struct parent_ops *ops)
+{
+ int ret;
+ struct parent_device *parent;
+
+ /* check for mandatory ops */
+ if (!ops || !ops->create || !ops->remove || !ops->supported_type_groups)
+ return -EINVAL;
+
+ dev = get_device(dev);
+ if (!dev)
+ return -EINVAL;
+
+ mutex_lock(&parent_list_lock);
+
+ /* Check for duplicate */
+ parent = __find_parent_device(dev);
+ if (parent) {
+ ret = -EEXIST;
+ goto add_dev_err;
+ }
+
+ parent = kzalloc(sizeof(*parent), GFP_KERNEL);
+ if (!parent) {
+ ret = -ENOMEM;
+ goto add_dev_err;
+ }
+
+ kref_init(&parent->ref);
+ mutex_init(&parent->lock);
+
+ parent->dev = dev;
+ parent->ops = ops;
+
+ if (!mdev_bus_compat_class) {
+ mdev_bus_compat_class = class_compat_register("mdev_bus");
+ if (!mdev_bus_compat_class) {
+ ret = -ENOMEM;
+ goto add_dev_err;
+ }
+ }
+
+ ret = parent_create_sysfs_files(parent);
+ if (ret)
+ goto add_dev_err;
+
+ ret = class_compat_create_link(mdev_bus_compat_class, dev, NULL);
+ if (ret)
+ dev_warn(dev, "Failed to create compatibility class link\n");
+
+ list_add(&parent->next, &parent_list);
+ mutex_unlock(&parent_list_lock);
+
+ dev_info(dev, "MDEV: Registered\n");
+ return 0;
+
+add_dev_err:
+ mutex_unlock(&parent_list_lock);
+ if (parent)
+ mdev_put_parent(parent);
+ else
+ put_device(dev);
+ return ret;
+}
+EXPORT_SYMBOL(mdev_register_device);
+
+/*
+ * mdev_unregister_device : Unregister a parent device
+ * @dev: device structure representing parent device.
+ *
+ * Remove device from list of registered parent devices. Give a chance to free
+ * existing mediated devices for given device.
+ */
+
+void mdev_unregister_device(struct device *dev)
+{
+ struct parent_device *parent;
+ bool force_remove = true;
+
+ mutex_lock(&parent_list_lock);
+ parent = __find_parent_device(dev);
+
+ if (!parent) {
+ mutex_unlock(&parent_list_lock);
+ return;
+ }
+ dev_info(dev, "MDEV: Unregistering\n");
+
+ list_del(&parent->next);
+ class_compat_remove_link(mdev_bus_compat_class, dev, NULL);
+
+ device_for_each_child(dev, (void *)&force_remove,
+ mdev_device_remove_cb);
+
+ parent_remove_sysfs_files(parent);
+
+ mutex_unlock(&parent_list_lock);
+ mdev_put_parent(parent);
+}
+EXPORT_SYMBOL(mdev_unregister_device);
+
+static void mdev_device_release(struct device *dev)
+{
+ struct mdev_device *mdev = to_mdev_device(dev);
+
+ dev_dbg(&mdev->dev, "MDEV: destroying\n");
+ kfree(mdev);
+}
+
+int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid)
+{
+ int ret;
+ struct mdev_device *mdev;
+ struct parent_device *parent;
+ struct mdev_type *type = to_mdev_type(kobj);
+
+ parent = mdev_get_parent(type->parent);
+ if (!parent)
+ return -EINVAL;
+
+ mutex_lock(&parent->lock);
+
+ /* Check for duplicate */
+ if (mdev_device_exist(parent, uuid)) {
+ ret = -EEXIST;
+ goto create_err;
+ }
+
+ mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
+ if (!mdev) {
+ ret = -ENOMEM;
+ goto create_err;
+ }
+
+ memcpy(&mdev->uuid, &uuid, sizeof(uuid_le));
+ mdev->parent = parent;
+ kref_init(&mdev->ref);
+
+ mdev->dev.parent = dev;
+ mdev->dev.bus = &mdev_bus_type;
+ mdev->dev.release = mdev_device_release;
+ dev_set_name(&mdev->dev, "%pUl", uuid.b);
+
+ ret = device_register(&mdev->dev);
+ if (ret) {
+ put_device(&mdev->dev);
+ goto create_err;
+ }
+
+ ret = mdev_device_create_ops(kobj, mdev);
+ if (ret)
+ goto create_failed;
+
+ ret = mdev_create_sysfs_files(&mdev->dev, type);
+ if (ret) {
+ mdev_device_remove_ops(mdev, true);
+ goto create_failed;
+ }
+
+ mdev->type_kobj = kobj;
+ dev_dbg(&mdev->dev, "MDEV: created\n");
+
+ mutex_unlock(&parent->lock);
+ return ret;
+
+create_failed:
+ device_unregister(&mdev->dev);
+
+create_err:
+ mutex_unlock(&parent->lock);
+ mdev_put_parent(parent);
+ return ret;
+}
+
+int mdev_device_remove(struct device *dev, bool force_remove)
+{
+ struct mdev_device *mdev;
+ struct parent_device *parent;
+ struct mdev_type *type;
+ int ret;
+
+ mdev = to_mdev_device(dev);
+ type = to_mdev_type(mdev->type_kobj);
+ parent = mdev->parent;
+ mutex_lock(&parent->lock);
+
+ ret = mdev_device_remove_ops(mdev, force_remove);
+ if (ret) {
+ mutex_unlock(&parent->lock);
+ return ret;
+ }
+
+ mdev_remove_sysfs_files(dev, type);
+ device_unregister(dev);
+ mutex_unlock(&parent->lock);
+ mdev_put_parent(parent);
+ return ret;
+}
+
+static int __init mdev_init(void)
+{
+ int ret;
+
+ ret = mdev_bus_register();
+
+ /*
+ * Attempt to load known vfio_mdev. This gives us a working environment
+ * without the user needing to explicitly load vfio_mdev driver.
+ */
+ if (!ret)
+ request_module_nowait("vfio_mdev");
+
+ return ret;
+}
+
+static void __exit mdev_exit(void)
+{
+ if (mdev_bus_compat_class)
+ class_compat_unregister(mdev_bus_compat_class);
+
+ mdev_bus_unregister();
+}
+
+module_init(mdev_init)
+module_exit(mdev_exit)
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
new file mode 100644
index 000000000000..6f0391f6f9b6
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_driver.c
@@ -0,0 +1,119 @@
+/*
+ * MDEV driver
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ * Author: Neo Jia <cjia@nvidia.com>
+ * Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/device.h>
+#include <linux/iommu.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+static int mdev_attach_iommu(struct mdev_device *mdev)
+{
+ int ret;
+ struct iommu_group *group;
+
+ group = iommu_group_alloc();
+ if (IS_ERR(group))
+ return PTR_ERR(group);
+
+ ret = iommu_group_add_device(group, &mdev->dev);
+ if (!ret)
+ dev_info(&mdev->dev, "MDEV: group_id = %d\n",
+ iommu_group_id(group));
+
+ iommu_group_put(group);
+ return ret;
+}
+
+static void mdev_detach_iommu(struct mdev_device *mdev)
+{
+ iommu_group_remove_device(&mdev->dev);
+ dev_info(&mdev->dev, "MDEV: detaching iommu\n");
+}
+
+static int mdev_probe(struct device *dev)
+{
+ struct mdev_driver *drv = to_mdev_driver(dev->driver);
+ struct mdev_device *mdev = to_mdev_device(dev);
+ int ret;
+
+ ret = mdev_attach_iommu(mdev);
+ if (ret)
+ return ret;
+
+ if (drv && drv->probe) {
+ ret = drv->probe(dev);
+ if (ret)
+ mdev_detach_iommu(mdev);
+ }
+
+ return ret;
+}
+
+static int mdev_remove(struct device *dev)
+{
+ struct mdev_driver *drv = to_mdev_driver(dev->driver);
+ struct mdev_device *mdev = to_mdev_device(dev);
+
+ if (drv && drv->remove)
+ drv->remove(dev);
+
+ mdev_detach_iommu(mdev);
+
+ return 0;
+}
+
+struct bus_type mdev_bus_type = {
+ .name = "mdev",
+ .probe = mdev_probe,
+ .remove = mdev_remove,
+};
+EXPORT_SYMBOL_GPL(mdev_bus_type);
+
+/**
+ * mdev_register_driver - register a new MDEV driver
+ * @drv: the driver to register
+ * @owner: module owner of driver to be registered
+ *
+ * Returns a negative value on error, otherwise 0.
+ **/
+int mdev_register_driver(struct mdev_driver *drv, struct module *owner)
+{
+ /* initialize common driver fields */
+ drv->driver.name = drv->name;
+ drv->driver.bus = &mdev_bus_type;
+ drv->driver.owner = owner;
+
+ /* register with core */
+ return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL(mdev_register_driver);
+
+/*
+ * mdev_unregister_driver - unregister MDEV driver
+ * @drv: the driver to unregister
+ */
+void mdev_unregister_driver(struct mdev_driver *drv)
+{
+ driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL(mdev_unregister_driver);
+
+int mdev_bus_register(void)
+{
+ return bus_register(&mdev_bus_type);
+}
+
+void mdev_bus_unregister(void)
+{
+ bus_unregister(&mdev_bus_type);
+}
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
new file mode 100644
index 000000000000..d35097cbf3d7
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -0,0 +1,41 @@
+/*
+ * Mediated device interal definitions
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ * Author: Neo Jia <cjia@nvidia.com>
+ * Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef MDEV_PRIVATE_H
+#define MDEV_PRIVATE_H
+
+int mdev_bus_register(void);
+void mdev_bus_unregister(void);
+
+struct mdev_type {
+ struct kobject kobj;
+ struct kobject *devices_kobj;
+ struct parent_device *parent;
+ struct list_head next;
+ struct attribute_group *group;
+};
+
+#define to_mdev_type_attr(_attr) \
+ container_of(_attr, struct mdev_type_attribute, attr)
+#define to_mdev_type(_kobj) \
+ container_of(_kobj, struct mdev_type, kobj)
+
+int parent_create_sysfs_files(struct parent_device *parent);
+void parent_remove_sysfs_files(struct parent_device *parent);
+
+int mdev_create_sysfs_files(struct device *dev, struct mdev_type *type);
+void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type);
+
+int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid);
+int mdev_device_remove(struct device *dev, bool force_remove);
+
+#endif /* MDEV_PRIVATE_H */
diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
new file mode 100644
index 000000000000..1a53deb2ee10
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -0,0 +1,286 @@
+/*
+ * File attributes for Mediated devices
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ * Author: Neo Jia <cjia@nvidia.com>
+ * Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/sysfs.h>
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/uuid.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+/* Static functions */
+
+static ssize_t mdev_type_attr_show(struct kobject *kobj,
+ struct attribute *__attr, char *buf)
+{
+ struct mdev_type_attribute *attr = to_mdev_type_attr(__attr);
+ struct mdev_type *type = to_mdev_type(kobj);
+ ssize_t ret = -EIO;
+
+ if (attr->show)
+ ret = attr->show(kobj, type->parent->dev, buf);
+ return ret;
+}
+
+static ssize_t mdev_type_attr_store(struct kobject *kobj,
+ struct attribute *__attr,
+ const char *buf, size_t count)
+{
+ struct mdev_type_attribute *attr = to_mdev_type_attr(__attr);
+ struct mdev_type *type = to_mdev_type(kobj);
+ ssize_t ret = -EIO;
+
+ if (attr->store)
+ ret = attr->store(&type->kobj, type->parent->dev, buf, count);
+ return ret;
+}
+
+static const struct sysfs_ops mdev_type_sysfs_ops = {
+ .show = mdev_type_attr_show,
+ .store = mdev_type_attr_store,
+};
+
+static ssize_t create_store(struct kobject *kobj, struct device *dev,
+ const char *buf, size_t count)
+{
+ char *str;
+ uuid_le uuid;
+ int ret;
+
+ if ((count < UUID_STRING_LEN) || (count > UUID_STRING_LEN + 1))
+ return -EINVAL;
+
+ str = kstrndup(buf, count, GFP_KERNEL);
+ if (!str)
+ return -ENOMEM;
+
+ ret = uuid_le_to_bin(str, &uuid);
+ kfree(str);
+ if (ret)
+ return ret;
+
+ ret = mdev_device_create(kobj, dev, uuid);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+MDEV_TYPE_ATTR_WO(create);
+
+static void mdev_type_release(struct kobject *kobj)
+{
+ struct mdev_type *type = to_mdev_type(kobj);
+
+ pr_debug("Releasing group %s\n", kobj->name);
+ kfree(type);
+}
+
+static struct kobj_type mdev_type_ktype = {
+ .sysfs_ops = &mdev_type_sysfs_ops,
+ .release = mdev_type_release,
+};
+
+struct mdev_type *add_mdev_supported_type(struct parent_device *parent,
+ struct attribute_group *group)
+{
+ struct mdev_type *type;
+ int ret;
+
+ if (!group->name) {
+ pr_err("%s: Type name empty!\n", __func__);
+ return ERR_PTR(-EINVAL);
+ }
+
+ type = kzalloc(sizeof(*type), GFP_KERNEL);
+ if (!type)
+ return ERR_PTR(-ENOMEM);
+
+ type->kobj.kset = parent->mdev_types_kset;
+
+ ret = kobject_init_and_add(&type->kobj, &mdev_type_ktype, NULL,
+ "%s-%s", dev_driver_string(parent->dev),
+ group->name);
+ if (ret) {
+ kfree(type);
+ return ERR_PTR(ret);
+ }
+
+ ret = sysfs_create_file(&type->kobj, &mdev_type_attr_create.attr);
+ if (ret)
+ goto attr_create_failed;
+
+ type->devices_kobj = kobject_create_and_add("devices", &type->kobj);
+ if (!type->devices_kobj) {
+ ret = -ENOMEM;
+ goto attr_devices_failed;
+ }
+
+ ret = sysfs_create_files(&type->kobj,
+ (const struct attribute **)group->attrs);
+ if (ret) {
+ ret = -ENOMEM;
+ goto attrs_failed;
+ }
+
+ type->group = group;
+ type->parent = parent;
+ return type;
+
+attrs_failed:
+ kobject_put(type->devices_kobj);
+attr_devices_failed:
+ sysfs_remove_file(&type->kobj, &mdev_type_attr_create.attr);
+attr_create_failed:
+ kobject_del(&type->kobj);
+ kobject_put(&type->kobj);
+ return ERR_PTR(ret);
+}
+
+static void remove_mdev_supported_type(struct mdev_type *type)
+{
+ sysfs_remove_files(&type->kobj,
+ (const struct attribute **)type->group->attrs);
+ kobject_put(type->devices_kobj);
+ sysfs_remove_file(&type->kobj, &mdev_type_attr_create.attr);
+ kobject_del(&type->kobj);
+ kobject_put(&type->kobj);
+}
+
+static int add_mdev_supported_type_groups(struct parent_device *parent)
+{
+ int i;
+
+ for (i = 0; parent->ops->supported_type_groups[i]; i++) {
+ struct mdev_type *type;
+
+ type = add_mdev_supported_type(parent,
+ parent->ops->supported_type_groups[i]);
+ if (IS_ERR(type)) {
+ struct mdev_type *ltype, *tmp;
+
+ list_for_each_entry_safe(ltype, tmp, &parent->type_list,
+ next) {
+ list_del(&ltype->next);
+ remove_mdev_supported_type(ltype);
+ }
+ return PTR_ERR(type);
+ }
+ list_add(&type->next, &parent->type_list);
+ }
+ return 0;
+}
+
+/* mdev sysfs functions */
+void parent_remove_sysfs_files(struct parent_device *parent)
+{
+ struct mdev_type *type, *tmp;
+
+ list_for_each_entry_safe(type, tmp, &parent->type_list, next) {
+ list_del(&type->next);
+ remove_mdev_supported_type(type);
+ }
+
+ sysfs_remove_groups(&parent->dev->kobj, parent->ops->dev_attr_groups);
+ kset_unregister(parent->mdev_types_kset);
+}
+
+int parent_create_sysfs_files(struct parent_device *parent)
+{
+ int ret;
+
+ parent->mdev_types_kset = kset_create_and_add("mdev_supported_types",
+ NULL, &parent->dev->kobj);
+
+ if (!parent->mdev_types_kset)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&parent->type_list);
+
+ ret = sysfs_create_groups(&parent->dev->kobj,
+ parent->ops->dev_attr_groups);
+ if (ret)
+ goto create_err;
+
+ ret = add_mdev_supported_type_groups(parent);
+ if (ret)
+ sysfs_remove_groups(&parent->dev->kobj,
+ parent->ops->dev_attr_groups);
+ else
+ return ret;
+
+create_err:
+ kset_unregister(parent->mdev_types_kset);
+ return ret;
+}
+
+static ssize_t remove_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 0, &val) < 0)
+ return -EINVAL;
+
+ if (val && device_remove_file_self(dev, attr)) {
+ int ret;
+
+ ret = mdev_device_remove(dev, false);
+ if (ret) {
+ device_create_file(dev, attr);
+ return ret;
+ }
+ }
+
+ return count;
+}
+
+static DEVICE_ATTR_WO(remove);
+
+static const struct attribute *mdev_device_attrs[] = {
+ &dev_attr_remove.attr,
+ NULL,
+};
+
+int mdev_create_sysfs_files(struct device *dev, struct mdev_type *type)
+{
+ int ret;
+
+ ret = sysfs_create_files(&dev->kobj, mdev_device_attrs);
+ if (ret)
+ return ret;
+
+ ret = sysfs_create_link(type->devices_kobj, &dev->kobj, dev_name(dev));
+ if (ret)
+ goto device_link_failed;
+
+ ret = sysfs_create_link(&dev->kobj, &type->kobj, "mdev_type");
+ if (ret)
+ goto type_link_failed;
+
+ return ret;
+
+type_link_failed:
+ sysfs_remove_link(type->devices_kobj, dev_name(dev));
+device_link_failed:
+ sysfs_remove_files(&dev->kobj, mdev_device_attrs);
+ return ret;
+}
+
+void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type)
+{
+ sysfs_remove_link(&dev->kobj, "mdev_type");
+ sysfs_remove_link(type->devices_kobj, dev_name(dev));
+ sysfs_remove_files(&dev->kobj, mdev_device_attrs);
+}
diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
new file mode 100644
index 000000000000..ffc36758cb84
--- /dev/null
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -0,0 +1,148 @@
+/*
+ * VFIO based driver for Mediated device
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ * Author: Neo Jia <cjia@nvidia.com>
+ * Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vfio.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+#define DRIVER_VERSION "0.1"
+#define DRIVER_AUTHOR "NVIDIA Corporation"
+#define DRIVER_DESC "VFIO based driver for Mediated device"
+
+static int vfio_mdev_open(void *device_data)
+{
+ struct mdev_device *mdev = device_data;
+ struct parent_device *parent = mdev->parent;
+ int ret;
+
+ if (unlikely(!parent->ops->open))
+ return -EINVAL;
+
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
+ ret = parent->ops->open(mdev);
+ if (ret)
+ module_put(THIS_MODULE);
+
+ return ret;
+}
+
+static void vfio_mdev_release(void *device_data)
+{
+ struct mdev_device *mdev = device_data;
+ struct parent_device *parent = mdev->parent;
+
+ if (likely(parent->ops->release))
+ parent->ops->release(mdev);
+
+ module_put(THIS_MODULE);
+}
+
+static long vfio_mdev_unlocked_ioctl(void *device_data,
+ unsigned int cmd, unsigned long arg)
+{
+ struct mdev_device *mdev = device_data;
+ struct parent_device *parent = mdev->parent;
+
+ if (unlikely(!parent->ops->ioctl))
+ return -EINVAL;
+
+ return parent->ops->ioctl(mdev, cmd, arg);
+}
+
+static ssize_t vfio_mdev_read(void *device_data, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct mdev_device *mdev = device_data;
+ struct parent_device *parent = mdev->parent;
+
+ if (unlikely(!parent->ops->read))
+ return -EINVAL;
+
+ return parent->ops->read(mdev, buf, count, ppos);
+}
+
+static ssize_t vfio_mdev_write(void *device_data, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct mdev_device *mdev = device_data;
+ struct parent_device *parent = mdev->parent;
+
+ if (unlikely(!parent->ops->write))
+ return -EINVAL;
+
+ return parent->ops->write(mdev, buf, count, ppos);
+}
+
+static int vfio_mdev_mmap(void *device_data, struct vm_area_struct *vma)
+{
+ struct mdev_device *mdev = device_data;
+ struct parent_device *parent = mdev->parent;
+
+ if (unlikely(!parent->ops->mmap))
+ return -EINVAL;
+
+ return parent->ops->mmap(mdev, vma);
+}
+
+static const struct vfio_device_ops vfio_mdev_dev_ops = {
+ .name = "vfio-mdev",
+ .open = vfio_mdev_open,
+ .release = vfio_mdev_release,
+ .ioctl = vfio_mdev_unlocked_ioctl,
+ .read = vfio_mdev_read,
+ .write = vfio_mdev_write,
+ .mmap = vfio_mdev_mmap,
+};
+
+int vfio_mdev_probe(struct device *dev)
+{
+ struct mdev_device *mdev = to_mdev_device(dev);
+
+ return vfio_add_group_dev(dev, &vfio_mdev_dev_ops, mdev);
+}
+
+void vfio_mdev_remove(struct device *dev)
+{
+ vfio_del_group_dev(dev);
+}
+
+struct mdev_driver vfio_mdev_driver = {
+ .name = "vfio_mdev",
+ .probe = vfio_mdev_probe,
+ .remove = vfio_mdev_remove,
+};
+
+static int __init vfio_mdev_init(void)
+{
+ return mdev_register_driver(&vfio_mdev_driver, THIS_MODULE);
+}
+
+static void __exit vfio_mdev_exit(void)
+{
+ mdev_unregister_driver(&vfio_mdev_driver);
+}
+
+module_init(vfio_mdev_init)
+module_exit(vfio_mdev_exit)
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index d624a527777f..dcd7c2a99618 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -558,10 +558,9 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev,
struct vfio_info_cap *caps)
{
- struct vfio_info_cap_header *header;
struct vfio_region_info_cap_sparse_mmap *sparse;
size_t end, size;
- int nr_areas = 2, i = 0;
+ int nr_areas = 2, i = 0, ret;
end = pci_resource_len(vdev->pdev, vdev->msix_bar);
@@ -572,13 +571,10 @@ static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev,
size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas));
- header = vfio_info_cap_add(caps, size,
- VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
- if (IS_ERR(header))
- return PTR_ERR(header);
+ sparse = kzalloc(size, GFP_KERNEL);
+ if (!sparse)
+ return -ENOMEM;
- sparse = container_of(header,
- struct vfio_region_info_cap_sparse_mmap, header);
sparse->nr_areas = nr_areas;
if (vdev->msix_offset & PAGE_MASK) {
@@ -594,26 +590,11 @@ static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev,
i++;
}
- return 0;
-}
-
-static int region_type_cap(struct vfio_pci_device *vdev,
- struct vfio_info_cap *caps,
- unsigned int type, unsigned int subtype)
-{
- struct vfio_info_cap_header *header;
- struct vfio_region_info_cap_type *cap;
-
- header = vfio_info_cap_add(caps, sizeof(*cap),
- VFIO_REGION_INFO_CAP_TYPE, 1);
- if (IS_ERR(header))
- return PTR_ERR(header);
+ ret = vfio_info_add_capability(caps, VFIO_REGION_INFO_CAP_SPARSE_MMAP,
+ sparse);
+ kfree(sparse);
- cap = container_of(header, struct vfio_region_info_cap_type, header);
- cap->type = type;
- cap->subtype = subtype;
-
- return 0;
+ return ret;
}
int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
@@ -752,6 +733,9 @@ static long vfio_pci_ioctl(void *device_data,
break;
default:
+ {
+ struct vfio_region_info_cap_type cap_type;
+
if (info.index >=
VFIO_PCI_NUM_REGIONS + vdev->num_regions)
return -EINVAL;
@@ -762,11 +746,16 @@ static long vfio_pci_ioctl(void *device_data,
info.size = vdev->region[i].size;
info.flags = vdev->region[i].flags;
- ret = region_type_cap(vdev, &caps,
- vdev->region[i].type,
- vdev->region[i].subtype);
+ cap_type.type = vdev->region[i].type;
+ cap_type.subtype = vdev->region[i].subtype;
+
+ ret = vfio_info_add_capability(&caps,
+ VFIO_REGION_INFO_CAP_TYPE,
+ &cap_type);
if (ret)
return ret;
+
+ }
}
if (caps.size) {
@@ -830,35 +819,24 @@ static long vfio_pci_ioctl(void *device_data,
} else if (cmd == VFIO_DEVICE_SET_IRQS) {
struct vfio_irq_set hdr;
u8 *data = NULL;
- int ret = 0;
+ int max, ret = 0;
+ size_t data_size = 0;
minsz = offsetofend(struct vfio_irq_set, count);
if (copy_from_user(&hdr, (void __user *)arg, minsz))
return -EFAULT;
- if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
- hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
- VFIO_IRQ_SET_ACTION_TYPE_MASK))
- return -EINVAL;
-
- if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
- size_t size;
- int max = vfio_pci_get_irq_count(vdev, hdr.index);
+ max = vfio_pci_get_irq_count(vdev, hdr.index);
- if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
- size = sizeof(uint8_t);
- else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
- size = sizeof(int32_t);
- else
- return -EINVAL;
-
- if (hdr.argsz - minsz < hdr.count * size ||
- hdr.start >= max || hdr.start + hdr.count > max)
- return -EINVAL;
+ ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
+ VFIO_PCI_NUM_IRQS, &data_size);
+ if (ret)
+ return ret;
+ if (data_size) {
data = memdup_user((void __user *)(arg + minsz),
- hdr.count * size);
+ data_size);
if (IS_ERR(data))
return PTR_ERR(data);
}
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 871af74fc4ce..330a57024cbc 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -150,7 +150,7 @@ static int vfio_user_config_read(struct pci_dev *pdev, int offset,
*val = cpu_to_le32(tmp_val);
- return pcibios_err_to_errno(ret);
+ return ret;
}
static int vfio_user_config_write(struct pci_dev *pdev, int offset,
@@ -171,7 +171,7 @@ static int vfio_user_config_write(struct pci_dev *pdev, int offset,
break;
}
- return pcibios_err_to_errno(ret);
+ return ret;
}
static int vfio_default_config_read(struct vfio_pci_device *vdev, int pos,
@@ -255,7 +255,7 @@ static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos,
ret = vfio_user_config_read(vdev->pdev, pos, val, count);
if (ret)
- return pcibios_err_to_errno(ret);
+ return ret;
if (pos >= PCI_CFG_SPACE_SIZE) { /* Extended cap header mangling */
if (offset < 4)
@@ -293,7 +293,7 @@ static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos,
ret = vfio_user_config_read(vdev->pdev, pos, val, count);
if (ret)
- return pcibios_err_to_errno(ret);
+ return ret;
return count;
}
@@ -1087,7 +1087,7 @@ static int vfio_msi_config_write(struct vfio_pci_device *vdev, int pos,
start + PCI_MSI_FLAGS,
flags);
if (ret)
- return pcibios_err_to_errno(ret);
+ return ret;
}
return count;
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index c2e60893cd09..1c46045b0e7f 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -256,7 +256,7 @@ static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix)
if (!is_irq_none(vdev))
return -EINVAL;
- vdev->ctx = kzalloc(nvec * sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL);
+ vdev->ctx = kcalloc(nvec, sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL);
if (!vdev->ctx)
return -ENOMEM;
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index d78142830754..4c27f4be3c3d 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -364,36 +364,21 @@ static long vfio_platform_ioctl(void *device_data,
struct vfio_irq_set hdr;
u8 *data = NULL;
int ret = 0;
+ size_t data_size = 0;
minsz = offsetofend(struct vfio_irq_set, count);
if (copy_from_user(&hdr, (void __user *)arg, minsz))
return -EFAULT;
- if (hdr.argsz < minsz)
- return -EINVAL;
-
- if (hdr.index >= vdev->num_irqs)
- return -EINVAL;
-
- if (hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
- VFIO_IRQ_SET_ACTION_TYPE_MASK))
- return -EINVAL;
-
- if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
- size_t size;
-
- if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
- size = sizeof(uint8_t);
- else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
- size = sizeof(int32_t);
- else
- return -EINVAL;
-
- if (hdr.argsz - minsz < size)
- return -EINVAL;
+ ret = vfio_set_irqs_validate_and_prepare(&hdr, vdev->num_irqs,
+ vdev->num_irqs, &data_size);
+ if (ret)
+ return ret;
- data = memdup_user((void __user *)(arg + minsz), size);
+ if (data_size) {
+ data = memdup_user((void __user *)(arg + minsz),
+ data_size);
if (IS_ERR(data))
return PTR_ERR(data);
}
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index d1d70e0b011b..9901c4671e2f 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -86,6 +86,8 @@ struct vfio_group {
struct mutex unbound_lock;
atomic_t opened;
bool noiommu;
+ struct kvm *kvm;
+ struct blocking_notifier_head notifier;
};
struct vfio_device {
@@ -339,6 +341,7 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
#ifdef CONFIG_VFIO_NOIOMMU
group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
#endif
+ BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
group->nb.notifier_call = vfio_iommu_group_notifier;
@@ -480,6 +483,21 @@ static struct vfio_group *vfio_group_get_from_minor(int minor)
return group;
}
+static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
+{
+ struct iommu_group *iommu_group;
+ struct vfio_group *group;
+
+ iommu_group = iommu_group_get(dev);
+ if (!iommu_group)
+ return NULL;
+
+ group = vfio_group_get_from_iommu(iommu_group);
+ iommu_group_put(iommu_group);
+
+ return group;
+}
+
/**
* Device objects - create, release, get, put, search
*/
@@ -811,16 +829,10 @@ EXPORT_SYMBOL_GPL(vfio_add_group_dev);
*/
struct vfio_device *vfio_device_get_from_dev(struct device *dev)
{
- struct iommu_group *iommu_group;
struct vfio_group *group;
struct vfio_device *device;
- iommu_group = iommu_group_get(dev);
- if (!iommu_group)
- return NULL;
-
- group = vfio_group_get_from_iommu(iommu_group);
- iommu_group_put(iommu_group);
+ group = vfio_group_get_from_dev(dev);
if (!group)
return NULL;
@@ -1376,6 +1388,23 @@ static bool vfio_group_viable(struct vfio_group *group)
group, vfio_dev_viable) == 0);
}
+static int vfio_group_add_container_user(struct vfio_group *group)
+{
+ if (!atomic_inc_not_zero(&group->container_users))
+ return -EINVAL;
+
+ if (group->noiommu) {
+ atomic_dec(&group->container_users);
+ return -EPERM;
+ }
+ if (!group->container->iommu_driver || !vfio_group_viable(group)) {
+ atomic_dec(&group->container_users);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static const struct file_operations vfio_device_fops;
static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
@@ -1555,6 +1584,9 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep)
filep->private_data = NULL;
+ /* Any user didn't unregister? */
+ WARN_ON(group->notifier.head);
+
vfio_group_try_dissolve_container(group);
atomic_dec(&group->opened);
@@ -1685,23 +1717,14 @@ static const struct file_operations vfio_device_fops = {
struct vfio_group *vfio_group_get_external_user(struct file *filep)
{
struct vfio_group *group = filep->private_data;
+ int ret;
if (filep->f_op != &vfio_group_fops)
return ERR_PTR(-EINVAL);
- if (!atomic_inc_not_zero(&group->container_users))
- return ERR_PTR(-EINVAL);
-
- if (group->noiommu) {
- atomic_dec(&group->container_users);
- return ERR_PTR(-EPERM);
- }
-
- if (!group->container->iommu_driver ||
- !vfio_group_viable(group)) {
- atomic_dec(&group->container_users);
- return ERR_PTR(-EINVAL);
- }
+ ret = vfio_group_add_container_user(group);
+ if (ret)
+ return ERR_PTR(ret);
vfio_group_get(group);
@@ -1763,7 +1786,7 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
header->version = version;
/* Add to the end of the capability chain */
- for (tmp = caps->buf; tmp->next; tmp = (void *)tmp + tmp->next)
+ for (tmp = buf; tmp->next; tmp = buf + tmp->next)
; /* nothing */
tmp->next = caps->size;
@@ -1776,11 +1799,403 @@ EXPORT_SYMBOL_GPL(vfio_info_cap_add);
void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
{
struct vfio_info_cap_header *tmp;
+ void *buf = (void *)caps->buf;
- for (tmp = caps->buf; tmp->next; tmp = (void *)tmp + tmp->next - offset)
+ for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
tmp->next += offset;
}
-EXPORT_SYMBOL_GPL(vfio_info_cap_shift);
+EXPORT_SYMBOL(vfio_info_cap_shift);
+
+static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type)
+{
+ struct vfio_info_cap_header *header;
+ struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type;
+ size_t size;
+
+ size = sizeof(*sparse) + sparse->nr_areas * sizeof(*sparse->areas);
+ header = vfio_info_cap_add(caps, size,
+ VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
+ if (IS_ERR(header))
+ return PTR_ERR(header);
+
+ sparse_cap = container_of(header,
+ struct vfio_region_info_cap_sparse_mmap, header);
+ sparse_cap->nr_areas = sparse->nr_areas;
+ memcpy(sparse_cap->areas, sparse->areas,
+ sparse->nr_areas * sizeof(*sparse->areas));
+ return 0;
+}
+
+static int region_type_cap(struct vfio_info_cap *caps, void *cap_type)
+{
+ struct vfio_info_cap_header *header;
+ struct vfio_region_info_cap_type *type_cap, *cap = cap_type;
+
+ header = vfio_info_cap_add(caps, sizeof(*cap),
+ VFIO_REGION_INFO_CAP_TYPE, 1);
+ if (IS_ERR(header))
+ return PTR_ERR(header);
+
+ type_cap = container_of(header, struct vfio_region_info_cap_type,
+ header);
+ type_cap->type = cap->type;
+ type_cap->subtype = cap->subtype;
+ return 0;
+}
+
+int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
+ void *cap_type)
+{
+ int ret = -EINVAL;
+
+ if (!cap_type)
+ return 0;
+
+ switch (cap_type_id) {
+ case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
+ ret = sparse_mmap_cap(caps, cap_type);
+ break;
+
+ case VFIO_REGION_INFO_CAP_TYPE:
+ ret = region_type_cap(caps, cap_type);
+ break;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(vfio_info_add_capability);
+
+int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
+ int max_irq_type, size_t *data_size)
+{
+ unsigned long minsz;
+ size_t size;
+
+ minsz = offsetofend(struct vfio_irq_set, count);
+
+ if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
+ (hdr->count >= (U32_MAX - hdr->start)) ||
+ (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
+ VFIO_IRQ_SET_ACTION_TYPE_MASK)))
+ return -EINVAL;
+
+ if (data_size)
+ *data_size = 0;
+
+ if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
+ return -EINVAL;
+
+ switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
+ case VFIO_IRQ_SET_DATA_NONE:
+ size = 0;
+ break;
+ case VFIO_IRQ_SET_DATA_BOOL:
+ size = sizeof(uint8_t);
+ break;
+ case VFIO_IRQ_SET_DATA_EVENTFD:
+ size = sizeof(int32_t);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (size) {
+ if (hdr->argsz - minsz < hdr->count * size)
+ return -EINVAL;
+
+ if (!data_size)
+ return -EINVAL;
+
+ *data_size = hdr->count * size;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
+
+/*
+ * Pin a set of guest PFNs and return their associated host PFNs for local
+ * domain only.
+ * @dev [in] : device
+ * @user_pfn [in]: array of user/guest PFNs to be unpinned.
+ * @npage [in] : count of elements in user_pfn array. This count should not
+ * be greater VFIO_PIN_PAGES_MAX_ENTRIES.
+ * @prot [in] : protection flags
+ * @phys_pfn[out]: array of host PFNs
+ * Return error or number of pages pinned.
+ */
+int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
+ int prot, unsigned long *phys_pfn)
+{
+ struct vfio_container *container;
+ struct vfio_group *group;
+ struct vfio_iommu_driver *driver;
+ int ret;
+
+ if (!dev || !user_pfn || !phys_pfn || !npage)
+ return -EINVAL;
+
+ if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
+ return -E2BIG;
+
+ group = vfio_group_get_from_dev(dev);
+ if (!group)
+ return -ENODEV;
+
+ ret = vfio_group_add_container_user(group);
+ if (ret)
+ goto err_pin_pages;
+
+ container = group->container;
+ down_read(&container->group_lock);
+
+ driver = container->iommu_driver;
+ if (likely(driver && driver->ops->pin_pages))
+ ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
+ npage, prot, phys_pfn);
+ else
+ ret = -ENOTTY;
+
+ up_read(&container->group_lock);
+ vfio_group_try_dissolve_container(group);
+
+err_pin_pages:
+ vfio_group_put(group);
+ return ret;
+}
+EXPORT_SYMBOL(vfio_pin_pages);
+
+/*
+ * Unpin set of host PFNs for local domain only.
+ * @dev [in] : device
+ * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
+ * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
+ * @npage [in] : count of elements in user_pfn array. This count should not
+ * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
+ * Return error or number of pages unpinned.
+ */
+int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
+{
+ struct vfio_container *container;
+ struct vfio_group *group;
+ struct vfio_iommu_driver *driver;
+ int ret;
+
+ if (!dev || !user_pfn || !npage)
+ return -EINVAL;
+
+ if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
+ return -E2BIG;
+
+ group = vfio_group_get_from_dev(dev);
+ if (!group)
+ return -ENODEV;
+
+ ret = vfio_group_add_container_user(group);
+ if (ret)
+ goto err_unpin_pages;
+
+ container = group->container;
+ down_read(&container->group_lock);
+
+ driver = container->iommu_driver;
+ if (likely(driver && driver->ops->unpin_pages))
+ ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
+ npage);
+ else
+ ret = -ENOTTY;
+
+ up_read(&container->group_lock);
+ vfio_group_try_dissolve_container(group);
+
+err_unpin_pages:
+ vfio_group_put(group);
+ return ret;
+}
+EXPORT_SYMBOL(vfio_unpin_pages);
+
+static int vfio_register_iommu_notifier(struct vfio_group *group,
+ unsigned long *events,
+ struct notifier_block *nb)
+{
+ struct vfio_container *container;
+ struct vfio_iommu_driver *driver;
+ int ret;
+
+ ret = vfio_group_add_container_user(group);
+ if (ret)
+ return -EINVAL;
+
+ container = group->container;
+ down_read(&container->group_lock);
+
+ driver = container->iommu_driver;
+ if (likely(driver && driver->ops->register_notifier))
+ ret = driver->ops->register_notifier(container->iommu_data,
+ events, nb);
+ else
+ ret = -ENOTTY;
+
+ up_read(&container->group_lock);
+ vfio_group_try_dissolve_container(group);
+
+ return ret;
+}
+
+static int vfio_unregister_iommu_notifier(struct vfio_group *group,
+ struct notifier_block *nb)
+{
+ struct vfio_container *container;
+ struct vfio_iommu_driver *driver;
+ int ret;
+
+ ret = vfio_group_add_container_user(group);
+ if (ret)
+ return -EINVAL;
+
+ container = group->container;
+ down_read(&container->group_lock);
+
+ driver = container->iommu_driver;
+ if (likely(driver && driver->ops->unregister_notifier))
+ ret = driver->ops->unregister_notifier(container->iommu_data,
+ nb);
+ else
+ ret = -ENOTTY;
+
+ up_read(&container->group_lock);
+ vfio_group_try_dissolve_container(group);
+
+ return ret;
+}
+
+void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
+{
+ group->kvm = kvm;
+ blocking_notifier_call_chain(&group->notifier,
+ VFIO_GROUP_NOTIFY_SET_KVM, kvm);
+}
+EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
+
+static int vfio_register_group_notifier(struct vfio_group *group,
+ unsigned long *events,
+ struct notifier_block *nb)
+{
+ struct vfio_container *container;
+ int ret;
+ bool set_kvm = false;
+
+ if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
+ set_kvm = true;
+
+ /* clear known events */
+ *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
+
+ /* refuse to continue if still events remaining */
+ if (*events)
+ return -EINVAL;
+
+ ret = vfio_group_add_container_user(group);
+ if (ret)
+ return -EINVAL;
+
+ container = group->container;
+ down_read(&container->group_lock);
+
+ ret = blocking_notifier_chain_register(&group->notifier, nb);
+
+ /*
+ * The attaching of kvm and vfio_group might already happen, so
+ * here we replay once upon registration.
+ */
+ if (!ret && set_kvm && group->kvm)
+ blocking_notifier_call_chain(&group->notifier,
+ VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
+
+ up_read(&container->group_lock);
+ vfio_group_try_dissolve_container(group);
+
+ return ret;
+}
+
+static int vfio_unregister_group_notifier(struct vfio_group *group,
+ struct notifier_block *nb)
+{
+ struct vfio_container *container;
+ int ret;
+
+ ret = vfio_group_add_container_user(group);
+ if (ret)
+ return -EINVAL;
+
+ container = group->container;
+ down_read(&container->group_lock);
+
+ ret = blocking_notifier_chain_unregister(&group->notifier, nb);
+
+ up_read(&container->group_lock);
+ vfio_group_try_dissolve_container(group);
+
+ return ret;
+}
+
+int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
+ unsigned long *events, struct notifier_block *nb)
+{
+ struct vfio_group *group;
+ int ret;
+
+ if (!dev || !nb || !events || (*events == 0))
+ return -EINVAL;
+
+ group = vfio_group_get_from_dev(dev);
+ if (!group)
+ return -ENODEV;
+
+ switch (type) {
+ case VFIO_IOMMU_NOTIFY:
+ ret = vfio_register_iommu_notifier(group, events, nb);
+ break;
+ case VFIO_GROUP_NOTIFY:
+ ret = vfio_register_group_notifier(group, events, nb);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ vfio_group_put(group);
+ return ret;
+}
+EXPORT_SYMBOL(vfio_register_notifier);
+
+int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
+ struct notifier_block *nb)
+{
+ struct vfio_group *group;
+ int ret;
+
+ if (!dev || !nb)
+ return -EINVAL;
+
+ group = vfio_group_get_from_dev(dev);
+ if (!group)
+ return -ENODEV;
+
+ switch (type) {
+ case VFIO_IOMMU_NOTIFY:
+ ret = vfio_unregister_iommu_notifier(group, nb);
+ break;
+ case VFIO_GROUP_NOTIFY:
+ ret = vfio_unregister_group_notifier(group, nb);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ vfio_group_put(group);
+ return ret;
+}
+EXPORT_SYMBOL(vfio_unregister_notifier);
/**
* Module/class support
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 2ba19424e4a1..f3726ba12aa6 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -36,6 +36,9 @@
#include <linux/uaccess.h>
#include <linux/vfio.h>
#include <linux/workqueue.h>
+#include <linux/pid_namespace.h>
+#include <linux/mdev.h>
+#include <linux/notifier.h>
#define DRIVER_VERSION "0.2"
#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
@@ -55,8 +58,10 @@ MODULE_PARM_DESC(disable_hugepages,
struct vfio_iommu {
struct list_head domain_list;
+ struct vfio_domain *external_domain; /* domain for external user */
struct mutex lock;
struct rb_root dma_list;
+ struct blocking_notifier_head notifier;
bool v2;
bool nesting;
};
@@ -75,6 +80,9 @@ struct vfio_dma {
unsigned long vaddr; /* Process virtual addr */
size_t size; /* Map size (bytes) */
int prot; /* IOMMU_READ/WRITE */
+ bool iommu_mapped;
+ struct task_struct *task;
+ struct rb_root pfn_list; /* Ex-user pinned pfn list */
};
struct vfio_group {
@@ -83,6 +91,21 @@ struct vfio_group {
};
/*
+ * Guest RAM pinning working set or DMA target
+ */
+struct vfio_pfn {
+ struct rb_node node;
+ dma_addr_t iova; /* Device address */
+ unsigned long pfn; /* Host pfn */
+ atomic_t ref_count;
+};
+
+#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
+ (!list_empty(&iommu->domain_list))
+
+static int put_pfn(unsigned long pfn, int prot);
+
+/*
* This code handles mapping and unmapping of user data buffers
* into DMA'ble space using the IOMMU
*/
@@ -130,6 +153,97 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
rb_erase(&old->node, &iommu->dma_list);
}
+/*
+ * Helper Functions for host iova-pfn list
+ */
+static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
+{
+ struct vfio_pfn *vpfn;
+ struct rb_node *node = dma->pfn_list.rb_node;
+
+ while (node) {
+ vpfn = rb_entry(node, struct vfio_pfn, node);
+
+ if (iova < vpfn->iova)
+ node = node->rb_left;
+ else if (iova > vpfn->iova)
+ node = node->rb_right;
+ else
+ return vpfn;
+ }
+ return NULL;
+}
+
+static void vfio_link_pfn(struct vfio_dma *dma,
+ struct vfio_pfn *new)
+{
+ struct rb_node **link, *parent = NULL;
+ struct vfio_pfn *vpfn;
+
+ link = &dma->pfn_list.rb_node;
+ while (*link) {
+ parent = *link;
+ vpfn = rb_entry(parent, struct vfio_pfn, node);
+
+ if (new->iova < vpfn->iova)
+ link = &(*link)->rb_left;
+ else
+ link = &(*link)->rb_right;
+ }
+
+ rb_link_node(&new->node, parent, link);
+ rb_insert_color(&new->node, &dma->pfn_list);
+}
+
+static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
+{
+ rb_erase(&old->node, &dma->pfn_list);
+}
+
+static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
+ unsigned long pfn)
+{
+ struct vfio_pfn *vpfn;
+
+ vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
+ if (!vpfn)
+ return -ENOMEM;
+
+ vpfn->iova = iova;
+ vpfn->pfn = pfn;
+ atomic_set(&vpfn->ref_count, 1);
+ vfio_link_pfn(dma, vpfn);
+ return 0;
+}
+
+static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
+ struct vfio_pfn *vpfn)
+{
+ vfio_unlink_pfn(dma, vpfn);
+ kfree(vpfn);
+}
+
+static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
+ unsigned long iova)
+{
+ struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
+
+ if (vpfn)
+ atomic_inc(&vpfn->ref_count);
+ return vpfn;
+}
+
+static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
+{
+ int ret = 0;
+
+ if (atomic_dec_and_test(&vpfn->ref_count)) {
+ ret = put_pfn(vpfn->pfn, dma->prot);
+ vfio_remove_from_pfn_list(dma, vpfn);
+ }
+ return ret;
+}
+
struct vwork {
struct mm_struct *mm;
long npage;
@@ -150,17 +264,22 @@ static void vfio_lock_acct_bg(struct work_struct *work)
kfree(vwork);
}
-static void vfio_lock_acct(long npage)
+static void vfio_lock_acct(struct task_struct *task, long npage)
{
struct vwork *vwork;
struct mm_struct *mm;
- if (!current->mm || !npage)
+ if (!npage)
+ return;
+
+ mm = get_task_mm(task);
+ if (!mm)
return; /* process exited or nothing to do */
- if (down_write_trylock(&current->mm->mmap_sem)) {
- current->mm->locked_vm += npage;
- up_write(&current->mm->mmap_sem);
+ if (down_write_trylock(&mm->mmap_sem)) {
+ mm->locked_vm += npage;
+ up_write(&mm->mmap_sem);
+ mmput(mm);
return;
}
@@ -170,11 +289,8 @@ static void vfio_lock_acct(long npage)
* wouldn't need this silliness
*/
vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
- if (!vwork)
- return;
- mm = get_task_mm(current);
- if (!mm) {
- kfree(vwork);
+ if (!vwork) {
+ mmput(mm);
return;
}
INIT_WORK(&vwork->work, vfio_lock_acct_bg);
@@ -228,20 +344,36 @@ static int put_pfn(unsigned long pfn, int prot)
return 0;
}
-static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
+static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
+ int prot, unsigned long *pfn)
{
struct page *page[1];
struct vm_area_struct *vma;
- int ret = -EFAULT;
+ int ret;
+
+ if (mm == current->mm) {
+ ret = get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE),
+ page);
+ } else {
+ unsigned int flags = 0;
+
+ if (prot & IOMMU_WRITE)
+ flags |= FOLL_WRITE;
+
+ down_read(&mm->mmap_sem);
+ ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
+ NULL, NULL);
+ up_read(&mm->mmap_sem);
+ }
- if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
+ if (ret == 1) {
*pfn = page_to_pfn(page[0]);
return 0;
}
- down_read(&current->mm->mmap_sem);
+ down_read(&mm->mmap_sem);
- vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
+ vma = find_vma_intersection(mm, vaddr, vaddr + 1);
if (vma && vma->vm_flags & VM_PFNMAP) {
*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -249,8 +381,7 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
ret = 0;
}
- up_read(&current->mm->mmap_sem);
-
+ up_read(&mm->mmap_sem);
return ret;
}
@@ -259,88 +390,299 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
* the iommu can only map chunks of consecutive pfns anyway, so get the
* first page and all consecutive pages with the same locking.
*/
-static long vfio_pin_pages(unsigned long vaddr, long npage,
- int prot, unsigned long *pfn_base)
+static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
+ long npage, unsigned long *pfn_base)
{
- unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- bool lock_cap = capable(CAP_IPC_LOCK);
- long ret, i;
+ unsigned long limit;
+ bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns,
+ CAP_IPC_LOCK);
+ struct mm_struct *mm;
+ long ret, i = 0, lock_acct = 0;
bool rsvd;
+ dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
- if (!current->mm)
+ mm = get_task_mm(dma->task);
+ if (!mm)
return -ENODEV;
- ret = vaddr_get_pfn(vaddr, prot, pfn_base);
+ ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
if (ret)
- return ret;
+ goto pin_pg_remote_exit;
rsvd = is_invalid_reserved_pfn(*pfn_base);
+ limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) {
- put_pfn(*pfn_base, prot);
- pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
- limit << PAGE_SHIFT);
- return -ENOMEM;
+ /*
+ * Reserved pages aren't counted against the user, externally pinned
+ * pages are already counted against the user.
+ */
+ if (!rsvd && !vfio_find_vpfn(dma, iova)) {
+ if (!lock_cap && mm->locked_vm + 1 > limit) {
+ put_pfn(*pfn_base, dma->prot);
+ pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
+ limit << PAGE_SHIFT);
+ ret = -ENOMEM;
+ goto pin_pg_remote_exit;
+ }
+ lock_acct++;
}
- if (unlikely(disable_hugepages)) {
- if (!rsvd)
- vfio_lock_acct(1);
- return 1;
- }
+ i++;
+ if (likely(!disable_hugepages)) {
+ /* Lock all the consecutive pages from pfn_base */
+ for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; i < npage;
+ i++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
+ unsigned long pfn = 0;
- /* Lock all the consecutive pages from pfn_base */
- for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
- unsigned long pfn = 0;
+ ret = vaddr_get_pfn(mm, vaddr, dma->prot, &pfn);
+ if (ret)
+ break;
- ret = vaddr_get_pfn(vaddr, prot, &pfn);
- if (ret)
- break;
+ if (pfn != *pfn_base + i ||
+ rsvd != is_invalid_reserved_pfn(pfn)) {
+ put_pfn(pfn, dma->prot);
+ break;
+ }
- if (pfn != *pfn_base + i ||
- rsvd != is_invalid_reserved_pfn(pfn)) {
- put_pfn(pfn, prot);
- break;
+ if (!rsvd && !vfio_find_vpfn(dma, iova)) {
+ if (!lock_cap &&
+ mm->locked_vm + lock_acct + 1 > limit) {
+ put_pfn(pfn, dma->prot);
+ pr_warn("%s: RLIMIT_MEMLOCK (%ld) "
+ "exceeded\n", __func__,
+ limit << PAGE_SHIFT);
+ break;
+ }
+ lock_acct++;
+ }
}
+ }
- if (!rsvd && !lock_cap &&
- current->mm->locked_vm + i + 1 > limit) {
- put_pfn(pfn, prot);
- pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
- __func__, limit << PAGE_SHIFT);
- break;
+ vfio_lock_acct(dma->task, lock_acct);
+ ret = i;
+
+pin_pg_remote_exit:
+ mmput(mm);
+ return ret;
+}
+
+static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
+ unsigned long pfn, long npage,
+ bool do_accounting)
+{
+ long unlocked = 0, locked = 0;
+ long i;
+
+ for (i = 0; i < npage; i++) {
+ if (put_pfn(pfn++, dma->prot)) {
+ unlocked++;
+ if (vfio_find_vpfn(dma, iova + (i << PAGE_SHIFT)))
+ locked++;
}
}
- if (!rsvd)
- vfio_lock_acct(i);
+ if (do_accounting)
+ vfio_lock_acct(dma->task, locked - unlocked);
- return i;
+ return unlocked;
}
-static long vfio_unpin_pages(unsigned long pfn, long npage,
- int prot, bool do_accounting)
+static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
+ unsigned long *pfn_base, bool do_accounting)
{
- unsigned long unlocked = 0;
- long i;
+ unsigned long limit;
+ bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns,
+ CAP_IPC_LOCK);
+ struct mm_struct *mm;
+ int ret;
+ bool rsvd;
- for (i = 0; i < npage; i++)
- unlocked += put_pfn(pfn++, prot);
+ mm = get_task_mm(dma->task);
+ if (!mm)
+ return -ENODEV;
+
+ ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
+ if (ret)
+ goto pin_page_exit;
+
+ rsvd = is_invalid_reserved_pfn(*pfn_base);
+ limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
+ put_pfn(*pfn_base, dma->prot);
+ pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
+ __func__, dma->task->comm, task_pid_nr(dma->task),
+ limit << PAGE_SHIFT);
+ ret = -ENOMEM;
+ goto pin_page_exit;
+ }
+
+ if (!rsvd && do_accounting)
+ vfio_lock_acct(dma->task, 1);
+ ret = 1;
+
+pin_page_exit:
+ mmput(mm);
+ return ret;
+}
+
+static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
+ bool do_accounting)
+{
+ int unlocked;
+ struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
+
+ if (!vpfn)
+ return 0;
+
+ unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
if (do_accounting)
- vfio_lock_acct(-unlocked);
+ vfio_lock_acct(dma->task, -unlocked);
return unlocked;
}
-static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
+static int vfio_iommu_type1_pin_pages(void *iommu_data,
+ unsigned long *user_pfn,
+ int npage, int prot,
+ unsigned long *phys_pfn)
+{
+ struct vfio_iommu *iommu = iommu_data;
+ int i, j, ret;
+ unsigned long remote_vaddr;
+ struct vfio_dma *dma;
+ bool do_accounting;
+
+ if (!iommu || !user_pfn || !phys_pfn)
+ return -EINVAL;
+
+ /* Supported for v2 version only */
+ if (!iommu->v2)
+ return -EACCES;
+
+ mutex_lock(&iommu->lock);
+
+ /* Fail if notifier list is empty */
+ if ((!iommu->external_domain) || (!iommu->notifier.head)) {
+ ret = -EINVAL;
+ goto pin_done;
+ }
+
+ /*
+ * If iommu capable domain exist in the container then all pages are
+ * already pinned and accounted. Accouting should be done if there is no
+ * iommu capable domain in the container.
+ */
+ do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
+
+ for (i = 0; i < npage; i++) {
+ dma_addr_t iova;
+ struct vfio_pfn *vpfn;
+
+ iova = user_pfn[i] << PAGE_SHIFT;
+ dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
+ if (!dma) {
+ ret = -EINVAL;
+ goto pin_unwind;
+ }
+
+ if ((dma->prot & prot) != prot) {
+ ret = -EPERM;
+ goto pin_unwind;
+ }
+
+ vpfn = vfio_iova_get_vfio_pfn(dma, iova);
+ if (vpfn) {
+ phys_pfn[i] = vpfn->pfn;
+ continue;
+ }
+
+ remote_vaddr = dma->vaddr + iova - dma->iova;
+ ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
+ do_accounting);
+ if (ret <= 0) {
+ WARN_ON(!ret);
+ goto pin_unwind;
+ }
+
+ ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
+ if (ret) {
+ vfio_unpin_page_external(dma, iova, do_accounting);
+ goto pin_unwind;
+ }
+ }
+
+ ret = i;
+ goto pin_done;
+
+pin_unwind:
+ phys_pfn[i] = 0;
+ for (j = 0; j < i; j++) {
+ dma_addr_t iova;
+
+ iova = user_pfn[j] << PAGE_SHIFT;
+ dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
+ vfio_unpin_page_external(dma, iova, do_accounting);
+ phys_pfn[j] = 0;
+ }
+pin_done:
+ mutex_unlock(&iommu->lock);
+ return ret;
+}
+
+static int vfio_iommu_type1_unpin_pages(void *iommu_data,
+ unsigned long *user_pfn,
+ int npage)
+{
+ struct vfio_iommu *iommu = iommu_data;
+ bool do_accounting;
+ int i;
+
+ if (!iommu || !user_pfn)
+ return -EINVAL;
+
+ /* Supported for v2 version only */
+ if (!iommu->v2)
+ return -EACCES;
+
+ mutex_lock(&iommu->lock);
+
+ if (!iommu->external_domain) {
+ mutex_unlock(&iommu->lock);
+ return -EINVAL;
+ }
+
+ do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
+ for (i = 0; i < npage; i++) {
+ struct vfio_dma *dma;
+ dma_addr_t iova;
+
+ iova = user_pfn[i] << PAGE_SHIFT;
+ dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
+ if (!dma)
+ goto unpin_exit;
+ vfio_unpin_page_external(dma, iova, do_accounting);
+ }
+
+unpin_exit:
+ mutex_unlock(&iommu->lock);
+ return i > npage ? npage : (i > 0 ? i : -EINVAL);
+}
+
+static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
+ bool do_accounting)
{
dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
struct vfio_domain *domain, *d;
long unlocked = 0;
if (!dma->size)
- return;
+ return 0;
+
+ if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
+ return 0;
+
/*
* We use the IOMMU to track the physical addresses, otherwise we'd
* need a much more complicated tracking system. Unfortunately that
@@ -382,21 +724,28 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
if (WARN_ON(!unmapped))
break;
- unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
- unmapped >> PAGE_SHIFT,
- dma->prot, false);
+ unlocked += vfio_unpin_pages_remote(dma, iova,
+ phys >> PAGE_SHIFT,
+ unmapped >> PAGE_SHIFT,
+ false);
iova += unmapped;
cond_resched();
}
- vfio_lock_acct(-unlocked);
+ dma->iommu_mapped = false;
+ if (do_accounting) {
+ vfio_lock_acct(dma->task, -unlocked);
+ return 0;
+ }
+ return unlocked;
}
static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
{
- vfio_unmap_unpin(iommu, dma);
+ vfio_unmap_unpin(iommu, dma, true);
vfio_unlink_dma(iommu, dma);
+ put_task_struct(dma->task);
kfree(dma);
}
@@ -430,9 +779,9 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
struct vfio_iommu_type1_dma_unmap *unmap)
{
uint64_t mask;
- struct vfio_dma *dma;
+ struct vfio_dma *dma, *dma_last = NULL;
size_t unmapped = 0;
- int ret = 0;
+ int ret = 0, retries = 0;
mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
@@ -442,7 +791,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
return -EINVAL;
WARN_ON(mask & PAGE_MASK);
-
+again:
mutex_lock(&iommu->lock);
/*
@@ -477,7 +826,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
* mappings within the range.
*/
if (iommu->v2) {
- dma = vfio_find_dma(iommu, unmap->iova, 0);
+ dma = vfio_find_dma(iommu, unmap->iova, 1);
if (dma && dma->iova != unmap->iova) {
ret = -EINVAL;
goto unlock;
@@ -492,6 +841,38 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
if (!iommu->v2 && unmap->iova > dma->iova)
break;
+ /*
+ * Task with same address space who mapped this iova range is
+ * allowed to unmap the iova range.
+ */
+ if (dma->task->mm != current->mm)
+ break;
+
+ if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
+ struct vfio_iommu_type1_dma_unmap nb_unmap;
+
+ if (dma_last == dma) {
+ BUG_ON(++retries > 10);
+ } else {
+ dma_last = dma;
+ retries = 0;
+ }
+
+ nb_unmap.iova = dma->iova;
+ nb_unmap.size = dma->size;
+
+ /*
+ * Notify anyone (mdev vendor drivers) to invalidate and
+ * unmap iovas within the range we're about to unmap.
+ * Vendor drivers MUST unpin pages in response to an
+ * invalidation.
+ */
+ mutex_unlock(&iommu->lock);
+ blocking_notifier_call_chain(&iommu->notifier,
+ VFIO_IOMMU_NOTIFY_DMA_UNMAP,
+ &nb_unmap);
+ goto again;
+ }
unmapped += dma->size;
vfio_remove_dma(iommu, dma);
}
@@ -558,17 +939,56 @@ unwind:
return ret;
}
+static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
+ size_t map_size)
+{
+ dma_addr_t iova = dma->iova;
+ unsigned long vaddr = dma->vaddr;
+ size_t size = map_size;
+ long npage;
+ unsigned long pfn;
+ int ret = 0;
+
+ while (size) {
+ /* Pin a contiguous chunk of memory */
+ npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
+ size >> PAGE_SHIFT, &pfn);
+ if (npage <= 0) {
+ WARN_ON(!npage);
+ ret = (int)npage;
+ break;
+ }
+
+ /* Map it! */
+ ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
+ dma->prot);
+ if (ret) {
+ vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
+ npage, true);
+ break;
+ }
+
+ size -= npage << PAGE_SHIFT;
+ dma->size += npage << PAGE_SHIFT;
+ }
+
+ dma->iommu_mapped = true;
+
+ if (ret)
+ vfio_remove_dma(iommu, dma);
+
+ return ret;
+}
+
static int vfio_dma_do_map(struct vfio_iommu *iommu,
struct vfio_iommu_type1_dma_map *map)
{
dma_addr_t iova = map->iova;
unsigned long vaddr = map->vaddr;
size_t size = map->size;
- long npage;
int ret = 0, prot = 0;
uint64_t mask;
struct vfio_dma *dma;
- unsigned long pfn;
/* Verify that none of our __u64 fields overflow */
if (map->size != size || map->vaddr != vaddr || map->iova != iova)
@@ -594,47 +1014,33 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
mutex_lock(&iommu->lock);
if (vfio_find_dma(iommu, iova, size)) {
- mutex_unlock(&iommu->lock);
- return -EEXIST;
+ ret = -EEXIST;
+ goto out_unlock;
}
dma = kzalloc(sizeof(*dma), GFP_KERNEL);
if (!dma) {
- mutex_unlock(&iommu->lock);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_unlock;
}
dma->iova = iova;
dma->vaddr = vaddr;
dma->prot = prot;
+ get_task_struct(current);
+ dma->task = current;
+ dma->pfn_list = RB_ROOT;
/* Insert zero-sized and grow as we map chunks of it */
vfio_link_dma(iommu, dma);
- while (size) {
- /* Pin a contiguous chunk of memory */
- npage = vfio_pin_pages(vaddr + dma->size,
- size >> PAGE_SHIFT, prot, &pfn);
- if (npage <= 0) {
- WARN_ON(!npage);
- ret = (int)npage;
- break;
- }
-
- /* Map it! */
- ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
- if (ret) {
- vfio_unpin_pages(pfn, npage, prot, true);
- break;
- }
-
- size -= npage << PAGE_SHIFT;
- dma->size += npage << PAGE_SHIFT;
- }
-
- if (ret)
- vfio_remove_dma(iommu, dma);
+ /* Don't pin and map if container doesn't contain IOMMU capable domain*/
+ if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
+ dma->size = size;
+ else
+ ret = vfio_pin_map_dma(iommu, dma, size);
+out_unlock:
mutex_unlock(&iommu->lock);
return ret;
}
@@ -662,10 +1068,6 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
n = rb_first(&iommu->dma_list);
- /* If there's not a domain, there better not be any mappings */
- if (WARN_ON(n && !d))
- return -EINVAL;
-
for (; n; n = rb_next(n)) {
struct vfio_dma *dma;
dma_addr_t iova;
@@ -674,21 +1076,49 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
iova = dma->iova;
while (iova < dma->iova + dma->size) {
- phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
+ phys_addr_t phys;
size_t size;
- if (WARN_ON(!phys)) {
- iova += PAGE_SIZE;
- continue;
+ if (dma->iommu_mapped) {
+ phys_addr_t p;
+ dma_addr_t i;
+
+ phys = iommu_iova_to_phys(d->domain, iova);
+
+ if (WARN_ON(!phys)) {
+ iova += PAGE_SIZE;
+ continue;
+ }
+
+ size = PAGE_SIZE;
+ p = phys + size;
+ i = iova + size;
+ while (i < dma->iova + dma->size &&
+ p == iommu_iova_to_phys(d->domain, i)) {
+ size += PAGE_SIZE;
+ p += PAGE_SIZE;
+ i += PAGE_SIZE;
+ }
+ } else {
+ unsigned long pfn;
+ unsigned long vaddr = dma->vaddr +
+ (iova - dma->iova);
+ size_t n = dma->iova + dma->size - iova;
+ long npage;
+
+ npage = vfio_pin_pages_remote(dma, vaddr,
+ n >> PAGE_SHIFT,
+ &pfn);
+ if (npage <= 0) {
+ WARN_ON(!npage);
+ ret = (int)npage;
+ return ret;
+ }
+
+ phys = pfn << PAGE_SHIFT;
+ size = npage << PAGE_SHIFT;
}
- size = PAGE_SIZE;
-
- while (iova + size < dma->iova + dma->size &&
- phys + size == iommu_iova_to_phys(d->domain,
- iova + size))
- size += PAGE_SIZE;
-
ret = iommu_map(domain->domain, iova, phys,
size, dma->prot | domain->prot);
if (ret)
@@ -696,8 +1126,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
iova += size;
}
+ dma->iommu_mapped = true;
}
-
return 0;
}
@@ -734,22 +1164,39 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain)
__free_pages(pages, order);
}
+static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
+ struct iommu_group *iommu_group)
+{
+ struct vfio_group *g;
+
+ list_for_each_entry(g, &domain->group_list, next) {
+ if (g->iommu_group == iommu_group)
+ return g;
+ }
+
+ return NULL;
+}
+
static int vfio_iommu_type1_attach_group(void *iommu_data,
struct iommu_group *iommu_group)
{
struct vfio_iommu *iommu = iommu_data;
- struct vfio_group *group, *g;
+ struct vfio_group *group;
struct vfio_domain *domain, *d;
- struct bus_type *bus = NULL;
+ struct bus_type *bus = NULL, *mdev_bus;
int ret;
mutex_lock(&iommu->lock);
list_for_each_entry(d, &iommu->domain_list, next) {
- list_for_each_entry(g, &d->group_list, next) {
- if (g->iommu_group != iommu_group)
- continue;
+ if (find_iommu_group(d, iommu_group)) {
+ mutex_unlock(&iommu->lock);
+ return -EINVAL;
+ }
+ }
+ if (iommu->external_domain) {
+ if (find_iommu_group(iommu->external_domain, iommu_group)) {
mutex_unlock(&iommu->lock);
return -EINVAL;
}
@@ -769,6 +1216,25 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
if (ret)
goto out_free;
+ mdev_bus = symbol_get(mdev_bus_type);
+
+ if (mdev_bus) {
+ if ((bus == mdev_bus) && !iommu_present(bus)) {
+ symbol_put(mdev_bus_type);
+ if (!iommu->external_domain) {
+ INIT_LIST_HEAD(&domain->group_list);
+ iommu->external_domain = domain;
+ } else
+ kfree(domain);
+
+ list_add(&group->next,
+ &iommu->external_domain->group_list);
+ mutex_unlock(&iommu->lock);
+ return 0;
+ }
+ symbol_put(mdev_bus_type);
+ }
+
domain->domain = iommu_domain_alloc(bus);
if (!domain->domain) {
ret = -EIO;
@@ -859,6 +1325,46 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
}
+static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
+{
+ struct rb_node *n, *p;
+
+ n = rb_first(&iommu->dma_list);
+ for (; n; n = rb_next(n)) {
+ struct vfio_dma *dma;
+ long locked = 0, unlocked = 0;
+
+ dma = rb_entry(n, struct vfio_dma, node);
+ unlocked += vfio_unmap_unpin(iommu, dma, false);
+ p = rb_first(&dma->pfn_list);
+ for (; p; p = rb_next(p)) {
+ struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
+ node);
+
+ if (!is_invalid_reserved_pfn(vpfn->pfn))
+ locked++;
+ }
+ vfio_lock_acct(dma->task, locked - unlocked);
+ }
+}
+
+static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu)
+{
+ struct rb_node *n;
+
+ n = rb_first(&iommu->dma_list);
+ for (; n; n = rb_next(n)) {
+ struct vfio_dma *dma;
+
+ dma = rb_entry(n, struct vfio_dma, node);
+
+ if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)))
+ break;
+ }
+ /* mdev vendor driver must unregister notifier */
+ WARN_ON(iommu->notifier.head);
+}
+
static void vfio_iommu_type1_detach_group(void *iommu_data,
struct iommu_group *iommu_group)
{
@@ -868,31 +1374,55 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
mutex_lock(&iommu->lock);
- list_for_each_entry(domain, &iommu->domain_list, next) {
- list_for_each_entry(group, &domain->group_list, next) {
- if (group->iommu_group != iommu_group)
- continue;
-
- iommu_detach_group(domain->domain, iommu_group);
+ if (iommu->external_domain) {
+ group = find_iommu_group(iommu->external_domain, iommu_group);
+ if (group) {
list_del(&group->next);
kfree(group);
- /*
- * Group ownership provides privilege, if the group
- * list is empty, the domain goes away. If it's the
- * last domain, then all the mappings go away too.
- */
- if (list_empty(&domain->group_list)) {
- if (list_is_singular(&iommu->domain_list))
+
+ if (list_empty(&iommu->external_domain->group_list)) {
+ vfio_sanity_check_pfn_list(iommu);
+
+ if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
vfio_iommu_unmap_unpin_all(iommu);
- iommu_domain_free(domain->domain);
- list_del(&domain->next);
- kfree(domain);
+
+ kfree(iommu->external_domain);
+ iommu->external_domain = NULL;
+ }
+ goto detach_group_done;
+ }
+ }
+
+ list_for_each_entry(domain, &iommu->domain_list, next) {
+ group = find_iommu_group(domain, iommu_group);
+ if (!group)
+ continue;
+
+ iommu_detach_group(domain->domain, iommu_group);
+ list_del(&group->next);
+ kfree(group);
+ /*
+ * Group ownership provides privilege, if the group list is
+ * empty, the domain goes away. If it's the last domain with
+ * iommu and external domain doesn't exist, then all the
+ * mappings go away too. If it's the last domain with iommu and
+ * external domain exist, update accounting
+ */
+ if (list_empty(&domain->group_list)) {
+ if (list_is_singular(&iommu->domain_list)) {
+ if (!iommu->external_domain)
+ vfio_iommu_unmap_unpin_all(iommu);
+ else
+ vfio_iommu_unmap_unpin_reaccount(iommu);
}
- goto done;
+ iommu_domain_free(domain->domain);
+ list_del(&domain->next);
+ kfree(domain);
}
+ break;
}
-done:
+detach_group_done:
mutex_unlock(&iommu->lock);
}
@@ -920,31 +1450,46 @@ static void *vfio_iommu_type1_open(unsigned long arg)
INIT_LIST_HEAD(&iommu->domain_list);
iommu->dma_list = RB_ROOT;
mutex_init(&iommu->lock);
+ BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
return iommu;
}
+static void vfio_release_domain(struct vfio_domain *domain, bool external)
+{
+ struct vfio_group *group, *group_tmp;
+
+ list_for_each_entry_safe(group, group_tmp,
+ &domain->group_list, next) {
+ if (!external)
+ iommu_detach_group(domain->domain, group->iommu_group);
+ list_del(&group->next);
+ kfree(group);
+ }
+
+ if (!external)
+ iommu_domain_free(domain->domain);
+}
+
static void vfio_iommu_type1_release(void *iommu_data)
{
struct vfio_iommu *iommu = iommu_data;
struct vfio_domain *domain, *domain_tmp;
- struct vfio_group *group, *group_tmp;
+
+ if (iommu->external_domain) {
+ vfio_release_domain(iommu->external_domain, true);
+ vfio_sanity_check_pfn_list(iommu);
+ kfree(iommu->external_domain);
+ }
vfio_iommu_unmap_unpin_all(iommu);
list_for_each_entry_safe(domain, domain_tmp,
&iommu->domain_list, next) {
- list_for_each_entry_safe(group, group_tmp,
- &domain->group_list, next) {
- iommu_detach_group(domain->domain, group->iommu_group);
- list_del(&group->next);
- kfree(group);
- }
- iommu_domain_free(domain->domain);
+ vfio_release_domain(domain, false);
list_del(&domain->next);
kfree(domain);
}
-
kfree(iommu);
}
@@ -1040,14 +1585,42 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
return -ENOTTY;
}
+static int vfio_iommu_type1_register_notifier(void *iommu_data,
+ unsigned long *events,
+ struct notifier_block *nb)
+{
+ struct vfio_iommu *iommu = iommu_data;
+
+ /* clear known events */
+ *events &= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP;
+
+ /* refuse to register if still events remaining */
+ if (*events)
+ return -EINVAL;
+
+ return blocking_notifier_chain_register(&iommu->notifier, nb);
+}
+
+static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
+ struct notifier_block *nb)
+{
+ struct vfio_iommu *iommu = iommu_data;
+
+ return blocking_notifier_chain_unregister(&iommu->notifier, nb);
+}
+
static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
- .name = "vfio-iommu-type1",
- .owner = THIS_MODULE,
- .open = vfio_iommu_type1_open,
- .release = vfio_iommu_type1_release,
- .ioctl = vfio_iommu_type1_ioctl,
- .attach_group = vfio_iommu_type1_attach_group,
- .detach_group = vfio_iommu_type1_detach_group,
+ .name = "vfio-iommu-type1",
+ .owner = THIS_MODULE,
+ .open = vfio_iommu_type1_open,
+ .release = vfio_iommu_type1_release,
+ .ioctl = vfio_iommu_type1_ioctl,
+ .attach_group = vfio_iommu_type1_attach_group,
+ .detach_group = vfio_iommu_type1_detach_group,
+ .pin_pages = vfio_iommu_type1_pin_pages,
+ .unpin_pages = vfio_iommu_type1_unpin_pages,
+ .register_notifier = vfio_iommu_type1_register_notifier,
+ .unregister_notifier = vfio_iommu_type1_unregister_notifier,
};
static int __init vfio_iommu_type1_init(void)