diff options
-rw-r--r-- | Documentation/ABI/testing/sysfs-bus-vfio-mdev | 111 | ||||
-rw-r--r-- | Documentation/vfio-mediated-device.txt | 398 | ||||
-rw-r--r-- | MAINTAINERS | 9 | ||||
-rw-r--r-- | drivers/vfio/Kconfig | 1 | ||||
-rw-r--r-- | drivers/vfio/Makefile | 1 | ||||
-rw-r--r-- | drivers/vfio/mdev/Kconfig | 17 | ||||
-rw-r--r-- | drivers/vfio/mdev/Makefile | 5 | ||||
-rw-r--r-- | drivers/vfio/mdev/mdev_core.c | 385 | ||||
-rw-r--r-- | drivers/vfio/mdev/mdev_driver.c | 119 | ||||
-rw-r--r-- | drivers/vfio/mdev/mdev_private.h | 41 | ||||
-rw-r--r-- | drivers/vfio/mdev/mdev_sysfs.c | 286 | ||||
-rw-r--r-- | drivers/vfio/mdev/vfio_mdev.c | 148 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci.c | 83 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_config.c | 10 | ||||
-rw-r--r-- | drivers/vfio/platform/vfio_platform_common.c | 31 | ||||
-rw-r--r-- | drivers/vfio/vfio.c | 461 | ||||
-rw-r--r-- | drivers/vfio/vfio_iommu_type1.c | 885 | ||||
-rw-r--r-- | include/linux/mdev.h | 168 | ||||
-rw-r--r-- | include/linux/vfio.h | 48 | ||||
-rw-r--r-- | include/uapi/linux/vfio.h | 10 | ||||
-rw-r--r-- | samples/vfio-mdev/Makefile | 13 | ||||
-rw-r--r-- | samples/vfio-mdev/mtty.c | 1503 | ||||
-rw-r--r-- | virt/kvm/vfio.c | 18 |
23 files changed, 4486 insertions, 265 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-vfio-mdev b/Documentation/ABI/testing/sysfs-bus-vfio-mdev new file mode 100644 index 000000000000..452dbe39270e --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-vfio-mdev @@ -0,0 +1,111 @@ +What: /sys/.../<device>/mdev_supported_types/ +Date: October 2016 +Contact: Kirti Wankhede <kwankhede@nvidia.com> +Description: + This directory contains list of directories of currently + supported mediated device types and their details for + <device>. Supported type attributes are defined by the + vendor driver who registers with Mediated device framework. + Each supported type is a directory whose name is created + by adding the device driver string as a prefix to the + string provided by the vendor driver. + +What: /sys/.../<device>/mdev_supported_types/<type-id>/ +Date: October 2016 +Contact: Kirti Wankhede <kwankhede@nvidia.com> +Description: + This directory gives details of supported type, like name, + description, available_instances, device_api etc. + 'device_api' and 'available_instances' are mandatory + attributes to be provided by vendor driver. 'name', + 'description' and other vendor driver specific attributes + are optional. + +What: /sys/.../mdev_supported_types/<type-id>/create +Date: October 2016 +Contact: Kirti Wankhede <kwankhede@nvidia.com> +Description: + Writing UUID to this file will create mediated device of + type <type-id> for parent device <device>. This is a + write-only file. + For example: + # echo "83b8f4f2-509f-382f-3c1e-e6bfe0fa1001" > \ + /sys/devices/foo/mdev_supported_types/foo-1/create + +What: /sys/.../mdev_supported_types/<type-id>/devices/ +Date: October 2016 +Contact: Kirti Wankhede <kwankhede@nvidia.com> +Description: + This directory contains symbolic links pointing to mdev + devices sysfs entries which are created of this <type-id>. + +What: /sys/.../mdev_supported_types/<type-id>/available_instances +Date: October 2016 +Contact: Kirti Wankhede <kwankhede@nvidia.com> +Description: + Reading this attribute will show the number of mediated + devices of type <type-id> that can be created. This is a + readonly file. +Users: + Userspace applications interested in creating mediated + device of that type. Userspace application should check + the number of available instances could be created before + creating mediated device of this type. + +What: /sys/.../mdev_supported_types/<type-id>/device_api +Date: October 2016 +Contact: Kirti Wankhede <kwankhede@nvidia.com> +Description: + Reading this attribute will show VFIO device API supported + by this type. For example, "vfio-pci" for a PCI device, + "vfio-platform" for platform device. + +What: /sys/.../mdev_supported_types/<type-id>/name +Date: October 2016 +Contact: Kirti Wankhede <kwankhede@nvidia.com> +Description: + Reading this attribute will show human readable name of the + mediated device that will get created of type <type-id>. + This is optional attribute. For example: "Grid M60-0Q" +Users: + Userspace applications interested in knowing the name of + a particular <type-id> that can help in understanding the + type of mediated device. + +What: /sys/.../mdev_supported_types/<type-id>/description +Date: October 2016 +Contact: Kirti Wankhede <kwankhede@nvidia.com> +Description: + Reading this attribute will show description of the type of + mediated device that will get created of type <type-id>. + This is optional attribute. For example: + "2 heads, 512M FB, 2560x1600 maximum resolution" +Users: + Userspace applications interested in knowing the details of + a particular <type-id> that can help in understanding the + features provided by that type of mediated device. + +What: /sys/.../<device>/<UUID>/ +Date: October 2016 +Contact: Kirti Wankhede <kwankhede@nvidia.com> +Description: + This directory represents device directory of mediated + device. It contains all the attributes related to mediated + device. + +What: /sys/.../<device>/<UUID>/mdev_type +Date: October 2016 +Contact: Kirti Wankhede <kwankhede@nvidia.com> +Description: + This is symbolic link pointing to supported type, <type-id> + directory of which this mediated device is created. + +What: /sys/.../<device>/<UUID>/remove +Date: October 2016 +Contact: Kirti Wankhede <kwankhede@nvidia.com> +Description: + Writing '1' to this file destroys the mediated device. The + vendor driver can fail the remove() callback if that device + is active and the vendor driver doesn't support hot unplug. + Example: + # echo 1 > /sys/bus/mdev/devices/<UUID>/remove diff --git a/Documentation/vfio-mediated-device.txt b/Documentation/vfio-mediated-device.txt new file mode 100644 index 000000000000..b38afec35edc --- /dev/null +++ b/Documentation/vfio-mediated-device.txt @@ -0,0 +1,398 @@ +/* + * VFIO Mediated devices + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +Virtual Function I/O (VFIO) Mediated devices[1] +=============================================== + +The number of use cases for virtualizing DMA devices that do not have built-in +SR_IOV capability is increasing. Previously, to virtualize such devices, +developers had to create their own management interfaces and APIs, and then +integrate them with user space software. To simplify integration with user space +software, we have identified common requirements and a unified management +interface for such devices. + +The VFIO driver framework provides unified APIs for direct device access. It is +an IOMMU/device-agnostic framework for exposing direct device access to user +space in a secure, IOMMU-protected environment. This framework is used for +multiple devices, such as GPUs, network adapters, and compute accelerators. With +direct device access, virtual machines or user space applications have direct +access to the physical device. This framework is reused for mediated devices. + +The mediated core driver provides a common interface for mediated device +management that can be used by drivers of different devices. This module +provides a generic interface to perform these operations: + +* Create and destroy a mediated device +* Add a mediated device to and remove it from a mediated bus driver +* Add a mediated device to and remove it from an IOMMU group + +The mediated core driver also provides an interface to register a bus driver. +For example, the mediated VFIO mdev driver is designed for mediated devices and +supports VFIO APIs. The mediated bus driver adds a mediated device to and +removes it from a VFIO group. + +The following high-level block diagram shows the main components and interfaces +in the VFIO mediated driver framework. The diagram shows NVIDIA, Intel, and IBM +devices as examples, as these devices are the first devices to use this module. + + +---------------+ + | | + | +-----------+ | mdev_register_driver() +--------------+ + | | | +<------------------------+ | + | | mdev | | | | + | | bus | +------------------------>+ vfio_mdev.ko |<-> VFIO user + | | driver | | probe()/remove() | | APIs + | | | | +--------------+ + | +-----------+ | + | | + | MDEV CORE | + | MODULE | + | mdev.ko | + | +-----------+ | mdev_register_device() +--------------+ + | | | +<------------------------+ | + | | | | | nvidia.ko |<-> physical + | | | +------------------------>+ | device + | | | | callbacks +--------------+ + | | Physical | | + | | device | | mdev_register_device() +--------------+ + | | interface | |<------------------------+ | + | | | | | i915.ko |<-> physical + | | | +------------------------>+ | device + | | | | callbacks +--------------+ + | | | | + | | | | mdev_register_device() +--------------+ + | | | +<------------------------+ | + | | | | | ccw_device.ko|<-> physical + | | | +------------------------>+ | device + | | | | callbacks +--------------+ + | +-----------+ | + +---------------+ + + +Registration Interfaces +======================= + +The mediated core driver provides the following types of registration +interfaces: + +* Registration interface for a mediated bus driver +* Physical device driver interface + +Registration Interface for a Mediated Bus Driver +------------------------------------------------ + +The registration interface for a mediated bus driver provides the following +structure to represent a mediated device's driver: + + /* + * struct mdev_driver [2] - Mediated device's driver + * @name: driver name + * @probe: called when new device created + * @remove: called when device removed + * @driver: device driver structure + */ + struct mdev_driver { + const char *name; + int (*probe) (struct device *dev); + void (*remove) (struct device *dev); + struct device_driver driver; + }; + +A mediated bus driver for mdev should use this structure in the function calls +to register and unregister itself with the core driver: + +* Register: + + extern int mdev_register_driver(struct mdev_driver *drv, + struct module *owner); + +* Unregister: + + extern void mdev_unregister_driver(struct mdev_driver *drv); + +The mediated bus driver is responsible for adding mediated devices to the VFIO +group when devices are bound to the driver and removing mediated devices from +the VFIO when devices are unbound from the driver. + + +Physical Device Driver Interface +-------------------------------- + +The physical device driver interface provides the parent_ops[3] structure to +define the APIs to manage work in the mediated core driver that is related to +the physical device. + +The structures in the parent_ops structure are as follows: + +* dev_attr_groups: attributes of the parent device +* mdev_attr_groups: attributes of the mediated device +* supported_config: attributes to define supported configurations + +The functions in the parent_ops structure are as follows: + +* create: allocate basic resources in a driver for a mediated device +* remove: free resources in a driver when a mediated device is destroyed + +The callbacks in the parent_ops structure are as follows: + +* open: open callback of mediated device +* close: close callback of mediated device +* ioctl: ioctl callback of mediated device +* read : read emulation callback +* write: write emulation callback +* mmap: mmap emulation callback + +A driver should use the parent_ops structure in the function call to register +itself with the mdev core driver: + +extern int mdev_register_device(struct device *dev, + const struct parent_ops *ops); + +However, the parent_ops structure is not required in the function call that a +driver should use to unregister itself with the mdev core driver: + +extern void mdev_unregister_device(struct device *dev); + + +Mediated Device Management Interface Through sysfs +================================================== + +The management interface through sysfs enables user space software, such as +libvirt, to query and configure mediated devices in a hardware-agnostic fashion. +This management interface provides flexibility to the underlying physical +device's driver to support features such as: + +* Mediated device hot plug +* Multiple mediated devices in a single virtual machine +* Multiple mediated devices from different physical devices + +Links in the mdev_bus Class Directory +------------------------------------- +The /sys/class/mdev_bus/ directory contains links to devices that are registered +with the mdev core driver. + +Directories and files under the sysfs for Each Physical Device +-------------------------------------------------------------- + +|- [parent physical device] +|--- Vendor-specific-attributes [optional] +|--- [mdev_supported_types] +| |--- [<type-id>] +| | |--- create +| | |--- name +| | |--- available_instances +| | |--- device_api +| | |--- description +| | |--- [devices] +| |--- [<type-id>] +| | |--- create +| | |--- name +| | |--- available_instances +| | |--- device_api +| | |--- description +| | |--- [devices] +| |--- [<type-id>] +| |--- create +| |--- name +| |--- available_instances +| |--- device_api +| |--- description +| |--- [devices] + +* [mdev_supported_types] + + The list of currently supported mediated device types and their details. + + [<type-id>], device_api, and available_instances are mandatory attributes + that should be provided by vendor driver. + +* [<type-id>] + + The [<type-id>] name is created by adding the the device driver string as a + prefix to the string provided by the vendor driver. This format of this name + is as follows: + + sprintf(buf, "%s-%s", dev_driver_string(parent->dev), group->name); + +* device_api + + This attribute should show which device API is being created, for example, + "vfio-pci" for a PCI device. + +* available_instances + + This attribute should show the number of devices of type <type-id> that can be + created. + +* [device] + + This directory contains links to the devices of type <type-id> that have been +created. + +* name + + This attribute should show human readable name. This is optional attribute. + +* description + + This attribute should show brief features/description of the type. This is + optional attribute. + +Directories and Files Under the sysfs for Each mdev Device +---------------------------------------------------------- + +|- [parent phy device] +|--- [$MDEV_UUID] + |--- remove + |--- mdev_type {link to its type} + |--- vendor-specific-attributes [optional] + +* remove (write only) +Writing '1' to the 'remove' file destroys the mdev device. The vendor driver can +fail the remove() callback if that device is active and the vendor driver +doesn't support hot unplug. + +Example: + # echo 1 > /sys/bus/mdev/devices/$mdev_UUID/remove + +Mediated device Hot plug: +------------------------ + +Mediated devices can be created and assigned at runtime. The procedure to hot +plug a mediated device is the same as the procedure to hot plug a PCI device. + +Translation APIs for Mediated Devices +===================================== + +The following APIs are provided for translating user pfn to host pfn in a VFIO +driver: + +extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, + int npage, int prot, unsigned long *phys_pfn); + +extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, + int npage); + +These functions call back into the back-end IOMMU module by using the pin_pages +and unpin_pages callbacks of the struct vfio_iommu_driver_ops[4]. Currently +these callbacks are supported in the TYPE1 IOMMU module. To enable them for +other IOMMU backend modules, such as PPC64 sPAPR module, they need to provide +these two callback functions. + +Using the Sample Code +===================== + +mtty.c in samples/vfio-mdev/ directory is a sample driver program to +demonstrate how to use the mediated device framework. + +The sample driver creates an mdev device that simulates a serial port over a PCI +card. + +1. Build and load the mtty.ko module. + + This step creates a dummy device, /sys/devices/virtual/mtty/mtty/ + + Files in this device directory in sysfs are similar to the following: + + # tree /sys/devices/virtual/mtty/mtty/ + /sys/devices/virtual/mtty/mtty/ + |-- mdev_supported_types + | |-- mtty-1 + | | |-- available_instances + | | |-- create + | | |-- device_api + | | |-- devices + | | `-- name + | `-- mtty-2 + | |-- available_instances + | |-- create + | |-- device_api + | |-- devices + | `-- name + |-- mtty_dev + | `-- sample_mtty_dev + |-- power + | |-- autosuspend_delay_ms + | |-- control + | |-- runtime_active_time + | |-- runtime_status + | `-- runtime_suspended_time + |-- subsystem -> ../../../../class/mtty + `-- uevent + +2. Create a mediated device by using the dummy device that you created in the + previous step. + + # echo "83b8f4f2-509f-382f-3c1e-e6bfe0fa1001" > \ + /sys/devices/virtual/mtty/mtty/mdev_supported_types/mtty-2/create + +3. Add parameters to qemu-kvm. + + -device vfio-pci,\ + sysfsdev=/sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001 + +4. Boot the VM. + + In the Linux guest VM, with no hardware on the host, the device appears + as follows: + + # lspci -s 00:05.0 -xxvv + 00:05.0 Serial controller: Device 4348:3253 (rev 10) (prog-if 02 [16550]) + Subsystem: Device 4348:3253 + Physical Slot: 5 + Control: I/O+ Mem- BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr- + Stepping- SERR- FastB2B- DisINTx- + Status: Cap- 66MHz- UDF- FastB2B- ParErr- DEVSEL=medium >TAbort- + <TAbort- <MAbort- >SERR- <PERR- INTx- + Interrupt: pin A routed to IRQ 10 + Region 0: I/O ports at c150 [size=8] + Region 1: I/O ports at c158 [size=8] + Kernel driver in use: serial + 00: 48 43 53 32 01 00 00 02 10 02 00 07 00 00 00 00 + 10: 51 c1 00 00 59 c1 00 00 00 00 00 00 00 00 00 00 + 20: 00 00 00 00 00 00 00 00 00 00 00 00 48 43 53 32 + 30: 00 00 00 00 00 00 00 00 00 00 00 00 0a 01 00 00 + + In the Linux guest VM, dmesg output for the device is as follows: + + serial 0000:00:05.0: PCI INT A -> Link[LNKA] -> GSI 10 (level, high) -> IRQ +10 + 0000:00:05.0: ttyS1 at I/O 0xc150 (irq = 10) is a 16550A + 0000:00:05.0: ttyS2 at I/O 0xc158 (irq = 10) is a 16550A + + +5. In the Linux guest VM, check the serial ports. + + # setserial -g /dev/ttyS* + /dev/ttyS0, UART: 16550A, Port: 0x03f8, IRQ: 4 + /dev/ttyS1, UART: 16550A, Port: 0xc150, IRQ: 10 + /dev/ttyS2, UART: 16550A, Port: 0xc158, IRQ: 10 + +6. Using a minicom or any terminal enulation program, open port /dev/ttyS1 or + /dev/ttyS2 with hardware flow control disabled. + +7. Type data on the minicom terminal or send data to the terminal emulation + program and read the data. + + Data is loop backed from hosts mtty driver. + +8. Destroy the mediated device that you created. + + # echo 1 > /sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001/remove + +References +========== + +[1] See Documentation/vfio.txt for more information on VFIO. +[2] struct mdev_driver in include/linux/mdev.h +[3] struct parent_ops in include/linux/mdev.h +[4] struct vfio_iommu_driver_ops in include/linux/vfio.h diff --git a/MAINTAINERS b/MAINTAINERS index fde08b1b5b7f..cffba3e40199 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12791,6 +12791,15 @@ F: drivers/vfio/ F: include/linux/vfio.h F: include/uapi/linux/vfio.h +VFIO MEDIATED DEVICE DRIVERS +M: Kirti Wankhede <kwankhede@nvidia.com> +L: kvm@vger.kernel.org +S: Maintained +F: Documentation/vfio-mediated-device.txt +F: drivers/vfio/mdev/ +F: include/linux/mdev.h +F: samples/vfio-mdev/ + VFIO PLATFORM DRIVER M: Baptiste Reynal <b.reynal@virtualopensystems.com> L: kvm@vger.kernel.org diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index da6e2ce77495..23eced02aaf6 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -48,4 +48,5 @@ menuconfig VFIO_NOIOMMU source "drivers/vfio/pci/Kconfig" source "drivers/vfio/platform/Kconfig" +source "drivers/vfio/mdev/Kconfig" source "virt/lib/Kconfig" diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 7b8a31f63fea..4a23c13b6be4 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o obj-$(CONFIG_VFIO_PCI) += pci/ obj-$(CONFIG_VFIO_PLATFORM) += platform/ +obj-$(CONFIG_VFIO_MDEV) += mdev/ diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig new file mode 100644 index 000000000000..14fdb106a827 --- /dev/null +++ b/drivers/vfio/mdev/Kconfig @@ -0,0 +1,17 @@ + +config VFIO_MDEV + tristate "Mediated device driver framework" + depends on VFIO + default n + help + Provides a framework to virtualize devices. + See Documentation/vfio-mediated-device.txt for more details. + + If you don't know what do here, say N. + +config VFIO_MDEV_DEVICE + tristate "VFIO driver for Mediated devices" + depends on VFIO && VFIO_MDEV + default n + help + VFIO based driver for Mediated devices. diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile new file mode 100644 index 000000000000..fa2d5ea466ee --- /dev/null +++ b/drivers/vfio/mdev/Makefile @@ -0,0 +1,5 @@ + +mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o + +obj-$(CONFIG_VFIO_MDEV) += mdev.o +obj-$(CONFIG_VFIO_MDEV_DEVICE) += vfio_mdev.o diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c new file mode 100644 index 000000000000..be1ee89ee917 --- /dev/null +++ b/drivers/vfio/mdev/mdev_core.c @@ -0,0 +1,385 @@ +/* + * Mediated device Core Driver + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/uuid.h> +#include <linux/sysfs.h> +#include <linux/mdev.h> + +#include "mdev_private.h" + +#define DRIVER_VERSION "0.1" +#define DRIVER_AUTHOR "NVIDIA Corporation" +#define DRIVER_DESC "Mediated device Core Driver" + +static LIST_HEAD(parent_list); +static DEFINE_MUTEX(parent_list_lock); +static struct class_compat *mdev_bus_compat_class; + +static int _find_mdev_device(struct device *dev, void *data) +{ + struct mdev_device *mdev; + + if (!dev_is_mdev(dev)) + return 0; + + mdev = to_mdev_device(dev); + + if (uuid_le_cmp(mdev->uuid, *(uuid_le *)data) == 0) + return 1; + + return 0; +} + +static bool mdev_device_exist(struct parent_device *parent, uuid_le uuid) +{ + struct device *dev; + + dev = device_find_child(parent->dev, &uuid, _find_mdev_device); + if (dev) { + put_device(dev); + return true; + } + + return false; +} + +/* Should be called holding parent_list_lock */ +static struct parent_device *__find_parent_device(struct device *dev) +{ + struct parent_device *parent; + + list_for_each_entry(parent, &parent_list, next) { + if (parent->dev == dev) + return parent; + } + return NULL; +} + +static void mdev_release_parent(struct kref *kref) +{ + struct parent_device *parent = container_of(kref, struct parent_device, + ref); + struct device *dev = parent->dev; + + kfree(parent); + put_device(dev); +} + +static +inline struct parent_device *mdev_get_parent(struct parent_device *parent) +{ + if (parent) + kref_get(&parent->ref); + + return parent; +} + +static inline void mdev_put_parent(struct parent_device *parent) +{ + if (parent) + kref_put(&parent->ref, mdev_release_parent); +} + +static int mdev_device_create_ops(struct kobject *kobj, + struct mdev_device *mdev) +{ + struct parent_device *parent = mdev->parent; + int ret; + + ret = parent->ops->create(kobj, mdev); + if (ret) + return ret; + + ret = sysfs_create_groups(&mdev->dev.kobj, + parent->ops->mdev_attr_groups); + if (ret) + parent->ops->remove(mdev); + + return ret; +} + +/* + * mdev_device_remove_ops gets called from sysfs's 'remove' and when parent + * device is being unregistered from mdev device framework. + * - 'force_remove' is set to 'false' when called from sysfs's 'remove' which + * indicates that if the mdev device is active, used by VMM or userspace + * application, vendor driver could return error then don't remove the device. + * - 'force_remove' is set to 'true' when called from mdev_unregister_device() + * which indicate that parent device is being removed from mdev device + * framework so remove mdev device forcefully. + */ +static int mdev_device_remove_ops(struct mdev_device *mdev, bool force_remove) +{ + struct parent_device *parent = mdev->parent; + int ret; + + /* + * Vendor driver can return error if VMM or userspace application is + * using this mdev device. + */ + ret = parent->ops->remove(mdev); + if (ret && !force_remove) + return -EBUSY; + + sysfs_remove_groups(&mdev->dev.kobj, parent->ops->mdev_attr_groups); + return 0; +} + +static int mdev_device_remove_cb(struct device *dev, void *data) +{ + if (!dev_is_mdev(dev)) + return 0; + + return mdev_device_remove(dev, data ? *(bool *)data : true); +} + +/* + * mdev_register_device : Register a device + * @dev: device structure representing parent device. + * @ops: Parent device operation structure to be registered. + * + * Add device to list of registered parent devices. + * Returns a negative value on error, otherwise 0. + */ +int mdev_register_device(struct device *dev, const struct parent_ops *ops) +{ + int ret; + struct parent_device *parent; + + /* check for mandatory ops */ + if (!ops || !ops->create || !ops->remove || !ops->supported_type_groups) + return -EINVAL; + + dev = get_device(dev); + if (!dev) + return -EINVAL; + + mutex_lock(&parent_list_lock); + + /* Check for duplicate */ + parent = __find_parent_device(dev); + if (parent) { + ret = -EEXIST; + goto add_dev_err; + } + + parent = kzalloc(sizeof(*parent), GFP_KERNEL); + if (!parent) { + ret = -ENOMEM; + goto add_dev_err; + } + + kref_init(&parent->ref); + mutex_init(&parent->lock); + + parent->dev = dev; + parent->ops = ops; + + if (!mdev_bus_compat_class) { + mdev_bus_compat_class = class_compat_register("mdev_bus"); + if (!mdev_bus_compat_class) { + ret = -ENOMEM; + goto add_dev_err; + } + } + + ret = parent_create_sysfs_files(parent); + if (ret) + goto add_dev_err; + + ret = class_compat_create_link(mdev_bus_compat_class, dev, NULL); + if (ret) + dev_warn(dev, "Failed to create compatibility class link\n"); + + list_add(&parent->next, &parent_list); + mutex_unlock(&parent_list_lock); + + dev_info(dev, "MDEV: Registered\n"); + return 0; + +add_dev_err: + mutex_unlock(&parent_list_lock); + if (parent) + mdev_put_parent(parent); + else + put_device(dev); + return ret; +} +EXPORT_SYMBOL(mdev_register_device); + +/* + * mdev_unregister_device : Unregister a parent device + * @dev: device structure representing parent device. + * + * Remove device from list of registered parent devices. Give a chance to free + * existing mediated devices for given device. + */ + +void mdev_unregister_device(struct device *dev) +{ + struct parent_device *parent; + bool force_remove = true; + + mutex_lock(&parent_list_lock); + parent = __find_parent_device(dev); + + if (!parent) { + mutex_unlock(&parent_list_lock); + return; + } + dev_info(dev, "MDEV: Unregistering\n"); + + list_del(&parent->next); + class_compat_remove_link(mdev_bus_compat_class, dev, NULL); + + device_for_each_child(dev, (void *)&force_remove, + mdev_device_remove_cb); + + parent_remove_sysfs_files(parent); + + mutex_unlock(&parent_list_lock); + mdev_put_parent(parent); +} +EXPORT_SYMBOL(mdev_unregister_device); + +static void mdev_device_release(struct device *dev) +{ + struct mdev_device *mdev = to_mdev_device(dev); + + dev_dbg(&mdev->dev, "MDEV: destroying\n"); + kfree(mdev); +} + +int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid) +{ + int ret; + struct mdev_device *mdev; + struct parent_device *parent; + struct mdev_type *type = to_mdev_type(kobj); + + parent = mdev_get_parent(type->parent); + if (!parent) + return -EINVAL; + + mutex_lock(&parent->lock); + + /* Check for duplicate */ + if (mdev_device_exist(parent, uuid)) { + ret = -EEXIST; + goto create_err; + } + + mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); + if (!mdev) { + ret = -ENOMEM; + goto create_err; + } + + memcpy(&mdev->uuid, &uuid, sizeof(uuid_le)); + mdev->parent = parent; + kref_init(&mdev->ref); + + mdev->dev.parent = dev; + mdev->dev.bus = &mdev_bus_type; + mdev->dev.release = mdev_device_release; + dev_set_name(&mdev->dev, "%pUl", uuid.b); + + ret = device_register(&mdev->dev); + if (ret) { + put_device(&mdev->dev); + goto create_err; + } + + ret = mdev_device_create_ops(kobj, mdev); + if (ret) + goto create_failed; + + ret = mdev_create_sysfs_files(&mdev->dev, type); + if (ret) { + mdev_device_remove_ops(mdev, true); + goto create_failed; + } + + mdev->type_kobj = kobj; + dev_dbg(&mdev->dev, "MDEV: created\n"); + + mutex_unlock(&parent->lock); + return ret; + +create_failed: + device_unregister(&mdev->dev); + +create_err: + mutex_unlock(&parent->lock); + mdev_put_parent(parent); + return ret; +} + +int mdev_device_remove(struct device *dev, bool force_remove) +{ + struct mdev_device *mdev; + struct parent_device *parent; + struct mdev_type *type; + int ret; + + mdev = to_mdev_device(dev); + type = to_mdev_type(mdev->type_kobj); + parent = mdev->parent; + mutex_lock(&parent->lock); + + ret = mdev_device_remove_ops(mdev, force_remove); + if (ret) { + mutex_unlock(&parent->lock); + return ret; + } + + mdev_remove_sysfs_files(dev, type); + device_unregister(dev); + mutex_unlock(&parent->lock); + mdev_put_parent(parent); + return ret; +} + +static int __init mdev_init(void) +{ + int ret; + + ret = mdev_bus_register(); + + /* + * Attempt to load known vfio_mdev. This gives us a working environment + * without the user needing to explicitly load vfio_mdev driver. + */ + if (!ret) + request_module_nowait("vfio_mdev"); + + return ret; +} + +static void __exit mdev_exit(void) +{ + if (mdev_bus_compat_class) + class_compat_unregister(mdev_bus_compat_class); + + mdev_bus_unregister(); +} + +module_init(mdev_init) +module_exit(mdev_exit) + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c new file mode 100644 index 000000000000..6f0391f6f9b6 --- /dev/null +++ b/drivers/vfio/mdev/mdev_driver.c @@ -0,0 +1,119 @@ +/* + * MDEV driver + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/device.h> +#include <linux/iommu.h> +#include <linux/mdev.h> + +#include "mdev_private.h" + +static int mdev_attach_iommu(struct mdev_device *mdev) +{ + int ret; + struct iommu_group *group; + + group = iommu_group_alloc(); + if (IS_ERR(group)) + return PTR_ERR(group); + + ret = iommu_group_add_device(group, &mdev->dev); + if (!ret) + dev_info(&mdev->dev, "MDEV: group_id = %d\n", + iommu_group_id(group)); + + iommu_group_put(group); + return ret; +} + +static void mdev_detach_iommu(struct mdev_device *mdev) +{ + iommu_group_remove_device(&mdev->dev); + dev_info(&mdev->dev, "MDEV: detaching iommu\n"); +} + +static int mdev_probe(struct device *dev) +{ + struct mdev_driver *drv = to_mdev_driver(dev->driver); + struct mdev_device *mdev = to_mdev_device(dev); + int ret; + + ret = mdev_attach_iommu(mdev); + if (ret) + return ret; + + if (drv && drv->probe) { + ret = drv->probe(dev); + if (ret) + mdev_detach_iommu(mdev); + } + + return ret; +} + +static int mdev_remove(struct device *dev) +{ + struct mdev_driver *drv = to_mdev_driver(dev->driver); + struct mdev_device *mdev = to_mdev_device(dev); + + if (drv && drv->remove) + drv->remove(dev); + + mdev_detach_iommu(mdev); + + return 0; +} + +struct bus_type mdev_bus_type = { + .name = "mdev", + .probe = mdev_probe, + .remove = mdev_remove, +}; +EXPORT_SYMBOL_GPL(mdev_bus_type); + +/** + * mdev_register_driver - register a new MDEV driver + * @drv: the driver to register + * @owner: module owner of driver to be registered + * + * Returns a negative value on error, otherwise 0. + **/ +int mdev_register_driver(struct mdev_driver *drv, struct module *owner) +{ + /* initialize common driver fields */ + drv->driver.name = drv->name; + drv->driver.bus = &mdev_bus_type; + drv->driver.owner = owner; + + /* register with core */ + return driver_register(&drv->driver); +} +EXPORT_SYMBOL(mdev_register_driver); + +/* + * mdev_unregister_driver - unregister MDEV driver + * @drv: the driver to unregister + */ +void mdev_unregister_driver(struct mdev_driver *drv) +{ + driver_unregister(&drv->driver); +} +EXPORT_SYMBOL(mdev_unregister_driver); + +int mdev_bus_register(void) +{ + return bus_register(&mdev_bus_type); +} + +void mdev_bus_unregister(void) +{ + bus_unregister(&mdev_bus_type); +} diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h new file mode 100644 index 000000000000..d35097cbf3d7 --- /dev/null +++ b/drivers/vfio/mdev/mdev_private.h @@ -0,0 +1,41 @@ +/* + * Mediated device interal definitions + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef MDEV_PRIVATE_H +#define MDEV_PRIVATE_H + +int mdev_bus_register(void); +void mdev_bus_unregister(void); + +struct mdev_type { + struct kobject kobj; + struct kobject *devices_kobj; + struct parent_device *parent; + struct list_head next; + struct attribute_group *group; +}; + +#define to_mdev_type_attr(_attr) \ + container_of(_attr, struct mdev_type_attribute, attr) +#define to_mdev_type(_kobj) \ + container_of(_kobj, struct mdev_type, kobj) + +int parent_create_sysfs_files(struct parent_device *parent); +void parent_remove_sysfs_files(struct parent_device *parent); + +int mdev_create_sysfs_files(struct device *dev, struct mdev_type *type); +void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type); + +int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid); +int mdev_device_remove(struct device *dev, bool force_remove); + +#endif /* MDEV_PRIVATE_H */ diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c new file mode 100644 index 000000000000..1a53deb2ee10 --- /dev/null +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -0,0 +1,286 @@ +/* + * File attributes for Mediated devices + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/sysfs.h> +#include <linux/ctype.h> +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/uuid.h> +#include <linux/mdev.h> + +#include "mdev_private.h" + +/* Static functions */ + +static ssize_t mdev_type_attr_show(struct kobject *kobj, + struct attribute *__attr, char *buf) +{ + struct mdev_type_attribute *attr = to_mdev_type_attr(__attr); + struct mdev_type *type = to_mdev_type(kobj); + ssize_t ret = -EIO; + + if (attr->show) + ret = attr->show(kobj, type->parent->dev, buf); + return ret; +} + +static ssize_t mdev_type_attr_store(struct kobject *kobj, + struct attribute *__attr, + const char *buf, size_t count) +{ + struct mdev_type_attribute *attr = to_mdev_type_attr(__attr); + struct mdev_type *type = to_mdev_type(kobj); + ssize_t ret = -EIO; + + if (attr->store) + ret = attr->store(&type->kobj, type->parent->dev, buf, count); + return ret; +} + +static const struct sysfs_ops mdev_type_sysfs_ops = { + .show = mdev_type_attr_show, + .store = mdev_type_attr_store, +}; + +static ssize_t create_store(struct kobject *kobj, struct device *dev, + const char *buf, size_t count) +{ + char *str; + uuid_le uuid; + int ret; + + if ((count < UUID_STRING_LEN) || (count > UUID_STRING_LEN + 1)) + return -EINVAL; + + str = kstrndup(buf, count, GFP_KERNEL); + if (!str) + return -ENOMEM; + + ret = uuid_le_to_bin(str, &uuid); + kfree(str); + if (ret) + return ret; + + ret = mdev_device_create(kobj, dev, uuid); + if (ret) + return ret; + + return count; +} + +MDEV_TYPE_ATTR_WO(create); + +static void mdev_type_release(struct kobject *kobj) +{ + struct mdev_type *type = to_mdev_type(kobj); + + pr_debug("Releasing group %s\n", kobj->name); + kfree(type); +} + +static struct kobj_type mdev_type_ktype = { + .sysfs_ops = &mdev_type_sysfs_ops, + .release = mdev_type_release, +}; + +struct mdev_type *add_mdev_supported_type(struct parent_device *parent, + struct attribute_group *group) +{ + struct mdev_type *type; + int ret; + + if (!group->name) { + pr_err("%s: Type name empty!\n", __func__); + return ERR_PTR(-EINVAL); + } + + type = kzalloc(sizeof(*type), GFP_KERNEL); + if (!type) + return ERR_PTR(-ENOMEM); + + type->kobj.kset = parent->mdev_types_kset; + + ret = kobject_init_and_add(&type->kobj, &mdev_type_ktype, NULL, + "%s-%s", dev_driver_string(parent->dev), + group->name); + if (ret) { + kfree(type); + return ERR_PTR(ret); + } + + ret = sysfs_create_file(&type->kobj, &mdev_type_attr_create.attr); + if (ret) + goto attr_create_failed; + + type->devices_kobj = kobject_create_and_add("devices", &type->kobj); + if (!type->devices_kobj) { + ret = -ENOMEM; + goto attr_devices_failed; + } + + ret = sysfs_create_files(&type->kobj, + (const struct attribute **)group->attrs); + if (ret) { + ret = -ENOMEM; + goto attrs_failed; + } + + type->group = group; + type->parent = parent; + return type; + +attrs_failed: + kobject_put(type->devices_kobj); +attr_devices_failed: + sysfs_remove_file(&type->kobj, &mdev_type_attr_create.attr); +attr_create_failed: + kobject_del(&type->kobj); + kobject_put(&type->kobj); + return ERR_PTR(ret); +} + +static void remove_mdev_supported_type(struct mdev_type *type) +{ + sysfs_remove_files(&type->kobj, + (const struct attribute **)type->group->attrs); + kobject_put(type->devices_kobj); + sysfs_remove_file(&type->kobj, &mdev_type_attr_create.attr); + kobject_del(&type->kobj); + kobject_put(&type->kobj); +} + +static int add_mdev_supported_type_groups(struct parent_device *parent) +{ + int i; + + for (i = 0; parent->ops->supported_type_groups[i]; i++) { + struct mdev_type *type; + + type = add_mdev_supported_type(parent, + parent->ops->supported_type_groups[i]); + if (IS_ERR(type)) { + struct mdev_type *ltype, *tmp; + + list_for_each_entry_safe(ltype, tmp, &parent->type_list, + next) { + list_del(<ype->next); + remove_mdev_supported_type(ltype); + } + return PTR_ERR(type); + } + list_add(&type->next, &parent->type_list); + } + return 0; +} + +/* mdev sysfs functions */ +void parent_remove_sysfs_files(struct parent_device *parent) +{ + struct mdev_type *type, *tmp; + + list_for_each_entry_safe(type, tmp, &parent->type_list, next) { + list_del(&type->next); + remove_mdev_supported_type(type); + } + + sysfs_remove_groups(&parent->dev->kobj, parent->ops->dev_attr_groups); + kset_unregister(parent->mdev_types_kset); +} + +int parent_create_sysfs_files(struct parent_device *parent) +{ + int ret; + + parent->mdev_types_kset = kset_create_and_add("mdev_supported_types", + NULL, &parent->dev->kobj); + + if (!parent->mdev_types_kset) + return -ENOMEM; + + INIT_LIST_HEAD(&parent->type_list); + + ret = sysfs_create_groups(&parent->dev->kobj, + parent->ops->dev_attr_groups); + if (ret) + goto create_err; + + ret = add_mdev_supported_type_groups(parent); + if (ret) + sysfs_remove_groups(&parent->dev->kobj, + parent->ops->dev_attr_groups); + else + return ret; + +create_err: + kset_unregister(parent->mdev_types_kset); + return ret; +} + +static ssize_t remove_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val; + + if (kstrtoul(buf, 0, &val) < 0) + return -EINVAL; + + if (val && device_remove_file_self(dev, attr)) { + int ret; + + ret = mdev_device_remove(dev, false); + if (ret) { + device_create_file(dev, attr); + return ret; + } + } + + return count; +} + +static DEVICE_ATTR_WO(remove); + +static const struct attribute *mdev_device_attrs[] = { + &dev_attr_remove.attr, + NULL, +}; + +int mdev_create_sysfs_files(struct device *dev, struct mdev_type *type) +{ + int ret; + + ret = sysfs_create_files(&dev->kobj, mdev_device_attrs); + if (ret) + return ret; + + ret = sysfs_create_link(type->devices_kobj, &dev->kobj, dev_name(dev)); + if (ret) + goto device_link_failed; + + ret = sysfs_create_link(&dev->kobj, &type->kobj, "mdev_type"); + if (ret) + goto type_link_failed; + + return ret; + +type_link_failed: + sysfs_remove_link(type->devices_kobj, dev_name(dev)); +device_link_failed: + sysfs_remove_files(&dev->kobj, mdev_device_attrs); + return ret; +} + +void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type) +{ + sysfs_remove_link(&dev->kobj, "mdev_type"); + sysfs_remove_link(type->devices_kobj, dev_name(dev)); + sysfs_remove_files(&dev->kobj, mdev_device_attrs); +} diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c new file mode 100644 index 000000000000..ffc36758cb84 --- /dev/null +++ b/drivers/vfio/mdev/vfio_mdev.c @@ -0,0 +1,148 @@ +/* + * VFIO based driver for Mediated device + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/vfio.h> +#include <linux/mdev.h> + +#include "mdev_private.h" + +#define DRIVER_VERSION "0.1" +#define DRIVER_AUTHOR "NVIDIA Corporation" +#define DRIVER_DESC "VFIO based driver for Mediated device" + +static int vfio_mdev_open(void *device_data) +{ + struct mdev_device *mdev = device_data; + struct parent_device *parent = mdev->parent; + int ret; + + if (unlikely(!parent->ops->open)) + return -EINVAL; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + ret = parent->ops->open(mdev); + if (ret) + module_put(THIS_MODULE); + + return ret; +} + +static void vfio_mdev_release(void *device_data) +{ + struct mdev_device *mdev = device_data; + struct parent_device *parent = mdev->parent; + + if (likely(parent->ops->release)) + parent->ops->release(mdev); + + module_put(THIS_MODULE); +} + +static long vfio_mdev_unlocked_ioctl(void *device_data, + unsigned int cmd, unsigned long arg) +{ + struct mdev_device *mdev = device_data; + struct parent_device *parent = mdev->parent; + + if (unlikely(!parent->ops->ioctl)) + return -EINVAL; + + return parent->ops->ioctl(mdev, cmd, arg); +} + +static ssize_t vfio_mdev_read(void *device_data, char __user *buf, + size_t count, loff_t *ppos) +{ + struct mdev_device *mdev = device_data; + struct parent_device *parent = mdev->parent; + + if (unlikely(!parent->ops->read)) + return -EINVAL; + + return parent->ops->read(mdev, buf, count, ppos); +} + +static ssize_t vfio_mdev_write(void *device_data, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct mdev_device *mdev = device_data; + struct parent_device *parent = mdev->parent; + + if (unlikely(!parent->ops->write)) + return -EINVAL; + + return parent->ops->write(mdev, buf, count, ppos); +} + +static int vfio_mdev_mmap(void *device_data, struct vm_area_struct *vma) +{ + struct mdev_device *mdev = device_data; + struct parent_device *parent = mdev->parent; + + if (unlikely(!parent->ops->mmap)) + return -EINVAL; + + return parent->ops->mmap(mdev, vma); +} + +static const struct vfio_device_ops vfio_mdev_dev_ops = { + .name = "vfio-mdev", + .open = vfio_mdev_open, + .release = vfio_mdev_release, + .ioctl = vfio_mdev_unlocked_ioctl, + .read = vfio_mdev_read, + .write = vfio_mdev_write, + .mmap = vfio_mdev_mmap, +}; + +int vfio_mdev_probe(struct device *dev) +{ + struct mdev_device *mdev = to_mdev_device(dev); + + return vfio_add_group_dev(dev, &vfio_mdev_dev_ops, mdev); +} + +void vfio_mdev_remove(struct device *dev) +{ + vfio_del_group_dev(dev); +} + +struct mdev_driver vfio_mdev_driver = { + .name = "vfio_mdev", + .probe = vfio_mdev_probe, + .remove = vfio_mdev_remove, +}; + +static int __init vfio_mdev_init(void) +{ + return mdev_register_driver(&vfio_mdev_driver, THIS_MODULE); +} + +static void __exit vfio_mdev_exit(void) +{ + mdev_unregister_driver(&vfio_mdev_driver); +} + +module_init(vfio_mdev_init) +module_exit(vfio_mdev_exit) + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 031bc08d000d..dcd7c2a99618 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -558,10 +558,9 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev, static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev, struct vfio_info_cap *caps) { - struct vfio_info_cap_header *header; struct vfio_region_info_cap_sparse_mmap *sparse; size_t end, size; - int nr_areas = 2, i = 0; + int nr_areas = 2, i = 0, ret; end = pci_resource_len(vdev->pdev, vdev->msix_bar); @@ -572,13 +571,10 @@ static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev, size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas)); - header = vfio_info_cap_add(caps, size, - VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1); - if (IS_ERR(header)) - return PTR_ERR(header); + sparse = kzalloc(size, GFP_KERNEL); + if (!sparse) + return -ENOMEM; - sparse = container_of(header, - struct vfio_region_info_cap_sparse_mmap, header); sparse->nr_areas = nr_areas; if (vdev->msix_offset & PAGE_MASK) { @@ -594,26 +590,11 @@ static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev, i++; } - return 0; -} - -static int region_type_cap(struct vfio_pci_device *vdev, - struct vfio_info_cap *caps, - unsigned int type, unsigned int subtype) -{ - struct vfio_info_cap_header *header; - struct vfio_region_info_cap_type *cap; - - header = vfio_info_cap_add(caps, sizeof(*cap), - VFIO_REGION_INFO_CAP_TYPE, 1); - if (IS_ERR(header)) - return PTR_ERR(header); - - cap = container_of(header, struct vfio_region_info_cap_type, header); - cap->type = type; - cap->subtype = subtype; + ret = vfio_info_add_capability(caps, VFIO_REGION_INFO_CAP_SPARSE_MMAP, + sparse); + kfree(sparse); - return 0; + return ret; } int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, @@ -752,6 +733,9 @@ static long vfio_pci_ioctl(void *device_data, break; default: + { + struct vfio_region_info_cap_type cap_type; + if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) return -EINVAL; @@ -762,11 +746,16 @@ static long vfio_pci_ioctl(void *device_data, info.size = vdev->region[i].size; info.flags = vdev->region[i].flags; - ret = region_type_cap(vdev, &caps, - vdev->region[i].type, - vdev->region[i].subtype); + cap_type.type = vdev->region[i].type; + cap_type.subtype = vdev->region[i].subtype; + + ret = vfio_info_add_capability(&caps, + VFIO_REGION_INFO_CAP_TYPE, + &cap_type); if (ret) return ret; + + } } if (caps.size) { @@ -829,45 +818,25 @@ static long vfio_pci_ioctl(void *device_data, } else if (cmd == VFIO_DEVICE_SET_IRQS) { struct vfio_irq_set hdr; - size_t size; u8 *data = NULL; int max, ret = 0; + size_t data_size = 0; minsz = offsetofend(struct vfio_irq_set, count); if (copy_from_user(&hdr, (void __user *)arg, minsz)) return -EFAULT; - if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS || - hdr.count >= (U32_MAX - hdr.start) || - hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | - VFIO_IRQ_SET_ACTION_TYPE_MASK)) - return -EINVAL; - max = vfio_pci_get_irq_count(vdev, hdr.index); - if (hdr.start >= max || hdr.start + hdr.count > max) - return -EINVAL; - switch (hdr.flags & VFIO_IRQ_SET_DATA_TYPE_MASK) { - case VFIO_IRQ_SET_DATA_NONE: - size = 0; - break; - case VFIO_IRQ_SET_DATA_BOOL: - size = sizeof(uint8_t); - break; - case VFIO_IRQ_SET_DATA_EVENTFD: - size = sizeof(int32_t); - break; - default: - return -EINVAL; - } - - if (size) { - if (hdr.argsz - minsz < hdr.count * size) - return -EINVAL; + ret = vfio_set_irqs_validate_and_prepare(&hdr, max, + VFIO_PCI_NUM_IRQS, &data_size); + if (ret) + return ret; + if (data_size) { data = memdup_user((void __user *)(arg + minsz), - hdr.count * size); + data_size); if (IS_ERR(data)) return PTR_ERR(data); } diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 65d4a3015542..e4220ca8ca27 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -152,7 +152,7 @@ static int vfio_user_config_read(struct pci_dev *pdev, int offset, *val = cpu_to_le32(tmp_val); - return pcibios_err_to_errno(ret); + return ret; } static int vfio_user_config_write(struct pci_dev *pdev, int offset, @@ -173,7 +173,7 @@ static int vfio_user_config_write(struct pci_dev *pdev, int offset, break; } - return pcibios_err_to_errno(ret); + return ret; } static int vfio_default_config_read(struct vfio_pci_device *vdev, int pos, @@ -257,7 +257,7 @@ static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos, ret = vfio_user_config_read(vdev->pdev, pos, val, count); if (ret) - return pcibios_err_to_errno(ret); + return ret; if (pos >= PCI_CFG_SPACE_SIZE) { /* Extended cap header mangling */ if (offset < 4) @@ -295,7 +295,7 @@ static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos, ret = vfio_user_config_read(vdev->pdev, pos, val, count); if (ret) - return pcibios_err_to_errno(ret); + return ret; return count; } @@ -1089,7 +1089,7 @@ static int vfio_msi_config_write(struct vfio_pci_device *vdev, int pos, start + PCI_MSI_FLAGS, flags); if (ret) - return pcibios_err_to_errno(ret); + return ret; } return count; diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index d78142830754..4c27f4be3c3d 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -364,36 +364,21 @@ static long vfio_platform_ioctl(void *device_data, struct vfio_irq_set hdr; u8 *data = NULL; int ret = 0; + size_t data_size = 0; minsz = offsetofend(struct vfio_irq_set, count); if (copy_from_user(&hdr, (void __user *)arg, minsz)) return -EFAULT; - if (hdr.argsz < minsz) - return -EINVAL; - - if (hdr.index >= vdev->num_irqs) - return -EINVAL; - - if (hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | - VFIO_IRQ_SET_ACTION_TYPE_MASK)) - return -EINVAL; - - if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) { - size_t size; - - if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL) - size = sizeof(uint8_t); - else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD) - size = sizeof(int32_t); - else - return -EINVAL; - - if (hdr.argsz - minsz < size) - return -EINVAL; + ret = vfio_set_irqs_validate_and_prepare(&hdr, vdev->num_irqs, + vdev->num_irqs, &data_size); + if (ret) + return ret; - data = memdup_user((void __user *)(arg + minsz), size); + if (data_size) { + data = memdup_user((void __user *)(arg + minsz), + data_size); if (IS_ERR(data)) return PTR_ERR(data); } diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index d1d70e0b011b..9901c4671e2f 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -86,6 +86,8 @@ struct vfio_group { struct mutex unbound_lock; atomic_t opened; bool noiommu; + struct kvm *kvm; + struct blocking_notifier_head notifier; }; struct vfio_device { @@ -339,6 +341,7 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) #ifdef CONFIG_VFIO_NOIOMMU group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu); #endif + BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); group->nb.notifier_call = vfio_iommu_group_notifier; @@ -480,6 +483,21 @@ static struct vfio_group *vfio_group_get_from_minor(int minor) return group; } +static struct vfio_group *vfio_group_get_from_dev(struct device *dev) +{ + struct iommu_group *iommu_group; + struct vfio_group *group; + + iommu_group = iommu_group_get(dev); + if (!iommu_group) + return NULL; + + group = vfio_group_get_from_iommu(iommu_group); + iommu_group_put(iommu_group); + + return group; +} + /** * Device objects - create, release, get, put, search */ @@ -811,16 +829,10 @@ EXPORT_SYMBOL_GPL(vfio_add_group_dev); */ struct vfio_device *vfio_device_get_from_dev(struct device *dev) { - struct iommu_group *iommu_group; struct vfio_group *group; struct vfio_device *device; - iommu_group = iommu_group_get(dev); - if (!iommu_group) - return NULL; - - group = vfio_group_get_from_iommu(iommu_group); - iommu_group_put(iommu_group); + group = vfio_group_get_from_dev(dev); if (!group) return NULL; @@ -1376,6 +1388,23 @@ static bool vfio_group_viable(struct vfio_group *group) group, vfio_dev_viable) == 0); } +static int vfio_group_add_container_user(struct vfio_group *group) +{ + if (!atomic_inc_not_zero(&group->container_users)) + return -EINVAL; + + if (group->noiommu) { + atomic_dec(&group->container_users); + return -EPERM; + } + if (!group->container->iommu_driver || !vfio_group_viable(group)) { + atomic_dec(&group->container_users); + return -EINVAL; + } + + return 0; +} + static const struct file_operations vfio_device_fops; static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) @@ -1555,6 +1584,9 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) filep->private_data = NULL; + /* Any user didn't unregister? */ + WARN_ON(group->notifier.head); + vfio_group_try_dissolve_container(group); atomic_dec(&group->opened); @@ -1685,23 +1717,14 @@ static const struct file_operations vfio_device_fops = { struct vfio_group *vfio_group_get_external_user(struct file *filep) { struct vfio_group *group = filep->private_data; + int ret; if (filep->f_op != &vfio_group_fops) return ERR_PTR(-EINVAL); - if (!atomic_inc_not_zero(&group->container_users)) - return ERR_PTR(-EINVAL); - - if (group->noiommu) { - atomic_dec(&group->container_users); - return ERR_PTR(-EPERM); - } - - if (!group->container->iommu_driver || - !vfio_group_viable(group)) { - atomic_dec(&group->container_users); - return ERR_PTR(-EINVAL); - } + ret = vfio_group_add_container_user(group); + if (ret) + return ERR_PTR(ret); vfio_group_get(group); @@ -1763,7 +1786,7 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, header->version = version; /* Add to the end of the capability chain */ - for (tmp = caps->buf; tmp->next; tmp = (void *)tmp + tmp->next) + for (tmp = buf; tmp->next; tmp = buf + tmp->next) ; /* nothing */ tmp->next = caps->size; @@ -1776,11 +1799,403 @@ EXPORT_SYMBOL_GPL(vfio_info_cap_add); void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset) { struct vfio_info_cap_header *tmp; + void *buf = (void *)caps->buf; - for (tmp = caps->buf; tmp->next; tmp = (void *)tmp + tmp->next - offset) + for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset) tmp->next += offset; } -EXPORT_SYMBOL_GPL(vfio_info_cap_shift); +EXPORT_SYMBOL(vfio_info_cap_shift); + +static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type) +{ + struct vfio_info_cap_header *header; + struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type; + size_t size; + + size = sizeof(*sparse) + sparse->nr_areas * sizeof(*sparse->areas); + header = vfio_info_cap_add(caps, size, + VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1); + if (IS_ERR(header)) + return PTR_ERR(header); + + sparse_cap = container_of(header, + struct vfio_region_info_cap_sparse_mmap, header); + sparse_cap->nr_areas = sparse->nr_areas; + memcpy(sparse_cap->areas, sparse->areas, + sparse->nr_areas * sizeof(*sparse->areas)); + return 0; +} + +static int region_type_cap(struct vfio_info_cap *caps, void *cap_type) +{ + struct vfio_info_cap_header *header; + struct vfio_region_info_cap_type *type_cap, *cap = cap_type; + + header = vfio_info_cap_add(caps, sizeof(*cap), + VFIO_REGION_INFO_CAP_TYPE, 1); + if (IS_ERR(header)) + return PTR_ERR(header); + + type_cap = container_of(header, struct vfio_region_info_cap_type, + header); + type_cap->type = cap->type; + type_cap->subtype = cap->subtype; + return 0; +} + +int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id, + void *cap_type) +{ + int ret = -EINVAL; + + if (!cap_type) + return 0; + + switch (cap_type_id) { + case VFIO_REGION_INFO_CAP_SPARSE_MMAP: + ret = sparse_mmap_cap(caps, cap_type); + break; + + case VFIO_REGION_INFO_CAP_TYPE: + ret = region_type_cap(caps, cap_type); + break; + } + + return ret; +} +EXPORT_SYMBOL(vfio_info_add_capability); + +int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, + int max_irq_type, size_t *data_size) +{ + unsigned long minsz; + size_t size; + + minsz = offsetofend(struct vfio_irq_set, count); + + if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) || + (hdr->count >= (U32_MAX - hdr->start)) || + (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | + VFIO_IRQ_SET_ACTION_TYPE_MASK))) + return -EINVAL; + + if (data_size) + *data_size = 0; + + if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs) + return -EINVAL; + + switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) { + case VFIO_IRQ_SET_DATA_NONE: + size = 0; + break; + case VFIO_IRQ_SET_DATA_BOOL: + size = sizeof(uint8_t); + break; + case VFIO_IRQ_SET_DATA_EVENTFD: + size = sizeof(int32_t); + break; + default: + return -EINVAL; + } + + if (size) { + if (hdr->argsz - minsz < hdr->count * size) + return -EINVAL; + + if (!data_size) + return -EINVAL; + + *data_size = hdr->count * size; + } + + return 0; +} +EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); + +/* + * Pin a set of guest PFNs and return their associated host PFNs for local + * domain only. + * @dev [in] : device + * @user_pfn [in]: array of user/guest PFNs to be unpinned. + * @npage [in] : count of elements in user_pfn array. This count should not + * be greater VFIO_PIN_PAGES_MAX_ENTRIES. + * @prot [in] : protection flags + * @phys_pfn[out]: array of host PFNs + * Return error or number of pages pinned. + */ +int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage, + int prot, unsigned long *phys_pfn) +{ + struct vfio_container *container; + struct vfio_group *group; + struct vfio_iommu_driver *driver; + int ret; + + if (!dev || !user_pfn || !phys_pfn || !npage) + return -EINVAL; + + if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) + return -E2BIG; + + group = vfio_group_get_from_dev(dev); + if (!group) + return -ENODEV; + + ret = vfio_group_add_container_user(group); + if (ret) + goto err_pin_pages; + + container = group->container; + down_read(&container->group_lock); + + driver = container->iommu_driver; + if (likely(driver && driver->ops->pin_pages)) + ret = driver->ops->pin_pages(container->iommu_data, user_pfn, + npage, prot, phys_pfn); + else + ret = -ENOTTY; + + up_read(&container->group_lock); + vfio_group_try_dissolve_container(group); + +err_pin_pages: + vfio_group_put(group); + return ret; +} +EXPORT_SYMBOL(vfio_pin_pages); + +/* + * Unpin set of host PFNs for local domain only. + * @dev [in] : device + * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest + * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES. + * @npage [in] : count of elements in user_pfn array. This count should not + * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. + * Return error or number of pages unpinned. + */ +int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage) +{ + struct vfio_container *container; + struct vfio_group *group; + struct vfio_iommu_driver *driver; + int ret; + + if (!dev || !user_pfn || !npage) + return -EINVAL; + + if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) + return -E2BIG; + + group = vfio_group_get_from_dev(dev); + if (!group) + return -ENODEV; + + ret = vfio_group_add_container_user(group); + if (ret) + goto err_unpin_pages; + + container = group->container; + down_read(&container->group_lock); + + driver = container->iommu_driver; + if (likely(driver && driver->ops->unpin_pages)) + ret = driver->ops->unpin_pages(container->iommu_data, user_pfn, + npage); + else + ret = -ENOTTY; + + up_read(&container->group_lock); + vfio_group_try_dissolve_container(group); + +err_unpin_pages: + vfio_group_put(group); + return ret; +} +EXPORT_SYMBOL(vfio_unpin_pages); + +static int vfio_register_iommu_notifier(struct vfio_group *group, + unsigned long *events, + struct notifier_block *nb) +{ + struct vfio_container *container; + struct vfio_iommu_driver *driver; + int ret; + + ret = vfio_group_add_container_user(group); + if (ret) + return -EINVAL; + + container = group->container; + down_read(&container->group_lock); + + driver = container->iommu_driver; + if (likely(driver && driver->ops->register_notifier)) + ret = driver->ops->register_notifier(container->iommu_data, + events, nb); + else + ret = -ENOTTY; + + up_read(&container->group_lock); + vfio_group_try_dissolve_container(group); + + return ret; +} + +static int vfio_unregister_iommu_notifier(struct vfio_group *group, + struct notifier_block *nb) +{ + struct vfio_container *container; + struct vfio_iommu_driver *driver; + int ret; + + ret = vfio_group_add_container_user(group); + if (ret) + return -EINVAL; + + container = group->container; + down_read(&container->group_lock); + + driver = container->iommu_driver; + if (likely(driver && driver->ops->unregister_notifier)) + ret = driver->ops->unregister_notifier(container->iommu_data, + nb); + else + ret = -ENOTTY; + + up_read(&container->group_lock); + vfio_group_try_dissolve_container(group); + + return ret; +} + +void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm) +{ + group->kvm = kvm; + blocking_notifier_call_chain(&group->notifier, + VFIO_GROUP_NOTIFY_SET_KVM, kvm); +} +EXPORT_SYMBOL_GPL(vfio_group_set_kvm); + +static int vfio_register_group_notifier(struct vfio_group *group, + unsigned long *events, + struct notifier_block *nb) +{ + struct vfio_container *container; + int ret; + bool set_kvm = false; + + if (*events & VFIO_GROUP_NOTIFY_SET_KVM) + set_kvm = true; + + /* clear known events */ + *events &= ~VFIO_GROUP_NOTIFY_SET_KVM; + + /* refuse to continue if still events remaining */ + if (*events) + return -EINVAL; + + ret = vfio_group_add_container_user(group); + if (ret) + return -EINVAL; + + container = group->container; + down_read(&container->group_lock); + + ret = blocking_notifier_chain_register(&group->notifier, nb); + + /* + * The attaching of kvm and vfio_group might already happen, so + * here we replay once upon registration. + */ + if (!ret && set_kvm && group->kvm) + blocking_notifier_call_chain(&group->notifier, + VFIO_GROUP_NOTIFY_SET_KVM, group->kvm); + + up_read(&container->group_lock); + vfio_group_try_dissolve_container(group); + + return ret; +} + +static int vfio_unregister_group_notifier(struct vfio_group *group, + struct notifier_block *nb) +{ + struct vfio_container *container; + int ret; + + ret = vfio_group_add_container_user(group); + if (ret) + return -EINVAL; + + container = group->container; + down_read(&container->group_lock); + + ret = blocking_notifier_chain_unregister(&group->notifier, nb); + + up_read(&container->group_lock); + vfio_group_try_dissolve_container(group); + + return ret; +} + +int vfio_register_notifier(struct device *dev, enum vfio_notify_type type, + unsigned long *events, struct notifier_block *nb) +{ + struct vfio_group *group; + int ret; + + if (!dev || !nb || !events || (*events == 0)) + return -EINVAL; + + group = vfio_group_get_from_dev(dev); + if (!group) + return -ENODEV; + + switch (type) { + case VFIO_IOMMU_NOTIFY: + ret = vfio_register_iommu_notifier(group, events, nb); + break; + case VFIO_GROUP_NOTIFY: + ret = vfio_register_group_notifier(group, events, nb); + break; + default: + ret = -EINVAL; + } + + vfio_group_put(group); + return ret; +} +EXPORT_SYMBOL(vfio_register_notifier); + +int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type, + struct notifier_block *nb) +{ + struct vfio_group *group; + int ret; + + if (!dev || !nb) + return -EINVAL; + + group = vfio_group_get_from_dev(dev); + if (!group) + return -ENODEV; + + switch (type) { + case VFIO_IOMMU_NOTIFY: + ret = vfio_unregister_iommu_notifier(group, nb); + break; + case VFIO_GROUP_NOTIFY: + ret = vfio_unregister_group_notifier(group, nb); + break; + default: + ret = -EINVAL; + } + + vfio_group_put(group); + return ret; +} +EXPORT_SYMBOL(vfio_unregister_notifier); /** * Module/class support diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 2ba19424e4a1..9815e45c23c4 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -36,6 +36,9 @@ #include <linux/uaccess.h> #include <linux/vfio.h> #include <linux/workqueue.h> +#include <linux/pid_namespace.h> +#include <linux/mdev.h> +#include <linux/notifier.h> #define DRIVER_VERSION "0.2" #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" @@ -55,8 +58,10 @@ MODULE_PARM_DESC(disable_hugepages, struct vfio_iommu { struct list_head domain_list; + struct vfio_domain *external_domain; /* domain for external user */ struct mutex lock; struct rb_root dma_list; + struct blocking_notifier_head notifier; bool v2; bool nesting; }; @@ -75,6 +80,9 @@ struct vfio_dma { unsigned long vaddr; /* Process virtual addr */ size_t size; /* Map size (bytes) */ int prot; /* IOMMU_READ/WRITE */ + bool iommu_mapped; + struct task_struct *task; + struct rb_root pfn_list; /* Ex-user pinned pfn list */ }; struct vfio_group { @@ -83,6 +91,21 @@ struct vfio_group { }; /* + * Guest RAM pinning working set or DMA target + */ +struct vfio_pfn { + struct rb_node node; + dma_addr_t iova; /* Device address */ + unsigned long pfn; /* Host pfn */ + atomic_t ref_count; +}; + +#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ + (!list_empty(&iommu->domain_list)) + +static int put_pfn(unsigned long pfn, int prot); + +/* * This code handles mapping and unmapping of user data buffers * into DMA'ble space using the IOMMU */ @@ -130,6 +153,97 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old) rb_erase(&old->node, &iommu->dma_list); } +/* + * Helper Functions for host iova-pfn list + */ +static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) +{ + struct vfio_pfn *vpfn; + struct rb_node *node = dma->pfn_list.rb_node; + + while (node) { + vpfn = rb_entry(node, struct vfio_pfn, node); + + if (iova < vpfn->iova) + node = node->rb_left; + else if (iova > vpfn->iova) + node = node->rb_right; + else + return vpfn; + } + return NULL; +} + +static void vfio_link_pfn(struct vfio_dma *dma, + struct vfio_pfn *new) +{ + struct rb_node **link, *parent = NULL; + struct vfio_pfn *vpfn; + + link = &dma->pfn_list.rb_node; + while (*link) { + parent = *link; + vpfn = rb_entry(parent, struct vfio_pfn, node); + + if (new->iova < vpfn->iova) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, &dma->pfn_list); +} + +static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old) +{ + rb_erase(&old->node, &dma->pfn_list); +} + +static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova, + unsigned long pfn) +{ + struct vfio_pfn *vpfn; + + vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL); + if (!vpfn) + return -ENOMEM; + + vpfn->iova = iova; + vpfn->pfn = pfn; + atomic_set(&vpfn->ref_count, 1); + vfio_link_pfn(dma, vpfn); + return 0; +} + +static void vfio_remove_from_pfn_list(struct vfio_dma *dma, + struct vfio_pfn *vpfn) +{ + vfio_unlink_pfn(dma, vpfn); + kfree(vpfn); +} + +static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma, + unsigned long iova) +{ + struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova); + + if (vpfn) + atomic_inc(&vpfn->ref_count); + return vpfn; +} + +static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn) +{ + int ret = 0; + + if (atomic_dec_and_test(&vpfn->ref_count)) { + ret = put_pfn(vpfn->pfn, dma->prot); + vfio_remove_from_pfn_list(dma, vpfn); + } + return ret; +} + struct vwork { struct mm_struct *mm; long npage; @@ -150,17 +264,22 @@ static void vfio_lock_acct_bg(struct work_struct *work) kfree(vwork); } -static void vfio_lock_acct(long npage) +static void vfio_lock_acct(struct task_struct *task, long npage) { struct vwork *vwork; struct mm_struct *mm; - if (!current->mm || !npage) + if (!npage) + return; + + mm = get_task_mm(task); + if (!mm) return; /* process exited or nothing to do */ - if (down_write_trylock(¤t->mm->mmap_sem)) { - current->mm->locked_vm += npage; - up_write(¤t->mm->mmap_sem); + if (down_write_trylock(&mm->mmap_sem)) { + mm->locked_vm += npage; + up_write(&mm->mmap_sem); + mmput(mm); return; } @@ -170,11 +289,8 @@ static void vfio_lock_acct(long npage) * wouldn't need this silliness */ vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); - if (!vwork) - return; - mm = get_task_mm(current); - if (!mm) { - kfree(vwork); + if (!vwork) { + mmput(mm); return; } INIT_WORK(&vwork->work, vfio_lock_acct_bg); @@ -228,20 +344,36 @@ static int put_pfn(unsigned long pfn, int prot) return 0; } -static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) +static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, + int prot, unsigned long *pfn) { struct page *page[1]; struct vm_area_struct *vma; - int ret = -EFAULT; + int ret; + + if (mm == current->mm) { + ret = get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), + page); + } else { + unsigned int flags = 0; + + if (prot & IOMMU_WRITE) + flags |= FOLL_WRITE; + + down_read(&mm->mmap_sem); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page, + NULL); + up_read(&mm->mmap_sem); + } - if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) { + if (ret == 1) { *pfn = page_to_pfn(page[0]); return 0; } - down_read(¤t->mm->mmap_sem); + down_read(&mm->mmap_sem); - vma = find_vma_intersection(current->mm, vaddr, vaddr + 1); + vma = find_vma_intersection(mm, vaddr, vaddr + 1); if (vma && vma->vm_flags & VM_PFNMAP) { *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; @@ -249,8 +381,7 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) ret = 0; } - up_read(¤t->mm->mmap_sem); - + up_read(&mm->mmap_sem); return ret; } @@ -259,88 +390,299 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) * the iommu can only map chunks of consecutive pfns anyway, so get the * first page and all consecutive pages with the same locking. */ -static long vfio_pin_pages(unsigned long vaddr, long npage, - int prot, unsigned long *pfn_base) +static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, + long npage, unsigned long *pfn_base) { - unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - bool lock_cap = capable(CAP_IPC_LOCK); - long ret, i; + unsigned long limit; + bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns, + CAP_IPC_LOCK); + struct mm_struct *mm; + long ret, i = 0, lock_acct = 0; bool rsvd; + dma_addr_t iova = vaddr - dma->vaddr + dma->iova; - if (!current->mm) + mm = get_task_mm(dma->task); + if (!mm) return -ENODEV; - ret = vaddr_get_pfn(vaddr, prot, pfn_base); + ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base); if (ret) - return ret; + goto pin_pg_remote_exit; rsvd = is_invalid_reserved_pfn(*pfn_base); + limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) { - put_pfn(*pfn_base, prot); - pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, - limit << PAGE_SHIFT); - return -ENOMEM; + /* + * Reserved pages aren't counted against the user, externally pinned + * pages are already counted against the user. + */ + if (!rsvd && !vfio_find_vpfn(dma, iova)) { + if (!lock_cap && mm->locked_vm + 1 > limit) { + put_pfn(*pfn_base, dma->prot); + pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, + limit << PAGE_SHIFT); + ret = -ENOMEM; + goto pin_pg_remote_exit; + } + lock_acct++; } - if (unlikely(disable_hugepages)) { - if (!rsvd) - vfio_lock_acct(1); - return 1; - } + i++; + if (likely(!disable_hugepages)) { + /* Lock all the consecutive pages from pfn_base */ + for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; i < npage; + i++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) { + unsigned long pfn = 0; - /* Lock all the consecutive pages from pfn_base */ - for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) { - unsigned long pfn = 0; + ret = vaddr_get_pfn(mm, vaddr, dma->prot, &pfn); + if (ret) + break; - ret = vaddr_get_pfn(vaddr, prot, &pfn); - if (ret) - break; + if (pfn != *pfn_base + i || + rsvd != is_invalid_reserved_pfn(pfn)) { + put_pfn(pfn, dma->prot); + break; + } - if (pfn != *pfn_base + i || - rsvd != is_invalid_reserved_pfn(pfn)) { - put_pfn(pfn, prot); - break; + if (!rsvd && !vfio_find_vpfn(dma, iova)) { + if (!lock_cap && + mm->locked_vm + lock_acct + 1 > limit) { + put_pfn(pfn, dma->prot); + pr_warn("%s: RLIMIT_MEMLOCK (%ld) " + "exceeded\n", __func__, + limit << PAGE_SHIFT); + break; + } + lock_acct++; + } } + } - if (!rsvd && !lock_cap && - current->mm->locked_vm + i + 1 > limit) { - put_pfn(pfn, prot); - pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", - __func__, limit << PAGE_SHIFT); - break; + vfio_lock_acct(dma->task, lock_acct); + ret = i; + +pin_pg_remote_exit: + mmput(mm); + return ret; +} + +static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, + unsigned long pfn, long npage, + bool do_accounting) +{ + long unlocked = 0, locked = 0; + long i; + + for (i = 0; i < npage; i++) { + if (put_pfn(pfn++, dma->prot)) { + unlocked++; + if (vfio_find_vpfn(dma, iova + (i << PAGE_SHIFT))) + locked++; } } - if (!rsvd) - vfio_lock_acct(i); + if (do_accounting) + vfio_lock_acct(dma->task, locked - unlocked); - return i; + return unlocked; } -static long vfio_unpin_pages(unsigned long pfn, long npage, - int prot, bool do_accounting) +static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, + unsigned long *pfn_base, bool do_accounting) { - unsigned long unlocked = 0; - long i; + unsigned long limit; + bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns, + CAP_IPC_LOCK); + struct mm_struct *mm; + int ret; + bool rsvd; - for (i = 0; i < npage; i++) - unlocked += put_pfn(pfn++, prot); + mm = get_task_mm(dma->task); + if (!mm) + return -ENODEV; + + ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base); + if (ret) + goto pin_page_exit; + + rsvd = is_invalid_reserved_pfn(*pfn_base); + limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) { + put_pfn(*pfn_base, dma->prot); + pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n", + __func__, dma->task->comm, task_pid_nr(dma->task), + limit << PAGE_SHIFT); + ret = -ENOMEM; + goto pin_page_exit; + } + + if (!rsvd && do_accounting) + vfio_lock_acct(dma->task, 1); + ret = 1; + +pin_page_exit: + mmput(mm); + return ret; +} + +static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova, + bool do_accounting) +{ + int unlocked; + struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova); + + if (!vpfn) + return 0; + + unlocked = vfio_iova_put_vfio_pfn(dma, vpfn); if (do_accounting) - vfio_lock_acct(-unlocked); + vfio_lock_acct(dma->task, -unlocked); return unlocked; } -static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) +static int vfio_iommu_type1_pin_pages(void *iommu_data, + unsigned long *user_pfn, + int npage, int prot, + unsigned long *phys_pfn) +{ + struct vfio_iommu *iommu = iommu_data; + int i, j, ret; + unsigned long remote_vaddr; + struct vfio_dma *dma; + bool do_accounting; + + if (!iommu || !user_pfn || !phys_pfn) + return -EINVAL; + + /* Supported for v2 version only */ + if (!iommu->v2) + return -EACCES; + + mutex_lock(&iommu->lock); + + /* Fail if notifier list is empty */ + if ((!iommu->external_domain) || (!iommu->notifier.head)) { + ret = -EINVAL; + goto pin_done; + } + + /* + * If iommu capable domain exist in the container then all pages are + * already pinned and accounted. Accouting should be done if there is no + * iommu capable domain in the container. + */ + do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu); + + for (i = 0; i < npage; i++) { + dma_addr_t iova; + struct vfio_pfn *vpfn; + + iova = user_pfn[i] << PAGE_SHIFT; + dma = vfio_find_dma(iommu, iova, PAGE_SIZE); + if (!dma) { + ret = -EINVAL; + goto pin_unwind; + } + + if ((dma->prot & prot) != prot) { + ret = -EPERM; + goto pin_unwind; + } + + vpfn = vfio_iova_get_vfio_pfn(dma, iova); + if (vpfn) { + phys_pfn[i] = vpfn->pfn; + continue; + } + + remote_vaddr = dma->vaddr + iova - dma->iova; + ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i], + do_accounting); + if (ret <= 0) { + WARN_ON(!ret); + goto pin_unwind; + } + + ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]); + if (ret) { + vfio_unpin_page_external(dma, iova, do_accounting); + goto pin_unwind; + } + } + + ret = i; + goto pin_done; + +pin_unwind: + phys_pfn[i] = 0; + for (j = 0; j < i; j++) { + dma_addr_t iova; + + iova = user_pfn[j] << PAGE_SHIFT; + dma = vfio_find_dma(iommu, iova, PAGE_SIZE); + vfio_unpin_page_external(dma, iova, do_accounting); + phys_pfn[j] = 0; + } +pin_done: + mutex_unlock(&iommu->lock); + return ret; +} + +static int vfio_iommu_type1_unpin_pages(void *iommu_data, + unsigned long *user_pfn, + int npage) +{ + struct vfio_iommu *iommu = iommu_data; + bool do_accounting; + int i; + + if (!iommu || !user_pfn) + return -EINVAL; + + /* Supported for v2 version only */ + if (!iommu->v2) + return -EACCES; + + mutex_lock(&iommu->lock); + + if (!iommu->external_domain) { + mutex_unlock(&iommu->lock); + return -EINVAL; + } + + do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu); + for (i = 0; i < npage; i++) { + struct vfio_dma *dma; + dma_addr_t iova; + + iova = user_pfn[i] << PAGE_SHIFT; + dma = vfio_find_dma(iommu, iova, PAGE_SIZE); + if (!dma) + goto unpin_exit; + vfio_unpin_page_external(dma, iova, do_accounting); + } + +unpin_exit: + mutex_unlock(&iommu->lock); + return i > npage ? npage : (i > 0 ? i : -EINVAL); +} + +static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, + bool do_accounting) { dma_addr_t iova = dma->iova, end = dma->iova + dma->size; struct vfio_domain *domain, *d; long unlocked = 0; if (!dma->size) - return; + return 0; + + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) + return 0; + /* * We use the IOMMU to track the physical addresses, otherwise we'd * need a much more complicated tracking system. Unfortunately that @@ -382,21 +724,28 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) if (WARN_ON(!unmapped)) break; - unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT, - unmapped >> PAGE_SHIFT, - dma->prot, false); + unlocked += vfio_unpin_pages_remote(dma, iova, + phys >> PAGE_SHIFT, + unmapped >> PAGE_SHIFT, + false); iova += unmapped; cond_resched(); } - vfio_lock_acct(-unlocked); + dma->iommu_mapped = false; + if (do_accounting) { + vfio_lock_acct(dma->task, -unlocked); + return 0; + } + return unlocked; } static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma) { - vfio_unmap_unpin(iommu, dma); + vfio_unmap_unpin(iommu, dma, true); vfio_unlink_dma(iommu, dma); + put_task_struct(dma->task); kfree(dma); } @@ -430,9 +779,9 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, struct vfio_iommu_type1_dma_unmap *unmap) { uint64_t mask; - struct vfio_dma *dma; + struct vfio_dma *dma, *dma_last = NULL; size_t unmapped = 0; - int ret = 0; + int ret = 0, retries = 0; mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1; @@ -442,7 +791,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, return -EINVAL; WARN_ON(mask & PAGE_MASK); - +again: mutex_lock(&iommu->lock); /* @@ -477,7 +826,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, * mappings within the range. */ if (iommu->v2) { - dma = vfio_find_dma(iommu, unmap->iova, 0); + dma = vfio_find_dma(iommu, unmap->iova, 1); if (dma && dma->iova != unmap->iova) { ret = -EINVAL; goto unlock; @@ -492,6 +841,38 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { if (!iommu->v2 && unmap->iova > dma->iova) break; + /* + * Task with same address space who mapped this iova range is + * allowed to unmap the iova range. + */ + if (dma->task->mm != current->mm) + break; + + if (!RB_EMPTY_ROOT(&dma->pfn_list)) { + struct vfio_iommu_type1_dma_unmap nb_unmap; + + if (dma_last == dma) { + BUG_ON(++retries > 10); + } else { + dma_last = dma; + retries = 0; + } + + nb_unmap.iova = dma->iova; + nb_unmap.size = dma->size; + + /* + * Notify anyone (mdev vendor drivers) to invalidate and + * unmap iovas within the range we're about to unmap. + * Vendor drivers MUST unpin pages in response to an + * invalidation. + */ + mutex_unlock(&iommu->lock); + blocking_notifier_call_chain(&iommu->notifier, + VFIO_IOMMU_NOTIFY_DMA_UNMAP, + &nb_unmap); + goto again; + } unmapped += dma->size; vfio_remove_dma(iommu, dma); } @@ -558,17 +939,56 @@ unwind: return ret; } +static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma, + size_t map_size) +{ + dma_addr_t iova = dma->iova; + unsigned long vaddr = dma->vaddr; + size_t size = map_size; + long npage; + unsigned long pfn; + int ret = 0; + + while (size) { + /* Pin a contiguous chunk of memory */ + npage = vfio_pin_pages_remote(dma, vaddr + dma->size, + size >> PAGE_SHIFT, &pfn); + if (npage <= 0) { + WARN_ON(!npage); + ret = (int)npage; + break; + } + + /* Map it! */ + ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, + dma->prot); + if (ret) { + vfio_unpin_pages_remote(dma, iova + dma->size, pfn, + npage, true); + break; + } + + size -= npage << PAGE_SHIFT; + dma->size += npage << PAGE_SHIFT; + } + + dma->iommu_mapped = true; + + if (ret) + vfio_remove_dma(iommu, dma); + + return ret; +} + static int vfio_dma_do_map(struct vfio_iommu *iommu, struct vfio_iommu_type1_dma_map *map) { dma_addr_t iova = map->iova; unsigned long vaddr = map->vaddr; size_t size = map->size; - long npage; int ret = 0, prot = 0; uint64_t mask; struct vfio_dma *dma; - unsigned long pfn; /* Verify that none of our __u64 fields overflow */ if (map->size != size || map->vaddr != vaddr || map->iova != iova) @@ -594,47 +1014,33 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, mutex_lock(&iommu->lock); if (vfio_find_dma(iommu, iova, size)) { - mutex_unlock(&iommu->lock); - return -EEXIST; + ret = -EEXIST; + goto out_unlock; } dma = kzalloc(sizeof(*dma), GFP_KERNEL); if (!dma) { - mutex_unlock(&iommu->lock); - return -ENOMEM; + ret = -ENOMEM; + goto out_unlock; } dma->iova = iova; dma->vaddr = vaddr; dma->prot = prot; + get_task_struct(current); + dma->task = current; + dma->pfn_list = RB_ROOT; /* Insert zero-sized and grow as we map chunks of it */ vfio_link_dma(iommu, dma); - while (size) { - /* Pin a contiguous chunk of memory */ - npage = vfio_pin_pages(vaddr + dma->size, - size >> PAGE_SHIFT, prot, &pfn); - if (npage <= 0) { - WARN_ON(!npage); - ret = (int)npage; - break; - } - - /* Map it! */ - ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot); - if (ret) { - vfio_unpin_pages(pfn, npage, prot, true); - break; - } - - size -= npage << PAGE_SHIFT; - dma->size += npage << PAGE_SHIFT; - } - - if (ret) - vfio_remove_dma(iommu, dma); + /* Don't pin and map if container doesn't contain IOMMU capable domain*/ + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) + dma->size = size; + else + ret = vfio_pin_map_dma(iommu, dma, size); +out_unlock: mutex_unlock(&iommu->lock); return ret; } @@ -662,10 +1068,6 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, d = list_first_entry(&iommu->domain_list, struct vfio_domain, next); n = rb_first(&iommu->dma_list); - /* If there's not a domain, there better not be any mappings */ - if (WARN_ON(n && !d)) - return -EINVAL; - for (; n; n = rb_next(n)) { struct vfio_dma *dma; dma_addr_t iova; @@ -674,21 +1076,49 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, iova = dma->iova; while (iova < dma->iova + dma->size) { - phys_addr_t phys = iommu_iova_to_phys(d->domain, iova); + phys_addr_t phys; size_t size; - if (WARN_ON(!phys)) { - iova += PAGE_SIZE; - continue; + if (dma->iommu_mapped) { + phys_addr_t p; + dma_addr_t i; + + phys = iommu_iova_to_phys(d->domain, iova); + + if (WARN_ON(!phys)) { + iova += PAGE_SIZE; + continue; + } + + size = PAGE_SIZE; + p = phys + size; + i = iova + size; + while (i < dma->iova + dma->size && + p == iommu_iova_to_phys(d->domain, i)) { + size += PAGE_SIZE; + p += PAGE_SIZE; + i += PAGE_SIZE; + } + } else { + unsigned long pfn; + unsigned long vaddr = dma->vaddr + + (iova - dma->iova); + size_t n = dma->iova + dma->size - iova; + long npage; + + npage = vfio_pin_pages_remote(dma, vaddr, + n >> PAGE_SHIFT, + &pfn); + if (npage <= 0) { + WARN_ON(!npage); + ret = (int)npage; + return ret; + } + + phys = pfn << PAGE_SHIFT; + size = npage << PAGE_SHIFT; } - size = PAGE_SIZE; - - while (iova + size < dma->iova + dma->size && - phys + size == iommu_iova_to_phys(d->domain, - iova + size)) - size += PAGE_SIZE; - ret = iommu_map(domain->domain, iova, phys, size, dma->prot | domain->prot); if (ret) @@ -696,8 +1126,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, iova += size; } + dma->iommu_mapped = true; } - return 0; } @@ -734,22 +1164,39 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain) __free_pages(pages, order); } +static struct vfio_group *find_iommu_group(struct vfio_domain *domain, + struct iommu_group *iommu_group) +{ + struct vfio_group *g; + + list_for_each_entry(g, &domain->group_list, next) { + if (g->iommu_group == iommu_group) + return g; + } + + return NULL; +} + static int vfio_iommu_type1_attach_group(void *iommu_data, struct iommu_group *iommu_group) { struct vfio_iommu *iommu = iommu_data; - struct vfio_group *group, *g; + struct vfio_group *group; struct vfio_domain *domain, *d; - struct bus_type *bus = NULL; + struct bus_type *bus = NULL, *mdev_bus; int ret; mutex_lock(&iommu->lock); list_for_each_entry(d, &iommu->domain_list, next) { - list_for_each_entry(g, &d->group_list, next) { - if (g->iommu_group != iommu_group) - continue; + if (find_iommu_group(d, iommu_group)) { + mutex_unlock(&iommu->lock); + return -EINVAL; + } + } + if (iommu->external_domain) { + if (find_iommu_group(iommu->external_domain, iommu_group)) { mutex_unlock(&iommu->lock); return -EINVAL; } @@ -769,6 +1216,25 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, if (ret) goto out_free; + mdev_bus = symbol_get(mdev_bus_type); + + if (mdev_bus) { + if ((bus == mdev_bus) && !iommu_present(bus)) { + symbol_put(mdev_bus_type); + if (!iommu->external_domain) { + INIT_LIST_HEAD(&domain->group_list); + iommu->external_domain = domain; + } else + kfree(domain); + + list_add(&group->next, + &iommu->external_domain->group_list); + mutex_unlock(&iommu->lock); + return 0; + } + symbol_put(mdev_bus_type); + } + domain->domain = iommu_domain_alloc(bus); if (!domain->domain) { ret = -EIO; @@ -859,6 +1325,46 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu) vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node)); } +static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu) +{ + struct rb_node *n, *p; + + n = rb_first(&iommu->dma_list); + for (; n; n = rb_next(n)) { + struct vfio_dma *dma; + long locked = 0, unlocked = 0; + + dma = rb_entry(n, struct vfio_dma, node); + unlocked += vfio_unmap_unpin(iommu, dma, false); + p = rb_first(&dma->pfn_list); + for (; p; p = rb_next(p)) { + struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, + node); + + if (!is_invalid_reserved_pfn(vpfn->pfn)) + locked++; + } + vfio_lock_acct(dma->task, locked - unlocked); + } +} + +static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu) +{ + struct rb_node *n; + + n = rb_first(&iommu->dma_list); + for (; n; n = rb_next(n)) { + struct vfio_dma *dma; + + dma = rb_entry(n, struct vfio_dma, node); + + if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list))) + break; + } + /* mdev vendor driver must unregister notifier */ + WARN_ON(iommu->notifier.head); +} + static void vfio_iommu_type1_detach_group(void *iommu_data, struct iommu_group *iommu_group) { @@ -868,31 +1374,55 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, mutex_lock(&iommu->lock); - list_for_each_entry(domain, &iommu->domain_list, next) { - list_for_each_entry(group, &domain->group_list, next) { - if (group->iommu_group != iommu_group) - continue; - - iommu_detach_group(domain->domain, iommu_group); + if (iommu->external_domain) { + group = find_iommu_group(iommu->external_domain, iommu_group); + if (group) { list_del(&group->next); kfree(group); - /* - * Group ownership provides privilege, if the group - * list is empty, the domain goes away. If it's the - * last domain, then all the mappings go away too. - */ - if (list_empty(&domain->group_list)) { - if (list_is_singular(&iommu->domain_list)) + + if (list_empty(&iommu->external_domain->group_list)) { + vfio_sanity_check_pfn_list(iommu); + + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) vfio_iommu_unmap_unpin_all(iommu); - iommu_domain_free(domain->domain); - list_del(&domain->next); - kfree(domain); + + kfree(iommu->external_domain); + iommu->external_domain = NULL; + } + goto detach_group_done; + } + } + + list_for_each_entry(domain, &iommu->domain_list, next) { + group = find_iommu_group(domain, iommu_group); + if (!group) + continue; + + iommu_detach_group(domain->domain, iommu_group); + list_del(&group->next); + kfree(group); + /* + * Group ownership provides privilege, if the group list is + * empty, the domain goes away. If it's the last domain with + * iommu and external domain doesn't exist, then all the + * mappings go away too. If it's the last domain with iommu and + * external domain exist, update accounting + */ + if (list_empty(&domain->group_list)) { + if (list_is_singular(&iommu->domain_list)) { + if (!iommu->external_domain) + vfio_iommu_unmap_unpin_all(iommu); + else + vfio_iommu_unmap_unpin_reaccount(iommu); } - goto done; + iommu_domain_free(domain->domain); + list_del(&domain->next); + kfree(domain); } + break; } -done: +detach_group_done: mutex_unlock(&iommu->lock); } @@ -920,31 +1450,46 @@ static void *vfio_iommu_type1_open(unsigned long arg) INIT_LIST_HEAD(&iommu->domain_list); iommu->dma_list = RB_ROOT; mutex_init(&iommu->lock); + BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier); return iommu; } +static void vfio_release_domain(struct vfio_domain *domain, bool external) +{ + struct vfio_group *group, *group_tmp; + + list_for_each_entry_safe(group, group_tmp, + &domain->group_list, next) { + if (!external) + iommu_detach_group(domain->domain, group->iommu_group); + list_del(&group->next); + kfree(group); + } + + if (!external) + iommu_domain_free(domain->domain); +} + static void vfio_iommu_type1_release(void *iommu_data) { struct vfio_iommu *iommu = iommu_data; struct vfio_domain *domain, *domain_tmp; - struct vfio_group *group, *group_tmp; + + if (iommu->external_domain) { + vfio_release_domain(iommu->external_domain, true); + vfio_sanity_check_pfn_list(iommu); + kfree(iommu->external_domain); + } vfio_iommu_unmap_unpin_all(iommu); list_for_each_entry_safe(domain, domain_tmp, &iommu->domain_list, next) { - list_for_each_entry_safe(group, group_tmp, - &domain->group_list, next) { - iommu_detach_group(domain->domain, group->iommu_group); - list_del(&group->next); - kfree(group); - } - iommu_domain_free(domain->domain); + vfio_release_domain(domain, false); list_del(&domain->next); kfree(domain); } - kfree(iommu); } @@ -1040,14 +1585,42 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, return -ENOTTY; } +static int vfio_iommu_type1_register_notifier(void *iommu_data, + unsigned long *events, + struct notifier_block *nb) +{ + struct vfio_iommu *iommu = iommu_data; + + /* clear known events */ + *events &= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP; + + /* refuse to register if still events remaining */ + if (*events) + return -EINVAL; + + return blocking_notifier_chain_register(&iommu->notifier, nb); +} + +static int vfio_iommu_type1_unregister_notifier(void *iommu_data, + struct notifier_block *nb) +{ + struct vfio_iommu *iommu = iommu_data; + + return blocking_notifier_chain_unregister(&iommu->notifier, nb); +} + static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { - .name = "vfio-iommu-type1", - .owner = THIS_MODULE, - .open = vfio_iommu_type1_open, - .release = vfio_iommu_type1_release, - .ioctl = vfio_iommu_type1_ioctl, - .attach_group = vfio_iommu_type1_attach_group, - .detach_group = vfio_iommu_type1_detach_group, + .name = "vfio-iommu-type1", + .owner = THIS_MODULE, + .open = vfio_iommu_type1_open, + .release = vfio_iommu_type1_release, + .ioctl = vfio_iommu_type1_ioctl, + .attach_group = vfio_iommu_type1_attach_group, + .detach_group = vfio_iommu_type1_detach_group, + .pin_pages = vfio_iommu_type1_pin_pages, + .unpin_pages = vfio_iommu_type1_unpin_pages, + .register_notifier = vfio_iommu_type1_register_notifier, + .unregister_notifier = vfio_iommu_type1_unregister_notifier, }; static int __init vfio_iommu_type1_init(void) diff --git a/include/linux/mdev.h b/include/linux/mdev.h new file mode 100644 index 000000000000..ec819e9a115a --- /dev/null +++ b/include/linux/mdev.h @@ -0,0 +1,168 @@ +/* + * Mediated device definition + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef MDEV_H +#define MDEV_H + +/* Parent device */ +struct parent_device { + struct device *dev; + const struct parent_ops *ops; + + /* internal */ + struct kref ref; + struct mutex lock; + struct list_head next; + struct kset *mdev_types_kset; + struct list_head type_list; +}; + +/* Mediated device */ +struct mdev_device { + struct device dev; + struct parent_device *parent; + uuid_le uuid; + void *driver_data; + + /* internal */ + struct kref ref; + struct list_head next; + struct kobject *type_kobj; +}; + +/** + * struct parent_ops - Structure to be registered for each parent device to + * register the device to mdev module. + * + * @owner: The module owner. + * @dev_attr_groups: Attributes of the parent device. + * @mdev_attr_groups: Attributes of the mediated device. + * @supported_type_groups: Attributes to define supported types. It is mandatory + * to provide supported types. + * @create: Called to allocate basic resources in parent device's + * driver for a particular mediated device. It is + * mandatory to provide create ops. + * @kobj: kobject of type for which 'create' is called. + * @mdev: mdev_device structure on of mediated device + * that is being created + * Returns integer: success (0) or error (< 0) + * @remove: Called to free resources in parent device's driver for a + * a mediated device. It is mandatory to provide 'remove' + * ops. + * @mdev: mdev_device device structure which is being + * destroyed + * Returns integer: success (0) or error (< 0) + * @open: Open mediated device. + * @mdev: mediated device. + * Returns integer: success (0) or error (< 0) + * @release: release mediated device + * @mdev: mediated device. + * @read: Read emulation callback + * @mdev: mediated device structure + * @buf: read buffer + * @count: number of bytes to read + * @ppos: address. + * Retuns number on bytes read on success or error. + * @write: Write emulation callback + * @mdev: mediated device structure + * @buf: write buffer + * @count: number of bytes to be written + * @ppos: address. + * Retuns number on bytes written on success or error. + * @ioctl: IOCTL callback + * @mdev: mediated device structure + * @cmd: ioctl command + * @arg: arguments to ioctl + * @mmap: mmap callback + * @mdev: mediated device structure + * @vma: vma structure + * Parent device that support mediated device should be registered with mdev + * module with parent_ops structure. + **/ + +struct parent_ops { + struct module *owner; + const struct attribute_group **dev_attr_groups; + const struct attribute_group **mdev_attr_groups; + struct attribute_group **supported_type_groups; + + int (*create)(struct kobject *kobj, struct mdev_device *mdev); + int (*remove)(struct mdev_device *mdev); + int (*open)(struct mdev_device *mdev); + void (*release)(struct mdev_device *mdev); + ssize_t (*read)(struct mdev_device *mdev, char __user *buf, + size_t count, loff_t *ppos); + ssize_t (*write)(struct mdev_device *mdev, const char __user *buf, + size_t count, loff_t *ppos); + ssize_t (*ioctl)(struct mdev_device *mdev, unsigned int cmd, + unsigned long arg); + int (*mmap)(struct mdev_device *mdev, struct vm_area_struct *vma); +}; + +/* interface for exporting mdev supported type attributes */ +struct mdev_type_attribute { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, struct device *dev, char *buf); + ssize_t (*store)(struct kobject *kobj, struct device *dev, + const char *buf, size_t count); +}; + +#define MDEV_TYPE_ATTR(_name, _mode, _show, _store) \ +struct mdev_type_attribute mdev_type_attr_##_name = \ + __ATTR(_name, _mode, _show, _store) +#define MDEV_TYPE_ATTR_RW(_name) \ + struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RW(_name) +#define MDEV_TYPE_ATTR_RO(_name) \ + struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RO(_name) +#define MDEV_TYPE_ATTR_WO(_name) \ + struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_WO(_name) + +/** + * struct mdev_driver - Mediated device driver + * @name: driver name + * @probe: called when new device created + * @remove: called when device removed + * @driver: device driver structure + * + **/ +struct mdev_driver { + const char *name; + int (*probe)(struct device *dev); + void (*remove)(struct device *dev); + struct device_driver driver; +}; + +#define to_mdev_driver(drv) container_of(drv, struct mdev_driver, driver) +#define to_mdev_device(dev) container_of(dev, struct mdev_device, dev) + +static inline void *mdev_get_drvdata(struct mdev_device *mdev) +{ + return mdev->driver_data; +} + +static inline void mdev_set_drvdata(struct mdev_device *mdev, void *data) +{ + mdev->driver_data = data; +} + +extern struct bus_type mdev_bus_type; + +#define dev_is_mdev(d) ((d)->bus == &mdev_bus_type) + +extern int mdev_register_device(struct device *dev, + const struct parent_ops *ops); +extern void mdev_unregister_device(struct device *dev); + +extern int mdev_register_driver(struct mdev_driver *drv, struct module *owner); +extern void mdev_unregister_driver(struct mdev_driver *drv); + +#endif /* MDEV_H */ diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 0ecae0b1cd34..edf9b2cad277 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -75,7 +75,16 @@ struct vfio_iommu_driver_ops { struct iommu_group *group); void (*detach_group)(void *iommu_data, struct iommu_group *group); - + int (*pin_pages)(void *iommu_data, unsigned long *user_pfn, + int npage, int prot, + unsigned long *phys_pfn); + int (*unpin_pages)(void *iommu_data, + unsigned long *user_pfn, int npage); + int (*register_notifier)(void *iommu_data, + unsigned long *events, + struct notifier_block *nb); + int (*unregister_notifier)(void *iommu_data, + struct notifier_block *nb); }; extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); @@ -92,6 +101,36 @@ extern int vfio_external_user_iommu_id(struct vfio_group *group); extern long vfio_external_check_extension(struct vfio_group *group, unsigned long arg); +#define VFIO_PIN_PAGES_MAX_ENTRIES (PAGE_SIZE/sizeof(unsigned long)) + +extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, + int npage, int prot, unsigned long *phys_pfn); +extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, + int npage); + +/* each type has independent events */ +enum vfio_notify_type { + VFIO_IOMMU_NOTIFY = 0, + VFIO_GROUP_NOTIFY = 1, +}; + +/* events for VFIO_IOMMU_NOTIFY */ +#define VFIO_IOMMU_NOTIFY_DMA_UNMAP BIT(0) + +/* events for VFIO_GROUP_NOTIFY */ +#define VFIO_GROUP_NOTIFY_SET_KVM BIT(0) + +extern int vfio_register_notifier(struct device *dev, + enum vfio_notify_type type, + unsigned long *required_events, + struct notifier_block *nb); +extern int vfio_unregister_notifier(struct device *dev, + enum vfio_notify_type type, + struct notifier_block *nb); + +struct kvm; +extern void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm); + /* * Sub-module helpers */ @@ -103,6 +142,13 @@ extern struct vfio_info_cap_header *vfio_info_cap_add( struct vfio_info_cap *caps, size_t size, u16 id, u16 version); extern void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset); +extern int vfio_info_add_capability(struct vfio_info_cap *caps, + int cap_type_id, void *cap_type); + +extern int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, + int num_irqs, int max_irq_type, + size_t *data_size); + struct pci_dev; #ifdef CONFIG_EEH extern void vfio_spapr_pci_eeh_open(struct pci_dev *pdev); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 255a2113f53c..519eff362c1c 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -203,6 +203,16 @@ struct vfio_device_info { }; #define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7) +/* + * Vendor driver using Mediated device framework should provide device_api + * attribute in supported type attribute groups. Device API string should be one + * of the following corresponding to device flags in vfio_device_info structure. + */ + +#define VFIO_DEVICE_API_PCI_STRING "vfio-pci" +#define VFIO_DEVICE_API_PLATFORM_STRING "vfio-platform" +#define VFIO_DEVICE_API_AMBA_STRING "vfio-amba" + /** * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8, * struct vfio_region_info) diff --git a/samples/vfio-mdev/Makefile b/samples/vfio-mdev/Makefile new file mode 100644 index 000000000000..a932edbe38eb --- /dev/null +++ b/samples/vfio-mdev/Makefile @@ -0,0 +1,13 @@ +# +# Makefile for mtty.c file +# +KERNEL_DIR:=/lib/modules/$(shell uname -r)/build + +obj-m:=mtty.o + +modules clean modules_install: + $(MAKE) -C $(KERNEL_DIR) SUBDIRS=$(PWD) $@ + +default: modules + +module: modules diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c new file mode 100644 index 000000000000..6b633a4ea333 --- /dev/null +++ b/samples/vfio-mdev/mtty.c @@ -0,0 +1,1503 @@ +/* + * Mediated virtual PCI serial host device driver + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Sample driver that creates mdev device that simulates serial port over PCI + * card. + * + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/poll.h> +#include <linux/slab.h> +#include <linux/cdev.h> +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/uuid.h> +#include <linux/vfio.h> +#include <linux/iommu.h> +#include <linux/sysfs.h> +#include <linux/ctype.h> +#include <linux/file.h> +#include <linux/mdev.h> +#include <linux/pci.h> +#include <linux/serial.h> +#include <uapi/linux/serial_reg.h> +#include <linux/eventfd.h> +/* + * #defines + */ + +#define VERSION_STRING "0.1" +#define DRIVER_AUTHOR "NVIDIA Corporation" + +#define MTTY_CLASS_NAME "mtty" + +#define MTTY_NAME "mtty" + +#define MTTY_STRING_LEN 16 + +#define MTTY_CONFIG_SPACE_SIZE 0xff +#define MTTY_IO_BAR_SIZE 0x8 +#define MTTY_MMIO_BAR_SIZE 0x100000 + +#define STORE_LE16(addr, val) (*(u16 *)addr = val) +#define STORE_LE32(addr, val) (*(u32 *)addr = val) + +#define MAX_FIFO_SIZE 16 + +#define CIRCULAR_BUF_INC_IDX(idx) (idx = (idx + 1) & (MAX_FIFO_SIZE - 1)) + +#define MTTY_VFIO_PCI_OFFSET_SHIFT 40 + +#define MTTY_VFIO_PCI_OFFSET_TO_INDEX(off) (off >> MTTY_VFIO_PCI_OFFSET_SHIFT) +#define MTTY_VFIO_PCI_INDEX_TO_OFFSET(index) \ + ((u64)(index) << MTTY_VFIO_PCI_OFFSET_SHIFT) +#define MTTY_VFIO_PCI_OFFSET_MASK \ + (((u64)(1) << MTTY_VFIO_PCI_OFFSET_SHIFT) - 1) +#define MAX_MTTYS 24 + +/* + * Global Structures + */ + +struct mtty_dev { + dev_t vd_devt; + struct class *vd_class; + struct cdev vd_cdev; + struct idr vd_idr; + struct device dev; +} mtty_dev; + +struct mdev_region_info { + u64 start; + u64 phys_start; + u32 size; + u64 vfio_offset; +}; + +#if defined(DEBUG_REGS) +const char *wr_reg[] = { + "TX", + "IER", + "FCR", + "LCR", + "MCR", + "LSR", + "MSR", + "SCR" +}; + +const char *rd_reg[] = { + "RX", + "IER", + "IIR", + "LCR", + "MCR", + "LSR", + "MSR", + "SCR" +}; +#endif + +/* loop back buffer */ +struct rxtx { + u8 fifo[MAX_FIFO_SIZE]; + u8 head, tail; + u8 count; +}; + +struct serial_port { + u8 uart_reg[8]; /* 8 registers */ + struct rxtx rxtx; /* loop back buffer */ + bool dlab; + bool overrun; + u16 divisor; + u8 fcr; /* FIFO control register */ + u8 max_fifo_size; + u8 intr_trigger_level; /* interrupt trigger level */ +}; + +/* State of each mdev device */ +struct mdev_state { + int irq_fd; + struct eventfd_ctx *intx_evtfd; + struct eventfd_ctx *msi_evtfd; + int irq_index; + u8 *vconfig; + struct mutex ops_lock; + struct mdev_device *mdev; + struct mdev_region_info region_info[VFIO_PCI_NUM_REGIONS]; + u32 bar_mask[VFIO_PCI_NUM_REGIONS]; + struct list_head next; + struct serial_port s[2]; + struct mutex rxtx_lock; + struct vfio_device_info dev_info; + int nr_ports; +}; + +struct mutex mdev_list_lock; +struct list_head mdev_devices_list; + +static const struct file_operations vd_fops = { + .owner = THIS_MODULE, +}; + +/* function prototypes */ + +static int mtty_trigger_interrupt(uuid_le uuid); + +/* Helper functions */ +static struct mdev_state *find_mdev_state_by_uuid(uuid_le uuid) +{ + struct mdev_state *mds; + + list_for_each_entry(mds, &mdev_devices_list, next) { + if (uuid_le_cmp(mds->mdev->uuid, uuid) == 0) + return mds; + } + + return NULL; +} + +void dump_buffer(char *buf, uint32_t count) +{ +#if defined(DEBUG) + int i; + + pr_info("Buffer:\n"); + for (i = 0; i < count; i++) { + pr_info("%2x ", *(buf + i)); + if ((i + 1) % 16 == 0) + pr_info("\n"); + } +#endif +} + +static void mtty_create_config_space(struct mdev_state *mdev_state) +{ + /* PCI dev ID */ + STORE_LE32((u32 *) &mdev_state->vconfig[0x0], 0x32534348); + + /* Control: I/O+, Mem-, BusMaster- */ + STORE_LE16((u16 *) &mdev_state->vconfig[0x4], 0x0001); + + /* Status: capabilities list absent */ + STORE_LE16((u16 *) &mdev_state->vconfig[0x6], 0x0200); + + /* Rev ID */ + mdev_state->vconfig[0x8] = 0x10; + + /* programming interface class : 16550-compatible serial controller */ + mdev_state->vconfig[0x9] = 0x02; + + /* Sub class : 00 */ + mdev_state->vconfig[0xa] = 0x00; + + /* Base class : Simple Communication controllers */ + mdev_state->vconfig[0xb] = 0x07; + + /* base address registers */ + /* BAR0: IO space */ + STORE_LE32((u32 *) &mdev_state->vconfig[0x10], 0x000001); + mdev_state->bar_mask[0] = ~(MTTY_IO_BAR_SIZE) + 1; + + if (mdev_state->nr_ports == 2) { + /* BAR1: IO space */ + STORE_LE32((u32 *) &mdev_state->vconfig[0x14], 0x000001); + mdev_state->bar_mask[1] = ~(MTTY_IO_BAR_SIZE) + 1; + } + + /* Subsystem ID */ + STORE_LE32((u32 *) &mdev_state->vconfig[0x2c], 0x32534348); + + mdev_state->vconfig[0x34] = 0x00; /* Cap Ptr */ + mdev_state->vconfig[0x3d] = 0x01; /* interrupt pin (INTA#) */ + + /* Vendor specific data */ + mdev_state->vconfig[0x40] = 0x23; + mdev_state->vconfig[0x43] = 0x80; + mdev_state->vconfig[0x44] = 0x23; + mdev_state->vconfig[0x48] = 0x23; + mdev_state->vconfig[0x4c] = 0x23; + + mdev_state->vconfig[0x60] = 0x50; + mdev_state->vconfig[0x61] = 0x43; + mdev_state->vconfig[0x62] = 0x49; + mdev_state->vconfig[0x63] = 0x20; + mdev_state->vconfig[0x64] = 0x53; + mdev_state->vconfig[0x65] = 0x65; + mdev_state->vconfig[0x66] = 0x72; + mdev_state->vconfig[0x67] = 0x69; + mdev_state->vconfig[0x68] = 0x61; + mdev_state->vconfig[0x69] = 0x6c; + mdev_state->vconfig[0x6a] = 0x2f; + mdev_state->vconfig[0x6b] = 0x55; + mdev_state->vconfig[0x6c] = 0x41; + mdev_state->vconfig[0x6d] = 0x52; + mdev_state->vconfig[0x6e] = 0x54; +} + +static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset, + char *buf, u32 count) +{ + u32 cfg_addr, bar_mask, bar_index = 0; + + switch (offset) { + case 0x04: /* device control */ + case 0x06: /* device status */ + /* do nothing */ + break; + case 0x3c: /* interrupt line */ + mdev_state->vconfig[0x3c] = buf[0]; + break; + case 0x3d: + /* + * Interrupt Pin is hardwired to INTA. + * This field is write protected by hardware + */ + break; + case 0x10: /* BAR0 */ + case 0x14: /* BAR1 */ + if (offset == 0x10) + bar_index = 0; + else if (offset == 0x14) + bar_index = 1; + + if ((mdev_state->nr_ports == 1) && (bar_index == 1)) { + STORE_LE32(&mdev_state->vconfig[offset], 0); + break; + } + + cfg_addr = *(u32 *)buf; + pr_info("BAR%d addr 0x%x\n", bar_index, cfg_addr); + + if (cfg_addr == 0xffffffff) { + bar_mask = mdev_state->bar_mask[bar_index]; + cfg_addr = (cfg_addr & bar_mask); + } + + cfg_addr |= (mdev_state->vconfig[offset] & 0x3ul); + STORE_LE32(&mdev_state->vconfig[offset], cfg_addr); + break; + case 0x18: /* BAR2 */ + case 0x1c: /* BAR3 */ + case 0x20: /* BAR4 */ + STORE_LE32(&mdev_state->vconfig[offset], 0); + break; + default: + pr_info("PCI config write @0x%x of %d bytes not handled\n", + offset, count); + break; + } +} + +static void handle_bar_write(unsigned int index, struct mdev_state *mdev_state, + u16 offset, char *buf, u32 count) +{ + u8 data = *buf; + + /* Handle data written by guest */ + switch (offset) { + case UART_TX: + /* if DLAB set, data is LSB of divisor */ + if (mdev_state->s[index].dlab) { + mdev_state->s[index].divisor |= data; + break; + } + + mutex_lock(&mdev_state->rxtx_lock); + + /* save in TX buffer */ + if (mdev_state->s[index].rxtx.count < + mdev_state->s[index].max_fifo_size) { + mdev_state->s[index].rxtx.fifo[ + mdev_state->s[index].rxtx.head] = data; + mdev_state->s[index].rxtx.count++; + CIRCULAR_BUF_INC_IDX(mdev_state->s[index].rxtx.head); + mdev_state->s[index].overrun = false; + + /* + * Trigger interrupt if receive data interrupt is + * enabled and fifo reached trigger level + */ + if ((mdev_state->s[index].uart_reg[UART_IER] & + UART_IER_RDI) && + (mdev_state->s[index].rxtx.count == + mdev_state->s[index].intr_trigger_level)) { + /* trigger interrupt */ +#if defined(DEBUG_INTR) + pr_err("Serial port %d: Fifo level trigger\n", + index); +#endif + mtty_trigger_interrupt(mdev_state->mdev->uuid); + } + } else { +#if defined(DEBUG_INTR) + pr_err("Serial port %d: Buffer Overflow\n", index); +#endif + mdev_state->s[index].overrun = true; + + /* + * Trigger interrupt if receiver line status interrupt + * is enabled + */ + if (mdev_state->s[index].uart_reg[UART_IER] & + UART_IER_RLSI) + mtty_trigger_interrupt(mdev_state->mdev->uuid); + } + mutex_unlock(&mdev_state->rxtx_lock); + break; + + case UART_IER: + /* if DLAB set, data is MSB of divisor */ + if (mdev_state->s[index].dlab) + mdev_state->s[index].divisor |= (u16)data << 8; + else { + mdev_state->s[index].uart_reg[offset] = data; + mutex_lock(&mdev_state->rxtx_lock); + if ((data & UART_IER_THRI) && + (mdev_state->s[index].rxtx.head == + mdev_state->s[index].rxtx.tail)) { +#if defined(DEBUG_INTR) + pr_err("Serial port %d: IER_THRI write\n", + index); +#endif + mtty_trigger_interrupt(mdev_state->mdev->uuid); + } + + mutex_unlock(&mdev_state->rxtx_lock); + } + + break; + + case UART_FCR: + mdev_state->s[index].fcr = data; + + mutex_lock(&mdev_state->rxtx_lock); + if (data & (UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT)) { + /* clear loop back FIFO */ + mdev_state->s[index].rxtx.count = 0; + mdev_state->s[index].rxtx.head = 0; + mdev_state->s[index].rxtx.tail = 0; + } + mutex_unlock(&mdev_state->rxtx_lock); + + switch (data & UART_FCR_TRIGGER_MASK) { + case UART_FCR_TRIGGER_1: + mdev_state->s[index].intr_trigger_level = 1; + break; + + case UART_FCR_TRIGGER_4: + mdev_state->s[index].intr_trigger_level = 4; + break; + + case UART_FCR_TRIGGER_8: + mdev_state->s[index].intr_trigger_level = 8; + break; + + case UART_FCR_TRIGGER_14: + mdev_state->s[index].intr_trigger_level = 14; + break; + } + + /* + * Set trigger level to 1 otherwise or implement timer with + * timeout of 4 characters and on expiring that timer set + * Recevice data timeout in IIR register + */ + mdev_state->s[index].intr_trigger_level = 1; + if (data & UART_FCR_ENABLE_FIFO) + mdev_state->s[index].max_fifo_size = MAX_FIFO_SIZE; + else { + mdev_state->s[index].max_fifo_size = 1; + mdev_state->s[index].intr_trigger_level = 1; + } + + break; + + case UART_LCR: + if (data & UART_LCR_DLAB) { + mdev_state->s[index].dlab = true; + mdev_state->s[index].divisor = 0; + } else + mdev_state->s[index].dlab = false; + + mdev_state->s[index].uart_reg[offset] = data; + break; + + case UART_MCR: + mdev_state->s[index].uart_reg[offset] = data; + + if ((mdev_state->s[index].uart_reg[UART_IER] & UART_IER_MSI) && + (data & UART_MCR_OUT2)) { +#if defined(DEBUG_INTR) + pr_err("Serial port %d: MCR_OUT2 write\n", index); +#endif + mtty_trigger_interrupt(mdev_state->mdev->uuid); + } + + if ((mdev_state->s[index].uart_reg[UART_IER] & UART_IER_MSI) && + (data & (UART_MCR_RTS | UART_MCR_DTR))) { +#if defined(DEBUG_INTR) + pr_err("Serial port %d: MCR RTS/DTR write\n", index); +#endif + mtty_trigger_interrupt(mdev_state->mdev->uuid); + } + break; + + case UART_LSR: + case UART_MSR: + /* do nothing */ + break; + + case UART_SCR: + mdev_state->s[index].uart_reg[offset] = data; + break; + + default: + break; + } +} + +static void handle_bar_read(unsigned int index, struct mdev_state *mdev_state, + u16 offset, char *buf, u32 count) +{ + /* Handle read requests by guest */ + switch (offset) { + case UART_RX: + /* if DLAB set, data is LSB of divisor */ + if (mdev_state->s[index].dlab) { + *buf = (u8)mdev_state->s[index].divisor; + break; + } + + mutex_lock(&mdev_state->rxtx_lock); + /* return data in tx buffer */ + if (mdev_state->s[index].rxtx.head != + mdev_state->s[index].rxtx.tail) { + *buf = mdev_state->s[index].rxtx.fifo[ + mdev_state->s[index].rxtx.tail]; + mdev_state->s[index].rxtx.count--; + CIRCULAR_BUF_INC_IDX(mdev_state->s[index].rxtx.tail); + } + + if (mdev_state->s[index].rxtx.head == + mdev_state->s[index].rxtx.tail) { + /* + * Trigger interrupt if tx buffer empty interrupt is + * enabled and fifo is empty + */ +#if defined(DEBUG_INTR) + pr_err("Serial port %d: Buffer Empty\n", index); +#endif + if (mdev_state->s[index].uart_reg[UART_IER] & + UART_IER_THRI) + mtty_trigger_interrupt(mdev_state->mdev->uuid); + } + mutex_unlock(&mdev_state->rxtx_lock); + + break; + + case UART_IER: + if (mdev_state->s[index].dlab) { + *buf = (u8)(mdev_state->s[index].divisor >> 8); + break; + } + *buf = mdev_state->s[index].uart_reg[offset] & 0x0f; + break; + + case UART_IIR: + { + u8 ier = mdev_state->s[index].uart_reg[UART_IER]; + *buf = 0; + + mutex_lock(&mdev_state->rxtx_lock); + /* Interrupt priority 1: Parity, overrun, framing or break */ + if ((ier & UART_IER_RLSI) && mdev_state->s[index].overrun) + *buf |= UART_IIR_RLSI; + + /* Interrupt priority 2: Fifo trigger level reached */ + if ((ier & UART_IER_RDI) && + (mdev_state->s[index].rxtx.count == + mdev_state->s[index].intr_trigger_level)) + *buf |= UART_IIR_RDI; + + /* Interrupt priotiry 3: transmitter holding register empty */ + if ((ier & UART_IER_THRI) && + (mdev_state->s[index].rxtx.head == + mdev_state->s[index].rxtx.tail)) + *buf |= UART_IIR_THRI; + + /* Interrupt priotiry 4: Modem status: CTS, DSR, RI or DCD */ + if ((ier & UART_IER_MSI) && + (mdev_state->s[index].uart_reg[UART_MCR] & + (UART_MCR_RTS | UART_MCR_DTR))) + *buf |= UART_IIR_MSI; + + /* bit0: 0=> interrupt pending, 1=> no interrupt is pending */ + if (*buf == 0) + *buf = UART_IIR_NO_INT; + + /* set bit 6 & 7 to be 16550 compatible */ + *buf |= 0xC0; + mutex_unlock(&mdev_state->rxtx_lock); + } + break; + + case UART_LCR: + case UART_MCR: + *buf = mdev_state->s[index].uart_reg[offset]; + break; + + case UART_LSR: + { + u8 lsr = 0; + + mutex_lock(&mdev_state->rxtx_lock); + /* atleast one char in FIFO */ + if (mdev_state->s[index].rxtx.head != + mdev_state->s[index].rxtx.tail) + lsr |= UART_LSR_DR; + + /* if FIFO overrun */ + if (mdev_state->s[index].overrun) + lsr |= UART_LSR_OE; + + /* transmit FIFO empty and tramsitter empty */ + if (mdev_state->s[index].rxtx.head == + mdev_state->s[index].rxtx.tail) + lsr |= UART_LSR_TEMT | UART_LSR_THRE; + + mutex_unlock(&mdev_state->rxtx_lock); + *buf = lsr; + break; + } + case UART_MSR: + *buf = UART_MSR_DSR | UART_MSR_DDSR | UART_MSR_DCD; + + mutex_lock(&mdev_state->rxtx_lock); + /* if AFE is 1 and FIFO have space, set CTS bit */ + if (mdev_state->s[index].uart_reg[UART_MCR] & + UART_MCR_AFE) { + if (mdev_state->s[index].rxtx.count < + mdev_state->s[index].max_fifo_size) + *buf |= UART_MSR_CTS | UART_MSR_DCTS; + } else + *buf |= UART_MSR_CTS | UART_MSR_DCTS; + mutex_unlock(&mdev_state->rxtx_lock); + + break; + + case UART_SCR: + *buf = mdev_state->s[index].uart_reg[offset]; + break; + + default: + break; + } +} + +static void mdev_read_base(struct mdev_state *mdev_state) +{ + int index, pos; + u32 start_lo, start_hi; + u32 mem_type; + + pos = PCI_BASE_ADDRESS_0; + + for (index = 0; index <= VFIO_PCI_BAR5_REGION_INDEX; index++) { + + if (!mdev_state->region_info[index].size) + continue; + + start_lo = (*(u32 *)(mdev_state->vconfig + pos)) & + PCI_BASE_ADDRESS_MEM_MASK; + mem_type = (*(u32 *)(mdev_state->vconfig + pos)) & + PCI_BASE_ADDRESS_MEM_TYPE_MASK; + + switch (mem_type) { + case PCI_BASE_ADDRESS_MEM_TYPE_64: + start_hi = (*(u32 *)(mdev_state->vconfig + pos + 4)); + pos += 4; + break; + case PCI_BASE_ADDRESS_MEM_TYPE_32: + case PCI_BASE_ADDRESS_MEM_TYPE_1M: + /* 1M mem BAR treated as 32-bit BAR */ + default: + /* mem unknown type treated as 32-bit BAR */ + start_hi = 0; + break; + } + pos += 4; + mdev_state->region_info[index].start = ((u64)start_hi << 32) | + start_lo; + } +} + +static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count, + loff_t pos, bool is_write) +{ + struct mdev_state *mdev_state; + unsigned int index; + loff_t offset; + int ret = 0; + + if (!mdev || !buf) + return -EINVAL; + + mdev_state = mdev_get_drvdata(mdev); + if (!mdev_state) { + pr_err("%s mdev_state not found\n", __func__); + return -EINVAL; + } + + mutex_lock(&mdev_state->ops_lock); + + index = MTTY_VFIO_PCI_OFFSET_TO_INDEX(pos); + offset = pos & MTTY_VFIO_PCI_OFFSET_MASK; + switch (index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + +#if defined(DEBUG) + pr_info("%s: PCI config space %s at offset 0x%llx\n", + __func__, is_write ? "write" : "read", offset); +#endif + if (is_write) { + dump_buffer(buf, count); + handle_pci_cfg_write(mdev_state, offset, buf, count); + } else { + memcpy(buf, (mdev_state->vconfig + offset), count); + dump_buffer(buf, count); + } + + break; + + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: + if (!mdev_state->region_info[index].start) + mdev_read_base(mdev_state); + + if (is_write) { + dump_buffer(buf, count); + +#if defined(DEBUG_REGS) + pr_info("%s: BAR%d WR @0x%llx %s val:0x%02x dlab:%d\n", + __func__, index, offset, wr_reg[offset], + (u8)*buf, mdev_state->s[index].dlab); +#endif + handle_bar_write(index, mdev_state, offset, buf, count); + } else { + handle_bar_read(index, mdev_state, offset, buf, count); + dump_buffer(buf, count); + +#if defined(DEBUG_REGS) + pr_info("%s: BAR%d RD @0x%llx %s val:0x%02x dlab:%d\n", + __func__, index, offset, rd_reg[offset], + (u8)*buf, mdev_state->s[index].dlab); +#endif + } + break; + + default: + ret = -1; + goto accessfailed; + } + + ret = count; + + +accessfailed: + mutex_unlock(&mdev_state->ops_lock); + + return ret; +} + +int mtty_create(struct kobject *kobj, struct mdev_device *mdev) +{ + struct mdev_state *mdev_state; + char name[MTTY_STRING_LEN]; + int nr_ports = 0, i; + + if (!mdev) + return -EINVAL; + + for (i = 0; i < 2; i++) { + snprintf(name, MTTY_STRING_LEN, "%s-%d", + dev_driver_string(mdev->parent->dev), i + 1); + if (!strcmp(kobj->name, name)) { + nr_ports = i + 1; + break; + } + } + + if (!nr_ports) + return -EINVAL; + + mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL); + if (mdev_state == NULL) + return -ENOMEM; + + mdev_state->nr_ports = nr_ports; + mdev_state->irq_index = -1; + mdev_state->s[0].max_fifo_size = MAX_FIFO_SIZE; + mdev_state->s[1].max_fifo_size = MAX_FIFO_SIZE; + mutex_init(&mdev_state->rxtx_lock); + mdev_state->vconfig = kzalloc(MTTY_CONFIG_SPACE_SIZE, GFP_KERNEL); + + if (mdev_state->vconfig == NULL) { + kfree(mdev_state); + return -ENOMEM; + } + + mutex_init(&mdev_state->ops_lock); + mdev_state->mdev = mdev; + mdev_set_drvdata(mdev, mdev_state); + + mtty_create_config_space(mdev_state); + + mutex_lock(&mdev_list_lock); + list_add(&mdev_state->next, &mdev_devices_list); + mutex_unlock(&mdev_list_lock); + + return 0; +} + +int mtty_remove(struct mdev_device *mdev) +{ + struct mdev_state *mds, *tmp_mds; + struct mdev_state *mdev_state = mdev_get_drvdata(mdev); + int ret = -EINVAL; + + mutex_lock(&mdev_list_lock); + list_for_each_entry_safe(mds, tmp_mds, &mdev_devices_list, next) { + if (mdev_state == mds) { + list_del(&mdev_state->next); + mdev_set_drvdata(mdev, NULL); + kfree(mdev_state->vconfig); + kfree(mdev_state); + ret = 0; + break; + } + } + mutex_unlock(&mdev_list_lock); + + return ret; +} + +int mtty_reset(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state; + + if (!mdev) + return -EINVAL; + + mdev_state = mdev_get_drvdata(mdev); + if (!mdev_state) + return -EINVAL; + + pr_info("%s: called\n", __func__); + + return 0; +} + +ssize_t mtty_read(struct mdev_device *mdev, char __user *buf, size_t count, + loff_t *ppos) +{ + unsigned int done = 0; + int ret; + + while (count) { + size_t filled; + + if (count >= 4 && !(*ppos % 4)) { + u32 val; + + ret = mdev_access(mdev, (char *)&val, sizeof(val), + *ppos, false); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 4; + } else if (count >= 2 && !(*ppos % 2)) { + u16 val; + + ret = mdev_access(mdev, (char *)&val, sizeof(val), + *ppos, false); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 2; + } else { + u8 val; + + ret = mdev_access(mdev, (char *)&val, sizeof(val), + *ppos, false); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 1; + } + + count -= filled; + done += filled; + *ppos += filled; + buf += filled; + } + + return done; + +read_err: + return -EFAULT; +} + +ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned int done = 0; + int ret; + + while (count) { + size_t filled; + + if (count >= 4 && !(*ppos % 4)) { + u32 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = mdev_access(mdev, (char *)&val, sizeof(val), + *ppos, true); + if (ret <= 0) + goto write_err; + + filled = 4; + } else if (count >= 2 && !(*ppos % 2)) { + u16 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = mdev_access(mdev, (char *)&val, sizeof(val), + *ppos, true); + if (ret <= 0) + goto write_err; + + filled = 2; + } else { + u8 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = mdev_access(mdev, (char *)&val, sizeof(val), + *ppos, true); + if (ret <= 0) + goto write_err; + + filled = 1; + } + count -= filled; + done += filled; + *ppos += filled; + buf += filled; + } + + return done; +write_err: + return -EFAULT; +} + +static int mtty_set_irqs(struct mdev_device *mdev, uint32_t flags, + unsigned int index, unsigned int start, + unsigned int count, void *data) +{ + int ret = 0; + struct mdev_state *mdev_state; + + if (!mdev) + return -EINVAL; + + mdev_state = mdev_get_drvdata(mdev); + if (!mdev_state) + return -EINVAL; + + mutex_lock(&mdev_state->ops_lock); + switch (index) { + case VFIO_PCI_INTX_IRQ_INDEX: + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { + case VFIO_IRQ_SET_ACTION_MASK: + case VFIO_IRQ_SET_ACTION_UNMASK: + break; + case VFIO_IRQ_SET_ACTION_TRIGGER: + { + if (flags & VFIO_IRQ_SET_DATA_NONE) { + pr_info("%s: disable INTx\n", __func__); + if (mdev_state->intx_evtfd) + eventfd_ctx_put(mdev_state->intx_evtfd); + break; + } + + if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { + int fd = *(int *)data; + + if (fd > 0) { + struct eventfd_ctx *evt; + + evt = eventfd_ctx_fdget(fd); + if (IS_ERR(evt)) { + ret = PTR_ERR(evt); + break; + } + mdev_state->intx_evtfd = evt; + mdev_state->irq_fd = fd; + mdev_state->irq_index = index; + break; + } + } + break; + } + } + break; + case VFIO_PCI_MSI_IRQ_INDEX: + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { + case VFIO_IRQ_SET_ACTION_MASK: + case VFIO_IRQ_SET_ACTION_UNMASK: + break; + case VFIO_IRQ_SET_ACTION_TRIGGER: + if (flags & VFIO_IRQ_SET_DATA_NONE) { + if (mdev_state->msi_evtfd) + eventfd_ctx_put(mdev_state->msi_evtfd); + pr_info("%s: disable MSI\n", __func__); + mdev_state->irq_index = VFIO_PCI_INTX_IRQ_INDEX; + break; + } + if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { + int fd = *(int *)data; + struct eventfd_ctx *evt; + + if (fd <= 0) + break; + + if (mdev_state->msi_evtfd) + break; + + evt = eventfd_ctx_fdget(fd); + if (IS_ERR(evt)) { + ret = PTR_ERR(evt); + break; + } + mdev_state->msi_evtfd = evt; + mdev_state->irq_fd = fd; + mdev_state->irq_index = index; + } + break; + } + break; + case VFIO_PCI_MSIX_IRQ_INDEX: + pr_info("%s: MSIX_IRQ\n", __func__); + break; + case VFIO_PCI_ERR_IRQ_INDEX: + pr_info("%s: ERR_IRQ\n", __func__); + break; + case VFIO_PCI_REQ_IRQ_INDEX: + pr_info("%s: REQ_IRQ\n", __func__); + break; + } + + mutex_unlock(&mdev_state->ops_lock); + return ret; +} + +static int mtty_trigger_interrupt(uuid_le uuid) +{ + int ret = -1; + struct mdev_state *mdev_state; + + mdev_state = find_mdev_state_by_uuid(uuid); + + if (!mdev_state) { + pr_info("%s: mdev not found\n", __func__); + return -EINVAL; + } + + if ((mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX) && + (!mdev_state->msi_evtfd)) + return -EINVAL; + else if ((mdev_state->irq_index == VFIO_PCI_INTX_IRQ_INDEX) && + (!mdev_state->intx_evtfd)) { + pr_info("%s: Intr eventfd not found\n", __func__); + return -EINVAL; + } + + if (mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX) + ret = eventfd_signal(mdev_state->msi_evtfd, 1); + else + ret = eventfd_signal(mdev_state->intx_evtfd, 1); + +#if defined(DEBUG_INTR) + pr_info("Intx triggered\n"); +#endif + if (ret != 1) + pr_err("%s: eventfd signal failed (%d)\n", __func__, ret); + + return ret; +} + +int mtty_get_region_info(struct mdev_device *mdev, + struct vfio_region_info *region_info, + u16 *cap_type_id, void **cap_type) +{ + unsigned int size = 0; + struct mdev_state *mdev_state; + int bar_index; + + if (!mdev) + return -EINVAL; + + mdev_state = mdev_get_drvdata(mdev); + if (!mdev_state) + return -EINVAL; + + mutex_lock(&mdev_state->ops_lock); + bar_index = region_info->index; + + switch (bar_index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + size = MTTY_CONFIG_SPACE_SIZE; + break; + case VFIO_PCI_BAR0_REGION_INDEX: + size = MTTY_IO_BAR_SIZE; + break; + case VFIO_PCI_BAR1_REGION_INDEX: + if (mdev_state->nr_ports == 2) + size = MTTY_IO_BAR_SIZE; + break; + default: + size = 0; + break; + } + + mdev_state->region_info[bar_index].size = size; + mdev_state->region_info[bar_index].vfio_offset = + MTTY_VFIO_PCI_INDEX_TO_OFFSET(bar_index); + + region_info->size = size; + region_info->offset = MTTY_VFIO_PCI_INDEX_TO_OFFSET(bar_index); + region_info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + mutex_unlock(&mdev_state->ops_lock); + return 0; +} + +int mtty_get_irq_info(struct mdev_device *mdev, struct vfio_irq_info *irq_info) +{ + switch (irq_info->index) { + case VFIO_PCI_INTX_IRQ_INDEX: + case VFIO_PCI_MSI_IRQ_INDEX: + case VFIO_PCI_REQ_IRQ_INDEX: + break; + + default: + return -EINVAL; + } + + irq_info->flags = VFIO_IRQ_INFO_EVENTFD; + irq_info->count = 1; + + if (irq_info->index == VFIO_PCI_INTX_IRQ_INDEX) + irq_info->flags |= (VFIO_IRQ_INFO_MASKABLE | + VFIO_IRQ_INFO_AUTOMASKED); + else + irq_info->flags |= VFIO_IRQ_INFO_NORESIZE; + + return 0; +} + +int mtty_get_device_info(struct mdev_device *mdev, + struct vfio_device_info *dev_info) +{ + dev_info->flags = VFIO_DEVICE_FLAGS_PCI; + dev_info->num_regions = VFIO_PCI_NUM_REGIONS; + dev_info->num_irqs = VFIO_PCI_NUM_IRQS; + + return 0; +} + +static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd, + unsigned long arg) +{ + int ret = 0; + unsigned long minsz; + struct mdev_state *mdev_state; + + if (!mdev) + return -EINVAL; + + mdev_state = mdev_get_drvdata(mdev); + if (!mdev_state) + return -ENODEV; + + switch (cmd) { + case VFIO_DEVICE_GET_INFO: + { + struct vfio_device_info info; + + minsz = offsetofend(struct vfio_device_info, num_irqs); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + ret = mtty_get_device_info(mdev, &info); + if (ret) + return ret; + + memcpy(&mdev_state->dev_info, &info, sizeof(info)); + + return copy_to_user((void __user *)arg, &info, minsz); + } + case VFIO_DEVICE_GET_REGION_INFO: + { + struct vfio_region_info info; + u16 cap_type_id = 0; + void *cap_type = NULL; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + ret = mtty_get_region_info(mdev, &info, &cap_type_id, + &cap_type); + if (ret) + return ret; + + return copy_to_user((void __user *)arg, &info, minsz); + } + + case VFIO_DEVICE_GET_IRQ_INFO: + { + struct vfio_irq_info info; + + minsz = offsetofend(struct vfio_irq_info, count); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if ((info.argsz < minsz) || + (info.index >= mdev_state->dev_info.num_irqs)) + return -EINVAL; + + ret = mtty_get_irq_info(mdev, &info); + if (ret) + return ret; + + if (info.count == -1) + return -EINVAL; + + return copy_to_user((void __user *)arg, &info, minsz); + } + case VFIO_DEVICE_SET_IRQS: + { + struct vfio_irq_set hdr; + u8 *data = NULL, *ptr = NULL; + size_t data_size = 0; + + minsz = offsetofend(struct vfio_irq_set, count); + + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; + + ret = vfio_set_irqs_validate_and_prepare(&hdr, + mdev_state->dev_info.num_irqs, + VFIO_PCI_NUM_IRQS, + &data_size); + if (ret) + return ret; + + if (data_size) { + ptr = data = memdup_user((void __user *)(arg + minsz), + data_size); + if (IS_ERR(data)) + return PTR_ERR(data); + } + + ret = mtty_set_irqs(mdev, hdr.flags, hdr.index, hdr.start, + hdr.count, data); + + kfree(ptr); + return ret; + } + case VFIO_DEVICE_RESET: + return mtty_reset(mdev); + } + return -ENOTTY; +} + +int mtty_open(struct mdev_device *mdev) +{ + pr_info("%s\n", __func__); + return 0; +} + +void mtty_close(struct mdev_device *mdev) +{ + pr_info("%s\n", __func__); +} + +static ssize_t +sample_mtty_dev_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "This is phy device\n"); +} + +static DEVICE_ATTR_RO(sample_mtty_dev); + +static struct attribute *mtty_dev_attrs[] = { + &dev_attr_sample_mtty_dev.attr, + NULL, +}; + +static const struct attribute_group mtty_dev_group = { + .name = "mtty_dev", + .attrs = mtty_dev_attrs, +}; + +const struct attribute_group *mtty_dev_groups[] = { + &mtty_dev_group, + NULL, +}; + +static ssize_t +sample_mdev_dev_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct mdev_device *mdev = to_mdev_device(dev); + + if (mdev) + return sprintf(buf, "This is MDEV %s\n", dev_name(&mdev->dev)); + + return sprintf(buf, "\n"); +} + +static DEVICE_ATTR_RO(sample_mdev_dev); + +static struct attribute *mdev_dev_attrs[] = { + &dev_attr_sample_mdev_dev.attr, + NULL, +}; + +static const struct attribute_group mdev_dev_group = { + .name = "vendor", + .attrs = mdev_dev_attrs, +}; + +const struct attribute_group *mdev_dev_groups[] = { + &mdev_dev_group, + NULL, +}; + +static ssize_t +name_show(struct kobject *kobj, struct device *dev, char *buf) +{ + char name[MTTY_STRING_LEN]; + int i; + const char *name_str[2] = {"Single port serial", "Dual port serial"}; + + for (i = 0; i < 2; i++) { + snprintf(name, MTTY_STRING_LEN, "%s-%d", + dev_driver_string(dev), i + 1); + if (!strcmp(kobj->name, name)) + return sprintf(buf, "%s\n", name_str[i]); + } + + return -EINVAL; +} + +MDEV_TYPE_ATTR_RO(name); + +static ssize_t +available_instances_show(struct kobject *kobj, struct device *dev, char *buf) +{ + char name[MTTY_STRING_LEN]; + int i; + struct mdev_state *mds; + int ports = 0, used = 0; + + for (i = 0; i < 2; i++) { + snprintf(name, MTTY_STRING_LEN, "%s-%d", + dev_driver_string(dev), i + 1); + if (!strcmp(kobj->name, name)) { + ports = i + 1; + break; + } + } + + if (!ports) + return -EINVAL; + + list_for_each_entry(mds, &mdev_devices_list, next) + used += mds->nr_ports; + + return sprintf(buf, "%d\n", (MAX_MTTYS - used)/ports); +} + +MDEV_TYPE_ATTR_RO(available_instances); + + +static ssize_t device_api_show(struct kobject *kobj, struct device *dev, + char *buf) +{ + return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); +} + +MDEV_TYPE_ATTR_RO(device_api); + +static struct attribute *mdev_types_attrs[] = { + &mdev_type_attr_name.attr, + &mdev_type_attr_device_api.attr, + &mdev_type_attr_available_instances.attr, + NULL, +}; + +static struct attribute_group mdev_type_group1 = { + .name = "1", + .attrs = mdev_types_attrs, +}; + +static struct attribute_group mdev_type_group2 = { + .name = "2", + .attrs = mdev_types_attrs, +}; + +struct attribute_group *mdev_type_groups[] = { + &mdev_type_group1, + &mdev_type_group2, + NULL, +}; + +struct parent_ops mdev_fops = { + .owner = THIS_MODULE, + .dev_attr_groups = mtty_dev_groups, + .mdev_attr_groups = mdev_dev_groups, + .supported_type_groups = mdev_type_groups, + .create = mtty_create, + .remove = mtty_remove, + .open = mtty_open, + .release = mtty_close, + .read = mtty_read, + .write = mtty_write, + .ioctl = mtty_ioctl, +}; + +static void mtty_device_release(struct device *dev) +{ + dev_dbg(dev, "mtty: released\n"); +} + +static int __init mtty_dev_init(void) +{ + int ret = 0; + + pr_info("mtty_dev: %s\n", __func__); + + memset(&mtty_dev, 0, sizeof(mtty_dev)); + + idr_init(&mtty_dev.vd_idr); + + ret = alloc_chrdev_region(&mtty_dev.vd_devt, 0, MINORMASK, MTTY_NAME); + + if (ret < 0) { + pr_err("Error: failed to register mtty_dev, err:%d\n", ret); + return ret; + } + + cdev_init(&mtty_dev.vd_cdev, &vd_fops); + cdev_add(&mtty_dev.vd_cdev, mtty_dev.vd_devt, MINORMASK); + + pr_info("major_number:%d\n", MAJOR(mtty_dev.vd_devt)); + + mtty_dev.vd_class = class_create(THIS_MODULE, MTTY_CLASS_NAME); + + if (IS_ERR(mtty_dev.vd_class)) { + pr_err("Error: failed to register mtty_dev class\n"); + goto failed1; + } + + mtty_dev.dev.class = mtty_dev.vd_class; + mtty_dev.dev.release = mtty_device_release; + dev_set_name(&mtty_dev.dev, "%s", MTTY_NAME); + + ret = device_register(&mtty_dev.dev); + if (ret) + goto failed2; + + if (mdev_register_device(&mtty_dev.dev, &mdev_fops) != 0) + goto failed3; + + mutex_init(&mdev_list_lock); + INIT_LIST_HEAD(&mdev_devices_list); + + goto all_done; + +failed3: + + device_unregister(&mtty_dev.dev); +failed2: + class_destroy(mtty_dev.vd_class); + +failed1: + cdev_del(&mtty_dev.vd_cdev); + unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK); + +all_done: + return ret; +} + +static void __exit mtty_dev_exit(void) +{ + mtty_dev.dev.bus = NULL; + mdev_unregister_device(&mtty_dev.dev); + + device_unregister(&mtty_dev.dev); + idr_destroy(&mtty_dev.vd_idr); + cdev_del(&mtty_dev.vd_cdev); + unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK); + class_destroy(mtty_dev.vd_class); + mtty_dev.vd_class = NULL; + pr_info("mtty_dev: Unloaded!\n"); +} + +module_init(mtty_dev_init) +module_exit(mtty_dev_exit) + +MODULE_LICENSE("GPL v2"); +MODULE_INFO(supported, "Test driver that simulate serial port over PCI"); +MODULE_VERSION(VERSION_STRING); +MODULE_AUTHOR(DRIVER_AUTHOR); diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index 1dd087da6f31..d32f239eb471 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -60,6 +60,19 @@ static void kvm_vfio_group_put_external_user(struct vfio_group *vfio_group) symbol_put(vfio_group_put_external_user); } +static void kvm_vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm) +{ + void (*fn)(struct vfio_group *, struct kvm *); + + fn = symbol_get(vfio_group_set_kvm); + if (!fn) + return; + + fn(group, kvm); + + symbol_put(vfio_group_set_kvm); +} + static bool kvm_vfio_group_is_coherent(struct vfio_group *vfio_group) { long (*fn)(struct vfio_group *, unsigned long); @@ -159,6 +172,8 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) mutex_unlock(&kv->lock); + kvm_vfio_group_set_kvm(vfio_group, dev->kvm); + kvm_vfio_update_coherency(dev); return 0; @@ -196,6 +211,8 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) mutex_unlock(&kv->lock); + kvm_vfio_group_set_kvm(vfio_group, NULL); + kvm_vfio_group_put_external_user(vfio_group); kvm_vfio_update_coherency(dev); @@ -240,6 +257,7 @@ static void kvm_vfio_destroy(struct kvm_device *dev) struct kvm_vfio_group *kvg, *tmp; list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) { + kvm_vfio_group_set_kvm(kvg->vfio_group, NULL); kvm_vfio_group_put_external_user(kvg->vfio_group); list_del(&kvg->node); kfree(kvg); |