From 22b436c9b5682e877d34425d05576db74a8647e1 Mon Sep 17 00:00:00 2001 From: Yuri Benditovich Date: Mon, 2 Mar 2020 13:50:01 +0200 Subject: virtio-net: Introduce extended RSC feature VIRTIO_NET_F_RSC_EXT feature bit indicates that the device is able to provide extended RSC information. When the feature is negotiatede and 'gso_type' field in received packet is not GSO_NONE, the device reports number of coalesced packets in 'csum_start' field and number of duplicated acks in 'csum_offset' field and sets VIRTIO_NET_HDR_F_RSC_INFO in 'flags' field. Signed-off-by: Yuri Benditovich Link: https://lore.kernel.org/r/20200302115003.14877-2-yuri.benditovich@daynix.com Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/virtio_net.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index a3715a3224c1..6466c5979a93 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -57,6 +57,7 @@ * Steering */ #define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ +#define VIRTIO_NET_F_RSC_EXT 61 /* extended coalescing info */ #define VIRTIO_NET_F_STANDBY 62 /* Act as standby for another device * with the same MAC. */ @@ -104,6 +105,7 @@ struct virtio_net_config { struct virtio_net_hdr_v1 { #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */ #define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ +#define VIRTIO_NET_HDR_F_RSC_INFO 4 /* rsc info in csum_ fields */ __u8 flags; #define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */ #define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */ @@ -113,8 +115,26 @@ struct virtio_net_hdr_v1 { __u8 gso_type; __virtio16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */ __virtio16 gso_size; /* Bytes to append to hdr_len per frame */ - __virtio16 csum_start; /* Position to start checksumming from */ - __virtio16 csum_offset; /* Offset after that to place checksum */ + union { + struct { + __virtio16 csum_start; + __virtio16 csum_offset; + }; + /* Checksum calculation */ + struct { + /* Position to start checksumming from */ + __virtio16 start; + /* Offset after that to place checksum */ + __virtio16 offset; + } csum; + /* Receive Segment Coalescing */ + struct { + /* Number of coalesced segments */ + __le16 segments; + /* Number of duplicated acks */ + __le16 dup_acks; + } rsc; + }; __virtio16 num_buffers; /* Number of merged rx buffers */ }; -- cgit v1.2.3 From fd58bf674564f1731ca8a61a5150d40383f3df60 Mon Sep 17 00:00:00 2001 From: Yuri Benditovich Date: Mon, 2 Mar 2020 13:50:02 +0200 Subject: virtio-net: Introduce RSS receive steering feature RSS (Receive-side scaling) defines hash calculation rules and decision on receive virtqueue according to the calculated hash, provided mask to apply and provided indirection table containing indices of receive virqueues. The driver sends the control command to enable multiqueue and provide parameters for receive steering. Signed-off-by: Yuri Benditovich Link: https://lore.kernel.org/r/20200302115003.14877-3-yuri.benditovich@daynix.com Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/virtio_net.h | 42 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 6466c5979a93..aec6fac3666a 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -57,6 +57,7 @@ * Steering */ #define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ +#define VIRTIO_NET_F_RSS 60 /* Supports RSS RX steering */ #define VIRTIO_NET_F_RSC_EXT 61 /* extended coalescing info */ #define VIRTIO_NET_F_STANDBY 62 /* Act as standby for another device * with the same MAC. @@ -70,6 +71,17 @@ #define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ #define VIRTIO_NET_S_ANNOUNCE 2 /* Announcement is needed */ +/* supported/enabled hash types */ +#define VIRTIO_NET_RSS_HASH_TYPE_IPv4 (1 << 0) +#define VIRTIO_NET_RSS_HASH_TYPE_TCPv4 (1 << 1) +#define VIRTIO_NET_RSS_HASH_TYPE_UDPv4 (1 << 2) +#define VIRTIO_NET_RSS_HASH_TYPE_IPv6 (1 << 3) +#define VIRTIO_NET_RSS_HASH_TYPE_TCPv6 (1 << 4) +#define VIRTIO_NET_RSS_HASH_TYPE_UDPv6 (1 << 5) +#define VIRTIO_NET_RSS_HASH_TYPE_IP_EX (1 << 6) +#define VIRTIO_NET_RSS_HASH_TYPE_TCP_EX (1 << 7) +#define VIRTIO_NET_RSS_HASH_TYPE_UDP_EX (1 << 8) + struct virtio_net_config { /* The config defining mac address (if VIRTIO_NET_F_MAC) */ __u8 mac[ETH_ALEN]; @@ -93,6 +105,12 @@ struct virtio_net_config { * Any other value stands for unknown. */ __u8 duplex; + /* maximum size of RSS key */ + __u8 rss_max_key_size; + /* maximum number of indirection table entries */ + __le16 rss_max_indirection_table_length; + /* bitmask of supported VIRTIO_NET_RSS_HASH_ types */ + __le32 supported_hash_types; } __attribute__((packed)); /* @@ -248,7 +266,9 @@ struct virtio_net_ctrl_mac { /* * Control Receive Flow Steering - * + */ +#define VIRTIO_NET_CTRL_MQ 4 +/* * The command VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET * enables Receive Flow Steering, specifying the number of the transmit and * receive queues that will be used. After the command is consumed and acked by @@ -261,11 +281,29 @@ struct virtio_net_ctrl_mq { __virtio16 virtqueue_pairs; }; -#define VIRTIO_NET_CTRL_MQ 4 #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET 0 #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN 1 #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX 0x8000 +/* + * The command VIRTIO_NET_CTRL_MQ_RSS_CONFIG has the same effect as + * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET does and additionally configures + * the receive steering to use a hash calculated for incoming packet + * to decide on receive virtqueue to place the packet. The command + * also provides parameters to calculate a hash and receive virtqueue. + */ +struct virtio_net_rss_config { + __le32 hash_types; + __le16 indirection_table_mask; + __le16 unclassified_queue; + __le16 indirection_table[1/* + indirection_table_mask */]; + __le16 max_tx_vq; + __u8 hash_key_length; + __u8 hash_key_data[/* hash_key_length */]; +}; + + #define VIRTIO_NET_CTRL_MQ_RSS_CONFIG 1 + /* * Control network offloads * -- cgit v1.2.3 From 3024e20958ee9e1554951df4d26aaf9f5cb7c210 Mon Sep 17 00:00:00 2001 From: Yuri Benditovich Date: Mon, 2 Mar 2020 13:50:03 +0200 Subject: virtio-net: Introduce hash report feature The feature VIRTIO_NET_F_HASH_REPORT extends the layout of the packet and requests the device to calculate hash on incoming packets and report it in the packet header. Signed-off-by: Yuri Benditovich Link: https://lore.kernel.org/r/20200302115003.14877-4-yuri.benditovich@daynix.com Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/virtio_net.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index aec6fac3666a..19d23e5baa4e 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -57,6 +57,7 @@ * Steering */ #define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ +#define VIRTIO_NET_F_HASH_REPORT 57 /* Supports hash report */ #define VIRTIO_NET_F_RSS 60 /* Supports RSS RX steering */ #define VIRTIO_NET_F_RSC_EXT 61 /* extended coalescing info */ #define VIRTIO_NET_F_STANDBY 62 /* Act as standby for another device @@ -156,6 +157,23 @@ struct virtio_net_hdr_v1 { __virtio16 num_buffers; /* Number of merged rx buffers */ }; +struct virtio_net_hdr_v1_hash { + struct virtio_net_hdr_v1 hdr; + __le32 hash_value; +#define VIRTIO_NET_HASH_REPORT_NONE 0 +#define VIRTIO_NET_HASH_REPORT_IPv4 1 +#define VIRTIO_NET_HASH_REPORT_TCPv4 2 +#define VIRTIO_NET_HASH_REPORT_UDPv4 3 +#define VIRTIO_NET_HASH_REPORT_IPv6 4 +#define VIRTIO_NET_HASH_REPORT_TCPv6 5 +#define VIRTIO_NET_HASH_REPORT_UDPv6 6 +#define VIRTIO_NET_HASH_REPORT_IPv6_EX 7 +#define VIRTIO_NET_HASH_REPORT_TCPv6_EX 8 +#define VIRTIO_NET_HASH_REPORT_UDPv6_EX 9 + __le16 hash_report; + __le16 padding; +}; + #ifndef VIRTIO_NET_NO_LEGACY /* This header comes first in the scatter-gather list. * For legacy virtio, if VIRTIO_F_ANY_LAYOUT is not negotiated, it must @@ -304,6 +322,24 @@ struct virtio_net_rss_config { #define VIRTIO_NET_CTRL_MQ_RSS_CONFIG 1 +/* + * The command VIRTIO_NET_CTRL_MQ_HASH_CONFIG requests the device + * to include in the virtio header of the packet the value of the + * calculated hash and the report type of hash. It also provides + * parameters for hash calculation. The command requires feature + * VIRTIO_NET_F_HASH_REPORT to be negotiated to extend the + * layout of virtio header as defined in virtio_net_hdr_v1_hash. + */ +struct virtio_net_hash_config { + __le32 hash_types; + /* for compatibility with virtio_net_rss_config */ + __le16 reserved[4]; + __u8 hash_key_length; + __u8 hash_key_data[/* hash_key_length */]; +}; + + #define VIRTIO_NET_CTRL_MQ_HASH_CONFIG 2 + /* * Control network offloads * -- cgit v1.2.3 From 4c8cf31885f69e86be0b5b9e6677a26797365e1d Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Thu, 26 Mar 2020 22:01:23 +0800 Subject: vhost: introduce vDPA-based backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces a vDPA-based vhost backend. This backend is built on top of the same interface defined in virtio-vDPA and provides a generic vhost interface for userspace to accelerate the virtio devices in guest. This backend is implemented as a vDPA device driver on top of the same ops used in virtio-vDPA. It will create char device entry named vhost-vdpa-$index for userspace to use. Userspace can use vhost ioctls on top of this char device to setup the backend. Vhost ioctls are extended to make it type agnostic and behave like a virtio device, this help to eliminate type specific API like what vhost_net/scsi/vsock did: - VHOST_VDPA_GET_DEVICE_ID: get the virtio device ID which is defined by virtio specification to differ from different type of devices - VHOST_VDPA_GET_VRING_NUM: get the maximum size of virtqueue supported by the vDPA device - VHSOT_VDPA_SET/GET_STATUS: set and get virtio status of vDPA device - VHOST_VDPA_SET/GET_CONFIG: access virtio config space - VHOST_VDPA_SET_VRING_ENABLE: enable a specific virtqueue For memory mapping, IOTLB API is mandated for vhost-vDPA which means userspace drivers are required to use VHOST_IOTLB_UPDATE/VHOST_IOTLB_INVALIDATE to add or remove mapping for a specific userspace memory region. The vhost-vDPA API is designed to be type agnostic, but it allows net device only in current stage. Due to the lacking of control virtqueue support, some features were filter out by vhost-vdpa. We will enable more features and devices in the near future. Signed-off-by: Tiwei Bie Signed-off-by: Eugenio PĂ©rez Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20200326140125.19794-8-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vhost/Kconfig | 12 + drivers/vhost/Makefile | 3 + drivers/vhost/vdpa.c | 883 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/vhost.h | 24 ++ include/uapi/linux/vhost_types.h | 8 + 5 files changed, 930 insertions(+) create mode 100644 drivers/vhost/vdpa.c (limited to 'include/uapi') diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 128238488078..362b832f5338 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -59,6 +59,18 @@ config VHOST_VSOCK To compile this driver as a module, choose M here: the module will be called vhost_vsock. +config VHOST_VDPA + tristate "Vhost driver for vDPA-based backend" + depends on EVENTFD + select VHOST + select VDPA + help + This kernel module can be loaded in host kernel to accelerate + guest virtio devices with the vDPA-based backends. + + To compile this driver as a module, choose M here: the module + will be called vhost_vdpa. + config VHOST_CROSS_ENDIAN_LEGACY bool "Cross-endian support for vhost" default n diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index fb831002bcf0..f3e1897cce85 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -10,6 +10,9 @@ vhost_vsock-y := vsock.o obj-$(CONFIG_VHOST_RING) += vringh.o +obj-$(CONFIG_VHOST_VDPA) += vhost_vdpa.o +vhost_vdpa-y := vdpa.o + obj-$(CONFIG_VHOST) += vhost.o obj-$(CONFIG_VHOST_IOTLB) += vhost_iotlb.o diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c new file mode 100644 index 000000000000..421f02a8530a --- /dev/null +++ b/drivers/vhost/vdpa.c @@ -0,0 +1,883 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2018-2020 Intel Corporation. + * Copyright (C) 2020 Red Hat, Inc. + * + * Author: Tiwei Bie + * Jason Wang + * + * Thanks Michael S. Tsirkin for the valuable comments and + * suggestions. And thanks to Cunming Liang and Zhihong Wang for all + * their supports. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vhost.h" + +enum { + VHOST_VDPA_FEATURES = + (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | + (1ULL << VIRTIO_F_ANY_LAYOUT) | + (1ULL << VIRTIO_F_VERSION_1) | + (1ULL << VIRTIO_F_IOMMU_PLATFORM) | + (1ULL << VIRTIO_F_RING_PACKED) | + (1ULL << VIRTIO_F_ORDER_PLATFORM) | + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | + (1ULL << VIRTIO_RING_F_EVENT_IDX), + + VHOST_VDPA_NET_FEATURES = VHOST_VDPA_FEATURES | + (1ULL << VIRTIO_NET_F_CSUM) | + (1ULL << VIRTIO_NET_F_GUEST_CSUM) | + (1ULL << VIRTIO_NET_F_MTU) | + (1ULL << VIRTIO_NET_F_MAC) | + (1ULL << VIRTIO_NET_F_GUEST_TSO4) | + (1ULL << VIRTIO_NET_F_GUEST_TSO6) | + (1ULL << VIRTIO_NET_F_GUEST_ECN) | + (1ULL << VIRTIO_NET_F_GUEST_UFO) | + (1ULL << VIRTIO_NET_F_HOST_TSO4) | + (1ULL << VIRTIO_NET_F_HOST_TSO6) | + (1ULL << VIRTIO_NET_F_HOST_ECN) | + (1ULL << VIRTIO_NET_F_HOST_UFO) | + (1ULL << VIRTIO_NET_F_MRG_RXBUF) | + (1ULL << VIRTIO_NET_F_STATUS) | + (1ULL << VIRTIO_NET_F_SPEED_DUPLEX), +}; + +/* Currently, only network backend w/o multiqueue is supported. */ +#define VHOST_VDPA_VQ_MAX 2 + +#define VHOST_VDPA_DEV_MAX (1U << MINORBITS) + +struct vhost_vdpa { + struct vhost_dev vdev; + struct iommu_domain *domain; + struct vhost_virtqueue *vqs; + struct completion completion; + struct vdpa_device *vdpa; + struct device dev; + struct cdev cdev; + atomic_t opened; + int nvqs; + int virtio_id; + int minor; +}; + +static DEFINE_IDA(vhost_vdpa_ida); + +static dev_t vhost_vdpa_major; + +static const u64 vhost_vdpa_features[] = { + [VIRTIO_ID_NET] = VHOST_VDPA_NET_FEATURES, +}; + +static void handle_vq_kick(struct vhost_work *work) +{ + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, + poll.work); + struct vhost_vdpa *v = container_of(vq->dev, struct vhost_vdpa, vdev); + const struct vdpa_config_ops *ops = v->vdpa->config; + + ops->kick_vq(v->vdpa, vq - v->vqs); +} + +static irqreturn_t vhost_vdpa_virtqueue_cb(void *private) +{ + struct vhost_virtqueue *vq = private; + struct eventfd_ctx *call_ctx = vq->call_ctx; + + if (call_ctx) + eventfd_signal(call_ctx, 1); + + return IRQ_HANDLED; +} + +static void vhost_vdpa_reset(struct vhost_vdpa *v) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + + ops->set_status(vdpa, 0); +} + +static long vhost_vdpa_get_device_id(struct vhost_vdpa *v, u8 __user *argp) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + u32 device_id; + + device_id = ops->get_device_id(vdpa); + + if (copy_to_user(argp, &device_id, sizeof(device_id))) + return -EFAULT; + + return 0; +} + +static long vhost_vdpa_get_status(struct vhost_vdpa *v, u8 __user *statusp) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + u8 status; + + status = ops->get_status(vdpa); + + if (copy_to_user(statusp, &status, sizeof(status))) + return -EFAULT; + + return 0; +} + +static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + u8 status; + + if (copy_from_user(&status, statusp, sizeof(status))) + return -EFAULT; + + /* + * Userspace shouldn't remove status bits unless reset the + * status to 0. + */ + if (status != 0 && (ops->get_status(vdpa) & ~status) != 0) + return -EINVAL; + + ops->set_status(vdpa, status); + + return 0; +} + +static int vhost_vdpa_config_validate(struct vhost_vdpa *v, + struct vhost_vdpa_config *c) +{ + long size = 0; + + switch (v->virtio_id) { + case VIRTIO_ID_NET: + size = sizeof(struct virtio_net_config); + break; + } + + if (c->len == 0) + return -EINVAL; + + if (c->len > size - c->off) + return -E2BIG; + + return 0; +} + +static long vhost_vdpa_get_config(struct vhost_vdpa *v, + struct vhost_vdpa_config __user *c) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + struct vhost_vdpa_config config; + unsigned long size = offsetof(struct vhost_vdpa_config, buf); + u8 *buf; + + if (copy_from_user(&config, c, size)) + return -EFAULT; + if (vhost_vdpa_config_validate(v, &config)) + return -EINVAL; + buf = kvzalloc(config.len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + ops->get_config(vdpa, config.off, buf, config.len); + + if (copy_to_user(c->buf, buf, config.len)) { + kvfree(buf); + return -EFAULT; + } + + kvfree(buf); + return 0; +} + +static long vhost_vdpa_set_config(struct vhost_vdpa *v, + struct vhost_vdpa_config __user *c) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + struct vhost_vdpa_config config; + unsigned long size = offsetof(struct vhost_vdpa_config, buf); + u8 *buf; + + if (copy_from_user(&config, c, size)) + return -EFAULT; + if (vhost_vdpa_config_validate(v, &config)) + return -EINVAL; + buf = kvzalloc(config.len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, c->buf, config.len)) { + kvfree(buf); + return -EFAULT; + } + + ops->set_config(vdpa, config.off, buf, config.len); + + kvfree(buf); + return 0; +} + +static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + u64 features; + + features = ops->get_features(vdpa); + features &= vhost_vdpa_features[v->virtio_id]; + + if (copy_to_user(featurep, &features, sizeof(features))) + return -EFAULT; + + return 0; +} + +static long vhost_vdpa_set_features(struct vhost_vdpa *v, u64 __user *featurep) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + u64 features; + + /* + * It's not allowed to change the features after they have + * been negotiated. + */ + if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_FEATURES_OK) + return -EBUSY; + + if (copy_from_user(&features, featurep, sizeof(features))) + return -EFAULT; + + if (features & ~vhost_vdpa_features[v->virtio_id]) + return -EINVAL; + + if (ops->set_features(vdpa, features)) + return -EINVAL; + + return 0; +} + +static long vhost_vdpa_get_vring_num(struct vhost_vdpa *v, u16 __user *argp) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + u16 num; + + num = ops->get_vq_num_max(vdpa); + + if (copy_to_user(argp, &num, sizeof(num))) + return -EFAULT; + + return 0; +} + +static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, + void __user *argp) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + struct vdpa_callback cb; + struct vhost_virtqueue *vq; + struct vhost_vring_state s; + u8 status; + u32 idx; + long r; + + r = get_user(idx, (u32 __user *)argp); + if (r < 0) + return r; + + if (idx >= v->nvqs) + return -ENOBUFS; + + idx = array_index_nospec(idx, v->nvqs); + vq = &v->vqs[idx]; + + status = ops->get_status(vdpa); + + if (cmd == VHOST_VDPA_SET_VRING_ENABLE) { + if (copy_from_user(&s, argp, sizeof(s))) + return -EFAULT; + ops->set_vq_ready(vdpa, idx, s.num); + return 0; + } + + if (cmd == VHOST_GET_VRING_BASE) + vq->last_avail_idx = ops->get_vq_state(v->vdpa, idx); + + r = vhost_vring_ioctl(&v->vdev, cmd, argp); + if (r) + return r; + + switch (cmd) { + case VHOST_SET_VRING_ADDR: + if (ops->set_vq_address(vdpa, idx, + (u64)(uintptr_t)vq->desc, + (u64)(uintptr_t)vq->avail, + (u64)(uintptr_t)vq->used)) + r = -EINVAL; + break; + + case VHOST_SET_VRING_BASE: + if (ops->set_vq_state(vdpa, idx, vq->last_avail_idx)) + r = -EINVAL; + break; + + case VHOST_SET_VRING_CALL: + if (vq->call_ctx) { + cb.callback = vhost_vdpa_virtqueue_cb; + cb.private = vq; + } else { + cb.callback = NULL; + cb.private = NULL; + } + ops->set_vq_cb(vdpa, idx, &cb); + break; + + case VHOST_SET_VRING_NUM: + ops->set_vq_num(vdpa, idx, vq->num); + break; + } + + return r; +} + +static long vhost_vdpa_unlocked_ioctl(struct file *filep, + unsigned int cmd, unsigned long arg) +{ + struct vhost_vdpa *v = filep->private_data; + struct vhost_dev *d = &v->vdev; + void __user *argp = (void __user *)arg; + long r; + + mutex_lock(&d->mutex); + + switch (cmd) { + case VHOST_VDPA_GET_DEVICE_ID: + r = vhost_vdpa_get_device_id(v, argp); + break; + case VHOST_VDPA_GET_STATUS: + r = vhost_vdpa_get_status(v, argp); + break; + case VHOST_VDPA_SET_STATUS: + r = vhost_vdpa_set_status(v, argp); + break; + case VHOST_VDPA_GET_CONFIG: + r = vhost_vdpa_get_config(v, argp); + break; + case VHOST_VDPA_SET_CONFIG: + r = vhost_vdpa_set_config(v, argp); + break; + case VHOST_GET_FEATURES: + r = vhost_vdpa_get_features(v, argp); + break; + case VHOST_SET_FEATURES: + r = vhost_vdpa_set_features(v, argp); + break; + case VHOST_VDPA_GET_VRING_NUM: + r = vhost_vdpa_get_vring_num(v, argp); + break; + case VHOST_SET_LOG_BASE: + case VHOST_SET_LOG_FD: + r = -ENOIOCTLCMD; + break; + default: + r = vhost_dev_ioctl(&v->vdev, cmd, argp); + if (r == -ENOIOCTLCMD) + r = vhost_vdpa_vring_ioctl(v, cmd, argp); + break; + } + + mutex_unlock(&d->mutex); + return r; +} + +static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, u64 start, u64 last) +{ + struct vhost_dev *dev = &v->vdev; + struct vhost_iotlb *iotlb = dev->iotlb; + struct vhost_iotlb_map *map; + struct page *page; + unsigned long pfn, pinned; + + while ((map = vhost_iotlb_itree_first(iotlb, start, last)) != NULL) { + pinned = map->size >> PAGE_SHIFT; + for (pfn = map->addr >> PAGE_SHIFT; + pinned > 0; pfn++, pinned--) { + page = pfn_to_page(pfn); + if (map->perm & VHOST_ACCESS_WO) + set_page_dirty_lock(page); + unpin_user_page(page); + } + atomic64_sub(map->size >> PAGE_SHIFT, &dev->mm->pinned_vm); + vhost_iotlb_map_free(iotlb, map); + } +} + +static void vhost_vdpa_iotlb_free(struct vhost_vdpa *v) +{ + struct vhost_dev *dev = &v->vdev; + + vhost_vdpa_iotlb_unmap(v, 0ULL, 0ULL - 1); + kfree(dev->iotlb); + dev->iotlb = NULL; +} + +static int perm_to_iommu_flags(u32 perm) +{ + int flags = 0; + + switch (perm) { + case VHOST_ACCESS_WO: + flags |= IOMMU_WRITE; + break; + case VHOST_ACCESS_RO: + flags |= IOMMU_READ; + break; + case VHOST_ACCESS_RW: + flags |= (IOMMU_WRITE | IOMMU_READ); + break; + default: + WARN(1, "invalidate vhost IOTLB permission\n"); + break; + } + + return flags | IOMMU_CACHE; +} + +static int vhost_vdpa_map(struct vhost_vdpa *v, + u64 iova, u64 size, u64 pa, u32 perm) +{ + struct vhost_dev *dev = &v->vdev; + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + int r = 0; + + r = vhost_iotlb_add_range(dev->iotlb, iova, iova + size - 1, + pa, perm); + if (r) + return r; + + if (ops->dma_map) + r = ops->dma_map(vdpa, iova, size, pa, perm); + else if (ops->set_map) + r = ops->set_map(vdpa, dev->iotlb); + else + r = iommu_map(v->domain, iova, pa, size, + perm_to_iommu_flags(perm)); + + return r; +} + +static void vhost_vdpa_unmap(struct vhost_vdpa *v, u64 iova, u64 size) +{ + struct vhost_dev *dev = &v->vdev; + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + + vhost_vdpa_iotlb_unmap(v, iova, iova + size - 1); + + if (ops->dma_map) + ops->dma_unmap(vdpa, iova, size); + else if (ops->set_map) + ops->set_map(vdpa, dev->iotlb); + else + iommu_unmap(v->domain, iova, size); +} + +static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, + struct vhost_iotlb_msg *msg) +{ + struct vhost_dev *dev = &v->vdev; + struct vhost_iotlb *iotlb = dev->iotlb; + struct page **page_list; + unsigned long list_size = PAGE_SIZE / sizeof(struct page *); + unsigned int gup_flags = FOLL_LONGTERM; + unsigned long npages, cur_base, map_pfn, last_pfn = 0; + unsigned long locked, lock_limit, pinned, i; + u64 iova = msg->iova; + int ret = 0; + + if (vhost_iotlb_itree_first(iotlb, msg->iova, + msg->iova + msg->size - 1)) + return -EEXIST; + + page_list = (struct page **) __get_free_page(GFP_KERNEL); + if (!page_list) + return -ENOMEM; + + if (msg->perm & VHOST_ACCESS_WO) + gup_flags |= FOLL_WRITE; + + npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT; + if (!npages) + return -EINVAL; + + down_read(&dev->mm->mmap_sem); + + locked = atomic64_add_return(npages, &dev->mm->pinned_vm); + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if (locked > lock_limit) { + ret = -ENOMEM; + goto out; + } + + cur_base = msg->uaddr & PAGE_MASK; + iova &= PAGE_MASK; + + while (npages) { + pinned = min_t(unsigned long, npages, list_size); + ret = pin_user_pages(cur_base, pinned, + gup_flags, page_list, NULL); + if (ret != pinned) + goto out; + + if (!last_pfn) + map_pfn = page_to_pfn(page_list[0]); + + for (i = 0; i < ret; i++) { + unsigned long this_pfn = page_to_pfn(page_list[i]); + u64 csize; + + if (last_pfn && (this_pfn != last_pfn + 1)) { + /* Pin a contiguous chunk of memory */ + csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT; + if (vhost_vdpa_map(v, iova, csize, + map_pfn << PAGE_SHIFT, + msg->perm)) + goto out; + map_pfn = this_pfn; + iova += csize; + } + + last_pfn = this_pfn; + } + + cur_base += ret << PAGE_SHIFT; + npages -= ret; + } + + /* Pin the rest chunk */ + ret = vhost_vdpa_map(v, iova, (last_pfn - map_pfn + 1) << PAGE_SHIFT, + map_pfn << PAGE_SHIFT, msg->perm); +out: + if (ret) { + vhost_vdpa_unmap(v, msg->iova, msg->size); + atomic64_sub(npages, &dev->mm->pinned_vm); + } + up_read(&dev->mm->mmap_sem); + free_page((unsigned long)page_list); + return ret; +} + +static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, + struct vhost_iotlb_msg *msg) +{ + struct vhost_vdpa *v = container_of(dev, struct vhost_vdpa, vdev); + int r = 0; + + r = vhost_dev_check_owner(dev); + if (r) + return r; + + switch (msg->type) { + case VHOST_IOTLB_UPDATE: + r = vhost_vdpa_process_iotlb_update(v, msg); + break; + case VHOST_IOTLB_INVALIDATE: + vhost_vdpa_unmap(v, msg->iova, msg->size); + break; + default: + r = -EINVAL; + break; + } + + return r; +} + +static ssize_t vhost_vdpa_chr_write_iter(struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct vhost_vdpa *v = file->private_data; + struct vhost_dev *dev = &v->vdev; + + return vhost_chr_write_iter(dev, from); +} + +static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + struct device *dma_dev = vdpa_get_dma_dev(vdpa); + struct bus_type *bus; + int ret; + + /* Device want to do DMA by itself */ + if (ops->set_map || ops->dma_map) + return 0; + + bus = dma_dev->bus; + if (!bus) + return -EFAULT; + + if (!iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY)) + return -ENOTSUPP; + + v->domain = iommu_domain_alloc(bus); + if (!v->domain) + return -EIO; + + ret = iommu_attach_device(v->domain, dma_dev); + if (ret) + goto err_attach; + + return 0; + +err_attach: + iommu_domain_free(v->domain); + return ret; +} + +static void vhost_vdpa_free_domain(struct vhost_vdpa *v) +{ + struct vdpa_device *vdpa = v->vdpa; + struct device *dma_dev = vdpa_get_dma_dev(vdpa); + + if (v->domain) { + iommu_detach_device(v->domain, dma_dev); + iommu_domain_free(v->domain); + } + + v->domain = NULL; +} + +static int vhost_vdpa_open(struct inode *inode, struct file *filep) +{ + struct vhost_vdpa *v; + struct vhost_dev *dev; + struct vhost_virtqueue **vqs; + int nvqs, i, r, opened; + + v = container_of(inode->i_cdev, struct vhost_vdpa, cdev); + if (!v) + return -ENODEV; + + opened = atomic_cmpxchg(&v->opened, 0, 1); + if (opened) + return -EBUSY; + + nvqs = v->nvqs; + vhost_vdpa_reset(v); + + vqs = kmalloc_array(nvqs, sizeof(*vqs), GFP_KERNEL); + if (!vqs) { + r = -ENOMEM; + goto err; + } + + dev = &v->vdev; + for (i = 0; i < nvqs; i++) { + vqs[i] = &v->vqs[i]; + vqs[i]->handle_kick = handle_vq_kick; + } + vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, + vhost_vdpa_process_iotlb_msg); + + dev->iotlb = vhost_iotlb_alloc(0, 0); + if (!dev->iotlb) { + r = -ENOMEM; + goto err_init_iotlb; + } + + r = vhost_vdpa_alloc_domain(v); + if (r) + goto err_init_iotlb; + + filep->private_data = v; + + return 0; + +err_init_iotlb: + vhost_dev_cleanup(&v->vdev); +err: + atomic_dec(&v->opened); + return r; +} + +static int vhost_vdpa_release(struct inode *inode, struct file *filep) +{ + struct vhost_vdpa *v = filep->private_data; + struct vhost_dev *d = &v->vdev; + + mutex_lock(&d->mutex); + filep->private_data = NULL; + vhost_vdpa_reset(v); + vhost_dev_stop(&v->vdev); + vhost_vdpa_iotlb_free(v); + vhost_vdpa_free_domain(v); + vhost_dev_cleanup(&v->vdev); + kfree(v->vdev.vqs); + mutex_unlock(&d->mutex); + + atomic_dec(&v->opened); + complete(&v->completion); + + return 0; +} + +static const struct file_operations vhost_vdpa_fops = { + .owner = THIS_MODULE, + .open = vhost_vdpa_open, + .release = vhost_vdpa_release, + .write_iter = vhost_vdpa_chr_write_iter, + .unlocked_ioctl = vhost_vdpa_unlocked_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +static void vhost_vdpa_release_dev(struct device *device) +{ + struct vhost_vdpa *v = + container_of(device, struct vhost_vdpa, dev); + + ida_simple_remove(&vhost_vdpa_ida, v->minor); + kfree(v->vqs); + kfree(v); +} + +static int vhost_vdpa_probe(struct vdpa_device *vdpa) +{ + const struct vdpa_config_ops *ops = vdpa->config; + struct vhost_vdpa *v; + int minor, nvqs = VHOST_VDPA_VQ_MAX; + int r; + + /* Currently, we only accept the network devices. */ + if (ops->get_device_id(vdpa) != VIRTIO_ID_NET) + return -ENOTSUPP; + + v = kzalloc(sizeof(*v), GFP_KERNEL | __GFP_RETRY_MAYFAIL); + if (!v) + return -ENOMEM; + + minor = ida_simple_get(&vhost_vdpa_ida, 0, + VHOST_VDPA_DEV_MAX, GFP_KERNEL); + if (minor < 0) { + kfree(v); + return minor; + } + + atomic_set(&v->opened, 0); + v->minor = minor; + v->vdpa = vdpa; + v->nvqs = nvqs; + v->virtio_id = ops->get_device_id(vdpa); + + device_initialize(&v->dev); + v->dev.release = vhost_vdpa_release_dev; + v->dev.parent = &vdpa->dev; + v->dev.devt = MKDEV(MAJOR(vhost_vdpa_major), minor); + v->vqs = kmalloc_array(nvqs, sizeof(struct vhost_virtqueue), + GFP_KERNEL); + if (!v->vqs) { + r = -ENOMEM; + goto err; + } + + r = dev_set_name(&v->dev, "vhost-vdpa-%u", minor); + if (r) + goto err; + + cdev_init(&v->cdev, &vhost_vdpa_fops); + v->cdev.owner = THIS_MODULE; + + r = cdev_device_add(&v->cdev, &v->dev); + if (r) + goto err; + + init_completion(&v->completion); + vdpa_set_drvdata(vdpa, v); + + return 0; + +err: + put_device(&v->dev); + return r; +} + +static void vhost_vdpa_remove(struct vdpa_device *vdpa) +{ + struct vhost_vdpa *v = vdpa_get_drvdata(vdpa); + int opened; + + cdev_device_del(&v->cdev, &v->dev); + + do { + opened = atomic_cmpxchg(&v->opened, 0, 1); + if (!opened) + break; + wait_for_completion(&v->completion); + } while (1); + + put_device(&v->dev); +} + +static struct vdpa_driver vhost_vdpa_driver = { + .driver = { + .name = "vhost_vdpa", + }, + .probe = vhost_vdpa_probe, + .remove = vhost_vdpa_remove, +}; + +static int __init vhost_vdpa_init(void) +{ + int r; + + r = alloc_chrdev_region(&vhost_vdpa_major, 0, VHOST_VDPA_DEV_MAX, + "vhost-vdpa"); + if (r) + goto err_alloc_chrdev; + + r = vdpa_register_driver(&vhost_vdpa_driver); + if (r) + goto err_vdpa_register_driver; + + return 0; + +err_vdpa_register_driver: + unregister_chrdev_region(vhost_vdpa_major, VHOST_VDPA_DEV_MAX); +err_alloc_chrdev: + return r; +} +module_init(vhost_vdpa_init); + +static void __exit vhost_vdpa_exit(void) +{ + vdpa_unregister_driver(&vhost_vdpa_driver); + unregister_chrdev_region(vhost_vdpa_major, VHOST_VDPA_DEV_MAX); +} +module_exit(vhost_vdpa_exit); + +MODULE_VERSION("0.0.1"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("vDPA-based vhost backend for virtio"); diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 40d028eed645..9fe72e4b1373 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -116,4 +116,28 @@ #define VHOST_VSOCK_SET_GUEST_CID _IOW(VHOST_VIRTIO, 0x60, __u64) #define VHOST_VSOCK_SET_RUNNING _IOW(VHOST_VIRTIO, 0x61, int) +/* VHOST_VDPA specific defines */ + +/* Get the device id. The device ids follow the same definition of + * the device id defined in virtio-spec. + */ +#define VHOST_VDPA_GET_DEVICE_ID _IOR(VHOST_VIRTIO, 0x70, __u32) +/* Get and set the status. The status bits follow the same definition + * of the device status defined in virtio-spec. + */ +#define VHOST_VDPA_GET_STATUS _IOR(VHOST_VIRTIO, 0x71, __u8) +#define VHOST_VDPA_SET_STATUS _IOW(VHOST_VIRTIO, 0x72, __u8) +/* Get and set the device config. The device config follows the same + * definition of the device config defined in virtio-spec. + */ +#define VHOST_VDPA_GET_CONFIG _IOR(VHOST_VIRTIO, 0x73, \ + struct vhost_vdpa_config) +#define VHOST_VDPA_SET_CONFIG _IOW(VHOST_VIRTIO, 0x74, \ + struct vhost_vdpa_config) +/* Enable/disable the ring. */ +#define VHOST_VDPA_SET_VRING_ENABLE _IOW(VHOST_VIRTIO, 0x75, \ + struct vhost_vring_state) +/* Get the max ring size. */ +#define VHOST_VDPA_GET_VRING_NUM _IOR(VHOST_VIRTIO, 0x76, __u16) + #endif diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h index c907290ff065..669457ce5c48 100644 --- a/include/uapi/linux/vhost_types.h +++ b/include/uapi/linux/vhost_types.h @@ -119,6 +119,14 @@ struct vhost_scsi_target { unsigned short reserved; }; +/* VHOST_VDPA specific definitions */ + +struct vhost_vdpa_config { + __u32 off; + __u32 len; + __u8 buf[0]; +}; + /* Feature bits */ /* Log all write descriptors. Can be changed while device is active. */ #define VHOST_F_LOG_ALL 26 -- cgit v1.2.3