summaryrefslogtreecommitdiffstats
path: root/drivers/vhost
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-08-15 15:04:25 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2018-08-15 15:04:25 -0700
commit9a76aba02a37718242d7cdc294f0a3901928aa57 (patch)
tree2040d038f85d2120f21af83b0793efd5af1864e3 /drivers/vhost
parent0a957467c5fd46142bc9c52758ffc552d4c5e2f7 (diff)
parent26a1ccc6c117be8e33e0410fce8c5298b0015b99 (diff)
downloadlinux-9a76aba02a37718242d7cdc294f0a3901928aa57.tar.bz2
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Highlights: - Gustavo A. R. Silva keeps working on the implicit switch fallthru changes. - Support 802.11ax High-Efficiency wireless in cfg80211 et al, From Luca Coelho. - Re-enable ASPM in r8169, from Kai-Heng Feng. - Add virtual XFRM interfaces, which avoids all of the limitations of existing IPSEC tunnels. From Steffen Klassert. - Convert GRO over to use a hash table, so that when we have many flows active we don't traverse a long list during accumluation. - Many new self tests for routing, TC, tunnels, etc. Too many contributors to mention them all, but I'm really happy to keep seeing this stuff. - Hardware timestamping support for dpaa_eth/fsl-fman from Yangbo Lu. - Lots of cleanups and fixes in L2TP code from Guillaume Nault. - Add IPSEC offload support to netdevsim, from Shannon Nelson. - Add support for slotting with non-uniform distribution to netem packet scheduler, from Yousuk Seung. - Add UDP GSO support to mlx5e, from Boris Pismenny. - Support offloading of Team LAG in NFP, from John Hurley. - Allow to configure TX queue selection based upon RX queue, from Amritha Nambiar. - Support ethtool ring size configuration in aquantia, from Anton Mikaev. - Support DSCP and flowlabel per-transport in SCTP, from Xin Long. - Support list based batching and stack traversal of SKBs, this is very exciting work. From Edward Cree. - Busyloop optimizations in vhost_net, from Toshiaki Makita. - Introduce the ETF qdisc, which allows time based transmissions. IGB can offload this in hardware. From Vinicius Costa Gomes. - Add parameter support to devlink, from Moshe Shemesh. - Several multiplication and division optimizations for BPF JIT in nfp driver, from Jiong Wang. - Lots of prepatory work to make more of the packet scheduler layer lockless, when possible, from Vlad Buslov. - Add ACK filter and NAT awareness to sch_cake packet scheduler, from Toke Høiland-Jørgensen. - Support regions and region snapshots in devlink, from Alex Vesker. - Allow to attach XDP programs to both HW and SW at the same time on a given device, with initial support in nfp. From Jakub Kicinski. - Add TLS RX offload and support in mlx5, from Ilya Lesokhin. - Use PHYLIB in r8169 driver, from Heiner Kallweit. - All sorts of changes to support Spectrum 2 in mlxsw driver, from Ido Schimmel. - PTP support in mv88e6xxx DSA driver, from Andrew Lunn. - Make TCP_USER_TIMEOUT socket option more accurate, from Jon Maxwell. - Support for templates in packet scheduler classifier, from Jiri Pirko. - IPV6 support in RDS, from Ka-Cheong Poon. - Native tproxy support in nf_tables, from Máté Eckl. - Maintain IP fragment queue in an rbtree, but optimize properly for in-order frags. From Peter Oskolkov. - Improvde handling of ACKs on hole repairs, from Yuchung Cheng" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1996 commits) bpf: test: fix spelling mistake "REUSEEPORT" -> "REUSEPORT" hv/netvsc: Fix NULL dereference at single queue mode fallback net: filter: mark expected switch fall-through xen-netfront: fix warn message as irq device name has '/' cxgb4: Add new T5 PCI device ids 0x50af and 0x50b0 net: dsa: mv88e6xxx: missing unlock on error path rds: fix building with IPV6=m inet/connection_sock: prefer _THIS_IP_ to current_text_addr net: dsa: mv88e6xxx: bitwise vs logical bug net: sock_diag: Fix spectre v1 gadget in __sock_diag_cmd() ieee802154: hwsim: using right kind of iteration net: hns3: Add vlan filter setting by ethtool command -K net: hns3: Set tx ring' tc info when netdev is up net: hns3: Remove tx ring BD len register in hns3_enet net: hns3: Fix desc num set to default when setting channel net: hns3: Fix for phy link issue when using marvell phy driver net: hns3: Fix for information of phydev lost problem when down/up net: hns3: Fix for command format parsing error in hclge_is_all_function_id_zero net: hns3: Add support for serdes loopback selftest bnxt_en: take coredump_record structure off stack ...
Diffstat (limited to 'drivers/vhost')
-rw-r--r--drivers/vhost/net.c370
-rw-r--r--drivers/vhost/vhost.c71
-rw-r--r--drivers/vhost/vhost.h11
3 files changed, 330 insertions, 122 deletions
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 29756d88799b..4e656f89cb22 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -78,6 +78,10 @@ enum {
};
enum {
+ VHOST_NET_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2)
+};
+
+enum {
VHOST_NET_VQ_RX = 0,
VHOST_NET_VQ_TX = 1,
VHOST_NET_VQ_MAX = 2,
@@ -94,7 +98,7 @@ struct vhost_net_ubuf_ref {
struct vhost_virtqueue *vq;
};
-#define VHOST_RX_BATCH 64
+#define VHOST_NET_BATCH 64
struct vhost_net_buf {
void **queue;
int tail;
@@ -168,7 +172,7 @@ static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
rxq->head = 0;
rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue,
- VHOST_RX_BATCH);
+ VHOST_NET_BATCH);
return rxq->tail;
}
@@ -396,13 +400,10 @@ static inline unsigned long busy_clock(void)
return local_clock() >> 10;
}
-static bool vhost_can_busy_poll(struct vhost_dev *dev,
- unsigned long endtime)
+static bool vhost_can_busy_poll(unsigned long endtime)
{
- return likely(!need_resched()) &&
- likely(!time_after(busy_clock(), endtime)) &&
- likely(!signal_pending(current)) &&
- !vhost_has_work(dev);
+ return likely(!need_resched() && !time_after(busy_clock(), endtime) &&
+ !signal_pending(current));
}
static void vhost_net_disable_vq(struct vhost_net *n,
@@ -431,21 +432,42 @@ static int vhost_net_enable_vq(struct vhost_net *n,
return vhost_poll_start(poll, sock->file);
}
+static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
+{
+ struct vhost_virtqueue *vq = &nvq->vq;
+ struct vhost_dev *dev = vq->dev;
+
+ if (!nvq->done_idx)
+ return;
+
+ vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx);
+ nvq->done_idx = 0;
+}
+
static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
- struct vhost_virtqueue *vq,
- struct iovec iov[], unsigned int iov_size,
- unsigned int *out_num, unsigned int *in_num)
+ struct vhost_net_virtqueue *nvq,
+ unsigned int *out_num, unsigned int *in_num,
+ bool *busyloop_intr)
{
+ struct vhost_virtqueue *vq = &nvq->vq;
unsigned long uninitialized_var(endtime);
int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
out_num, in_num, NULL, NULL);
if (r == vq->num && vq->busyloop_timeout) {
+ if (!vhost_sock_zcopy(vq->private_data))
+ vhost_net_signal_used(nvq);
preempt_disable();
endtime = busy_clock() + vq->busyloop_timeout;
- while (vhost_can_busy_poll(vq->dev, endtime) &&
- vhost_vq_avail_empty(vq->dev, vq))
+ while (vhost_can_busy_poll(endtime)) {
+ if (vhost_has_work(vq->dev)) {
+ *busyloop_intr = true;
+ break;
+ }
+ if (!vhost_vq_avail_empty(vq->dev, vq))
+ break;
cpu_relax();
+ }
preempt_enable();
r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
out_num, in_num, NULL, NULL);
@@ -463,9 +485,62 @@ static bool vhost_exceeds_maxpend(struct vhost_net *net)
min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2);
}
-/* Expects to be always run from workqueue - which acts as
- * read-size critical section for our kind of RCU. */
-static void handle_tx(struct vhost_net *net)
+static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,
+ size_t hdr_size, int out)
+{
+ /* Skip header. TODO: support TSO. */
+ size_t len = iov_length(vq->iov, out);
+
+ iov_iter_init(iter, WRITE, vq->iov, out, len);
+ iov_iter_advance(iter, hdr_size);
+
+ return iov_iter_count(iter);
+}
+
+static bool vhost_exceeds_weight(int pkts, int total_len)
+{
+ return total_len >= VHOST_NET_WEIGHT ||
+ pkts >= VHOST_NET_PKT_WEIGHT;
+}
+
+static int get_tx_bufs(struct vhost_net *net,
+ struct vhost_net_virtqueue *nvq,
+ struct msghdr *msg,
+ unsigned int *out, unsigned int *in,
+ size_t *len, bool *busyloop_intr)
+{
+ struct vhost_virtqueue *vq = &nvq->vq;
+ int ret;
+
+ ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, busyloop_intr);
+
+ if (ret < 0 || ret == vq->num)
+ return ret;
+
+ if (*in) {
+ vq_err(vq, "Unexpected descriptor format for TX: out %d, int %d\n",
+ *out, *in);
+ return -EFAULT;
+ }
+
+ /* Sanity check */
+ *len = init_iov_iter(vq, &msg->msg_iter, nvq->vhost_hlen, *out);
+ if (*len == 0) {
+ vq_err(vq, "Unexpected header len for TX: %zd expected %zd\n",
+ *len, nvq->vhost_hlen);
+ return -EFAULT;
+ }
+
+ return ret;
+}
+
+static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len)
+{
+ return total_len < VHOST_NET_WEIGHT &&
+ !vhost_vq_avail_empty(vq->dev, vq);
+}
+
+static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
{
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *vq = &nvq->vq;
@@ -480,67 +555,103 @@ static void handle_tx(struct vhost_net *net)
};
size_t len, total_len = 0;
int err;
- size_t hdr_size;
- struct socket *sock;
- struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
- bool zcopy, zcopy_used;
int sent_pkts = 0;
- mutex_lock(&vq->mutex);
- sock = vq->private_data;
- if (!sock)
- goto out;
+ for (;;) {
+ bool busyloop_intr = false;
- if (!vq_iotlb_prefetch(vq))
- goto out;
+ head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
+ &busyloop_intr);
+ /* On error, stop handling until the next kick. */
+ if (unlikely(head < 0))
+ break;
+ /* Nothing new? Wait for eventfd to tell us they refilled. */
+ if (head == vq->num) {
+ if (unlikely(busyloop_intr)) {
+ vhost_poll_queue(&vq->poll);
+ } else if (unlikely(vhost_enable_notify(&net->dev,
+ vq))) {
+ vhost_disable_notify(&net->dev, vq);
+ continue;
+ }
+ break;
+ }
- vhost_disable_notify(&net->dev, vq);
- vhost_net_disable_vq(net, vq);
+ vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
+ vq->heads[nvq->done_idx].len = 0;
- hdr_size = nvq->vhost_hlen;
- zcopy = nvq->ubufs;
+ total_len += len;
+ if (tx_can_batch(vq, total_len))
+ msg.msg_flags |= MSG_MORE;
+ else
+ msg.msg_flags &= ~MSG_MORE;
+
+ /* TODO: Check specific error and bomb out unless ENOBUFS? */
+ err = sock->ops->sendmsg(sock, &msg, len);
+ if (unlikely(err < 0)) {
+ vhost_discard_vq_desc(vq, 1);
+ vhost_net_enable_vq(net, vq);
+ break;
+ }
+ if (err != len)
+ pr_debug("Truncated TX packet: len %d != %zd\n",
+ err, len);
+ if (++nvq->done_idx >= VHOST_NET_BATCH)
+ vhost_net_signal_used(nvq);
+ if (vhost_exceeds_weight(++sent_pkts, total_len)) {
+ vhost_poll_queue(&vq->poll);
+ break;
+ }
+ }
+
+ vhost_net_signal_used(nvq);
+}
+
+static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
+{
+ struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
+ struct vhost_virtqueue *vq = &nvq->vq;
+ unsigned out, in;
+ int head;
+ struct msghdr msg = {
+ .msg_name = NULL,
+ .msg_namelen = 0,
+ .msg_control = NULL,
+ .msg_controllen = 0,
+ .msg_flags = MSG_DONTWAIT,
+ };
+ size_t len, total_len = 0;
+ int err;
+ struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
+ bool zcopy_used;
+ int sent_pkts = 0;
for (;;) {
- /* Release DMAs done buffers first */
- if (zcopy)
- vhost_zerocopy_signal_used(net, vq);
+ bool busyloop_intr;
+ /* Release DMAs done buffers first */
+ vhost_zerocopy_signal_used(net, vq);
- head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
- ARRAY_SIZE(vq->iov),
- &out, &in);
+ busyloop_intr = false;
+ head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
+ &busyloop_intr);
/* On error, stop handling until the next kick. */
if (unlikely(head < 0))
break;
/* Nothing new? Wait for eventfd to tell us they refilled. */
if (head == vq->num) {
- if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+ if (unlikely(busyloop_intr)) {
+ vhost_poll_queue(&vq->poll);
+ } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
vhost_disable_notify(&net->dev, vq);
continue;
}
break;
}
- if (in) {
- vq_err(vq, "Unexpected descriptor format for TX: "
- "out %d, int %d\n", out, in);
- break;
- }
- /* Skip header. TODO: support TSO. */
- len = iov_length(vq->iov, out);
- iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
- iov_iter_advance(&msg.msg_iter, hdr_size);
- /* Sanity check */
- if (!msg_data_left(&msg)) {
- vq_err(vq, "Unexpected header len for TX: "
- "%zd expected %zd\n",
- len, hdr_size);
- break;
- }
- len = msg_data_left(&msg);
- zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
- && !vhost_exceeds_maxpend(net)
- && vhost_net_tx_select_zcopy(net);
+ zcopy_used = len >= VHOST_GOODCOPY_LEN
+ && !vhost_exceeds_maxpend(net)
+ && vhost_net_tx_select_zcopy(net);
/* use msg_control to pass vhost zerocopy ubuf info to skb */
if (zcopy_used) {
@@ -562,10 +673,8 @@ static void handle_tx(struct vhost_net *net)
msg.msg_control = NULL;
ubufs = NULL;
}
-
total_len += len;
- if (total_len < VHOST_NET_WEIGHT &&
- !vhost_vq_avail_empty(&net->dev, vq) &&
+ if (tx_can_batch(vq, total_len) &&
likely(!vhost_exceeds_maxpend(net))) {
msg.msg_flags |= MSG_MORE;
} else {
@@ -592,12 +701,37 @@ static void handle_tx(struct vhost_net *net)
else
vhost_zerocopy_signal_used(net, vq);
vhost_net_tx_packet(net);
- if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
- unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT)) {
+ if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) {
vhost_poll_queue(&vq->poll);
break;
}
}
+}
+
+/* Expects to be always run from workqueue - which acts as
+ * read-size critical section for our kind of RCU. */
+static void handle_tx(struct vhost_net *net)
+{
+ struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
+ struct vhost_virtqueue *vq = &nvq->vq;
+ struct socket *sock;
+
+ mutex_lock(&vq->mutex);
+ sock = vq->private_data;
+ if (!sock)
+ goto out;
+
+ if (!vq_iotlb_prefetch(vq))
+ goto out;
+
+ vhost_disable_notify(&net->dev, vq);
+ vhost_net_disable_vq(net, vq);
+
+ if (vhost_sock_zcopy(sock))
+ handle_tx_zerocopy(net, sock);
+ else
+ handle_tx_copy(net, sock);
+
out:
mutex_unlock(&vq->mutex);
}
@@ -633,53 +767,50 @@ static int sk_has_rx_data(struct sock *sk)
return skb_queue_empty(&sk->sk_receive_queue);
}
-static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
+static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
+ bool *busyloop_intr)
{
- struct vhost_virtqueue *vq = &nvq->vq;
- struct vhost_dev *dev = vq->dev;
-
- if (!nvq->done_idx)
- return;
-
- vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx);
- nvq->done_idx = 0;
-}
-
-static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
-{
- struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
- struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
- struct vhost_virtqueue *vq = &nvq->vq;
+ struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
+ struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
+ struct vhost_virtqueue *rvq = &rnvq->vq;
+ struct vhost_virtqueue *tvq = &tnvq->vq;
unsigned long uninitialized_var(endtime);
- int len = peek_head_len(rvq, sk);
+ int len = peek_head_len(rnvq, sk);
- if (!len && vq->busyloop_timeout) {
+ if (!len && tvq->busyloop_timeout) {
/* Flush batched heads first */
- vhost_rx_signal_used(rvq);
+ vhost_net_signal_used(rnvq);
/* Both tx vq and rx socket were polled here */
- mutex_lock_nested(&vq->mutex, 1);
- vhost_disable_notify(&net->dev, vq);
+ mutex_lock_nested(&tvq->mutex, 1);
+ vhost_disable_notify(&net->dev, tvq);
preempt_disable();
- endtime = busy_clock() + vq->busyloop_timeout;
+ endtime = busy_clock() + tvq->busyloop_timeout;
- while (vhost_can_busy_poll(&net->dev, endtime) &&
- !sk_has_rx_data(sk) &&
- vhost_vq_avail_empty(&net->dev, vq))
+ while (vhost_can_busy_poll(endtime)) {
+ if (vhost_has_work(&net->dev)) {
+ *busyloop_intr = true;
+ break;
+ }
+ if ((sk_has_rx_data(sk) &&
+ !vhost_vq_avail_empty(&net->dev, rvq)) ||
+ !vhost_vq_avail_empty(&net->dev, tvq))
+ break;
cpu_relax();
+ }
preempt_enable();
- if (!vhost_vq_avail_empty(&net->dev, vq))
- vhost_poll_queue(&vq->poll);
- else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
- vhost_disable_notify(&net->dev, vq);
- vhost_poll_queue(&vq->poll);
+ if (!vhost_vq_avail_empty(&net->dev, tvq)) {
+ vhost_poll_queue(&tvq->poll);
+ } else if (unlikely(vhost_enable_notify(&net->dev, tvq))) {
+ vhost_disable_notify(&net->dev, tvq);
+ vhost_poll_queue(&tvq->poll);
}
- mutex_unlock(&vq->mutex);
+ mutex_unlock(&tvq->mutex);
- len = peek_head_len(rvq, sk);
+ len = peek_head_len(rnvq, sk);
}
return len;
@@ -786,6 +917,7 @@ static void handle_rx(struct vhost_net *net)
s16 headcount;
size_t vhost_hlen, sock_hlen;
size_t vhost_len, sock_len;
+ bool busyloop_intr = false;
struct socket *sock;
struct iov_iter fixup;
__virtio16 num_buffers;
@@ -809,7 +941,8 @@ static void handle_rx(struct vhost_net *net)
vq->log : NULL;
mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
- while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
+ while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
+ &busyloop_intr))) {
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
@@ -820,7 +953,9 @@ static void handle_rx(struct vhost_net *net)
goto out;
/* OK, now we need to know about added descriptors. */
if (!headcount) {
- if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+ if (unlikely(busyloop_intr)) {
+ vhost_poll_queue(&vq->poll);
+ } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
/* They have slipped one in as we were
* doing that: check again. */
vhost_disable_notify(&net->dev, vq);
@@ -830,6 +965,7 @@ static void handle_rx(struct vhost_net *net)
* they refilled. */
goto out;
}
+ busyloop_intr = false;
if (nvq->rx_ring)
msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
/* On overrun, truncate and discard */
@@ -885,20 +1021,22 @@ static void handle_rx(struct vhost_net *net)
goto out;
}
nvq->done_idx += headcount;
- if (nvq->done_idx > VHOST_RX_BATCH)
- vhost_rx_signal_used(nvq);
+ if (nvq->done_idx > VHOST_NET_BATCH)
+ vhost_net_signal_used(nvq);
if (unlikely(vq_log))
vhost_log_write(vq, vq_log, log, vhost_len);
total_len += vhost_len;
- if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
- unlikely(++recv_pkts >= VHOST_NET_PKT_WEIGHT)) {
+ if (unlikely(vhost_exceeds_weight(++recv_pkts, total_len))) {
vhost_poll_queue(&vq->poll);
goto out;
}
}
- vhost_net_enable_vq(net, vq);
+ if (unlikely(busyloop_intr))
+ vhost_poll_queue(&vq->poll);
+ else
+ vhost_net_enable_vq(net, vq);
out:
- vhost_rx_signal_used(nvq);
+ vhost_net_signal_used(nvq);
mutex_unlock(&vq->mutex);
}
@@ -951,7 +1089,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
return -ENOMEM;
}
- queue = kmalloc_array(VHOST_RX_BATCH, sizeof(void *),
+ queue = kmalloc_array(VHOST_NET_BATCH, sizeof(void *),
GFP_KERNEL);
if (!queue) {
kfree(vqs);
@@ -1265,6 +1403,21 @@ done:
return err;
}
+static int vhost_net_set_backend_features(struct vhost_net *n, u64 features)
+{
+ int i;
+
+ mutex_lock(&n->dev.mutex);
+ for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
+ mutex_lock(&n->vqs[i].vq.mutex);
+ n->vqs[i].vq.acked_backend_features = features;
+ mutex_unlock(&n->vqs[i].vq.mutex);
+ }
+ mutex_unlock(&n->dev.mutex);
+
+ return 0;
+}
+
static int vhost_net_set_features(struct vhost_net *n, u64 features)
{
size_t vhost_hlen, sock_hlen, hdr_len;
@@ -1355,6 +1508,17 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
if (features & ~VHOST_NET_FEATURES)
return -EOPNOTSUPP;
return vhost_net_set_features(n, features);
+ case VHOST_GET_BACKEND_FEATURES:
+ features = VHOST_NET_BACKEND_FEATURES;
+ if (copy_to_user(featurep, &features, sizeof(features)))
+ return -EFAULT;
+ return 0;
+ case VHOST_SET_BACKEND_FEATURES:
+ if (copy_from_user(&features, featurep, sizeof(features)))
+ return -EFAULT;
+ if (features & ~VHOST_NET_BACKEND_FEATURES)
+ return -EOPNOTSUPP;
+ return vhost_net_set_backend_features(n, features);
case VHOST_RESET_OWNER:
return vhost_net_reset_owner(n);
case VHOST_SET_OWNER:
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index ed3114556fda..96c1d8400822 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -315,6 +315,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->log_addr = -1ull;
vq->private_data = NULL;
vq->acked_features = 0;
+ vq->acked_backend_features = 0;
vq->log_base = NULL;
vq->error_ctx = NULL;
vq->kick = NULL;
@@ -1027,28 +1028,40 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
struct iov_iter *from)
{
- struct vhost_msg_node node;
- unsigned size = sizeof(struct vhost_msg);
- size_t ret;
- int err;
+ struct vhost_iotlb_msg msg;
+ size_t offset;
+ int type, ret;
- if (iov_iter_count(from) < size)
- return 0;
- ret = copy_from_iter(&node.msg, size, from);
- if (ret != size)
+ ret = copy_from_iter(&type, sizeof(type), from);
+ if (ret != sizeof(type))
goto done;
- switch (node.msg.type) {
+ switch (type) {
case VHOST_IOTLB_MSG:
- err = vhost_process_iotlb_msg(dev, &node.msg.iotlb);
- if (err)
- ret = err;
+ /* There maybe a hole after type for V1 message type,
+ * so skip it here.
+ */
+ offset = offsetof(struct vhost_msg, iotlb) - sizeof(int);
+ break;
+ case VHOST_IOTLB_MSG_V2:
+ offset = sizeof(__u32);
break;
default:
ret = -EINVAL;
- break;
+ goto done;
+ }
+
+ iov_iter_advance(from, offset);
+ ret = copy_from_iter(&msg, sizeof(msg), from);
+ if (ret != sizeof(msg))
+ goto done;
+ if (vhost_process_iotlb_msg(dev, &msg)) {
+ ret = -EFAULT;
+ goto done;
}
+ ret = (type == VHOST_IOTLB_MSG) ? sizeof(struct vhost_msg) :
+ sizeof(struct vhost_msg_v2);
done:
return ret;
}
@@ -1107,13 +1120,28 @@ ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
finish_wait(&dev->wait, &wait);
if (node) {
- ret = copy_to_iter(&node->msg, size, to);
+ struct vhost_iotlb_msg *msg;
+ void *start = &node->msg;
+
+ switch (node->msg.type) {
+ case VHOST_IOTLB_MSG:
+ size = sizeof(node->msg);
+ msg = &node->msg.iotlb;
+ break;
+ case VHOST_IOTLB_MSG_V2:
+ size = sizeof(node->msg_v2);
+ msg = &node->msg_v2.iotlb;
+ break;
+ default:
+ BUG();
+ break;
+ }
- if (ret != size || node->msg.type != VHOST_IOTLB_MISS) {
+ ret = copy_to_iter(start, size, to);
+ if (ret != size || msg->type != VHOST_IOTLB_MISS) {
kfree(node);
return ret;
}
-
vhost_enqueue_msg(dev, &dev->pending_list, node);
}
@@ -1126,12 +1154,19 @@ static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access)
struct vhost_dev *dev = vq->dev;
struct vhost_msg_node *node;
struct vhost_iotlb_msg *msg;
+ bool v2 = vhost_backend_has_feature(vq, VHOST_BACKEND_F_IOTLB_MSG_V2);
- node = vhost_new_msg(vq, VHOST_IOTLB_MISS);
+ node = vhost_new_msg(vq, v2 ? VHOST_IOTLB_MSG_V2 : VHOST_IOTLB_MSG);
if (!node)
return -ENOMEM;
- msg = &node->msg.iotlb;
+ if (v2) {
+ node->msg_v2.type = VHOST_IOTLB_MSG_V2;
+ msg = &node->msg_v2.iotlb;
+ } else {
+ msg = &node->msg.iotlb;
+ }
+
msg->type = VHOST_IOTLB_MISS;
msg->iova = iova;
msg->perm = access;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 6c844b90a168..466ef7542291 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -132,6 +132,7 @@ struct vhost_virtqueue {
struct vhost_umem *iotlb;
void *private_data;
u64 acked_features;
+ u64 acked_backend_features;
/* Log write descriptors */
void __user *log_base;
struct vhost_log *log;
@@ -147,7 +148,10 @@ struct vhost_virtqueue {
};
struct vhost_msg_node {
- struct vhost_msg msg;
+ union {
+ struct vhost_msg msg;
+ struct vhost_msg_v2 msg_v2;
+ };
struct vhost_virtqueue *vq;
struct list_head node;
};
@@ -238,6 +242,11 @@ static inline bool vhost_has_feature(struct vhost_virtqueue *vq, int bit)
return vq->acked_features & (1ULL << bit);
}
+static inline bool vhost_backend_has_feature(struct vhost_virtqueue *vq, int bit)
+{
+ return vq->acked_backend_features & (1ULL << bit);
+}
+
#ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
static inline bool vhost_is_little_endian(struct vhost_virtqueue *vq)
{