summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-03-20 13:28:18 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-20 13:28:18 -0700
commitf0691533b756931089902464ca15afc218a49d70 (patch)
tree7d72b43866be5ae5507efb9c2976059d5d5cc0f1 /drivers
parent2b2f72d8ce59acce95ceb156f0f31b848e32e6d4 (diff)
parentc67f5db82027ba6d2ea4ac9176bc45996a03ae6a (diff)
downloadlinux-f0691533b756931089902464ca15afc218a49d70.tar.bz2
Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
Pull virtio/vhost updates from Michael Tsirkin: "New features, performance improvements, cleanups: - basic polling support for vhost - rework virtio to optionally use DMA API, fixing it on Xen - balloon stats gained a new entry - using the new napi_alloc_skb speeds up virtio net - virtio blk stats can now be read while another VCPU is busy inflating or deflating the balloon plus misc cleanups in various places" * tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost: virtio_net: replace netdev_alloc_skb_ip_align() with napi_alloc_skb() vhost_net: basic polling support vhost: introduce vhost_vq_avail_empty() vhost: introduce vhost_has_work() virtio_balloon: Allow to resize and update the balloon stats in parallel virtio_balloon: Use a workqueue instead of "vballoon" kthread virtio/s390: size of SET_IND payload virtio/s390: use dev_to_virtio vhost: rename vhost_init_used() vhost: rename cross-endian helpers virtio_blk: VIRTIO_BLK_F_WCE->VIRTIO_BLK_F_FLUSH vring: Use the DMA API on Xen virtio_pci: Use the DMA API if enabled virtio_mmio: Use the DMA API if enabled virtio: Add improved queue allocation API virtio_ring: Support DMA APIs vring: Introduce vring_use_dma_api() s390/dma: Allow per device dma ops alpha/dma: use common noop dma ops dma: Provide simple noop dma ops
Diffstat (limited to 'drivers')
-rw-r--r--drivers/block/virtio_blk.c11
-rw-r--r--drivers/net/virtio_net.c2
-rw-r--r--drivers/s390/virtio/virtio_ccw.c15
-rw-r--r--drivers/vhost/net.c80
-rw-r--r--drivers/vhost/scsi.c2
-rw-r--r--drivers/vhost/test.c2
-rw-r--r--drivers/vhost/vhost.c69
-rw-r--r--drivers/vhost/vhost.h5
-rw-r--r--drivers/virtio/Kconfig2
-rw-r--r--drivers/virtio/virtio_balloon.c122
-rw-r--r--drivers/virtio/virtio_mmio.c67
-rw-r--r--drivers/virtio/virtio_pci_common.h6
-rw-r--r--drivers/virtio/virtio_pci_legacy.c42
-rw-r--r--drivers/virtio/virtio_pci_modern.c61
-rw-r--r--drivers/virtio/virtio_ring.c439
15 files changed, 646 insertions, 279 deletions
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 6ca35495a5be..28cff0d23d82 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -477,8 +477,13 @@ static int virtblk_get_cache_mode(struct virtio_device *vdev)
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE,
struct virtio_blk_config, wce,
&writeback);
+
+ /*
+ * If WCE is not configurable and flush is not available,
+ * assume no writeback cache is in use.
+ */
if (err)
- writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE);
+ writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH);
return writeback;
}
@@ -833,14 +838,14 @@ static const struct virtio_device_id id_table[] = {
static unsigned int features_legacy[] = {
VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
- VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
+ VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
VIRTIO_BLK_F_MQ,
}
;
static unsigned int features[] = {
VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
- VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
+ VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
VIRTIO_BLK_F_MQ,
};
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index fb0eae42bf39..49d84e540343 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -260,7 +260,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
p = page_address(page) + offset;
/* copy small packet so we can reuse these pages for small data */
- skb = netdev_alloc_skb_ip_align(vi->dev, GOOD_COPY_LEN);
+ skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
if (unlikely(!skb))
return NULL;
diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
index bf2d1300a957..8688ad4c825f 100644
--- a/drivers/s390/virtio/virtio_ccw.c
+++ b/drivers/s390/virtio/virtio_ccw.c
@@ -342,13 +342,14 @@ static void virtio_ccw_drop_indicator(struct virtio_ccw_device *vcdev,
ccw->count = sizeof(*thinint_area);
ccw->cda = (__u32)(unsigned long) thinint_area;
} else {
+ /* payload is the address of the indicators */
indicatorp = kmalloc(sizeof(&vcdev->indicators),
GFP_DMA | GFP_KERNEL);
if (!indicatorp)
return;
*indicatorp = 0;
ccw->cmd_code = CCW_CMD_SET_IND;
- ccw->count = sizeof(vcdev->indicators);
+ ccw->count = sizeof(&vcdev->indicators);
ccw->cda = (__u32)(unsigned long) indicatorp;
}
/* Deregister indicators from host. */
@@ -656,7 +657,10 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
}
}
ret = -ENOMEM;
- /* We need a data area under 2G to communicate. */
+ /*
+ * We need a data area under 2G to communicate. Our payload is
+ * the address of the indicators.
+ */
indicatorp = kmalloc(sizeof(&vcdev->indicators), GFP_DMA | GFP_KERNEL);
if (!indicatorp)
goto out;
@@ -672,7 +676,7 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
vcdev->indicators = 0;
ccw->cmd_code = CCW_CMD_SET_IND;
ccw->flags = 0;
- ccw->count = sizeof(vcdev->indicators);
+ ccw->count = sizeof(&vcdev->indicators);
ccw->cda = (__u32)(unsigned long) indicatorp;
ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_IND);
if (ret)
@@ -683,7 +687,7 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
vcdev->indicators2 = 0;
ccw->cmd_code = CCW_CMD_SET_CONF_IND;
ccw->flags = 0;
- ccw->count = sizeof(vcdev->indicators2);
+ ccw->count = sizeof(&vcdev->indicators2);
ccw->cda = (__u32)(unsigned long) indicatorp;
ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_CONF_IND);
if (ret)
@@ -945,8 +949,7 @@ static struct virtio_config_ops virtio_ccw_config_ops = {
static void virtio_ccw_release_dev(struct device *_d)
{
- struct virtio_device *dev = container_of(_d, struct virtio_device,
- dev);
+ struct virtio_device *dev = dev_to_virtio(_d);
struct virtio_ccw_device *vcdev = to_vc_device(dev);
kfree(vcdev->status);
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9eda69e40678..f744eeb3e2b4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -287,6 +287,43 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
rcu_read_unlock_bh();
}
+static inline unsigned long busy_clock(void)
+{
+ return local_clock() >> 10;
+}
+
+static bool vhost_can_busy_poll(struct vhost_dev *dev,
+ unsigned long endtime)
+{
+ return likely(!need_resched()) &&
+ likely(!time_after(busy_clock(), endtime)) &&
+ likely(!signal_pending(current)) &&
+ !vhost_has_work(dev);
+}
+
+static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
+ struct vhost_virtqueue *vq,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num)
+{
+ unsigned long uninitialized_var(endtime);
+ int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+ out_num, in_num, NULL, NULL);
+
+ if (r == vq->num && vq->busyloop_timeout) {
+ preempt_disable();
+ endtime = busy_clock() + vq->busyloop_timeout;
+ while (vhost_can_busy_poll(vq->dev, endtime) &&
+ vhost_vq_avail_empty(vq->dev, vq))
+ cpu_relax_lowlatency();
+ preempt_enable();
+ r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+ out_num, in_num, NULL, NULL);
+ }
+
+ return r;
+}
+
/* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */
static void handle_tx(struct vhost_net *net)
@@ -331,10 +368,9 @@ static void handle_tx(struct vhost_net *net)
% UIO_MAXIOV == nvq->done_idx))
break;
- head = vhost_get_vq_desc(vq, vq->iov,
- ARRAY_SIZE(vq->iov),
- &out, &in,
- NULL, NULL);
+ head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
+ ARRAY_SIZE(vq->iov),
+ &out, &in);
/* On error, stop handling until the next kick. */
if (unlikely(head < 0))
break;
@@ -435,6 +471,38 @@ static int peek_head_len(struct sock *sk)
return len;
}
+static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
+{
+ struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
+ struct vhost_virtqueue *vq = &nvq->vq;
+ unsigned long uninitialized_var(endtime);
+ int len = peek_head_len(sk);
+
+ if (!len && vq->busyloop_timeout) {
+ /* Both tx vq and rx socket were polled here */
+ mutex_lock(&vq->mutex);
+ vhost_disable_notify(&net->dev, vq);
+
+ preempt_disable();
+ endtime = busy_clock() + vq->busyloop_timeout;
+
+ while (vhost_can_busy_poll(&net->dev, endtime) &&
+ skb_queue_empty(&sk->sk_receive_queue) &&
+ vhost_vq_avail_empty(&net->dev, vq))
+ cpu_relax_lowlatency();
+
+ preempt_enable();
+
+ if (vhost_enable_notify(&net->dev, vq))
+ vhost_poll_queue(&vq->poll);
+ mutex_unlock(&vq->mutex);
+
+ len = peek_head_len(sk);
+ }
+
+ return len;
+}
+
/* This is a multi-buffer version of vhost_get_desc, that works if
* vq has read descriptors only.
* @vq - the relevant virtqueue
@@ -553,7 +621,7 @@ static void handle_rx(struct vhost_net *net)
vq->log : NULL;
mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
- while ((sock_len = peek_head_len(sock->sk))) {
+ while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
headcount = get_rx_bufs(vq, vq->heads, vhost_len,
@@ -917,7 +985,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
vhost_net_disable_vq(n, vq);
vq->private_data = sock;
- r = vhost_init_used(vq);
+ r = vhost_vq_init_access(vq);
if (r)
goto err_used;
r = vhost_net_enable_vq(n, vq);
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 29cfc57d496e..f898686cdd93 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1274,7 +1274,7 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs,
vq = &vs->vqs[i].vq;
mutex_lock(&vq->mutex);
vq->private_data = vs_tpg;
- vhost_init_used(vq);
+ vhost_vq_init_access(vq);
mutex_unlock(&vq->mutex);
}
ret = 0;
diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index f2882ac98726..388eec4e1a90 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -196,7 +196,7 @@ static long vhost_test_run(struct vhost_test *n, int test)
oldpriv = vq->private_data;
vq->private_data = priv;
- r = vhost_init_used(&n->vqs[index]);
+ r = vhost_vq_init_access(&n->vqs[index]);
mutex_unlock(&vq->mutex);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 236553e81027..669fef1e2bb6 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -43,11 +43,21 @@ enum {
#define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
#ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
-static void vhost_vq_reset_user_be(struct vhost_virtqueue *vq)
+static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
{
vq->user_be = !virtio_legacy_is_little_endian();
}
+static void vhost_enable_cross_endian_big(struct vhost_virtqueue *vq)
+{
+ vq->user_be = true;
+}
+
+static void vhost_enable_cross_endian_little(struct vhost_virtqueue *vq)
+{
+ vq->user_be = false;
+}
+
static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
{
struct vhost_vring_state s;
@@ -62,7 +72,10 @@ static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
s.num != VHOST_VRING_BIG_ENDIAN)
return -EINVAL;
- vq->user_be = s.num;
+ if (s.num == VHOST_VRING_BIG_ENDIAN)
+ vhost_enable_cross_endian_big(vq);
+ else
+ vhost_enable_cross_endian_little(vq);
return 0;
}
@@ -91,7 +104,7 @@ static void vhost_init_is_le(struct vhost_virtqueue *vq)
vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) || !vq->user_be;
}
#else
-static void vhost_vq_reset_user_be(struct vhost_virtqueue *vq)
+static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
{
}
@@ -113,6 +126,11 @@ static void vhost_init_is_le(struct vhost_virtqueue *vq)
}
#endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */
+static void vhost_reset_is_le(struct vhost_virtqueue *vq)
+{
+ vq->is_le = virtio_legacy_is_little_endian();
+}
+
static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
{
@@ -245,6 +263,13 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
}
EXPORT_SYMBOL_GPL(vhost_work_queue);
+/* A lockless hint for busy polling code to exit the loop */
+bool vhost_has_work(struct vhost_dev *dev)
+{
+ return !list_empty(&dev->work_list);
+}
+EXPORT_SYMBOL_GPL(vhost_has_work);
+
void vhost_poll_queue(struct vhost_poll *poll)
{
vhost_work_queue(poll->dev, &poll->work);
@@ -276,8 +301,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->call = NULL;
vq->log_ctx = NULL;
vq->memory = NULL;
- vq->is_le = virtio_legacy_is_little_endian();
- vhost_vq_reset_user_be(vq);
+ vhost_reset_is_le(vq);
+ vhost_disable_cross_endian(vq);
+ vq->busyloop_timeout = 0;
}
static int vhost_worker(void *data)
@@ -912,6 +938,19 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
case VHOST_GET_VRING_ENDIAN:
r = vhost_get_vring_endian(vq, idx, argp);
break;
+ case VHOST_SET_VRING_BUSYLOOP_TIMEOUT:
+ if (copy_from_user(&s, argp, sizeof(s))) {
+ r = -EFAULT;
+ break;
+ }
+ vq->busyloop_timeout = s.num;
+ break;
+ case VHOST_GET_VRING_BUSYLOOP_TIMEOUT:
+ s.index = idx;
+ s.num = vq->busyloop_timeout;
+ if (copy_to_user(argp, &s, sizeof(s)))
+ r = -EFAULT;
+ break;
default:
r = -ENOIOCTLCMD;
}
@@ -1152,14 +1191,14 @@ static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
return 0;
}
-int vhost_init_used(struct vhost_virtqueue *vq)
+int vhost_vq_init_access(struct vhost_virtqueue *vq)
{
__virtio16 last_used_idx;
int r;
bool is_le = vq->is_le;
if (!vq->private_data) {
- vq->is_le = virtio_legacy_is_little_endian();
+ vhost_reset_is_le(vq);
return 0;
}
@@ -1182,7 +1221,7 @@ err:
vq->is_le = is_le;
return r;
}
-EXPORT_SYMBOL_GPL(vhost_init_used);
+EXPORT_SYMBOL_GPL(vhost_vq_init_access);
static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
struct iovec iov[], int iov_size)
@@ -1633,6 +1672,20 @@ void vhost_add_used_and_signal_n(struct vhost_dev *dev,
}
EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
+/* return true if we're sure that avaiable ring is empty */
+bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+ __virtio16 avail_idx;
+ int r;
+
+ r = __get_user(avail_idx, &vq->avail->idx);
+ if (r)
+ return false;
+
+ return vhost16_to_cpu(vq, avail_idx) == vq->avail_idx;
+}
+EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
+
/* OK, now we need to know about added descriptors. */
bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
{
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d3f767448a72..d36d8beb3351 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -37,6 +37,7 @@ struct vhost_poll {
void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
+bool vhost_has_work(struct vhost_dev *dev);
void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
unsigned long mask, struct vhost_dev *dev);
@@ -114,6 +115,7 @@ struct vhost_virtqueue {
/* Ring endianness requested by userspace for cross-endian support. */
bool user_be;
#endif
+ u32 busyloop_timeout;
};
struct vhost_dev {
@@ -148,7 +150,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
struct vhost_log *log, unsigned int *log_num);
void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
-int vhost_init_used(struct vhost_virtqueue *);
+int vhost_vq_init_access(struct vhost_virtqueue *);
int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
unsigned count);
@@ -158,6 +160,7 @@ void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
struct vring_used_elem *heads, unsigned count);
void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *);
+bool vhost_vq_avail_empty(struct vhost_dev *, struct vhost_virtqueue *);
bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index cab9f3f63a38..77590320d44c 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -60,7 +60,7 @@ config VIRTIO_INPUT
config VIRTIO_MMIO
tristate "Platform bus driver for memory mapped virtio devices"
- depends on HAS_IOMEM
+ depends on HAS_IOMEM && HAS_DMA
select VIRTIO
---help---
This drivers provides support for memory mapped virtio
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index f2b77dea8d3c..7b6d74f0c72f 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -22,8 +22,7 @@
#include <linux/virtio.h>
#include <linux/virtio_balloon.h>
#include <linux/swap.h>
-#include <linux/kthread.h>
-#include <linux/freezer.h>
+#include <linux/workqueue.h>
#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/module.h>
@@ -50,11 +49,13 @@ struct virtio_balloon {
struct virtio_device *vdev;
struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
- /* Where the ballooning thread waits for config to change. */
- wait_queue_head_t config_change;
+ /* The balloon servicing is delegated to a freezable workqueue. */
+ struct work_struct update_balloon_stats_work;
+ struct work_struct update_balloon_size_work;
- /* The thread servicing the balloon. */
- struct task_struct *thread;
+ /* Prevent updating balloon when it is being canceled. */
+ spinlock_t stop_update_lock;
+ bool stop_update;
/* Waiting for host to ack the pages we released. */
wait_queue_head_t acked;
@@ -77,7 +78,6 @@ struct virtio_balloon {
u32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
/* Memory statistics */
- int need_stats_update;
struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
/* To register callback in oom notifier call chain */
@@ -124,6 +124,7 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
/* When host has read buffer, this completes via balloon_ack */
wait_event(vb->acked, virtqueue_get_buf(vq, &len));
+
}
static void set_page_pfns(u32 pfns[], struct page *page)
@@ -136,9 +137,10 @@ static void set_page_pfns(u32 pfns[], struct page *page)
pfns[i] = page_to_balloon_pfn(page) + i;
}
-static void fill_balloon(struct virtio_balloon *vb, size_t num)
+static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
{
struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info;
+ unsigned num_allocated_pages;
/* We can only do one array worth at a time. */
num = min(num, ARRAY_SIZE(vb->pfns));
@@ -163,10 +165,13 @@ static void fill_balloon(struct virtio_balloon *vb, size_t num)
adjust_managed_page_count(page, -1);
}
+ num_allocated_pages = vb->num_pfns;
/* Did we get any? */
if (vb->num_pfns != 0)
tell_host(vb, vb->inflate_vq);
mutex_unlock(&vb->balloon_lock);
+
+ return num_allocated_pages;
}
static void release_pages_balloon(struct virtio_balloon *vb)
@@ -257,14 +262,17 @@ static void update_balloon_stats(struct virtio_balloon *vb)
* with a single buffer. From that point forward, all conversations consist of
* a hypervisor request (a call to this function) which directs us to refill
* the virtqueue with a fresh stats buffer. Since stats collection can sleep,
- * we notify our kthread which does the actual work via stats_handle_request().
+ * we delegate the job to a freezable workqueue that will do the actual work via
+ * stats_handle_request().
*/
static void stats_request(struct virtqueue *vq)
{
struct virtio_balloon *vb = vq->vdev->priv;
- vb->need_stats_update = 1;
- wake_up(&vb->config_change);
+ spin_lock(&vb->stop_update_lock);
+ if (!vb->stop_update)
+ queue_work(system_freezable_wq, &vb->update_balloon_stats_work);
+ spin_unlock(&vb->stop_update_lock);
}
static void stats_handle_request(struct virtio_balloon *vb)
@@ -273,7 +281,6 @@ static void stats_handle_request(struct virtio_balloon *vb)
struct scatterlist sg;
unsigned int len;
- vb->need_stats_update = 0;
update_balloon_stats(vb);
vq = vb->stats_vq;
@@ -287,8 +294,12 @@ static void stats_handle_request(struct virtio_balloon *vb)
static void virtballoon_changed(struct virtio_device *vdev)
{
struct virtio_balloon *vb = vdev->priv;
+ unsigned long flags;
- wake_up(&vb->config_change);
+ spin_lock_irqsave(&vb->stop_update_lock, flags);
+ if (!vb->stop_update)
+ queue_work(system_freezable_wq, &vb->update_balloon_size_work);
+ spin_unlock_irqrestore(&vb->stop_update_lock, flags);
}
static inline s64 towards_target(struct virtio_balloon *vb)
@@ -351,43 +362,32 @@ static int virtballoon_oom_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static int balloon(void *_vballoon)
+static void update_balloon_stats_func(struct work_struct *work)
{
- struct virtio_balloon *vb = _vballoon;
- DEFINE_WAIT_FUNC(wait, woken_wake_function);
-
- set_freezable();
- while (!kthread_should_stop()) {
- s64 diff;
-
- try_to_freeze();
-
- add_wait_queue(&vb->config_change, &wait);
- for (;;) {
- if ((diff = towards_target(vb)) != 0 ||
- vb->need_stats_update ||
- kthread_should_stop() ||
- freezing(current))
- break;
- wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
- }
- remove_wait_queue(&vb->config_change, &wait);
+ struct virtio_balloon *vb;
- if (vb->need_stats_update)
- stats_handle_request(vb);
- if (diff > 0)
- fill_balloon(vb, diff);
- else if (diff < 0)
- leak_balloon(vb, -diff);
- update_balloon_size(vb);
+ vb = container_of(work, struct virtio_balloon,
+ update_balloon_stats_work);
+ stats_handle_request(vb);
+}
- /*
- * For large balloon changes, we could spend a lot of time
- * and always have work to do. Be nice if preempt disabled.
- */
- cond_resched();
- }
- return 0;
+static void update_balloon_size_func(struct work_struct *work)
+{
+ struct virtio_balloon *vb;
+ s64 diff;
+
+ vb = container_of(work, struct virtio_balloon,
+ update_balloon_size_work);
+ diff = towards_target(vb);
+
+ if (diff > 0)
+ diff -= fill_balloon(vb, diff);
+ else if (diff < 0)
+ diff += leak_balloon(vb, -diff);
+ update_balloon_size(vb);
+
+ if (diff)
+ queue_work(system_freezable_wq, work);
}
static int init_vqs(struct virtio_balloon *vb)
@@ -505,12 +505,14 @@ static int virtballoon_probe(struct virtio_device *vdev)
goto out;
}
+ INIT_WORK(&vb->update_balloon_stats_work, update_balloon_stats_func);
+ INIT_WORK(&vb->update_balloon_size_work, update_balloon_size_func);
+ spin_lock_init(&vb->stop_update_lock);
+ vb->stop_update = false;
vb->num_pages = 0;
mutex_init(&vb->balloon_lock);
- init_waitqueue_head(&vb->config_change);
init_waitqueue_head(&vb->acked);
vb->vdev = vdev;
- vb->need_stats_update = 0;
balloon_devinfo_init(&vb->vb_dev_info);
#ifdef CONFIG_BALLOON_COMPACTION
@@ -529,16 +531,8 @@ static int virtballoon_probe(struct virtio_device *vdev)
virtio_device_ready(vdev);
- vb->thread = kthread_run(balloon, vb, "vballoon");
- if (IS_ERR(vb->thread)) {
- err = PTR_ERR(vb->thread);
- goto out_del_vqs;
- }
-
return 0;
-out_del_vqs:
- unregister_oom_notifier(&vb->nb);
out_oom_notify:
vdev->config->del_vqs(vdev);
out_free_vb:
@@ -565,7 +559,13 @@ static void virtballoon_remove(struct virtio_device *vdev)
struct virtio_balloon *vb = vdev->priv;
unregister_oom_notifier(&vb->nb);
- kthread_stop(vb->thread);
+
+ spin_lock_irq(&vb->stop_update_lock);
+ vb->stop_update = true;
+ spin_unlock_irq(&vb->stop_update_lock);
+ cancel_work_sync(&vb->update_balloon_size_work);
+ cancel_work_sync(&vb->update_balloon_stats_work);
+
remove_common(vb);
kfree(vb);
}
@@ -576,10 +576,9 @@ static int virtballoon_freeze(struct virtio_device *vdev)
struct virtio_balloon *vb = vdev->priv;
/*
- * The kthread is already frozen by the PM core before this
+ * The workqueue is already frozen by the PM core before this
* function is called.
*/
-
remove_common(vb);
return 0;
}
@@ -595,7 +594,8 @@ static int virtballoon_restore(struct virtio_device *vdev)
virtio_device_ready(vdev);
- fill_balloon(vb, towards_target(vb));
+ if (towards_target(vb))
+ virtballoon_changed(vdev);
update_balloon_size(vb);
return 0;
}
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index 745c6ee1bb3e..48bfea91dbca 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -99,12 +99,6 @@ struct virtio_mmio_vq_info {
/* the actual virtqueue */
struct virtqueue *vq;
- /* the number of entries in the queue */
- unsigned int num;
-
- /* the virtual address of the ring queue */
- void *queue;
-
/* the list node for the virtqueues list */
struct list_head node;
};
@@ -322,15 +316,13 @@ static void vm_del_vq(struct virtqueue *vq)
{
struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev);
struct virtio_mmio_vq_info *info = vq->priv;
- unsigned long flags, size;
+ unsigned long flags;
unsigned int index = vq->index;
spin_lock_irqsave(&vm_dev->lock, flags);
list_del(&info->node);
spin_unlock_irqrestore(&vm_dev->lock, flags);
- vring_del_virtqueue(vq);
-
/* Select and deactivate the queue */
writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL);
if (vm_dev->version == 1) {
@@ -340,8 +332,8 @@ static void vm_del_vq(struct virtqueue *vq)
WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY));
}
- size = PAGE_ALIGN(vring_size(info->num, VIRTIO_MMIO_VRING_ALIGN));
- free_pages_exact(info->queue, size);
+ vring_del_virtqueue(vq);
+
kfree(info);
}
@@ -356,8 +348,6 @@ static void vm_del_vqs(struct virtio_device *vdev)
free_irq(platform_get_irq(vm_dev->pdev, 0), vm_dev);
}
-
-
static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
void (*callback)(struct virtqueue *vq),
const char *name)
@@ -365,7 +355,8 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
struct virtio_mmio_vq_info *info;
struct virtqueue *vq;
- unsigned long flags, size;
+ unsigned long flags;
+ unsigned int num;
int err;
if (!name)
@@ -388,66 +379,40 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
goto error_kmalloc;
}
- /* Allocate pages for the queue - start with a queue as big as
- * possible (limited by maximum size allowed by device), drop down
- * to a minimal size, just big enough to fit descriptor table
- * and two rings (which makes it "alignment_size * 2")
- */
- info->num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX);
-
- /* If the device reports a 0 entry queue, we won't be able to
- * use it to perform I/O, and vring_new_virtqueue() can't create
- * empty queues anyway, so don't bother to set up the device.
- */
- if (info->num == 0) {
+ num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX);
+ if (num == 0) {
err = -ENOENT;
- goto error_alloc_pages;
- }
-
- while (1) {
- size = PAGE_ALIGN(vring_size(info->num,
- VIRTIO_MMIO_VRING_ALIGN));
- /* Did the last iter shrink the queue below minimum size? */
- if (size < VIRTIO_MMIO_VRING_ALIGN * 2) {
- err = -ENOMEM;
- goto error_alloc_pages;
- }
-
- info->queue = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
- if (info->queue)
- break;
-
- info->num /= 2;
+ goto error_new_virtqueue;
}
/* Create the vring */
- vq = vring_new_virtqueue(index, info->num, VIRTIO_MMIO_VRING_ALIGN, vdev,
- true, info->queue, vm_notify, callback, name);
+ vq = vring_create_virtqueue(index, num, VIRTIO_MMIO_VRING_ALIGN, vdev,
+ true, true, vm_notify, callback, name);
if (!vq) {
err = -ENOMEM;
goto error_new_virtqueue;
}
/* Activate the queue */
- writel(info->num, vm_dev->base + VIRTIO_MMIO_QUEUE_NUM);
+ writel(virtqueue_get_vring_size(vq), vm_dev->base + VIRTIO_MMIO_QUEUE_NUM);
if (vm_dev->version == 1) {
writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_QUEUE_ALIGN);
- writel(virt_to_phys(info->queue) >> PAGE_SHIFT,
+ writel(virtqueue_get_desc_addr(vq) >> PAGE_SHIFT,
vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
} else {
u64 addr;
- addr = virt_to_phys(info->queue);
+ addr = virtqueue_get_desc_addr(vq);
writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_DESC_LOW);
writel((u32)(addr >> 32),
vm_dev->base + VIRTIO_MMIO_QUEUE_DESC_HIGH);
- addr = virt_to_phys(virtqueue_get_avail(vq));
+ addr = virtqueue_get_avail_addr(vq);
writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_AVAIL_LOW);
writel((u32)(addr >> 32),
vm_dev->base + VIRTIO_MMIO_QUEUE_AVAIL_HIGH);
- addr = virt_to_phys(virtqueue_get_used(vq));
+ addr = virtqueue_get_used_addr(vq);
writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_USED_LOW);
writel((u32)(addr >> 32),
vm_dev->base + VIRTIO_MMIO_QUEUE_USED_HIGH);
@@ -471,8 +436,6 @@ error_new_virtqueue:
writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY);
WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY));
}
- free_pages_exact(info->queue, size);
-error_alloc_pages:
kfree(info);
error_kmalloc:
error_available:
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index 2cc252270b2d..28263200ed42 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -35,12 +35,6 @@ struct virtio_pci_vq_info {
/* the actual virtqueue */
struct virtqueue *vq;
- /* the number of entries in the queue */
- int num;
-
- /* the virtual address of the ring queue */
- void *queue;
-
/* the list node for the virtqueues list */
struct list_head node;
diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c
index 48bc9797e530..8c4e61783441 100644
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -119,7 +119,6 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
u16 msix_vec)
{
struct virtqueue *vq;
- unsigned long size;
u16 num;
int err;
@@ -131,27 +130,19 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
if (!num || ioread32(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN))
return ERR_PTR(-ENOENT);
- info->num = num;
info->msix_vector = msix_vec;
- size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN));
- info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO);
- if (info->queue == NULL)
+ /* create the vring */
+ vq = vring_create_virtqueue(index, num,
+ VIRTIO_PCI_VRING_ALIGN, &vp_dev->vdev,
+ true, false, vp_notify, callback, name);
+ if (!vq)
return ERR_PTR(-ENOMEM);
/* activate the queue */
- iowrite32(virt_to_phys(info->queue) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
+ iowrite32(virtqueue_get_desc_addr(vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
- /* create the vring */
- vq = vring_new_virtqueue(index, info->num,
- VIRTIO_PCI_VRING_ALIGN, &vp_dev->vdev,
- true, info->queue, vp_notify, callback, name);
- if (!vq) {
- err = -ENOMEM;
- goto out_activate_queue;
- }
-
vq->priv = (void __force *)vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY;
if (msix_vec != VIRTIO_MSI_NO_VECTOR) {
@@ -159,17 +150,15 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
msix_vec = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
if (msix_vec == VIRTIO_MSI_NO_VECTOR) {
err = -EBUSY;
- goto out_assign;
+ goto out_deactivate;
}
}
return vq;
-out_assign:
- vring_del_virtqueue(vq);
-out_activate_queue:
+out_deactivate:
iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
- free_pages_exact(info->queue, size);
+ vring_del_virtqueue(vq);
return ERR_PTR(err);
}
@@ -177,7 +166,6 @@ static void del_vq(struct virtio_pci_vq_info *info)
{
struct virtqueue *vq = info->vq;
struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
- unsigned long size;
iowrite16(vq->index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
@@ -188,13 +176,10 @@ static void del_vq(struct virtio_pci_vq_info *info)
ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR);
}
- vring_del_virtqueue(vq);
-
/* Select and deactivate the queue */
iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
- size = PAGE_ALIGN(vring_size(info->num, VIRTIO_PCI_VRING_ALIGN));
- free_pages_exact(info->queue, size);
+ vring_del_virtqueue(vq);
}
static const struct virtio_config_ops virtio_pci_config_ops = {
@@ -227,6 +212,13 @@ int virtio_pci_legacy_probe(struct virtio_pci_device *vp_dev)
return -ENODEV;
}
+ rc = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64));
+ if (rc)
+ rc = dma_set_mask_and_coherent(&pci_dev->dev,
+ DMA_BIT_MASK(32));
+ if (rc)
+ dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA. Trying to continue, but this might not work.\n");
+
rc = pci_request_region(pci_dev, 0, "virtio-pci-legacy");
if (rc)
return rc;
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 7760fc1a2218..f6f28cc7eb45 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -287,31 +287,6 @@ static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector)
return vp_ioread16(&vp_dev->common->msix_config);
}
-static size_t vring_pci_size(u16 num)
-{
- /* We only need a cacheline separation. */
- return PAGE_ALIGN(vring_size(num, SMP_CACHE_BYTES));
-}
-
-static void *alloc_virtqueue_pages(int *num)
-{
- void *pages;
-
- /* TODO: allocate each queue chunk individually */
- for (; *num && vring_pci_size(*num) > PAGE_SIZE; *num /= 2) {
- pages = alloc_pages_exact(vring_pci_size(*num),
- GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
- if (pages)
- return pages;
- }
-
- if (!*num)
- return NULL;
-
- /* Try to get a single page. You are my only hope! */
- return alloc_pages_exact(vring_pci_size(*num), GFP_KERNEL|__GFP_ZERO);
-}
-
static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
struct virtio_pci_vq_info *info,
unsigned index,
@@ -343,29 +318,22 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
/* get offset of notification word for this vq */
off = vp_ioread16(&cfg->queue_notify_off);
- info->num = num;
info->msix_vector = msix_vec;
- info->queue = alloc_virtqueue_pages(&info->num);
- if (info->queue == NULL)
- return ERR_PTR(-ENOMEM);
-
/* create the vring */
- vq = vring_new_virtqueue(index, info->num,
- SMP_CACHE_BYTES, &vp_dev->vdev,
- true, info->queue, vp_notify, callback, name);
- if (!vq) {
- err = -ENOMEM;
- goto err_new_queue;
- }
+ vq = vring_create_virtqueue(index, num,
+ SMP_CACHE_BYTES, &vp_dev->vdev,
+ true, true, vp_notify, callback, name);
+ if (!vq)
+ return ERR_PTR(-ENOMEM);
/* activate the queue */
- vp_iowrite16(num, &cfg->queue_size);
- vp_iowrite64_twopart(virt_to_phys(info->queue),
+ vp_iowrite16(virtqueue_get_vring_size(vq), &cfg->queue_size);
+ vp_iowrite64_twopart(virtqueue_get_desc_addr(vq),
&cfg->queue_desc_lo, &cfg->queue_desc_hi);
- vp_iowrite64_twopart(virt_to_phys(virtqueue_get_avail(vq)),
+ vp_iowrite64_twopart(virtqueue_get_avail_addr(vq),
&cfg->queue_avail_lo, &cfg->queue_avail_hi);
- vp_iowrite64_twopart(virt_to_phys(virtqueue_get_used(vq)),
+ vp_iowrite64_twopart(virtqueue_get_used_addr(vq),
&cfg->queue_used_lo, &cfg->queue_used_hi);
if (vp_dev->notify_base) {
@@ -410,8 +378,6 @@ err_assign_vector:
pci_iounmap(vp_dev->pci_dev, (void __iomem __force *)vq->priv);
err_map_notify:
vring_del_virtqueue(vq);
-err_new_queue:
- free_pages_exact(info->queue, vring_pci_size(info->num));
return ERR_PTR(err);
}
@@ -456,8 +422,6 @@ static void del_vq(struct virtio_pci_vq_info *info)
pci_iounmap(vp_dev->pci_dev, (void __force __iomem *)vq->priv);
vring_del_virtqueue(vq);
-
- free_pages_exact(info->queue, vring_pci_size(info->num));
}
static const struct virtio_config_ops virtio_pci_config_nodev_ops = {
@@ -641,6 +605,13 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev)
return -EINVAL;
}
+ err = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64));
+ if (err)
+ err = dma_set_mask_and_coherent(&pci_dev->dev,
+ DMA_BIT_MASK(32));
+ if (err)
+ dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA. Trying to continue, but this might not work.\n");
+
/* Device capability is only mandatory for devices that have
* device-specific configuration.
*/
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index e12e385f7ac3..5c802d47892c 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -24,6 +24,8 @@
#include <linux/module.h>
#include <linux/hrtimer.h>
#include <linux/kmemleak.h>
+#include <linux/dma-mapping.h>
+#include <xen/xen.h>
#ifdef DEBUG
/* For development, we want to crash whenever the ring is screwed. */
@@ -54,6 +56,11 @@
#define END_USE(vq)
#endif
+struct vring_desc_state {
+ void *data; /* Data for callback. */
+ struct vring_desc *indir_desc; /* Indirect descriptor, if any. */
+};
+
struct vring_virtqueue {
struct virtqueue vq;
@@ -89,6 +96,11 @@ struct vring_virtqueue {
/* How to notify other side. FIXME: commonalize hcalls! */
bool (*notify)(struct virtqueue *vq);
+ /* DMA, allocation, and size information */
+ bool we_own_ring;
+ size_t queue_size_in_bytes;
+ dma_addr_t queue_dma_addr;
+
#ifdef DEBUG
/* They're supposed to lock for us. */
unsigned int in_use;
@@ -98,12 +110,120 @@ struct vring_virtqueue {
ktime_t last_add_time;
#endif
- /* Tokens for callbacks. */
- void *data[];
+ /* Per-descriptor state. */
+ struct vring_desc_state desc_state[];
};
#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
+/*
+ * The interaction between virtio and a possible IOMMU is a mess.
+ *
+ * On most systems with virtio, physical addresses match bus addresses,
+ * and it doesn't particularly matter whether we use the DMA API.
+ *
+ * On some systems, including Xen and any system with a physical device
+ * that speaks virtio behind a physical IOMMU, we must use the DMA API
+ * for virtio DMA to work at all.
+ *
+ * On other systems, including SPARC and PPC64, virtio-pci devices are
+ * enumerated as though they are behind an IOMMU, but the virtio host
+ * ignores the IOMMU, so we must either pretend that the IOMMU isn't
+ * there or somehow map everything as the identity.
+ *
+ * For the time being, we preserve historic behavior and bypass the DMA
+ * API.
+ */
+
+static bool vring_use_dma_api(struct virtio_device *vdev)
+{
+ /*
+ * In theory, it's possible to have a buggy QEMU-supposed
+ * emulated Q35 IOMMU and Xen enabled at the same time. On
+ * such a configuration, virtio has never worked and will
+ * not work without an even larger kludge. Instead, enable
+ * the DMA API if we're a Xen guest, which at least allows
+ * all of the sensible Xen configurations to work correctly.
+ */
+ if (xen_domain())
+ return true;
+
+ return false;
+}
+
+/*
+ * The DMA ops on various arches are rather gnarly right now, and
+ * making all of the arch DMA ops work on the vring device itself
+ * is a mess. For now, we use the parent device for DMA ops.
+ */
+struct device *vring_dma_dev(const struct vring_virtqueue *vq)
+{
+ return vq->vq.vdev->dev.parent;
+}
+
+/* Map one sg entry. */
+static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq,
+ struct scatterlist *sg,
+ enum dma_data_direction direction)
+{
+ if (!vring_use_dma_api(vq->vq.vdev))
+ return (dma_addr_t)sg_phys(sg);
+
+ /*
+ * We can't use dma_map_sg, because we don't use scatterlists in
+ * the way it expects (we don't guarantee that the scatterlist
+ * will exist for the lifetime of the mapping).
+ */
+ return dma_map_page(vring_dma_dev(vq),
+ sg_page(sg), sg->offset, sg->length,
+ direction);
+}
+
+static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
+ void *cpu_addr, size_t size,
+ enum dma_data_direction direction)
+{
+ if (!vring_use_dma_api(vq->vq.vdev))
+ return (dma_addr_t)virt_to_phys(cpu_addr);
+
+ return dma_map_single(vring_dma_dev(vq),
+ cpu_addr, size, direction);
+}
+
+static void vring_unmap_one(const struct vring_virtqueue *vq,
+ struct vring_desc *desc)
+{
+ u16 flags;
+
+ if (!vring_use_dma_api(vq->vq.vdev))
+ return;
+
+ flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
+
+ if (flags & VRING_DESC_F_INDIRECT) {
+ dma_unmap_single(vring_dma_dev(vq),
+ virtio64_to_cpu(vq->vq.vdev, desc->addr),
+ virtio32_to_cpu(vq->vq.vdev, desc->len),
+ (flags & VRING_DESC_F_WRITE) ?
+ DMA_FROM_DEVICE : DMA_TO_DEVICE);
+ } else {
+ dma_unmap_page(vring_dma_dev(vq),
+ virtio64_to_cpu(vq->vq.vdev, desc->addr),
+ virtio32_to_cpu(vq->vq.vdev, desc->len),
+ (flags & VRING_DESC_F_WRITE) ?
+ DMA_FROM_DEVICE : DMA_TO_DEVICE);
+ }
+}
+
+static int vring_mapping_error(const struct vring_virtqueue *vq,
+ dma_addr_t addr)
+{
+ if (!vring_use_dma_api(vq->vq.vdev))
+ return 0;
+
+ return dma_mapping_error(vring_dma_dev(vq), addr);
+}
+
static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
unsigned int total_sg, gfp_t gfp)
{
@@ -137,7 +257,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
struct vring_virtqueue *vq = to_vvq(_vq);
struct scatterlist *sg;
struct vring_desc *desc;
- unsigned int i, n, avail, descs_used, uninitialized_var(prev);
+ unsigned int i, n, avail, descs_used, uninitialized_var(prev), err_idx;
int head;
bool indirect;
@@ -177,21 +297,15 @@ static inline int virtqueue_add(struct virtqueue *_vq,
if (desc) {
/* Use a single buffer which doesn't continue */
- vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT);
- vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, virt_to_phys(desc));
- /* avoid kmemleak false positive (hidden by virt_to_phys) */
- kmemleak_ignore(desc);
- vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc));
-
+ indirect = true;
/* Set up rest to use this indirect table. */
i = 0;
descs_used = 1;
- indirect = true;
} else {
+ indirect = false;
desc = vq->vring.desc;
i = head;
descs_used = total_sg;
- indirect = false;
}
if (vq->vq.num_free < descs_used) {
@@ -206,13 +320,14 @@ static inline int virtqueue_add(struct virtqueue *_vq,
return -ENOSPC;
}
- /* We're about to use some buffers from the free list. */
- vq->vq.num_free -= descs_used;
-
for (n = 0; n < out_sgs; n++) {
for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+ dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE);
+ if (vring_mapping_error(vq, addr))
+ goto unmap_release;
+
desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT);
- desc[i].addr = cpu_to_virtio64(_vq->vdev, sg_phys(sg));
+ desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
prev = i;
i = virtio16_to_cpu(_vq->vdev, desc[i].next);
@@ -220,8 +335,12 @@ static inline int virtqueue_add(struct virtqueue *_vq,
}
for (; n < (out_sgs + in_sgs); n++) {
for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+ dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
+ if (vring_mapping_error(vq, addr))
+ goto unmap_release;
+
desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE);
- desc[i].addr = cpu_to_virtio64(_vq->vdev, sg_phys(sg));
+ desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
prev = i;
i = virtio16_to_cpu(_vq->vdev, desc[i].next);
@@ -230,14 +349,33 @@ static inline int virtqueue_add(struct virtqueue *_vq,
/* Last one doesn't continue. */
desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
+ if (indirect) {
+ /* Now that the indirect table is filled in, map it. */
+ dma_addr_t addr = vring_map_single(
+ vq, desc, total_sg * sizeof(struct vring_desc),
+ DMA_TO_DEVICE);
+ if (vring_mapping_error(vq, addr))
+ goto unmap_release;
+
+ vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT);
+ vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
+
+ vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc));
+ }
+
+ /* We're using some buffers from the free list. */
+ vq->vq.num_free -= descs_used;
+
/* Update free pointer */
if (indirect)
vq->free_head = virtio16_to_cpu(_vq->vdev, vq->vring.desc[head].next);
else
vq->free_head = i;
- /* Set token. */
- vq->data[head] = data;
+ /* Store token and indirect buffer state. */
+ vq->desc_state[head].data = data;
+ if (indirect)
+ vq->desc_state[head].indir_desc = desc;
/* Put entry in available array (but don't update avail->idx until they
* do sync). */
@@ -260,6 +398,24 @@ static inline int virtqueue_add(struct virtqueue *_vq,
virtqueue_kick(_vq);
return 0;
+
+unmap_release:
+ err_idx = i;
+ i = head;
+
+ for (n = 0; n < total_sg; n++) {
+ if (i == err_idx)
+ break;
+ vring_unmap_one(vq, &desc[i]);
+ i = vq->vring.desc[i].next;
+ }
+
+ vq->vq.num_free += total_sg;
+
+ if (indirect)
+ kfree(desc);
+
+ return -EIO;
}
/**
@@ -430,27 +586,43 @@ EXPORT_SYMBOL_GPL(virtqueue_kick);
static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
{
- unsigned int i;
+ unsigned int i, j;
+ u16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
/* Clear data ptr. */
- vq->data[head] = NULL;
+ vq->desc_state[head].data = NULL;
- /* Put back on free list: find end */
+ /* Put back on free list: unmap first-level descriptors and find end */
i = head;
- /* Free the indirect table */
- if (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT))
- kfree(phys_to_virt(virtio64_to_cpu(vq->vq.vdev, vq->vring.desc[i].addr)));
-
- while (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT)) {
+ while (vq->vring.desc[i].flags & nextflag) {
+ vring_unmap_one(vq, &vq->vring.desc[i]);
i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
vq->vq.num_free++;
}
+ vring_unmap_one(vq, &vq->vring.desc[i]);
vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
vq->free_head = head;
+
/* Plus final descriptor */
vq->vq.num_free++;
+
+ /* Free the indirect table, if any, now that it's unmapped. */
+ if (vq->desc_state[head].indir_desc) {
+ struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
+ u32 len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);
+
+ BUG_ON(!(vq->vring.desc[head].flags &
+ cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
+ BUG_ON(len == 0 || len % sizeof(struct vring_desc));
+
+ for (j = 0; j < len / sizeof(struct vring_desc); j++)
+ vring_unmap_one(vq, &indir_desc[j]);
+
+ kfree(vq->desc_state[head].indir_desc);
+ vq->desc_state[head].indir_desc = NULL;
+ }
}
static inline bool more_used(const struct vring_virtqueue *vq)
@@ -505,13 +677,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
BAD_RING(vq, "id %u out of range\n", i);
return NULL;
}
- if (unlikely(!vq->data[i])) {
+ if (unlikely(!vq->desc_state[i].data)) {
BAD_RING(vq, "id %u is not a head!\n", i);
return NULL;
}
/* detach_buf clears data, so grab it now. */
- ret = vq->data[i];
+ ret = vq->desc_state[i].data;
detach_buf(vq, i);
vq->last_used_idx++;
/* If we expect an interrupt for the next entry, tell host
@@ -685,10 +857,10 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
START_USE(vq);
for (i = 0; i < vq->vring.num; i++) {
- if (!vq->data[i])
+ if (!vq->desc_state[i].data)
continue;
/* detach_buf clears data, so grab it now. */
- buf = vq->data[i];
+ buf = vq->desc_state[i].data;
detach_buf(vq, i);
vq->avail_idx_shadow--;
vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
@@ -723,35 +895,31 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
}
EXPORT_SYMBOL_GPL(vring_interrupt);
-struct virtqueue *vring_new_virtqueue(unsigned int index,
- unsigned int num,
- unsigned int vring_align,
- struct virtio_device *vdev,
- bool weak_barriers,
- void *pages,
- bool (*notify)(struct virtqueue *),
- void (*callback)(struct virtqueue *),
- const char *name)
+struct virtqueue *__vring_new_virtqueue(unsigned int index,
+ struct vring vring,
+ struct virtio_device *vdev,
+ bool weak_barriers,
+ bool (*notify)(struct virtqueue *),
+ void (*callback)(struct virtqueue *),
+ const char *name)
{
- struct vring_virtqueue *vq;
unsigned int i;
+ struct vring_virtqueue *vq;
- /* We assume num is a power of 2. */
- if (num & (num - 1)) {
- dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num);
- return NULL;
- }
-
- vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL);
+ vq = kmalloc(sizeof(*vq) + vring.num * sizeof(struct vring_desc_state),
+ GFP_KERNEL);
if (!vq)
return NULL;
- vring_init(&vq->vring, num, pages, vring_align);
+ vq->vring = vring;
vq->vq.callback = callback;
vq->vq.vdev = vdev;
vq->vq.name = name;
- vq->vq.num_free = num;
+ vq->vq.num_free = vring.num;
vq->vq.index = index;
+ vq->we_own_ring = false;
+ vq->queue_dma_addr = 0;
+ vq->queue_size_in_bytes = 0;
vq->notify = notify;
vq->weak_barriers = weak_barriers;
vq->broken = false;
@@ -776,20 +944,145 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
/* Put everything in free lists. */
vq->free_head = 0;
- for (i = 0; i < num-1; i++) {
+ for (i = 0; i < vring.num-1; i++)
vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
- vq->data[i] = NULL;
- }
- vq->data[i] = NULL;
+ memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
return &vq->vq;
}
+EXPORT_SYMBOL_GPL(__vring_new_virtqueue);
+
+static void *vring_alloc_queue(struct virtio_device *vdev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flag)
+{
+ if (vring_use_dma_api(vdev)) {
+ return dma_alloc_coherent(vdev->dev.parent, size,
+ dma_handle, flag);
+ } else {
+ void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag);
+ if (queue) {
+ phys_addr_t phys_addr = virt_to_phys(queue);
+ *dma_handle = (dma_addr_t)phys_addr;
+
+ /*
+ * Sanity check: make sure we dind't truncate
+ * the address. The only arches I can find that
+ * have 64-bit phys_addr_t but 32-bit dma_addr_t
+ * are certain non-highmem MIPS and x86
+ * configurations, but these configurations
+ * should never allocate physical pages above 32
+ * bits, so this is fine. Just in case, throw a
+ * warning and abort if we end up with an
+ * unrepresentable address.
+ */
+ if (WARN_ON_ONCE(*dma_handle != phys_addr)) {
+ free_pages_exact(queue, PAGE_ALIGN(size));
+ return NULL;
+ }
+ }
+ return queue;
+ }
+}
+
+static void vring_free_queue(struct virtio_device *vdev, size_t size,
+ void *queue, dma_addr_t dma_handle)
+{
+ if (vring_use_dma_api(vdev)) {
+ dma_free_coherent(vdev->dev.parent, size, queue, dma_handle);
+ } else {
+ free_pages_exact(queue, PAGE_ALIGN(size));
+ }
+}
+
+struct virtqueue *vring_create_virtqueue(
+ unsigned int index,
+ unsigned int num,
+ unsigned int vring_align,
+ struct virtio_device *vdev,
+ bool weak_barriers,
+ bool may_reduce_num,
+ bool (*notify)(struct virtqueue *),
+ void (*callback)(struct virtqueue *),
+ const char *name)
+{
+ struct virtqueue *vq;
+ void *queue;
+ dma_addr_t dma_addr;
+ size_t queue_size_in_bytes;
+ struct vring vring;
+
+ /* We assume num is a power of 2. */
+ if (num & (num - 1)) {
+ dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num);
+ return NULL;
+ }
+
+ /* TODO: allocate each queue chunk individually */
+ for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
+ queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
+ &dma_addr,
+ GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO);
+ if (queue)
+ break;
+ }
+
+ if (!num)
+ return NULL;
+
+ if (!queue) {
+ /* Try to get a single page. You are my only hope! */
+ queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
+ &dma_addr, GFP_KERNEL|__GFP_ZERO);
+ }
+ if (!queue)
+ return NULL;
+
+ queue_size_in_bytes = vring_size(num, vring_align);
+ vring_init(&vring, num, queue, vring_align);
+
+ vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers,
+ notify, callback, name);
+ if (!vq) {
+ vring_free_queue(vdev, queue_size_in_bytes, queue,
+ dma_addr);
+ return NULL;
+ }
+
+ to_vvq(vq)->queue_dma_addr = dma_addr;
+ to_vvq(vq)->queue_size_in_bytes = queue_size_in_bytes;
+ to_vvq(vq)->we_own_ring = true;
+
+ return vq;
+}
+EXPORT_SYMBOL_GPL(vring_create_virtqueue);
+
+struct virtqueue *vring_new_virtqueue(unsigned int index,
+ unsigned int num,
+ unsigned int vring_align,
+ struct virtio_device *vdev,
+ bool weak_barriers,
+ void *pages,
+ bool (*notify)(struct virtqueue *vq),
+ void (*callback)(struct virtqueue *vq),
+ const char *name)
+{
+ struct vring vring;
+ vring_init(&vring, num, pages, vring_align);
+ return __vring_new_virtqueue(index, vring, vdev, weak_barriers,
+ notify, callback, name);
+}
EXPORT_SYMBOL_GPL(vring_new_virtqueue);
-void vring_del_virtqueue(struct virtqueue *vq)
+void vring_del_virtqueue(struct virtqueue *_vq)
{
- list_del(&vq->list);
- kfree(to_vvq(vq));
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ if (vq->we_own_ring) {
+ vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
+ vq->vring.desc, vq->queue_dma_addr);
+ }
+ list_del(&_vq->list);
+ kfree(vq);
}
EXPORT_SYMBOL_GPL(vring_del_virtqueue);
@@ -853,20 +1146,42 @@ void virtio_break_device(struct virtio_device *dev)
}
EXPORT_SYMBOL_GPL(virtio_break_device);
-void *virtqueue_get_avail(struct virtqueue *_vq)
+dma_addr_t virtqueue_get_desc_addr(struct virtqueue *_vq)
{
struct vring_virtqueue *vq = to_vvq(_vq);
- return vq->vring.avail;
+ BUG_ON(!vq->we_own_ring);
+
+ return vq->queue_dma_addr;
}
-EXPORT_SYMBOL_GPL(virtqueue_get_avail);
+EXPORT_SYMBOL_GPL(virtqueue_get_desc_addr);
-void *virtqueue_get_used(struct virtqueue *_vq)
+dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
{
struct vring_virtqueue *vq = to_vvq(_vq);
- return vq->vring.used;
+ BUG_ON(!vq->we_own_ring);
+
+ return vq->queue_dma_addr +
+ ((char *)vq->vring.avail - (char *)vq->vring.desc);
+}
+EXPORT_SYMBOL_GPL(virtqueue_get_avail_addr);
+
+dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ BUG_ON(!vq->we_own_ring);
+
+ return vq->queue_dma_addr +
+ ((char *)vq->vring.used - (char *)vq->vring.desc);
+}
+EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
+
+const struct vring *virtqueue_get_vring(struct virtqueue *vq)
+{
+ return &to_vvq(vq)->vring;
}
-EXPORT_SYMBOL_GPL(virtqueue_get_used);
+EXPORT_SYMBOL_GPL(virtqueue_get_vring);
MODULE_LICENSE("GPL");