diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-05-02 14:14:04 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-05-02 14:14:04 -0700 |
commit | 736a2dd2571ac56b11ed95a7814d838d5311be04 (patch) | |
tree | de10d107025970c6e51d5b6faeba799ed4b9caae /drivers | |
parent | 0b2e3b6bb4a415379f16e38fc92db42379be47a1 (diff) | |
parent | 01d779a14ef800b74684d9692add4944df052461 (diff) | |
download | linux-736a2dd2571ac56b11ed95a7814d838d5311be04.tar.bz2 |
Merge tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux
Pull virtio & lguest updates from Rusty Russell:
"Lots of virtio work which wasn't quite ready for last merge window.
Plus I dived into lguest again, reworking the pagetable code so we can
move the switcher page: our fixmaps sometimes take more than 2MB now..."
Ugh. Annoying conflicts with the tcm_vhost -> vhost_scsi rename.
Hopefully correctly resolved.
* tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux: (57 commits)
caif_virtio: Remove bouncing email addresses
lguest: improve code readability in lg_cpu_start.
virtio-net: fill only rx queues which are being used
lguest: map Switcher below fixmap.
lguest: cache last cpu we ran on.
lguest: map Switcher text whenever we allocate a new pagetable.
lguest: don't share Switcher PTE pages between guests.
lguest: expost switcher_pages array (as lg_switcher_pages).
lguest: extract shadow PTE walking / allocating.
lguest: make check_gpte et. al return bool.
lguest: assume Switcher text is a single page.
lguest: rename switcher_page to switcher_pages.
lguest: remove RESERVE_MEM constant.
lguest: check vaddr not pgd for Switcher protection.
lguest: prepare to make SWITCHER_ADDR a variable.
virtio: console: replace EMFILE with EBUSY for already-open port
virtio-scsi: reset virtqueue affinity when doing cpu hotplug
virtio-scsi: introduce multiqueue support
virtio-scsi: push vq lock/unlock into virtscsi_vq_done
virtio-scsi: pass struct virtio_scsi to virtqueue completion function
...
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/Makefile | 2 | ||||
-rw-r--r-- | drivers/block/virtio_blk.c | 148 | ||||
-rw-r--r-- | drivers/char/hw_random/virtio-rng.c | 2 | ||||
-rw-r--r-- | drivers/char/virtio_console.c | 14 | ||||
-rw-r--r-- | drivers/lguest/Kconfig | 5 | ||||
-rw-r--r-- | drivers/lguest/core.c | 67 | ||||
-rw-r--r-- | drivers/lguest/lg.h | 6 | ||||
-rw-r--r-- | drivers/lguest/lguest_user.c | 6 | ||||
-rw-r--r-- | drivers/lguest/page_tables.c | 567 | ||||
-rw-r--r-- | drivers/lguest/x86/core.c | 7 | ||||
-rw-r--r-- | drivers/net/caif/Kconfig | 14 | ||||
-rw-r--r-- | drivers/net/caif/Makefile | 3 | ||||
-rw-r--r-- | drivers/net/caif/caif_virtio.c | 790 | ||||
-rw-r--r-- | drivers/net/virtio_net.c | 77 | ||||
-rw-r--r-- | drivers/rpmsg/virtio_rpmsg_bus.c | 8 | ||||
-rw-r--r-- | drivers/scsi/virtio_scsi.c | 487 | ||||
-rw-r--r-- | drivers/vhost/Kconfig | 8 | ||||
-rw-r--r-- | drivers/vhost/Makefile | 2 | ||||
-rw-r--r-- | drivers/vhost/test.c | 4 | ||||
-rw-r--r-- | drivers/vhost/vringh.c | 1007 | ||||
-rw-r--r-- | drivers/virtio/virtio_balloon.c | 6 | ||||
-rw-r--r-- | drivers/virtio/virtio_ring.c | 297 |
22 files changed, 2850 insertions, 677 deletions
diff --git a/drivers/Makefile b/drivers/Makefile index 33360de63650..8e57688ebd95 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -124,7 +124,7 @@ obj-$(CONFIG_PPC_PS3) += ps3/ obj-$(CONFIG_OF) += of/ obj-$(CONFIG_SSB) += ssb/ obj-$(CONFIG_BCMA) += bcma/ -obj-$(CONFIG_VHOST_NET) += vhost/ +obj-$(CONFIG_VHOST_RING) += vhost/ obj-$(CONFIG_VLYNQ) += vlynq/ obj-$(CONFIG_STAGING) += staging/ obj-y += platform/ diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 8ad21a25bc0d..64723953e1c9 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -100,96 +100,103 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk, return vbr; } -static void virtblk_add_buf_wait(struct virtio_blk *vblk, - struct virtblk_req *vbr, - unsigned long out, - unsigned long in) +static int __virtblk_add_req(struct virtqueue *vq, + struct virtblk_req *vbr, + struct scatterlist *data_sg, + bool have_data) { - DEFINE_WAIT(wait); + struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6]; + unsigned int num_out = 0, num_in = 0; + int type = vbr->out_hdr.type & ~VIRTIO_BLK_T_OUT; - for (;;) { - prepare_to_wait_exclusive(&vblk->queue_wait, &wait, - TASK_UNINTERRUPTIBLE); + sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); + sgs[num_out++] = &hdr; - spin_lock_irq(vblk->disk->queue->queue_lock); - if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, - GFP_ATOMIC) < 0) { - spin_unlock_irq(vblk->disk->queue->queue_lock); - io_schedule(); - } else { - virtqueue_kick(vblk->vq); - spin_unlock_irq(vblk->disk->queue->queue_lock); - break; - } + /* + * If this is a packet command we need a couple of additional headers. + * Behind the normal outhdr we put a segment with the scsi command + * block, and before the normal inhdr we put the sense data and the + * inhdr with additional status information. + */ + if (type == VIRTIO_BLK_T_SCSI_CMD) { + sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len); + sgs[num_out++] = &cmd; + } + if (have_data) { + if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT) + sgs[num_out++] = data_sg; + else + sgs[num_out + num_in++] = data_sg; } - finish_wait(&vblk->queue_wait, &wait); + if (type == VIRTIO_BLK_T_SCSI_CMD) { + sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE); + sgs[num_out + num_in++] = &sense; + sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr)); + sgs[num_out + num_in++] = &inhdr; + } + + sg_init_one(&status, &vbr->status, sizeof(vbr->status)); + sgs[num_out + num_in++] = &status; + + return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); } -static inline void virtblk_add_req(struct virtblk_req *vbr, - unsigned int out, unsigned int in) +static void virtblk_add_req(struct virtblk_req *vbr, bool have_data) { struct virtio_blk *vblk = vbr->vblk; + DEFINE_WAIT(wait); + int ret; spin_lock_irq(vblk->disk->queue->queue_lock); - if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, - GFP_ATOMIC) < 0)) { + while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg, + have_data)) < 0)) { + prepare_to_wait_exclusive(&vblk->queue_wait, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_irq(vblk->disk->queue->queue_lock); - virtblk_add_buf_wait(vblk, vbr, out, in); - return; + io_schedule(); + spin_lock_irq(vblk->disk->queue->queue_lock); + + finish_wait(&vblk->queue_wait, &wait); } + virtqueue_kick(vblk->vq); spin_unlock_irq(vblk->disk->queue->queue_lock); } -static int virtblk_bio_send_flush(struct virtblk_req *vbr) +static void virtblk_bio_send_flush(struct virtblk_req *vbr) { - unsigned int out = 0, in = 0; - vbr->flags |= VBLK_IS_FLUSH; vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; vbr->out_hdr.sector = 0; vbr->out_hdr.ioprio = 0; - sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); - sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status)); - - virtblk_add_req(vbr, out, in); - return 0; + virtblk_add_req(vbr, false); } -static int virtblk_bio_send_data(struct virtblk_req *vbr) +static void virtblk_bio_send_data(struct virtblk_req *vbr) { struct virtio_blk *vblk = vbr->vblk; - unsigned int num, out = 0, in = 0; struct bio *bio = vbr->bio; + bool have_data; vbr->flags &= ~VBLK_IS_FLUSH; vbr->out_hdr.type = 0; vbr->out_hdr.sector = bio->bi_sector; vbr->out_hdr.ioprio = bio_prio(bio); - sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); - - num = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg + out); - - sg_set_buf(&vbr->sg[num + out + in++], &vbr->status, - sizeof(vbr->status)); - - if (num) { - if (bio->bi_rw & REQ_WRITE) { + if (blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg)) { + have_data = true; + if (bio->bi_rw & REQ_WRITE) vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; - out += num; - } else { + else vbr->out_hdr.type |= VIRTIO_BLK_T_IN; - in += num; - } - } + } else + have_data = false; - virtblk_add_req(vbr, out, in); - - return 0; + virtblk_add_req(vbr, have_data); } static void virtblk_bio_send_data_work(struct work_struct *work) @@ -298,7 +305,7 @@ static void virtblk_done(struct virtqueue *vq) static bool do_req(struct request_queue *q, struct virtio_blk *vblk, struct request *req) { - unsigned long num, out = 0, in = 0; + unsigned int num; struct virtblk_req *vbr; vbr = virtblk_alloc_req(vblk, GFP_ATOMIC); @@ -335,40 +342,15 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, } } - sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); - - /* - * If this is a packet command we need a couple of additional headers. - * Behind the normal outhdr we put a segment with the scsi command - * block, and before the normal inhdr we put the sense data and the - * inhdr with additional status information before the normal inhdr. - */ - if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) - sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len); - - num = blk_rq_map_sg(q, vbr->req, vblk->sg + out); - - if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) { - sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, SCSI_SENSE_BUFFERSIZE); - sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr, - sizeof(vbr->in_hdr)); - } - - sg_set_buf(&vblk->sg[num + out + in++], &vbr->status, - sizeof(vbr->status)); - + num = blk_rq_map_sg(q, vbr->req, vblk->sg); if (num) { - if (rq_data_dir(vbr->req) == WRITE) { + if (rq_data_dir(vbr->req) == WRITE) vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; - out += num; - } else { + else vbr->out_hdr.type |= VIRTIO_BLK_T_IN; - in += num; - } } - if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, - GFP_ATOMIC) < 0) { + if (__virtblk_add_req(vblk->vq, vbr, vblk->sg, num) < 0) { mempool_free(vbr, vblk->pool); return false; } @@ -539,6 +521,7 @@ static void virtblk_config_changed_work(struct work_struct *work) struct virtio_device *vdev = vblk->vdev; struct request_queue *q = vblk->disk->queue; char cap_str_2[10], cap_str_10[10]; + char *envp[] = { "RESIZE=1", NULL }; u64 capacity, size; mutex_lock(&vblk->config_lock); @@ -568,6 +551,7 @@ static void virtblk_config_changed_work(struct work_struct *work) set_capacity(vblk->disk, capacity); revalidate_disk(vblk->disk); + kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp); done: mutex_unlock(&vblk->config_lock); } diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 6bf4d47324eb..ef46a9cfd832 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -47,7 +47,7 @@ static void register_buffer(u8 *buf, size_t size) sg_init_one(&sg, buf, size); /* There should always be room for one buffer. */ - if (virtqueue_add_buf(vq, &sg, 0, 1, buf, GFP_KERNEL) < 0) + if (virtqueue_add_inbuf(vq, &sg, 1, buf, GFP_KERNEL) < 0) BUG(); virtqueue_kick(vq); diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index ce5f3fc25d6d..1b456fe9b87a 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -78,8 +78,8 @@ struct ports_driver_data { }; static struct ports_driver_data pdrvdata; -DEFINE_SPINLOCK(pdrvdata_lock); -DECLARE_COMPLETION(early_console_added); +static DEFINE_SPINLOCK(pdrvdata_lock); +static DECLARE_COMPLETION(early_console_added); /* This struct holds information that's relevant only for console ports */ struct console { @@ -503,7 +503,7 @@ static int add_inbuf(struct virtqueue *vq, struct port_buffer *buf) sg_init_one(sg, buf->buf, buf->size); - ret = virtqueue_add_buf(vq, sg, 0, 1, buf, GFP_ATOMIC); + ret = virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC); virtqueue_kick(vq); if (!ret) ret = vq->num_free; @@ -572,7 +572,7 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id, sg_init_one(sg, &cpkt, sizeof(cpkt)); spin_lock(&portdev->c_ovq_lock); - if (virtqueue_add_buf(vq, sg, 1, 0, &cpkt, GFP_ATOMIC) == 0) { + if (virtqueue_add_outbuf(vq, sg, 1, &cpkt, GFP_ATOMIC) == 0) { virtqueue_kick(vq); while (!virtqueue_get_buf(vq, &len)) cpu_relax(); @@ -622,7 +622,7 @@ static ssize_t __send_to_port(struct port *port, struct scatterlist *sg, reclaim_consumed_buffers(port); - err = virtqueue_add_buf(out_vq, sg, nents, 0, data, GFP_ATOMIC); + err = virtqueue_add_outbuf(out_vq, sg, nents, data, GFP_ATOMIC); /* Tell Host to go! */ virtqueue_kick(out_vq); @@ -1040,7 +1040,7 @@ static int port_fops_open(struct inode *inode, struct file *filp) spin_lock_irq(&port->inbuf_lock); if (port->guest_connected) { spin_unlock_irq(&port->inbuf_lock); - ret = -EMFILE; + ret = -EBUSY; goto out; } @@ -1202,7 +1202,7 @@ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int)) return hvc_instantiate(0, 0, &hv_ops); } -int init_port_console(struct port *port) +static int init_port_console(struct port *port) { int ret; diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig index 89875ea19ade..ee035ec4526b 100644 --- a/drivers/lguest/Kconfig +++ b/drivers/lguest/Kconfig @@ -5,10 +5,9 @@ config LGUEST ---help--- This is a very simple module which allows you to run multiple instances of the same Linux kernel, using the - "lguest" command found in the Documentation/virtual/lguest - directory. + "lguest" command found in the tools/lguest directory. Note that "lguest" is pronounced to rhyme with "fell quest", - not "rustyvisor". See Documentation/virtual/lguest/lguest.txt. + not "rustyvisor". See tools/lguest/lguest.txt. If unsure, say N. If curious, say M. If masochistic, say Y. diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index a5ebc0083d87..0bf1e4edf04d 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -20,9 +20,9 @@ #include <asm/asm-offsets.h> #include "lg.h" - +unsigned long switcher_addr; +struct page **lg_switcher_pages; static struct vm_struct *switcher_vma; -static struct page **switcher_page; /* This One Big lock protects all inter-guest data structures. */ DEFINE_MUTEX(lguest_lock); @@ -52,13 +52,21 @@ static __init int map_switcher(void) * easy. */ + /* We assume Switcher text fits into a single page. */ + if (end_switcher_text - start_switcher_text > PAGE_SIZE) { + printk(KERN_ERR "lguest: switcher text too large (%zu)\n", + end_switcher_text - start_switcher_text); + return -EINVAL; + } + /* * We allocate an array of struct page pointers. map_vm_area() wants * this, rather than just an array of pages. */ - switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, - GFP_KERNEL); - if (!switcher_page) { + lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0]) + * TOTAL_SWITCHER_PAGES, + GFP_KERNEL); + if (!lg_switcher_pages) { err = -ENOMEM; goto out; } @@ -68,32 +76,29 @@ static __init int map_switcher(void) * so we make sure they're zeroed. */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { - switcher_page[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (!switcher_page[i]) { + lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (!lg_switcher_pages[i]) { err = -ENOMEM; goto free_some_pages; } } /* - * First we check that the Switcher won't overlap the fixmap area at - * the top of memory. It's currently nowhere near, but it could have - * very strange effects if it ever happened. + * We place the Switcher underneath the fixmap area, which is the + * highest virtual address we can get. This is important, since we + * tell the Guest it can't access this memory, so we want its ceiling + * as high as possible. */ - if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ - err = -ENOMEM; - printk("lguest: mapping switcher would thwack fixmap\n"); - goto free_pages; - } + switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE; /* - * Now we reserve the "virtual memory area" we want: 0xFFC00000 - * (SWITCHER_ADDR). We might not get it in theory, but in practice - * it's worked so far. The end address needs +1 because __get_vm_area - * allocates an extra guard page, so we need space for that. + * Now we reserve the "virtual memory area" we want. We might + * not get it in theory, but in practice it's worked so far. + * The end address needs +1 because __get_vm_area allocates an + * extra guard page, so we need space for that. */ switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, - VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR + VM_ALLOC, switcher_addr, switcher_addr + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); if (!switcher_vma) { err = -ENOMEM; @@ -103,12 +108,12 @@ static __init int map_switcher(void) /* * This code actually sets up the pages we've allocated to appear at - * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the + * switcher_addr. map_vm_area() takes the vma we allocated above, the * kind of pages we're mapping (kernel pages), and a pointer to our * array of struct pages. It increments that pointer, but we don't * care. */ - pagep = switcher_page; + pagep = lg_switcher_pages; err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); if (err) { printk("lguest: map_vm_area failed: %i\n", err); @@ -133,8 +138,8 @@ free_pages: i = TOTAL_SWITCHER_PAGES; free_some_pages: for (--i; i >= 0; i--) - __free_pages(switcher_page[i], 0); - kfree(switcher_page); + __free_pages(lg_switcher_pages[i], 0); + kfree(lg_switcher_pages); out: return err; } @@ -149,8 +154,8 @@ static void unmap_switcher(void) vunmap(switcher_vma->addr); /* Now we just need to free the pages we copied the switcher into */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) - __free_pages(switcher_page[i], 0); - kfree(switcher_page); + __free_pages(lg_switcher_pages[i], 0); + kfree(lg_switcher_pages); } /*H:032 @@ -323,15 +328,10 @@ static int __init init(void) if (err) goto out; - /* Now we set up the pagetable implementation for the Guests. */ - err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); - if (err) - goto unmap; - /* We might need to reserve an interrupt vector. */ err = init_interrupts(); if (err) - goto free_pgtables; + goto unmap; /* /dev/lguest needs to be registered. */ err = lguest_device_init(); @@ -346,8 +346,6 @@ static int __init init(void) free_interrupts: free_interrupts(); -free_pgtables: - free_pagetables(); unmap: unmap_switcher(); out: @@ -359,7 +357,6 @@ static void __exit fini(void) { lguest_device_remove(); free_interrupts(); - free_pagetables(); unmap_switcher(); lguest_arch_host_fini(); diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 295df06e6590..2eef40be4c04 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -14,11 +14,10 @@ #include <asm/lguest.h> -void free_pagetables(void); -int init_pagetables(struct page **switcher_page, unsigned int pages); - struct pgdir { unsigned long gpgdir; + bool switcher_mapped; + int last_host_cpu; pgd_t *pgdir; }; @@ -124,6 +123,7 @@ bool lguest_address_ok(const struct lguest *lg, unsigned long addr, unsigned long len); void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); +extern struct page **lg_switcher_pages; /*H:035 * Using memory-copy operations like that is usually inconvient, so we diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index ff4a0bc9904d..4263f4cc8c55 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -250,13 +250,13 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) */ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) { - /* We have a limited number the number of CPUs in the lguest struct. */ + /* We have a limited number of CPUs in the lguest struct. */ if (id >= ARRAY_SIZE(cpu->lg->cpus)) return -EINVAL; /* Set up this CPU's id, and pointer back to the lguest struct. */ cpu->id = id; - cpu->lg = container_of((cpu - id), struct lguest, cpus[0]); + cpu->lg = container_of(cpu, struct lguest, cpus[id]); cpu->lg->nr_cpus++; /* Each CPU has a timer it can set. */ @@ -270,7 +270,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) if (!cpu->regs_page) return -ENOMEM; - /* We actually put the registers at the bottom of the page. */ + /* We actually put the registers at the end of the page. */ cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); /* diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 864baabaee25..699187ab3800 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -7,7 +7,7 @@ * converted Guest pages when running the Guest. :*/ -/* Copyright (C) Rusty Russell IBM Corporation 2006. +/* Copyright (C) Rusty Russell IBM Corporation 2013. * GPL v2 and any later version */ #include <linux/mm.h> #include <linux/gfp.h> @@ -62,22 +62,11 @@ * will need the last pmd entry of the last pmd page. */ #ifdef CONFIG_X86_PAE -#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) -#define RESERVE_MEM 2U #define CHECK_GPGD_MASK _PAGE_PRESENT #else -#define RESERVE_MEM 4U #define CHECK_GPGD_MASK _PAGE_TABLE #endif -/* - * We actually need a separate PTE page for each CPU. Remember that after the - * Switcher code itself comes two pages for each CPU, and we don't want this - * CPU's guest to see the pages of any other CPU. - */ -static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); -#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) - /*H:320 * The page table code is curly enough to need helper functions to keep it * clear and clean. The kernel itself provides many of them; one advantage @@ -95,13 +84,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) { unsigned int index = pgd_index(vaddr); -#ifndef CONFIG_X86_PAE - /* We kill any Guest trying to touch the Switcher addresses. */ - if (index >= SWITCHER_PGD_INDEX) { - kill_guest(cpu, "attempt to access switcher pages"); - index = 0; - } -#endif /* Return a pointer index'th pgd entry for the i'th page table. */ return &cpu->lg->pgdirs[i].pgdir[index]; } @@ -117,13 +99,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) unsigned int index = pmd_index(vaddr); pmd_t *page; - /* We kill any Guest trying to touch the Switcher addresses. */ - if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && - index >= SWITCHER_PMD_INDEX) { - kill_guest(cpu, "attempt to access switcher pages"); - index = 0; - } - /* You should never call this if the PGD entry wasn't valid */ BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); page = __va(pgd_pfn(spgd) << PAGE_SHIFT); @@ -275,122 +250,177 @@ static void release_pte(pte_t pte) } /*:*/ -static void check_gpte(struct lg_cpu *cpu, pte_t gpte) +static bool check_gpte(struct lg_cpu *cpu, pte_t gpte) { if ((pte_flags(gpte) & _PAGE_PSE) || - pte_pfn(gpte) >= cpu->lg->pfn_limit) + pte_pfn(gpte) >= cpu->lg->pfn_limit) { kill_guest(cpu, "bad page table entry"); + return false; + } + return true; } -static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) +static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) { if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || - (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) + (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) { kill_guest(cpu, "bad page directory entry"); + return false; + } + return true; } #ifdef CONFIG_X86_PAE -static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) +static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) { if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || - (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) + (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) { kill_guest(cpu, "bad page middle directory entry"); + return false; + } + return true; } #endif -/*H:330 - * (i) Looking up a page table entry when the Guest faults. - * - * We saw this call in run_guest(): when we see a page fault in the Guest, we - * come here. That's because we only set up the shadow page tables lazily as - * they're needed, so we get page faults all the time and quietly fix them up - * and return to the Guest without it knowing. +/*H:331 + * This is the core routine to walk the shadow page tables and find the page + * table entry for a specific address. * - * If we fixed up the fault (ie. we mapped the address), this routine returns - * true. Otherwise, it was a real fault and we need to tell the Guest. + * If allocate is set, then we allocate any missing levels, setting the flags + * on the new page directory and mid-level directories using the arguments + * (which are copied from the Guest's page table entries). */ -bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) +static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate, + int pgd_flags, int pmd_flags) { - pgd_t gpgd; pgd_t *spgd; - unsigned long gpte_ptr; - pte_t gpte; - pte_t *spte; - /* Mid level for PAE. */ #ifdef CONFIG_X86_PAE pmd_t *spmd; - pmd_t gpmd; #endif - /* First step: get the top-level Guest page table entry. */ - if (unlikely(cpu->linear_pages)) { - /* Faking up a linear mapping. */ - gpgd = __pgd(CHECK_GPGD_MASK); - } else { - gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); - /* Toplevel not present? We can't map it in. */ - if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) - return false; - } - - /* Now look at the matching shadow entry. */ + /* Get top level entry. */ spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ - unsigned long ptepage = get_zeroed_page(GFP_KERNEL); + unsigned long ptepage; + + /* If they didn't want us to allocate anything, stop. */ + if (!allocate) + return NULL; + + ptepage = get_zeroed_page(GFP_KERNEL); /* * This is not really the Guest's fault, but killing it is * simple for this corner case. */ if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); - return false; + return NULL; } - /* We check that the Guest pgd is OK. */ - check_gpgd(cpu, gpgd); /* * And we copy the flags to the shadow PGD entry. The page * number in the shadow PGD is the page we just allocated. */ - set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); + set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags)); } + /* + * Intel's Physical Address Extension actually uses three levels of + * page tables, so we need to look in the mid-level. + */ #ifdef CONFIG_X86_PAE - if (unlikely(cpu->linear_pages)) { - /* Faking up a linear mapping. */ - gpmd = __pmd(_PAGE_TABLE); - } else { - gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); - /* Middle level not present? We can't map it in. */ - if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) - return false; - } - - /* Now look at the matching shadow entry. */ + /* Now look at the mid-level shadow entry. */ spmd = spmd_addr(cpu, *spgd, vaddr); if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ - unsigned long ptepage = get_zeroed_page(GFP_KERNEL); + unsigned long ptepage; + + /* If they didn't want us to allocate anything, stop. */ + if (!allocate) + return NULL; + + ptepage = get_zeroed_page(GFP_KERNEL); /* * This is not really the Guest's fault, but killing it is * simple for this corner case. */ if (!ptepage) { - kill_guest(cpu, "out of memory allocating pte page"); - return false; + kill_guest(cpu, "out of memory allocating pmd page"); + return NULL; } - /* We check that the Guest pmd is OK. */ - check_gpmd(cpu, gpmd); - /* * And we copy the flags to the shadow PMD entry. The page * number in the shadow PMD is the page we just allocated. */ - set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); + set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags)); + } +#endif + + /* Get the pointer to the shadow PTE entry we're going to set. */ + return spte_addr(cpu, *spgd, vaddr); +} + +/*H:330 + * (i) Looking up a page table entry when the Guest faults. + * + * We saw this call in run_guest(): when we see a page fault in the Guest, we + * come here. That's because we only set up the shadow page tables lazily as + * they're needed, so we get page faults all the time and quietly fix them up + * and return to the Guest without it knowing. + * + * If we fixed up the fault (ie. we mapped the address), this routine returns + * true. Otherwise, it was a real fault and we need to tell the Guest. + */ +bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) +{ + unsigned long gpte_ptr; + pte_t gpte; + pte_t *spte; + pmd_t gpmd; + pgd_t gpgd; + + /* We never demand page the Switcher, so trying is a mistake. */ + if (vaddr >= switcher_addr) + return false; + + /* First step: get the top-level Guest page table entry. */ + if (unlikely(cpu->linear_pages)) { + /* Faking up a linear mapping. */ + gpgd = __pgd(CHECK_GPGD_MASK); + } else { + gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); + /* Toplevel not present? We can't map it in. */ + if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) + return false; + + /* + * This kills the Guest if it has weird flags or tries to + * refer to a "physical" address outside the bounds. + */ + if (!check_gpgd(cpu, gpgd)) + return false; + } + + /* This "mid-level" entry is only used for non-linear, PAE mode. */ + gpmd = __pmd(_PAGE_TABLE); + +#ifdef CONFIG_X86_PAE + if (likely(!cpu->linear_pages)) { + gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); + /* Middle level not present? We can't map it in. */ + if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) + return false; + + /* + * This kills the Guest if it has weird flags or tries to + * refer to a "physical" address outside the bounds. + */ + if (!check_gpmd(cpu, gpmd)) + return false; } /* @@ -433,7 +463,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * Check that the Guest PTE flags are OK, and the page number is below * the pfn_limit (ie. not mapping the Launcher binary). */ - check_gpte(cpu, gpte); + if (!check_gpte(cpu, gpte)) + return false; /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ gpte = pte_mkyoung(gpte); @@ -441,7 +472,9 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) gpte = pte_mkdirty(gpte); /* Get the pointer to the shadow PTE entry we're going to set. */ - spte = spte_addr(cpu, *spgd, vaddr); + spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd)); + if (!spte) + return false; /* * If there was a valid shadow PTE entry here before, we release it. @@ -493,29 +526,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) */ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) { - pgd_t *spgd; + pte_t *spte; unsigned long flags; -#ifdef CONFIG_X86_PAE - pmd_t *spmd; -#endif - /* Look at the current top level entry: is it present? */ - spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); - if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) + /* You can't put your stack in the Switcher! */ + if (vaddr >= switcher_addr) return false; -#ifdef CONFIG_X86_PAE - spmd = spmd_addr(cpu, *spgd, vaddr); - if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) + /* If there's no shadow PTE, it's not writable. */ + spte = find_spte(cpu, vaddr, false, 0, 0); + if (!spte) return false; -#endif /* * Check the flags on the pte entry itself: it must be present and * writable. */ - flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); - + flags = pte_flags(*spte); return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); } @@ -678,9 +705,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, int *blank_pgdir) { unsigned int next; -#ifdef CONFIG_X86_PAE - pmd_t *pmd_table; -#endif /* * We pick one entry at random to throw out. Choosing the Least @@ -695,29 +719,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, if (!cpu->lg->pgdirs[next].pgdir) next = cpu->cpu_pgd; else { -#ifdef CONFIG_X86_PAE /* - * In PAE mode, allocate a pmd page and populate the - * last pgd entry. + * This is a blank page, so there are no kernel + * mappings: caller must map the stack! */ - pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); - if (!pmd_table) { - free_page((long)cpu->lg->pgdirs[next].pgdir); - set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0)); - next = cpu->cpu_pgd; - } else { - set_pgd(cpu->lg->pgdirs[next].pgdir + - SWITCHER_PGD_INDEX, - __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - /* - * This is a blank page, so there are no kernel - * mappings: caller must map the stack! - */ - *blank_pgdir = 1; - } -#else *blank_pgdir = 1; -#endif } } /* Record which Guest toplevel this shadows. */ @@ -725,9 +731,50 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, /* Release all the non-kernel mappings. */ flush_user_mappings(cpu->lg, next); + /* This hasn't run on any CPU at all. */ + cpu->lg->pgdirs[next].last_host_cpu = -1; + return next; } +/*H:501 + * We do need the Switcher code mapped at all times, so we allocate that + * part of the Guest page table here. We map the Switcher code immediately, + * but defer mapping of the guest register page and IDT/LDT etc page until + * just before we run the guest in map_switcher_in_guest(). + * + * We *could* do this setup in map_switcher_in_guest(), but at that point + * we've interrupts disabled, and allocating pages like that is fraught: we + * can't sleep if we need to free up some memory. + */ +static bool allocate_switcher_mapping(struct lg_cpu *cpu) +{ + int i; + + for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { + pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true, + CHECK_GPGD_MASK, _PAGE_TABLE); + if (!pte) + return false; + + /* + * Map the switcher page if not already there. It might + * already be there because we call allocate_switcher_mapping() + * in guest_set_pgd() just in case it did discard our Switcher + * mapping, but it probably didn't. + */ + if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) { + /* Get a reference to the Switcher page. */ + get_page(lg_switcher_pages[0]); + /* Create a read-only, exectuable, kernel-style PTE */ + set_pte(pte, + mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX)); + } + } + cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true; + return true; +} + /*H:470 * Finally, a routine which throws away everything: all PGD entries in all * the shadow page tables, including the Guest's kernel mappings. This is used @@ -738,28 +785,16 @@ static void release_all_pagetables(struct lguest *lg) unsigned int i, j; /* Every shadow pagetable this Guest has */ - for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) - if (lg->pgdirs[i].pgdir) { -#ifdef CONFIG_X86_PAE - pgd_t *spgd; - pmd_t *pmdpage; - unsigned int k; - - /* Get the last pmd page. */ - spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; - pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); - - /* - * And release the pmd entries of that pmd page, - * except for the switcher pmd. - */ - for (k = 0; k < SWITCHER_PMD_INDEX; k++) - release_pmd(&pmdpage[k]); -#endif - /* Every PGD entry except the Switcher at the top */ - for (j = 0; j < SWITCHER_PGD_INDEX; j++) - release_pgd(lg->pgdirs[i].pgdir + j); - } + for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) { + if (!lg->pgdirs[i].pgdir) + continue; + + /* Every PGD entry. */ + for (j = 0; j < PTRS_PER_PGD; j++) + release_pgd(lg->pgdirs[i].pgdir + j); + lg->pgdirs[i].switcher_mapped = false; + lg->pgdirs[i].last_host_cpu = -1; + } } /* @@ -773,6 +808,9 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) release_all_pagetables(cpu->lg); /* We need the Guest kernel stack mapped again. */ pin_stack_pages(cpu); + /* And we need Switcher allocated. */ + if (!allocate_switcher_mapping(cpu)) + kill_guest(cpu, "Cannot populate switcher mapping"); } /*H:430 @@ -808,9 +846,17 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) newpgdir = new_pgdir(cpu, pgtable, &repin); /* Change the current pgd index to the new one. */ cpu->cpu_pgd = newpgdir; - /* If it was completely blank, we map in the Guest kernel stack */ + /* + * If it was completely blank, we map in the Guest kernel stack and + * the Switcher. + */ if (repin) pin_stack_pages(cpu); + + if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) { + if (!allocate_switcher_mapping(cpu)) + kill_guest(cpu, "Cannot populate switcher mapping"); + } } /*:*/ @@ -865,7 +911,8 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, * micro-benchmark. */ if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { - check_gpte(cpu, gpte); + if (!check_gpte(cpu, gpte)) + return; set_pte(spte, gpte_to_spte(cpu, gpte, pte_flags(gpte) & _PAGE_DIRTY)); @@ -897,6 +944,12 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, unsigned long vaddr, pte_t gpte) { + /* We don't let you remap the Switcher; we need it to get back! */ + if (vaddr >= switcher_addr) { + kill_guest(cpu, "attempt to set pte into Switcher pages"); + return; + } + /* * Kernel mappings must be changed on all top levels. Slow, but doesn't * happen often. @@ -933,14 +986,23 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) { int pgdir; - if (idx >= SWITCHER_PGD_INDEX) + if (idx > PTRS_PER_PGD) { + kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u", + idx, PTRS_PER_PGD); return; + } /* If they're talking about a page table we have a shadow for... */ pgdir = find_pgdir(lg, gpgdir); - if (pgdir < ARRAY_SIZE(lg->pgdirs)) + if (pgdir < ARRAY_SIZE(lg->pgdirs)) { /* ... throw it away. */ release_pgd(lg->pgdirs[pgdir].pgdir + idx); + /* That might have been the Switcher mapping, remap it. */ + if (!allocate_switcher_mapping(&lg->cpus[0])) { + kill_guest(&lg->cpus[0], + "Cannot populate switcher mapping"); + } + } } #ifdef CONFIG_X86_PAE @@ -958,6 +1020,9 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) * we will populate on future faults. The Guest doesn't have any actual * pagetables yet, so we set linear_pages to tell demand_page() to fake it * for the moment. + * + * We do need the Switcher to be mapped at all times, so we allocate that + * part of the Guest page table here. */ int init_guest_pagetable(struct lguest *lg) { @@ -971,21 +1036,34 @@ int init_guest_pagetable(struct lguest *lg) /* We start with a linear mapping until the initialize. */ cpu->linear_pages = true; + + /* Allocate the page tables for the Switcher. */ + if (!allocate_switcher_mapping(cpu)) { + release_all_pagetables(lg); + return -ENOMEM; + } + return 0; } /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ void page_table_guest_data_init(struct lg_cpu *cpu) { + /* + * We tell the Guest that it can't use the virtual addresses + * used by the Switcher. This trick is equivalent to 4GB - + * switcher_addr. + */ + u32 top = ~switcher_addr + 1; + /* We get the kernel address: above this is all kernel memory. */ if (get_user(cpu->lg->kernel_address, - &cpu->lg->lguest_data->kernel_address) + &cpu->lg->lguest_data->kernel_address) /* - * We tell the Guest that it can't use the top 2 or 4 MB - * of virtual addresses used by the Switcher. + * We tell the Guest that it can't use the top virtual + * addresses (used by the Switcher). */ - || put_user(RESERVE_MEM * 1024 * 1024, - &cpu->lg->lguest_data->reserve_mem)) { + || put_user(top, &cpu->lg->lguest_data->reserve_mem)) { kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); return; } @@ -995,12 +1073,7 @@ void page_table_guest_data_init(struct lg_cpu *cpu) * "pgd_index(lg->kernel_address)". This assumes it won't hit the * Switcher mappings, so check that now. */ -#ifdef CONFIG_X86_PAE - if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && - pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) -#else - if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) -#endif + if (cpu->lg->kernel_address >= switcher_addr) kill_guest(cpu, "bad kernel address %#lx", cpu->lg->kernel_address); } @@ -1017,102 +1090,96 @@ void free_guest_pagetable(struct lguest *lg) free_page((long)lg->pgdirs[i].pgdir); } -/*H:480 - * (vi) Mapping the Switcher when the Guest is about to run. - * - * The Switcher and the two pages for this CPU need to be visible in the - * Guest (and not the pages for other CPUs). We have the appropriate PTE pages - * for each CPU already set up, we just need to hook them in now we know which - * Guest is about to run on this CPU. +/*H:481 + * This clears the Switcher mappings for cpu #i. */ -void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) +static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i) { - pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages); - pte_t regs_pte; + unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2; + pte_t *pte; -#ifdef CONFIG_X86_PAE - pmd_t switcher_pmd; - pmd_t *pmd_table; - - switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT, - PAGE_KERNEL_EXEC); - - /* Figure out where the pmd page is, by reading the PGD, and converting - * it to a virtual address. */ - pmd_table = __va(pgd_pfn(cpu->lg-> - pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) - << PAGE_SHIFT); - /* Now write it into the shadow page table. */ - set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); -#else - pgd_t switcher_pgd; + /* Clear the mappings for both pages. */ + pte = find_spte(cpu, base, false, 0, 0); + release_pte(*pte); + set_pte(pte, __pte(0)); - /* - * Make the last PGD entry for this Guest point to the Switcher's PTE - * page for this CPU (with appropriate flags). - */ - switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); - - cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; - -#endif - /* - * We also change the Switcher PTE page. When we're running the Guest, - * we want the Guest's "regs" page to appear where the first Switcher - * page for this CPU is. This is an optimization: when the Switcher - * saves the Guest registers, it saves them into the first page of this - * CPU's "struct lguest_pages": if we make sure the Guest's register - * page is already mapped there, we don't have to copy them out - * again. - */ - regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL); - set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte); + pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); + release_pte(*pte); + set_pte(pte, __pte(0)); } -/*:*/ -static void free_switcher_pte_pages(void) -{ - unsigned int i; - - for_each_possible_cpu(i) - free_page((long)switcher_pte_page(i)); -} - -/*H:520 - * Setting up the Switcher PTE page for given CPU is fairly easy, given - * the CPU number and the "struct page"s for the Switcher code itself. +/*H:480 + * (vi) Mapping the Switcher when the Guest is about to run. + * + * The Switcher and the two pages for this CPU need to be visible in the Guest + * (and not the pages for other CPUs). * - * Currently the Switcher is less than a page long, so "pages" is always 1. + * The pages for the pagetables have all been allocated before: we just need + * to make sure the actual PTEs are up-to-date for the CPU we're about to run + * on. */ -static __init void populate_switcher_pte_page(unsigned int cpu, - struct page *switcher_page[], - unsigned int pages) +void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) { - unsigned int i; - pte_t *pte = switcher_pte_page(cpu); + unsigned long base; + struct page *percpu_switcher_page, *regs_page; + pte_t *pte; + struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd]; + + /* Switcher page should always be mapped by now! */ + BUG_ON(!pgdir->switcher_mapped); + + /* + * Remember that we have two pages for each Host CPU, so we can run a + * Guest on each CPU without them interfering. We need to make sure + * those pages are mapped correctly in the Guest, but since we usually + * run on the same CPU, we cache that, and only update the mappings + * when we move. + */ + if (pgdir->last_host_cpu == raw_smp_processor_id()) + return; - /* The first entries are easy: they map the Switcher code. */ - for (i = 0; i < pages; i++) { - set_pte(&pte[i], mk_pte(switcher_page[i], - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); + /* -1 means unknown so we remove everything. */ + if (pgdir->last_host_cpu == -1) { + unsigned int i; + for_each_possible_cpu(i) + remove_switcher_percpu_map(cpu, i); + } else { + /* We know exactly what CPU mapping to remove. */ + remove_switcher_percpu_map(cpu, pgdir->last_host_cpu); } - /* The only other thing we map is this CPU's pair of pages. */ - i = pages + cpu*2; - - /* First page (Guest registers) is writable from the Guest */ - set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); + /* + * When we're running the Guest, we want the Guest's "regs" page to + * appear where the first Switcher page for this CPU is. This is an + * optimization: when the Switcher saves the Guest registers, it saves + * them into the first page of this CPU's "struct lguest_pages": if we + * make sure the Guest's register page is already mapped there, we + * don't have to copy them out again. + */ + /* Find the shadow PTE for this regs page. */ + base = switcher_addr + PAGE_SIZE + + raw_smp_processor_id() * sizeof(struct lguest_pages); + pte = find_spte(cpu, base, false, 0, 0); + regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT); + get_page(regs_page); + set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL))); /* - * The second page contains the "struct lguest_ro_state", and is - * read-only. + * We map the second page of the struct lguest_pages read-only in + * the Guest: the IDT, GDT and other things it's not supposed to + * change. */ - set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); + pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); + percpu_switcher_page + = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1]; + get_page(percpu_switcher_page); + set_pte(pte, mk_pte(percpu_switcher_page, + __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL))); + + pgdir->last_host_cpu = raw_smp_processor_id(); } -/* +/*H:490 * We've made it through the page table code. Perhaps our tired brains are * still processing the details, or perhaps we're simply glad it's over. * @@ -1124,29 +1191,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu, * * There is just one file remaining in the Host. */ - -/*H:510 - * At boot or module load time, init_pagetables() allocates and populates - * the Switcher PTE page for each CPU. - */ -__init int init_pagetables(struct page **switcher_page, unsigned int pages) -{ - unsigned int i; - - for_each_possible_cpu(i) { - switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL); - if (!switcher_pte_page(i)) { - free_switcher_pte_pages(); - return -ENOMEM; - } - populate_switcher_pte_page(i, switcher_page, pages); - } - return 0; -} -/*:*/ - -/* Cleaning up simply involves freeing the PTE page for each CPU. */ -void free_pagetables(void) -{ - free_switcher_pte_pages(); -} diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 4af12e1844d5..f0a3347b6441 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -59,14 +59,13 @@ static struct { /* Offset from where switcher.S was compiled to where we've copied it */ static unsigned long switcher_offset(void) { - return SWITCHER_ADDR - (unsigned long)start_switcher_text; + return switcher_addr - (unsigned long)start_switcher_text; } -/* This cpu's struct lguest_pages. */ +/* This cpu's struct lguest_pages (after the Switcher text page) */ static struct lguest_pages *lguest_pages(unsigned int cpu) { - return &(((struct lguest_pages *) - (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); + return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]); } static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu); diff --git a/drivers/net/caif/Kconfig b/drivers/net/caif/Kconfig index a966128c2a7a..7ffc756131a2 100644 --- a/drivers/net/caif/Kconfig +++ b/drivers/net/caif/Kconfig @@ -40,3 +40,17 @@ config CAIF_HSI The caif low level driver for CAIF over HSI. Be aware that if you enable this then you also need to enable a low-level HSI driver. + +config CAIF_VIRTIO + tristate "CAIF virtio transport driver" + depends on CAIF + select VHOST_RING + select VIRTIO + select GENERIC_ALLOCATOR + default n + ---help--- + The caif driver for CAIF over Virtio. + +if CAIF_VIRTIO +source "drivers/vhost/Kconfig" +endif diff --git a/drivers/net/caif/Makefile b/drivers/net/caif/Makefile index 15a9d2fc753d..9bbd45391f6c 100644 --- a/drivers/net/caif/Makefile +++ b/drivers/net/caif/Makefile @@ -9,3 +9,6 @@ obj-$(CONFIG_CAIF_SPI_SLAVE) += cfspi_slave.o # HSI interface obj-$(CONFIG_CAIF_HSI) += caif_hsi.o + +# Virtio interface +obj-$(CONFIG_CAIF_VIRTIO) += caif_virtio.o diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c new file mode 100644 index 000000000000..b9ed1288ce2d --- /dev/null +++ b/drivers/net/caif/caif_virtio.c @@ -0,0 +1,790 @@ +/* + * Copyright (C) ST-Ericsson AB 2013 + * Authors: Vicram Arv + * Dmitry Tarnyagin <dmitry.tarnyagin@lockless.no> + * Sjur Brendeland + * License terms: GNU General Public License (GPL) version 2 + */ +#include <linux/module.h> +#include <linux/if_arp.h> +#include <linux/virtio.h> +#include <linux/vringh.h> +#include <linux/debugfs.h> +#include <linux/spinlock.h> +#include <linux/genalloc.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/virtio_ids.h> +#include <linux/virtio_caif.h> +#include <linux/virtio_ring.h> +#include <linux/dma-mapping.h> +#include <net/caif/caif_dev.h> +#include <linux/virtio_config.h> + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Vicram Arv"); +MODULE_AUTHOR("Sjur Brendeland"); +MODULE_DESCRIPTION("Virtio CAIF Driver"); + +/* NAPI schedule quota */ +#define CFV_DEFAULT_QUOTA 32 + +/* Defaults used if virtio config space is unavailable */ +#define CFV_DEF_MTU_SIZE 4096 +#define CFV_DEF_HEADROOM 32 +#define CFV_DEF_TAILROOM 32 + +/* Required IP header alignment */ +#define IP_HDR_ALIGN 4 + +/* struct cfv_napi_contxt - NAPI context info + * @riov: IOV holding data read from the ring. Note that riov may + * still hold data when cfv_rx_poll() returns. + * @head: Last descriptor ID we received from vringh_getdesc_kern. + * We use this to put descriptor back on the used ring. USHRT_MAX is + * used to indicate invalid head-id. + */ +struct cfv_napi_context { + struct vringh_kiov riov; + unsigned short head; +}; + +/* struct cfv_stats - statistics for debugfs + * @rx_napi_complete: Number of NAPI completions (RX) + * @rx_napi_resched: Number of calls where the full quota was used (RX) + * @rx_nomem: Number of SKB alloc failures (RX) + * @rx_kicks: Number of RX kicks + * @tx_full_ring: Number times TX ring was full + * @tx_no_mem: Number of times TX went out of memory + * @tx_flow_on: Number of flow on (TX) + * @tx_kicks: Number of TX kicks + */ +struct cfv_stats { + u32 rx_napi_complete; + u32 rx_napi_resched; + u32 rx_nomem; + u32 rx_kicks; + u32 tx_full_ring; + u32 tx_no_mem; + u32 tx_flow_on; + u32 tx_kicks; +}; + +/* struct cfv_info - Caif Virtio control structure + * @cfdev: caif common header + * @vdev: Associated virtio device + * @vr_rx: rx/downlink host vring + * @vq_tx: tx/uplink virtqueue + * @ndev: CAIF link layer device + * @watermark_tx: indicates number of free descriptors we need + * to reopen the tx-queues after overload. + * @tx_lock: protects vq_tx from concurrent use + * @tx_release_tasklet: Tasklet for freeing consumed TX buffers + * @napi: Napi context used in cfv_rx_poll() + * @ctx: Context data used in cfv_rx_poll() + * @tx_hr: transmit headroom + * @rx_hr: receive headroom + * @tx_tr: transmit tail room + * @rx_tr: receive tail room + * @mtu: transmit max size + * @mru: receive max size + * @allocsz: size of dma memory reserved for TX buffers + * @alloc_addr: virtual address to dma memory for TX buffers + * @alloc_dma: dma address to dma memory for TX buffers + * @genpool: Gen Pool used for allocating TX buffers + * @reserved_mem: Pointer to memory reserve allocated from genpool + * @reserved_size: Size of memory reserve allocated from genpool + * @stats: Statistics exposed in sysfs + * @debugfs: Debugfs dentry for statistic counters + */ +struct cfv_info { + struct caif_dev_common cfdev; + struct virtio_device *vdev; + struct vringh *vr_rx; + struct virtqueue *vq_tx; + struct net_device *ndev; + unsigned int watermark_tx; + /* Protect access to vq_tx */ + spinlock_t tx_lock; + struct tasklet_struct tx_release_tasklet; + struct napi_struct napi; + struct cfv_napi_context ctx; + u16 tx_hr; + u16 rx_hr; + u16 tx_tr; + u16 rx_tr; + u32 mtu; + u32 mru; + size_t allocsz; + void *alloc_addr; + dma_addr_t alloc_dma; + struct gen_pool *genpool; + unsigned long reserved_mem; + size_t reserved_size; + struct cfv_stats stats; + struct dentry *debugfs; +}; + +/* struct buf_info - maintains transmit buffer data handle + * @size: size of transmit buffer + * @dma_handle: handle to allocated dma device memory area + * @vaddr: virtual address mapping to allocated memory area + */ +struct buf_info { + size_t size; + u8 *vaddr; +}; + +/* Called from virtio device, in IRQ context */ +static void cfv_release_cb(struct virtqueue *vq_tx) +{ + struct cfv_info *cfv = vq_tx->vdev->priv; + + ++cfv->stats.tx_kicks; + tasklet_schedule(&cfv->tx_release_tasklet); +} + +static void free_buf_info(struct cfv_info *cfv, struct buf_info *buf_info) +{ + if (!buf_info) + return; + gen_pool_free(cfv->genpool, (unsigned long) buf_info->vaddr, + buf_info->size); + kfree(buf_info); +} + +/* This is invoked whenever the remote processor completed processing + * a TX msg we just sent, and the buffer is put back to the used ring. + */ +static void cfv_release_used_buf(struct virtqueue *vq_tx) +{ + struct cfv_info *cfv = vq_tx->vdev->priv; + unsigned long flags; + + BUG_ON(vq_tx != cfv->vq_tx); + + for (;;) { + unsigned int len; + struct buf_info *buf_info; + + /* Get used buffer from used ring to recycle used descriptors */ + spin_lock_irqsave(&cfv->tx_lock, flags); + buf_info = virtqueue_get_buf(vq_tx, &len); + spin_unlock_irqrestore(&cfv->tx_lock, flags); + + /* Stop looping if there are no more buffers to free */ + if (!buf_info) + break; + + free_buf_info(cfv, buf_info); + + /* watermark_tx indicates if we previously stopped the tx + * queues. If we have enough free stots in the virtio ring, + * re-establish memory reserved and open up tx queues. + */ + if (cfv->vq_tx->num_free <= cfv->watermark_tx) + continue; + + /* Re-establish memory reserve */ + if (cfv->reserved_mem == 0 && cfv->genpool) + cfv->reserved_mem = + gen_pool_alloc(cfv->genpool, + cfv->reserved_size); + + /* Open up the tx queues */ + if (cfv->reserved_mem) { + cfv->watermark_tx = + virtqueue_get_vring_size(cfv->vq_tx); + netif_tx_wake_all_queues(cfv->ndev); + /* Buffers are recycled in cfv_netdev_tx, so + * disable notifications when queues are opened. + */ + virtqueue_disable_cb(cfv->vq_tx); + ++cfv->stats.tx_flow_on; + } else { + /* if no memory reserve, wait for more free slots */ + WARN_ON(cfv->watermark_tx > + virtqueue_get_vring_size(cfv->vq_tx)); + cfv->watermark_tx += + virtqueue_get_vring_size(cfv->vq_tx) / 4; + } + } +} + +/* Allocate a SKB and copy packet data to it */ +static struct sk_buff *cfv_alloc_and_copy_skb(int *err, + struct cfv_info *cfv, + u8 *frm, u32 frm_len) +{ + struct sk_buff *skb; + u32 cfpkt_len, pad_len; + + *err = 0; + /* Verify that packet size with down-link header and mtu size */ + if (frm_len > cfv->mru || frm_len <= cfv->rx_hr + cfv->rx_tr) { + netdev_err(cfv->ndev, + "Invalid frmlen:%u mtu:%u hr:%d tr:%d\n", + frm_len, cfv->mru, cfv->rx_hr, + cfv->rx_tr); + *err = -EPROTO; + return NULL; + } + + cfpkt_len = frm_len - (cfv->rx_hr + cfv->rx_tr); + pad_len = (unsigned long)(frm + cfv->rx_hr) & (IP_HDR_ALIGN - 1); + + skb = netdev_alloc_skb(cfv->ndev, frm_len + pad_len); + if (!skb) { + *err = -ENOMEM; + return NULL; + } + + skb_reserve(skb, cfv->rx_hr + pad_len); + + memcpy(skb_put(skb, cfpkt_len), frm + cfv->rx_hr, cfpkt_len); + return skb; +} + +/* Get packets from the host vring */ +static int cfv_rx_poll(struct napi_struct *napi, int quota) +{ + struct cfv_info *cfv = container_of(napi, struct cfv_info, napi); + int rxcnt = 0; + int err = 0; + void *buf; + struct sk_buff *skb; + struct vringh_kiov *riov = &cfv->ctx.riov; + unsigned int skb_len; + +again: + do { + skb = NULL; + + /* Put the previous iovec back on the used ring and + * fetch a new iovec if we have processed all elements. + */ + if (riov->i == riov->used) { + if (cfv->ctx.head != USHRT_MAX) { + vringh_complete_kern(cfv->vr_rx, + cfv->ctx.head, + 0); + cfv->ctx.head = USHRT_MAX; + } + + err = vringh_getdesc_kern( + cfv->vr_rx, + riov, + NULL, + &cfv->ctx.head, + GFP_ATOMIC); + + if (err <= 0) + goto exit; + } + + buf = phys_to_virt((unsigned long) riov->iov[riov->i].iov_base); + /* TODO: Add check on valid buffer address */ + + skb = cfv_alloc_and_copy_skb(&err, cfv, buf, + riov->iov[riov->i].iov_len); + if (unlikely(err)) + goto exit; + + /* Push received packet up the stack. */ + skb_len = skb->len; + skb->protocol = htons(ETH_P_CAIF); + skb_reset_mac_header(skb); + skb->dev = cfv->ndev; + err = netif_receive_skb(skb); + if (unlikely(err)) { + ++cfv->ndev->stats.rx_dropped; + } else { + ++cfv->ndev->stats.rx_packets; + cfv->ndev->stats.rx_bytes += skb_len; + } + + ++riov->i; + ++rxcnt; + } while (rxcnt < quota); + + ++cfv->stats.rx_napi_resched; + goto out; + +exit: + switch (err) { + case 0: + ++cfv->stats.rx_napi_complete; + + /* Really out of patckets? (stolen from virtio_net)*/ + napi_complete(napi); + if (unlikely(!vringh_notify_enable_kern(cfv->vr_rx)) && + napi_schedule_prep(napi)) { + vringh_notify_disable_kern(cfv->vr_rx); + __napi_schedule(napi); + goto again; + } + break; + + case -ENOMEM: + ++cfv->stats.rx_nomem; + dev_kfree_skb(skb); + /* Stop NAPI poll on OOM, we hope to be polled later */ + napi_complete(napi); + vringh_notify_enable_kern(cfv->vr_rx); + break; + + default: + /* We're doomed, any modem fault is fatal */ + netdev_warn(cfv->ndev, "Bad ring, disable device\n"); + cfv->ndev->stats.rx_dropped = riov->used - riov->i; + napi_complete(napi); + vringh_notify_disable_kern(cfv->vr_rx); + netif_carrier_off(cfv->ndev); + break; + } +out: + if (rxcnt && vringh_need_notify_kern(cfv->vr_rx) > 0) + vringh_notify(cfv->vr_rx); + return rxcnt; +} + +static void cfv_recv(struct virtio_device *vdev, struct vringh *vr_rx) +{ + struct cfv_info *cfv = vdev->priv; + + ++cfv->stats.rx_kicks; + vringh_notify_disable_kern(cfv->vr_rx); + napi_schedule(&cfv->napi); +} + +static void cfv_destroy_genpool(struct cfv_info *cfv) +{ + if (cfv->alloc_addr) + dma_free_coherent(cfv->vdev->dev.parent->parent, + cfv->allocsz, cfv->alloc_addr, + cfv->alloc_dma); + + if (!cfv->genpool) + return; + gen_pool_free(cfv->genpool, cfv->reserved_mem, + cfv->reserved_size); + gen_pool_destroy(cfv->genpool); + cfv->genpool = NULL; +} + +static int cfv_create_genpool(struct cfv_info *cfv) +{ + int err; + + /* dma_alloc can only allocate whole pages, and we need a more + * fine graned allocation so we use genpool. We ask for space needed + * by IP and a full ring. If the dma allcoation fails we retry with a + * smaller allocation size. + */ + err = -ENOMEM; + cfv->allocsz = (virtqueue_get_vring_size(cfv->vq_tx) * + (ETH_DATA_LEN + cfv->tx_hr + cfv->tx_tr) * 11)/10; + if (cfv->allocsz <= (num_possible_cpus() + 1) * cfv->ndev->mtu) + return -EINVAL; + + for (;;) { + if (cfv->allocsz <= num_possible_cpus() * cfv->ndev->mtu) { + netdev_info(cfv->ndev, "Not enough device memory\n"); + return -ENOMEM; + } + + cfv->alloc_addr = dma_alloc_coherent( + cfv->vdev->dev.parent->parent, + cfv->allocsz, &cfv->alloc_dma, + GFP_ATOMIC); + if (cfv->alloc_addr) + break; + + cfv->allocsz = (cfv->allocsz * 3) >> 2; + } + + netdev_dbg(cfv->ndev, "Allocated %zd bytes from dma-memory\n", + cfv->allocsz); + + /* Allocate on 128 bytes boundaries (1 << 7)*/ + cfv->genpool = gen_pool_create(7, -1); + if (!cfv->genpool) + goto err; + + err = gen_pool_add_virt(cfv->genpool, (unsigned long)cfv->alloc_addr, + (phys_addr_t)virt_to_phys(cfv->alloc_addr), + cfv->allocsz, -1); + if (err) + goto err; + + /* Reserve some memory for low memory situations. If we hit the roof + * in the memory pool, we stop TX flow and release the reserve. + */ + cfv->reserved_size = num_possible_cpus() * cfv->ndev->mtu; + cfv->reserved_mem = gen_pool_alloc(cfv->genpool, + cfv->reserved_size); + if (!cfv->reserved_mem) { + err = -ENOMEM; + goto err; + } + + cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx); + return 0; +err: + cfv_destroy_genpool(cfv); + return err; +} + +/* Enable the CAIF interface and allocate the memory-pool */ +static int cfv_netdev_open(struct net_device *netdev) +{ + struct cfv_info *cfv = netdev_priv(netdev); + + if (cfv_create_genpool(cfv)) + return -ENOMEM; + + netif_carrier_on(netdev); + napi_enable(&cfv->napi); + + /* Schedule NAPI to read any pending packets */ + napi_schedule(&cfv->napi); + return 0; +} + +/* Disable the CAIF interface and free the memory-pool */ +static int cfv_netdev_close(struct net_device *netdev) +{ + struct cfv_info *cfv = netdev_priv(netdev); + unsigned long flags; + struct buf_info *buf_info; + + /* Disable interrupts, queues and NAPI polling */ + netif_carrier_off(netdev); + virtqueue_disable_cb(cfv->vq_tx); + vringh_notify_disable_kern(cfv->vr_rx); + napi_disable(&cfv->napi); + + /* Release any TX buffers on both used and avilable rings */ + cfv_release_used_buf(cfv->vq_tx); + spin_lock_irqsave(&cfv->tx_lock, flags); + while ((buf_info = virtqueue_detach_unused_buf(cfv->vq_tx))) + free_buf_info(cfv, buf_info); + spin_unlock_irqrestore(&cfv->tx_lock, flags); + + /* Release all dma allocated memory and destroy the pool */ + cfv_destroy_genpool(cfv); + return 0; +} + +/* Allocate a buffer in dma-memory and copy skb to it */ +static struct buf_info *cfv_alloc_and_copy_to_shm(struct cfv_info *cfv, + struct sk_buff *skb, + struct scatterlist *sg) +{ + struct caif_payload_info *info = (void *)&skb->cb; + struct buf_info *buf_info = NULL; + u8 pad_len, hdr_ofs; + + if (!cfv->genpool) + goto err; + + if (unlikely(cfv->tx_hr + skb->len + cfv->tx_tr > cfv->mtu)) { + netdev_warn(cfv->ndev, "Invalid packet len (%d > %d)\n", + cfv->tx_hr + skb->len + cfv->tx_tr, cfv->mtu); + goto err; + } + + buf_info = kmalloc(sizeof(struct buf_info), GFP_ATOMIC); + if (unlikely(!buf_info)) + goto err; + + /* Make the IP header aligned in tbe buffer */ + hdr_ofs = cfv->tx_hr + info->hdr_len; + pad_len = hdr_ofs & (IP_HDR_ALIGN - 1); + buf_info->size = cfv->tx_hr + skb->len + cfv->tx_tr + pad_len; + + /* allocate dma memory buffer */ + buf_info->vaddr = (void *)gen_pool_alloc(cfv->genpool, buf_info->size); + if (unlikely(!buf_info->vaddr)) + goto err; + + /* copy skbuf contents to send buffer */ + skb_copy_bits(skb, 0, buf_info->vaddr + cfv->tx_hr + pad_len, skb->len); + sg_init_one(sg, buf_info->vaddr + pad_len, + skb->len + cfv->tx_hr + cfv->rx_hr); + + return buf_info; +err: + kfree(buf_info); + return NULL; +} + +/* Put the CAIF packet on the virtio ring and kick the receiver */ +static int cfv_netdev_tx(struct sk_buff *skb, struct net_device *netdev) +{ + struct cfv_info *cfv = netdev_priv(netdev); + struct buf_info *buf_info; + struct scatterlist sg; + unsigned long flags; + bool flow_off = false; + int ret; + + /* garbage collect released buffers */ + cfv_release_used_buf(cfv->vq_tx); + spin_lock_irqsave(&cfv->tx_lock, flags); + + /* Flow-off check takes into account number of cpus to make sure + * virtqueue will not be overfilled in any possible smp conditions. + * + * Flow-on is triggered when sufficient buffers are freed + */ + if (unlikely(cfv->vq_tx->num_free <= num_present_cpus())) { + flow_off = true; + cfv->stats.tx_full_ring++; + } + + /* If we run out of memory, we release the memory reserve and retry + * allocation. + */ + buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg); + if (unlikely(!buf_info)) { + cfv->stats.tx_no_mem++; + flow_off = true; + + if (cfv->reserved_mem && cfv->genpool) { + gen_pool_free(cfv->genpool, cfv->reserved_mem, + cfv->reserved_size); + cfv->reserved_mem = 0; + buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg); + } + } + + if (unlikely(flow_off)) { + /* Turn flow on when a 1/4 of the descriptors are released */ + cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx) / 4; + /* Enable notifications of recycled TX buffers */ + virtqueue_enable_cb(cfv->vq_tx); + netif_tx_stop_all_queues(netdev); + } + + if (unlikely(!buf_info)) { + /* If the memory reserve does it's job, this shouldn't happen */ + netdev_warn(cfv->ndev, "Out of gen_pool memory\n"); + goto err; + } + + ret = virtqueue_add_outbuf(cfv->vq_tx, &sg, 1, buf_info, GFP_ATOMIC); + if (unlikely((ret < 0))) { + /* If flow control works, this shouldn't happen */ + netdev_warn(cfv->ndev, "Failed adding buffer to TX vring:%d\n", + ret); + goto err; + } + + /* update netdev statistics */ + cfv->ndev->stats.tx_packets++; + cfv->ndev->stats.tx_bytes += skb->len; + spin_unlock_irqrestore(&cfv->tx_lock, flags); + + /* tell the remote processor it has a pending message to read */ + virtqueue_kick(cfv->vq_tx); + + dev_kfree_skb(skb); + return NETDEV_TX_OK; +err: + spin_unlock_irqrestore(&cfv->tx_lock, flags); + cfv->ndev->stats.tx_dropped++; + free_buf_info(cfv, buf_info); + dev_kfree_skb(skb); + return NETDEV_TX_OK; +} + +static void cfv_tx_release_tasklet(unsigned long drv) +{ + struct cfv_info *cfv = (struct cfv_info *)drv; + cfv_release_used_buf(cfv->vq_tx); +} + +static const struct net_device_ops cfv_netdev_ops = { + .ndo_open = cfv_netdev_open, + .ndo_stop = cfv_netdev_close, + .ndo_start_xmit = cfv_netdev_tx, +}; + +static void cfv_netdev_setup(struct net_device *netdev) +{ + netdev->netdev_ops = &cfv_netdev_ops; + netdev->type = ARPHRD_CAIF; + netdev->tx_queue_len = 100; + netdev->flags = IFF_POINTOPOINT | IFF_NOARP; + netdev->mtu = CFV_DEF_MTU_SIZE; + netdev->destructor = free_netdev; +} + +/* Create debugfs counters for the device */ +static inline void debugfs_init(struct cfv_info *cfv) +{ + cfv->debugfs = + debugfs_create_dir(netdev_name(cfv->ndev), NULL); + + if (IS_ERR(cfv->debugfs)) + return; + + debugfs_create_u32("rx-napi-complete", S_IRUSR, cfv->debugfs, + &cfv->stats.rx_napi_complete); + debugfs_create_u32("rx-napi-resched", S_IRUSR, cfv->debugfs, + &cfv->stats.rx_napi_resched); + debugfs_create_u32("rx-nomem", S_IRUSR, cfv->debugfs, + &cfv->stats.rx_nomem); + debugfs_create_u32("rx-kicks", S_IRUSR, cfv->debugfs, + &cfv->stats.rx_kicks); + debugfs_create_u32("tx-full-ring", S_IRUSR, cfv->debugfs, + &cfv->stats.tx_full_ring); + debugfs_create_u32("tx-no-mem", S_IRUSR, cfv->debugfs, + &cfv->stats.tx_no_mem); + debugfs_create_u32("tx-kicks", S_IRUSR, cfv->debugfs, + &cfv->stats.tx_kicks); + debugfs_create_u32("tx-flow-on", S_IRUSR, cfv->debugfs, + &cfv->stats.tx_flow_on); +} + +/* Setup CAIF for the a virtio device */ +static int cfv_probe(struct virtio_device *vdev) +{ + vq_callback_t *vq_cbs = cfv_release_cb; + vrh_callback_t *vrh_cbs = cfv_recv; + const char *names = "output"; + const char *cfv_netdev_name = "cfvrt"; + struct net_device *netdev; + struct cfv_info *cfv; + int err = -EINVAL; + + netdev = alloc_netdev(sizeof(struct cfv_info), cfv_netdev_name, + cfv_netdev_setup); + if (!netdev) + return -ENOMEM; + + cfv = netdev_priv(netdev); + cfv->vdev = vdev; + cfv->ndev = netdev; + + spin_lock_init(&cfv->tx_lock); + + /* Get the RX virtio ring. This is a "host side vring". */ + err = -ENODEV; + if (!vdev->vringh_config || !vdev->vringh_config->find_vrhs) + goto err; + + err = vdev->vringh_config->find_vrhs(vdev, 1, &cfv->vr_rx, &vrh_cbs); + if (err) + goto err; + + /* Get the TX virtio ring. This is a "guest side vring". */ + err = vdev->config->find_vqs(vdev, 1, &cfv->vq_tx, &vq_cbs, &names); + if (err) + goto err; + + /* Get the CAIF configuration from virtio config space, if available */ +#define GET_VIRTIO_CONFIG_OPS(_v, _var, _f) \ + ((_v)->config->get(_v, offsetof(struct virtio_caif_transf_config, _f), \ + &_var, \ + FIELD_SIZEOF(struct virtio_caif_transf_config, _f))) + + if (vdev->config->get) { + GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_hr, headroom); + GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_hr, headroom); + GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_tr, tailroom); + GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_tr, tailroom); + GET_VIRTIO_CONFIG_OPS(vdev, cfv->mtu, mtu); + GET_VIRTIO_CONFIG_OPS(vdev, cfv->mru, mtu); + } else { + cfv->tx_hr = CFV_DEF_HEADROOM; + cfv->rx_hr = CFV_DEF_HEADROOM; + cfv->tx_tr = CFV_DEF_TAILROOM; + cfv->rx_tr = CFV_DEF_TAILROOM; + cfv->mtu = CFV_DEF_MTU_SIZE; + cfv->mru = CFV_DEF_MTU_SIZE; + } + + netdev->needed_headroom = cfv->tx_hr; + netdev->needed_tailroom = cfv->tx_tr; + + /* Disable buffer release interrupts unless we have stopped TX queues */ + virtqueue_disable_cb(cfv->vq_tx); + + netdev->mtu = cfv->mtu - cfv->tx_tr; + vdev->priv = cfv; + + /* Initialize NAPI poll context data */ + vringh_kiov_init(&cfv->ctx.riov, NULL, 0); + cfv->ctx.head = USHRT_MAX; + netif_napi_add(netdev, &cfv->napi, cfv_rx_poll, CFV_DEFAULT_QUOTA); + + tasklet_init(&cfv->tx_release_tasklet, + cfv_tx_release_tasklet, + (unsigned long)cfv); + + /* Carrier is off until netdevice is opened */ + netif_carrier_off(netdev); + + /* register Netdev */ + err = register_netdev(netdev); + if (err) { + dev_err(&vdev->dev, "Unable to register netdev (%d)\n", err); + goto err; + } + + debugfs_init(cfv); + + return 0; +err: + netdev_warn(cfv->ndev, "CAIF Virtio probe failed:%d\n", err); + + if (cfv->vr_rx) + vdev->vringh_config->del_vrhs(cfv->vdev); + if (cfv->vdev) + vdev->config->del_vqs(cfv->vdev); + free_netdev(netdev); + return err; +} + +static void cfv_remove(struct virtio_device *vdev) +{ + struct cfv_info *cfv = vdev->priv; + + rtnl_lock(); + dev_close(cfv->ndev); + rtnl_unlock(); + + tasklet_kill(&cfv->tx_release_tasklet); + debugfs_remove_recursive(cfv->debugfs); + + vringh_kiov_cleanup(&cfv->ctx.riov); + vdev->config->reset(vdev); + vdev->vringh_config->del_vrhs(cfv->vdev); + cfv->vr_rx = NULL; + vdev->config->del_vqs(cfv->vdev); + unregister_netdev(cfv->ndev); +} + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_CAIF, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static unsigned int features[] = { +}; + +static struct virtio_driver caif_virtio_driver = { + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = cfv_probe, + .remove = cfv_remove, +}; + +module_virtio_driver(caif_virtio_driver); +MODULE_DEVICE_TABLE(virtio, id_table); diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 50077753a0e5..3c23fdc27bf0 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -39,7 +39,6 @@ module_param(gso, bool, 0444); #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) #define GOOD_COPY_LEN 128 -#define VIRTNET_SEND_COMMAND_SG_MAX 2 #define VIRTNET_DRIVER_VERSION "1.0.0" struct virtnet_stats { @@ -444,7 +443,7 @@ static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp) skb_to_sgvec(skb, rq->sg + 1, 0, skb->len); - err = virtqueue_add_buf(rq->vq, rq->sg, 0, 2, skb, gfp); + err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp); if (err < 0) dev_kfree_skb(skb); @@ -489,8 +488,8 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp) /* chain first in list head */ first->private = (unsigned long)list; - err = virtqueue_add_buf(rq->vq, rq->sg, 0, MAX_SKB_FRAGS + 2, - first, gfp); + err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2, + first, gfp); if (err < 0) give_pages(rq, first); @@ -508,7 +507,7 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp) sg_init_one(rq->sg, page_address(page), PAGE_SIZE); - err = virtqueue_add_buf(rq->vq, rq->sg, 0, 1, page, gfp); + err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, page, gfp); if (err < 0) give_pages(rq, page); @@ -582,7 +581,7 @@ static void refill_work(struct work_struct *work) bool still_empty; int i; - for (i = 0; i < vi->max_queue_pairs; i++) { + for (i = 0; i < vi->curr_queue_pairs; i++) { struct receive_queue *rq = &vi->rq[i]; napi_disable(&rq->napi); @@ -637,7 +636,7 @@ static int virtnet_open(struct net_device *dev) struct virtnet_info *vi = netdev_priv(dev); int i; - for (i = 0; i < vi->max_queue_pairs; i++) { + for (i = 0; i < vi->curr_queue_pairs; i++) { /* Make sure we have some buffers: if oom use wq. */ if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) schedule_delayed_work(&vi->refill, 0); @@ -711,8 +710,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr); num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1; - return virtqueue_add_buf(sq->vq, sq->sg, num_sg, - 0, skb, GFP_ATOMIC); + return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC); } static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) @@ -767,32 +765,35 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) * never fail unless improperly formated. */ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, - struct scatterlist *data, int out, int in) + struct scatterlist *out, + struct scatterlist *in) { - struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2]; + struct scatterlist *sgs[4], hdr, stat; struct virtio_net_ctrl_hdr ctrl; virtio_net_ctrl_ack status = ~0; - unsigned int tmp; - int i; + unsigned out_num = 0, in_num = 0, tmp; /* Caller should know better */ - BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) || - (out + in > VIRTNET_SEND_COMMAND_SG_MAX)); - - out++; /* Add header */ - in++; /* Add return status */ + BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)); ctrl.class = class; ctrl.cmd = cmd; + /* Add header */ + sg_init_one(&hdr, &ctrl, sizeof(ctrl)); + sgs[out_num++] = &hdr; - sg_init_table(sg, out + in); + if (out) + sgs[out_num++] = out; + if (in) + sgs[out_num + in_num++] = in; - sg_set_buf(&sg[0], &ctrl, sizeof(ctrl)); - for_each_sg(data, s, out + in - 2, i) - sg_set_buf(&sg[i + 1], sg_virt(s), s->length); - sg_set_buf(&sg[out + in - 1], &status, sizeof(status)); + /* Add return status. */ + sg_init_one(&stat, &status, sizeof(status)); + sgs[out_num + in_num++] = &stat; - BUG_ON(virtqueue_add_buf(vi->cvq, sg, out, in, vi, GFP_ATOMIC) < 0); + BUG_ON(out_num + in_num > ARRAY_SIZE(sgs)); + BUG_ON(virtqueue_add_sgs(vi->cvq, sgs, out_num, in_num, vi, GFP_ATOMIC) + < 0); virtqueue_kick(vi->cvq); @@ -821,7 +822,7 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p) sg_init_one(&sg, addr->sa_data, dev->addr_len); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, VIRTIO_NET_CTRL_MAC_ADDR_SET, - &sg, 1, 0)) { + &sg, NULL)) { dev_warn(&vdev->dev, "Failed to set mac address by vq command.\n"); return -EINVAL; @@ -889,8 +890,7 @@ static void virtnet_ack_link_announce(struct virtnet_info *vi) { rtnl_lock(); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE, - VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL, - 0, 0)) + VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL, NULL)) dev_warn(&vi->dev->dev, "Failed to ack link announce.\n"); rtnl_unlock(); } @@ -900,6 +900,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) struct scatterlist sg; struct virtio_net_ctrl_mq s; struct net_device *dev = vi->dev; + int i; if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ)) return 0; @@ -908,12 +909,16 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) sg_init_one(&sg, &s, sizeof(s)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ, - VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, 1, 0)){ + VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, NULL)) { dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n", queue_pairs); return -EINVAL; - } else + } else { + for (i = vi->curr_queue_pairs; i < queue_pairs; i++) + if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) + schedule_delayed_work(&vi->refill, 0); vi->curr_queue_pairs = queue_pairs; + } return 0; } @@ -955,7 +960,7 @@ static void virtnet_set_rx_mode(struct net_device *dev) if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, VIRTIO_NET_CTRL_RX_PROMISC, - sg, 1, 0)) + sg, NULL)) dev_warn(&dev->dev, "Failed to %sable promisc mode.\n", promisc ? "en" : "dis"); @@ -963,7 +968,7 @@ static void virtnet_set_rx_mode(struct net_device *dev) if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, VIRTIO_NET_CTRL_RX_ALLMULTI, - sg, 1, 0)) + sg, NULL)) dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n", allmulti ? "en" : "dis"); @@ -1000,7 +1005,7 @@ static void virtnet_set_rx_mode(struct net_device *dev) if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, VIRTIO_NET_CTRL_MAC_TABLE_SET, - sg, 2, 0)) + sg, NULL)) dev_warn(&dev->dev, "Failed to set MAC fitler table.\n"); kfree(buf); @@ -1015,7 +1020,7 @@ static int virtnet_vlan_rx_add_vid(struct net_device *dev, sg_init_one(&sg, &vid, sizeof(vid)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, - VIRTIO_NET_CTRL_VLAN_ADD, &sg, 1, 0)) + VIRTIO_NET_CTRL_VLAN_ADD, &sg, NULL)) dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid); return 0; } @@ -1029,7 +1034,7 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev, sg_init_one(&sg, &vid, sizeof(vid)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, - VIRTIO_NET_CTRL_VLAN_DEL, &sg, 1, 0)) + VIRTIO_NET_CTRL_VLAN_DEL, &sg, NULL)) dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid); return 0; } @@ -1570,7 +1575,7 @@ static int virtnet_probe(struct virtio_device *vdev) } /* Last of all, set up some receive buffers. */ - for (i = 0; i < vi->max_queue_pairs; i++) { + for (i = 0; i < vi->curr_queue_pairs; i++) { try_fill_recv(&vi->rq[i], GFP_KERNEL); /* If we didn't even get one input buffer, we're useless. */ @@ -1694,7 +1699,7 @@ static int virtnet_restore(struct virtio_device *vdev) netif_device_attach(vi->dev); - for (i = 0; i < vi->max_queue_pairs; i++) + for (i = 0; i < vi->curr_queue_pairs; i++) if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) schedule_delayed_work(&vi->refill, 0); diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c index 7861f1119b7d..56fceafec9ec 100644 --- a/drivers/rpmsg/virtio_rpmsg_bus.c +++ b/drivers/rpmsg/virtio_rpmsg_bus.c @@ -757,14 +757,14 @@ int rpmsg_send_offchannel_raw(struct rpmsg_channel *rpdev, u32 src, u32 dst, mutex_lock(&vrp->tx_lock); /* add message to the remote processor's virtqueue */ - err = virtqueue_add_buf(vrp->svq, &sg, 1, 0, msg, GFP_KERNEL); + err = virtqueue_add_outbuf(vrp->svq, &sg, 1, msg, GFP_KERNEL); if (err) { /* * need to reclaim the buffer here, otherwise it's lost * (memory won't leak, but rpmsg won't use it again for TX). * this will wait for a buffer management overhaul. */ - dev_err(dev, "virtqueue_add_buf failed: %d\n", err); + dev_err(dev, "virtqueue_add_outbuf failed: %d\n", err); goto out; } @@ -839,7 +839,7 @@ static void rpmsg_recv_done(struct virtqueue *rvq) sg_init_one(&sg, msg, RPMSG_BUF_SIZE); /* add the buffer back to the remote processor's virtqueue */ - err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, msg, GFP_KERNEL); + err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, msg, GFP_KERNEL); if (err < 0) { dev_err(dev, "failed to add a virtqueue buffer: %d\n", err); return; @@ -972,7 +972,7 @@ static int rpmsg_probe(struct virtio_device *vdev) sg_init_one(&sg, cpu_addr, RPMSG_BUF_SIZE); - err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, cpu_addr, + err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, cpu_addr, GFP_KERNEL); WARN_ON(err); /* sanity check; this can't really happen */ } diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 3449a1f8c656..2168258fb2c3 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -13,6 +13,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/slab.h> #include <linux/mempool.h> @@ -20,12 +22,14 @@ #include <linux/virtio_ids.h> #include <linux/virtio_config.h> #include <linux/virtio_scsi.h> +#include <linux/cpu.h> #include <scsi/scsi_host.h> #include <scsi/scsi_device.h> #include <scsi/scsi_cmnd.h> #define VIRTIO_SCSI_MEMPOOL_SZ 64 #define VIRTIO_SCSI_EVENT_LEN 8 +#define VIRTIO_SCSI_VQ_BASE 2 /* Command queue element */ struct virtio_scsi_cmd { @@ -57,27 +61,61 @@ struct virtio_scsi_vq { struct virtqueue *vq; }; -/* Per-target queue state */ +/* + * Per-target queue state. + * + * This struct holds the data needed by the queue steering policy. When a + * target is sent multiple requests, we need to drive them to the same queue so + * that FIFO processing order is kept. However, if a target was idle, we can + * choose a queue arbitrarily. In this case the queue is chosen according to + * the current VCPU, so the driver expects the number of request queues to be + * equal to the number of VCPUs. This makes it easy and fast to select the + * queue, and also lets the driver optimize the IRQ affinity for the virtqueues + * (each virtqueue's affinity is set to the CPU that "owns" the queue). + * + * An interesting effect of this policy is that only writes to req_vq need to + * take the tgt_lock. Read can be done outside the lock because: + * + * - writes of req_vq only occur when atomic_inc_return(&tgt->reqs) returns 1. + * In that case, no other CPU is reading req_vq: even if they were in + * virtscsi_queuecommand_multi, they would be spinning on tgt_lock. + * + * - reads of req_vq only occur when the target is not idle (reqs != 0). + * A CPU that enters virtscsi_queuecommand_multi will not modify req_vq. + * + * Similarly, decrements of reqs are never concurrent with writes of req_vq. + * Thus they can happen outside the tgt_lock, provided of course we make reqs + * an atomic_t. + */ struct virtio_scsi_target_state { - /* Protects sg. Lock hierarchy is tgt_lock -> vq_lock. */ + /* This spinlock never held at the same time as vq_lock. */ spinlock_t tgt_lock; - /* For sglist construction when adding commands to the virtqueue. */ - struct scatterlist sg[]; + /* Count of outstanding requests. */ + atomic_t reqs; + + /* Currently active virtqueue for requests sent to this target. */ + struct virtio_scsi_vq *req_vq; }; /* Driver instance state */ struct virtio_scsi { struct virtio_device *vdev; - struct virtio_scsi_vq ctrl_vq; - struct virtio_scsi_vq event_vq; - struct virtio_scsi_vq req_vq; - /* Get some buffers ready for event vq */ struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN]; - struct virtio_scsi_target_state *tgt[]; + u32 num_queues; + + /* If the affinity hint is set for virtqueues */ + bool affinity_hint_set; + + /* CPU hotplug notifier */ + struct notifier_block nb; + + struct virtio_scsi_vq ctrl_vq; + struct virtio_scsi_vq event_vq; + struct virtio_scsi_vq req_vqs[]; }; static struct kmem_cache *virtscsi_cmd_cache; @@ -107,11 +145,13 @@ static void virtscsi_compute_resid(struct scsi_cmnd *sc, u32 resid) * * Called with vq_lock held. */ -static void virtscsi_complete_cmd(void *buf) +static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf) { struct virtio_scsi_cmd *cmd = buf; struct scsi_cmnd *sc = cmd->sc; struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd; + struct virtio_scsi_target_state *tgt = + scsi_target(sc->device)->hostdata; dev_dbg(&sc->device->sdev_gendev, "cmd %p response %u status %#02x sense_len %u\n", @@ -166,32 +206,71 @@ static void virtscsi_complete_cmd(void *buf) mempool_free(cmd, virtscsi_cmd_pool); sc->scsi_done(sc); + + atomic_dec(&tgt->reqs); } -static void virtscsi_vq_done(struct virtqueue *vq, void (*fn)(void *buf)) +static void virtscsi_vq_done(struct virtio_scsi *vscsi, + struct virtio_scsi_vq *virtscsi_vq, + void (*fn)(struct virtio_scsi *vscsi, void *buf)) { void *buf; unsigned int len; + unsigned long flags; + struct virtqueue *vq = virtscsi_vq->vq; + spin_lock_irqsave(&virtscsi_vq->vq_lock, flags); do { virtqueue_disable_cb(vq); while ((buf = virtqueue_get_buf(vq, &len)) != NULL) - fn(buf); + fn(vscsi, buf); } while (!virtqueue_enable_cb(vq)); + spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags); } static void virtscsi_req_done(struct virtqueue *vq) { struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct virtio_scsi *vscsi = shost_priv(sh); - unsigned long flags; + int index = vq->index - VIRTIO_SCSI_VQ_BASE; + struct virtio_scsi_vq *req_vq = &vscsi->req_vqs[index]; - spin_lock_irqsave(&vscsi->req_vq.vq_lock, flags); - virtscsi_vq_done(vq, virtscsi_complete_cmd); - spin_unlock_irqrestore(&vscsi->req_vq.vq_lock, flags); + /* + * Read req_vq before decrementing the reqs field in + * virtscsi_complete_cmd. + * + * With barriers: + * + * CPU #0 virtscsi_queuecommand_multi (CPU #1) + * ------------------------------------------------------------ + * lock vq_lock + * read req_vq + * read reqs (reqs = 1) + * write reqs (reqs = 0) + * increment reqs (reqs = 1) + * write req_vq + * + * Possible reordering without barriers: + * + * CPU #0 virtscsi_queuecommand_multi (CPU #1) + * ------------------------------------------------------------ + * lock vq_lock + * read reqs (reqs = 1) + * write reqs (reqs = 0) + * increment reqs (reqs = 1) + * write req_vq + * read (wrong) req_vq + * + * We do not need a full smp_rmb, because req_vq is required to get + * to tgt->reqs: tgt is &vscsi->tgt[sc->device->id], where sc is stored + * in the virtqueue as the user token. + */ + smp_read_barrier_depends(); + + virtscsi_vq_done(vscsi, req_vq, virtscsi_complete_cmd); }; -static void virtscsi_complete_free(void *buf) +static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf) { struct virtio_scsi_cmd *cmd = buf; @@ -205,11 +284,8 @@ static void virtscsi_ctrl_done(struct virtqueue *vq) { struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct virtio_scsi *vscsi = shost_priv(sh); - unsigned long flags; - spin_lock_irqsave(&vscsi->ctrl_vq.vq_lock, flags); - virtscsi_vq_done(vq, virtscsi_complete_free); - spin_unlock_irqrestore(&vscsi->ctrl_vq.vq_lock, flags); + virtscsi_vq_done(vscsi, &vscsi->ctrl_vq, virtscsi_complete_free); }; static int virtscsi_kick_event(struct virtio_scsi *vscsi, @@ -223,8 +299,8 @@ static int virtscsi_kick_event(struct virtio_scsi *vscsi, spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags); - err = virtqueue_add_buf(vscsi->event_vq.vq, &sg, 0, 1, event_node, - GFP_ATOMIC); + err = virtqueue_add_inbuf(vscsi->event_vq.vq, &sg, 1, event_node, + GFP_ATOMIC); if (!err) virtqueue_kick(vscsi->event_vq.vq); @@ -254,7 +330,7 @@ static void virtscsi_cancel_event_work(struct virtio_scsi *vscsi) } static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi, - struct virtio_scsi_event *event) + struct virtio_scsi_event *event) { struct scsi_device *sdev; struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev); @@ -332,7 +408,7 @@ static void virtscsi_handle_event(struct work_struct *work) virtscsi_kick_event(vscsi, event_node); } -static void virtscsi_complete_event(void *buf) +static void virtscsi_complete_event(struct virtio_scsi *vscsi, void *buf) { struct virtio_scsi_event_node *event_node = buf; @@ -344,82 +420,65 @@ static void virtscsi_event_done(struct virtqueue *vq) { struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct virtio_scsi *vscsi = shost_priv(sh); - unsigned long flags; - spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags); - virtscsi_vq_done(vq, virtscsi_complete_event); - spin_unlock_irqrestore(&vscsi->event_vq.vq_lock, flags); + virtscsi_vq_done(vscsi, &vscsi->event_vq, virtscsi_complete_event); }; -static void virtscsi_map_sgl(struct scatterlist *sg, unsigned int *p_idx, - struct scsi_data_buffer *sdb) -{ - struct sg_table *table = &sdb->table; - struct scatterlist *sg_elem; - unsigned int idx = *p_idx; - int i; - - for_each_sg(table->sgl, sg_elem, table->nents, i) - sg[idx++] = *sg_elem; - - *p_idx = idx; -} - /** - * virtscsi_map_cmd - map a scsi_cmd to a virtqueue scatterlist - * @vscsi : virtio_scsi state + * virtscsi_add_cmd - add a virtio_scsi_cmd to a virtqueue + * @vq : the struct virtqueue we're talking about * @cmd : command structure - * @out_num : number of read-only elements - * @in_num : number of write-only elements * @req_size : size of the request buffer * @resp_size : size of the response buffer - * - * Called with tgt_lock held. + * @gfp : flags to use for memory allocations */ -static void virtscsi_map_cmd(struct virtio_scsi_target_state *tgt, - struct virtio_scsi_cmd *cmd, - unsigned *out_num, unsigned *in_num, - size_t req_size, size_t resp_size) +static int virtscsi_add_cmd(struct virtqueue *vq, + struct virtio_scsi_cmd *cmd, + size_t req_size, size_t resp_size, gfp_t gfp) { struct scsi_cmnd *sc = cmd->sc; - struct scatterlist *sg = tgt->sg; - unsigned int idx = 0; + struct scatterlist *sgs[4], req, resp; + struct sg_table *out, *in; + unsigned out_num = 0, in_num = 0; + + out = in = NULL; + + if (sc && sc->sc_data_direction != DMA_NONE) { + if (sc->sc_data_direction != DMA_FROM_DEVICE) + out = &scsi_out(sc)->table; + if (sc->sc_data_direction != DMA_TO_DEVICE) + in = &scsi_in(sc)->table; + } /* Request header. */ - sg_set_buf(&sg[idx++], &cmd->req, req_size); + sg_init_one(&req, &cmd->req, req_size); + sgs[out_num++] = &req; /* Data-out buffer. */ - if (sc && sc->sc_data_direction != DMA_FROM_DEVICE) - virtscsi_map_sgl(sg, &idx, scsi_out(sc)); - - *out_num = idx; + if (out) + sgs[out_num++] = out->sgl; /* Response header. */ - sg_set_buf(&sg[idx++], &cmd->resp, resp_size); + sg_init_one(&resp, &cmd->resp, resp_size); + sgs[out_num + in_num++] = &resp; /* Data-in buffer */ - if (sc && sc->sc_data_direction != DMA_TO_DEVICE) - virtscsi_map_sgl(sg, &idx, scsi_in(sc)); + if (in) + sgs[out_num + in_num++] = in->sgl; - *in_num = idx - *out_num; + return virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, gfp); } -static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt, - struct virtio_scsi_vq *vq, +static int virtscsi_kick_cmd(struct virtio_scsi_vq *vq, struct virtio_scsi_cmd *cmd, size_t req_size, size_t resp_size, gfp_t gfp) { - unsigned int out_num, in_num; unsigned long flags; int err; bool needs_kick = false; - spin_lock_irqsave(&tgt->tgt_lock, flags); - virtscsi_map_cmd(tgt, cmd, &out_num, &in_num, req_size, resp_size); - - spin_lock(&vq->vq_lock); - err = virtqueue_add_buf(vq->vq, tgt->sg, out_num, in_num, cmd, gfp); - spin_unlock(&tgt->tgt_lock); + spin_lock_irqsave(&vq->vq_lock, flags); + err = virtscsi_add_cmd(vq->vq, cmd, req_size, resp_size, gfp); if (!err) needs_kick = virtqueue_kick_prepare(vq->vq); @@ -430,10 +489,10 @@ static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt, return err; } -static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc) +static int virtscsi_queuecommand(struct virtio_scsi *vscsi, + struct virtio_scsi_vq *req_vq, + struct scsi_cmnd *sc) { - struct virtio_scsi *vscsi = shost_priv(sh); - struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id]; struct virtio_scsi_cmd *cmd; int ret; @@ -467,7 +526,7 @@ static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc) BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE); memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len); - if (virtscsi_kick_cmd(tgt, &vscsi->req_vq, cmd, + if (virtscsi_kick_cmd(req_vq, cmd, sizeof cmd->req.cmd, sizeof cmd->resp.cmd, GFP_ATOMIC) == 0) ret = 0; @@ -478,14 +537,62 @@ out: return ret; } +static int virtscsi_queuecommand_single(struct Scsi_Host *sh, + struct scsi_cmnd *sc) +{ + struct virtio_scsi *vscsi = shost_priv(sh); + struct virtio_scsi_target_state *tgt = + scsi_target(sc->device)->hostdata; + + atomic_inc(&tgt->reqs); + return virtscsi_queuecommand(vscsi, &vscsi->req_vqs[0], sc); +} + +static struct virtio_scsi_vq *virtscsi_pick_vq(struct virtio_scsi *vscsi, + struct virtio_scsi_target_state *tgt) +{ + struct virtio_scsi_vq *vq; + unsigned long flags; + u32 queue_num; + + spin_lock_irqsave(&tgt->tgt_lock, flags); + + /* + * The memory barrier after atomic_inc_return matches + * the smp_read_barrier_depends() in virtscsi_req_done. + */ + if (atomic_inc_return(&tgt->reqs) > 1) + vq = ACCESS_ONCE(tgt->req_vq); + else { + queue_num = smp_processor_id(); + while (unlikely(queue_num >= vscsi->num_queues)) + queue_num -= vscsi->num_queues; + + tgt->req_vq = vq = &vscsi->req_vqs[queue_num]; + } + + spin_unlock_irqrestore(&tgt->tgt_lock, flags); + return vq; +} + +static int virtscsi_queuecommand_multi(struct Scsi_Host *sh, + struct scsi_cmnd *sc) +{ + struct virtio_scsi *vscsi = shost_priv(sh); + struct virtio_scsi_target_state *tgt = + scsi_target(sc->device)->hostdata; + struct virtio_scsi_vq *req_vq = virtscsi_pick_vq(vscsi, tgt); + + return virtscsi_queuecommand(vscsi, req_vq, sc); +} + static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd) { DECLARE_COMPLETION_ONSTACK(comp); - struct virtio_scsi_target_state *tgt = vscsi->tgt[cmd->sc->device->id]; int ret = FAILED; cmd->comp = ∁ - if (virtscsi_kick_cmd(tgt, &vscsi->ctrl_vq, cmd, + if (virtscsi_kick_cmd(&vscsi->ctrl_vq, cmd, sizeof cmd->req.tmf, sizeof cmd->resp.tmf, GFP_NOIO) < 0) goto out; @@ -547,18 +654,57 @@ static int virtscsi_abort(struct scsi_cmnd *sc) return virtscsi_tmf(vscsi, cmd); } -static struct scsi_host_template virtscsi_host_template = { +static int virtscsi_target_alloc(struct scsi_target *starget) +{ + struct virtio_scsi_target_state *tgt = + kmalloc(sizeof(*tgt), GFP_KERNEL); + if (!tgt) + return -ENOMEM; + + spin_lock_init(&tgt->tgt_lock); + atomic_set(&tgt->reqs, 0); + tgt->req_vq = NULL; + + starget->hostdata = tgt; + return 0; +} + +static void virtscsi_target_destroy(struct scsi_target *starget) +{ + struct virtio_scsi_target_state *tgt = starget->hostdata; + kfree(tgt); +} + +static struct scsi_host_template virtscsi_host_template_single = { + .module = THIS_MODULE, + .name = "Virtio SCSI HBA", + .proc_name = "virtio_scsi", + .this_id = -1, + .queuecommand = virtscsi_queuecommand_single, + .eh_abort_handler = virtscsi_abort, + .eh_device_reset_handler = virtscsi_device_reset, + + .can_queue = 1024, + .dma_boundary = UINT_MAX, + .use_clustering = ENABLE_CLUSTERING, + .target_alloc = virtscsi_target_alloc, + .target_destroy = virtscsi_target_destroy, +}; + +static struct scsi_host_template virtscsi_host_template_multi = { .module = THIS_MODULE, .name = "Virtio SCSI HBA", .proc_name = "virtio_scsi", - .queuecommand = virtscsi_queuecommand, .this_id = -1, + .queuecommand = virtscsi_queuecommand_multi, .eh_abort_handler = virtscsi_abort, .eh_device_reset_handler = virtscsi_device_reset, .can_queue = 1024, .dma_boundary = UINT_MAX, .use_clustering = ENABLE_CLUSTERING, + .target_alloc = virtscsi_target_alloc, + .target_destroy = virtscsi_target_destroy, }; #define virtscsi_config_get(vdev, fld) \ @@ -578,29 +724,69 @@ static struct scsi_host_template virtscsi_host_template = { &__val, sizeof(__val)); \ }) -static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq, - struct virtqueue *vq) +static void __virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity) { - spin_lock_init(&virtscsi_vq->vq_lock); - virtscsi_vq->vq = vq; + int i; + int cpu; + + /* In multiqueue mode, when the number of cpu is equal + * to the number of request queues, we let the qeueues + * to be private to one cpu by setting the affinity hint + * to eliminate the contention. + */ + if ((vscsi->num_queues == 1 || + vscsi->num_queues != num_online_cpus()) && affinity) { + if (vscsi->affinity_hint_set) + affinity = false; + else + return; + } + + if (affinity) { + i = 0; + for_each_online_cpu(cpu) { + virtqueue_set_affinity(vscsi->req_vqs[i].vq, cpu); + i++; + } + + vscsi->affinity_hint_set = true; + } else { + for (i = 0; i < vscsi->num_queues - VIRTIO_SCSI_VQ_BASE; i++) + virtqueue_set_affinity(vscsi->req_vqs[i].vq, -1); + + vscsi->affinity_hint_set = false; + } } -static struct virtio_scsi_target_state *virtscsi_alloc_tgt( - struct virtio_device *vdev, int sg_elems) +static void virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity) { - struct virtio_scsi_target_state *tgt; - gfp_t gfp_mask = GFP_KERNEL; - - /* We need extra sg elements at head and tail. */ - tgt = kmalloc(sizeof(*tgt) + sizeof(tgt->sg[0]) * (sg_elems + 2), - gfp_mask); + get_online_cpus(); + __virtscsi_set_affinity(vscsi, affinity); + put_online_cpus(); +} - if (!tgt) - return NULL; +static int virtscsi_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + struct virtio_scsi *vscsi = container_of(nfb, struct virtio_scsi, nb); + switch(action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + __virtscsi_set_affinity(vscsi, true); + break; + default: + break; + } + return NOTIFY_OK; +} - spin_lock_init(&tgt->tgt_lock); - sg_init_table(tgt->sg, sg_elems + 2); - return tgt; +static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq, + struct virtqueue *vq) +{ + spin_lock_init(&virtscsi_vq->vq_lock); + virtscsi_vq->vq = vq; } static void virtscsi_scan(struct virtio_device *vdev) @@ -614,46 +800,56 @@ static void virtscsi_remove_vqs(struct virtio_device *vdev) { struct Scsi_Host *sh = virtio_scsi_host(vdev); struct virtio_scsi *vscsi = shost_priv(sh); - u32 i, num_targets; + + virtscsi_set_affinity(vscsi, false); /* Stop all the virtqueues. */ vdev->config->reset(vdev); - num_targets = sh->max_id; - for (i = 0; i < num_targets; i++) { - kfree(vscsi->tgt[i]); - vscsi->tgt[i] = NULL; - } - vdev->config->del_vqs(vdev); } static int virtscsi_init(struct virtio_device *vdev, - struct virtio_scsi *vscsi, int num_targets) + struct virtio_scsi *vscsi) { int err; - struct virtqueue *vqs[3]; - u32 i, sg_elems; + u32 i; + u32 num_vqs; + vq_callback_t **callbacks; + const char **names; + struct virtqueue **vqs; + + num_vqs = vscsi->num_queues + VIRTIO_SCSI_VQ_BASE; + vqs = kmalloc(num_vqs * sizeof(struct virtqueue *), GFP_KERNEL); + callbacks = kmalloc(num_vqs * sizeof(vq_callback_t *), GFP_KERNEL); + names = kmalloc(num_vqs * sizeof(char *), GFP_KERNEL); + + if (!callbacks || !vqs || !names) { + err = -ENOMEM; + goto out; + } - vq_callback_t *callbacks[] = { - virtscsi_ctrl_done, - virtscsi_event_done, - virtscsi_req_done - }; - const char *names[] = { - "control", - "event", - "request" - }; + callbacks[0] = virtscsi_ctrl_done; + callbacks[1] = virtscsi_event_done; + names[0] = "control"; + names[1] = "event"; + for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++) { + callbacks[i] = virtscsi_req_done; + names[i] = "request"; + } /* Discover virtqueues and write information to configuration. */ - err = vdev->config->find_vqs(vdev, 3, vqs, callbacks, names); + err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names); if (err) - return err; + goto out; virtscsi_init_vq(&vscsi->ctrl_vq, vqs[0]); virtscsi_init_vq(&vscsi->event_vq, vqs[1]); - virtscsi_init_vq(&vscsi->req_vq, vqs[2]); + for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++) + virtscsi_init_vq(&vscsi->req_vqs[i - VIRTIO_SCSI_VQ_BASE], + vqs[i]); + + virtscsi_set_affinity(vscsi, true); virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE); virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE); @@ -661,19 +857,12 @@ static int virtscsi_init(struct virtio_device *vdev, if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) virtscsi_kick_event_all(vscsi); - /* We need to know how many segments before we allocate. */ - sg_elems = virtscsi_config_get(vdev, seg_max) ?: 1; - - for (i = 0; i < num_targets; i++) { - vscsi->tgt[i] = virtscsi_alloc_tgt(vdev, sg_elems); - if (!vscsi->tgt[i]) { - err = -ENOMEM; - goto out; - } - } err = 0; out: + kfree(names); + kfree(callbacks); + kfree(vqs); if (err) virtscsi_remove_vqs(vdev); return err; @@ -686,13 +875,21 @@ static int virtscsi_probe(struct virtio_device *vdev) int err; u32 sg_elems, num_targets; u32 cmd_per_lun; + u32 num_queues; + struct scsi_host_template *hostt; + + /* We need to know how many queues before we allocate. */ + num_queues = virtscsi_config_get(vdev, num_queues) ? : 1; - /* Allocate memory and link the structs together. */ num_targets = virtscsi_config_get(vdev, max_target) + 1; - shost = scsi_host_alloc(&virtscsi_host_template, - sizeof(*vscsi) - + num_targets * sizeof(struct virtio_scsi_target_state)); + if (num_queues == 1) + hostt = &virtscsi_host_template_single; + else + hostt = &virtscsi_host_template_multi; + + shost = scsi_host_alloc(hostt, + sizeof(*vscsi) + sizeof(vscsi->req_vqs[0]) * num_queues); if (!shost) return -ENOMEM; @@ -700,12 +897,20 @@ static int virtscsi_probe(struct virtio_device *vdev) shost->sg_tablesize = sg_elems; vscsi = shost_priv(shost); vscsi->vdev = vdev; + vscsi->num_queues = num_queues; vdev->priv = shost; - err = virtscsi_init(vdev, vscsi, num_targets); + err = virtscsi_init(vdev, vscsi); if (err) goto virtscsi_init_failed; + vscsi->nb.notifier_call = &virtscsi_cpu_callback; + err = register_hotcpu_notifier(&vscsi->nb); + if (err) { + pr_err("registering cpu notifier failed\n"); + goto scsi_add_host_failed; + } + cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1; shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue); shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF; @@ -743,6 +948,8 @@ static void virtscsi_remove(struct virtio_device *vdev) scsi_remove_host(shost); + unregister_hotcpu_notifier(&vscsi->nb); + virtscsi_remove_vqs(vdev); scsi_host_put(shost); } @@ -759,7 +966,7 @@ static int virtscsi_restore(struct virtio_device *vdev) struct Scsi_Host *sh = virtio_scsi_host(vdev); struct virtio_scsi *vscsi = shost_priv(sh); - return virtscsi_init(vdev, vscsi, sh->max_id); + return virtscsi_init(vdev, vscsi); } #endif @@ -794,8 +1001,7 @@ static int __init init(void) virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0); if (!virtscsi_cmd_cache) { - printk(KERN_ERR "kmem_cache_create() for " - "virtscsi_cmd_cache failed\n"); + pr_err("kmem_cache_create() for virtscsi_cmd_cache failed\n"); goto error; } @@ -804,8 +1010,7 @@ static int __init init(void) mempool_create_slab_pool(VIRTIO_SCSI_MEMPOOL_SZ, virtscsi_cmd_cache); if (!virtscsi_cmd_pool) { - printk(KERN_ERR "mempool_create() for" - "virtscsi_cmd_pool failed\n"); + pr_err("mempool_create() for virtscsi_cmd_pool failed\n"); goto error; } ret = register_virtio_driver(&virtio_scsi_driver); diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 26a64e5b8a58..8b9226da3f54 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -1,6 +1,7 @@ config VHOST_NET tristate "Host kernel accelerator for virtio net" depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP) + select VHOST_RING ---help--- This kernel module can be loaded in host kernel to accelerate guest networking with virtio_net. Not to be confused with virtio_net @@ -12,7 +13,14 @@ config VHOST_NET config VHOST_SCSI tristate "VHOST_SCSI TCM fabric driver" depends on TARGET_CORE && EVENTFD && m + select VHOST_RING default n ---help--- Say M here to enable the vhost_scsi TCM fabric module for use with virtio-scsi guests + +config VHOST_RING + tristate + ---help--- + This option is selected by any driver which needs to access + the host side of a virtio ring. diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index ef21d5fdfa7d..654e9afb11f5 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -3,3 +3,5 @@ vhost_net-y := vhost.o net.o obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o vhost_scsi-y := scsi.o + +obj-$(CONFIG_VHOST_RING) += vringh.o diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c index be65414d5bb1..1ee45bc85f67 100644 --- a/drivers/vhost/test.c +++ b/drivers/vhost/test.c @@ -282,7 +282,9 @@ static long vhost_test_ioctl(struct file *f, unsigned int ioctl, return vhost_test_reset_owner(n); default: mutex_lock(&n->dev.mutex); - r = vhost_dev_ioctl(&n->dev, ioctl, arg); + r = vhost_dev_ioctl(&n->dev, ioctl, argp); + if (r == -ENOIOCTLCMD) + r = vhost_vring_ioctl(&n->dev, ioctl, argp); vhost_test_flush(n); mutex_unlock(&n->dev.mutex); return r; diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c new file mode 100644 index 000000000000..bff0775e258c --- /dev/null +++ b/drivers/vhost/vringh.c @@ -0,0 +1,1007 @@ +/* + * Helpers for the host side of a virtio ring. + * + * Since these may be in userspace, we use (inline) accessors. + */ +#include <linux/vringh.h> +#include <linux/virtio_ring.h> +#include <linux/kernel.h> +#include <linux/ratelimit.h> +#include <linux/uaccess.h> +#include <linux/slab.h> +#include <linux/export.h> + +static __printf(1,2) __cold void vringh_bad(const char *fmt, ...) +{ + static DEFINE_RATELIMIT_STATE(vringh_rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + if (__ratelimit(&vringh_rs)) { + va_list ap; + va_start(ap, fmt); + printk(KERN_NOTICE "vringh:"); + vprintk(fmt, ap); + va_end(ap); + } +} + +/* Returns vring->num if empty, -ve on error. */ +static inline int __vringh_get_head(const struct vringh *vrh, + int (*getu16)(u16 *val, const u16 *p), + u16 *last_avail_idx) +{ + u16 avail_idx, i, head; + int err; + + err = getu16(&avail_idx, &vrh->vring.avail->idx); + if (err) { + vringh_bad("Failed to access avail idx at %p", + &vrh->vring.avail->idx); + return err; + } + + if (*last_avail_idx == avail_idx) + return vrh->vring.num; + + /* Only get avail ring entries after they have been exposed by guest. */ + virtio_rmb(vrh->weak_barriers); + + i = *last_avail_idx & (vrh->vring.num - 1); + + err = getu16(&head, &vrh->vring.avail->ring[i]); + if (err) { + vringh_bad("Failed to read head: idx %d address %p", + *last_avail_idx, &vrh->vring.avail->ring[i]); + return err; + } + + if (head >= vrh->vring.num) { + vringh_bad("Guest says index %u > %u is available", + head, vrh->vring.num); + return -EINVAL; + } + + (*last_avail_idx)++; + return head; +} + +/* Copy some bytes to/from the iovec. Returns num copied. */ +static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov, + void *ptr, size_t len, + int (*xfer)(void *addr, void *ptr, + size_t len)) +{ + int err, done = 0; + + while (len && iov->i < iov->used) { + size_t partlen; + + partlen = min(iov->iov[iov->i].iov_len, len); + err = xfer(iov->iov[iov->i].iov_base, ptr, partlen); + if (err) + return err; + done += partlen; + len -= partlen; + ptr += partlen; + iov->consumed += partlen; + iov->iov[iov->i].iov_len -= partlen; + iov->iov[iov->i].iov_base += partlen; + + if (!iov->iov[iov->i].iov_len) { + /* Fix up old iov element then increment. */ + iov->iov[iov->i].iov_len = iov->consumed; + iov->iov[iov->i].iov_base -= iov->consumed; + + iov->consumed = 0; + iov->i++; + } + } + return done; +} + +/* May reduce *len if range is shorter. */ +static inline bool range_check(struct vringh *vrh, u64 addr, size_t *len, + struct vringh_range *range, + bool (*getrange)(struct vringh *, + u64, struct vringh_range *)) +{ + if (addr < range->start || addr > range->end_incl) { + if (!getrange(vrh, addr, range)) + return false; + } + BUG_ON(addr < range->start || addr > range->end_incl); + + /* To end of memory? */ + if (unlikely(addr + *len == 0)) { + if (range->end_incl == -1ULL) + return true; + goto truncate; + } + + /* Otherwise, don't wrap. */ + if (addr + *len < addr) { + vringh_bad("Wrapping descriptor %zu@0x%llx", + *len, (unsigned long long)addr); + return false; + } + + if (unlikely(addr + *len - 1 > range->end_incl)) + goto truncate; + return true; + +truncate: + *len = range->end_incl + 1 - addr; + return true; +} + +static inline bool no_range_check(struct vringh *vrh, u64 addr, size_t *len, + struct vringh_range *range, + bool (*getrange)(struct vringh *, + u64, struct vringh_range *)) +{ + return true; +} + +/* No reason for this code to be inline. */ +static int move_to_indirect(int *up_next, u16 *i, void *addr, + const struct vring_desc *desc, + struct vring_desc **descs, int *desc_max) +{ + /* Indirect tables can't have indirect. */ + if (*up_next != -1) { + vringh_bad("Multilevel indirect %u->%u", *up_next, *i); + return -EINVAL; + } + + if (unlikely(desc->len % sizeof(struct vring_desc))) { + vringh_bad("Strange indirect len %u", desc->len); + return -EINVAL; + } + + /* We will check this when we follow it! */ + if (desc->flags & VRING_DESC_F_NEXT) + *up_next = desc->next; + else + *up_next = -2; + *descs = addr; + *desc_max = desc->len / sizeof(struct vring_desc); + + /* Now, start at the first indirect. */ + *i = 0; + return 0; +} + +static int resize_iovec(struct vringh_kiov *iov, gfp_t gfp) +{ + struct kvec *new; + unsigned int flag, new_num = (iov->max_num & ~VRINGH_IOV_ALLOCATED) * 2; + + if (new_num < 8) + new_num = 8; + + flag = (iov->max_num & VRINGH_IOV_ALLOCATED); + if (flag) + new = krealloc(iov->iov, new_num * sizeof(struct iovec), gfp); + else { + new = kmalloc(new_num * sizeof(struct iovec), gfp); + if (new) { + memcpy(new, iov->iov, + iov->max_num * sizeof(struct iovec)); + flag = VRINGH_IOV_ALLOCATED; + } + } + if (!new) + return -ENOMEM; + iov->iov = new; + iov->max_num = (new_num | flag); + return 0; +} + +static u16 __cold return_from_indirect(const struct vringh *vrh, int *up_next, + struct vring_desc **descs, int *desc_max) +{ + u16 i = *up_next; + + *up_next = -1; + *descs = vrh->vring.desc; + *desc_max = vrh->vring.num; + return i; +} + +static int slow_copy(struct vringh *vrh, void *dst, const void *src, + bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len, + struct vringh_range *range, + bool (*getrange)(struct vringh *vrh, + u64, + struct vringh_range *)), + bool (*getrange)(struct vringh *vrh, + u64 addr, + struct vringh_range *r), + struct vringh_range *range, + int (*copy)(void *dst, const void *src, size_t len)) +{ + size_t part, len = sizeof(struct vring_desc); + + do { + u64 addr; + int err; + + part = len; + addr = (u64)(unsigned long)src - range->offset; + + if (!rcheck(vrh, addr, &part, range, getrange)) + return -EINVAL; + + err = copy(dst, src, part); + if (err) + return err; + + dst += part; + src += part; + len -= part; + } while (len); + return 0; +} + +static inline int +__vringh_iov(struct vringh *vrh, u16 i, + struct vringh_kiov *riov, + struct vringh_kiov *wiov, + bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len, + struct vringh_range *range, + bool (*getrange)(struct vringh *, u64, + struct vringh_range *)), + bool (*getrange)(struct vringh *, u64, struct vringh_range *), + gfp_t gfp, + int (*copy)(void *dst, const void *src, size_t len)) +{ + int err, count = 0, up_next, desc_max; + struct vring_desc desc, *descs; + struct vringh_range range = { -1ULL, 0 }, slowrange; + bool slow = false; + + /* We start traversing vring's descriptor table. */ + descs = vrh->vring.desc; + desc_max = vrh->vring.num; + up_next = -1; + + if (riov) + riov->i = riov->used = 0; + else if (wiov) + wiov->i = wiov->used = 0; + else + /* You must want something! */ + BUG(); + + for (;;) { + void *addr; + struct vringh_kiov *iov; + size_t len; + + if (unlikely(slow)) + err = slow_copy(vrh, &desc, &descs[i], rcheck, getrange, + &slowrange, copy); + else + err = copy(&desc, &descs[i], sizeof(desc)); + if (unlikely(err)) + goto fail; + + if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) { + /* Make sure it's OK, and get offset. */ + len = desc.len; + if (!rcheck(vrh, desc.addr, &len, &range, getrange)) { + err = -EINVAL; + goto fail; + } + + if (unlikely(len != desc.len)) { + slow = true; + /* We need to save this range to use offset */ + slowrange = range; + } + + addr = (void *)(long)(desc.addr + range.offset); + err = move_to_indirect(&up_next, &i, addr, &desc, + &descs, &desc_max); + if (err) + goto fail; + continue; + } + + if (count++ == vrh->vring.num) { + vringh_bad("Descriptor loop in %p", descs); + err = -ELOOP; + goto fail; + } + + if (desc.flags & VRING_DESC_F_WRITE) + iov = wiov; + else { + iov = riov; + if (unlikely(wiov && wiov->i)) { + vringh_bad("Readable desc %p after writable", + &descs[i]); + err = -EINVAL; + goto fail; + } + } + + if (!iov) { + vringh_bad("Unexpected %s desc", + !wiov ? "writable" : "readable"); + err = -EPROTO; + goto fail; + } + + again: + /* Make sure it's OK, and get offset. */ + len = desc.len; + if (!rcheck(vrh, desc.addr, &len, &range, getrange)) { + err = -EINVAL; + goto fail; + } + addr = (void *)(unsigned long)(desc.addr + range.offset); + + if (unlikely(iov->used == (iov->max_num & ~VRINGH_IOV_ALLOCATED))) { + err = resize_iovec(iov, gfp); + if (err) + goto fail; + } + + iov->iov[iov->used].iov_base = addr; + iov->iov[iov->used].iov_len = len; + iov->used++; + + if (unlikely(len != desc.len)) { + desc.len -= len; + desc.addr += len; + goto again; + } + + if (desc.flags & VRING_DESC_F_NEXT) { + i = desc.next; + } else { + /* Just in case we need to finish traversing above. */ + if (unlikely(up_next > 0)) { + i = return_from_indirect(vrh, &up_next, + &descs, &desc_max); + slow = false; + } else + break; + } + + if (i >= desc_max) { + vringh_bad("Chained index %u > %u", i, desc_max); + err = -EINVAL; + goto fail; + } + } + + return 0; + +fail: + return err; +} + +static inline int __vringh_complete(struct vringh *vrh, + const struct vring_used_elem *used, + unsigned int num_used, + int (*putu16)(u16 *p, u16 val), + int (*putused)(struct vring_used_elem *dst, + const struct vring_used_elem + *src, unsigned num)) +{ + struct vring_used *used_ring; + int err; + u16 used_idx, off; + + used_ring = vrh->vring.used; + used_idx = vrh->last_used_idx + vrh->completed; + + off = used_idx % vrh->vring.num; + + /* Compiler knows num_used == 1 sometimes, hence extra check */ + if (num_used > 1 && unlikely(off + num_used >= vrh->vring.num)) { + u16 part = vrh->vring.num - off; + err = putused(&used_ring->ring[off], used, part); + if (!err) + err = putused(&used_ring->ring[0], used + part, + num_used - part); + } else + err = putused(&used_ring->ring[off], used, num_used); + + if (err) { + vringh_bad("Failed to write %u used entries %u at %p", + num_used, off, &used_ring->ring[off]); + return err; + } + + /* Make sure buffer is written before we update index. */ + virtio_wmb(vrh->weak_barriers); + + err = putu16(&vrh->vring.used->idx, used_idx + num_used); + if (err) { + vringh_bad("Failed to update used index at %p", + &vrh->vring.used->idx); + return err; + } + + vrh->completed += num_used; + return 0; +} + + +static inline int __vringh_need_notify(struct vringh *vrh, + int (*getu16)(u16 *val, const u16 *p)) +{ + bool notify; + u16 used_event; + int err; + + /* Flush out used index update. This is paired with the + * barrier that the Guest executes when enabling + * interrupts. */ + virtio_mb(vrh->weak_barriers); + + /* Old-style, without event indices. */ + if (!vrh->event_indices) { + u16 flags; + err = getu16(&flags, &vrh->vring.avail->flags); + if (err) { + vringh_bad("Failed to get flags at %p", + &vrh->vring.avail->flags); + return err; + } + return (!(flags & VRING_AVAIL_F_NO_INTERRUPT)); + } + + /* Modern: we know when other side wants to know. */ + err = getu16(&used_event, &vring_used_event(&vrh->vring)); + if (err) { + vringh_bad("Failed to get used event idx at %p", + &vring_used_event(&vrh->vring)); + return err; + } + + /* Just in case we added so many that we wrap. */ + if (unlikely(vrh->completed > 0xffff)) + notify = true; + else + notify = vring_need_event(used_event, + vrh->last_used_idx + vrh->completed, + vrh->last_used_idx); + + vrh->last_used_idx += vrh->completed; + vrh->completed = 0; + return notify; +} + +static inline bool __vringh_notify_enable(struct vringh *vrh, + int (*getu16)(u16 *val, const u16 *p), + int (*putu16)(u16 *p, u16 val)) +{ + u16 avail; + + if (!vrh->event_indices) { + /* Old-school; update flags. */ + if (putu16(&vrh->vring.used->flags, 0) != 0) { + vringh_bad("Clearing used flags %p", + &vrh->vring.used->flags); + return true; + } + } else { + if (putu16(&vring_avail_event(&vrh->vring), + vrh->last_avail_idx) != 0) { + vringh_bad("Updating avail event index %p", + &vring_avail_event(&vrh->vring)); + return true; + } + } + + /* They could have slipped one in as we were doing that: make + * sure it's written, then check again. */ + virtio_mb(vrh->weak_barriers); + + if (getu16(&avail, &vrh->vring.avail->idx) != 0) { + vringh_bad("Failed to check avail idx at %p", + &vrh->vring.avail->idx); + return true; + } + + /* This is unlikely, so we just leave notifications enabled + * (if we're using event_indices, we'll only get one + * notification anyway). */ + return avail == vrh->last_avail_idx; +} + +static inline void __vringh_notify_disable(struct vringh *vrh, + int (*putu16)(u16 *p, u16 val)) +{ + if (!vrh->event_indices) { + /* Old-school; update flags. */ + if (putu16(&vrh->vring.used->flags, VRING_USED_F_NO_NOTIFY)) { + vringh_bad("Setting used flags %p", + &vrh->vring.used->flags); + } + } +} + +/* Userspace access helpers: in this case, addresses are really userspace. */ +static inline int getu16_user(u16 *val, const u16 *p) +{ + return get_user(*val, (__force u16 __user *)p); +} + +static inline int putu16_user(u16 *p, u16 val) +{ + return put_user(val, (__force u16 __user *)p); +} + +static inline int copydesc_user(void *dst, const void *src, size_t len) +{ + return copy_from_user(dst, (__force void __user *)src, len) ? + -EFAULT : 0; +} + +static inline int putused_user(struct vring_used_elem *dst, + const struct vring_used_elem *src, + unsigned int num) +{ + return copy_to_user((__force void __user *)dst, src, + sizeof(*dst) * num) ? -EFAULT : 0; +} + +static inline int xfer_from_user(void *src, void *dst, size_t len) +{ + return copy_from_user(dst, (__force void __user *)src, len) ? + -EFAULT : 0; +} + +static inline int xfer_to_user(void *dst, void *src, size_t len) +{ + return copy_to_user((__force void __user *)dst, src, len) ? + -EFAULT : 0; +} + +/** + * vringh_init_user - initialize a vringh for a userspace vring. + * @vrh: the vringh to initialize. + * @features: the feature bits for this ring. + * @num: the number of elements. + * @weak_barriers: true if we only need memory barriers, not I/O. + * @desc: the userpace descriptor pointer. + * @avail: the userpace avail pointer. + * @used: the userpace used pointer. + * + * Returns an error if num is invalid: you should check pointers + * yourself! + */ +int vringh_init_user(struct vringh *vrh, u32 features, + unsigned int num, bool weak_barriers, + struct vring_desc __user *desc, + struct vring_avail __user *avail, + struct vring_used __user *used) +{ + /* Sane power of 2 please! */ + if (!num || num > 0xffff || (num & (num - 1))) { + vringh_bad("Bad ring size %u", num); + return -EINVAL; + } + + vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX)); + vrh->weak_barriers = weak_barriers; + vrh->completed = 0; + vrh->last_avail_idx = 0; + vrh->last_used_idx = 0; + vrh->vring.num = num; + /* vring expects kernel addresses, but only used via accessors. */ + vrh->vring.desc = (__force struct vring_desc *)desc; + vrh->vring.avail = (__force struct vring_avail *)avail; + vrh->vring.used = (__force struct vring_used *)used; + return 0; +} +EXPORT_SYMBOL(vringh_init_user); + +/** + * vringh_getdesc_user - get next available descriptor from userspace ring. + * @vrh: the userspace vring. + * @riov: where to put the readable descriptors (or NULL) + * @wiov: where to put the writable descriptors (or NULL) + * @getrange: function to call to check ranges. + * @head: head index we received, for passing to vringh_complete_user(). + * + * Returns 0 if there was no descriptor, 1 if there was, or -errno. + * + * Note that on error return, you can tell the difference between an + * invalid ring and a single invalid descriptor: in the former case, + * *head will be vrh->vring.num. You may be able to ignore an invalid + * descriptor, but there's not much you can do with an invalid ring. + * + * Note that you may need to clean up riov and wiov, even on error! + */ +int vringh_getdesc_user(struct vringh *vrh, + struct vringh_iov *riov, + struct vringh_iov *wiov, + bool (*getrange)(struct vringh *vrh, + u64 addr, struct vringh_range *r), + u16 *head) +{ + int err; + + *head = vrh->vring.num; + err = __vringh_get_head(vrh, getu16_user, &vrh->last_avail_idx); + if (err < 0) + return err; + + /* Empty... */ + if (err == vrh->vring.num) + return 0; + + /* We need the layouts to be the identical for this to work */ + BUILD_BUG_ON(sizeof(struct vringh_kiov) != sizeof(struct vringh_iov)); + BUILD_BUG_ON(offsetof(struct vringh_kiov, iov) != + offsetof(struct vringh_iov, iov)); + BUILD_BUG_ON(offsetof(struct vringh_kiov, i) != + offsetof(struct vringh_iov, i)); + BUILD_BUG_ON(offsetof(struct vringh_kiov, used) != + offsetof(struct vringh_iov, used)); + BUILD_BUG_ON(offsetof(struct vringh_kiov, max_num) != + offsetof(struct vringh_iov, max_num)); + BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); + BUILD_BUG_ON(offsetof(struct iovec, iov_base) != + offsetof(struct kvec, iov_base)); + BUILD_BUG_ON(offsetof(struct iovec, iov_len) != + offsetof(struct kvec, iov_len)); + BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_base) + != sizeof(((struct kvec *)NULL)->iov_base)); + BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_len) + != sizeof(((struct kvec *)NULL)->iov_len)); + + *head = err; + err = __vringh_iov(vrh, *head, (struct vringh_kiov *)riov, + (struct vringh_kiov *)wiov, + range_check, getrange, GFP_KERNEL, copydesc_user); + if (err) + return err; + + return 1; +} +EXPORT_SYMBOL(vringh_getdesc_user); + +/** + * vringh_iov_pull_user - copy bytes from vring_iov. + * @riov: the riov as passed to vringh_getdesc_user() (updated as we consume) + * @dst: the place to copy. + * @len: the maximum length to copy. + * + * Returns the bytes copied <= len or a negative errno. + */ +ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len) +{ + return vringh_iov_xfer((struct vringh_kiov *)riov, + dst, len, xfer_from_user); +} +EXPORT_SYMBOL(vringh_iov_pull_user); + +/** + * vringh_iov_push_user - copy bytes into vring_iov. + * @wiov: the wiov as passed to vringh_getdesc_user() (updated as we consume) + * @dst: the place to copy. + * @len: the maximum length to copy. + * + * Returns the bytes copied <= len or a negative errno. + */ +ssize_t vringh_iov_push_user(struct vringh_iov *wiov, + const void *src, size_t len) +{ + return vringh_iov_xfer((struct vringh_kiov *)wiov, + (void *)src, len, xfer_to_user); +} +EXPORT_SYMBOL(vringh_iov_push_user); + +/** + * vringh_abandon_user - we've decided not to handle the descriptor(s). + * @vrh: the vring. + * @num: the number of descriptors to put back (ie. num + * vringh_get_user() to undo). + * + * The next vringh_get_user() will return the old descriptor(s) again. + */ +void vringh_abandon_user(struct vringh *vrh, unsigned int num) +{ + /* We only update vring_avail_event(vr) when we want to be notified, + * so we haven't changed that yet. */ + vrh->last_avail_idx -= num; +} +EXPORT_SYMBOL(vringh_abandon_user); + +/** + * vringh_complete_user - we've finished with descriptor, publish it. + * @vrh: the vring. + * @head: the head as filled in by vringh_getdesc_user. + * @len: the length of data we have written. + * + * You should check vringh_need_notify_user() after one or more calls + * to this function. + */ +int vringh_complete_user(struct vringh *vrh, u16 head, u32 len) +{ + struct vring_used_elem used; + + used.id = head; + used.len = len; + return __vringh_complete(vrh, &used, 1, putu16_user, putused_user); +} +EXPORT_SYMBOL(vringh_complete_user); + +/** + * vringh_complete_multi_user - we've finished with many descriptors. + * @vrh: the vring. + * @used: the head, length pairs. + * @num_used: the number of used elements. + * + * You should check vringh_need_notify_user() after one or more calls + * to this function. + */ +int vringh_complete_multi_user(struct vringh *vrh, + const struct vring_used_elem used[], + unsigned num_used) +{ + return __vringh_complete(vrh, used, num_used, + putu16_user, putused_user); +} +EXPORT_SYMBOL(vringh_complete_multi_user); + +/** + * vringh_notify_enable_user - we want to know if something changes. + * @vrh: the vring. + * + * This always enables notifications, but returns false if there are + * now more buffers available in the vring. + */ +bool vringh_notify_enable_user(struct vringh *vrh) +{ + return __vringh_notify_enable(vrh, getu16_user, putu16_user); +} +EXPORT_SYMBOL(vringh_notify_enable_user); + +/** + * vringh_notify_disable_user - don't tell us if something changes. + * @vrh: the vring. + * + * This is our normal running state: we disable and then only enable when + * we're going to sleep. + */ +void vringh_notify_disable_user(struct vringh *vrh) +{ + __vringh_notify_disable(vrh, putu16_user); +} +EXPORT_SYMBOL(vringh_notify_disable_user); + +/** + * vringh_need_notify_user - must we tell the other side about used buffers? + * @vrh: the vring we've called vringh_complete_user() on. + * + * Returns -errno or 0 if we don't need to tell the other side, 1 if we do. + */ +int vringh_need_notify_user(struct vringh *vrh) +{ + return __vringh_need_notify(vrh, getu16_user); +} +EXPORT_SYMBOL(vringh_need_notify_user); + +/* Kernelspace access helpers. */ +static inline int getu16_kern(u16 *val, const u16 *p) +{ + *val = ACCESS_ONCE(*p); + return 0; +} + +static inline int putu16_kern(u16 *p, u16 val) +{ + ACCESS_ONCE(*p) = val; + return 0; +} + +static inline int copydesc_kern(void *dst, const void *src, size_t len) +{ + memcpy(dst, src, len); + return 0; +} + +static inline int putused_kern(struct vring_used_elem *dst, + const struct vring_used_elem *src, + unsigned int num) +{ + memcpy(dst, src, num * sizeof(*dst)); + return 0; +} + +static inline int xfer_kern(void *src, void *dst, size_t len) +{ + memcpy(dst, src, len); + return 0; +} + +/** + * vringh_init_kern - initialize a vringh for a kernelspace vring. + * @vrh: the vringh to initialize. + * @features: the feature bits for this ring. + * @num: the number of elements. + * @weak_barriers: true if we only need memory barriers, not I/O. + * @desc: the userpace descriptor pointer. + * @avail: the userpace avail pointer. + * @used: the userpace used pointer. + * + * Returns an error if num is invalid. + */ +int vringh_init_kern(struct vringh *vrh, u32 features, + unsigned int num, bool weak_barriers, + struct vring_desc *desc, + struct vring_avail *avail, + struct vring_used *used) +{ + /* Sane power of 2 please! */ + if (!num || num > 0xffff || (num & (num - 1))) { + vringh_bad("Bad ring size %u", num); + return -EINVAL; + } + + vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX)); + vrh->weak_barriers = weak_barriers; + vrh->completed = 0; + vrh->last_avail_idx = 0; + vrh->last_used_idx = 0; + vrh->vring.num = num; + vrh->vring.desc = desc; + vrh->vring.avail = avail; + vrh->vring.used = used; + return 0; +} +EXPORT_SYMBOL(vringh_init_kern); + +/** + * vringh_getdesc_kern - get next available descriptor from kernelspace ring. + * @vrh: the kernelspace vring. + * @riov: where to put the readable descriptors (or NULL) + * @wiov: where to put the writable descriptors (or NULL) + * @head: head index we received, for passing to vringh_complete_kern(). + * @gfp: flags for allocating larger riov/wiov. + * + * Returns 0 if there was no descriptor, 1 if there was, or -errno. + * + * Note that on error return, you can tell the difference between an + * invalid ring and a single invalid descriptor: in the former case, + * *head will be vrh->vring.num. You may be able to ignore an invalid + * descriptor, but there's not much you can do with an invalid ring. + * + * Note that you may need to clean up riov and wiov, even on error! + */ +int vringh_getdesc_kern(struct vringh *vrh, + struct vringh_kiov *riov, + struct vringh_kiov *wiov, + u16 *head, + gfp_t gfp) +{ + int err; + + err = __vringh_get_head(vrh, getu16_kern, &vrh->last_avail_idx); + if (err < 0) + return err; + + /* Empty... */ + if (err == vrh->vring.num) + return 0; + + *head = err; + err = __vringh_iov(vrh, *head, riov, wiov, no_range_check, NULL, + gfp, copydesc_kern); + if (err) + return err; + + return 1; +} +EXPORT_SYMBOL(vringh_getdesc_kern); + +/** + * vringh_iov_pull_kern - copy bytes from vring_iov. + * @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume) + * @dst: the place to copy. + * @len: the maximum length to copy. + * + * Returns the bytes copied <= len or a negative errno. + */ +ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len) +{ + return vringh_iov_xfer(riov, dst, len, xfer_kern); +} +EXPORT_SYMBOL(vringh_iov_pull_kern); + +/** + * vringh_iov_push_kern - copy bytes into vring_iov. + * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume) + * @dst: the place to copy. + * @len: the maximum length to copy. + * + * Returns the bytes copied <= len or a negative errno. + */ +ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov, + const void *src, size_t len) +{ + return vringh_iov_xfer(wiov, (void *)src, len, xfer_kern); +} +EXPORT_SYMBOL(vringh_iov_push_kern); + +/** + * vringh_abandon_kern - we've decided not to handle the descriptor(s). + * @vrh: the vring. + * @num: the number of descriptors to put back (ie. num + * vringh_get_kern() to undo). + * + * The next vringh_get_kern() will return the old descriptor(s) again. + */ +void vringh_abandon_kern(struct vringh *vrh, unsigned int num) +{ + /* We only update vring_avail_event(vr) when we want to be notified, + * so we haven't changed that yet. */ + vrh->last_avail_idx -= num; +} +EXPORT_SYMBOL(vringh_abandon_kern); + +/** + * vringh_complete_kern - we've finished with descriptor, publish it. + * @vrh: the vring. + * @head: the head as filled in by vringh_getdesc_kern. + * @len: the length of data we have written. + * + * You should check vringh_need_notify_kern() after one or more calls + * to this function. + */ +int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len) +{ + struct vring_used_elem used; + + used.id = head; + used.len = len; + + return __vringh_complete(vrh, &used, 1, putu16_kern, putused_kern); +} +EXPORT_SYMBOL(vringh_complete_kern); + +/** + * vringh_notify_enable_kern - we want to know if something changes. + * @vrh: the vring. + * + * This always enables notifications, but returns false if there are + * now more buffers available in the vring. + */ +bool vringh_notify_enable_kern(struct vringh *vrh) +{ + return __vringh_notify_enable(vrh, getu16_kern, putu16_kern); +} +EXPORT_SYMBOL(vringh_notify_enable_kern); + +/** + * vringh_notify_disable_kern - don't tell us if something changes. + * @vrh: the vring. + * + * This is our normal running state: we disable and then only enable when + * we're going to sleep. + */ +void vringh_notify_disable_kern(struct vringh *vrh) +{ + __vringh_notify_disable(vrh, putu16_kern); +} +EXPORT_SYMBOL(vringh_notify_disable_kern); + +/** + * vringh_need_notify_kern - must we tell the other side about used buffers? + * @vrh: the vring we've called vringh_complete_kern() on. + * + * Returns -errno or 0 if we don't need to tell the other side, 1 if we do. + */ +int vringh_need_notify_kern(struct vringh *vrh) +{ + return __vringh_need_notify(vrh, getu16_kern); +} +EXPORT_SYMBOL(vringh_need_notify_kern); diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 8dab163c5ef0..bd3ae324a1a2 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -108,7 +108,7 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq) sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns); /* We should always be able to add one buffer to an empty queue. */ - if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0) + if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0) BUG(); virtqueue_kick(vq); @@ -256,7 +256,7 @@ static void stats_handle_request(struct virtio_balloon *vb) if (!virtqueue_get_buf(vq, &len)) return; sg_init_one(&sg, vb->stats, sizeof(vb->stats)); - if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0) + if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0) BUG(); virtqueue_kick(vq); } @@ -341,7 +341,7 @@ static int init_vqs(struct virtio_balloon *vb) * use it to signal us later. */ sg_init_one(&sg, vb->stats, sizeof vb->stats); - if (virtqueue_add_buf(vb->stats_vq, &sg, 1, 0, vb, GFP_KERNEL) + if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL) < 0) BUG(); virtqueue_kick(vb->stats_vq); diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index ffd7e7da5d3b..5217baf5528c 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -24,27 +24,6 @@ #include <linux/module.h> #include <linux/hrtimer.h> -/* virtio guest is communicating with a virtual "device" that actually runs on - * a host processor. Memory barriers are used to control SMP effects. */ -#ifdef CONFIG_SMP -/* Where possible, use SMP barriers which are more lightweight than mandatory - * barriers, because mandatory barriers control MMIO effects on accesses - * through relaxed memory I/O windows (which virtio-pci does not use). */ -#define virtio_mb(vq) \ - do { if ((vq)->weak_barriers) smp_mb(); else mb(); } while(0) -#define virtio_rmb(vq) \ - do { if ((vq)->weak_barriers) smp_rmb(); else rmb(); } while(0) -#define virtio_wmb(vq) \ - do { if ((vq)->weak_barriers) smp_wmb(); else wmb(); } while(0) -#else -/* We must force memory ordering even if guest is UP since host could be - * running on another CPU, but SMP barriers are defined to barrier() in that - * configuration. So fall back to mandatory barriers instead. */ -#define virtio_mb(vq) mb() -#define virtio_rmb(vq) rmb() -#define virtio_wmb(vq) wmb() -#endif - #ifdef DEBUG /* For development, we want to crash whenever the ring is screwed. */ #define BAD_RING(_vq, fmt, args...) \ @@ -119,16 +98,36 @@ struct vring_virtqueue #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) +static inline struct scatterlist *sg_next_chained(struct scatterlist *sg, + unsigned int *count) +{ + return sg_next(sg); +} + +static inline struct scatterlist *sg_next_arr(struct scatterlist *sg, + unsigned int *count) +{ + if (--(*count) == 0) + return NULL; + return sg + 1; +} + /* Set up an indirect table of descriptors and add it to the queue. */ -static int vring_add_indirect(struct vring_virtqueue *vq, - struct scatterlist sg[], - unsigned int out, - unsigned int in, - gfp_t gfp) +static inline int vring_add_indirect(struct vring_virtqueue *vq, + struct scatterlist *sgs[], + struct scatterlist *(*next) + (struct scatterlist *, unsigned int *), + unsigned int total_sg, + unsigned int total_out, + unsigned int total_in, + unsigned int out_sgs, + unsigned int in_sgs, + gfp_t gfp) { struct vring_desc *desc; unsigned head; - int i; + struct scatterlist *sg; + int i, n; /* * We require lowmem mappings for the descriptors because @@ -137,25 +136,31 @@ static int vring_add_indirect(struct vring_virtqueue *vq, */ gfp &= ~(__GFP_HIGHMEM | __GFP_HIGH); - desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp); + desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp); if (!desc) return -ENOMEM; - /* Transfer entries from the sg list into the indirect page */ - for (i = 0; i < out; i++) { - desc[i].flags = VRING_DESC_F_NEXT; - desc[i].addr = sg_phys(sg); - desc[i].len = sg->length; - desc[i].next = i+1; - sg++; + /* Transfer entries from the sg lists into the indirect page */ + i = 0; + for (n = 0; n < out_sgs; n++) { + for (sg = sgs[n]; sg; sg = next(sg, &total_out)) { + desc[i].flags = VRING_DESC_F_NEXT; + desc[i].addr = sg_phys(sg); + desc[i].len = sg->length; + desc[i].next = i+1; + i++; + } } - for (; i < (out + in); i++) { - desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; - desc[i].addr = sg_phys(sg); - desc[i].len = sg->length; - desc[i].next = i+1; - sg++; + for (; n < (out_sgs + in_sgs); n++) { + for (sg = sgs[n]; sg; sg = next(sg, &total_in)) { + desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; + desc[i].addr = sg_phys(sg); + desc[i].len = sg->length; + desc[i].next = i+1; + i++; + } } + BUG_ON(i != total_sg); /* Last one doesn't continue. */ desc[i-1].flags &= ~VRING_DESC_F_NEXT; @@ -176,29 +181,20 @@ static int vring_add_indirect(struct vring_virtqueue *vq, return head; } -/** - * virtqueue_add_buf - expose buffer to other end - * @vq: the struct virtqueue we're talking about. - * @sg: the description of the buffer(s). - * @out_num: the number of sg readable by other side - * @in_num: the number of sg which are writable (after readable ones) - * @data: the token identifying the buffer. - * @gfp: how to do memory allocations (if necessary). - * - * Caller must ensure we don't call this with other virtqueue operations - * at the same time (except where noted). - * - * Returns zero or a negative error (ie. ENOSPC, ENOMEM). - */ -int virtqueue_add_buf(struct virtqueue *_vq, - struct scatterlist sg[], - unsigned int out, - unsigned int in, - void *data, - gfp_t gfp) +static inline int virtqueue_add(struct virtqueue *_vq, + struct scatterlist *sgs[], + struct scatterlist *(*next) + (struct scatterlist *, unsigned int *), + unsigned int total_out, + unsigned int total_in, + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + gfp_t gfp) { struct vring_virtqueue *vq = to_vvq(_vq); - unsigned int i, avail, uninitialized_var(prev); + struct scatterlist *sg; + unsigned int i, n, avail, uninitialized_var(prev), total_sg; int head; START_USE(vq); @@ -218,46 +214,54 @@ int virtqueue_add_buf(struct virtqueue *_vq, } #endif + total_sg = total_in + total_out; + /* If the host supports indirect descriptor tables, and we have multiple * buffers, then go indirect. FIXME: tune this threshold */ - if (vq->indirect && (out + in) > 1 && vq->vq.num_free) { - head = vring_add_indirect(vq, sg, out, in, gfp); + if (vq->indirect && total_sg > 1 && vq->vq.num_free) { + head = vring_add_indirect(vq, sgs, next, total_sg, total_out, + total_in, + out_sgs, in_sgs, gfp); if (likely(head >= 0)) goto add_head; } - BUG_ON(out + in > vq->vring.num); - BUG_ON(out + in == 0); + BUG_ON(total_sg > vq->vring.num); + BUG_ON(total_sg == 0); - if (vq->vq.num_free < out + in) { + if (vq->vq.num_free < total_sg) { pr_debug("Can't add buf len %i - avail = %i\n", - out + in, vq->vq.num_free); + total_sg, vq->vq.num_free); /* FIXME: for historical reasons, we force a notify here if * there are outgoing parts to the buffer. Presumably the * host should service the ring ASAP. */ - if (out) + if (out_sgs) vq->notify(&vq->vq); END_USE(vq); return -ENOSPC; } /* We're about to use some buffers from the free list. */ - vq->vq.num_free -= out + in; - - head = vq->free_head; - for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) { - vq->vring.desc[i].flags = VRING_DESC_F_NEXT; - vq->vring.desc[i].addr = sg_phys(sg); - vq->vring.desc[i].len = sg->length; - prev = i; - sg++; + vq->vq.num_free -= total_sg; + + head = i = vq->free_head; + for (n = 0; n < out_sgs; n++) { + for (sg = sgs[n]; sg; sg = next(sg, &total_out)) { + vq->vring.desc[i].flags = VRING_DESC_F_NEXT; + vq->vring.desc[i].addr = sg_phys(sg); + vq->vring.desc[i].len = sg->length; + prev = i; + i = vq->vring.desc[i].next; + } } - for (; in; i = vq->vring.desc[i].next, in--) { - vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; - vq->vring.desc[i].addr = sg_phys(sg); - vq->vring.desc[i].len = sg->length; - prev = i; - sg++; + for (; n < (out_sgs + in_sgs); n++) { + for (sg = sgs[n]; sg; sg = next(sg, &total_in)) { + vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; + vq->vring.desc[i].addr = sg_phys(sg); + vq->vring.desc[i].len = sg->length; + prev = i; + i = vq->vring.desc[i].next; + } } /* Last one doesn't continue. */ vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT; @@ -276,7 +280,7 @@ add_head: /* Descriptors and available array need to be set before we expose the * new available array entries. */ - virtio_wmb(vq); + virtio_wmb(vq->weak_barriers); vq->vring.avail->idx++; vq->num_added++; @@ -290,9 +294,122 @@ add_head: return 0; } + +/** + * virtqueue_add_buf - expose buffer to other end + * @vq: the struct virtqueue we're talking about. + * @sg: the description of the buffer(s). + * @out_num: the number of sg readable by other side + * @in_num: the number of sg which are writable (after readable ones) + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM). + */ +int virtqueue_add_buf(struct virtqueue *_vq, + struct scatterlist sg[], + unsigned int out, + unsigned int in, + void *data, + gfp_t gfp) +{ + struct scatterlist *sgs[2]; + + sgs[0] = sg; + sgs[1] = sg + out; + + return virtqueue_add(_vq, sgs, sg_next_arr, + out, in, out ? 1 : 0, in ? 1 : 0, data, gfp); +} EXPORT_SYMBOL_GPL(virtqueue_add_buf); /** + * virtqueue_add_sgs - expose buffers to other end + * @vq: the struct virtqueue we're talking about. + * @sgs: array of terminated scatterlists. + * @out_num: the number of scatterlists readable by other side + * @in_num: the number of scatterlists which are writable (after readable ones) + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM). + */ +int virtqueue_add_sgs(struct virtqueue *_vq, + struct scatterlist *sgs[], + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + gfp_t gfp) +{ + unsigned int i, total_out, total_in; + + /* Count them first. */ + for (i = total_out = total_in = 0; i < out_sgs; i++) { + struct scatterlist *sg; + for (sg = sgs[i]; sg; sg = sg_next(sg)) + total_out++; + } + for (; i < out_sgs + in_sgs; i++) { + struct scatterlist *sg; + for (sg = sgs[i]; sg; sg = sg_next(sg)) + total_in++; + } + return virtqueue_add(_vq, sgs, sg_next_chained, + total_out, total_in, out_sgs, in_sgs, data, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_sgs); + +/** + * virtqueue_add_outbuf - expose output buffers to other end + * @vq: the struct virtqueue we're talking about. + * @sgs: array of scatterlists (need not be terminated!) + * @num: the number of scatterlists readable by other side + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM). + */ +int virtqueue_add_outbuf(struct virtqueue *vq, + struct scatterlist sg[], unsigned int num, + void *data, + gfp_t gfp) +{ + return virtqueue_add(vq, &sg, sg_next_arr, num, 0, 1, 0, data, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_outbuf); + +/** + * virtqueue_add_inbuf - expose input buffers to other end + * @vq: the struct virtqueue we're talking about. + * @sgs: array of scatterlists (need not be terminated!) + * @num: the number of scatterlists writable by other side + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM). + */ +int virtqueue_add_inbuf(struct virtqueue *vq, + struct scatterlist sg[], unsigned int num, + void *data, + gfp_t gfp) +{ + return virtqueue_add(vq, &sg, sg_next_arr, 0, num, 0, 1, data, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_inbuf); + +/** * virtqueue_kick_prepare - first half of split virtqueue_kick call. * @vq: the struct virtqueue * @@ -312,7 +429,7 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq) START_USE(vq); /* We need to expose available array entries before checking avail * event. */ - virtio_mb(vq); + virtio_mb(vq->weak_barriers); old = vq->vring.avail->idx - vq->num_added; new = vq->vring.avail->idx; @@ -436,7 +553,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) } /* Only get used array entries after they have been exposed by host. */ - virtio_rmb(vq); + virtio_rmb(vq->weak_barriers); last_used = (vq->last_used_idx & (vq->vring.num - 1)); i = vq->vring.used->ring[last_used].id; @@ -460,7 +577,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) * the read in the next get_buf call. */ if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { vring_used_event(&vq->vring) = vq->last_used_idx; - virtio_mb(vq); + virtio_mb(vq->weak_barriers); } #ifdef DEBUG @@ -513,7 +630,7 @@ bool virtqueue_enable_cb(struct virtqueue *_vq) * entry. Always do both to keep code simple. */ vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; vring_used_event(&vq->vring) = vq->last_used_idx; - virtio_mb(vq); + virtio_mb(vq->weak_barriers); if (unlikely(more_used(vq))) { END_USE(vq); return false; @@ -553,7 +670,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) /* TODO: tune this threshold */ bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4; vring_used_event(&vq->vring) = vq->last_used_idx + bufs; - virtio_mb(vq); + virtio_mb(vq->weak_barriers); if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) { END_USE(vq); return false; |