diff options
99 files changed, 849 insertions, 595 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 15853d522941..2c391338c675 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -440,7 +440,9 @@ prototypes: ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); - unsigned int (*poll) (struct file *, struct poll_table_struct *); + __poll_t (*poll) (struct file *, struct poll_table_struct *); + struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t); + __poll_t (*poll_mask) (struct file *, __poll_t); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); @@ -471,7 +473,7 @@ prototypes: }; locking rules: - All may block. + All except for ->poll_mask may block. ->llseek() locking has moved from llseek to the individual llseek implementations. If your fs is not using generic_file_llseek, you @@ -503,6 +505,9 @@ in sys_read() and friends. the lease within the individual filesystem to record the result of the operation +->poll_mask can be called with or without the waitqueue lock for the waitqueue +returned from ->get_poll_head. + --------------------------- dquot_operations ------------------------------- prototypes: int (*write_dquot) (struct dquot *); diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 5fd325df59e2..829a7b7857a4 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -856,7 +856,9 @@ struct file_operations { ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iterate) (struct file *, struct dir_context *); - unsigned int (*poll) (struct file *, struct poll_table_struct *); + __poll_t (*poll) (struct file *, struct poll_table_struct *); + struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t); + __poll_t (*poll_mask) (struct file *, __poll_t); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); @@ -901,6 +903,17 @@ otherwise noted. activity on this file and (optionally) go to sleep until there is activity. Called by the select(2) and poll(2) system calls + get_poll_head: Returns the struct wait_queue_head that callers can + wait on. Callers need to check the returned events using ->poll_mask + once woken. Can return NULL to indicate polling is not supported, + or any error code using the ERR_PTR convention to indicate that a + grave error occured and ->poll_mask shall not be called. + + poll_mask: return the mask of EPOLL* values describing the file descriptor + state. Called either before going to sleep on the waitqueue returned by + get_poll_head, or after it has been woken. If ->get_poll_head and + ->poll_mask are implemented ->poll does not need to be implement. + unlocked_ioctl: called by the ioctl(2) system call. compat_ioctl: called by the ioctl(2) system call when 32 bit system calls diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index d6b27dab1b30..14a2f996e543 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -396,3 +396,4 @@ 382 i386 pkey_free sys_pkey_free __ia32_sys_pkey_free 383 i386 statx sys_statx __ia32_sys_statx 384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl +385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 4dfe42666d0c..cd36232ab62f 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -341,6 +341,7 @@ 330 common pkey_alloc __x64_sys_pkey_alloc 331 common pkey_free __x64_sys_pkey_free 332 common statx __x64_sys_statx +333 common io_pgetevents __x64_sys_io_pgetevents # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 7846c0c20cfe..89ed613c017e 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -347,7 +347,6 @@ static const struct proto_ops alg_proto_ops = { .sendpage = sock_no_sendpage, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, - .poll = sock_no_poll, .bind = alg_bind, .release = af_alg_release, @@ -1061,19 +1060,12 @@ void af_alg_async_cb(struct crypto_async_request *_req, int err) } EXPORT_SYMBOL_GPL(af_alg_async_cb); -/** - * af_alg_poll - poll system call handler - */ -__poll_t af_alg_poll(struct file *file, struct socket *sock, - poll_table *wait) +__poll_t af_alg_poll_mask(struct socket *sock, __poll_t events) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; - __poll_t mask; - - sock_poll_wait(file, sk_sleep(sk), wait); - mask = 0; + __poll_t mask = 0; if (!ctx->more || ctx->used) mask |= EPOLLIN | EPOLLRDNORM; @@ -1083,7 +1075,7 @@ __poll_t af_alg_poll(struct file *file, struct socket *sock, return mask; } -EXPORT_SYMBOL_GPL(af_alg_poll); +EXPORT_SYMBOL_GPL(af_alg_poll_mask); /** * af_alg_alloc_areq - allocate struct af_alg_async_req diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c index 4b07edd5a9ff..330cf9f2b767 100644 --- a/crypto/algif_aead.c +++ b/crypto/algif_aead.c @@ -375,7 +375,7 @@ static struct proto_ops algif_aead_ops = { .sendmsg = aead_sendmsg, .sendpage = af_alg_sendpage, .recvmsg = aead_recvmsg, - .poll = af_alg_poll, + .poll_mask = af_alg_poll_mask, }; static int aead_check_key(struct socket *sock) @@ -471,7 +471,7 @@ static struct proto_ops algif_aead_ops_nokey = { .sendmsg = aead_sendmsg_nokey, .sendpage = aead_sendpage_nokey, .recvmsg = aead_recvmsg_nokey, - .poll = af_alg_poll, + .poll_mask = af_alg_poll_mask, }; static void *aead_bind(const char *name, u32 type, u32 mask) diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index 6c9b1927a520..bfcf595fd8f9 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -288,7 +288,6 @@ static struct proto_ops algif_hash_ops = { .mmap = sock_no_mmap, .bind = sock_no_bind, .setsockopt = sock_no_setsockopt, - .poll = sock_no_poll, .release = af_alg_release, .sendmsg = hash_sendmsg, @@ -396,7 +395,6 @@ static struct proto_ops algif_hash_ops_nokey = { .mmap = sock_no_mmap, .bind = sock_no_bind, .setsockopt = sock_no_setsockopt, - .poll = sock_no_poll, .release = af_alg_release, .sendmsg = hash_sendmsg_nokey, diff --git a/crypto/algif_rng.c b/crypto/algif_rng.c index 150c2b6480ed..22df3799a17b 100644 --- a/crypto/algif_rng.c +++ b/crypto/algif_rng.c @@ -106,7 +106,6 @@ static struct proto_ops algif_rng_ops = { .bind = sock_no_bind, .accept = sock_no_accept, .setsockopt = sock_no_setsockopt, - .poll = sock_no_poll, .sendmsg = sock_no_sendmsg, .sendpage = sock_no_sendpage, diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index c4e885df4564..15cf3c5222e0 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -205,7 +205,7 @@ static struct proto_ops algif_skcipher_ops = { .sendmsg = skcipher_sendmsg, .sendpage = af_alg_sendpage, .recvmsg = skcipher_recvmsg, - .poll = af_alg_poll, + .poll_mask = af_alg_poll_mask, }; static int skcipher_check_key(struct socket *sock) @@ -301,7 +301,7 @@ static struct proto_ops algif_skcipher_ops_nokey = { .sendmsg = skcipher_sendmsg_nokey, .sendpage = skcipher_sendpage_nokey, .recvmsg = skcipher_recvmsg_nokey, - .poll = af_alg_poll, + .poll_mask = af_alg_poll_mask, }; static void *skcipher_bind(const char *name, u32 type, u32 mask) diff --git a/drivers/char/random.c b/drivers/char/random.c index cd888d4ee605..a8fb0020ba5c 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -402,8 +402,7 @@ static struct poolinfo { /* * Static global variables */ -static DECLARE_WAIT_QUEUE_HEAD(random_read_wait); -static DECLARE_WAIT_QUEUE_HEAD(random_write_wait); +static DECLARE_WAIT_QUEUE_HEAD(random_wait); static struct fasync_struct *fasync; static DEFINE_SPINLOCK(random_ready_list_lock); @@ -722,8 +721,8 @@ retry: /* should we wake readers? */ if (entropy_bits >= random_read_wakeup_bits && - wq_has_sleeper(&random_read_wait)) { - wake_up_interruptible(&random_read_wait); + wq_has_sleeper(&random_wait)) { + wake_up_interruptible_poll(&random_wait, POLLIN); kill_fasync(&fasync, SIGIO, POLL_IN); } /* If the input pool is getting full, send some @@ -1397,7 +1396,7 @@ retry: trace_debit_entropy(r->name, 8 * ibytes); if (ibytes && (r->entropy_count >> ENTROPY_SHIFT) < random_write_wakeup_bits) { - wake_up_interruptible(&random_write_wait); + wake_up_interruptible_poll(&random_wait, POLLOUT); kill_fasync(&fasync, SIGIO, POLL_OUT); } @@ -1839,7 +1838,7 @@ _random_read(int nonblock, char __user *buf, size_t nbytes) if (nonblock) return -EAGAIN; - wait_event_interruptible(random_read_wait, + wait_event_interruptible(random_wait, ENTROPY_BITS(&input_pool) >= random_read_wakeup_bits); if (signal_pending(current)) @@ -1876,14 +1875,17 @@ urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) return ret; } +static struct wait_queue_head * +random_get_poll_head(struct file *file, __poll_t events) +{ + return &random_wait; +} + static __poll_t -random_poll(struct file *file, poll_table * wait) +random_poll_mask(struct file *file, __poll_t events) { - __poll_t mask; + __poll_t mask = 0; - poll_wait(file, &random_read_wait, wait); - poll_wait(file, &random_write_wait, wait); - mask = 0; if (ENTROPY_BITS(&input_pool) >= random_read_wakeup_bits) mask |= EPOLLIN | EPOLLRDNORM; if (ENTROPY_BITS(&input_pool) < random_write_wakeup_bits) @@ -1990,7 +1992,8 @@ static int random_fasync(int fd, struct file *filp, int on) const struct file_operations random_fops = { .read = random_read, .write = random_write, - .poll = random_poll, + .get_poll_head = random_get_poll_head, + .poll_mask = random_poll_mask, .unlocked_ioctl = random_ioctl, .fasync = random_fasync, .llseek = noop_llseek, @@ -2323,7 +2326,7 @@ void add_hwgenerator_randomness(const char *buffer, size_t count, * We'll be woken up again once below random_write_wakeup_thresh, * or when the calling thread is about to terminate. */ - wait_event_interruptible(random_write_wait, kthread_should_stop() || + wait_event_interruptible(random_wait, kthread_should_stop() || ENTROPY_BITS(&input_pool) <= random_write_wakeup_bits); mix_pool_bytes(poolp, buffer, count); credit_entropy_bits(poolp, entropy); diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index 1f8f489b4167..98f90aadd141 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -588,7 +588,7 @@ static const struct proto_ops data_sock_ops = { .getname = data_sock_getname, .sendmsg = mISDN_sock_sendmsg, .recvmsg = mISDN_sock_recvmsg, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = data_sock_setsockopt, @@ -745,7 +745,6 @@ static const struct proto_ops base_sock_ops = { .getname = sock_no_getname, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, - .poll = sock_no_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index ce61231e96ea..de51e8f70f44 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -1107,7 +1107,7 @@ static const struct proto_ops pppoe_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pppoe_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index c4267ecefd85..157b67c1bf8e 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -624,7 +624,6 @@ static const struct proto_ops pptp_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pptp_getname, - .poll = sock_no_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, diff --git a/drivers/staging/comedi/drivers/serial2002.c b/drivers/staging/comedi/drivers/serial2002.c index b3f3b4a201af..5471b2212a62 100644 --- a/drivers/staging/comedi/drivers/serial2002.c +++ b/drivers/staging/comedi/drivers/serial2002.c @@ -113,7 +113,7 @@ static void serial2002_tty_read_poll_wait(struct file *f, int timeout) long elapsed; __poll_t mask; - mask = f->f_op->poll(f, &table.pt); + mask = vfs_poll(f, &table.pt); if (mask & (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR)) { break; @@ -136,7 +136,7 @@ static int serial2002_tty_read(struct file *f, int timeout) result = -1; if (!IS_ERR(f)) { - if (f->f_op->poll) { + if (file_can_poll(f)) { serial2002_tty_read_poll_wait(f, timeout); if (kernel_read(f, &ch, 1, &pos) == 1) diff --git a/drivers/staging/ipx/af_ipx.c b/drivers/staging/ipx/af_ipx.c index 5703dd176787..208b5c161631 100644 --- a/drivers/staging/ipx/af_ipx.c +++ b/drivers/staging/ipx/af_ipx.c @@ -1965,7 +1965,7 @@ static const struct proto_ops ipx_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = ipx_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = ipx_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ipx_compat_ioctl, diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c index 085700f1be10..2a1be859ee71 100644 --- a/drivers/vfio/virqfd.c +++ b/drivers/vfio/virqfd.c @@ -166,7 +166,7 @@ int vfio_virqfd_enable(void *opaque, init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup); init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc); - events = irqfd.file->f_op->poll(irqfd.file, &virqfd->pt); + events = vfs_poll(irqfd.file, &virqfd->pt); /* * Check if there was an event already pending on the eventfd diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index f0be5f35ab28..895eaa25807c 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -208,7 +208,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file) if (poll->wqh) return 0; - mask = file->f_op->poll(file, &poll->table); + mask = vfs_poll(file, &poll->table); if (mask) vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask)); if (mask & EPOLLERR) { @@ -5,6 +5,7 @@ * Implements an efficient asynchronous io interface. * * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. + * Copyright 2018 Christoph Hellwig. * * See ../COPYING for licensing terms. */ @@ -46,6 +47,8 @@ #include "internal.h" +#define KIOCB_KEY 0 + #define AIO_RING_MAGIC 0xa10a10a1 #define AIO_RING_COMPAT_FEATURES 1 #define AIO_RING_INCOMPAT_FEATURES 0 @@ -156,21 +159,29 @@ struct kioctx { unsigned id; }; -/* - * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either - * cancelled or completed (this makes a certain amount of sense because - * successful cancellation - io_cancel() - does deliver the completion to - * userspace). - * - * And since most things don't implement kiocb cancellation and we'd really like - * kiocb completion to be lockless when possible, we use ki_cancel to - * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED - * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel(). - */ -#define KIOCB_CANCELLED ((void *) (~0ULL)) +struct fsync_iocb { + struct work_struct work; + struct file *file; + bool datasync; +}; + +struct poll_iocb { + struct file *file; + __poll_t events; + struct wait_queue_head *head; + + union { + struct wait_queue_entry wait; + struct work_struct work; + }; +}; struct aio_kiocb { - struct kiocb common; + union { + struct kiocb rw; + struct fsync_iocb fsync; + struct poll_iocb poll; + }; struct kioctx *ki_ctx; kiocb_cancel_fn *ki_cancel; @@ -264,9 +275,6 @@ static int __init aio_setup(void) kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); - - pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page)); - return 0; } __initcall(aio_setup); @@ -552,42 +560,20 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) { - struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common); + struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw); struct kioctx *ctx = req->ki_ctx; unsigned long flags; - spin_lock_irqsave(&ctx->ctx_lock, flags); - - if (!req->ki_list.next) - list_add(&req->ki_list, &ctx->active_reqs); + if (WARN_ON_ONCE(!list_empty(&req->ki_list))) + return; + spin_lock_irqsave(&ctx->ctx_lock, flags); + list_add_tail(&req->ki_list, &ctx->active_reqs); req->ki_cancel = cancel; - spin_unlock_irqrestore(&ctx->ctx_lock, flags); } EXPORT_SYMBOL(kiocb_set_cancel_fn); -static int kiocb_cancel(struct aio_kiocb *kiocb) -{ - kiocb_cancel_fn *old, *cancel; - - /* - * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it - * actually has a cancel function, hence the cmpxchg() - */ - - cancel = READ_ONCE(kiocb->ki_cancel); - do { - if (!cancel || cancel == KIOCB_CANCELLED) - return -EINVAL; - - old = cancel; - cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); - } while (cancel != old); - - return cancel(&kiocb->common); -} - /* * free_ioctx() should be RCU delayed to synchronize against the RCU * protected lookup_ioctx() and also needs process context to call @@ -634,7 +620,7 @@ static void free_ioctx_users(struct percpu_ref *ref) while (!list_empty(&ctx->active_reqs)) { req = list_first_entry(&ctx->active_reqs, struct aio_kiocb, ki_list); - kiocb_cancel(req); + req->ki_cancel(&req->rw); list_del_init(&req->ki_list); } @@ -1041,7 +1027,7 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) goto out_put; percpu_ref_get(&ctx->reqs); - + INIT_LIST_HEAD(&req->ki_list); req->ki_ctx = ctx; return req; out_put: @@ -1049,15 +1035,6 @@ out_put: return NULL; } -static void kiocb_free(struct aio_kiocb *req) -{ - if (req->common.ki_filp) - fput(req->common.ki_filp); - if (req->ki_eventfd != NULL) - eventfd_ctx_put(req->ki_eventfd); - kmem_cache_free(kiocb_cachep, req); -} - static struct kioctx *lookup_ioctx(unsigned long ctx_id) { struct aio_ring __user *ring = (void __user *)ctx_id; @@ -1088,44 +1065,14 @@ out: /* aio_complete * Called when the io request on the given iocb is complete. */ -static void aio_complete(struct kiocb *kiocb, long res, long res2) +static void aio_complete(struct aio_kiocb *iocb, long res, long res2) { - struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common); struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; unsigned tail, pos, head; unsigned long flags; - if (kiocb->ki_flags & IOCB_WRITE) { - struct file *file = kiocb->ki_filp; - - /* - * Tell lockdep we inherited freeze protection from submission - * thread. - */ - if (S_ISREG(file_inode(file)->i_mode)) - __sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE); - file_end_write(file); - } - - /* - * Special case handling for sync iocbs: - * - events go directly into the iocb for fast handling - * - the sync task with the iocb in its stack holds the single iocb - * ref, no other paths have a way to get another ref - * - the sync task helpfully left a reference to itself in the iocb - */ - BUG_ON(is_sync_kiocb(kiocb)); - - if (iocb->ki_list.next) { - unsigned long flags; - - spin_lock_irqsave(&ctx->ctx_lock, flags); - list_del(&iocb->ki_list); - spin_unlock_irqrestore(&ctx->ctx_lock, flags); - } - /* * Add a completion event to the ring buffer. Must be done holding * ctx->completion_lock to prevent other code from messing with the tail @@ -1179,11 +1126,12 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2) * eventfd. The eventfd_signal() function is safe to be called * from IRQ context. */ - if (iocb->ki_eventfd != NULL) + if (iocb->ki_eventfd) { eventfd_signal(iocb->ki_eventfd, 1); + eventfd_ctx_put(iocb->ki_eventfd); + } - /* everything turned out well, dispose of the aiocb. */ - kiocb_free(iocb); + kmem_cache_free(kiocb_cachep, iocb); /* * We have to order our ring_info tail store above and test @@ -1249,14 +1197,13 @@ static long aio_read_events_ring(struct kioctx *ctx, if (head == tail) break; - avail = min(avail, nr - ret); - avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - - ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE)); - pos = head + AIO_EVENTS_OFFSET; page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]; pos %= AIO_EVENTS_PER_PAGE; + avail = min(avail, nr - ret); + avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos); + ev = kmap(page); copy_ret = copy_to_user(event + ret, ev + pos, sizeof(*ev) * avail); @@ -1327,10 +1274,6 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, wait_event_interruptible_hrtimeout(ctx->wait, aio_read_events(ctx, min_nr, nr, event, &ret), until); - - if (!ret && signal_pending(current)) - ret = -EINTR; - return ret; } @@ -1446,6 +1389,58 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) return -EINVAL; } +static void aio_remove_iocb(struct aio_kiocb *iocb) +{ + struct kioctx *ctx = iocb->ki_ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->ctx_lock, flags); + list_del(&iocb->ki_list); + spin_unlock_irqrestore(&ctx->ctx_lock, flags); +} + +static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) +{ + struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw); + + if (!list_empty_careful(&iocb->ki_list)) + aio_remove_iocb(iocb); + + if (kiocb->ki_flags & IOCB_WRITE) { + struct inode *inode = file_inode(kiocb->ki_filp); + + /* + * Tell lockdep we inherited freeze protection from submission + * thread. + */ + if (S_ISREG(inode->i_mode)) + __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); + file_end_write(kiocb->ki_filp); + } + + fput(kiocb->ki_filp); + aio_complete(iocb, res, res2); +} + +static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) +{ + int ret; + + req->ki_filp = fget(iocb->aio_fildes); + if (unlikely(!req->ki_filp)) + return -EBADF; + req->ki_complete = aio_complete_rw; + req->ki_pos = iocb->aio_offset; + req->ki_flags = iocb_flags(req->ki_filp); + if (iocb->aio_flags & IOCB_FLAG_RESFD) + req->ki_flags |= IOCB_EVENTFD; + req->ki_hint = file_write_hint(req->ki_filp); + ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); + if (unlikely(ret)) + fput(req->ki_filp); + return ret; +} + static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec, bool vectored, bool compat, struct iov_iter *iter) { @@ -1465,11 +1460,11 @@ static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec, return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter); } -static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret) +static inline void aio_rw_done(struct kiocb *req, ssize_t ret) { switch (ret) { case -EIOCBQUEUED: - return ret; + break; case -ERESTARTSYS: case -ERESTARTNOINTR: case -ERESTARTNOHAND: @@ -1481,85 +1476,270 @@ static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret) ret = -EINTR; /*FALLTHRU*/ default: - aio_complete(req, ret, 0); - return 0; + aio_complete_rw(req, ret, 0); } } static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored, bool compat) { - struct file *file = req->ki_filp; struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; + struct file *file; ssize_t ret; + ret = aio_prep_rw(req, iocb); + if (ret) + return ret; + file = req->ki_filp; + + ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_READ))) - return -EBADF; + goto out_fput; + ret = -EINVAL; if (unlikely(!file->f_op->read_iter)) - return -EINVAL; + goto out_fput; ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter); if (ret) - return ret; + goto out_fput; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) - ret = aio_ret(req, call_read_iter(file, req, &iter)); + aio_rw_done(req, call_read_iter(file, req, &iter)); kfree(iovec); +out_fput: + if (unlikely(ret)) + fput(file); return ret; } static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, bool compat) { - struct file *file = req->ki_filp; struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; + struct file *file; ssize_t ret; + ret = aio_prep_rw(req, iocb); + if (ret) + return ret; + file = req->ki_filp; + + ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_WRITE))) - return -EBADF; + goto out_fput; + ret = -EINVAL; if (unlikely(!file->f_op->write_iter)) - return -EINVAL; + goto out_fput; ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter); if (ret) - return ret; + goto out_fput; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) { - req->ki_flags |= IOCB_WRITE; - file_start_write(file); - ret = aio_ret(req, call_write_iter(file, req, &iter)); /* - * We release freeze protection in aio_complete(). Fool lockdep - * by telling it the lock got released so that it doesn't - * complain about held lock when we return to userspace. + * Open-code file_start_write here to grab freeze protection, + * which will be released by another thread in + * aio_complete_rw(). Fool lockdep by telling it the lock got + * released so that it doesn't complain about the held lock when + * we return to userspace. */ - if (S_ISREG(file_inode(file)->i_mode)) + if (S_ISREG(file_inode(file)->i_mode)) { + __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true); __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); + } + req->ki_flags |= IOCB_WRITE; + aio_rw_done(req, call_write_iter(file, req, &iter)); } kfree(iovec); +out_fput: + if (unlikely(ret)) + fput(file); return ret; } +static void aio_fsync_work(struct work_struct *work) +{ + struct fsync_iocb *req = container_of(work, struct fsync_iocb, work); + int ret; + + ret = vfs_fsync(req->file, req->datasync); + fput(req->file); + aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0); +} + +static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) +{ + if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes || + iocb->aio_rw_flags)) + return -EINVAL; + req->file = fget(iocb->aio_fildes); + if (unlikely(!req->file)) + return -EBADF; + if (unlikely(!req->file->f_op->fsync)) { + fput(req->file); + return -EINVAL; + } + + req->datasync = datasync; + INIT_WORK(&req->work, aio_fsync_work); + schedule_work(&req->work); + return 0; +} + +/* need to use list_del_init so we can check if item was present */ +static inline bool __aio_poll_remove(struct poll_iocb *req) +{ + if (list_empty(&req->wait.entry)) + return false; + list_del_init(&req->wait.entry); + return true; +} + +static inline void __aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) +{ + fput(iocb->poll.file); + aio_complete(iocb, mangle_poll(mask), 0); +} + +static void aio_poll_work(struct work_struct *work) +{ + struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, poll.work); + + if (!list_empty_careful(&iocb->ki_list)) + aio_remove_iocb(iocb); + __aio_poll_complete(iocb, iocb->poll.events); +} + +static int aio_poll_cancel(struct kiocb *iocb) +{ + struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); + struct poll_iocb *req = &aiocb->poll; + struct wait_queue_head *head = req->head; + bool found = false; + + spin_lock(&head->lock); + found = __aio_poll_remove(req); + spin_unlock(&head->lock); + + if (found) { + req->events = 0; + INIT_WORK(&req->work, aio_poll_work); + schedule_work(&req->work); + } + return 0; +} + +static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + void *key) +{ + struct poll_iocb *req = container_of(wait, struct poll_iocb, wait); + struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); + struct file *file = req->file; + __poll_t mask = key_to_poll(key); + + assert_spin_locked(&req->head->lock); + + /* for instances that support it check for an event match first: */ + if (mask && !(mask & req->events)) + return 0; + + mask = file->f_op->poll_mask(file, req->events); + if (!mask) + return 0; + + __aio_poll_remove(req); + + /* + * Try completing without a context switch if we can acquire ctx_lock + * without spinning. Otherwise we need to defer to a workqueue to + * avoid a deadlock due to the lock order. + */ + if (spin_trylock(&iocb->ki_ctx->ctx_lock)) { + list_del_init(&iocb->ki_list); + spin_unlock(&iocb->ki_ctx->ctx_lock); + + __aio_poll_complete(iocb, mask); + } else { + req->events = mask; + INIT_WORK(&req->work, aio_poll_work); + schedule_work(&req->work); + } + + return 1; +} + +static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) +{ + struct kioctx *ctx = aiocb->ki_ctx; + struct poll_iocb *req = &aiocb->poll; + __poll_t mask; + + /* reject any unknown events outside the normal event mask. */ + if ((u16)iocb->aio_buf != iocb->aio_buf) + return -EINVAL; + /* reject fields that are not defined for poll */ + if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags) + return -EINVAL; + + req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; + req->file = fget(iocb->aio_fildes); + if (unlikely(!req->file)) + return -EBADF; + if (!file_has_poll_mask(req->file)) + goto out_fail; + + req->head = req->file->f_op->get_poll_head(req->file, req->events); + if (!req->head) + goto out_fail; + if (IS_ERR(req->head)) { + mask = EPOLLERR; + goto done; + } + + init_waitqueue_func_entry(&req->wait, aio_poll_wake); + aiocb->ki_cancel = aio_poll_cancel; + + spin_lock_irq(&ctx->ctx_lock); + spin_lock(&req->head->lock); + mask = req->file->f_op->poll_mask(req->file, req->events); + if (!mask) { + __add_wait_queue(req->head, &req->wait); + list_add_tail(&aiocb->ki_list, &ctx->active_reqs); + } + spin_unlock(&req->head->lock); + spin_unlock_irq(&ctx->ctx_lock); +done: + if (mask) + __aio_poll_complete(aiocb, mask); + return 0; +out_fail: + fput(req->file); + return -EINVAL; /* same as no support for IOCB_CMD_POLL */ +} + static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - struct iocb *iocb, bool compat) + bool compat) { struct aio_kiocb *req; - struct file *file; + struct iocb iocb; ssize_t ret; + if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) + return -EFAULT; + /* enforce forwards compatibility on users */ - if (unlikely(iocb->aio_reserved2)) { + if (unlikely(iocb.aio_reserved2)) { pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } /* prevent overflows */ if (unlikely( - (iocb->aio_buf != (unsigned long)iocb->aio_buf) || - (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || - ((ssize_t)iocb->aio_nbytes < 0) + (iocb.aio_buf != (unsigned long)iocb.aio_buf) || + (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) || + ((ssize_t)iocb.aio_nbytes < 0) )) { pr_debug("EINVAL: overflow check\n"); return -EINVAL; @@ -1569,37 +1749,19 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, if (unlikely(!req)) return -EAGAIN; - req->common.ki_filp = file = fget(iocb->aio_fildes); - if (unlikely(!req->common.ki_filp)) { - ret = -EBADF; - goto out_put_req; - } - req->common.ki_pos = iocb->aio_offset; - req->common.ki_complete = aio_complete; - req->common.ki_flags = iocb_flags(req->common.ki_filp); - req->common.ki_hint = file_write_hint(file); - - if (iocb->aio_flags & IOCB_FLAG_RESFD) { + if (iocb.aio_flags & IOCB_FLAG_RESFD) { /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an * instance of the file* now. The file descriptor must be * an eventfd() fd, and will be signaled for each completed * event using the eventfd_signal() function. */ - req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd); + req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd); if (IS_ERR(req->ki_eventfd)) { ret = PTR_ERR(req->ki_eventfd); req->ki_eventfd = NULL; goto out_put_req; } - - req->common.ki_flags |= IOCB_EVENTFD; - } - - ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags); - if (unlikely(ret)) { - pr_debug("EINVAL: aio_rw_flags\n"); - goto out_put_req; } ret = put_user(KIOCB_KEY, &user_iocb->aio_key); @@ -1609,41 +1771,67 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, } req->ki_user_iocb = user_iocb; - req->ki_user_data = iocb->aio_data; + req->ki_user_data = iocb.aio_data; - get_file(file); - switch (iocb->aio_lio_opcode) { + switch (iocb.aio_lio_opcode) { case IOCB_CMD_PREAD: - ret = aio_read(&req->common, iocb, false, compat); + ret = aio_read(&req->rw, &iocb, false, compat); break; case IOCB_CMD_PWRITE: - ret = aio_write(&req->common, iocb, false, compat); + ret = aio_write(&req->rw, &iocb, false, compat); break; case IOCB_CMD_PREADV: - ret = aio_read(&req->common, iocb, true, compat); + ret = aio_read(&req->rw, &iocb, true, compat); break; case IOCB_CMD_PWRITEV: - ret = aio_write(&req->common, iocb, true, compat); + ret = aio_write(&req->rw, &iocb, true, compat); + break; + case IOCB_CMD_FSYNC: + ret = aio_fsync(&req->fsync, &iocb, false); + break; + case IOCB_CMD_FDSYNC: + ret = aio_fsync(&req->fsync, &iocb, true); + break; + case IOCB_CMD_POLL: + ret = aio_poll(req, &iocb); break; default: - pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode); + pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode); ret = -EINVAL; break; } - fput(file); - if (ret && ret != -EIOCBQUEUED) + /* + * If ret is 0, we'd either done aio_complete() ourselves or have + * arranged for that to be done asynchronously. Anything non-zero + * means that we need to destroy req ourselves. + */ + if (ret) goto out_put_req; return 0; out_put_req: put_reqs_available(ctx, 1); percpu_ref_put(&ctx->reqs); - kiocb_free(req); + if (req->ki_eventfd) + eventfd_ctx_put(req->ki_eventfd); + kmem_cache_free(kiocb_cachep, req); return ret; } -static long do_io_submit(aio_context_t ctx_id, long nr, - struct iocb __user *__user *iocbpp, bool compat) +/* sys_io_submit: + * Queue the nr iocbs pointed to by iocbpp for processing. Returns + * the number of iocbs queued. May return -EINVAL if the aio_context + * specified by ctx_id is invalid, if nr is < 0, if the iocb at + * *iocbpp[0] is not properly initialized, if the operation specified + * is invalid for the file descriptor in the iocb. May fail with + * -EFAULT if any of the data structures point to invalid data. May + * fail with -EBADF if the file descriptor specified in the first + * iocb is invalid. May fail with -EAGAIN if insufficient resources + * are available to queue any iocbs. Will return 0 if nr is 0. Will + * fail with -ENOSYS if not implemented. + */ +SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, + struct iocb __user * __user *, iocbpp) { struct kioctx *ctx; long ret = 0; @@ -1653,39 +1841,25 @@ static long do_io_submit(aio_context_t ctx_id, long nr, if (unlikely(nr < 0)) return -EINVAL; - if (unlikely(nr > LONG_MAX/sizeof(*iocbpp))) - nr = LONG_MAX/sizeof(*iocbpp); - - if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp))))) - return -EFAULT; - ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) { pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } - blk_start_plug(&plug); + if (nr > ctx->nr_events) + nr = ctx->nr_events; - /* - * AKPM: should this return a partial result if some of the IOs were - * successfully submitted? - */ - for (i=0; i<nr; i++) { + blk_start_plug(&plug); + for (i = 0; i < nr; i++) { struct iocb __user *user_iocb; - struct iocb tmp; - if (unlikely(__get_user(user_iocb, iocbpp + i))) { + if (unlikely(get_user(user_iocb, iocbpp + i))) { ret = -EFAULT; break; } - if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) { - ret = -EFAULT; - break; - } - - ret = io_submit_one(ctx, user_iocb, &tmp, compat); + ret = io_submit_one(ctx, user_iocb, false); if (ret) break; } @@ -1695,59 +1869,44 @@ static long do_io_submit(aio_context_t ctx_id, long nr, return i ? i : ret; } -/* sys_io_submit: - * Queue the nr iocbs pointed to by iocbpp for processing. Returns - * the number of iocbs queued. May return -EINVAL if the aio_context - * specified by ctx_id is invalid, if nr is < 0, if the iocb at - * *iocbpp[0] is not properly initialized, if the operation specified - * is invalid for the file descriptor in the iocb. May fail with - * -EFAULT if any of the data structures point to invalid data. May - * fail with -EBADF if the file descriptor specified in the first - * iocb is invalid. May fail with -EAGAIN if insufficient resources - * are available to queue any iocbs. Will return 0 if nr is 0. Will - * fail with -ENOSYS if not implemented. - */ -SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, - struct iocb __user * __user *, iocbpp) -{ - return do_io_submit(ctx_id, nr, iocbpp, 0); -} - #ifdef CONFIG_COMPAT -static inline long -copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) +COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, + int, nr, compat_uptr_t __user *, iocbpp) { - compat_uptr_t uptr; - int i; + struct kioctx *ctx; + long ret = 0; + int i = 0; + struct blk_plug plug; - for (i = 0; i < nr; ++i) { - if (get_user(uptr, ptr32 + i)) - return -EFAULT; - if (put_user(compat_ptr(uptr), ptr64 + i)) - return -EFAULT; + if (unlikely(nr < 0)) + return -EINVAL; + + ctx = lookup_ioctx(ctx_id); + if (unlikely(!ctx)) { + pr_debug("EINVAL: invalid context id\n"); + return -EINVAL; } - return 0; -} -#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *)) + if (nr > ctx->nr_events) + nr = ctx->nr_events; -COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, - int, nr, u32 __user *, iocb) -{ - struct iocb __user * __user *iocb64; - long ret; + blk_start_plug(&plug); + for (i = 0; i < nr; i++) { + compat_uptr_t user_iocb; - if (unlikely(nr < 0)) - return -EINVAL; + if (unlikely(get_user(user_iocb, iocbpp + i))) { + ret = -EFAULT; + break; + } - if (nr > MAX_AIO_SUBMITS) - nr = MAX_AIO_SUBMITS; + ret = io_submit_one(ctx, compat_ptr(user_iocb), true); + if (ret) + break; + } + blk_finish_plug(&plug); - iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); - ret = copy_iocb(nr, iocb, iocb64); - if (!ret) - ret = do_io_submit(ctx_id, nr, iocb64, 1); - return ret; + percpu_ref_put(&ctx->users); + return i ? i : ret; } #endif @@ -1755,15 +1914,12 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, * Finds a given iocb for cancellation. */ static struct aio_kiocb * -lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key) +lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb) { struct aio_kiocb *kiocb; assert_spin_locked(&ctx->ctx_lock); - if (key != KIOCB_KEY) - return NULL; - /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { if (kiocb->ki_user_iocb == iocb) @@ -1787,25 +1943,24 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, { struct kioctx *ctx; struct aio_kiocb *kiocb; + int ret = -EINVAL; u32 key; - int ret; - ret = get_user(key, &iocb->aio_key); - if (unlikely(ret)) + if (unlikely(get_user(key, &iocb->aio_key))) return -EFAULT; + if (unlikely(key != KIOCB_KEY)) + return -EINVAL; ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - - kiocb = lookup_kiocb(ctx, iocb, key); - if (kiocb) - ret = kiocb_cancel(kiocb); - else - ret = -EINVAL; - + kiocb = lookup_kiocb(ctx, iocb); + if (kiocb) { + ret = kiocb->ki_cancel(&kiocb->rw); + list_del_init(&kiocb->ki_list); + } spin_unlock_irq(&ctx->ctx_lock); if (!ret) { @@ -1860,13 +2015,60 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, struct timespec __user *, timeout) { struct timespec64 ts; + int ret; + + if (timeout && unlikely(get_timespec64(&ts, timeout))) + return -EFAULT; + + ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); + if (!ret && signal_pending(current)) + ret = -EINTR; + return ret; +} + +SYSCALL_DEFINE6(io_pgetevents, + aio_context_t, ctx_id, + long, min_nr, + long, nr, + struct io_event __user *, events, + struct timespec __user *, timeout, + const struct __aio_sigset __user *, usig) +{ + struct __aio_sigset ksig = { NULL, }; + sigset_t ksigmask, sigsaved; + struct timespec64 ts; + int ret; + + if (timeout && unlikely(get_timespec64(&ts, timeout))) + return -EFAULT; + + if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) + return -EFAULT; - if (timeout) { - if (unlikely(get_timespec64(&ts, timeout))) + if (ksig.sigmask) { + if (ksig.sigsetsize != sizeof(sigset_t)) + return -EINVAL; + if (copy_from_user(&ksigmask, ksig.sigmask, sizeof(ksigmask))) return -EFAULT; + sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); + if (signal_pending(current)) { + if (ksig.sigmask) { + current->saved_sigmask = sigsaved; + set_restore_sigmask(); + } + + if (!ret) + ret = -ERESTARTNOHAND; + } else { + if (ksig.sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); } - return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); + return ret; } #ifdef CONFIG_COMPAT @@ -1877,13 +2079,64 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, struct compat_timespec __user *, timeout) { struct timespec64 t; + int ret; + + if (timeout && compat_get_timespec64(&t, timeout)) + return -EFAULT; + + ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); + if (!ret && signal_pending(current)) + ret = -EINTR; + return ret; +} + + +struct __compat_aio_sigset { + compat_sigset_t __user *sigmask; + compat_size_t sigsetsize; +}; + +COMPAT_SYSCALL_DEFINE6(io_pgetevents, + compat_aio_context_t, ctx_id, + compat_long_t, min_nr, + compat_long_t, nr, + struct io_event __user *, events, + struct compat_timespec __user *, timeout, + const struct __compat_aio_sigset __user *, usig) +{ + struct __compat_aio_sigset ksig = { NULL, }; + sigset_t ksigmask, sigsaved; + struct timespec64 t; + int ret; + + if (timeout && compat_get_timespec64(&t, timeout)) + return -EFAULT; + + if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) + return -EFAULT; - if (timeout) { - if (compat_get_timespec64(&t, timeout)) + if (ksig.sigmask) { + if (ksig.sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (get_compat_sigset(&ksigmask, ksig.sigmask)) return -EFAULT; + sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); + if (signal_pending(current)) { + if (ksig.sigmask) { + current->saved_sigmask = sigsaved; + set_restore_sigmask(); + } + if (!ret) + ret = -ERESTARTNOHAND; + } else { + if (ksig.sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); } - return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); + return ret; } #endif diff --git a/fs/eventfd.c b/fs/eventfd.c index 08d3bd602f73..61c9514da5e9 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -101,14 +101,20 @@ static int eventfd_release(struct inode *inode, struct file *file) return 0; } -static __poll_t eventfd_poll(struct file *file, poll_table *wait) +static struct wait_queue_head * +eventfd_get_poll_head(struct file *file, __poll_t events) +{ + struct eventfd_ctx *ctx = file->private_data; + + return &ctx->wqh; +} + +static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask) { struct eventfd_ctx *ctx = file->private_data; __poll_t events = 0; u64 count; - poll_wait(file, &ctx->wqh, wait); - /* * All writes to ctx->count occur within ctx->wqh.lock. This read * can be done outside ctx->wqh.lock because we know that poll_wait @@ -305,7 +311,8 @@ static const struct file_operations eventfd_fops = { .show_fdinfo = eventfd_show_fdinfo, #endif .release = eventfd_release, - .poll = eventfd_poll, + .get_poll_head = eventfd_get_poll_head, + .poll_mask = eventfd_poll_mask, .read = eventfd_read, .write = eventfd_write, .llseek = noop_llseek, diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 602ca4285b2e..67db22fe99c5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -884,8 +884,7 @@ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, pt->_key = epi->event.events; if (!is_file_epoll(epi->ffd.file)) - return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & - epi->event.events; + return vfs_poll(epi->ffd.file, pt) & epi->event.events; ep = epi->ffd.file->private_data; poll_wait(epi->ffd.file, &ep->poll_wait, pt); @@ -2025,7 +2024,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, /* The target file descriptor must support poll */ error = -EPERM; - if (!tf.file->f_op->poll) + if (!file_can_poll(tf.file)) goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */ diff --git a/fs/pipe.c b/fs/pipe.c index 39d6f431da83..bb0840e234f3 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -509,19 +509,22 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } } -/* No kernel lock held - fine */ -static __poll_t -pipe_poll(struct file *filp, poll_table *wait) +static struct wait_queue_head * +pipe_get_poll_head(struct file *filp, __poll_t events) { - __poll_t mask; struct pipe_inode_info *pipe = filp->private_data; - int nrbufs; - poll_wait(filp, &pipe->wait, wait); + return &pipe->wait; +} + +/* No kernel lock held - fine */ +static __poll_t pipe_poll_mask(struct file *filp, __poll_t events) +{ + struct pipe_inode_info *pipe = filp->private_data; + int nrbufs = pipe->nrbufs; + __poll_t mask = 0; /* Reading only -- no need for acquiring the semaphore. */ - nrbufs = pipe->nrbufs; - mask = 0; if (filp->f_mode & FMODE_READ) { mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0; if (!pipe->writers && filp->f_version != pipe->w_counter) @@ -1020,7 +1023,8 @@ const struct file_operations pipefifo_fops = { .llseek = no_llseek, .read_iter = pipe_read, .write_iter = pipe_write, - .poll = pipe_poll, + .get_poll_head = pipe_get_poll_head, + .poll_mask = pipe_poll_mask, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, .fasync = pipe_fasync, diff --git a/fs/select.c b/fs/select.c index ba879c51288f..bc3cc0f98896 100644 --- a/fs/select.c +++ b/fs/select.c @@ -34,6 +34,29 @@ #include <linux/uaccess.h> +__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt) +{ + if (file->f_op->poll) { + return file->f_op->poll(file, pt); + } else if (file_has_poll_mask(file)) { + unsigned int events = poll_requested_events(pt); + struct wait_queue_head *head; + + if (pt && pt->_qproc) { + head = file->f_op->get_poll_head(file, events); + if (!head) + return DEFAULT_POLLMASK; + if (IS_ERR(head)) + return EPOLLERR; + pt->_qproc(file, head, pt); + } + + return file->f_op->poll_mask(file, events); + } else { + return DEFAULT_POLLMASK; + } +} +EXPORT_SYMBOL_GPL(vfs_poll); /* * Estimate expected accuracy in ns from a timeval. @@ -233,7 +256,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, add_wait_queue(wait_address, &entry->wait); } -int poll_schedule_timeout(struct poll_wqueues *pwq, int state, +static int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack) { int rc = -EINTR; @@ -258,7 +281,6 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state, return rc; } -EXPORT_SYMBOL(poll_schedule_timeout); /** * poll_select_set_timeout - helper function to setup the timeout value @@ -503,14 +525,10 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) continue; f = fdget(i); if (f.file) { - const struct file_operations *f_op; - f_op = f.file->f_op; - mask = DEFAULT_POLLMASK; - if (f_op->poll) { - wait_key_set(wait, in, out, - bit, busy_flag); - mask = (*f_op->poll)(f.file, wait); - } + wait_key_set(wait, in, out, bit, + busy_flag); + mask = vfs_poll(f.file, wait); + fdput(f); if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; @@ -813,34 +831,29 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, bool *can_busy_poll, __poll_t busy_flag) { - __poll_t mask; - int fd; - - mask = 0; - fd = pollfd->fd; - if (fd >= 0) { - struct fd f = fdget(fd); - mask = EPOLLNVAL; - if (f.file) { - /* userland u16 ->events contains POLL... bitmap */ - __poll_t filter = demangle_poll(pollfd->events) | - EPOLLERR | EPOLLHUP; - mask = DEFAULT_POLLMASK; - if (f.file->f_op->poll) { - pwait->_key = filter; - pwait->_key |= busy_flag; - mask = f.file->f_op->poll(f.file, pwait); - if (mask & busy_flag) - *can_busy_poll = true; - } - /* Mask out unneeded events. */ - mask &= filter; - fdput(f); - } - } + int fd = pollfd->fd; + __poll_t mask = 0, filter; + struct fd f; + + if (fd < 0) + goto out; + mask = EPOLLNVAL; + f = fdget(fd); + if (!f.file) + goto out; + + /* userland u16 ->events contains POLL... bitmap */ + filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP; + pwait->_key = filter | busy_flag; + mask = vfs_poll(f.file, pwait); + if (mask & busy_flag) + *can_busy_poll = true; + mask &= filter; /* Mask out unneeded events. */ + fdput(f); + +out: /* ... and so does ->revents */ pollfd->revents = mangle_poll(mask); - return mask; } diff --git a/fs/timerfd.c b/fs/timerfd.c index cdad49da3ff7..d84a2bee4f82 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -226,21 +226,20 @@ static int timerfd_release(struct inode *inode, struct file *file) kfree_rcu(ctx, rcu); return 0; } - -static __poll_t timerfd_poll(struct file *file, poll_table *wait) + +static struct wait_queue_head *timerfd_get_poll_head(struct file *file, + __poll_t eventmask) { struct timerfd_ctx *ctx = file->private_data; - __poll_t events = 0; - unsigned long flags; - poll_wait(file, &ctx->wqh, wait); + return &ctx->wqh; +} - spin_lock_irqsave(&ctx->wqh.lock, flags); - if (ctx->ticks) - events |= EPOLLIN; - spin_unlock_irqrestore(&ctx->wqh.lock, flags); +static __poll_t timerfd_poll_mask(struct file *file, __poll_t eventmask) +{ + struct timerfd_ctx *ctx = file->private_data; - return events; + return ctx->ticks ? EPOLLIN : 0; } static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, @@ -364,7 +363,8 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg static const struct file_operations timerfd_fops = { .release = timerfd_release, - .poll = timerfd_poll, + .get_poll_head = timerfd_get_poll_head, + .poll_mask = timerfd_poll_mask, .read = timerfd_read, .llseek = noop_llseek, .show_fdinfo = timerfd_show, diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h index 482461d8931d..cc414db9da0a 100644 --- a/include/crypto/if_alg.h +++ b/include/crypto/if_alg.h @@ -245,8 +245,7 @@ ssize_t af_alg_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); void af_alg_free_resources(struct af_alg_async_req *areq); void af_alg_async_cb(struct crypto_async_request *_req, int err); -__poll_t af_alg_poll(struct file *file, struct socket *sock, - poll_table *wait); +__poll_t af_alg_poll_mask(struct socket *sock, __poll_t events); struct af_alg_async_req *af_alg_alloc_areq(struct sock *sk, unsigned int areqlen); int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags, diff --git a/include/linux/aio.h b/include/linux/aio.h index 9d8aabecfe2d..b83e68dd006f 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -8,8 +8,6 @@ struct kioctx; struct kiocb; struct mm_struct; -#define KIOCB_KEY 0 - typedef int (kiocb_cancel_fn)(struct kiocb *); /* prototypes */ diff --git a/include/linux/compat.h b/include/linux/compat.h index 081281ad5772..ad192057b887 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -330,6 +330,7 @@ extern int put_compat_rusage(const struct rusage *, struct compat_rusage __user *); struct compat_siginfo; +struct __compat_aio_sigset; struct compat_dirent { u32 d_ino; @@ -553,6 +554,12 @@ asmlinkage long compat_sys_io_getevents(compat_aio_context_t ctx_id, compat_long_t nr, struct io_event __user *events, struct compat_timespec __user *timeout); +asmlinkage long compat_sys_io_pgetevents(compat_aio_context_t ctx_id, + compat_long_t min_nr, + compat_long_t nr, + struct io_event __user *events, + struct compat_timespec __user *timeout, + const struct __compat_aio_sigset __user *usig); /* fs/cookies.c */ asmlinkage long compat_sys_lookup_dcookie(u32, u32, char __user *, compat_size_t); diff --git a/include/linux/fs.h b/include/linux/fs.h index 7f86730b67a9..d4c37d371da5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1711,6 +1711,8 @@ struct file_operations { int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); + struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t); + __poll_t (*poll_mask) (struct file *, __poll_t); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); diff --git a/include/linux/net.h b/include/linux/net.h index 2248a052061d..3fd9d8c16581 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -147,6 +147,7 @@ struct proto_ops { int (*getname) (struct socket *sock, struct sockaddr *addr, int peer); + __poll_t (*poll_mask) (struct socket *sock, __poll_t events); __poll_t (*poll) (struct file *file, struct socket *sock, struct poll_table_struct *wait); int (*ioctl) (struct socket *sock, unsigned int cmd, diff --git a/include/linux/poll.h b/include/linux/poll.h index f45ebd017eaa..fdf86b4cbc71 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -74,6 +74,18 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) pt->_key = ~(__poll_t)0; /* all events enabled */ } +static inline bool file_has_poll_mask(struct file *file) +{ + return file->f_op->get_poll_head && file->f_op->poll_mask; +} + +static inline bool file_can_poll(struct file *file) +{ + return file->f_op->poll || file_has_poll_mask(file); +} + +__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt); + struct poll_table_entry { struct file *filp; __poll_t key; @@ -96,8 +108,6 @@ struct poll_wqueues { extern void poll_initwait(struct poll_wqueues *pwq); extern void poll_freewait(struct poll_wqueues *pwq); -extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state, - ktime_t *expires, unsigned long slack); extern u64 select_estimate_accuracy(struct timespec64 *tv); #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9065477ed255..89198379b39d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3250,8 +3250,7 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags, int *peeked, int *off, int *err); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int *err); -__poll_t datagram_poll(struct file *file, struct socket *sock, - struct poll_table_struct *wait); +__poll_t datagram_poll_mask(struct socket *sock, __poll_t events); int skb_copy_datagram_iter(const struct sk_buff *from, int offset, struct iov_iter *to, int size); static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 70fcda1a9049..811172fcb916 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -290,6 +290,12 @@ asmlinkage long sys_io_getevents(aio_context_t ctx_id, long nr, struct io_event __user *events, struct timespec __user *timeout); +asmlinkage long sys_io_pgetevents(aio_context_t ctx_id, + long min_nr, + long nr, + struct io_event __user *events, + struct timespec __user *timeout, + const struct __aio_sigset *sig); /* fs/xattr.c */ asmlinkage long sys_setxattr(const char __user *path, const char __user *name, diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index ec9d6bc65855..53ce8176c313 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -271,7 +271,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); -__poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait); +__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events); int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo); int bt_sock_wait_ready(struct sock *sk, unsigned long flags); diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 71c72a939bf8..c5187438af38 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -121,6 +121,21 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) #endif } +static inline void sock_poll_busy_loop(struct socket *sock, __poll_t events) +{ + if (sk_can_busy_loop(sock->sk) && + events && (events & POLL_BUSY_LOOP)) { + /* once, only if requested by syscall */ + sk_busy_loop(sock->sk, 1); + } +} + +/* if this socket can poll_ll, tell the system call */ +static inline __poll_t sock_poll_busy_flag(struct socket *sock) +{ + return sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0; +} + /* used in the NIC receive handler to mark the skb */ static inline void skb_mark_napi_id(struct sk_buff *skb, struct napi_struct *napi) diff --git a/include/net/iucv/af_iucv.h b/include/net/iucv/af_iucv.h index f4c21b5a1242..b0eaeb02d46d 100644 --- a/include/net/iucv/af_iucv.h +++ b/include/net/iucv/af_iucv.h @@ -153,8 +153,6 @@ struct iucv_sock_list { atomic_t autobind_name; }; -__poll_t iucv_sock_poll(struct file *file, struct socket *sock, - poll_table *wait); void iucv_sock_link(struct iucv_sock_list *l, struct sock *s); void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *s); void iucv_accept_enqueue(struct sock *parent, struct sock *sk); diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 35498e613ff5..e6d349b2a791 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -109,8 +109,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb); int sctp_inet_listen(struct socket *sock, int backlog); void sctp_write_space(struct sock *sk); void sctp_data_ready(struct sock *sk); -__poll_t sctp_poll(struct file *file, struct socket *sock, - poll_table *wait); +__poll_t sctp_poll_mask(struct socket *sock, __poll_t events); void sctp_sock_rfree(struct sk_buff *skb); void sctp_copy_sock(struct sock *newsk, struct sock *sk, struct sctp_association *asoc); diff --git a/include/net/sock.h b/include/net/sock.h index 74d725fdbe0f..4d2e8ad98985 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1591,8 +1591,6 @@ int sock_no_connect(struct socket *, struct sockaddr *, int, int); int sock_no_socketpair(struct socket *, struct socket *); int sock_no_accept(struct socket *, struct socket *, int, bool); int sock_no_getname(struct socket *, struct sockaddr *, int); -__poll_t sock_no_poll(struct file *, struct socket *, - struct poll_table_struct *); int sock_no_ioctl(struct socket *, unsigned int, unsigned long); int sock_no_listen(struct socket *, int); int sock_no_shutdown(struct socket *, int); diff --git a/include/net/tcp.h b/include/net/tcp.h index 51dc7a26a2fa..f88f8a2cab0d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -388,8 +388,7 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); void tcp_init_transfer(struct sock *sk, int bpf_op); -__poll_t tcp_poll(struct file *file, struct socket *sock, - struct poll_table_struct *wait); +__poll_t tcp_poll_mask(struct socket *sock, __poll_t events); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int tcp_setsockopt(struct sock *sk, int level, int optname, diff --git a/include/net/udp.h b/include/net/udp.h index 621778b80e3d..d8ca3b26964d 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -276,7 +276,7 @@ int udp_init_sock(struct sock *sk); int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); -__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait); +__poll_t udp_poll_mask(struct socket *sock, __poll_t events); struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, bool is_ipv6); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 8bcb186c6f67..42990676a55e 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -732,9 +732,11 @@ __SYSCALL(__NR_pkey_alloc, sys_pkey_alloc) __SYSCALL(__NR_pkey_free, sys_pkey_free) #define __NR_statx 291 __SYSCALL(__NR_statx, sys_statx) +#define __NR_io_pgetevents 292 +__SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents) #undef __NR_syscalls -#define __NR_syscalls 292 +#define __NR_syscalls 293 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index a04adbc70ddf..ed0185945bb2 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -29,6 +29,7 @@ #include <linux/types.h> #include <linux/fs.h> +#include <linux/signal.h> #include <asm/byteorder.h> typedef __kernel_ulong_t aio_context_t; @@ -38,10 +39,8 @@ enum { IOCB_CMD_PWRITE = 1, IOCB_CMD_FSYNC = 2, IOCB_CMD_FDSYNC = 3, - /* These two are experimental. - * IOCB_CMD_PREADX = 4, - * IOCB_CMD_POLL = 5, - */ + /* 4 was the experimental IOCB_CMD_PREADX */ + IOCB_CMD_POLL = 5, IOCB_CMD_NOOP = 6, IOCB_CMD_PREADV = 7, IOCB_CMD_PWRITEV = 8, @@ -108,5 +107,10 @@ struct iocb { #undef IFBIG #undef IFLITTLE +struct __aio_sigset { + sigset_t __user *sigmask; + size_t sigsetsize; +}; + #endif /* __LINUX__AIO_ABI_H */ diff --git a/include/uapi/linux/types.h b/include/uapi/linux/types.h index cd4f0b897a48..2fce8b6876e9 100644 --- a/include/uapi/linux/types.h +++ b/include/uapi/linux/types.h @@ -49,11 +49,7 @@ typedef __u32 __bitwise __wsum; #define __aligned_be64 __be64 __attribute__((aligned(8))) #define __aligned_le64 __le64 __attribute__((aligned(8))) -#ifdef __CHECK_POLL typedef unsigned __bitwise __poll_t; -#else -typedef unsigned __poll_t; -#endif #endif /* __ASSEMBLY__ */ #endif /* _UAPI_LINUX_TYPES_H */ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 9791364925dc..183169c2a75b 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -43,7 +43,9 @@ COND_SYSCALL(io_submit); COND_SYSCALL_COMPAT(io_submit); COND_SYSCALL(io_cancel); COND_SYSCALL(io_getevents); +COND_SYSCALL(io_pgetevents); COND_SYSCALL_COMPAT(io_getevents); +COND_SYSCALL_COMPAT(io_pgetevents); /* fs/xattr.c */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2bd3df3d101a..1695f38630f1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3849,7 +3849,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, if (ret) goto out_put_css; - efile.file->f_op->poll(efile.file, &event->pt); + vfs_poll(efile.file, &event->pt); spin_lock(&memcg->event_list_lock); list_add(&event->list, &memcg->event_list); diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 848969fe7979..588bf88c3305 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -231,7 +231,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err) static __poll_t p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt, int *err) { - __poll_t ret, n; + __poll_t ret; struct p9_trans_fd *ts = NULL; if (client && client->status == Connected) @@ -243,19 +243,9 @@ p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt, int *err) return EPOLLERR; } - if (!ts->rd->f_op->poll) - ret = DEFAULT_POLLMASK; - else - ret = ts->rd->f_op->poll(ts->rd, pt); - - if (ts->rd != ts->wr) { - if (!ts->wr->f_op->poll) - n = DEFAULT_POLLMASK; - else - n = ts->wr->f_op->poll(ts->wr, pt); - ret = (ret & ~EPOLLOUT) | (n & ~EPOLLIN); - } - + ret = vfs_poll(ts->rd, pt); + if (ts->rd != ts->wr) + ret = (ret & ~EPOLLOUT) | (vfs_poll(ts->wr, pt) & ~EPOLLIN); return ret; } diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 9b6bc5abe946..55fdba05d7d9 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1869,7 +1869,7 @@ static const struct proto_ops atalk_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = atalk_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = atalk_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = atalk_compat_ioctl, diff --git a/net/atm/common.c b/net/atm/common.c index fc78a0508ae1..1f2af59935db 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -648,16 +648,11 @@ out: return error; } -__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait) +__poll_t vcc_poll_mask(struct socket *sock, __poll_t events) { struct sock *sk = sock->sk; - struct atm_vcc *vcc; - __poll_t mask; - - sock_poll_wait(file, sk_sleep(sk), wait); - mask = 0; - - vcc = ATM_SD(sock); + struct atm_vcc *vcc = ATM_SD(sock); + __poll_t mask = 0; /* exceptional events */ if (sk->sk_err) diff --git a/net/atm/common.h b/net/atm/common.h index 5850649068bb..526796ad230f 100644 --- a/net/atm/common.h +++ b/net/atm/common.h @@ -17,7 +17,7 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci); int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len); -__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait); +__poll_t vcc_poll_mask(struct socket *sock, __poll_t events); int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int vcc_setsockopt(struct socket *sock, int level, int optname, diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 2cb10af16afc..9f75092fe778 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -113,7 +113,7 @@ static const struct proto_ops pvc_proto_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pvc_getname, - .poll = vcc_poll, + .poll_mask = vcc_poll_mask, .ioctl = vcc_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = vcc_compat_ioctl, diff --git a/net/atm/svc.c b/net/atm/svc.c index 2f91b766ac42..53f4ad7087b1 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -636,7 +636,7 @@ static const struct proto_ops svc_proto_ops = { .socketpair = sock_no_socketpair, .accept = svc_accept, .getname = svc_getname, - .poll = vcc_poll, + .poll_mask = vcc_poll_mask, .ioctl = svc_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = svc_compat_ioctl, diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index c603d33d5410..d1d2442ce573 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1941,7 +1941,7 @@ static const struct proto_ops ax25_proto_ops = { .socketpair = sock_no_socketpair, .accept = ax25_accept, .getname = ax25_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = ax25_ioctl, .listen = ax25_listen, .shutdown = ax25_shutdown, diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 3264e1873219..510ab4f55df5 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -437,16 +437,13 @@ static inline __poll_t bt_accept_poll(struct sock *parent) return 0; } -__poll_t bt_sock_poll(struct file *file, struct socket *sock, - poll_table *wait) +__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events) { struct sock *sk = sock->sk; __poll_t mask = 0; BT_DBG("sock %p, sk %p", sock, sk); - poll_wait(file, sk_sleep(sk), wait); - if (sk->sk_state == BT_LISTEN) return bt_accept_poll(sk); @@ -478,7 +475,7 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock, return mask; } -EXPORT_SYMBOL(bt_sock_poll); +EXPORT_SYMBOL(bt_sock_poll_mask); int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index b5116fa9835e..00deacdcb51c 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -175,7 +175,6 @@ static const struct proto_ops bnep_sock_ops = { .getname = sock_no_getname, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, - .poll = sock_no_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c index ce86a7bae844..e08f28fadd65 100644 --- a/net/bluetooth/cmtp/sock.c +++ b/net/bluetooth/cmtp/sock.c @@ -178,7 +178,6 @@ static const struct proto_ops cmtp_sock_ops = { .getname = sock_no_getname, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, - .poll = sock_no_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 1506e1632394..d6c099861538 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1975,7 +1975,7 @@ static const struct proto_ops hci_sock_ops = { .sendmsg = hci_sock_sendmsg, .recvmsg = hci_sock_recvmsg, .ioctl = hci_sock_ioctl, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = hci_sock_setsockopt, diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index 008ba439bd62..1eaac01f85de 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -208,7 +208,6 @@ static const struct proto_ops hidp_sock_ops = { .getname = sock_no_getname, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, - .poll = sock_no_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 686bdc6b35b0..742a190034e6 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1653,7 +1653,7 @@ static const struct proto_ops l2cap_sock_ops = { .getname = l2cap_sock_getname, .sendmsg = l2cap_sock_sendmsg, .recvmsg = l2cap_sock_recvmsg, - .poll = bt_sock_poll, + .poll_mask = bt_sock_poll_mask, .ioctl = bt_sock_ioctl, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index d606e9212291..1cf57622473a 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -1049,7 +1049,7 @@ static const struct proto_ops rfcomm_sock_ops = { .setsockopt = rfcomm_sock_setsockopt, .getsockopt = rfcomm_sock_getsockopt, .ioctl = rfcomm_sock_ioctl, - .poll = bt_sock_poll, + .poll_mask = bt_sock_poll_mask, .socketpair = sock_no_socketpair, .mmap = sock_no_mmap }; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 413b8ee49fec..d60dbc61d170 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -1197,7 +1197,7 @@ static const struct proto_ops sco_sock_ops = { .getname = sco_sock_getname, .sendmsg = sco_sock_sendmsg, .recvmsg = sco_sock_recvmsg, - .poll = bt_sock_poll, + .poll_mask = bt_sock_poll_mask, .ioctl = bt_sock_ioctl, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index a6fb1b3bcad9..c7991867d622 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -934,15 +934,11 @@ static int caif_release(struct socket *sock) } /* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */ -static __poll_t caif_poll(struct file *file, - struct socket *sock, poll_table *wait) +static __poll_t caif_poll_mask(struct socket *sock, __poll_t events) { struct sock *sk = sock->sk; - __poll_t mask; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); - - sock_poll_wait(file, sk_sleep(sk), wait); - mask = 0; + __poll_t mask = 0; /* exceptional events? */ if (sk->sk_err) @@ -976,7 +972,7 @@ static const struct proto_ops caif_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll = caif_poll, + .poll_mask = caif_poll_mask, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -997,7 +993,7 @@ static const struct proto_ops caif_stream_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll = caif_poll, + .poll_mask = caif_poll_mask, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/can/bcm.c b/net/can/bcm.c index 6ad89f49b341..97fedff3f0c4 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1657,7 +1657,7 @@ static const struct proto_ops bcm_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/can/raw.c b/net/can/raw.c index 1051eee82581..fd7e2f49ea6a 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -843,7 +843,7 @@ static const struct proto_ops raw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = raw_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/core/datagram.c b/net/core/datagram.c index 9938952c5c78..f19bf3dc2bd6 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -819,9 +819,8 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); /** * datagram_poll - generic datagram poll - * @file: file struct * @sock: socket - * @wait: poll table + * @events to wait for * * Datagram poll: Again totally generic. This also handles * sequenced packet sockets providing the socket receive queue @@ -831,14 +830,10 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); * and you use a different write policy from sock_writeable() * then please supply your own write_space callback. */ -__poll_t datagram_poll(struct file *file, struct socket *sock, - poll_table *wait) +__poll_t datagram_poll_mask(struct socket *sock, __poll_t events) { struct sock *sk = sock->sk; - __poll_t mask; - - sock_poll_wait(file, sk_sleep(sk), wait); - mask = 0; + __poll_t mask = 0; /* exceptional events? */ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) @@ -871,4 +866,4 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, return mask; } -EXPORT_SYMBOL(datagram_poll); +EXPORT_SYMBOL(datagram_poll_mask); diff --git a/net/core/sock.c b/net/core/sock.c index 815770333d91..2aed99a541d5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2567,12 +2567,6 @@ int sock_no_getname(struct socket *sock, struct sockaddr *saddr, } EXPORT_SYMBOL(sock_no_getname); -__poll_t sock_no_poll(struct file *file, struct socket *sock, poll_table *pt) -{ - return 0; -} -EXPORT_SYMBOL(sock_no_poll); - int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return -EOPNOTSUPP; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index f91e3816806b..0ea2ee56ac1b 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -316,8 +316,7 @@ int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); void dccp_shutdown(struct sock *sk, int how); int inet_dccp_listen(struct socket *sock, int backlog); -__poll_t dccp_poll(struct file *file, struct socket *sock, - poll_table *wait); +__poll_t dccp_poll_mask(struct socket *sock, __poll_t events); int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); void dccp_req_err(struct sock *sk, u64 seq); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index b08feb219b44..a9e478cd3787 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -984,7 +984,7 @@ static const struct proto_ops inet_dccp_ops = { .accept = inet_accept, .getname = inet_getname, /* FIXME: work on tcp_poll to rename it to inet_csk_poll */ - .poll = dccp_poll, + .poll_mask = dccp_poll_mask, .ioctl = inet_ioctl, /* FIXME: work on inet_listen to rename it to sock_common_listen */ .listen = inet_dccp_listen, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 6344f1b18a6a..17fc4e0166ba 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1070,7 +1070,7 @@ static const struct proto_ops inet6_dccp_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet6_getname, - .poll = dccp_poll, + .poll_mask = dccp_poll_mask, .ioctl = inet6_ioctl, .listen = inet_dccp_listen, .shutdown = inet_shutdown, diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 0d56e36a6db7..ca21c1c76da0 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -312,20 +312,11 @@ int dccp_disconnect(struct sock *sk, int flags) EXPORT_SYMBOL_GPL(dccp_disconnect); -/* - * Wait for a DCCP event. - * - * Note that we don't need to lock the socket, as the upper poll layers - * take care of normal races (between the test and the event) and we don't - * go look at any of the socket buffers directly. - */ -__poll_t dccp_poll(struct file *file, struct socket *sock, - poll_table *wait) +__poll_t dccp_poll_mask(struct socket *sock, __poll_t events) { __poll_t mask; struct sock *sk = sock->sk; - sock_poll_wait(file, sk_sleep(sk), wait); if (sk->sk_state == DCCP_LISTEN) return inet_csk_listen_poll(sk); @@ -367,7 +358,7 @@ __poll_t dccp_poll(struct file *file, struct socket *sock, return mask; } -EXPORT_SYMBOL_GPL(dccp_poll); +EXPORT_SYMBOL_GPL(dccp_poll_mask); int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) { diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 7d6ff983ba2c..9a686d890bfa 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1207,11 +1207,11 @@ static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int peer) } -static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table *wait) +static __poll_t dn_poll_mask(struct socket *sock, __poll_t events) { struct sock *sk = sock->sk; struct dn_scp *scp = DN_SK(sk); - __poll_t mask = datagram_poll(file, sock, wait); + __poll_t mask = datagram_poll_mask(sock, events); if (!skb_queue_empty(&scp->other_receive_queue)) mask |= EPOLLRDBAND; @@ -2331,7 +2331,7 @@ static const struct proto_ops dn_proto_ops = { .socketpair = sock_no_socketpair, .accept = dn_accept, .getname = dn_getname, - .poll = dn_poll, + .poll_mask = dn_poll_mask, .ioctl = dn_ioctl, .listen = dn_listen, .shutdown = dn_shutdown, diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index a60658c85a9a..a0768d2759b8 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -423,7 +423,7 @@ static const struct proto_ops ieee802154_raw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = ieee802154_sock_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -969,7 +969,7 @@ static const struct proto_ops ieee802154_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = ieee802154_sock_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index eaed0367e669..8a59428e63ab 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -986,7 +986,7 @@ const struct proto_ops inet_stream_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, - .poll = tcp_poll, + .poll_mask = tcp_poll_mask, .ioctl = inet_ioctl, .listen = inet_listen, .shutdown = inet_shutdown, @@ -1018,7 +1018,7 @@ const struct proto_ops inet_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = inet_getname, - .poll = udp_poll, + .poll_mask = udp_poll_mask, .ioctl = inet_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, @@ -1039,7 +1039,7 @@ EXPORT_SYMBOL(inet_dgram_ops); /* * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without - * udp_poll + * udp_poll_mask */ static const struct proto_ops inet_sockraw_ops = { .family = PF_INET, @@ -1050,7 +1050,7 @@ static const struct proto_ops inet_sockraw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = inet_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = inet_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c9d00ef54dec..dec47e6789e7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -494,32 +494,21 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, } /* - * Wait for a TCP event. - * - * Note that we don't need to lock the socket, as the upper poll layers - * take care of normal races (between the test and the event) and we don't - * go look at any of the socket buffers directly. + * Socket is not locked. We are protected from async events by poll logic and + * correct handling of state changes made by other threads is impossible in + * any case. */ -__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) +__poll_t tcp_poll_mask(struct socket *sock, __poll_t events) { - __poll_t mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); + __poll_t mask = 0; int state; - sock_poll_wait(file, sk_sleep(sk), wait); - state = inet_sk_state_load(sk); if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); - /* Socket is not locked. We are protected from async events - * by poll logic and correct handling of state changes - * made by other threads is impossible in any case. - */ - - mask = 0; - /* * EPOLLHUP is certainly not done right. But poll() doesn't * have a notion of HUP in just one direction, and for a @@ -600,7 +589,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) return mask; } -EXPORT_SYMBOL(tcp_poll); +EXPORT_SYMBOL(tcp_poll_mask); int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 051a43ff3fb8..675433eb53a8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2501,7 +2501,7 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname, * udp_poll - wait for a UDP event. * @file - file struct * @sock - socket - * @wait - poll table + * @events - events to wait for * * This is same as datagram poll, except for the special case of * blocking sockets. If application is using a blocking fd @@ -2510,23 +2510,23 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname, * but then block when reading it. Add special case code * to work around these arguably broken applications. */ -__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) +__poll_t udp_poll_mask(struct socket *sock, __poll_t events) { - __poll_t mask = datagram_poll(file, sock, wait); + __poll_t mask = datagram_poll_mask(sock, events); struct sock *sk = sock->sk; if (!skb_queue_empty(&udp_sk(sk)->reader_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Check for false positives due to checksum errors */ - if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) && + if ((mask & EPOLLRDNORM) && !(sock->file->f_flags & O_NONBLOCK) && !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1) mask &= ~(EPOLLIN | EPOLLRDNORM); return mask; } -EXPORT_SYMBOL(udp_poll); +EXPORT_SYMBOL(udp_poll_mask); int udp_abort(struct sock *sk, int err) { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 8da0b513f188..d443c18b45fe 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -571,7 +571,7 @@ const struct proto_ops inet6_stream_ops = { .socketpair = sock_no_socketpair, /* a do nothing */ .accept = inet_accept, /* ok */ .getname = inet6_getname, - .poll = tcp_poll, /* ok */ + .poll_mask = tcp_poll_mask, /* ok */ .ioctl = inet6_ioctl, /* must change */ .listen = inet_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ @@ -601,7 +601,7 @@ const struct proto_ops inet6_dgram_ops = { .socketpair = sock_no_socketpair, /* a do nothing */ .accept = sock_no_accept, /* a do nothing */ .getname = inet6_getname, - .poll = udp_poll, /* ok */ + .poll_mask = udp_poll_mask, /* ok */ .ioctl = inet6_ioctl, /* must change */ .listen = sock_no_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index afc307c89d1a..ce6f0d15b5dd 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1334,7 +1334,7 @@ void raw6_proc_exit(void) } #endif /* CONFIG_PROC_FS */ -/* Same as inet6_dgram_ops, sans udp_poll. */ +/* Same as inet6_dgram_ops, sans udp_poll_mask. */ const struct proto_ops inet6_sockraw_ops = { .family = PF_INET6, .owner = THIS_MODULE, @@ -1344,7 +1344,7 @@ const struct proto_ops inet6_sockraw_ops = { .socketpair = sock_no_socketpair, /* a do nothing */ .accept = sock_no_accept, /* a do nothing */ .getname = inet6_getname, - .poll = datagram_poll, /* ok */ + .poll_mask = datagram_poll_mask, /* ok */ .ioctl = inet6_ioctl, /* must change */ .listen = sock_no_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 893a022f9620..68e86257a549 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1488,14 +1488,11 @@ static inline __poll_t iucv_accept_poll(struct sock *parent) return 0; } -__poll_t iucv_sock_poll(struct file *file, struct socket *sock, - poll_table *wait) +static __poll_t iucv_sock_poll_mask(struct socket *sock, __poll_t events) { struct sock *sk = sock->sk; __poll_t mask = 0; - sock_poll_wait(file, sk_sleep(sk), wait); - if (sk->sk_state == IUCV_LISTEN) return iucv_accept_poll(sk); @@ -2388,7 +2385,7 @@ static const struct proto_ops iucv_sock_ops = { .getname = iucv_sock_getname, .sendmsg = iucv_sock_sendmsg, .recvmsg = iucv_sock_recvmsg, - .poll = iucv_sock_poll, + .poll_mask = iucv_sock_poll_mask, .ioctl = sock_no_ioctl, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index d3601d421571..84b7d5c6fec8 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1336,9 +1336,9 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) struct list_head *head; int index = 0; - /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so - * we set sk_state, otherwise epoll_wait always returns right away with - * EPOLLHUP + /* For SOCK_SEQPACKET sock type, datagram_poll_mask checks the sk_state, + * so we set sk_state, otherwise epoll_wait always returns right away + * with EPOLLHUP */ kcm->sk.sk_state = TCP_ESTABLISHED; @@ -1903,7 +1903,7 @@ static const struct proto_ops kcm_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = kcm_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -1924,7 +1924,7 @@ static const struct proto_ops kcm_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = kcm_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/key/af_key.c b/net/key/af_key.c index 5e1d2946ffbf..8bdc1cbe490a 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3751,7 +3751,7 @@ static const struct proto_ops pfkey_ops = { /* Now the operations that really occur. */ .release = pfkey_release, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .sendmsg = pfkey_sendmsg, .recvmsg = pfkey_recvmsg, }; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index a9c05b2bc1b0..181073bf6925 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -613,7 +613,7 @@ static const struct proto_ops l2tp_ip_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = l2tp_ip_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = inet_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 957369192ca1..336e4c00abbc 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -754,7 +754,7 @@ static const struct proto_ops l2tp_ip6_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = l2tp_ip6_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = inet6_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 830469766c1f..3d8ca1231f8f 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1788,7 +1788,7 @@ static const struct proto_ops pppol2tp_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pppol2tp_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = pppol2tp_setsockopt, diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 1beeea9549fa..804de8490186 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -1192,7 +1192,7 @@ static const struct proto_ops llc_ui_ops = { .socketpair = sock_no_socketpair, .accept = llc_ui_accept, .getname = llc_ui_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = llc_ui_ioctl, .listen = llc_ui_listen, .shutdown = llc_ui_shutdown, diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 393573a99a5a..1189b84413d5 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2658,7 +2658,7 @@ static const struct proto_ops netlink_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = netlink_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index c2888c78d4c1..b97eb766a1d5 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1355,7 +1355,7 @@ static const struct proto_ops nr_proto_ops = { .socketpair = sock_no_socketpair, .accept = nr_accept, .getname = nr_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = nr_ioctl, .listen = nr_listen, .shutdown = sock_no_shutdown, diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index ea0c0c6f1874..ab5bb14b49af 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -548,16 +548,13 @@ static inline __poll_t llcp_accept_poll(struct sock *parent) return 0; } -static __poll_t llcp_sock_poll(struct file *file, struct socket *sock, - poll_table *wait) +static __poll_t llcp_sock_poll_mask(struct socket *sock, __poll_t events) { struct sock *sk = sock->sk; __poll_t mask = 0; pr_debug("%p\n", sk); - sock_poll_wait(file, sk_sleep(sk), wait); - if (sk->sk_state == LLCP_LISTEN) return llcp_accept_poll(sk); @@ -899,7 +896,7 @@ static const struct proto_ops llcp_sock_ops = { .socketpair = sock_no_socketpair, .accept = llcp_sock_accept, .getname = llcp_sock_getname, - .poll = llcp_sock_poll, + .poll_mask = llcp_sock_poll_mask, .ioctl = sock_no_ioctl, .listen = llcp_sock_listen, .shutdown = sock_no_shutdown, @@ -919,7 +916,7 @@ static const struct proto_ops llcp_rawsock_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = llcp_sock_getname, - .poll = llcp_sock_poll, + .poll_mask = llcp_sock_poll_mask, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index e2188deb08dc..60c322531c49 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -284,7 +284,7 @@ static const struct proto_ops rawsock_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -304,7 +304,7 @@ static const struct proto_ops rawsock_raw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index f9cdd27a7f6f..674390b1f084 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4110,12 +4110,11 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd, return 0; } -static __poll_t packet_poll(struct file *file, struct socket *sock, - poll_table *wait) +static __poll_t packet_poll_mask(struct socket *sock, __poll_t events) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); - __poll_t mask = datagram_poll(file, sock, wait); + __poll_t mask = datagram_poll_mask(sock, events); spin_lock_bh(&sk->sk_receive_queue.lock); if (po->rx_ring.pg_vec) { @@ -4457,7 +4456,7 @@ static const struct proto_ops packet_ops_spkt = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname_spkt, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = packet_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -4478,7 +4477,7 @@ static const struct proto_ops packet_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname, - .poll = packet_poll, + .poll_mask = packet_poll_mask, .ioctl = packet_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 30187990257f..c295c4e20f01 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -340,15 +340,12 @@ static int pn_socket_getname(struct socket *sock, struct sockaddr *addr, return sizeof(struct sockaddr_pn); } -static __poll_t pn_socket_poll(struct file *file, struct socket *sock, - poll_table *wait) +static __poll_t pn_socket_poll_mask(struct socket *sock, __poll_t events) { struct sock *sk = sock->sk; struct pep_sock *pn = pep_sk(sk); __poll_t mask = 0; - poll_wait(file, sk_sleep(sk), wait); - if (sk->sk_state == TCP_CLOSE) return EPOLLERR; if (!skb_queue_empty(&sk->sk_receive_queue)) @@ -448,7 +445,7 @@ const struct proto_ops phonet_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pn_socket_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = pn_socket_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -473,7 +470,7 @@ const struct proto_ops phonet_stream_ops = { .socketpair = sock_no_socketpair, .accept = pn_socket_accept, .getname = pn_socket_getname, - .poll = pn_socket_poll, + .poll_mask = pn_socket_poll_mask, .ioctl = pn_socket_ioctl, .listen = pn_socket_listen, .shutdown = sock_no_shutdown, diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 2aa07b547b16..1b5025ea5b04 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -1023,7 +1023,7 @@ static const struct proto_ops qrtr_proto_ops = { .recvmsg = qrtr_recvmsg, .getname = qrtr_getname, .ioctl = qrtr_ioctl, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, .getsockopt = sock_no_getsockopt, diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 22a7f2b413ac..5b73fea849df 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1470,7 +1470,7 @@ static const struct proto_ops rose_proto_ops = { .socketpair = sock_no_socketpair, .accept = rose_accept, .getname = rose_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = rose_ioctl, .listen = rose_listen, .shutdown = sock_no_shutdown, diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 2b463047dd7b..3b1ac93efee2 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -734,15 +734,11 @@ static int rxrpc_getsockopt(struct socket *sock, int level, int optname, /* * permit an RxRPC socket to be polled */ -static __poll_t rxrpc_poll(struct file *file, struct socket *sock, - poll_table *wait) +static __poll_t rxrpc_poll_mask(struct socket *sock, __poll_t events) { struct sock *sk = sock->sk; struct rxrpc_sock *rx = rxrpc_sk(sk); - __poll_t mask; - - sock_poll_wait(file, sk_sleep(sk), wait); - mask = 0; + __poll_t mask = 0; /* the socket is readable if there are any messages waiting on the Rx * queue */ @@ -949,7 +945,7 @@ static const struct proto_ops rxrpc_rpc_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll = rxrpc_poll, + .poll_mask = rxrpc_poll_mask, .ioctl = sock_no_ioctl, .listen = rxrpc_listen, .shutdown = rxrpc_shutdown, diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 0cd2e764f47f..7339918a805d 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -1010,7 +1010,7 @@ static const struct proto_ops inet6_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = sctp_getname, - .poll = sctp_poll, + .poll_mask = sctp_poll_mask, .ioctl = inet6_ioctl, .listen = sctp_inet_listen, .shutdown = inet_shutdown, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 6bf0a9971888..11d93377ba5e 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1016,7 +1016,7 @@ static const struct proto_ops inet_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, /* Semantics are different. */ - .poll = sctp_poll, + .poll_mask = sctp_poll_mask, .ioctl = inet_ioctl, .listen = sctp_inet_listen, .shutdown = inet_shutdown, /* Looks harmless. */ diff --git a/net/sctp/socket.c b/net/sctp/socket.c index ae7e7c606f72..bf747094d26b 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7722,14 +7722,12 @@ out: * here, again, by modeling the current TCP/UDP code. We don't have * a good way to test with it yet. */ -__poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait) +__poll_t sctp_poll_mask(struct socket *sock, __poll_t events) { struct sock *sk = sock->sk; struct sctp_sock *sp = sctp_sk(sk); __poll_t mask; - poll_wait(file, sk_sleep(sk), wait); - sock_rps_record_flow(sk); /* A TCP-style listening socket becomes readable when the accept queue diff --git a/net/socket.c b/net/socket.c index f10f1d947c78..2d752e9eb3f9 100644 --- a/net/socket.c +++ b/net/socket.c @@ -117,8 +117,10 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from); static int sock_mmap(struct file *file, struct vm_area_struct *vma); static int sock_close(struct inode *inode, struct file *file); -static __poll_t sock_poll(struct file *file, - struct poll_table_struct *wait); +static struct wait_queue_head *sock_get_poll_head(struct file *file, + __poll_t events); +static __poll_t sock_poll_mask(struct file *file, __poll_t); +static __poll_t sock_poll(struct file *file, struct poll_table_struct *wait); static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT static long compat_sock_ioctl(struct file *file, @@ -141,6 +143,8 @@ static const struct file_operations socket_file_ops = { .llseek = no_llseek, .read_iter = sock_read_iter, .write_iter = sock_write_iter, + .get_poll_head = sock_get_poll_head, + .poll_mask = sock_poll_mask, .poll = sock_poll, .unlocked_ioctl = sock_ioctl, #ifdef CONFIG_COMPAT @@ -1114,27 +1118,48 @@ out_release: } EXPORT_SYMBOL(sock_create_lite); -/* No kernel lock held - perfect */ -static __poll_t sock_poll(struct file *file, poll_table *wait) +static struct wait_queue_head *sock_get_poll_head(struct file *file, + __poll_t events) { - __poll_t busy_flag = 0; - struct socket *sock; + struct socket *sock = file->private_data; + + if (!sock->ops->poll_mask) + return NULL; + sock_poll_busy_loop(sock, events); + return sk_sleep(sock->sk); +} + +static __poll_t sock_poll_mask(struct file *file, __poll_t events) +{ + struct socket *sock = file->private_data; /* - * We can't return errors to poll, so it's either yes or no. + * We need to be sure we are in sync with the socket flags modification. + * + * This memory barrier is paired in the wq_has_sleeper. */ - sock = file->private_data; + smp_mb(); + + /* this socket can poll_ll so tell the system call */ + return sock->ops->poll_mask(sock, events) | + (sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0); +} - if (sk_can_busy_loop(sock->sk)) { - /* this socket can poll_ll so tell the system call */ - busy_flag = POLL_BUSY_LOOP; +/* No kernel lock held - perfect */ +static __poll_t sock_poll(struct file *file, poll_table *wait) +{ + struct socket *sock = file->private_data; + __poll_t events = poll_requested_events(wait), mask = 0; - /* once, only if requested by syscall */ - if (wait && (wait->_key & POLL_BUSY_LOOP)) - sk_busy_loop(sock->sk, 1); + if (sock->ops->poll) { + sock_poll_busy_loop(sock, events); + mask = sock->ops->poll(file, sock, wait); + } else if (sock->ops->poll_mask) { + sock_poll_wait(file, sock_get_poll_head(file, events), wait); + mask = sock->ops->poll_mask(sock, events); } - return busy_flag | sock->ops->poll(file, sock, wait); + return mask | sock_poll_busy_flag(sock); } static int sock_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 6be21575503a..3bb45042e833 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -692,10 +692,9 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, } /** - * tipc_poll - read and possibly block on pollmask + * tipc_poll - read pollmask * @file: file structure associated with the socket * @sock: socket for which to calculate the poll bits - * @wait: ??? * * Returns pollmask value * @@ -709,15 +708,12 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, * imply that the operation will succeed, merely that it should be performed * and will not block. */ -static __poll_t tipc_poll(struct file *file, struct socket *sock, - poll_table *wait) +static __poll_t tipc_poll_mask(struct socket *sock, __poll_t events) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); __poll_t revents = 0; - sock_poll_wait(file, sk_sleep(sk), wait); - if (sk->sk_shutdown & RCV_SHUTDOWN) revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) @@ -3028,7 +3024,7 @@ static const struct proto_ops msg_ops = { .socketpair = tipc_socketpair, .accept = sock_no_accept, .getname = tipc_getname, - .poll = tipc_poll, + .poll_mask = tipc_poll_mask, .ioctl = tipc_ioctl, .listen = sock_no_listen, .shutdown = tipc_shutdown, @@ -3049,7 +3045,7 @@ static const struct proto_ops packet_ops = { .socketpair = tipc_socketpair, .accept = tipc_accept, .getname = tipc_getname, - .poll = tipc_poll, + .poll_mask = tipc_poll_mask, .ioctl = tipc_ioctl, .listen = tipc_listen, .shutdown = tipc_shutdown, @@ -3070,7 +3066,7 @@ static const struct proto_ops stream_ops = { .socketpair = tipc_socketpair, .accept = tipc_accept, .getname = tipc_getname, - .poll = tipc_poll, + .poll_mask = tipc_poll_mask, .ioctl = tipc_ioctl, .listen = tipc_listen, .shutdown = tipc_shutdown, diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e5473c03d667..95b02a71fd47 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -638,9 +638,8 @@ static int unix_stream_connect(struct socket *, struct sockaddr *, static int unix_socketpair(struct socket *, struct socket *); static int unix_accept(struct socket *, struct socket *, int, bool); static int unix_getname(struct socket *, struct sockaddr *, int); -static __poll_t unix_poll(struct file *, struct socket *, poll_table *); -static __poll_t unix_dgram_poll(struct file *, struct socket *, - poll_table *); +static __poll_t unix_poll_mask(struct socket *, __poll_t); +static __poll_t unix_dgram_poll_mask(struct socket *, __poll_t); static int unix_ioctl(struct socket *, unsigned int, unsigned long); static int unix_shutdown(struct socket *, int); static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); @@ -681,7 +680,7 @@ static const struct proto_ops unix_stream_ops = { .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, - .poll = unix_poll, + .poll_mask = unix_poll_mask, .ioctl = unix_ioctl, .listen = unix_listen, .shutdown = unix_shutdown, @@ -704,7 +703,7 @@ static const struct proto_ops unix_dgram_ops = { .socketpair = unix_socketpair, .accept = sock_no_accept, .getname = unix_getname, - .poll = unix_dgram_poll, + .poll_mask = unix_dgram_poll_mask, .ioctl = unix_ioctl, .listen = sock_no_listen, .shutdown = unix_shutdown, @@ -726,7 +725,7 @@ static const struct proto_ops unix_seqpacket_ops = { .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, - .poll = unix_dgram_poll, + .poll_mask = unix_dgram_poll_mask, .ioctl = unix_ioctl, .listen = unix_listen, .shutdown = unix_shutdown, @@ -2630,13 +2629,10 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return err; } -static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) +static __poll_t unix_poll_mask(struct socket *sock, __poll_t events) { struct sock *sk = sock->sk; - __poll_t mask; - - sock_poll_wait(file, sk_sleep(sk), wait); - mask = 0; + __poll_t mask = 0; /* exceptional events? */ if (sk->sk_err) @@ -2665,15 +2661,11 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa return mask; } -static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, - poll_table *wait) +static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events) { struct sock *sk = sock->sk, *other; - unsigned int writable; - __poll_t mask; - - sock_poll_wait(file, sk_sleep(sk), wait); - mask = 0; + int writable; + __poll_t mask = 0; /* exceptional events? */ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) @@ -2699,7 +2691,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, } /* No write status requested, avoid expensive OUT tests. */ - if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) + if (!(events & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) return mask; writable = unix_writable(sk); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index c1076c19b858..bb5d5fa68c35 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -850,18 +850,11 @@ static int vsock_shutdown(struct socket *sock, int mode) return err; } -static __poll_t vsock_poll(struct file *file, struct socket *sock, - poll_table *wait) +static __poll_t vsock_poll_mask(struct socket *sock, __poll_t events) { - struct sock *sk; - __poll_t mask; - struct vsock_sock *vsk; - - sk = sock->sk; - vsk = vsock_sk(sk); - - poll_wait(file, sk_sleep(sk), wait); - mask = 0; + struct sock *sk = sock->sk; + struct vsock_sock *vsk = vsock_sk(sk); + __poll_t mask = 0; if (sk->sk_err) /* Signify that there has been an error on this socket. */ @@ -1091,7 +1084,7 @@ static const struct proto_ops vsock_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = vsock_getname, - .poll = vsock_poll, + .poll_mask = vsock_poll_mask, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = vsock_shutdown, @@ -1849,7 +1842,7 @@ static const struct proto_ops vsock_stream_ops = { .socketpair = sock_no_socketpair, .accept = vsock_accept, .getname = vsock_getname, - .poll = vsock_poll, + .poll_mask = vsock_poll_mask, .ioctl = sock_no_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index d49aa79b7997..f93365ae0fdd 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1750,7 +1750,7 @@ static const struct proto_ops x25_proto_ops = { .socketpair = sock_no_socketpair, .accept = x25_accept, .getname = x25_getname, - .poll = datagram_poll, + .poll_mask = datagram_poll_mask, .ioctl = x25_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_x25_ioctl, diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 6e865e8b5b10..90d30fbe95ae 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -397,7 +397,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) * Check if there was an event already pending on the eventfd * before we registered, and trigger it as if we didn't miss it. */ - events = f.file->f_op->poll(f.file, &irqfd->pt); + events = vfs_poll(f.file, &irqfd->pt); if (events & EPOLLIN) schedule_work(&irqfd->inject); |