From 7c8f4e6dc30996bff806285730a0bb4e714d3d52 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 26 Jun 2018 01:39:32 +0200 Subject: fib_rules: match rules based on suppress_* properties too Two rules with different values of suppress_prefix or suppress_ifgroup are not the same. This fixes an -EEXIST when running: $ ip -4 rule add table main suppress_prefixlength 0 Signed-off-by: Jason A. Donenfeld Fixes: f9d4b0c1e969 ("fib_rules: move common handling of newrule delrule msgs into fib_nl2rule") Signed-off-by: David S. Miller --- net/core/fib_rules.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 126ffc5bc630..bc8425d81022 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -416,6 +416,14 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops, if (rule->mark && r->mark != rule->mark) continue; + if (rule->suppress_ifgroup != -1 && + r->suppress_ifgroup != rule->suppress_ifgroup) + continue; + + if (rule->suppress_prefixlen != -1 && + r->suppress_prefixlen != rule->suppress_prefixlen) + continue; + if (rule->mark_mask && r->mark_mask != rule->mark_mask) continue; -- cgit v1.2.3 From a11e1d432b51f63ba698d044441284a661f01144 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 28 Jun 2018 09:43:44 -0700 Subject: Revert changes to convert to ->poll_mask() and aio IOCB_CMD_POLL The poll() changes were not well thought out, and completely unexplained. They also caused a huge performance regression, because "->poll()" was no longer a trivial file operation that just called down to the underlying file operations, but instead did at least two indirect calls. Indirect calls are sadly slow now with the Spectre mitigation, but the performance problem could at least be largely mitigated by changing the "->get_poll_head()" operation to just have a per-file-descriptor pointer to the poll head instead. That gets rid of one of the new indirections. But that doesn't fix the new complexity that is completely unwarranted for the regular case. The (undocumented) reason for the poll() changes was some alleged AIO poll race fixing, but we don't make the common case slower and more complex for some uncommon special case, so this all really needs way more explanations and most likely a fundamental redesign. [ This revert is a revert of about 30 different commits, not reverted individually because that would just be unnecessarily messy - Linus ] Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 7 +- Documentation/filesystems/vfs.txt | 13 ---- crypto/af_alg.c | 13 +++- crypto/algif_aead.c | 4 +- crypto/algif_skcipher.c | 4 +- drivers/char/random.c | 29 ++++---- drivers/isdn/mISDN/socket.c | 2 +- drivers/net/ppp/pppoe.c | 2 +- fs/aio.c | 148 +------------------------------------- fs/eventfd.c | 19 ++--- fs/eventpoll.c | 15 ++-- fs/pipe.c | 22 +++--- fs/select.c | 23 ------ fs/timerfd.c | 22 +++--- include/crypto/if_alg.h | 3 +- include/linux/fs.h | 2 - include/linux/net.h | 1 - include/linux/poll.h | 12 ++-- include/linux/skbuff.h | 3 +- include/net/bluetooth/bluetooth.h | 2 +- include/net/iucv/af_iucv.h | 2 + include/net/sctp/sctp.h | 3 +- include/net/tcp.h | 3 +- include/net/tls.h | 6 +- include/net/udp.h | 2 +- include/uapi/linux/aio_abi.h | 8 ++- net/appletalk/ddp.c | 2 +- net/atm/common.c | 11 ++- net/atm/common.h | 2 +- net/atm/pvc.c | 2 +- net/atm/svc.c | 2 +- net/ax25/af_ax25.c | 2 +- net/bluetooth/af_bluetooth.c | 7 +- net/bluetooth/hci_sock.c | 2 +- net/bluetooth/l2cap_sock.c | 2 +- net/bluetooth/rfcomm/sock.c | 2 +- net/bluetooth/sco.c | 2 +- net/caif/caif_socket.c | 12 ++-- net/can/bcm.c | 2 +- net/can/raw.c | 2 +- net/core/datagram.c | 13 ++-- net/dccp/dccp.h | 3 +- net/dccp/ipv4.c | 2 +- net/dccp/ipv6.c | 2 +- net/dccp/proto.c | 13 +++- net/decnet/af_decnet.c | 6 +- net/ieee802154/socket.c | 4 +- net/ipv4/af_inet.c | 8 +-- net/ipv4/tcp.c | 23 ++++-- net/ipv4/udp.c | 10 +-- net/ipv6/af_inet6.c | 4 +- net/ipv6/raw.c | 4 +- net/iucv/af_iucv.c | 7 +- net/kcm/kcmsock.c | 10 +-- net/key/af_key.c | 2 +- net/l2tp/l2tp_ip.c | 2 +- net/l2tp/l2tp_ip6.c | 2 +- net/l2tp/l2tp_ppp.c | 2 +- net/llc/af_llc.c | 2 +- net/netlink/af_netlink.c | 2 +- net/netrom/af_netrom.c | 2 +- net/nfc/llcp_sock.c | 9 ++- net/nfc/rawsock.c | 4 +- net/packet/af_packet.c | 9 +-- net/phonet/socket.c | 9 ++- net/qrtr/qrtr.c | 2 +- net/rose/af_rose.c | 2 +- net/rxrpc/af_rxrpc.c | 10 ++- net/sctp/ipv6.c | 2 +- net/sctp/protocol.c | 2 +- net/sctp/socket.c | 4 +- net/smc/af_smc.c | 12 +++- net/socket.c | 48 ++----------- net/tipc/socket.c | 14 ++-- net/tls/tls_main.c | 2 +- net/tls/tls_sw.c | 19 ++--- net/unix/af_unix.c | 30 +++++--- net/vmw_vsock/af_vsock.c | 19 +++-- net/x25/af_x25.c | 2 +- net/xdp/xsk.c | 7 +- 80 files changed, 301 insertions(+), 450 deletions(-) (limited to 'net/core') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 2c391338c675..37bf0a9de75c 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -441,8 +441,6 @@ prototypes: int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); - struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t); - __poll_t (*poll_mask) (struct file *, __poll_t); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); @@ -473,7 +471,7 @@ prototypes: }; locking rules: - All except for ->poll_mask may block. + All may block. ->llseek() locking has moved from llseek to the individual llseek implementations. If your fs is not using generic_file_llseek, you @@ -505,9 +503,6 @@ in sys_read() and friends. the lease within the individual filesystem to record the result of the operation -->poll_mask can be called with or without the waitqueue lock for the waitqueue -returned from ->get_poll_head. - --------------------------- dquot_operations ------------------------------- prototypes: int (*write_dquot) (struct dquot *); diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 829a7b7857a4..f608180ad59d 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -857,8 +857,6 @@ struct file_operations { ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iterate) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); - struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t); - __poll_t (*poll_mask) (struct file *, __poll_t); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); @@ -903,17 +901,6 @@ otherwise noted. activity on this file and (optionally) go to sleep until there is activity. Called by the select(2) and poll(2) system calls - get_poll_head: Returns the struct wait_queue_head that callers can - wait on. Callers need to check the returned events using ->poll_mask - once woken. Can return NULL to indicate polling is not supported, - or any error code using the ERR_PTR convention to indicate that a - grave error occured and ->poll_mask shall not be called. - - poll_mask: return the mask of EPOLL* values describing the file descriptor - state. Called either before going to sleep on the waitqueue returned by - get_poll_head, or after it has been woken. If ->get_poll_head and - ->poll_mask are implemented ->poll does not need to be implement. - unlocked_ioctl: called by the ioctl(2) system call. compat_ioctl: called by the ioctl(2) system call when 32 bit system calls diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 49fa8582138b..314c52c967e5 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -1060,12 +1060,19 @@ void af_alg_async_cb(struct crypto_async_request *_req, int err) } EXPORT_SYMBOL_GPL(af_alg_async_cb); -__poll_t af_alg_poll_mask(struct socket *sock, __poll_t events) +/** + * af_alg_poll - poll system call handler + */ +__poll_t af_alg_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; - __poll_t mask = 0; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; if (!ctx->more || ctx->used) mask |= EPOLLIN | EPOLLRDNORM; @@ -1075,7 +1082,7 @@ __poll_t af_alg_poll_mask(struct socket *sock, __poll_t events) return mask; } -EXPORT_SYMBOL_GPL(af_alg_poll_mask); +EXPORT_SYMBOL_GPL(af_alg_poll); /** * af_alg_alloc_areq - allocate struct af_alg_async_req diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c index 825524f27438..c40a8c7ee8ae 100644 --- a/crypto/algif_aead.c +++ b/crypto/algif_aead.c @@ -375,7 +375,7 @@ static struct proto_ops algif_aead_ops = { .sendmsg = aead_sendmsg, .sendpage = af_alg_sendpage, .recvmsg = aead_recvmsg, - .poll_mask = af_alg_poll_mask, + .poll = af_alg_poll, }; static int aead_check_key(struct socket *sock) @@ -471,7 +471,7 @@ static struct proto_ops algif_aead_ops_nokey = { .sendmsg = aead_sendmsg_nokey, .sendpage = aead_sendpage_nokey, .recvmsg = aead_recvmsg_nokey, - .poll_mask = af_alg_poll_mask, + .poll = af_alg_poll, }; static void *aead_bind(const char *name, u32 type, u32 mask) diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index 4c04eb9888ad..cfdaab2b7d76 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -206,7 +206,7 @@ static struct proto_ops algif_skcipher_ops = { .sendmsg = skcipher_sendmsg, .sendpage = af_alg_sendpage, .recvmsg = skcipher_recvmsg, - .poll_mask = af_alg_poll_mask, + .poll = af_alg_poll, }; static int skcipher_check_key(struct socket *sock) @@ -302,7 +302,7 @@ static struct proto_ops algif_skcipher_ops_nokey = { .sendmsg = skcipher_sendmsg_nokey, .sendpage = skcipher_sendpage_nokey, .recvmsg = skcipher_recvmsg_nokey, - .poll_mask = af_alg_poll_mask, + .poll = af_alg_poll, }; static void *skcipher_bind(const char *name, u32 type, u32 mask) diff --git a/drivers/char/random.c b/drivers/char/random.c index a8fb0020ba5c..cd888d4ee605 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -402,7 +402,8 @@ static struct poolinfo { /* * Static global variables */ -static DECLARE_WAIT_QUEUE_HEAD(random_wait); +static DECLARE_WAIT_QUEUE_HEAD(random_read_wait); +static DECLARE_WAIT_QUEUE_HEAD(random_write_wait); static struct fasync_struct *fasync; static DEFINE_SPINLOCK(random_ready_list_lock); @@ -721,8 +722,8 @@ retry: /* should we wake readers? */ if (entropy_bits >= random_read_wakeup_bits && - wq_has_sleeper(&random_wait)) { - wake_up_interruptible_poll(&random_wait, POLLIN); + wq_has_sleeper(&random_read_wait)) { + wake_up_interruptible(&random_read_wait); kill_fasync(&fasync, SIGIO, POLL_IN); } /* If the input pool is getting full, send some @@ -1396,7 +1397,7 @@ retry: trace_debit_entropy(r->name, 8 * ibytes); if (ibytes && (r->entropy_count >> ENTROPY_SHIFT) < random_write_wakeup_bits) { - wake_up_interruptible_poll(&random_wait, POLLOUT); + wake_up_interruptible(&random_write_wait); kill_fasync(&fasync, SIGIO, POLL_OUT); } @@ -1838,7 +1839,7 @@ _random_read(int nonblock, char __user *buf, size_t nbytes) if (nonblock) return -EAGAIN; - wait_event_interruptible(random_wait, + wait_event_interruptible(random_read_wait, ENTROPY_BITS(&input_pool) >= random_read_wakeup_bits); if (signal_pending(current)) @@ -1875,17 +1876,14 @@ urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) return ret; } -static struct wait_queue_head * -random_get_poll_head(struct file *file, __poll_t events) -{ - return &random_wait; -} - static __poll_t -random_poll_mask(struct file *file, __poll_t events) +random_poll(struct file *file, poll_table * wait) { - __poll_t mask = 0; + __poll_t mask; + poll_wait(file, &random_read_wait, wait); + poll_wait(file, &random_write_wait, wait); + mask = 0; if (ENTROPY_BITS(&input_pool) >= random_read_wakeup_bits) mask |= EPOLLIN | EPOLLRDNORM; if (ENTROPY_BITS(&input_pool) < random_write_wakeup_bits) @@ -1992,8 +1990,7 @@ static int random_fasync(int fd, struct file *filp, int on) const struct file_operations random_fops = { .read = random_read, .write = random_write, - .get_poll_head = random_get_poll_head, - .poll_mask = random_poll_mask, + .poll = random_poll, .unlocked_ioctl = random_ioctl, .fasync = random_fasync, .llseek = noop_llseek, @@ -2326,7 +2323,7 @@ void add_hwgenerator_randomness(const char *buffer, size_t count, * We'll be woken up again once below random_write_wakeup_thresh, * or when the calling thread is about to terminate. */ - wait_event_interruptible(random_wait, kthread_should_stop() || + wait_event_interruptible(random_write_wait, kthread_should_stop() || ENTROPY_BITS(&input_pool) <= random_write_wakeup_bits); mix_pool_bytes(poolp, buffer, count); credit_entropy_bits(poolp, entropy); diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index 98f90aadd141..18c0a1281914 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -588,7 +588,7 @@ static const struct proto_ops data_sock_ops = { .getname = data_sock_getname, .sendmsg = mISDN_sock_sendmsg, .recvmsg = mISDN_sock_recvmsg, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = data_sock_setsockopt, diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index de51e8f70f44..ce61231e96ea 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -1107,7 +1107,7 @@ static const struct proto_ops pppoe_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pppoe_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, diff --git a/fs/aio.c b/fs/aio.c index e1d20124ec0e..210df9da1283 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -5,7 +5,6 @@ * Implements an efficient asynchronous io interface. * * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. - * Copyright 2018 Christoph Hellwig. * * See ../COPYING for licensing terms. */ @@ -165,22 +164,10 @@ struct fsync_iocb { bool datasync; }; -struct poll_iocb { - struct file *file; - __poll_t events; - struct wait_queue_head *head; - - union { - struct wait_queue_entry wait; - struct work_struct work; - }; -}; - struct aio_kiocb { union { struct kiocb rw; struct fsync_iocb fsync; - struct poll_iocb poll; }; struct kioctx *ki_ctx; @@ -1590,6 +1577,7 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)) return -EINVAL; + req->file = fget(iocb->aio_fildes); if (unlikely(!req->file)) return -EBADF; @@ -1604,137 +1592,6 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) return 0; } -/* need to use list_del_init so we can check if item was present */ -static inline bool __aio_poll_remove(struct poll_iocb *req) -{ - if (list_empty(&req->wait.entry)) - return false; - list_del_init(&req->wait.entry); - return true; -} - -static inline void __aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) -{ - fput(iocb->poll.file); - aio_complete(iocb, mangle_poll(mask), 0); -} - -static void aio_poll_work(struct work_struct *work) -{ - struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, poll.work); - - if (!list_empty_careful(&iocb->ki_list)) - aio_remove_iocb(iocb); - __aio_poll_complete(iocb, iocb->poll.events); -} - -static int aio_poll_cancel(struct kiocb *iocb) -{ - struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); - struct poll_iocb *req = &aiocb->poll; - struct wait_queue_head *head = req->head; - bool found = false; - - spin_lock(&head->lock); - found = __aio_poll_remove(req); - spin_unlock(&head->lock); - - if (found) { - req->events = 0; - INIT_WORK(&req->work, aio_poll_work); - schedule_work(&req->work); - } - return 0; -} - -static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, - void *key) -{ - struct poll_iocb *req = container_of(wait, struct poll_iocb, wait); - struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); - struct file *file = req->file; - __poll_t mask = key_to_poll(key); - - assert_spin_locked(&req->head->lock); - - /* for instances that support it check for an event match first: */ - if (mask && !(mask & req->events)) - return 0; - - mask = file->f_op->poll_mask(file, req->events) & req->events; - if (!mask) - return 0; - - __aio_poll_remove(req); - - /* - * Try completing without a context switch if we can acquire ctx_lock - * without spinning. Otherwise we need to defer to a workqueue to - * avoid a deadlock due to the lock order. - */ - if (spin_trylock(&iocb->ki_ctx->ctx_lock)) { - list_del_init(&iocb->ki_list); - spin_unlock(&iocb->ki_ctx->ctx_lock); - - __aio_poll_complete(iocb, mask); - } else { - req->events = mask; - INIT_WORK(&req->work, aio_poll_work); - schedule_work(&req->work); - } - - return 1; -} - -static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) -{ - struct kioctx *ctx = aiocb->ki_ctx; - struct poll_iocb *req = &aiocb->poll; - __poll_t mask; - - /* reject any unknown events outside the normal event mask. */ - if ((u16)iocb->aio_buf != iocb->aio_buf) - return -EINVAL; - /* reject fields that are not defined for poll */ - if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags) - return -EINVAL; - - req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; - req->file = fget(iocb->aio_fildes); - if (unlikely(!req->file)) - return -EBADF; - if (!file_has_poll_mask(req->file)) - goto out_fail; - - req->head = req->file->f_op->get_poll_head(req->file, req->events); - if (!req->head) - goto out_fail; - if (IS_ERR(req->head)) { - mask = EPOLLERR; - goto done; - } - - init_waitqueue_func_entry(&req->wait, aio_poll_wake); - aiocb->ki_cancel = aio_poll_cancel; - - spin_lock_irq(&ctx->ctx_lock); - spin_lock(&req->head->lock); - mask = req->file->f_op->poll_mask(req->file, req->events) & req->events; - if (!mask) { - __add_wait_queue(req->head, &req->wait); - list_add_tail(&aiocb->ki_list, &ctx->active_reqs); - } - spin_unlock(&req->head->lock); - spin_unlock_irq(&ctx->ctx_lock); -done: - if (mask) - __aio_poll_complete(aiocb, mask); - return 0; -out_fail: - fput(req->file); - return -EINVAL; /* same as no support for IOCB_CMD_POLL */ -} - static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, bool compat) { @@ -1808,9 +1665,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, case IOCB_CMD_FDSYNC: ret = aio_fsync(&req->fsync, &iocb, true); break; - case IOCB_CMD_POLL: - ret = aio_poll(req, &iocb); - break; default: pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode); ret = -EINVAL; diff --git a/fs/eventfd.c b/fs/eventfd.c index ceb1031f1cac..08d3bd602f73 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -101,20 +101,14 @@ static int eventfd_release(struct inode *inode, struct file *file) return 0; } -static struct wait_queue_head * -eventfd_get_poll_head(struct file *file, __poll_t events) -{ - struct eventfd_ctx *ctx = file->private_data; - - return &ctx->wqh; -} - -static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask) +static __poll_t eventfd_poll(struct file *file, poll_table *wait) { struct eventfd_ctx *ctx = file->private_data; __poll_t events = 0; u64 count; + poll_wait(file, &ctx->wqh, wait); + /* * All writes to ctx->count occur within ctx->wqh.lock. This read * can be done outside ctx->wqh.lock because we know that poll_wait @@ -156,11 +150,11 @@ static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask) count = READ_ONCE(ctx->count); if (count > 0) - events |= (EPOLLIN & eventmask); + events |= EPOLLIN; if (count == ULLONG_MAX) events |= EPOLLERR; if (ULLONG_MAX - 1 > count) - events |= (EPOLLOUT & eventmask); + events |= EPOLLOUT; return events; } @@ -311,8 +305,7 @@ static const struct file_operations eventfd_fops = { .show_fdinfo = eventfd_show_fdinfo, #endif .release = eventfd_release, - .get_poll_head = eventfd_get_poll_head, - .poll_mask = eventfd_poll_mask, + .poll = eventfd_poll, .read = eventfd_read, .write = eventfd_write, .llseek = noop_llseek, diff --git a/fs/eventpoll.c b/fs/eventpoll.c index ea4436f409fb..67db22fe99c5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -922,18 +922,14 @@ static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head return 0; } -static struct wait_queue_head *ep_eventpoll_get_poll_head(struct file *file, - __poll_t eventmask) -{ - struct eventpoll *ep = file->private_data; - return &ep->poll_wait; -} - -static __poll_t ep_eventpoll_poll_mask(struct file *file, __poll_t eventmask) +static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) { struct eventpoll *ep = file->private_data; int depth = 0; + /* Insert inside our poll wait queue */ + poll_wait(file, &ep->poll_wait, wait); + /* * Proceed to find out if wanted events are really available inside * the ready list. @@ -972,8 +968,7 @@ static const struct file_operations eventpoll_fops = { .show_fdinfo = ep_show_fdinfo, #endif .release = ep_eventpoll_release, - .get_poll_head = ep_eventpoll_get_poll_head, - .poll_mask = ep_eventpoll_poll_mask, + .poll = ep_eventpoll_poll, .llseek = noop_llseek, }; diff --git a/fs/pipe.c b/fs/pipe.c index bb0840e234f3..39d6f431da83 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -509,22 +509,19 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } } -static struct wait_queue_head * -pipe_get_poll_head(struct file *filp, __poll_t events) -{ - struct pipe_inode_info *pipe = filp->private_data; - - return &pipe->wait; -} - /* No kernel lock held - fine */ -static __poll_t pipe_poll_mask(struct file *filp, __poll_t events) +static __poll_t +pipe_poll(struct file *filp, poll_table *wait) { + __poll_t mask; struct pipe_inode_info *pipe = filp->private_data; - int nrbufs = pipe->nrbufs; - __poll_t mask = 0; + int nrbufs; + + poll_wait(filp, &pipe->wait, wait); /* Reading only -- no need for acquiring the semaphore. */ + nrbufs = pipe->nrbufs; + mask = 0; if (filp->f_mode & FMODE_READ) { mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0; if (!pipe->writers && filp->f_version != pipe->w_counter) @@ -1023,8 +1020,7 @@ const struct file_operations pipefifo_fops = { .llseek = no_llseek, .read_iter = pipe_read, .write_iter = pipe_write, - .get_poll_head = pipe_get_poll_head, - .poll_mask = pipe_poll_mask, + .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, .fasync = pipe_fasync, diff --git a/fs/select.c b/fs/select.c index 317891ff8165..4a6b6e4b21cb 100644 --- a/fs/select.c +++ b/fs/select.c @@ -34,29 +34,6 @@ #include -__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt) -{ - if (file->f_op->poll) { - return file->f_op->poll(file, pt); - } else if (file_has_poll_mask(file)) { - unsigned int events = poll_requested_events(pt); - struct wait_queue_head *head; - - if (pt && pt->_qproc) { - head = file->f_op->get_poll_head(file, events); - if (!head) - return DEFAULT_POLLMASK; - if (IS_ERR(head)) - return EPOLLERR; - pt->_qproc(file, head, pt); - } - - return file->f_op->poll_mask(file, events); - } else { - return DEFAULT_POLLMASK; - } -} -EXPORT_SYMBOL_GPL(vfs_poll); /* * Estimate expected accuracy in ns from a timeval. diff --git a/fs/timerfd.c b/fs/timerfd.c index d84a2bee4f82..cdad49da3ff7 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -226,20 +226,21 @@ static int timerfd_release(struct inode *inode, struct file *file) kfree_rcu(ctx, rcu); return 0; } - -static struct wait_queue_head *timerfd_get_poll_head(struct file *file, - __poll_t eventmask) + +static __poll_t timerfd_poll(struct file *file, poll_table *wait) { struct timerfd_ctx *ctx = file->private_data; + __poll_t events = 0; + unsigned long flags; - return &ctx->wqh; -} + poll_wait(file, &ctx->wqh, wait); -static __poll_t timerfd_poll_mask(struct file *file, __poll_t eventmask) -{ - struct timerfd_ctx *ctx = file->private_data; + spin_lock_irqsave(&ctx->wqh.lock, flags); + if (ctx->ticks) + events |= EPOLLIN; + spin_unlock_irqrestore(&ctx->wqh.lock, flags); - return ctx->ticks ? EPOLLIN : 0; + return events; } static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, @@ -363,8 +364,7 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg static const struct file_operations timerfd_fops = { .release = timerfd_release, - .get_poll_head = timerfd_get_poll_head, - .poll_mask = timerfd_poll_mask, + .poll = timerfd_poll, .read = timerfd_read, .llseek = noop_llseek, .show_fdinfo = timerfd_show, diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h index cc414db9da0a..482461d8931d 100644 --- a/include/crypto/if_alg.h +++ b/include/crypto/if_alg.h @@ -245,7 +245,8 @@ ssize_t af_alg_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); void af_alg_free_resources(struct af_alg_async_req *areq); void af_alg_async_cb(struct crypto_async_request *_req, int err); -__poll_t af_alg_poll_mask(struct socket *sock, __poll_t events); +__poll_t af_alg_poll(struct file *file, struct socket *sock, + poll_table *wait); struct af_alg_async_req *af_alg_alloc_areq(struct sock *sk, unsigned int areqlen); int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags, diff --git a/include/linux/fs.h b/include/linux/fs.h index 5c91108846db..d78d146a98da 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1720,8 +1720,6 @@ struct file_operations { int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); - struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t); - __poll_t (*poll_mask) (struct file *, __poll_t); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); diff --git a/include/linux/net.h b/include/linux/net.h index 08b6eb964dd6..6554d3ba4396 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -147,7 +147,6 @@ struct proto_ops { int (*getname) (struct socket *sock, struct sockaddr *addr, int peer); - __poll_t (*poll_mask) (struct socket *sock, __poll_t events); __poll_t (*poll) (struct file *file, struct socket *sock, struct poll_table_struct *wait); int (*ioctl) (struct socket *sock, unsigned int cmd, diff --git a/include/linux/poll.h b/include/linux/poll.h index fdf86b4cbc71..7e0fdcf905d2 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -74,18 +74,18 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) pt->_key = ~(__poll_t)0; /* all events enabled */ } -static inline bool file_has_poll_mask(struct file *file) +static inline bool file_can_poll(struct file *file) { - return file->f_op->get_poll_head && file->f_op->poll_mask; + return file->f_op->poll; } -static inline bool file_can_poll(struct file *file) +static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt) { - return file->f_op->poll || file_has_poll_mask(file); + if (unlikely(!file->f_op->poll)) + return DEFAULT_POLLMASK; + return file->f_op->poll(file, pt); } -__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt); - struct poll_table_entry { struct file *filp; __poll_t key; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c86885954994..164cdedf6012 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3252,7 +3252,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags, int *peeked, int *off, int *err); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int *err); -__poll_t datagram_poll_mask(struct socket *sock, __poll_t events); +__poll_t datagram_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait); int skb_copy_datagram_iter(const struct sk_buff *from, int offset, struct iov_iter *to, int size); static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 53ce8176c313..ec9d6bc65855 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -271,7 +271,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); -__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events); +__poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait); int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo); int bt_sock_wait_ready(struct sock *sk, unsigned long flags); diff --git a/include/net/iucv/af_iucv.h b/include/net/iucv/af_iucv.h index b0eaeb02d46d..f4c21b5a1242 100644 --- a/include/net/iucv/af_iucv.h +++ b/include/net/iucv/af_iucv.h @@ -153,6 +153,8 @@ struct iucv_sock_list { atomic_t autobind_name; }; +__poll_t iucv_sock_poll(struct file *file, struct socket *sock, + poll_table *wait); void iucv_sock_link(struct iucv_sock_list *l, struct sock *s); void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *s); void iucv_accept_enqueue(struct sock *parent, struct sock *sk); diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 30b3e2fe240a..8c2caa370e0f 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -109,7 +109,8 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb); int sctp_inet_listen(struct socket *sock, int backlog); void sctp_write_space(struct sock *sk); void sctp_data_ready(struct sock *sk); -__poll_t sctp_poll_mask(struct socket *sock, __poll_t events); +__poll_t sctp_poll(struct file *file, struct socket *sock, + poll_table *wait); void sctp_sock_rfree(struct sk_buff *skb); void sctp_copy_sock(struct sock *newsk, struct sock *sk, struct sctp_association *asoc); diff --git a/include/net/tcp.h b/include/net/tcp.h index 0448e7c5d2b4..800582b5dd54 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -388,7 +388,8 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); void tcp_init_transfer(struct sock *sk, int bpf_op); -__poll_t tcp_poll_mask(struct socket *sock, __poll_t events); +__poll_t tcp_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int tcp_setsockopt(struct sock *sk, int level, int optname, diff --git a/include/net/tls.h b/include/net/tls.h index 7f84ea3e217c..70c273777fe9 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -109,7 +109,8 @@ struct tls_sw_context_rx { struct strparser strp; void (*saved_data_ready)(struct sock *sk); - __poll_t (*sk_poll_mask)(struct socket *sock, __poll_t events); + unsigned int (*sk_poll)(struct file *file, struct socket *sock, + struct poll_table_struct *wait); struct sk_buff *recv_pkt; u8 control; bool decrypted; @@ -224,7 +225,8 @@ void tls_sw_free_resources_tx(struct sock *sk); void tls_sw_free_resources_rx(struct sock *sk); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); -__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events); +unsigned int tls_sw_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); diff --git a/include/net/udp.h b/include/net/udp.h index b1ea8b0f5e6a..81afdacd4fff 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -285,7 +285,7 @@ int udp_init_sock(struct sock *sk); int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); -__poll_t udp_poll_mask(struct socket *sock, __poll_t events); +__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait); struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, bool is_ipv6); diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index d00221345c19..d4e768d55d14 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -39,8 +39,10 @@ enum { IOCB_CMD_PWRITE = 1, IOCB_CMD_FSYNC = 2, IOCB_CMD_FDSYNC = 3, - /* 4 was the experimental IOCB_CMD_PREADX */ - IOCB_CMD_POLL = 5, + /* These two are experimental. + * IOCB_CMD_PREADX = 4, + * IOCB_CMD_POLL = 5, + */ IOCB_CMD_NOOP = 6, IOCB_CMD_PREADV = 7, IOCB_CMD_PWRITEV = 8, @@ -109,7 +111,7 @@ struct iocb { #undef IFLITTLE struct __aio_sigset { - const sigset_t __user *sigmask; + sigset_t __user *sigmask; size_t sigsetsize; }; diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 55fdba05d7d9..9b6bc5abe946 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1869,7 +1869,7 @@ static const struct proto_ops atalk_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = atalk_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = atalk_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = atalk_compat_ioctl, diff --git a/net/atm/common.c b/net/atm/common.c index ff5748b2190f..a7a68e509628 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -647,11 +647,16 @@ out: return error; } -__poll_t vcc_poll_mask(struct socket *sock, __poll_t events) +__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - struct atm_vcc *vcc = ATM_SD(sock); - __poll_t mask = 0; + struct atm_vcc *vcc; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; + + vcc = ATM_SD(sock); /* exceptional events */ if (sk->sk_err) diff --git a/net/atm/common.h b/net/atm/common.h index 526796ad230f..5850649068bb 100644 --- a/net/atm/common.h +++ b/net/atm/common.h @@ -17,7 +17,7 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci); int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len); -__poll_t vcc_poll_mask(struct socket *sock, __poll_t events); +__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait); int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int vcc_setsockopt(struct socket *sock, int level, int optname, diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 9f75092fe778..2cb10af16afc 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -113,7 +113,7 @@ static const struct proto_ops pvc_proto_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pvc_getname, - .poll_mask = vcc_poll_mask, + .poll = vcc_poll, .ioctl = vcc_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = vcc_compat_ioctl, diff --git a/net/atm/svc.c b/net/atm/svc.c index 53f4ad7087b1..2f91b766ac42 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -636,7 +636,7 @@ static const struct proto_ops svc_proto_ops = { .socketpair = sock_no_socketpair, .accept = svc_accept, .getname = svc_getname, - .poll_mask = vcc_poll_mask, + .poll = vcc_poll, .ioctl = svc_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = svc_compat_ioctl, diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index d1d2442ce573..c603d33d5410 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1941,7 +1941,7 @@ static const struct proto_ops ax25_proto_ops = { .socketpair = sock_no_socketpair, .accept = ax25_accept, .getname = ax25_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = ax25_ioctl, .listen = ax25_listen, .shutdown = ax25_shutdown, diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 510ab4f55df5..3264e1873219 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -437,13 +437,16 @@ static inline __poll_t bt_accept_poll(struct sock *parent) return 0; } -__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events) +__poll_t bt_sock_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; BT_DBG("sock %p, sk %p", sock, sk); + poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == BT_LISTEN) return bt_accept_poll(sk); @@ -475,7 +478,7 @@ __poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events) return mask; } -EXPORT_SYMBOL(bt_sock_poll_mask); +EXPORT_SYMBOL(bt_sock_poll); int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index d6c099861538..1506e1632394 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1975,7 +1975,7 @@ static const struct proto_ops hci_sock_ops = { .sendmsg = hci_sock_sendmsg, .recvmsg = hci_sock_recvmsg, .ioctl = hci_sock_ioctl, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = hci_sock_setsockopt, diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 742a190034e6..686bdc6b35b0 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1653,7 +1653,7 @@ static const struct proto_ops l2cap_sock_ops = { .getname = l2cap_sock_getname, .sendmsg = l2cap_sock_sendmsg, .recvmsg = l2cap_sock_recvmsg, - .poll_mask = bt_sock_poll_mask, + .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 1cf57622473a..d606e9212291 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -1049,7 +1049,7 @@ static const struct proto_ops rfcomm_sock_ops = { .setsockopt = rfcomm_sock_setsockopt, .getsockopt = rfcomm_sock_getsockopt, .ioctl = rfcomm_sock_ioctl, - .poll_mask = bt_sock_poll_mask, + .poll = bt_sock_poll, .socketpair = sock_no_socketpair, .mmap = sock_no_mmap }; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index d60dbc61d170..413b8ee49fec 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -1197,7 +1197,7 @@ static const struct proto_ops sco_sock_ops = { .getname = sco_sock_getname, .sendmsg = sco_sock_sendmsg, .recvmsg = sco_sock_recvmsg, - .poll_mask = bt_sock_poll_mask, + .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index c7991867d622..a6fb1b3bcad9 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -934,11 +934,15 @@ static int caif_release(struct socket *sock) } /* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */ -static __poll_t caif_poll_mask(struct socket *sock, __poll_t events) +static __poll_t caif_poll(struct file *file, + struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; + __poll_t mask; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); - __poll_t mask = 0; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* exceptional events? */ if (sk->sk_err) @@ -972,7 +976,7 @@ static const struct proto_ops caif_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = caif_poll_mask, + .poll = caif_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -993,7 +997,7 @@ static const struct proto_ops caif_stream_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = caif_poll_mask, + .poll = caif_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/can/bcm.c b/net/can/bcm.c index 9393f25df08d..0af8f0db892a 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1660,7 +1660,7 @@ static const struct proto_ops bcm_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/can/raw.c b/net/can/raw.c index fd7e2f49ea6a..1051eee82581 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -843,7 +843,7 @@ static const struct proto_ops raw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = raw_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/core/datagram.c b/net/core/datagram.c index f19bf3dc2bd6..9938952c5c78 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -819,8 +819,9 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); /** * datagram_poll - generic datagram poll + * @file: file struct * @sock: socket - * @events to wait for + * @wait: poll table * * Datagram poll: Again totally generic. This also handles * sequenced packet sockets providing the socket receive queue @@ -830,10 +831,14 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); * and you use a different write policy from sock_writeable() * then please supply your own write_space callback. */ -__poll_t datagram_poll_mask(struct socket *sock, __poll_t events) +__poll_t datagram_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; - __poll_t mask = 0; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* exceptional events? */ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) @@ -866,4 +871,4 @@ __poll_t datagram_poll_mask(struct socket *sock, __poll_t events) return mask; } -EXPORT_SYMBOL(datagram_poll_mask); +EXPORT_SYMBOL(datagram_poll); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 0ea2ee56ac1b..f91e3816806b 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -316,7 +316,8 @@ int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); void dccp_shutdown(struct sock *sk, int how); int inet_dccp_listen(struct socket *sock, int backlog); -__poll_t dccp_poll_mask(struct socket *sock, __poll_t events); +__poll_t dccp_poll(struct file *file, struct socket *sock, + poll_table *wait); int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); void dccp_req_err(struct sock *sk, u64 seq); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index a9e478cd3787..b08feb219b44 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -984,7 +984,7 @@ static const struct proto_ops inet_dccp_ops = { .accept = inet_accept, .getname = inet_getname, /* FIXME: work on tcp_poll to rename it to inet_csk_poll */ - .poll_mask = dccp_poll_mask, + .poll = dccp_poll, .ioctl = inet_ioctl, /* FIXME: work on inet_listen to rename it to sock_common_listen */ .listen = inet_dccp_listen, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 17fc4e0166ba..6344f1b18a6a 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1070,7 +1070,7 @@ static const struct proto_ops inet6_dccp_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet6_getname, - .poll_mask = dccp_poll_mask, + .poll = dccp_poll, .ioctl = inet6_ioctl, .listen = inet_dccp_listen, .shutdown = inet_shutdown, diff --git a/net/dccp/proto.c b/net/dccp/proto.c index ca21c1c76da0..0d56e36a6db7 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -312,11 +312,20 @@ int dccp_disconnect(struct sock *sk, int flags) EXPORT_SYMBOL_GPL(dccp_disconnect); -__poll_t dccp_poll_mask(struct socket *sock, __poll_t events) +/* + * Wait for a DCCP event. + * + * Note that we don't need to lock the socket, as the upper poll layers + * take care of normal races (between the test and the event) and we don't + * go look at any of the socket buffers directly. + */ +__poll_t dccp_poll(struct file *file, struct socket *sock, + poll_table *wait) { __poll_t mask; struct sock *sk = sock->sk; + sock_poll_wait(file, sk_sleep(sk), wait); if (sk->sk_state == DCCP_LISTEN) return inet_csk_listen_poll(sk); @@ -358,7 +367,7 @@ __poll_t dccp_poll_mask(struct socket *sock, __poll_t events) return mask; } -EXPORT_SYMBOL_GPL(dccp_poll_mask); +EXPORT_SYMBOL_GPL(dccp_poll); int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) { diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 9a686d890bfa..7d6ff983ba2c 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1207,11 +1207,11 @@ static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int peer) } -static __poll_t dn_poll_mask(struct socket *sock, __poll_t events) +static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct dn_scp *scp = DN_SK(sk); - __poll_t mask = datagram_poll_mask(sock, events); + __poll_t mask = datagram_poll(file, sock, wait); if (!skb_queue_empty(&scp->other_receive_queue)) mask |= EPOLLRDBAND; @@ -2331,7 +2331,7 @@ static const struct proto_ops dn_proto_ops = { .socketpair = sock_no_socketpair, .accept = dn_accept, .getname = dn_getname, - .poll_mask = dn_poll_mask, + .poll = dn_poll, .ioctl = dn_ioctl, .listen = dn_listen, .shutdown = dn_shutdown, diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index a0768d2759b8..a60658c85a9a 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -423,7 +423,7 @@ static const struct proto_ops ieee802154_raw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = ieee802154_sock_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -969,7 +969,7 @@ static const struct proto_ops ieee802154_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = ieee802154_sock_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 15e125558c76..b403499fdabe 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -986,7 +986,7 @@ const struct proto_ops inet_stream_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, - .poll_mask = tcp_poll_mask, + .poll = tcp_poll, .ioctl = inet_ioctl, .listen = inet_listen, .shutdown = inet_shutdown, @@ -1021,7 +1021,7 @@ const struct proto_ops inet_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = inet_getname, - .poll_mask = udp_poll_mask, + .poll = udp_poll, .ioctl = inet_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, @@ -1042,7 +1042,7 @@ EXPORT_SYMBOL(inet_dgram_ops); /* * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without - * udp_poll_mask + * udp_poll */ static const struct proto_ops inet_sockraw_ops = { .family = PF_INET, @@ -1053,7 +1053,7 @@ static const struct proto_ops inet_sockraw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = inet_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = inet_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 141acd92e58a..e7b53d2a971f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -494,21 +494,32 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, } /* - * Socket is not locked. We are protected from async events by poll logic and - * correct handling of state changes made by other threads is impossible in - * any case. + * Wait for a TCP event. + * + * Note that we don't need to lock the socket, as the upper poll layers + * take care of normal races (between the test and the event) and we don't + * go look at any of the socket buffers directly. */ -__poll_t tcp_poll_mask(struct socket *sock, __poll_t events) +__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) { + __poll_t mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); - __poll_t mask = 0; int state; + sock_poll_wait(file, sk_sleep(sk), wait); + state = inet_sk_state_load(sk); if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); + /* Socket is not locked. We are protected from async events + * by poll logic and correct handling of state changes + * made by other threads is impossible in any case. + */ + + mask = 0; + /* * EPOLLHUP is certainly not done right. But poll() doesn't * have a notion of HUP in just one direction, and for a @@ -589,7 +600,7 @@ __poll_t tcp_poll_mask(struct socket *sock, __poll_t events) return mask; } -EXPORT_SYMBOL(tcp_poll_mask); +EXPORT_SYMBOL(tcp_poll); int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9bb27df4dac5..24e116ddae79 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2591,7 +2591,7 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname, * udp_poll - wait for a UDP event. * @file - file struct * @sock - socket - * @events - events to wait for + * @wait - poll table * * This is same as datagram poll, except for the special case of * blocking sockets. If application is using a blocking fd @@ -2600,23 +2600,23 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname, * but then block when reading it. Add special case code * to work around these arguably broken applications. */ -__poll_t udp_poll_mask(struct socket *sock, __poll_t events) +__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) { - __poll_t mask = datagram_poll_mask(sock, events); + __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; if (!skb_queue_empty(&udp_sk(sk)->reader_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Check for false positives due to checksum errors */ - if ((mask & EPOLLRDNORM) && !(sock->file->f_flags & O_NONBLOCK) && + if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) && !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1) mask &= ~(EPOLLIN | EPOLLRDNORM); return mask; } -EXPORT_SYMBOL(udp_poll_mask); +EXPORT_SYMBOL(udp_poll); int udp_abort(struct sock *sk, int err) { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 74f2a261e8df..9ed0eae91758 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -570,7 +570,7 @@ const struct proto_ops inet6_stream_ops = { .socketpair = sock_no_socketpair, /* a do nothing */ .accept = inet_accept, /* ok */ .getname = inet6_getname, - .poll_mask = tcp_poll_mask, /* ok */ + .poll = tcp_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ .listen = inet_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ @@ -603,7 +603,7 @@ const struct proto_ops inet6_dgram_ops = { .socketpair = sock_no_socketpair, /* a do nothing */ .accept = sock_no_accept, /* a do nothing */ .getname = inet6_getname, - .poll_mask = udp_poll_mask, /* ok */ + .poll = udp_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ .listen = sock_no_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index ce6f0d15b5dd..afc307c89d1a 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1334,7 +1334,7 @@ void raw6_proc_exit(void) } #endif /* CONFIG_PROC_FS */ -/* Same as inet6_dgram_ops, sans udp_poll_mask. */ +/* Same as inet6_dgram_ops, sans udp_poll. */ const struct proto_ops inet6_sockraw_ops = { .family = PF_INET6, .owner = THIS_MODULE, @@ -1344,7 +1344,7 @@ const struct proto_ops inet6_sockraw_ops = { .socketpair = sock_no_socketpair, /* a do nothing */ .accept = sock_no_accept, /* a do nothing */ .getname = inet6_getname, - .poll_mask = datagram_poll_mask, /* ok */ + .poll = datagram_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ .listen = sock_no_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 68e86257a549..893a022f9620 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1488,11 +1488,14 @@ static inline __poll_t iucv_accept_poll(struct sock *parent) return 0; } -static __poll_t iucv_sock_poll_mask(struct socket *sock, __poll_t events) +__poll_t iucv_sock_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; + sock_poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == IUCV_LISTEN) return iucv_accept_poll(sk); @@ -2385,7 +2388,7 @@ static const struct proto_ops iucv_sock_ops = { .getname = iucv_sock_getname, .sendmsg = iucv_sock_sendmsg, .recvmsg = iucv_sock_recvmsg, - .poll_mask = iucv_sock_poll_mask, + .poll = iucv_sock_poll, .ioctl = sock_no_ioctl, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 84b7d5c6fec8..d3601d421571 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1336,9 +1336,9 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) struct list_head *head; int index = 0; - /* For SOCK_SEQPACKET sock type, datagram_poll_mask checks the sk_state, - * so we set sk_state, otherwise epoll_wait always returns right away - * with EPOLLHUP + /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so + * we set sk_state, otherwise epoll_wait always returns right away with + * EPOLLHUP */ kcm->sk.sk_state = TCP_ESTABLISHED; @@ -1903,7 +1903,7 @@ static const struct proto_ops kcm_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = kcm_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -1924,7 +1924,7 @@ static const struct proto_ops kcm_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = kcm_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/key/af_key.c b/net/key/af_key.c index 8bdc1cbe490a..5e1d2946ffbf 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3751,7 +3751,7 @@ static const struct proto_ops pfkey_ops = { /* Now the operations that really occur. */ .release = pfkey_release, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .sendmsg = pfkey_sendmsg, .recvmsg = pfkey_recvmsg, }; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 181073bf6925..a9c05b2bc1b0 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -613,7 +613,7 @@ static const struct proto_ops l2tp_ip_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = l2tp_ip_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = inet_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 336e4c00abbc..957369192ca1 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -754,7 +754,7 @@ static const struct proto_ops l2tp_ip6_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = l2tp_ip6_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = inet6_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 55188382845c..e398797878a9 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1818,7 +1818,7 @@ static const struct proto_ops pppol2tp_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pppol2tp_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = pppol2tp_setsockopt, diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 804de8490186..1beeea9549fa 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -1192,7 +1192,7 @@ static const struct proto_ops llc_ui_ops = { .socketpair = sock_no_socketpair, .accept = llc_ui_accept, .getname = llc_ui_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = llc_ui_ioctl, .listen = llc_ui_listen, .shutdown = llc_ui_shutdown, diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 1189b84413d5..393573a99a5a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2658,7 +2658,7 @@ static const struct proto_ops netlink_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = netlink_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 93fbcafbf388..03f37c4e64fe 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1355,7 +1355,7 @@ static const struct proto_ops nr_proto_ops = { .socketpair = sock_no_socketpair, .accept = nr_accept, .getname = nr_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = nr_ioctl, .listen = nr_listen, .shutdown = sock_no_shutdown, diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index ab5bb14b49af..ea0c0c6f1874 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -548,13 +548,16 @@ static inline __poll_t llcp_accept_poll(struct sock *parent) return 0; } -static __poll_t llcp_sock_poll_mask(struct socket *sock, __poll_t events) +static __poll_t llcp_sock_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; pr_debug("%p\n", sk); + sock_poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == LLCP_LISTEN) return llcp_accept_poll(sk); @@ -896,7 +899,7 @@ static const struct proto_ops llcp_sock_ops = { .socketpair = sock_no_socketpair, .accept = llcp_sock_accept, .getname = llcp_sock_getname, - .poll_mask = llcp_sock_poll_mask, + .poll = llcp_sock_poll, .ioctl = sock_no_ioctl, .listen = llcp_sock_listen, .shutdown = sock_no_shutdown, @@ -916,7 +919,7 @@ static const struct proto_ops llcp_rawsock_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = llcp_sock_getname, - .poll_mask = llcp_sock_poll_mask, + .poll = llcp_sock_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 60c322531c49..e2188deb08dc 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -284,7 +284,7 @@ static const struct proto_ops rawsock_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -304,7 +304,7 @@ static const struct proto_ops rawsock_raw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ff8e7e245c37..57634bc3da74 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4076,11 +4076,12 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd, return 0; } -static __poll_t packet_poll_mask(struct socket *sock, __poll_t events) +static __poll_t packet_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); - __poll_t mask = datagram_poll_mask(sock, events); + __poll_t mask = datagram_poll(file, sock, wait); spin_lock_bh(&sk->sk_receive_queue.lock); if (po->rx_ring.pg_vec) { @@ -4422,7 +4423,7 @@ static const struct proto_ops packet_ops_spkt = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname_spkt, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = packet_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -4443,7 +4444,7 @@ static const struct proto_ops packet_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname, - .poll_mask = packet_poll_mask, + .poll = packet_poll, .ioctl = packet_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/phonet/socket.c b/net/phonet/socket.c index c295c4e20f01..30187990257f 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -340,12 +340,15 @@ static int pn_socket_getname(struct socket *sock, struct sockaddr *addr, return sizeof(struct sockaddr_pn); } -static __poll_t pn_socket_poll_mask(struct socket *sock, __poll_t events) +static __poll_t pn_socket_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct pep_sock *pn = pep_sk(sk); __poll_t mask = 0; + poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == TCP_CLOSE) return EPOLLERR; if (!skb_queue_empty(&sk->sk_receive_queue)) @@ -445,7 +448,7 @@ const struct proto_ops phonet_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pn_socket_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = pn_socket_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -470,7 +473,7 @@ const struct proto_ops phonet_stream_ops = { .socketpair = sock_no_socketpair, .accept = pn_socket_accept, .getname = pn_socket_getname, - .poll_mask = pn_socket_poll_mask, + .poll = pn_socket_poll, .ioctl = pn_socket_ioctl, .listen = pn_socket_listen, .shutdown = sock_no_shutdown, diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 1b5025ea5b04..2aa07b547b16 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -1023,7 +1023,7 @@ static const struct proto_ops qrtr_proto_ops = { .recvmsg = qrtr_recvmsg, .getname = qrtr_getname, .ioctl = qrtr_ioctl, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, .getsockopt = sock_no_getsockopt, diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index ebe42e7eb456..d00a0ef39a56 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1470,7 +1470,7 @@ static const struct proto_ops rose_proto_ops = { .socketpair = sock_no_socketpair, .accept = rose_accept, .getname = rose_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = rose_ioctl, .listen = rose_listen, .shutdown = sock_no_shutdown, diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 3b1ac93efee2..2b463047dd7b 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -734,11 +734,15 @@ static int rxrpc_getsockopt(struct socket *sock, int level, int optname, /* * permit an RxRPC socket to be polled */ -static __poll_t rxrpc_poll_mask(struct socket *sock, __poll_t events) +static __poll_t rxrpc_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct rxrpc_sock *rx = rxrpc_sk(sk); - __poll_t mask = 0; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* the socket is readable if there are any messages waiting on the Rx * queue */ @@ -945,7 +949,7 @@ static const struct proto_ops rxrpc_rpc_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = rxrpc_poll_mask, + .poll = rxrpc_poll, .ioctl = sock_no_ioctl, .listen = rxrpc_listen, .shutdown = rxrpc_shutdown, diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 7339918a805d..0cd2e764f47f 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -1010,7 +1010,7 @@ static const struct proto_ops inet6_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = sctp_getname, - .poll_mask = sctp_poll_mask, + .poll = sctp_poll, .ioctl = inet6_ioctl, .listen = sctp_inet_listen, .shutdown = inet_shutdown, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 5dffbc493008..67f73d3a1356 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1016,7 +1016,7 @@ static const struct proto_ops inet_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, /* Semantics are different. */ - .poll_mask = sctp_poll_mask, + .poll = sctp_poll, .ioctl = inet_ioctl, .listen = sctp_inet_listen, .shutdown = inet_shutdown, /* Looks harmless. */ diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d20f7addee19..ce620e878538 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7717,12 +7717,14 @@ out: * here, again, by modeling the current TCP/UDP code. We don't have * a good way to test with it yet. */ -__poll_t sctp_poll_mask(struct socket *sock, __poll_t events) +__poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct sctp_sock *sp = sctp_sk(sk); __poll_t mask; + poll_wait(file, sk_sleep(sk), wait); + sock_rps_record_flow(sk); /* A TCP-style listening socket becomes readable when the accept queue diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index da7f02edcd37..973b4471b532 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1273,7 +1273,8 @@ static __poll_t smc_accept_poll(struct sock *parent) return mask; } -static __poll_t smc_poll_mask(struct socket *sock, __poll_t events) +static __poll_t smc_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; @@ -1289,7 +1290,7 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events) if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { /* delegate to CLC child sock */ release_sock(sk); - mask = smc->clcsock->ops->poll_mask(smc->clcsock, events); + mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); lock_sock(sk); sk->sk_err = smc->clcsock->sk->sk_err; if (sk->sk_err) { @@ -1307,6 +1308,11 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events) } } } else { + if (sk->sk_state != SMC_CLOSED) { + release_sock(sk); + sock_poll_wait(file, sk_sleep(sk), wait); + lock_sock(sk); + } if (sk->sk_err) mask |= EPOLLERR; if ((sk->sk_shutdown == SHUTDOWN_MASK) || @@ -1619,7 +1625,7 @@ static const struct proto_ops smc_sock_ops = { .socketpair = sock_no_socketpair, .accept = smc_accept, .getname = smc_getname, - .poll_mask = smc_poll_mask, + .poll = smc_poll, .ioctl = smc_ioctl, .listen = smc_listen, .shutdown = smc_shutdown, diff --git a/net/socket.c b/net/socket.c index 8a109012608a..a564c6ed19d5 100644 --- a/net/socket.c +++ b/net/socket.c @@ -117,10 +117,8 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from); static int sock_mmap(struct file *file, struct vm_area_struct *vma); static int sock_close(struct inode *inode, struct file *file); -static struct wait_queue_head *sock_get_poll_head(struct file *file, - __poll_t events); -static __poll_t sock_poll_mask(struct file *file, __poll_t); -static __poll_t sock_poll(struct file *file, struct poll_table_struct *wait); +static __poll_t sock_poll(struct file *file, + struct poll_table_struct *wait); static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT static long compat_sock_ioctl(struct file *file, @@ -143,8 +141,6 @@ static const struct file_operations socket_file_ops = { .llseek = no_llseek, .read_iter = sock_read_iter, .write_iter = sock_write_iter, - .get_poll_head = sock_get_poll_head, - .poll_mask = sock_poll_mask, .poll = sock_poll, .unlocked_ioctl = sock_ioctl, #ifdef CONFIG_COMPAT @@ -1130,48 +1126,14 @@ out_release: } EXPORT_SYMBOL(sock_create_lite); -static struct wait_queue_head *sock_get_poll_head(struct file *file, - __poll_t events) -{ - struct socket *sock = file->private_data; - - if (!sock->ops->poll_mask) - return NULL; - sock_poll_busy_loop(sock, events); - return sk_sleep(sock->sk); -} - -static __poll_t sock_poll_mask(struct file *file, __poll_t events) -{ - struct socket *sock = file->private_data; - - /* - * We need to be sure we are in sync with the socket flags modification. - * - * This memory barrier is paired in the wq_has_sleeper. - */ - smp_mb(); - - /* this socket can poll_ll so tell the system call */ - return sock->ops->poll_mask(sock, events) | - (sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0); -} - /* No kernel lock held - perfect */ static __poll_t sock_poll(struct file *file, poll_table *wait) { struct socket *sock = file->private_data; - __poll_t events = poll_requested_events(wait), mask = 0; - - if (sock->ops->poll) { - sock_poll_busy_loop(sock, events); - mask = sock->ops->poll(file, sock, wait); - } else if (sock->ops->poll_mask) { - sock_poll_wait(file, sock_get_poll_head(file, events), wait); - mask = sock->ops->poll_mask(sock, events); - } + __poll_t events = poll_requested_events(wait); - return mask | sock_poll_busy_flag(sock); + sock_poll_busy_loop(sock, events); + return sock->ops->poll(file, sock, wait) | sock_poll_busy_flag(sock); } static int sock_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 14a5d055717d..930852c54d7a 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -692,9 +692,10 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, } /** - * tipc_poll - read pollmask + * tipc_poll - read and possibly block on pollmask * @file: file structure associated with the socket * @sock: socket for which to calculate the poll bits + * @wait: ??? * * Returns pollmask value * @@ -708,12 +709,15 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, * imply that the operation will succeed, merely that it should be performed * and will not block. */ -static __poll_t tipc_poll_mask(struct socket *sock, __poll_t events) +static __poll_t tipc_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); __poll_t revents = 0; + sock_poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_shutdown & RCV_SHUTDOWN) revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) @@ -3033,7 +3037,7 @@ static const struct proto_ops msg_ops = { .socketpair = tipc_socketpair, .accept = sock_no_accept, .getname = tipc_getname, - .poll_mask = tipc_poll_mask, + .poll = tipc_poll, .ioctl = tipc_ioctl, .listen = sock_no_listen, .shutdown = tipc_shutdown, @@ -3054,7 +3058,7 @@ static const struct proto_ops packet_ops = { .socketpair = tipc_socketpair, .accept = tipc_accept, .getname = tipc_getname, - .poll_mask = tipc_poll_mask, + .poll = tipc_poll, .ioctl = tipc_ioctl, .listen = tipc_listen, .shutdown = tipc_shutdown, @@ -3075,7 +3079,7 @@ static const struct proto_ops stream_ops = { .socketpair = tipc_socketpair, .accept = tipc_accept, .getname = tipc_getname, - .poll_mask = tipc_poll_mask, + .poll = tipc_poll, .ioctl = tipc_ioctl, .listen = tipc_listen, .shutdown = tipc_shutdown, diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index a127d61e8af9..301f22430469 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -712,7 +712,7 @@ static int __init tls_register(void) build_protos(tls_prots[TLSV4], &tcp_prot); tls_sw_proto_ops = inet_stream_ops; - tls_sw_proto_ops.poll_mask = tls_sw_poll_mask; + tls_sw_proto_ops.poll = tls_sw_poll; tls_sw_proto_ops.splice_read = tls_sw_splice_read; #ifdef CONFIG_TLS_DEVICE diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index f127fac88acf..d2380548f8f6 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -919,22 +919,23 @@ splice_read_end: return copied ? : err; } -__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events) +unsigned int tls_sw_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) { + unsigned int ret; struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - __poll_t mask; - /* Grab EPOLLOUT and EPOLLHUP from the underlying socket */ - mask = ctx->sk_poll_mask(sock, events); + /* Grab POLLOUT and POLLHUP from the underlying socket */ + ret = ctx->sk_poll(file, sock, wait); - /* Clear EPOLLIN bits, and set based on recv_pkt */ - mask &= ~(EPOLLIN | EPOLLRDNORM); + /* Clear POLLIN bits, and set based on recv_pkt */ + ret &= ~(POLLIN | POLLRDNORM); if (ctx->recv_pkt) - mask |= EPOLLIN | EPOLLRDNORM; + ret |= POLLIN | POLLRDNORM; - return mask; + return ret; } static int tls_read_size(struct strparser *strp, struct sk_buff *skb) @@ -1191,7 +1192,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) sk->sk_data_ready = tls_data_ready; write_unlock_bh(&sk->sk_callback_lock); - sw_ctx_rx->sk_poll_mask = sk->sk_socket->ops->poll_mask; + sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll; strp_check_rcv(&sw_ctx_rx->strp); } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 95b02a71fd47..e5473c03d667 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -638,8 +638,9 @@ static int unix_stream_connect(struct socket *, struct sockaddr *, static int unix_socketpair(struct socket *, struct socket *); static int unix_accept(struct socket *, struct socket *, int, bool); static int unix_getname(struct socket *, struct sockaddr *, int); -static __poll_t unix_poll_mask(struct socket *, __poll_t); -static __poll_t unix_dgram_poll_mask(struct socket *, __poll_t); +static __poll_t unix_poll(struct file *, struct socket *, poll_table *); +static __poll_t unix_dgram_poll(struct file *, struct socket *, + poll_table *); static int unix_ioctl(struct socket *, unsigned int, unsigned long); static int unix_shutdown(struct socket *, int); static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); @@ -680,7 +681,7 @@ static const struct proto_ops unix_stream_ops = { .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, - .poll_mask = unix_poll_mask, + .poll = unix_poll, .ioctl = unix_ioctl, .listen = unix_listen, .shutdown = unix_shutdown, @@ -703,7 +704,7 @@ static const struct proto_ops unix_dgram_ops = { .socketpair = unix_socketpair, .accept = sock_no_accept, .getname = unix_getname, - .poll_mask = unix_dgram_poll_mask, + .poll = unix_dgram_poll, .ioctl = unix_ioctl, .listen = sock_no_listen, .shutdown = unix_shutdown, @@ -725,7 +726,7 @@ static const struct proto_ops unix_seqpacket_ops = { .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, - .poll_mask = unix_dgram_poll_mask, + .poll = unix_dgram_poll, .ioctl = unix_ioctl, .listen = unix_listen, .shutdown = unix_shutdown, @@ -2629,10 +2630,13 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return err; } -static __poll_t unix_poll_mask(struct socket *sock, __poll_t events) +static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - __poll_t mask = 0; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* exceptional events? */ if (sk->sk_err) @@ -2661,11 +2665,15 @@ static __poll_t unix_poll_mask(struct socket *sock, __poll_t events) return mask; } -static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events) +static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk, *other; - int writable; - __poll_t mask = 0; + unsigned int writable; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* exceptional events? */ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) @@ -2691,7 +2699,7 @@ static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events) } /* No write status requested, avoid expensive OUT tests. */ - if (!(events & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) + if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) return mask; writable = unix_writable(sk); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index bb5d5fa68c35..c1076c19b858 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -850,11 +850,18 @@ static int vsock_shutdown(struct socket *sock, int mode) return err; } -static __poll_t vsock_poll_mask(struct socket *sock, __poll_t events) +static __poll_t vsock_poll(struct file *file, struct socket *sock, + poll_table *wait) { - struct sock *sk = sock->sk; - struct vsock_sock *vsk = vsock_sk(sk); - __poll_t mask = 0; + struct sock *sk; + __poll_t mask; + struct vsock_sock *vsk; + + sk = sock->sk; + vsk = vsock_sk(sk); + + poll_wait(file, sk_sleep(sk), wait); + mask = 0; if (sk->sk_err) /* Signify that there has been an error on this socket. */ @@ -1084,7 +1091,7 @@ static const struct proto_ops vsock_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = vsock_getname, - .poll_mask = vsock_poll_mask, + .poll = vsock_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = vsock_shutdown, @@ -1842,7 +1849,7 @@ static const struct proto_ops vsock_stream_ops = { .socketpair = sock_no_socketpair, .accept = vsock_accept, .getname = vsock_getname, - .poll_mask = vsock_poll_mask, + .poll = vsock_poll, .ioctl = sock_no_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index f93365ae0fdd..d49aa79b7997 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1750,7 +1750,7 @@ static const struct proto_ops x25_proto_ops = { .socketpair = sock_no_socketpair, .accept = x25_accept, .getname = x25_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = x25_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_x25_ioctl, diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 3b3410ada097..59fb7d3c36a3 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -303,9 +303,10 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len); } -static __poll_t xsk_poll_mask(struct socket *sock, __poll_t events) +static unsigned int xsk_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) { - __poll_t mask = datagram_poll_mask(sock, events); + unsigned int mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); @@ -696,7 +697,7 @@ static const struct proto_ops xsk_proto_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = xsk_poll_mask, + .poll = xsk_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, -- cgit v1.2.3 From 4c79579b44b1876444f4d04de31c1a37098a0350 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 26 Jun 2018 16:21:18 -0700 Subject: bpf: Change bpf_fib_lookup to return lookup status For ACLs implemented using either FIB rules or FIB entries, the BPF program needs the FIB lookup status to be able to drop the packet. Since the bpf_fib_lookup API has not reached a released kernel yet, change the return code to contain an encoding of the FIB lookup result and return the nexthop device index in the params struct. In addition, inform the BPF program of any post FIB lookup reason as to why the packet needs to go up the stack. The fib result for unicast routes must have an egress device, so remove the check that it is non-NULL. Signed-off-by: David Ahern Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 28 ++++++++++++--- net/core/filter.c | 86 +++++++++++++++++++++++++++++----------------- samples/bpf/xdp_fwd_kern.c | 8 ++--- 3 files changed, 81 insertions(+), 41 deletions(-) (limited to 'net/core') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 59b19b6a40d7..b7db3261c62d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1857,7 +1857,8 @@ union bpf_attr { * is resolved), the nexthop address is returned in ipv4_dst * or ipv6_dst based on family, smac is set to mac address of * egress device, dmac is set to nexthop mac address, rt_metric - * is set to metric from route (IPv4/IPv6 only). + * is set to metric from route (IPv4/IPv6 only), and ifindex + * is set to the device index of the nexthop from the FIB lookup. * * *plen* argument is the size of the passed in struct. * *flags* argument can be a combination of one or more of the @@ -1873,9 +1874,10 @@ union bpf_attr { * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. * Return - * Egress device index on success, 0 if packet needs to continue - * up the stack for further processing or a negative error in case - * of failure. + * * < 0 if any input argument is invalid + * * 0 on success (packet is forwarded, nexthop neighbor exists) + * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the + * * packet is not forwarded or needs assist from full stack * * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) * Description @@ -2612,6 +2614,18 @@ struct bpf_raw_tracepoint_args { #define BPF_FIB_LOOKUP_DIRECT BIT(0) #define BPF_FIB_LOOKUP_OUTPUT BIT(1) +enum { + BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ + BPF_FIB_LKUP_RET_BLACKHOLE, /* dest is blackholed; can be dropped */ + BPF_FIB_LKUP_RET_UNREACHABLE, /* dest is unreachable; can be dropped */ + BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed; can be dropped */ + BPF_FIB_LKUP_RET_NOT_FWDED, /* packet is not forwarded */ + BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */ + BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ + BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ + BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ +}; + struct bpf_fib_lookup { /* input: network family for lookup (AF_INET, AF_INET6) * output: network family of egress nexthop @@ -2625,7 +2639,11 @@ struct bpf_fib_lookup { /* total length of packet from network header - used for MTU check */ __u16 tot_len; - __u32 ifindex; /* L3 device index for lookup */ + + /* input: L3 device index for lookup + * output: device index from FIB lookup + */ + __u32 ifindex; union { /* inputs to lookup */ diff --git a/net/core/filter.c b/net/core/filter.c index e7f12e9f598c..0ca6907d7efe 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4073,8 +4073,9 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, memcpy(params->smac, dev->dev_addr, ETH_ALEN); params->h_vlan_TCI = 0; params->h_vlan_proto = 0; + params->ifindex = dev->ifindex; - return dev->ifindex; + return 0; } #endif @@ -4098,7 +4099,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, /* verify forwarding is enabled on this interface */ in_dev = __in_dev_get_rcu(dev); if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) - return 0; + return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl4.flowi4_iif = 1; @@ -4123,7 +4124,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, tb = fib_get_table(net, tbid); if (unlikely(!tb)) - return 0; + return BPF_FIB_LKUP_RET_NOT_FWDED; err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); } else { @@ -4135,8 +4136,20 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); } - if (err || res.type != RTN_UNICAST) - return 0; + if (err) { + /* map fib lookup errors to RTN_ type */ + if (err == -EINVAL) + return BPF_FIB_LKUP_RET_BLACKHOLE; + if (err == -EHOSTUNREACH) + return BPF_FIB_LKUP_RET_UNREACHABLE; + if (err == -EACCES) + return BPF_FIB_LKUP_RET_PROHIBIT; + + return BPF_FIB_LKUP_RET_NOT_FWDED; + } + + if (res.type != RTN_UNICAST) + return BPF_FIB_LKUP_RET_NOT_FWDED; if (res.fi->fib_nhs > 1) fib_select_path(net, &res, &fl4, NULL); @@ -4144,19 +4157,16 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (check_mtu) { mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); if (params->tot_len > mtu) - return 0; + return BPF_FIB_LKUP_RET_FRAG_NEEDED; } nh = &res.fi->fib_nh[res.nh_sel]; /* do not handle lwt encaps right now */ if (nh->nh_lwtstate) - return 0; + return BPF_FIB_LKUP_RET_UNSUPP_LWT; dev = nh->nh_dev; - if (unlikely(!dev)) - return 0; - if (nh->nh_gw) params->ipv4_dst = nh->nh_gw; @@ -4166,10 +4176,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, * rcu_read_lock_bh is not needed here */ neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); - if (neigh) - return bpf_fib_set_fwd_params(params, neigh, dev); + if (!neigh) + return BPF_FIB_LKUP_RET_NO_NEIGH; - return 0; + return bpf_fib_set_fwd_params(params, neigh, dev); } #endif @@ -4190,7 +4200,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, /* link local addresses are never forwarded */ if (rt6_need_strict(dst) || rt6_need_strict(src)) - return 0; + return BPF_FIB_LKUP_RET_NOT_FWDED; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) @@ -4198,7 +4208,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, idev = __in6_dev_get_safely(dev); if (unlikely(!idev || !net->ipv6.devconf_all->forwarding)) - return 0; + return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl6.flowi6_iif = 1; @@ -4225,7 +4235,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, tb = ipv6_stub->fib6_get_table(net, tbid); if (unlikely(!tb)) - return 0; + return BPF_FIB_LKUP_RET_NOT_FWDED; f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict); } else { @@ -4238,11 +4248,23 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, } if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry)) - return 0; + return BPF_FIB_LKUP_RET_NOT_FWDED; + + if (unlikely(f6i->fib6_flags & RTF_REJECT)) { + switch (f6i->fib6_type) { + case RTN_BLACKHOLE: + return BPF_FIB_LKUP_RET_BLACKHOLE; + case RTN_UNREACHABLE: + return BPF_FIB_LKUP_RET_UNREACHABLE; + case RTN_PROHIBIT: + return BPF_FIB_LKUP_RET_PROHIBIT; + default: + return BPF_FIB_LKUP_RET_NOT_FWDED; + } + } - if (unlikely(f6i->fib6_flags & RTF_REJECT || - f6i->fib6_type != RTN_UNICAST)) - return 0; + if (f6i->fib6_type != RTN_UNICAST) + return BPF_FIB_LKUP_RET_NOT_FWDED; if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0) f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6, @@ -4252,11 +4274,11 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (check_mtu) { mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src); if (params->tot_len > mtu) - return 0; + return BPF_FIB_LKUP_RET_FRAG_NEEDED; } if (f6i->fib6_nh.nh_lwtstate) - return 0; + return BPF_FIB_LKUP_RET_UNSUPP_LWT; if (f6i->fib6_flags & RTF_GATEWAY) *dst = f6i->fib6_nh.nh_gw; @@ -4270,10 +4292,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, */ neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, ndisc_hashfn, dst, dev); - if (neigh) - return bpf_fib_set_fwd_params(params, neigh, dev); + if (!neigh) + return BPF_FIB_LKUP_RET_NO_NEIGH; - return 0; + return bpf_fib_set_fwd_params(params, neigh, dev); } #endif @@ -4315,7 +4337,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, struct bpf_fib_lookup *, params, int, plen, u32, flags) { struct net *net = dev_net(skb->dev); - int index = -EAFNOSUPPORT; + int rc = -EAFNOSUPPORT; if (plen < sizeof(*params)) return -EINVAL; @@ -4326,25 +4348,25 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: - index = bpf_ipv4_fib_lookup(net, params, flags, false); + rc = bpf_ipv4_fib_lookup(net, params, flags, false); break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - index = bpf_ipv6_fib_lookup(net, params, flags, false); + rc = bpf_ipv6_fib_lookup(net, params, flags, false); break; #endif } - if (index > 0) { + if (!rc) { struct net_device *dev; - dev = dev_get_by_index_rcu(net, index); + dev = dev_get_by_index_rcu(net, params->ifindex); if (!is_skb_forwardable(dev, skb)) - index = 0; + rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; } - return index; + return rc; } static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c index 6673cdb9f55c..a7e94e7ff87d 100644 --- a/samples/bpf/xdp_fwd_kern.c +++ b/samples/bpf/xdp_fwd_kern.c @@ -48,9 +48,9 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) struct ethhdr *eth = data; struct ipv6hdr *ip6h; struct iphdr *iph; - int out_index; u16 h_proto; u64 nh_off; + int rc; nh_off = sizeof(*eth); if (data + nh_off > data_end) @@ -101,7 +101,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) fib_params.ifindex = ctx->ingress_ifindex; - out_index = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags); + rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags); /* verify egress index has xdp support * TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with @@ -109,7 +109,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) * NOTE: without verification that egress index supports XDP * forwarding packets are dropped. */ - if (out_index > 0) { + if (rc == 0) { if (h_proto == htons(ETH_P_IP)) ip_decrease_ttl(iph); else if (h_proto == htons(ETH_P_IPV6)) @@ -117,7 +117,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN); memcpy(eth->h_source, fib_params.smac, ETH_ALEN); - return bpf_redirect_map(&tx_port, out_index, 0); + return bpf_redirect_map(&tx_port, fib_params.ifindex, 0); } return XDP_PASS; -- cgit v1.2.3 From e699e2c6a654ff8d7303f5297ab5dd83da7b23e0 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 27 Jun 2018 15:16:42 -0700 Subject: net, mm: account sock objects to kmemcg Currently the kernel accounts the memory for network traffic through mem_cgroup_[un]charge_skmem() interface. However the memory accounted only includes the truesize of sk_buff which does not include the size of sock objects. In our production environment, with opt-out kmem accounting, the sock kmem caches (TCP[v6], UDP[v6], RAW[v6], UNIX) are among the top most charged kmem caches and consume a significant amount of memory which can not be left as system overhead. So, this patch converts the kmem caches of all sock objects to SLAB_ACCOUNT. Signed-off-by: Shakeel Butt Suggested-by: Eric Dumazet Reviewed-by: Kirill Tkhai Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index bcc41829a16d..9e8f65585b81 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3243,7 +3243,8 @@ static int req_prot_init(const struct proto *prot) rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, rsk_prot->obj_size, 0, - prot->slab_flags, NULL); + SLAB_ACCOUNT | prot->slab_flags, + NULL); if (!rsk_prot->slab) { pr_crit("%s: Can't create request sock SLAB cache!\n", @@ -3258,7 +3259,8 @@ int proto_register(struct proto *prot, int alloc_slab) if (alloc_slab) { prot->slab = kmem_cache_create_usercopy(prot->name, prot->obj_size, 0, - SLAB_HWCACHE_ALIGN | prot->slab_flags, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | + prot->slab_flags, prot->useroffset, prot->usersize, NULL); @@ -3281,6 +3283,7 @@ int proto_register(struct proto *prot, int alloc_slab) kmem_cache_create(prot->twsk_prot->twsk_slab_name, prot->twsk_prot->twsk_obj_size, 0, + SLAB_ACCOUNT | prot->slab_flags, NULL); if (prot->twsk_prot->twsk_slab == NULL) -- cgit v1.2.3 From d14b56f508ad70eca3e659545aab3c45200f258c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 28 Jun 2018 17:53:06 +0200 Subject: net: cleanup gfp mask in alloc_skb_with_frags alloc_skb_with_frags uses __GFP_NORETRY for non-sleeping allocations which is just a noop and a little bit confusing. __GFP_NORETRY was added by ed98df3361f0 ("net: use __GFP_NORETRY for high order allocations") to prevent from the OOM killer. Yet this was not enough because fb05e7a89f50 ("net: don't wait for order-3 page allocation") didn't want an excessive reclaim for non-costly orders so it made it completely NOWAIT while it preserved __GFP_NORETRY in place which is now redundant. Drop the pointless __GFP_NORETRY because this function is used as copy&paste source for other places. Reviewed-by: Eric Dumazet Signed-off-by: Michal Hocko Signed-off-by: David S. Miller --- net/core/skbuff.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c642304f178c..eba8dae22c25 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5276,8 +5276,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, if (npages >= 1 << order) { page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | - __GFP_NOWARN | - __GFP_NORETRY, + __GFP_NOWARN, order); if (page) goto fill_page; -- cgit v1.2.3 From 3f76df198288ceec92fc9eddecad1e73c52769b0 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 29 Jun 2018 13:42:48 -0700 Subject: net: use dev_change_tx_queue_len() for SIOCSIFTXQLEN As noticed by Eric, we need to switch to the helper dev_change_tx_queue_len() for SIOCSIFTXQLEN call path too, otheriwse still miss dev_qdisc_change_tx_queue_len(). Fixes: 6a643ddb5624 ("net: introduce helper dev_change_tx_queue_len()") Reported-by: Eric Dumazet Signed-off-by: Cong Wang Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev_ioctl.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index a04e1e88bf3a..50537ff961a7 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -285,16 +285,9 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) if (ifr->ifr_qlen < 0) return -EINVAL; if (dev->tx_queue_len ^ ifr->ifr_qlen) { - unsigned int orig_len = dev->tx_queue_len; - - dev->tx_queue_len = ifr->ifr_qlen; - err = call_netdevice_notifiers( - NETDEV_CHANGE_TX_QUEUE_LEN, dev); - err = notifier_to_errno(err); - if (err) { - dev->tx_queue_len = orig_len; + err = dev_change_tx_queue_len(dev, ifr->ifr_qlen); + if (err) return err; - } } return 0; -- cgit v1.2.3 From 35e8c7ba0863acadd4b9badd09740af7a8331125 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Fri, 29 Jun 2018 14:32:15 -0700 Subject: net: fib_rules: bring back rule_exists to match rule during add After commit f9d4b0c1e969 ("fib_rules: move common handling of newrule delrule msgs into fib_nl2rule"), rule_exists got replaced by rule_find for existing rule lookup in both the add and del paths. While this is good for the delete path, it solves a few problems but opens up a few invalid key matches in the add path. $ip -4 rule add table main tos 10 fwmark 1 $ip -4 rule add table main tos 10 RTNETLINK answers: File exists The problem here is rule_find does not check if the key masks in the new and old rule are the same and hence ends up matching a more secific rule. Rule key masks cannot be easily compared today without an elaborate if-else block. Its best to introduce key masks for easier and accurate rule comparison in the future. Until then, due to fear of regressions this patch re-introduces older loose rule_exists during add. Also fixes both rule_exists and rule_find to cover missing attributes. Fixes: f9d4b0c1e969 ("fib_rules: move common handling of newrule delrule msgs into fib_nl2rule") Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/fib_rules.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index bc8425d81022..f64aa13811ea 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -444,6 +444,9 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops, if (rule->ip_proto && r->ip_proto != rule->ip_proto) continue; + if (rule->proto && r->proto != rule->proto) + continue; + if (fib_rule_port_range_set(&rule->sport_range) && !fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) @@ -653,6 +656,73 @@ errout: return err; } +static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, + struct nlattr **tb, struct fib_rule *rule) +{ + struct fib_rule *r; + + list_for_each_entry(r, &ops->rules_list, list) { + if (r->action != rule->action) + continue; + + if (r->table != rule->table) + continue; + + if (r->pref != rule->pref) + continue; + + if (memcmp(r->iifname, rule->iifname, IFNAMSIZ)) + continue; + + if (memcmp(r->oifname, rule->oifname, IFNAMSIZ)) + continue; + + if (r->mark != rule->mark) + continue; + + if (r->suppress_ifgroup != rule->suppress_ifgroup) + continue; + + if (r->suppress_prefixlen != rule->suppress_prefixlen) + continue; + + if (r->mark_mask != rule->mark_mask) + continue; + + if (r->tun_id != rule->tun_id) + continue; + + if (r->fr_net != rule->fr_net) + continue; + + if (r->l3mdev != rule->l3mdev) + continue; + + if (!uid_eq(r->uid_range.start, rule->uid_range.start) || + !uid_eq(r->uid_range.end, rule->uid_range.end)) + continue; + + if (r->ip_proto != rule->ip_proto) + continue; + + if (r->proto != rule->proto) + continue; + + if (!fib_rule_port_range_compare(&r->sport_range, + &rule->sport_range)) + continue; + + if (!fib_rule_port_range_compare(&r->dport_range, + &rule->dport_range)) + continue; + + if (!ops->compare(r, frh, tb)) + continue; + return 1; + } + return 0; +} + int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -687,7 +757,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; if ((nlh->nlmsg_flags & NLM_F_EXCL) && - rule_find(ops, frh, tb, rule, user_priority)) { + rule_exists(ops, frh, tb, rule)) { err = -EEXIST; goto errout_free; } -- cgit v1.2.3