From 8c43bd1706885ba1acfa88da02bc60a2ec16f68c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 18 Jun 2018 12:30:37 -0700
Subject: net/tcp: Fix socket lookups with SO_BINDTODEVICE

Similar to 69678bcd4d2d ("udp: fix SO_BINDTODEVICE"), TCP socket lookups
need to fail if dev_match is not true. Currently, a packet to a given port
can match a socket bound to device when it should not. In the VRF case,
this causes the lookup to hit a VRF socket and not a global socket
resulting in a response trying to go through the VRF when it should not.

Fixes: 3fa6f616a7a4d ("net: ipv4: add second dif to inet socket lookups")
Fixes: 4297a0ef08572 ("net: ipv6: add second dif to inet6 socket lookups")
Reported-by: Lou Berger <lberger@labn.net>
Diagnosed-by: Renato Westphal <renato@opensourcerouting.org>
Tested-by: Renato Westphal <renato@opensourcerouting.org>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_hashtables.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 31ff46daae97..3647167c8fa3 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -243,9 +243,9 @@ static inline int compute_score(struct sock *sk, struct net *net,
 			bool dev_match = (sk->sk_bound_dev_if == dif ||
 					  sk->sk_bound_dev_if == sdif);
 
-			if (exact_dif && !dev_match)
+			if (!dev_match)
 				return -1;
-			if (sk->sk_bound_dev_if && dev_match)
+			if (sk->sk_bound_dev_if)
 				score += 4;
 		}
 		if (sk->sk_incoming_cpu == raw_smp_processor_id())
-- 
cgit v1.2.3


From 9887cba19978a5f288100ef90a37684cc8d5e0a6 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Tue, 19 Jun 2018 12:47:52 -0400
Subject: ip: limit use of gso_size to udp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ipcm(6)_cookie field gso_size is set only in the udp path. The ip
layer copies this to cork only if sk_type is SOCK_DGRAM. This check
proved too permissive. Ping and l2tp sockets have the same type.

Limit to sockets of type SOCK_DGRAM and protocol IPPROTO_UDP to
exclude ping sockets.

v1 -> v2
- remove irrelevant whitespace changes

Fixes: bec1f6f69736 ("udp: generate gso with UDP_SEGMENT")
Reported-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c  | 3 ++-
 net/ipv6/ip6_output.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index af5a830ff6ad..b3308e9d9762 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1145,7 +1145,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
 	cork->fragsize = ip_sk_use_pmtu(sk) ?
 			 dst_mtu(&rt->dst) : rt->dst.dev->mtu;
 
-	cork->gso_size = sk->sk_type == SOCK_DGRAM ? ipc->gso_size : 0;
+	cork->gso_size = sk->sk_type == SOCK_DGRAM &&
+			 sk->sk_protocol == IPPROTO_UDP ? ipc->gso_size : 0;
 	cork->dst = &rt->dst;
 	cork->length = 0;
 	cork->ttl = ipc->ttl;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 021e5aef6ba3..a14fb4fcdf18 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1219,7 +1219,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
 	if (mtu < IPV6_MIN_MTU)
 		return -EINVAL;
 	cork->base.fragsize = mtu;
-	cork->base.gso_size = sk->sk_type == SOCK_DGRAM ? ipc6->gso_size : 0;
+	cork->base.gso_size = sk->sk_type == SOCK_DGRAM &&
+			      sk->sk_protocol == IPPROTO_UDP ? ipc6->gso_size : 0;
 
 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
 		cork->base.flags |= IPCORK_ALLFRAG;
-- 
cgit v1.2.3


From 15ecbe94a45ef88491ca459b26efdd02f91edb6d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 27 Jun 2018 08:47:21 -0700
Subject: tcp: add one more quick ack after after ECN events

Larry Brakmo proposal ( https://patchwork.ozlabs.org/patch/935233/
tcp: force cwnd at least 2 in tcp_cwnd_reduction) made us rethink
about our recent patch removing ~16 quick acks after ECN events.

tcp_enter_quickack_mode(sk, 1) makes sure one immediate ack is sent,
but in the case the sender cwnd was lowered to 1, we do not want
to have a delayed ack for the next packet we will receive.

Fixes: 522040ea5fdd ("tcp: do not aggressively quick ack after ECN events")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Neal Cardwell <ncardwell@google.com>
Cc: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 355d3dffd021..045d930d01a9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -265,7 +265,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
 		 * it is probably a retransmit.
 		 */
 		if (tp->ecn_flags & TCP_ECN_SEEN)
-			tcp_enter_quickack_mode(sk, 1);
+			tcp_enter_quickack_mode(sk, 2);
 		break;
 	case INET_ECN_CE:
 		if (tcp_ca_needs_ecn(sk))
@@ -273,7 +273,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
 
 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
 			/* Better not delay acks, sender can have a very low cwnd */
-			tcp_enter_quickack_mode(sk, 1);
+			tcp_enter_quickack_mode(sk, 2);
 			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
 		}
 		tp->ecn_flags |= TCP_ECN_SEEN;
-- 
cgit v1.2.3


From a11e1d432b51f63ba698d044441284a661f01144 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 28 Jun 2018 09:43:44 -0700
Subject: Revert changes to convert to ->poll_mask() and aio IOCB_CMD_POLL

The poll() changes were not well thought out, and completely
unexplained.  They also caused a huge performance regression, because
"->poll()" was no longer a trivial file operation that just called down
to the underlying file operations, but instead did at least two indirect
calls.

Indirect calls are sadly slow now with the Spectre mitigation, but the
performance problem could at least be largely mitigated by changing the
"->get_poll_head()" operation to just have a per-file-descriptor pointer
to the poll head instead.  That gets rid of one of the new indirections.

But that doesn't fix the new complexity that is completely unwarranted
for the regular case.  The (undocumented) reason for the poll() changes
was some alleged AIO poll race fixing, but we don't make the common case
slower and more complex for some uncommon special case, so this all
really needs way more explanations and most likely a fundamental
redesign.

[ This revert is a revert of about 30 different commits, not reverted
  individually because that would just be unnecessarily messy  - Linus ]

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/Locking |   7 +-
 Documentation/filesystems/vfs.txt |  13 ----
 crypto/af_alg.c                   |  13 +++-
 crypto/algif_aead.c               |   4 +-
 crypto/algif_skcipher.c           |   4 +-
 drivers/char/random.c             |  29 ++++----
 drivers/isdn/mISDN/socket.c       |   2 +-
 drivers/net/ppp/pppoe.c           |   2 +-
 fs/aio.c                          | 148 +-------------------------------------
 fs/eventfd.c                      |  19 ++---
 fs/eventpoll.c                    |  15 ++--
 fs/pipe.c                         |  22 +++---
 fs/select.c                       |  23 ------
 fs/timerfd.c                      |  22 +++---
 include/crypto/if_alg.h           |   3 +-
 include/linux/fs.h                |   2 -
 include/linux/net.h               |   1 -
 include/linux/poll.h              |  12 ++--
 include/linux/skbuff.h            |   3 +-
 include/net/bluetooth/bluetooth.h |   2 +-
 include/net/iucv/af_iucv.h        |   2 +
 include/net/sctp/sctp.h           |   3 +-
 include/net/tcp.h                 |   3 +-
 include/net/tls.h                 |   6 +-
 include/net/udp.h                 |   2 +-
 include/uapi/linux/aio_abi.h      |   8 ++-
 net/appletalk/ddp.c               |   2 +-
 net/atm/common.c                  |  11 ++-
 net/atm/common.h                  |   2 +-
 net/atm/pvc.c                     |   2 +-
 net/atm/svc.c                     |   2 +-
 net/ax25/af_ax25.c                |   2 +-
 net/bluetooth/af_bluetooth.c      |   7 +-
 net/bluetooth/hci_sock.c          |   2 +-
 net/bluetooth/l2cap_sock.c        |   2 +-
 net/bluetooth/rfcomm/sock.c       |   2 +-
 net/bluetooth/sco.c               |   2 +-
 net/caif/caif_socket.c            |  12 ++--
 net/can/bcm.c                     |   2 +-
 net/can/raw.c                     |   2 +-
 net/core/datagram.c               |  13 ++--
 net/dccp/dccp.h                   |   3 +-
 net/dccp/ipv4.c                   |   2 +-
 net/dccp/ipv6.c                   |   2 +-
 net/dccp/proto.c                  |  13 +++-
 net/decnet/af_decnet.c            |   6 +-
 net/ieee802154/socket.c           |   4 +-
 net/ipv4/af_inet.c                |   8 +--
 net/ipv4/tcp.c                    |  23 ++++--
 net/ipv4/udp.c                    |  10 +--
 net/ipv6/af_inet6.c               |   4 +-
 net/ipv6/raw.c                    |   4 +-
 net/iucv/af_iucv.c                |   7 +-
 net/kcm/kcmsock.c                 |  10 +--
 net/key/af_key.c                  |   2 +-
 net/l2tp/l2tp_ip.c                |   2 +-
 net/l2tp/l2tp_ip6.c               |   2 +-
 net/l2tp/l2tp_ppp.c               |   2 +-
 net/llc/af_llc.c                  |   2 +-
 net/netlink/af_netlink.c          |   2 +-
 net/netrom/af_netrom.c            |   2 +-
 net/nfc/llcp_sock.c               |   9 ++-
 net/nfc/rawsock.c                 |   4 +-
 net/packet/af_packet.c            |   9 +--
 net/phonet/socket.c               |   9 ++-
 net/qrtr/qrtr.c                   |   2 +-
 net/rose/af_rose.c                |   2 +-
 net/rxrpc/af_rxrpc.c              |  10 ++-
 net/sctp/ipv6.c                   |   2 +-
 net/sctp/protocol.c               |   2 +-
 net/sctp/socket.c                 |   4 +-
 net/smc/af_smc.c                  |  12 +++-
 net/socket.c                      |  48 ++-----------
 net/tipc/socket.c                 |  14 ++--
 net/tls/tls_main.c                |   2 +-
 net/tls/tls_sw.c                  |  19 ++---
 net/unix/af_unix.c                |  30 +++++---
 net/vmw_vsock/af_vsock.c          |  19 +++--
 net/x25/af_x25.c                  |   2 +-
 net/xdp/xsk.c                     |   7 +-
 80 files changed, 301 insertions(+), 450 deletions(-)

(limited to 'net/ipv4')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 2c391338c675..37bf0a9de75c 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -441,8 +441,6 @@ prototypes:
 	int (*iterate) (struct file *, struct dir_context *);
 	int (*iterate_shared) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
-	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
-	__poll_t (*poll_mask) (struct file *, __poll_t);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
@@ -473,7 +471,7 @@ prototypes:
 };
 
 locking rules:
-	All except for ->poll_mask may block.
+	All may block.
 
 ->llseek() locking has moved from llseek to the individual llseek
 implementations.  If your fs is not using generic_file_llseek, you
@@ -505,9 +503,6 @@ in sys_read() and friends.
 the lease within the individual filesystem to record the result of the
 operation
 
-->poll_mask can be called with or without the waitqueue lock for the waitqueue
-returned from ->get_poll_head.
-
 --------------------------- dquot_operations -------------------------------
 prototypes:
 	int (*write_dquot) (struct dquot *);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 829a7b7857a4..f608180ad59d 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -857,8 +857,6 @@ struct file_operations {
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
-	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
-	__poll_t (*poll_mask) (struct file *, __poll_t);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
@@ -903,17 +901,6 @@ otherwise noted.
 	activity on this file and (optionally) go to sleep until there
 	is activity. Called by the select(2) and poll(2) system calls
 
-  get_poll_head: Returns the struct wait_queue_head that callers can
-  wait on.  Callers need to check the returned events using ->poll_mask
-  once woken.  Can return NULL to indicate polling is not supported,
-  or any error code using the ERR_PTR convention to indicate that a
-  grave error occured and ->poll_mask shall not be called.
-
-  poll_mask: return the mask of EPOLL* values describing the file descriptor
-  state.  Called either before going to sleep on the waitqueue returned by
-  get_poll_head, or after it has been woken.  If ->get_poll_head and
-  ->poll_mask are implemented ->poll does not need to be implement.
-
   unlocked_ioctl: called by the ioctl(2) system call.
 
   compat_ioctl: called by the ioctl(2) system call when 32 bit system calls
diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index 49fa8582138b..314c52c967e5 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -1060,12 +1060,19 @@ void af_alg_async_cb(struct crypto_async_request *_req, int err)
 }
 EXPORT_SYMBOL_GPL(af_alg_async_cb);
 
-__poll_t af_alg_poll_mask(struct socket *sock, __poll_t events)
+/**
+ * af_alg_poll - poll system call handler
+ */
+__poll_t af_alg_poll(struct file *file, struct socket *sock,
+			 poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	struct alg_sock *ask = alg_sk(sk);
 	struct af_alg_ctx *ctx = ask->private;
-	__poll_t mask = 0;
+	__poll_t mask;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
 
 	if (!ctx->more || ctx->used)
 		mask |= EPOLLIN | EPOLLRDNORM;
@@ -1075,7 +1082,7 @@ __poll_t af_alg_poll_mask(struct socket *sock, __poll_t events)
 
 	return mask;
 }
-EXPORT_SYMBOL_GPL(af_alg_poll_mask);
+EXPORT_SYMBOL_GPL(af_alg_poll);
 
 /**
  * af_alg_alloc_areq - allocate struct af_alg_async_req
diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index 825524f27438..c40a8c7ee8ae 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -375,7 +375,7 @@ static struct proto_ops algif_aead_ops = {
 	.sendmsg	=	aead_sendmsg,
 	.sendpage	=	af_alg_sendpage,
 	.recvmsg	=	aead_recvmsg,
-	.poll_mask	=	af_alg_poll_mask,
+	.poll		=	af_alg_poll,
 };
 
 static int aead_check_key(struct socket *sock)
@@ -471,7 +471,7 @@ static struct proto_ops algif_aead_ops_nokey = {
 	.sendmsg	=	aead_sendmsg_nokey,
 	.sendpage	=	aead_sendpage_nokey,
 	.recvmsg	=	aead_recvmsg_nokey,
-	.poll_mask	=	af_alg_poll_mask,
+	.poll		=	af_alg_poll,
 };
 
 static void *aead_bind(const char *name, u32 type, u32 mask)
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index 4c04eb9888ad..cfdaab2b7d76 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -206,7 +206,7 @@ static struct proto_ops algif_skcipher_ops = {
 	.sendmsg	=	skcipher_sendmsg,
 	.sendpage	=	af_alg_sendpage,
 	.recvmsg	=	skcipher_recvmsg,
-	.poll_mask	=	af_alg_poll_mask,
+	.poll		=	af_alg_poll,
 };
 
 static int skcipher_check_key(struct socket *sock)
@@ -302,7 +302,7 @@ static struct proto_ops algif_skcipher_ops_nokey = {
 	.sendmsg	=	skcipher_sendmsg_nokey,
 	.sendpage	=	skcipher_sendpage_nokey,
 	.recvmsg	=	skcipher_recvmsg_nokey,
-	.poll_mask	=	af_alg_poll_mask,
+	.poll		=	af_alg_poll,
 };
 
 static void *skcipher_bind(const char *name, u32 type, u32 mask)
diff --git a/drivers/char/random.c b/drivers/char/random.c
index a8fb0020ba5c..cd888d4ee605 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -402,7 +402,8 @@ static struct poolinfo {
 /*
  * Static global variables
  */
-static DECLARE_WAIT_QUEUE_HEAD(random_wait);
+static DECLARE_WAIT_QUEUE_HEAD(random_read_wait);
+static DECLARE_WAIT_QUEUE_HEAD(random_write_wait);
 static struct fasync_struct *fasync;
 
 static DEFINE_SPINLOCK(random_ready_list_lock);
@@ -721,8 +722,8 @@ retry:
 
 		/* should we wake readers? */
 		if (entropy_bits >= random_read_wakeup_bits &&
-		    wq_has_sleeper(&random_wait)) {
-			wake_up_interruptible_poll(&random_wait, POLLIN);
+		    wq_has_sleeper(&random_read_wait)) {
+			wake_up_interruptible(&random_read_wait);
 			kill_fasync(&fasync, SIGIO, POLL_IN);
 		}
 		/* If the input pool is getting full, send some
@@ -1396,7 +1397,7 @@ retry:
 	trace_debit_entropy(r->name, 8 * ibytes);
 	if (ibytes &&
 	    (r->entropy_count >> ENTROPY_SHIFT) < random_write_wakeup_bits) {
-		wake_up_interruptible_poll(&random_wait, POLLOUT);
+		wake_up_interruptible(&random_write_wait);
 		kill_fasync(&fasync, SIGIO, POLL_OUT);
 	}
 
@@ -1838,7 +1839,7 @@ _random_read(int nonblock, char __user *buf, size_t nbytes)
 		if (nonblock)
 			return -EAGAIN;
 
-		wait_event_interruptible(random_wait,
+		wait_event_interruptible(random_read_wait,
 			ENTROPY_BITS(&input_pool) >=
 			random_read_wakeup_bits);
 		if (signal_pending(current))
@@ -1875,17 +1876,14 @@ urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 	return ret;
 }
 
-static struct wait_queue_head *
-random_get_poll_head(struct file *file, __poll_t events)
-{
-	return &random_wait;
-}
-
 static __poll_t
-random_poll_mask(struct file *file, __poll_t events)
+random_poll(struct file *file, poll_table * wait)
 {
-	__poll_t mask = 0;
+	__poll_t mask;
 
+	poll_wait(file, &random_read_wait, wait);
+	poll_wait(file, &random_write_wait, wait);
+	mask = 0;
 	if (ENTROPY_BITS(&input_pool) >= random_read_wakeup_bits)
 		mask |= EPOLLIN | EPOLLRDNORM;
 	if (ENTROPY_BITS(&input_pool) < random_write_wakeup_bits)
@@ -1992,8 +1990,7 @@ static int random_fasync(int fd, struct file *filp, int on)
 const struct file_operations random_fops = {
 	.read  = random_read,
 	.write = random_write,
-	.get_poll_head  = random_get_poll_head,
-	.poll_mask  = random_poll_mask,
+	.poll  = random_poll,
 	.unlocked_ioctl = random_ioctl,
 	.fasync = random_fasync,
 	.llseek = noop_llseek,
@@ -2326,7 +2323,7 @@ void add_hwgenerator_randomness(const char *buffer, size_t count,
 	 * We'll be woken up again once below random_write_wakeup_thresh,
 	 * or when the calling thread is about to terminate.
 	 */
-	wait_event_interruptible(random_wait, kthread_should_stop() ||
+	wait_event_interruptible(random_write_wait, kthread_should_stop() ||
 			ENTROPY_BITS(&input_pool) <= random_write_wakeup_bits);
 	mix_pool_bytes(poolp, buffer, count);
 	credit_entropy_bits(poolp, entropy);
diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 98f90aadd141..18c0a1281914 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -588,7 +588,7 @@ static const struct proto_ops data_sock_ops = {
 	.getname	= data_sock_getname,
 	.sendmsg	= mISDN_sock_sendmsg,
 	.recvmsg	= mISDN_sock_recvmsg,
-	.poll_mask	= datagram_poll_mask,
+	.poll		= datagram_poll,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= data_sock_setsockopt,
diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index de51e8f70f44..ce61231e96ea 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -1107,7 +1107,7 @@ static const struct proto_ops pppoe_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= pppoe_getname,
-	.poll_mask	= datagram_poll_mask,
+	.poll		= datagram_poll,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= sock_no_setsockopt,
diff --git a/fs/aio.c b/fs/aio.c
index e1d20124ec0e..210df9da1283 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -5,7 +5,6 @@
  *	Implements an efficient asynchronous io interface.
  *
  *	Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved.
- *	Copyright 2018 Christoph Hellwig.
  *
  *	See ../COPYING for licensing terms.
  */
@@ -165,22 +164,10 @@ struct fsync_iocb {
 	bool			datasync;
 };
 
-struct poll_iocb {
-	struct file		*file;
-	__poll_t		events;
-	struct wait_queue_head	*head;
-
-	union {
-		struct wait_queue_entry	wait;
-		struct work_struct	work;
-	};
-};
-
 struct aio_kiocb {
 	union {
 		struct kiocb		rw;
 		struct fsync_iocb	fsync;
-		struct poll_iocb	poll;
 	};
 
 	struct kioctx		*ki_ctx;
@@ -1590,6 +1577,7 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
 	if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes ||
 			iocb->aio_rw_flags))
 		return -EINVAL;
+
 	req->file = fget(iocb->aio_fildes);
 	if (unlikely(!req->file))
 		return -EBADF;
@@ -1604,137 +1592,6 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
 	return 0;
 }
 
-/* need to use list_del_init so we can check if item was present */
-static inline bool __aio_poll_remove(struct poll_iocb *req)
-{
-	if (list_empty(&req->wait.entry))
-		return false;
-	list_del_init(&req->wait.entry);
-	return true;
-}
-
-static inline void __aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
-{
-	fput(iocb->poll.file);
-	aio_complete(iocb, mangle_poll(mask), 0);
-}
-
-static void aio_poll_work(struct work_struct *work)
-{
-	struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, poll.work);
-
-	if (!list_empty_careful(&iocb->ki_list))
-		aio_remove_iocb(iocb);
-	__aio_poll_complete(iocb, iocb->poll.events);
-}
-
-static int aio_poll_cancel(struct kiocb *iocb)
-{
-	struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
-	struct poll_iocb *req = &aiocb->poll;
-	struct wait_queue_head *head = req->head;
-	bool found = false;
-
-	spin_lock(&head->lock);
-	found = __aio_poll_remove(req);
-	spin_unlock(&head->lock);
-
-	if (found) {
-		req->events = 0;
-		INIT_WORK(&req->work, aio_poll_work);
-		schedule_work(&req->work);
-	}
-	return 0;
-}
-
-static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
-		void *key)
-{
-	struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
-	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
-	struct file *file = req->file;
-	__poll_t mask = key_to_poll(key);
-
-	assert_spin_locked(&req->head->lock);
-
-	/* for instances that support it check for an event match first: */
-	if (mask && !(mask & req->events))
-		return 0;
-
-	mask = file->f_op->poll_mask(file, req->events) & req->events;
-	if (!mask)
-		return 0;
-
-	__aio_poll_remove(req);
-
-	/*
-	 * Try completing without a context switch if we can acquire ctx_lock
-	 * without spinning.  Otherwise we need to defer to a workqueue to
-	 * avoid a deadlock due to the lock order.
-	 */
-	if (spin_trylock(&iocb->ki_ctx->ctx_lock)) {
-		list_del_init(&iocb->ki_list);
-		spin_unlock(&iocb->ki_ctx->ctx_lock);
-
-		__aio_poll_complete(iocb, mask);
-	} else {
-		req->events = mask;
-		INIT_WORK(&req->work, aio_poll_work);
-		schedule_work(&req->work);
-	}
-
-	return 1;
-}
-
-static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
-{
-	struct kioctx *ctx = aiocb->ki_ctx;
-	struct poll_iocb *req = &aiocb->poll;
-	__poll_t mask;
-
-	/* reject any unknown events outside the normal event mask. */
-	if ((u16)iocb->aio_buf != iocb->aio_buf)
-		return -EINVAL;
-	/* reject fields that are not defined for poll */
-	if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
-		return -EINVAL;
-
-	req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
-	req->file = fget(iocb->aio_fildes);
-	if (unlikely(!req->file))
-		return -EBADF;
-	if (!file_has_poll_mask(req->file))
-		goto out_fail;
-
-	req->head = req->file->f_op->get_poll_head(req->file, req->events);
-	if (!req->head)
-		goto out_fail;
-	if (IS_ERR(req->head)) {
-		mask = EPOLLERR;
-		goto done;
-	}
-
-	init_waitqueue_func_entry(&req->wait, aio_poll_wake);
-	aiocb->ki_cancel = aio_poll_cancel;
-
-	spin_lock_irq(&ctx->ctx_lock);
-	spin_lock(&req->head->lock);
-	mask = req->file->f_op->poll_mask(req->file, req->events) & req->events;
-	if (!mask) {
-		__add_wait_queue(req->head, &req->wait);
-		list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
-	}
-	spin_unlock(&req->head->lock);
-	spin_unlock_irq(&ctx->ctx_lock);
-done:
-	if (mask)
-		__aio_poll_complete(aiocb, mask);
-	return 0;
-out_fail:
-	fput(req->file);
-	return -EINVAL; /* same as no support for IOCB_CMD_POLL */
-}
-
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 			 bool compat)
 {
@@ -1808,9 +1665,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	case IOCB_CMD_FDSYNC:
 		ret = aio_fsync(&req->fsync, &iocb, true);
 		break;
-	case IOCB_CMD_POLL:
-		ret = aio_poll(req, &iocb);
-		break;
 	default:
 		pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode);
 		ret = -EINVAL;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index ceb1031f1cac..08d3bd602f73 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -101,20 +101,14 @@ static int eventfd_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static struct wait_queue_head *
-eventfd_get_poll_head(struct file *file, __poll_t events)
-{
-	struct eventfd_ctx *ctx = file->private_data;
-
-	return &ctx->wqh;
-}
-
-static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask)
+static __poll_t eventfd_poll(struct file *file, poll_table *wait)
 {
 	struct eventfd_ctx *ctx = file->private_data;
 	__poll_t events = 0;
 	u64 count;
 
+	poll_wait(file, &ctx->wqh, wait);
+
 	/*
 	 * All writes to ctx->count occur within ctx->wqh.lock.  This read
 	 * can be done outside ctx->wqh.lock because we know that poll_wait
@@ -156,11 +150,11 @@ static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask)
 	count = READ_ONCE(ctx->count);
 
 	if (count > 0)
-		events |= (EPOLLIN & eventmask);
+		events |= EPOLLIN;
 	if (count == ULLONG_MAX)
 		events |= EPOLLERR;
 	if (ULLONG_MAX - 1 > count)
-		events |= (EPOLLOUT & eventmask);
+		events |= EPOLLOUT;
 
 	return events;
 }
@@ -311,8 +305,7 @@ static const struct file_operations eventfd_fops = {
 	.show_fdinfo	= eventfd_show_fdinfo,
 #endif
 	.release	= eventfd_release,
-	.get_poll_head	= eventfd_get_poll_head,
-	.poll_mask	= eventfd_poll_mask,
+	.poll		= eventfd_poll,
 	.read		= eventfd_read,
 	.write		= eventfd_write,
 	.llseek		= noop_llseek,
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index ea4436f409fb..67db22fe99c5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -922,18 +922,14 @@ static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head
 	return 0;
 }
 
-static struct wait_queue_head *ep_eventpoll_get_poll_head(struct file *file,
-		__poll_t eventmask)
-{
-	struct eventpoll *ep = file->private_data;
-	return &ep->poll_wait;
-}
-
-static __poll_t ep_eventpoll_poll_mask(struct file *file, __poll_t eventmask)
+static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
 {
 	struct eventpoll *ep = file->private_data;
 	int depth = 0;
 
+	/* Insert inside our poll wait queue */
+	poll_wait(file, &ep->poll_wait, wait);
+
 	/*
 	 * Proceed to find out if wanted events are really available inside
 	 * the ready list.
@@ -972,8 +968,7 @@ static const struct file_operations eventpoll_fops = {
 	.show_fdinfo	= ep_show_fdinfo,
 #endif
 	.release	= ep_eventpoll_release,
-	.get_poll_head	= ep_eventpoll_get_poll_head,
-	.poll_mask	= ep_eventpoll_poll_mask,
+	.poll		= ep_eventpoll_poll,
 	.llseek		= noop_llseek,
 };
 
diff --git a/fs/pipe.c b/fs/pipe.c
index bb0840e234f3..39d6f431da83 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -509,22 +509,19 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	}
 }
 
-static struct wait_queue_head *
-pipe_get_poll_head(struct file *filp, __poll_t events)
-{
-	struct pipe_inode_info *pipe = filp->private_data;
-
-	return &pipe->wait;
-}
-
 /* No kernel lock held - fine */
-static __poll_t pipe_poll_mask(struct file *filp, __poll_t events)
+static __poll_t
+pipe_poll(struct file *filp, poll_table *wait)
 {
+	__poll_t mask;
 	struct pipe_inode_info *pipe = filp->private_data;
-	int nrbufs = pipe->nrbufs;
-	__poll_t mask = 0;
+	int nrbufs;
+
+	poll_wait(filp, &pipe->wait, wait);
 
 	/* Reading only -- no need for acquiring the semaphore.  */
+	nrbufs = pipe->nrbufs;
+	mask = 0;
 	if (filp->f_mode & FMODE_READ) {
 		mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0;
 		if (!pipe->writers && filp->f_version != pipe->w_counter)
@@ -1023,8 +1020,7 @@ const struct file_operations pipefifo_fops = {
 	.llseek		= no_llseek,
 	.read_iter	= pipe_read,
 	.write_iter	= pipe_write,
-	.get_poll_head	= pipe_get_poll_head,
-	.poll_mask	= pipe_poll_mask,
+	.poll		= pipe_poll,
 	.unlocked_ioctl	= pipe_ioctl,
 	.release	= pipe_release,
 	.fasync		= pipe_fasync,
diff --git a/fs/select.c b/fs/select.c
index 317891ff8165..4a6b6e4b21cb 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -34,29 +34,6 @@
 
 #include <linux/uaccess.h>
 
-__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
-{
-	if (file->f_op->poll) {
-		return file->f_op->poll(file, pt);
-	} else if (file_has_poll_mask(file)) {
-		unsigned int events = poll_requested_events(pt);
-		struct wait_queue_head *head;
-
-		if (pt && pt->_qproc) {
-			head = file->f_op->get_poll_head(file, events);
-			if (!head)
-				return DEFAULT_POLLMASK;
-			if (IS_ERR(head))
-				return EPOLLERR;
-			pt->_qproc(file, head, pt);
-		}
-
-		return file->f_op->poll_mask(file, events);
-	} else {
-		return DEFAULT_POLLMASK;
-	}
-}
-EXPORT_SYMBOL_GPL(vfs_poll);
 
 /*
  * Estimate expected accuracy in ns from a timeval.
diff --git a/fs/timerfd.c b/fs/timerfd.c
index d84a2bee4f82..cdad49da3ff7 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -226,20 +226,21 @@ static int timerfd_release(struct inode *inode, struct file *file)
 	kfree_rcu(ctx, rcu);
 	return 0;
 }
-	
-static struct wait_queue_head *timerfd_get_poll_head(struct file *file,
-		__poll_t eventmask)
+
+static __poll_t timerfd_poll(struct file *file, poll_table *wait)
 {
 	struct timerfd_ctx *ctx = file->private_data;
+	__poll_t events = 0;
+	unsigned long flags;
 
-	return &ctx->wqh;
-}
+	poll_wait(file, &ctx->wqh, wait);
 
-static __poll_t timerfd_poll_mask(struct file *file, __poll_t eventmask)
-{
-	struct timerfd_ctx *ctx = file->private_data;
+	spin_lock_irqsave(&ctx->wqh.lock, flags);
+	if (ctx->ticks)
+		events |= EPOLLIN;
+	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 
-	return ctx->ticks ? EPOLLIN : 0;
+	return events;
 }
 
 static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
@@ -363,8 +364,7 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg
 
 static const struct file_operations timerfd_fops = {
 	.release	= timerfd_release,
-	.get_poll_head	= timerfd_get_poll_head,
-	.poll_mask	= timerfd_poll_mask,
+	.poll		= timerfd_poll,
 	.read		= timerfd_read,
 	.llseek		= noop_llseek,
 	.show_fdinfo	= timerfd_show,
diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h
index cc414db9da0a..482461d8931d 100644
--- a/include/crypto/if_alg.h
+++ b/include/crypto/if_alg.h
@@ -245,7 +245,8 @@ ssize_t af_alg_sendpage(struct socket *sock, struct page *page,
 			int offset, size_t size, int flags);
 void af_alg_free_resources(struct af_alg_async_req *areq);
 void af_alg_async_cb(struct crypto_async_request *_req, int err);
-__poll_t af_alg_poll_mask(struct socket *sock, __poll_t events);
+__poll_t af_alg_poll(struct file *file, struct socket *sock,
+			 poll_table *wait);
 struct af_alg_async_req *af_alg_alloc_areq(struct sock *sk,
 					   unsigned int areqlen);
 int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5c91108846db..d78d146a98da 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1720,8 +1720,6 @@ struct file_operations {
 	int (*iterate) (struct file *, struct dir_context *);
 	int (*iterate_shared) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
-	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
-	__poll_t (*poll_mask) (struct file *, __poll_t);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
diff --git a/include/linux/net.h b/include/linux/net.h
index 08b6eb964dd6..6554d3ba4396 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -147,7 +147,6 @@ struct proto_ops {
 	int		(*getname)   (struct socket *sock,
 				      struct sockaddr *addr,
 				      int peer);
-	__poll_t	(*poll_mask) (struct socket *sock, __poll_t events);
 	__poll_t	(*poll)	     (struct file *file, struct socket *sock,
 				      struct poll_table_struct *wait);
 	int		(*ioctl)     (struct socket *sock, unsigned int cmd,
diff --git a/include/linux/poll.h b/include/linux/poll.h
index fdf86b4cbc71..7e0fdcf905d2 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -74,18 +74,18 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
 	pt->_key   = ~(__poll_t)0; /* all events enabled */
 }
 
-static inline bool file_has_poll_mask(struct file *file)
+static inline bool file_can_poll(struct file *file)
 {
-	return file->f_op->get_poll_head && file->f_op->poll_mask;
+	return file->f_op->poll;
 }
 
-static inline bool file_can_poll(struct file *file)
+static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
 {
-	return file->f_op->poll || file_has_poll_mask(file);
+	if (unlikely(!file->f_op->poll))
+		return DEFAULT_POLLMASK;
+	return file->f_op->poll(file, pt);
 }
 
-__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt);
-
 struct poll_table_entry {
 	struct file *filp;
 	__poll_t key;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c86885954994..164cdedf6012 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3252,7 +3252,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
 				    int *peeked, int *off, int *err);
 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
 				  int *err);
-__poll_t datagram_poll_mask(struct socket *sock, __poll_t events);
+__poll_t datagram_poll(struct file *file, struct socket *sock,
+			   struct poll_table_struct *wait);
 int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
 			   struct iov_iter *to, int size);
 static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 53ce8176c313..ec9d6bc65855 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -271,7 +271,7 @@ int  bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 		     int flags);
 int  bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg,
 			    size_t len, int flags);
-__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events);
+__poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait);
 int  bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 int  bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo);
 int  bt_sock_wait_ready(struct sock *sk, unsigned long flags);
diff --git a/include/net/iucv/af_iucv.h b/include/net/iucv/af_iucv.h
index b0eaeb02d46d..f4c21b5a1242 100644
--- a/include/net/iucv/af_iucv.h
+++ b/include/net/iucv/af_iucv.h
@@ -153,6 +153,8 @@ struct iucv_sock_list {
 	atomic_t	  autobind_name;
 };
 
+__poll_t iucv_sock_poll(struct file *file, struct socket *sock,
+			    poll_table *wait);
 void iucv_sock_link(struct iucv_sock_list *l, struct sock *s);
 void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *s);
 void iucv_accept_enqueue(struct sock *parent, struct sock *sk);
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 30b3e2fe240a..8c2caa370e0f 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -109,7 +109,8 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb);
 int sctp_inet_listen(struct socket *sock, int backlog);
 void sctp_write_space(struct sock *sk);
 void sctp_data_ready(struct sock *sk);
-__poll_t sctp_poll_mask(struct socket *sock, __poll_t events);
+__poll_t sctp_poll(struct file *file, struct socket *sock,
+		poll_table *wait);
 void sctp_sock_rfree(struct sk_buff *skb);
 void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 		    struct sctp_association *asoc);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0448e7c5d2b4..800582b5dd54 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -388,7 +388,8 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
 void tcp_close(struct sock *sk, long timeout);
 void tcp_init_sock(struct sock *sk);
 void tcp_init_transfer(struct sock *sk, int bpf_op);
-__poll_t tcp_poll_mask(struct socket *sock, __poll_t events);
+__poll_t tcp_poll(struct file *file, struct socket *sock,
+		      struct poll_table_struct *wait);
 int tcp_getsockopt(struct sock *sk, int level, int optname,
 		   char __user *optval, int __user *optlen);
 int tcp_setsockopt(struct sock *sk, int level, int optname,
diff --git a/include/net/tls.h b/include/net/tls.h
index 7f84ea3e217c..70c273777fe9 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -109,7 +109,8 @@ struct tls_sw_context_rx {
 
 	struct strparser strp;
 	void (*saved_data_ready)(struct sock *sk);
-	__poll_t (*sk_poll_mask)(struct socket *sock, __poll_t events);
+	unsigned int (*sk_poll)(struct file *file, struct socket *sock,
+				struct poll_table_struct *wait);
 	struct sk_buff *recv_pkt;
 	u8 control;
 	bool decrypted;
@@ -224,7 +225,8 @@ void tls_sw_free_resources_tx(struct sock *sk);
 void tls_sw_free_resources_rx(struct sock *sk);
 int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		   int nonblock, int flags, int *addr_len);
-__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events);
+unsigned int tls_sw_poll(struct file *file, struct socket *sock,
+			 struct poll_table_struct *wait);
 ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
 			   struct pipe_inode_info *pipe,
 			   size_t len, unsigned int flags);
diff --git a/include/net/udp.h b/include/net/udp.h
index b1ea8b0f5e6a..81afdacd4fff 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -285,7 +285,7 @@ int udp_init_sock(struct sock *sk);
 int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
 int __udp_disconnect(struct sock *sk, int flags);
 int udp_disconnect(struct sock *sk, int flags);
-__poll_t udp_poll_mask(struct socket *sock, __poll_t events);
+__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait);
 struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 				       netdev_features_t features,
 				       bool is_ipv6);
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index d00221345c19..d4e768d55d14 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -39,8 +39,10 @@ enum {
 	IOCB_CMD_PWRITE = 1,
 	IOCB_CMD_FSYNC = 2,
 	IOCB_CMD_FDSYNC = 3,
-	/* 4 was the experimental IOCB_CMD_PREADX */
-	IOCB_CMD_POLL = 5,
+	/* These two are experimental.
+	 * IOCB_CMD_PREADX = 4,
+	 * IOCB_CMD_POLL = 5,
+	 */
 	IOCB_CMD_NOOP = 6,
 	IOCB_CMD_PREADV = 7,
 	IOCB_CMD_PWRITEV = 8,
@@ -109,7 +111,7 @@ struct iocb {
 #undef IFLITTLE
 
 struct __aio_sigset {
-	const sigset_t __user	*sigmask;
+	sigset_t __user	*sigmask;
 	size_t		sigsetsize;
 };
 
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 55fdba05d7d9..9b6bc5abe946 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1869,7 +1869,7 @@ static const struct proto_ops atalk_dgram_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= atalk_getname,
-	.poll_mask	= datagram_poll_mask,
+	.poll		= datagram_poll,
 	.ioctl		= atalk_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= atalk_compat_ioctl,
diff --git a/net/atm/common.c b/net/atm/common.c
index ff5748b2190f..a7a68e509628 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -647,11 +647,16 @@ out:
 	return error;
 }
 
-__poll_t vcc_poll_mask(struct socket *sock, __poll_t events)
+__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
 {
 	struct sock *sk = sock->sk;
-	struct atm_vcc *vcc = ATM_SD(sock);
-	__poll_t mask = 0;
+	struct atm_vcc *vcc;
+	__poll_t mask;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
+
+	vcc = ATM_SD(sock);
 
 	/* exceptional events */
 	if (sk->sk_err)
diff --git a/net/atm/common.h b/net/atm/common.h
index 526796ad230f..5850649068bb 100644
--- a/net/atm/common.h
+++ b/net/atm/common.h
@@ -17,7 +17,7 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci);
 int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 		int flags);
 int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len);
-__poll_t vcc_poll_mask(struct socket *sock, __poll_t events);
+__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait);
 int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 int vcc_setsockopt(struct socket *sock, int level, int optname,
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index 9f75092fe778..2cb10af16afc 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -113,7 +113,7 @@ static const struct proto_ops pvc_proto_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	pvc_getname,
-	.poll_mask =	vcc_poll_mask,
+	.poll =		vcc_poll,
 	.ioctl =	vcc_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = vcc_compat_ioctl,
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 53f4ad7087b1..2f91b766ac42 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -636,7 +636,7 @@ static const struct proto_ops svc_proto_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	svc_accept,
 	.getname =	svc_getname,
-	.poll_mask =	vcc_poll_mask,
+	.poll =		vcc_poll,
 	.ioctl =	svc_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl =	svc_compat_ioctl,
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index d1d2442ce573..c603d33d5410 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1941,7 +1941,7 @@ static const struct proto_ops ax25_proto_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= ax25_accept,
 	.getname	= ax25_getname,
-	.poll_mask	= datagram_poll_mask,
+	.poll		= datagram_poll,
 	.ioctl		= ax25_ioctl,
 	.listen		= ax25_listen,
 	.shutdown	= ax25_shutdown,
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 510ab4f55df5..3264e1873219 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -437,13 +437,16 @@ static inline __poll_t bt_accept_poll(struct sock *parent)
 	return 0;
 }
 
-__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events)
+__poll_t bt_sock_poll(struct file *file, struct socket *sock,
+			  poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	__poll_t mask = 0;
 
 	BT_DBG("sock %p, sk %p", sock, sk);
 
+	poll_wait(file, sk_sleep(sk), wait);
+
 	if (sk->sk_state == BT_LISTEN)
 		return bt_accept_poll(sk);
 
@@ -475,7 +478,7 @@ __poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events)
 
 	return mask;
 }
-EXPORT_SYMBOL(bt_sock_poll_mask);
+EXPORT_SYMBOL(bt_sock_poll);
 
 int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index d6c099861538..1506e1632394 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1975,7 +1975,7 @@ static const struct proto_ops hci_sock_ops = {
 	.sendmsg	= hci_sock_sendmsg,
 	.recvmsg	= hci_sock_recvmsg,
 	.ioctl		= hci_sock_ioctl,
-	.poll_mask	= datagram_poll_mask,
+	.poll		= datagram_poll,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= hci_sock_setsockopt,
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 742a190034e6..686bdc6b35b0 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1653,7 +1653,7 @@ static const struct proto_ops l2cap_sock_ops = {
 	.getname	= l2cap_sock_getname,
 	.sendmsg	= l2cap_sock_sendmsg,
 	.recvmsg	= l2cap_sock_recvmsg,
-	.poll_mask	= bt_sock_poll_mask,
+	.poll		= bt_sock_poll,
 	.ioctl		= bt_sock_ioctl,
 	.mmap		= sock_no_mmap,
 	.socketpair	= sock_no_socketpair,
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 1cf57622473a..d606e9212291 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -1049,7 +1049,7 @@ static const struct proto_ops rfcomm_sock_ops = {
 	.setsockopt	= rfcomm_sock_setsockopt,
 	.getsockopt	= rfcomm_sock_getsockopt,
 	.ioctl		= rfcomm_sock_ioctl,
-	.poll_mask	= bt_sock_poll_mask,
+	.poll		= bt_sock_poll,
 	.socketpair	= sock_no_socketpair,
 	.mmap		= sock_no_mmap
 };
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index d60dbc61d170..413b8ee49fec 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -1197,7 +1197,7 @@ static const struct proto_ops sco_sock_ops = {
 	.getname	= sco_sock_getname,
 	.sendmsg	= sco_sock_sendmsg,
 	.recvmsg	= sco_sock_recvmsg,
-	.poll_mask	= bt_sock_poll_mask,
+	.poll		= bt_sock_poll,
 	.ioctl		= bt_sock_ioctl,
 	.mmap		= sock_no_mmap,
 	.socketpair	= sock_no_socketpair,
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index c7991867d622..a6fb1b3bcad9 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -934,11 +934,15 @@ static int caif_release(struct socket *sock)
 }
 
 /* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */
-static __poll_t caif_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t caif_poll(struct file *file,
+			      struct socket *sock, poll_table *wait)
 {
 	struct sock *sk = sock->sk;
+	__poll_t mask;
 	struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
-	__poll_t mask = 0;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
 
 	/* exceptional events? */
 	if (sk->sk_err)
@@ -972,7 +976,7 @@ static const struct proto_ops caif_seqpacket_ops = {
 	.socketpair = sock_no_socketpair,
 	.accept = sock_no_accept,
 	.getname = sock_no_getname,
-	.poll_mask = caif_poll_mask,
+	.poll = caif_poll,
 	.ioctl = sock_no_ioctl,
 	.listen = sock_no_listen,
 	.shutdown = sock_no_shutdown,
@@ -993,7 +997,7 @@ static const struct proto_ops caif_stream_ops = {
 	.socketpair = sock_no_socketpair,
 	.accept = sock_no_accept,
 	.getname = sock_no_getname,
-	.poll_mask = caif_poll_mask,
+	.poll = caif_poll,
 	.ioctl = sock_no_ioctl,
 	.listen = sock_no_listen,
 	.shutdown = sock_no_shutdown,
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 9393f25df08d..0af8f0db892a 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1660,7 +1660,7 @@ static const struct proto_ops bcm_ops = {
 	.socketpair    = sock_no_socketpair,
 	.accept        = sock_no_accept,
 	.getname       = sock_no_getname,
-	.poll_mask     = datagram_poll_mask,
+	.poll          = datagram_poll,
 	.ioctl         = can_ioctl,	/* use can_ioctl() from af_can.c */
 	.listen        = sock_no_listen,
 	.shutdown      = sock_no_shutdown,
diff --git a/net/can/raw.c b/net/can/raw.c
index fd7e2f49ea6a..1051eee82581 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -843,7 +843,7 @@ static const struct proto_ops raw_ops = {
 	.socketpair    = sock_no_socketpair,
 	.accept        = sock_no_accept,
 	.getname       = raw_getname,
-	.poll_mask     = datagram_poll_mask,
+	.poll          = datagram_poll,
 	.ioctl         = can_ioctl,	/* use can_ioctl() from af_can.c */
 	.listen        = sock_no_listen,
 	.shutdown      = sock_no_shutdown,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index f19bf3dc2bd6..9938952c5c78 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -819,8 +819,9 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
 
 /**
  * 	datagram_poll - generic datagram poll
+ *	@file: file struct
  *	@sock: socket
- *	@events to wait for
+ *	@wait: poll table
  *
  *	Datagram poll: Again totally generic. This also handles
  *	sequenced packet sockets providing the socket receive queue
@@ -830,10 +831,14 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
  *	and you use a different write policy from sock_writeable()
  *	then please supply your own write_space callback.
  */
-__poll_t datagram_poll_mask(struct socket *sock, __poll_t events)
+__poll_t datagram_poll(struct file *file, struct socket *sock,
+			   poll_table *wait)
 {
 	struct sock *sk = sock->sk;
-	__poll_t mask = 0;
+	__poll_t mask;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
 
 	/* exceptional events? */
 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
@@ -866,4 +871,4 @@ __poll_t datagram_poll_mask(struct socket *sock, __poll_t events)
 
 	return mask;
 }
-EXPORT_SYMBOL(datagram_poll_mask);
+EXPORT_SYMBOL(datagram_poll);
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 0ea2ee56ac1b..f91e3816806b 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -316,7 +316,8 @@ int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 		 int flags, int *addr_len);
 void dccp_shutdown(struct sock *sk, int how);
 int inet_dccp_listen(struct socket *sock, int backlog);
-__poll_t dccp_poll_mask(struct socket *sock, __poll_t events);
+__poll_t dccp_poll(struct file *file, struct socket *sock,
+		       poll_table *wait);
 int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
 void dccp_req_err(struct sock *sk, u64 seq);
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index a9e478cd3787..b08feb219b44 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -984,7 +984,7 @@ static const struct proto_ops inet_dccp_ops = {
 	.accept		   = inet_accept,
 	.getname	   = inet_getname,
 	/* FIXME: work on tcp_poll to rename it to inet_csk_poll */
-	.poll_mask	   = dccp_poll_mask,
+	.poll		   = dccp_poll,
 	.ioctl		   = inet_ioctl,
 	/* FIXME: work on inet_listen to rename it to sock_common_listen */
 	.listen		   = inet_dccp_listen,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 17fc4e0166ba..6344f1b18a6a 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1070,7 +1070,7 @@ static const struct proto_ops inet6_dccp_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = inet_accept,
 	.getname	   = inet6_getname,
-	.poll_mask	   = dccp_poll_mask,
+	.poll		   = dccp_poll,
 	.ioctl		   = inet6_ioctl,
 	.listen		   = inet_dccp_listen,
 	.shutdown	   = inet_shutdown,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index ca21c1c76da0..0d56e36a6db7 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -312,11 +312,20 @@ int dccp_disconnect(struct sock *sk, int flags)
 
 EXPORT_SYMBOL_GPL(dccp_disconnect);
 
-__poll_t dccp_poll_mask(struct socket *sock, __poll_t events)
+/*
+ *	Wait for a DCCP event.
+ *
+ *	Note that we don't need to lock the socket, as the upper poll layers
+ *	take care of normal races (between the test and the event) and we don't
+ *	go look at any of the socket buffers directly.
+ */
+__poll_t dccp_poll(struct file *file, struct socket *sock,
+		       poll_table *wait)
 {
 	__poll_t mask;
 	struct sock *sk = sock->sk;
 
+	sock_poll_wait(file, sk_sleep(sk), wait);
 	if (sk->sk_state == DCCP_LISTEN)
 		return inet_csk_listen_poll(sk);
 
@@ -358,7 +367,7 @@ __poll_t dccp_poll_mask(struct socket *sock, __poll_t events)
 	return mask;
 }
 
-EXPORT_SYMBOL_GPL(dccp_poll_mask);
+EXPORT_SYMBOL_GPL(dccp_poll);
 
 int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 9a686d890bfa..7d6ff983ba2c 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1207,11 +1207,11 @@ static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int peer)
 }
 
 
-static __poll_t dn_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table  *wait)
 {
 	struct sock *sk = sock->sk;
 	struct dn_scp *scp = DN_SK(sk);
-	__poll_t mask = datagram_poll_mask(sock, events);
+	__poll_t mask = datagram_poll(file, sock, wait);
 
 	if (!skb_queue_empty(&scp->other_receive_queue))
 		mask |= EPOLLRDBAND;
@@ -2331,7 +2331,7 @@ static const struct proto_ops dn_proto_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	dn_accept,
 	.getname =	dn_getname,
-	.poll_mask =	dn_poll_mask,
+	.poll =		dn_poll,
 	.ioctl =	dn_ioctl,
 	.listen =	dn_listen,
 	.shutdown =	dn_shutdown,
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index a0768d2759b8..a60658c85a9a 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -423,7 +423,7 @@ static const struct proto_ops ieee802154_raw_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = sock_no_getname,
-	.poll_mask	   = datagram_poll_mask,
+	.poll		   = datagram_poll,
 	.ioctl		   = ieee802154_sock_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = sock_no_shutdown,
@@ -969,7 +969,7 @@ static const struct proto_ops ieee802154_dgram_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = sock_no_getname,
-	.poll_mask	   = datagram_poll_mask,
+	.poll		   = datagram_poll,
 	.ioctl		   = ieee802154_sock_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = sock_no_shutdown,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 15e125558c76..b403499fdabe 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -986,7 +986,7 @@ const struct proto_ops inet_stream_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = inet_accept,
 	.getname	   = inet_getname,
-	.poll_mask	   = tcp_poll_mask,
+	.poll		   = tcp_poll,
 	.ioctl		   = inet_ioctl,
 	.listen		   = inet_listen,
 	.shutdown	   = inet_shutdown,
@@ -1021,7 +1021,7 @@ const struct proto_ops inet_dgram_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = inet_getname,
-	.poll_mask	   = udp_poll_mask,
+	.poll		   = udp_poll,
 	.ioctl		   = inet_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = inet_shutdown,
@@ -1042,7 +1042,7 @@ EXPORT_SYMBOL(inet_dgram_ops);
 
 /*
  * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
- * udp_poll_mask
+ * udp_poll
  */
 static const struct proto_ops inet_sockraw_ops = {
 	.family		   = PF_INET,
@@ -1053,7 +1053,7 @@ static const struct proto_ops inet_sockraw_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = inet_getname,
-	.poll_mask	   = datagram_poll_mask,
+	.poll		   = datagram_poll,
 	.ioctl		   = inet_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = inet_shutdown,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 141acd92e58a..e7b53d2a971f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -494,21 +494,32 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
 }
 
 /*
- * Socket is not locked. We are protected from async events by poll logic and
- * correct handling of state changes made by other threads is impossible in
- * any case.
+ *	Wait for a TCP event.
+ *
+ *	Note that we don't need to lock the socket, as the upper poll layers
+ *	take care of normal races (between the test and the event) and we don't
+ *	go look at any of the socket buffers directly.
  */
-__poll_t tcp_poll_mask(struct socket *sock, __poll_t events)
+__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 {
+	__poll_t mask;
 	struct sock *sk = sock->sk;
 	const struct tcp_sock *tp = tcp_sk(sk);
-	__poll_t mask = 0;
 	int state;
 
+	sock_poll_wait(file, sk_sleep(sk), wait);
+
 	state = inet_sk_state_load(sk);
 	if (state == TCP_LISTEN)
 		return inet_csk_listen_poll(sk);
 
+	/* Socket is not locked. We are protected from async events
+	 * by poll logic and correct handling of state changes
+	 * made by other threads is impossible in any case.
+	 */
+
+	mask = 0;
+
 	/*
 	 * EPOLLHUP is certainly not done right. But poll() doesn't
 	 * have a notion of HUP in just one direction, and for a
@@ -589,7 +600,7 @@ __poll_t tcp_poll_mask(struct socket *sock, __poll_t events)
 
 	return mask;
 }
-EXPORT_SYMBOL(tcp_poll_mask);
+EXPORT_SYMBOL(tcp_poll);
 
 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9bb27df4dac5..24e116ddae79 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2591,7 +2591,7 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname,
  * 	udp_poll - wait for a UDP event.
  *	@file - file struct
  *	@sock - socket
- *	@events - events to wait for
+ *	@wait - poll table
  *
  *	This is same as datagram poll, except for the special case of
  *	blocking sockets. If application is using a blocking fd
@@ -2600,23 +2600,23 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname,
  *	but then block when reading it. Add special case code
  *	to work around these arguably broken applications.
  */
-__poll_t udp_poll_mask(struct socket *sock, __poll_t events)
+__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
 {
-	__poll_t mask = datagram_poll_mask(sock, events);
+	__poll_t mask = datagram_poll(file, sock, wait);
 	struct sock *sk = sock->sk;
 
 	if (!skb_queue_empty(&udp_sk(sk)->reader_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* Check for false positives due to checksum errors */
-	if ((mask & EPOLLRDNORM) && !(sock->file->f_flags & O_NONBLOCK) &&
+	if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
 	    !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
 		mask &= ~(EPOLLIN | EPOLLRDNORM);
 
 	return mask;
 
 }
-EXPORT_SYMBOL(udp_poll_mask);
+EXPORT_SYMBOL(udp_poll);
 
 int udp_abort(struct sock *sk, int err)
 {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 74f2a261e8df..9ed0eae91758 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -570,7 +570,7 @@ const struct proto_ops inet6_stream_ops = {
 	.socketpair	   = sock_no_socketpair,	/* a do nothing	*/
 	.accept		   = inet_accept,		/* ok		*/
 	.getname	   = inet6_getname,
-	.poll_mask	   = tcp_poll_mask,		/* ok		*/
+	.poll		   = tcp_poll,			/* ok		*/
 	.ioctl		   = inet6_ioctl,		/* must change  */
 	.listen		   = inet_listen,		/* ok		*/
 	.shutdown	   = inet_shutdown,		/* ok		*/
@@ -603,7 +603,7 @@ const struct proto_ops inet6_dgram_ops = {
 	.socketpair	   = sock_no_socketpair,	/* a do nothing	*/
 	.accept		   = sock_no_accept,		/* a do nothing	*/
 	.getname	   = inet6_getname,
-	.poll_mask	   = udp_poll_mask,		/* ok		*/
+	.poll		   = udp_poll,			/* ok		*/
 	.ioctl		   = inet6_ioctl,		/* must change  */
 	.listen		   = sock_no_listen,		/* ok		*/
 	.shutdown	   = inet_shutdown,		/* ok		*/
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index ce6f0d15b5dd..afc307c89d1a 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1334,7 +1334,7 @@ void raw6_proc_exit(void)
 }
 #endif	/* CONFIG_PROC_FS */
 
-/* Same as inet6_dgram_ops, sans udp_poll_mask.  */
+/* Same as inet6_dgram_ops, sans udp_poll.  */
 const struct proto_ops inet6_sockraw_ops = {
 	.family		   = PF_INET6,
 	.owner		   = THIS_MODULE,
@@ -1344,7 +1344,7 @@ const struct proto_ops inet6_sockraw_ops = {
 	.socketpair	   = sock_no_socketpair,	/* a do nothing	*/
 	.accept		   = sock_no_accept,		/* a do nothing	*/
 	.getname	   = inet6_getname,
-	.poll_mask	   = datagram_poll_mask,	/* ok		*/
+	.poll		   = datagram_poll,		/* ok		*/
 	.ioctl		   = inet6_ioctl,		/* must change  */
 	.listen		   = sock_no_listen,		/* ok		*/
 	.shutdown	   = inet_shutdown,		/* ok		*/
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 68e86257a549..893a022f9620 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1488,11 +1488,14 @@ static inline __poll_t iucv_accept_poll(struct sock *parent)
 	return 0;
 }
 
-static __poll_t iucv_sock_poll_mask(struct socket *sock, __poll_t events)
+__poll_t iucv_sock_poll(struct file *file, struct socket *sock,
+			    poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	__poll_t mask = 0;
 
+	sock_poll_wait(file, sk_sleep(sk), wait);
+
 	if (sk->sk_state == IUCV_LISTEN)
 		return iucv_accept_poll(sk);
 
@@ -2385,7 +2388,7 @@ static const struct proto_ops iucv_sock_ops = {
 	.getname	= iucv_sock_getname,
 	.sendmsg	= iucv_sock_sendmsg,
 	.recvmsg	= iucv_sock_recvmsg,
-	.poll_mask	= iucv_sock_poll_mask,
+	.poll		= iucv_sock_poll,
 	.ioctl		= sock_no_ioctl,
 	.mmap		= sock_no_mmap,
 	.socketpair	= sock_no_socketpair,
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 84b7d5c6fec8..d3601d421571 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1336,9 +1336,9 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
 	struct list_head *head;
 	int index = 0;
 
-	/* For SOCK_SEQPACKET sock type, datagram_poll_mask checks the sk_state,
-	 * so  we set sk_state, otherwise epoll_wait always returns right away
-	 * with EPOLLHUP
+	/* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so
+	 * we set sk_state, otherwise epoll_wait always returns right away with
+	 * EPOLLHUP
 	 */
 	kcm->sk.sk_state = TCP_ESTABLISHED;
 
@@ -1903,7 +1903,7 @@ static const struct proto_ops kcm_dgram_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	sock_no_getname,
-	.poll_mask =	datagram_poll_mask,
+	.poll =		datagram_poll,
 	.ioctl =	kcm_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
@@ -1924,7 +1924,7 @@ static const struct proto_ops kcm_seqpacket_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	sock_no_getname,
-	.poll_mask =	datagram_poll_mask,
+	.poll =		datagram_poll,
 	.ioctl =	kcm_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 8bdc1cbe490a..5e1d2946ffbf 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3751,7 +3751,7 @@ static const struct proto_ops pfkey_ops = {
 
 	/* Now the operations that really occur. */
 	.release	=	pfkey_release,
-	.poll_mask	=	datagram_poll_mask,
+	.poll		=	datagram_poll,
 	.sendmsg	=	pfkey_sendmsg,
 	.recvmsg	=	pfkey_recvmsg,
 };
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 181073bf6925..a9c05b2bc1b0 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -613,7 +613,7 @@ static const struct proto_ops l2tp_ip_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = l2tp_ip_getname,
-	.poll_mask	   = datagram_poll_mask,
+	.poll		   = datagram_poll,
 	.ioctl		   = inet_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = inet_shutdown,
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 336e4c00abbc..957369192ca1 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -754,7 +754,7 @@ static const struct proto_ops l2tp_ip6_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = l2tp_ip6_getname,
-	.poll_mask	   = datagram_poll_mask,
+	.poll		   = datagram_poll,
 	.ioctl		   = inet6_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = inet_shutdown,
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 55188382845c..e398797878a9 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1818,7 +1818,7 @@ static const struct proto_ops pppol2tp_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= pppol2tp_getname,
-	.poll_mask	= datagram_poll_mask,
+	.poll		= datagram_poll,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= pppol2tp_setsockopt,
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 804de8490186..1beeea9549fa 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -1192,7 +1192,7 @@ static const struct proto_ops llc_ui_ops = {
 	.socketpair  = sock_no_socketpair,
 	.accept      = llc_ui_accept,
 	.getname     = llc_ui_getname,
-	.poll_mask   = datagram_poll_mask,
+	.poll	     = datagram_poll,
 	.ioctl       = llc_ui_ioctl,
 	.listen      = llc_ui_listen,
 	.shutdown    = llc_ui_shutdown,
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 1189b84413d5..393573a99a5a 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2658,7 +2658,7 @@ static const struct proto_ops netlink_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	netlink_getname,
-	.poll_mask =	datagram_poll_mask,
+	.poll =		datagram_poll,
 	.ioctl =	netlink_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 93fbcafbf388..03f37c4e64fe 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1355,7 +1355,7 @@ static const struct proto_ops nr_proto_ops = {
 	.socketpair	=	sock_no_socketpair,
 	.accept		=	nr_accept,
 	.getname	=	nr_getname,
-	.poll_mask	=	datagram_poll_mask,
+	.poll		=	datagram_poll,
 	.ioctl		=	nr_ioctl,
 	.listen		=	nr_listen,
 	.shutdown	=	sock_no_shutdown,
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index ab5bb14b49af..ea0c0c6f1874 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -548,13 +548,16 @@ static inline __poll_t llcp_accept_poll(struct sock *parent)
 	return 0;
 }
 
-static __poll_t llcp_sock_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t llcp_sock_poll(struct file *file, struct socket *sock,
+				   poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	__poll_t mask = 0;
 
 	pr_debug("%p\n", sk);
 
+	sock_poll_wait(file, sk_sleep(sk), wait);
+
 	if (sk->sk_state == LLCP_LISTEN)
 		return llcp_accept_poll(sk);
 
@@ -896,7 +899,7 @@ static const struct proto_ops llcp_sock_ops = {
 	.socketpair     = sock_no_socketpair,
 	.accept         = llcp_sock_accept,
 	.getname        = llcp_sock_getname,
-	.poll_mask      = llcp_sock_poll_mask,
+	.poll           = llcp_sock_poll,
 	.ioctl          = sock_no_ioctl,
 	.listen         = llcp_sock_listen,
 	.shutdown       = sock_no_shutdown,
@@ -916,7 +919,7 @@ static const struct proto_ops llcp_rawsock_ops = {
 	.socketpair     = sock_no_socketpair,
 	.accept         = sock_no_accept,
 	.getname        = llcp_sock_getname,
-	.poll_mask      = llcp_sock_poll_mask,
+	.poll           = llcp_sock_poll,
 	.ioctl          = sock_no_ioctl,
 	.listen         = sock_no_listen,
 	.shutdown       = sock_no_shutdown,
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index 60c322531c49..e2188deb08dc 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -284,7 +284,7 @@ static const struct proto_ops rawsock_ops = {
 	.socketpair     = sock_no_socketpair,
 	.accept         = sock_no_accept,
 	.getname        = sock_no_getname,
-	.poll_mask      = datagram_poll_mask,
+	.poll           = datagram_poll,
 	.ioctl          = sock_no_ioctl,
 	.listen         = sock_no_listen,
 	.shutdown       = sock_no_shutdown,
@@ -304,7 +304,7 @@ static const struct proto_ops rawsock_raw_ops = {
 	.socketpair     = sock_no_socketpair,
 	.accept         = sock_no_accept,
 	.getname        = sock_no_getname,
-	.poll_mask      = datagram_poll_mask,
+	.poll           = datagram_poll,
 	.ioctl          = sock_no_ioctl,
 	.listen         = sock_no_listen,
 	.shutdown       = sock_no_shutdown,
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index ff8e7e245c37..57634bc3da74 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -4076,11 +4076,12 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,
 	return 0;
 }
 
-static __poll_t packet_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t packet_poll(struct file *file, struct socket *sock,
+				poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	struct packet_sock *po = pkt_sk(sk);
-	__poll_t mask = datagram_poll_mask(sock, events);
+	__poll_t mask = datagram_poll(file, sock, wait);
 
 	spin_lock_bh(&sk->sk_receive_queue.lock);
 	if (po->rx_ring.pg_vec) {
@@ -4422,7 +4423,7 @@ static const struct proto_ops packet_ops_spkt = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	packet_getname_spkt,
-	.poll_mask =	datagram_poll_mask,
+	.poll =		datagram_poll,
 	.ioctl =	packet_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
@@ -4443,7 +4444,7 @@ static const struct proto_ops packet_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	packet_getname,
-	.poll_mask =	packet_poll_mask,
+	.poll =		packet_poll,
 	.ioctl =	packet_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index c295c4e20f01..30187990257f 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -340,12 +340,15 @@ static int pn_socket_getname(struct socket *sock, struct sockaddr *addr,
 	return sizeof(struct sockaddr_pn);
 }
 
-static __poll_t pn_socket_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t pn_socket_poll(struct file *file, struct socket *sock,
+					poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	struct pep_sock *pn = pep_sk(sk);
 	__poll_t mask = 0;
 
+	poll_wait(file, sk_sleep(sk), wait);
+
 	if (sk->sk_state == TCP_CLOSE)
 		return EPOLLERR;
 	if (!skb_queue_empty(&sk->sk_receive_queue))
@@ -445,7 +448,7 @@ const struct proto_ops phonet_dgram_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= pn_socket_getname,
-	.poll_mask	= datagram_poll_mask,
+	.poll		= datagram_poll,
 	.ioctl		= pn_socket_ioctl,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
@@ -470,7 +473,7 @@ const struct proto_ops phonet_stream_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= pn_socket_accept,
 	.getname	= pn_socket_getname,
-	.poll_mask	= pn_socket_poll_mask,
+	.poll		= pn_socket_poll,
 	.ioctl		= pn_socket_ioctl,
 	.listen		= pn_socket_listen,
 	.shutdown	= sock_no_shutdown,
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 1b5025ea5b04..2aa07b547b16 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -1023,7 +1023,7 @@ static const struct proto_ops qrtr_proto_ops = {
 	.recvmsg	= qrtr_recvmsg,
 	.getname	= qrtr_getname,
 	.ioctl		= qrtr_ioctl,
-	.poll_mask	= datagram_poll_mask,
+	.poll		= datagram_poll,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= sock_no_setsockopt,
 	.getsockopt	= sock_no_getsockopt,
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index ebe42e7eb456..d00a0ef39a56 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1470,7 +1470,7 @@ static const struct proto_ops rose_proto_ops = {
 	.socketpair	=	sock_no_socketpair,
 	.accept		=	rose_accept,
 	.getname	=	rose_getname,
-	.poll_mask	=	datagram_poll_mask,
+	.poll		=	datagram_poll,
 	.ioctl		=	rose_ioctl,
 	.listen		=	rose_listen,
 	.shutdown	=	sock_no_shutdown,
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 3b1ac93efee2..2b463047dd7b 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -734,11 +734,15 @@ static int rxrpc_getsockopt(struct socket *sock, int level, int optname,
 /*
  * permit an RxRPC socket to be polled
  */
-static __poll_t rxrpc_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t rxrpc_poll(struct file *file, struct socket *sock,
+			       poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	struct rxrpc_sock *rx = rxrpc_sk(sk);
-	__poll_t mask = 0;
+	__poll_t mask;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
 
 	/* the socket is readable if there are any messages waiting on the Rx
 	 * queue */
@@ -945,7 +949,7 @@ static const struct proto_ops rxrpc_rpc_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= sock_no_getname,
-	.poll_mask	= rxrpc_poll_mask,
+	.poll		= rxrpc_poll,
 	.ioctl		= sock_no_ioctl,
 	.listen		= rxrpc_listen,
 	.shutdown	= rxrpc_shutdown,
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 7339918a805d..0cd2e764f47f 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -1010,7 +1010,7 @@ static const struct proto_ops inet6_seqpacket_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = inet_accept,
 	.getname	   = sctp_getname,
-	.poll_mask	   = sctp_poll_mask,
+	.poll		   = sctp_poll,
 	.ioctl		   = inet6_ioctl,
 	.listen		   = sctp_inet_listen,
 	.shutdown	   = inet_shutdown,
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 5dffbc493008..67f73d3a1356 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1016,7 +1016,7 @@ static const struct proto_ops inet_seqpacket_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = inet_accept,
 	.getname	   = inet_getname,	/* Semantics are different.  */
-	.poll_mask	   = sctp_poll_mask,
+	.poll		   = sctp_poll,
 	.ioctl		   = inet_ioctl,
 	.listen		   = sctp_inet_listen,
 	.shutdown	   = inet_shutdown,	/* Looks harmless.  */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index d20f7addee19..ce620e878538 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7717,12 +7717,14 @@ out:
  * here, again, by modeling the current TCP/UDP code.  We don't have
  * a good way to test with it yet.
  */
-__poll_t sctp_poll_mask(struct socket *sock, __poll_t events)
+__poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	struct sctp_sock *sp = sctp_sk(sk);
 	__poll_t mask;
 
+	poll_wait(file, sk_sleep(sk), wait);
+
 	sock_rps_record_flow(sk);
 
 	/* A TCP-style listening socket becomes readable when the accept queue
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index da7f02edcd37..973b4471b532 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1273,7 +1273,8 @@ static __poll_t smc_accept_poll(struct sock *parent)
 	return mask;
 }
 
-static __poll_t smc_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t smc_poll(struct file *file, struct socket *sock,
+			     poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	__poll_t mask = 0;
@@ -1289,7 +1290,7 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events)
 	if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
 		/* delegate to CLC child sock */
 		release_sock(sk);
-		mask = smc->clcsock->ops->poll_mask(smc->clcsock, events);
+		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
 		lock_sock(sk);
 		sk->sk_err = smc->clcsock->sk->sk_err;
 		if (sk->sk_err) {
@@ -1307,6 +1308,11 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events)
 			}
 		}
 	} else {
+		if (sk->sk_state != SMC_CLOSED) {
+			release_sock(sk);
+			sock_poll_wait(file, sk_sleep(sk), wait);
+			lock_sock(sk);
+		}
 		if (sk->sk_err)
 			mask |= EPOLLERR;
 		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
@@ -1619,7 +1625,7 @@ static const struct proto_ops smc_sock_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= smc_accept,
 	.getname	= smc_getname,
-	.poll_mask	= smc_poll_mask,
+	.poll		= smc_poll,
 	.ioctl		= smc_ioctl,
 	.listen		= smc_listen,
 	.shutdown	= smc_shutdown,
diff --git a/net/socket.c b/net/socket.c
index 8a109012608a..a564c6ed19d5 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -117,10 +117,8 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from);
 static int sock_mmap(struct file *file, struct vm_area_struct *vma);
 
 static int sock_close(struct inode *inode, struct file *file);
-static struct wait_queue_head *sock_get_poll_head(struct file *file,
-		__poll_t events);
-static __poll_t sock_poll_mask(struct file *file, __poll_t);
-static __poll_t sock_poll(struct file *file, struct poll_table_struct *wait);
+static __poll_t sock_poll(struct file *file,
+			      struct poll_table_struct *wait);
 static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 #ifdef CONFIG_COMPAT
 static long compat_sock_ioctl(struct file *file,
@@ -143,8 +141,6 @@ static const struct file_operations socket_file_ops = {
 	.llseek =	no_llseek,
 	.read_iter =	sock_read_iter,
 	.write_iter =	sock_write_iter,
-	.get_poll_head = sock_get_poll_head,
-	.poll_mask =	sock_poll_mask,
 	.poll =		sock_poll,
 	.unlocked_ioctl = sock_ioctl,
 #ifdef CONFIG_COMPAT
@@ -1130,48 +1126,14 @@ out_release:
 }
 EXPORT_SYMBOL(sock_create_lite);
 
-static struct wait_queue_head *sock_get_poll_head(struct file *file,
-		__poll_t events)
-{
-	struct socket *sock = file->private_data;
-
-	if (!sock->ops->poll_mask)
-		return NULL;
-	sock_poll_busy_loop(sock, events);
-	return sk_sleep(sock->sk);
-}
-
-static __poll_t sock_poll_mask(struct file *file, __poll_t events)
-{
-	struct socket *sock = file->private_data;
-
-	/*
-	 * We need to be sure we are in sync with the socket flags modification.
-	 *
-	 * This memory barrier is paired in the wq_has_sleeper.
-	 */
-	smp_mb();
-
-	/* this socket can poll_ll so tell the system call */
-	return sock->ops->poll_mask(sock, events) |
-		(sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0);
-}
-
 /* No kernel lock held - perfect */
 static __poll_t sock_poll(struct file *file, poll_table *wait)
 {
 	struct socket *sock = file->private_data;
-	__poll_t events = poll_requested_events(wait), mask = 0;
-
-	if (sock->ops->poll) {
-		sock_poll_busy_loop(sock, events);
-		mask = sock->ops->poll(file, sock, wait);
-	} else if (sock->ops->poll_mask) {
-		sock_poll_wait(file, sock_get_poll_head(file, events), wait);
-		mask = sock->ops->poll_mask(sock, events);
-	}
+	__poll_t events = poll_requested_events(wait);
 
-	return mask | sock_poll_busy_flag(sock);
+	sock_poll_busy_loop(sock, events);
+	return sock->ops->poll(file, sock, wait) | sock_poll_busy_flag(sock);
 }
 
 static int sock_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 14a5d055717d..930852c54d7a 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -692,9 +692,10 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
 }
 
 /**
- * tipc_poll - read pollmask
+ * tipc_poll - read and possibly block on pollmask
  * @file: file structure associated with the socket
  * @sock: socket for which to calculate the poll bits
+ * @wait: ???
  *
  * Returns pollmask value
  *
@@ -708,12 +709,15 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
  * imply that the operation will succeed, merely that it should be performed
  * and will not block.
  */
-static __poll_t tipc_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t tipc_poll(struct file *file, struct socket *sock,
+			      poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
 	__poll_t revents = 0;
 
+	sock_poll_wait(file, sk_sleep(sk), wait);
+
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
 		revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
@@ -3033,7 +3037,7 @@ static const struct proto_ops msg_ops = {
 	.socketpair	= tipc_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= tipc_getname,
-	.poll_mask	= tipc_poll_mask,
+	.poll		= tipc_poll,
 	.ioctl		= tipc_ioctl,
 	.listen		= sock_no_listen,
 	.shutdown	= tipc_shutdown,
@@ -3054,7 +3058,7 @@ static const struct proto_ops packet_ops = {
 	.socketpair	= tipc_socketpair,
 	.accept		= tipc_accept,
 	.getname	= tipc_getname,
-	.poll_mask	= tipc_poll_mask,
+	.poll		= tipc_poll,
 	.ioctl		= tipc_ioctl,
 	.listen		= tipc_listen,
 	.shutdown	= tipc_shutdown,
@@ -3075,7 +3079,7 @@ static const struct proto_ops stream_ops = {
 	.socketpair	= tipc_socketpair,
 	.accept		= tipc_accept,
 	.getname	= tipc_getname,
-	.poll_mask	= tipc_poll_mask,
+	.poll		= tipc_poll,
 	.ioctl		= tipc_ioctl,
 	.listen		= tipc_listen,
 	.shutdown	= tipc_shutdown,
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index a127d61e8af9..301f22430469 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -712,7 +712,7 @@ static int __init tls_register(void)
 	build_protos(tls_prots[TLSV4], &tcp_prot);
 
 	tls_sw_proto_ops = inet_stream_ops;
-	tls_sw_proto_ops.poll_mask = tls_sw_poll_mask;
+	tls_sw_proto_ops.poll = tls_sw_poll;
 	tls_sw_proto_ops.splice_read = tls_sw_splice_read;
 
 #ifdef CONFIG_TLS_DEVICE
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index f127fac88acf..d2380548f8f6 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -919,22 +919,23 @@ splice_read_end:
 	return copied ? : err;
 }
 
-__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events)
+unsigned int tls_sw_poll(struct file *file, struct socket *sock,
+			 struct poll_table_struct *wait)
 {
+	unsigned int ret;
 	struct sock *sk = sock->sk;
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
-	__poll_t mask;
 
-	/* Grab EPOLLOUT and EPOLLHUP from the underlying socket */
-	mask = ctx->sk_poll_mask(sock, events);
+	/* Grab POLLOUT and POLLHUP from the underlying socket */
+	ret = ctx->sk_poll(file, sock, wait);
 
-	/* Clear EPOLLIN bits, and set based on recv_pkt */
-	mask &= ~(EPOLLIN | EPOLLRDNORM);
+	/* Clear POLLIN bits, and set based on recv_pkt */
+	ret &= ~(POLLIN | POLLRDNORM);
 	if (ctx->recv_pkt)
-		mask |= EPOLLIN | EPOLLRDNORM;
+		ret |= POLLIN | POLLRDNORM;
 
-	return mask;
+	return ret;
 }
 
 static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
@@ -1191,7 +1192,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		sk->sk_data_ready = tls_data_ready;
 		write_unlock_bh(&sk->sk_callback_lock);
 
-		sw_ctx_rx->sk_poll_mask = sk->sk_socket->ops->poll_mask;
+		sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll;
 
 		strp_check_rcv(&sw_ctx_rx->strp);
 	}
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 95b02a71fd47..e5473c03d667 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -638,8 +638,9 @@ static int unix_stream_connect(struct socket *, struct sockaddr *,
 static int unix_socketpair(struct socket *, struct socket *);
 static int unix_accept(struct socket *, struct socket *, int, bool);
 static int unix_getname(struct socket *, struct sockaddr *, int);
-static __poll_t unix_poll_mask(struct socket *, __poll_t);
-static __poll_t unix_dgram_poll_mask(struct socket *, __poll_t);
+static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
+static __poll_t unix_dgram_poll(struct file *, struct socket *,
+				    poll_table *);
 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 static int unix_shutdown(struct socket *, int);
 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
@@ -680,7 +681,7 @@ static const struct proto_ops unix_stream_ops = {
 	.socketpair =	unix_socketpair,
 	.accept =	unix_accept,
 	.getname =	unix_getname,
-	.poll_mask =	unix_poll_mask,
+	.poll =		unix_poll,
 	.ioctl =	unix_ioctl,
 	.listen =	unix_listen,
 	.shutdown =	unix_shutdown,
@@ -703,7 +704,7 @@ static const struct proto_ops unix_dgram_ops = {
 	.socketpair =	unix_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	unix_getname,
-	.poll_mask =	unix_dgram_poll_mask,
+	.poll =		unix_dgram_poll,
 	.ioctl =	unix_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	unix_shutdown,
@@ -725,7 +726,7 @@ static const struct proto_ops unix_seqpacket_ops = {
 	.socketpair =	unix_socketpair,
 	.accept =	unix_accept,
 	.getname =	unix_getname,
-	.poll_mask =	unix_dgram_poll_mask,
+	.poll =		unix_dgram_poll,
 	.ioctl =	unix_ioctl,
 	.listen =	unix_listen,
 	.shutdown =	unix_shutdown,
@@ -2629,10 +2630,13 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
-static __poll_t unix_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
 {
 	struct sock *sk = sock->sk;
-	__poll_t mask = 0;
+	__poll_t mask;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
 
 	/* exceptional events? */
 	if (sk->sk_err)
@@ -2661,11 +2665,15 @@ static __poll_t unix_poll_mask(struct socket *sock, __poll_t events)
 	return mask;
 }
 
-static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
+				    poll_table *wait)
 {
 	struct sock *sk = sock->sk, *other;
-	int writable;
-	__poll_t mask = 0;
+	unsigned int writable;
+	__poll_t mask;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
 
 	/* exceptional events? */
 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
@@ -2691,7 +2699,7 @@ static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events)
 	}
 
 	/* No write status requested, avoid expensive OUT tests. */
-	if (!(events & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
+	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
 		return mask;
 
 	writable = unix_writable(sk);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index bb5d5fa68c35..c1076c19b858 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -850,11 +850,18 @@ static int vsock_shutdown(struct socket *sock, int mode)
 	return err;
 }
 
-static __poll_t vsock_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t vsock_poll(struct file *file, struct socket *sock,
+			       poll_table *wait)
 {
-	struct sock *sk = sock->sk;
-	struct vsock_sock *vsk = vsock_sk(sk);
-	__poll_t mask = 0;
+	struct sock *sk;
+	__poll_t mask;
+	struct vsock_sock *vsk;
+
+	sk = sock->sk;
+	vsk = vsock_sk(sk);
+
+	poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
 
 	if (sk->sk_err)
 		/* Signify that there has been an error on this socket. */
@@ -1084,7 +1091,7 @@ static const struct proto_ops vsock_dgram_ops = {
 	.socketpair = sock_no_socketpair,
 	.accept = sock_no_accept,
 	.getname = vsock_getname,
-	.poll_mask = vsock_poll_mask,
+	.poll = vsock_poll,
 	.ioctl = sock_no_ioctl,
 	.listen = sock_no_listen,
 	.shutdown = vsock_shutdown,
@@ -1842,7 +1849,7 @@ static const struct proto_ops vsock_stream_ops = {
 	.socketpair = sock_no_socketpair,
 	.accept = vsock_accept,
 	.getname = vsock_getname,
-	.poll_mask = vsock_poll_mask,
+	.poll = vsock_poll,
 	.ioctl = sock_no_ioctl,
 	.listen = vsock_listen,
 	.shutdown = vsock_shutdown,
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index f93365ae0fdd..d49aa79b7997 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1750,7 +1750,7 @@ static const struct proto_ops x25_proto_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	x25_accept,
 	.getname =	x25_getname,
-	.poll_mask =	datagram_poll_mask,
+	.poll =		datagram_poll,
 	.ioctl =	x25_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = compat_x25_ioctl,
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 3b3410ada097..59fb7d3c36a3 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -303,9 +303,10 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 	return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len);
 }
 
-static __poll_t xsk_poll_mask(struct socket *sock, __poll_t events)
+static unsigned int xsk_poll(struct file *file, struct socket *sock,
+			     struct poll_table_struct *wait)
 {
-	__poll_t mask = datagram_poll_mask(sock, events);
+	unsigned int mask = datagram_poll(file, sock, wait);
 	struct sock *sk = sock->sk;
 	struct xdp_sock *xs = xdp_sk(sk);
 
@@ -696,7 +697,7 @@ static const struct proto_ops xsk_proto_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= sock_no_getname,
-	.poll_mask	= xsk_poll_mask,
+	.poll		= xsk_poll,
 	.ioctl		= sock_no_ioctl,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
-- 
cgit v1.2.3


From c860e997e9170a6d68f9d1e6e2cf61f572191aaf Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 27 Jun 2018 16:04:48 -0700
Subject: tcp: fix Fast Open key endianness

Fast Open key could be stored in different endian based on the CPU.
Previously hosts in different endianness in a server farm using
the same key config (sysctl value) would produce different cookies.
This patch fixes it by always storing it as little endian to keep
same API for LE hosts.

Reported-by: Daniele Iamartino <danielei@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/sysctl_net_ipv4.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d06247ba08b2..af0a857d8352 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -265,8 +265,9 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
 	    ipv4.sysctl_tcp_fastopen);
 	struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
 	struct tcp_fastopen_context *ctxt;
-	int ret;
 	u32  user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
+	__le32 key[4];
+	int ret, i;
 
 	tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
 	if (!tbl.data)
@@ -275,11 +276,14 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
 	rcu_read_lock();
 	ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
 	if (ctxt)
-		memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
+		memcpy(key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
 	else
-		memset(user_key, 0, sizeof(user_key));
+		memset(key, 0, sizeof(key));
 	rcu_read_unlock();
 
+	for (i = 0; i < ARRAY_SIZE(key); i++)
+		user_key[i] = le32_to_cpu(key[i]);
+
 	snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
 		user_key[0], user_key[1], user_key[2], user_key[3]);
 	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
@@ -290,13 +294,17 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
 			ret = -EINVAL;
 			goto bad_key;
 		}
-		tcp_fastopen_reset_cipher(net, NULL, user_key,
+
+		for (i = 0; i < ARRAY_SIZE(user_key); i++)
+			key[i] = cpu_to_le32(user_key[i]);
+
+		tcp_fastopen_reset_cipher(net, NULL, key,
 					  TCP_FASTOPEN_KEY_LENGTH);
 	}
 
 bad_key:
 	pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
-	       user_key[0], user_key[1], user_key[2], user_key[3],
+		user_key[0], user_key[1], user_key[2], user_key[3],
 	       (char *)tbl.data, ret);
 	kfree(tbl.data);
 	return ret;
-- 
cgit v1.2.3


From 1236f22fbae15df3736ab4a984c64c0c6ee6254c Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Fri, 29 Jun 2018 13:07:53 +0300
Subject: tcp: prevent bogus FRTO undos with non-SACK flows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If SACK is not enabled and the first cumulative ACK after the RTO
retransmission covers more than the retransmitted skb, a spurious
FRTO undo will trigger (assuming FRTO is enabled for that RTO).
The reason is that any non-retransmitted segment acknowledged will
set FLAG_ORIG_SACK_ACKED in tcp_clean_rtx_queue even if there is
no indication that it would have been delivered for real (the
scoreboard is not kept with TCPCB_SACKED_ACKED bits in the non-SACK
case so the check for that bit won't help like it does with SACK).
Having FLAG_ORIG_SACK_ACKED set results in the spurious FRTO undo
in tcp_process_loss.

We need to use more strict condition for non-SACK case and check
that none of the cumulatively ACKed segments were retransmitted
to prove that progress is due to original transmissions. Only then
keep FLAG_ORIG_SACK_ACKED set, allowing FRTO undo to proceed in
non-SACK case.

(FLAG_ORIG_SACK_ACKED is planned to be renamed to FLAG_ORIG_PROGRESS
to better indicate its purpose but to keep this change minimal, it
will be done in another patch).

Besides burstiness and congestion control violations, this problem
can result in RTO loop: When the loss recovery is prematurely
undoed, only new data will be transmitted (if available) and
the next retransmission can occur only after a new RTO which in case
of multiple losses (that are not for consecutive packets) requires
one RTO per loss to recover.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Tested-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 045d930d01a9..8e5522c6833a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3181,6 +3181,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 
 		if (tcp_is_reno(tp)) {
 			tcp_remove_reno_sacks(sk, pkts_acked);
+
+			/* If any of the cumulatively ACKed segments was
+			 * retransmitted, non-SACK case cannot confirm that
+			 * progress was due to original transmission due to
+			 * lack of TCPCB_SACKED_ACKED bits even if some of
+			 * the packets may have been never retransmitted.
+			 */
+			if (flag & FLAG_RETRANS_DATA_ACKED)
+				flag &= ~FLAG_ORIG_SACK_ACKED;
 		} else {
 			int delta;
 
-- 
cgit v1.2.3


From 603d4cf8fe095b1ee78f423d514427be507fb513 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Sat, 30 Jun 2018 17:38:55 +0200
Subject: net: fix use-after-free in GRO with ESP

Since the addition of GRO for ESP, gro_receive can consume the skb and
return -EINPROGRESS. In that case, the lower layer GRO handler cannot
touch the skb anymore.

Commit 5f114163f2f5 ("net: Add a skb_gro_flush_final helper.") converted
some of the gro_receive handlers that can lead to ESP's gro_receive so
that they wouldn't access the skb when -EINPROGRESS is returned, but
missed other spots, mainly in tunneling protocols.

This patch finishes the conversion to using skb_gro_flush_final(), and
adds a new helper, skb_gro_flush_final_remcsum(), used in VXLAN and
GUE.

Fixes: 5f114163f2f5 ("net: Add a skb_gro_flush_final helper.")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c      |  2 +-
 drivers/net/vxlan.c       |  4 +---
 include/linux/netdevice.h | 20 ++++++++++++++++++++
 net/8021q/vlan.c          |  2 +-
 net/ipv4/fou.c            |  4 +---
 net/ipv4/gre_offload.c    |  2 +-
 net/ipv4/udp_offload.c    |  2 +-
 7 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 750eaa53bf0c..ada33c2d9ac2 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -476,7 +476,7 @@ static struct sk_buff **geneve_gro_receive(struct sock *sk,
 out_unlock:
 	rcu_read_unlock();
 out:
-	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_flush_final(skb, pp, flush);
 
 	return pp;
 }
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index aee0e60471f1..f6bb1d54d4bd 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -623,9 +623,7 @@ static struct sk_buff **vxlan_gro_receive(struct sock *sk,
 	flush = 0;
 
 out:
-	skb_gro_remcsum_cleanup(skb, &grc);
-	skb->remcsum_offload = 0;
-	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
 
 	return pp;
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3ec9850c7936..3d0cc0b5cec2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2789,11 +2789,31 @@ static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp,
 	if (PTR_ERR(pp) != -EINPROGRESS)
 		NAPI_GRO_CB(skb)->flush |= flush;
 }
+static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb,
+					       struct sk_buff **pp,
+					       int flush,
+					       struct gro_remcsum *grc)
+{
+	if (PTR_ERR(pp) != -EINPROGRESS) {
+		NAPI_GRO_CB(skb)->flush |= flush;
+		skb_gro_remcsum_cleanup(skb, grc);
+		skb->remcsum_offload = 0;
+	}
+}
 #else
 static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush)
 {
 	NAPI_GRO_CB(skb)->flush |= flush;
 }
+static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb,
+					       struct sk_buff **pp,
+					       int flush,
+					       struct gro_remcsum *grc)
+{
+	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_remcsum_cleanup(skb, grc);
+	skb->remcsum_offload = 0;
+}
 #endif
 
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 73a65789271b..8ccee3d01822 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -693,7 +693,7 @@ static struct sk_buff **vlan_gro_receive(struct sk_buff **head,
 out_unlock:
 	rcu_read_unlock();
 out:
-	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_flush_final(skb, pp, flush);
 
 	return pp;
 }
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 1540db65241a..c9ec1603666b 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -448,9 +448,7 @@ next_proto:
 out_unlock:
 	rcu_read_unlock();
 out:
-	NAPI_GRO_CB(skb)->flush |= flush;
-	skb_gro_remcsum_cleanup(skb, &grc);
-	skb->remcsum_offload = 0;
+	skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
 
 	return pp;
 }
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 1859c473b21a..6a7d980105f6 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -223,7 +223,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
 out_unlock:
 	rcu_read_unlock();
 out:
-	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_flush_final(skb, pp, flush);
 
 	return pp;
 }
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 92dc9e5a7ff3..69c54540d5b4 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -394,7 +394,7 @@ unflush:
 out_unlock:
 	rcu_read_unlock();
 out:
-	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_flush_final(skb, pp, flush);
 	return pp;
 }
 EXPORT_SYMBOL(udp_gro_receive);
-- 
cgit v1.2.3


From d376bef9c29b3c65aeee4e785fffcd97ef0a9a81 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 4 Jul 2018 20:25:32 +0200
Subject: netfilter: x_tables: set module owner for icmp(6) matches

nft_compat relies on xt_request_find_match to increment
refcount of the module that provides the match/target.

The (builtin) icmp matches did't set the module owner so it
was possible to rmmod ip(6)tables while icmp extensions were still in use.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ip_tables.c  | 1 +
 net/ipv6/netfilter/ip6_tables.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index ca0dad90803a..e77872c93c20 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1898,6 +1898,7 @@ static struct xt_match ipt_builtin_mt[] __read_mostly = {
 		.checkentry = icmp_checkentry,
 		.proto      = IPPROTO_ICMP,
 		.family     = NFPROTO_IPV4,
+		.me	    = THIS_MODULE,
 	},
 };
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 7eab959734bc..daf2e9e9193d 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1909,6 +1909,7 @@ static struct xt_match ip6t_builtin_mt[] __read_mostly = {
 		.checkentry = icmp6_checkentry,
 		.proto      = IPPROTO_ICMPV6,
 		.family     = NFPROTO_IPV6,
+		.me	    = THIS_MODULE,
 	},
 };
 
-- 
cgit v1.2.3


From 70ba5b6db96ff7324b8cfc87e0d0383cf59c9677 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Thu, 5 Jul 2018 18:49:23 +0000
Subject: ipv4: Return EINVAL when ping_group_range sysctl doesn't map to user
 ns

The low and high values of the net.ipv4.ping_group_range sysctl were
being silently forced to the default disabled state when a write to the
sysctl contained GIDs that didn't map to the associated user namespace.
Confusingly, the sysctl's write operation would return success and then
a subsequent read of the sysctl would indicate that the low and high
values are the overflowgid.

This patch changes the behavior by clearly returning an error when the
sysctl write operation receives a GID range that doesn't map to the
associated user namespace. In such a situation, the previous value of
the sysctl is preserved and that range will be returned in a subsequent
read of the sysctl.

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/sysctl_net_ipv4.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index af0a857d8352..5fa335fd3852 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -189,8 +189,9 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
 	if (write && ret == 0) {
 		low = make_kgid(user_ns, urange[0]);
 		high = make_kgid(user_ns, urange[1]);
-		if (!gid_valid(low) || !gid_valid(high) ||
-		    (urange[1] < urange[0]) || gid_lt(high, low)) {
+		if (!gid_valid(low) || !gid_valid(high))
+			return -EINVAL;
+		if (urange[1] < urange[0] || gid_lt(high, low)) {
 			low = make_kgid(&init_user_ns, 1);
 			high = make_kgid(&init_user_ns, 0);
 		}
-- 
cgit v1.2.3


From 5711b4e89319c2912f20b2a4f371c1525fc9551d Mon Sep 17 00:00:00 2001
From: Máté Eckl <ecklm94@gmail.com>
Date: Thu, 5 Jul 2018 12:01:53 +0200
Subject: netfilter: nf_tproxy: fix possible non-linear access to transport
 header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixes a silent out-of-bound read possibility that was present
because of the misuse of this function.

Mostly it was called with a struct udphdr *hp which had only the udphdr
part linearized by the skb_header_pointer, however
nf_tproxy_get_sock_v{4,6} uses it as a tcphdr pointer, so some reads for
tcp specific attributes may be invalid.

Fixes: a583636a83ea ("inet: refactor inet[6]_lookup functions to take skb")
Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tproxy.h   |  4 ++--
 net/ipv4/netfilter/nf_tproxy_ipv4.c | 18 ++++++++++++------
 net/ipv6/netfilter/nf_tproxy_ipv6.c | 18 ++++++++++++------
 net/netfilter/xt_TPROXY.c           |  8 ++++----
 4 files changed, 30 insertions(+), 18 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/netfilter/nf_tproxy.h b/include/net/netfilter/nf_tproxy.h
index 9754a50ecde9..4cc64c8446eb 100644
--- a/include/net/netfilter/nf_tproxy.h
+++ b/include/net/netfilter/nf_tproxy.h
@@ -64,7 +64,7 @@ nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
  * belonging to established connections going through that one.
  */
 struct sock *
-nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
+nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
 		      const u8 protocol,
 		      const __be32 saddr, const __be32 daddr,
 		      const __be16 sport, const __be16 dport,
@@ -103,7 +103,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
 			    struct sock *sk);
 
 struct sock *
-nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
+nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
 		      const u8 protocol,
 		      const struct in6_addr *saddr, const struct in6_addr *daddr,
 		      const __be16 sport, const __be16 dport,
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
index 805e83ec3ad9..164714104965 100644
--- a/net/ipv4/netfilter/nf_tproxy_ipv4.c
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -37,7 +37,7 @@ nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
 		 * to a listener socket if there's one */
 		struct sock *sk2;
 
-		sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
+		sk2 = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
 					    iph->saddr, laddr ? laddr : iph->daddr,
 					    hp->source, lport ? lport : hp->dest,
 					    skb->dev, NF_TPROXY_LOOKUP_LISTENER);
@@ -71,7 +71,7 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
 EXPORT_SYMBOL_GPL(nf_tproxy_laddr4);
 
 struct sock *
-nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
+nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
 		      const u8 protocol,
 		      const __be32 saddr, const __be32 daddr,
 		      const __be16 sport, const __be16 dport,
@@ -79,16 +79,21 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
 		      const enum nf_tproxy_lookup_t lookup_type)
 {
 	struct sock *sk;
-	struct tcphdr *tcph;
 
 	switch (protocol) {
-	case IPPROTO_TCP:
+	case IPPROTO_TCP: {
+		struct tcphdr _hdr, *hp;
+
+		hp = skb_header_pointer(skb, ip_hdrlen(skb),
+					sizeof(struct tcphdr), &_hdr);
+		if (hp == NULL)
+			return NULL;
+
 		switch (lookup_type) {
 		case NF_TPROXY_LOOKUP_LISTENER:
-			tcph = hp;
 			sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
 						    ip_hdrlen(skb) +
-						      __tcp_hdrlen(tcph),
+						      __tcp_hdrlen(hp),
 						    saddr, sport,
 						    daddr, dport,
 						    in->ifindex, 0);
@@ -110,6 +115,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
 			BUG();
 		}
 		break;
+		}
 	case IPPROTO_UDP:
 		sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
 				     in->ifindex);
diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c
index bf1d6c421e3b..5dfd33af6451 100644
--- a/net/ipv6/netfilter/nf_tproxy_ipv6.c
+++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c
@@ -55,7 +55,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
 		 * to a listener socket if there's one */
 		struct sock *sk2;
 
-		sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, hp, tproto,
+		sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, tproto,
 					    &iph->saddr,
 					    nf_tproxy_laddr6(skb, laddr, &iph->daddr),
 					    hp->source,
@@ -72,7 +72,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
 EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait6);
 
 struct sock *
-nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
+nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
 		      const u8 protocol,
 		      const struct in6_addr *saddr, const struct in6_addr *daddr,
 		      const __be16 sport, const __be16 dport,
@@ -80,15 +80,20 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
 		      const enum nf_tproxy_lookup_t lookup_type)
 {
 	struct sock *sk;
-	struct tcphdr *tcph;
 
 	switch (protocol) {
-	case IPPROTO_TCP:
+	case IPPROTO_TCP: {
+		struct tcphdr _hdr, *hp;
+
+		hp = skb_header_pointer(skb, thoff,
+					sizeof(struct tcphdr), &_hdr);
+		if (hp == NULL)
+			return NULL;
+
 		switch (lookup_type) {
 		case NF_TPROXY_LOOKUP_LISTENER:
-			tcph = hp;
 			sk = inet6_lookup_listener(net, &tcp_hashinfo, skb,
-						   thoff + __tcp_hdrlen(tcph),
+						   thoff + __tcp_hdrlen(hp),
 						   saddr, sport,
 						   daddr, ntohs(dport),
 						   in->ifindex, 0);
@@ -110,6 +115,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
 			BUG();
 		}
 		break;
+		}
 	case IPPROTO_UDP:
 		sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
 				     in->ifindex);
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 58fce4e749a9..d76550a8b642 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -61,7 +61,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
 	 * addresses, this happens if the redirect already happened
 	 * and the current packet belongs to an already established
 	 * connection */
-	sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
+	sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
 				   iph->saddr, iph->daddr,
 				   hp->source, hp->dest,
 				   skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
@@ -77,7 +77,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
 	else if (!sk)
 		/* no, there's no established connection, check if
 		 * there's a listener on the redirected addr/port */
-		sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
+		sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
 					   iph->saddr, laddr,
 					   hp->source, lport,
 					   skb->dev, NF_TPROXY_LOOKUP_LISTENER);
@@ -150,7 +150,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 	 * addresses, this happens if the redirect already happened
 	 * and the current packet belongs to an already established
 	 * connection */
-	sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
+	sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, tproto,
 				   &iph->saddr, &iph->daddr,
 				   hp->source, hp->dest,
 				   xt_in(par), NF_TPROXY_LOOKUP_ESTABLISHED);
@@ -171,7 +171,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 	else if (!sk)
 		/* no there's no established connection, check if
 		 * there's a listener on the redirected addr/port */
-		sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp,
+		sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff,
 					   tproto, &iph->saddr, laddr,
 					   hp->source, lport,
 					   xt_in(par), NF_TPROXY_LOOKUP_LISTENER);
-- 
cgit v1.2.3


From e7372197e15856ec4ee66b668020a662994db103 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sat, 7 Jul 2018 16:15:26 -0700
Subject: net/ipv4: Set oif in fib_compute_spec_dst

Xin reported that icmp replies may not use the address on the device the
echo request is received if the destination address is broadcast. Instead
a route lookup is done without considering VRF context. Fix by setting
oif in flow struct to the master device if it is enslaved. That directs
the lookup to the VRF table. If the device is not enslaved, oif is still
0 so no affect.

Fixes: cd2fbe1b6b51 ("net: Use VRF device index for lookups on RX")
Reported-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index b21833651394..e46cdd310e5f 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -300,6 +300,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
 	if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
 		struct flowi4 fl4 = {
 			.flowi4_iif = LOOPBACK_IFINDEX,
+			.flowi4_oif = l3mdev_master_ifindex_rcu(dev),
 			.daddr = ip_hdr(skb)->saddr,
 			.flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
 			.flowi4_scope = scope,
-- 
cgit v1.2.3


From acc2cf4e37174646a24cba42fa53c668b2338d4e Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Sat, 7 Jul 2018 16:31:40 +0900
Subject: net: diag: Don't double-free TCP_NEW_SYN_RECV sockets in tcp_abort

When tcp_diag_destroy closes a TCP_NEW_SYN_RECV socket, it first
frees it by calling inet_csk_reqsk_queue_drop_and_and_put in
tcp_abort, and then frees it again by calling sock_gen_put.

Since tcp_abort only has one caller, and all the other codepaths
in tcp_abort don't free the socket, just remove the free in that
function.

Cc: David Ahern <dsa@cumulusnetworks.com>
Tested: passes Android sock_diag_test.py, which exercises this codepath
Fixes: d7226c7a4dd1 ("net: diag: Fix refcnt leak in error path destroying socket")
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsa@cumulusnetworks.com>
Tested-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e7b53d2a971f..c959bb6ea4ed 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3720,8 +3720,7 @@ int tcp_abort(struct sock *sk, int err)
 			struct request_sock *req = inet_reqsk(sk);
 
 			local_bh_disable();
-			inet_csk_reqsk_queue_drop_and_put(req->rsk_listener,
-							  req);
+			inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 			local_bh_enable();
 			return 0;
 		}
-- 
cgit v1.2.3


From f6f2a4a2eb92bc73671204198bb2f8ab53ff59fb Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 6 Jul 2018 12:30:20 +0200
Subject: ipfrag: really prevent allocation on netns exit

Setting the low threshold to 0 has no effect on frags allocation,
we need to clear high_thresh instead.

The code was pre-existent to commit 648700f76b03 ("inet: frags:
use rhashtables for reassembly units"), but before the above,
such assignment had a different role: prevent concurrent eviction
from the worker and the netns cleanup helper.

Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_fragment.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index c9e35b81d093..1e4cf3ab560f 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -90,7 +90,7 @@ static void inet_frags_free_cb(void *ptr, void *arg)
 
 void inet_frags_exit_net(struct netns_frags *nf)
 {
-	nf->low_thresh = 0; /* prevent creation of new frags */
+	nf->high_thresh = 0; /* prevent creation of new frags */
 
 	rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
 }
-- 
cgit v1.2.3


From 6508b6781be076f889e3077a1a5fadf1930a569d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 7 Jul 2018 23:00:01 -0700
Subject: tcp: cleanup copied_seq and urg_data in tcp_disconnect

tcp_zerocopy_receive() relies on tcp_inq() to limit number of bytes
requested by user.

syzbot found that after tcp_disconnect(), tcp_inq() was returning
a stale value (number of bytes in queue before the disconnect).

Note that after this patch, ioctl(fd, SIOCINQ, &val) is also fixed
and returns 0, so this might be a candidate for all known linux kernels.

While we are at this, we probably also should clear urg_data to
avoid other syzkaller reports after it discovers how to deal with
urgent data.

syzkaller repro :

socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 3
bind(3, {sa_family=AF_INET, sin_port=htons(20000), sin_addr=inet_addr("224.0.0.1")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(20000), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
send(3, ..., 4096, 0) = 4096
connect(3, {sa_family=AF_UNSPEC, sa_data="\0\0\0\0\0\0\0\0\0\0\0\0\0\0"}, 128) = 0
getsockopt(3, SOL_TCP, TCP_ZEROCOPY_RECEIVE, ..., [16]) = 0 // CRASH

Fixes: 05255b823a61 ("tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c959bb6ea4ed..0d43705dd001 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2562,6 +2562,8 @@ int tcp_disconnect(struct sock *sk, int flags)
 
 	tcp_clear_xmit_timers(sk);
 	__skb_queue_purge(&sk->sk_receive_queue);
+	tp->copied_seq = tp->rcv_nxt;
+	tp->urg_data = 0;
 	tcp_write_queue_purge(sk);
 	tcp_fastopen_active_disable_ofo_check(sk);
 	skb_rbtree_purge(&tp->out_of_order_queue);
-- 
cgit v1.2.3


From 21684dc46c598e477707487c009f9773f7c0382d Mon Sep 17 00:00:00 2001
From: Stefan Baranoff <sbaranoff@gmail.com>
Date: Tue, 10 Jul 2018 17:25:20 -0400
Subject: tcp: fix sequence numbers for repaired sockets re-using TIME-WAIT
 sockets

This patch fixes a bug where the sequence numbers of a socket created using
TCP repair functionality are lower than set after connect is called.
This occurs when the repair socket overlaps with a TIME-WAIT socket and
triggers the re-use code. The amount lower is equal to the number of times
that a particular IP/port set is re-used and then put back into TIME-WAIT.
Re-using the first time the sequence number is 1 lower, closing that socket
and then re-opening (with repair) a new socket with the same addresses/ports
puts the sequence number 2 lower than set via setsockopt. The third time is
3 lower, etc. I have not tested what the limit of this acrewal is, if any.

The fix is, if a socket is in repair mode, to respect the already set
sequence number and timestamp when it would have already re-used the
TIME-WAIT socket.

Signed-off-by: Stefan Baranoff <sbaranoff@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index bea17f1e8302..3b2711e33e4c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -156,11 +156,24 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 	 */
 	if (tcptw->tw_ts_recent_stamp &&
 	    (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
-		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
-		if (tp->write_seq == 0)
-			tp->write_seq = 1;
-		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
-		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+		/* In case of repair and re-using TIME-WAIT sockets we still
+		 * want to be sure that it is safe as above but honor the
+		 * sequence numbers and time stamps set as part of the repair
+		 * process.
+		 *
+		 * Without this check re-using a TIME-WAIT socket with TCP
+		 * repair would accumulate a -1 on the repair assigned
+		 * sequence number. The first time it is reused the sequence
+		 * is -1, the second time -2, etc. This fixes that issue
+		 * without appearing to create any others.
+		 */
+		if (likely(!tp->repair)) {
+			tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
+			if (tp->write_seq == 0)
+				tp->write_seq = 1;
+			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
+			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+		}
 		sock_hold(sktw);
 		return 1;
 	}
-- 
cgit v1.2.3


From 70b7ff130224d2d22a158c7f4aa5e7fb1c95949d Mon Sep 17 00:00:00 2001
From: Stefan Baranoff <sbaranoff@gmail.com>
Date: Tue, 10 Jul 2018 17:31:10 -0400
Subject: tcp: allow user to create repair socket without window probes

Under rare conditions where repair code may be used it is possible that
window probes are either unnecessary or undesired. If the user knows that
window probes are not wanted or needed this change allows them to skip
sending them when a socket comes out of repair.

Signed-off-by: Stefan Baranoff <sbaranoff@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0d43705dd001..8e5e2ca9ab1b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2823,14 +2823,16 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 	case TCP_REPAIR:
 		if (!tcp_can_repair_sock(sk))
 			err = -EPERM;
-		else if (val == 1) {
-			tp->repair = 1;
+		/* 1 for normal repair, 2 for no window probes */
+		else if (val == 1 || val == 2) {
+			tp->repair = val;
 			sk->sk_reuse = SK_FORCE_REUSE;
 			tp->repair_queue = TCP_NO_QUEUE;
 		} else if (val == 0) {
 			tp->repair = 0;
 			sk->sk_reuse = SK_NO_REUSE;
-			tcp_send_window_probe(sk);
+			if (tp->repair == 1)
+				tcp_send_window_probe(sk);
 		} else
 			err = -EINVAL;
 
-- 
cgit v1.2.3


From b0c05d0e99d98d7f0cd41efc1eeec94efdc3325d Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jul 2018 06:04:52 -0700
Subject: tcp: fix dctcp delayed ACK schedule

Previously, when a data segment was sent an ACK was piggybacked
on the data segment without generating a CA_EVENT_NON_DELAYED_ACK
event to notify congestion control modules. So the DCTCP
ca->delayed_ack_reserved flag could incorrectly stay set when
in fact there were no delayed ACKs being reserved. This could result
in sending a special ECN notification ACK that carries an older
ACK sequence, when in fact there was no need for such an ACK.
DCTCP keeps track of the delayed ACK status with its own separate
state ca->delayed_ack_reserved. Previously it may accidentally cancel
the delayed ACK without updating this field upon sending a special
ACK that carries a older ACK sequence. This inconsistency would
lead to DCTCP receiver never acknowledging the latest data until the
sender times out and retry in some cases.

Packetdrill script (provided by Larry Brakmo)

0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0
0.000 bind(3, ..., ...) = 0
0.000 listen(3, 1) = 0

0.100 < [ect0] SEW 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
0.100 > SE. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
0.110 < [ect0] . 1:1(0) ack 1 win 257
0.200 accept(3, ..., ...) = 4

0.200 < [ect0] . 1:1001(1000) ack 1 win 257
0.200 > [ect01] . 1:1(0) ack 1001

0.200 write(4, ..., 1) = 1
0.200 > [ect01] P. 1:2(1) ack 1001

0.200 < [ect0] . 1001:2001(1000) ack 2 win 257
0.200 write(4, ..., 1) = 1
0.200 > [ect01] P. 2:3(1) ack 2001

0.200 < [ect0] . 2001:3001(1000) ack 3 win 257
0.200 < [ect0] . 3001:4001(1000) ack 3 win 257
0.200 > [ect01] . 3:3(0) ack 4001

0.210 < [ce] P. 4001:4501(500) ack 3 win 257

+0.001 read(4, ..., 4500) = 4500
+0 write(4, ..., 1) = 1
+0 > [ect01] PE. 3:4(1) ack 4501

+0.010 < [ect0] W. 4501:5501(1000) ack 4 win 257
// Previously the ACK sequence below would be 4501, causing a long RTO
+0.040~+0.045 > [ect01] . 4:4(0) ack 5501   // delayed ack

+0.311 < [ect0] . 5501:6501(1000) ack 4 win 257  // More data
+0 > [ect01] . 4:4(0) ack 6501     // now acks everything

+0.500 < F. 9501:9501(0) ack 4 win 257

Reported-by: Larry Brakmo <brakmo@fb.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_dctcp.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 5f5e5936760e..89f88b0d8167 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -134,7 +134,8 @@ static void dctcp_ce_state_0_to_1(struct sock *sk)
 	/* State has changed from CE=0 to CE=1 and delayed
 	 * ACK has not sent yet.
 	 */
-	if (!ca->ce_state && ca->delayed_ack_reserved) {
+	if (!ca->ce_state &&
+	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
 		u32 tmp_rcv_nxt;
 
 		/* Save current rcv_nxt. */
@@ -164,7 +165,8 @@ static void dctcp_ce_state_1_to_0(struct sock *sk)
 	/* State has changed from CE=1 to CE=0 and delayed
 	 * ACK has not sent yet.
 	 */
-	if (ca->ce_state && ca->delayed_ack_reserved) {
+	if (ca->ce_state &&
+	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
 		u32 tmp_rcv_nxt;
 
 		/* Save current rcv_nxt. */
-- 
cgit v1.2.3


From a69258f7aa2623e0930212f09c586fd06674ad79 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jul 2018 06:04:53 -0700
Subject: tcp: remove DELAYED ACK events in DCTCP

After fixing the way DCTCP tracking delayed ACKs, the delayed-ACK
related callbacks are no longer needed

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  2 --
 net/ipv4/tcp_dctcp.c  | 25 -------------------------
 net/ipv4/tcp_output.c |  4 ----
 3 files changed, 31 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index af3ec72d5d41..3482d13d655b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -912,8 +912,6 @@ enum tcp_ca_event {
 	CA_EVENT_LOSS,		/* loss timeout */
 	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
 	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
-	CA_EVENT_DELAYED_ACK,	/* Delayed ack is sent */
-	CA_EVENT_NON_DELAYED_ACK,
 };
 
 /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 89f88b0d8167..5869f89ca656 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -55,7 +55,6 @@ struct dctcp {
 	u32 dctcp_alpha;
 	u32 next_seq;
 	u32 ce_state;
-	u32 delayed_ack_reserved;
 	u32 loss_cwnd;
 };
 
@@ -96,7 +95,6 @@ static void dctcp_init(struct sock *sk)
 
 		ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
 
-		ca->delayed_ack_reserved = 0;
 		ca->loss_cwnd = 0;
 		ca->ce_state = 0;
 
@@ -250,25 +248,6 @@ static void dctcp_state(struct sock *sk, u8 new_state)
 	}
 }
 
-static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev)
-{
-	struct dctcp *ca = inet_csk_ca(sk);
-
-	switch (ev) {
-	case CA_EVENT_DELAYED_ACK:
-		if (!ca->delayed_ack_reserved)
-			ca->delayed_ack_reserved = 1;
-		break;
-	case CA_EVENT_NON_DELAYED_ACK:
-		if (ca->delayed_ack_reserved)
-			ca->delayed_ack_reserved = 0;
-		break;
-	default:
-		/* Don't care for the rest. */
-		break;
-	}
-}
-
 static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
 {
 	switch (ev) {
@@ -278,10 +257,6 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
 	case CA_EVENT_ECN_NO_CE:
 		dctcp_ce_state_1_to_0(sk);
 		break;
-	case CA_EVENT_DELAYED_ACK:
-	case CA_EVENT_NON_DELAYED_ACK:
-		dctcp_update_ack_reserved(sk, ev);
-		break;
 	default:
 		/* Don't care for the rest. */
 		break;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8e08b409c71e..00e5a300ddb9 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3523,8 +3523,6 @@ void tcp_send_delayed_ack(struct sock *sk)
 	int ato = icsk->icsk_ack.ato;
 	unsigned long timeout;
 
-	tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
-
 	if (ato > TCP_DELACK_MIN) {
 		const struct tcp_sock *tp = tcp_sk(sk);
 		int max_ato = HZ / 2;
@@ -3581,8 +3579,6 @@ void tcp_send_ack(struct sock *sk)
 	if (sk->sk_state == TCP_CLOSE)
 		return;
 
-	tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
-
 	/* We are not putting this on the write queue, so
 	 * tcp_transmit_skb() will set the ownership to this
 	 * sock.
-- 
cgit v1.2.3


From 6e2059b53f9885f202b086d7b4ca10a98926e974 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 10 Jul 2018 22:41:26 +0800
Subject: ipv4/igmp: init group mode as INCLUDE when join source group

Based on RFC3376 5.1
   If no interface
   state existed for that multicast address before the change (i.e., the
   change consisted of creating a new per-interface record), or if no
   state exists after the change (i.e., the change consisted of deleting
   a per-interface record), then the "non-existent" state is considered
   to have a filter mode of INCLUDE and an empty source list.

Which means a new multicast group should start with state IN().

Function ip_mc_join_group() works correctly for IGMP ASM(Any-Source Multicast)
mode. It adds a group with state EX() and inits crcount to mc_qrv,
so the kernel will send a TO_EX() report message after adding group.

But for IGMPv3 SSM(Source-specific multicast) JOIN_SOURCE_GROUP mode, we
split the group joining into two steps. First we join the group like ASM,
i.e. via ip_mc_join_group(). So the state changes from IN() to EX().

Then we add the source-specific address with INCLUDE mode. So the state
changes from EX() to IN(A).

Before the first step sends a group change record, we finished the second
step. So we will only send the second change record. i.e. TO_IN(A).

Regarding the RFC stands, we should actually send an ALLOW(A) message for
SSM JOIN_SOURCE_GROUP as the state should mimic the 'IN() to IN(A)'
transition.

The issue was exposed by commit a052517a8ff65 ("net/multicast: should not
send source list records when have filter mode change"). Before this change,
we used to send both ALLOW(A) and TO_IN(A). After this change we only send
TO_IN(A).

Fix it by adding a new parameter to init group mode. Also add new wrapper
functions so we don't need to change too much code.

v1 -> v2:
In my first version I only cleared the group change record. But this is not
enough. Because when a new group join, it will init as EXCLUDE and trigger
an filter mode change in ip/ip6_mc_add_src(), which will clear all source
addresses' sf_crcount. This will prevent early joined address sending state
change records if multi source addressed joined at the same time.

In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM
JOIN_SOURCE_GROUP. I also split the original patch into two separated patches
for IPv4 and IPv6.

Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change")
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h   |  2 ++
 net/ipv4/igmp.c        | 58 ++++++++++++++++++++++++++++++++++++--------------
 net/ipv4/ip_sockglue.c |  4 ++--
 3 files changed, 46 insertions(+), 18 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index f8231854b5d6..119f53941c12 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -109,6 +109,8 @@ struct ip_mc_list {
 extern int ip_check_mc_rcu(struct in_device *dev, __be32 mc_addr, __be32 src_addr, u8 proto);
 extern int igmp_rcv(struct sk_buff *);
 extern int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr);
+extern int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr,
+				unsigned int mode);
 extern int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr);
 extern void ip_mc_drop_socket(struct sock *sk);
 extern int ip_mc_source(int add, int omode, struct sock *sk,
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 85b617b655bc..b3c899a630a0 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1200,13 +1200,14 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
 	spin_lock_bh(&im->lock);
 	if (pmc) {
 		im->interface = pmc->interface;
-		im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 		im->sfmode = pmc->sfmode;
 		if (pmc->sfmode == MCAST_INCLUDE) {
 			im->tomb = pmc->tomb;
 			im->sources = pmc->sources;
 			for (psf = im->sources; psf; psf = psf->sf_next)
-				psf->sf_crcount = im->crcount;
+				psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+		} else {
+			im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 		}
 		in_dev_put(pmc->interface);
 		kfree(pmc);
@@ -1288,7 +1289,7 @@ static void igmp_group_dropped(struct ip_mc_list *im)
 #endif
 }
 
-static void igmp_group_added(struct ip_mc_list *im)
+static void igmp_group_added(struct ip_mc_list *im, unsigned int mode)
 {
 	struct in_device *in_dev = im->interface;
 #ifdef CONFIG_IP_MULTICAST
@@ -1316,7 +1317,13 @@ static void igmp_group_added(struct ip_mc_list *im)
 	}
 	/* else, v3 */
 
-	im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+	/* Based on RFC3376 5.1, for newly added INCLUDE SSM, we should
+	 * not send filter-mode change record as the mode should be from
+	 * IN() to IN(A).
+	 */
+	if (mode == MCAST_EXCLUDE)
+		im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+
 	igmp_ifc_event(in_dev);
 #endif
 }
@@ -1381,8 +1388,7 @@ static void ip_mc_hash_remove(struct in_device *in_dev,
 /*
  *	A socket has joined a multicast group on device dev.
  */
-
-void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
+void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, unsigned int mode)
 {
 	struct ip_mc_list *im;
 #ifdef CONFIG_IP_MULTICAST
@@ -1394,7 +1400,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
 	for_each_pmc_rtnl(in_dev, im) {
 		if (im->multiaddr == addr) {
 			im->users++;
-			ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0);
+			ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0);
 			goto out;
 		}
 	}
@@ -1408,8 +1414,8 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
 	in_dev_hold(in_dev);
 	im->multiaddr = addr;
 	/* initial mode is (EX, empty) */
-	im->sfmode = MCAST_EXCLUDE;
-	im->sfcount[MCAST_EXCLUDE] = 1;
+	im->sfmode = mode;
+	im->sfcount[mode] = 1;
 	refcount_set(&im->refcnt, 1);
 	spin_lock_init(&im->lock);
 #ifdef CONFIG_IP_MULTICAST
@@ -1426,12 +1432,17 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
 #ifdef CONFIG_IP_MULTICAST
 	igmpv3_del_delrec(in_dev, im);
 #endif
-	igmp_group_added(im);
+	igmp_group_added(im, mode);
 	if (!in_dev->dead)
 		ip_rt_multicast_event(in_dev);
 out:
 	return;
 }
+
+void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
+{
+	__ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE);
+}
 EXPORT_SYMBOL(ip_mc_inc_group);
 
 static int ip_mc_check_iphdr(struct sk_buff *skb)
@@ -1688,7 +1699,7 @@ void ip_mc_remap(struct in_device *in_dev)
 #ifdef CONFIG_IP_MULTICAST
 		igmpv3_del_delrec(in_dev, pmc);
 #endif
-		igmp_group_added(pmc);
+		igmp_group_added(pmc, pmc->sfmode);
 	}
 }
 
@@ -1751,7 +1762,7 @@ void ip_mc_up(struct in_device *in_dev)
 #ifdef CONFIG_IP_MULTICAST
 		igmpv3_del_delrec(in_dev, pmc);
 #endif
-		igmp_group_added(pmc);
+		igmp_group_added(pmc, pmc->sfmode);
 	}
 }
 
@@ -2130,8 +2141,8 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc)
 
 /* Join a multicast group
  */
-
-int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
+static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr,
+			      unsigned int mode)
 {
 	__be32 addr = imr->imr_multiaddr.s_addr;
 	struct ip_mc_socklist *iml, *i;
@@ -2172,15 +2183,30 @@ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
 	memcpy(&iml->multi, imr, sizeof(*imr));
 	iml->next_rcu = inet->mc_list;
 	iml->sflist = NULL;
-	iml->sfmode = MCAST_EXCLUDE;
+	iml->sfmode = mode;
 	rcu_assign_pointer(inet->mc_list, iml);
-	ip_mc_inc_group(in_dev, addr);
+	__ip_mc_inc_group(in_dev, addr, mode);
 	err = 0;
 done:
 	return err;
 }
+
+/* Join ASM (Any-Source Multicast) group
+ */
+int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
+{
+	return __ip_mc_join_group(sk, imr, MCAST_EXCLUDE);
+}
 EXPORT_SYMBOL(ip_mc_join_group);
 
+/* Join SSM (Source-Specific Multicast) group
+ */
+int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr,
+			 unsigned int mode)
+{
+	return __ip_mc_join_group(sk, imr, mode);
+}
+
 static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
 			   struct in_device *in_dev)
 {
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index fc32fdbeefa6..64c76dcf7386 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -984,7 +984,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr;
 			mreq.imr_address.s_addr = mreqs.imr_interface;
 			mreq.imr_ifindex = 0;
-			err = ip_mc_join_group(sk, &mreq);
+			err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE);
 			if (err && err != -EADDRINUSE)
 				break;
 			omode = MCAST_INCLUDE;
@@ -1061,7 +1061,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			mreq.imr_multiaddr = psin->sin_addr;
 			mreq.imr_address.s_addr = 0;
 			mreq.imr_ifindex = greqs.gsr_interface;
-			err = ip_mc_join_group(sk, &mreq);
+			err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE);
 			if (err && err != -EADDRINUSE)
 				break;
 			greqs.gsr_interface = mreq.imr_ifindex;
-- 
cgit v1.2.3


From 31048d7aedf31bf0f69c54a662944632f29d82f2 Mon Sep 17 00:00:00 2001
From: Stefan Baranoff <sbaranoff@gmail.com>
Date: Sun, 15 Jul 2018 11:36:37 -0400
Subject: tcp: Fix broken repair socket window probe patch

Correct previous bad attempt at allowing sockets to come out of TCP
repair without sending window probes. To avoid changing size of
the repair variable in struct tcp_sock, this lets the decision for
sending probes or not to be made when coming out of repair by
introducing two ways to turn it off.

v2:
* Remove erroneous comment; defines now make behavior clear

Fixes: 70b7ff130224 ("tcp: allow user to create repair socket without window probes")
Signed-off-by: Stefan Baranoff <sbaranoff@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h |  4 ++++
 net/ipv4/tcp.c           | 13 +++++++------
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 29eb659aa77a..e3f6ed8a7064 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -127,6 +127,10 @@ enum {
 
 #define TCP_CM_INQ		TCP_INQ
 
+#define TCP_REPAIR_ON		1
+#define TCP_REPAIR_OFF		0
+#define TCP_REPAIR_OFF_NO_WP	-1	/* Turn off without window probes */
+
 struct tcp_repair_opt {
 	__u32	opt_code;
 	__u32	opt_val;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8e5e2ca9ab1b..ec2186e3087f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2823,16 +2823,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 	case TCP_REPAIR:
 		if (!tcp_can_repair_sock(sk))
 			err = -EPERM;
-		/* 1 for normal repair, 2 for no window probes */
-		else if (val == 1 || val == 2) {
-			tp->repair = val;
+		else if (val == TCP_REPAIR_ON) {
+			tp->repair = 1;
 			sk->sk_reuse = SK_FORCE_REUSE;
 			tp->repair_queue = TCP_NO_QUEUE;
-		} else if (val == 0) {
+		} else if (val == TCP_REPAIR_OFF) {
+			tp->repair = 0;
+			sk->sk_reuse = SK_NO_REUSE;
+			tcp_send_window_probe(sk);
+		} else if (val == TCP_REPAIR_OFF_NO_WP) {
 			tp->repair = 0;
 			sk->sk_reuse = SK_NO_REUSE;
-			if (tp->repair == 1)
-				tcp_send_window_probe(sk);
 		} else
 			err = -EINVAL;
 
-- 
cgit v1.2.3


From e56b8ce363a36fb7b74b80aaa5cc9084f2c908b4 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 17 Jul 2018 18:27:45 -0700
Subject: tcp: identify cryptic messages as TCP seq # bugs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Attempt to make cryptic TCP seq number error messages clearer by
(1) identifying the source of the message as "TCP", (2) identifying the
errors as "seq # bug", and (3) grouping the field identifiers and values
by separating them with commas.

E.g., the following message is changed from:

recvmsg bug 2: copied 73BCB6CD seq 70F17CBE rcvnxt 73BCB9AA fl 0
WARNING: CPU: 2 PID: 1501 at /linux/net/ipv4/tcp.c:1881 tcp_recvmsg+0x649/0xb90

to:

TCP recvmsg seq # bug 2: copied 73BCB6CD, seq 70F17CBE, rcvnxt 73BCB9AA, fl 0
WARNING: CPU: 2 PID: 1501 at /linux/net/ipv4/tcp.c:2011 tcp_recvmsg+0x694/0xba0

Suggested-by: 積丹尼 Dan Jacobson <jidanni@jidanni.org>
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ec2186e3087f..4491faf83f4f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1998,7 +1998,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 			 * shouldn't happen.
 			 */
 			if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
-				 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
+				 "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
 				 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
 				 flags))
 				break;
@@ -2013,7 +2013,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 				goto found_fin_ok;
 			WARN(!(flags & MSG_PEEK),
-			     "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
+			     "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
 			     *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
 		}
 
-- 
cgit v1.2.3


From 2987babb6982306509380fc11b450227a844493b Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 18 Jul 2018 13:56:34 -0700
Subject: tcp: helpers to send special DCTCP ack

Refactor and create helpers to send the special ACK in DCTCP.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 00e5a300ddb9..ee1b0705321d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1023,8 +1023,8 @@ static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
  * We are working here with either a clone of the original
  * SKB, or a fresh unique copy made by the retransmit engine.
  */
-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
-			    gfp_t gfp_mask)
+static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
+			      int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct inet_sock *inet;
@@ -1100,7 +1100,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	th->source		= inet->inet_sport;
 	th->dest		= inet->inet_dport;
 	th->seq			= htonl(tcb->seq);
-	th->ack_seq		= htonl(tp->rcv_nxt);
+	th->ack_seq		= htonl(rcv_nxt);
 	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
 					tcb->tcp_flags);
 
@@ -1178,6 +1178,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	return err;
 }
 
+static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+			    gfp_t gfp_mask)
+{
+	return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
+				  tcp_sk(sk)->rcv_nxt);
+}
+
 /* This routine just queues the buffer for sending.
  *
  * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
@@ -3571,7 +3578,7 @@ void tcp_send_delayed_ack(struct sock *sk)
 }
 
 /* This routine sends an ack and also updates the window. */
-void tcp_send_ack(struct sock *sk)
+void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
 {
 	struct sk_buff *buff;
 
@@ -3604,7 +3611,12 @@ void tcp_send_ack(struct sock *sk)
 	skb_set_tcp_pure_ack(buff);
 
 	/* Send it off, this clears delayed acks for us. */
-	tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0);
+	__tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
+}
+
+void tcp_send_ack(struct sock *sk)
+{
+	__tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
 }
 EXPORT_SYMBOL_GPL(tcp_send_ack);
 
-- 
cgit v1.2.3


From 27cde44a259c380a3c09066fc4b42de7dde9b1ad Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 18 Jul 2018 13:56:35 -0700
Subject: tcp: do not cancel delay-AcK on DCTCP special ACK

Currently when a DCTCP receiver delays an ACK and receive a
data packet with a different CE mark from the previous one's, it
sends two immediate ACKs acking previous and latest sequences
respectly (for ECN accounting).

Previously sending the first ACK may mark off the delayed ACK timer
(tcp_event_ack_sent). This may subsequently prevent sending the
second ACK to acknowledge the latest sequence (tcp_ack_snd_check).
The culprit is that tcp_send_ack() assumes it always acknowleges
the latest sequence, which is not true for the first special ACK.

The fix is to not make the assumption in tcp_send_ack and check the
actual ack sequence before cancelling the delayed ACK. Further it's
safer to pass the ack sequence number as a local variable into
tcp_send_ack routine, instead of intercepting tp->rcv_nxt to avoid
future bugs like this.

Reported-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  1 +
 net/ipv4/tcp_dctcp.c  | 34 ++++------------------------------
 net/ipv4/tcp_output.c | 10 +++++++---
 3 files changed, 12 insertions(+), 33 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3482d13d655b..a08de496d1b2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -539,6 +539,7 @@ void tcp_send_fin(struct sock *sk);
 void tcp_send_active_reset(struct sock *sk, gfp_t priority);
 int tcp_send_synack(struct sock *);
 void tcp_push_one(struct sock *, unsigned int mss_now);
+void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
 void tcp_send_ack(struct sock *sk);
 void tcp_send_delayed_ack(struct sock *sk);
 void tcp_send_loss_probe(struct sock *sk);
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 5869f89ca656..078328afbfe3 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -133,21 +133,8 @@ static void dctcp_ce_state_0_to_1(struct sock *sk)
 	 * ACK has not sent yet.
 	 */
 	if (!ca->ce_state &&
-	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
-		u32 tmp_rcv_nxt;
-
-		/* Save current rcv_nxt. */
-		tmp_rcv_nxt = tp->rcv_nxt;
-
-		/* Generate previous ack with CE=0. */
-		tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
-		tp->rcv_nxt = ca->prior_rcv_nxt;
-
-		tcp_send_ack(sk);
-
-		/* Recover current rcv_nxt. */
-		tp->rcv_nxt = tmp_rcv_nxt;
-	}
+	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
+		__tcp_send_ack(sk, ca->prior_rcv_nxt);
 
 	ca->prior_rcv_nxt = tp->rcv_nxt;
 	ca->ce_state = 1;
@@ -164,21 +151,8 @@ static void dctcp_ce_state_1_to_0(struct sock *sk)
 	 * ACK has not sent yet.
 	 */
 	if (ca->ce_state &&
-	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
-		u32 tmp_rcv_nxt;
-
-		/* Save current rcv_nxt. */
-		tmp_rcv_nxt = tp->rcv_nxt;
-
-		/* Generate previous ack with CE=1. */
-		tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
-		tp->rcv_nxt = ca->prior_rcv_nxt;
-
-		tcp_send_ack(sk);
-
-		/* Recover current rcv_nxt. */
-		tp->rcv_nxt = tmp_rcv_nxt;
-	}
+	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
+		__tcp_send_ack(sk, ca->prior_rcv_nxt);
 
 	ca->prior_rcv_nxt = tp->rcv_nxt;
 	ca->ce_state = 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ee1b0705321d..c4172c1fb198 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -160,7 +160,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
 }
 
 /* Account for an ACK we sent. */
-static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
+static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
+				      u32 rcv_nxt)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -171,6 +172,9 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
 		if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
 			__sock_put(sk);
 	}
+
+	if (unlikely(rcv_nxt != tp->rcv_nxt))
+		return;  /* Special ACK sent by DCTCP to reflect ECN */
 	tcp_dec_quickack_mode(sk, pkts);
 	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
 }
@@ -1141,7 +1145,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 	icsk->icsk_af_ops->send_check(sk, skb);
 
 	if (likely(tcb->tcp_flags & TCPHDR_ACK))
-		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
+		tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
 
 	if (skb->len != tcp_header_size) {
 		tcp_event_data_sent(tp, sk);
@@ -3613,12 +3617,12 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
 	/* Send it off, this clears delayed acks for us. */
 	__tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
 }
+EXPORT_SYMBOL_GPL(__tcp_send_ack);
 
 void tcp_send_ack(struct sock *sk)
 {
 	__tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
 }
-EXPORT_SYMBOL_GPL(tcp_send_ack);
 
 /* This routine sends a packet with an out of date sequence
  * number. It assumes the other end will try to ack it.
-- 
cgit v1.2.3


From a0496ef2c23b3b180902dd185d0d63ccbc624cf8 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 18 Jul 2018 13:56:36 -0700
Subject: tcp: do not delay ACK in DCTCP upon CE status change

Per DCTCP RFC8257 (Section 3.2) the ACK reflecting the CE status change
has to be sent immediately so the sender can respond quickly:

""" When receiving packets, the CE codepoint MUST be processed as follows:

   1.  If the CE codepoint is set and DCTCP.CE is false, set DCTCP.CE to
       true and send an immediate ACK.

   2.  If the CE codepoint is not set and DCTCP.CE is true, set DCTCP.CE
       to false and send an immediate ACK.
"""

Previously DCTCP implementation may continue to delay the ACK. This
patch fixes that to implement the RFC by forcing an immediate ACK.

Tested with this packetdrill script provided by Larry Brakmo

0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0
0.000 bind(3, ..., ...) = 0
0.000 listen(3, 1) = 0

0.100 < [ect0] SEW 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
0.100 > SE. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
0.110 < [ect0] . 1:1(0) ack 1 win 257
0.200 accept(3, ..., ...) = 4
   +0 setsockopt(4, SOL_SOCKET, SO_DEBUG, [1], 4) = 0

0.200 < [ect0] . 1:1001(1000) ack 1 win 257
0.200 > [ect01] . 1:1(0) ack 1001

0.200 write(4, ..., 1) = 1
0.200 > [ect01] P. 1:2(1) ack 1001

0.200 < [ect0] . 1001:2001(1000) ack 2 win 257
+0.005 < [ce] . 2001:3001(1000) ack 2 win 257

+0.000 > [ect01] . 2:2(0) ack 2001
// Previously the ACK below would be delayed by 40ms
+0.000 > [ect01] E. 2:2(0) ack 3001

+0.500 < F. 9501:9501(0) ack 4 win 257

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    |  1 +
 net/ipv4/tcp_dctcp.c | 30 ++++++++++++++++++------------
 net/ipv4/tcp_input.c |  3 ++-
 3 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index a08de496d1b2..25116ec02087 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -342,6 +342,7 @@ ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
 			struct pipe_inode_info *pipe, size_t len,
 			unsigned int flags);
 
+void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks);
 static inline void tcp_dec_quickack_mode(struct sock *sk,
 					 const unsigned int pkts)
 {
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 078328afbfe3..8b637f9f23a2 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -129,12 +129,15 @@ static void dctcp_ce_state_0_to_1(struct sock *sk)
 	struct dctcp *ca = inet_csk_ca(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	/* State has changed from CE=0 to CE=1 and delayed
-	 * ACK has not sent yet.
-	 */
-	if (!ca->ce_state &&
-	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
-		__tcp_send_ack(sk, ca->prior_rcv_nxt);
+	if (!ca->ce_state) {
+		/* State has changed from CE=0 to CE=1, force an immediate
+		 * ACK to reflect the new CE state. If an ACK was delayed,
+		 * send that first to reflect the prior CE state.
+		 */
+		if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
+			__tcp_send_ack(sk, ca->prior_rcv_nxt);
+		tcp_enter_quickack_mode(sk, 1);
+	}
 
 	ca->prior_rcv_nxt = tp->rcv_nxt;
 	ca->ce_state = 1;
@@ -147,12 +150,15 @@ static void dctcp_ce_state_1_to_0(struct sock *sk)
 	struct dctcp *ca = inet_csk_ca(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	/* State has changed from CE=1 to CE=0 and delayed
-	 * ACK has not sent yet.
-	 */
-	if (ca->ce_state &&
-	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
-		__tcp_send_ack(sk, ca->prior_rcv_nxt);
+	if (ca->ce_state) {
+		/* State has changed from CE=1 to CE=0, force an immediate
+		 * ACK to reflect the new CE state. If an ACK was delayed,
+		 * send that first to reflect the prior CE state.
+		 */
+		if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
+			__tcp_send_ack(sk, ca->prior_rcv_nxt);
+		tcp_enter_quickack_mode(sk, 1);
+	}
 
 	ca->prior_rcv_nxt = tp->rcv_nxt;
 	ca->ce_state = 0;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8e5522c6833a..6bade06aaf72 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -215,7 +215,7 @@ static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
 		icsk->icsk_ack.quick = quickacks;
 }
 
-static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
+void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
@@ -223,6 +223,7 @@ static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
 	icsk->icsk_ack.pingpong = 0;
 	icsk->icsk_ack.ato = TCP_ATO_MIN;
 }
+EXPORT_SYMBOL(tcp_enter_quickack_mode);
 
 /* Send ACKs quickly, if "quick" count is not exhausted
  * and the session is not interactive.
-- 
cgit v1.2.3


From 08d3ffcc0cfaba36f6b86fd568cc3bc773061fa6 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Fri, 20 Jul 2018 14:04:27 +0800
Subject: multicast: do not restore deleted record source filter mode to new
 one

There are two scenarios that we will restore deleted records. The first is
when device down and up(or unmap/remap). In this scenario the new filter
mode is same with previous one. Because we get it from in_dev->mc_list and
we do not touch it during device down and up.

The other scenario is when a new socket join a group which was just delete
and not finish sending status reports. In this scenario, we should use the
current filter mode instead of restore old one. Here are 4 cases in total.

old_socket        new_socket       before_fix       after_fix
  IN(A)             IN(A)           ALLOW(A)         ALLOW(A)
  IN(A)             EX( )           TO_IN( )         TO_EX( )
  EX( )             IN(A)           TO_EX( )         ALLOW(A)
  EX( )             EX( )           TO_EX( )         TO_EX( )

Fixes: 24803f38a5c0b (igmp: do not remove igmp souce list info when set link down)
Fixes: 1666d49e1d416 (mld: do not remove mld souce list info when set link down)
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c  | 3 +--
 net/ipv6/mcast.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index b3c899a630a0..28fef7d15959 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1200,8 +1200,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
 	spin_lock_bh(&im->lock);
 	if (pmc) {
 		im->interface = pmc->interface;
-		im->sfmode = pmc->sfmode;
-		if (pmc->sfmode == MCAST_INCLUDE) {
+		if (im->sfmode == MCAST_INCLUDE) {
 			im->tomb = pmc->tomb;
 			im->sources = pmc->sources;
 			for (psf = im->sources; psf; psf = psf->sf_next)
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 2699be7202be..f60f310785fd 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -790,8 +790,7 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
 	spin_lock_bh(&im->mca_lock);
 	if (pmc) {
 		im->idev = pmc->idev;
-		im->mca_sfmode = pmc->mca_sfmode;
-		if (pmc->mca_sfmode == MCAST_INCLUDE) {
+		if (im->mca_sfmode == MCAST_INCLUDE) {
 			im->mca_tomb = pmc->mca_tomb;
 			im->mca_sources = pmc->mca_sources;
 			for (psf = im->mca_sources; psf; psf = psf->sf_next)
-- 
cgit v1.2.3


From 3dd1c9a1270736029ffca670e9bd0265f4120600 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 23 Jul 2018 16:50:48 +0200
Subject: ip: hash fragments consistently

The skb hash for locally generated ip[v6] fragments belonging
to the same datagram can vary in several circumstances:
* for connected UDP[v6] sockets, the first fragment get its hash
  via set_owner_w()/skb_set_hash_from_sk()
* for unconnected IPv6 UDPv6 sockets, the first fragment can get
  its hash via ip6_make_flowlabel()/skb_get_hash_flowi6(), if
  auto_flowlabel is enabled

For the following frags the hash is usually computed via
skb_get_hash().
The above can cause OoO for unconnected IPv6 UDPv6 socket: in that
scenario the egress tx queue can be selected on a per packet basis
via the skb hash.
It may also fool flow-oriented schedulers to place fragments belonging
to the same datagram in different flows.

Fix the issue by copying the skb hash from the head frag into
the others at fragmentation time.

Before this commit:
perf probe -a "dev_queue_xmit skb skb->hash skb->l4_hash:b1@0/8 skb->sw_hash:b1@1/8"
netperf -H $IPV4 -t UDP_STREAM -l 5 -- -m 2000 -n &
perf record -e probe:dev_queue_xmit -e probe:skb_set_owner_w -a sleep 0.1
perf script
probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=3713014309 l4_hash=1 sw_hash=0
probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=0 l4_hash=0 sw_hash=0

After this commit:
probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=2171763177 l4_hash=1 sw_hash=0
probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=2171763177 l4_hash=1 sw_hash=0

Fixes: b73c3d0e4f0e ("net: Save TX flow hash in sock and set in skbuf on xmit")
Fixes: 67800f9b1f4e ("ipv6: Call skb_get_hash_flowi6 to get skb->hash in ip6_make_flowlabel")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c  | 2 ++
 net/ipv6/ip6_output.c | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b3308e9d9762..0e3edd25f881 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -523,6 +523,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->dev = from->dev;
 	to->mark = from->mark;
 
+	skb_copy_hash(to, from);
+
 	/* Copy the flags to each fragment. */
 	IPCB(to)->flags = IPCB(from)->flags;
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index a14fb4fcdf18..3168847c30d1 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -570,6 +570,8 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->dev = from->dev;
 	to->mark = from->mark;
 
+	skb_copy_hash(to, from);
+
 #ifdef CONFIG_NET_SCHED
 	to->tc_index = from->tc_index;
 #endif
-- 
cgit v1.2.3


From 72cd43ba64fc172a443410ce01645895850844c8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 23 Jul 2018 09:28:17 -0700
Subject: tcp: free batches of packets in tcp_prune_ofo_queue()

Juha-Matti Tilli reported that malicious peers could inject tiny
packets in out_of_order_queue, forcing very expensive calls
to tcp_collapse_ofo_queue() and tcp_prune_ofo_queue() for
every incoming packet. out_of_order_queue rb-tree can contain
thousands of nodes, iterating over all of them is not nice.

Before linux-4.9, we would have pruned all packets in ofo_queue
in one go, every XXXX packets. XXXX depends on sk_rcvbuf and skbs
truesize, but is about 7000 packets with tcp_rmem[2] default of 6 MB.

Since we plan to increase tcp_rmem[2] in the future to cope with
modern BDP, can not revert to the old behavior, without great pain.

Strategy taken in this patch is to purge ~12.5 % of the queue capacity.

Fixes: 36a6503fedda ("tcp: refine tcp_prune_ofo_queue() to not drop all packets")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6bade06aaf72..64e45b279431 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4942,6 +4942,7 @@ new_range:
  * 2) not add too big latencies if thousands of packets sit there.
  *    (But if application shrinks SO_RCVBUF, we could still end up
  *     freeing whole queue here)
+ * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
  *
  * Return true if queue has shrunk.
  */
@@ -4949,20 +4950,26 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct rb_node *node, *prev;
+	int goal;
 
 	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
 		return false;
 
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
+	goal = sk->sk_rcvbuf >> 3;
 	node = &tp->ooo_last_skb->rbnode;
 	do {
 		prev = rb_prev(node);
 		rb_erase(node, &tp->out_of_order_queue);
+		goal -= rb_to_skb(node)->truesize;
 		tcp_drop(sk, rb_to_skb(node));
-		sk_mem_reclaim(sk);
-		if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
-		    !tcp_under_memory_pressure(sk))
-			break;
+		if (!prev || goal <= 0) {
+			sk_mem_reclaim(sk);
+			if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+			    !tcp_under_memory_pressure(sk))
+				break;
+			goal = sk->sk_rcvbuf >> 3;
+		}
 		node = prev;
 	} while (node);
 	tp->ooo_last_skb = rb_to_skb(prev);
-- 
cgit v1.2.3


From f4a3313d8e2ca9fd8d8f45e40a2903ba782607e7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 23 Jul 2018 09:28:18 -0700
Subject: tcp: avoid collapses in tcp_prune_queue() if possible

Right after a TCP flow is created, receiving tiny out of order
packets allways hit the condition :

if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
	tcp_clamp_window(sk);

tcp_clamp_window() increases sk_rcvbuf to match sk_rmem_alloc
(guarded by tcp_rmem[2])

Calling tcp_collapse_ofo_queue() in this case is not useful,
and offers a O(N^2) surface attack to malicious peers.

Better not attempt anything before full queue capacity is reached,
forcing attacker to spend lots of resource and allow us to more
easily detect the abuse.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 64e45b279431..53289911362a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5004,6 +5004,9 @@ static int tcp_prune_queue(struct sock *sk)
 	else if (tcp_under_memory_pressure(sk))
 		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
 
+	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+		return 0;
+
 	tcp_collapse_ofo_queue(sk);
 	if (!skb_queue_empty(&sk->sk_receive_queue))
 		tcp_collapse(sk, &sk->sk_receive_queue, NULL,
-- 
cgit v1.2.3


From 3d4bf93ac12003f9b8e1e2de37fe27983deebdcf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 23 Jul 2018 09:28:19 -0700
Subject: tcp: detect malicious patterns in tcp_collapse_ofo_queue()

In case an attacker feeds tiny packets completely out of order,
tcp_collapse_ofo_queue() might scan the whole rb-tree, performing
expensive copies, but not changing socket memory usage at all.

1) Do not attempt to collapse tiny skbs.
2) Add logic to exit early when too many tiny skbs are detected.

We prefer not doing aggressive collapsing (which copies packets)
for pathological flows, and revert to tcp_prune_ofo_queue() which
will be less expensive.

In the future, we might add the possibility of terminating flows
that are proven to be malicious.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 53289911362a..78068b902e7b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4902,6 +4902,7 @@ end:
 static void tcp_collapse_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	u32 range_truesize, sum_tiny = 0;
 	struct sk_buff *skb, *head;
 	u32 start, end;
 
@@ -4913,6 +4914,7 @@ new_range:
 	}
 	start = TCP_SKB_CB(skb)->seq;
 	end = TCP_SKB_CB(skb)->end_seq;
+	range_truesize = skb->truesize;
 
 	for (head = skb;;) {
 		skb = skb_rb_next(skb);
@@ -4923,11 +4925,20 @@ new_range:
 		if (!skb ||
 		    after(TCP_SKB_CB(skb)->seq, end) ||
 		    before(TCP_SKB_CB(skb)->end_seq, start)) {
-			tcp_collapse(sk, NULL, &tp->out_of_order_queue,
-				     head, skb, start, end);
+			/* Do not attempt collapsing tiny skbs */
+			if (range_truesize != head->truesize ||
+			    end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
+				tcp_collapse(sk, NULL, &tp->out_of_order_queue,
+					     head, skb, start, end);
+			} else {
+				sum_tiny += range_truesize;
+				if (sum_tiny > sk->sk_rcvbuf >> 3)
+					return;
+			}
 			goto new_range;
 		}
 
+		range_truesize += skb->truesize;
 		if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
 			start = TCP_SKB_CB(skb)->seq;
 		if (after(TCP_SKB_CB(skb)->end_seq, end))
-- 
cgit v1.2.3


From 8541b21e781a22dce52a74fef0b9bed00404a1cd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 23 Jul 2018 09:28:20 -0700
Subject: tcp: call tcp_drop() from tcp_data_queue_ofo()

In order to be able to give better diagnostics and detect
malicious traffic, we need to have better sk->sk_drops tracking.

Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 78068b902e7b..b062a7692238 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4510,7 +4510,7 @@ coalesce_done:
 				/* All the bits are present. Drop. */
 				NET_INC_STATS(sock_net(sk),
 					      LINUX_MIB_TCPOFOMERGE);
-				__kfree_skb(skb);
+				tcp_drop(sk, skb);
 				skb = NULL;
 				tcp_dsack_set(sk, seq, end_seq);
 				goto add_sack;
@@ -4529,7 +4529,7 @@ coalesce_done:
 						 TCP_SKB_CB(skb1)->end_seq);
 				NET_INC_STATS(sock_net(sk),
 					      LINUX_MIB_TCPOFOMERGE);
-				__kfree_skb(skb1);
+				tcp_drop(sk, skb1);
 				goto merge_right;
 			}
 		} else if (tcp_try_coalesce(sk, skb1,
-- 
cgit v1.2.3


From 58152ecbbcc6a0ce7fddd5bf5f6ee535834ece0c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 23 Jul 2018 09:28:21 -0700
Subject: tcp: add tcp_ooo_try_coalesce() helper

In case skb in out_or_order_queue is the result of
multiple skbs coalescing, we would like to get a proper gso_segs
counter tracking, so that future tcp_drop() can report an accurate
number.

I chose to not implement this tracking for skbs in receive queue,
since they are not dropped, unless socket is disconnected.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b062a7692238..3bcd30a2ba06 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4358,6 +4358,23 @@ static bool tcp_try_coalesce(struct sock *sk,
 	return true;
 }
 
+static bool tcp_ooo_try_coalesce(struct sock *sk,
+			     struct sk_buff *to,
+			     struct sk_buff *from,
+			     bool *fragstolen)
+{
+	bool res = tcp_try_coalesce(sk, to, from, fragstolen);
+
+	/* In case tcp_drop() is called later, update to->gso_segs */
+	if (res) {
+		u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
+			       max_t(u16, 1, skb_shinfo(from)->gso_segs);
+
+		skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
+	}
+	return res;
+}
+
 static void tcp_drop(struct sock *sk, struct sk_buff *skb)
 {
 	sk_drops_add(sk, skb);
@@ -4481,8 +4498,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	/* In the typical case, we are adding an skb to the end of the list.
 	 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
 	 */
-	if (tcp_try_coalesce(sk, tp->ooo_last_skb,
-			     skb, &fragstolen)) {
+	if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
+				 skb, &fragstolen)) {
 coalesce_done:
 		tcp_grow_window(sk, skb);
 		kfree_skb_partial(skb, fragstolen);
@@ -4532,8 +4549,8 @@ coalesce_done:
 				tcp_drop(sk, skb1);
 				goto merge_right;
 			}
-		} else if (tcp_try_coalesce(sk, skb1,
-					    skb, &fragstolen)) {
+		} else if (tcp_ooo_try_coalesce(sk, skb1,
+						skb, &fragstolen)) {
 			goto coalesce_done;
 		}
 		p = &parent->rb_right;
-- 
cgit v1.2.3


From 2efd4fca703a6707cad16ab486eaab8fc7f0fd49 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Mon, 23 Jul 2018 19:36:48 -0400
Subject: ip: in cmsg IP(V6)_ORIGDSTADDR call pskb_may_pull

Syzbot reported a read beyond the end of the skb head when returning
IPV6_ORIGDSTADDR:

  BUG: KMSAN: kernel-infoleak in put_cmsg+0x5ef/0x860 net/core/scm.c:242
  CPU: 0 PID: 4501 Comm: syz-executor128 Not tainted 4.17.0+ #9
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
  Google 01/01/2011
  Call Trace:
    __dump_stack lib/dump_stack.c:77 [inline]
    dump_stack+0x185/0x1d0 lib/dump_stack.c:113
    kmsan_report+0x188/0x2a0 mm/kmsan/kmsan.c:1125
    kmsan_internal_check_memory+0x138/0x1f0 mm/kmsan/kmsan.c:1219
    kmsan_copy_to_user+0x7a/0x160 mm/kmsan/kmsan.c:1261
    copy_to_user include/linux/uaccess.h:184 [inline]
    put_cmsg+0x5ef/0x860 net/core/scm.c:242
    ip6_datagram_recv_specific_ctl+0x1cf3/0x1eb0 net/ipv6/datagram.c:719
    ip6_datagram_recv_ctl+0x41c/0x450 net/ipv6/datagram.c:733
    rawv6_recvmsg+0x10fb/0x1460 net/ipv6/raw.c:521
    [..]

This logic and its ipv4 counterpart read the destination port from
the packet at skb_transport_offset(skb) + 4.

With MSG_MORE and a local SOCK_RAW sender, syzbot was able to cook a
packet that stores headers exactly up to skb_transport_offset(skb) in
the head and the remainder in a frag.

Call pskb_may_pull before accessing the pointer to ensure that it lies
in skb head.

Link: http://lkml.kernel.org/r/CAF=yD-LEJwZj5a1-bAAj2Oy_hKmGygV6rsJ_WOrAYnv-fnayiQ@mail.gmail.com
Reported-by: syzbot+9adb4b567003cac781f0@syzkaller.appspotmail.com
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c | 7 +++++--
 net/ipv6/datagram.c    | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 64c76dcf7386..c0fe5ad996f2 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -150,15 +150,18 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
 {
 	struct sockaddr_in sin;
 	const struct iphdr *iph = ip_hdr(skb);
-	__be16 *ports = (__be16 *)skb_transport_header(skb);
+	__be16 *ports;
+	int end;
 
-	if (skb_transport_offset(skb) + 4 > (int)skb->len)
+	end = skb_transport_offset(skb) + 4;
+	if (end > 0 && !pskb_may_pull(skb, end))
 		return;
 
 	/* All current transport protocols have the port numbers in the
 	 * first four bytes of the transport header and this function is
 	 * written with this assumption in mind.
 	 */
+	ports = (__be16 *)skb_transport_header(skb);
 
 	sin.sin_family = AF_INET;
 	sin.sin_addr.s_addr = iph->daddr;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 2ee08b6a86a4..1a1f876f8e28 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -700,13 +700,16 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
 	}
 	if (np->rxopt.bits.rxorigdstaddr) {
 		struct sockaddr_in6 sin6;
-		__be16 *ports = (__be16 *) skb_transport_header(skb);
+		__be16 *ports;
+		int end;
 
-		if (skb_transport_offset(skb) + 4 <= (int)skb->len) {
+		end = skb_transport_offset(skb) + 4;
+		if (end <= 0 || pskb_may_pull(skb, end)) {
 			/* All current transport protocols have the port numbers in the
 			 * first four bytes of the transport header and this function is
 			 * written with this assumption in mind.
 			 */
+			ports = (__be16 *)skb_transport_header(skb);
 
 			sin6.sin6_family = AF_INET6;
 			sin6.sin6_addr = ipv6_hdr(skb)->daddr;
-- 
cgit v1.2.3