From d6990976af7c5d8f55903bfb4289b6fb030bf754 Mon Sep 17 00:00:00 2001
From: Eyal Birger <eyal.birger@gmail.com>
Date: Thu, 7 Jun 2018 10:11:02 +0300
Subject: vti6: fix PMTU caching and reporting on xmit

When setting the skb->dst before doing the MTU check, the route PMTU
caching and reporting is done on the new dst which is about to be
released.

Instead, PMTU handling should be done using the original dst.

This is aligned with IPv4 VTI.

Fixes: ccd740cbc6 ("vti6: Add pmtu handling to vti6_xmit.")
Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/ip6_vti.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index b7f28deddaea..c72ae3a4fe09 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -480,10 +480,6 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
 		goto tx_err_dst_release;
 	}
 
-	skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev)));
-	skb_dst_set(skb, dst);
-	skb->dev = skb_dst(skb)->dev;
-
 	mtu = dst_mtu(dst);
 	if (!skb->ignore_df && skb->len > mtu) {
 		skb_dst_update_pmtu(skb, mtu);
@@ -498,9 +494,14 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
 				  htonl(mtu));
 		}
 
-		return -EMSGSIZE;
+		err = -EMSGSIZE;
+		goto tx_err_dst_release;
 	}
 
+	skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev)));
+	skb_dst_set(skb, dst);
+	skb->dev = skb_dst(skb)->dev;
+
 	err = dst_output(t->net, skb->sk, skb);
 	if (net_xmit_eval(err) == 0) {
 		struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
-- 
cgit v1.2.3


From 9ce7bc036ae4cfe3393232c86e9e1fea2153c237 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 13 Jun 2018 10:11:56 -0700
Subject: netfilter: ipv6: nf_defrag: reduce struct net memory waste

It is a waste of memory to use a full "struct netns_sysctl_ipv6"
while only one pointer is really used, considering netns_sysctl_ipv6
keeps growing.

Also, since "struct netns_frags" has cache line alignment,
it is better to move the frags_hdr pointer outside, otherwise
we spend a full cache line for this pointer.

This saves 192 bytes of memory per netns.

Fixes: c038a767cd69 ("ipv6: add a new namespace for nf_conntrack_reasm")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/net_namespace.h             | 1 +
 include/net/netns/ipv6.h                | 1 -
 net/ipv6/netfilter/nf_conntrack_reasm.c | 6 +++---
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net/ipv6')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 47e35cce3b64..a71264d75d7f 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -128,6 +128,7 @@ struct net {
 #endif
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
 	struct netns_nf_frag	nf_frag;
+	struct ctl_table_header *nf_frag_frags_hdr;
 #endif
 	struct sock		*nfnl;
 	struct sock		*nfnl_stash;
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index c978a31b0f84..762ac9931b62 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -109,7 +109,6 @@ struct netns_ipv6 {
 
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
 struct netns_nf_frag {
-	struct netns_sysctl_ipv6 sysctl;
 	struct netns_frags	frags;
 };
 #endif
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 5e0332014c17..a452d99c9f52 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -107,7 +107,7 @@ static int nf_ct_frag6_sysctl_register(struct net *net)
 	if (hdr == NULL)
 		goto err_reg;
 
-	net->nf_frag.sysctl.frags_hdr = hdr;
+	net->nf_frag_frags_hdr = hdr;
 	return 0;
 
 err_reg:
@@ -121,8 +121,8 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net)
 {
 	struct ctl_table *table;
 
-	table = net->nf_frag.sysctl.frags_hdr->ctl_table_arg;
-	unregister_net_sysctl_table(net->nf_frag.sysctl.frags_hdr);
+	table = net->nf_frag_frags_hdr->ctl_table_arg;
+	unregister_net_sysctl_table(net->nf_frag_frags_hdr);
 	if (!net_eq(net, &init_net))
 		kfree(table);
 }
-- 
cgit v1.2.3


From 9b0a8da8c4c6e91012ab03a801acc5d8011c7c2f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 18 Jun 2018 05:24:31 -0700
Subject: net/ipv6: respect rcu grace period before freeing fib6_info

syzbot reported use after free that is caused by fib6_info being
freed without a proper RCU grace period.

CPU: 0 PID: 1407 Comm: udevd Not tainted 4.17.0+ #39
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1b9/0x294 lib/dump_stack.c:113
 print_address_description+0x6c/0x20b mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433
 __read_once_size include/linux/compiler.h:188 [inline]
 find_rr_leaf net/ipv6/route.c:705 [inline]
 rt6_select net/ipv6/route.c:761 [inline]
 fib6_table_lookup+0x12b7/0x14d0 net/ipv6/route.c:1823
 ip6_pol_route+0x1c2/0x1020 net/ipv6/route.c:1856
 ip6_pol_route_output+0x54/0x70 net/ipv6/route.c:2082
 fib6_rule_lookup+0x211/0x6d0 net/ipv6/fib6_rules.c:122
 ip6_route_output_flags+0x2c5/0x350 net/ipv6/route.c:2110
 ip6_route_output include/net/ip6_route.h:82 [inline]
 icmpv6_xrlim_allow net/ipv6/icmp.c:211 [inline]
 icmp6_send+0x147c/0x2da0 net/ipv6/icmp.c:535
 icmpv6_send+0x17a/0x300 net/ipv6/ip6_icmp.c:43
 ip6_link_failure+0xa5/0x790 net/ipv6/route.c:2244
 dst_link_failure include/net/dst.h:427 [inline]
 ndisc_error_report+0xd1/0x1c0 net/ipv6/ndisc.c:695
 neigh_invalidate+0x246/0x550 net/core/neighbour.c:892
 neigh_timer_handler+0xaf9/0xde0 net/core/neighbour.c:978
 call_timer_fn+0x230/0x940 kernel/time/timer.c:1326
 expire_timers kernel/time/timer.c:1363 [inline]
 __run_timers+0x79e/0xc50 kernel/time/timer.c:1666
 run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692
 __do_softirq+0x2e0/0xaf5 kernel/softirq.c:284
 invoke_softirq kernel/softirq.c:364 [inline]
 irq_exit+0x1d1/0x200 kernel/softirq.c:404
 exiting_irq arch/x86/include/asm/apic.h:527 [inline]
 smp_apic_timer_interrupt+0x17e/0x710 arch/x86/kernel/apic/apic.c:1052
 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:863
 </IRQ>
RIP: 0010:strlen+0x5e/0xa0 lib/string.c:482
Code: 24 00 74 3b 48 bb 00 00 00 00 00 fc ff df 4c 89 e0 48 83 c0 01 48 89 c2 48 89 c1 48 c1 ea 03 83 e1 07 0f b6 14 1a 38 ca 7f 04 <84> d2 75 23 80 38 00 75 de 48 83 c4 08 4c 29 e0 5b 41 5c 5d c3 48
RSP: 0018:ffff8801af117850 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13
RAX: ffff880197f53bd0 RBX: dffffc0000000000 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffffffff81c5b06c RDI: ffff880197f53bc0
RBP: ffff8801af117868 R08: ffff88019a976540 R09: 0000000000000000
R10: ffff88019a976540 R11: 0000000000000000 R12: ffff880197f53bc0
R13: ffff880197f53bc0 R14: ffffffff899e4e90 R15: ffff8801d91c6a00
 strlen include/linux/string.h:267 [inline]
 getname_kernel+0x24/0x370 fs/namei.c:218
 open_exec+0x17/0x70 fs/exec.c:882
 load_elf_binary+0x968/0x5610 fs/binfmt_elf.c:780
 search_binary_handler+0x17d/0x570 fs/exec.c:1653
 exec_binprm fs/exec.c:1695 [inline]
 __do_execve_file.isra.35+0x16fe/0x2710 fs/exec.c:1819
 do_execveat_common fs/exec.c:1866 [inline]
 do_execve fs/exec.c:1883 [inline]
 __do_sys_execve fs/exec.c:1964 [inline]
 __se_sys_execve fs/exec.c:1959 [inline]
 __x64_sys_execve+0x8f/0xc0 fs/exec.c:1959
 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x7f1576a46207
Code: 77 19 f4 48 89 d7 44 89 c0 0f 05 48 3d 00 f0 ff ff 76 e0 f7 d8 64 41 89 01 eb d8 f7 d8 64 41 89 01 eb df b8 3b 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 02 f3 c3 48 8b 15 00 8c 2d 00 f7 d8 64 89 02
RSP: 002b:00007ffff2784568 EFLAGS: 00000202 ORIG_RAX: 000000000000003b
RAX: ffffffffffffffda RBX: 00000000ffffffff RCX: 00007f1576a46207
RDX: 0000000001215b10 RSI: 00007ffff2784660 RDI: 00007ffff2785670
RBP: 0000000000625500 R08: 000000000000589c R09: 000000000000589c
R10: 0000000000000000 R11: 0000000000000202 R12: 0000000001215b10
R13: 0000000000000007 R14: 0000000001204250 R15: 0000000000000005

Allocated by task 12188:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553
 kmem_cache_alloc_trace+0x152/0x780 mm/slab.c:3620
 kmalloc include/linux/slab.h:513 [inline]
 kzalloc include/linux/slab.h:706 [inline]
 fib6_info_alloc+0xbb/0x280 net/ipv6/ip6_fib.c:152
 ip6_route_info_create+0x782/0x2b50 net/ipv6/route.c:3013
 ip6_route_add+0x23/0xb0 net/ipv6/route.c:3154
 ipv6_route_ioctl+0x5a5/0x760 net/ipv6/route.c:3660
 inet6_ioctl+0x100/0x1f0 net/ipv6/af_inet6.c:546
 sock_do_ioctl+0xe4/0x3e0 net/socket.c:973
 sock_ioctl+0x30d/0x680 net/socket.c:1097
 vfs_ioctl fs/ioctl.c:46 [inline]
 file_ioctl fs/ioctl.c:500 [inline]
 do_vfs_ioctl+0x1cf/0x16f0 fs/ioctl.c:684
 ksys_ioctl+0xa9/0xd0 fs/ioctl.c:701
 __do_sys_ioctl fs/ioctl.c:708 [inline]
 __se_sys_ioctl fs/ioctl.c:706 [inline]
 __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:706
 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 1402:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521
 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
 __cache_free mm/slab.c:3498 [inline]
 kfree+0xd9/0x260 mm/slab.c:3813
 fib6_info_destroy+0x29b/0x350 net/ipv6/ip6_fib.c:207
 fib6_info_release include/net/ip6_fib.h:286 [inline]
 __ip6_del_rt_siblings net/ipv6/route.c:3235 [inline]
 ip6_route_del+0x11c4/0x13b0 net/ipv6/route.c:3316
 ipv6_route_ioctl+0x616/0x760 net/ipv6/route.c:3663
 inet6_ioctl+0x100/0x1f0 net/ipv6/af_inet6.c:546
 sock_do_ioctl+0xe4/0x3e0 net/socket.c:973
 sock_ioctl+0x30d/0x680 net/socket.c:1097
 vfs_ioctl fs/ioctl.c:46 [inline]
 file_ioctl fs/ioctl.c:500 [inline]
 do_vfs_ioctl+0x1cf/0x16f0 fs/ioctl.c:684
 ksys_ioctl+0xa9/0xd0 fs/ioctl.c:701
 __do_sys_ioctl fs/ioctl.c:708 [inline]
 __se_sys_ioctl fs/ioctl.c:706 [inline]
 __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:706
 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

The buggy address belongs to the object at ffff8801b5df2580
 which belongs to the cache kmalloc-256 of size 256
The buggy address is located 8 bytes inside of
 256-byte region [ffff8801b5df2580, ffff8801b5df2680)
The buggy address belongs to the page:
page:ffffea0006d77c80 count:1 mapcount:0 mapping:ffff8801da8007c0 index:0xffff8801b5df2e40
flags: 0x2fffc0000000100(slab)
raw: 02fffc0000000100 ffffea0006c5cc48 ffffea0007363308 ffff8801da8007c0
raw: ffff8801b5df2e40 ffff8801b5df2080 0000000100000006 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8801b5df2480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff8801b5df2500: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
> ffff8801b5df2580: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                      ^
 ffff8801b5df2600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff8801b5df2680: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb

Fixes: a64efe142f5e ("net/ipv6: introduce fib6_info struct and helpers")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: David Ahern <dsahern@gmail.com>
Reported-by: syzbot+9e6d75e3edef427ee888@syzkaller.appspotmail.com
Acked-by: David Ahern <dsahern@gmail.com>
Tested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 5 +++--
 net/ipv6/ip6_fib.c    | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'net/ipv6')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 5cba71d2dc44..71b9043aa0e7 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -170,6 +170,7 @@ struct fib6_info {
 					unused:3;
 
 	struct fib6_nh			fib6_nh;
+	struct rcu_head			rcu;
 };
 
 struct rt6_info {
@@ -273,7 +274,7 @@ static inline void ip6_rt_put(struct rt6_info *rt)
 }
 
 struct fib6_info *fib6_info_alloc(gfp_t gfp_flags);
-void fib6_info_destroy(struct fib6_info *f6i);
+void fib6_info_destroy_rcu(struct rcu_head *head);
 
 static inline void fib6_info_hold(struct fib6_info *f6i)
 {
@@ -283,7 +284,7 @@ static inline void fib6_info_hold(struct fib6_info *f6i)
 static inline void fib6_info_release(struct fib6_info *f6i)
 {
 	if (f6i && atomic_dec_and_test(&f6i->fib6_ref))
-		fib6_info_destroy(f6i);
+		call_rcu(&f6i->rcu, fib6_info_destroy_rcu);
 }
 
 enum fib6_walk_state {
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 39d1d487eca2..1fb2f3118d60 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -167,8 +167,9 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
 	return f6i;
 }
 
-void fib6_info_destroy(struct fib6_info *f6i)
+void fib6_info_destroy_rcu(struct rcu_head *head)
 {
+	struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
 	struct rt6_exception_bucket *bucket;
 	struct dst_metrics *m;
 
@@ -206,7 +207,7 @@ void fib6_info_destroy(struct fib6_info *f6i)
 
 	kfree(f6i);
 }
-EXPORT_SYMBOL_GPL(fib6_info_destroy);
+EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);
 
 static struct fib6_node *node_alloc(struct net *net)
 {
-- 
cgit v1.2.3


From 8c43bd1706885ba1acfa88da02bc60a2ec16f68c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 18 Jun 2018 12:30:37 -0700
Subject: net/tcp: Fix socket lookups with SO_BINDTODEVICE

Similar to 69678bcd4d2d ("udp: fix SO_BINDTODEVICE"), TCP socket lookups
need to fail if dev_match is not true. Currently, a packet to a given port
can match a socket bound to device when it should not. In the VRF case,
this causes the lookup to hit a VRF socket and not a global socket
resulting in a response trying to go through the VRF when it should not.

Fixes: 3fa6f616a7a4d ("net: ipv4: add second dif to inet socket lookups")
Fixes: 4297a0ef08572 ("net: ipv6: add second dif to inet6 socket lookups")
Reported-by: Lou Berger <lberger@labn.net>
Diagnosed-by: Renato Westphal <renato@opensourcerouting.org>
Tested-by: Renato Westphal <renato@opensourcerouting.org>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_hashtables.c  | 4 ++--
 net/ipv6/inet6_hashtables.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 31ff46daae97..3647167c8fa3 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -243,9 +243,9 @@ static inline int compute_score(struct sock *sk, struct net *net,
 			bool dev_match = (sk->sk_bound_dev_if == dif ||
 					  sk->sk_bound_dev_if == sdif);
 
-			if (exact_dif && !dev_match)
+			if (!dev_match)
 				return -1;
-			if (sk->sk_bound_dev_if && dev_match)
+			if (sk->sk_bound_dev_if)
 				score += 4;
 		}
 		if (sk->sk_incoming_cpu == raw_smp_processor_id())
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 2febe26de6a1..595ad408dba0 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -113,9 +113,9 @@ static inline int compute_score(struct sock *sk, struct net *net,
 			bool dev_match = (sk->sk_bound_dev_if == dif ||
 					  sk->sk_bound_dev_if == sdif);
 
-			if (exact_dif && !dev_match)
+			if (!dev_match)
 				return -1;
-			if (sk->sk_bound_dev_if && dev_match)
+			if (sk->sk_bound_dev_if)
 				score++;
 		}
 		if (sk->sk_incoming_cpu == raw_smp_processor_id())
-- 
cgit v1.2.3


From 9887cba19978a5f288100ef90a37684cc8d5e0a6 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Tue, 19 Jun 2018 12:47:52 -0400
Subject: ip: limit use of gso_size to udp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ipcm(6)_cookie field gso_size is set only in the udp path. The ip
layer copies this to cork only if sk_type is SOCK_DGRAM. This check
proved too permissive. Ping and l2tp sockets have the same type.

Limit to sockets of type SOCK_DGRAM and protocol IPPROTO_UDP to
exclude ping sockets.

v1 -> v2
- remove irrelevant whitespace changes

Fixes: bec1f6f69736 ("udp: generate gso with UDP_SEGMENT")
Reported-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c  | 3 ++-
 net/ipv6/ip6_output.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index af5a830ff6ad..b3308e9d9762 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1145,7 +1145,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
 	cork->fragsize = ip_sk_use_pmtu(sk) ?
 			 dst_mtu(&rt->dst) : rt->dst.dev->mtu;
 
-	cork->gso_size = sk->sk_type == SOCK_DGRAM ? ipc->gso_size : 0;
+	cork->gso_size = sk->sk_type == SOCK_DGRAM &&
+			 sk->sk_protocol == IPPROTO_UDP ? ipc->gso_size : 0;
 	cork->dst = &rt->dst;
 	cork->length = 0;
 	cork->ttl = ipc->ttl;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 021e5aef6ba3..a14fb4fcdf18 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1219,7 +1219,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
 	if (mtu < IPV6_MIN_MTU)
 		return -EINVAL;
 	cork->base.fragsize = mtu;
-	cork->base.gso_size = sk->sk_type == SOCK_DGRAM ? ipc6->gso_size : 0;
+	cork->base.gso_size = sk->sk_type == SOCK_DGRAM &&
+			      sk->sk_protocol == IPPROTO_UDP ? ipc6->gso_size : 0;
 
 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
 		cork->base.flags |= IPCORK_ALLFRAG;
-- 
cgit v1.2.3


From 6c6da92808442908287fae8ebb0ca041a52469f4 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Thu, 21 Jun 2018 19:49:36 +0800
Subject: ipv6: mcast: fix unsolicited report interval after receiving querys

After recieving MLD querys, we update idev->mc_maxdelay with max_delay
from query header. This make the later unsolicited reports have the same
interval with mc_maxdelay, which means we may send unsolicited reports with
long interval time instead of default configured interval time.

Also as we will not call ipv6_mc_reset() after device up. This issue will
be there even after leave the group and join other groups.

Fixes: fc4eba58b4c14 ("ipv6: make unsolicited report intervals configurable for mld")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/mcast.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 975021df7c1c..c0c74088f2af 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -2082,7 +2082,8 @@ void ipv6_mc_dad_complete(struct inet6_dev *idev)
 		mld_send_initial_cr(idev);
 		idev->mc_dad_count--;
 		if (idev->mc_dad_count)
-			mld_dad_start_timer(idev, idev->mc_maxdelay);
+			mld_dad_start_timer(idev,
+					    unsolicited_report_interval(idev));
 	}
 }
 
@@ -2094,7 +2095,8 @@ static void mld_dad_timer_expire(struct timer_list *t)
 	if (idev->mc_dad_count) {
 		idev->mc_dad_count--;
 		if (idev->mc_dad_count)
-			mld_dad_start_timer(idev, idev->mc_maxdelay);
+			mld_dad_start_timer(idev,
+					    unsolicited_report_interval(idev));
 	}
 	in6_dev_put(idev);
 }
@@ -2452,7 +2454,8 @@ static void mld_ifc_timer_expire(struct timer_list *t)
 	if (idev->mc_ifc_count) {
 		idev->mc_ifc_count--;
 		if (idev->mc_ifc_count)
-			mld_ifc_start_timer(idev, idev->mc_maxdelay);
+			mld_ifc_start_timer(idev,
+					    unsolicited_report_interval(idev));
 	}
 	in6_dev_put(idev);
 }
-- 
cgit v1.2.3


From 7284fdf39a912322ce97de2d30def3c6068a418c Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Wed, 27 Jun 2018 11:49:28 +0800
Subject: esp6: fix memleak on error path in esp6_input

This ought to be an omission in e6194923237 ("esp: Fix memleaks on error
paths."). The memleak on error path in esp6_input is similar to esp_input
of esp4.

Fixes: e6194923237 ("esp: Fix memleaks on error paths.")
Fixes: 3f29770723f ("ipsec: check return value of skb_to_sgvec always")
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/esp6.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 97513f35bcc5..88a7579c23bd 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -669,8 +669,10 @@ skip_cow:
 
 	sg_init_table(sg, nfrags);
 	ret = skb_to_sgvec(skb, sg, 0, skb->len);
-	if (unlikely(ret < 0))
+	if (unlikely(ret < 0)) {
+		kfree(tmp);
 		goto out;
+	}
 
 	skb->ip_summed = CHECKSUM_NONE;
 
-- 
cgit v1.2.3


From a11e1d432b51f63ba698d044441284a661f01144 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 28 Jun 2018 09:43:44 -0700
Subject: Revert changes to convert to ->poll_mask() and aio IOCB_CMD_POLL

The poll() changes were not well thought out, and completely
unexplained.  They also caused a huge performance regression, because
"->poll()" was no longer a trivial file operation that just called down
to the underlying file operations, but instead did at least two indirect
calls.

Indirect calls are sadly slow now with the Spectre mitigation, but the
performance problem could at least be largely mitigated by changing the
"->get_poll_head()" operation to just have a per-file-descriptor pointer
to the poll head instead.  That gets rid of one of the new indirections.

But that doesn't fix the new complexity that is completely unwarranted
for the regular case.  The (undocumented) reason for the poll() changes
was some alleged AIO poll race fixing, but we don't make the common case
slower and more complex for some uncommon special case, so this all
really needs way more explanations and most likely a fundamental
redesign.

[ This revert is a revert of about 30 different commits, not reverted
  individually because that would just be unnecessarily messy  - Linus ]

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/Locking |   7 +-
 Documentation/filesystems/vfs.txt |  13 ----
 crypto/af_alg.c                   |  13 +++-
 crypto/algif_aead.c               |   4 +-
 crypto/algif_skcipher.c           |   4 +-
 drivers/char/random.c             |  29 ++++----
 drivers/isdn/mISDN/socket.c       |   2 +-
 drivers/net/ppp/pppoe.c           |   2 +-
 fs/aio.c                          | 148 +-------------------------------------
 fs/eventfd.c                      |  19 ++---
 fs/eventpoll.c                    |  15 ++--
 fs/pipe.c                         |  22 +++---
 fs/select.c                       |  23 ------
 fs/timerfd.c                      |  22 +++---
 include/crypto/if_alg.h           |   3 +-
 include/linux/fs.h                |   2 -
 include/linux/net.h               |   1 -
 include/linux/poll.h              |  12 ++--
 include/linux/skbuff.h            |   3 +-
 include/net/bluetooth/bluetooth.h |   2 +-
 include/net/iucv/af_iucv.h        |   2 +
 include/net/sctp/sctp.h           |   3 +-
 include/net/tcp.h                 |   3 +-
 include/net/tls.h                 |   6 +-
 include/net/udp.h                 |   2 +-
 include/uapi/linux/aio_abi.h      |   8 ++-
 net/appletalk/ddp.c               |   2 +-
 net/atm/common.c                  |  11 ++-
 net/atm/common.h                  |   2 +-
 net/atm/pvc.c                     |   2 +-
 net/atm/svc.c                     |   2 +-
 net/ax25/af_ax25.c                |   2 +-
 net/bluetooth/af_bluetooth.c      |   7 +-
 net/bluetooth/hci_sock.c          |   2 +-
 net/bluetooth/l2cap_sock.c        |   2 +-
 net/bluetooth/rfcomm/sock.c       |   2 +-
 net/bluetooth/sco.c               |   2 +-
 net/caif/caif_socket.c            |  12 ++--
 net/can/bcm.c                     |   2 +-
 net/can/raw.c                     |   2 +-
 net/core/datagram.c               |  13 ++--
 net/dccp/dccp.h                   |   3 +-
 net/dccp/ipv4.c                   |   2 +-
 net/dccp/ipv6.c                   |   2 +-
 net/dccp/proto.c                  |  13 +++-
 net/decnet/af_decnet.c            |   6 +-
 net/ieee802154/socket.c           |   4 +-
 net/ipv4/af_inet.c                |   8 +--
 net/ipv4/tcp.c                    |  23 ++++--
 net/ipv4/udp.c                    |  10 +--
 net/ipv6/af_inet6.c               |   4 +-
 net/ipv6/raw.c                    |   4 +-
 net/iucv/af_iucv.c                |   7 +-
 net/kcm/kcmsock.c                 |  10 +--
 net/key/af_key.c                  |   2 +-
 net/l2tp/l2tp_ip.c                |   2 +-
 net/l2tp/l2tp_ip6.c               |   2 +-
 net/l2tp/l2tp_ppp.c               |   2 +-
 net/llc/af_llc.c                  |   2 +-
 net/netlink/af_netlink.c          |   2 +-
 net/netrom/af_netrom.c            |   2 +-
 net/nfc/llcp_sock.c               |   9 ++-
 net/nfc/rawsock.c                 |   4 +-
 net/packet/af_packet.c            |   9 +--
 net/phonet/socket.c               |   9 ++-
 net/qrtr/qrtr.c                   |   2 +-
 net/rose/af_rose.c                |   2 +-
 net/rxrpc/af_rxrpc.c              |  10 ++-
 net/sctp/ipv6.c                   |   2 +-
 net/sctp/protocol.c               |   2 +-
 net/sctp/socket.c                 |   4 +-
 net/smc/af_smc.c                  |  12 +++-
 net/socket.c                      |  48 ++-----------
 net/tipc/socket.c                 |  14 ++--
 net/tls/tls_main.c                |   2 +-
 net/tls/tls_sw.c                  |  19 ++---
 net/unix/af_unix.c                |  30 +++++---
 net/vmw_vsock/af_vsock.c          |  19 +++--
 net/x25/af_x25.c                  |   2 +-
 net/xdp/xsk.c                     |   7 +-
 80 files changed, 301 insertions(+), 450 deletions(-)

(limited to 'net/ipv6')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 2c391338c675..37bf0a9de75c 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -441,8 +441,6 @@ prototypes:
 	int (*iterate) (struct file *, struct dir_context *);
 	int (*iterate_shared) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
-	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
-	__poll_t (*poll_mask) (struct file *, __poll_t);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
@@ -473,7 +471,7 @@ prototypes:
 };
 
 locking rules:
-	All except for ->poll_mask may block.
+	All may block.
 
 ->llseek() locking has moved from llseek to the individual llseek
 implementations.  If your fs is not using generic_file_llseek, you
@@ -505,9 +503,6 @@ in sys_read() and friends.
 the lease within the individual filesystem to record the result of the
 operation
 
-->poll_mask can be called with or without the waitqueue lock for the waitqueue
-returned from ->get_poll_head.
-
 --------------------------- dquot_operations -------------------------------
 prototypes:
 	int (*write_dquot) (struct dquot *);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 829a7b7857a4..f608180ad59d 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -857,8 +857,6 @@ struct file_operations {
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
-	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
-	__poll_t (*poll_mask) (struct file *, __poll_t);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
@@ -903,17 +901,6 @@ otherwise noted.
 	activity on this file and (optionally) go to sleep until there
 	is activity. Called by the select(2) and poll(2) system calls
 
-  get_poll_head: Returns the struct wait_queue_head that callers can
-  wait on.  Callers need to check the returned events using ->poll_mask
-  once woken.  Can return NULL to indicate polling is not supported,
-  or any error code using the ERR_PTR convention to indicate that a
-  grave error occured and ->poll_mask shall not be called.
-
-  poll_mask: return the mask of EPOLL* values describing the file descriptor
-  state.  Called either before going to sleep on the waitqueue returned by
-  get_poll_head, or after it has been woken.  If ->get_poll_head and
-  ->poll_mask are implemented ->poll does not need to be implement.
-
   unlocked_ioctl: called by the ioctl(2) system call.
 
   compat_ioctl: called by the ioctl(2) system call when 32 bit system calls
diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index 49fa8582138b..314c52c967e5 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -1060,12 +1060,19 @@ void af_alg_async_cb(struct crypto_async_request *_req, int err)
 }
 EXPORT_SYMBOL_GPL(af_alg_async_cb);
 
-__poll_t af_alg_poll_mask(struct socket *sock, __poll_t events)
+/**
+ * af_alg_poll - poll system call handler
+ */
+__poll_t af_alg_poll(struct file *file, struct socket *sock,
+			 poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	struct alg_sock *ask = alg_sk(sk);
 	struct af_alg_ctx *ctx = ask->private;
-	__poll_t mask = 0;
+	__poll_t mask;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
 
 	if (!ctx->more || ctx->used)
 		mask |= EPOLLIN | EPOLLRDNORM;
@@ -1075,7 +1082,7 @@ __poll_t af_alg_poll_mask(struct socket *sock, __poll_t events)
 
 	return mask;
 }
-EXPORT_SYMBOL_GPL(af_alg_poll_mask);
+EXPORT_SYMBOL_GPL(af_alg_poll);
 
 /**
  * af_alg_alloc_areq - allocate struct af_alg_async_req
diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index 825524f27438..c40a8c7ee8ae 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -375,7 +375,7 @@ static struct proto_ops algif_aead_ops = {
 	.sendmsg	=	aead_sendmsg,
 	.sendpage	=	af_alg_sendpage,
 	.recvmsg	=	aead_recvmsg,
-	.poll_mask	=	af_alg_poll_mask,
+	.poll		=	af_alg_poll,
 };
 
 static int aead_check_key(struct socket *sock)
@@ -471,7 +471,7 @@ static struct proto_ops algif_aead_ops_nokey = {
 	.sendmsg	=	aead_sendmsg_nokey,
 	.sendpage	=	aead_sendpage_nokey,
 	.recvmsg	=	aead_recvmsg_nokey,
-	.poll_mask	=	af_alg_poll_mask,
+	.poll		=	af_alg_poll,
 };
 
 static void *aead_bind(const char *name, u32 type, u32 mask)
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index 4c04eb9888ad..cfdaab2b7d76 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -206,7 +206,7 @@ static struct proto_ops algif_skcipher_ops = {
 	.sendmsg	=	skcipher_sendmsg,
 	.sendpage	=	af_alg_sendpage,
 	.recvmsg	=	skcipher_recvmsg,
-	.poll_mask	=	af_alg_poll_mask,
+	.poll		=	af_alg_poll,
 };
 
 static int skcipher_check_key(struct socket *sock)
@@ -302,7 +302,7 @@ static struct proto_ops algif_skcipher_ops_nokey = {
 	.sendmsg	=	skcipher_sendmsg_nokey,
 	.sendpage	=	skcipher_sendpage_nokey,
 	.recvmsg	=	skcipher_recvmsg_nokey,
-	.poll_mask	=	af_alg_poll_mask,
+	.poll		=	af_alg_poll,
 };
 
 static void *skcipher_bind(const char *name, u32 type, u32 mask)
diff --git a/drivers/char/random.c b/drivers/char/random.c
index a8fb0020ba5c..cd888d4ee605 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -402,7 +402,8 @@ static struct poolinfo {
 /*
  * Static global variables
  */
-static DECLARE_WAIT_QUEUE_HEAD(random_wait);
+static DECLARE_WAIT_QUEUE_HEAD(random_read_wait);
+static DECLARE_WAIT_QUEUE_HEAD(random_write_wait);
 static struct fasync_struct *fasync;
 
 static DEFINE_SPINLOCK(random_ready_list_lock);
@@ -721,8 +722,8 @@ retry:
 
 		/* should we wake readers? */
 		if (entropy_bits >= random_read_wakeup_bits &&
-		    wq_has_sleeper(&random_wait)) {
-			wake_up_interruptible_poll(&random_wait, POLLIN);
+		    wq_has_sleeper(&random_read_wait)) {
+			wake_up_interruptible(&random_read_wait);
 			kill_fasync(&fasync, SIGIO, POLL_IN);
 		}
 		/* If the input pool is getting full, send some
@@ -1396,7 +1397,7 @@ retry:
 	trace_debit_entropy(r->name, 8 * ibytes);
 	if (ibytes &&
 	    (r->entropy_count >> ENTROPY_SHIFT) < random_write_wakeup_bits) {
-		wake_up_interruptible_poll(&random_wait, POLLOUT);
+		wake_up_interruptible(&random_write_wait);
 		kill_fasync(&fasync, SIGIO, POLL_OUT);
 	}
 
@@ -1838,7 +1839,7 @@ _random_read(int nonblock, char __user *buf, size_t nbytes)
 		if (nonblock)
 			return -EAGAIN;
 
-		wait_event_interruptible(random_wait,
+		wait_event_interruptible(random_read_wait,
 			ENTROPY_BITS(&input_pool) >=
 			random_read_wakeup_bits);
 		if (signal_pending(current))
@@ -1875,17 +1876,14 @@ urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 	return ret;
 }
 
-static struct wait_queue_head *
-random_get_poll_head(struct file *file, __poll_t events)
-{
-	return &random_wait;
-}
-
 static __poll_t
-random_poll_mask(struct file *file, __poll_t events)
+random_poll(struct file *file, poll_table * wait)
 {
-	__poll_t mask = 0;
+	__poll_t mask;
 
+	poll_wait(file, &random_read_wait, wait);
+	poll_wait(file, &random_write_wait, wait);
+	mask = 0;
 	if (ENTROPY_BITS(&input_pool) >= random_read_wakeup_bits)
 		mask |= EPOLLIN | EPOLLRDNORM;
 	if (ENTROPY_BITS(&input_pool) < random_write_wakeup_bits)
@@ -1992,8 +1990,7 @@ static int random_fasync(int fd, struct file *filp, int on)
 const struct file_operations random_fops = {
 	.read  = random_read,
 	.write = random_write,
-	.get_poll_head  = random_get_poll_head,
-	.poll_mask  = random_poll_mask,
+	.poll  = random_poll,
 	.unlocked_ioctl = random_ioctl,
 	.fasync = random_fasync,
 	.llseek = noop_llseek,
@@ -2326,7 +2323,7 @@ void add_hwgenerator_randomness(const char *buffer, size_t count,
 	 * We'll be woken up again once below random_write_wakeup_thresh,
 	 * or when the calling thread is about to terminate.
 	 */
-	wait_event_interruptible(random_wait, kthread_should_stop() ||
+	wait_event_interruptible(random_write_wait, kthread_should_stop() ||
 			ENTROPY_BITS(&input_pool) <= random_write_wakeup_bits);
 	mix_pool_bytes(poolp, buffer, count);
 	credit_entropy_bits(poolp, entropy);
diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 98f90aadd141..18c0a1281914 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -588,7 +588,7 @@ static const struct proto_ops data_sock_ops = {
 	.getname	= data_sock_getname,
 	.sendmsg	= mISDN_sock_sendmsg,
 	.recvmsg	= mISDN_sock_recvmsg,
-	.poll_mask	= datagram_poll_mask,
+	.poll		= datagram_poll,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= data_sock_setsockopt,
diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index de51e8f70f44..ce61231e96ea 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -1107,7 +1107,7 @@ static const struct proto_ops pppoe_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= pppoe_getname,
-	.poll_mask	= datagram_poll_mask,
+	.poll		= datagram_poll,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= sock_no_setsockopt,
diff --git a/fs/aio.c b/fs/aio.c
index e1d20124ec0e..210df9da1283 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -5,7 +5,6 @@
  *	Implements an efficient asynchronous io interface.
  *
  *	Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved.
- *	Copyright 2018 Christoph Hellwig.
  *
  *	See ../COPYING for licensing terms.
  */
@@ -165,22 +164,10 @@ struct fsync_iocb {
 	bool			datasync;
 };
 
-struct poll_iocb {
-	struct file		*file;
-	__poll_t		events;
-	struct wait_queue_head	*head;
-
-	union {
-		struct wait_queue_entry	wait;
-		struct work_struct	work;
-	};
-};
-
 struct aio_kiocb {
 	union {
 		struct kiocb		rw;
 		struct fsync_iocb	fsync;
-		struct poll_iocb	poll;
 	};
 
 	struct kioctx		*ki_ctx;
@@ -1590,6 +1577,7 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
 	if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes ||
 			iocb->aio_rw_flags))
 		return -EINVAL;
+
 	req->file = fget(iocb->aio_fildes);
 	if (unlikely(!req->file))
 		return -EBADF;
@@ -1604,137 +1592,6 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
 	return 0;
 }
 
-/* need to use list_del_init so we can check if item was present */
-static inline bool __aio_poll_remove(struct poll_iocb *req)
-{
-	if (list_empty(&req->wait.entry))
-		return false;
-	list_del_init(&req->wait.entry);
-	return true;
-}
-
-static inline void __aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
-{
-	fput(iocb->poll.file);
-	aio_complete(iocb, mangle_poll(mask), 0);
-}
-
-static void aio_poll_work(struct work_struct *work)
-{
-	struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, poll.work);
-
-	if (!list_empty_careful(&iocb->ki_list))
-		aio_remove_iocb(iocb);
-	__aio_poll_complete(iocb, iocb->poll.events);
-}
-
-static int aio_poll_cancel(struct kiocb *iocb)
-{
-	struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
-	struct poll_iocb *req = &aiocb->poll;
-	struct wait_queue_head *head = req->head;
-	bool found = false;
-
-	spin_lock(&head->lock);
-	found = __aio_poll_remove(req);
-	spin_unlock(&head->lock);
-
-	if (found) {
-		req->events = 0;
-		INIT_WORK(&req->work, aio_poll_work);
-		schedule_work(&req->work);
-	}
-	return 0;
-}
-
-static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
-		void *key)
-{
-	struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
-	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
-	struct file *file = req->file;
-	__poll_t mask = key_to_poll(key);
-
-	assert_spin_locked(&req->head->lock);
-
-	/* for instances that support it check for an event match first: */
-	if (mask && !(mask & req->events))
-		return 0;
-
-	mask = file->f_op->poll_mask(file, req->events) & req->events;
-	if (!mask)
-		return 0;
-
-	__aio_poll_remove(req);
-
-	/*
-	 * Try completing without a context switch if we can acquire ctx_lock
-	 * without spinning.  Otherwise we need to defer to a workqueue to
-	 * avoid a deadlock due to the lock order.
-	 */
-	if (spin_trylock(&iocb->ki_ctx->ctx_lock)) {
-		list_del_init(&iocb->ki_list);
-		spin_unlock(&iocb->ki_ctx->ctx_lock);
-
-		__aio_poll_complete(iocb, mask);
-	} else {
-		req->events = mask;
-		INIT_WORK(&req->work, aio_poll_work);
-		schedule_work(&req->work);
-	}
-
-	return 1;
-}
-
-static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
-{
-	struct kioctx *ctx = aiocb->ki_ctx;
-	struct poll_iocb *req = &aiocb->poll;
-	__poll_t mask;
-
-	/* reject any unknown events outside the normal event mask. */
-	if ((u16)iocb->aio_buf != iocb->aio_buf)
-		return -EINVAL;
-	/* reject fields that are not defined for poll */
-	if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
-		return -EINVAL;
-
-	req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
-	req->file = fget(iocb->aio_fildes);
-	if (unlikely(!req->file))
-		return -EBADF;
-	if (!file_has_poll_mask(req->file))
-		goto out_fail;
-
-	req->head = req->file->f_op->get_poll_head(req->file, req->events);
-	if (!req->head)
-		goto out_fail;
-	if (IS_ERR(req->head)) {
-		mask = EPOLLERR;
-		goto done;
-	}
-
-	init_waitqueue_func_entry(&req->wait, aio_poll_wake);
-	aiocb->ki_cancel = aio_poll_cancel;
-
-	spin_lock_irq(&ctx->ctx_lock);
-	spin_lock(&req->head->lock);
-	mask = req->file->f_op->poll_mask(req->file, req->events) & req->events;
-	if (!mask) {
-		__add_wait_queue(req->head, &req->wait);
-		list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
-	}
-	spin_unlock(&req->head->lock);
-	spin_unlock_irq(&ctx->ctx_lock);
-done:
-	if (mask)
-		__aio_poll_complete(aiocb, mask);
-	return 0;
-out_fail:
-	fput(req->file);
-	return -EINVAL; /* same as no support for IOCB_CMD_POLL */
-}
-
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 			 bool compat)
 {
@@ -1808,9 +1665,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	case IOCB_CMD_FDSYNC:
 		ret = aio_fsync(&req->fsync, &iocb, true);
 		break;
-	case IOCB_CMD_POLL:
-		ret = aio_poll(req, &iocb);
-		break;
 	default:
 		pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode);
 		ret = -EINVAL;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index ceb1031f1cac..08d3bd602f73 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -101,20 +101,14 @@ static int eventfd_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static struct wait_queue_head *
-eventfd_get_poll_head(struct file *file, __poll_t events)
-{
-	struct eventfd_ctx *ctx = file->private_data;
-
-	return &ctx->wqh;
-}
-
-static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask)
+static __poll_t eventfd_poll(struct file *file, poll_table *wait)
 {
 	struct eventfd_ctx *ctx = file->private_data;
 	__poll_t events = 0;
 	u64 count;
 
+	poll_wait(file, &ctx->wqh, wait);
+
 	/*
 	 * All writes to ctx->count occur within ctx->wqh.lock.  This read
 	 * can be done outside ctx->wqh.lock because we know that poll_wait
@@ -156,11 +150,11 @@ static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask)
 	count = READ_ONCE(ctx->count);
 
 	if (count > 0)
-		events |= (EPOLLIN & eventmask);
+		events |= EPOLLIN;
 	if (count == ULLONG_MAX)
 		events |= EPOLLERR;
 	if (ULLONG_MAX - 1 > count)
-		events |= (EPOLLOUT & eventmask);
+		events |= EPOLLOUT;
 
 	return events;
 }
@@ -311,8 +305,7 @@ static const struct file_operations eventfd_fops = {
 	.show_fdinfo	= eventfd_show_fdinfo,
 #endif
 	.release	= eventfd_release,
-	.get_poll_head	= eventfd_get_poll_head,
-	.poll_mask	= eventfd_poll_mask,
+	.poll		= eventfd_poll,
 	.read		= eventfd_read,
 	.write		= eventfd_write,
 	.llseek		= noop_llseek,
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index ea4436f409fb..67db22fe99c5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -922,18 +922,14 @@ static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head
 	return 0;
 }
 
-static struct wait_queue_head *ep_eventpoll_get_poll_head(struct file *file,
-		__poll_t eventmask)
-{
-	struct eventpoll *ep = file->private_data;
-	return &ep->poll_wait;
-}
-
-static __poll_t ep_eventpoll_poll_mask(struct file *file, __poll_t eventmask)
+static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
 {
 	struct eventpoll *ep = file->private_data;
 	int depth = 0;
 
+	/* Insert inside our poll wait queue */
+	poll_wait(file, &ep->poll_wait, wait);
+
 	/*
 	 * Proceed to find out if wanted events are really available inside
 	 * the ready list.
@@ -972,8 +968,7 @@ static const struct file_operations eventpoll_fops = {
 	.show_fdinfo	= ep_show_fdinfo,
 #endif
 	.release	= ep_eventpoll_release,
-	.get_poll_head	= ep_eventpoll_get_poll_head,
-	.poll_mask	= ep_eventpoll_poll_mask,
+	.poll		= ep_eventpoll_poll,
 	.llseek		= noop_llseek,
 };
 
diff --git a/fs/pipe.c b/fs/pipe.c
index bb0840e234f3..39d6f431da83 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -509,22 +509,19 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	}
 }
 
-static struct wait_queue_head *
-pipe_get_poll_head(struct file *filp, __poll_t events)
-{
-	struct pipe_inode_info *pipe = filp->private_data;
-
-	return &pipe->wait;
-}
-
 /* No kernel lock held - fine */
-static __poll_t pipe_poll_mask(struct file *filp, __poll_t events)
+static __poll_t
+pipe_poll(struct file *filp, poll_table *wait)
 {
+	__poll_t mask;
 	struct pipe_inode_info *pipe = filp->private_data;
-	int nrbufs = pipe->nrbufs;
-	__poll_t mask = 0;
+	int nrbufs;
+
+	poll_wait(filp, &pipe->wait, wait);
 
 	/* Reading only -- no need for acquiring the semaphore.  */
+	nrbufs = pipe->nrbufs;
+	mask = 0;
 	if (filp->f_mode & FMODE_READ) {
 		mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0;
 		if (!pipe->writers && filp->f_version != pipe->w_counter)
@@ -1023,8 +1020,7 @@ const struct file_operations pipefifo_fops = {
 	.llseek		= no_llseek,
 	.read_iter	= pipe_read,
 	.write_iter	= pipe_write,
-	.get_poll_head	= pipe_get_poll_head,
-	.poll_mask	= pipe_poll_mask,
+	.poll		= pipe_poll,
 	.unlocked_ioctl	= pipe_ioctl,
 	.release	= pipe_release,
 	.fasync		= pipe_fasync,
diff --git a/fs/select.c b/fs/select.c
index 317891ff8165..4a6b6e4b21cb 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -34,29 +34,6 @@
 
 #include <linux/uaccess.h>
 
-__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
-{
-	if (file->f_op->poll) {
-		return file->f_op->poll(file, pt);
-	} else if (file_has_poll_mask(file)) {
-		unsigned int events = poll_requested_events(pt);
-		struct wait_queue_head *head;
-
-		if (pt && pt->_qproc) {
-			head = file->f_op->get_poll_head(file, events);
-			if (!head)
-				return DEFAULT_POLLMASK;
-			if (IS_ERR(head))
-				return EPOLLERR;
-			pt->_qproc(file, head, pt);
-		}
-
-		return file->f_op->poll_mask(file, events);
-	} else {
-		return DEFAULT_POLLMASK;
-	}
-}
-EXPORT_SYMBOL_GPL(vfs_poll);
 
 /*
  * Estimate expected accuracy in ns from a timeval.
diff --git a/fs/timerfd.c b/fs/timerfd.c
index d84a2bee4f82..cdad49da3ff7 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -226,20 +226,21 @@ static int timerfd_release(struct inode *inode, struct file *file)
 	kfree_rcu(ctx, rcu);
 	return 0;
 }
-	
-static struct wait_queue_head *timerfd_get_poll_head(struct file *file,
-		__poll_t eventmask)
+
+static __poll_t timerfd_poll(struct file *file, poll_table *wait)
 {
 	struct timerfd_ctx *ctx = file->private_data;
+	__poll_t events = 0;
+	unsigned long flags;
 
-	return &ctx->wqh;
-}
+	poll_wait(file, &ctx->wqh, wait);
 
-static __poll_t timerfd_poll_mask(struct file *file, __poll_t eventmask)
-{
-	struct timerfd_ctx *ctx = file->private_data;
+	spin_lock_irqsave(&ctx->wqh.lock, flags);
+	if (ctx->ticks)
+		events |= EPOLLIN;
+	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 
-	return ctx->ticks ? EPOLLIN : 0;
+	return events;
 }
 
 static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
@@ -363,8 +364,7 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg
 
 static const struct file_operations timerfd_fops = {
 	.release	= timerfd_release,
-	.get_poll_head	= timerfd_get_poll_head,
-	.poll_mask	= timerfd_poll_mask,
+	.poll		= timerfd_poll,
 	.read		= timerfd_read,
 	.llseek		= noop_llseek,
 	.show_fdinfo	= timerfd_show,
diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h
index cc414db9da0a..482461d8931d 100644
--- a/include/crypto/if_alg.h
+++ b/include/crypto/if_alg.h
@@ -245,7 +245,8 @@ ssize_t af_alg_sendpage(struct socket *sock, struct page *page,
 			int offset, size_t size, int flags);
 void af_alg_free_resources(struct af_alg_async_req *areq);
 void af_alg_async_cb(struct crypto_async_request *_req, int err);
-__poll_t af_alg_poll_mask(struct socket *sock, __poll_t events);
+__poll_t af_alg_poll(struct file *file, struct socket *sock,
+			 poll_table *wait);
 struct af_alg_async_req *af_alg_alloc_areq(struct sock *sk,
 					   unsigned int areqlen);
 int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5c91108846db..d78d146a98da 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1720,8 +1720,6 @@ struct file_operations {
 	int (*iterate) (struct file *, struct dir_context *);
 	int (*iterate_shared) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
-	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
-	__poll_t (*poll_mask) (struct file *, __poll_t);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
diff --git a/include/linux/net.h b/include/linux/net.h
index 08b6eb964dd6..6554d3ba4396 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -147,7 +147,6 @@ struct proto_ops {
 	int		(*getname)   (struct socket *sock,
 				      struct sockaddr *addr,
 				      int peer);
-	__poll_t	(*poll_mask) (struct socket *sock, __poll_t events);
 	__poll_t	(*poll)	     (struct file *file, struct socket *sock,
 				      struct poll_table_struct *wait);
 	int		(*ioctl)     (struct socket *sock, unsigned int cmd,
diff --git a/include/linux/poll.h b/include/linux/poll.h
index fdf86b4cbc71..7e0fdcf905d2 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -74,18 +74,18 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
 	pt->_key   = ~(__poll_t)0; /* all events enabled */
 }
 
-static inline bool file_has_poll_mask(struct file *file)
+static inline bool file_can_poll(struct file *file)
 {
-	return file->f_op->get_poll_head && file->f_op->poll_mask;
+	return file->f_op->poll;
 }
 
-static inline bool file_can_poll(struct file *file)
+static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
 {
-	return file->f_op->poll || file_has_poll_mask(file);
+	if (unlikely(!file->f_op->poll))
+		return DEFAULT_POLLMASK;
+	return file->f_op->poll(file, pt);
 }
 
-__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt);
-
 struct poll_table_entry {
 	struct file *filp;
 	__poll_t key;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c86885954994..164cdedf6012 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3252,7 +3252,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
 				    int *peeked, int *off, int *err);
 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
 				  int *err);
-__poll_t datagram_poll_mask(struct socket *sock, __poll_t events);
+__poll_t datagram_poll(struct file *file, struct socket *sock,
+			   struct poll_table_struct *wait);
 int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
 			   struct iov_iter *to, int size);
 static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 53ce8176c313..ec9d6bc65855 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -271,7 +271,7 @@ int  bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 		     int flags);
 int  bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg,
 			    size_t len, int flags);
-__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events);
+__poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait);
 int  bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 int  bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo);
 int  bt_sock_wait_ready(struct sock *sk, unsigned long flags);
diff --git a/include/net/iucv/af_iucv.h b/include/net/iucv/af_iucv.h
index b0eaeb02d46d..f4c21b5a1242 100644
--- a/include/net/iucv/af_iucv.h
+++ b/include/net/iucv/af_iucv.h
@@ -153,6 +153,8 @@ struct iucv_sock_list {
 	atomic_t	  autobind_name;
 };
 
+__poll_t iucv_sock_poll(struct file *file, struct socket *sock,
+			    poll_table *wait);
 void iucv_sock_link(struct iucv_sock_list *l, struct sock *s);
 void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *s);
 void iucv_accept_enqueue(struct sock *parent, struct sock *sk);
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 30b3e2fe240a..8c2caa370e0f 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -109,7 +109,8 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb);
 int sctp_inet_listen(struct socket *sock, int backlog);
 void sctp_write_space(struct sock *sk);
 void sctp_data_ready(struct sock *sk);
-__poll_t sctp_poll_mask(struct socket *sock, __poll_t events);
+__poll_t sctp_poll(struct file *file, struct socket *sock,
+		poll_table *wait);
 void sctp_sock_rfree(struct sk_buff *skb);
 void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 		    struct sctp_association *asoc);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0448e7c5d2b4..800582b5dd54 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -388,7 +388,8 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
 void tcp_close(struct sock *sk, long timeout);
 void tcp_init_sock(struct sock *sk);
 void tcp_init_transfer(struct sock *sk, int bpf_op);
-__poll_t tcp_poll_mask(struct socket *sock, __poll_t events);
+__poll_t tcp_poll(struct file *file, struct socket *sock,
+		      struct poll_table_struct *wait);
 int tcp_getsockopt(struct sock *sk, int level, int optname,
 		   char __user *optval, int __user *optlen);
 int tcp_setsockopt(struct sock *sk, int level, int optname,
diff --git a/include/net/tls.h b/include/net/tls.h
index 7f84ea3e217c..70c273777fe9 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -109,7 +109,8 @@ struct tls_sw_context_rx {
 
 	struct strparser strp;
 	void (*saved_data_ready)(struct sock *sk);
-	__poll_t (*sk_poll_mask)(struct socket *sock, __poll_t events);
+	unsigned int (*sk_poll)(struct file *file, struct socket *sock,
+				struct poll_table_struct *wait);
 	struct sk_buff *recv_pkt;
 	u8 control;
 	bool decrypted;
@@ -224,7 +225,8 @@ void tls_sw_free_resources_tx(struct sock *sk);
 void tls_sw_free_resources_rx(struct sock *sk);
 int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		   int nonblock, int flags, int *addr_len);
-__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events);
+unsigned int tls_sw_poll(struct file *file, struct socket *sock,
+			 struct poll_table_struct *wait);
 ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
 			   struct pipe_inode_info *pipe,
 			   size_t len, unsigned int flags);
diff --git a/include/net/udp.h b/include/net/udp.h
index b1ea8b0f5e6a..81afdacd4fff 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -285,7 +285,7 @@ int udp_init_sock(struct sock *sk);
 int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
 int __udp_disconnect(struct sock *sk, int flags);
 int udp_disconnect(struct sock *sk, int flags);
-__poll_t udp_poll_mask(struct socket *sock, __poll_t events);
+__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait);
 struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 				       netdev_features_t features,
 				       bool is_ipv6);
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index d00221345c19..d4e768d55d14 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -39,8 +39,10 @@ enum {
 	IOCB_CMD_PWRITE = 1,
 	IOCB_CMD_FSYNC = 2,
 	IOCB_CMD_FDSYNC = 3,
-	/* 4 was the experimental IOCB_CMD_PREADX */
-	IOCB_CMD_POLL = 5,
+	/* These two are experimental.
+	 * IOCB_CMD_PREADX = 4,
+	 * IOCB_CMD_POLL = 5,
+	 */
 	IOCB_CMD_NOOP = 6,
 	IOCB_CMD_PREADV = 7,
 	IOCB_CMD_PWRITEV = 8,
@@ -109,7 +111,7 @@ struct iocb {
 #undef IFLITTLE
 
 struct __aio_sigset {
-	const sigset_t __user	*sigmask;
+	sigset_t __user	*sigmask;
 	size_t		sigsetsize;
 };
 
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 55fdba05d7d9..9b6bc5abe946 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1869,7 +1869,7 @@ static const struct proto_ops atalk_dgram_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= atalk_getname,
-	.poll_mask	= datagram_poll_mask,
+	.poll		= datagram_poll,
 	.ioctl		= atalk_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= atalk_compat_ioctl,
diff --git a/net/atm/common.c b/net/atm/common.c
index ff5748b2190f..a7a68e509628 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -647,11 +647,16 @@ out:
 	return error;
 }
 
-__poll_t vcc_poll_mask(struct socket *sock, __poll_t events)
+__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
 {
 	struct sock *sk = sock->sk;
-	struct atm_vcc *vcc = ATM_SD(sock);
-	__poll_t mask = 0;
+	struct atm_vcc *vcc;
+	__poll_t mask;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
+
+	vcc = ATM_SD(sock);
 
 	/* exceptional events */
 	if (sk->sk_err)
diff --git a/net/atm/common.h b/net/atm/common.h
index 526796ad230f..5850649068bb 100644
--- a/net/atm/common.h
+++ b/net/atm/common.h
@@ -17,7 +17,7 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci);
 int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 		int flags);
 int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len);
-__poll_t vcc_poll_mask(struct socket *sock, __poll_t events);
+__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait);
 int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 int vcc_setsockopt(struct socket *sock, int level, int optname,
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index 9f75092fe778..2cb10af16afc 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -113,7 +113,7 @@ static const struct proto_ops pvc_proto_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	pvc_getname,
-	.poll_mask =	vcc_poll_mask,
+	.poll =		vcc_poll,
 	.ioctl =	vcc_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = vcc_compat_ioctl,
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 53f4ad7087b1..2f91b766ac42 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -636,7 +636,7 @@ static const struct proto_ops svc_proto_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	svc_accept,
 	.getname =	svc_getname,
-	.poll_mask =	vcc_poll_mask,
+	.poll =		vcc_poll,
 	.ioctl =	svc_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl =	svc_compat_ioctl,
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index d1d2442ce573..c603d33d5410 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1941,7 +1941,7 @@ static const struct proto_ops ax25_proto_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= ax25_accept,
 	.getname	= ax25_getname,
-	.poll_mask	= datagram_poll_mask,
+	.poll		= datagram_poll,
 	.ioctl		= ax25_ioctl,
 	.listen		= ax25_listen,
 	.shutdown	= ax25_shutdown,
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 510ab4f55df5..3264e1873219 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -437,13 +437,16 @@ static inline __poll_t bt_accept_poll(struct sock *parent)
 	return 0;
 }
 
-__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events)
+__poll_t bt_sock_poll(struct file *file, struct socket *sock,
+			  poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	__poll_t mask = 0;
 
 	BT_DBG("sock %p, sk %p", sock, sk);
 
+	poll_wait(file, sk_sleep(sk), wait);
+
 	if (sk->sk_state == BT_LISTEN)
 		return bt_accept_poll(sk);
 
@@ -475,7 +478,7 @@ __poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events)
 
 	return mask;
 }
-EXPORT_SYMBOL(bt_sock_poll_mask);
+EXPORT_SYMBOL(bt_sock_poll);
 
 int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index d6c099861538..1506e1632394 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1975,7 +1975,7 @@ static const struct proto_ops hci_sock_ops = {
 	.sendmsg	= hci_sock_sendmsg,
 	.recvmsg	= hci_sock_recvmsg,
 	.ioctl		= hci_sock_ioctl,
-	.poll_mask	= datagram_poll_mask,
+	.poll		= datagram_poll,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= hci_sock_setsockopt,
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 742a190034e6..686bdc6b35b0 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1653,7 +1653,7 @@ static const struct proto_ops l2cap_sock_ops = {
 	.getname	= l2cap_sock_getname,
 	.sendmsg	= l2cap_sock_sendmsg,
 	.recvmsg	= l2cap_sock_recvmsg,
-	.poll_mask	= bt_sock_poll_mask,
+	.poll		= bt_sock_poll,
 	.ioctl		= bt_sock_ioctl,
 	.mmap		= sock_no_mmap,
 	.socketpair	= sock_no_socketpair,
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 1cf57622473a..d606e9212291 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -1049,7 +1049,7 @@ static const struct proto_ops rfcomm_sock_ops = {
 	.setsockopt	= rfcomm_sock_setsockopt,
 	.getsockopt	= rfcomm_sock_getsockopt,
 	.ioctl		= rfcomm_sock_ioctl,
-	.poll_mask	= bt_sock_poll_mask,
+	.poll		= bt_sock_poll,
 	.socketpair	= sock_no_socketpair,
 	.mmap		= sock_no_mmap
 };
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index d60dbc61d170..413b8ee49fec 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -1197,7 +1197,7 @@ static const struct proto_ops sco_sock_ops = {
 	.getname	= sco_sock_getname,
 	.sendmsg	= sco_sock_sendmsg,
 	.recvmsg	= sco_sock_recvmsg,
-	.poll_mask	= bt_sock_poll_mask,
+	.poll		= bt_sock_poll,
 	.ioctl		= bt_sock_ioctl,
 	.mmap		= sock_no_mmap,
 	.socketpair	= sock_no_socketpair,
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index c7991867d622..a6fb1b3bcad9 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -934,11 +934,15 @@ static int caif_release(struct socket *sock)
 }
 
 /* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */
-static __poll_t caif_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t caif_poll(struct file *file,
+			      struct socket *sock, poll_table *wait)
 {
 	struct sock *sk = sock->sk;
+	__poll_t mask;
 	struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
-	__poll_t mask = 0;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
 
 	/* exceptional events? */
 	if (sk->sk_err)
@@ -972,7 +976,7 @@ static const struct proto_ops caif_seqpacket_ops = {
 	.socketpair = sock_no_socketpair,
 	.accept = sock_no_accept,
 	.getname = sock_no_getname,
-	.poll_mask = caif_poll_mask,
+	.poll = caif_poll,
 	.ioctl = sock_no_ioctl,
 	.listen = sock_no_listen,
 	.shutdown = sock_no_shutdown,
@@ -993,7 +997,7 @@ static const struct proto_ops caif_stream_ops = {
 	.socketpair = sock_no_socketpair,
 	.accept = sock_no_accept,
 	.getname = sock_no_getname,
-	.poll_mask = caif_poll_mask,
+	.poll = caif_poll,
 	.ioctl = sock_no_ioctl,
 	.listen = sock_no_listen,
 	.shutdown = sock_no_shutdown,
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 9393f25df08d..0af8f0db892a 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1660,7 +1660,7 @@ static const struct proto_ops bcm_ops = {
 	.socketpair    = sock_no_socketpair,
 	.accept        = sock_no_accept,
 	.getname       = sock_no_getname,
-	.poll_mask     = datagram_poll_mask,
+	.poll          = datagram_poll,
 	.ioctl         = can_ioctl,	/* use can_ioctl() from af_can.c */
 	.listen        = sock_no_listen,
 	.shutdown      = sock_no_shutdown,
diff --git a/net/can/raw.c b/net/can/raw.c
index fd7e2f49ea6a..1051eee82581 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -843,7 +843,7 @@ static const struct proto_ops raw_ops = {
 	.socketpair    = sock_no_socketpair,
 	.accept        = sock_no_accept,
 	.getname       = raw_getname,
-	.poll_mask     = datagram_poll_mask,
+	.poll          = datagram_poll,
 	.ioctl         = can_ioctl,	/* use can_ioctl() from af_can.c */
 	.listen        = sock_no_listen,
 	.shutdown      = sock_no_shutdown,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index f19bf3dc2bd6..9938952c5c78 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -819,8 +819,9 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
 
 /**
  * 	datagram_poll - generic datagram poll
+ *	@file: file struct
  *	@sock: socket
- *	@events to wait for
+ *	@wait: poll table
  *
  *	Datagram poll: Again totally generic. This also handles
  *	sequenced packet sockets providing the socket receive queue
@@ -830,10 +831,14 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
  *	and you use a different write policy from sock_writeable()
  *	then please supply your own write_space callback.
  */
-__poll_t datagram_poll_mask(struct socket *sock, __poll_t events)
+__poll_t datagram_poll(struct file *file, struct socket *sock,
+			   poll_table *wait)
 {
 	struct sock *sk = sock->sk;
-	__poll_t mask = 0;
+	__poll_t mask;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
 
 	/* exceptional events? */
 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
@@ -866,4 +871,4 @@ __poll_t datagram_poll_mask(struct socket *sock, __poll_t events)
 
 	return mask;
 }
-EXPORT_SYMBOL(datagram_poll_mask);
+EXPORT_SYMBOL(datagram_poll);
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 0ea2ee56ac1b..f91e3816806b 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -316,7 +316,8 @@ int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 		 int flags, int *addr_len);
 void dccp_shutdown(struct sock *sk, int how);
 int inet_dccp_listen(struct socket *sock, int backlog);
-__poll_t dccp_poll_mask(struct socket *sock, __poll_t events);
+__poll_t dccp_poll(struct file *file, struct socket *sock,
+		       poll_table *wait);
 int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
 void dccp_req_err(struct sock *sk, u64 seq);
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index a9e478cd3787..b08feb219b44 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -984,7 +984,7 @@ static const struct proto_ops inet_dccp_ops = {
 	.accept		   = inet_accept,
 	.getname	   = inet_getname,
 	/* FIXME: work on tcp_poll to rename it to inet_csk_poll */
-	.poll_mask	   = dccp_poll_mask,
+	.poll		   = dccp_poll,
 	.ioctl		   = inet_ioctl,
 	/* FIXME: work on inet_listen to rename it to sock_common_listen */
 	.listen		   = inet_dccp_listen,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 17fc4e0166ba..6344f1b18a6a 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1070,7 +1070,7 @@ static const struct proto_ops inet6_dccp_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = inet_accept,
 	.getname	   = inet6_getname,
-	.poll_mask	   = dccp_poll_mask,
+	.poll		   = dccp_poll,
 	.ioctl		   = inet6_ioctl,
 	.listen		   = inet_dccp_listen,
 	.shutdown	   = inet_shutdown,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index ca21c1c76da0..0d56e36a6db7 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -312,11 +312,20 @@ int dccp_disconnect(struct sock *sk, int flags)
 
 EXPORT_SYMBOL_GPL(dccp_disconnect);
 
-__poll_t dccp_poll_mask(struct socket *sock, __poll_t events)
+/*
+ *	Wait for a DCCP event.
+ *
+ *	Note that we don't need to lock the socket, as the upper poll layers
+ *	take care of normal races (between the test and the event) and we don't
+ *	go look at any of the socket buffers directly.
+ */
+__poll_t dccp_poll(struct file *file, struct socket *sock,
+		       poll_table *wait)
 {
 	__poll_t mask;
 	struct sock *sk = sock->sk;
 
+	sock_poll_wait(file, sk_sleep(sk), wait);
 	if (sk->sk_state == DCCP_LISTEN)
 		return inet_csk_listen_poll(sk);
 
@@ -358,7 +367,7 @@ __poll_t dccp_poll_mask(struct socket *sock, __poll_t events)
 	return mask;
 }
 
-EXPORT_SYMBOL_GPL(dccp_poll_mask);
+EXPORT_SYMBOL_GPL(dccp_poll);
 
 int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 9a686d890bfa..7d6ff983ba2c 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1207,11 +1207,11 @@ static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int peer)
 }
 
 
-static __poll_t dn_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table  *wait)
 {
 	struct sock *sk = sock->sk;
 	struct dn_scp *scp = DN_SK(sk);
-	__poll_t mask = datagram_poll_mask(sock, events);
+	__poll_t mask = datagram_poll(file, sock, wait);
 
 	if (!skb_queue_empty(&scp->other_receive_queue))
 		mask |= EPOLLRDBAND;
@@ -2331,7 +2331,7 @@ static const struct proto_ops dn_proto_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	dn_accept,
 	.getname =	dn_getname,
-	.poll_mask =	dn_poll_mask,
+	.poll =		dn_poll,
 	.ioctl =	dn_ioctl,
 	.listen =	dn_listen,
 	.shutdown =	dn_shutdown,
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index a0768d2759b8..a60658c85a9a 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -423,7 +423,7 @@ static const struct proto_ops ieee802154_raw_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = sock_no_getname,
-	.poll_mask	   = datagram_poll_mask,
+	.poll		   = datagram_poll,
 	.ioctl		   = ieee802154_sock_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = sock_no_shutdown,
@@ -969,7 +969,7 @@ static const struct proto_ops ieee802154_dgram_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = sock_no_getname,
-	.poll_mask	   = datagram_poll_mask,
+	.poll		   = datagram_poll,
 	.ioctl		   = ieee802154_sock_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = sock_no_shutdown,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 15e125558c76..b403499fdabe 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -986,7 +986,7 @@ const struct proto_ops inet_stream_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = inet_accept,
 	.getname	   = inet_getname,
-	.poll_mask	   = tcp_poll_mask,
+	.poll		   = tcp_poll,
 	.ioctl		   = inet_ioctl,
 	.listen		   = inet_listen,
 	.shutdown	   = inet_shutdown,
@@ -1021,7 +1021,7 @@ const struct proto_ops inet_dgram_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = inet_getname,
-	.poll_mask	   = udp_poll_mask,
+	.poll		   = udp_poll,
 	.ioctl		   = inet_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = inet_shutdown,
@@ -1042,7 +1042,7 @@ EXPORT_SYMBOL(inet_dgram_ops);
 
 /*
  * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
- * udp_poll_mask
+ * udp_poll
  */
 static const struct proto_ops inet_sockraw_ops = {
 	.family		   = PF_INET,
@@ -1053,7 +1053,7 @@ static const struct proto_ops inet_sockraw_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = inet_getname,
-	.poll_mask	   = datagram_poll_mask,
+	.poll		   = datagram_poll,
 	.ioctl		   = inet_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = inet_shutdown,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 141acd92e58a..e7b53d2a971f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -494,21 +494,32 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
 }
 
 /*
- * Socket is not locked. We are protected from async events by poll logic and
- * correct handling of state changes made by other threads is impossible in
- * any case.
+ *	Wait for a TCP event.
+ *
+ *	Note that we don't need to lock the socket, as the upper poll layers
+ *	take care of normal races (between the test and the event) and we don't
+ *	go look at any of the socket buffers directly.
  */
-__poll_t tcp_poll_mask(struct socket *sock, __poll_t events)
+__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 {
+	__poll_t mask;
 	struct sock *sk = sock->sk;
 	const struct tcp_sock *tp = tcp_sk(sk);
-	__poll_t mask = 0;
 	int state;
 
+	sock_poll_wait(file, sk_sleep(sk), wait);
+
 	state = inet_sk_state_load(sk);
 	if (state == TCP_LISTEN)
 		return inet_csk_listen_poll(sk);
 
+	/* Socket is not locked. We are protected from async events
+	 * by poll logic and correct handling of state changes
+	 * made by other threads is impossible in any case.
+	 */
+
+	mask = 0;
+
 	/*
 	 * EPOLLHUP is certainly not done right. But poll() doesn't
 	 * have a notion of HUP in just one direction, and for a
@@ -589,7 +600,7 @@ __poll_t tcp_poll_mask(struct socket *sock, __poll_t events)
 
 	return mask;
 }
-EXPORT_SYMBOL(tcp_poll_mask);
+EXPORT_SYMBOL(tcp_poll);
 
 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9bb27df4dac5..24e116ddae79 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2591,7 +2591,7 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname,
  * 	udp_poll - wait for a UDP event.
  *	@file - file struct
  *	@sock - socket
- *	@events - events to wait for
+ *	@wait - poll table
  *
  *	This is same as datagram poll, except for the special case of
  *	blocking sockets. If application is using a blocking fd
@@ -2600,23 +2600,23 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname,
  *	but then block when reading it. Add special case code
  *	to work around these arguably broken applications.
  */
-__poll_t udp_poll_mask(struct socket *sock, __poll_t events)
+__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
 {
-	__poll_t mask = datagram_poll_mask(sock, events);
+	__poll_t mask = datagram_poll(file, sock, wait);
 	struct sock *sk = sock->sk;
 
 	if (!skb_queue_empty(&udp_sk(sk)->reader_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* Check for false positives due to checksum errors */
-	if ((mask & EPOLLRDNORM) && !(sock->file->f_flags & O_NONBLOCK) &&
+	if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
 	    !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
 		mask &= ~(EPOLLIN | EPOLLRDNORM);
 
 	return mask;
 
 }
-EXPORT_SYMBOL(udp_poll_mask);
+EXPORT_SYMBOL(udp_poll);
 
 int udp_abort(struct sock *sk, int err)
 {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 74f2a261e8df..9ed0eae91758 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -570,7 +570,7 @@ const struct proto_ops inet6_stream_ops = {
 	.socketpair	   = sock_no_socketpair,	/* a do nothing	*/
 	.accept		   = inet_accept,		/* ok		*/
 	.getname	   = inet6_getname,
-	.poll_mask	   = tcp_poll_mask,		/* ok		*/
+	.poll		   = tcp_poll,			/* ok		*/
 	.ioctl		   = inet6_ioctl,		/* must change  */
 	.listen		   = inet_listen,		/* ok		*/
 	.shutdown	   = inet_shutdown,		/* ok		*/
@@ -603,7 +603,7 @@ const struct proto_ops inet6_dgram_ops = {
 	.socketpair	   = sock_no_socketpair,	/* a do nothing	*/
 	.accept		   = sock_no_accept,		/* a do nothing	*/
 	.getname	   = inet6_getname,
-	.poll_mask	   = udp_poll_mask,		/* ok		*/
+	.poll		   = udp_poll,			/* ok		*/
 	.ioctl		   = inet6_ioctl,		/* must change  */
 	.listen		   = sock_no_listen,		/* ok		*/
 	.shutdown	   = inet_shutdown,		/* ok		*/
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index ce6f0d15b5dd..afc307c89d1a 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1334,7 +1334,7 @@ void raw6_proc_exit(void)
 }
 #endif	/* CONFIG_PROC_FS */
 
-/* Same as inet6_dgram_ops, sans udp_poll_mask.  */
+/* Same as inet6_dgram_ops, sans udp_poll.  */
 const struct proto_ops inet6_sockraw_ops = {
 	.family		   = PF_INET6,
 	.owner		   = THIS_MODULE,
@@ -1344,7 +1344,7 @@ const struct proto_ops inet6_sockraw_ops = {
 	.socketpair	   = sock_no_socketpair,	/* a do nothing	*/
 	.accept		   = sock_no_accept,		/* a do nothing	*/
 	.getname	   = inet6_getname,
-	.poll_mask	   = datagram_poll_mask,	/* ok		*/
+	.poll		   = datagram_poll,		/* ok		*/
 	.ioctl		   = inet6_ioctl,		/* must change  */
 	.listen		   = sock_no_listen,		/* ok		*/
 	.shutdown	   = inet_shutdown,		/* ok		*/
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 68e86257a549..893a022f9620 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1488,11 +1488,14 @@ static inline __poll_t iucv_accept_poll(struct sock *parent)
 	return 0;
 }
 
-static __poll_t iucv_sock_poll_mask(struct socket *sock, __poll_t events)
+__poll_t iucv_sock_poll(struct file *file, struct socket *sock,
+			    poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	__poll_t mask = 0;
 
+	sock_poll_wait(file, sk_sleep(sk), wait);
+
 	if (sk->sk_state == IUCV_LISTEN)
 		return iucv_accept_poll(sk);
 
@@ -2385,7 +2388,7 @@ static const struct proto_ops iucv_sock_ops = {
 	.getname	= iucv_sock_getname,
 	.sendmsg	= iucv_sock_sendmsg,
 	.recvmsg	= iucv_sock_recvmsg,
-	.poll_mask	= iucv_sock_poll_mask,
+	.poll		= iucv_sock_poll,
 	.ioctl		= sock_no_ioctl,
 	.mmap		= sock_no_mmap,
 	.socketpair	= sock_no_socketpair,
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 84b7d5c6fec8..d3601d421571 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1336,9 +1336,9 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
 	struct list_head *head;
 	int index = 0;
 
-	/* For SOCK_SEQPACKET sock type, datagram_poll_mask checks the sk_state,
-	 * so  we set sk_state, otherwise epoll_wait always returns right away
-	 * with EPOLLHUP
+	/* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so
+	 * we set sk_state, otherwise epoll_wait always returns right away with
+	 * EPOLLHUP
 	 */
 	kcm->sk.sk_state = TCP_ESTABLISHED;
 
@@ -1903,7 +1903,7 @@ static const struct proto_ops kcm_dgram_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	sock_no_getname,
-	.poll_mask =	datagram_poll_mask,
+	.poll =		datagram_poll,
 	.ioctl =	kcm_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
@@ -1924,7 +1924,7 @@ static const struct proto_ops kcm_seqpacket_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	sock_no_getname,
-	.poll_mask =	datagram_poll_mask,
+	.poll =		datagram_poll,
 	.ioctl =	kcm_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 8bdc1cbe490a..5e1d2946ffbf 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3751,7 +3751,7 @@ static const struct proto_ops pfkey_ops = {
 
 	/* Now the operations that really occur. */
 	.release	=	pfkey_release,
-	.poll_mask	=	datagram_poll_mask,
+	.poll		=	datagram_poll,
 	.sendmsg	=	pfkey_sendmsg,
 	.recvmsg	=	pfkey_recvmsg,
 };
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 181073bf6925..a9c05b2bc1b0 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -613,7 +613,7 @@ static const struct proto_ops l2tp_ip_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = l2tp_ip_getname,
-	.poll_mask	   = datagram_poll_mask,
+	.poll		   = datagram_poll,
 	.ioctl		   = inet_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = inet_shutdown,
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 336e4c00abbc..957369192ca1 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -754,7 +754,7 @@ static const struct proto_ops l2tp_ip6_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = sock_no_accept,
 	.getname	   = l2tp_ip6_getname,
-	.poll_mask	   = datagram_poll_mask,
+	.poll		   = datagram_poll,
 	.ioctl		   = inet6_ioctl,
 	.listen		   = sock_no_listen,
 	.shutdown	   = inet_shutdown,
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 55188382845c..e398797878a9 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1818,7 +1818,7 @@ static const struct proto_ops pppol2tp_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= pppol2tp_getname,
-	.poll_mask	= datagram_poll_mask,
+	.poll		= datagram_poll,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= pppol2tp_setsockopt,
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 804de8490186..1beeea9549fa 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -1192,7 +1192,7 @@ static const struct proto_ops llc_ui_ops = {
 	.socketpair  = sock_no_socketpair,
 	.accept      = llc_ui_accept,
 	.getname     = llc_ui_getname,
-	.poll_mask   = datagram_poll_mask,
+	.poll	     = datagram_poll,
 	.ioctl       = llc_ui_ioctl,
 	.listen      = llc_ui_listen,
 	.shutdown    = llc_ui_shutdown,
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 1189b84413d5..393573a99a5a 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2658,7 +2658,7 @@ static const struct proto_ops netlink_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	netlink_getname,
-	.poll_mask =	datagram_poll_mask,
+	.poll =		datagram_poll,
 	.ioctl =	netlink_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 93fbcafbf388..03f37c4e64fe 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1355,7 +1355,7 @@ static const struct proto_ops nr_proto_ops = {
 	.socketpair	=	sock_no_socketpair,
 	.accept		=	nr_accept,
 	.getname	=	nr_getname,
-	.poll_mask	=	datagram_poll_mask,
+	.poll		=	datagram_poll,
 	.ioctl		=	nr_ioctl,
 	.listen		=	nr_listen,
 	.shutdown	=	sock_no_shutdown,
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index ab5bb14b49af..ea0c0c6f1874 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -548,13 +548,16 @@ static inline __poll_t llcp_accept_poll(struct sock *parent)
 	return 0;
 }
 
-static __poll_t llcp_sock_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t llcp_sock_poll(struct file *file, struct socket *sock,
+				   poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	__poll_t mask = 0;
 
 	pr_debug("%p\n", sk);
 
+	sock_poll_wait(file, sk_sleep(sk), wait);
+
 	if (sk->sk_state == LLCP_LISTEN)
 		return llcp_accept_poll(sk);
 
@@ -896,7 +899,7 @@ static const struct proto_ops llcp_sock_ops = {
 	.socketpair     = sock_no_socketpair,
 	.accept         = llcp_sock_accept,
 	.getname        = llcp_sock_getname,
-	.poll_mask      = llcp_sock_poll_mask,
+	.poll           = llcp_sock_poll,
 	.ioctl          = sock_no_ioctl,
 	.listen         = llcp_sock_listen,
 	.shutdown       = sock_no_shutdown,
@@ -916,7 +919,7 @@ static const struct proto_ops llcp_rawsock_ops = {
 	.socketpair     = sock_no_socketpair,
 	.accept         = sock_no_accept,
 	.getname        = llcp_sock_getname,
-	.poll_mask      = llcp_sock_poll_mask,
+	.poll           = llcp_sock_poll,
 	.ioctl          = sock_no_ioctl,
 	.listen         = sock_no_listen,
 	.shutdown       = sock_no_shutdown,
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index 60c322531c49..e2188deb08dc 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -284,7 +284,7 @@ static const struct proto_ops rawsock_ops = {
 	.socketpair     = sock_no_socketpair,
 	.accept         = sock_no_accept,
 	.getname        = sock_no_getname,
-	.poll_mask      = datagram_poll_mask,
+	.poll           = datagram_poll,
 	.ioctl          = sock_no_ioctl,
 	.listen         = sock_no_listen,
 	.shutdown       = sock_no_shutdown,
@@ -304,7 +304,7 @@ static const struct proto_ops rawsock_raw_ops = {
 	.socketpair     = sock_no_socketpair,
 	.accept         = sock_no_accept,
 	.getname        = sock_no_getname,
-	.poll_mask      = datagram_poll_mask,
+	.poll           = datagram_poll,
 	.ioctl          = sock_no_ioctl,
 	.listen         = sock_no_listen,
 	.shutdown       = sock_no_shutdown,
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index ff8e7e245c37..57634bc3da74 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -4076,11 +4076,12 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,
 	return 0;
 }
 
-static __poll_t packet_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t packet_poll(struct file *file, struct socket *sock,
+				poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	struct packet_sock *po = pkt_sk(sk);
-	__poll_t mask = datagram_poll_mask(sock, events);
+	__poll_t mask = datagram_poll(file, sock, wait);
 
 	spin_lock_bh(&sk->sk_receive_queue.lock);
 	if (po->rx_ring.pg_vec) {
@@ -4422,7 +4423,7 @@ static const struct proto_ops packet_ops_spkt = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	packet_getname_spkt,
-	.poll_mask =	datagram_poll_mask,
+	.poll =		datagram_poll,
 	.ioctl =	packet_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
@@ -4443,7 +4444,7 @@ static const struct proto_ops packet_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	packet_getname,
-	.poll_mask =	packet_poll_mask,
+	.poll =		packet_poll,
 	.ioctl =	packet_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index c295c4e20f01..30187990257f 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -340,12 +340,15 @@ static int pn_socket_getname(struct socket *sock, struct sockaddr *addr,
 	return sizeof(struct sockaddr_pn);
 }
 
-static __poll_t pn_socket_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t pn_socket_poll(struct file *file, struct socket *sock,
+					poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	struct pep_sock *pn = pep_sk(sk);
 	__poll_t mask = 0;
 
+	poll_wait(file, sk_sleep(sk), wait);
+
 	if (sk->sk_state == TCP_CLOSE)
 		return EPOLLERR;
 	if (!skb_queue_empty(&sk->sk_receive_queue))
@@ -445,7 +448,7 @@ const struct proto_ops phonet_dgram_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= pn_socket_getname,
-	.poll_mask	= datagram_poll_mask,
+	.poll		= datagram_poll,
 	.ioctl		= pn_socket_ioctl,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
@@ -470,7 +473,7 @@ const struct proto_ops phonet_stream_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= pn_socket_accept,
 	.getname	= pn_socket_getname,
-	.poll_mask	= pn_socket_poll_mask,
+	.poll		= pn_socket_poll,
 	.ioctl		= pn_socket_ioctl,
 	.listen		= pn_socket_listen,
 	.shutdown	= sock_no_shutdown,
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 1b5025ea5b04..2aa07b547b16 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -1023,7 +1023,7 @@ static const struct proto_ops qrtr_proto_ops = {
 	.recvmsg	= qrtr_recvmsg,
 	.getname	= qrtr_getname,
 	.ioctl		= qrtr_ioctl,
-	.poll_mask	= datagram_poll_mask,
+	.poll		= datagram_poll,
 	.shutdown	= sock_no_shutdown,
 	.setsockopt	= sock_no_setsockopt,
 	.getsockopt	= sock_no_getsockopt,
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index ebe42e7eb456..d00a0ef39a56 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1470,7 +1470,7 @@ static const struct proto_ops rose_proto_ops = {
 	.socketpair	=	sock_no_socketpair,
 	.accept		=	rose_accept,
 	.getname	=	rose_getname,
-	.poll_mask	=	datagram_poll_mask,
+	.poll		=	datagram_poll,
 	.ioctl		=	rose_ioctl,
 	.listen		=	rose_listen,
 	.shutdown	=	sock_no_shutdown,
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 3b1ac93efee2..2b463047dd7b 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -734,11 +734,15 @@ static int rxrpc_getsockopt(struct socket *sock, int level, int optname,
 /*
  * permit an RxRPC socket to be polled
  */
-static __poll_t rxrpc_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t rxrpc_poll(struct file *file, struct socket *sock,
+			       poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	struct rxrpc_sock *rx = rxrpc_sk(sk);
-	__poll_t mask = 0;
+	__poll_t mask;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
 
 	/* the socket is readable if there are any messages waiting on the Rx
 	 * queue */
@@ -945,7 +949,7 @@ static const struct proto_ops rxrpc_rpc_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= sock_no_getname,
-	.poll_mask	= rxrpc_poll_mask,
+	.poll		= rxrpc_poll,
 	.ioctl		= sock_no_ioctl,
 	.listen		= rxrpc_listen,
 	.shutdown	= rxrpc_shutdown,
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 7339918a805d..0cd2e764f47f 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -1010,7 +1010,7 @@ static const struct proto_ops inet6_seqpacket_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = inet_accept,
 	.getname	   = sctp_getname,
-	.poll_mask	   = sctp_poll_mask,
+	.poll		   = sctp_poll,
 	.ioctl		   = inet6_ioctl,
 	.listen		   = sctp_inet_listen,
 	.shutdown	   = inet_shutdown,
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 5dffbc493008..67f73d3a1356 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1016,7 +1016,7 @@ static const struct proto_ops inet_seqpacket_ops = {
 	.socketpair	   = sock_no_socketpair,
 	.accept		   = inet_accept,
 	.getname	   = inet_getname,	/* Semantics are different.  */
-	.poll_mask	   = sctp_poll_mask,
+	.poll		   = sctp_poll,
 	.ioctl		   = inet_ioctl,
 	.listen		   = sctp_inet_listen,
 	.shutdown	   = inet_shutdown,	/* Looks harmless.  */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index d20f7addee19..ce620e878538 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7717,12 +7717,14 @@ out:
  * here, again, by modeling the current TCP/UDP code.  We don't have
  * a good way to test with it yet.
  */
-__poll_t sctp_poll_mask(struct socket *sock, __poll_t events)
+__poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	struct sctp_sock *sp = sctp_sk(sk);
 	__poll_t mask;
 
+	poll_wait(file, sk_sleep(sk), wait);
+
 	sock_rps_record_flow(sk);
 
 	/* A TCP-style listening socket becomes readable when the accept queue
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index da7f02edcd37..973b4471b532 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1273,7 +1273,8 @@ static __poll_t smc_accept_poll(struct sock *parent)
 	return mask;
 }
 
-static __poll_t smc_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t smc_poll(struct file *file, struct socket *sock,
+			     poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	__poll_t mask = 0;
@@ -1289,7 +1290,7 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events)
 	if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
 		/* delegate to CLC child sock */
 		release_sock(sk);
-		mask = smc->clcsock->ops->poll_mask(smc->clcsock, events);
+		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
 		lock_sock(sk);
 		sk->sk_err = smc->clcsock->sk->sk_err;
 		if (sk->sk_err) {
@@ -1307,6 +1308,11 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events)
 			}
 		}
 	} else {
+		if (sk->sk_state != SMC_CLOSED) {
+			release_sock(sk);
+			sock_poll_wait(file, sk_sleep(sk), wait);
+			lock_sock(sk);
+		}
 		if (sk->sk_err)
 			mask |= EPOLLERR;
 		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
@@ -1619,7 +1625,7 @@ static const struct proto_ops smc_sock_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= smc_accept,
 	.getname	= smc_getname,
-	.poll_mask	= smc_poll_mask,
+	.poll		= smc_poll,
 	.ioctl		= smc_ioctl,
 	.listen		= smc_listen,
 	.shutdown	= smc_shutdown,
diff --git a/net/socket.c b/net/socket.c
index 8a109012608a..a564c6ed19d5 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -117,10 +117,8 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from);
 static int sock_mmap(struct file *file, struct vm_area_struct *vma);
 
 static int sock_close(struct inode *inode, struct file *file);
-static struct wait_queue_head *sock_get_poll_head(struct file *file,
-		__poll_t events);
-static __poll_t sock_poll_mask(struct file *file, __poll_t);
-static __poll_t sock_poll(struct file *file, struct poll_table_struct *wait);
+static __poll_t sock_poll(struct file *file,
+			      struct poll_table_struct *wait);
 static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 #ifdef CONFIG_COMPAT
 static long compat_sock_ioctl(struct file *file,
@@ -143,8 +141,6 @@ static const struct file_operations socket_file_ops = {
 	.llseek =	no_llseek,
 	.read_iter =	sock_read_iter,
 	.write_iter =	sock_write_iter,
-	.get_poll_head = sock_get_poll_head,
-	.poll_mask =	sock_poll_mask,
 	.poll =		sock_poll,
 	.unlocked_ioctl = sock_ioctl,
 #ifdef CONFIG_COMPAT
@@ -1130,48 +1126,14 @@ out_release:
 }
 EXPORT_SYMBOL(sock_create_lite);
 
-static struct wait_queue_head *sock_get_poll_head(struct file *file,
-		__poll_t events)
-{
-	struct socket *sock = file->private_data;
-
-	if (!sock->ops->poll_mask)
-		return NULL;
-	sock_poll_busy_loop(sock, events);
-	return sk_sleep(sock->sk);
-}
-
-static __poll_t sock_poll_mask(struct file *file, __poll_t events)
-{
-	struct socket *sock = file->private_data;
-
-	/*
-	 * We need to be sure we are in sync with the socket flags modification.
-	 *
-	 * This memory barrier is paired in the wq_has_sleeper.
-	 */
-	smp_mb();
-
-	/* this socket can poll_ll so tell the system call */
-	return sock->ops->poll_mask(sock, events) |
-		(sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0);
-}
-
 /* No kernel lock held - perfect */
 static __poll_t sock_poll(struct file *file, poll_table *wait)
 {
 	struct socket *sock = file->private_data;
-	__poll_t events = poll_requested_events(wait), mask = 0;
-
-	if (sock->ops->poll) {
-		sock_poll_busy_loop(sock, events);
-		mask = sock->ops->poll(file, sock, wait);
-	} else if (sock->ops->poll_mask) {
-		sock_poll_wait(file, sock_get_poll_head(file, events), wait);
-		mask = sock->ops->poll_mask(sock, events);
-	}
+	__poll_t events = poll_requested_events(wait);
 
-	return mask | sock_poll_busy_flag(sock);
+	sock_poll_busy_loop(sock, events);
+	return sock->ops->poll(file, sock, wait) | sock_poll_busy_flag(sock);
 }
 
 static int sock_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 14a5d055717d..930852c54d7a 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -692,9 +692,10 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
 }
 
 /**
- * tipc_poll - read pollmask
+ * tipc_poll - read and possibly block on pollmask
  * @file: file structure associated with the socket
  * @sock: socket for which to calculate the poll bits
+ * @wait: ???
  *
  * Returns pollmask value
  *
@@ -708,12 +709,15 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
  * imply that the operation will succeed, merely that it should be performed
  * and will not block.
  */
-static __poll_t tipc_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t tipc_poll(struct file *file, struct socket *sock,
+			      poll_table *wait)
 {
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
 	__poll_t revents = 0;
 
+	sock_poll_wait(file, sk_sleep(sk), wait);
+
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
 		revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
@@ -3033,7 +3037,7 @@ static const struct proto_ops msg_ops = {
 	.socketpair	= tipc_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= tipc_getname,
-	.poll_mask	= tipc_poll_mask,
+	.poll		= tipc_poll,
 	.ioctl		= tipc_ioctl,
 	.listen		= sock_no_listen,
 	.shutdown	= tipc_shutdown,
@@ -3054,7 +3058,7 @@ static const struct proto_ops packet_ops = {
 	.socketpair	= tipc_socketpair,
 	.accept		= tipc_accept,
 	.getname	= tipc_getname,
-	.poll_mask	= tipc_poll_mask,
+	.poll		= tipc_poll,
 	.ioctl		= tipc_ioctl,
 	.listen		= tipc_listen,
 	.shutdown	= tipc_shutdown,
@@ -3075,7 +3079,7 @@ static const struct proto_ops stream_ops = {
 	.socketpair	= tipc_socketpair,
 	.accept		= tipc_accept,
 	.getname	= tipc_getname,
-	.poll_mask	= tipc_poll_mask,
+	.poll		= tipc_poll,
 	.ioctl		= tipc_ioctl,
 	.listen		= tipc_listen,
 	.shutdown	= tipc_shutdown,
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index a127d61e8af9..301f22430469 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -712,7 +712,7 @@ static int __init tls_register(void)
 	build_protos(tls_prots[TLSV4], &tcp_prot);
 
 	tls_sw_proto_ops = inet_stream_ops;
-	tls_sw_proto_ops.poll_mask = tls_sw_poll_mask;
+	tls_sw_proto_ops.poll = tls_sw_poll;
 	tls_sw_proto_ops.splice_read = tls_sw_splice_read;
 
 #ifdef CONFIG_TLS_DEVICE
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index f127fac88acf..d2380548f8f6 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -919,22 +919,23 @@ splice_read_end:
 	return copied ? : err;
 }
 
-__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events)
+unsigned int tls_sw_poll(struct file *file, struct socket *sock,
+			 struct poll_table_struct *wait)
 {
+	unsigned int ret;
 	struct sock *sk = sock->sk;
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
-	__poll_t mask;
 
-	/* Grab EPOLLOUT and EPOLLHUP from the underlying socket */
-	mask = ctx->sk_poll_mask(sock, events);
+	/* Grab POLLOUT and POLLHUP from the underlying socket */
+	ret = ctx->sk_poll(file, sock, wait);
 
-	/* Clear EPOLLIN bits, and set based on recv_pkt */
-	mask &= ~(EPOLLIN | EPOLLRDNORM);
+	/* Clear POLLIN bits, and set based on recv_pkt */
+	ret &= ~(POLLIN | POLLRDNORM);
 	if (ctx->recv_pkt)
-		mask |= EPOLLIN | EPOLLRDNORM;
+		ret |= POLLIN | POLLRDNORM;
 
-	return mask;
+	return ret;
 }
 
 static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
@@ -1191,7 +1192,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		sk->sk_data_ready = tls_data_ready;
 		write_unlock_bh(&sk->sk_callback_lock);
 
-		sw_ctx_rx->sk_poll_mask = sk->sk_socket->ops->poll_mask;
+		sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll;
 
 		strp_check_rcv(&sw_ctx_rx->strp);
 	}
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 95b02a71fd47..e5473c03d667 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -638,8 +638,9 @@ static int unix_stream_connect(struct socket *, struct sockaddr *,
 static int unix_socketpair(struct socket *, struct socket *);
 static int unix_accept(struct socket *, struct socket *, int, bool);
 static int unix_getname(struct socket *, struct sockaddr *, int);
-static __poll_t unix_poll_mask(struct socket *, __poll_t);
-static __poll_t unix_dgram_poll_mask(struct socket *, __poll_t);
+static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
+static __poll_t unix_dgram_poll(struct file *, struct socket *,
+				    poll_table *);
 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 static int unix_shutdown(struct socket *, int);
 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
@@ -680,7 +681,7 @@ static const struct proto_ops unix_stream_ops = {
 	.socketpair =	unix_socketpair,
 	.accept =	unix_accept,
 	.getname =	unix_getname,
-	.poll_mask =	unix_poll_mask,
+	.poll =		unix_poll,
 	.ioctl =	unix_ioctl,
 	.listen =	unix_listen,
 	.shutdown =	unix_shutdown,
@@ -703,7 +704,7 @@ static const struct proto_ops unix_dgram_ops = {
 	.socketpair =	unix_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	unix_getname,
-	.poll_mask =	unix_dgram_poll_mask,
+	.poll =		unix_dgram_poll,
 	.ioctl =	unix_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	unix_shutdown,
@@ -725,7 +726,7 @@ static const struct proto_ops unix_seqpacket_ops = {
 	.socketpair =	unix_socketpair,
 	.accept =	unix_accept,
 	.getname =	unix_getname,
-	.poll_mask =	unix_dgram_poll_mask,
+	.poll =		unix_dgram_poll,
 	.ioctl =	unix_ioctl,
 	.listen =	unix_listen,
 	.shutdown =	unix_shutdown,
@@ -2629,10 +2630,13 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
-static __poll_t unix_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
 {
 	struct sock *sk = sock->sk;
-	__poll_t mask = 0;
+	__poll_t mask;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
 
 	/* exceptional events? */
 	if (sk->sk_err)
@@ -2661,11 +2665,15 @@ static __poll_t unix_poll_mask(struct socket *sock, __poll_t events)
 	return mask;
 }
 
-static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
+				    poll_table *wait)
 {
 	struct sock *sk = sock->sk, *other;
-	int writable;
-	__poll_t mask = 0;
+	unsigned int writable;
+	__poll_t mask;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
 
 	/* exceptional events? */
 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
@@ -2691,7 +2699,7 @@ static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events)
 	}
 
 	/* No write status requested, avoid expensive OUT tests. */
-	if (!(events & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
+	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
 		return mask;
 
 	writable = unix_writable(sk);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index bb5d5fa68c35..c1076c19b858 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -850,11 +850,18 @@ static int vsock_shutdown(struct socket *sock, int mode)
 	return err;
 }
 
-static __poll_t vsock_poll_mask(struct socket *sock, __poll_t events)
+static __poll_t vsock_poll(struct file *file, struct socket *sock,
+			       poll_table *wait)
 {
-	struct sock *sk = sock->sk;
-	struct vsock_sock *vsk = vsock_sk(sk);
-	__poll_t mask = 0;
+	struct sock *sk;
+	__poll_t mask;
+	struct vsock_sock *vsk;
+
+	sk = sock->sk;
+	vsk = vsock_sk(sk);
+
+	poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
 
 	if (sk->sk_err)
 		/* Signify that there has been an error on this socket. */
@@ -1084,7 +1091,7 @@ static const struct proto_ops vsock_dgram_ops = {
 	.socketpair = sock_no_socketpair,
 	.accept = sock_no_accept,
 	.getname = vsock_getname,
-	.poll_mask = vsock_poll_mask,
+	.poll = vsock_poll,
 	.ioctl = sock_no_ioctl,
 	.listen = sock_no_listen,
 	.shutdown = vsock_shutdown,
@@ -1842,7 +1849,7 @@ static const struct proto_ops vsock_stream_ops = {
 	.socketpair = sock_no_socketpair,
 	.accept = vsock_accept,
 	.getname = vsock_getname,
-	.poll_mask = vsock_poll_mask,
+	.poll = vsock_poll,
 	.ioctl = sock_no_ioctl,
 	.listen = vsock_listen,
 	.shutdown = vsock_shutdown,
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index f93365ae0fdd..d49aa79b7997 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1750,7 +1750,7 @@ static const struct proto_ops x25_proto_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	x25_accept,
 	.getname =	x25_getname,
-	.poll_mask =	datagram_poll_mask,
+	.poll =		datagram_poll,
 	.ioctl =	x25_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = compat_x25_ioctl,
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 3b3410ada097..59fb7d3c36a3 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -303,9 +303,10 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 	return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len);
 }
 
-static __poll_t xsk_poll_mask(struct socket *sock, __poll_t events)
+static unsigned int xsk_poll(struct file *file, struct socket *sock,
+			     struct poll_table_struct *wait)
 {
-	__poll_t mask = datagram_poll_mask(sock, events);
+	unsigned int mask = datagram_poll(file, sock, wait);
 	struct sock *sk = sock->sk;
 	struct xdp_sock *xs = xdp_sk(sk);
 
@@ -696,7 +697,7 @@ static const struct proto_ops xsk_proto_ops = {
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= sock_no_getname,
-	.poll_mask	= xsk_poll_mask,
+	.poll		= xsk_poll,
 	.ioctl		= sock_no_ioctl,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
-- 
cgit v1.2.3


From e7c7faa936680a2f0e78fc6d9683a5728421a150 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 28 Jun 2018 13:36:55 -0700
Subject: net/ipv6: Fix updates to prefix route

Sowmini reported that a recent commit broke prefix routes for linklocal
addresses. The newly added modify_prefix_route is attempting to add a
new prefix route when the ifp priority does not match the route metric
however the check needs to account for the default priority. In addition,
the route add fails because the route already exists, and then the delete
removes the one that exists. Flip the order to do the delete first.

Fixes: 8308f3ff1753 ("net/ipv6: Add support for specifying metric of connected routes")
Reported-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Tested-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c134286d6a41..91580c62bb86 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4528,6 +4528,7 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp,
 			       unsigned long expires, u32 flags)
 {
 	struct fib6_info *f6i;
+	u32 prio;
 
 	f6i = addrconf_get_prefix_route(&ifp->addr,
 					ifp->prefix_len,
@@ -4536,13 +4537,15 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp,
 	if (!f6i)
 		return -ENOENT;
 
-	if (f6i->fib6_metric != ifp->rt_priority) {
+	prio = ifp->rt_priority ? : IP6_RT_PRIO_ADDRCONF;
+	if (f6i->fib6_metric != prio) {
+		/* delete old one */
+		ip6_del_rt(dev_net(ifp->idev->dev), f6i);
+
 		/* add new one */
 		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
 				      ifp->rt_priority, ifp->idev->dev,
 				      expires, flags, GFP_KERNEL);
-		/* delete old one */
-		ip6_del_rt(dev_net(ifp->idev->dev), f6i);
 	} else {
 		if (!expires)
 			fib6_clean_expires(f6i);
-- 
cgit v1.2.3


From fc9c2029e37c3ae9efc28bf47045e0b87e09660c Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 30 Jun 2018 15:26:56 -0700
Subject: ipv6: sr: fix passing wrong flags to crypto_alloc_shash()

The 'mask' argument to crypto_alloc_shash() uses the CRYPTO_ALG_* flags,
not 'gfp_t'.  So don't pass GFP_KERNEL to it.

Fixes: bf355b8d2c30 ("ipv6: sr: add core files for SR HMAC support")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6_hmac.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index 33fb35cbfac1..558fe8cc6d43 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -373,7 +373,7 @@ static int seg6_hmac_init_algo(void)
 			return -ENOMEM;
 
 		for_each_possible_cpu(cpu) {
-			tfm = crypto_alloc_shash(algo->name, 0, GFP_KERNEL);
+			tfm = crypto_alloc_shash(algo->name, 0, 0);
 			if (IS_ERR(tfm))
 				return PTR_ERR(tfm);
 			p_tfm = per_cpu_ptr(algo->tfms, cpu);
-- 
cgit v1.2.3


From 33bd5ac54dc47e002da4a395aaf9bf158dd17709 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 3 Jul 2018 14:36:21 -0700
Subject: net/ipv6: Revert attempt to simplify route replace and append

NetworkManager likes to manage linklocal prefix routes and does so with
the NLM_F_APPEND flag, breaking attempts to simplify the IPv6 route
code and by extension enable multipath routes with device only nexthops.

Revert f34436a43092 and these followup patches:
6eba08c3626b ("ipv6: Only emit append events for appended routes").
ce45bded6435 ("mlxsw: spectrum_router: Align with new route replace logic")
53b562df8c20 ("mlxsw: spectrum_router: Allow appending to dev-only routes")

Update the fib_tests cases to reflect the old behavior.

Fixes: f34436a43092 ("net/ipv6: Simplify route replace and appending into multipath route")
Signed-off-by: David Ahern <dsahern@gmail.com>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |  48 +++----
 include/net/ip6_route.h                            |   6 +
 net/ipv6/ip6_fib.c                                 | 156 ++++++++++++---------
 net/ipv6/route.c                                   |   3 +-
 tools/testing/selftests/net/fib_tests.sh           |  41 ------
 5 files changed, 117 insertions(+), 137 deletions(-)

(limited to 'net/ipv6')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 6aaaf3d9ba31..77b2adb29341 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -4756,6 +4756,12 @@ static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 *mlxsw_sp_rt6)
 	kfree(mlxsw_sp_rt6);
 }
 
+static bool mlxsw_sp_fib6_rt_can_mp(const struct fib6_info *rt)
+{
+	/* RTF_CACHE routes are ignored */
+	return (rt->fib6_flags & (RTF_GATEWAY | RTF_ADDRCONF)) == RTF_GATEWAY;
+}
+
 static struct fib6_info *
 mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry)
 {
@@ -4765,11 +4771,11 @@ mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry)
 
 static struct mlxsw_sp_fib6_entry *
 mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node,
-				 const struct fib6_info *nrt, bool append)
+				 const struct fib6_info *nrt, bool replace)
 {
 	struct mlxsw_sp_fib6_entry *fib6_entry;
 
-	if (!append)
+	if (!mlxsw_sp_fib6_rt_can_mp(nrt) || replace)
 		return NULL;
 
 	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
@@ -4784,7 +4790,8 @@ mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node,
 			break;
 		if (rt->fib6_metric < nrt->fib6_metric)
 			continue;
-		if (rt->fib6_metric == nrt->fib6_metric)
+		if (rt->fib6_metric == nrt->fib6_metric &&
+		    mlxsw_sp_fib6_rt_can_mp(rt))
 			return fib6_entry;
 		if (rt->fib6_metric > nrt->fib6_metric)
 			break;
@@ -5163,7 +5170,7 @@ static struct mlxsw_sp_fib6_entry *
 mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node,
 			      const struct fib6_info *nrt, bool replace)
 {
-	struct mlxsw_sp_fib6_entry *fib6_entry;
+	struct mlxsw_sp_fib6_entry *fib6_entry, *fallback = NULL;
 
 	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
 		struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
@@ -5172,13 +5179,18 @@ mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node,
 			continue;
 		if (rt->fib6_table->tb6_id != nrt->fib6_table->tb6_id)
 			break;
-		if (replace && rt->fib6_metric == nrt->fib6_metric)
-			return fib6_entry;
+		if (replace && rt->fib6_metric == nrt->fib6_metric) {
+			if (mlxsw_sp_fib6_rt_can_mp(rt) ==
+			    mlxsw_sp_fib6_rt_can_mp(nrt))
+				return fib6_entry;
+			if (mlxsw_sp_fib6_rt_can_mp(nrt))
+				fallback = fallback ?: fib6_entry;
+		}
 		if (rt->fib6_metric > nrt->fib6_metric)
-			return fib6_entry;
+			return fallback ?: fib6_entry;
 	}
 
-	return NULL;
+	return fallback;
 }
 
 static int
@@ -5304,8 +5316,7 @@ static void mlxsw_sp_fib6_entry_replace(struct mlxsw_sp *mlxsw_sp,
 }
 
 static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp,
-				    struct fib6_info *rt, bool replace,
-				    bool append)
+				    struct fib6_info *rt, bool replace)
 {
 	struct mlxsw_sp_fib6_entry *fib6_entry;
 	struct mlxsw_sp_fib_node *fib_node;
@@ -5331,7 +5342,7 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp,
 	/* Before creating a new entry, try to append route to an existing
 	 * multipath entry.
 	 */
-	fib6_entry = mlxsw_sp_fib6_node_mp_entry_find(fib_node, rt, append);
+	fib6_entry = mlxsw_sp_fib6_node_mp_entry_find(fib_node, rt, replace);
 	if (fib6_entry) {
 		err = mlxsw_sp_fib6_entry_nexthop_add(mlxsw_sp, fib6_entry, rt);
 		if (err)
@@ -5339,14 +5350,6 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp,
 		return 0;
 	}
 
-	/* We received an append event, yet did not find any route to
-	 * append to.
-	 */
-	if (WARN_ON(append)) {
-		err = -EINVAL;
-		goto err_fib6_entry_append;
-	}
-
 	fib6_entry = mlxsw_sp_fib6_entry_create(mlxsw_sp, fib_node, rt);
 	if (IS_ERR(fib6_entry)) {
 		err = PTR_ERR(fib6_entry);
@@ -5364,7 +5367,6 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp,
 err_fib6_node_entry_link:
 	mlxsw_sp_fib6_entry_destroy(mlxsw_sp, fib6_entry);
 err_fib6_entry_create:
-err_fib6_entry_append:
 err_fib6_entry_nexthop_add:
 	mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
 	return err;
@@ -5715,7 +5717,7 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work)
 	struct mlxsw_sp_fib_event_work *fib_work =
 		container_of(work, struct mlxsw_sp_fib_event_work, work);
 	struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp;
-	bool replace, append;
+	bool replace;
 	int err;
 
 	rtnl_lock();
@@ -5726,10 +5728,8 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work)
 	case FIB_EVENT_ENTRY_APPEND: /* fall through */
 	case FIB_EVENT_ENTRY_ADD:
 		replace = fib_work->event == FIB_EVENT_ENTRY_REPLACE;
-		append = fib_work->event == FIB_EVENT_ENTRY_APPEND;
 		err = mlxsw_sp_router_fib6_add(mlxsw_sp,
-					       fib_work->fen6_info.rt, replace,
-					       append);
+					       fib_work->fen6_info.rt, replace);
 		if (err)
 			mlxsw_sp_router_fib_abort(mlxsw_sp);
 		mlxsw_sp_rt6_release(fib_work->fen6_info.rt);
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 59656fc580df..7b9c82de11cc 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -66,6 +66,12 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 }
 
+static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
+{
+	return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
+	       RTF_GATEWAY;
+}
+
 void ip6_route_input(struct sk_buff *skb);
 struct dst_entry *ip6_route_input_lookup(struct net *net,
 					 struct net_device *dev,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 1fb2f3118d60..d212738e9d10 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -935,20 +935,19 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 {
 	struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
 				    lockdep_is_held(&rt->fib6_table->tb6_lock));
-	enum fib_event_type event = FIB_EVENT_ENTRY_ADD;
-	struct fib6_info *iter = NULL, *match = NULL;
+	struct fib6_info *iter = NULL;
 	struct fib6_info __rcu **ins;
+	struct fib6_info __rcu **fallback_ins = NULL;
 	int replace = (info->nlh &&
 		       (info->nlh->nlmsg_flags & NLM_F_REPLACE));
-	int append = (info->nlh &&
-		       (info->nlh->nlmsg_flags & NLM_F_APPEND));
 	int add = (!info->nlh ||
 		   (info->nlh->nlmsg_flags & NLM_F_CREATE));
 	int found = 0;
+	bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
 	u16 nlflags = NLM_F_EXCL;
 	int err;
 
-	if (append)
+	if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
 		nlflags |= NLM_F_APPEND;
 
 	ins = &fn->leaf;
@@ -970,8 +969,13 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 
 			nlflags &= ~NLM_F_EXCL;
 			if (replace) {
-				found++;
-				break;
+				if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
+					found++;
+					break;
+				}
+				if (rt_can_ecmp)
+					fallback_ins = fallback_ins ?: ins;
+				goto next_iter;
 			}
 
 			if (rt6_duplicate_nexthop(iter, rt)) {
@@ -986,51 +990,71 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 				fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu);
 				return -EEXIST;
 			}
-
-			/* first route that matches */
-			if (!match)
-				match = iter;
+			/* If we have the same destination and the same metric,
+			 * but not the same gateway, then the route we try to
+			 * add is sibling to this route, increment our counter
+			 * of siblings, and later we will add our route to the
+			 * list.
+			 * Only static routes (which don't have flag
+			 * RTF_EXPIRES) are used for ECMPv6.
+			 *
+			 * To avoid long list, we only had siblings if the
+			 * route have a gateway.
+			 */
+			if (rt_can_ecmp &&
+			    rt6_qualify_for_ecmp(iter))
+				rt->fib6_nsiblings++;
 		}
 
 		if (iter->fib6_metric > rt->fib6_metric)
 			break;
 
+next_iter:
 		ins = &iter->fib6_next;
 	}
 
+	if (fallback_ins && !found) {
+		/* No ECMP-able route found, replace first non-ECMP one */
+		ins = fallback_ins;
+		iter = rcu_dereference_protected(*ins,
+				    lockdep_is_held(&rt->fib6_table->tb6_lock));
+		found++;
+	}
+
 	/* Reset round-robin state, if necessary */
 	if (ins == &fn->leaf)
 		fn->rr_ptr = NULL;
 
 	/* Link this route to others same route. */
-	if (append && match) {
+	if (rt->fib6_nsiblings) {
+		unsigned int fib6_nsiblings;
 		struct fib6_info *sibling, *temp_sibling;
 
-		if (rt->fib6_flags & RTF_REJECT) {
-			NL_SET_ERR_MSG(extack,
-				       "Can not append a REJECT route");
-			return -EINVAL;
-		} else if (match->fib6_flags & RTF_REJECT) {
-			NL_SET_ERR_MSG(extack,
-				       "Can not append to a REJECT route");
-			return -EINVAL;
+		/* Find the first route that have the same metric */
+		sibling = leaf;
+		while (sibling) {
+			if (sibling->fib6_metric == rt->fib6_metric &&
+			    rt6_qualify_for_ecmp(sibling)) {
+				list_add_tail(&rt->fib6_siblings,
+					      &sibling->fib6_siblings);
+				break;
+			}
+			sibling = rcu_dereference_protected(sibling->fib6_next,
+				    lockdep_is_held(&rt->fib6_table->tb6_lock));
 		}
-		event = FIB_EVENT_ENTRY_APPEND;
-		rt->fib6_nsiblings = match->fib6_nsiblings;
-		list_add_tail(&rt->fib6_siblings, &match->fib6_siblings);
-		match->fib6_nsiblings++;
-
 		/* For each sibling in the list, increment the counter of
 		 * siblings. BUG() if counters does not match, list of siblings
 		 * is broken!
 		 */
+		fib6_nsiblings = 0;
 		list_for_each_entry_safe(sibling, temp_sibling,
-					 &match->fib6_siblings, fib6_siblings) {
+					 &rt->fib6_siblings, fib6_siblings) {
 			sibling->fib6_nsiblings++;
-			BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings);
+			BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
+			fib6_nsiblings++;
 		}
-
-		rt6_multipath_rebalance(match);
+		BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
+		rt6_multipath_rebalance(temp_sibling);
 	}
 
 	/*
@@ -1043,8 +1067,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 add:
 		nlflags |= NLM_F_CREATE;
 
-		err = call_fib6_entry_notifiers(info->nl_net, event, rt,
-						extack);
+		err = call_fib6_entry_notifiers(info->nl_net,
+						FIB_EVENT_ENTRY_ADD,
+						rt, extack);
 		if (err)
 			return err;
 
@@ -1062,7 +1087,7 @@ add:
 		}
 
 	} else {
-		struct fib6_info *tmp;
+		int nsiblings;
 
 		if (!found) {
 			if (add)
@@ -1077,57 +1102,48 @@ add:
 		if (err)
 			return err;
 
-		/* if route being replaced has siblings, set tmp to
-		 * last one, otherwise tmp is current route. this is
-		 * used to set fib6_next for new route
-		 */
-		if (iter->fib6_nsiblings)
-			tmp = list_last_entry(&iter->fib6_siblings,
-					      struct fib6_info,
-					      fib6_siblings);
-		else
-			tmp = iter;
-
-		/* insert new route */
 		atomic_inc(&rt->fib6_ref);
 		rcu_assign_pointer(rt->fib6_node, fn);
-		rt->fib6_next = tmp->fib6_next;
+		rt->fib6_next = iter->fib6_next;
 		rcu_assign_pointer(*ins, rt);
-
 		if (!info->skip_notify)
 			inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
 		if (!(fn->fn_flags & RTN_RTINFO)) {
 			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
 			fn->fn_flags |= RTN_RTINFO;
 		}
+		nsiblings = iter->fib6_nsiblings;
+		iter->fib6_node = NULL;
+		fib6_purge_rt(iter, fn, info->nl_net);
+		if (rcu_access_pointer(fn->rr_ptr) == iter)
+			fn->rr_ptr = NULL;
+		fib6_info_release(iter);
 
-		/* delete old route */
-		rt = iter;
-
-		if (rt->fib6_nsiblings) {
-			struct fib6_info *tmp;
-
+		if (nsiblings) {
 			/* Replacing an ECMP route, remove all siblings */
-			list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings,
-						 fib6_siblings) {
-				iter->fib6_node = NULL;
-				fib6_purge_rt(iter, fn, info->nl_net);
-				if (rcu_access_pointer(fn->rr_ptr) == iter)
-					fn->rr_ptr = NULL;
-				fib6_info_release(iter);
-
-				rt->fib6_nsiblings--;
-				info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
+			ins = &rt->fib6_next;
+			iter = rcu_dereference_protected(*ins,
+				    lockdep_is_held(&rt->fib6_table->tb6_lock));
+			while (iter) {
+				if (iter->fib6_metric > rt->fib6_metric)
+					break;
+				if (rt6_qualify_for_ecmp(iter)) {
+					*ins = iter->fib6_next;
+					iter->fib6_node = NULL;
+					fib6_purge_rt(iter, fn, info->nl_net);
+					if (rcu_access_pointer(fn->rr_ptr) == iter)
+						fn->rr_ptr = NULL;
+					fib6_info_release(iter);
+					nsiblings--;
+					info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
+				} else {
+					ins = &iter->fib6_next;
+				}
+				iter = rcu_dereference_protected(*ins,
+					lockdep_is_held(&rt->fib6_table->tb6_lock));
 			}
+			WARN_ON(nsiblings != 0);
 		}
-
-		WARN_ON(rt->fib6_nsiblings != 0);
-
-		rt->fib6_node = NULL;
-		fib6_purge_rt(rt, fn, info->nl_net);
-		if (rcu_access_pointer(fn->rr_ptr) == rt)
-			fn->rr_ptr = NULL;
-		fib6_info_release(rt);
 	}
 
 	return 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 86a0e4333d42..63f99411f0de 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3842,7 +3842,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
 			lockdep_is_held(&rt->fib6_table->tb6_lock));
 	while (iter) {
 		if (iter->fib6_metric == rt->fib6_metric &&
-		    iter->fib6_nsiblings)
+		    rt6_qualify_for_ecmp(iter))
 			return iter;
 		iter = rcu_dereference_protected(iter->fib6_next,
 				lockdep_is_held(&rt->fib6_table->tb6_lock));
@@ -4439,7 +4439,6 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 		 */
 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
 						     NLM_F_REPLACE);
-		cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND;
 		nhn++;
 	}
 
diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index 78245d60d8bc..0f45633bd634 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -740,13 +740,6 @@ ipv6_rt_add()
 	run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64"
 	log_test $? 2 "Attempt to add duplicate route - reject route"
 
-	# iproute2 prepend only sets NLM_F_CREATE
-	# - adds a new route; does NOT convert existing route to ECMP
-	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
-	run_cmd "$IP -6 ro prepend 2001:db8:104::/64 via 2001:db8:103::2"
-	check_route6 "2001:db8:104::/64 via 2001:db8:101::2 dev veth1 metric 1024 2001:db8:104::/64 via 2001:db8:103::2 dev veth3 metric 1024"
-	log_test $? 0 "Add new route for existing prefix (w/o NLM_F_EXCL)"
-
 	# route append with same prefix adds a new route
 	# - iproute2 sets NLM_F_CREATE | NLM_F_APPEND
 	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
@@ -754,27 +747,6 @@ ipv6_rt_add()
 	check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
 	log_test $? 0 "Append nexthop to existing route - gw"
 
-	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
-	run_cmd "$IP -6 ro append 2001:db8:104::/64 dev veth3"
-	check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop dev veth3 weight 1"
-	log_test $? 0 "Append nexthop to existing route - dev only"
-
-	# multipath route can not have a nexthop that is a reject route
-	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
-	run_cmd "$IP -6 ro append unreachable 2001:db8:104::/64"
-	log_test $? 2 "Append nexthop to existing route - reject route"
-
-	# reject route can not be converted to multipath route
-	run_cmd "$IP -6 ro flush 2001:db8:104::/64"
-	run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64"
-	run_cmd "$IP -6 ro append 2001:db8:104::/64 via 2001:db8:103::2"
-	log_test $? 2 "Append nexthop to existing reject route - gw"
-
-	run_cmd "$IP -6 ro flush 2001:db8:104::/64"
-	run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64"
-	run_cmd "$IP -6 ro append 2001:db8:104::/64 dev veth3"
-	log_test $? 2 "Append nexthop to existing reject route - dev only"
-
 	# insert mpath directly
 	add_route6 "2001:db8:104::/64" "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
 	check_route6  "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
@@ -819,13 +791,6 @@ ipv6_rt_replace_single()
 	check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::3 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
 	log_test $? 0 "Single path with multipath"
 
-	# single path with reject
-	#
-	add_initial_route6 "nexthop via 2001:db8:101::2"
-	run_cmd "$IP -6 ro replace unreachable 2001:db8:104::/64"
-	check_route6 "unreachable 2001:db8:104::/64 dev lo metric 1024"
-	log_test $? 0 "Single path with reject route"
-
 	# single path with single path using MULTIPATH attribute
 	#
 	add_initial_route6 "via 2001:db8:101::2"
@@ -873,12 +838,6 @@ ipv6_rt_replace_mpath()
 	check_route6 "2001:db8:104::/64 via 2001:db8:101::3 dev veth1 metric 1024"
 	log_test $? 0 "Multipath with single path via multipath attribute"
 
-	# multipath with reject
-	add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
-	run_cmd "$IP -6 ro replace unreachable 2001:db8:104::/64"
-	check_route6 "unreachable 2001:db8:104::/64 dev lo metric 1024"
-	log_test $? 0 "Multipath with reject route"
-
 	# route replace fails - invalid nexthop 1
 	add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
 	run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:111::3 nexthop via 2001:db8:103::3"
-- 
cgit v1.2.3


From d376bef9c29b3c65aeee4e785fffcd97ef0a9a81 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 4 Jul 2018 20:25:32 +0200
Subject: netfilter: x_tables: set module owner for icmp(6) matches

nft_compat relies on xt_request_find_match to increment
refcount of the module that provides the match/target.

The (builtin) icmp matches did't set the module owner so it
was possible to rmmod ip(6)tables while icmp extensions were still in use.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ip_tables.c  | 1 +
 net/ipv6/netfilter/ip6_tables.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index ca0dad90803a..e77872c93c20 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1898,6 +1898,7 @@ static struct xt_match ipt_builtin_mt[] __read_mostly = {
 		.checkentry = icmp_checkentry,
 		.proto      = IPPROTO_ICMP,
 		.family     = NFPROTO_IPV4,
+		.me	    = THIS_MODULE,
 	},
 };
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 7eab959734bc..daf2e9e9193d 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1909,6 +1909,7 @@ static struct xt_match ip6t_builtin_mt[] __read_mostly = {
 		.checkentry = icmp6_checkentry,
 		.proto      = IPPROTO_ICMPV6,
 		.family     = NFPROTO_IPV6,
+		.me	    = THIS_MODULE,
 	},
 };
 
-- 
cgit v1.2.3


From a9ba23d48dbc6ffd08426bb10f05720e0b9f5c14 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Wed, 4 Jul 2018 09:58:05 -0400
Subject: ipv6: make ipv6_renew_options() interrupt/kernel safe

At present the ipv6_renew_options_kern() function ends up calling into
access_ok() which is problematic if done from inside an interrupt as
access_ok() calls WARN_ON_IN_IRQ() on some (all?) architectures
(x86-64 is affected).  Example warning/backtrace is shown below:

 WARNING: CPU: 1 PID: 3144 at lib/usercopy.c:11 _copy_from_user+0x85/0x90
 ...
 Call Trace:
  <IRQ>
  ipv6_renew_option+0xb2/0xf0
  ipv6_renew_options+0x26a/0x340
  ipv6_renew_options_kern+0x2c/0x40
  calipso_req_setattr+0x72/0xe0
  netlbl_req_setattr+0x126/0x1b0
  selinux_netlbl_inet_conn_request+0x80/0x100
  selinux_inet_conn_request+0x6d/0xb0
  security_inet_conn_request+0x32/0x50
  tcp_conn_request+0x35f/0xe00
  ? __lock_acquire+0x250/0x16c0
  ? selinux_socket_sock_rcv_skb+0x1ae/0x210
  ? tcp_rcv_state_process+0x289/0x106b
  tcp_rcv_state_process+0x289/0x106b
  ? tcp_v6_do_rcv+0x1a7/0x3c0
  tcp_v6_do_rcv+0x1a7/0x3c0
  tcp_v6_rcv+0xc82/0xcf0
  ip6_input_finish+0x10d/0x690
  ip6_input+0x45/0x1e0
  ? ip6_rcv_finish+0x1d0/0x1d0
  ipv6_rcv+0x32b/0x880
  ? ip6_make_skb+0x1e0/0x1e0
  __netif_receive_skb_core+0x6f2/0xdf0
  ? process_backlog+0x85/0x250
  ? process_backlog+0x85/0x250
  ? process_backlog+0xec/0x250
  process_backlog+0xec/0x250
  net_rx_action+0x153/0x480
  __do_softirq+0xd9/0x4f7
  do_softirq_own_stack+0x2a/0x40
  </IRQ>
  ...

While not present in the backtrace, ipv6_renew_option() ends up calling
access_ok() via the following chain:

  access_ok()
  _copy_from_user()
  copy_from_user()
  ipv6_renew_option()

The fix presented in this patch is to perform the userspace copy
earlier in the call chain such that it is only called when the option
data is actually coming from userspace; that place is
do_ipv6_setsockopt().  Not only does this solve the problem seen in
the backtrace above, it also allows us to simplify the code quite a
bit by removing ipv6_renew_options_kern() completely.  We also take
this opportunity to cleanup ipv6_renew_options()/ipv6_renew_option()
a small amount as well.

This patch is heavily based on a rough patch by Al Viro.  I've taken
his original patch, converted a kmemdup() call in do_ipv6_setsockopt()
to a memdup_user() call, made better use of the e_inval jump target in
the same function, and cleaned up the use ipv6_renew_option() by
ipv6_renew_options().

CC: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h       |   9 +---
 net/ipv6/calipso.c       |   9 ++--
 net/ipv6/exthdrs.c       | 111 +++++++++++++----------------------------------
 net/ipv6/ipv6_sockglue.c |  27 ++++++++----
 4 files changed, 53 insertions(+), 103 deletions(-)

(limited to 'net/ipv6')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 16475c269749..d02881e4ad1f 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -355,14 +355,7 @@ struct ipv6_txoptions *ipv6_dup_options(struct sock *sk,
 struct ipv6_txoptions *ipv6_renew_options(struct sock *sk,
 					  struct ipv6_txoptions *opt,
 					  int newtype,
-					  struct ipv6_opt_hdr __user *newopt,
-					  int newoptlen);
-struct ipv6_txoptions *
-ipv6_renew_options_kern(struct sock *sk,
-			struct ipv6_txoptions *opt,
-			int newtype,
-			struct ipv6_opt_hdr *newopt,
-			int newoptlen);
+					  struct ipv6_opt_hdr *newopt);
 struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
 					  struct ipv6_txoptions *opt);
 
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index 1323b9679cf7..1c0bb9fb76e6 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -799,8 +799,7 @@ static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop)
 {
 	struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts;
 
-	txopts = ipv6_renew_options_kern(sk, old, IPV6_HOPOPTS,
-					 hop, hop ? ipv6_optlen(hop) : 0);
+	txopts = ipv6_renew_options(sk, old, IPV6_HOPOPTS, hop);
 	txopt_put(old);
 	if (IS_ERR(txopts))
 		return PTR_ERR(txopts);
@@ -1222,8 +1221,7 @@ static int calipso_req_setattr(struct request_sock *req,
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 
-	txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS,
-					 new, new ? ipv6_optlen(new) : 0);
+	txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new);
 
 	kfree(new);
 
@@ -1260,8 +1258,7 @@ static void calipso_req_delattr(struct request_sock *req)
 	if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new))
 		return; /* Nothing to do */
 
-	txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS,
-					 new, new ? ipv6_optlen(new) : 0);
+	txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new);
 
 	if (!IS_ERR(txopts)) {
 		txopts = xchg(&req_inet->ipv6_opt, txopts);
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 5bc2bf3733ab..20291c2036fc 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -1015,29 +1015,21 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
 }
 EXPORT_SYMBOL_GPL(ipv6_dup_options);
 
-static int ipv6_renew_option(void *ohdr,
-			     struct ipv6_opt_hdr __user *newopt, int newoptlen,
-			     int inherit,
-			     struct ipv6_opt_hdr **hdr,
-			     char **p)
+static void ipv6_renew_option(int renewtype,
+			      struct ipv6_opt_hdr **dest,
+			      struct ipv6_opt_hdr *old,
+			      struct ipv6_opt_hdr *new,
+			      int newtype, char **p)
 {
-	if (inherit) {
-		if (ohdr) {
-			memcpy(*p, ohdr, ipv6_optlen((struct ipv6_opt_hdr *)ohdr));
-			*hdr = (struct ipv6_opt_hdr *)*p;
-			*p += CMSG_ALIGN(ipv6_optlen(*hdr));
-		}
-	} else {
-		if (newopt) {
-			if (copy_from_user(*p, newopt, newoptlen))
-				return -EFAULT;
-			*hdr = (struct ipv6_opt_hdr *)*p;
-			if (ipv6_optlen(*hdr) > newoptlen)
-				return -EINVAL;
-			*p += CMSG_ALIGN(newoptlen);
-		}
-	}
-	return 0;
+	struct ipv6_opt_hdr *src;
+
+	src = (renewtype == newtype ? new : old);
+	if (!src)
+		return;
+
+	memcpy(*p, src, ipv6_optlen(src));
+	*dest = (struct ipv6_opt_hdr *)*p;
+	*p += CMSG_ALIGN(ipv6_optlen(*dest));
 }
 
 /**
@@ -1063,13 +1055,11 @@ static int ipv6_renew_option(void *ohdr,
  */
 struct ipv6_txoptions *
 ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
-		   int newtype,
-		   struct ipv6_opt_hdr __user *newopt, int newoptlen)
+		   int newtype, struct ipv6_opt_hdr *newopt)
 {
 	int tot_len = 0;
 	char *p;
 	struct ipv6_txoptions *opt2;
-	int err;
 
 	if (opt) {
 		if (newtype != IPV6_HOPOPTS && opt->hopopt)
@@ -1082,8 +1072,8 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
 			tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt));
 	}
 
-	if (newopt && newoptlen)
-		tot_len += CMSG_ALIGN(newoptlen);
+	if (newopt)
+		tot_len += CMSG_ALIGN(ipv6_optlen(newopt));
 
 	if (!tot_len)
 		return NULL;
@@ -1098,29 +1088,19 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
 	opt2->tot_len = tot_len;
 	p = (char *)(opt2 + 1);
 
-	err = ipv6_renew_option(opt ? opt->hopopt : NULL, newopt, newoptlen,
-				newtype != IPV6_HOPOPTS,
-				&opt2->hopopt, &p);
-	if (err)
-		goto out;
-
-	err = ipv6_renew_option(opt ? opt->dst0opt : NULL, newopt, newoptlen,
-				newtype != IPV6_RTHDRDSTOPTS,
-				&opt2->dst0opt, &p);
-	if (err)
-		goto out;
-
-	err = ipv6_renew_option(opt ? opt->srcrt : NULL, newopt, newoptlen,
-				newtype != IPV6_RTHDR,
-				(struct ipv6_opt_hdr **)&opt2->srcrt, &p);
-	if (err)
-		goto out;
-
-	err = ipv6_renew_option(opt ? opt->dst1opt : NULL, newopt, newoptlen,
-				newtype != IPV6_DSTOPTS,
-				&opt2->dst1opt, &p);
-	if (err)
-		goto out;
+	ipv6_renew_option(IPV6_HOPOPTS, &opt2->hopopt,
+			  (opt ? opt->hopopt : NULL),
+			  newopt, newtype, &p);
+	ipv6_renew_option(IPV6_RTHDRDSTOPTS, &opt2->dst0opt,
+			  (opt ? opt->dst0opt : NULL),
+			  newopt, newtype, &p);
+	ipv6_renew_option(IPV6_RTHDR,
+			  (struct ipv6_opt_hdr **)&opt2->srcrt,
+			  (opt ? (struct ipv6_opt_hdr *)opt->srcrt : NULL),
+			  newopt, newtype, &p);
+	ipv6_renew_option(IPV6_DSTOPTS, &opt2->dst1opt,
+			  (opt ? opt->dst1opt : NULL),
+			  newopt, newtype, &p);
 
 	opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) +
 			  (opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) +
@@ -1128,37 +1108,6 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
 	opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0);
 
 	return opt2;
-out:
-	sock_kfree_s(sk, opt2, opt2->tot_len);
-	return ERR_PTR(err);
-}
-
-/**
- * ipv6_renew_options_kern - replace a specific ext hdr with a new one.
- *
- * @sk: sock from which to allocate memory
- * @opt: original options
- * @newtype: option type to replace in @opt
- * @newopt: new option of type @newtype to replace (kernel-mem)
- * @newoptlen: length of @newopt
- *
- * See ipv6_renew_options().  The difference is that @newopt is
- * kernel memory, rather than user memory.
- */
-struct ipv6_txoptions *
-ipv6_renew_options_kern(struct sock *sk, struct ipv6_txoptions *opt,
-			int newtype, struct ipv6_opt_hdr *newopt,
-			int newoptlen)
-{
-	struct ipv6_txoptions *ret_val;
-	const mm_segment_t old_fs = get_fs();
-
-	set_fs(KERNEL_DS);
-	ret_val = ipv6_renew_options(sk, opt, newtype,
-				     (struct ipv6_opt_hdr __user *)newopt,
-				     newoptlen);
-	set_fs(old_fs);
-	return ret_val;
 }
 
 struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 4d780c7f0130..c95c3486d904 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -398,6 +398,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 	case IPV6_DSTOPTS:
 	{
 		struct ipv6_txoptions *opt;
+		struct ipv6_opt_hdr *new = NULL;
+
+		/* hop-by-hop / destination options are privileged option */
+		retv = -EPERM;
+		if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
+			break;
 
 		/* remove any sticky options header with a zero option
 		 * length, per RFC3542.
@@ -409,17 +415,22 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		else if (optlen < sizeof(struct ipv6_opt_hdr) ||
 			 optlen & 0x7 || optlen > 8 * 255)
 			goto e_inval;
-
-		/* hop-by-hop / destination options are privileged option */
-		retv = -EPERM;
-		if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
-			break;
+		else {
+			new = memdup_user(optval, optlen);
+			if (IS_ERR(new)) {
+				retv = PTR_ERR(new);
+				break;
+			}
+			if (unlikely(ipv6_optlen(new) > optlen)) {
+				kfree(new);
+				goto e_inval;
+			}
+		}
 
 		opt = rcu_dereference_protected(np->opt,
 						lockdep_sock_is_held(sk));
-		opt = ipv6_renew_options(sk, opt, optname,
-					 (struct ipv6_opt_hdr __user *)optval,
-					 optlen);
+		opt = ipv6_renew_options(sk, opt, optname, new);
+		kfree(new);
 		if (IS_ERR(opt)) {
 			retv = PTR_ERR(opt);
 			break;
-- 
cgit v1.2.3


From 5711b4e89319c2912f20b2a4f371c1525fc9551d Mon Sep 17 00:00:00 2001
From: Máté Eckl <ecklm94@gmail.com>
Date: Thu, 5 Jul 2018 12:01:53 +0200
Subject: netfilter: nf_tproxy: fix possible non-linear access to transport
 header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixes a silent out-of-bound read possibility that was present
because of the misuse of this function.

Mostly it was called with a struct udphdr *hp which had only the udphdr
part linearized by the skb_header_pointer, however
nf_tproxy_get_sock_v{4,6} uses it as a tcphdr pointer, so some reads for
tcp specific attributes may be invalid.

Fixes: a583636a83ea ("inet: refactor inet[6]_lookup functions to take skb")
Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tproxy.h   |  4 ++--
 net/ipv4/netfilter/nf_tproxy_ipv4.c | 18 ++++++++++++------
 net/ipv6/netfilter/nf_tproxy_ipv6.c | 18 ++++++++++++------
 net/netfilter/xt_TPROXY.c           |  8 ++++----
 4 files changed, 30 insertions(+), 18 deletions(-)

(limited to 'net/ipv6')

diff --git a/include/net/netfilter/nf_tproxy.h b/include/net/netfilter/nf_tproxy.h
index 9754a50ecde9..4cc64c8446eb 100644
--- a/include/net/netfilter/nf_tproxy.h
+++ b/include/net/netfilter/nf_tproxy.h
@@ -64,7 +64,7 @@ nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
  * belonging to established connections going through that one.
  */
 struct sock *
-nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
+nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
 		      const u8 protocol,
 		      const __be32 saddr, const __be32 daddr,
 		      const __be16 sport, const __be16 dport,
@@ -103,7 +103,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
 			    struct sock *sk);
 
 struct sock *
-nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
+nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
 		      const u8 protocol,
 		      const struct in6_addr *saddr, const struct in6_addr *daddr,
 		      const __be16 sport, const __be16 dport,
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
index 805e83ec3ad9..164714104965 100644
--- a/net/ipv4/netfilter/nf_tproxy_ipv4.c
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -37,7 +37,7 @@ nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
 		 * to a listener socket if there's one */
 		struct sock *sk2;
 
-		sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
+		sk2 = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
 					    iph->saddr, laddr ? laddr : iph->daddr,
 					    hp->source, lport ? lport : hp->dest,
 					    skb->dev, NF_TPROXY_LOOKUP_LISTENER);
@@ -71,7 +71,7 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
 EXPORT_SYMBOL_GPL(nf_tproxy_laddr4);
 
 struct sock *
-nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
+nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
 		      const u8 protocol,
 		      const __be32 saddr, const __be32 daddr,
 		      const __be16 sport, const __be16 dport,
@@ -79,16 +79,21 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
 		      const enum nf_tproxy_lookup_t lookup_type)
 {
 	struct sock *sk;
-	struct tcphdr *tcph;
 
 	switch (protocol) {
-	case IPPROTO_TCP:
+	case IPPROTO_TCP: {
+		struct tcphdr _hdr, *hp;
+
+		hp = skb_header_pointer(skb, ip_hdrlen(skb),
+					sizeof(struct tcphdr), &_hdr);
+		if (hp == NULL)
+			return NULL;
+
 		switch (lookup_type) {
 		case NF_TPROXY_LOOKUP_LISTENER:
-			tcph = hp;
 			sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
 						    ip_hdrlen(skb) +
-						      __tcp_hdrlen(tcph),
+						      __tcp_hdrlen(hp),
 						    saddr, sport,
 						    daddr, dport,
 						    in->ifindex, 0);
@@ -110,6 +115,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
 			BUG();
 		}
 		break;
+		}
 	case IPPROTO_UDP:
 		sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
 				     in->ifindex);
diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c
index bf1d6c421e3b..5dfd33af6451 100644
--- a/net/ipv6/netfilter/nf_tproxy_ipv6.c
+++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c
@@ -55,7 +55,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
 		 * to a listener socket if there's one */
 		struct sock *sk2;
 
-		sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, hp, tproto,
+		sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, tproto,
 					    &iph->saddr,
 					    nf_tproxy_laddr6(skb, laddr, &iph->daddr),
 					    hp->source,
@@ -72,7 +72,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
 EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait6);
 
 struct sock *
-nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
+nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
 		      const u8 protocol,
 		      const struct in6_addr *saddr, const struct in6_addr *daddr,
 		      const __be16 sport, const __be16 dport,
@@ -80,15 +80,20 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
 		      const enum nf_tproxy_lookup_t lookup_type)
 {
 	struct sock *sk;
-	struct tcphdr *tcph;
 
 	switch (protocol) {
-	case IPPROTO_TCP:
+	case IPPROTO_TCP: {
+		struct tcphdr _hdr, *hp;
+
+		hp = skb_header_pointer(skb, thoff,
+					sizeof(struct tcphdr), &_hdr);
+		if (hp == NULL)
+			return NULL;
+
 		switch (lookup_type) {
 		case NF_TPROXY_LOOKUP_LISTENER:
-			tcph = hp;
 			sk = inet6_lookup_listener(net, &tcp_hashinfo, skb,
-						   thoff + __tcp_hdrlen(tcph),
+						   thoff + __tcp_hdrlen(hp),
 						   saddr, sport,
 						   daddr, ntohs(dport),
 						   in->ifindex, 0);
@@ -110,6 +115,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
 			BUG();
 		}
 		break;
+		}
 	case IPPROTO_UDP:
 		sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
 				     in->ifindex);
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 58fce4e749a9..d76550a8b642 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -61,7 +61,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
 	 * addresses, this happens if the redirect already happened
 	 * and the current packet belongs to an already established
 	 * connection */
-	sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
+	sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
 				   iph->saddr, iph->daddr,
 				   hp->source, hp->dest,
 				   skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
@@ -77,7 +77,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
 	else if (!sk)
 		/* no, there's no established connection, check if
 		 * there's a listener on the redirected addr/port */
-		sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
+		sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
 					   iph->saddr, laddr,
 					   hp->source, lport,
 					   skb->dev, NF_TPROXY_LOOKUP_LISTENER);
@@ -150,7 +150,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 	 * addresses, this happens if the redirect already happened
 	 * and the current packet belongs to an already established
 	 * connection */
-	sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
+	sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, tproto,
 				   &iph->saddr, &iph->daddr,
 				   hp->source, hp->dest,
 				   xt_in(par), NF_TPROXY_LOOKUP_ESTABLISHED);
@@ -171,7 +171,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 	else if (!sk)
 		/* no there's no established connection, check if
 		 * there's a listener on the redirected addr/port */
-		sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp,
+		sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff,
 					   tproto, &iph->saddr, laddr,
 					   hp->source, lport,
 					   xt_in(par), NF_TPROXY_LOOKUP_LISTENER);
-- 
cgit v1.2.3


From 84379c9afe011020e797e3f50a662b08a6355dcf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 9 Jul 2018 13:43:38 +0200
Subject: netfilter: ipv6: nf_defrag: drop skb dst before queueing

Eric Dumazet reports:
 Here is a reproducer of an annoying bug detected by syzkaller on our production kernel
 [..]
 ./b78305423 enable_conntrack
 Then :
 sleep 60
 dmesg | tail -10
 [  171.599093] unregister_netdevice: waiting for lo to become free. Usage count = 2
 [  181.631024] unregister_netdevice: waiting for lo to become free. Usage count = 2
 [  191.687076] unregister_netdevice: waiting for lo to become free. Usage count = 2
 [  201.703037] unregister_netdevice: waiting for lo to become free. Usage count = 2
 [  211.711072] unregister_netdevice: waiting for lo to become free. Usage count = 2
 [  221.959070] unregister_netdevice: waiting for lo to become free. Usage count = 2

Reproducer sends ipv6 fragment that hits nfct defrag via LOCAL_OUT hook.
skb gets queued until frag timer expiry -- 1 minute.

Normally nf_conntrack_reasm gets called during prerouting, so skb has
no dst yet which might explain why this wasn't spotted earlier.

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Reported-by: John Sperbeck <jsperbeck@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Tested-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index a452d99c9f52..e4d9e6976d3c 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -585,6 +585,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 	    fq->q.meat == fq->q.len &&
 	    nf_ct_frag6_reasm(fq, skb, dev))
 		ret = 0;
+	else
+		skb_dst_drop(skb);
 
 out_unlock:
 	spin_unlock_bh(&fq->q.lock);
-- 
cgit v1.2.3


From c7ea20c9da5b94e400c8dcc0adb99411f2e430a6 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 10 Jul 2018 22:41:27 +0800
Subject: ipv6/mcast: init as INCLUDE when join SSM INCLUDE group

This an IPv6 version patch of "ipv4/igmp: init group mode as INCLUDE when
join source group". From RFC3810, part 6.1:

   If no per-interface state existed for that
   multicast address before the change (i.e., the change consisted of
   creating a new per-interface record), or if no state exists after the
   change (i.e., the change consisted of deleting a per-interface
   record), then the "non-existent" state is considered to have an
   INCLUDE filter mode and an empty source list.

Which means a new multicast group should start with state IN(). Currently,
for MLDv2 SSM JOIN_SOURCE_GROUP mode, we first call ipv6_sock_mc_join(),
then ip6_mc_source(), which will trigger a TO_IN() message instead of
ALLOW().

The issue was exposed by commit a052517a8ff65 ("net/multicast: should not
send source list records when have filter mode change"). Before this change,
we sent both ALLOW(A) and TO_IN(A). Now, we only send TO_IN(A).

Fix it by adding a new parameter to init group mode. Also add some wrapper
functions to avoid changing too much code.

v1 -> v2:
In the first version I only cleared the group change record. But this is not
enough. Because when a new group join, it will init as EXCLUDE and trigger
a filter mode change in ip/ip6_mc_add_src(), which will clear all source
addresses sf_crcount. This will prevent early joined address sending state
change records if multi source addressed joined at the same time.

In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM
JOIN_SOURCE_GROUP. I also split the original patch into two separated patches
for IPv4 and IPv6.

There is also a difference between v4 and v6 version. For IPv6, when the
interface goes down and up, we will send correct state change record with
unspecified IPv6 address (::) with function ipv6_mc_up(). But after DAD is
completed, we resend the change record TO_IN() in mld_send_initial_cr().
Fix it by sending ALLOW() for INCLUDE mode in mld_send_initial_cr().

Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change")
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h       |  2 ++
 net/ipv6/ipv6_sockglue.c |  5 ++--
 net/ipv6/mcast.c         | 64 ++++++++++++++++++++++++++++++++++--------------
 3 files changed, 50 insertions(+), 21 deletions(-)

(limited to 'net/ipv6')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index d02881e4ad1f..7528632bcf2a 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1100,6 +1100,8 @@ void ipv6_sysctl_unregister(void);
 
 int ipv6_sock_mc_join(struct sock *sk, int ifindex,
 		      const struct in6_addr *addr);
+int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex,
+			  const struct in6_addr *addr, unsigned int mode);
 int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
 		      const struct in6_addr *addr);
 #endif /* _NET_IPV6_H */
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index c95c3486d904..568ca4187cd1 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -729,8 +729,9 @@ done:
 			struct sockaddr_in6 *psin6;
 
 			psin6 = (struct sockaddr_in6 *)&greqs.gsr_group;
-			retv = ipv6_sock_mc_join(sk, greqs.gsr_interface,
-						 &psin6->sin6_addr);
+			retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface,
+						     &psin6->sin6_addr,
+						     MCAST_INCLUDE);
 			/* prior join w/ different source is ok */
 			if (retv && retv != -EADDRINUSE)
 				break;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index c0c74088f2af..2699be7202be 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -95,6 +95,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
 			  int delta);
 static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
 			    struct inet6_dev *idev);
+static int __ipv6_dev_mc_inc(struct net_device *dev,
+			     const struct in6_addr *addr, unsigned int mode);
 
 #define MLD_QRV_DEFAULT		2
 /* RFC3810, 9.2. Query Interval */
@@ -132,7 +134,8 @@ static int unsolicited_report_interval(struct inet6_dev *idev)
 	return iv > 0 ? iv : 1;
 }
 
-int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
+static int __ipv6_sock_mc_join(struct sock *sk, int ifindex,
+			       const struct in6_addr *addr, unsigned int mode)
 {
 	struct net_device *dev = NULL;
 	struct ipv6_mc_socklist *mc_lst;
@@ -179,7 +182,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 	}
 
 	mc_lst->ifindex = dev->ifindex;
-	mc_lst->sfmode = MCAST_EXCLUDE;
+	mc_lst->sfmode = mode;
 	rwlock_init(&mc_lst->sflock);
 	mc_lst->sflist = NULL;
 
@@ -187,7 +190,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 	 *	now add/increase the group membership on the device
 	 */
 
-	err = ipv6_dev_mc_inc(dev, addr);
+	err = __ipv6_dev_mc_inc(dev, addr, mode);
 
 	if (err) {
 		sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
@@ -199,8 +202,19 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 
 	return 0;
 }
+
+int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
+{
+	return __ipv6_sock_mc_join(sk, ifindex, addr, MCAST_EXCLUDE);
+}
 EXPORT_SYMBOL(ipv6_sock_mc_join);
 
+int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex,
+			  const struct in6_addr *addr, unsigned int mode)
+{
+	return __ipv6_sock_mc_join(sk, ifindex, addr, mode);
+}
+
 /*
  *	socket leave on multicast group
  */
@@ -646,7 +660,7 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
 	return rv;
 }
 
-static void igmp6_group_added(struct ifmcaddr6 *mc)
+static void igmp6_group_added(struct ifmcaddr6 *mc, unsigned int mode)
 {
 	struct net_device *dev = mc->idev->dev;
 	char buf[MAX_ADDR_LEN];
@@ -672,7 +686,13 @@ static void igmp6_group_added(struct ifmcaddr6 *mc)
 	}
 	/* else v2 */
 
-	mc->mca_crcount = mc->idev->mc_qrv;
+	/* Based on RFC3810 6.1, for newly added INCLUDE SSM, we
+	 * should not send filter-mode change record as the mode
+	 * should be from IN() to IN(A).
+	 */
+	if (mode == MCAST_EXCLUDE)
+		mc->mca_crcount = mc->idev->mc_qrv;
+
 	mld_ifc_event(mc->idev);
 }
 
@@ -770,13 +790,14 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
 	spin_lock_bh(&im->mca_lock);
 	if (pmc) {
 		im->idev = pmc->idev;
-		im->mca_crcount = idev->mc_qrv;
 		im->mca_sfmode = pmc->mca_sfmode;
 		if (pmc->mca_sfmode == MCAST_INCLUDE) {
 			im->mca_tomb = pmc->mca_tomb;
 			im->mca_sources = pmc->mca_sources;
 			for (psf = im->mca_sources; psf; psf = psf->sf_next)
-				psf->sf_crcount = im->mca_crcount;
+				psf->sf_crcount = idev->mc_qrv;
+		} else {
+			im->mca_crcount = idev->mc_qrv;
 		}
 		in6_dev_put(pmc->idev);
 		kfree(pmc);
@@ -831,7 +852,8 @@ static void ma_put(struct ifmcaddr6 *mc)
 }
 
 static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
-				   const struct in6_addr *addr)
+				   const struct in6_addr *addr,
+				   unsigned int mode)
 {
 	struct ifmcaddr6 *mc;
 
@@ -849,9 +871,8 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
 	refcount_set(&mc->mca_refcnt, 1);
 	spin_lock_init(&mc->mca_lock);
 
-	/* initial mode is (EX, empty) */
-	mc->mca_sfmode = MCAST_EXCLUDE;
-	mc->mca_sfcount[MCAST_EXCLUDE] = 1;
+	mc->mca_sfmode = mode;
+	mc->mca_sfcount[mode] = 1;
 
 	if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) ||
 	    IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL)
@@ -863,7 +884,8 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
 /*
  *	device multicast group inc (add if not found)
  */
-int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
+static int __ipv6_dev_mc_inc(struct net_device *dev,
+			     const struct in6_addr *addr, unsigned int mode)
 {
 	struct ifmcaddr6 *mc;
 	struct inet6_dev *idev;
@@ -887,14 +909,13 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
 		if (ipv6_addr_equal(&mc->mca_addr, addr)) {
 			mc->mca_users++;
 			write_unlock_bh(&idev->lock);
-			ip6_mc_add_src(idev, &mc->mca_addr, MCAST_EXCLUDE, 0,
-				NULL, 0);
+			ip6_mc_add_src(idev, &mc->mca_addr, mode, 0, NULL, 0);
 			in6_dev_put(idev);
 			return 0;
 		}
 	}
 
-	mc = mca_alloc(idev, addr);
+	mc = mca_alloc(idev, addr, mode);
 	if (!mc) {
 		write_unlock_bh(&idev->lock);
 		in6_dev_put(idev);
@@ -911,11 +932,16 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
 	write_unlock_bh(&idev->lock);
 
 	mld_del_delrec(idev, mc);
-	igmp6_group_added(mc);
+	igmp6_group_added(mc, mode);
 	ma_put(mc);
 	return 0;
 }
 
+int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
+{
+	return __ipv6_dev_mc_inc(dev, addr, MCAST_EXCLUDE);
+}
+
 /*
  *	device multicast group del
  */
@@ -1751,7 +1777,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
 
 		psf_next = psf->sf_next;
 
-		if (!is_in(pmc, psf, type, gdeleted, sdeleted)) {
+		if (!is_in(pmc, psf, type, gdeleted, sdeleted) && !crsend) {
 			psf_prev = psf;
 			continue;
 		}
@@ -2066,7 +2092,7 @@ static void mld_send_initial_cr(struct inet6_dev *idev)
 		if (pmc->mca_sfcount[MCAST_EXCLUDE])
 			type = MLD2_CHANGE_TO_EXCLUDE;
 		else
-			type = MLD2_CHANGE_TO_INCLUDE;
+			type = MLD2_ALLOW_NEW_SOURCES;
 		skb = add_grec(skb, pmc, type, 0, 0, 1);
 		spin_unlock_bh(&pmc->mca_lock);
 	}
@@ -2546,7 +2572,7 @@ void ipv6_mc_up(struct inet6_dev *idev)
 	ipv6_mc_reset(idev);
 	for (i = idev->mc_list; i; i = i->next) {
 		mld_del_delrec(idev, i);
-		igmp6_group_added(i);
+		igmp6_group_added(i, i->mca_sfmode);
 	}
 	read_unlock_bh(&idev->lock);
 }
-- 
cgit v1.2.3


From b7ed879425be371905d856410d19e9a42a62bcf3 Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Fri, 13 Jul 2018 14:40:50 +0900
Subject: net: ip6_gre: get ipv6hdr after skb_cow_head()

A KASAN:use-after-free bug was found related to ip6-erspan
while running selftests/net/ip6_gre_headroom.sh

It happens because of following sequence:
- ipv6hdr pointer is obtained from skb
- skb_cow_head() is called, skb->head memory is reallocated
- old data is accessed using ipv6hdr pointer

skb_cow_head() call was added in e41c7c68ea77 ("ip6erspan: make sure
enough headroom at xmit."), but looking at the history there was a
chance of similar bug because gre_handle_offloads() and pskb_trim()
can also reallocate skb->head memory. Fixes tag points to commit
which introduced possibility of this bug.

This patch moves ipv6hdr pointer assignment after skb_cow_head() call.

Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support")
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Reviewed-by: Greg Rose <gvrose8192@gmail.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index c8cf2fdbb13b..cd2cfb04e5d8 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -927,7 +927,6 @@ tx_err:
 static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 					 struct net_device *dev)
 {
-	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
 	struct ip6_tnl *t = netdev_priv(dev);
 	struct dst_entry *dst = skb_dst(skb);
 	struct net_device_stats *stats;
@@ -1010,6 +1009,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 			goto tx_err;
 		}
 	} else {
+		struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+
 		switch (skb->protocol) {
 		case htons(ETH_P_IP):
 			memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-- 
cgit v1.2.3


From e66515999b627368892ccc9b3a13a506f2ea1357 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 13 Jul 2018 17:21:42 +0200
Subject: ipv6: make DAD fail with enhanced DAD when nonce length differs

Commit adc176c54722 ("ipv6 addrconf: Implemented enhanced DAD (RFC7527)")
added enhanced DAD with a nonce length of 6 bytes. However, RFC7527
doesn't specify the length of the nonce, other than being 6 + 8*k bytes,
with integer k >= 0 (RFC3971 5.3.2). The current implementation simply
assumes that the nonce will always be 6 bytes, but others systems are
free to choose different sizes.

If another system sends a nonce of different length but with the same 6
bytes prefix, it shouldn't be considered as the same nonce. Thus, check
that the length of the received nonce is the same as the length we sent.

Ugly scapy test script running on veth0:

def loop():
    pkt=sniff(iface="veth0", filter="icmp6", count=1)
    pkt = pkt[0]
    b = bytearray(pkt[Raw].load)
    b[1] += 1
    b += b'\xde\xad\xbe\xef\xde\xad\xbe\xef'
    pkt[Raw].load = bytes(b)
    pkt[IPv6].plen += 8
    # fixup checksum after modifying the payload
    pkt[IPv6].payload.cksum -= 0x3b44
    if pkt[IPv6].payload.cksum < 0:
        pkt[IPv6].payload.cksum += 0xffff
    sendp(pkt, iface="veth0")

This should result in DAD failure for any address added to veth0's peer,
but is currently ignored.

Fixes: adc176c54722 ("ipv6 addrconf: Implemented enhanced DAD (RFC7527)")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ndisc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index e640d2f3c55c..0ec273997d1d 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -811,7 +811,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
 			return;
 		}
 	}
-	if (ndopts.nd_opts_nonce)
+	if (ndopts.nd_opts_nonce && ndopts.nd_opts_nonce->nd_opt_len == 1)
 		memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6);
 
 	inc = ipv6_addr_is_multicast(daddr);
-- 
cgit v1.2.3


From b5d2d75e079a918be686957b1a8d2f6c5cc95a0a Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 15 Jul 2018 09:35:19 -0700
Subject: net/ipv6: Do not allow device only routes via the multipath API

Eric reported that reverting the patch that fixed and simplified IPv6
multipath routes means reverting back to invalid userspace notifications.
eg.,
$ ip -6 route add 2001:db8:1::/64 nexthop dev eth0 nexthop dev eth1

only generates a single notification:
2001:db8:1::/64 dev eth0 metric 1024 pref medium

While working on a fix for this problem I found another case that is just
broken completely - a multipath route with a gateway followed by device
followed by gateway:
    $ ip -6 ro add 2001:db8:103::/64
          nexthop via 2001:db8:1::64
          nexthop dev dummy2
          nexthop via 2001:db8:3::64

In this case the device only route is dropped completely - no notification
to userpsace but no addition to the FIB either:

$ ip -6 ro ls
2001:db8:1::/64 dev dummy1 proto kernel metric 256 pref medium
2001:db8:2::/64 dev dummy2 proto kernel metric 256 pref medium
2001:db8:3::/64 dev dummy3 proto kernel metric 256 pref medium
2001:db8:103::/64 metric 1024
	nexthop via 2001:db8:1::64 dev dummy1 weight 1
	nexthop via 2001:db8:3::64 dev dummy3 weight 1 pref medium
fe80::/64 dev dummy1 proto kernel metric 256 pref medium
fe80::/64 dev dummy2 proto kernel metric 256 pref medium
fe80::/64 dev dummy3 proto kernel metric 256 pref medium

Really, IPv6 multipath is just FUBAR'ed beyond repair when it comes to
device only routes, so do not allow it all.

This change will break any scripts relying on the mpath api for insert,
but I don't see any other way to handle the permutations. Besides, since
the routes are added to the FIB as standalone (non-multipath) routes the
kernel is not doing what the user requested, so it might as well tell the
user that.

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 63f99411f0de..2ce0bd17de4f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4388,6 +4388,13 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 			rt = NULL;
 			goto cleanup;
 		}
+		if (!rt6_qualify_for_ecmp(rt)) {
+			err = -EINVAL;
+			NL_SET_ERR_MSG(extack,
+				       "Device only routes can not be added for IPv6 using the multipath API.");
+			fib6_info_release(rt);
+			goto cleanup;
+		}
 
 		rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
 
-- 
cgit v1.2.3


From 83ed7d1fe2d2d4a11b30660dec20168bb473d9c1 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 18 Jul 2018 10:48:56 +0200
Subject: ipv6: ila: select CONFIG_DST_CACHE

My randconfig builds came across an old missing dependency for ILA:

ERROR: "dst_cache_set_ip6" [net/ipv6/ila/ila.ko] undefined!
ERROR: "dst_cache_get" [net/ipv6/ila/ila.ko] undefined!
ERROR: "dst_cache_init" [net/ipv6/ila/ila.ko] undefined!
ERROR: "dst_cache_destroy" [net/ipv6/ila/ila.ko] undefined!

We almost never run into this by accident because randconfig builds
end up selecting DST_CACHE from some other tunnel protocol, and this
one appears to be the only one missing the explicit 'select'.

>From all I can tell, this problem first appeared in linux-4.9
when dst_cache support got added to ILA.

Fixes: 79ff2fc31e0f ("ila: Cache a route to translated address")
Cc: Tom Herbert <tom@herbertland.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 0eff75525da1..b3885ca22d6f 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -108,6 +108,7 @@ config IPV6_MIP6
 config IPV6_ILA
 	tristate "IPv6: Identifier Locator Addressing (ILA)"
 	depends on NETFILTER
+	select DST_CACHE
 	select LWTUNNEL
 	---help---
 	  Support for IPv6 Identifier Locator Addressing (ILA).
-- 
cgit v1.2.3


From 3ee593adbbb46d9e1bb1320915943926f4744483 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 17 Jul 2018 16:52:54 +0100
Subject: ipv6: sr: fix useless rol32 call on hash

The rol32 call is currently rotating hash but the rol'd value is
being discarded. I believe the current code is incorrect and hash
should be assigned the rotated value returned from rol32.

Detected by CoverityScan, CID#1468411 ("Useless call")

Fixes: b5facfdba14c ("ipv6: sr: Compute flowlabel for outer IPv6 header of seg6 encap mode")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: dlebrun@google.com
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6_iptunnel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 19ccf0dc996c..a8854dd3e9c5 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -101,7 +101,7 @@ static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb,
 
 	if (do_flowlabel > 0) {
 		hash = skb_get_hash(skb);
-		rol32(hash, 16);
+		hash = rol32(hash, 16);
 		flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;
 	} else if (!do_flowlabel && skb->protocol == htons(ETH_P_IPV6)) {
 		flowlabel = ip6_flowlabel(inner_hdr);
-- 
cgit v1.2.3


From 24b711edfc34bc45777a3f068812b7d1ed004a5d Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 19 Jul 2018 12:41:18 -0700
Subject: net/ipv6: Fix linklocal to global address with VRF

Example setup:
    host: ip -6 addr add dev eth1 2001:db8:104::4
           where eth1 is enslaved to a VRF

    switch: ip -6 ro add 2001:db8:104::4/128 dev br1
            where br1 only has an LLA

           ping6 2001:db8:104::4
           ssh   2001:db8:104::4

(NOTE: UDP works fine if the PKTINFO has the address set to the global
address and ifindex is set to the index of eth1 with a destination an
LLA).

For ICMP, icmp6_iif needs to be updated to check if skb->dev is an
L3 master. If it is then return the ifindex from rt6i_idev similar
to what is done for loopback.

For TCP, restore the original tcp_v6_iif definition which is needed in
most places and add a new tcp_v6_iif_l3_slave that considers the
l3_slave variability. This latter check is only needed for socket
lookups.

Fixes: 9ff74384600a ("net: vrf: Handle ipv6 multicast and link-local addresses")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h   | 5 +++++
 net/ipv6/icmp.c     | 5 +++--
 net/ipv6/tcp_ipv6.c | 6 ++++--
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'net/ipv6')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 25116ec02087..cd3ecda9386a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -840,6 +840,11 @@ static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
  * as TCP moves IP6CB into a different location in skb->cb[]
  */
 static inline int tcp_v6_iif(const struct sk_buff *skb)
+{
+	return TCP_SKB_CB(skb)->header.h6.iif;
+}
+
+static inline int tcp_v6_iif_l3_slave(const struct sk_buff *skb)
 {
 	bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags);
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index be491bf6ab6e..ef2505aefc15 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -402,9 +402,10 @@ static int icmp6_iif(const struct sk_buff *skb)
 
 	/* for local traffic to local address, skb dev is the loopback
 	 * device. Check if there is a dst attached to the skb and if so
-	 * get the real device index.
+	 * get the real device index. Same is needed for replies to a link
+	 * local address on a device enslaved to an L3 master device
 	 */
-	if (unlikely(iif == LOOPBACK_IFINDEX)) {
+	if (unlikely(iif == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) {
 		const struct rt6_info *rt6 = skb_rt6_info(skb);
 
 		if (rt6)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7efa9fd7e109..03e6b7a2bc53 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -938,7 +938,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 					   &tcp_hashinfo, NULL, 0,
 					   &ipv6h->saddr,
 					   th->source, &ipv6h->daddr,
-					   ntohs(th->source), tcp_v6_iif(skb),
+					   ntohs(th->source),
+					   tcp_v6_iif_l3_slave(skb),
 					   tcp_v6_sdif(skb));
 		if (!sk1)
 			goto out;
@@ -1609,7 +1610,8 @@ do_time_wait:
 					    skb, __tcp_hdrlen(th),
 					    &ipv6_hdr(skb)->saddr, th->source,
 					    &ipv6_hdr(skb)->daddr,
-					    ntohs(th->dest), tcp_v6_iif(skb),
+					    ntohs(th->dest),
+					    tcp_v6_iif_l3_slave(skb),
 					    sdif);
 		if (sk2) {
 			struct inet_timewait_sock *tw = inet_twsk(sk);
-- 
cgit v1.2.3


From 08d3ffcc0cfaba36f6b86fd568cc3bc773061fa6 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Fri, 20 Jul 2018 14:04:27 +0800
Subject: multicast: do not restore deleted record source filter mode to new
 one

There are two scenarios that we will restore deleted records. The first is
when device down and up(or unmap/remap). In this scenario the new filter
mode is same with previous one. Because we get it from in_dev->mc_list and
we do not touch it during device down and up.

The other scenario is when a new socket join a group which was just delete
and not finish sending status reports. In this scenario, we should use the
current filter mode instead of restore old one. Here are 4 cases in total.

old_socket        new_socket       before_fix       after_fix
  IN(A)             IN(A)           ALLOW(A)         ALLOW(A)
  IN(A)             EX( )           TO_IN( )         TO_EX( )
  EX( )             IN(A)           TO_EX( )         ALLOW(A)
  EX( )             EX( )           TO_EX( )         TO_EX( )

Fixes: 24803f38a5c0b (igmp: do not remove igmp souce list info when set link down)
Fixes: 1666d49e1d416 (mld: do not remove mld souce list info when set link down)
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c  | 3 +--
 net/ipv6/mcast.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index b3c899a630a0..28fef7d15959 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1200,8 +1200,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
 	spin_lock_bh(&im->lock);
 	if (pmc) {
 		im->interface = pmc->interface;
-		im->sfmode = pmc->sfmode;
-		if (pmc->sfmode == MCAST_INCLUDE) {
+		if (im->sfmode == MCAST_INCLUDE) {
 			im->tomb = pmc->tomb;
 			im->sources = pmc->sources;
 			for (psf = im->sources; psf; psf = psf->sf_next)
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 2699be7202be..f60f310785fd 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -790,8 +790,7 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
 	spin_lock_bh(&im->mca_lock);
 	if (pmc) {
 		im->idev = pmc->idev;
-		im->mca_sfmode = pmc->mca_sfmode;
-		if (pmc->mca_sfmode == MCAST_INCLUDE) {
+		if (im->mca_sfmode == MCAST_INCLUDE) {
 			im->mca_tomb = pmc->mca_tomb;
 			im->mca_sources = pmc->mca_sources;
 			for (psf = im->mca_sources; psf; psf = psf->sf_next)
-- 
cgit v1.2.3


From e873e4b9cc7e8ce79e5c5627b32b107035bb3f5d Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Sat, 21 Jul 2018 20:56:32 -0700
Subject: ipv6: use fib6_info_hold_safe() when necessary

In the code path where only rcu read lock is held, e.g. in the route
lookup code path, it is not safe to directly call fib6_info_hold()
because the fib6_info may already have been deleted but still exists
in the rcu grace period. Holding reference to it could cause double
free and crash the kernel.

This patch adds a new function fib6_info_hold_safe() and replace
fib6_info_hold() in all necessary places.

Syzbot reported 3 crash traces because of this. One of them is:
8021q: adding VLAN 0 to HW filter on device team0
IPv6: ADDRCONF(NETDEV_CHANGE): team0: link becomes ready
dst_release: dst:(____ptrval____) refcnt:-1
dst_release: dst:(____ptrval____) refcnt:-2
WARNING: CPU: 1 PID: 4845 at include/net/dst.h:239 dst_hold include/net/dst.h:239 [inline]
WARNING: CPU: 1 PID: 4845 at include/net/dst.h:239 ip6_setup_cork+0xd66/0x1830 net/ipv6/ip6_output.c:1204
dst_release: dst:(____ptrval____) refcnt:-1
Kernel panic - not syncing: panic_on_warn set ...

CPU: 1 PID: 4845 Comm: syz-executor493 Not tainted 4.18.0-rc3+ #10
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113
 panic+0x238/0x4e7 kernel/panic.c:184
dst_release: dst:(____ptrval____) refcnt:-2
dst_release: dst:(____ptrval____) refcnt:-3
 __warn.cold.8+0x163/0x1ba kernel/panic.c:536
dst_release: dst:(____ptrval____) refcnt:-4
 report_bug+0x252/0x2d0 lib/bug.c:186
 fixup_bug arch/x86/kernel/traps.c:178 [inline]
 do_error_trap+0x1fc/0x4d0 arch/x86/kernel/traps.c:296
dst_release: dst:(____ptrval____) refcnt:-5
 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:316
 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992
RIP: 0010:dst_hold include/net/dst.h:239 [inline]
RIP: 0010:ip6_setup_cork+0xd66/0x1830 net/ipv6/ip6_output.c:1204
Code: c1 ed 03 89 9d 18 ff ff ff 48 b8 00 00 00 00 00 fc ff df 41 c6 44 05 00 f8 e9 2d 01 00 00 4c 8b a5 c8 fe ff ff e8 1a f6 e6 fa <0f> 0b e9 6a fc ff ff e8 0e f6 e6 fa 48 8b 85 d0 fe ff ff 48 8d 78
RSP: 0018:ffff8801a8fcf178 EFLAGS: 00010293
RAX: ffff8801a8eba5c0 RBX: 0000000000000000 RCX: ffffffff869511e6
RDX: 0000000000000000 RSI: ffffffff869515b6 RDI: 0000000000000005
RBP: ffff8801a8fcf2c8 R08: ffff8801a8eba5c0 R09: ffffed0035ac8338
R10: ffffed0035ac8338 R11: ffff8801ad6419c3 R12: ffff8801a8fcf720
R13: ffff8801a8fcf6a0 R14: ffff8801ad6419c0 R15: ffff8801ad641980
 ip6_make_skb+0x2c8/0x600 net/ipv6/ip6_output.c:1768
 udpv6_sendmsg+0x2c90/0x35f0 net/ipv6/udp.c:1376
 inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798
 sock_sendmsg_nosec net/socket.c:641 [inline]
 sock_sendmsg+0xd5/0x120 net/socket.c:651
 ___sys_sendmsg+0x51d/0x930 net/socket.c:2125
 __sys_sendmmsg+0x240/0x6f0 net/socket.c:2220
 __do_sys_sendmmsg net/socket.c:2249 [inline]
 __se_sys_sendmmsg net/socket.c:2246 [inline]
 __x64_sys_sendmmsg+0x9d/0x100 net/socket.c:2246
 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x446ba9
Code: e8 cc bb 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 eb 08 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007fb39a469da8 EFLAGS: 00000246 ORIG_RAX: 0000000000000133
RAX: ffffffffffffffda RBX: 00000000006dcc54 RCX: 0000000000446ba9
RDX: 00000000000000b8 RSI: 0000000020001b00 RDI: 0000000000000003
RBP: 00000000006dcc50 R08: 00007fb39a46a700 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 45c828efc7a64843
R13: e6eeb815b9d8a477 R14: 5068caf6f713c6fc R15: 0000000000000001
Dumping ftrace buffer:
   (ftrace buffer empty)
Kernel Offset: disabled
Rebooting in 86400 seconds..

Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes")
Reported-by: syzbot+902e2a1bcd4f7808cef5@syzkaller.appspotmail.com
Reported-by: syzbot+8ae62d67f647abeeceb9@syzkaller.appspotmail.com
Reported-by: syzbot+3f08feb14086930677d0@syzkaller.appspotmail.com
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  5 +++++
 net/ipv6/addrconf.c   |  3 ++-
 net/ipv6/route.c      | 41 +++++++++++++++++++++++++++++++----------
 3 files changed, 38 insertions(+), 11 deletions(-)

(limited to 'net/ipv6')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 71b9043aa0e7..3d4930528db0 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -281,6 +281,11 @@ static inline void fib6_info_hold(struct fib6_info *f6i)
 	atomic_inc(&f6i->fib6_ref);
 }
 
+static inline bool fib6_info_hold_safe(struct fib6_info *f6i)
+{
+	return atomic_inc_not_zero(&f6i->fib6_ref);
+}
+
 static inline void fib6_info_release(struct fib6_info *f6i)
 {
 	if (f6i && atomic_dec_and_test(&f6i->fib6_ref))
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 91580c62bb86..f66a1cae3366 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2374,7 +2374,8 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 			continue;
 		if ((rt->fib6_flags & noflags) != 0)
 			continue;
-		fib6_info_hold(rt);
+		if (!fib6_info_hold_safe(rt))
+			continue;
 		break;
 	}
 out:
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2ce0bd17de4f..ec18b3ce8b6d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -972,10 +972,10 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
 	rt->dst.lastuse = jiffies;
 }
 
+/* Caller must already hold reference to @from */
 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
 {
 	rt->rt6i_flags &= ~RTF_EXPIRES;
-	fib6_info_hold(from);
 	rcu_assign_pointer(rt->from, from);
 	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
 	if (from->fib6_metrics != &dst_default_metrics) {
@@ -984,6 +984,7 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
 	}
 }
 
+/* Caller must already hold reference to @ort */
 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
 {
 	struct net_device *dev = fib6_info_nh_dev(ort);
@@ -1044,9 +1045,14 @@ static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
 	struct net_device *dev = rt->fib6_nh.nh_dev;
 	struct rt6_info *nrt;
 
+	if (!fib6_info_hold_safe(rt))
+		return NULL;
+
 	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
 	if (nrt)
 		ip6_rt_copy_init(nrt, rt);
+	else
+		fib6_info_release(rt);
 
 	return nrt;
 }
@@ -1178,10 +1184,15 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
 	 *	Clone the route.
 	 */
 
+	if (!fib6_info_hold_safe(ort))
+		return NULL;
+
 	dev = ip6_rt_get_dev_rcu(ort);
 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
-	if (!rt)
+	if (!rt) {
+		fib6_info_release(ort);
 		return NULL;
+	}
 
 	ip6_rt_copy_init(rt, ort);
 	rt->rt6i_flags |= RTF_CACHE;
@@ -1210,12 +1221,17 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
 	struct net_device *dev;
 	struct rt6_info *pcpu_rt;
 
+	if (!fib6_info_hold_safe(rt))
+		return NULL;
+
 	rcu_read_lock();
 	dev = ip6_rt_get_dev_rcu(rt);
 	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
 	rcu_read_unlock();
-	if (!pcpu_rt)
+	if (!pcpu_rt) {
+		fib6_info_release(rt);
 		return NULL;
+	}
 	ip6_rt_copy_init(pcpu_rt, rt);
 	pcpu_rt->rt6i_flags |= RTF_PCPU;
 	return pcpu_rt;
@@ -2486,7 +2502,7 @@ restart:
 
 out:
 	if (ret)
-		dst_hold(&ret->dst);
+		ip6_hold_safe(net, &ret, true);
 	else
 		ret = ip6_create_rt_rcu(rt);
 
@@ -3303,7 +3319,8 @@ static int ip6_route_del(struct fib6_config *cfg,
 				continue;
 			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
 				continue;
-			fib6_info_hold(rt);
+			if (!fib6_info_hold_safe(rt))
+				continue;
 			rcu_read_unlock();
 
 			/* if gateway was specified only delete the one hop */
@@ -3409,6 +3426,9 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 
 	rcu_read_lock();
 	from = rcu_dereference(rt->from);
+	/* This fib6_info_hold() is safe here because we hold reference to rt
+	 * and rt already holds reference to fib6_info.
+	 */
 	fib6_info_hold(from);
 	rcu_read_unlock();
 
@@ -3470,7 +3490,8 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
 			continue;
 		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
 			continue;
-		fib6_info_hold(rt);
+		if (!fib6_info_hold_safe(rt))
+			continue;
 		break;
 	}
 out:
@@ -3530,8 +3551,8 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
 		    ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
 			break;
 	}
-	if (rt)
-		fib6_info_hold(rt);
+	if (rt && !fib6_info_hold_safe(rt))
+		rt = NULL;
 	rcu_read_unlock();
 	return rt;
 }
@@ -3579,8 +3600,8 @@ restart:
 		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
 
 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
-		    (!idev || idev->cnf.accept_ra != 2)) {
-			fib6_info_hold(rt);
+		    (!idev || idev->cnf.accept_ra != 2) &&
+		    fib6_info_hold_safe(rt)) {
 			rcu_read_unlock();
 			ip6_del_rt(net, rt);
 			goto restart;
-- 
cgit v1.2.3


From 3dd1c9a1270736029ffca670e9bd0265f4120600 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 23 Jul 2018 16:50:48 +0200
Subject: ip: hash fragments consistently

The skb hash for locally generated ip[v6] fragments belonging
to the same datagram can vary in several circumstances:
* for connected UDP[v6] sockets, the first fragment get its hash
  via set_owner_w()/skb_set_hash_from_sk()
* for unconnected IPv6 UDPv6 sockets, the first fragment can get
  its hash via ip6_make_flowlabel()/skb_get_hash_flowi6(), if
  auto_flowlabel is enabled

For the following frags the hash is usually computed via
skb_get_hash().
The above can cause OoO for unconnected IPv6 UDPv6 socket: in that
scenario the egress tx queue can be selected on a per packet basis
via the skb hash.
It may also fool flow-oriented schedulers to place fragments belonging
to the same datagram in different flows.

Fix the issue by copying the skb hash from the head frag into
the others at fragmentation time.

Before this commit:
perf probe -a "dev_queue_xmit skb skb->hash skb->l4_hash:b1@0/8 skb->sw_hash:b1@1/8"
netperf -H $IPV4 -t UDP_STREAM -l 5 -- -m 2000 -n &
perf record -e probe:dev_queue_xmit -e probe:skb_set_owner_w -a sleep 0.1
perf script
probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=3713014309 l4_hash=1 sw_hash=0
probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=0 l4_hash=0 sw_hash=0

After this commit:
probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=2171763177 l4_hash=1 sw_hash=0
probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=2171763177 l4_hash=1 sw_hash=0

Fixes: b73c3d0e4f0e ("net: Save TX flow hash in sock and set in skbuf on xmit")
Fixes: 67800f9b1f4e ("ipv6: Call skb_get_hash_flowi6 to get skb->hash in ip6_make_flowlabel")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c  | 2 ++
 net/ipv6/ip6_output.c | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b3308e9d9762..0e3edd25f881 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -523,6 +523,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->dev = from->dev;
 	to->mark = from->mark;
 
+	skb_copy_hash(to, from);
+
 	/* Copy the flags to each fragment. */
 	IPCB(to)->flags = IPCB(from)->flags;
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index a14fb4fcdf18..3168847c30d1 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -570,6 +570,8 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->dev = from->dev;
 	to->mark = from->mark;
 
+	skb_copy_hash(to, from);
+
 #ifdef CONFIG_NET_SCHED
 	to->tc_index = from->tc_index;
 #endif
-- 
cgit v1.2.3


From 2efd4fca703a6707cad16ab486eaab8fc7f0fd49 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Mon, 23 Jul 2018 19:36:48 -0400
Subject: ip: in cmsg IP(V6)_ORIGDSTADDR call pskb_may_pull

Syzbot reported a read beyond the end of the skb head when returning
IPV6_ORIGDSTADDR:

  BUG: KMSAN: kernel-infoleak in put_cmsg+0x5ef/0x860 net/core/scm.c:242
  CPU: 0 PID: 4501 Comm: syz-executor128 Not tainted 4.17.0+ #9
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
  Google 01/01/2011
  Call Trace:
    __dump_stack lib/dump_stack.c:77 [inline]
    dump_stack+0x185/0x1d0 lib/dump_stack.c:113
    kmsan_report+0x188/0x2a0 mm/kmsan/kmsan.c:1125
    kmsan_internal_check_memory+0x138/0x1f0 mm/kmsan/kmsan.c:1219
    kmsan_copy_to_user+0x7a/0x160 mm/kmsan/kmsan.c:1261
    copy_to_user include/linux/uaccess.h:184 [inline]
    put_cmsg+0x5ef/0x860 net/core/scm.c:242
    ip6_datagram_recv_specific_ctl+0x1cf3/0x1eb0 net/ipv6/datagram.c:719
    ip6_datagram_recv_ctl+0x41c/0x450 net/ipv6/datagram.c:733
    rawv6_recvmsg+0x10fb/0x1460 net/ipv6/raw.c:521
    [..]

This logic and its ipv4 counterpart read the destination port from
the packet at skb_transport_offset(skb) + 4.

With MSG_MORE and a local SOCK_RAW sender, syzbot was able to cook a
packet that stores headers exactly up to skb_transport_offset(skb) in
the head and the remainder in a frag.

Call pskb_may_pull before accessing the pointer to ensure that it lies
in skb head.

Link: http://lkml.kernel.org/r/CAF=yD-LEJwZj5a1-bAAj2Oy_hKmGygV6rsJ_WOrAYnv-fnayiQ@mail.gmail.com
Reported-by: syzbot+9adb4b567003cac781f0@syzkaller.appspotmail.com
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c | 7 +++++--
 net/ipv6/datagram.c    | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 64c76dcf7386..c0fe5ad996f2 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -150,15 +150,18 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
 {
 	struct sockaddr_in sin;
 	const struct iphdr *iph = ip_hdr(skb);
-	__be16 *ports = (__be16 *)skb_transport_header(skb);
+	__be16 *ports;
+	int end;
 
-	if (skb_transport_offset(skb) + 4 > (int)skb->len)
+	end = skb_transport_offset(skb) + 4;
+	if (end > 0 && !pskb_may_pull(skb, end))
 		return;
 
 	/* All current transport protocols have the port numbers in the
 	 * first four bytes of the transport header and this function is
 	 * written with this assumption in mind.
 	 */
+	ports = (__be16 *)skb_transport_header(skb);
 
 	sin.sin_family = AF_INET;
 	sin.sin_addr.s_addr = iph->daddr;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 2ee08b6a86a4..1a1f876f8e28 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -700,13 +700,16 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
 	}
 	if (np->rxopt.bits.rxorigdstaddr) {
 		struct sockaddr_in6 sin6;
-		__be16 *ports = (__be16 *) skb_transport_header(skb);
+		__be16 *ports;
+		int end;
 
-		if (skb_transport_offset(skb) + 4 <= (int)skb->len) {
+		end = skb_transport_offset(skb) + 4;
+		if (end <= 0 || pskb_may_pull(skb, end)) {
 			/* All current transport protocols have the port numbers in the
 			 * first four bytes of the transport header and this function is
 			 * written with this assumption in mind.
 			 */
+			ports = (__be16 *)skb_transport_header(skb);
 
 			sin6.sin6_family = AF_INET6;
 			sin6.sin6_addr = ipv6_hdr(skb)->daddr;
-- 
cgit v1.2.3


From df18b50448fab1dff093731dfd0e25e77e1afcd1 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Mon, 30 Jul 2018 16:23:10 +0200
Subject: net/ipv6: fix metrics leak

Since commit d4ead6b34b67 ("net/ipv6: move metrics from dst to
rt6_info"), ipv6 metrics are shared and refcounted. rt6_set_from()
assigns the rt->from pointer and increases the refcount on from's
metrics. This reference is never released.

Introduce the fib6_metrics_release() helper and use it to release the
metrics.

Fixes: d4ead6b34b67 ("net/ipv6: move metrics from dst to rt6_info")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index d212738e9d10..211a2d437b56 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -167,11 +167,22 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
 	return f6i;
 }
 
+static void fib6_metrics_release(struct fib6_info *f6i)
+{
+	struct dst_metrics *m;
+
+	if (!f6i)
+		return;
+
+	m = f6i->fib6_metrics;
+	if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
+		kfree(m);
+}
+
 void fib6_info_destroy_rcu(struct rcu_head *head)
 {
 	struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
 	struct rt6_exception_bucket *bucket;
-	struct dst_metrics *m;
 
 	WARN_ON(f6i->fib6_node);
 
@@ -201,9 +212,7 @@ void fib6_info_destroy_rcu(struct rcu_head *head)
 	if (f6i->fib6_nh.nh_dev)
 		dev_put(f6i->fib6_nh.nh_dev);
 
-	m = f6i->fib6_metrics;
-	if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
-		kfree(m);
+	fib6_metrics_release(f6i);
 
 	kfree(f6i);
 }
@@ -887,6 +896,7 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i,
 
 			from = rcu_dereference_protected(pcpu_rt->from,
 					     lockdep_is_held(&table->tb6_lock));
+			fib6_metrics_release(from);
 			rcu_assign_pointer(pcpu_rt->from, NULL);
 			fib6_info_release(from);
 		}
-- 
cgit v1.2.3


From e6aed040eafb4ce1881bbc59a225f6b27d250396 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 1 Aug 2018 21:32:30 -0700
Subject: Revert "net/ipv6: fix metrics leak"

This reverts commit df18b50448fab1dff093731dfd0e25e77e1afcd1.

This change causes other problems and use-after-free situations as
found by syzbot.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 211a2d437b56..d212738e9d10 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -167,22 +167,11 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
 	return f6i;
 }
 
-static void fib6_metrics_release(struct fib6_info *f6i)
-{
-	struct dst_metrics *m;
-
-	if (!f6i)
-		return;
-
-	m = f6i->fib6_metrics;
-	if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
-		kfree(m);
-}
-
 void fib6_info_destroy_rcu(struct rcu_head *head)
 {
 	struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
 	struct rt6_exception_bucket *bucket;
+	struct dst_metrics *m;
 
 	WARN_ON(f6i->fib6_node);
 
@@ -212,7 +201,9 @@ void fib6_info_destroy_rcu(struct rcu_head *head)
 	if (f6i->fib6_nh.nh_dev)
 		dev_put(f6i->fib6_nh.nh_dev);
 
-	fib6_metrics_release(f6i);
+	m = f6i->fib6_metrics;
+	if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
+		kfree(m);
 
 	kfree(f6i);
 }
@@ -896,7 +887,6 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i,
 
 			from = rcu_dereference_protected(pcpu_rt->from,
 					     lockdep_is_held(&table->tb6_lock));
-			fib6_metrics_release(from);
 			rcu_assign_pointer(pcpu_rt->from, NULL);
 			fib6_info_release(from);
 		}
-- 
cgit v1.2.3


From e70a3aad44cc8b24986687ffc98c4a4f6ecf25ea Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 2 Aug 2018 23:20:38 -0700
Subject: ipv6: fix double refcount of fib6_metrics

All the callers of ip6_rt_copy_init()/rt6_set_from() hold refcnt
of the "from" fib6_info, so there is no need to hold fib6_metrics
refcnt again, because fib6_metrics refcnt is only released when
fib6_info is gone, that is, they have the same life time, so the
whole fib6_metrics refcnt can be removed actually.

This fixes a kmemleak warning reported by Sabrina.

Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes")
Reported-by: Sabrina Dubroca <sd@queasysnail.net>
Cc: Sabrina Dubroca <sd@queasysnail.net>
Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ec18b3ce8b6d..7208c16302f6 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -978,10 +978,6 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
 	rt->rt6i_flags &= ~RTF_EXPIRES;
 	rcu_assign_pointer(rt->from, from);
 	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
-	if (from->fib6_metrics != &dst_default_metrics) {
-		rt->dst._metrics |= DST_METRICS_REFCOUNTED;
-		refcount_inc(&from->fib6_metrics->refcnt);
-	}
 }
 
 /* Caller must already hold reference to @ort */
-- 
cgit v1.2.3


From 82a40777de12728dedf4075453b694f0d1baee80 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 5 Aug 2018 22:46:07 +0800
Subject: ip6_tunnel: use the right value for ipv4 min mtu check in
 ip6_tnl_xmit

According to RFC791, 68 bytes is the minimum size of IPv4 datagram every
device must be able to forward without further fragmentation while 576
bytes is the minimum size of IPv4 datagram every device has to be able
to receive, so in ip6_tnl_xmit(), 68(IPV4_MIN_MTU) should be the right
value for the ipv4 min mtu check in ip6_tnl_xmit.

While at it, change to use max() instead of if statement.

Fixes: c9fefa08190f ("ip6_tunnel: get the min mtu properly in ip6_tnl_xmit")
Reported-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 00e138a44cbb..1cc9650af9fb 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1133,12 +1133,8 @@ route_lookup:
 		max_headroom += 8;
 		mtu -= 8;
 	}
-	if (skb->protocol == htons(ETH_P_IPV6)) {
-		if (mtu < IPV6_MIN_MTU)
-			mtu = IPV6_MIN_MTU;
-	} else if (mtu < 576) {
-		mtu = 576;
-	}
+	mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ?
+		       IPV6_MIN_MTU : IPV4_MIN_MTU);
 
 	skb_dst_update_pmtu(skb, mtu);
 	if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) {
-- 
cgit v1.2.3