From 22f6bbb7bcfcef0b373b0502a7ff390275c575dd Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 4 Dec 2018 17:37:57 +0000 Subject: net: use skb_list_del_init() to remove from RX sublists list_del() leaves the skb->next pointer poisoned, which can then lead to a crash in e.g. OVS forwarding. For example, setting up an OVS VXLAN forwarding bridge on sfc as per: ======== $ ovs-vsctl show 5dfd9c47-f04b-4aaa-aa96-4fbb0a522a30 Bridge "br0" Port "br0" Interface "br0" type: internal Port "enp6s0f0" Interface "enp6s0f0" Port "vxlan0" Interface "vxlan0" type: vxlan options: {key="1", local_ip="10.0.0.5", remote_ip="10.0.0.4"} ovs_version: "2.5.0" ======== (where 10.0.0.5 is an address on enp6s0f1) and sending traffic across it will lead to the following panic: ======== general protection fault: 0000 [#1] SMP PTI CPU: 5 PID: 0 Comm: swapper/5 Not tainted 4.20.0-rc3-ehc+ #701 Hardware name: Dell Inc. PowerEdge R710/0M233H, BIOS 6.4.0 07/23/2013 RIP: 0010:dev_hard_start_xmit+0x38/0x200 Code: 53 48 89 fb 48 83 ec 20 48 85 ff 48 89 54 24 08 48 89 4c 24 18 0f 84 ab 01 00 00 48 8d 86 90 00 00 00 48 89 f5 48 89 44 24 10 <4c> 8b 33 48 c7 03 00 00 00 00 48 8b 05 c7 d1 b3 00 4d 85 f6 0f 95 RSP: 0018:ffff888627b437e0 EFLAGS: 00010202 RAX: 0000000000000000 RBX: dead000000000100 RCX: ffff88862279c000 RDX: ffff888614a342c0 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffff888618a88000 R08: 0000000000000001 R09: 00000000000003e8 R10: 0000000000000000 R11: ffff888614a34140 R12: 0000000000000000 R13: 0000000000000062 R14: dead000000000100 R15: ffff888616430000 FS: 0000000000000000(0000) GS:ffff888627b40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f6d2bc6d000 CR3: 000000000200a000 CR4: 00000000000006e0 Call Trace: __dev_queue_xmit+0x623/0x870 ? masked_flow_lookup+0xf7/0x220 [openvswitch] ? ep_poll_callback+0x101/0x310 do_execute_actions+0xaba/0xaf0 [openvswitch] ? __wake_up_common+0x8a/0x150 ? __wake_up_common_lock+0x87/0xc0 ? queue_userspace_packet+0x31c/0x5b0 [openvswitch] ovs_execute_actions+0x47/0x120 [openvswitch] ovs_dp_process_packet+0x7d/0x110 [openvswitch] ovs_vport_receive+0x6e/0xd0 [openvswitch] ? dst_alloc+0x64/0x90 ? rt_dst_alloc+0x50/0xd0 ? ip_route_input_slow+0x19a/0x9a0 ? __udp_enqueue_schedule_skb+0x198/0x1b0 ? __udp4_lib_rcv+0x856/0xa30 ? __udp4_lib_rcv+0x856/0xa30 ? cpumask_next_and+0x19/0x20 ? find_busiest_group+0x12d/0xcd0 netdev_frame_hook+0xce/0x150 [openvswitch] __netif_receive_skb_core+0x205/0xae0 __netif_receive_skb_list_core+0x11e/0x220 netif_receive_skb_list+0x203/0x460 ? __efx_rx_packet+0x335/0x5e0 [sfc] efx_poll+0x182/0x320 [sfc] net_rx_action+0x294/0x3c0 __do_softirq+0xca/0x297 irq_exit+0xa6/0xb0 do_IRQ+0x54/0xd0 common_interrupt+0xf/0xf ======== So, in all listified-receive handling, instead pull skbs off the lists with skb_list_del_init(). Fixes: 9af86f933894 ("net: core: fix use-after-free in __netif_receive_skb_list_core") Fixes: 7da517a3bc52 ("net: core: Another step of skb receive list processing") Fixes: a4ca8b7df73c ("net: ipv4: fix drop handling in ip_list_rcv() and ip_list_rcv_finish()") Fixes: d8269e2cbf90 ("net: ipv6: listify ipv6_rcv() and ip6_rcv_finish()") Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- net/ipv6/ip6_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 96577e742afd..c1d85830c906 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -95,7 +95,7 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk, list_for_each_entry_safe(skb, next, head, list) { struct dst_entry *dst; - list_del(&skb->list); + skb_list_del_init(skb); /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ @@ -296,7 +296,7 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *dev = skb->dev; struct net *net = dev_net(dev); - list_del(&skb->list); + skb_list_del_init(skb); skb = ip6_rcv_core(skb, dev, net); if (skb == NULL) continue; -- cgit v1.2.3 From ebaf39e6032faf77218220707fc3fa22487784e0 Mon Sep 17 00:00:00 2001 From: Jiri Wiesner Date: Wed, 5 Dec 2018 16:55:29 +0100 Subject: ipv4: ipv6: netfilter: Adjust the frag mem limit when truesize changes The *_frag_reasm() functions are susceptible to miscalculating the byte count of packet fragments in case the truesize of a head buffer changes. The truesize member may be changed by the call to skb_unclone(), leaving the fragment memory limit counter unbalanced even if all fragments are processed. This miscalculation goes unnoticed as long as the network namespace which holds the counter is not destroyed. Should an attempt be made to destroy a network namespace that holds an unbalanced fragment memory limit counter the cleanup of the namespace never finishes. The thread handling the cleanup gets stuck in inet_frags_exit_net() waiting for the percpu counter to reach zero. The thread is usually in running state with a stacktrace similar to: PID: 1073 TASK: ffff880626711440 CPU: 1 COMMAND: "kworker/u48:4" #5 [ffff880621563d48] _raw_spin_lock at ffffffff815f5480 #6 [ffff880621563d48] inet_evict_bucket at ffffffff8158020b #7 [ffff880621563d80] inet_frags_exit_net at ffffffff8158051c #8 [ffff880621563db0] ops_exit_list at ffffffff814f5856 #9 [ffff880621563dd8] cleanup_net at ffffffff814f67c0 #10 [ffff880621563e38] process_one_work at ffffffff81096f14 It is not possible to create new network namespaces, and processes that call unshare() end up being stuck in uninterruptible sleep state waiting to acquire the net_mutex. The bug was observed in the IPv6 netfilter code by Per Sundstrom. I thank him for his analysis of the problem. The parts of this patch that apply to IPv4 and IPv6 fragment reassembly are preemptive measures. Signed-off-by: Jiri Wiesner Reported-by: Per Sundstrom Acked-by: Peter Oskolkov Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 7 +++++++ net/ipv6/netfilter/nf_conntrack_reasm.c | 8 +++++++- net/ipv6/reassembly.c | 8 +++++++- 3 files changed, 21 insertions(+), 2 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index d6ee343fdb86..aa0b22697998 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -515,6 +515,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct rb_node *rbn; int len; int ihlen; + int delta; int err; u8 ecn; @@ -556,10 +557,16 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, if (len > 65535) goto out_oversize; + delta = - head->truesize; + /* Head of list must not be cloned. */ if (skb_unclone(head, GFP_ATOMIC)) goto out_nomem; + delta += head->truesize; + if (delta) + add_frag_mem_limit(qp->q.net, delta); + /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index d219979c3e52..181da2c40f9a 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -341,7 +341,7 @@ static bool nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev) { struct sk_buff *fp, *head = fq->q.fragments; - int payload_len; + int payload_len, delta; u8 ecn; inet_frag_kill(&fq->q); @@ -363,10 +363,16 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic return false; } + delta = - head->truesize; + /* Head of list must not be cloned. */ if (skb_unclone(head, GFP_ATOMIC)) return false; + delta += head->truesize; + if (delta) + add_frag_mem_limit(fq->q.net, delta); + /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 5c3c92713096..aa26c45486d9 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -281,7 +281,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, { struct net *net = container_of(fq->q.net, struct net, ipv6.frags); struct sk_buff *fp, *head = fq->q.fragments; - int payload_len; + int payload_len, delta; unsigned int nhoff; int sum_truesize; u8 ecn; @@ -322,10 +322,16 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, if (payload_len > IPV6_MAXPLEN) goto out_oversize; + delta = - head->truesize; + /* Head of list must not be cloned. */ if (skb_unclone(head, GFP_ATOMIC)) goto out_oom; + delta += head->truesize; + if (delta) + add_frag_mem_limit(fq->q.net, delta); + /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ -- cgit v1.2.3 From 1b4e5ad5d6b9f15cd0b5121f86d4719165958417 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Fri, 7 Dec 2018 09:50:17 +0200 Subject: ipv6: sr: properly initialize flowi6 prior passing to ip6_route_output In 'seg6_output', stack variable 'struct flowi6 fl6' was missing initialization. Fixes: 6c8702c60b88 ("ipv6: sr: add support for SRH encapsulation and injection with lwtunnels") Signed-off-by: Shmulik Ladkani Signed-off-by: David S. Miller --- net/ipv6/seg6_iptunnel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv6') diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index a8854dd3e9c5..8181ee7e1e27 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -347,6 +347,7 @@ static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; + memset(&fl6, 0, sizeof(fl6)); fl6.daddr = hdr->daddr; fl6.saddr = hdr->saddr; fl6.flowlabel = ip6_flowinfo(hdr); -- cgit v1.2.3 From 66033f47ca60294a95fc85ec3a3cc909dab7b765 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Thu, 6 Dec 2018 19:30:36 +0100 Subject: ipv6: Check available headroom in ip6_xmit() even without options Even if we send an IPv6 packet without options, MAX_HEADER might not be enough to account for the additional headroom required by alignment of hardware headers. On a configuration without HYPERV_NET, WLAN, AX25, and with IPV6_TUNNEL, sending short SCTP packets over IPv4 over L2TP over IPv6, we start with 100 bytes of allocated headroom in sctp_packet_transmit(), end up with 54 bytes after l2tp_xmit_skb(), and 14 bytes in ip6_finish_output2(). Those would be enough to append our 14 bytes header, but we're going to align that to 16 bytes, and write 2 bytes out of the allocated slab in neigh_hh_output(). KASan says: [ 264.967848] ================================================================== [ 264.967861] BUG: KASAN: slab-out-of-bounds in ip6_finish_output2+0x1aec/0x1c70 [ 264.967866] Write of size 16 at addr 000000006af1c7fe by task netperf/6201 [ 264.967870] [ 264.967876] CPU: 0 PID: 6201 Comm: netperf Not tainted 4.20.0-rc4+ #1 [ 264.967881] Hardware name: IBM 2827 H43 400 (z/VM 6.4.0) [ 264.967887] Call Trace: [ 264.967896] ([<00000000001347d6>] show_stack+0x56/0xa0) [ 264.967903] [<00000000017e379c>] dump_stack+0x23c/0x290 [ 264.967912] [<00000000007bc594>] print_address_description+0xf4/0x290 [ 264.967919] [<00000000007bc8fc>] kasan_report+0x13c/0x240 [ 264.967927] [<000000000162f5e4>] ip6_finish_output2+0x1aec/0x1c70 [ 264.967935] [<000000000163f890>] ip6_finish_output+0x430/0x7f0 [ 264.967943] [<000000000163fe44>] ip6_output+0x1f4/0x580 [ 264.967953] [<000000000163882a>] ip6_xmit+0xfea/0x1ce8 [ 264.967963] [<00000000017396e2>] inet6_csk_xmit+0x282/0x3f8 [ 264.968033] [<000003ff805fb0ba>] l2tp_xmit_skb+0xe02/0x13e0 [l2tp_core] [ 264.968037] [<000003ff80631192>] l2tp_eth_dev_xmit+0xda/0x150 [l2tp_eth] [ 264.968041] [<0000000001220020>] dev_hard_start_xmit+0x268/0x928 [ 264.968069] [<0000000001330e8e>] sch_direct_xmit+0x7ae/0x1350 [ 264.968071] [<000000000122359c>] __dev_queue_xmit+0x2b7c/0x3478 [ 264.968075] [<00000000013d2862>] ip_finish_output2+0xce2/0x11a0 [ 264.968078] [<00000000013d9b14>] ip_finish_output+0x56c/0x8c8 [ 264.968081] [<00000000013ddd1e>] ip_output+0x226/0x4c0 [ 264.968083] [<00000000013dbd6c>] __ip_queue_xmit+0x894/0x1938 [ 264.968100] [<000003ff80bc3a5c>] sctp_packet_transmit+0x29d4/0x3648 [sctp] [ 264.968116] [<000003ff80b7bf68>] sctp_outq_flush_ctrl.constprop.5+0x8d0/0xe50 [sctp] [ 264.968131] [<000003ff80b7c716>] sctp_outq_flush+0x22e/0x7d8 [sctp] [ 264.968146] [<000003ff80b35c68>] sctp_cmd_interpreter.isra.16+0x530/0x6800 [sctp] [ 264.968161] [<000003ff80b3410a>] sctp_do_sm+0x222/0x648 [sctp] [ 264.968177] [<000003ff80bbddac>] sctp_primitive_ASSOCIATE+0xbc/0xf8 [sctp] [ 264.968192] [<000003ff80b93328>] __sctp_connect+0x830/0xc20 [sctp] [ 264.968208] [<000003ff80bb11ce>] sctp_inet_connect+0x2e6/0x378 [sctp] [ 264.968212] [<0000000001197942>] __sys_connect+0x21a/0x450 [ 264.968215] [<000000000119aff8>] sys_socketcall+0x3d0/0xb08 [ 264.968218] [<000000000184ea7a>] system_call+0x2a2/0x2c0 [...] Just like ip_finish_output2() does for IPv4, check that we have enough headroom in ip6_xmit(), and reallocate it if we don't. This issue is older than git history. Reported-by: Jianlin Shi Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 827a3f5ff3bb..fcd3c66ded16 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -195,37 +195,37 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); + unsigned int head_room; struct ipv6hdr *hdr; u8 proto = fl6->flowi6_proto; int seg_len = skb->len; int hlimit = -1; u32 mtu; - if (opt) { - unsigned int head_room; + head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); + if (opt) + head_room += opt->opt_nflen + opt->opt_flen; - /* First: exthdrs may take lots of space (~8K for now) - MAX_HEADER is not enough. - */ - head_room = opt->opt_nflen + opt->opt_flen; - seg_len += head_room; - head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); - - if (skb_headroom(skb) < head_room) { - struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); - if (!skb2) { - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_OUTDISCARDS); - kfree_skb(skb); - return -ENOBUFS; - } - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - consume_skb(skb); - skb = skb2; + if (unlikely(skb_headroom(skb) < head_room)) { + struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); + if (!skb2) { + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + return -ENOBUFS; } + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + consume_skb(skb); + skb = skb2; + } + + if (opt) { + seg_len += opt->opt_nflen + opt->opt_flen; + if (opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); + if (opt->opt_nflen) ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, &fl6->saddr); -- cgit v1.2.3