From 5bd8baab087dff657e05387aee802e70304cc813 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 13 Apr 2022 10:10:50 +0200 Subject: esp: limit skb_page_frag_refill use to a single page Commit ebe48d368e97 ("esp: Fix possible buffer overflow in ESP transformation") tried to fix skb_page_frag_refill usage in ESP by capping allocsize to 32k, but that doesn't completely solve the issue, as skb_page_frag_refill may return a single page. If that happens, we will write out of bounds, despite the check introduced in the previous patch. This patch forces COW in cases where we would end up calling skb_page_frag_refill with a size larger than a page (first in esp_output_head with tailen, then in esp_output_tail with skb->data_len). Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/esp.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/esp.h b/include/net/esp.h index 90cd02ff77ef..9c5637d41d95 100644 --- a/include/net/esp.h +++ b/include/net/esp.h @@ -4,8 +4,6 @@ #include -#define ESP_SKB_FRAG_MAXSIZE (PAGE_SIZE << SKB_FRAG_PAGE_ORDER) - struct ip_esp_hdr; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) -- cgit v1.2.3 From db53cd3d88dc328dea2e968c9c8d3b4294a8a674 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 13 Apr 2022 11:43:20 -0600 Subject: net: Handle l3mdev in ip_tunnel_init_flow Ido reported that the commit referenced in the Fixes tag broke a gre use case with dummy devices. Add a check to ip_tunnel_init_flow to see if the oif is an l3mdev port and if so set the oif to 0 to avoid the oif comparison in fib_lookup_good_nhc. Fixes: 40867d74c374 ("net: Add l3mdev index to flow struct and avoid oif reset for port devices") Reported-by: Ido Schimmel Signed-off-by: David Ahern Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 2 +- include/net/ip_tunnels.h | 11 +++++++++-- net/ipv4/ip_gre.c | 4 ++-- net/ipv4/ip_tunnel.c | 9 +++++---- 4 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index b73466470f75..fe663b0ab708 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -423,7 +423,7 @@ mlxsw_sp_span_gretap4_route(const struct net_device *to_dev, parms = mlxsw_sp_ipip_netdev_parms4(to_dev); ip_tunnel_init_flow(&fl4, parms.iph.protocol, *daddrp, *saddrp, - 0, 0, parms.link, tun->fwmark, 0); + 0, 0, dev_net(to_dev), parms.link, tun->fwmark, 0); rt = ip_route_output_key(tun->net, &fl4); if (IS_ERR(rt)) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 0219fe907b26..88dee57eac8a 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -243,11 +243,18 @@ static inline __be32 tunnel_id_to_key32(__be64 tun_id) static inline void ip_tunnel_init_flow(struct flowi4 *fl4, int proto, __be32 daddr, __be32 saddr, - __be32 key, __u8 tos, int oif, + __be32 key, __u8 tos, + struct net *net, int oif, __u32 mark, __u32 tun_inner_hash) { memset(fl4, 0, sizeof(*fl4)); - fl4->flowi4_oif = oif; + + if (oif) { + fl4->flowi4_l3mdev = l3mdev_master_upper_ifindex_by_index_rcu(net, oif); + /* Legacy VRF/l3mdev use case */ + fl4->flowi4_oif = fl4->flowi4_l3mdev ? 0 : oif; + } + fl4->daddr = daddr; fl4->saddr = saddr; fl4->flowi4_tos = tos; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 99db2e41ed10..365caebf51ab 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -605,8 +605,8 @@ static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) key = &info->key; ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src, tunnel_id_to_key32(key->tun_id), - key->tos & ~INET_ECN_MASK, 0, skb->mark, - skb_get_hash(skb)); + key->tos & ~INET_ECN_MASK, dev_net(dev), 0, + skb->mark, skb_get_hash(skb)); rt = ip_route_output_key(dev_net(dev), &fl4); if (IS_ERR(rt)) return PTR_ERR(rt); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 5a473319d3a5..94017a8c3994 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -294,8 +294,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev) ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, - RT_TOS(iph->tos), tunnel->parms.link, - tunnel->fwmark, 0); + RT_TOS(iph->tos), dev_net(dev), + tunnel->parms.link, tunnel->fwmark, 0); rt = ip_route_output_key(tunnel->net, &fl4); if (!IS_ERR(rt)) { @@ -570,7 +570,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, tunnel_id_to_key32(key->tun_id), RT_TOS(tos), - 0, skb->mark, skb_get_hash(skb)); + dev_net(dev), 0, skb->mark, skb_get_hash(skb)); if (tunnel->encap.type != TUNNEL_ENCAP_NONE) goto tx_error; @@ -726,7 +726,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, - tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, + tunnel->parms.o_key, RT_TOS(tos), + dev_net(dev), tunnel->parms.link, tunnel->fwmark, skb_get_hash(skb)); if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) -- cgit v1.2.3 From 9cb7c013420f98fa6fd12fc6a5dc055170c108db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 13 Apr 2022 11:13:33 -0700 Subject: ipv6: make ip6_rt_gc_expire an atomic_t Reads and Writes to ip6_rt_gc_expire always have been racy, as syzbot reported lately [1] There is a possible risk of under-flow, leading to unexpected high value passed to fib6_run_gc(), although I have not observed this in the field. Hosts hitting ip6_dst_gc() very hard are under pretty bad state anyway. [1] BUG: KCSAN: data-race in ip6_dst_gc / ip6_dst_gc read-write to 0xffff888102110744 of 4 bytes by task 13165 on cpu 1: ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311 dst_alloc+0x9b/0x160 net/core/dst.c:86 ip6_dst_alloc net/ipv6/route.c:344 [inline] icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807 mld_send_cr net/ipv6/mcast.c:2119 [inline] mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 read-write to 0xffff888102110744 of 4 bytes by task 11607 on cpu 0: ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311 dst_alloc+0x9b/0x160 net/core/dst.c:86 ip6_dst_alloc net/ipv6/route.c:344 [inline] icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807 mld_send_cr net/ipv6/mcast.c:2119 [inline] mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 value changed: 0x00000bb3 -> 0x00000ba9 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 11607 Comm: kworker/0:21 Not tainted 5.18.0-rc1-syzkaller-00037-g42e7a03d3bad-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: mld mld_ifc_work Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220413181333.649424-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv6.h | 4 ++-- net/ipv6/route.c | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 3d83b64471d3..b4af4837d80b 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -75,8 +75,8 @@ struct netns_ipv6 { struct list_head fib6_walkers; rwlock_t fib6_walker_lock; spinlock_t fib6_gc_lock; - unsigned int ip6_rt_gc_expire; - unsigned long ip6_rt_last_gc; + atomic_t ip6_rt_gc_expire; + unsigned long ip6_rt_last_gc; unsigned char flowlabel_has_excl; #ifdef CONFIG_IPV6_MULTIPLE_TABLES bool fib6_has_custom_rules; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 169e9df6d172..c4b6ce017d5e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3292,6 +3292,7 @@ static int ip6_dst_gc(struct dst_ops *ops) int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; + unsigned int val; int entries; entries = dst_entries_get_fast(ops); @@ -3302,13 +3303,13 @@ static int ip6_dst_gc(struct dst_ops *ops) entries <= rt_max_size) goto out; - net->ipv6.ip6_rt_gc_expire++; - fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); + fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); entries = dst_entries_get_slow(ops); if (entries < ops->gc_thresh) - net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; + atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1); out: - net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; + val = atomic_read(&net->ipv6.ip6_rt_gc_expire); + atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); return entries > rt_max_size; } @@ -6509,7 +6510,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; net->ipv6.sysctl.skip_notify_on_dev_down = 0; - net->ipv6.ip6_rt_gc_expire = 30*HZ; + atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ); ret = 0; out: -- cgit v1.2.3