From 04c494e68a1340cb5c70d4704ac32d863dc64293 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 May 2022 14:14:56 -0700 Subject: Revert "tcp/dccp: get rid of inet_twsk_purge()" This reverts commits: 0dad4087a86a2cbe177404dc73f18ada26a2c390 ("tcp/dccp: get rid of inet_twsk_purge()") d507204d3c5cc57d9a8bdf0a477615bb59ea1611 ("tcp/dccp: add tw->tw_bslot") As Leonard pointed out, a newly allocated netns can happen to reuse a freed 'struct net'. While TCP TW timers were covered by my patches, other things were not: 1) Lookups in rx path (INET_MATCH() and INET6_MATCH()), as they look at 4-tuple plus the 'struct net' pointer. 2) /proc/net/tcp[6] and inet_diag, same reason. 3) hashinfo->bhash[], same reason. Fixing all this seems risky, lets instead revert. In the future, we might have a per netns tcp hash table, or a per netns list of timewait sockets... Fixes: 0dad4087a86a ("tcp/dccp: get rid of inet_twsk_purge()") Signed-off-by: Eric Dumazet Reported-by: Leonard Crestez Tested-by: Leonard Crestez Signed-off-by: David S. Miller --- net/ipv4/inet_timewait_sock.c | 58 +++++++++++++++++++++++++++++++++++++------ net/ipv4/tcp_ipv4.c | 2 ++ 2 files changed, 53 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 9e0bbd026560..0ec501845cb3 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -52,7 +52,8 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) spin_unlock(lock); /* Disassociate with bind bucket. */ - bhead = &hashinfo->bhash[tw->tw_bslot]; + bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, + hashinfo->bhash_size)]; spin_lock(&bhead->lock); inet_twsk_bind_unhash(tw, hashinfo); @@ -111,12 +112,8 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ - /* Cache inet_bhashfn(), because 'struct net' might be no longer - * available later in inet_twsk_kill(). - */ - tw->tw_bslot = inet_bhashfn(twsk_net(tw), inet->inet_num, - hashinfo->bhash_size); - bhead = &hashinfo->bhash[tw->tw_bslot]; + bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, + hashinfo->bhash_size)]; spin_lock(&bhead->lock); tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); @@ -257,3 +254,50 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) } } EXPORT_SYMBOL_GPL(__inet_twsk_schedule); + +void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family) +{ + struct inet_timewait_sock *tw; + struct sock *sk; + struct hlist_nulls_node *node; + unsigned int slot; + + for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { + struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; +restart_rcu: + cond_resched(); + rcu_read_lock(); +restart: + sk_nulls_for_each_rcu(sk, node, &head->chain) { + if (sk->sk_state != TCP_TIME_WAIT) + continue; + tw = inet_twsk(sk); + if ((tw->tw_family != family) || + refcount_read(&twsk_net(tw)->ns.count)) + continue; + + if (unlikely(!refcount_inc_not_zero(&tw->tw_refcnt))) + continue; + + if (unlikely((tw->tw_family != family) || + refcount_read(&twsk_net(tw)->ns.count))) { + inet_twsk_put(tw); + goto restart; + } + + rcu_read_unlock(); + local_bh_disable(); + inet_twsk_deschedule_put(tw); + local_bh_enable(); + goto restart_rcu; + } + /* If the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot) + goto restart; + rcu_read_unlock(); + } +} +EXPORT_SYMBOL_GPL(inet_twsk_purge); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f9cec624068d..457f5b5d5d4a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3173,6 +3173,8 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { struct net *net; + inet_twsk_purge(&tcp_hashinfo, AF_INET); + list_for_each_entry(net, net_exit_list, exit_list) tcp_fastopen_ctx_destroy(net); } -- cgit v1.2.3 From e6175a2ed1f18bf2f649625bf725e07adcfa6a28 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Fri, 13 May 2022 23:34:02 +0300 Subject: xfrm: fix "disable_policy" flag use when arriving from different devices In IPv4 setting the "disable_policy" flag on a device means no policy should be enforced for traffic originating from the device. This was implemented by seting the DST_NOPOLICY flag in the dst based on the originating device. However, dsts are cached in nexthops regardless of the originating devices, in which case, the DST_NOPOLICY flag value may be incorrect. Consider the following setup: +------------------------------+ | ROUTER | +-------------+ | +-----------------+ | | ipsec src |----|-|ipsec0 | | +-------------+ | |disable_policy=0 | +----+ | | +-----------------+ |eth1|-|----- +-------------+ | +-----------------+ +----+ | | noipsec src |----|-|eth0 | | +-------------+ | |disable_policy=1 | | | +-----------------+ | +------------------------------+ Where ROUTER has a default route towards eth1. dst entries for traffic arriving from eth0 would have DST_NOPOLICY and would be cached and therefore can be reused by traffic originating from ipsec0, skipping policy check. Fix by setting a IPSKB_NOPOLICY flag in IPCB and observing it instead of the DST in IN/FWD IPv4 policy checks. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Shmulik Ladkani Signed-off-by: Eyal Birger Signed-off-by: Steffen Klassert --- include/net/ip.h | 1 + include/net/xfrm.h | 14 +++++++++++++- net/ipv4/route.c | 23 ++++++++++++++++++----- 3 files changed, 32 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip.h b/include/net/ip.h index 3984f2c39c4b..0161137914cf 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -56,6 +56,7 @@ struct inet_skb_parm { #define IPSKB_DOREDIRECT BIT(5) #define IPSKB_FRAG_PMTU BIT(6) #define IPSKB_L3SLAVE BIT(7) +#define IPSKB_NOPOLICY BIT(8) u16 frag_max_size; }; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 6fb899ff5afc..d2efddce65d4 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1093,6 +1093,18 @@ static inline bool __xfrm_check_nopolicy(struct net *net, struct sk_buff *skb, return false; } +static inline bool __xfrm_check_dev_nopolicy(struct sk_buff *skb, + int dir, unsigned short family) +{ + if (dir != XFRM_POLICY_OUT && family == AF_INET) { + /* same dst may be used for traffic originating from + * devices with different policy settings. + */ + return IPCB(skb)->flags & IPSKB_NOPOLICY; + } + return skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY); +} + static inline int __xfrm_policy_check2(struct sock *sk, int dir, struct sk_buff *skb, unsigned int family, int reverse) @@ -1104,7 +1116,7 @@ static inline int __xfrm_policy_check2(struct sock *sk, int dir, return __xfrm_policy_check(sk, ndir, skb, family); return __xfrm_check_nopolicy(net, skb, dir) || - (skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY)) || + __xfrm_check_dev_nopolicy(skb, dir, family) || __xfrm_policy_check(sk, ndir, skb, family); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 98c6f3429593..fe5d14ef5c4d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1726,6 +1726,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct in_device *in_dev = __in_dev_get_rcu(dev); unsigned int flags = RTCF_MULTICAST; struct rtable *rth; + bool no_policy; u32 itag = 0; int err; @@ -1736,8 +1737,12 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (our) flags |= RTCF_LOCAL; + no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY); + if (no_policy) + IPCB(skb)->flags |= IPSKB_NOPOLICY; + rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, - IN_DEV_ORCONF(in_dev, NOPOLICY), false); + no_policy, false); if (!rth) return -ENOBUFS; @@ -1795,7 +1800,7 @@ static int __mkroute_input(struct sk_buff *skb, struct rtable *rth; int err; struct in_device *out_dev; - bool do_cache; + bool do_cache, no_policy; u32 itag = 0; /* get a working reference to the output device */ @@ -1840,6 +1845,10 @@ static int __mkroute_input(struct sk_buff *skb, } } + no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY); + if (no_policy) + IPCB(skb)->flags |= IPSKB_NOPOLICY; + fnhe = find_exception(nhc, daddr); if (do_cache) { if (fnhe) @@ -1852,8 +1861,7 @@ static int __mkroute_input(struct sk_buff *skb, } } - rth = rt_dst_alloc(out_dev->dev, 0, res->type, - IN_DEV_ORCONF(in_dev, NOPOLICY), + rth = rt_dst_alloc(out_dev->dev, 0, res->type, no_policy, IN_DEV_ORCONF(out_dev, NOXFRM)); if (!rth) { err = -ENOBUFS; @@ -2228,6 +2236,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct rtable *rth; struct flowi4 fl4; bool do_cache = true; + bool no_policy; /* IP on this device is disabled. */ @@ -2346,6 +2355,10 @@ brd_input: RT_CACHE_STAT_INC(in_brd); local_input: + no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY); + if (no_policy) + IPCB(skb)->flags |= IPSKB_NOPOLICY; + do_cache &= res->fi && !itag; if (do_cache) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); @@ -2360,7 +2373,7 @@ local_input: rth = rt_dst_alloc(ip_rt_get_dev(net, res), flags | RTCF_LOCAL, res->type, - IN_DEV_ORCONF(in_dev, NOPOLICY), false); + no_policy, false); if (!rth) goto e_nobufs; -- cgit v1.2.3