From 58189ca7b27411c3dc9a5cb9eeee0906da684c59 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 15 Sep 2015 15:10:50 -0700 Subject: net: Fix vti use case with oif in dst lookups Steffen reported that the recent change to add oif to dst lookups breaks the VTI use case. The problem is that with the oif set in the flow struct the comparison to the nh_oif is triggered. Fix by splitting the FLOWI_FLAG_VRFSRC into 2 flags -- one that triggers the vrf device cache bypass (FLOWI_FLAG_VRFSRC) and another telling the lookup to not compare nh oif (FLOWI_FLAG_SKIP_NH_OIF). Fixes: 42a7b32b73d6 ("xfrm: Add oif to dst lookups") Signed-off-by: David Ahern Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 2 +- net/ipv4/udp.c | 3 ++- net/ipv4/xfrm4_policy.c | 2 ++ 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 26d6ffb6d23c..6c2af797f2f9 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1426,7 +1426,7 @@ found: nh->nh_flags & RTNH_F_LINKDOWN && !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) continue; - if (!(flp->flowi4_flags & FLOWI_FLAG_VRFSRC)) { + if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) { if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) continue; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c0a15e7f359f..f7d1d5e19e95 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1024,7 +1024,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (netif_index_is_vrf(net, ipc.oif)) { flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, - (flow_flags | FLOWI_FLAG_VRFSRC), + (flow_flags | FLOWI_FLAG_VRFSRC | + FLOWI_FLAG_SKIP_NH_OIF), faddr, saddr, dport, inet->inet_sport); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index bb919b28619f..c10a9ee68433 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -33,6 +33,8 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, if (saddr) fl4->saddr = saddr->a4; + fl4->flowi4_flags = FLOWI_FLAG_SKIP_NH_OIF; + rt = __ip_route_output_key(net, fl4); if (!IS_ERR(rt)) return &rt->dst; -- cgit v1.2.3 From c2e7204d180f8efc80f27959ca9cf16fa17f67db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 Sep 2015 08:38:00 -0700 Subject: tcp_cubic: do not set epoch_start in the future Tracking idle time in bictcp_cwnd_event() is imprecise, as epoch_start is normally set at ACK processing time, not at send time. Doing a proper fix would need to add an additional state variable, and does not seem worth the trouble, given CUBIC bug has been there forever before Jana noticed it. Let's simply not set epoch_start in the future, otherwise bictcp_update() could overflow and CUBIC would again grow cwnd too fast. This was detected thanks to a packetdrill test Neal wrote that was flaky before applying this fix. Fixes: 30927520dbae ("tcp_cubic: better follow cubic curve after idle period") Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Cc: Jana Iyengar Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index c6ded6b2a79f..448c2615fece 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -154,14 +154,20 @@ static void bictcp_init(struct sock *sk) static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { - s32 delta = tcp_time_stamp - tcp_sk(sk)->lsndtime; struct bictcp *ca = inet_csk_ca(sk); + u32 now = tcp_time_stamp; + s32 delta; + + delta = now - tcp_sk(sk)->lsndtime; /* We were application limited (idle) for a while. * Shift epoch_start to keep cwnd growth to cubic curve. */ - if (ca->epoch_start && delta > 0) + if (ca->epoch_start && delta > 0) { ca->epoch_start += delta; + if (after(ca->epoch_start, now)) + ca->epoch_start = now; + } return; } } -- cgit v1.2.3 From 0315e382704817b279e5693dca8ab9d89aa20b3f Mon Sep 17 00:00:00 2001 From: Nikola Forró Date: Thu, 17 Sep 2015 16:01:32 +0200 Subject: net: Fix behaviour of unreachable, blackhole and prohibit routes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Man page of ip-route(8) says following about route types: unreachable - these destinations are unreachable. Packets are dis‐ carded and the ICMP message host unreachable is generated. The local senders get an EHOSTUNREACH error. blackhole - these destinations are unreachable. Packets are dis‐ carded silently. The local senders get an EINVAL error. prohibit - these destinations are unreachable. Packets are discarded and the ICMP message communication administratively prohibited is generated. The local senders get an EACCES error. In the inet6 address family, this was correct, except the local senders got ENETUNREACH error instead of EHOSTUNREACH in case of unreachable route. In the inet address family, all three route types generated ICMP message net unreachable, and the local senders got ENETUNREACH error. In both address families all three route types now behave consistently with documentation. Signed-off-by: Nikola Forró Signed-off-by: David S. Miller --- include/net/ip_fib.h | 30 +++++++++++++++++++----------- net/ipv4/route.c | 6 ++++-- net/ipv6/route.c | 4 +++- 3 files changed, 26 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index a37d0432bebd..727d6e9a9685 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -236,8 +236,11 @@ static inline int fib_lookup(struct net *net, const struct flowi4 *flp, rcu_read_lock(); tb = fib_get_table(net, RT_TABLE_MAIN); - if (tb && !fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF)) - err = 0; + if (tb) + err = fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF); + + if (err == -EAGAIN) + err = -ENETUNREACH; rcu_read_unlock(); @@ -258,7 +261,7 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags) { struct fib_table *tb; - int err; + int err = -ENETUNREACH; flags |= FIB_LOOKUP_NOREF; if (net->ipv4.fib_has_custom_rules) @@ -268,15 +271,20 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp, res->tclassid = 0; - for (err = 0; !err; err = -ENETUNREACH) { - tb = rcu_dereference_rtnl(net->ipv4.fib_main); - if (tb && !fib_table_lookup(tb, flp, res, flags)) - break; + tb = rcu_dereference_rtnl(net->ipv4.fib_main); + if (tb) + err = fib_table_lookup(tb, flp, res, flags); + + if (!err) + goto out; + + tb = rcu_dereference_rtnl(net->ipv4.fib_default); + if (tb) + err = fib_table_lookup(tb, flp, res, flags); - tb = rcu_dereference_rtnl(net->ipv4.fib_default); - if (tb && !fib_table_lookup(tb, flp, res, flags)) - break; - } +out: + if (err == -EAGAIN) + err = -ENETUNREACH; rcu_read_unlock(); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 5f4a5565ad8b..c6ad99ad0ffb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2045,6 +2045,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) struct fib_result res; struct rtable *rth; int orig_oif; + int err = -ENETUNREACH; res.tclassid = 0; res.fi = NULL; @@ -2153,7 +2154,8 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) goto make_route; } - if (fib_lookup(net, fl4, &res, 0)) { + err = fib_lookup(net, fl4, &res, 0); + if (err) { res.fi = NULL; res.table = NULL; if (fl4->flowi4_oif) { @@ -2181,7 +2183,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) res.type = RTN_UNICAST; goto make_route; } - rth = ERR_PTR(-ENETUNREACH); + rth = ERR_PTR(err); goto out; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d5fa50297f80..f204089e854c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1885,9 +1885,11 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret) rt->dst.input = ip6_pkt_prohibit; break; case RTN_THROW: + case RTN_UNREACHABLE: default: rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN - : -ENETUNREACH; + : (cfg->fc_type == RTN_UNREACHABLE) + ? -EHOSTUNREACH : -ENETUNREACH; rt->dst.output = ip6_pkt_discard_out; rt->dst.input = ip6_pkt_discard; break; -- cgit v1.2.3 From bc22a0e2ea03b75b51a1f722f93821744b5b5ff1 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 18 Sep 2015 11:47:40 +0200 Subject: iptunnel: make rx/tx bytes counters consistent This was already done a long time ago in commit 64194c31a0b6 ("inet: Make tunnel RX/TX byte counters more consistent") but tx path was broken (at least since 3.10). Before the patch the gre header was included on tx. After the patch: $ ping -c1 192.168.0.121 ; ip -s l ls dev gre1 PING 192.168.0.121 (192.168.0.121) 56(84) bytes of data. 64 bytes from 192.168.0.121: icmp_req=1 ttl=64 time=2.95 ms --- 192.168.0.121 ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 2.955/2.955/2.955/0.000 ms 7: gre1@NONE: mtu 1468 qdisc noqueue state UNKNOWN mode DEFAULT group default link/gre 10.16.0.249 peer 10.16.0.121 RX: bytes packets errors dropped overrun mcast 84 1 0 0 0 0 TX: bytes packets errors dropped carrier collsns 84 1 0 0 0 0 Reported-by: Julien Meunier Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 29ed6c5a5185..9b97204b8c81 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -51,7 +51,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet) { - int pkt_len = skb->len; + int pkt_len = skb->len - skb_inner_network_offset(skb); struct iphdr *iph; int err; -- cgit v1.2.3 From ed2e923945892a8372ab70d2f61d364b0b6d9054 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 19 Sep 2015 09:08:34 -0700 Subject: tcp/dccp: fix timewait races in timer handling When creating a timewait socket, we need to arm the timer before allowing other cpus to find it. The signal allowing cpus to find the socket is setting tw_refcnt to non zero value. As we set tw_refcnt in __inet_twsk_hashdance(), we therefore need to call inet_twsk_schedule() first. This also means we need to remove tw_refcnt changes from inet_twsk_schedule() and let the caller handle it. Note that because we use mod_timer_pinned(), we have the guarantee the timer wont expire before we set tw_refcnt as we run in BH context. To make things more readable I introduced inet_twsk_reschedule() helper. When rearming the timer, we can use mod_timer_pending() to make sure we do not rearm a canceled timer. Note: This bug can possibly trigger if packets of a flow can hit multiple cpus. This does not normally happen, unless flow steering is broken somehow. This explains this bug was spotted ~5 months after its introduction. A similar fix is needed for SYN_RECV sockets in reqsk_queue_hash_req(), but will be provided in a separate patch for proper tracking. Fixes: 789f558cfb36 ("tcp/dccp: get rid of central timewait timer") Signed-off-by: Eric Dumazet Reported-by: Ying Cai Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 14 +++++++++++++- net/dccp/minisocks.c | 4 ++-- net/ipv4/inet_timewait_sock.c | 16 ++++++++++------ net/ipv4/tcp_minisocks.c | 13 ++++++------- 4 files changed, 31 insertions(+), 16 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 879d6e5a973b..186f3a1e1b1f 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -110,7 +110,19 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, struct inet_hashinfo *hashinfo); -void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo); +void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, + bool rearm); + +static void inline inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo) +{ + __inet_twsk_schedule(tw, timeo, false); +} + +static void inline inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo) +{ + __inet_twsk_schedule(tw, timeo, true); +} + void inet_twsk_deschedule_put(struct inet_timewait_sock *tw); void inet_twsk_purge(struct inet_hashinfo *hashinfo, diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 30addee2dd03..838f524cf11a 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -48,8 +48,6 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) tw->tw_ipv6only = sk->sk_ipv6only; } #endif - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) @@ -60,6 +58,8 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) timeo = DCCP_TIMEWAIT_LEN; inet_twsk_schedule(tw, timeo); + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index ae22cc24fbe8..c67f9bd7699c 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -123,13 +123,15 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, /* * Step 2: Hash TW into tcp ehash chain. * Notes : - * - tw_refcnt is set to 3 because : + * - tw_refcnt is set to 4 because : * - We have one reference from bhash chain. * - We have one reference from ehash chain. + * - We have one reference from timer. + * - One reference for ourself (our caller will release it). * We can use atomic_set() because prior spin_lock()/spin_unlock() * committed into memory all tw fields. */ - atomic_set(&tw->tw_refcnt, 1 + 1 + 1); + atomic_set(&tw->tw_refcnt, 4); inet_twsk_add_node_rcu(tw, &ehead->chain); /* Step 3: Remove SK from hash chain */ @@ -217,7 +219,7 @@ void inet_twsk_deschedule_put(struct inet_timewait_sock *tw) } EXPORT_SYMBOL(inet_twsk_deschedule_put); -void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) +void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) { /* timeout := RTO * 3.5 * @@ -245,12 +247,14 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) */ tw->tw_kill = timeo <= 4*HZ; - if (!mod_timer_pinned(&tw->tw_timer, jiffies + timeo)) { - atomic_inc(&tw->tw_refcnt); + if (!rearm) { + BUG_ON(mod_timer_pinned(&tw->tw_timer, jiffies + timeo)); atomic_inc(&tw->tw_dr->tw_count); + } else { + mod_timer_pending(&tw->tw_timer, jiffies + timeo); } } -EXPORT_SYMBOL_GPL(inet_twsk_schedule); +EXPORT_SYMBOL_GPL(__inet_twsk_schedule); void inet_twsk_purge(struct inet_hashinfo *hashinfo, struct inet_timewait_death_row *twdr, int family) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6d8795b066ac..def765911ff8 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -162,9 +162,9 @@ kill_with_rst: if (tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_tw_remember_stamp(tw)) - inet_twsk_schedule(tw, tw->tw_timeout); + inet_twsk_reschedule(tw, tw->tw_timeout); else - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } @@ -201,7 +201,7 @@ kill: return TCP_TW_SUCCESS; } } - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; @@ -251,7 +251,7 @@ kill: * Do not reschedule in the last case. */ if (paws_reject || th->ack) - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); @@ -322,9 +322,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } while (0); #endif - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); - /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) timeo = rto; @@ -338,6 +335,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } inet_twsk_schedule(tw, timeo); + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this -- cgit v1.2.3 From 29c6852602e259d2c1882f320b29d5c3fec0de04 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 19 Sep 2015 09:48:04 -0700 Subject: inet: fix races in reqsk_queue_hash_req() Before allowing lockless LISTEN processing, we need to make sure to arm the SYN_RECV timer before the req socket is visible in hash tables. Also, req->rsk_hash should be written before we set rsk_refcnt to a non zero value. Fixes: fa76ce7328b2 ("inet: get rid of central tcp/dccp listener timer") Signed-off-by: Eric Dumazet Cc: Ying Cai Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 134957159c27..7bb9c39e0a4d 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -685,20 +685,20 @@ void reqsk_queue_hash_req(struct request_sock_queue *queue, req->num_timeout = 0; req->sk = NULL; + setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); + mod_timer_pinned(&req->rsk_timer, jiffies + timeout); + req->rsk_hash = hash; + /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ smp_wmb(); atomic_set(&req->rsk_refcnt, 2); - setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); - req->rsk_hash = hash; spin_lock(&queue->syn_wait_lock); req->dl_next = lopt->syn_table[hash]; lopt->syn_table[hash] = req; spin_unlock(&queue->syn_wait_lock); - - mod_timer_pinned(&req->rsk_timer, jiffies + timeout); } EXPORT_SYMBOL(reqsk_queue_hash_req); -- cgit v1.2.3 From 675ee231d960af2af3606b4480324e26797eb010 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 23 Sep 2015 14:00:21 -0700 Subject: tcp: add proper TS val into RST packets RST packets sent on behalf of TCP connections with TS option (RFC 7323 TCP timestamps) have incorrect TS val (set to 0), but correct TS ecr. A > B: Flags [S], seq 0, win 65535, options [mss 1000,nop,nop,TS val 100 ecr 0], length 0 B > A: Flags [S.], seq 2444755794, ack 1, win 28960, options [mss 1460,nop,nop,TS val 7264344 ecr 100], length 0 A > B: Flags [.], ack 1, win 65535, options [nop,nop,TS val 110 ecr 7264344], length 0 B > A: Flags [R.], seq 1, ack 1, win 28960, options [nop,nop,TS val 0 ecr 110], length 0 We need to call skb_mstamp_get() to get proper TS val, derived from skb->skb_mstamp Note that RFC 1323 was advocating to not send TS option in RST segment, but RFC 7323 recommends the opposite : Once TSopt has been successfully negotiated, that is both and contain TSopt, the TSopt MUST be sent in every non- segment for the duration of the connection, and SHOULD be sent in an segment (see Section 5.2 for details) Note this RFC recommends to send TS val = 0, but we believe it is premature : We do not know if all TCP stacks are properly handling the receive side : When an segment is received, it MUST NOT be subjected to the PAWS check by verifying an acceptable value in SEG.TSval, and information from the Timestamps option MUST NOT be used to update connection state information. SEG.TSecr MAY be used to provide stricter acceptance checks. In 5 years, if/when all TCP stack are RFC 7323 ready, we might consider to decide to send TS val = 0, if it buys something. Fixes: 7faee5c0d514 ("tcp: remove TCP_SKB_CB(skb)->when") Signed-off-by: Eric Dumazet Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f9a8a12b62ee..1100ffe4a722 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2897,6 +2897,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) skb_reserve(skb, MAX_TCP_HEADER); tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); + skb_mstamp_get(&skb->skb_mstamp); /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); -- cgit v1.2.3 From 63d008a4e9ee86614ca5671b7f3ba447df007190 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 22 Sep 2015 18:12:11 +0200 Subject: ipv4: send arp replies to the correct tunnel When using ip lwtunnels, the additional data for xmit (basically, the actual tunnel to use) are carried in ip_tunnel_info either in dst->lwtstate or in metadata dst. When replying to ARP requests, we need to send the reply to the same tunnel the request came from. This means we need to construct proper metadata dst for ARP replies. We could perform another route lookup to get a dst entry with the correct lwtstate. However, this won't always ensure that the outgoing tunnel is the same as the incoming one, and it won't work anyway for IPv4 duplicate address detection. The only thing to do is to "reverse" the ip_tunnel_info. Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 2 ++ net/ipv4/arp.c | 39 +++++++++++++++++++++++++-------------- net/ipv4/ip_tunnel_core.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 9a6a3ba888e8..f6dafec9102c 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -276,6 +276,8 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto); int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, u8 proto, u8 tos, u8 ttl, __be16 df, bool xnet); +struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, + gfp_t flags); struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool gre_csum, int gso_type_mask); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 30409b75e925..f03db8b7abee 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -113,6 +113,8 @@ #include #include #include +#include +#include #include @@ -296,7 +298,8 @@ static void arp_send_dst(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, - const unsigned char *target_hw, struct sk_buff *oskb) + const unsigned char *target_hw, + struct dst_entry *dst) { struct sk_buff *skb; @@ -309,9 +312,7 @@ static void arp_send_dst(int type, int ptype, __be32 dest_ip, if (!skb) return; - if (oskb) - skb_dst_copy(skb, oskb); - + skb_dst_set(skb, dst); arp_xmit(skb); } @@ -333,6 +334,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) __be32 target = *(__be32 *)neigh->primary_key; int probes = atomic_read(&neigh->probes); struct in_device *in_dev; + struct dst_entry *dst = NULL; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); @@ -381,9 +383,10 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) } } + if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE)) + dst = dst_clone(skb_dst(skb)); arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, - dst_hw, dev->dev_addr, NULL, - dev->priv_flags & IFF_XMIT_DST_RELEASE ? NULL : skb); + dst_hw, dev->dev_addr, NULL, dst); } static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) @@ -649,6 +652,7 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) int addr_type; struct neighbour *n; struct net *net = dev_net(dev); + struct dst_entry *reply_dst = NULL; bool is_garp = false; /* arp_rcv below verifies the ARP header and verifies the device @@ -749,13 +753,18 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) * cache. */ + if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb)) + reply_dst = (struct dst_entry *) + iptunnel_metadata_reply(skb_metadata_dst(skb), + GFP_ATOMIC); + /* Special case: IPv4 duplicate address detection packet (RFC2131) */ if (sip == 0) { if (arp->ar_op == htons(ARPOP_REQUEST) && inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL && !arp_ignore(in_dev, sip, tip)) - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, - dev->dev_addr, sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, + sha, dev->dev_addr, sha, reply_dst); goto out; } @@ -774,9 +783,10 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) if (!dont_send) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) { - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, - dev, tip, sha, dev->dev_addr, - sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, + sip, dev, tip, sha, + dev->dev_addr, sha, + reply_dst); neigh_release(n); } } @@ -794,9 +804,10 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) { - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, - dev, tip, sha, dev->dev_addr, - sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, + sip, dev, tip, sha, + dev->dev_addr, sha, + reply_dst); } else { pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 9b97204b8c81..ce3a1e728606 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -46,6 +46,7 @@ #include #include #include +#include int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, @@ -119,6 +120,33 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) } EXPORT_SYMBOL_GPL(iptunnel_pull_header); +struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, + gfp_t flags) +{ + struct metadata_dst *res; + struct ip_tunnel_info *dst, *src; + + if (!md || md->u.tun_info.mode & IP_TUNNEL_INFO_TX) + return NULL; + + res = metadata_dst_alloc(0, flags); + if (!res) + return NULL; + + dst = &res->u.tun_info; + src = &md->u.tun_info; + dst->key.tun_id = src->key.tun_id; + if (src->mode & IP_TUNNEL_INFO_IPV6) + memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src, + sizeof(struct in6_addr)); + else + dst->key.u.ipv4.dst = src->key.u.ipv4.src; + dst->mode = src->mode | IP_TUNNEL_INFO_TX; + + return res; +} +EXPORT_SYMBOL_GPL(iptunnel_metadata_reply); + struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool csum_help, int gso_type_mask) -- cgit v1.2.3 From b194f30c61efb0767a98f47a64530baa8b731670 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 22 Sep 2015 18:12:12 +0200 Subject: lwtunnel: remove source and destination UDP port config option The UDP tunnel config is asymmetric wrt. to the ports used. The source and destination ports from one direction of the tunnel are not related to the ports of the other direction. We need to be able to respond to ARP requests using the correct ports without involving routing. As the consequence, UDP ports need to be fixed property of the tunnel interface and cannot be set per route. Remove the ability to set ports per route. This is still okay to do, as no kernel has been released with these attributes yet. Note that the ability to specify source and destination ports is preserved for other users of the lwtunnel API which don't use routes for tunnel key specification (like openvswitch). If in the future we rework ARP handling to allow port specification, the attributes can be added back. Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/uapi/linux/lwtunnel.h | 4 ---- net/ipv4/ip_tunnel_core.c | 24 ------------------------ 2 files changed, 28 deletions(-) (limited to 'net/ipv4') diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index 34141a5dfe74..f8b01887a495 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -21,8 +21,6 @@ enum lwtunnel_ip_t { LWTUNNEL_IP_SRC, LWTUNNEL_IP_TTL, LWTUNNEL_IP_TOS, - LWTUNNEL_IP_SPORT, - LWTUNNEL_IP_DPORT, LWTUNNEL_IP_FLAGS, __LWTUNNEL_IP_MAX, }; @@ -36,8 +34,6 @@ enum lwtunnel_ip6_t { LWTUNNEL_IP6_SRC, LWTUNNEL_IP6_HOPLIMIT, LWTUNNEL_IP6_TC, - LWTUNNEL_IP6_SPORT, - LWTUNNEL_IP6_DPORT, LWTUNNEL_IP6_FLAGS, __LWTUNNEL_IP6_MAX, }; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index ce3a1e728606..84dce6a92f93 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -226,8 +226,6 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { [LWTUNNEL_IP_SRC] = { .type = NLA_U32 }, [LWTUNNEL_IP_TTL] = { .type = NLA_U8 }, [LWTUNNEL_IP_TOS] = { .type = NLA_U8 }, - [LWTUNNEL_IP_SPORT] = { .type = NLA_U16 }, - [LWTUNNEL_IP_DPORT] = { .type = NLA_U16 }, [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 }, }; @@ -267,12 +265,6 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, if (tb[LWTUNNEL_IP_TOS]) tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); - if (tb[LWTUNNEL_IP_SPORT]) - tun_info->key.tp_src = nla_get_be16(tb[LWTUNNEL_IP_SPORT]); - - if (tb[LWTUNNEL_IP_DPORT]) - tun_info->key.tp_dst = nla_get_be16(tb[LWTUNNEL_IP_DPORT]); - if (tb[LWTUNNEL_IP_FLAGS]) tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP_FLAGS]); @@ -294,8 +286,6 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || - nla_put_u16(skb, LWTUNNEL_IP_SPORT, tun_info->key.tp_src) || - nla_put_u16(skb, LWTUNNEL_IP_DPORT, tun_info->key.tp_dst) || nla_put_u16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags)) return -ENOMEM; @@ -309,8 +299,6 @@ static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(4) /* LWTUNNEL_IP_SRC */ + nla_total_size(1) /* LWTUNNEL_IP_TOS */ + nla_total_size(1) /* LWTUNNEL_IP_TTL */ - + nla_total_size(2) /* LWTUNNEL_IP_SPORT */ - + nla_total_size(2) /* LWTUNNEL_IP_DPORT */ + nla_total_size(2); /* LWTUNNEL_IP_FLAGS */ } @@ -333,8 +321,6 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { [LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) }, [LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 }, [LWTUNNEL_IP6_TC] = { .type = NLA_U8 }, - [LWTUNNEL_IP6_SPORT] = { .type = NLA_U16 }, - [LWTUNNEL_IP6_DPORT] = { .type = NLA_U16 }, [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 }, }; @@ -374,12 +360,6 @@ static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr, if (tb[LWTUNNEL_IP6_TC]) tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); - if (tb[LWTUNNEL_IP6_SPORT]) - tun_info->key.tp_src = nla_get_be16(tb[LWTUNNEL_IP6_SPORT]); - - if (tb[LWTUNNEL_IP6_DPORT]) - tun_info->key.tp_dst = nla_get_be16(tb[LWTUNNEL_IP6_DPORT]); - if (tb[LWTUNNEL_IP6_FLAGS]) tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP6_FLAGS]); @@ -401,8 +381,6 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb, nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.ttl) || - nla_put_u16(skb, LWTUNNEL_IP6_SPORT, tun_info->key.tp_src) || - nla_put_u16(skb, LWTUNNEL_IP6_DPORT, tun_info->key.tp_dst) || nla_put_u16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags)) return -ENOMEM; @@ -416,8 +394,6 @@ static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(16) /* LWTUNNEL_IP6_SRC */ + nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */ + nla_total_size(1) /* LWTUNNEL_IP6_TC */ - + nla_total_size(2) /* LWTUNNEL_IP6_SPORT */ - + nla_total_size(2) /* LWTUNNEL_IP6_DPORT */ + nla_total_size(2); /* LWTUNNEL_IP6_FLAGS */ } -- cgit v1.2.3 From bdb06cbf77cb01911694cc9076ffa8196b7b9b61 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 24 Sep 2015 15:31:29 -0600 Subject: net: Fix panic in icmp_route_lookup Andrey reported a panic: [ 7249.865507] BUG: unable to handle kernel pointer dereference at 000000b4 [ 7249.865559] IP: [] icmp_route_lookup+0xaa/0x320 [ 7249.865598] *pdpt = 0000000030f7f001 *pde = 0000000000000000 [ 7249.865637] Oops: 0000 [#1] ... [ 7249.866811] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.3.0-999-generic #201509220155 [ 7249.866876] Hardware name: MSI MS-7250/MS-7250, BIOS 080014 08/02/2006 [ 7249.866916] task: c1a5ab00 ti: c1a52000 task.ti: c1a52000 [ 7249.866949] EIP: 0060:[] EFLAGS: 00210246 CPU: 0 [ 7249.866981] EIP is at icmp_route_lookup+0xaa/0x320 [ 7249.867012] EAX: 00000000 EBX: f483ba48 ECX: 00000000 EDX: f2e18a00 [ 7249.867045] ESI: 000000c0 EDI: f483ba70 EBP: f483b9ec ESP: f483b974 [ 7249.867077] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 [ 7249.867108] CR0: 8005003b CR2: 000000b4 CR3: 36ee07c0 CR4: 000006f0 [ 7249.867141] Stack: [ 7249.867165] 320310ee 00000000 00000042 320310ee 00000000 c1aeca00 f3920240 f0c69180 [ 7249.867268] f483ba04 f855058b a89b66cd f483ba44 f8962f4b 00000000 e659266c f483ba54 [ 7249.867361] 8004753c f483ba5c f8962f4b f2031140 000003c1 ffbd8fa0 c16b0e00 00000064 [ 7249.867448] Call Trace: [ 7249.867494] [] ? e1000_xmit_frame+0x87b/0xdc0 [e1000e] [ 7249.867534] [] ? tcp_in_window+0xeb/0xb10 [nf_conntrack] [ 7249.867576] [] ? tcp_in_window+0xeb/0xb10 [nf_conntrack] [ 7249.867615] [] ? icmp_send+0xa0/0x380 [ 7249.867648] [] icmp_send+0x2cf/0x380 [ 7249.867681] [] nf_send_unreach+0xa6/0xc0 [nf_reject_ipv4] [ 7249.867714] [] reject_tg+0x7a/0x9f [ipt_REJECT] [ 7249.867746] [] ipt_do_table+0x317/0x70c [ip_tables] [ 7249.867780] [] ? __nf_conntrack_find_get+0x166/0x3b0 [nf_conntrack] [ 7249.867838] [] ? nf_conntrack_in+0x398/0x600 [nf_conntrack] [ 7249.867889] [] iptable_filter_hook+0x35/0x80 [iptable_filter] [ 7249.867933] [] nf_iterate+0x71/0x80 [ 7249.867970] [] nf_hook_slow+0x65/0xc0 [ 7249.868002] [] __ip_local_out_sk+0xc1/0xd0 [ 7249.868034] [] ? ip_forward_options+0x1a0/0x1a0 [ 7249.868066] [] ip_local_out_sk+0x16/0x30 [ 7249.868097] [] ip_send_skb+0x14/0x80 [ 7249.868129] [] ip_push_pending_frames+0x34/0x40 [ 7249.868163] [] ip_send_unicast_reply+0x282/0x310 [ 7249.868196] [] tcp_v4_send_reset+0x1b3/0x380 [ 7249.868227] [] tcp_v4_rcv+0x323/0x990 [ 7249.868257] [] ? nf_iterate+0x71/0x80 [ 7249.868289] [] ip_local_deliver_finish+0x8b/0x230 [ 7249.868322] [] ip_local_deliver+0x4c/0xa0 [ 7249.868353] [] ? ip_rcv_finish+0x390/0x390 [ 7249.868384] [] ip_rcv_finish+0x7c/0x390 [ 7249.868415] [] ip_rcv+0x2e0/0x420 ... Prior to the VRF change the oif was not set in the flow struct, so the VRF support should really have only added the vrf_master_ifindex lookup. Fixes: 613d09b30f8b ("net: Use VRF device index for lookups on TX") Cc: Andrey Melnikov Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 79fe05befcae..e5eb8ac4089d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -427,7 +427,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.flowi4_mark = mark; fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; - fl4.flowi4_oif = vrf_master_ifindex(skb->dev) ? : skb->dev->ifindex; + fl4.flowi4_oif = vrf_master_ifindex(skb->dev); security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) @@ -461,7 +461,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - fl4->flowi4_oif = vrf_master_ifindex(skb_in->dev) ? : skb_in->dev->ifindex; + fl4->flowi4_oif = vrf_master_ifindex(skb_in->dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); rt = __ip_route_output_key(net, fl4); -- cgit v1.2.3 From b84f78782052ee4516903e5d0566a5eee365b771 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 29 Sep 2015 19:07:07 -0700 Subject: net: Initialize flow flags in input path The fib_table_lookup tracepoint found 2 places where the flowi4_flags is not initialized. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 1 + net/ipv4/route.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6fcbd215cdbc..690bcbc59f26 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -340,6 +340,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_tun_key.tun_id = 0; + fl4.flowi4_flags = 0; no_addr = idev->ifa_list == NULL; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c6ad99ad0ffb..c81deb85acb4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1737,6 +1737,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.flowi4_mark = skb->mark; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_flags = 0; fl4.daddr = daddr; fl4.saddr = saddr; err = fib_lookup(net, &fl4, &res, 0); -- cgit v1.2.3