From ed0dfffd7dcd3f517b1507929642c2aed4ef00fb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Jan 2016 08:36:43 -0800 Subject: udp: fix potential infinite loop in SO_REUSEPORT logic Using a combination of connected and un-connected sockets, Dmitry was able to trigger soft lockups with his fuzzer. The problem is that sockets in the SO_REUSEPORT array might have different scores. Right after sk2=socket(), setsockopt(sk2,...,SO_REUSEPORT, on) and bind(sk2, ...), but _before_ the connect(sk2) is done, sk2 is added into the soreuseport array, with a score which is smaller than the score of first socket sk1 found in hash table (I am speaking of the regular UDP hash table), if sk1 had the connect() done, giving a +8 to its score. hash bucket [X] -> sk1 -> sk2 -> NULL sk1 score = 14 (because it did a connect()) sk2 score = 6 SO_REUSEPORT fast selection is an optimization. If it turns out the score of the selected socket does not match score of first socket, just fallback to old SO_REUSEPORT logic instead of trying to be too smart. Normal SO_REUSEPORT users do not mix different kind of sockets, as this mechanism is used for load balance traffic. Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection") Reported-by: Dmitry Vyukov Signed-off-by: Eric Dumazet Cc: Craig Gallek Acked-by: Craig Gallek Signed-off-by: David S. Miller --- net/ipv4/udp.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index dc45b538e237..be0b21852b13 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -499,6 +499,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, struct sock *sk, *result; struct hlist_nulls_node *node; int score, badness, matches = 0, reuseport = 0; + bool select_ok = true; u32 hash = 0; begin: @@ -512,14 +513,18 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { - struct sock *sk2; hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - sk2 = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - if (sk2) { - result = sk2; - goto found; + if (select_ok) { + struct sock *sk2; + + sk2 = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (sk2) { + result = sk2; + select_ok = false; + goto found; + } } matches = 1; } @@ -563,6 +568,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; int score, badness, matches = 0, reuseport = 0; + bool select_ok = true; u32 hash = 0; rcu_read_lock(); @@ -601,14 +607,18 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { - struct sock *sk2; hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - sk2 = reuseport_select_sock(sk, hash, skb, + if (select_ok) { + struct sock *sk2; + + sk2 = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (sk2) { - result = sk2; - goto found; + if (sk2) { + result = sk2; + select_ok = false; + goto found; + } } matches = 1; } -- cgit v1.2.3 From 7c1306723ee916ea9f1fa7d9e4c7a6d029ca7aaf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 Jan 2016 16:25:01 -0800 Subject: net: diag: support v4mapped sockets in inet_diag_find_one_icsk() Lorenzo reported that we could not properly find v4mapped sockets in inet_diag_find_one_icsk(). This patch fixes the issue. Reported-by: Lorenzo Colitti Signed-off-by: Eric Dumazet Acked-by: Lorenzo Colitti Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 8bb8e7ad8548..6029157a19ed 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -361,13 +361,20 @@ struct sock *inet_diag_find_one_icsk(struct net *net, req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if); #if IS_ENABLED(CONFIG_IPV6) - else if (req->sdiag_family == AF_INET6) - sk = inet6_lookup(net, hashinfo, - (struct in6_addr *)req->id.idiag_dst, - req->id.idiag_dport, - (struct in6_addr *)req->id.idiag_src, - req->id.idiag_sport, - req->id.idiag_if); + else if (req->sdiag_family == AF_INET6) { + if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && + ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) + sk = inet_lookup(net, hashinfo, req->id.idiag_dst[3], + req->id.idiag_dport, req->id.idiag_src[3], + req->id.idiag_sport, req->id.idiag_if); + else + sk = inet6_lookup(net, hashinfo, + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + req->id.idiag_if); + } #endif else return ERR_PTR(-EINVAL); -- cgit v1.2.3 From e62a123b8ef7c5dc4db2c16383d506860ad21b47 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Jan 2016 08:02:54 -0800 Subject: tcp: fix NULL deref in tcp_v4_send_ack() Neal reported crashes with this stack trace : RIP: 0010:[] tcp_v4_send_ack+0x41/0x20f ... CR2: 0000000000000018 CR3: 000000044005c000 CR4: 00000000001427e0 ... [] tcp_v4_reqsk_send_ack+0xa5/0xb4 [] tcp_check_req+0x2ea/0x3e0 [] tcp_rcv_state_process+0x850/0x2500 [] tcp_v4_do_rcv+0x141/0x330 [] sk_backlog_rcv+0x21/0x30 [] tcp_recvmsg+0x75d/0xf90 [] inet_recvmsg+0x80/0xa0 [] sock_aio_read+0xee/0x110 [] do_sync_read+0x6f/0xa0 [] SyS_read+0x1e1/0x290 [] system_call_fastpath+0x16/0x1b The problem here is the skb we provide to tcp_v4_send_ack() had to be parked in the backlog of a new TCP fastopen child because this child was owned by the user at the time an out of window packet arrived. Before queuing a packet, TCP has to set skb->dev to NULL as the device could disappear before packet is removed from the queue. Fix this issue by using the net pointer provided by the socket (being a timewait or a request socket). IPv6 is immune to the bug : tcp_v6_send_response() already gets the net pointer from the socket if provided. Fixes: 168a8f58059a ("tcp: TCP Fast Open Server - main code path") Reported-by: Neal Cardwell Signed-off-by: Eric Dumazet Cc: Jerry Chu Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c7d1fb50f381..2a67244f97ca 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -708,7 +708,8 @@ release_sk1: outside socket context is ugly, certainly. What can I do? */ -static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, +static void tcp_v4_send_ack(struct net *net, + struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, int reply_flags, u8 tos) @@ -723,7 +724,6 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, ]; } rep; struct ip_reply_arg arg; - struct net *net = dev_net(skb_dst(skb)->dev); memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); @@ -785,7 +785,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcp_v4_send_ack(sock_net(sk), skb, + tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp + tcptw->tw_ts_offset, tcptw->tw_ts_recent, @@ -804,8 +805,10 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ - tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? - tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, + u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : + tcp_sk(sk)->snd_nxt; + + tcp_v4_send_ack(sock_net(sk), skb, seq, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd, tcp_time_stamp, req->ts_recent, -- cgit v1.2.3 From 32b6170ca59ccf07d0e394561e54b2cd9726038c Mon Sep 17 00:00:00 2001 From: Thomas Egerer Date: Mon, 25 Jan 2016 12:58:44 +0100 Subject: ipv4+ipv6: Make INET*_ESP select CRYPTO_ECHAINIV The ESP algorithms using CBC mode require echainiv. Hence INET*_ESP have to select CRYPTO_ECHAINIV in order to work properly. This solves the issues caused by a misconfiguration as described in [1]. The original approach, patching crypto/Kconfig was turned down by Herbert Xu [2]. [1] https://lists.strongswan.org/pipermail/users/2015-December/009074.html [2] http://marc.info/?l=linux-crypto-vger&m=145224655809562&w=2 Signed-off-by: Thomas Egerer Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 1 + net/ipv6/Kconfig | 1 + 2 files changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index c22920525e5d..775824720b6b 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -353,6 +353,7 @@ config INET_ESP select CRYPTO_CBC select CRYPTO_SHA1 select CRYPTO_DES + select CRYPTO_ECHAINIV ---help--- Support for IPsec ESP. diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index bb7dabe2ebbf..40c897515ddc 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -69,6 +69,7 @@ config INET6_ESP select CRYPTO_CBC select CRYPTO_SHA1 select CRYPTO_DES + select CRYPTO_ECHAINIV ---help--- Support for IPsec ESP. -- cgit v1.2.3 From 8282f27449bf15548cb82c77b6e04ee0ab827bdc Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Fri, 22 Jan 2016 15:49:12 -0800 Subject: inet: frag: Always orphan skbs inside ip_defrag() Later parts of the stack (including fragmentation) expect that there is never a socket attached to frag in a frag_list, however this invariant was not enforced on all defrag paths. This could lead to the BUG_ON(skb->sk) during ip_do_fragment(), as per the call stack at the end of this commit message. While the call could be added to openvswitch to fix this particular error, the head and tail of the frags list are already orphaned indirectly inside ip_defrag(), so it seems like the remaining fragments should all be orphaned in all circumstances. kernel BUG at net/ipv4/ip_output.c:586! [...] Call Trace: [] ? do_output.isra.29+0x1b0/0x1b0 [openvswitch] [] ovs_fragment+0xcc/0x214 [openvswitch] [] ? dst_discard_out+0x20/0x20 [] ? dst_ifdown+0x80/0x80 [] ? find_bucket.isra.2+0x62/0x70 [openvswitch] [] ? mod_timer_pending+0x65/0x210 [] ? __lock_acquire+0x3db/0x1b90 [] ? nf_conntrack_in+0x252/0x500 [nf_conntrack] [] ? __lock_is_held+0x54/0x70 [] do_output.isra.29+0xe3/0x1b0 [openvswitch] [] do_execute_actions+0xe11/0x11f0 [openvswitch] [] ? __lock_is_held+0x54/0x70 [] ovs_execute_actions+0x32/0xd0 [openvswitch] [] ovs_dp_process_packet+0x85/0x140 [openvswitch] [] ? __lock_is_held+0x54/0x70 [] ovs_execute_actions+0xb2/0xd0 [openvswitch] [] ovs_dp_process_packet+0x85/0x140 [openvswitch] [] ? ovs_ct_get_labels+0x49/0x80 [openvswitch] [] ovs_vport_receive+0x5d/0xa0 [openvswitch] [] ? __lock_acquire+0x3db/0x1b90 [] ? __lock_acquire+0x3db/0x1b90 [] ? __lock_acquire+0x3db/0x1b90 [] ? internal_dev_xmit+0x5/0x140 [openvswitch] [] internal_dev_xmit+0x6c/0x140 [openvswitch] [] ? internal_dev_xmit+0x5/0x140 [openvswitch] [] dev_hard_start_xmit+0x2b9/0x5e0 [] ? netif_skb_features+0xd1/0x1f0 [] __dev_queue_xmit+0x800/0x930 [] ? __dev_queue_xmit+0x50/0x930 [] ? mark_held_locks+0x71/0x90 [] ? neigh_resolve_output+0x106/0x220 [] dev_queue_xmit+0x10/0x20 [] neigh_resolve_output+0x178/0x220 [] ? ip_finish_output2+0x1ff/0x590 [] ip_finish_output2+0x1ff/0x590 [] ? ip_finish_output2+0x7e/0x590 [] ip_do_fragment+0x831/0x8a0 [] ? ip_copy_metadata+0x1b0/0x1b0 [] ip_fragment.constprop.49+0x43/0x80 [] ip_finish_output+0x17c/0x340 [] ? nf_hook_slow+0xe4/0x190 [] ip_output+0x70/0x110 [] ? ip_fragment.constprop.49+0x80/0x80 [] ip_local_out+0x39/0x70 [] ip_send_skb+0x19/0x40 [] ip_push_pending_frames+0x33/0x40 [] icmp_push_reply+0xea/0x120 [] icmp_reply.constprop.23+0x1ed/0x230 [] icmp_echo.part.21+0x4e/0x50 [] ? __lock_is_held+0x54/0x70 [] ? rcu_read_lock_held+0x5e/0x70 [] icmp_echo+0x36/0x70 [] icmp_rcv+0x271/0x450 [] ip_local_deliver_finish+0x127/0x3a0 [] ? ip_local_deliver_finish+0x41/0x3a0 [] ip_local_deliver+0x60/0xd0 [] ? ip_rcv_finish+0x560/0x560 [] ip_rcv_finish+0xdd/0x560 [] ip_rcv+0x283/0x3e0 [] ? match_held_lock+0x192/0x200 [] ? inet_del_offload+0x40/0x40 [] __netif_receive_skb_core+0x392/0xae0 [] ? process_backlog+0x8e/0x230 [] ? mark_held_locks+0x71/0x90 [] __netif_receive_skb+0x18/0x60 [] process_backlog+0x78/0x230 [] ? process_backlog+0xdd/0x230 [] net_rx_action+0x155/0x400 [] __do_softirq+0xcc/0x420 [] ? ip_finish_output2+0x217/0x590 [] do_softirq_own_stack+0x1c/0x30 [] do_softirq+0x4e/0x60 [] __local_bh_enable_ip+0xa8/0xb0 [] ip_finish_output2+0x240/0x590 [] ? ip_do_fragment+0x831/0x8a0 [] ip_do_fragment+0x831/0x8a0 [] ? ip_copy_metadata+0x1b0/0x1b0 [] ip_fragment.constprop.49+0x43/0x80 [] ip_finish_output+0x17c/0x340 [] ? nf_hook_slow+0xe4/0x190 [] ip_output+0x70/0x110 [] ? ip_fragment.constprop.49+0x80/0x80 [] ip_local_out+0x39/0x70 [] ip_send_skb+0x19/0x40 [] ip_push_pending_frames+0x33/0x40 [] raw_sendmsg+0x7d3/0xc30 [] ? __lock_acquire+0x3db/0x1b90 [] ? inet_sendmsg+0xc7/0x1d0 [] ? __lock_is_held+0x54/0x70 [] inet_sendmsg+0x10a/0x1d0 [] ? inet_sendmsg+0x5/0x1d0 [] sock_sendmsg+0x38/0x50 [] ___sys_sendmsg+0x25f/0x270 [] ? handle_mm_fault+0x8dd/0x1320 [] ? _raw_spin_unlock+0x27/0x40 [] ? __do_page_fault+0x1e2/0x460 [] ? __fget_light+0x66/0x90 [] __sys_sendmsg+0x42/0x80 [] SyS_sendmsg+0x12/0x20 [] entry_SYSCALL_64_fastpath+0x12/0x6f Code: 00 00 44 89 e0 e9 7c fb ff ff 4c 89 ff e8 e7 e7 ff ff 41 8b 9d 80 00 00 00 2b 5d d4 89 d8 c1 f8 03 0f b7 c0 e9 33 ff ff f 66 66 66 2e 0f 1f 84 00 00 00 00 00 66 66 66 66 90 55 48 RIP [] ip_do_fragment+0x892/0x8a0 RSP Fixes: 7f8a436eaa2c ("openvswitch: Add conntrack action") Signed-off-by: Joe Stringer Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 1 + net/ipv4/netfilter/nf_defrag_ipv4.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 3f00810b7288..187c6fcc3027 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -661,6 +661,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) struct ipq *qp; IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); + skb_orphan(skb); /* Lookup (or create) queue header */ qp = ip_find(net, ip_hdr(skb), user, vif); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 6fb869f646bf..a04dee536b8e 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -27,8 +27,6 @@ static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, { int err; - skb_orphan(skb); - local_bh_disable(); err = ip_defrag(net, skb, user); local_bh_enable(); -- cgit v1.2.3 From d88270eef4b56bd7973841dd1fed387ccfa83709 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 25 Jan 2016 14:01:53 -0800 Subject: tcp: fix tcp_mark_head_lost to check skb len before fragmenting This commit fixes a corner case in tcp_mark_head_lost() which was causing the WARN_ON(len > skb->len) in tcp_fragment() to fire. tcp_mark_head_lost() was assuming that if a packet has tcp_skb_pcount(skb) of N, then it's safe to fragment off a prefix of M*mss bytes, for any M < N. But with the tricky way TCP pcounts are maintained, this is not always true. For example, suppose the sender sends 4 1-byte packets and have the last 3 packet sacked. It will merge the last 3 packets in the write queue into an skb with pcount = 3 and len = 3 bytes. If another recovery happens after a sack reneging event, tcp_mark_head_lost() may attempt to split the skb assuming it has more than 2*MSS bytes. This sounds very counterintuitive, but as the commit description for the related commit c0638c247f55 ("tcp: don't fragment SACKed skbs in tcp_mark_head_lost()") notes, this is because tcp_shifted_skb() coalesces adjacent regions of SACKed skbs, and when doing this it preserves the sum of their packet counts in order to reflect the real-world dynamics on the wire. The c0638c247f55 commit tried to avoid problems by not fragmenting SACKed skbs, since SACKed skbs are where the non-proportionality between pcount and skb->len/mss is known to be possible. However, that commit did not handle the case where during a reneging event one of these weird SACKed skbs becomes an un-SACKed skb, which tcp_mark_head_lost() can then try to fragment. The fix is to simply mark the entire skb lost when this happens. This makes the recovery slightly more aggressive in such corner cases before we detect reordering. But once we detect reordering this code path is by-passed because FACK is disabled. Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0003d409fec5..d2ad4337b63d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2164,8 +2164,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int cnt, oldcnt; - int err; + int cnt, oldcnt, lost; unsigned int mss; /* Use SACK to deduce losses of new sequences sent during recovery */ const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; @@ -2205,9 +2204,10 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) break; mss = tcp_skb_mss(skb); - err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, - mss, GFP_ATOMIC); - if (err < 0) + /* If needed, chop off the prefix to mark as lost. */ + lost = (packets - oldcnt) * mss; + if (lost < skb->len && + tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0) break; cnt = packets; } -- cgit v1.2.3 From ff5d749772018602c47509bdc0093ff72acd82ec Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Jan 2016 10:52:43 -0800 Subject: tcp: beware of alignments in tcp_get_info() With some combinations of user provided flags in netlink command, it is possible to call tcp_get_info() with a buffer that is not 8-bytes aligned. It does matter on some arches, so we need to use put_unaligned() to store the u64 fields. Current iproute2 package does not trigger this particular issue. Fixes: 0df48c26d841 ("tcp: add tcpi_bytes_acked to tcp_info") Fixes: 977cb0ecf82e ("tcp: add pacing_rate information into tcp_info") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index fd17eec93525..19746b3fcbbe 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -279,6 +279,7 @@ #include #include +#include #include int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; @@ -2638,6 +2639,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; unsigned int start; + u64 rate64; u32 rate; memset(info, 0, sizeof(*info)); @@ -2703,15 +2705,17 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_total_retrans = tp->total_retrans; rate = READ_ONCE(sk->sk_pacing_rate); - info->tcpi_pacing_rate = rate != ~0U ? rate : ~0ULL; + rate64 = rate != ~0U ? rate : ~0ULL; + put_unaligned(rate64, &info->tcpi_pacing_rate); rate = READ_ONCE(sk->sk_max_pacing_rate); - info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL; + rate64 = rate != ~0U ? rate : ~0ULL; + put_unaligned(rate64, &info->tcpi_max_pacing_rate); do { start = u64_stats_fetch_begin_irq(&tp->syncp); - info->tcpi_bytes_acked = tp->bytes_acked; - info->tcpi_bytes_received = tp->bytes_received; + put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked); + put_unaligned(tp->bytes_received, &info->tcpi_bytes_received); } while (u64_stats_fetch_retry_irq(&tp->syncp, start)); info->tcpi_segs_out = tp->segs_out; info->tcpi_segs_in = tp->segs_in; -- cgit v1.2.3 From 63e51b6a24f1bee5363056b7aee3a468b12c546b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Jan 2016 16:59:42 -0800 Subject: ipv4: early demux should be aware of fragments We should not assume a valid protocol header is present, as this is not the case for IPv4 fragments. Lets avoid extra cache line misses and potential bugs if we actually find a socket and incorrectly uses its dst. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index b1209b63381f..d77eb0c3b684 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -316,7 +316,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; - if (sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk) { + if (sysctl_ip_early_demux && + !skb_dst(skb) && + !skb->sk && + !ip_is_fragment(iph)) { const struct net_protocol *ipprot; int protocol = iph->protocol; -- cgit v1.2.3 From 52b79e2bdf92b07b37c805c50811eaf69a33683d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 28 Jan 2016 17:39:24 +0100 Subject: ipv4: ipconfig: avoid unused ic_proto_used symbol When CONFIG_PROC_FS, CONFIG_IP_PNP_BOOTP, CONFIG_IP_PNP_DHCP and CONFIG_IP_PNP_RARP are all disabled, we get a warning about the ic_proto_used variable being unused: net/ipv4/ipconfig.c:146:12: error: 'ic_proto_used' defined but not used [-Werror=unused-variable] This avoids the warning, by making the definition conditional on whether a dynamic IP configuration protocol is configured. If not, we know that the value is always zero, so we can optimize away the variable and all code that depends on it. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/ipv4/ipconfig.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 67f7c9de0b16..2ed9dd2b5f2f 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -143,7 +143,11 @@ static char dhcp_client_identifier[253] __initdata; /* Persistent data: */ +#ifdef IPCONFIG_DYNAMIC static int ic_proto_used; /* Protocol used, if any */ +#else +#define ic_proto_used 0 +#endif static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */ static u8 ic_domain[64]; /* DNS (not NIS) domain name */ -- cgit v1.2.3 From a5829f536b3d11a57617e83d4bcb2b7d70671e98 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 28 Jan 2016 13:42:24 -0800 Subject: fib_trie: Fix shift by 32 in fib_table_lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fib_table_lookup function had a shift by 32 that triggered a UBSAN warning. This was due to the fact that I had placed the shift first and then followed it with the check for the suffix length to ignore the undefined behavior. If we reorder this so that we verify the suffix is less than 32 before shifting the value we can avoid the issue. Reported-by: Toralf Förster Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 744e5936c10d..22e73171ea63 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1396,9 +1396,10 @@ found: struct fib_info *fi = fa->fa_info; int nhsel, err; - if ((index >= (1ul << fa->fa_slen)) && - ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen != KEYLENGTH))) - continue; + if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) { + if (index >= (1ul << fa->fa_slen)) + continue; + } if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) continue; if (fi->fib_dead) -- cgit v1.2.3 From 99b4dd9f2423130875ac486fe587cd103c64f753 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 29 Jan 2016 15:11:50 -0800 Subject: tcp: avoid cwnd undo after receiving ECN RFC 4015 section 3.4 says the TCP sender MUST refrain from reversing the congestion control state when the ACK signals congestion through the ECN-Echo flag. Currently we may not always do that when prior_ssthresh is reset upon receiving ACKs with ECE marks. This patch fixes that. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d2ad4337b63d..1c2a73406261 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2366,8 +2366,6 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) tp->snd_ssthresh = tp->prior_ssthresh; tcp_ecn_withdraw_cwr(tp); } - } else { - tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); } tp->snd_cwnd_stamp = tcp_time_stamp; tp->undo_marker = 0; -- cgit v1.2.3