From 9bdfb3b79e61c60e1a3e2dc05ad164528afa6b8a Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sun, 21 Feb 2016 10:12:39 +0300 Subject: tcp: convert cached rtt from usec to jiffies when feeding initial rto Currently it's converted into msecs, thus HZ=1000 intact. Signed-off-by: Konstantin Khlebnikov Fixes: 740b0f1841f6 ("tcp: switch rtt estimations to usec resolution") Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index c8cbc2b4b792..a726d7853ce5 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -550,7 +550,7 @@ reset: */ if (crtt > tp->srtt_us) { /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */ - crtt /= 8 * USEC_PER_MSEC; + crtt /= 8 * USEC_PER_SEC / HZ; inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk)); } else if (tp->srtt_us == 0) { /* RFC6298: 5.7 We've failed to get a valid RTT sample from -- cgit v1.2.3 From 5146d1f151122e868e594c7b45115d64825aee5f Mon Sep 17 00:00:00 2001 From: Bernie Harris Date: Mon, 22 Feb 2016 12:58:05 +1300 Subject: tunnel: Clear IPCB(skb)->opt before dst_link_failure called IPCB may contain data from previous layers (in the observed case the qdisc layer). In the observed scenario, the data was misinterpreted as ip header options, which later caused the ihl to be set to an invalid value (<5). This resulted in an infinite loop in the mips implementation of ip_fast_csum. This patch clears IPCB(skb)->opt before dst_link_failure can be called for various types of tunnels. This change only applies to encapsulated ipv4 packets. The code introduced in 11c21a30 which clears all of IPCB has been removed to be consistent with these changes, and instead the opt field is cleared unconditionally in ip_tunnel_xmit. The change in ip_tunnel_xmit applies to SIT, GRE, and IPIP tunnels. The relevant vti, l2tp, and pptp functions already contain similar code for clearing the IPCB. Signed-off-by: Bernie Harris Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 3 ++- net/ipv4/udp_tunnel.c | 2 ++ net/ipv6/ip6_gre.c | 2 ++ net/ipv6/ip6_tunnel.c | 2 ++ 4 files changed, 8 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 89e8861e05fc..336e6892a93c 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -661,6 +661,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, inner_iph = (const struct iphdr *)skb_inner_network_header(skb); connected = (tunnel->parms.iph.daddr != 0); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + dst = tnl_params->daddr; if (dst == 0) { /* NBMA tunnel */ @@ -758,7 +760,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); dst_link_failure(skb); } else tunnel->err_count = 0; diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 0ec08814f37d..96599d1a1318 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -89,6 +89,8 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb uh->source = src_port; uh->len = htons(skb->len); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + udp_set_csum(nocheck, skb, src, dst, skb->len); iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index a69aad1e29d1..c0d4dc1c5ea4 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -777,6 +777,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) __u32 mtu; int err; + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 137fca42aaa6..6c5dfec7a377 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1180,6 +1180,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) u8 tproto; int err; + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + tproto = ACCESS_ONCE(t->parms.proto); if (tproto != IPPROTO_IPIP && tproto != 0) return -1; -- cgit v1.2.3 From a8c4a2522a0808c5c2143612909717d1115c40cf Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 22 Feb 2016 18:43:25 +0100 Subject: ipv4: only create late gso-skb if skb is already set up with CHECKSUM_PARTIAL Otherwise we break the contract with GSO to only pass CHECKSUM_PARTIAL skbs down. This can easily happen with UDP+IPv4 sockets with the first MSG_MORE write smaller than the MTU, second write is a sendfile. Returning -EOPNOTSUPP lets the callers fall back into normal sendmsg path, were we calculate the checksum manually during copying. Commit d749c9cbffd6 ("ipv4: no CHECKSUM_PARTIAL on MSG_MORE corked sockets") started to exposes this bug. Fixes: d749c9cbffd6 ("ipv4: no CHECKSUM_PARTIAL on MSG_MORE corked sockets") Reported-by: Jiri Benc Cc: Jiri Benc Reported-by: Wakko Warner Cc: Wakko Warner Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 64878efa045c..565bf64b2b7d 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1236,13 +1236,16 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, if (!skb) return -EINVAL; - cork->length += size; if ((size + skb->len > mtu) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO)) { + if (skb->ip_summed != CHECKSUM_PARTIAL) + return -EOPNOTSUPP; + skb_shinfo(skb)->gso_size = mtu - fragheaderlen; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; } + cork->length += size; while (size > 0) { if (skb_is_gso(skb)) { -- cgit v1.2.3 From 1837b2e2bcd23137766555a63867e649c0b637f0 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Mon, 29 Feb 2016 15:03:33 -0800 Subject: mld, igmp: Fix reserved tailroom calculation The current reserved_tailroom calculation fails to take hlen and tlen into account. skb: [__hlen__|__data____________|__tlen___|__extra__] ^ ^ head skb_end_offset In this representation, hlen + data + tlen is the size passed to alloc_skb. "extra" is the extra space made available in __alloc_skb because of rounding up by kmalloc. We can reorder the representation like so: [__hlen__|__data____________|__extra__|__tlen___] ^ ^ head skb_end_offset The maximum space available for ip headers and payload without fragmentation is min(mtu, data + extra). Therefore, reserved_tailroom = data + extra + tlen - min(mtu, data + extra) = skb_end_offset - hlen - min(mtu, skb_end_offset - hlen - tlen) = skb_tailroom - min(mtu, skb_tailroom - tlen) ; after skb_reserve(hlen) Compare the second line to the current expression: reserved_tailroom = skb_end_offset - min(mtu, skb_end_offset) and we can see that hlen and tlen are not taken into account. The min() in the third line can be expanded into: if mtu < skb_tailroom - tlen: reserved_tailroom = skb_tailroom - mtu else: reserved_tailroom = tlen Depending on hlen, tlen, mtu and the number of multicast address records, the current code may output skbs that have less tailroom than dev->needed_tailroom or it may output more skbs than needed because not all space available is used. Fixes: 4c672e4b ("ipv6: mld: fix add_grhead skb_over_panic for devs with large MTUs") Signed-off-by: Benjamin Poirier Acked-by: Hannes Frederic Sowa Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/skbuff.h | 24 ++++++++++++++++++++++++ net/ipv4/igmp.c | 3 +-- net/ipv6/mcast.c | 3 +-- 3 files changed, 26 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4ce9ff7086f4..d3fcd4591ce4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1985,6 +1985,30 @@ static inline void skb_reserve(struct sk_buff *skb, int len) skb->tail += len; } +/** + * skb_tailroom_reserve - adjust reserved_tailroom + * @skb: buffer to alter + * @mtu: maximum amount of headlen permitted + * @needed_tailroom: minimum amount of reserved_tailroom + * + * Set reserved_tailroom so that headlen can be as large as possible but + * not larger than mtu and tailroom cannot be smaller than + * needed_tailroom. + * The required headroom should already have been reserved before using + * this function. + */ +static inline void skb_tailroom_reserve(struct sk_buff *skb, unsigned int mtu, + unsigned int needed_tailroom) +{ + SKB_LINEAR_ASSERT(skb); + if (mtu < skb_tailroom(skb) - needed_tailroom) + /* use at most mtu */ + skb->reserved_tailroom = skb_tailroom(skb) - mtu; + else + /* use up to all available space */ + skb->reserved_tailroom = needed_tailroom; +} + #define ENCAP_TYPE_ETHER 0 #define ENCAP_TYPE_IPPROTO 1 diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 05e4cba14162..b3086cf27027 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -356,9 +356,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) skb_dst_set(skb, &rt->dst); skb->dev = dev; - skb->reserved_tailroom = skb_end_offset(skb) - - min(mtu, skb_end_offset(skb)); skb_reserve(skb, hlen); + skb_tailroom_reserve(skb, mtu, tlen); skb_reset_network_header(skb); pip = ip_hdr(skb); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 5ee56d0a8699..d64ee7e83664 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1574,9 +1574,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) return NULL; skb->priority = TC_PRIO_CONTROL; - skb->reserved_tailroom = skb_end_offset(skb) - - min(mtu, skb_end_offset(skb)); skb_reserve(skb, hlen); + skb_tailroom_reserve(skb, mtu, tlen); if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) { /* : -- cgit v1.2.3 From a9d99ce28ed359d68cf6f3c1a69038aefedf6d6a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 6 Mar 2016 09:29:21 -0800 Subject: tcp: fix tcpi_segs_in after connection establishment If final packet (ACK) of 3WHS is lost, it appears we do not properly account the following incoming segment into tcpi_segs_in While we are at it, starts segs_in with one, to count the SYN packet. We do not yet count number of SYN we received for a request sock, we might add this someday. packetdrill script showing proper behavior after fix : // Tests tcpi_segs_in when 3rd packet (ACK) of 3WHS is lost 0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 +0 < S 0:0(0) win 32792 +0 > S. 0:0(0) ack 1 +.020 < P. 1:1001(1000) ack 1 win 32792 +0 accept(3, ..., ...) = 4 +.000 %{ assert tcpi_segs_in == 2, 'tcpi_segs_in=%d' % tcpi_segs_in }% Fixes: 2efd055c53c06 ("tcp: add tcpi_segs_in and tcpi_segs_out to tcp_info") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_minisocks.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 75632a925824..9b02af2139d3 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -455,7 +455,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; - newtp->segs_in = 0; + newtp->segs_in = 1; newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; @@ -815,6 +815,7 @@ int tcp_child_process(struct sock *parent, struct sock *child, int ret = 0; int state = child->sk_state; + tcp_sk(child)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); if (!sock_owned_by_user(child)) { ret = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ -- cgit v1.2.3