diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-15 11:56:19 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-15 11:56:19 -0800 |
commit | 5bbcc0f595fadb4cac0eddc4401035ec0bd95b09 (patch) | |
tree | 3b65e490cc36a6c6fecac1fa24d9e0ac9ced4455 /net/ipv4/tcp_recovery.c | |
parent | 892204e06cb9e89fbc4b299a678f9ca358e97cac (diff) | |
parent | 50895b9de1d3e0258e015e8e55128d835d9a9f19 (diff) | |
download | linux-5bbcc0f595fadb4cac0eddc4401035ec0bd95b09.tar.bz2 |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
"Highlights:
1) Maintain the TCP retransmit queue using an rbtree, with 1GB
windows at 100Gb this really has become necessary. From Eric
Dumazet.
2) Multi-program support for cgroup+bpf, from Alexei Starovoitov.
3) Perform broadcast flooding in hardware in mv88e6xxx, from Andrew
Lunn.
4) Add meter action support to openvswitch, from Andy Zhou.
5) Add a data meta pointer for BPF accessible packets, from Daniel
Borkmann.
6) Namespace-ify almost all TCP sysctl knobs, from Eric Dumazet.
7) Turn on Broadcom Tags in b53 driver, from Florian Fainelli.
8) More work to move the RTNL mutex down, from Florian Westphal.
9) Add 'bpftool' utility, to help with bpf program introspection.
From Jakub Kicinski.
10) Add new 'cpumap' type for XDP_REDIRECT action, from Jesper
Dangaard Brouer.
11) Support 'blocks' of transformations in the packet scheduler which
can span multiple network devices, from Jiri Pirko.
12) TC flower offload support in cxgb4, from Kumar Sanghvi.
13) Priority based stream scheduler for SCTP, from Marcelo Ricardo
Leitner.
14) Thunderbolt networking driver, from Amir Levy and Mika Westerberg.
15) Add RED qdisc offloadability, and use it in mlxsw driver. From
Nogah Frankel.
16) eBPF based device controller for cgroup v2, from Roman Gushchin.
17) Add some fundamental tracepoints for TCP, from Song Liu.
18) Remove garbage collection from ipv6 route layer, this is a
significant accomplishment. From Wei Wang.
19) Add multicast route offload support to mlxsw, from Yotam Gigi"
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (2177 commits)
tcp: highest_sack fix
geneve: fix fill_info when link down
bpf: fix lockdep splat
net: cdc_ncm: GetNtbFormat endian fix
openvswitch: meter: fix NULL pointer dereference in ovs_meter_cmd_reply_start
netem: remove unnecessary 64 bit modulus
netem: use 64 bit divide by rate
tcp: Namespace-ify sysctl_tcp_default_congestion_control
net: Protect iterations over net::fib_notifier_ops in fib_seq_sum()
ipv6: set all.accept_dad to 0 by default
uapi: fix linux/tls.h userspace compilation error
usbnet: ipheth: prevent TX queue timeouts when device not ready
vhost_net: conditionally enable tx polling
uapi: fix linux/rxrpc.h userspace compilation errors
net: stmmac: fix LPI transitioning for dwmac4
atm: horizon: Fix irq release error
net-sysfs: trigger netlink notification on ifalias change via sysfs
openvswitch: Using kfree_rcu() to simplify the code
openvswitch: Make local function ovs_nsh_key_attr_size() static
openvswitch: Fix return value check in ovs_meter_cmd_features()
...
Diffstat (limited to 'net/ipv4/tcp_recovery.c')
-rw-r--r-- | net/ipv4/tcp_recovery.c | 102 |
1 files changed, 65 insertions, 37 deletions
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index be8ef1e5dfef..d3ea89020c69 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -2,8 +2,6 @@ #include <linux/tcp.h> #include <net/tcp.h> -int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION; - static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -46,7 +44,8 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; + u32 min_rtt = tcp_min_rtt(tp); + struct sk_buff *skb, *n; u32 reo_wnd; *reo_timeout = 0; @@ -56,48 +55,36 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) * to queuing or delayed ACKs. */ reo_wnd = 1000; - if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U) - reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); + if ((tp->rack.reord || !tp->lost_out) && min_rtt != ~0U) { + reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd); + reo_wnd = min(reo_wnd, tp->srtt_us >> 3); + } - tcp_for_write_queue(skb, sk) { + list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue, + tcp_tsorted_anchor) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + s32 remaining; - if (skb == tcp_send_head(sk)) - break; - - /* Skip ones already (s)acked */ - if (!after(scb->end_seq, tp->snd_una) || - scb->sacked & TCPCB_SACKED_ACKED) + /* Skip ones marked lost but not yet retransmitted */ + if ((scb->sacked & TCPCB_LOST) && + !(scb->sacked & TCPCB_SACKED_RETRANS)) continue; - if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, - tp->rack.end_seq, scb->end_seq)) { - /* Step 3 in draft-cheng-tcpm-rack-00.txt: - * A packet is lost if its elapsed time is beyond - * the recent RTT plus the reordering window. - */ - u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp, - skb->skb_mstamp); - s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; - - if (remaining < 0) { - tcp_rack_mark_skb_lost(sk, skb); - continue; - } - - /* Skip ones marked lost but not yet retransmitted */ - if ((scb->sacked & TCPCB_LOST) && - !(scb->sacked & TCPCB_SACKED_RETRANS)) - continue; + if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, + tp->rack.end_seq, scb->end_seq)) + break; + /* A packet is lost if it has not been s/acked beyond + * the recent RTT plus the reordering window. + */ + remaining = tp->rack.rtt_us + reo_wnd - + tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); + if (remaining < 0) { + tcp_rack_mark_skb_lost(sk, skb); + list_del_init(&skb->tcp_tsorted_anchor); + } else { /* Record maximum wait time (+1 to avoid 0) */ *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); - - } else if (!(scb->sacked & TCPCB_RETRANS)) { - /* Original data are sent sequentially so stop early - * b/c the rest are all sent after rack_sent - */ - break; } } } @@ -176,3 +163,44 @@ void tcp_rack_reo_timeout(struct sock *sk) if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS) tcp_rearm_rto(sk); } + +/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries. + * + * If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded + * by srtt), since there is possibility that spurious retransmission was + * due to reordering delay longer than reo_wnd. + * + * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16) + * no. of successful recoveries (accounts for full DSACK-based loss + * recovery undo). After that, reset it to default (min_rtt/4). + * + * At max, reo_wnd is incremented only once per rtt. So that the new + * DSACK on which we are reacting, is due to the spurious retx (approx) + * after the reo_wnd has been updated last time. + * + * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than + * absolute value to account for change in rtt. + */ +void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_STATIC_REO_WND || + !rs->prior_delivered) + return; + + /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */ + if (before(rs->prior_delivered, tp->rack.last_delivered)) + tp->rack.dsack_seen = 0; + + /* Adjust the reo_wnd if update is pending */ + if (tp->rack.dsack_seen) { + tp->rack.reo_wnd_steps = min_t(u32, 0xFF, + tp->rack.reo_wnd_steps + 1); + tp->rack.dsack_seen = 0; + tp->rack.last_delivered = tp->delivered; + tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH; + } else if (!tp->rack.reo_wnd_persist) { + tp->rack.reo_wnd_steps = 1; + } +} |