diff options
author | Yuchung Cheng <ycheng@google.com> | 2017-01-12 22:11:36 -0800 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2017-01-13 22:37:16 -0500 |
commit | a0370b3f3f2cfb8b424b04c0545414abaa53f5ee (patch) | |
tree | 3b5e28b49001af777d99edfa784d9c8f1256616c /net | |
parent | 98e36d449cc681f1bb2ce2230243f7f977a7da1b (diff) | |
download | linux-a0370b3f3f2cfb8b424b04c0545414abaa53f5ee.tar.bz2 |
tcp: enable RACK loss detection to trigger recovery
This patch changes two things:
1. Start fast recovery with RACK in addition to other heuristics
(e.g., DUPACK threshold, FACK). Prior to this change RACK
is enabled to detect losses only after the recovery has
started by other algorithms.
2. Disable TCP early retransmit. RACK subsumes the early retransmit
with the new reordering timer feature. A latter patch in this
series removes the early retransmit code.
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/ipv4/tcp_input.c | 29 | ||||
-rw-r--r-- | net/ipv4/tcp_recovery.c | 16 |
2 files changed, 31 insertions, 14 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9c98dc874825..4ad75b8c4fee 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2129,10 +2129,25 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) * F.e. after RTO, when all the queue is considered as lost, * lost_out = packets_out and in_flight = retrans_out. * - * Essentially, we have now two algorithms counting + * Essentially, we have now a few algorithms detecting * lost packets. * - * FACK: It is the simplest heuristics. As soon as we decided + * If the receiver supports SACK: + * + * RFC6675/3517: It is the conventional algorithm. A packet is + * considered lost if the number of higher sequence packets + * SACKed is greater than or equal the DUPACK thoreshold + * (reordering). This is implemented in tcp_mark_head_lost and + * tcp_update_scoreboard. + * + * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm + * (2017-) that checks timing instead of counting DUPACKs. + * Essentially a packet is considered lost if it's not S/ACKed + * after RTT + reordering_window, where both metrics are + * dynamically measured and adjusted. This is implemented in + * tcp_rack_mark_lost. + * + * FACK: it is the simplest heuristics. As soon as we decided * that something is lost, we decide that _all_ not SACKed * packets until the most forward SACK are lost. I.e. * lost_out = fackets_out - sacked_out and left_out = fackets_out. @@ -2141,16 +2156,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) * takes place. We use FACK by default until reordering * is suspected on the path to this destination. * - * NewReno: when Recovery is entered, we assume that one segment + * If the receiver does not support SACK: + * + * NewReno (RFC6582): in Recovery we assume that one segment * is lost (classic Reno). While we are in Recovery and * a partial ACK arrives, we assume that one more packet * is lost (NewReno). This heuristics are the same in NewReno * and SACK. * - * Imagine, that's all! Forget about all this shamanism about CWND inflation - * deflation etc. CWND is real congestion window, never inflated, changes - * only according to classic VJ rules. - * * Really tricky (and requiring careful tuning) part of algorithm * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). * The first determines the moment _when_ we should reduce CWND and, @@ -2807,7 +2820,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag, struct tcp_sock *tp = tcp_sk(sk); /* Use RACK to detect loss */ - if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) { + if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { u32 prior_retrans = tp->retrans_out; tcp_rack_mark_lost(sk, ack_time); diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 1e330a2f913d..4ecb38ae8504 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -1,7 +1,7 @@ #include <linux/tcp.h> #include <net/tcp.h> -int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS; +int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION; static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb) { @@ -24,7 +24,9 @@ static bool tcp_rack_sent_after(const struct skb_mstamp *t1, (t1->v64 == t2->v64 && after(seq1, seq2)); } -/* Marks a packet lost, if some packet sent later has been (s)acked. +/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): + * + * Marks a packet lost, if some packet sent later has been (s)acked. * The underlying idea is similar to the traditional dupthresh and FACK * but they look at different metrics: * @@ -37,8 +39,10 @@ static bool tcp_rack_sent_after(const struct skb_mstamp *t1, * is being more resilient to reordering by simply allowing some * "settling delay", instead of tweaking the dupthresh. * - * The current version is only used after recovery starts but can be - * easily extended to detect the first loss. + * When tcp_rack_detect_loss() detects some packets are lost and we + * are not already in the CA_Recovery state, either tcp_rack_reo_timeout() + * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will + * make us enter the CA_Recovery state. */ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now, u32 *reo_timeout) @@ -54,7 +58,7 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now, * to queuing or delayed ACKs. */ reo_wnd = 1000; - if (tp->rack.reord && tcp_min_rtt(tp) != ~0U) + if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U) reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); tcp_for_write_queue(skb, sk) { @@ -105,7 +109,7 @@ void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now) struct tcp_sock *tp = tcp_sk(sk); u32 timeout; - if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced) + if (!tp->rack.advanced) return; /* Reset the advanced flag to avoid unnecessary queue scanning */ |