summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2017-08-03 15:38:31 -0700
committerDavid S. Miller <davem@davemloft.net>2017-08-03 15:38:31 -0700
commit337f1b07db97b75bfd91eb333ff3ec3b82c5b1e6 (patch)
tree423c7984d203560f17751f0ba8ba9f0a8ff2ae4e
parentb91d532928dff2141ea9c107c3e73104d9843767 (diff)
parentdf92c8394e6ea0469e8056946ef8add740ab8046 (diff)
downloadlinux-337f1b07db97b75bfd91eb333ff3ec3b82c5b1e6.tar.bz2
Merge branch 'tcp-xmit-timer-rearming'
Neal Cardwell says: ==================== tcp: fix xmit timer rearming to avoid stalls This patch series is a bug fix for a TCP loss recovery performance bug reported independently in recent netdev threads: (i) July 26, 2017: netdev thread "TCP fast retransmit issues" (ii) July 26, 2017: netdev thread: "[PATCH V2 net-next] TLP: Don't reschedule PTO when there's one outstanding TLP retransmission" Many thanks to Klavs Klavsen and Mao Wenan for the detailed reports, traces, and packetdrill test cases, which enabled us to root-cause this issue and verify the fix. - v1 -> v2: - In patch 2/3, changed an unclear comment in the pre-existing code in tcp_schedule_loss_probe() to be more clear (thanks to Eric Dumazet for suggesting we improve this). ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/tcp.h10
-rw-r--r--net/ipv4/tcp_input.c30
-rw-r--r--net/ipv4/tcp_output.c23
3 files changed, 32 insertions, 31 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 70483296157f..ada65e767b28 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1916,6 +1916,16 @@ extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
u64 xmit_time);
extern void tcp_rack_reo_timeout(struct sock *sk);
+/* At how many usecs into the future should the RTO fire? */
+static inline s64 tcp_rto_delta_us(const struct sock *sk)
+{
+ const struct sk_buff *skb = tcp_write_queue_head(sk);
+ u32 rto = inet_csk(sk)->icsk_rto;
+ u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);
+
+ return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp;
+}
+
/*
* Save and compile IPv4 options, return a pointer to it
*/
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index dad026fcfd09..53de1424c13c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -107,6 +107,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
+#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */
#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
@@ -3004,10 +3005,7 @@ void tcp_rearm_rto(struct sock *sk)
/* Offset the time elapsed after installing regular RTO */
if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
- struct sk_buff *skb = tcp_write_queue_head(sk);
- u64 rto_time_stamp = skb->skb_mstamp +
- jiffies_to_usecs(rto);
- s64 delta_us = rto_time_stamp - tp->tcp_mstamp;
+ s64 delta_us = tcp_rto_delta_us(sk);
/* delta_us may not be positive if the socket is locked
* when the retrans timer fires and is rescheduled.
*/
@@ -3019,6 +3017,13 @@ void tcp_rearm_rto(struct sock *sk)
}
}
+/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
+static void tcp_set_xmit_timer(struct sock *sk)
+{
+ if (!tcp_schedule_loss_probe(sk))
+ tcp_rearm_rto(sk);
+}
+
/* If we get here, the whole TSO packet has not been acked. */
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
{
@@ -3180,7 +3185,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
ca_rtt_us, sack->rate);
if (flag & FLAG_ACKED) {
- tcp_rearm_rto(sk);
+ flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
if (unlikely(icsk->icsk_mtup.probe_size &&
!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
tcp_mtup_probe_success(sk);
@@ -3208,7 +3213,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
* after when the head was last (re)transmitted. Otherwise the
* timeout may continue to extend in loss recovery.
*/
- tcp_rearm_rto(sk);
+ flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
}
if (icsk->icsk_ca_ops->pkts_acked) {
@@ -3580,9 +3585,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt))
goto invalid_ack;
- if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
- tcp_rearm_rto(sk);
-
if (after(ack, prior_snd_una)) {
flag |= FLAG_SND_UNA_ADVANCED;
icsk->icsk_retransmits = 0;
@@ -3647,18 +3649,20 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
&sack_state);
+ if (tp->tlp_high_seq)
+ tcp_process_tlp_ack(sk, ack, flag);
+ /* If needed, reset TLP/RTO timer; RACK may later override this. */
+ if (flag & FLAG_SET_XMIT_TIMER)
+ tcp_set_xmit_timer(sk);
+
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
}
- if (tp->tlp_high_seq)
- tcp_process_tlp_ack(sk, ack, flag);
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
sk_dst_confirm(sk);
- if (icsk->icsk_pending == ICSK_TIME_RETRANS)
- tcp_schedule_loss_probe(sk);
delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
lost = tp->lost - lost; /* freshly marked lost */
tcp_rate_gen(sk, delivered, lost, sack_state.rate);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2f1588bf73da..276406a83a37 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2377,24 +2377,15 @@ bool tcp_schedule_loss_probe(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- u32 timeout, tlp_time_stamp, rto_time_stamp;
u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
+ u32 timeout, rto_delta_us;
- /* No consecutive loss probes. */
- if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
- tcp_rearm_rto(sk);
- return false;
- }
/* Don't do any loss probe on a Fast Open connection before 3WHS
* finishes.
*/
if (tp->fastopen_rsk)
return false;
- /* TLP is only scheduled when next timer event is RTO. */
- if (icsk->icsk_pending != ICSK_TIME_RETRANS)
- return false;
-
/* Schedule a loss probe in 2*RTT for SACK capable connections
* in Open state, that are either limited by cwnd or application.
*/
@@ -2417,14 +2408,10 @@ bool tcp_schedule_loss_probe(struct sock *sk)
(rtt + (rtt >> 1) + TCP_DELACK_MAX));
timeout = max_t(u32, timeout, msecs_to_jiffies(10));
- /* If RTO is shorter, just schedule TLP in its place. */
- tlp_time_stamp = tcp_jiffies32 + timeout;
- rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
- if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
- s32 delta = rto_time_stamp - tcp_jiffies32;
- if (delta > 0)
- timeout = delta;
- }
+ /* If the RTO formula yields an earlier time, then use that time. */
+ rto_delta_us = tcp_rto_delta_us(sk); /* How far in future is RTO? */
+ if (rto_delta_us > 0)
+ timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
TCP_RTO_MAX);