From 75c119afe14f74b4dd967d75ed9f57ab6c0ef045 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Oct 2017 22:21:27 -0700 Subject: tcp: implement rb-tree based retransmit queue Using a linear list to store all skbs in write queue has been okay for quite a while : O(N) is not too bad when N < 500. Things get messy when N is the order of 100,000 : Modern TCP stacks want 10Gbit+ of throughput even with 200 ms RTT flows. 40 ns per cache line miss means a full scan can use 4 ms, blowing away CPU caches. SACK processing often can use various hints to avoid parsing whole retransmit queue. But with high packet losses and/or high reordering, hints no longer work. Sender has to process thousands of unfriendly SACK, accumulating a huge socket backlog, burning a cpu and massively dropping packets. Using an rb-tree for retransmit queue has been avoided for years because it added complexity and overhead, but now is the time to be more resistant and say no to quadratic behavior. 1) RTX queue is no longer part of the write queue : already sent skbs are stored in one rb-tree. 2) Since reaching the head of write queue no longer needs sk->sk_send_head, we added an union of sk_send_head and tcp_rtx_queue Tested: On receiver : netem on ingress : delay 150ms 200us loss 1 GRO disabled to force stress and SACK storms. for f in `seq 1 10` do ./netperf -H lpaa6 -l30 -- -K bbr -o THROUGHPUT|tail -1 done | awk '{print $0} {sum += $0} END {printf "%7u\n",sum}' Before patch : 323.87 351.48 339.59 338.62 306.72 204.07 304.93 291.88 202.47 176.88 2840 After patch: 1700.83 2207.98 2070.17 1544.26 2114.76 2124.89 1693.14 1080.91 2216.82 1299.94 18053 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_timer.c') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 655dd8d7f064..7014cc00c74c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -156,8 +156,13 @@ static bool retransmits_timed_out(struct sock *sk, return false; start_ts = tcp_sk(sk)->retrans_stamp; - if (unlikely(!start_ts)) - start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk)); + if (unlikely(!start_ts)) { + struct sk_buff *head = tcp_rtx_queue_head(sk); + + if (!head) + return false; + start_ts = tcp_skb_timestamp(head); + } if (likely(timeout == 0)) { linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); @@ -304,11 +309,12 @@ static void tcp_delack_timer(unsigned long data) static void tcp_probe_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *skb = tcp_send_head(sk); struct tcp_sock *tp = tcp_sk(sk); int max_probes; u32 start_ts; - if (tp->packets_out || !tcp_send_head(sk)) { + if (tp->packets_out || !skb) { icsk->icsk_probes_out = 0; return; } @@ -321,9 +327,9 @@ static void tcp_probe_timer(struct sock *sk) * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ - start_ts = tcp_skb_timestamp(tcp_send_head(sk)); + start_ts = tcp_skb_timestamp(skb); if (!start_ts) - tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp; + skb->skb_mstamp = tp->tcp_mstamp; else if (icsk->icsk_user_timeout && (s32)(tcp_time_stamp(tp) - start_ts) > jiffies_to_msecs(icsk->icsk_user_timeout)) @@ -408,7 +414,7 @@ void tcp_retransmit_timer(struct sock *sk) if (!tp->packets_out) goto out; - WARN_ON(tcp_write_queue_empty(sk)); + WARN_ON(tcp_rtx_queue_empty(sk)); tp->tlp_high_seq = 0; @@ -441,7 +447,7 @@ void tcp_retransmit_timer(struct sock *sk) goto out; } tcp_enter_loss(sk); - tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1); + tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1); __sk_dst_reset(sk); goto out_reset_timer; } @@ -473,7 +479,7 @@ void tcp_retransmit_timer(struct sock *sk) tcp_enter_loss(sk); - if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) { + if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) { /* Retransmission failed because of local congestion, * do not backoff. */ @@ -647,7 +653,7 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = keepalive_time_when(tp); /* It is alive without keepalive 8) */ - if (tp->packets_out || tcp_send_head(sk)) + if (tp->packets_out || !tcp_write_queue_empty(sk)) goto resched; elapsed = keepalive_time_elapsed(tp); -- cgit v1.2.3 From 59f379f9046a9e0532ffd19b44e3c32fe79ec51b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:29:19 -0700 Subject: inet/connection_sock: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "David S. Miller" Cc: Gerrit Renker Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Cc: netdev@vger.kernel.org Cc: dccp@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 6 +++--- net/dccp/timer.c | 18 ++++++++++-------- net/ipv4/inet_connection_sock.c | 21 +++++++++------------ net/ipv4/tcp_timer.c | 18 +++++++++++------- 4 files changed, 33 insertions(+), 30 deletions(-) (limited to 'net/ipv4/tcp_timer.c') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 13e4c89a8231..0358745ea059 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -169,9 +169,9 @@ enum inet_csk_ack_state_t { }; void inet_csk_init_xmit_timers(struct sock *sk, - void (*retransmit_handler)(unsigned long), - void (*delack_handler)(unsigned long), - void (*keepalive_handler)(unsigned long)); + void (*retransmit_handler)(struct timer_list *), + void (*delack_handler)(struct timer_list *), + void (*keepalive_handler)(struct timer_list *)); void inet_csk_clear_xmit_timers(struct sock *sk); static inline void inet_csk_schedule_ack(struct sock *sk) diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 3a2c34027758..1e35526bf436 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -125,10 +125,11 @@ static void dccp_retransmit_timer(struct sock *sk) __sk_dst_reset(sk); } -static void dccp_write_timer(unsigned long data) +static void dccp_write_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; - struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_retransmit_timer); + struct sock *sk = &icsk->icsk_inet.sk; int event = 0; bh_lock_sock(sk); @@ -161,19 +162,20 @@ out: sock_put(sk); } -static void dccp_keepalive_timer(unsigned long data) +static void dccp_keepalive_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; + struct sock *sk = from_timer(sk, t, sk_timer); pr_err("dccp should not use a keepalive timer !\n"); sock_put(sk); } /* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */ -static void dccp_delack_timer(unsigned long data) +static void dccp_delack_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; - struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_delack_timer); + struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 8a91ebbf0c01..5c965ecc96a0 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -494,17 +494,15 @@ EXPORT_SYMBOL(inet_csk_accept); * to optimize. */ void inet_csk_init_xmit_timers(struct sock *sk, - void (*retransmit_handler)(unsigned long), - void (*delack_handler)(unsigned long), - void (*keepalive_handler)(unsigned long)) + void (*retransmit_handler)(struct timer_list *t), + void (*delack_handler)(struct timer_list *t), + void (*keepalive_handler)(struct timer_list *t)) { struct inet_connection_sock *icsk = inet_csk(sk); - setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler, - (unsigned long)sk); - setup_timer(&icsk->icsk_delack_timer, delack_handler, - (unsigned long)sk); - setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk); + timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0); + timer_setup(&icsk->icsk_delack_timer, delack_handler, 0); + timer_setup(&sk->sk_timer, keepalive_handler, 0); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } EXPORT_SYMBOL(inet_csk_init_xmit_timers); @@ -676,9 +674,9 @@ void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req } EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); -static void reqsk_timer_handler(unsigned long data) +static void reqsk_timer_handler(struct timer_list *t) { - struct request_sock *req = (struct request_sock *)data; + struct request_sock *req = from_timer(req, t, rsk_timer); struct sock *sk_listener = req->rsk_listener; struct net *net = sock_net(sk_listener); struct inet_connection_sock *icsk = inet_csk(sk_listener); @@ -749,8 +747,7 @@ static void reqsk_queue_hash_req(struct request_sock *req, req->num_timeout = 0; req->sk = NULL; - setup_pinned_timer(&req->rsk_timer, reqsk_timer_handler, - (unsigned long)req); + timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); mod_timer(&req->rsk_timer, jiffies + timeout); inet_ehash_insert(req_to_sk(req), NULL); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 7014cc00c74c..804a8d34ce86 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -288,15 +288,17 @@ out: * * Returns: Nothing (void) */ -static void tcp_delack_timer(unsigned long data) +static void tcp_delack_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_delack_timer); + struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { tcp_delack_timer_handler(sk); } else { - inet_csk(sk)->icsk_ack.blocked = 1; + icsk->icsk_ack.blocked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) @@ -576,9 +578,11 @@ out: sk_mem_reclaim(sk); } -static void tcp_write_timer(unsigned long data) +static void tcp_write_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_retransmit_timer); + struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { @@ -613,9 +617,9 @@ void tcp_set_keepalive(struct sock *sk, int val) EXPORT_SYMBOL_GPL(tcp_set_keepalive); -static void tcp_keepalive_timer (unsigned long data) +static void tcp_keepalive_timer (struct timer_list *t) { - struct sock *sk = (struct sock *) data; + struct sock *sk = from_timer(sk, t, sk_timer); struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 elapsed; -- cgit v1.2.3 From 2c04ac8ae0b61e0780a30b7069a11bb202b21f26 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:54:58 -0700 Subject: tcp: Namespace-ify sysctl_tcp_thin_linear_timeouts Note that sysctl_tcp_thin_dupack was not used, I deleted it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 2 -- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_timer.c | 4 +--- 4 files changed, 9 insertions(+), 12 deletions(-) (limited to 'net/ipv4/tcp_timer.c') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index d6ed718075d4..2a9f37b39c45 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -130,6 +130,7 @@ struct netns_ipv4 { int sysctl_tcp_timestamps; int sysctl_tcp_early_retrans; int sysctl_tcp_recovery; + int sysctl_tcp_thin_linear_timeouts; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index c7f51534fc44..063a7a48b7fe 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -263,8 +263,6 @@ extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; extern int sysctl_tcp_workaround_signed_windows; extern int sysctl_tcp_slow_start_after_idle; -extern int sysctl_tcp_thin_linear_timeouts; -extern int sysctl_tcp_thin_dupack; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 78019adcae87..12003214f4d8 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -620,13 +620,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_allowed_congestion_control, }, - { - .procname = "tcp_thin_linear_timeouts", - .data = &sysctl_tcp_thin_linear_timeouts, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_min_tso_segs", .data = &sysctl_tcp_min_tso_segs, @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_thin_linear_timeouts", + .data = &init_net.ipv4.sysctl_tcp_thin_linear_timeouts, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 804a8d34ce86..035a1ef1f2d8 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,8 +22,6 @@ #include #include -int sysctl_tcp_thin_linear_timeouts __read_mostly; - /** * tcp_write_err() - close socket and save error info * @sk: The socket the error has appeared on. @@ -522,7 +520,7 @@ out_reset_timer: * linear-timeout retransmissions into a black hole */ if (sk->sk_state == TCP_ESTABLISHED && - (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) && + (tp->thin_lto || net->ipv4.sysctl_tcp_thin_linear_timeouts) && tcp_stream_is_thin(tp) && icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { icsk->icsk_backoff = 0; -- cgit v1.2.3 From d0f36847016276920d860d5c089934ff3fea7e30 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Nov 2017 06:09:17 -0700 Subject: tcp: tcp_mtu_probing() cleanup Reduce one indentation level to make code more readable. tcp_sync_mss() can be factorized. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'net/ipv4/tcp_timer.c') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 035a1ef1f2d8..16df6dd44b98 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -107,26 +107,23 @@ static int tcp_orphan_retries(struct sock *sk, bool alive) static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) { - struct net *net = sock_net(sk); + const struct net *net = sock_net(sk); + int mss; /* Black hole detection */ - if (net->ipv4.sysctl_tcp_mtu_probing) { - if (!icsk->icsk_mtup.enabled) { - icsk->icsk_mtup.enabled = 1; - icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; - tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); - } else { - struct net *net = sock_net(sk); - struct tcp_sock *tp = tcp_sk(sk); - int mss; - - mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; - mss = min(net->ipv4.sysctl_tcp_base_mss, mss); - mss = max(mss, 68 - tp->tcp_header_len); - icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); - tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); - } + if (!net->ipv4.sysctl_tcp_mtu_probing) + return; + + if (!icsk->icsk_mtup.enabled) { + icsk->icsk_mtup.enabled = 1; + icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; + } else { + mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; + mss = min(net->ipv4.sysctl_tcp_base_mss, mss); + mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len); + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); } + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } -- cgit v1.2.3