diff options
-rw-r--r-- | include/linux/tcp.h | 2 | ||||
-rw-r--r-- | include/net/tcp.h | 19 | ||||
-rw-r--r-- | include/uapi/linux/tcp.h | 3 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 24 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 10 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 23 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 1 |
8 files changed, 76 insertions, 8 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 711361af9ce0..c23019a3b264 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -245,6 +245,7 @@ struct tcp_sock { syn_smc:1; /* SYN includes SMC */ u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ + u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ u64 tcp_wstamp_ns; /* departure time for next sent data packet */ u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ @@ -436,6 +437,7 @@ struct tcp_timewait_sock { u32 tw_last_oow_ack_time; int tw_ts_recent_stamp; + u32 tw_tx_delay; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *tw_md5_key; #endif diff --git a/include/net/tcp.h b/include/net/tcp.h index 204328b88412..49a178b8d5b2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2232,4 +2232,23 @@ void clean_acked_data_disable(struct inet_connection_sock *icsk); void clean_acked_data_flush(void); #endif +DECLARE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); +static inline void tcp_add_tx_delay(struct sk_buff *skb, + const struct tcp_sock *tp) +{ + if (static_branch_unlikely(&tcp_tx_delay_enabled)) + skb->skb_mstamp_ns += (u64)tp->tcp_tx_delay * NSEC_PER_USEC; +} + +static inline void tcp_set_tx_time(struct sk_buff *skb, + const struct sock *sk) +{ + if (static_branch_unlikely(&tcp_tx_delay_enabled)) { + u32 delay = (sk->sk_state == TCP_TIME_WAIT) ? + tcp_twsk(sk)->tw_tx_delay : tcp_sk(sk)->tcp_tx_delay; + + skb->skb_mstamp_ns = tcp_clock_ns() + (u64)delay * NSEC_PER_USEC; + } +} + #endif /* _TCP_H */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index b521464ea962..b3564f85a762 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -127,6 +127,9 @@ enum { #define TCP_CM_INQ TCP_INQ +#define TCP_TX_DELAY 37 /* delay outgoing packets by XX usec */ + + #define TCP_REPAIR_ON 1 #define TCP_REPAIR_OFF 0 #define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bd0856ac680a..5542e3d778e6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2736,6 +2736,21 @@ static int tcp_repair_options_est(struct sock *sk, return 0; } +DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); +EXPORT_SYMBOL(tcp_tx_delay_enabled); + +static void tcp_enable_tx_delay(void) +{ + if (!static_branch_unlikely(&tcp_tx_delay_enabled)) { + static int __tcp_tx_delay_enabled = 0; + + if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) { + static_branch_enable(&tcp_tx_delay_enabled); + pr_info("TCP_TX_DELAY enabled\n"); + } + } +} + /* * Socket option code for TCP. */ @@ -3087,6 +3102,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, else tp->recvmsg_inq = val; break; + case TCP_TX_DELAY: + if (val) + tcp_enable_tx_delay(); + tp->tcp_tx_delay = val; + break; default: err = -ENOPROTOOPT; break; @@ -3546,6 +3566,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = tp->fastopen_no_cookie; break; + case TCP_TX_DELAY: + val = tp->tcp_tx_delay; + break; + case TCP_TIMESTAMP: val = tcp_time_stamp_raw() + tp->tsoffset; break; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f059fbd81a84..1b7e9e1fbd3b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -767,9 +767,11 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); - if (sk) + if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : sk->sk_mark; + tcp_set_tx_time(skb, sk); + } ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, @@ -859,9 +861,9 @@ static void tcp_v4_send_ack(const struct sock *sk, arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); local_bh_disable(); ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); - if (sk) - ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_mark : sk->sk_mark; + ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_mark : sk->sk_mark; + tcp_set_tx_time(skb, sk); ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 11011e8386dc..8bcaf2586b68 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -274,7 +274,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; tcptw->tw_last_oow_ack_time = 0; - + tcptw->tw_tx_delay = tp->tcp_tx_delay; #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f429e856e263..d954ff9069e8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1153,6 +1153,8 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), sizeof(struct inet6_skb_parm))); + tcp_add_tx_delay(skb, tp); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); if (unlikely(err > 0)) { @@ -2234,6 +2236,18 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); limit <<= factor; + if (static_branch_unlikely(&tcp_tx_delay_enabled) && + tcp_sk(sk)->tcp_tx_delay) { + u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay; + + /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we + * approximate our needs assuming an ~100% skb->truesize overhead. + * USEC_PER_SEC is approximated by 2^20. + * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift. + */ + extra_bytes >>= (20 - 1); + limit += extra_bytes; + } if (refcount_read(&sk->sk_wmem_alloc) > limit) { /* Always send skb if rtx queue is empty. * No need to wait for TX completion to call us back, @@ -3212,6 +3226,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, int tcp_header_size; struct tcphdr *th; int mss; + u64 now; skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { @@ -3243,13 +3258,14 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); memset(&opts, 0, sizeof(opts)); + now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) skb->skb_mstamp_ns = cookie_init_timestamp(req); else #endif { - skb->skb_mstamp_ns = tcp_clock_ns(); + skb->skb_mstamp_ns = now; if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); } @@ -3292,8 +3308,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_unlock(); #endif - /* Do not fool tcpdump (if any), clean our debris */ - skb->tstamp = 0; + skb->skb_mstamp_ns = now; + tcp_add_tx_delay(skb, tp); + return skb; } EXPORT_SYMBOL(tcp_make_synack); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ad7039137a20..5606b2131b65 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -892,6 +892,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 } else { mark = sk->sk_mark; } + tcp_set_tx_time(buff, sk); } fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark; fl6.fl6_dport = t1->dest; |