diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-07-05 12:31:59 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-07-05 12:31:59 -0700 |
commit | 5518b69b76680a4f2df96b1deca260059db0c2de (patch) | |
tree | f33cd1519c8efb4590500f2f9617400be233238c /net/ipv4/tcp_output.c | |
parent | 8ad06e56dcbc1984ef0ff8f6e3c19982c5809f73 (diff) | |
parent | 0e72582270c07850b92cac351c8b97d4f9c123b9 (diff) | |
download | linux-5518b69b76680a4f2df96b1deca260059db0c2de.tar.bz2 |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
"Reasonably busy this cycle, but perhaps not as busy as in the 4.12
merge window:
1) Several optimizations for UDP processing under high load from
Paolo Abeni.
2) Support pacing internally in TCP when using the sch_fq packet
scheduler for this is not practical. From Eric Dumazet.
3) Support mutliple filter chains per qdisc, from Jiri Pirko.
4) Move to 1ms TCP timestamp clock, from Eric Dumazet.
5) Add batch dequeueing to vhost_net, from Jason Wang.
6) Flesh out more completely SCTP checksum offload support, from
Davide Caratti.
7) More plumbing of extended netlink ACKs, from David Ahern, Pablo
Neira Ayuso, and Matthias Schiffer.
8) Add devlink support to nfp driver, from Simon Horman.
9) Add RTM_F_FIB_MATCH flag to RTM_GETROUTE queries, from Roopa
Prabhu.
10) Add stack depth tracking to BPF verifier and use this information
in the various eBPF JITs. From Alexei Starovoitov.
11) Support XDP on qed device VFs, from Yuval Mintz.
12) Introduce BPF PROG ID for better introspection of installed BPF
programs. From Martin KaFai Lau.
13) Add bpf_set_hash helper for TC bpf programs, from Daniel Borkmann.
14) For loads, allow narrower accesses in bpf verifier checking, from
Yonghong Song.
15) Support MIPS in the BPF selftests and samples infrastructure, the
MIPS eBPF JIT will be merged in via the MIPS GIT tree. From David
Daney.
16) Support kernel based TLS, from Dave Watson and others.
17) Remove completely DST garbage collection, from Wei Wang.
18) Allow installing TCP MD5 rules using prefixes, from Ivan
Delalande.
19) Add XDP support to Intel i40e driver, from Björn Töpel
20) Add support for TC flower offload in nfp driver, from Simon
Horman, Pieter Jansen van Vuuren, Benjamin LaHaise, Jakub
Kicinski, and Bert van Leeuwen.
21) IPSEC offloading support in mlx5, from Ilan Tayari.
22) Add HW PTP support to macb driver, from Rafal Ozieblo.
23) Networking refcount_t conversions, From Elena Reshetova.
24) Add sock_ops support to BPF, from Lawrence Brako. This is useful
for tuning the TCP sockopt settings of a group of applications,
currently via CGROUPs"
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1899 commits)
net: phy: dp83867: add workaround for incorrect RX_CTRL pin strap
dt-bindings: phy: dp83867: provide a workaround for incorrect RX_CTRL pin strap
cxgb4: Support for get_ts_info ethtool method
cxgb4: Add PTP Hardware Clock (PHC) support
cxgb4: time stamping interface for PTP
nfp: default to chained metadata prepend format
nfp: remove legacy MAC address lookup
nfp: improve order of interfaces in breakout mode
net: macb: remove extraneous return when MACB_EXT_DESC is defined
bpf: add missing break in for the TCP_BPF_SNDCWND_CLAMP case
bpf: fix return in load_bpf_file
mpls: fix rtm policy in mpls_getroute
net, ax25: convert ax25_cb.refcount from atomic_t to refcount_t
net, ax25: convert ax25_route.refcount from atomic_t to refcount_t
net, ax25: convert ax25_uid_assoc.refcount from atomic_t to refcount_t
net, sctp: convert sctp_ep_common.refcnt from atomic_t to refcount_t
net, sctp: convert sctp_transport.refcnt from atomic_t to refcount_t
net, sctp: convert sctp_chunk.refcnt from atomic_t to refcount_t
net, sctp: convert sctp_datamsg.refcnt from atomic_t to refcount_t
net, sctp: convert sctp_auth_bytes.refcnt from atomic_t to refcount_t
...
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 182 |
1 files changed, 135 insertions, 47 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4858e190f6ac..4e985dea1dd2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -151,7 +151,7 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta) while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) cwnd >>= 1; tp->snd_cwnd = max(cwnd, restart_cwnd); - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; tp->snd_cwnd_used = 0; } @@ -160,7 +160,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp, struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - const u32 now = tcp_time_stamp; + const u32 now = tcp_jiffies32; if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); @@ -316,7 +316,8 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; if (!(tp->ecn_flags & TCP_ECN_OK)) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; - else if (tcp_ca_needs_ecn(sk)) + else if (tcp_ca_needs_ecn(sk) || + tcp_bpf_ca_needs_ecn(sk)) INET_ECN_xmit(sk); } @@ -324,8 +325,9 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || - tcp_ca_needs_ecn(sk); + tcp_ca_needs_ecn(sk) || bpf_needs_ecn; if (!use_ecn) { const struct dst_entry *dst = __sk_dst_get(sk); @@ -339,7 +341,7 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) if (use_ecn) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; - if (tcp_ca_needs_ecn(sk)) + if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) INET_ECN_xmit(sk); } } @@ -569,18 +571,18 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, opts->mss = tcp_advertise_mss(sk); remaining -= TCPOLEN_MSS_ALIGNED; - if (likely(sysctl_tcp_timestamps && !*md5)) { + if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) { opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } - if (likely(sysctl_tcp_window_scaling)) { + if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) { opts->ws = tp->rx_opt.rcv_wscale; opts->options |= OPTION_WSCALE; remaining -= TCPOLEN_WSCALE_ALIGNED; } - if (likely(sysctl_tcp_sack)) { + if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!(OPTION_TS & opts->options))) remaining -= TCPOLEN_SACKPERM_ALIGNED; @@ -861,12 +863,11 @@ void tcp_wfree(struct sk_buff *skb) struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); unsigned long flags, nval, oval; - int wmem; /* Keep one reference on sk_wmem_alloc. * Will be released by sk_free() from here or tcp_tasklet_func() */ - wmem = atomic_sub_return(skb->truesize - 1, &sk->sk_wmem_alloc); + WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc)); /* If this softirq is serviced by ksoftirqd, we are likely under stress. * Wait until our queues (qdisc + devices) are drained. @@ -875,7 +876,7 @@ void tcp_wfree(struct sk_buff *skb) * - chance for incoming ACK (processed by another cpu maybe) * to migrate this flow (skb->ooo_okay will be eventually set) */ - if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) + if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) goto out; for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) { @@ -904,6 +905,72 @@ out: sk_free(sk); } +/* Note: Called under hard irq. + * We can not call TCP stack right away. + */ +enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) +{ + struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer); + struct sock *sk = (struct sock *)tp; + unsigned long nval, oval; + + for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) { + struct tsq_tasklet *tsq; + bool empty; + + if (oval & TSQF_QUEUED) + break; + + nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED; + nval = cmpxchg(&sk->sk_tsq_flags, oval, nval); + if (nval != oval) + continue; + + if (!refcount_inc_not_zero(&sk->sk_wmem_alloc)) + break; + /* queue this socket to tasklet queue */ + tsq = this_cpu_ptr(&tsq_tasklet); + empty = list_empty(&tsq->head); + list_add(&tp->tsq_node, &tsq->head); + if (empty) + tasklet_schedule(&tsq->tasklet); + break; + } + return HRTIMER_NORESTART; +} + +/* BBR congestion control needs pacing. + * Same remark for SO_MAX_PACING_RATE. + * sch_fq packet scheduler is efficiently handling pacing, + * but is not always installed/used. + * Return true if TCP stack should pace packets itself. + */ +static bool tcp_needs_internal_pacing(const struct sock *sk) +{ + return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED; +} + +static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) +{ + u64 len_ns; + u32 rate; + + if (!tcp_needs_internal_pacing(sk)) + return; + rate = sk->sk_pacing_rate; + if (!rate || rate == ~0U) + return; + + /* Should account for header sizes as sch_fq does, + * but lets make things simple. + */ + len_ns = (u64)skb->len * NSEC_PER_SEC; + do_div(len_ns, rate); + hrtimer_start(&tcp_sk(sk)->pacing_timer, + ktime_add_ns(ktime_get(), len_ns), + HRTIMER_MODE_ABS_PINNED); +} + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -931,8 +998,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, BUG_ON(!skb || !tcp_skb_pcount(skb)); tp = tcp_sk(sk); + skb->skb_mstamp = tp->tcp_mstamp; if (clone_it) { - skb_mstamp_get(&skb->skb_mstamp); TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; tcp_rate_skb_sent(sk, skb); @@ -979,7 +1046,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb->sk = sk; skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree; skb_set_hash_from_sk(skb, sk); - atomic_add(skb->truesize, &sk->sk_wmem_alloc); + refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm); @@ -1034,6 +1101,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); tp->data_segs_out += tcp_skb_pcount(skb); + tcp_internal_pacing(sk, skb); } if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) @@ -1261,9 +1329,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, return 0; } -/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c - * eventually). The difference is that pulled data not copied, but - * immediately discarded. +/* This is similar to __pskb_pull_tail(). The difference is that pulled + * data is not copied, but immediately discarded. */ static int __pskb_trim_head(struct sk_buff *skb, int len) { @@ -1298,7 +1365,6 @@ static int __pskb_trim_head(struct sk_buff *skb, int len) } shinfo->nr_frags = k; - skb_reset_tail_pointer(skb); skb->data_len -= len; skb->len = skb->data_len; return len; @@ -1408,7 +1474,7 @@ void tcp_mtup_init(struct sock *sk) icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); icsk->icsk_mtup.probe_size = 0; if (icsk->icsk_mtup.enabled) - icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; + icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } EXPORT_SYMBOL(tcp_mtup_init); @@ -1509,7 +1575,7 @@ static void tcp_cwnd_application_limited(struct sock *sk) } tp->snd_cwnd_used = 0; } - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; } static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) @@ -1530,14 +1596,14 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) if (tcp_is_cwnd_limited(sk)) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_jiffies32; } else { /* Network starves. */ if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; if (sysctl_tcp_slow_start_after_idle && - (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && + (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && !ca_ops->cong_control) tcp_cwnd_application_limited(sk); @@ -1839,7 +1905,6 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, const struct inet_connection_sock *icsk = inet_csk(sk); u32 age, send_win, cong_win, limit, in_flight; struct tcp_sock *tp = tcp_sk(sk); - struct skb_mstamp now; struct sk_buff *head; int win_divisor; @@ -1852,7 +1917,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, /* Avoid bursty behavior by allowing defer * only if the last write was recent. */ - if ((s32)(tcp_time_stamp - tp->lsndtime) > 0) + if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0) goto send_now; in_flight = tcp_packets_in_flight(tp); @@ -1895,8 +1960,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, } head = tcp_write_queue_head(sk); - skb_mstamp_get(&now); - age = skb_mstamp_us_delta(&now, &head->skb_mstamp); + + age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); /* If next ACK is likely to come too late (half srtt), do not defer */ if (age < (tp->srtt_us >> 4)) goto send_now; @@ -1921,7 +1986,7 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk) s32 delta; interval = net->ipv4.sysctl_tcp_probe_interval; - delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp; + delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp; if (unlikely(delta >= interval * HZ)) { int mss = tcp_current_mss(sk); @@ -1933,7 +1998,7 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk) icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); /* Update probe time stamp */ - icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; + icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } } @@ -2086,6 +2151,12 @@ static int tcp_mtu_probe(struct sock *sk) return -1; } +static bool tcp_pacing_check(const struct sock *sk) +{ + return tcp_needs_internal_pacing(sk) && + hrtimer_active(&tcp_sk(sk)->pacing_timer); +} + /* TCP Small Queues : * Control number of packets in qdisc/devices to two packets / or ~1 ms. * (These limits are doubled for retransmits) @@ -2106,7 +2177,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes); limit <<= factor; - if (atomic_read(&sk->sk_wmem_alloc) > limit) { + if (refcount_read(&sk->sk_wmem_alloc) > limit) { /* Always send the 1st or 2nd skb in write queue. * No need to wait for TX completion to call us back, * after softirq/tasklet schedule. @@ -2122,7 +2193,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, * test again the condition. */ smp_mb__after_atomic(); - if (atomic_read(&sk->sk_wmem_alloc) > limit) + if (refcount_read(&sk->sk_wmem_alloc) > limit) return true; } return false; @@ -2130,7 +2201,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) { - const u32 now = tcp_time_stamp; + const u32 now = tcp_jiffies32; if (tp->chrono_type > TCP_CHRONO_UNSPEC) tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start; @@ -2207,15 +2278,19 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } max_segs = tcp_tso_segs(sk, mss_now); + tcp_mstamp_refresh(tp); while ((skb = tcp_send_head(sk))) { unsigned int limit; + if (tcp_pacing_check(sk)) + break; + tso_segs = tcp_init_tso_segs(skb, mss_now); BUG_ON(!tso_segs); if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp" is used as a start point for the retransmit timer */ - skb_mstamp_get(&skb->skb_mstamp); + skb->skb_mstamp = tp->tcp_mstamp; goto repair; /* Skip network transmission */ } @@ -2342,10 +2417,10 @@ bool tcp_schedule_loss_probe(struct sock *sk) timeout = max_t(u32, timeout, msecs_to_jiffies(10)); /* If RTO is shorter, just schedule TLP in its place. */ - tlp_time_stamp = tcp_time_stamp + timeout; + tlp_time_stamp = tcp_jiffies32 + timeout; rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout; if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) { - s32 delta = rto_time_stamp - tcp_time_stamp; + s32 delta = rto_time_stamp - tcp_jiffies32; if (delta > 0) timeout = delta; } @@ -2738,7 +2813,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) /* Do not sent more than we queued. 1/4 is reserved for possible * copying overhead: fragmentation, tunneling, mangling etc. */ - if (atomic_read(&sk->sk_wmem_alloc) > + if (refcount_read(&sk->sk_wmem_alloc) > min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) return -EAGAIN; @@ -2803,7 +2878,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb; - skb_mstamp_get(&skb->skb_mstamp); + skb->skb_mstamp = tp->tcp_mstamp; nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : -ENOBUFS; @@ -2878,6 +2953,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (skb == tcp_send_head(sk)) break; + + if (tcp_pacing_check(sk)) + break; + /* we could do better than to assign each time */ if (!hole) tp->retransmit_skb_hint = skb; @@ -3015,7 +3094,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) skb_reserve(skb, MAX_TCP_HEADER); tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); - skb_mstamp_get(&skb->skb_mstamp); + tcp_mstamp_refresh(tcp_sk(sk)); /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); @@ -3111,10 +3190,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) - skb->skb_mstamp.stamp_jiffies = cookie_init_timestamp(req); + skb->skb_mstamp = cookie_init_timestamp(req); else #endif - skb_mstamp_get(&skb->skb_mstamp); + skb->skb_mstamp = tcp_clock_us(); #ifdef CONFIG_TCP_MD5SIG rcu_read_lock(); @@ -3134,6 +3213,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, tcp_ecn_make_synack(req, th); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; + skb->mark = ireq->ir_mark; /* Setting of flags are superfluous here for callers (and ECE is * not even correctly set) */ @@ -3189,12 +3269,14 @@ static void tcp_connect_init(struct sock *sk) const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; + u32 rcv_wnd; /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. */ - tp->tcp_header_len = sizeof(struct tcphdr) + - (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); + tp->tcp_header_len = sizeof(struct tcphdr); + if (sock_net(sk)->ipv4.sysctl_tcp_timestamps) + tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; #ifdef CONFIG_TCP_MD5SIG if (tp->af_specific->md5_lookup(sk, sk)) @@ -3221,13 +3303,17 @@ static void tcp_connect_init(struct sock *sk) (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) tp->window_clamp = tcp_full_space(sk); + rcv_wnd = tcp_rwnd_init_bpf(sk); + if (rcv_wnd == 0) + rcv_wnd = dst_metric(dst, RTAX_INITRWND); + tcp_select_initial_window(tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, - sysctl_tcp_window_scaling, + sock_net(sk)->ipv4.sysctl_tcp_window_scaling, &rcv_wscale, - dst_metric(dst, RTAX_INITRWND)); + rcv_wnd); tp->rx_opt.rcv_wscale = rcv_wscale; tp->rcv_ssthresh = tp->rcv_wnd; @@ -3244,11 +3330,11 @@ static void tcp_connect_init(struct sock *sk) if (likely(!tp->repair)) tp->rcv_nxt = 0; else - tp->rcv_tstamp = tcp_time_stamp; + tp->rcv_tstamp = tcp_jiffies32; tp->rcv_wup = tp->rcv_nxt; tp->copied_seq = tp->rcv_nxt; - inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); inet_csk(sk)->icsk_retransmits = 0; tcp_clear_retrans(tp); } @@ -3361,6 +3447,7 @@ int tcp_connect(struct sock *sk) struct sk_buff *buff; int err; + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB); tcp_connect_init(sk); if (unlikely(tp->repair)) { @@ -3373,7 +3460,8 @@ int tcp_connect(struct sock *sk) return -ENOBUFS; tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); - tp->retrans_stamp = tcp_time_stamp; + tcp_mstamp_refresh(tp); + tp->retrans_stamp = tcp_time_stamp(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); @@ -3492,7 +3580,6 @@ void tcp_send_ack(struct sock *sk) skb_set_tcp_pure_ack(buff); /* Send it off, this clears delayed acks for us. */ - skb_mstamp_get(&buff->skb_mstamp); tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0); } EXPORT_SYMBOL_GPL(tcp_send_ack); @@ -3526,15 +3613,16 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) * send it. */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); - skb_mstamp_get(&skb->skb_mstamp); NET_INC_STATS(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0); } +/* Called from setsockopt( ... TCP_REPAIR ) */ void tcp_send_window_probe(struct sock *sk) { if (sk->sk_state == TCP_ESTABLISHED) { tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; + tcp_mstamp_refresh(tcp_sk(sk)); tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE); } } |