summaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-07-05 12:31:59 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-07-05 12:31:59 -0700
commit5518b69b76680a4f2df96b1deca260059db0c2de (patch)
treef33cd1519c8efb4590500f2f9617400be233238c /net/ipv4/tcp_output.c
parent8ad06e56dcbc1984ef0ff8f6e3c19982c5809f73 (diff)
parent0e72582270c07850b92cac351c8b97d4f9c123b9 (diff)
downloadlinux-5518b69b76680a4f2df96b1deca260059db0c2de.tar.bz2
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Reasonably busy this cycle, but perhaps not as busy as in the 4.12 merge window: 1) Several optimizations for UDP processing under high load from Paolo Abeni. 2) Support pacing internally in TCP when using the sch_fq packet scheduler for this is not practical. From Eric Dumazet. 3) Support mutliple filter chains per qdisc, from Jiri Pirko. 4) Move to 1ms TCP timestamp clock, from Eric Dumazet. 5) Add batch dequeueing to vhost_net, from Jason Wang. 6) Flesh out more completely SCTP checksum offload support, from Davide Caratti. 7) More plumbing of extended netlink ACKs, from David Ahern, Pablo Neira Ayuso, and Matthias Schiffer. 8) Add devlink support to nfp driver, from Simon Horman. 9) Add RTM_F_FIB_MATCH flag to RTM_GETROUTE queries, from Roopa Prabhu. 10) Add stack depth tracking to BPF verifier and use this information in the various eBPF JITs. From Alexei Starovoitov. 11) Support XDP on qed device VFs, from Yuval Mintz. 12) Introduce BPF PROG ID for better introspection of installed BPF programs. From Martin KaFai Lau. 13) Add bpf_set_hash helper for TC bpf programs, from Daniel Borkmann. 14) For loads, allow narrower accesses in bpf verifier checking, from Yonghong Song. 15) Support MIPS in the BPF selftests and samples infrastructure, the MIPS eBPF JIT will be merged in via the MIPS GIT tree. From David Daney. 16) Support kernel based TLS, from Dave Watson and others. 17) Remove completely DST garbage collection, from Wei Wang. 18) Allow installing TCP MD5 rules using prefixes, from Ivan Delalande. 19) Add XDP support to Intel i40e driver, from Björn Töpel 20) Add support for TC flower offload in nfp driver, from Simon Horman, Pieter Jansen van Vuuren, Benjamin LaHaise, Jakub Kicinski, and Bert van Leeuwen. 21) IPSEC offloading support in mlx5, from Ilan Tayari. 22) Add HW PTP support to macb driver, from Rafal Ozieblo. 23) Networking refcount_t conversions, From Elena Reshetova. 24) Add sock_ops support to BPF, from Lawrence Brako. This is useful for tuning the TCP sockopt settings of a group of applications, currently via CGROUPs" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1899 commits) net: phy: dp83867: add workaround for incorrect RX_CTRL pin strap dt-bindings: phy: dp83867: provide a workaround for incorrect RX_CTRL pin strap cxgb4: Support for get_ts_info ethtool method cxgb4: Add PTP Hardware Clock (PHC) support cxgb4: time stamping interface for PTP nfp: default to chained metadata prepend format nfp: remove legacy MAC address lookup nfp: improve order of interfaces in breakout mode net: macb: remove extraneous return when MACB_EXT_DESC is defined bpf: add missing break in for the TCP_BPF_SNDCWND_CLAMP case bpf: fix return in load_bpf_file mpls: fix rtm policy in mpls_getroute net, ax25: convert ax25_cb.refcount from atomic_t to refcount_t net, ax25: convert ax25_route.refcount from atomic_t to refcount_t net, ax25: convert ax25_uid_assoc.refcount from atomic_t to refcount_t net, sctp: convert sctp_ep_common.refcnt from atomic_t to refcount_t net, sctp: convert sctp_transport.refcnt from atomic_t to refcount_t net, sctp: convert sctp_chunk.refcnt from atomic_t to refcount_t net, sctp: convert sctp_datamsg.refcnt from atomic_t to refcount_t net, sctp: convert sctp_auth_bytes.refcnt from atomic_t to refcount_t ...
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c182
1 files changed, 135 insertions, 47 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4858e190f6ac..4e985dea1dd2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -151,7 +151,7 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta)
while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
cwnd >>= 1;
tp->snd_cwnd = max(cwnd, restart_cwnd);
- tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->snd_cwnd_stamp = tcp_jiffies32;
tp->snd_cwnd_used = 0;
}
@@ -160,7 +160,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- const u32 now = tcp_time_stamp;
+ const u32 now = tcp_jiffies32;
if (tcp_packets_in_flight(tp) == 0)
tcp_ca_event(sk, CA_EVENT_TX_START);
@@ -316,7 +316,8 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
if (!(tp->ecn_flags & TCP_ECN_OK))
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
- else if (tcp_ca_needs_ecn(sk))
+ else if (tcp_ca_needs_ecn(sk) ||
+ tcp_bpf_ca_needs_ecn(sk))
INET_ECN_xmit(sk);
}
@@ -324,8 +325,9 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
+ bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
- tcp_ca_needs_ecn(sk);
+ tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
if (!use_ecn) {
const struct dst_entry *dst = __sk_dst_get(sk);
@@ -339,7 +341,7 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
if (use_ecn) {
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
tp->ecn_flags = TCP_ECN_OK;
- if (tcp_ca_needs_ecn(sk))
+ if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
INET_ECN_xmit(sk);
}
}
@@ -569,18 +571,18 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
opts->mss = tcp_advertise_mss(sk);
remaining -= TCPOLEN_MSS_ALIGNED;
- if (likely(sysctl_tcp_timestamps && !*md5)) {
+ if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
opts->options |= OPTION_TS;
opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
opts->tsecr = tp->rx_opt.ts_recent;
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
- if (likely(sysctl_tcp_window_scaling)) {
+ if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) {
opts->ws = tp->rx_opt.rcv_wscale;
opts->options |= OPTION_WSCALE;
remaining -= TCPOLEN_WSCALE_ALIGNED;
}
- if (likely(sysctl_tcp_sack)) {
+ if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) {
opts->options |= OPTION_SACK_ADVERTISE;
if (unlikely(!(OPTION_TS & opts->options)))
remaining -= TCPOLEN_SACKPERM_ALIGNED;
@@ -861,12 +863,11 @@ void tcp_wfree(struct sk_buff *skb)
struct sock *sk = skb->sk;
struct tcp_sock *tp = tcp_sk(sk);
unsigned long flags, nval, oval;
- int wmem;
/* Keep one reference on sk_wmem_alloc.
* Will be released by sk_free() from here or tcp_tasklet_func()
*/
- wmem = atomic_sub_return(skb->truesize - 1, &sk->sk_wmem_alloc);
+ WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
/* If this softirq is serviced by ksoftirqd, we are likely under stress.
* Wait until our queues (qdisc + devices) are drained.
@@ -875,7 +876,7 @@ void tcp_wfree(struct sk_buff *skb)
* - chance for incoming ACK (processed by another cpu maybe)
* to migrate this flow (skb->ooo_okay will be eventually set)
*/
- if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
+ if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
goto out;
for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
@@ -904,6 +905,72 @@ out:
sk_free(sk);
}
+/* Note: Called under hard irq.
+ * We can not call TCP stack right away.
+ */
+enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
+{
+ struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
+ struct sock *sk = (struct sock *)tp;
+ unsigned long nval, oval;
+
+ for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
+ struct tsq_tasklet *tsq;
+ bool empty;
+
+ if (oval & TSQF_QUEUED)
+ break;
+
+ nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
+ nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
+ if (nval != oval)
+ continue;
+
+ if (!refcount_inc_not_zero(&sk->sk_wmem_alloc))
+ break;
+ /* queue this socket to tasklet queue */
+ tsq = this_cpu_ptr(&tsq_tasklet);
+ empty = list_empty(&tsq->head);
+ list_add(&tp->tsq_node, &tsq->head);
+ if (empty)
+ tasklet_schedule(&tsq->tasklet);
+ break;
+ }
+ return HRTIMER_NORESTART;
+}
+
+/* BBR congestion control needs pacing.
+ * Same remark for SO_MAX_PACING_RATE.
+ * sch_fq packet scheduler is efficiently handling pacing,
+ * but is not always installed/used.
+ * Return true if TCP stack should pace packets itself.
+ */
+static bool tcp_needs_internal_pacing(const struct sock *sk)
+{
+ return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
+}
+
+static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
+{
+ u64 len_ns;
+ u32 rate;
+
+ if (!tcp_needs_internal_pacing(sk))
+ return;
+ rate = sk->sk_pacing_rate;
+ if (!rate || rate == ~0U)
+ return;
+
+ /* Should account for header sizes as sch_fq does,
+ * but lets make things simple.
+ */
+ len_ns = (u64)skb->len * NSEC_PER_SEC;
+ do_div(len_ns, rate);
+ hrtimer_start(&tcp_sk(sk)->pacing_timer,
+ ktime_add_ns(ktime_get(), len_ns),
+ HRTIMER_MODE_ABS_PINNED);
+}
+
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
* transmission and possible later retransmissions.
@@ -931,8 +998,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
BUG_ON(!skb || !tcp_skb_pcount(skb));
tp = tcp_sk(sk);
+ skb->skb_mstamp = tp->tcp_mstamp;
if (clone_it) {
- skb_mstamp_get(&skb->skb_mstamp);
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- tp->snd_una;
tcp_rate_skb_sent(sk, skb);
@@ -979,7 +1046,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb->sk = sk;
skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
skb_set_hash_from_sk(skb, sk);
- atomic_add(skb->truesize, &sk->sk_wmem_alloc);
+ refcount_add(skb->truesize, &sk->sk_wmem_alloc);
skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
@@ -1034,6 +1101,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
if (skb->len != tcp_header_size) {
tcp_event_data_sent(tp, sk);
tp->data_segs_out += tcp_skb_pcount(skb);
+ tcp_internal_pacing(sk, skb);
}
if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
@@ -1261,9 +1329,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
return 0;
}
-/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
- * eventually). The difference is that pulled data not copied, but
- * immediately discarded.
+/* This is similar to __pskb_pull_tail(). The difference is that pulled
+ * data is not copied, but immediately discarded.
*/
static int __pskb_trim_head(struct sk_buff *skb, int len)
{
@@ -1298,7 +1365,6 @@ static int __pskb_trim_head(struct sk_buff *skb, int len)
}
shinfo->nr_frags = k;
- skb_reset_tail_pointer(skb);
skb->data_len -= len;
skb->len = skb->data_len;
return len;
@@ -1408,7 +1474,7 @@ void tcp_mtup_init(struct sock *sk)
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
icsk->icsk_mtup.probe_size = 0;
if (icsk->icsk_mtup.enabled)
- icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
+ icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
}
EXPORT_SYMBOL(tcp_mtup_init);
@@ -1509,7 +1575,7 @@ static void tcp_cwnd_application_limited(struct sock *sk)
}
tp->snd_cwnd_used = 0;
}
- tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->snd_cwnd_stamp = tcp_jiffies32;
}
static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
@@ -1530,14 +1596,14 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
if (tcp_is_cwnd_limited(sk)) {
/* Network is feed fully. */
tp->snd_cwnd_used = 0;
- tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->snd_cwnd_stamp = tcp_jiffies32;
} else {
/* Network starves. */
if (tp->packets_out > tp->snd_cwnd_used)
tp->snd_cwnd_used = tp->packets_out;
if (sysctl_tcp_slow_start_after_idle &&
- (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
+ (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
!ca_ops->cong_control)
tcp_cwnd_application_limited(sk);
@@ -1839,7 +1905,6 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 age, send_win, cong_win, limit, in_flight;
struct tcp_sock *tp = tcp_sk(sk);
- struct skb_mstamp now;
struct sk_buff *head;
int win_divisor;
@@ -1852,7 +1917,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
/* Avoid bursty behavior by allowing defer
* only if the last write was recent.
*/
- if ((s32)(tcp_time_stamp - tp->lsndtime) > 0)
+ if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0)
goto send_now;
in_flight = tcp_packets_in_flight(tp);
@@ -1895,8 +1960,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
}
head = tcp_write_queue_head(sk);
- skb_mstamp_get(&now);
- age = skb_mstamp_us_delta(&now, &head->skb_mstamp);
+
+ age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
/* If next ACK is likely to come too late (half srtt), do not defer */
if (age < (tp->srtt_us >> 4))
goto send_now;
@@ -1921,7 +1986,7 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk)
s32 delta;
interval = net->ipv4.sysctl_tcp_probe_interval;
- delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp;
+ delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
if (unlikely(delta >= interval * HZ)) {
int mss = tcp_current_mss(sk);
@@ -1933,7 +1998,7 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk)
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
/* Update probe time stamp */
- icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
+ icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
}
}
@@ -2086,6 +2151,12 @@ static int tcp_mtu_probe(struct sock *sk)
return -1;
}
+static bool tcp_pacing_check(const struct sock *sk)
+{
+ return tcp_needs_internal_pacing(sk) &&
+ hrtimer_active(&tcp_sk(sk)->pacing_timer);
+}
+
/* TCP Small Queues :
* Control number of packets in qdisc/devices to two packets / or ~1 ms.
* (These limits are doubled for retransmits)
@@ -2106,7 +2177,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
limit <<= factor;
- if (atomic_read(&sk->sk_wmem_alloc) > limit) {
+ if (refcount_read(&sk->sk_wmem_alloc) > limit) {
/* Always send the 1st or 2nd skb in write queue.
* No need to wait for TX completion to call us back,
* after softirq/tasklet schedule.
@@ -2122,7 +2193,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
* test again the condition.
*/
smp_mb__after_atomic();
- if (atomic_read(&sk->sk_wmem_alloc) > limit)
+ if (refcount_read(&sk->sk_wmem_alloc) > limit)
return true;
}
return false;
@@ -2130,7 +2201,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
{
- const u32 now = tcp_time_stamp;
+ const u32 now = tcp_jiffies32;
if (tp->chrono_type > TCP_CHRONO_UNSPEC)
tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start;
@@ -2207,15 +2278,19 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
}
max_segs = tcp_tso_segs(sk, mss_now);
+ tcp_mstamp_refresh(tp);
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
+ if (tcp_pacing_check(sk))
+ break;
+
tso_segs = tcp_init_tso_segs(skb, mss_now);
BUG_ON(!tso_segs);
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
/* "skb_mstamp" is used as a start point for the retransmit timer */
- skb_mstamp_get(&skb->skb_mstamp);
+ skb->skb_mstamp = tp->tcp_mstamp;
goto repair; /* Skip network transmission */
}
@@ -2342,10 +2417,10 @@ bool tcp_schedule_loss_probe(struct sock *sk)
timeout = max_t(u32, timeout, msecs_to_jiffies(10));
/* If RTO is shorter, just schedule TLP in its place. */
- tlp_time_stamp = tcp_time_stamp + timeout;
+ tlp_time_stamp = tcp_jiffies32 + timeout;
rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
- s32 delta = rto_time_stamp - tcp_time_stamp;
+ s32 delta = rto_time_stamp - tcp_jiffies32;
if (delta > 0)
timeout = delta;
}
@@ -2738,7 +2813,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
/* Do not sent more than we queued. 1/4 is reserved for possible
* copying overhead: fragmentation, tunneling, mangling etc.
*/
- if (atomic_read(&sk->sk_wmem_alloc) >
+ if (refcount_read(&sk->sk_wmem_alloc) >
min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
sk->sk_sndbuf))
return -EAGAIN;
@@ -2803,7 +2878,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
skb_headroom(skb) >= 0xFFFF)) {
struct sk_buff *nskb;
- skb_mstamp_get(&skb->skb_mstamp);
+ skb->skb_mstamp = tp->tcp_mstamp;
nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
-ENOBUFS;
@@ -2878,6 +2953,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (skb == tcp_send_head(sk))
break;
+
+ if (tcp_pacing_check(sk))
+ break;
+
/* we could do better than to assign each time */
if (!hole)
tp->retransmit_skb_hint = skb;
@@ -3015,7 +3094,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
skb_reserve(skb, MAX_TCP_HEADER);
tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
TCPHDR_ACK | TCPHDR_RST);
- skb_mstamp_get(&skb->skb_mstamp);
+ tcp_mstamp_refresh(tcp_sk(sk));
/* Send it off. */
if (tcp_transmit_skb(sk, skb, 0, priority))
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
@@ -3111,10 +3190,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES
if (unlikely(req->cookie_ts))
- skb->skb_mstamp.stamp_jiffies = cookie_init_timestamp(req);
+ skb->skb_mstamp = cookie_init_timestamp(req);
else
#endif
- skb_mstamp_get(&skb->skb_mstamp);
+ skb->skb_mstamp = tcp_clock_us();
#ifdef CONFIG_TCP_MD5SIG
rcu_read_lock();
@@ -3134,6 +3213,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
tcp_ecn_make_synack(req, th);
th->source = htons(ireq->ir_num);
th->dest = ireq->ir_rmt_port;
+ skb->mark = ireq->ir_mark;
/* Setting of flags are superfluous here for callers (and ECE is
* not even correctly set)
*/
@@ -3189,12 +3269,14 @@ static void tcp_connect_init(struct sock *sk)
const struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
__u8 rcv_wscale;
+ u32 rcv_wnd;
/* We'll fix this up when we get a response from the other end.
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
*/
- tp->tcp_header_len = sizeof(struct tcphdr) +
- (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
+ tp->tcp_header_len = sizeof(struct tcphdr);
+ if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
+ tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
#ifdef CONFIG_TCP_MD5SIG
if (tp->af_specific->md5_lookup(sk, sk))
@@ -3221,13 +3303,17 @@ static void tcp_connect_init(struct sock *sk)
(tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
tp->window_clamp = tcp_full_space(sk);
+ rcv_wnd = tcp_rwnd_init_bpf(sk);
+ if (rcv_wnd == 0)
+ rcv_wnd = dst_metric(dst, RTAX_INITRWND);
+
tcp_select_initial_window(tcp_full_space(sk),
tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
&tp->rcv_wnd,
&tp->window_clamp,
- sysctl_tcp_window_scaling,
+ sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
&rcv_wscale,
- dst_metric(dst, RTAX_INITRWND));
+ rcv_wnd);
tp->rx_opt.rcv_wscale = rcv_wscale;
tp->rcv_ssthresh = tp->rcv_wnd;
@@ -3244,11 +3330,11 @@ static void tcp_connect_init(struct sock *sk)
if (likely(!tp->repair))
tp->rcv_nxt = 0;
else
- tp->rcv_tstamp = tcp_time_stamp;
+ tp->rcv_tstamp = tcp_jiffies32;
tp->rcv_wup = tp->rcv_nxt;
tp->copied_seq = tp->rcv_nxt;
- inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+ inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
inet_csk(sk)->icsk_retransmits = 0;
tcp_clear_retrans(tp);
}
@@ -3361,6 +3447,7 @@ int tcp_connect(struct sock *sk)
struct sk_buff *buff;
int err;
+ tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB);
tcp_connect_init(sk);
if (unlikely(tp->repair)) {
@@ -3373,7 +3460,8 @@ int tcp_connect(struct sock *sk)
return -ENOBUFS;
tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
- tp->retrans_stamp = tcp_time_stamp;
+ tcp_mstamp_refresh(tp);
+ tp->retrans_stamp = tcp_time_stamp(tp);
tcp_connect_queue_skb(sk, buff);
tcp_ecn_send_syn(sk, buff);
@@ -3492,7 +3580,6 @@ void tcp_send_ack(struct sock *sk)
skb_set_tcp_pure_ack(buff);
/* Send it off, this clears delayed acks for us. */
- skb_mstamp_get(&buff->skb_mstamp);
tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0);
}
EXPORT_SYMBOL_GPL(tcp_send_ack);
@@ -3526,15 +3613,16 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
* send it.
*/
tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
- skb_mstamp_get(&skb->skb_mstamp);
NET_INC_STATS(sock_net(sk), mib);
return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
}
+/* Called from setsockopt( ... TCP_REPAIR ) */
void tcp_send_window_probe(struct sock *sk)
{
if (sk->sk_state == TCP_ESTABLISHED) {
tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
+ tcp_mstamp_refresh(tcp_sk(sk));
tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
}
}