summaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c221
1 files changed, 201 insertions, 20 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7616cd76f6f6..c5d7656beeee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -103,6 +103,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define FLAG_DATA_SACKED 0x20 /* New SACK. */
#define FLAG_ECE 0x40 /* ECE in this ACK */
#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
+#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
@@ -3371,6 +3372,12 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
if (tp->snd_wnd != nwin) {
tp->snd_wnd = nwin;
+ /* Note, it is the only place, where
+ * fast path is recovered for sending TCP.
+ */
+ tp->pred_flags = 0;
+ tcp_fast_path_check(sk);
+
if (tcp_send_head(sk))
tcp_slow_start_after_idle_check(sk);
@@ -3552,7 +3559,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 lost = tp->lost;
int acked = 0; /* Number of packets newly acked */
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
- u32 ack_ev_flags = 0;
sack_state.first_sackt = 0;
sack_state.rate = &rs;
@@ -3593,26 +3599,42 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (flag & FLAG_UPDATE_TS_RECENT)
tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
- if (ack_seq != TCP_SKB_CB(skb)->end_seq)
- flag |= FLAG_DATA;
- else
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
+ if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
+ /* Window is constant, pure forward advance.
+ * No more checks are required.
+ * Note, we use the fact that SND.UNA>=SND.WL2.
+ */
+ tcp_update_wl(tp, ack_seq);
+ tcp_snd_una_update(tp, ack);
+ flag |= FLAG_WIN_UPDATE;
- flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
+ tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
- if (TCP_SKB_CB(skb)->sacked)
- flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
- &sack_state);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
+ } else {
+ u32 ack_ev_flags = CA_ACK_SLOWPATH;
- if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
- flag |= FLAG_ECE;
- ack_ev_flags = CA_ACK_ECE;
- }
+ if (ack_seq != TCP_SKB_CB(skb)->end_seq)
+ flag |= FLAG_DATA;
+ else
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
+
+ flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
- if (flag & FLAG_WIN_UPDATE)
- ack_ev_flags |= CA_ACK_WIN_UPDATE;
+ if (TCP_SKB_CB(skb)->sacked)
+ flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+ &sack_state);
- tcp_in_ack_event(sk, ack_ev_flags);
+ if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
+ flag |= FLAG_ECE;
+ ack_ev_flags |= CA_ACK_ECE;
+ }
+
+ if (flag & FLAG_WIN_UPDATE)
+ ack_ev_flags |= CA_ACK_WIN_UPDATE;
+
+ tcp_in_ack_event(sk, ack_ev_flags);
+ }
/* We passed data and got it acked, remove any soft error
* log. Something worked...
@@ -4404,6 +4426,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
if (TCP_SKB_CB(skb)->has_rxtstamp)
TCP_SKB_CB(skb)->swtstamp = skb->tstamp;
+ /* Disable header prediction. */
+ tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
@@ -4644,6 +4668,8 @@ queue_and_out:
if (tp->rx_opt.num_sacks)
tcp_sack_remove(tp);
+ tcp_fast_path_check(sk);
+
if (eaten > 0)
kfree_skb_partial(skb, fragstolen);
if (!sock_flag(sk, SOCK_DEAD))
@@ -4969,6 +4995,7 @@ static int tcp_prune_queue(struct sock *sk)
NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
/* Massive buffer overcommit. */
+ tp->pred_flags = 0;
return -1;
}
@@ -5140,6 +5167,9 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
tp->urg_data = TCP_URG_NOTYET;
tp->urg_seq = ptr;
+
+ /* Disable header prediction. */
+ tp->pred_flags = 0;
}
/* This is the 'fast' part of urgent handling. */
@@ -5298,6 +5328,26 @@ discard:
/*
* TCP receive function for the ESTABLISHED state.
+ *
+ * It is split into a fast path and a slow path. The fast path is
+ * disabled when:
+ * - A zero window was announced from us - zero window probing
+ * is only handled properly in the slow path.
+ * - Out of order segments arrived.
+ * - Urgent data is expected.
+ * - There is no buffer space left
+ * - Unexpected TCP flags/window values/header lengths are received
+ * (detected by checking the TCP header against pred_flags)
+ * - Data is sent in both directions. Fast path only supports pure senders
+ * or pure receivers (this means either the sequence number or the ack
+ * value must stay constant)
+ * - Unexpected TCP option.
+ *
+ * When these conditions are not satisfied it drops into a standard
+ * receive procedure patterned after RFC793 to handle all cases.
+ * The first three cases are guaranteed by proper pred_flags setting,
+ * the rest is checked inline. Fast processing is turned on in
+ * tcp_data_queue when everything is OK.
*/
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th)
@@ -5308,19 +5358,144 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_mstamp_refresh(tp);
if (unlikely(!sk->sk_rx_dst))
inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
+ /*
+ * Header prediction.
+ * The code loosely follows the one in the famous
+ * "30 instruction TCP receive" Van Jacobson mail.
+ *
+ * Van's trick is to deposit buffers into socket queue
+ * on a device interrupt, to call tcp_recv function
+ * on the receive process context and checksum and copy
+ * the buffer to user space. smart...
+ *
+ * Our current scheme is not silly either but we take the
+ * extra cost of the net_bh soft interrupt processing...
+ * We do checksum and copy also but from device to kernel.
+ */
tp->rx_opt.saw_tstamp = 0;
+ /* pred_flags is 0xS?10 << 16 + snd_wnd
+ * if header_prediction is to be made
+ * 'S' will always be tp->tcp_header_len >> 2
+ * '?' will be 0 for the fast path, otherwise pred_flags is 0 to
+ * turn it off (when there are holes in the receive
+ * space for instance)
+ * PSH flag is ignored.
+ */
+
+ if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
+ TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
+ !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
+ int tcp_header_len = tp->tcp_header_len;
+
+ /* Timestamp header prediction: tcp_header_len
+ * is automatically equal to th->doff*4 due to pred_flags
+ * match.
+ */
+
+ /* Check timestamp */
+ if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
+ /* No? Slow path! */
+ if (!tcp_parse_aligned_timestamp(tp, th))
+ goto slow_path;
+
+ /* If PAWS failed, check it more carefully in slow path */
+ if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
+ goto slow_path;
+
+ /* DO NOT update ts_recent here, if checksum fails
+ * and timestamp was corrupted part, it will result
+ * in a hung connection since we will drop all
+ * future packets due to the PAWS test.
+ */
+ }
+
+ if (len <= tcp_header_len) {
+ /* Bulk data transfer: sender */
+ if (len == tcp_header_len) {
+ /* Predicted packet is in window by definition.
+ * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+ * Hence, check seq<=rcv_wup reduces to:
+ */
+ if (tcp_header_len ==
+ (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+ tp->rcv_nxt == tp->rcv_wup)
+ tcp_store_ts_recent(tp);
+
+ /* We know that such packets are checksummed
+ * on entry.
+ */
+ tcp_ack(sk, skb, 0);
+ __kfree_skb(skb);
+ tcp_data_snd_check(sk);
+ return;
+ } else { /* Header too small */
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+ goto discard;
+ }
+ } else {
+ int eaten = 0;
+ bool fragstolen = false;
+
+ if (tcp_checksum_complete(skb))
+ goto csum_error;
+
+ if ((int)skb->truesize > sk->sk_forward_alloc)
+ goto step5;
+
+ /* Predicted packet is in window by definition.
+ * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+ * Hence, check seq<=rcv_wup reduces to:
+ */
+ if (tcp_header_len ==
+ (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+ tp->rcv_nxt == tp->rcv_wup)
+ tcp_store_ts_recent(tp);
+
+ tcp_rcv_rtt_measure_ts(sk, skb);
+
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
+
+ /* Bulk data transfer: receiver */
+ eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
+ &fragstolen);
+
+ tcp_event_data_recv(sk, skb);
+
+ if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
+ /* Well, only one small jumplet in fast path... */
+ tcp_ack(sk, skb, FLAG_DATA);
+ tcp_data_snd_check(sk);
+ if (!inet_csk_ack_scheduled(sk))
+ goto no_ack;
+ }
+
+ __tcp_ack_snd_check(sk, 0);
+no_ack:
+ if (eaten)
+ kfree_skb_partial(skb, fragstolen);
+ sk->sk_data_ready(sk);
+ return;
+ }
+ }
+
+slow_path:
if (len < (th->doff << 2) || tcp_checksum_complete(skb))
goto csum_error;
if (!th->ack && !th->rst && !th->syn)
goto discard;
+ /*
+ * Standard slow path.
+ */
+
if (!tcp_validate_incoming(sk, skb, th, 1))
return;
- if (tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT) < 0)
+step5:
+ if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
goto discard;
tcp_rcv_rtt_measure_ts(sk, skb);
@@ -5373,6 +5548,11 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
if (sock_flag(sk, SOCK_KEEPOPEN))
inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
+
+ if (!tp->rx_opt.snd_wscale)
+ __tcp_fast_path_on(tp, tp->snd_wnd);
+ else
+ tp->pred_flags = 0;
}
static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
@@ -5501,7 +5681,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_ecn_rcv_synack(tp, th);
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
- tcp_ack(sk, skb, 0);
+ tcp_ack(sk, skb, FLAG_SLOWPATH);
/* Ok.. it's good. Set up sequence numbers and
* move to established.
@@ -5737,8 +5917,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
return 0;
/* step 5: check the ACK field */
-
- acceptable = tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT |
+ acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
+ FLAG_UPDATE_TS_RECENT |
FLAG_NO_CHALLENGE_ACK) > 0;
if (!acceptable) {
@@ -5806,6 +5986,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tp->lsndtime = tcp_jiffies32;
tcp_initialize_rcv_mss(sk);
+ tcp_fast_path_on(tp);
break;
case TCP_FIN_WAIT1: {