diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/Kconfig | 34 | ||||
-rw-r--r-- | net/ipv4/af_inet.c | 6 | ||||
-rw-r--r-- | net/ipv4/esp4_offload.c | 1 | ||||
-rw-r--r-- | net/ipv4/fib_semantics.c | 2 | ||||
-rw-r--r-- | net/ipv4/fou.c | 1 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 6 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel.c | 14 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_tables.c | 15 | ||||
-rw-r--r-- | net/ipv4/netfilter/ipt_SYNPROXY.c | 1 | ||||
-rw-r--r-- | net/ipv4/netfilter/iptable_filter.c | 10 | ||||
-rw-r--r-- | net/ipv4/netfilter/iptable_mangle.c | 10 | ||||
-rw-r--r-- | net/ipv4/netfilter/iptable_nat.c | 10 | ||||
-rw-r--r-- | net/ipv4/netfilter/iptable_raw.c | 10 | ||||
-rw-r--r-- | net/ipv4/netfilter/iptable_security.c | 11 | ||||
-rw-r--r-- | net/ipv4/netfilter/nf_flow_table_ipv4.c | 1 | ||||
-rw-r--r-- | net/ipv4/netfilter/nft_dup_ipv4.c | 1 | ||||
-rw-r--r-- | net/ipv4/netfilter/nft_fib_ipv4.c | 1 | ||||
-rw-r--r-- | net/ipv4/netfilter/nft_reject_ipv4.c | 1 | ||||
-rw-r--r-- | net/ipv4/tcp_cubic.c | 5 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 80 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 12 |
21 files changed, 165 insertions, 67 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 6ecbb0ced177..e64e59b536d3 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -340,29 +340,31 @@ config NET_FOU_IP_TUNNELS config INET_AH tristate "IP: AH transformation" - select XFRM_ALGO - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_MD5 - select CRYPTO_SHA1 + select XFRM_AH help - Support for IPsec AH. + Support for IPsec AH (Authentication Header). + + AH can be used with various authentication algorithms. Besides + enabling AH support itself, this option enables the generic + implementations of the algorithms that RFC 8221 lists as MUST be + implemented. If you need any other algorithms, you'll need to enable + them in the crypto API. You should also enable accelerated + implementations of any needed algorithms when available. If unsure, say Y. config INET_ESP tristate "IP: ESP transformation" - select XFRM_ALGO - select CRYPTO - select CRYPTO_AUTHENC - select CRYPTO_HMAC - select CRYPTO_MD5 - select CRYPTO_CBC - select CRYPTO_SHA1 - select CRYPTO_DES - select CRYPTO_ECHAINIV + select XFRM_ESP help - Support for IPsec ESP. + Support for IPsec ESP (Encapsulating Security Payload). + + ESP can be used with various encryption and authentication algorithms. + Besides enabling ESP support itself, this option enables the generic + implementations of the algorithms that RFC 8221 lists as MUST be + implemented. If you need any other algorithms, you'll need to enable + them in the crypto API. You should also enable accelerated + implementations of any needed algorithms when available. If unsure, say Y. diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 02aa5cb3a4fd..ea6ed6d487ed 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1432,10 +1432,6 @@ static struct sk_buff *ipip_gso_segment(struct sk_buff *skb, return inet_gso_segment(skb, features); } -INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *, - struct sk_buff *)); -INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *, - struct sk_buff *)); struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; @@ -1608,8 +1604,6 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) return -EINVAL; } -INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *, int)); -INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int)); int inet_gro_complete(struct sk_buff *skb, int nhoff) { __be16 newlen = htons(skb->len - nhoff); diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index d14133eac476..5bda5aeda579 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -361,3 +361,4 @@ module_exit(esp4_offload_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET, XFRM_PROTO_ESP); +MODULE_DESCRIPTION("IPV4 GSO/GRO offload support"); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e53871e4a097..1f75dc686b6b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1109,7 +1109,7 @@ static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table, if (fl4.flowi4_scope < RT_SCOPE_LINK) fl4.flowi4_scope = RT_SCOPE_LINK; - if (table) + if (table && table != RT_TABLE_MAIN) tbl = fib_get_table(net, table); if (tbl) diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index dcc79ff54b41..abd083415f89 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -1304,3 +1304,4 @@ module_init(fou_init); module_exit(fou_fini); MODULE_AUTHOR("Tom Herbert <therbert@google.com>"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Foo over UDP"); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 090d3097ee15..d946356187ed 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -539,6 +539,12 @@ no_route: } EXPORT_SYMBOL(__ip_queue_xmit); +int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) +{ + return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos); +} +EXPORT_SYMBOL(ip_queue_xmit); + static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) { to->pkt_type = from->pkt_type; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index f4f1d11eab50..0c1f36404471 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -85,9 +85,10 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, __be32 remote, __be32 local, __be32 key) { - unsigned int hash; struct ip_tunnel *t, *cand = NULL; struct hlist_head *head; + struct net_device *ndev; + unsigned int hash; hash = ip_tunnel_hash(key, remote); head = &itn->tunnels[hash]; @@ -162,8 +163,9 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, if (t && t->dev->flags & IFF_UP) return t; - if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) - return netdev_priv(itn->fb_tunnel_dev); + ndev = READ_ONCE(itn->fb_tunnel_dev); + if (ndev && ndev->flags & IFF_UP) + return netdev_priv(ndev); return NULL; } @@ -1259,9 +1261,9 @@ void ip_tunnel_uninit(struct net_device *dev) struct ip_tunnel_net *itn; itn = net_generic(net, tunnel->ip_tnl_net_id); - /* fb_tunnel_dev will be unregisted in net-exit call. */ - if (itn->fb_tunnel_dev != dev) - ip_tunnel_del(itn, netdev_priv(dev)); + ip_tunnel_del(itn, netdev_priv(dev)); + if (itn->fb_tunnel_dev == dev) + WRITE_ONCE(itn->fb_tunnel_dev, NULL); dst_cache_reset(&tunnel->dst_cache); } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index c2670eaa74e6..5bf9fa06aee0 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1797,11 +1797,22 @@ out_free: return ret; } +void ipt_unregister_table_pre_exit(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops) +{ + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); +} + +void ipt_unregister_table_exit(struct net *net, struct xt_table *table) +{ + __ipt_unregister_table(net, table); +} + void ipt_unregister_table(struct net *net, struct xt_table *table, const struct nf_hook_ops *ops) { if (ops) - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + ipt_unregister_table_pre_exit(net, table, ops); __ipt_unregister_table(net, table); } @@ -1958,6 +1969,8 @@ static void __exit ip_tables_fini(void) EXPORT_SYMBOL(ipt_register_table); EXPORT_SYMBOL(ipt_unregister_table); +EXPORT_SYMBOL(ipt_unregister_table_pre_exit); +EXPORT_SYMBOL(ipt_unregister_table_exit); EXPORT_SYMBOL(ipt_do_table); module_init(ip_tables_init); module_exit(ip_tables_fini); diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 748dc3ce58d3..f2984c7eef40 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -118,3 +118,4 @@ module_exit(synproxy_tg4_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("Intercept TCP connections and establish them using syncookies"); diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 9d54b4017e50..8f7bc1ee7453 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -72,16 +72,24 @@ static int __net_init iptable_filter_net_init(struct net *net) return 0; } +static void __net_exit iptable_filter_net_pre_exit(struct net *net) +{ + if (net->ipv4.iptable_filter) + ipt_unregister_table_pre_exit(net, net->ipv4.iptable_filter, + filter_ops); +} + static void __net_exit iptable_filter_net_exit(struct net *net) { if (!net->ipv4.iptable_filter) return; - ipt_unregister_table(net, net->ipv4.iptable_filter, filter_ops); + ipt_unregister_table_exit(net, net->ipv4.iptable_filter); net->ipv4.iptable_filter = NULL; } static struct pernet_operations iptable_filter_net_ops = { .init = iptable_filter_net_init, + .pre_exit = iptable_filter_net_pre_exit, .exit = iptable_filter_net_exit, }; diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index bb9266ea3785..f703a717ab1d 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -100,15 +100,23 @@ static int __net_init iptable_mangle_table_init(struct net *net) return ret; } +static void __net_exit iptable_mangle_net_pre_exit(struct net *net) +{ + if (net->ipv4.iptable_mangle) + ipt_unregister_table_pre_exit(net, net->ipv4.iptable_mangle, + mangle_ops); +} + static void __net_exit iptable_mangle_net_exit(struct net *net) { if (!net->ipv4.iptable_mangle) return; - ipt_unregister_table(net, net->ipv4.iptable_mangle, mangle_ops); + ipt_unregister_table_exit(net, net->ipv4.iptable_mangle); net->ipv4.iptable_mangle = NULL; } static struct pernet_operations iptable_mangle_net_ops = { + .pre_exit = iptable_mangle_net_pre_exit, .exit = iptable_mangle_net_exit, }; diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index ad33687b7444..b0143b109f25 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -113,16 +113,22 @@ static int __net_init iptable_nat_table_init(struct net *net) return ret; } +static void __net_exit iptable_nat_net_pre_exit(struct net *net) +{ + if (net->ipv4.nat_table) + ipt_nat_unregister_lookups(net); +} + static void __net_exit iptable_nat_net_exit(struct net *net) { if (!net->ipv4.nat_table) return; - ipt_nat_unregister_lookups(net); - ipt_unregister_table(net, net->ipv4.nat_table, NULL); + ipt_unregister_table_exit(net, net->ipv4.nat_table); net->ipv4.nat_table = NULL; } static struct pernet_operations iptable_nat_net_ops = { + .pre_exit = iptable_nat_net_pre_exit, .exit = iptable_nat_net_exit, }; diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 69697eb4bfc6..9abfe6bf2cb9 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -67,15 +67,23 @@ static int __net_init iptable_raw_table_init(struct net *net) return ret; } +static void __net_exit iptable_raw_net_pre_exit(struct net *net) +{ + if (net->ipv4.iptable_raw) + ipt_unregister_table_pre_exit(net, net->ipv4.iptable_raw, + rawtable_ops); +} + static void __net_exit iptable_raw_net_exit(struct net *net) { if (!net->ipv4.iptable_raw) return; - ipt_unregister_table(net, net->ipv4.iptable_raw, rawtable_ops); + ipt_unregister_table_exit(net, net->ipv4.iptable_raw); net->ipv4.iptable_raw = NULL; } static struct pernet_operations iptable_raw_net_ops = { + .pre_exit = iptable_raw_net_pre_exit, .exit = iptable_raw_net_exit, }; diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index ac633c1db97e..415c1975d770 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -62,16 +62,23 @@ static int __net_init iptable_security_table_init(struct net *net) return ret; } +static void __net_exit iptable_security_net_pre_exit(struct net *net) +{ + if (net->ipv4.iptable_security) + ipt_unregister_table_pre_exit(net, net->ipv4.iptable_security, + sectbl_ops); +} + static void __net_exit iptable_security_net_exit(struct net *net) { if (!net->ipv4.iptable_security) return; - - ipt_unregister_table(net, net->ipv4.iptable_security, sectbl_ops); + ipt_unregister_table_exit(net, net->ipv4.iptable_security); net->ipv4.iptable_security = NULL; } static struct pernet_operations iptable_security_net_ops = { + .pre_exit = iptable_security_net_pre_exit, .exit = iptable_security_net_exit, }; diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c index e32e41b99f0f..aba65fe90345 100644 --- a/net/ipv4/netfilter/nf_flow_table_ipv4.c +++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c @@ -34,3 +34,4 @@ module_exit(nf_flow_ipv4_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NF_FLOWTABLE(AF_INET); +MODULE_DESCRIPTION("Netfilter flow table support"); diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index abf89b972094..bcdb37f86a94 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -107,3 +107,4 @@ module_exit(nft_dup_ipv4_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "dup"); +MODULE_DESCRIPTION("IPv4 nftables packet duplication support"); diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index ce294113dbcd..03df986217b7 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -210,3 +210,4 @@ module_exit(nft_fib4_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); MODULE_ALIAS_NFT_AF_EXPR(2, "fib"); +MODULE_DESCRIPTION("nftables fib / ip route lookup support"); diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index 7e6fd5cde50f..e408f813f5d8 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -71,3 +71,4 @@ module_exit(nft_reject_ipv4_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "reject"); +MODULE_DESCRIPTION("IPv4 packet rejection for nftables"); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 8f8eefd3a3ce..c7bf5b26bf0c 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -432,10 +432,9 @@ static void hystart_update(struct sock *sk, u32 delay) if (hystart_detect & HYSTART_DELAY) { /* obtain the minimum delay of more than sampling packets */ + if (ca->curr_rtt > delay) + ca->curr_rtt = delay; if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { - if (ca->curr_rtt > delay) - ca->curr_rtt = delay; - ca->sample_cnt++; } else { if (ca->curr_rtt > ca->delay_min + diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 12fda8f27b08..12c26c9565b7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -261,7 +261,8 @@ static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) * cwnd may be very low (even just 1 packet), so we should ACK * immediately. */ - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } } @@ -961,6 +962,15 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) } } +/* Updates the delivered and delivered_ce counts */ +static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, + bool ece_ack) +{ + tp->delivered += delivered; + if (ece_ack) + tp->delivered_ce += delivered; +} + /* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). @@ -1137,6 +1147,7 @@ struct tcp_sacktag_state { struct rate_sample *rate; int flag; unsigned int mss_now; + u32 sack_delivered; }; /* Check if skb is fully within the SACK block. In presence of GSO skbs, @@ -1257,7 +1268,8 @@ static u8 tcp_sacktag_one(struct sock *sk, sacked |= TCPCB_SACKED_ACKED; state->flag |= FLAG_DATA_SACKED; tp->sacked_out += pcount; - tp->delivered += pcount; /* Out-of-order packets delivered */ + /* Out-of-order packets delivered */ + state->sack_delivered += pcount; /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ if (tp->lost_skb_hint && @@ -1683,7 +1695,8 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, num_sacks, prior_snd_una); if (found_dup_sack) { state->flag |= FLAG_DSACKING_ACK; - tp->delivered++; /* A spurious retransmission is delivered */ + /* A spurious retransmission is delivered */ + state->sack_delivered++; } /* Eliminate too old ACKs, but take into @@ -1892,7 +1905,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) /* Emulate SACKs for SACKless connection: account for a new dupack. */ -static void tcp_add_reno_sack(struct sock *sk, int num_dupack) +static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack) { if (num_dupack) { struct tcp_sock *tp = tcp_sk(sk); @@ -1903,20 +1916,21 @@ static void tcp_add_reno_sack(struct sock *sk, int num_dupack) tcp_check_reno_reordering(sk, 0); delivered = tp->sacked_out - prior_sacked; if (delivered > 0) - tp->delivered += delivered; + tcp_count_delivered(tp, delivered, ece_ack); tcp_verify_left_out(tp); } } /* Account for ACK, ACKing some data in Reno Recovery phase. */ -static void tcp_remove_reno_sacks(struct sock *sk, int acked) +static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack) { struct tcp_sock *tp = tcp_sk(sk); if (acked > 0) { /* One ACK acked hole. The rest eat duplicate ACKs. */ - tp->delivered += max_t(int, acked - tp->sacked_out, 1); + tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1), + ece_ack); if (acked - 1 >= tp->sacked_out) tp->sacked_out = 0; else @@ -2696,7 +2710,7 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, * delivered. Lower inflight to clock out (re)tranmissions. */ if (after(tp->snd_nxt, tp->high_seq) && num_dupack) - tcp_add_reno_sack(sk, num_dupack); + tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE); else if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); } @@ -2778,6 +2792,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int fast_rexmit = 0, flag = *ack_flag; + bool ece_ack = flag & FLAG_ECE; bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) && tcp_force_fast_retransmit(sk)); @@ -2786,7 +2801,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, /* Now state machine starts. * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ - if (flag & FLAG_ECE) + if (ece_ack) tp->prior_ssthresh = 0; /* B. In all the states check for reneging SACKs. */ @@ -2827,7 +2842,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, case TCP_CA_Recovery: if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (tcp_is_reno(tp)) - tcp_add_reno_sack(sk, num_dupack); + tcp_add_reno_sack(sk, num_dupack, ece_ack); } else { if (tcp_try_undo_partial(sk, prior_snd_una)) return; @@ -2852,7 +2867,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); - tcp_add_reno_sack(sk, num_dupack); + tcp_add_reno_sack(sk, num_dupack, ece_ack); } if (icsk->icsk_ca_state <= TCP_CA_Disorder) @@ -2876,7 +2891,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, } /* Otherwise enter Recovery state */ - tcp_enter_recovery(sk, (flag & FLAG_ECE)); + tcp_enter_recovery(sk, ece_ack); fast_rexmit = 1; } @@ -3052,7 +3067,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, */ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, u32 prior_snd_una, - struct tcp_sacktag_state *sack) + struct tcp_sacktag_state *sack, bool ece_ack) { const struct inet_connection_sock *icsk = inet_csk(sk); u64 first_ackt, last_ackt; @@ -3077,8 +3092,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, u8 sacked = scb->sacked; u32 acked_pcount; - tcp_ack_tstamp(sk, skb, prior_snd_una); - /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { if (tcp_skb_pcount(skb) == 1 || @@ -3113,7 +3126,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (sacked & TCPCB_SACKED_ACKED) { tp->sacked_out -= acked_pcount; } else if (tcp_is_sack(tp)) { - tp->delivered += acked_pcount; + tcp_count_delivered(tp, acked_pcount, ece_ack); if (!tcp_skb_spurious_retrans(tp, skb)) tcp_rack_advance(tp, sacked, scb->end_seq, tcp_skb_timestamp_us(skb)); @@ -3142,6 +3155,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (!fully_acked) break; + tcp_ack_tstamp(sk, skb, prior_snd_una); + next = skb_rb_next(skb); if (unlikely(skb == tp->retransmit_skb_hint)) tp->retransmit_skb_hint = NULL; @@ -3157,8 +3172,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) tp->snd_up = tp->snd_una; - if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) - flag |= FLAG_SACK_RENEGING; + if (skb) { + tcp_ack_tstamp(sk, skb, prior_snd_una); + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) + flag |= FLAG_SACK_RENEGING; + } if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); @@ -3190,7 +3208,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, } if (tcp_is_reno(tp)) { - tcp_remove_reno_sacks(sk, pkts_acked); + tcp_remove_reno_sacks(sk, pkts_acked, ece_ack); /* If any of the cumulatively ACKed segments was * retransmitted, non-SACK case cannot confirm that @@ -3557,10 +3575,9 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) delivered = tp->delivered - prior_delivered; NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); - if (flag & FLAG_ECE) { - tp->delivered_ce += delivered; + if (flag & FLAG_ECE) NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); - } + return delivered; } @@ -3584,6 +3601,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) sack_state.first_sackt = 0; sack_state.rate = &rs; + sack_state.sack_delivered = 0; /* We very likely will need to access rtx queue. */ prefetch(sk->tcp_rtx_queue.rb_node); @@ -3659,12 +3677,25 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) ack_ev_flags |= CA_ACK_ECE; } + if (sack_state.sack_delivered) + tcp_count_delivered(tp, sack_state.sack_delivered, + flag & FLAG_ECE); + if (flag & FLAG_WIN_UPDATE) ack_ev_flags |= CA_ACK_WIN_UPDATE; tcp_in_ack_event(sk, ack_ev_flags); } + /* This is a deviation from RFC3168 since it states that: + * "When the TCP data sender is ready to set the CWR bit after reducing + * the congestion window, it SHOULD set the CWR bit only on the first + * new data packet that it transmits." + * We accept CWR on pure ACKs to be more robust + * with widely-deployed TCP implementations that do this. + */ + tcp_ecn_accept_cwr(sk, skb); + /* We passed data and got it acked, remove any soft error * log. Something worked... */ @@ -3675,7 +3706,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state); + flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state, + flag & FLAG_ECE); tcp_rack_update_reo_wnd(sk, &rs); @@ -4800,8 +4832,6 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) skb_dst_drop(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); - tcp_ecn_accept_cwr(sk, skb); - tp->rx_opt.dsack = 0; /* Queue data for delivery to the user. diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a50e1990a845..04b70fe31fa2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1064,6 +1064,10 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); } +INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); +INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); +INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)); + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -1207,7 +1211,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, } #endif - icsk->icsk_af_ops->send_check(sk, skb); + INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check, + tcp_v6_send_check, tcp_v4_send_check, + sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); @@ -1235,7 +1241,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tcp_add_tx_delay(skb, tp); - err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); + err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit, + inet6_csk_xmit, ip_queue_xmit, + sk, skb, &inet->cork.fl); if (unlikely(err > 0)) { tcp_enter_cwr(sk); |