diff options
Diffstat (limited to 'net/ipv4')
43 files changed, 1413 insertions, 456 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index bd2901604842..6fb3c90ad726 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -331,8 +331,8 @@ config NET_FOU_IP_TUNNELS When this option is enabled IP tunnels can be configured to use FOU or GUE encapsulation. -config GENEVE - tristate "Generic Network Virtualization Encapsulation (Geneve)" +config GENEVE_CORE + tristate "Generic Network Virtualization Encapsulation library" depends on INET select NET_UDP_TUNNEL ---help--- @@ -615,6 +615,22 @@ config TCP_CONG_DCTCP For further details see: http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf +config TCP_CONG_CDG + tristate "CAIA Delay-Gradient (CDG)" + default n + ---help--- + CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies + the TCP sender in order to: + + o Use the delay gradient as a congestion signal. + o Back off with an average probability that is independent of the RTT. + o Coexist with flows that use loss-based congestion control. + o Tolerate packet loss unrelated to congestion. + + For further details see: + D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using + delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC @@ -646,6 +662,9 @@ choice config DEFAULT_DCTCP bool "DCTCP" if TCP_CONG_DCTCP=y + config DEFAULT_CDG + bool "CDG" if TCP_CONG_CDG=y + config DEFAULT_RENO bool "Reno" endchoice @@ -668,6 +687,7 @@ config DEFAULT_TCP_CONG default "veno" if DEFAULT_VENO default "reno" if DEFAULT_RENO default "dctcp" if DEFAULT_DCTCP + default "cdg" if DEFAULT_CDG default "cubic" config TCP_MD5SIG diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 518c04ed666e..efc43f300b8c 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o +obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o @@ -56,7 +57,7 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o -obj-$(CONFIG_GENEVE) += geneve.o +obj-$(CONFIG_GENEVE_CORE) += geneve_core.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o xfrm4_protocol.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8b47a4d79d04..9532ee87151f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -228,6 +228,8 @@ int inet_listen(struct socket *sock, int backlog) err = 0; if (err) goto out; + + tcp_fastopen_init_key_once(true); } err = inet_csk_listen_start(sk, backlog); if (err) @@ -317,7 +319,7 @@ lookup_protocol: WARN_ON(!answer_prot->slab); err = -ENOBUFS; - sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); + sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); if (!sk) goto out; @@ -488,7 +490,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ - if (sk->sk_prot->get_port(sk, snum)) { + if ((snum || !inet->bind_address_no_port) && + sk->sk_prot->get_port(sk, snum)) { inet->inet_saddr = inet->inet_rcv_saddr = 0; err = -EADDRINUSE; goto out_release_sock; @@ -1430,7 +1433,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, struct net *net) { struct socket *sock; - int rc = sock_create_kern(family, type, protocol, &sock); + int rc = sock_create_kern(net, family, type, protocol, &sock); if (rc == 0) { *sk = sock->sk; @@ -1440,8 +1443,6 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, * we do not wish this socket to see incoming packets. */ (*sk)->sk_prot->unhash(*sk); - - sk_change_net(*sk, net); } return rc; } @@ -1597,7 +1598,7 @@ static __net_init int inet_init_net(struct net *net) */ seqlock_init(&net->ipv4.ip_local_ports.lock); net->ipv4.ip_local_ports.range[0] = 32768; - net->ipv4.ip_local_ports.range[1] = 61000; + net->ipv4.ip_local_ports.range[1] = 60999; seqlock_init(&net->ipv4.ping_group_range.lock); /* diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 419d23c53ec7..7498716e8f54 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2169,6 +2169,8 @@ static struct devinet_sysctl_table { "igmpv2_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL, "igmpv3_unsolicited_report_interval"), + DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN, + "ignore_routes_with_linkdown"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 872494e6e6eb..6bbc54940eb4 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -280,7 +280,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_scope = scope; fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; - if (!fib_lookup(net, &fl4, &res)) + if (!fib_lookup(net, &fl4, &res, 0)) return FIB_RES_PREFSRC(net, res); } else { scope = RT_SCOPE_LINK; @@ -319,7 +319,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; net = dev_net(dev); - if (fib_lookup(net, &fl4, &res)) + if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; if (res.type != RTN_UNICAST && (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) @@ -354,7 +354,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_oif = dev->ifindex; ret = 0; - if (fib_lookup(net, &fl4, &res) == 0) { + if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { if (res.type == RTN_UNICAST) ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; } @@ -1063,9 +1063,9 @@ static void nl_fib_lookup_exit(struct net *net) net->ipv4.fibnl = NULL; } -static void fib_disable_ip(struct net_device *dev, int force) +static void fib_disable_ip(struct net_device *dev, unsigned long event) { - if (fib_sync_down_dev(dev, force)) + if (fib_sync_down_dev(dev, event)) fib_flush(dev_net(dev)); rt_cache_flush(dev_net(dev)); arp_ifdown(dev); @@ -1081,7 +1081,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, case NETDEV_UP: fib_add_ifaddr(ifa); #ifdef CONFIG_IP_ROUTE_MULTIPATH - fib_sync_up(dev); + fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev)); @@ -1093,7 +1093,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, /* Last address was deleted from this interface. * Disable IP. */ - fib_disable_ip(dev, 1); + fib_disable_ip(dev, event); } else { rt_cache_flush(dev_net(dev)); } @@ -1107,9 +1107,10 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct in_device *in_dev; struct net *net = dev_net(dev); + unsigned int flags; if (event == NETDEV_UNREGISTER) { - fib_disable_ip(dev, 2); + fib_disable_ip(dev, event); rt_flush_dev(dev); return NOTIFY_DONE; } @@ -1124,16 +1125,22 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo fib_add_ifaddr(ifa); } endfor_ifa(in_dev); #ifdef CONFIG_IP_ROUTE_MULTIPATH - fib_sync_up(dev); + fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(net); break; case NETDEV_DOWN: - fib_disable_ip(dev, 0); + fib_disable_ip(dev, event); break; - case NETDEV_CHANGEMTU: case NETDEV_CHANGE: + flags = dev_get_flags(dev); + if (flags & (IFF_RUNNING | IFF_LOWER_UP)) + fib_sync_up(dev, RTNH_F_LINKDOWN); + else + fib_sync_down_dev(dev, event); + /* fall through */ + case NETDEV_CHANGEMTU: rt_cache_flush(net); break; } diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 56151982f74e..18123d50f576 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -47,11 +47,12 @@ struct fib4_rule { #endif }; -int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) +int __fib_lookup(struct net *net, struct flowi4 *flp, + struct fib_result *res, unsigned int flags) { struct fib_lookup_arg arg = { .result = res, - .flags = FIB_LOOKUP_NOREF, + .flags = flags, }; int err; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 8d695b6659c7..3bfccd83551c 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -266,7 +266,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif - ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) + ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK)) return -1; onh++; } endfor_nexthops(fi); @@ -318,7 +318,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) nfi->fib_type == fi->fib_type && memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX) == 0 && - ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && + !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) && (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) return fi; } @@ -604,6 +604,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, return -ENODEV; if (!(dev->flags & IFF_UP)) return -ENETDOWN; + if (!netif_carrier_ok(dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; nh->nh_dev = dev; dev_hold(dev); nh->nh_scope = RT_SCOPE_LINK; @@ -621,7 +623,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, /* It is not necessary, but requires a bit of thinking */ if (fl4.flowi4_scope < RT_SCOPE_LINK) fl4.flowi4_scope = RT_SCOPE_LINK; - err = fib_lookup(net, &fl4, &res); + err = fib_lookup(net, &fl4, &res, + FIB_LOOKUP_IGNORE_LINKSTATE); if (err) { rcu_read_unlock(); return err; @@ -636,6 +639,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (!dev) goto out; dev_hold(dev); + if (!netif_carrier_ok(dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; } else { struct in_device *in_dev; @@ -654,6 +659,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, nh->nh_dev = in_dev->dev; dev_hold(nh->nh_dev); nh->nh_scope = RT_SCOPE_HOST; + if (!netif_carrier_ok(nh->nh_dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; err = 0; } out: @@ -713,8 +720,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, struct hlist_head *dest; unsigned int new_hash; - hlist_del(&fi->fib_hash); - new_hash = fib_info_hashfn(fi); dest = &new_info_hash[new_hash]; hlist_add_head(&fi->fib_hash, dest); @@ -731,8 +736,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, struct hlist_head *ldest; unsigned int new_hash; - hlist_del(&fi->fib_lhash); - new_hash = fib_laddr_hashfn(fi->fib_prefsrc); ldest = &new_laddrhash[new_hash]; hlist_add_head(&fi->fib_lhash, ldest); @@ -924,11 +927,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (!nh->nh_dev) goto failure; } else { + int linkdown = 0; + change_nexthops(fi) { err = fib_check_nh(cfg, fi, nexthop_nh); if (err != 0) goto failure; + if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN) + linkdown++; } endfor_nexthops(fi) + if (linkdown == fi->fib_nhs) + fi->fib_flags |= RTNH_F_LINKDOWN; } if (fi->fib_prefsrc) { @@ -1027,12 +1036,20 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) goto nla_put_failure; if (fi->fib_nhs == 1) { + struct in_device *in_dev; + if (fi->fib_nh->nh_gw && nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw)) goto nla_put_failure; if (fi->fib_nh->nh_oif && nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif)) goto nla_put_failure; + if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) { + in_dev = __in_dev_get_rcu(fi->fib_nh->nh_dev); + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) + rtm->rtm_flags |= RTNH_F_DEAD; + } #ifdef CONFIG_IP_ROUTE_CLASSID if (fi->fib_nh[0].nh_tclassid && nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) @@ -1049,11 +1066,19 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; for_nexthops(fi) { + struct in_device *in_dev; + rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); if (!rtnh) goto nla_put_failure; rtnh->rtnh_flags = nh->nh_flags & 0xFF; + if (nh->nh_flags & RTNH_F_LINKDOWN) { + in_dev = __in_dev_get_rcu(nh->nh_dev); + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) + rtnh->rtnh_flags |= RTNH_F_DEAD; + } rtnh->rtnh_hops = nh->nh_weight - 1; rtnh->rtnh_ifindex = nh->nh_oif; @@ -1107,7 +1132,7 @@ int fib_sync_down_addr(struct net *net, __be32 local) return ret; } -int fib_sync_down_dev(struct net_device *dev, int force) +int fib_sync_down_dev(struct net_device *dev, unsigned long event) { int ret = 0; int scope = RT_SCOPE_NOWHERE; @@ -1116,7 +1141,8 @@ int fib_sync_down_dev(struct net_device *dev, int force) struct hlist_head *head = &fib_info_devhash[hash]; struct fib_nh *nh; - if (force) + if (event == NETDEV_UNREGISTER || + event == NETDEV_DOWN) scope = -1; hlist_for_each_entry(nh, head, nh_hash) { @@ -1133,7 +1159,15 @@ int fib_sync_down_dev(struct net_device *dev, int force) dead++; else if (nexthop_nh->nh_dev == dev && nexthop_nh->nh_scope != scope) { - nexthop_nh->nh_flags |= RTNH_F_DEAD; + switch (event) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + nexthop_nh->nh_flags |= RTNH_F_DEAD; + /* fall through */ + case NETDEV_CHANGE: + nexthop_nh->nh_flags |= RTNH_F_LINKDOWN; + break; + } #ifdef CONFIG_IP_ROUTE_MULTIPATH spin_lock_bh(&fib_multipath_lock); fi->fib_power -= nexthop_nh->nh_power; @@ -1143,14 +1177,23 @@ int fib_sync_down_dev(struct net_device *dev, int force) dead++; } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (force > 1 && nexthop_nh->nh_dev == dev) { + if (event == NETDEV_UNREGISTER && + nexthop_nh->nh_dev == dev) { dead = fi->fib_nhs; break; } #endif } endfor_nexthops(fi) if (dead == fi->fib_nhs) { - fi->fib_flags |= RTNH_F_DEAD; + switch (event) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + fi->fib_flags |= RTNH_F_DEAD; + /* fall through */ + case NETDEV_CHANGE: + fi->fib_flags |= RTNH_F_LINKDOWN; + break; + } ret++; } } @@ -1214,13 +1257,11 @@ out: return; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - /* * Dead device goes up. We wake up dead nexthops. * It takes sense only on multipath routes. */ -int fib_sync_up(struct net_device *dev) +int fib_sync_up(struct net_device *dev, unsigned int nh_flags) { struct fib_info *prev_fi; unsigned int hash; @@ -1247,7 +1288,7 @@ int fib_sync_up(struct net_device *dev) prev_fi = fi; alive = 0; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { + if (!(nexthop_nh->nh_flags & nh_flags)) { alive++; continue; } @@ -1258,14 +1299,18 @@ int fib_sync_up(struct net_device *dev) !__in_dev_get_rtnl(dev)) continue; alive++; +#ifdef CONFIG_IP_ROUTE_MULTIPATH spin_lock_bh(&fib_multipath_lock); nexthop_nh->nh_power = 0; - nexthop_nh->nh_flags &= ~RTNH_F_DEAD; + nexthop_nh->nh_flags &= ~nh_flags; spin_unlock_bh(&fib_multipath_lock); +#else + nexthop_nh->nh_flags &= ~nh_flags; +#endif } endfor_nexthops(fi) if (alive > 0) { - fi->fib_flags &= ~RTNH_F_DEAD; + fi->fib_flags &= ~nh_flags; ret++; } } @@ -1273,6 +1318,8 @@ int fib_sync_up(struct net_device *dev) return ret; } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + /* * The algorithm is suboptimal, but it provides really * fair weighted route distribution. @@ -1280,16 +1327,22 @@ int fib_sync_up(struct net_device *dev) void fib_select_multipath(struct fib_result *res) { struct fib_info *fi = res->fi; + struct in_device *in_dev; int w; spin_lock_bh(&fib_multipath_lock); if (fi->fib_power <= 0) { int power = 0; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { - power += nexthop_nh->nh_weight; - nexthop_nh->nh_power = nexthop_nh->nh_weight; - } + in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev); + if (nexthop_nh->nh_flags & RTNH_F_DEAD) + continue; + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nexthop_nh->nh_flags & RTNH_F_LINKDOWN) + continue; + power += nexthop_nh->nh_weight; + nexthop_nh->nh_power = nexthop_nh->nh_weight; } endfor_nexthops(fi); fi->fib_power = power; if (power <= 0) { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 09b62e17dd8c..15d32612e3c6 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -72,6 +72,7 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/vmalloc.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> @@ -324,13 +325,15 @@ static inline void empty_child_dec(struct key_vector *n) static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) { - struct tnode *kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); - struct key_vector *l = kv->kv; + struct key_vector *l; + struct tnode *kv; + kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); if (!kv) return NULL; /* initialize key vector */ + l = kv->kv; l->key = key; l->pos = 0; l->bits = 0; @@ -345,24 +348,26 @@ static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) static struct key_vector *tnode_new(t_key key, int pos, int bits) { - struct tnode *tnode = tnode_alloc(bits); unsigned int shift = pos + bits; - struct key_vector *tn = tnode->kv; + struct key_vector *tn; + struct tnode *tnode; /* verify bits and pos their msb bits clear and values are valid */ BUG_ON(!bits || (shift > KEYLENGTH)); - pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0), - sizeof(struct key_vector *) << bits); - + tnode = tnode_alloc(bits); if (!tnode) return NULL; + pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0), + sizeof(struct key_vector *) << bits); + if (bits == KEYLENGTH) tnode->full_children = 1; else tnode->empty_children = 1ul << bits; + tn = tnode->kv; tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0; tn->pos = pos; tn->bits = bits; @@ -1077,6 +1082,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; struct key_vector *l, *tp; + unsigned int nlflags = 0; struct fib_info *fi; u8 plen = cfg->fc_dst_len; u8 slen = KEYLENGTH - plen; @@ -1166,13 +1172,13 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_slen = fa->fa_slen; new_fa->tb_id = tb->tb_id; - err = netdev_switch_fib_ipv4_add(key, plen, fi, - new_fa->fa_tos, - cfg->fc_type, - cfg->fc_nlflags, - tb->tb_id); + err = switchdev_fib_ipv4_add(key, plen, fi, + new_fa->fa_tos, + cfg->fc_type, + cfg->fc_nlflags, + tb->tb_id); if (err) { - netdev_switch_fib_ipv4_abort(fi); + switchdev_fib_ipv4_abort(fi); kmem_cache_free(fn_alias_kmem, new_fa); goto out; } @@ -1196,7 +1202,9 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) if (fa_match) goto out; - if (!(cfg->fc_nlflags & NLM_F_APPEND)) + if (cfg->fc_nlflags & NLM_F_APPEND) + nlflags = NLM_F_APPEND; + else fa = fa_first; } err = -ENOENT; @@ -1216,12 +1224,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->tb_id = tb->tb_id; /* (Optionally) offload fib entry to switch hardware. */ - err = netdev_switch_fib_ipv4_add(key, plen, fi, tos, - cfg->fc_type, - cfg->fc_nlflags, - tb->tb_id); + err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type, + cfg->fc_nlflags, tb->tb_id); if (err) { - netdev_switch_fib_ipv4_abort(fi); + switchdev_fib_ipv4_abort(fi); goto out_free_new_fa; } @@ -1235,12 +1241,12 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) rt_cache_flush(cfg->fc_nlinfo.nl_net); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, - &cfg->fc_nlinfo, 0); + &cfg->fc_nlinfo, nlflags); succeeded: return 0; out_sw_fib_del: - netdev_switch_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id); + switchdev_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: @@ -1406,9 +1412,15 @@ found: continue; for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { const struct fib_nh *nh = &fi->fib_nh[nhsel]; + struct in_device *in_dev = __in_dev_get_rcu(nh->nh_dev); if (nh->nh_flags & RTNH_F_DEAD) continue; + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nh->nh_flags & RTNH_F_LINKDOWN && + !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) + continue; if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) continue; @@ -1518,8 +1530,8 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) if (!fa_to_delete) return -ESRCH; - netdev_switch_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos, - cfg->fc_type, tb->tb_id); + switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos, + cfg->fc_type, tb->tb_id); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1768,10 +1780,9 @@ void fib_table_flush_external(struct fib_table *tb) if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD)) continue; - netdev_switch_fib_ipv4_del(n->key, - KEYLENGTH - fa->fa_slen, - fi, fa->fa_tos, - fa->fa_type, tb->tb_id); + switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen, + fi, fa->fa_tos, fa->fa_type, + tb->tb_id); } /* update leaf slen */ @@ -1836,10 +1847,9 @@ int fib_table_flush(struct fib_table *tb) continue; } - netdev_switch_fib_ipv4_del(n->key, - KEYLENGTH - fa->fa_slen, - fi, fa->fa_tos, - fa->fa_type, tb->tb_id); + switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen, + fi, fa->fa_tos, fa->fa_type, + tb->tb_id); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); @@ -2057,11 +2067,12 @@ static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter) static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter, struct trie *t) { - struct key_vector *n, *pn = t->kv; + struct key_vector *n, *pn; if (!t) return NULL; + pn = t->kv; n = rcu_dereference(pn->tnode[0]); if (!n) return NULL; diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve_core.c index 8986e63f3bda..311a4ba6950a 100644 --- a/net/ipv4/geneve.c +++ b/net/ipv4/geneve_core.c @@ -60,11 +60,6 @@ struct geneve_net { static int geneve_net_id; -static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) -{ - return (struct genevehdr *)(udp_hdr(skb) + 1); -} - static struct geneve_sock *geneve_find_sock(struct net *net, sa_family_t family, __be16 port) { @@ -435,7 +430,7 @@ static int __init geneve_init_module(void) if (rc) return rc; - pr_info("Geneve driver\n"); + pr_info("Geneve core logic\n"); return 0; } @@ -449,5 +444,4 @@ module_exit(geneve_cleanup_module); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jesse Gross <jesse@nicira.com>"); -MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic"); -MODULE_ALIAS_RTNL_LINK("geneve"); +MODULE_DESCRIPTION("Driver library for GENEVE encapsulated traffic"); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index a3a697f5ffba..651cdf648ec4 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1339,6 +1339,168 @@ out: } EXPORT_SYMBOL(ip_mc_inc_group); +static int ip_mc_check_iphdr(struct sk_buff *skb) +{ + const struct iphdr *iph; + unsigned int len; + unsigned int offset = skb_network_offset(skb) + sizeof(*iph); + + if (!pskb_may_pull(skb, offset)) + return -EINVAL; + + iph = ip_hdr(skb); + + if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph)) + return -EINVAL; + + offset += ip_hdrlen(skb) - sizeof(*iph); + + if (!pskb_may_pull(skb, offset)) + return -EINVAL; + + iph = ip_hdr(skb); + + if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) + return -EINVAL; + + len = skb_network_offset(skb) + ntohs(iph->tot_len); + if (skb->len < len || len < offset) + return -EINVAL; + + skb_set_transport_header(skb, offset); + + return 0; +} + +static int ip_mc_check_igmp_reportv3(struct sk_buff *skb) +{ + unsigned int len = skb_transport_offset(skb); + + len += sizeof(struct igmpv3_report); + + return pskb_may_pull(skb, len) ? 0 : -EINVAL; +} + +static int ip_mc_check_igmp_query(struct sk_buff *skb) +{ + unsigned int len = skb_transport_offset(skb); + + len += sizeof(struct igmphdr); + if (skb->len < len) + return -EINVAL; + + /* IGMPv{1,2}? */ + if (skb->len != len) { + /* or IGMPv3? */ + len += sizeof(struct igmpv3_query) - sizeof(struct igmphdr); + if (skb->len < len || !pskb_may_pull(skb, len)) + return -EINVAL; + } + + /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer + * all-systems destination addresses (224.0.0.1) for general queries + */ + if (!igmp_hdr(skb)->group && + ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP)) + return -EINVAL; + + return 0; +} + +static int ip_mc_check_igmp_msg(struct sk_buff *skb) +{ + switch (igmp_hdr(skb)->type) { + case IGMP_HOST_LEAVE_MESSAGE: + case IGMP_HOST_MEMBERSHIP_REPORT: + case IGMPV2_HOST_MEMBERSHIP_REPORT: + /* fall through */ + return 0; + case IGMPV3_HOST_MEMBERSHIP_REPORT: + return ip_mc_check_igmp_reportv3(skb); + case IGMP_HOST_MEMBERSHIP_QUERY: + return ip_mc_check_igmp_query(skb); + default: + return -ENOMSG; + } +} + +static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb) +{ + return skb_checksum_simple_validate(skb); +} + +static int __ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) + +{ + struct sk_buff *skb_chk; + unsigned int transport_len; + unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr); + int ret; + + transport_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb); + + skb_get(skb); + skb_chk = skb_checksum_trimmed(skb, transport_len, + ip_mc_validate_checksum); + if (!skb_chk) + return -EINVAL; + + if (!pskb_may_pull(skb_chk, len)) { + kfree_skb(skb_chk); + return -EINVAL; + } + + ret = ip_mc_check_igmp_msg(skb_chk); + if (ret) { + kfree_skb(skb_chk); + return ret; + } + + if (skb_trimmed) + *skb_trimmed = skb_chk; + else + kfree_skb(skb_chk); + + return 0; +} + +/** + * ip_mc_check_igmp - checks whether this is a sane IGMP packet + * @skb: the skb to validate + * @skb_trimmed: to store an skb pointer trimmed to IPv4 packet tail (optional) + * + * Checks whether an IPv4 packet is a valid IGMP packet. If so sets + * skb network and transport headers accordingly and returns zero. + * + * -EINVAL: A broken packet was detected, i.e. it violates some internet + * standard + * -ENOMSG: IP header validation succeeded but it is not an IGMP packet. + * -ENOMEM: A memory allocation failure happened. + * + * Optionally, an skb pointer might be provided via skb_trimmed (or set it + * to NULL): After parsing an IGMP packet successfully it will point to + * an skb which has its tail aligned to the IP packet end. This might + * either be the originally provided skb or a trimmed, cloned version if + * the skb frame had data beyond the IP packet. A cloned skb allows us + * to leave the original skb and its full frame unchanged (which might be + * desirable for layer 2 frame jugglers). + * + * The caller needs to release a reference count from any returned skb_trimmed. + */ +int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) +{ + int ret = ip_mc_check_iphdr(skb); + + if (ret < 0) + return ret; + + if (ip_hdr(skb)->protocol != IPPROTO_IGMP) + return -ENOMSG; + + return __ip_mc_check_igmp(skb, skb_trimmed); +} +EXPORT_SYMBOL(ip_mc_check_igmp); + /* * Resend IGMP JOIN report; used by netdev notifier. */ diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 8976ca423a07..60021d0d9326 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -99,6 +99,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) struct net *net = sock_net(sk); int smallest_size = -1, smallest_rover; kuid_t uid = sock_i_uid(sk); + int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; local_bh_disable(); if (!snum) { @@ -106,6 +107,14 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) again: inet_get_local_port_range(net, &low, &high); + if (attempt_half) { + int half = low + ((high - low) >> 1); + + if (attempt_half == 1) + high = half; + else + low = half; + } remaining = (high - low) + 1; smallest_rover = rover = prandom_u32() % remaining + low; @@ -127,11 +136,6 @@ again: (tb->num_owners < smallest_size || smallest_size == -1)) { smallest_size = tb->num_owners; smallest_rover = rover; - if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 && - !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { - snum = smallest_rover; - goto tb_found; - } } if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { snum = rover; @@ -159,6 +163,11 @@ again: snum = smallest_rover; goto have_snum; } + if (attempt_half == 1) { + /* OK we now try the upper half of the range */ + attempt_half = 2; + goto again; + } goto fail; } /* OK, here is the one we will use. HEAD is diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 4d32262c7502..9bc26677058e 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -151,6 +151,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, if (nla_put_u8(skb, INET_DIAG_TCLASS, inet6_sk(sk)->tclass) < 0) goto errout; + + if (ipv6_only_sock(sk) && + nla_put_u8(skb, INET_DIAG_SKV6ONLY, 1)) + goto errout; } #endif @@ -200,9 +204,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, } #undef EXPIRES_IN_MS - if (ext & (1 << (INET_DIAG_INFO - 1))) { + if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) { attr = nla_reserve(skb, INET_DIAG_INFO, - sizeof(struct tcp_info)); + handler->idiag_info_size); if (!attr) goto errout; @@ -746,7 +750,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, entry.family = sk->sk_family; - spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); lopt = icsk->icsk_accept_queue.listen_opt; if (!lopt || !listen_sock_qlen(lopt)) @@ -794,7 +798,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, } out: - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); return err; } @@ -1078,14 +1082,62 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) return inet_diag_get_exact(skb, h, nlmsg_data(h)); } +static +int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk) +{ + const struct inet_diag_handler *handler; + struct nlmsghdr *nlh; + struct nlattr *attr; + struct inet_diag_msg *r; + void *info = NULL; + int err = 0; + + nlh = nlmsg_put(skb, 0, 0, SOCK_DIAG_BY_FAMILY, sizeof(*r), 0); + if (!nlh) + return -ENOMEM; + + r = nlmsg_data(nlh); + memset(r, 0, sizeof(*r)); + inet_diag_msg_common_fill(r, sk); + if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_STREAM) + r->id.idiag_sport = inet_sk(sk)->inet_sport; + r->idiag_state = sk->sk_state; + + if ((err = nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))) { + nlmsg_cancel(skb, nlh); + return err; + } + + handler = inet_diag_lock_handler(sk->sk_protocol); + if (IS_ERR(handler)) { + inet_diag_unlock_handler(handler); + nlmsg_cancel(skb, nlh); + return PTR_ERR(handler); + } + + attr = handler->idiag_info_size + ? nla_reserve(skb, INET_DIAG_INFO, handler->idiag_info_size) + : NULL; + if (attr) + info = nla_data(attr); + + handler->idiag_get_info(sk, r, info); + inet_diag_unlock_handler(handler); + + nlmsg_end(skb, nlh); + return 0; +} + static const struct sock_diag_handler inet_diag_handler = { .family = AF_INET, .dump = inet_diag_handler_dump, + .get_info = inet_diag_handler_get_info, }; static const struct sock_diag_handler inet6_diag_handler = { .family = AF_INET6, .dump = inet_diag_handler_dump, + .get_info = inet_diag_handler_get_info, }; int inet_diag_register(const struct inet_diag_handler *h) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index c6fb80bd5826..5f9b063bbe8a 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -18,6 +18,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/wait.h> +#include <linux/vmalloc.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> @@ -90,10 +91,6 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - - atomic_inc(&hashinfo->bsockets); - inet_sk(sk)->inet_num = snum; sk_add_bind_node(sk, &tb->owners); tb->num_owners++; @@ -111,8 +108,6 @@ static void __inet_put_port(struct sock *sk) struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; struct inet_bind_bucket *tb; - atomic_dec(&hashinfo->bsockets); - spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; __sk_del_bind_node(sk); @@ -399,9 +394,10 @@ not_unique: return -EADDRNOTAVAIL; } -static inline u32 inet_sk_port_offset(const struct sock *sk) +static u32 inet_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); + return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, inet->inet_daddr, inet->inet_dport); @@ -507,8 +503,14 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; + /* By starting with offset being an even number, + * we tend to leave about 50% of ports for other uses, + * like bind(0). + */ + offset &= ~1; + local_bh_disable(); - for (i = 1; i <= remaining; i++) { + for (i = 0; i < remaining; i++) { port = low + (i + offset) % remaining; if (inet_is_local_reserved_port(net, port)) continue; @@ -552,7 +554,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, return -EADDRNOTAVAIL; ok: - hint += i; + hint += (i + 2) & ~1; /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, port); @@ -599,7 +601,11 @@ out: int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { - return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), + u32 port_offset = 0; + + if (!inet_sk(sk)->inet_num) + port_offset = inet_sk_port_offset(sk); + return __inet_hash_connect(death_row, sk, port_offset, __inet_check_established); } EXPORT_SYMBOL_GPL(inet_hash_connect); @@ -608,7 +614,6 @@ void inet_hashinfo_init(struct inet_hashinfo *h) { int i; - atomic_set(&h->bsockets, 0); for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, @@ -616,3 +621,33 @@ void inet_hashinfo_init(struct inet_hashinfo *h) } } EXPORT_SYMBOL_GPL(inet_hashinfo_init); + +int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) +{ + unsigned int i, nblocks = 1; + + if (sizeof(spinlock_t) != 0) { + /* allocate 2 cache lines or at least one spinlock per cpu */ + nblocks = max_t(unsigned int, + 2 * L1_CACHE_BYTES / sizeof(spinlock_t), + 1); + nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); + + /* no more locks than number of hash buckets */ + nblocks = min(nblocks, hashinfo->ehash_mask + 1); + + hashinfo->ehash_locks = kmalloc_array(nblocks, sizeof(spinlock_t), + GFP_KERNEL | __GFP_NOWARN); + if (!hashinfo->ehash_locks) + hashinfo->ehash_locks = vmalloc(nblocks * sizeof(spinlock_t)); + + if (!hashinfo->ehash_locks) + return -ENOMEM; + + for (i = 0; i < nblocks; i++) + spin_lock_init(&hashinfo->ehash_locks[i]); + } + hashinfo->ehash_locks_mask = nblocks - 1; + return 0; +} +EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 00ec8d5d7e7e..2ffbd16b79e0 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -170,7 +170,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); -void tw_timer_handler(unsigned long data) +static void tw_timer_handler(unsigned long data) { struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 3674484946a5..2d3aa408fbdc 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -39,17 +39,21 @@ #include <net/route.h> #include <net/xfrm.h> -static bool ip_may_fragment(const struct sk_buff *skb) -{ - return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) || - skb->ignore_df; -} - static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) return false; + if (unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)) + return false; + + /* original fragment exceeds mtu and DF is set */ + if (unlikely(IPCB(skb)->frag_max_size > mtu)) + return true; + + if (skb->ignore_df) + return false; + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) return false; @@ -114,7 +118,7 @@ int ip_forward(struct sk_buff *skb) IPCB(skb)->flags |= IPSKB_FORWARDED; mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); - if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) { + if (ip_exceeds_mtu(skb, mtu)) { IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index cc1da6d9cb35..a50dc6d408d1 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -75,6 +75,7 @@ struct ipq { __be16 id; u8 protocol; u8 ecn; /* RFC3168 support */ + u16 max_df_size; /* largest frag with DF set seen */ int iif; unsigned int rid; struct inet_peer *peer; @@ -173,6 +174,15 @@ static void ipq_kill(struct ipq *ipq) inet_frag_kill(&ipq->q, &ip4_frags); } +static bool frag_expire_skip_icmp(u32 user) +{ + return user == IP_DEFRAG_AF_PACKET || + ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN, + __IP_DEFRAG_CONNTRACK_IN_END) || + ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN, + __IP_DEFRAG_CONNTRACK_BRIDGE_IN); +} + /* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ @@ -217,10 +227,8 @@ static void ip_expire(unsigned long arg) /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ - if (qp->user == IP_DEFRAG_AF_PACKET || - ((qp->user >= IP_DEFRAG_CONNTRACK_IN) && - (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) && - (skb_rtable(head)->rt_type != RTN_LOCAL))) + if (frag_expire_skip_icmp(qp->user) && + (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out_rcu_unlock; /* Send an ICMP "Fragment Reassembly Timeout" message. */ @@ -319,6 +327,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { struct sk_buff *prev, *next; struct net_device *dev; + unsigned int fragsize; int flags, offset; int ihl, end; int err = -ENOENT; @@ -474,9 +483,14 @@ found: if (offset == 0) qp->q.flags |= INET_FRAG_FIRST_IN; + fragsize = skb->len + ihl; + + if (fragsize > qp->q.max_size) + qp->q.max_size = fragsize; + if (ip_hdr(skb)->frag_off & htons(IP_DF) && - skb->len + ihl > qp->q.max_size) - qp->q.max_size = skb->len + ihl; + fragsize > qp->max_df_size) + qp->max_df_size = fragsize; if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && qp->q.meat == qp->q.len) { @@ -606,13 +620,27 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, head->next = NULL; head->dev = dev; head->tstamp = qp->q.stamp; - IPCB(head)->frag_max_size = qp->q.max_size; + IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); iph = ip_hdr(head); - /* max_size != 0 implies at least one fragment had IP_DF set */ - iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0; iph->tot_len = htons(len); iph->tos |= ecn; + + /* When we set IP_DF on a refragmented skb we must also force a + * call to ip_fragment to avoid forwarding a DF-skb of size s while + * original sender only sent fragments of size f (where f < s). + * + * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest + * frag seen to avoid sending tiny DF-fragments in case skb was built + * from one very small df-fragment and one large non-df frag. + */ + if (qp->max_df_size == qp->q.max_size) { + IPCB(head)->flags |= IPSKB_FRAG_PMTU; + iph->frag_off = htons(IP_DF); + } else { + iph->frag_off = 0; + } + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; qp->q.fragments_tail = NULL; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c65b93a7b711..6bf89a6312bc 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -83,6 +83,10 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; EXPORT_SYMBOL(sysctl_ip_default_ttl); +static int ip_fragment(struct sock *sk, struct sk_buff *skb, + unsigned int mtu, + int (*output)(struct sock *, struct sk_buff *)); + /* Generate a checksum for an outgoing IP datagram. */ void ip_send_check(struct iphdr *iph) { @@ -91,7 +95,7 @@ void ip_send_check(struct iphdr *iph) } EXPORT_SYMBOL(ip_send_check); -int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb) +static int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); @@ -168,7 +172,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, } EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); -static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb) +static int ip_finish_output2(struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = (struct rtable *)dst; @@ -216,7 +220,8 @@ static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb) return -EINVAL; } -static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb) +static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, + unsigned int mtu) { netdev_features_t features; struct sk_buff *segs; @@ -224,7 +229,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb) /* common case: locally created skb or seglen is <= mtu */ if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || - skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb)) + skb_gso_network_seglen(skb) <= mtu) return ip_finish_output2(sk, skb); /* Slowpath - GSO segment length is exceeding the dst MTU. @@ -248,7 +253,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb) int err; segs->next = NULL; - err = ip_fragment(sk, segs, ip_finish_output2); + err = ip_fragment(sk, segs, mtu, ip_finish_output2); if (err && ret == 0) ret = err; @@ -260,6 +265,8 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb) static int ip_finish_output(struct sock *sk, struct sk_buff *skb) { + unsigned int mtu; + #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { @@ -267,11 +274,12 @@ static int ip_finish_output(struct sock *sk, struct sk_buff *skb) return dst_output_sk(sk, skb); } #endif + mtu = ip_skb_dst_mtu(skb); if (skb_is_gso(skb)) - return ip_finish_output_gso(sk, skb); + return ip_finish_output_gso(sk, skb, mtu); - if (skb->len > ip_skb_dst_mtu(skb)) - return ip_fragment(sk, skb, ip_finish_output2); + if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU)) + return ip_fragment(sk, skb, mtu, ip_finish_output2); return ip_finish_output2(sk, skb); } @@ -478,6 +486,31 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } +static int ip_fragment(struct sock *sk, struct sk_buff *skb, + unsigned int mtu, + int (*output)(struct sock *, struct sk_buff *)) +{ + struct iphdr *iph = ip_hdr(skb); + + if ((iph->frag_off & htons(IP_DF)) == 0) + return ip_do_fragment(sk, skb, output); + + if (unlikely(!skb->ignore_df || + (IPCB(skb)->frag_max_size && + IPCB(skb)->frag_max_size > mtu))) { + struct rtable *rt = skb_rtable(skb); + struct net_device *dev = rt->dst.dev; + + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + kfree_skb(skb); + return -EMSGSIZE; + } + + return ip_do_fragment(sk, skb, output); +} + /* * This IP datagram is too large to be sent in one piece. Break it up into * smaller pieces (each of size equal to IP header plus @@ -485,8 +518,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) * single device frame, and queue such a frame for sending. */ -int ip_fragment(struct sock *sk, struct sk_buff *skb, - int (*output)(struct sock *, struct sk_buff *)) +int ip_do_fragment(struct sock *sk, struct sk_buff *skb, + int (*output)(struct sock *, struct sk_buff *)) { struct iphdr *iph; int ptr; @@ -507,15 +540,8 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb, iph = ip_hdr(skb); mtu = ip_skb_dst_mtu(skb); - if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || - (IPCB(skb)->frag_max_size && - IPCB(skb)->frag_max_size > mtu))) { - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); - kfree_skb(skb); - return -EMSGSIZE; - } + if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu) + mtu = IPCB(skb)->frag_max_size; /* * Setup starting values. @@ -523,10 +549,6 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb, hlen = iph->ihl * 4; mtu = mtu - hlen; /* Size of data space */ -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (skb->nf_bridge) - mtu -= nf_bridge_mtu_reduction(skb); -#endif IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; /* When frag_list is given, use it. First, check its validity: @@ -711,6 +733,9 @@ slow_path: iph = ip_hdr(skb2); iph->frag_off = htons((offset >> 3)); + if (IPCB(skb)->flags & IPSKB_FRAG_PMTU) + iph->frag_off |= htons(IP_DF); + /* ANK: dirty, but effective trick. Upgrade options only if * the segment to be fragmented was THE FIRST (otherwise, * options are already fixed) and make it ONCE @@ -751,7 +776,7 @@ fail: IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; } -EXPORT_SYMBOL(ip_fragment); +EXPORT_SYMBOL(ip_do_fragment); int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) @@ -1217,11 +1242,9 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, } while (size > 0) { - int i; - - if (skb_is_gso(skb)) + if (skb_is_gso(skb)) { len = size; - else { + } else { /* Check if the remaining data fits into current packet. */ len = mtu - skb->len; @@ -1273,15 +1296,10 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, continue; } - i = skb_shinfo(skb)->nr_frags; if (len > size) len = size; - if (skb_can_coalesce(skb, i, page, offset)) { - skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len); - } else if (i < MAX_SKB_FRAGS) { - get_page(page); - skb_fill_page_desc(skb, i, page, offset, len); - } else { + + if (skb_append_pagefrags(skb, page, offset, len)) { err = -EMSGSIZE; goto error; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 7cfb0893f263..c3c359ad66e3 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -432,6 +432,15 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf kfree_skb(skb); } +/* For some errors we have valid addr_offset even with zero payload and + * zero port. Also, addr_offset should be supported if port is set. + */ +static inline bool ipv4_datagram_support_addr(struct sock_exterr_skb *serr) +{ + return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || + serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port; +} + /* IPv4 supports cmsg on all imcp errors and some timestamps * * Timestamp code paths do not initialize the fields expected by cmsg: @@ -498,7 +507,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - if (sin && serr->port) { + if (sin && ipv4_datagram_support_addr(serr)) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); @@ -582,6 +591,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, case IP_TRANSPARENT: case IP_MINTTL: case IP_NODEFRAG: + case IP_BIND_ADDRESS_NO_PORT: case IP_UNICAST_IF: case IP_MULTICAST_TTL: case IP_MULTICAST_ALL: @@ -732,6 +742,9 @@ static int do_ip_setsockopt(struct sock *sk, int level, } inet->nodefrag = val ? 1 : 0; break; + case IP_BIND_ADDRESS_NO_PORT: + inet->bind_address_no_port = val ? 1 : 0; + break; case IP_MTU_DISCOVER: if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) goto e_inval; @@ -1324,6 +1337,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_NODEFRAG: val = inet->nodefrag; break; + case IP_BIND_ADDRESS_NO_PORT: + val = inet->bind_address_no_port; + break; case IP_MTU_DISCOVER: val = inet->pmtudisc; break; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index ce63ab21b6cd..6a51a71a6c67 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -98,7 +98,7 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) return -ENOMEM; eh = (struct ethhdr *)skb->data; - if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN)) + if (likely(eth_proto_is_802_3(eh->h_proto))) skb->protocol = eh->h_proto; else skb->protocol = htons(ETH_P_802_2); @@ -165,6 +165,8 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, { int i; + netdev_stats_to_stats64(tot, &dev->stats); + for_each_possible_cpu(i) { const struct pcpu_sw_netstats *tstats = per_cpu_ptr(dev->tstats, i); @@ -185,22 +187,6 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, tot->tx_bytes += tx_bytes; } - tot->multicast = dev->stats.multicast; - - tot->rx_crc_errors = dev->stats.rx_crc_errors; - tot->rx_fifo_errors = dev->stats.rx_fifo_errors; - tot->rx_length_errors = dev->stats.rx_length_errors; - tot->rx_frame_errors = dev->stats.rx_frame_errors; - tot->rx_errors = dev->stats.rx_errors; - - tot->tx_fifo_errors = dev->stats.tx_fifo_errors; - tot->tx_carrier_errors = dev->stats.tx_carrier_errors; - tot->tx_dropped = dev->stats.tx_dropped; - tot->tx_aborted_errors = dev->stats.tx_aborted_errors; - tot->tx_errors = dev->stats.tx_errors; - - tot->collisions = dev->stats.collisions; - return tot; } EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index ff96396ebec5..254238daf58b 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -251,7 +251,8 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EINVAL; } - p.i_key = p.o_key = p.i_flags = p.o_flags = 0; + p.i_key = p.o_key = 0; + p.i_flags = p.o_flags = 0; if (p.iph.ttl) p.iph.frag_off |= htons(IP_DF); diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index fb20f363151f..2199a5db25e6 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -195,7 +195,8 @@ config IP_NF_MATCH_ECN config IP_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' - depends on NETFILTER_ADVANCED && (IP_NF_MANGLE || IP_NF_RAW) + depends on NETFILTER_ADVANCED + depends on IP_NF_MANGLE || IP_NF_RAW ---help--- This option allows you to match packets whose replies would go out via the interface the packet came in. diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index a61200754f4b..95c9b6eece25 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -256,7 +256,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, const struct arphdr *arp; struct arpt_entry *e, *back; const char *indev, *outdev; - void *table_base; + const void *table_base; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; @@ -275,7 +275,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, * pointer. */ smp_read_barrier_depends(); - table_base = private->entries[smp_processor_id()]; + table_base = private->entries; e = get_entry(table_base, private->hook_entry[hook]); back = get_entry(table_base, private->underflow[hook]); @@ -289,13 +289,15 @@ unsigned int arpt_do_table(struct sk_buff *skb, arp = arp_hdr(skb); do { const struct xt_entry_target *t; + struct xt_counters *counter; if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { e = arpt_next_entry(e); continue; } - ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1); + counter = xt_get_this_cpu_counter(&e->counters); + ADD_COUNTER(*counter, arp_hdr_len(skb->dev), 1); t = arpt_get_target_c(e); @@ -521,6 +523,10 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) if (ret) return ret; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; + t = arpt_get_target(e); target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, t->u.user.revision); @@ -538,6 +544,8 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) err: module_put(t->u.kernel.target->me); out: + xt_percpu_counter_free(e->counters.pcnt); + return ret; } @@ -614,6 +622,7 @@ static inline void cleanup_entry(struct arpt_entry *e) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); + xt_percpu_counter_free(e->counters.pcnt); } /* Checks and translates the user-supplied table segment (held in @@ -702,12 +711,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) { - if (newinfo->entries[i] && newinfo->entries[i] != entry0) - memcpy(newinfo->entries[i], entry0, newinfo->size); - } - return ret; } @@ -722,14 +725,16 @@ static void get_counters(const struct xt_table_info *t, seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; - xt_entry_foreach(iter, t->entries[cpu], t->size) { + xt_entry_foreach(iter, t->entries, t->size) { + struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); do { start = read_seqcount_begin(s); - bcnt = iter->counters.bcnt; - pcnt = iter->counters.pcnt; + bcnt = tmp->bcnt; + pcnt = tmp->pcnt; } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); @@ -774,7 +779,7 @@ static int copy_entries_to_user(unsigned int total_size, if (IS_ERR(counters)) return PTR_ERR(counters); - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; /* ... then copy entire thing ... */ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; @@ -863,16 +868,16 @@ static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { struct arpt_entry *iter; - void *loc_cpu_entry; + const void *loc_cpu_entry; int ret; if (!newinfo || !info) return -EINVAL; - /* we dont care about newinfo->entries[] */ + /* we dont care about newinfo->entries */ memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; - loc_cpu_entry = info->entries[raw_smp_processor_id()]; + loc_cpu_entry = info->entries; xt_compat_init_offsets(NFPROTO_ARP, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); @@ -1037,7 +1042,7 @@ static int __do_replace(struct net *net, const char *name, get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + loc_cpu_old_entry = oldinfo->entries; xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) cleanup_entry(iter); @@ -1084,8 +1089,7 @@ static int do_replace(struct net *net, const void __user *user, if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1115,7 +1119,7 @@ static int do_replace(struct net *net, const void __user *user, static int do_add_counters(struct net *net, const void __user *user, unsigned int len, int compat) { - unsigned int i, curcpu; + unsigned int i; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1125,7 +1129,6 @@ static int do_add_counters(struct net *net, const void __user *user, struct xt_table *t; const struct xt_table_info *private; int ret = 0; - void *loc_cpu_entry; struct arpt_entry *iter; unsigned int addend; #ifdef CONFIG_COMPAT @@ -1181,12 +1184,13 @@ static int do_add_counters(struct net *net, const void __user *user, } i = 0; - /* Choose the copy that is on our node */ - curcpu = smp_processor_id(); - loc_cpu_entry = private->entries[curcpu]; + addend = xt_write_recseq_begin(); - xt_entry_foreach(iter, loc_cpu_entry, private->size) { - ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + xt_entry_foreach(iter, private->entries, private->size) { + struct xt_counters *tmp; + + tmp = xt_get_this_cpu_counter(&iter->counters); + ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt); ++i; } xt_write_recseq_end(addend); @@ -1396,7 +1400,7 @@ static int translate_compat_table(const char *name, newinfo->hook_entry[i] = info->hook_entry[i]; newinfo->underflow[i] = info->underflow[i]; } - entry1 = newinfo->entries[raw_smp_processor_id()]; + entry1 = newinfo->entries; pos = entry1; size = total_size; xt_entry_foreach(iter0, entry0, total_size) { @@ -1416,9 +1420,17 @@ static int translate_compat_table(const char *name, i = 0; xt_entry_foreach(iter1, entry1, newinfo->size) { + iter1->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(iter1->counters.pcnt)) { + ret = -ENOMEM; + break; + } + ret = check_target(iter1, name); - if (ret != 0) + if (ret != 0) { + xt_percpu_counter_free(iter1->counters.pcnt); break; + } ++i; if (strcmp(arpt_get_target(iter1)->u.user.name, XT_ERROR_TARGET) == 0) @@ -1448,11 +1460,6 @@ static int translate_compat_table(const char *name, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) - if (newinfo->entries[i] && newinfo->entries[i] != entry1) - memcpy(newinfo->entries[i], entry1, newinfo->size); - *pinfo = newinfo; *pentry0 = entry1; xt_free_table_info(info); @@ -1511,8 +1518,7 @@ static int compat_do_replace(struct net *net, void __user *user, if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; @@ -1609,7 +1615,6 @@ static int compat_copy_entries_to_user(unsigned int total_size, void __user *pos; unsigned int size; int ret = 0; - void *loc_cpu_entry; unsigned int i = 0; struct arpt_entry *iter; @@ -1617,11 +1622,9 @@ static int compat_copy_entries_to_user(unsigned int total_size, if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy on our node/cpu */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; pos = userptr; size = total_size; - xt_entry_foreach(iter, loc_cpu_entry, total_size) { + xt_entry_foreach(iter, private->entries, total_size) { ret = compat_copy_entry_to_user(iter, &pos, &size, counters, i++); if (ret != 0) @@ -1790,8 +1793,7 @@ struct xt_table *arpt_register_table(struct net *net, goto out; } - /* choose the copy on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(newinfo, loc_cpu_entry, repl); @@ -1822,7 +1824,7 @@ void arpt_unregister_table(struct xt_table *table) private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) cleanup_entry(iter); if (private->number > private->initial_entries) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 2d0e265fef6e..6c72fbb7b49e 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -254,15 +254,13 @@ static void trace_packet(const struct sk_buff *skb, const struct xt_table_info *private, const struct ipt_entry *e) { - const void *table_base; const struct ipt_entry *root; const char *hookname, *chainname, *comment; const struct ipt_entry *iter; unsigned int rulenum = 0; struct net *net = dev_net(in ? in : out); - table_base = private->entries[smp_processor_id()]; - root = get_entry(table_base, private->hook_entry[hook]); + root = get_entry(private->entries, private->hook_entry[hook]); hookname = chainname = hooknames[hook]; comment = comments[NF_IP_TRACE_COMMENT_RULE]; @@ -331,7 +329,7 @@ ipt_do_table(struct sk_buff *skb, * pointer. */ smp_read_barrier_depends(); - table_base = private->entries[cpu]; + table_base = private->entries; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; stackptr = per_cpu_ptr(private->stackptr, cpu); origptr = *stackptr; @@ -345,6 +343,7 @@ ipt_do_table(struct sk_buff *skb, do { const struct xt_entry_target *t; const struct xt_entry_match *ematch; + struct xt_counters *counter; IP_NF_ASSERT(e); if (!ip_packet_match(ip, indev, outdev, @@ -361,7 +360,8 @@ ipt_do_table(struct sk_buff *skb, goto no_match; } - ADD_COUNTER(e->counters, skb->len, 1); + counter = xt_get_this_cpu_counter(&e->counters); + ADD_COUNTER(*counter, skb->len, 1); t = ipt_get_target(e); IP_NF_ASSERT(t->u.kernel.target); @@ -665,6 +665,10 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, if (ret) return ret; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; + j = 0; mtpar.net = net; mtpar.table = name; @@ -691,6 +695,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, ret = check_target(e, net, name); if (ret) goto err; + return 0; err: module_put(t->u.kernel.target->me); @@ -700,6 +705,9 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, break; cleanup_match(ematch, net); } + + xt_percpu_counter_free(e->counters.pcnt); + return ret; } @@ -784,6 +792,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); + xt_percpu_counter_free(e->counters.pcnt); } /* Checks and translates the user-supplied table segment (held in @@ -866,12 +875,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) { - if (newinfo->entries[i] && newinfo->entries[i] != entry0) - memcpy(newinfo->entries[i], entry0, newinfo->size); - } - return ret; } @@ -887,14 +890,16 @@ get_counters(const struct xt_table_info *t, seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; - xt_entry_foreach(iter, t->entries[cpu], t->size) { + xt_entry_foreach(iter, t->entries, t->size) { + struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); do { start = read_seqcount_begin(s); - bcnt = iter->counters.bcnt; - pcnt = iter->counters.pcnt; + bcnt = tmp->bcnt; + pcnt = tmp->pcnt; } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); @@ -939,11 +944,7 @@ copy_entries_to_user(unsigned int total_size, if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy that is on our node/cpu, ... - * This choice is lazy (because current thread is - * allowed to migrate to another cpu) - */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; @@ -1051,16 +1052,16 @@ static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { struct ipt_entry *iter; - void *loc_cpu_entry; + const void *loc_cpu_entry; int ret; if (!newinfo || !info) return -EINVAL; - /* we dont care about newinfo->entries[] */ + /* we dont care about newinfo->entries */ memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; - loc_cpu_entry = info->entries[raw_smp_processor_id()]; + loc_cpu_entry = info->entries; xt_compat_init_offsets(AF_INET, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); @@ -1181,7 +1182,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_table *t; struct xt_table_info *oldinfo; struct xt_counters *counters; - void *loc_cpu_old_entry; struct ipt_entry *iter; ret = 0; @@ -1224,8 +1224,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) + xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) cleanup_entry(iter, net); xt_free_table_info(oldinfo); @@ -1271,8 +1270,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1303,7 +1301,7 @@ static int do_add_counters(struct net *net, const void __user *user, unsigned int len, int compat) { - unsigned int i, curcpu; + unsigned int i; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1313,7 +1311,6 @@ do_add_counters(struct net *net, const void __user *user, struct xt_table *t; const struct xt_table_info *private; int ret = 0; - void *loc_cpu_entry; struct ipt_entry *iter; unsigned int addend; #ifdef CONFIG_COMPAT @@ -1369,12 +1366,12 @@ do_add_counters(struct net *net, const void __user *user, } i = 0; - /* Choose the copy that is on our node */ - curcpu = smp_processor_id(); - loc_cpu_entry = private->entries[curcpu]; addend = xt_write_recseq_begin(); - xt_entry_foreach(iter, loc_cpu_entry, private->size) { - ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + xt_entry_foreach(iter, private->entries, private->size) { + struct xt_counters *tmp; + + tmp = xt_get_this_cpu_counter(&iter->counters); + ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt); ++i; } xt_write_recseq_end(addend); @@ -1444,7 +1441,6 @@ static int compat_find_calc_match(struct xt_entry_match *m, const char *name, const struct ipt_ip *ip, - unsigned int hookmask, int *size) { struct xt_match *match; @@ -1513,8 +1509,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, entry_offset = (void *)e - (void *)base; j = 0; xt_ematch_foreach(ematch, e) { - ret = compat_find_calc_match(ematch, name, - &e->ip, e->comefrom, &off); + ret = compat_find_calc_match(ematch, name, &e->ip, &off); if (ret != 0) goto release_matches; ++j; @@ -1610,6 +1605,10 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name) unsigned int j; int ret = 0; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; + j = 0; mtpar.net = net; mtpar.table = name; @@ -1634,6 +1633,9 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name) break; cleanup_match(ematch, net); } + + xt_percpu_counter_free(e->counters.pcnt); + return ret; } @@ -1718,7 +1720,7 @@ translate_compat_table(struct net *net, newinfo->hook_entry[i] = info->hook_entry[i]; newinfo->underflow[i] = info->underflow[i]; } - entry1 = newinfo->entries[raw_smp_processor_id()]; + entry1 = newinfo->entries; pos = entry1; size = total_size; xt_entry_foreach(iter0, entry0, total_size) { @@ -1770,11 +1772,6 @@ translate_compat_table(struct net *net, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) - if (newinfo->entries[i] && newinfo->entries[i] != entry1) - memcpy(newinfo->entries[i], entry1, newinfo->size); - *pinfo = newinfo; *pentry0 = entry1; xt_free_table_info(info); @@ -1821,8 +1818,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1893,7 +1889,6 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *pos; unsigned int size; int ret = 0; - const void *loc_cpu_entry; unsigned int i = 0; struct ipt_entry *iter; @@ -1901,14 +1896,9 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy that is on our node/cpu, ... - * This choice is lazy (because current thread is - * allowed to migrate to another cpu) - */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; pos = userptr; size = total_size; - xt_entry_foreach(iter, loc_cpu_entry, total_size) { + xt_entry_foreach(iter, private->entries, total_size) { ret = compat_copy_entry_to_user(iter, &pos, &size, counters, i++); if (ret != 0) @@ -2083,8 +2073,7 @@ struct xt_table *ipt_register_table(struct net *net, goto out; } - /* choose the copy on our node/cpu, but dont care about preemption */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); @@ -2115,7 +2104,7 @@ void ipt_unregister_table(struct net *net, struct xt_table *table) private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) cleanup_entry(iter, net); if (private->number > private->initial_entries) diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 771ab3d01ad3..45cb16a6a4a3 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -367,6 +367,11 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) struct clusterip_config *config; int ret; + if (par->nft_compat) { + pr_err("cannot use CLUSTERIP target from nftables compat\n"); + return -EOPNOTSUPP; + } + if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) { diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index e9e67793055f..fe8cc183411e 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -18,7 +18,7 @@ #include <net/netfilter/nf_conntrack_synproxy.h> static struct iphdr * -synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr) +synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct iphdr *iph; @@ -220,7 +220,7 @@ synproxy_send_client_ack(const struct synproxy_net *snet, nth->ack_seq = th->ack_seq; tcp_flag_word(nth) = TCP_FLAG_ACK; nth->doff = tcp_hdr_size / 4; - nth->window = ntohs(htons(th->window) >> opts->wscale); + nth->window = htons(ntohs(th->window) >> opts->wscale); nth->check = 0; nth->urg_ptr = 0; diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 4bfaedf9b34e..8618fd150c96 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -40,7 +40,7 @@ static bool rpfilter_lookup_reverse(struct flowi4 *fl4, struct net *net = dev_net(dev); int ret __maybe_unused; - if (fib_lookup(net, fl4, &res)) + if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE)) return false; if (res.type != RTN_UNICAST) { diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index e1f3b911dd1e..da5d483e236a 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -298,6 +298,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2), SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT), SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE), + SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE), + SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f45f2a12f37b..d0362a2de3d3 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -457,12 +457,9 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, } #define IP_IDENTS_SZ 2048u -struct ip_ident_bucket { - atomic_t id; - u32 stamp32; -}; -static struct ip_ident_bucket *ip_idents __read_mostly; +static atomic_t *ip_idents __read_mostly; +static u32 *ip_tstamps __read_mostly; /* In order to protect privacy, we add a perturbation to identifiers * if one generator is seldom used. This makes hard for an attacker @@ -470,15 +467,16 @@ static struct ip_ident_bucket *ip_idents __read_mostly; */ u32 ip_idents_reserve(u32 hash, int segs) { - struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ; - u32 old = ACCESS_ONCE(bucket->stamp32); + u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; + atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; + u32 old = ACCESS_ONCE(*p_tstamp); u32 now = (u32)jiffies; u32 delta = 0; - if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) + if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = prandom_u32_max(now - old); - return atomic_add_return(segs + delta, &bucket->id) - segs; + return atomic_add_return(segs + delta, p_id) - segs; } EXPORT_SYMBOL(ip_idents_reserve); @@ -749,7 +747,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow if (!(n->nud_state & NUD_VALID)) { neigh_event_send(n, NULL); } else { - if (fib_lookup(net, fl4, &res) == 0) { + if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, new_gw, @@ -977,7 +975,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) return; rcu_read_lock(); - if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) { + if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) { struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, 0, mtu, @@ -1188,7 +1186,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) fl4.flowi4_mark = skb->mark; rcu_read_lock(); - if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) + if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); else src = inet_select_addr(rt->dst.dev, @@ -1718,7 +1716,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.daddr = daddr; fl4.saddr = saddr; - err = fib_lookup(net, &fl4, &res); + err = fib_lookup(net, &fl4, &res, 0); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) err = -EHOSTUNREACH; @@ -2097,7 +2095,8 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) goto out; } if (ipv4_is_local_multicast(fl4->daddr) || - ipv4_is_lbcast(fl4->daddr)) { + ipv4_is_lbcast(fl4->daddr) || + fl4->flowi4_proto == IPPROTO_IGMP) { if (!fl4->saddr) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); @@ -2124,7 +2123,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) goto make_route; } - if (fib_lookup(net, fl4, &res)) { + if (fib_lookup(net, fl4, &res, 0)) { res.fi = NULL; res.table = NULL; if (fl4->flowi4_oif) { @@ -2742,6 +2741,10 @@ int __init ip_rt_init(void) prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL); + if (!ip_tstamps) + panic("IP: failed to allocate ip_tstamps\n"); + for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index df849e5a10f1..d70b1f603692 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -219,9 +219,9 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, } EXPORT_SYMBOL_GPL(__cookie_v4_check); -static struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst) +struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; @@ -235,7 +235,7 @@ static struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, } return child; } - +EXPORT_SYMBOL(tcp_get_cookie_sock); /* * when syncookies are in effect and tcp timestamps are enabled we stored @@ -391,7 +391,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst); - ret = get_cookie_sock(sk, skb, req, &rt->dst); + ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup * Normal sockets get it right from inet_csk_route_child_sock() */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index c3852a7ff3c7..433231ccfb17 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -41,11 +41,19 @@ static int tcp_syn_retries_min = 1; static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; +static int min_sndbuf = SOCK_MIN_SNDBUF; +static int min_rcvbuf = SOCK_MIN_RCVBUF; /* Update system visible IP port range */ static void set_local_port_range(struct net *net, int range[2]) { + bool same_parity = !((range[0] ^ range[1]) & 1); + write_seqlock(&net->ipv4.ip_local_ports.lock); + if (same_parity && !net->ipv4.ip_local_ports.warned) { + net->ipv4.ip_local_ports.warned = true; + pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n"); + } net->ipv4.ip_local_ports.range[0] = range[0]; net->ipv4.ip_local_ports.range[1] = range[1]; write_sequnlock(&net->ipv4.ip_local_ports.lock); @@ -522,7 +530,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(sysctl_tcp_wmem), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_sndbuf, }, { .procname = "tcp_notsent_lowat", @@ -537,7 +545,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(sysctl_tcp_rmem), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_rcvbuf, }, { .procname = "tcp_app_win", @@ -702,7 +710,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = &one, .extra2 = &gso_max_segs, }, { @@ -750,7 +758,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(sysctl_udp_rmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one + .extra1 = &min_rcvbuf, }, { .procname = "udp_wmem_min", @@ -758,7 +766,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(sysctl_udp_wmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one + .extra1 = &min_sndbuf, }, { } }; @@ -821,6 +829,13 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { + .procname = "tcp_ecn_fallback", + .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "ip_local_port_range", .maxlen = sizeof(init_net.ipv4.ip_local_ports.range), .data = &init_net.ipv4.ip_local_ports.range, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f1377f2a0472..7f4056785acc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -695,8 +695,9 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, struct tcp_splice_state *tss = rd_desc->arg.data; int ret; - ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len), - tss->flags); + ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe, + min(rd_desc->count, len), tss->flags, + skb_socket_splice); if (ret > 0) rd_desc->count -= ret; return ret; @@ -809,16 +810,28 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, } EXPORT_SYMBOL(tcp_splice_read); -struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) +struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, + bool force_schedule) { struct sk_buff *skb; /* The TCP header must be at least 32-bit aligned. */ size = ALIGN(size, 4); + if (unlikely(tcp_under_memory_pressure(sk))) + sk_mem_reclaim_partial(sk); + skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); - if (skb) { - if (sk_wmem_schedule(sk, skb->truesize)) { + if (likely(skb)) { + bool mem_scheduled; + + if (force_schedule) { + mem_scheduled = true; + sk_forced_mem_schedule(sk, skb->truesize); + } else { + mem_scheduled = sk_wmem_schedule(sk, skb->truesize); + } + if (likely(mem_scheduled)) { skb_reserve(skb, sk->sk_prot->max_header); /* * Make sure that we have exactly size bytes @@ -908,7 +921,8 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, + skb_queue_empty(&sk->sk_write_queue)); if (!skb) goto wait_for_memory; @@ -987,6 +1001,9 @@ do_error: if (copied) goto out; out_err: + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); return sk_stream_error(sk, flags, err); } @@ -1144,7 +1161,8 @@ new_segment: skb = sk_stream_alloc_skb(sk, select_size(sk, sg), - sk->sk_allocation); + sk->sk_allocation, + skb_queue_empty(&sk->sk_write_queue)); if (!skb) goto wait_for_memory; @@ -1275,6 +1293,9 @@ do_error: goto out; out_err: err = sk_stream_error(sk, flags, err); + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); release_sock(sk); return err; } @@ -2483,6 +2504,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level, icsk->icsk_syn_retries = val; break; + case TCP_SAVE_SYN: + if (val < 0 || val > 1) + err = -EINVAL; + else + tp->save_syn = val; + break; + case TCP_LINGER2: if (val < 0) tp->linger2 = -1; @@ -2545,10 +2573,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | - TCPF_LISTEN))) + TCPF_LISTEN))) { + tcp_fastopen_init_key_once(true); + err = fastopen_init_queue(sk, val); - else + } else { err = -EINVAL; + } break; case TCP_TIMESTAMP: if (!tp->repair) @@ -2596,13 +2627,15 @@ EXPORT_SYMBOL(compat_tcp_setsockopt); /* Return information about state of tcp endpoint in API format. */ void tcp_get_info(struct sock *sk, struct tcp_info *info) { - const struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; unsigned int start; u32 rate; memset(info, 0, sizeof(*info)); + if (sk->sk_type != SOCK_STREAM) + return; info->tcpi_state = sk->sk_state; info->tcpi_ca_state = icsk->icsk_ca_state; @@ -2672,6 +2705,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_bytes_acked = tp->bytes_acked; info->tcpi_bytes_received = tp->bytes_received; } while (u64_stats_fetch_retry_irq(&tp->syncp, start)); + info->tcpi_segs_out = tp->segs_out; + info->tcpi_segs_in = tp->segs_in; } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2821,6 +2856,42 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_NOTSENT_LOWAT: val = tp->notsent_lowat; break; + case TCP_SAVE_SYN: + val = tp->save_syn; + break; + case TCP_SAVED_SYN: { + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + if (tp->saved_syn) { + if (len < tp->saved_syn[0]) { + if (put_user(tp->saved_syn[0], optlen)) { + release_sock(sk); + return -EFAULT; + } + release_sock(sk); + return -EINVAL; + } + len = tp->saved_syn[0]; + if (put_user(len, optlen)) { + release_sock(sk); + return -EFAULT; + } + if (copy_to_user(optval, tp->saved_syn + 1, len)) { + release_sock(sk); + return -EFAULT; + } + tcp_saved_syn_free(tp); + release_sock(sk); + } else { + release_sock(sk); + len = 0; + if (put_user(len, optlen)) + return -EFAULT; + } + return 0; + } default: return -ENOPROTOOPT; } @@ -3025,11 +3096,12 @@ __setup("thash_entries=", set_thash_entries); static void __init tcp_init_mem(void) { - unsigned long limit = nr_free_buffer_pages() / 8; + unsigned long limit = nr_free_buffer_pages() / 16; + limit = max(limit, 128UL); - sysctl_tcp_mem[0] = limit / 4 * 3; - sysctl_tcp_mem[1] = limit; - sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; + sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */ + sysctl_tcp_mem[1] = limit; /* 6.25 % */ + sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */ } void __init tcp_init(void) diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c new file mode 100644 index 000000000000..8c6fd3d5e40f --- /dev/null +++ b/net/ipv4/tcp_cdg.c @@ -0,0 +1,433 @@ +/* + * CAIA Delay-Gradient (CDG) congestion control + * + * This implementation is based on the paper: + * D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using + * delay gradients." In IFIP Networking, pages 328-341. Springer, 2011. + * + * Scavenger traffic (Less-than-Best-Effort) should disable coexistence + * heuristics using parameters use_shadow=0 and use_ineff=0. + * + * Parameters window, backoff_beta, and backoff_factor are crucial for + * throughput and delay. Future work is needed to determine better defaults, + * and to provide guidelines for use in different environments/contexts. + * + * Except for window, knobs are configured via /sys/module/tcp_cdg/parameters/. + * Parameter window is only configurable when loading tcp_cdg as a module. + * + * Notable differences from paper/FreeBSD: + * o Using Hybrid Slow start and Proportional Rate Reduction. + * o Add toggle for shadow window mechanism. Suggested by David Hayes. + * o Add toggle for non-congestion loss tolerance. + * o Scaling parameter G is changed to a backoff factor; + * conversion is given by: backoff_factor = 1000/(G * window). + * o Limit shadow window to 2 * cwnd, or to cwnd when application limited. + * o More accurate e^-x. + */ +#include <linux/kernel.h> +#include <linux/random.h> +#include <linux/module.h> +#include <net/tcp.h> + +#define HYSTART_ACK_TRAIN 1 +#define HYSTART_DELAY 2 + +static int window __read_mostly = 8; +static unsigned int backoff_beta __read_mostly = 0.7071 * 1024; /* sqrt 0.5 */ +static unsigned int backoff_factor __read_mostly = 42; +static unsigned int hystart_detect __read_mostly = 3; +static unsigned int use_ineff __read_mostly = 5; +static bool use_shadow __read_mostly = true; +static bool use_tolerance __read_mostly; + +module_param(window, int, 0444); +MODULE_PARM_DESC(window, "gradient window size (power of two <= 256)"); +module_param(backoff_beta, uint, 0644); +MODULE_PARM_DESC(backoff_beta, "backoff beta (0-1024)"); +module_param(backoff_factor, uint, 0644); +MODULE_PARM_DESC(backoff_factor, "backoff probability scale factor"); +module_param(hystart_detect, uint, 0644); +MODULE_PARM_DESC(hystart_detect, "use Hybrid Slow start " + "(0: disabled, 1: ACK train, 2: delay threshold, 3: both)"); +module_param(use_ineff, uint, 0644); +MODULE_PARM_DESC(use_ineff, "use ineffectual backoff detection (threshold)"); +module_param(use_shadow, bool, 0644); +MODULE_PARM_DESC(use_shadow, "use shadow window heuristic"); +module_param(use_tolerance, bool, 0644); +MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic"); + +struct minmax { + union { + struct { + s32 min; + s32 max; + }; + u64 v64; + }; +}; + +enum cdg_state { + CDG_UNKNOWN = 0, + CDG_NONFULL = 1, + CDG_FULL = 2, + CDG_BACKOFF = 3, +}; + +struct cdg { + struct minmax rtt; + struct minmax rtt_prev; + struct minmax *gradients; + struct minmax gsum; + bool gfilled; + u8 tail; + u8 state; + u8 delack; + u32 rtt_seq; + u32 undo_cwnd; + u32 shadow_wnd; + u16 backoff_cnt; + u16 sample_cnt; + s32 delay_min; + u32 last_ack; + u32 round_start; +}; + +/** + * nexp_u32 - negative base-e exponential + * @ux: x in units of micro + * + * Returns exp(ux * -1e-6) * U32_MAX. + */ +static u32 __pure nexp_u32(u32 ux) +{ + static const u16 v[] = { + /* exp(-x)*65536-1 for x = 0, 0.000256, 0.000512, ... */ + 65535, + 65518, 65501, 65468, 65401, 65267, 65001, 64470, 63422, + 61378, 57484, 50423, 38795, 22965, 8047, 987, 14, + }; + u32 msb = ux >> 8; + u32 res; + int i; + + /* Cut off when ux >= 2^24 (actual result is <= 222/U32_MAX). */ + if (msb > U16_MAX) + return 0; + + /* Scale first eight bits linearly: */ + res = U32_MAX - (ux & 0xff) * (U32_MAX / 1000000); + + /* Obtain e^(x + y + ...) by computing e^x * e^y * ...: */ + for (i = 1; msb; i++, msb >>= 1) { + u32 y = v[i & -(msb & 1)] + U32_C(1); + + res = ((u64)res * y) >> 16; + } + + return res; +} + +/* Based on the HyStart algorithm (by Ha et al.) that is implemented in + * tcp_cubic. Differences/experimental changes: + * o Using Hayes' delayed ACK filter. + * o Using a usec clock for the ACK train. + * o Reset ACK train when application limited. + * o Invoked at any cwnd (i.e. also when cwnd < 16). + * o Invoked only when cwnd < ssthresh (i.e. not when cwnd == ssthresh). + */ +static void tcp_cdg_hystart_update(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + ca->delay_min = min_not_zero(ca->delay_min, ca->rtt.min); + if (ca->delay_min == 0) + return; + + if (hystart_detect & HYSTART_ACK_TRAIN) { + u32 now_us = div_u64(local_clock(), NSEC_PER_USEC); + + if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) { + ca->last_ack = now_us; + ca->round_start = now_us; + } else if (before(now_us, ca->last_ack + 3000)) { + u32 base_owd = max(ca->delay_min / 2U, 125U); + + ca->last_ack = now_us; + if (after(now_us, ca->round_start + base_owd)) { + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINDETECT); + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINCWND, + tp->snd_cwnd); + tp->snd_ssthresh = tp->snd_cwnd; + return; + } + } + } + + if (hystart_detect & HYSTART_DELAY) { + if (ca->sample_cnt < 8) { + ca->sample_cnt++; + } else { + s32 thresh = max(ca->delay_min + ca->delay_min / 8U, + 125U); + + if (ca->rtt.min > thresh) { + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYDETECT); + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYCWND, + tp->snd_cwnd); + tp->snd_ssthresh = tp->snd_cwnd; + } + } + } +} + +static s32 tcp_cdg_grad(struct cdg *ca) +{ + s32 gmin = ca->rtt.min - ca->rtt_prev.min; + s32 gmax = ca->rtt.max - ca->rtt_prev.max; + s32 grad; + + if (ca->gradients) { + ca->gsum.min += gmin - ca->gradients[ca->tail].min; + ca->gsum.max += gmax - ca->gradients[ca->tail].max; + ca->gradients[ca->tail].min = gmin; + ca->gradients[ca->tail].max = gmax; + ca->tail = (ca->tail + 1) & (window - 1); + gmin = ca->gsum.min; + gmax = ca->gsum.max; + } + + /* We keep sums to ignore gradients during cwnd reductions; + * the paper's smoothed gradients otherwise simplify to: + * (rtt_latest - rtt_oldest) / window. + * + * We also drop division by window here. + */ + grad = gmin > 0 ? gmin : gmax; + + /* Extrapolate missing values in gradient window: */ + if (!ca->gfilled) { + if (!ca->gradients && window > 1) + grad *= window; /* Memory allocation failed. */ + else if (ca->tail == 0) + ca->gfilled = true; + else + grad = (grad * window) / (int)ca->tail; + } + + /* Backoff was effectual: */ + if (gmin <= -32 || gmax <= -32) + ca->backoff_cnt = 0; + + if (use_tolerance) { + /* Reduce small variations to zero: */ + gmin = DIV_ROUND_CLOSEST(gmin, 64); + gmax = DIV_ROUND_CLOSEST(gmax, 64); + + if (gmin > 0 && gmax <= 0) + ca->state = CDG_FULL; + else if ((gmin > 0 && gmax > 0) || gmax < 0) + ca->state = CDG_NONFULL; + } + return grad; +} + +static bool tcp_cdg_backoff(struct sock *sk, u32 grad) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (prandom_u32() <= nexp_u32(grad * backoff_factor)) + return false; + + if (use_ineff) { + ca->backoff_cnt++; + if (ca->backoff_cnt > use_ineff) + return false; + } + + ca->shadow_wnd = max(ca->shadow_wnd, tp->snd_cwnd); + ca->state = CDG_BACKOFF; + tcp_enter_cwr(sk); + return true; +} + +/* Not called in CWR or Recovery state. */ +static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 prior_snd_cwnd; + u32 incr; + + if (tp->snd_cwnd < tp->snd_ssthresh && hystart_detect) + tcp_cdg_hystart_update(sk); + + if (after(ack, ca->rtt_seq) && ca->rtt.v64) { + s32 grad = 0; + + if (ca->rtt_prev.v64) + grad = tcp_cdg_grad(ca); + ca->rtt_seq = tp->snd_nxt; + ca->rtt_prev = ca->rtt; + ca->rtt.v64 = 0; + ca->last_ack = 0; + ca->sample_cnt = 0; + + if (grad > 0 && tcp_cdg_backoff(sk, grad)) + return; + } + + if (!tcp_is_cwnd_limited(sk)) { + ca->shadow_wnd = min(ca->shadow_wnd, tp->snd_cwnd); + return; + } + + prior_snd_cwnd = tp->snd_cwnd; + tcp_reno_cong_avoid(sk, ack, acked); + + incr = tp->snd_cwnd - prior_snd_cwnd; + ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr); +} + +static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (rtt_us <= 0) + return; + + /* A heuristic for filtering delayed ACKs, adapted from: + * D.A. Hayes. "Timing enhancements to the FreeBSD kernel to support + * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010. + */ + if (tp->sacked_out == 0) { + if (num_acked == 1 && ca->delack) { + /* A delayed ACK is only used for the minimum if it is + * provenly lower than an existing non-zero minimum. + */ + ca->rtt.min = min(ca->rtt.min, rtt_us); + ca->delack--; + return; + } else if (num_acked > 1 && ca->delack < 5) { + ca->delack++; + } + } + + ca->rtt.min = min_not_zero(ca->rtt.min, rtt_us); + ca->rtt.max = max(ca->rtt.max, rtt_us); +} + +static u32 tcp_cdg_ssthresh(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + ca->undo_cwnd = tp->snd_cwnd; + + if (ca->state == CDG_BACKOFF) + return max(2U, (tp->snd_cwnd * min(1024U, backoff_beta)) >> 10); + + if (ca->state == CDG_NONFULL && use_tolerance) + return tp->snd_cwnd; + + ca->shadow_wnd = min(ca->shadow_wnd >> 1, tp->snd_cwnd); + if (use_shadow) + return max3(2U, ca->shadow_wnd, tp->snd_cwnd >> 1); + return max(2U, tp->snd_cwnd >> 1); +} + +static u32 tcp_cdg_undo_cwnd(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->undo_cwnd); +} + +static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct minmax *gradients; + + switch (ev) { + case CA_EVENT_CWND_RESTART: + gradients = ca->gradients; + if (gradients) + memset(gradients, 0, window * sizeof(gradients[0])); + memset(ca, 0, sizeof(*ca)); + + ca->gradients = gradients; + ca->rtt_seq = tp->snd_nxt; + ca->shadow_wnd = tp->snd_cwnd; + break; + case CA_EVENT_COMPLETE_CWR: + ca->state = CDG_UNKNOWN; + ca->rtt_seq = tp->snd_nxt; + ca->rtt_prev = ca->rtt; + ca->rtt.v64 = 0; + break; + default: + break; + } +} + +static void tcp_cdg_init(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + /* We silently fall back to window = 1 if allocation fails. */ + if (window > 1) + ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), + GFP_NOWAIT | __GFP_NOWARN); + ca->rtt_seq = tp->snd_nxt; + ca->shadow_wnd = tp->snd_cwnd; +} + +static void tcp_cdg_release(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + + kfree(ca->gradients); +} + +struct tcp_congestion_ops tcp_cdg __read_mostly = { + .cong_avoid = tcp_cdg_cong_avoid, + .cwnd_event = tcp_cdg_cwnd_event, + .pkts_acked = tcp_cdg_acked, + .undo_cwnd = tcp_cdg_undo_cwnd, + .ssthresh = tcp_cdg_ssthresh, + .release = tcp_cdg_release, + .init = tcp_cdg_init, + .owner = THIS_MODULE, + .name = "cdg", +}; + +static int __init tcp_cdg_register(void) +{ + if (backoff_beta > 1024 || window < 1 || window > 256) + return -ERANGE; + if (!is_power_of_2(window)) + return -EINVAL; + + BUILD_BUG_ON(sizeof(struct cdg) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_cdg); + return 0; +} + +static void __exit tcp_cdg_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_cdg); +} + +module_init(tcp_cdg_register); +module_exit(tcp_cdg_unregister); +MODULE_AUTHOR("Kenneth Klette Jonassen"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP CDG"); diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 4c41c1287197..7092a61c4dc8 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -204,20 +204,26 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags) /* Expired RTT */ if (!before(tp->snd_una, ca->next_seq)) { - /* For avoiding denominator == 1. */ - if (ca->acked_bytes_total == 0) - ca->acked_bytes_total = 1; + u64 bytes_ecn = ca->acked_bytes_ecn; + u32 alpha = ca->dctcp_alpha; /* alpha = (1 - g) * alpha + g * F */ - ca->dctcp_alpha = ca->dctcp_alpha - - (ca->dctcp_alpha >> dctcp_shift_g) + - (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) / - ca->acked_bytes_total; - if (ca->dctcp_alpha > DCTCP_MAX_ALPHA) - /* Clamp dctcp_alpha to max. */ - ca->dctcp_alpha = DCTCP_MAX_ALPHA; + alpha -= alpha >> dctcp_shift_g; + if (bytes_ecn) { + /* If dctcp_shift_g == 1, a 32bit value would overflow + * after 8 Mbytes. + */ + bytes_ecn <<= (10 - dctcp_shift_g); + do_div(bytes_ecn, max(1U, ca->acked_bytes_total)); + alpha = min(alpha + (u32)bytes_ecn, DCTCP_MAX_ALPHA); + } + /* dctcp_alpha can be read from dctcp_get_info() without + * synchro, so we ask compiler to not use dctcp_alpha + * as a temporary variable in prior operations. + */ + WRITE_ONCE(ca->dctcp_alpha, alpha); dctcp_reset(tp, ca); } } diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 79b34a0f4a4a..479f34946177 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -19,13 +19,14 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *_info) { - const struct tcp_sock *tp = tcp_sk(sk); struct tcp_info *info = _info; if (sk->sk_state == TCP_LISTEN) { r->idiag_rqueue = sk->sk_ack_backlog; r->idiag_wqueue = sk->sk_max_ack_backlog; - } else { + } else if (sk->sk_type == SOCK_STREAM) { + const struct tcp_sock *tp = tcp_sk(sk); + r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); r->idiag_wqueue = tp->write_seq - tp->snd_una; } @@ -50,6 +51,7 @@ static const struct inet_diag_handler tcp_diag_handler = { .dump_one = tcp_diag_dump_one, .idiag_get_info = tcp_diag_get_info, .idiag_type = IPPROTO_TCP, + .idiag_info_size = sizeof(struct tcp_info), }; static int __init tcp_diag_init(void) diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 46b087a27503..f9c0fb84e435 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -78,8 +78,6 @@ static bool __tcp_fastopen_cookie_gen(const void *path, struct tcp_fastopen_context *ctx; bool ok = false; - tcp_fastopen_init_key_once(true); - rcu_read_lock(); ctx = rcu_dereference(tcp_fastopen_ctx); if (ctx) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c9ab964189a0..684f095d196e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -359,7 +359,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && - !sk_under_memory_pressure(sk)) { + !tcp_under_memory_pressure(sk)) { int incr; /* Check #2. Increase window, if skb with such overhead @@ -446,7 +446,7 @@ static void tcp_clamp_window(struct sock *sk) if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && - !sk_under_memory_pressure(sk) && + !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); @@ -1130,7 +1130,12 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sacktag_state { int reord; int fack_count; - long rtt_us; /* RTT measured by SACKing never-retransmitted data */ + /* Timestamps for earliest and latest never-retransmitted segment + * that was SACKed. RTO needs the earliest RTT to stay conservative, + * but congestion control should still get an accurate delay signal. + */ + struct skb_mstamp first_sackt; + struct skb_mstamp last_sackt; int flag; }; @@ -1233,14 +1238,9 @@ static u8 tcp_sacktag_one(struct sock *sk, state->reord); if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; - /* Pick the earliest sequence sacked for RTT */ - if (state->rtt_us < 0) { - struct skb_mstamp now; - - skb_mstamp_get(&now); - state->rtt_us = skb_mstamp_us_delta(&now, - xmit_time); - } + if (state->first_sackt.v64 == 0) + state->first_sackt = *xmit_time; + state->last_sackt = *xmit_time; } if (sacked & TCPCB_LOST) { @@ -1316,16 +1316,12 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, * code can come after this skb later on it's better to keep * setting gso_size to something. */ - if (!skb_shinfo(prev)->gso_size) { - skb_shinfo(prev)->gso_size = mss; - skb_shinfo(prev)->gso_type = sk->sk_gso_type; - } + if (!TCP_SKB_CB(prev)->tcp_gso_size) + TCP_SKB_CB(prev)->tcp_gso_size = mss; /* CHECKME: To clear or not to clear? Mimics normal skb currently */ - if (tcp_skb_pcount(skb) <= 1) { - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; - } + if (tcp_skb_pcount(skb) <= 1) + TCP_SKB_CB(skb)->tcp_gso_size = 0; /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); @@ -1634,7 +1630,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl static int tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, - u32 prior_snd_una, long *sack_rtt_us) + u32 prior_snd_una, struct tcp_sacktag_state *state) { struct tcp_sock *tp = tcp_sk(sk); const unsigned char *ptr = (skb_transport_header(ack_skb) + @@ -1642,7 +1638,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); struct tcp_sack_block sp[TCP_NUM_SACKS]; struct tcp_sack_block *cache; - struct tcp_sacktag_state state; struct sk_buff *skb; int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); int used_sacks; @@ -1650,9 +1645,8 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, int i, j; int first_sack_index; - state.flag = 0; - state.reord = tp->packets_out; - state.rtt_us = -1L; + state->flag = 0; + state->reord = tp->packets_out; if (!tp->sacked_out) { if (WARN_ON(tp->fackets_out)) @@ -1663,7 +1657,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, num_sacks, prior_snd_una); if (found_dup_sack) - state.flag |= FLAG_DSACKING_ACK; + state->flag |= FLAG_DSACKING_ACK; /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can @@ -1728,7 +1722,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, } skb = tcp_write_queue_head(sk); - state.fack_count = 0; + state->fack_count = 0; i = 0; if (!tp->sacked_out) { @@ -1762,10 +1756,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, /* Head todo? */ if (before(start_seq, cache->start_seq)) { - skb = tcp_sacktag_skip(skb, sk, &state, + skb = tcp_sacktag_skip(skb, sk, state, start_seq); skb = tcp_sacktag_walk(skb, sk, next_dup, - &state, + state, start_seq, cache->start_seq, dup_sack); @@ -1776,7 +1770,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, goto advance_sp; skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, - &state, + state, cache->end_seq); /* ...tail remains todo... */ @@ -1785,12 +1779,12 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, skb = tcp_highest_sack(sk); if (!skb) break; - state.fack_count = tp->fackets_out; + state->fack_count = tp->fackets_out; cache++; goto walk; } - skb = tcp_sacktag_skip(skb, sk, &state, cache->end_seq); + skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq); /* Check overlap against next cached too (past this one already) */ cache++; continue; @@ -1800,12 +1794,12 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, skb = tcp_highest_sack(sk); if (!skb) break; - state.fack_count = tp->fackets_out; + state->fack_count = tp->fackets_out; } - skb = tcp_sacktag_skip(skb, sk, &state, start_seq); + skb = tcp_sacktag_skip(skb, sk, state, start_seq); walk: - skb = tcp_sacktag_walk(skb, sk, next_dup, &state, + skb = tcp_sacktag_walk(skb, sk, next_dup, state, start_seq, end_seq, dup_sack); advance_sp: @@ -1820,9 +1814,9 @@ advance_sp: for (j = 0; j < used_sacks; j++) tp->recv_sack_cache[i++] = sp[j]; - if ((state.reord < tp->fackets_out) && + if ((state->reord < tp->fackets_out) && ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) - tcp_update_reordering(sk, tp->fackets_out - state.reord, 0); + tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); tcp_mark_lost_retrans(sk); tcp_verify_left_out(tp); @@ -1834,8 +1828,7 @@ out: WARN_ON((int)tp->retrans_out < 0); WARN_ON((int)tcp_packets_in_flight(tp) < 0); #endif - *sack_rtt_us = state.rtt_us; - return state.flag; + return state->flag; } /* Limits sacked_out so that sum with lost_out isn't ever larger than @@ -2255,7 +2248,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) (oldcnt >= packets)) break; - mss = skb_shinfo(skb)->gso_size; + mss = tcp_skb_mss(skb); err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss, GFP_ATOMIC); if (err < 0) @@ -2555,6 +2548,7 @@ void tcp_enter_cwr(struct sock *sk) tcp_set_ca_state(sk, TCP_CA_CWR); } } +EXPORT_SYMBOL(tcp_enter_cwr); static void tcp_try_keep_open(struct sock *sk) { @@ -3055,7 +3049,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una, long sack_rtt_us) + u32 prior_snd_una, + struct tcp_sacktag_state *sack) { const struct inet_connection_sock *icsk = inet_csk(sk); struct skb_mstamp first_ackt, last_ackt, now; @@ -3063,8 +3058,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; bool fully_acked = true; - long ca_seq_rtt_us = -1L; + long sack_rtt_us = -1L; long seq_rtt_us = -1L; + long ca_rtt_us = -1L; struct sk_buff *skb; u32 pkts_acked = 0; bool rtt_update; @@ -3153,15 +3149,16 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, skb_mstamp_get(&now); if (likely(first_ackt.v64)) { seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); - ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); + ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); + } + if (sack->first_sackt.v64) { + sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt); + ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); } rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); if (flag & FLAG_ACKED) { - const struct tcp_congestion_ops *ca_ops - = inet_csk(sk)->icsk_ca_ops; - tcp_rearm_rto(sk); if (unlikely(icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { @@ -3184,11 +3181,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); - if (ca_ops->pkts_acked) { - long rtt_us = min_t(ulong, ca_seq_rtt_us, sack_rtt_us); - ca_ops->pkts_acked(sk, pkts_acked, rtt_us); - } - } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { /* Do not re-arm RTO if the sack RTT is measured from data sent @@ -3198,6 +3190,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tcp_rearm_rto(sk); } + if (icsk->icsk_ca_ops->pkts_acked) + icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us); + #if FASTRETRANS_DEBUG > 0 WARN_ON((int)tp->sacked_out < 0); WARN_ON((int)tp->lost_out < 0); @@ -3238,7 +3233,7 @@ static void tcp_ack_probe(struct sock *sk) * This function is not for random using! */ } else { - unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); + unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX); @@ -3466,6 +3461,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sacktag_state sack_state; u32 prior_snd_una = tp->snd_una; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; @@ -3474,7 +3470,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) int prior_packets = tp->packets_out; const int prior_unsacked = tp->packets_out - tp->sacked_out; int acked = 0; /* Number of packets newly acked */ - long sack_rtt_us = -1L; + + sack_state.first_sackt.v64 = 0; /* We very likely will need to access write queue head. */ prefetchw(sk->sk_write_queue.next); @@ -3538,7 +3535,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (TCP_SKB_CB(skb)->sacked) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_rtt_us); + &sack_state); if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { flag |= FLAG_ECE; @@ -3563,7 +3560,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ acked = tp->packets_out; flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, - sack_rtt_us); + &sack_state); acked -= tp->packets_out; /* Advance cwnd if state allows */ @@ -3615,7 +3612,7 @@ old_ack: */ if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_rtt_us); + &sack_state); tcp_fastretrans_alert(sk, acked, prior_unsacked, is_dupack, flag); } @@ -4514,10 +4511,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (eaten <= 0) { queue_and_out: - if (eaten < 0 && - tcp_try_rmem_schedule(sk, skb, skb->truesize)) - goto drop; - + if (eaten < 0) { + if (skb_queue_len(&sk->sk_receive_queue) == 0) + sk_forced_mem_schedule(sk, skb->truesize); + else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + goto drop; + } eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); } tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); @@ -4788,7 +4787,7 @@ static int tcp_prune_queue(struct sock *sk) if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); - else if (sk_under_memory_pressure(sk)) + else if (tcp_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tcp_collapse_ofo_queue(sk); @@ -4832,7 +4831,7 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk) return false; /* If we are under global TCP memory pressure, do not expand. */ - if (sk_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) return false; /* If we are under soft global TCP memory pressure, do not expand. */ @@ -6067,6 +6066,23 @@ static bool tcp_syn_flood_action(struct sock *sk, return want_cookie; } +static void tcp_reqsk_record_syn(const struct sock *sk, + struct request_sock *req, + const struct sk_buff *skb) +{ + if (tcp_sk(sk)->save_syn) { + u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb); + u32 *copy; + + copy = kmalloc(len + sizeof(u32), GFP_ATOMIC); + if (copy) { + copy[0] = len; + memcpy(©[1], skb_network_header(skb), len); + req->saved_syn = copy; + } + } +} + int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) @@ -6199,6 +6215,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->tfo_listener = false; af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); } + tcp_reqsk_record_syn(sk, req, skb); return 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fc1c658ec6c1..d7d4c2b79cf2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1400,7 +1400,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; } - if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) + if (tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { @@ -1626,6 +1626,7 @@ process: skb->dev = NULL; bh_lock_sock_nested(sk); + tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); ret = 0; if (!sock_owned_by_user(sk)) { if (!tcp_prequeue(sk, skb)) @@ -1646,7 +1647,7 @@ no_tcp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; - if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { + if (tcp_checksum_complete(skb)) { csum_error: TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); bad_packet: @@ -1670,10 +1671,6 @@ do_time_wait: goto discard_it; } - if (skb->len < (th->doff << 2)) { - inet_twsk_put(inet_twsk(sk)); - goto bad_packet; - } if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; @@ -1802,6 +1799,7 @@ void tcp_v4_destroy_sock(struct sock *sk) /* If socket is aborted during connect operation */ tcp_free_fastopen_req(tp); + tcp_saved_syn_free(tp); sk_sockets_allocated_dec(sk); sock_release_memcg(sk); @@ -2410,12 +2408,15 @@ static int __net_init tcp_sk_init(struct net *net) goto fail; *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; } + net->ipv4.sysctl_tcp_ecn = 2; + net->ipv4.sysctl_tcp_ecn_fallback = 1; + net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; - return 0; + return 0; fail: tcp_sk_exit(net); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 17e7339ee5ca..4bc00cb79e60 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -451,6 +451,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; + newtp->segs_in = 0; newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; @@ -539,6 +540,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; + newtp->saved_syn = req->saved_syn; + req->saved_syn = NULL; + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); } return newsk; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 3f7c2fca5431..9864a2dbadce 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -77,7 +77,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, oldlen = (u16)~skb->len; __skb_pull(skb, thlen); - mss = tcp_skb_mss(skb); + mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) goto out; @@ -242,7 +242,7 @@ found: flush |= *(u32 *)((u8 *)th + i) ^ *(u32 *)((u8 *)th2 + i); - mss = tcp_skb_mss(p); + mss = skb_shinfo(p)->gso_size; flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a369e8a70b2c..b1c218df2c85 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -50,8 +50,8 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1; */ int sysctl_tcp_workaround_signed_windows __read_mostly = 0; -/* Default TSQ limit of two TSO segments */ -int sysctl_tcp_limit_output_bytes __read_mostly = 131072; +/* Default TSQ limit of four TSO segments */ +int sysctl_tcp_limit_output_bytes __read_mostly = 262144; /* This limits the percentage of the congestion window which we * will allow a single TSO frame to consume. Building TSO frames @@ -350,6 +350,15 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) } } +static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) +{ + if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback) + /* tp->ecn_flags are cleared at a later point in time when + * SYN ACK is ultimatively being received. + */ + TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); +} + static void tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th, struct sock *sk) @@ -393,8 +402,6 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, */ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { - struct skb_shared_info *shinfo = skb_shinfo(skb); - skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; @@ -402,8 +409,6 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) TCP_SKB_CB(skb)->sacked = 0; tcp_skb_pcount_set(skb, 1); - shinfo->gso_size = 0; - shinfo->gso_type = 0; TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -994,6 +999,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, } tcp_options_write((__be32 *)(th + 1), tp, &opts); + skb_shinfo(skb)->gso_type = sk->sk_gso_type; if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) tcp_ecn_send(sk, skb, tcp_header_size); @@ -1018,8 +1024,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); - /* OK, its time to fill skb_shinfo(skb)->gso_segs */ + tp->segs_out += tcp_skb_pcount(skb); + /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); + skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); /* Our usage of tstamp should remain private */ skb->tstamp.tv64 = 0; @@ -1056,25 +1064,17 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) } /* Initialize TSO segments for a packet. */ -static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, - unsigned int mss_now) +static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { - struct skb_shared_info *shinfo = skb_shinfo(skb); - - /* Make sure we own this skb before messing gso_size/gso_segs */ - WARN_ON_ONCE(skb_cloned(skb)); - if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) { /* Avoid the costly divide in the normal * non-TSO case. */ tcp_skb_pcount_set(skb, 1); - shinfo->gso_size = 0; - shinfo->gso_type = 0; + TCP_SKB_CB(skb)->tcp_gso_size = 0; } else { tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); - shinfo->gso_size = mss_now; - shinfo->gso_type = sk->sk_gso_type; + TCP_SKB_CB(skb)->tcp_gso_size = mss_now; } } @@ -1163,7 +1163,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, return -ENOMEM; /* Get a new skb... force flag on. */ - buff = sk_stream_alloc_skb(sk, nsize, gfp); + buff = sk_stream_alloc_skb(sk, nsize, gfp, true); if (!buff) return -ENOMEM; /* We'll just try again later. */ @@ -1206,8 +1206,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, old_factor = tcp_skb_pcount(skb); /* Fix up tso_factor for both original and new SKB. */ - tcp_set_skb_tso_segs(sk, skb, mss_now); - tcp_set_skb_tso_segs(sk, buff, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); + tcp_set_skb_tso_segs(buff, mss_now); /* If this packet has been sent out already, we must * adjust the various packet counters. @@ -1287,7 +1287,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) /* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) - tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); + tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb)); return 0; } @@ -1619,13 +1619,12 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, * This must be invoked the first time we consider transmitting * SKB onto the wire. */ -static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, - unsigned int mss_now) +static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { - tcp_set_skb_tso_segs(sk, skb, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); tso_segs = tcp_skb_pcount(skb); } return tso_segs; @@ -1680,7 +1679,7 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, const struct tcp_sock *tp = tcp_sk(sk); unsigned int cwnd_quota; - tcp_init_tso_segs(sk, skb, cur_mss); + tcp_init_tso_segs(skb, cur_mss); if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) return 0; @@ -1722,7 +1721,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, if (skb->len != skb->data_len) return tcp_fragment(sk, skb, len, mss_now, gfp); - buff = sk_stream_alloc_skb(sk, 0, gfp); + buff = sk_stream_alloc_skb(sk, 0, gfp, true); if (unlikely(!buff)) return -ENOMEM; @@ -1749,8 +1748,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, tcp_fragment_tstamp(skb, buff); /* Fix up tso_factor for both original and new SKB. */ - tcp_set_skb_tso_segs(sk, skb, mss_now); - tcp_set_skb_tso_segs(sk, buff, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); + tcp_set_skb_tso_segs(buff, mss_now); /* Link BUFF into the send queue. */ __skb_header_release(buff); @@ -1941,7 +1940,7 @@ static int tcp_mtu_probe(struct sock *sk) } /* We're allowed to probe. Build it now. */ - nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC); + nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false); if (!nskb) return -1; sk->sk_wmem_queued += nskb->truesize; @@ -1984,7 +1983,7 @@ static int tcp_mtu_probe(struct sock *sk) skb->len, 0); } else { __pskb_trim_head(skb, copy); - tcp_set_skb_tso_segs(sk, skb, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); } TCP_SKB_CB(skb)->seq += copy; } @@ -1994,7 +1993,7 @@ static int tcp_mtu_probe(struct sock *sk) if (len >= probe_size) break; } - tcp_init_tso_segs(sk, nskb, nskb->len); + tcp_init_tso_segs(nskb, nskb->len); /* We're ready to send. If this fails, the probe will * be resegmented into mss-sized pieces by tcp_write_xmit(). @@ -2056,7 +2055,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, while ((skb = tcp_send_head(sk))) { unsigned int limit; - tso_segs = tcp_init_tso_segs(sk, skb, mss_now); + tso_segs = tcp_init_tso_segs(skb, mss_now); BUG_ON(!tso_segs); if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { @@ -2078,7 +2077,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) break; - if (tso_segs == 1 || !max_segs) { + if (tso_segs == 1) { if (unlikely(!tcp_nagle_test(tp, skb, mss_now, (tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)))) @@ -2091,7 +2090,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } limit = mss_now; - if (tso_segs > 1 && max_segs && !tcp_urg_mode(tp)) + if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, min_t(unsigned int, cwnd_quota, @@ -2392,7 +2391,7 @@ u32 __tcp_select_window(struct sock *sk) if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; - if (sk_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); @@ -2610,11 +2609,15 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (unlikely(oldpcount > 1)) { if (skb_unclone(skb, GFP_ATOMIC)) return -ENOMEM; - tcp_init_tso_segs(sk, skb, cur_mss); + tcp_init_tso_segs(skb, cur_mss); tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); } } + /* RFC3168, section 6.1.1.1. ECN fallback */ + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) + tcp_ecn_clear_syn(sk, skb); + tcp_retrans_try_collapse(sk, skb, cur_mss); /* Make a copy, if the first transmission SKB clone we made @@ -2816,8 +2819,10 @@ begin_fwd: * connection tear down and (memory) recovery. * Otherwise tcp_send_fin() could be tempted to either delay FIN * or even be forced to close flow without any FIN. + * In general, we want to allow one skb per socket to avoid hangs + * with edge trigger epoll() */ -static void sk_forced_wmem_schedule(struct sock *sk, int size) +void sk_forced_mem_schedule(struct sock *sk, int size) { int amt, status; @@ -2841,7 +2846,7 @@ void tcp_send_fin(struct sock *sk) * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ - if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) { + if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) { coalesce: TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; @@ -2864,7 +2869,7 @@ coalesce: return; } skb_reserve(skb, MAX_TCP_HEADER); - sk_forced_wmem_schedule(sk, skb->truesize); + sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); @@ -3175,7 +3180,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) /* limit to order-0 allocations */ space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); - syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation); + syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false); if (!syn_data) goto fallback; syn_data->ip_summed = CHECKSUM_PARTIAL; @@ -3241,7 +3246,7 @@ int tcp_connect(struct sock *sk) return 0; } - buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); + buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true); if (unlikely(!buff)) return -ENOBUFS; @@ -3382,7 +3387,7 @@ EXPORT_SYMBOL_GPL(tcp_send_ack); * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is * out-of-date with SND.UNA-1 to probe window. */ -static int tcp_xmit_probe_skb(struct sock *sk, int urgent) +static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -3400,6 +3405,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); skb_mstamp_get(&skb->skb_mstamp); + NET_INC_STATS_BH(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } @@ -3407,12 +3413,12 @@ void tcp_send_window_probe(struct sock *sk) { if (sk->sk_state == TCP_ESTABLISHED) { tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; - tcp_xmit_probe_skb(sk, 0); + tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE); } } /* Initiate keepalive or window probe from timer. */ -int tcp_write_wakeup(struct sock *sk) +int tcp_write_wakeup(struct sock *sk, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -3440,7 +3446,7 @@ int tcp_write_wakeup(struct sock *sk) if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) - tcp_set_skb_tso_segs(sk, skb, mss); + tcp_set_skb_tso_segs(skb, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); @@ -3449,8 +3455,8 @@ int tcp_write_wakeup(struct sock *sk) return err; } else { if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) - tcp_xmit_probe_skb(sk, 1); - return tcp_xmit_probe_skb(sk, 0); + tcp_xmit_probe_skb(sk, 1, mib); + return tcp_xmit_probe_skb(sk, 0, mib); } } @@ -3464,7 +3470,7 @@ void tcp_send_probe0(struct sock *sk) unsigned long probe_max; int err; - err = tcp_write_wakeup(sk); + err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); if (tp->packets_out || !tcp_send_head(sk)) { /* Cancel probe timer, if it is not required. */ @@ -3490,7 +3496,7 @@ void tcp_send_probe0(struct sock *sk) probe_max = TCP_RESOURCE_PROBE_INTERVAL; } inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - inet_csk_rto_backoff(icsk, probe_max), + tcp_probe0_when(sk, probe_max), TCP_RTO_MAX); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 8c65dc147d8b..5b752f58a900 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -247,7 +247,7 @@ void tcp_delack_timer_handler(struct sock *sk) } out: - if (sk_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) sk_mem_reclaim(sk); } @@ -616,7 +616,7 @@ static void tcp_keepalive_timer (unsigned long data) tcp_write_err(sk); goto out; } - if (tcp_write_wakeup(sk) <= 0) { + if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) { icsk->icsk_probes_out++; elapsed = keepalive_intvl_when(tp); } else { diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index b763c39ae1d7..6116604bf6e8 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -170,6 +170,7 @@ static const struct inet_diag_handler udp_diag_handler = { .dump_one = udp_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, + .idiag_info_size = 0, }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, @@ -190,6 +191,7 @@ static const struct inet_diag_handler udplite_diag_handler = { .dump_one = udplite_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, + .idiag_info_size = 0, }; static int __init udp_diag_init(void) diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 6bb98cc193c9..933ea903f7b8 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -15,12 +15,10 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket *sock = NULL; struct sockaddr_in udp_addr; - err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock); + err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; - sk_change_net(sock->sk, net); - udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->local_ip; udp_addr.sin_port = cfg->local_udp_port; @@ -47,7 +45,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, error: if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sock->sk); + sock_release(sock); } *sockp = NULL; return err; @@ -101,7 +99,7 @@ void udp_tunnel_sock_release(struct socket *sock) { rcu_assign_sk_user_data(sock->sk, NULL); kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sock->sk); + sock_release(sock); } EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); |